diff --git "a/checkpoints/checkpoint-20000/trainer_state.json" "b/checkpoints/checkpoint-20000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/checkpoint-20000/trainer_state.json" @@ -0,0 +1,140033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.2648055181421167, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00016325864250438759, + "grad_norm": 2.6033101081848145, + "learning_rate": 2e-08, + "loss": 3.4004, + "step": 1 + }, + { + "epoch": 0.00032651728500877517, + "grad_norm": 3.7327322959899902, + "learning_rate": 4e-08, + "loss": 3.5503, + "step": 2 + }, + { + "epoch": 0.0004897759275131628, + "grad_norm": 2.869396924972534, + "learning_rate": 6.000000000000001e-08, + "loss": 3.5052, + "step": 3 + }, + { + "epoch": 0.0006530345700175503, + "grad_norm": 3.605945587158203, + "learning_rate": 8e-08, + "loss": 3.5679, + "step": 4 + }, + { + "epoch": 0.0008162932125219379, + "grad_norm": 3.122082471847534, + "learning_rate": 1.0000000000000001e-07, + "loss": 3.5535, + "step": 5 + }, + { + "epoch": 0.0009795518550263255, + "grad_norm": 3.1517856121063232, + "learning_rate": 1.2000000000000002e-07, + "loss": 3.531, + "step": 6 + }, + { + "epoch": 0.001142810497530713, + "grad_norm": 2.8109405040740967, + "learning_rate": 1.4e-07, + "loss": 3.5358, + "step": 7 + }, + { + "epoch": 0.0013060691400351007, + "grad_norm": 2.6619415283203125, + "learning_rate": 1.6e-07, + "loss": 3.4478, + "step": 8 + }, + { + "epoch": 0.0014693277825394882, + "grad_norm": 3.745173454284668, + "learning_rate": 1.8e-07, + "loss": 3.5365, + "step": 9 + }, + { + "epoch": 0.0016325864250438759, + "grad_norm": 2.8928394317626953, + "learning_rate": 2.0000000000000002e-07, + "loss": 3.6134, + "step": 10 + }, + { + "epoch": 0.0017958450675482633, + "grad_norm": 2.726499557495117, + "learning_rate": 2.2e-07, + "loss": 3.4948, + "step": 11 + }, + { + "epoch": 0.001959103710052651, + "grad_norm": 2.678392171859741, + "learning_rate": 2.4000000000000003e-07, + "loss": 3.4073, + "step": 12 + }, + { + "epoch": 0.0021223623525570383, + "grad_norm": 2.7368736267089844, + "learning_rate": 2.6e-07, + "loss": 3.3911, + "step": 13 + }, + { + "epoch": 0.002285620995061426, + "grad_norm": 2.772325038909912, + "learning_rate": 2.8e-07, + "loss": 3.6133, + "step": 14 + }, + { + "epoch": 0.0024488796375658137, + "grad_norm": 3.1457278728485107, + "learning_rate": 3.0000000000000004e-07, + "loss": 3.4348, + "step": 15 + }, + { + "epoch": 0.0026121382800702014, + "grad_norm": 3.166409492492676, + "learning_rate": 3.2e-07, + "loss": 3.5246, + "step": 16 + }, + { + "epoch": 0.0027753969225745886, + "grad_norm": 3.016062021255493, + "learning_rate": 3.4000000000000003e-07, + "loss": 3.5491, + "step": 17 + }, + { + "epoch": 0.0029386555650789763, + "grad_norm": 2.6820476055145264, + "learning_rate": 3.6e-07, + "loss": 3.437, + "step": 18 + }, + { + "epoch": 0.003101914207583364, + "grad_norm": 2.798854351043701, + "learning_rate": 3.8e-07, + "loss": 3.2704, + "step": 19 + }, + { + "epoch": 0.0032651728500877517, + "grad_norm": 2.497466564178467, + "learning_rate": 4.0000000000000003e-07, + "loss": 3.3624, + "step": 20 + }, + { + "epoch": 0.003428431492592139, + "grad_norm": 2.721534252166748, + "learning_rate": 4.2000000000000006e-07, + "loss": 3.3006, + "step": 21 + }, + { + "epoch": 0.0035916901350965267, + "grad_norm": 2.7939298152923584, + "learning_rate": 4.4e-07, + "loss": 3.4231, + "step": 22 + }, + { + "epoch": 0.0037549487776009144, + "grad_norm": 3.022852897644043, + "learning_rate": 4.6000000000000004e-07, + "loss": 3.5319, + "step": 23 + }, + { + "epoch": 0.003918207420105302, + "grad_norm": 2.7380170822143555, + "learning_rate": 4.800000000000001e-07, + "loss": 3.3093, + "step": 24 + }, + { + "epoch": 0.00408146606260969, + "grad_norm": 2.8686535358428955, + "learning_rate": 5.000000000000001e-07, + "loss": 3.6839, + "step": 25 + }, + { + "epoch": 0.004244724705114077, + "grad_norm": 2.96225643157959, + "learning_rate": 5.2e-07, + "loss": 3.4209, + "step": 26 + }, + { + "epoch": 0.004407983347618464, + "grad_norm": 3.343003034591675, + "learning_rate": 5.4e-07, + "loss": 3.6372, + "step": 27 + }, + { + "epoch": 0.004571241990122852, + "grad_norm": 3.239604949951172, + "learning_rate": 5.6e-07, + "loss": 3.574, + "step": 28 + }, + { + "epoch": 0.00473450063262724, + "grad_norm": 3.0173134803771973, + "learning_rate": 5.800000000000001e-07, + "loss": 3.4681, + "step": 29 + }, + { + "epoch": 0.004897759275131627, + "grad_norm": 2.6635303497314453, + "learning_rate": 6.000000000000001e-07, + "loss": 3.3091, + "step": 30 + }, + { + "epoch": 0.005061017917636015, + "grad_norm": 3.0312557220458984, + "learning_rate": 6.200000000000001e-07, + "loss": 3.4112, + "step": 31 + }, + { + "epoch": 0.005224276560140403, + "grad_norm": 2.696537494659424, + "learning_rate": 6.4e-07, + "loss": 3.5429, + "step": 32 + }, + { + "epoch": 0.00538753520264479, + "grad_norm": 3.657029628753662, + "learning_rate": 6.6e-07, + "loss": 3.4713, + "step": 33 + }, + { + "epoch": 0.005550793845149177, + "grad_norm": 3.3929457664489746, + "learning_rate": 6.800000000000001e-07, + "loss": 3.4757, + "step": 34 + }, + { + "epoch": 0.005714052487653565, + "grad_norm": 2.9206650257110596, + "learning_rate": 7.000000000000001e-07, + "loss": 3.4167, + "step": 35 + }, + { + "epoch": 0.005877311130157953, + "grad_norm": 3.096414089202881, + "learning_rate": 7.2e-07, + "loss": 3.5128, + "step": 36 + }, + { + "epoch": 0.00604056977266234, + "grad_norm": 3.2433276176452637, + "learning_rate": 7.4e-07, + "loss": 3.5109, + "step": 37 + }, + { + "epoch": 0.006203828415166728, + "grad_norm": 3.1589174270629883, + "learning_rate": 7.6e-07, + "loss": 3.3431, + "step": 38 + }, + { + "epoch": 0.006367087057671116, + "grad_norm": 2.8945250511169434, + "learning_rate": 7.8e-07, + "loss": 3.4019, + "step": 39 + }, + { + "epoch": 0.006530345700175503, + "grad_norm": 3.9912614822387695, + "learning_rate": 8.000000000000001e-07, + "loss": 3.5928, + "step": 40 + }, + { + "epoch": 0.00669360434267989, + "grad_norm": 2.637004852294922, + "learning_rate": 8.200000000000001e-07, + "loss": 3.2395, + "step": 41 + }, + { + "epoch": 0.006856862985184278, + "grad_norm": 2.9251229763031006, + "learning_rate": 8.400000000000001e-07, + "loss": 3.4691, + "step": 42 + }, + { + "epoch": 0.007020121627688666, + "grad_norm": 2.8292365074157715, + "learning_rate": 8.6e-07, + "loss": 3.4685, + "step": 43 + }, + { + "epoch": 0.007183380270193053, + "grad_norm": 2.774548053741455, + "learning_rate": 8.8e-07, + "loss": 3.4606, + "step": 44 + }, + { + "epoch": 0.007346638912697441, + "grad_norm": 3.066887617111206, + "learning_rate": 9.000000000000001e-07, + "loss": 3.5057, + "step": 45 + }, + { + "epoch": 0.007509897555201829, + "grad_norm": 2.674781084060669, + "learning_rate": 9.200000000000001e-07, + "loss": 3.408, + "step": 46 + }, + { + "epoch": 0.007673156197706216, + "grad_norm": 2.6978938579559326, + "learning_rate": 9.400000000000001e-07, + "loss": 3.488, + "step": 47 + }, + { + "epoch": 0.007836414840210604, + "grad_norm": 3.810103416442871, + "learning_rate": 9.600000000000001e-07, + "loss": 3.4489, + "step": 48 + }, + { + "epoch": 0.007999673482714992, + "grad_norm": 3.0675759315490723, + "learning_rate": 9.800000000000001e-07, + "loss": 3.3242, + "step": 49 + }, + { + "epoch": 0.00816293212521938, + "grad_norm": 2.642082929611206, + "learning_rate": 1.0000000000000002e-06, + "loss": 3.2969, + "step": 50 + }, + { + "epoch": 0.008326190767723767, + "grad_norm": 2.9033143520355225, + "learning_rate": 1.02e-06, + "loss": 3.3804, + "step": 51 + }, + { + "epoch": 0.008489449410228153, + "grad_norm": 3.209754705429077, + "learning_rate": 1.04e-06, + "loss": 3.3548, + "step": 52 + }, + { + "epoch": 0.00865270805273254, + "grad_norm": 2.7264466285705566, + "learning_rate": 1.06e-06, + "loss": 3.1932, + "step": 53 + }, + { + "epoch": 0.008815966695236929, + "grad_norm": 2.7314743995666504, + "learning_rate": 1.08e-06, + "loss": 3.3246, + "step": 54 + }, + { + "epoch": 0.008979225337741316, + "grad_norm": 2.9919331073760986, + "learning_rate": 1.1e-06, + "loss": 3.3179, + "step": 55 + }, + { + "epoch": 0.009142483980245704, + "grad_norm": 2.994581937789917, + "learning_rate": 1.12e-06, + "loss": 3.4928, + "step": 56 + }, + { + "epoch": 0.009305742622750092, + "grad_norm": 2.964712619781494, + "learning_rate": 1.14e-06, + "loss": 3.4166, + "step": 57 + }, + { + "epoch": 0.00946900126525448, + "grad_norm": 2.652458906173706, + "learning_rate": 1.1600000000000001e-06, + "loss": 3.3717, + "step": 58 + }, + { + "epoch": 0.009632259907758867, + "grad_norm": 3.0676703453063965, + "learning_rate": 1.1800000000000001e-06, + "loss": 3.3525, + "step": 59 + }, + { + "epoch": 0.009795518550263255, + "grad_norm": 2.994270086288452, + "learning_rate": 1.2000000000000002e-06, + "loss": 3.378, + "step": 60 + }, + { + "epoch": 0.009958777192767642, + "grad_norm": 3.4734878540039062, + "learning_rate": 1.2200000000000002e-06, + "loss": 3.3916, + "step": 61 + }, + { + "epoch": 0.01012203583527203, + "grad_norm": 2.6581945419311523, + "learning_rate": 1.2400000000000002e-06, + "loss": 3.2896, + "step": 62 + }, + { + "epoch": 0.010285294477776418, + "grad_norm": 2.923051118850708, + "learning_rate": 1.26e-06, + "loss": 3.2224, + "step": 63 + }, + { + "epoch": 0.010448553120280805, + "grad_norm": 2.9782087802886963, + "learning_rate": 1.28e-06, + "loss": 3.4146, + "step": 64 + }, + { + "epoch": 0.010611811762785193, + "grad_norm": 3.0207362174987793, + "learning_rate": 1.3e-06, + "loss": 3.5217, + "step": 65 + }, + { + "epoch": 0.01077507040528958, + "grad_norm": 2.6329057216644287, + "learning_rate": 1.32e-06, + "loss": 3.1558, + "step": 66 + }, + { + "epoch": 0.010938329047793967, + "grad_norm": 2.6853442192077637, + "learning_rate": 1.34e-06, + "loss": 3.4115, + "step": 67 + }, + { + "epoch": 0.011101587690298355, + "grad_norm": 3.012260913848877, + "learning_rate": 1.3600000000000001e-06, + "loss": 3.3941, + "step": 68 + }, + { + "epoch": 0.011264846332802742, + "grad_norm": 2.6659438610076904, + "learning_rate": 1.3800000000000001e-06, + "loss": 3.3189, + "step": 69 + }, + { + "epoch": 0.01142810497530713, + "grad_norm": 3.170567035675049, + "learning_rate": 1.4000000000000001e-06, + "loss": 3.3776, + "step": 70 + }, + { + "epoch": 0.011591363617811518, + "grad_norm": 2.726120710372925, + "learning_rate": 1.42e-06, + "loss": 3.23, + "step": 71 + }, + { + "epoch": 0.011754622260315905, + "grad_norm": 2.7397310733795166, + "learning_rate": 1.44e-06, + "loss": 3.3241, + "step": 72 + }, + { + "epoch": 0.011917880902820293, + "grad_norm": 2.8247501850128174, + "learning_rate": 1.46e-06, + "loss": 3.2031, + "step": 73 + }, + { + "epoch": 0.01208113954532468, + "grad_norm": 2.9634339809417725, + "learning_rate": 1.48e-06, + "loss": 3.1842, + "step": 74 + }, + { + "epoch": 0.012244398187829068, + "grad_norm": 2.6376233100891113, + "learning_rate": 1.5e-06, + "loss": 3.2034, + "step": 75 + }, + { + "epoch": 0.012407656830333456, + "grad_norm": 2.740602970123291, + "learning_rate": 1.52e-06, + "loss": 3.1517, + "step": 76 + }, + { + "epoch": 0.012570915472837844, + "grad_norm": 2.8798398971557617, + "learning_rate": 1.54e-06, + "loss": 3.2775, + "step": 77 + }, + { + "epoch": 0.012734174115342231, + "grad_norm": 2.9144585132598877, + "learning_rate": 1.56e-06, + "loss": 3.2373, + "step": 78 + }, + { + "epoch": 0.01289743275784662, + "grad_norm": 2.5058372020721436, + "learning_rate": 1.5800000000000001e-06, + "loss": 3.2275, + "step": 79 + }, + { + "epoch": 0.013060691400351007, + "grad_norm": 2.738837242126465, + "learning_rate": 1.6000000000000001e-06, + "loss": 3.0292, + "step": 80 + }, + { + "epoch": 0.013223950042855395, + "grad_norm": 3.099492073059082, + "learning_rate": 1.6200000000000002e-06, + "loss": 3.145, + "step": 81 + }, + { + "epoch": 0.01338720868535978, + "grad_norm": 2.5461785793304443, + "learning_rate": 1.6400000000000002e-06, + "loss": 3.0796, + "step": 82 + }, + { + "epoch": 0.013550467327864168, + "grad_norm": 3.377657175064087, + "learning_rate": 1.6600000000000002e-06, + "loss": 3.0228, + "step": 83 + }, + { + "epoch": 0.013713725970368556, + "grad_norm": 2.7379884719848633, + "learning_rate": 1.6800000000000002e-06, + "loss": 2.9684, + "step": 84 + }, + { + "epoch": 0.013876984612872944, + "grad_norm": 2.889983654022217, + "learning_rate": 1.7000000000000002e-06, + "loss": 3.0686, + "step": 85 + }, + { + "epoch": 0.014040243255377331, + "grad_norm": 3.0050642490386963, + "learning_rate": 1.72e-06, + "loss": 3.1926, + "step": 86 + }, + { + "epoch": 0.014203501897881719, + "grad_norm": 2.707963705062866, + "learning_rate": 1.74e-06, + "loss": 3.0134, + "step": 87 + }, + { + "epoch": 0.014366760540386107, + "grad_norm": 2.730029344558716, + "learning_rate": 1.76e-06, + "loss": 3.0117, + "step": 88 + }, + { + "epoch": 0.014530019182890494, + "grad_norm": 2.466172695159912, + "learning_rate": 1.7800000000000001e-06, + "loss": 2.8155, + "step": 89 + }, + { + "epoch": 0.014693277825394882, + "grad_norm": 2.728130340576172, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.9858, + "step": 90 + }, + { + "epoch": 0.01485653646789927, + "grad_norm": 2.8771209716796875, + "learning_rate": 1.8200000000000002e-06, + "loss": 3.1122, + "step": 91 + }, + { + "epoch": 0.015019795110403657, + "grad_norm": 2.803978204727173, + "learning_rate": 1.8400000000000002e-06, + "loss": 2.9593, + "step": 92 + }, + { + "epoch": 0.015183053752908045, + "grad_norm": 2.448578357696533, + "learning_rate": 1.8600000000000002e-06, + "loss": 2.5863, + "step": 93 + }, + { + "epoch": 0.015346312395412433, + "grad_norm": 2.9117980003356934, + "learning_rate": 1.8800000000000002e-06, + "loss": 2.6925, + "step": 94 + }, + { + "epoch": 0.01550957103791682, + "grad_norm": 2.8578615188598633, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.8129, + "step": 95 + }, + { + "epoch": 0.015672829680421208, + "grad_norm": 3.0731899738311768, + "learning_rate": 1.9200000000000003e-06, + "loss": 2.9359, + "step": 96 + }, + { + "epoch": 0.015836088322925594, + "grad_norm": 3.112687826156616, + "learning_rate": 1.94e-06, + "loss": 2.7836, + "step": 97 + }, + { + "epoch": 0.015999346965429984, + "grad_norm": 3.0433127880096436, + "learning_rate": 1.9600000000000003e-06, + "loss": 2.6969, + "step": 98 + }, + { + "epoch": 0.01616260560793437, + "grad_norm": 3.2575583457946777, + "learning_rate": 1.98e-06, + "loss": 2.7697, + "step": 99 + }, + { + "epoch": 0.01632586425043876, + "grad_norm": 2.598994493484497, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.751, + "step": 100 + }, + { + "epoch": 0.016489122892943145, + "grad_norm": 3.5013861656188965, + "learning_rate": 2.02e-06, + "loss": 2.6693, + "step": 101 + }, + { + "epoch": 0.016652381535447534, + "grad_norm": 3.2837140560150146, + "learning_rate": 2.04e-06, + "loss": 2.7556, + "step": 102 + }, + { + "epoch": 0.01681564017795192, + "grad_norm": 3.8987998962402344, + "learning_rate": 2.06e-06, + "loss": 2.6718, + "step": 103 + }, + { + "epoch": 0.016978898820456306, + "grad_norm": 3.455641508102417, + "learning_rate": 2.08e-06, + "loss": 2.7046, + "step": 104 + }, + { + "epoch": 0.017142157462960696, + "grad_norm": 3.2689836025238037, + "learning_rate": 2.1000000000000002e-06, + "loss": 2.5543, + "step": 105 + }, + { + "epoch": 0.01730541610546508, + "grad_norm": 3.1042590141296387, + "learning_rate": 2.12e-06, + "loss": 2.4787, + "step": 106 + }, + { + "epoch": 0.01746867474796947, + "grad_norm": 3.2438113689422607, + "learning_rate": 2.1400000000000003e-06, + "loss": 2.5919, + "step": 107 + }, + { + "epoch": 0.017631933390473857, + "grad_norm": 3.271658420562744, + "learning_rate": 2.16e-06, + "loss": 2.3686, + "step": 108 + }, + { + "epoch": 0.017795192032978246, + "grad_norm": 3.2078323364257812, + "learning_rate": 2.1800000000000003e-06, + "loss": 2.4998, + "step": 109 + }, + { + "epoch": 0.017958450675482632, + "grad_norm": 3.267106771469116, + "learning_rate": 2.2e-06, + "loss": 2.5409, + "step": 110 + }, + { + "epoch": 0.018121709317987022, + "grad_norm": 2.9392611980438232, + "learning_rate": 2.2200000000000003e-06, + "loss": 2.3312, + "step": 111 + }, + { + "epoch": 0.018284967960491408, + "grad_norm": 3.393911361694336, + "learning_rate": 2.24e-06, + "loss": 2.4156, + "step": 112 + }, + { + "epoch": 0.018448226602995797, + "grad_norm": 3.66111421585083, + "learning_rate": 2.2600000000000004e-06, + "loss": 2.532, + "step": 113 + }, + { + "epoch": 0.018611485245500183, + "grad_norm": 3.183798313140869, + "learning_rate": 2.28e-06, + "loss": 2.3355, + "step": 114 + }, + { + "epoch": 0.018774743888004573, + "grad_norm": 3.639376163482666, + "learning_rate": 2.3000000000000004e-06, + "loss": 2.4429, + "step": 115 + }, + { + "epoch": 0.01893800253050896, + "grad_norm": 2.9287357330322266, + "learning_rate": 2.3200000000000002e-06, + "loss": 2.15, + "step": 116 + }, + { + "epoch": 0.019101261173013348, + "grad_norm": 3.2446250915527344, + "learning_rate": 2.3400000000000005e-06, + "loss": 2.2276, + "step": 117 + }, + { + "epoch": 0.019264519815517734, + "grad_norm": 2.768106698989868, + "learning_rate": 2.3600000000000003e-06, + "loss": 2.2567, + "step": 118 + }, + { + "epoch": 0.01942777845802212, + "grad_norm": 2.6894707679748535, + "learning_rate": 2.38e-06, + "loss": 2.1898, + "step": 119 + }, + { + "epoch": 0.01959103710052651, + "grad_norm": 2.929439067840576, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.1807, + "step": 120 + }, + { + "epoch": 0.019754295743030895, + "grad_norm": 3.2130661010742188, + "learning_rate": 2.42e-06, + "loss": 2.1286, + "step": 121 + }, + { + "epoch": 0.019917554385535285, + "grad_norm": 3.124307870864868, + "learning_rate": 2.4400000000000004e-06, + "loss": 2.2528, + "step": 122 + }, + { + "epoch": 0.02008081302803967, + "grad_norm": 2.8239824771881104, + "learning_rate": 2.46e-06, + "loss": 2.1212, + "step": 123 + }, + { + "epoch": 0.02024407167054406, + "grad_norm": 2.901832342147827, + "learning_rate": 2.4800000000000004e-06, + "loss": 1.8622, + "step": 124 + }, + { + "epoch": 0.020407330313048446, + "grad_norm": 2.785125732421875, + "learning_rate": 2.5e-06, + "loss": 2.1786, + "step": 125 + }, + { + "epoch": 0.020570588955552836, + "grad_norm": 2.7350783348083496, + "learning_rate": 2.52e-06, + "loss": 1.9966, + "step": 126 + }, + { + "epoch": 0.02073384759805722, + "grad_norm": 2.692490339279175, + "learning_rate": 2.5400000000000002e-06, + "loss": 1.9132, + "step": 127 + }, + { + "epoch": 0.02089710624056161, + "grad_norm": 3.1648802757263184, + "learning_rate": 2.56e-06, + "loss": 1.8623, + "step": 128 + }, + { + "epoch": 0.021060364883065997, + "grad_norm": 2.889390230178833, + "learning_rate": 2.5800000000000003e-06, + "loss": 1.7629, + "step": 129 + }, + { + "epoch": 0.021223623525570386, + "grad_norm": 3.0257678031921387, + "learning_rate": 2.6e-06, + "loss": 1.9024, + "step": 130 + }, + { + "epoch": 0.021386882168074772, + "grad_norm": 2.9072747230529785, + "learning_rate": 2.6200000000000003e-06, + "loss": 1.8467, + "step": 131 + }, + { + "epoch": 0.02155014081057916, + "grad_norm": 3.107095241546631, + "learning_rate": 2.64e-06, + "loss": 1.8584, + "step": 132 + }, + { + "epoch": 0.021713399453083548, + "grad_norm": 2.426776170730591, + "learning_rate": 2.6600000000000004e-06, + "loss": 1.8054, + "step": 133 + }, + { + "epoch": 0.021876658095587934, + "grad_norm": 2.595407724380493, + "learning_rate": 2.68e-06, + "loss": 1.6519, + "step": 134 + }, + { + "epoch": 0.022039916738092323, + "grad_norm": 2.245654821395874, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.721, + "step": 135 + }, + { + "epoch": 0.02220317538059671, + "grad_norm": 1.9480783939361572, + "learning_rate": 2.7200000000000002e-06, + "loss": 1.7439, + "step": 136 + }, + { + "epoch": 0.0223664340231011, + "grad_norm": 2.581364870071411, + "learning_rate": 2.7400000000000004e-06, + "loss": 1.8561, + "step": 137 + }, + { + "epoch": 0.022529692665605484, + "grad_norm": 1.8394737243652344, + "learning_rate": 2.7600000000000003e-06, + "loss": 1.7832, + "step": 138 + }, + { + "epoch": 0.022692951308109874, + "grad_norm": 1.96915864944458, + "learning_rate": 2.7800000000000005e-06, + "loss": 1.7759, + "step": 139 + }, + { + "epoch": 0.02285620995061426, + "grad_norm": 2.141920804977417, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.7082, + "step": 140 + }, + { + "epoch": 0.02301946859311865, + "grad_norm": 2.139808416366577, + "learning_rate": 2.82e-06, + "loss": 1.7296, + "step": 141 + }, + { + "epoch": 0.023182727235623035, + "grad_norm": 2.474648952484131, + "learning_rate": 2.84e-06, + "loss": 1.7241, + "step": 142 + }, + { + "epoch": 0.023345985878127425, + "grad_norm": 2.2544307708740234, + "learning_rate": 2.86e-06, + "loss": 1.6342, + "step": 143 + }, + { + "epoch": 0.02350924452063181, + "grad_norm": 1.8940359354019165, + "learning_rate": 2.88e-06, + "loss": 1.8047, + "step": 144 + }, + { + "epoch": 0.0236725031631362, + "grad_norm": 2.0767056941986084, + "learning_rate": 2.9e-06, + "loss": 1.6129, + "step": 145 + }, + { + "epoch": 0.023835761805640586, + "grad_norm": 1.9623079299926758, + "learning_rate": 2.92e-06, + "loss": 1.7396, + "step": 146 + }, + { + "epoch": 0.023999020448144975, + "grad_norm": 2.2459523677825928, + "learning_rate": 2.9400000000000002e-06, + "loss": 1.7015, + "step": 147 + }, + { + "epoch": 0.02416227909064936, + "grad_norm": 1.9305064678192139, + "learning_rate": 2.96e-06, + "loss": 1.7263, + "step": 148 + }, + { + "epoch": 0.024325537733153747, + "grad_norm": 2.420577049255371, + "learning_rate": 2.9800000000000003e-06, + "loss": 1.6789, + "step": 149 + }, + { + "epoch": 0.024488796375658137, + "grad_norm": 2.3346197605133057, + "learning_rate": 3e-06, + "loss": 1.8634, + "step": 150 + }, + { + "epoch": 0.024652055018162523, + "grad_norm": 2.034031391143799, + "learning_rate": 3.0200000000000003e-06, + "loss": 1.5641, + "step": 151 + }, + { + "epoch": 0.024815313660666912, + "grad_norm": 2.277332067489624, + "learning_rate": 3.04e-06, + "loss": 1.7236, + "step": 152 + }, + { + "epoch": 0.024978572303171298, + "grad_norm": 2.122135639190674, + "learning_rate": 3.0600000000000003e-06, + "loss": 1.5725, + "step": 153 + }, + { + "epoch": 0.025141830945675687, + "grad_norm": 2.085005044937134, + "learning_rate": 3.08e-06, + "loss": 1.571, + "step": 154 + }, + { + "epoch": 0.025305089588180073, + "grad_norm": 2.0196194648742676, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.6876, + "step": 155 + }, + { + "epoch": 0.025468348230684463, + "grad_norm": 2.0159735679626465, + "learning_rate": 3.12e-06, + "loss": 1.5243, + "step": 156 + }, + { + "epoch": 0.02563160687318885, + "grad_norm": 1.865037441253662, + "learning_rate": 3.1400000000000004e-06, + "loss": 1.5005, + "step": 157 + }, + { + "epoch": 0.02579486551569324, + "grad_norm": 1.8383852243423462, + "learning_rate": 3.1600000000000002e-06, + "loss": 1.6258, + "step": 158 + }, + { + "epoch": 0.025958124158197624, + "grad_norm": 2.28385329246521, + "learning_rate": 3.1800000000000005e-06, + "loss": 1.4342, + "step": 159 + }, + { + "epoch": 0.026121382800702014, + "grad_norm": 1.7752000093460083, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.5506, + "step": 160 + }, + { + "epoch": 0.0262846414432064, + "grad_norm": 2.923414945602417, + "learning_rate": 3.2200000000000005e-06, + "loss": 1.5635, + "step": 161 + }, + { + "epoch": 0.02644790008571079, + "grad_norm": 1.5688098669052124, + "learning_rate": 3.2400000000000003e-06, + "loss": 1.305, + "step": 162 + }, + { + "epoch": 0.026611158728215175, + "grad_norm": 2.4980533123016357, + "learning_rate": 3.2600000000000006e-06, + "loss": 1.5358, + "step": 163 + }, + { + "epoch": 0.02677441737071956, + "grad_norm": 1.7541378736495972, + "learning_rate": 3.2800000000000004e-06, + "loss": 1.4512, + "step": 164 + }, + { + "epoch": 0.02693767601322395, + "grad_norm": 2.055202007293701, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.668, + "step": 165 + }, + { + "epoch": 0.027100934655728336, + "grad_norm": 1.4846760034561157, + "learning_rate": 3.3200000000000004e-06, + "loss": 1.3364, + "step": 166 + }, + { + "epoch": 0.027264193298232726, + "grad_norm": 1.6579443216323853, + "learning_rate": 3.3400000000000006e-06, + "loss": 1.432, + "step": 167 + }, + { + "epoch": 0.027427451940737112, + "grad_norm": 1.7275137901306152, + "learning_rate": 3.3600000000000004e-06, + "loss": 1.5508, + "step": 168 + }, + { + "epoch": 0.0275907105832415, + "grad_norm": 1.9515480995178223, + "learning_rate": 3.3800000000000007e-06, + "loss": 1.4596, + "step": 169 + }, + { + "epoch": 0.027753969225745887, + "grad_norm": 1.8022053241729736, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.5817, + "step": 170 + }, + { + "epoch": 0.027917227868250277, + "grad_norm": 1.7202377319335938, + "learning_rate": 3.4200000000000007e-06, + "loss": 1.4101, + "step": 171 + }, + { + "epoch": 0.028080486510754663, + "grad_norm": 2.0519022941589355, + "learning_rate": 3.44e-06, + "loss": 1.4796, + "step": 172 + }, + { + "epoch": 0.028243745153259052, + "grad_norm": 1.7433277368545532, + "learning_rate": 3.46e-06, + "loss": 1.6402, + "step": 173 + }, + { + "epoch": 0.028407003795763438, + "grad_norm": 1.6006267070770264, + "learning_rate": 3.48e-06, + "loss": 1.5036, + "step": 174 + }, + { + "epoch": 0.028570262438267827, + "grad_norm": 1.6294777393341064, + "learning_rate": 3.5e-06, + "loss": 1.4822, + "step": 175 + }, + { + "epoch": 0.028733521080772213, + "grad_norm": 1.6254267692565918, + "learning_rate": 3.52e-06, + "loss": 1.4198, + "step": 176 + }, + { + "epoch": 0.0288967797232766, + "grad_norm": 1.8645423650741577, + "learning_rate": 3.54e-06, + "loss": 1.4994, + "step": 177 + }, + { + "epoch": 0.02906003836578099, + "grad_norm": 1.8686392307281494, + "learning_rate": 3.5600000000000002e-06, + "loss": 1.3703, + "step": 178 + }, + { + "epoch": 0.029223297008285375, + "grad_norm": 1.5615224838256836, + "learning_rate": 3.58e-06, + "loss": 1.4052, + "step": 179 + }, + { + "epoch": 0.029386555650789764, + "grad_norm": 1.4917488098144531, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.4012, + "step": 180 + }, + { + "epoch": 0.02954981429329415, + "grad_norm": 1.5183836221694946, + "learning_rate": 3.62e-06, + "loss": 1.3347, + "step": 181 + }, + { + "epoch": 0.02971307293579854, + "grad_norm": 1.6634671688079834, + "learning_rate": 3.6400000000000003e-06, + "loss": 1.409, + "step": 182 + }, + { + "epoch": 0.029876331578302925, + "grad_norm": 1.7330362796783447, + "learning_rate": 3.66e-06, + "loss": 1.3344, + "step": 183 + }, + { + "epoch": 0.030039590220807315, + "grad_norm": 1.5642423629760742, + "learning_rate": 3.6800000000000003e-06, + "loss": 1.4214, + "step": 184 + }, + { + "epoch": 0.0302028488633117, + "grad_norm": 1.682076334953308, + "learning_rate": 3.7e-06, + "loss": 1.5688, + "step": 185 + }, + { + "epoch": 0.03036610750581609, + "grad_norm": 1.6651852130889893, + "learning_rate": 3.7200000000000004e-06, + "loss": 1.4421, + "step": 186 + }, + { + "epoch": 0.030529366148320476, + "grad_norm": 1.5420279502868652, + "learning_rate": 3.74e-06, + "loss": 1.3029, + "step": 187 + }, + { + "epoch": 0.030692624790824866, + "grad_norm": 1.7709190845489502, + "learning_rate": 3.7600000000000004e-06, + "loss": 1.5403, + "step": 188 + }, + { + "epoch": 0.03085588343332925, + "grad_norm": 1.604353427886963, + "learning_rate": 3.7800000000000002e-06, + "loss": 1.4906, + "step": 189 + }, + { + "epoch": 0.03101914207583364, + "grad_norm": 2.0194711685180664, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.2699, + "step": 190 + }, + { + "epoch": 0.031182400718338027, + "grad_norm": 1.5488393306732178, + "learning_rate": 3.820000000000001e-06, + "loss": 1.61, + "step": 191 + }, + { + "epoch": 0.031345659360842416, + "grad_norm": 1.4095816612243652, + "learning_rate": 3.8400000000000005e-06, + "loss": 1.3254, + "step": 192 + }, + { + "epoch": 0.0315089180033468, + "grad_norm": 1.522632360458374, + "learning_rate": 3.86e-06, + "loss": 1.4244, + "step": 193 + }, + { + "epoch": 0.03167217664585119, + "grad_norm": 1.754141926765442, + "learning_rate": 3.88e-06, + "loss": 1.5017, + "step": 194 + }, + { + "epoch": 0.03183543528835558, + "grad_norm": 1.8344064950942993, + "learning_rate": 3.900000000000001e-06, + "loss": 1.3473, + "step": 195 + }, + { + "epoch": 0.03199869393085997, + "grad_norm": 1.5926331281661987, + "learning_rate": 3.920000000000001e-06, + "loss": 1.5758, + "step": 196 + }, + { + "epoch": 0.03216195257336435, + "grad_norm": 1.4991172552108765, + "learning_rate": 3.94e-06, + "loss": 1.3974, + "step": 197 + }, + { + "epoch": 0.03232521121586874, + "grad_norm": 1.8705028295516968, + "learning_rate": 3.96e-06, + "loss": 1.5125, + "step": 198 + }, + { + "epoch": 0.03248846985837313, + "grad_norm": 1.6602659225463867, + "learning_rate": 3.980000000000001e-06, + "loss": 1.5436, + "step": 199 + }, + { + "epoch": 0.03265172850087752, + "grad_norm": 1.819608449935913, + "learning_rate": 4.000000000000001e-06, + "loss": 1.5024, + "step": 200 + }, + { + "epoch": 0.0328149871433819, + "grad_norm": 1.6650482416152954, + "learning_rate": 4.0200000000000005e-06, + "loss": 1.5034, + "step": 201 + }, + { + "epoch": 0.03297824578588629, + "grad_norm": 1.758331537246704, + "learning_rate": 4.04e-06, + "loss": 1.4673, + "step": 202 + }, + { + "epoch": 0.03314150442839068, + "grad_norm": 1.3998029232025146, + "learning_rate": 4.060000000000001e-06, + "loss": 1.2635, + "step": 203 + }, + { + "epoch": 0.03330476307089507, + "grad_norm": 1.6199387311935425, + "learning_rate": 4.08e-06, + "loss": 1.3724, + "step": 204 + }, + { + "epoch": 0.03346802171339945, + "grad_norm": 1.8421322107315063, + "learning_rate": 4.1e-06, + "loss": 1.4737, + "step": 205 + }, + { + "epoch": 0.03363128035590384, + "grad_norm": 1.8446450233459473, + "learning_rate": 4.12e-06, + "loss": 1.1772, + "step": 206 + }, + { + "epoch": 0.03379453899840823, + "grad_norm": 1.6325796842575073, + "learning_rate": 4.14e-06, + "loss": 1.5462, + "step": 207 + }, + { + "epoch": 0.03395779764091261, + "grad_norm": 1.5166337490081787, + "learning_rate": 4.16e-06, + "loss": 1.2812, + "step": 208 + }, + { + "epoch": 0.034121056283417, + "grad_norm": 2.056048631668091, + "learning_rate": 4.18e-06, + "loss": 1.4828, + "step": 209 + }, + { + "epoch": 0.03428431492592139, + "grad_norm": 1.6736007928848267, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.5851, + "step": 210 + }, + { + "epoch": 0.03444757356842578, + "grad_norm": 1.7450826168060303, + "learning_rate": 4.22e-06, + "loss": 1.3061, + "step": 211 + }, + { + "epoch": 0.03461083221093016, + "grad_norm": 2.008894443511963, + "learning_rate": 4.24e-06, + "loss": 1.4816, + "step": 212 + }, + { + "epoch": 0.03477409085343455, + "grad_norm": 1.7426434755325317, + "learning_rate": 4.26e-06, + "loss": 1.3139, + "step": 213 + }, + { + "epoch": 0.03493734949593894, + "grad_norm": 1.716793417930603, + "learning_rate": 4.2800000000000005e-06, + "loss": 1.4927, + "step": 214 + }, + { + "epoch": 0.03510060813844333, + "grad_norm": 1.6640710830688477, + "learning_rate": 4.3e-06, + "loss": 1.4486, + "step": 215 + }, + { + "epoch": 0.035263866780947714, + "grad_norm": 1.6972944736480713, + "learning_rate": 4.32e-06, + "loss": 1.4054, + "step": 216 + }, + { + "epoch": 0.035427125423452104, + "grad_norm": 1.5209475755691528, + "learning_rate": 4.34e-06, + "loss": 1.1952, + "step": 217 + }, + { + "epoch": 0.03559038406595649, + "grad_norm": 1.6674305200576782, + "learning_rate": 4.360000000000001e-06, + "loss": 1.3865, + "step": 218 + }, + { + "epoch": 0.03575364270846088, + "grad_norm": 1.6292712688446045, + "learning_rate": 4.38e-06, + "loss": 1.4336, + "step": 219 + }, + { + "epoch": 0.035916901350965265, + "grad_norm": 1.8487358093261719, + "learning_rate": 4.4e-06, + "loss": 1.2439, + "step": 220 + }, + { + "epoch": 0.036080159993469654, + "grad_norm": 1.9384684562683105, + "learning_rate": 4.42e-06, + "loss": 1.2103, + "step": 221 + }, + { + "epoch": 0.036243418635974044, + "grad_norm": 1.9539082050323486, + "learning_rate": 4.440000000000001e-06, + "loss": 1.3662, + "step": 222 + }, + { + "epoch": 0.036406677278478426, + "grad_norm": 1.5603522062301636, + "learning_rate": 4.4600000000000005e-06, + "loss": 1.447, + "step": 223 + }, + { + "epoch": 0.036569935920982816, + "grad_norm": 1.6361925601959229, + "learning_rate": 4.48e-06, + "loss": 1.5757, + "step": 224 + }, + { + "epoch": 0.036733194563487205, + "grad_norm": 1.5589345693588257, + "learning_rate": 4.5e-06, + "loss": 1.3436, + "step": 225 + }, + { + "epoch": 0.036896453205991595, + "grad_norm": 1.6450508832931519, + "learning_rate": 4.520000000000001e-06, + "loss": 1.5028, + "step": 226 + }, + { + "epoch": 0.03705971184849598, + "grad_norm": 1.9003053903579712, + "learning_rate": 4.540000000000001e-06, + "loss": 1.5323, + "step": 227 + }, + { + "epoch": 0.037222970491000366, + "grad_norm": 1.8154126405715942, + "learning_rate": 4.56e-06, + "loss": 1.3956, + "step": 228 + }, + { + "epoch": 0.037386229133504756, + "grad_norm": 1.8339314460754395, + "learning_rate": 4.58e-06, + "loss": 1.4306, + "step": 229 + }, + { + "epoch": 0.037549487776009145, + "grad_norm": 1.834531307220459, + "learning_rate": 4.600000000000001e-06, + "loss": 1.3822, + "step": 230 + }, + { + "epoch": 0.03771274641851353, + "grad_norm": 1.6129051446914673, + "learning_rate": 4.620000000000001e-06, + "loss": 1.4301, + "step": 231 + }, + { + "epoch": 0.03787600506101792, + "grad_norm": 1.5726324319839478, + "learning_rate": 4.6400000000000005e-06, + "loss": 1.4197, + "step": 232 + }, + { + "epoch": 0.03803926370352231, + "grad_norm": 1.6806925535202026, + "learning_rate": 4.66e-06, + "loss": 1.4155, + "step": 233 + }, + { + "epoch": 0.038202522346026696, + "grad_norm": 1.9844831228256226, + "learning_rate": 4.680000000000001e-06, + "loss": 1.5435, + "step": 234 + }, + { + "epoch": 0.03836578098853108, + "grad_norm": 1.6292520761489868, + "learning_rate": 4.7e-06, + "loss": 1.3312, + "step": 235 + }, + { + "epoch": 0.03852903963103547, + "grad_norm": 1.629751443862915, + "learning_rate": 4.7200000000000005e-06, + "loss": 1.3402, + "step": 236 + }, + { + "epoch": 0.03869229827353986, + "grad_norm": 1.5564073324203491, + "learning_rate": 4.74e-06, + "loss": 1.4724, + "step": 237 + }, + { + "epoch": 0.03885555691604424, + "grad_norm": 1.6269237995147705, + "learning_rate": 4.76e-06, + "loss": 1.2937, + "step": 238 + }, + { + "epoch": 0.03901881555854863, + "grad_norm": 1.9189403057098389, + "learning_rate": 4.78e-06, + "loss": 1.3602, + "step": 239 + }, + { + "epoch": 0.03918207420105302, + "grad_norm": 1.6265121698379517, + "learning_rate": 4.800000000000001e-06, + "loss": 1.394, + "step": 240 + }, + { + "epoch": 0.03934533284355741, + "grad_norm": 1.965876579284668, + "learning_rate": 4.8200000000000004e-06, + "loss": 1.4015, + "step": 241 + }, + { + "epoch": 0.03950859148606179, + "grad_norm": 1.7224138975143433, + "learning_rate": 4.84e-06, + "loss": 1.4144, + "step": 242 + }, + { + "epoch": 0.03967185012856618, + "grad_norm": 1.6164450645446777, + "learning_rate": 4.86e-06, + "loss": 1.5031, + "step": 243 + }, + { + "epoch": 0.03983510877107057, + "grad_norm": 1.755733847618103, + "learning_rate": 4.880000000000001e-06, + "loss": 1.3916, + "step": 244 + }, + { + "epoch": 0.03999836741357496, + "grad_norm": 1.854887843132019, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.5837, + "step": 245 + }, + { + "epoch": 0.04016162605607934, + "grad_norm": 1.747153878211975, + "learning_rate": 4.92e-06, + "loss": 1.3597, + "step": 246 + }, + { + "epoch": 0.04032488469858373, + "grad_norm": 2.0980474948883057, + "learning_rate": 4.94e-06, + "loss": 1.1637, + "step": 247 + }, + { + "epoch": 0.04048814334108812, + "grad_norm": 1.9024879932403564, + "learning_rate": 4.960000000000001e-06, + "loss": 1.3828, + "step": 248 + }, + { + "epoch": 0.04065140198359251, + "grad_norm": 1.8530932664871216, + "learning_rate": 4.980000000000001e-06, + "loss": 1.3278, + "step": 249 + }, + { + "epoch": 0.04081466062609689, + "grad_norm": 1.8176486492156982, + "learning_rate": 5e-06, + "loss": 1.4399, + "step": 250 + }, + { + "epoch": 0.04097791926860128, + "grad_norm": 2.028615713119507, + "learning_rate": 5.02e-06, + "loss": 1.3479, + "step": 251 + }, + { + "epoch": 0.04114117791110567, + "grad_norm": 1.8009893894195557, + "learning_rate": 5.04e-06, + "loss": 1.3192, + "step": 252 + }, + { + "epoch": 0.041304436553610054, + "grad_norm": 1.5895270109176636, + "learning_rate": 5.060000000000001e-06, + "loss": 1.3178, + "step": 253 + }, + { + "epoch": 0.04146769519611444, + "grad_norm": 1.7072293758392334, + "learning_rate": 5.0800000000000005e-06, + "loss": 1.2674, + "step": 254 + }, + { + "epoch": 0.04163095383861883, + "grad_norm": 1.9572274684906006, + "learning_rate": 5.1e-06, + "loss": 1.3481, + "step": 255 + }, + { + "epoch": 0.04179421248112322, + "grad_norm": 2.0055418014526367, + "learning_rate": 5.12e-06, + "loss": 1.4567, + "step": 256 + }, + { + "epoch": 0.041957471123627604, + "grad_norm": 1.632832646369934, + "learning_rate": 5.140000000000001e-06, + "loss": 1.3515, + "step": 257 + }, + { + "epoch": 0.042120729766131994, + "grad_norm": 1.7226401567459106, + "learning_rate": 5.1600000000000006e-06, + "loss": 1.2223, + "step": 258 + }, + { + "epoch": 0.04228398840863638, + "grad_norm": 1.745452642440796, + "learning_rate": 5.18e-06, + "loss": 1.2768, + "step": 259 + }, + { + "epoch": 0.04244724705114077, + "grad_norm": 1.5732847452163696, + "learning_rate": 5.2e-06, + "loss": 1.2221, + "step": 260 + }, + { + "epoch": 0.042610505693645155, + "grad_norm": 2.1821823120117188, + "learning_rate": 5.220000000000001e-06, + "loss": 1.2095, + "step": 261 + }, + { + "epoch": 0.042773764336149545, + "grad_norm": 1.6556347608566284, + "learning_rate": 5.240000000000001e-06, + "loss": 1.3607, + "step": 262 + }, + { + "epoch": 0.042937022978653934, + "grad_norm": 2.3543052673339844, + "learning_rate": 5.2600000000000005e-06, + "loss": 1.3836, + "step": 263 + }, + { + "epoch": 0.04310028162115832, + "grad_norm": 1.6909420490264893, + "learning_rate": 5.28e-06, + "loss": 1.244, + "step": 264 + }, + { + "epoch": 0.043263540263662706, + "grad_norm": 1.8927441835403442, + "learning_rate": 5.300000000000001e-06, + "loss": 1.4352, + "step": 265 + }, + { + "epoch": 0.043426798906167095, + "grad_norm": 1.794523000717163, + "learning_rate": 5.320000000000001e-06, + "loss": 1.3977, + "step": 266 + }, + { + "epoch": 0.043590057548671485, + "grad_norm": 1.9960391521453857, + "learning_rate": 5.3400000000000005e-06, + "loss": 1.1623, + "step": 267 + }, + { + "epoch": 0.04375331619117587, + "grad_norm": 1.8895787000656128, + "learning_rate": 5.36e-06, + "loss": 1.2235, + "step": 268 + }, + { + "epoch": 0.04391657483368026, + "grad_norm": 1.9792596101760864, + "learning_rate": 5.380000000000001e-06, + "loss": 1.4607, + "step": 269 + }, + { + "epoch": 0.044079833476184646, + "grad_norm": 1.84207022190094, + "learning_rate": 5.400000000000001e-06, + "loss": 1.3426, + "step": 270 + }, + { + "epoch": 0.044243092118689036, + "grad_norm": 2.020681619644165, + "learning_rate": 5.420000000000001e-06, + "loss": 1.4241, + "step": 271 + }, + { + "epoch": 0.04440635076119342, + "grad_norm": 1.750666856765747, + "learning_rate": 5.4400000000000004e-06, + "loss": 1.1745, + "step": 272 + }, + { + "epoch": 0.04456960940369781, + "grad_norm": 1.7200289964675903, + "learning_rate": 5.460000000000001e-06, + "loss": 1.2417, + "step": 273 + }, + { + "epoch": 0.0447328680462022, + "grad_norm": 1.8889890909194946, + "learning_rate": 5.480000000000001e-06, + "loss": 1.5173, + "step": 274 + }, + { + "epoch": 0.044896126688706586, + "grad_norm": 1.6877647638320923, + "learning_rate": 5.500000000000001e-06, + "loss": 1.2195, + "step": 275 + }, + { + "epoch": 0.04505938533121097, + "grad_norm": 1.7130943536758423, + "learning_rate": 5.5200000000000005e-06, + "loss": 1.306, + "step": 276 + }, + { + "epoch": 0.04522264397371536, + "grad_norm": 2.269832134246826, + "learning_rate": 5.540000000000001e-06, + "loss": 1.3022, + "step": 277 + }, + { + "epoch": 0.04538590261621975, + "grad_norm": 2.1610982418060303, + "learning_rate": 5.560000000000001e-06, + "loss": 1.3729, + "step": 278 + }, + { + "epoch": 0.04554916125872414, + "grad_norm": 1.6647307872772217, + "learning_rate": 5.580000000000001e-06, + "loss": 1.2433, + "step": 279 + }, + { + "epoch": 0.04571241990122852, + "grad_norm": 1.9671560525894165, + "learning_rate": 5.600000000000001e-06, + "loss": 1.1031, + "step": 280 + }, + { + "epoch": 0.04587567854373291, + "grad_norm": 1.8172413110733032, + "learning_rate": 5.620000000000001e-06, + "loss": 1.0862, + "step": 281 + }, + { + "epoch": 0.0460389371862373, + "grad_norm": 2.103264331817627, + "learning_rate": 5.64e-06, + "loss": 1.2269, + "step": 282 + }, + { + "epoch": 0.04620219582874168, + "grad_norm": 1.8390167951583862, + "learning_rate": 5.66e-06, + "loss": 1.3608, + "step": 283 + }, + { + "epoch": 0.04636545447124607, + "grad_norm": 1.8240208625793457, + "learning_rate": 5.68e-06, + "loss": 1.3546, + "step": 284 + }, + { + "epoch": 0.04652871311375046, + "grad_norm": 2.0313069820404053, + "learning_rate": 5.7e-06, + "loss": 1.2245, + "step": 285 + }, + { + "epoch": 0.04669197175625485, + "grad_norm": 2.0402538776397705, + "learning_rate": 5.72e-06, + "loss": 1.2089, + "step": 286 + }, + { + "epoch": 0.04685523039875923, + "grad_norm": 2.128643274307251, + "learning_rate": 5.74e-06, + "loss": 1.4644, + "step": 287 + }, + { + "epoch": 0.04701848904126362, + "grad_norm": 2.0624396800994873, + "learning_rate": 5.76e-06, + "loss": 1.2678, + "step": 288 + }, + { + "epoch": 0.04718174768376801, + "grad_norm": 1.8395850658416748, + "learning_rate": 5.78e-06, + "loss": 1.2465, + "step": 289 + }, + { + "epoch": 0.0473450063262724, + "grad_norm": 2.0606637001037598, + "learning_rate": 5.8e-06, + "loss": 1.377, + "step": 290 + }, + { + "epoch": 0.04750826496877678, + "grad_norm": 1.9317939281463623, + "learning_rate": 5.82e-06, + "loss": 1.2789, + "step": 291 + }, + { + "epoch": 0.04767152361128117, + "grad_norm": 1.8050525188446045, + "learning_rate": 5.84e-06, + "loss": 1.3675, + "step": 292 + }, + { + "epoch": 0.04783478225378556, + "grad_norm": 1.8397572040557861, + "learning_rate": 5.86e-06, + "loss": 1.2639, + "step": 293 + }, + { + "epoch": 0.04799804089628995, + "grad_norm": 1.796512484550476, + "learning_rate": 5.8800000000000005e-06, + "loss": 1.2712, + "step": 294 + }, + { + "epoch": 0.04816129953879433, + "grad_norm": 1.7678298950195312, + "learning_rate": 5.9e-06, + "loss": 1.4187, + "step": 295 + }, + { + "epoch": 0.04832455818129872, + "grad_norm": 2.002376079559326, + "learning_rate": 5.92e-06, + "loss": 1.3233, + "step": 296 + }, + { + "epoch": 0.04848781682380311, + "grad_norm": 2.010481595993042, + "learning_rate": 5.94e-06, + "loss": 1.3472, + "step": 297 + }, + { + "epoch": 0.048651075466307495, + "grad_norm": 1.9292447566986084, + "learning_rate": 5.9600000000000005e-06, + "loss": 1.1185, + "step": 298 + }, + { + "epoch": 0.048814334108811884, + "grad_norm": 1.8887194395065308, + "learning_rate": 5.98e-06, + "loss": 1.2652, + "step": 299 + }, + { + "epoch": 0.04897759275131627, + "grad_norm": 2.141409397125244, + "learning_rate": 6e-06, + "loss": 1.271, + "step": 300 + }, + { + "epoch": 0.04914085139382066, + "grad_norm": 1.881855845451355, + "learning_rate": 6.02e-06, + "loss": 1.2796, + "step": 301 + }, + { + "epoch": 0.049304110036325045, + "grad_norm": 2.150317430496216, + "learning_rate": 6.040000000000001e-06, + "loss": 1.0951, + "step": 302 + }, + { + "epoch": 0.049467368678829435, + "grad_norm": 1.9875880479812622, + "learning_rate": 6.0600000000000004e-06, + "loss": 1.3246, + "step": 303 + }, + { + "epoch": 0.049630627321333824, + "grad_norm": 2.255772590637207, + "learning_rate": 6.08e-06, + "loss": 1.4989, + "step": 304 + }, + { + "epoch": 0.049793885963838214, + "grad_norm": 2.045013904571533, + "learning_rate": 6.1e-06, + "loss": 1.2866, + "step": 305 + }, + { + "epoch": 0.049957144606342596, + "grad_norm": 2.1231534481048584, + "learning_rate": 6.120000000000001e-06, + "loss": 1.5194, + "step": 306 + }, + { + "epoch": 0.050120403248846986, + "grad_norm": 2.1739463806152344, + "learning_rate": 6.1400000000000005e-06, + "loss": 1.2079, + "step": 307 + }, + { + "epoch": 0.050283661891351375, + "grad_norm": 2.1337294578552246, + "learning_rate": 6.16e-06, + "loss": 1.3547, + "step": 308 + }, + { + "epoch": 0.050446920533855764, + "grad_norm": 1.978772759437561, + "learning_rate": 6.18e-06, + "loss": 1.2434, + "step": 309 + }, + { + "epoch": 0.05061017917636015, + "grad_norm": 2.1697778701782227, + "learning_rate": 6.200000000000001e-06, + "loss": 1.5144, + "step": 310 + }, + { + "epoch": 0.050773437818864536, + "grad_norm": 2.114093780517578, + "learning_rate": 6.220000000000001e-06, + "loss": 1.482, + "step": 311 + }, + { + "epoch": 0.050936696461368926, + "grad_norm": 1.9082748889923096, + "learning_rate": 6.24e-06, + "loss": 1.3339, + "step": 312 + }, + { + "epoch": 0.05109995510387331, + "grad_norm": 1.949682593345642, + "learning_rate": 6.26e-06, + "loss": 1.145, + "step": 313 + }, + { + "epoch": 0.0512632137463777, + "grad_norm": 2.2155940532684326, + "learning_rate": 6.280000000000001e-06, + "loss": 1.2347, + "step": 314 + }, + { + "epoch": 0.05142647238888209, + "grad_norm": 2.166496992111206, + "learning_rate": 6.300000000000001e-06, + "loss": 1.503, + "step": 315 + }, + { + "epoch": 0.05158973103138648, + "grad_norm": 1.980858564376831, + "learning_rate": 6.3200000000000005e-06, + "loss": 1.1723, + "step": 316 + }, + { + "epoch": 0.05175298967389086, + "grad_norm": 2.2432358264923096, + "learning_rate": 6.34e-06, + "loss": 1.2782, + "step": 317 + }, + { + "epoch": 0.05191624831639525, + "grad_norm": 2.169797658920288, + "learning_rate": 6.360000000000001e-06, + "loss": 1.3054, + "step": 318 + }, + { + "epoch": 0.05207950695889964, + "grad_norm": 1.794983983039856, + "learning_rate": 6.380000000000001e-06, + "loss": 1.3602, + "step": 319 + }, + { + "epoch": 0.05224276560140403, + "grad_norm": 1.9446194171905518, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.2564, + "step": 320 + }, + { + "epoch": 0.05240602424390841, + "grad_norm": 1.8768842220306396, + "learning_rate": 6.42e-06, + "loss": 1.2697, + "step": 321 + }, + { + "epoch": 0.0525692828864128, + "grad_norm": 2.0316975116729736, + "learning_rate": 6.440000000000001e-06, + "loss": 1.3533, + "step": 322 + }, + { + "epoch": 0.05273254152891719, + "grad_norm": 2.2916362285614014, + "learning_rate": 6.460000000000001e-06, + "loss": 1.0379, + "step": 323 + }, + { + "epoch": 0.05289580017142158, + "grad_norm": 2.2609684467315674, + "learning_rate": 6.480000000000001e-06, + "loss": 1.3794, + "step": 324 + }, + { + "epoch": 0.05305905881392596, + "grad_norm": 1.8895182609558105, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.2537, + "step": 325 + }, + { + "epoch": 0.05322231745643035, + "grad_norm": 2.064990520477295, + "learning_rate": 6.520000000000001e-06, + "loss": 1.4369, + "step": 326 + }, + { + "epoch": 0.05338557609893474, + "grad_norm": 2.1281538009643555, + "learning_rate": 6.540000000000001e-06, + "loss": 1.4265, + "step": 327 + }, + { + "epoch": 0.05354883474143912, + "grad_norm": 1.9302514791488647, + "learning_rate": 6.560000000000001e-06, + "loss": 1.1931, + "step": 328 + }, + { + "epoch": 0.05371209338394351, + "grad_norm": 2.119605541229248, + "learning_rate": 6.5800000000000005e-06, + "loss": 1.1733, + "step": 329 + }, + { + "epoch": 0.0538753520264479, + "grad_norm": 1.921176791191101, + "learning_rate": 6.600000000000001e-06, + "loss": 1.2275, + "step": 330 + }, + { + "epoch": 0.05403861066895229, + "grad_norm": 1.9931433200836182, + "learning_rate": 6.620000000000001e-06, + "loss": 1.3588, + "step": 331 + }, + { + "epoch": 0.05420186931145667, + "grad_norm": 2.1673290729522705, + "learning_rate": 6.640000000000001e-06, + "loss": 1.1609, + "step": 332 + }, + { + "epoch": 0.05436512795396106, + "grad_norm": 1.8807066679000854, + "learning_rate": 6.660000000000001e-06, + "loss": 1.4022, + "step": 333 + }, + { + "epoch": 0.05452838659646545, + "grad_norm": 2.072364568710327, + "learning_rate": 6.680000000000001e-06, + "loss": 1.2992, + "step": 334 + }, + { + "epoch": 0.05469164523896984, + "grad_norm": 2.0293211936950684, + "learning_rate": 6.700000000000001e-06, + "loss": 1.3371, + "step": 335 + }, + { + "epoch": 0.054854903881474223, + "grad_norm": 1.8143714666366577, + "learning_rate": 6.720000000000001e-06, + "loss": 1.2919, + "step": 336 + }, + { + "epoch": 0.05501816252397861, + "grad_norm": 1.8407166004180908, + "learning_rate": 6.740000000000001e-06, + "loss": 1.2765, + "step": 337 + }, + { + "epoch": 0.055181421166483, + "grad_norm": 2.0948259830474854, + "learning_rate": 6.760000000000001e-06, + "loss": 1.2772, + "step": 338 + }, + { + "epoch": 0.055344679808987385, + "grad_norm": 1.8377389907836914, + "learning_rate": 6.780000000000001e-06, + "loss": 1.1669, + "step": 339 + }, + { + "epoch": 0.055507938451491774, + "grad_norm": 1.992203950881958, + "learning_rate": 6.800000000000001e-06, + "loss": 1.2527, + "step": 340 + }, + { + "epoch": 0.055671197093996164, + "grad_norm": 1.9831184148788452, + "learning_rate": 6.820000000000001e-06, + "loss": 1.0334, + "step": 341 + }, + { + "epoch": 0.05583445573650055, + "grad_norm": 1.8389374017715454, + "learning_rate": 6.8400000000000014e-06, + "loss": 1.2587, + "step": 342 + }, + { + "epoch": 0.055997714379004936, + "grad_norm": 2.089749574661255, + "learning_rate": 6.860000000000001e-06, + "loss": 1.4855, + "step": 343 + }, + { + "epoch": 0.056160973021509325, + "grad_norm": 1.974974274635315, + "learning_rate": 6.88e-06, + "loss": 1.3715, + "step": 344 + }, + { + "epoch": 0.056324231664013714, + "grad_norm": 1.825980544090271, + "learning_rate": 6.9e-06, + "loss": 1.2328, + "step": 345 + }, + { + "epoch": 0.056487490306518104, + "grad_norm": 1.8808982372283936, + "learning_rate": 6.92e-06, + "loss": 1.3186, + "step": 346 + }, + { + "epoch": 0.056650748949022486, + "grad_norm": 2.0029497146606445, + "learning_rate": 6.9400000000000005e-06, + "loss": 1.3116, + "step": 347 + }, + { + "epoch": 0.056814007591526876, + "grad_norm": 2.1018147468566895, + "learning_rate": 6.96e-06, + "loss": 1.1871, + "step": 348 + }, + { + "epoch": 0.056977266234031265, + "grad_norm": 1.9819332361221313, + "learning_rate": 6.98e-06, + "loss": 1.0122, + "step": 349 + }, + { + "epoch": 0.057140524876535655, + "grad_norm": 2.356565475463867, + "learning_rate": 7e-06, + "loss": 1.2615, + "step": 350 + }, + { + "epoch": 0.05730378351904004, + "grad_norm": 1.7581766843795776, + "learning_rate": 7.0200000000000006e-06, + "loss": 1.1528, + "step": 351 + }, + { + "epoch": 0.05746704216154443, + "grad_norm": 1.815470576286316, + "learning_rate": 7.04e-06, + "loss": 1.1911, + "step": 352 + }, + { + "epoch": 0.057630300804048816, + "grad_norm": 2.0752034187316895, + "learning_rate": 7.06e-06, + "loss": 1.1539, + "step": 353 + }, + { + "epoch": 0.0577935594465532, + "grad_norm": 1.9206302165985107, + "learning_rate": 7.08e-06, + "loss": 1.2542, + "step": 354 + }, + { + "epoch": 0.05795681808905759, + "grad_norm": 1.9105370044708252, + "learning_rate": 7.100000000000001e-06, + "loss": 1.1524, + "step": 355 + }, + { + "epoch": 0.05812007673156198, + "grad_norm": 2.051164388656616, + "learning_rate": 7.1200000000000004e-06, + "loss": 1.4537, + "step": 356 + }, + { + "epoch": 0.05828333537406637, + "grad_norm": 1.989436388015747, + "learning_rate": 7.14e-06, + "loss": 1.114, + "step": 357 + }, + { + "epoch": 0.05844659401657075, + "grad_norm": 1.846563696861267, + "learning_rate": 7.16e-06, + "loss": 1.1085, + "step": 358 + }, + { + "epoch": 0.05860985265907514, + "grad_norm": 2.179766893386841, + "learning_rate": 7.180000000000001e-06, + "loss": 1.1198, + "step": 359 + }, + { + "epoch": 0.05877311130157953, + "grad_norm": 1.6388657093048096, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.0753, + "step": 360 + }, + { + "epoch": 0.05893636994408392, + "grad_norm": 1.9925389289855957, + "learning_rate": 7.22e-06, + "loss": 1.3266, + "step": 361 + }, + { + "epoch": 0.0590996285865883, + "grad_norm": 1.8099873065948486, + "learning_rate": 7.24e-06, + "loss": 1.2354, + "step": 362 + }, + { + "epoch": 0.05926288722909269, + "grad_norm": 2.0619328022003174, + "learning_rate": 7.260000000000001e-06, + "loss": 1.3734, + "step": 363 + }, + { + "epoch": 0.05942614587159708, + "grad_norm": 1.8196241855621338, + "learning_rate": 7.280000000000001e-06, + "loss": 1.1274, + "step": 364 + }, + { + "epoch": 0.05958940451410147, + "grad_norm": 1.8758442401885986, + "learning_rate": 7.3e-06, + "loss": 1.1189, + "step": 365 + }, + { + "epoch": 0.05975266315660585, + "grad_norm": 1.8072453737258911, + "learning_rate": 7.32e-06, + "loss": 1.2812, + "step": 366 + }, + { + "epoch": 0.05991592179911024, + "grad_norm": 1.901843786239624, + "learning_rate": 7.340000000000001e-06, + "loss": 1.2839, + "step": 367 + }, + { + "epoch": 0.06007918044161463, + "grad_norm": 2.108508825302124, + "learning_rate": 7.360000000000001e-06, + "loss": 1.2649, + "step": 368 + }, + { + "epoch": 0.06024243908411901, + "grad_norm": 2.0627455711364746, + "learning_rate": 7.3800000000000005e-06, + "loss": 1.3192, + "step": 369 + }, + { + "epoch": 0.0604056977266234, + "grad_norm": 2.1426141262054443, + "learning_rate": 7.4e-06, + "loss": 1.3641, + "step": 370 + }, + { + "epoch": 0.06056895636912779, + "grad_norm": 1.8952516317367554, + "learning_rate": 7.420000000000001e-06, + "loss": 1.1985, + "step": 371 + }, + { + "epoch": 0.06073221501163218, + "grad_norm": 1.6583818197250366, + "learning_rate": 7.440000000000001e-06, + "loss": 1.0724, + "step": 372 + }, + { + "epoch": 0.06089547365413656, + "grad_norm": 2.5599281787872314, + "learning_rate": 7.4600000000000006e-06, + "loss": 1.2023, + "step": 373 + }, + { + "epoch": 0.06105873229664095, + "grad_norm": 2.0398690700531006, + "learning_rate": 7.48e-06, + "loss": 1.3233, + "step": 374 + }, + { + "epoch": 0.06122199093914534, + "grad_norm": 2.041804313659668, + "learning_rate": 7.500000000000001e-06, + "loss": 1.2496, + "step": 375 + }, + { + "epoch": 0.06138524958164973, + "grad_norm": 1.9788709878921509, + "learning_rate": 7.520000000000001e-06, + "loss": 1.4004, + "step": 376 + }, + { + "epoch": 0.061548508224154114, + "grad_norm": 2.121373414993286, + "learning_rate": 7.540000000000001e-06, + "loss": 1.2866, + "step": 377 + }, + { + "epoch": 0.0617117668666585, + "grad_norm": 2.0985896587371826, + "learning_rate": 7.5600000000000005e-06, + "loss": 1.2359, + "step": 378 + }, + { + "epoch": 0.06187502550916289, + "grad_norm": 2.0373647212982178, + "learning_rate": 7.58e-06, + "loss": 1.1656, + "step": 379 + }, + { + "epoch": 0.06203828415166728, + "grad_norm": 1.9046670198440552, + "learning_rate": 7.600000000000001e-06, + "loss": 1.246, + "step": 380 + }, + { + "epoch": 0.062201542794171664, + "grad_norm": 1.9420348405838013, + "learning_rate": 7.620000000000001e-06, + "loss": 1.1311, + "step": 381 + }, + { + "epoch": 0.062364801436676054, + "grad_norm": 2.326917886734009, + "learning_rate": 7.640000000000001e-06, + "loss": 1.2461, + "step": 382 + }, + { + "epoch": 0.06252806007918044, + "grad_norm": 1.8557238578796387, + "learning_rate": 7.660000000000001e-06, + "loss": 1.0143, + "step": 383 + }, + { + "epoch": 0.06269131872168483, + "grad_norm": 1.9980357885360718, + "learning_rate": 7.680000000000001e-06, + "loss": 1.1776, + "step": 384 + }, + { + "epoch": 0.06285457736418922, + "grad_norm": 1.8375717401504517, + "learning_rate": 7.7e-06, + "loss": 1.2098, + "step": 385 + }, + { + "epoch": 0.0630178360066936, + "grad_norm": 2.0467495918273926, + "learning_rate": 7.72e-06, + "loss": 1.3015, + "step": 386 + }, + { + "epoch": 0.063181094649198, + "grad_norm": 1.9435759782791138, + "learning_rate": 7.74e-06, + "loss": 1.2486, + "step": 387 + }, + { + "epoch": 0.06334435329170238, + "grad_norm": 1.9634602069854736, + "learning_rate": 7.76e-06, + "loss": 1.1066, + "step": 388 + }, + { + "epoch": 0.06350761193420677, + "grad_norm": 1.8496242761611938, + "learning_rate": 7.78e-06, + "loss": 1.2167, + "step": 389 + }, + { + "epoch": 0.06367087057671116, + "grad_norm": 2.064605951309204, + "learning_rate": 7.800000000000002e-06, + "loss": 1.1604, + "step": 390 + }, + { + "epoch": 0.06383412921921554, + "grad_norm": 1.8987208604812622, + "learning_rate": 7.820000000000001e-06, + "loss": 1.3364, + "step": 391 + }, + { + "epoch": 0.06399738786171993, + "grad_norm": 2.062685012817383, + "learning_rate": 7.840000000000001e-06, + "loss": 1.1012, + "step": 392 + }, + { + "epoch": 0.06416064650422432, + "grad_norm": 2.044191598892212, + "learning_rate": 7.860000000000001e-06, + "loss": 1.2192, + "step": 393 + }, + { + "epoch": 0.0643239051467287, + "grad_norm": 1.8554444313049316, + "learning_rate": 7.88e-06, + "loss": 1.1752, + "step": 394 + }, + { + "epoch": 0.0644871637892331, + "grad_norm": 2.195918321609497, + "learning_rate": 7.9e-06, + "loss": 1.4694, + "step": 395 + }, + { + "epoch": 0.06465042243173748, + "grad_norm": 2.0799505710601807, + "learning_rate": 7.92e-06, + "loss": 1.3756, + "step": 396 + }, + { + "epoch": 0.06481368107424187, + "grad_norm": 1.980363368988037, + "learning_rate": 7.94e-06, + "loss": 1.4176, + "step": 397 + }, + { + "epoch": 0.06497693971674626, + "grad_norm": 2.225811719894409, + "learning_rate": 7.960000000000002e-06, + "loss": 1.1459, + "step": 398 + }, + { + "epoch": 0.06514019835925064, + "grad_norm": 2.4319510459899902, + "learning_rate": 7.980000000000002e-06, + "loss": 1.0911, + "step": 399 + }, + { + "epoch": 0.06530345700175504, + "grad_norm": 2.096545457839966, + "learning_rate": 8.000000000000001e-06, + "loss": 1.3741, + "step": 400 + }, + { + "epoch": 0.06546671564425942, + "grad_norm": 2.0194480419158936, + "learning_rate": 8.020000000000001e-06, + "loss": 1.1103, + "step": 401 + }, + { + "epoch": 0.0656299742867638, + "grad_norm": 2.086824417114258, + "learning_rate": 8.040000000000001e-06, + "loss": 1.1551, + "step": 402 + }, + { + "epoch": 0.0657932329292682, + "grad_norm": 2.0286951065063477, + "learning_rate": 8.06e-06, + "loss": 1.2531, + "step": 403 + }, + { + "epoch": 0.06595649157177258, + "grad_norm": 1.861185073852539, + "learning_rate": 8.08e-06, + "loss": 1.0743, + "step": 404 + }, + { + "epoch": 0.06611975021427696, + "grad_norm": 1.8385728597640991, + "learning_rate": 8.1e-06, + "loss": 1.2443, + "step": 405 + }, + { + "epoch": 0.06628300885678136, + "grad_norm": 2.05220365524292, + "learning_rate": 8.120000000000002e-06, + "loss": 1.1598, + "step": 406 + }, + { + "epoch": 0.06644626749928574, + "grad_norm": 2.015984535217285, + "learning_rate": 8.14e-06, + "loss": 1.1157, + "step": 407 + }, + { + "epoch": 0.06660952614179014, + "grad_norm": 1.9400795698165894, + "learning_rate": 8.16e-06, + "loss": 1.2048, + "step": 408 + }, + { + "epoch": 0.06677278478429452, + "grad_norm": 1.810413122177124, + "learning_rate": 8.18e-06, + "loss": 1.1264, + "step": 409 + }, + { + "epoch": 0.0669360434267989, + "grad_norm": 1.7337501049041748, + "learning_rate": 8.2e-06, + "loss": 1.065, + "step": 410 + }, + { + "epoch": 0.0670993020693033, + "grad_norm": 1.6440634727478027, + "learning_rate": 8.220000000000001e-06, + "loss": 1.1687, + "step": 411 + }, + { + "epoch": 0.06726256071180768, + "grad_norm": 1.9802428483963013, + "learning_rate": 8.24e-06, + "loss": 1.1269, + "step": 412 + }, + { + "epoch": 0.06742581935431206, + "grad_norm": 1.740654468536377, + "learning_rate": 8.26e-06, + "loss": 0.9955, + "step": 413 + }, + { + "epoch": 0.06758907799681646, + "grad_norm": 1.9390037059783936, + "learning_rate": 8.28e-06, + "loss": 1.2966, + "step": 414 + }, + { + "epoch": 0.06775233663932084, + "grad_norm": 1.8586387634277344, + "learning_rate": 8.3e-06, + "loss": 1.0892, + "step": 415 + }, + { + "epoch": 0.06791559528182523, + "grad_norm": 1.924579381942749, + "learning_rate": 8.32e-06, + "loss": 1.1705, + "step": 416 + }, + { + "epoch": 0.06807885392432962, + "grad_norm": 2.036813974380493, + "learning_rate": 8.34e-06, + "loss": 1.2578, + "step": 417 + }, + { + "epoch": 0.068242112566834, + "grad_norm": 2.1908116340637207, + "learning_rate": 8.36e-06, + "loss": 1.0804, + "step": 418 + }, + { + "epoch": 0.0684053712093384, + "grad_norm": 2.3113887310028076, + "learning_rate": 8.380000000000001e-06, + "loss": 1.1817, + "step": 419 + }, + { + "epoch": 0.06856862985184278, + "grad_norm": 2.203634738922119, + "learning_rate": 8.400000000000001e-06, + "loss": 1.2, + "step": 420 + }, + { + "epoch": 0.06873188849434717, + "grad_norm": 2.1560781002044678, + "learning_rate": 8.42e-06, + "loss": 1.1064, + "step": 421 + }, + { + "epoch": 0.06889514713685156, + "grad_norm": 1.9895814657211304, + "learning_rate": 8.44e-06, + "loss": 1.2488, + "step": 422 + }, + { + "epoch": 0.06905840577935594, + "grad_norm": 2.2973825931549072, + "learning_rate": 8.46e-06, + "loss": 1.2532, + "step": 423 + }, + { + "epoch": 0.06922166442186033, + "grad_norm": 1.9881922006607056, + "learning_rate": 8.48e-06, + "loss": 1.1107, + "step": 424 + }, + { + "epoch": 0.06938492306436472, + "grad_norm": 1.9441295862197876, + "learning_rate": 8.5e-06, + "loss": 1.2486, + "step": 425 + }, + { + "epoch": 0.0695481817068691, + "grad_norm": 1.9968016147613525, + "learning_rate": 8.52e-06, + "loss": 1.2483, + "step": 426 + }, + { + "epoch": 0.0697114403493735, + "grad_norm": 1.8959996700286865, + "learning_rate": 8.540000000000001e-06, + "loss": 1.0312, + "step": 427 + }, + { + "epoch": 0.06987469899187788, + "grad_norm": 1.7172850370407104, + "learning_rate": 8.560000000000001e-06, + "loss": 0.9756, + "step": 428 + }, + { + "epoch": 0.07003795763438227, + "grad_norm": 3.1167006492614746, + "learning_rate": 8.580000000000001e-06, + "loss": 1.1734, + "step": 429 + }, + { + "epoch": 0.07020121627688666, + "grad_norm": 2.0712010860443115, + "learning_rate": 8.6e-06, + "loss": 1.2294, + "step": 430 + }, + { + "epoch": 0.07036447491939105, + "grad_norm": 1.8392446041107178, + "learning_rate": 8.62e-06, + "loss": 1.2863, + "step": 431 + }, + { + "epoch": 0.07052773356189543, + "grad_norm": 1.9474895000457764, + "learning_rate": 8.64e-06, + "loss": 1.247, + "step": 432 + }, + { + "epoch": 0.07069099220439982, + "grad_norm": 2.0882370471954346, + "learning_rate": 8.66e-06, + "loss": 1.2153, + "step": 433 + }, + { + "epoch": 0.07085425084690421, + "grad_norm": 1.9889891147613525, + "learning_rate": 8.68e-06, + "loss": 1.3358, + "step": 434 + }, + { + "epoch": 0.07101750948940859, + "grad_norm": 2.1056065559387207, + "learning_rate": 8.700000000000001e-06, + "loss": 1.3148, + "step": 435 + }, + { + "epoch": 0.07118076813191299, + "grad_norm": 1.9358744621276855, + "learning_rate": 8.720000000000001e-06, + "loss": 1.1772, + "step": 436 + }, + { + "epoch": 0.07134402677441737, + "grad_norm": 1.9889074563980103, + "learning_rate": 8.740000000000001e-06, + "loss": 1.1282, + "step": 437 + }, + { + "epoch": 0.07150728541692176, + "grad_norm": 1.7672903537750244, + "learning_rate": 8.76e-06, + "loss": 0.9575, + "step": 438 + }, + { + "epoch": 0.07167054405942615, + "grad_norm": 2.0771145820617676, + "learning_rate": 8.78e-06, + "loss": 1.2015, + "step": 439 + }, + { + "epoch": 0.07183380270193053, + "grad_norm": 1.8405672311782837, + "learning_rate": 8.8e-06, + "loss": 1.0683, + "step": 440 + }, + { + "epoch": 0.07199706134443493, + "grad_norm": 1.8821085691452026, + "learning_rate": 8.82e-06, + "loss": 1.1916, + "step": 441 + }, + { + "epoch": 0.07216031998693931, + "grad_norm": 2.5297975540161133, + "learning_rate": 8.84e-06, + "loss": 1.2339, + "step": 442 + }, + { + "epoch": 0.07232357862944369, + "grad_norm": 2.138904333114624, + "learning_rate": 8.860000000000002e-06, + "loss": 1.1932, + "step": 443 + }, + { + "epoch": 0.07248683727194809, + "grad_norm": 2.216179132461548, + "learning_rate": 8.880000000000001e-06, + "loss": 1.0888, + "step": 444 + }, + { + "epoch": 0.07265009591445247, + "grad_norm": 1.9238756895065308, + "learning_rate": 8.900000000000001e-06, + "loss": 1.0363, + "step": 445 + }, + { + "epoch": 0.07281335455695685, + "grad_norm": 2.006751298904419, + "learning_rate": 8.920000000000001e-06, + "loss": 1.1825, + "step": 446 + }, + { + "epoch": 0.07297661319946125, + "grad_norm": 2.1362032890319824, + "learning_rate": 8.94e-06, + "loss": 1.285, + "step": 447 + }, + { + "epoch": 0.07313987184196563, + "grad_norm": 2.2246041297912598, + "learning_rate": 8.96e-06, + "loss": 1.1811, + "step": 448 + }, + { + "epoch": 0.07330313048447003, + "grad_norm": 2.350745677947998, + "learning_rate": 8.98e-06, + "loss": 1.1997, + "step": 449 + }, + { + "epoch": 0.07346638912697441, + "grad_norm": 1.8694416284561157, + "learning_rate": 9e-06, + "loss": 1.1203, + "step": 450 + }, + { + "epoch": 0.07362964776947879, + "grad_norm": 2.395637035369873, + "learning_rate": 9.020000000000002e-06, + "loss": 1.2534, + "step": 451 + }, + { + "epoch": 0.07379290641198319, + "grad_norm": 2.384709119796753, + "learning_rate": 9.040000000000002e-06, + "loss": 1.2784, + "step": 452 + }, + { + "epoch": 0.07395616505448757, + "grad_norm": 1.963156819343567, + "learning_rate": 9.060000000000001e-06, + "loss": 1.0283, + "step": 453 + }, + { + "epoch": 0.07411942369699195, + "grad_norm": 2.0077297687530518, + "learning_rate": 9.080000000000001e-06, + "loss": 1.0107, + "step": 454 + }, + { + "epoch": 0.07428268233949635, + "grad_norm": 2.4147799015045166, + "learning_rate": 9.100000000000001e-06, + "loss": 1.1989, + "step": 455 + }, + { + "epoch": 0.07444594098200073, + "grad_norm": 2.231031894683838, + "learning_rate": 9.12e-06, + "loss": 1.1359, + "step": 456 + }, + { + "epoch": 0.07460919962450512, + "grad_norm": 2.117201089859009, + "learning_rate": 9.14e-06, + "loss": 1.1769, + "step": 457 + }, + { + "epoch": 0.07477245826700951, + "grad_norm": 2.1280832290649414, + "learning_rate": 9.16e-06, + "loss": 1.2116, + "step": 458 + }, + { + "epoch": 0.0749357169095139, + "grad_norm": 2.199897289276123, + "learning_rate": 9.180000000000002e-06, + "loss": 1.1493, + "step": 459 + }, + { + "epoch": 0.07509897555201829, + "grad_norm": 2.391735315322876, + "learning_rate": 9.200000000000002e-06, + "loss": 1.2247, + "step": 460 + }, + { + "epoch": 0.07526223419452267, + "grad_norm": 2.252683401107788, + "learning_rate": 9.220000000000002e-06, + "loss": 1.1125, + "step": 461 + }, + { + "epoch": 0.07542549283702706, + "grad_norm": 1.8622978925704956, + "learning_rate": 9.240000000000001e-06, + "loss": 1.2454, + "step": 462 + }, + { + "epoch": 0.07558875147953145, + "grad_norm": 2.054431915283203, + "learning_rate": 9.260000000000001e-06, + "loss": 1.1081, + "step": 463 + }, + { + "epoch": 0.07575201012203583, + "grad_norm": 1.9579373598098755, + "learning_rate": 9.280000000000001e-06, + "loss": 1.1217, + "step": 464 + }, + { + "epoch": 0.07591526876454022, + "grad_norm": 1.9501029253005981, + "learning_rate": 9.3e-06, + "loss": 1.0647, + "step": 465 + }, + { + "epoch": 0.07607852740704461, + "grad_norm": 1.9215545654296875, + "learning_rate": 9.32e-06, + "loss": 1.072, + "step": 466 + }, + { + "epoch": 0.076241786049549, + "grad_norm": 2.225527763366699, + "learning_rate": 9.340000000000002e-06, + "loss": 1.2011, + "step": 467 + }, + { + "epoch": 0.07640504469205339, + "grad_norm": 2.4547829627990723, + "learning_rate": 9.360000000000002e-06, + "loss": 1.2743, + "step": 468 + }, + { + "epoch": 0.07656830333455777, + "grad_norm": 2.123399496078491, + "learning_rate": 9.38e-06, + "loss": 1.1489, + "step": 469 + }, + { + "epoch": 0.07673156197706216, + "grad_norm": 2.327552556991577, + "learning_rate": 9.4e-06, + "loss": 1.1451, + "step": 470 + }, + { + "epoch": 0.07689482061956655, + "grad_norm": 2.096482038497925, + "learning_rate": 9.42e-06, + "loss": 1.1311, + "step": 471 + }, + { + "epoch": 0.07705807926207094, + "grad_norm": 2.1612589359283447, + "learning_rate": 9.440000000000001e-06, + "loss": 1.1717, + "step": 472 + }, + { + "epoch": 0.07722133790457532, + "grad_norm": 2.1531004905700684, + "learning_rate": 9.460000000000001e-06, + "loss": 1.3949, + "step": 473 + }, + { + "epoch": 0.07738459654707971, + "grad_norm": 2.128967761993408, + "learning_rate": 9.48e-06, + "loss": 1.4293, + "step": 474 + }, + { + "epoch": 0.0775478551895841, + "grad_norm": 2.3594679832458496, + "learning_rate": 9.5e-06, + "loss": 1.1379, + "step": 475 + }, + { + "epoch": 0.07771111383208848, + "grad_norm": 2.103909969329834, + "learning_rate": 9.52e-06, + "loss": 1.109, + "step": 476 + }, + { + "epoch": 0.07787437247459288, + "grad_norm": 2.041992425918579, + "learning_rate": 9.54e-06, + "loss": 1.1604, + "step": 477 + }, + { + "epoch": 0.07803763111709726, + "grad_norm": 2.0942649841308594, + "learning_rate": 9.56e-06, + "loss": 1.2305, + "step": 478 + }, + { + "epoch": 0.07820088975960166, + "grad_norm": 2.137141704559326, + "learning_rate": 9.58e-06, + "loss": 1.1487, + "step": 479 + }, + { + "epoch": 0.07836414840210604, + "grad_norm": 2.5260872840881348, + "learning_rate": 9.600000000000001e-06, + "loss": 1.0727, + "step": 480 + }, + { + "epoch": 0.07852740704461042, + "grad_norm": 1.8872711658477783, + "learning_rate": 9.620000000000001e-06, + "loss": 1.2242, + "step": 481 + }, + { + "epoch": 0.07869066568711482, + "grad_norm": 2.1657471656799316, + "learning_rate": 9.640000000000001e-06, + "loss": 1.1084, + "step": 482 + }, + { + "epoch": 0.0788539243296192, + "grad_norm": 2.072225332260132, + "learning_rate": 9.66e-06, + "loss": 1.1961, + "step": 483 + }, + { + "epoch": 0.07901718297212358, + "grad_norm": 2.4796104431152344, + "learning_rate": 9.68e-06, + "loss": 1.2055, + "step": 484 + }, + { + "epoch": 0.07918044161462798, + "grad_norm": 2.453747510910034, + "learning_rate": 9.7e-06, + "loss": 1.1408, + "step": 485 + }, + { + "epoch": 0.07934370025713236, + "grad_norm": 2.501016139984131, + "learning_rate": 9.72e-06, + "loss": 1.116, + "step": 486 + }, + { + "epoch": 0.07950695889963674, + "grad_norm": 2.1103854179382324, + "learning_rate": 9.74e-06, + "loss": 1.1007, + "step": 487 + }, + { + "epoch": 0.07967021754214114, + "grad_norm": 1.7755740880966187, + "learning_rate": 9.760000000000001e-06, + "loss": 1.0018, + "step": 488 + }, + { + "epoch": 0.07983347618464552, + "grad_norm": 2.236894369125366, + "learning_rate": 9.780000000000001e-06, + "loss": 1.2326, + "step": 489 + }, + { + "epoch": 0.07999673482714992, + "grad_norm": 1.8976962566375732, + "learning_rate": 9.800000000000001e-06, + "loss": 1.1171, + "step": 490 + }, + { + "epoch": 0.0801599934696543, + "grad_norm": 1.9749926328659058, + "learning_rate": 9.820000000000001e-06, + "loss": 1.0314, + "step": 491 + }, + { + "epoch": 0.08032325211215868, + "grad_norm": 2.452061176300049, + "learning_rate": 9.84e-06, + "loss": 1.3033, + "step": 492 + }, + { + "epoch": 0.08048651075466308, + "grad_norm": 1.8078688383102417, + "learning_rate": 9.86e-06, + "loss": 1.0124, + "step": 493 + }, + { + "epoch": 0.08064976939716746, + "grad_norm": 2.379065990447998, + "learning_rate": 9.88e-06, + "loss": 0.9952, + "step": 494 + }, + { + "epoch": 0.08081302803967184, + "grad_norm": 2.2535061836242676, + "learning_rate": 9.9e-06, + "loss": 0.9827, + "step": 495 + }, + { + "epoch": 0.08097628668217624, + "grad_norm": 1.9587634801864624, + "learning_rate": 9.920000000000002e-06, + "loss": 1.0987, + "step": 496 + }, + { + "epoch": 0.08113954532468062, + "grad_norm": 2.154000997543335, + "learning_rate": 9.940000000000001e-06, + "loss": 1.1826, + "step": 497 + }, + { + "epoch": 0.08130280396718502, + "grad_norm": 2.02744197845459, + "learning_rate": 9.960000000000001e-06, + "loss": 1.26, + "step": 498 + }, + { + "epoch": 0.0814660626096894, + "grad_norm": 2.0693888664245605, + "learning_rate": 9.980000000000001e-06, + "loss": 1.143, + "step": 499 + }, + { + "epoch": 0.08162932125219378, + "grad_norm": 2.0301690101623535, + "learning_rate": 1e-05, + "loss": 1.2021, + "step": 500 + }, + { + "epoch": 0.08179257989469818, + "grad_norm": 2.30107045173645, + "learning_rate": 1.002e-05, + "loss": 1.2953, + "step": 501 + }, + { + "epoch": 0.08195583853720256, + "grad_norm": 1.9231711626052856, + "learning_rate": 1.004e-05, + "loss": 1.1801, + "step": 502 + }, + { + "epoch": 0.08211909717970695, + "grad_norm": 2.354249954223633, + "learning_rate": 1.006e-05, + "loss": 1.117, + "step": 503 + }, + { + "epoch": 0.08228235582221134, + "grad_norm": 2.048701524734497, + "learning_rate": 1.008e-05, + "loss": 1.2776, + "step": 504 + }, + { + "epoch": 0.08244561446471572, + "grad_norm": 1.9473559856414795, + "learning_rate": 1.0100000000000002e-05, + "loss": 1.0401, + "step": 505 + }, + { + "epoch": 0.08260887310722011, + "grad_norm": 1.8621548414230347, + "learning_rate": 1.0120000000000001e-05, + "loss": 0.9931, + "step": 506 + }, + { + "epoch": 0.0827721317497245, + "grad_norm": 2.0183658599853516, + "learning_rate": 1.0140000000000001e-05, + "loss": 1.0373, + "step": 507 + }, + { + "epoch": 0.08293539039222889, + "grad_norm": 2.3650753498077393, + "learning_rate": 1.0160000000000001e-05, + "loss": 1.2169, + "step": 508 + }, + { + "epoch": 0.08309864903473328, + "grad_norm": 2.2234432697296143, + "learning_rate": 1.018e-05, + "loss": 1.107, + "step": 509 + }, + { + "epoch": 0.08326190767723766, + "grad_norm": 2.1484463214874268, + "learning_rate": 1.02e-05, + "loss": 1.0828, + "step": 510 + }, + { + "epoch": 0.08342516631974205, + "grad_norm": 2.2211270332336426, + "learning_rate": 1.022e-05, + "loss": 1.1425, + "step": 511 + }, + { + "epoch": 0.08358842496224644, + "grad_norm": 2.0632476806640625, + "learning_rate": 1.024e-05, + "loss": 1.0173, + "step": 512 + }, + { + "epoch": 0.08375168360475083, + "grad_norm": 2.106527328491211, + "learning_rate": 1.0260000000000002e-05, + "loss": 1.1155, + "step": 513 + }, + { + "epoch": 0.08391494224725521, + "grad_norm": 2.3124008178710938, + "learning_rate": 1.0280000000000002e-05, + "loss": 1.343, + "step": 514 + }, + { + "epoch": 0.0840782008897596, + "grad_norm": 2.581451416015625, + "learning_rate": 1.0300000000000001e-05, + "loss": 1.3468, + "step": 515 + }, + { + "epoch": 0.08424145953226399, + "grad_norm": 2.043722629547119, + "learning_rate": 1.0320000000000001e-05, + "loss": 1.0537, + "step": 516 + }, + { + "epoch": 0.08440471817476837, + "grad_norm": 2.065143585205078, + "learning_rate": 1.0340000000000001e-05, + "loss": 1.251, + "step": 517 + }, + { + "epoch": 0.08456797681727277, + "grad_norm": 2.0450820922851562, + "learning_rate": 1.036e-05, + "loss": 1.2028, + "step": 518 + }, + { + "epoch": 0.08473123545977715, + "grad_norm": 2.0890326499938965, + "learning_rate": 1.038e-05, + "loss": 1.1759, + "step": 519 + }, + { + "epoch": 0.08489449410228155, + "grad_norm": 2.080613613128662, + "learning_rate": 1.04e-05, + "loss": 1.0664, + "step": 520 + }, + { + "epoch": 0.08505775274478593, + "grad_norm": 2.1385669708251953, + "learning_rate": 1.0420000000000002e-05, + "loss": 1.164, + "step": 521 + }, + { + "epoch": 0.08522101138729031, + "grad_norm": 2.360839605331421, + "learning_rate": 1.0440000000000002e-05, + "loss": 1.444, + "step": 522 + }, + { + "epoch": 0.0853842700297947, + "grad_norm": 2.0543017387390137, + "learning_rate": 1.0460000000000001e-05, + "loss": 1.2037, + "step": 523 + }, + { + "epoch": 0.08554752867229909, + "grad_norm": 2.4931371212005615, + "learning_rate": 1.0480000000000001e-05, + "loss": 1.1363, + "step": 524 + }, + { + "epoch": 0.08571078731480347, + "grad_norm": 2.6861774921417236, + "learning_rate": 1.0500000000000001e-05, + "loss": 1.3656, + "step": 525 + }, + { + "epoch": 0.08587404595730787, + "grad_norm": 1.9597691297531128, + "learning_rate": 1.0520000000000001e-05, + "loss": 1.0924, + "step": 526 + }, + { + "epoch": 0.08603730459981225, + "grad_norm": 2.2866878509521484, + "learning_rate": 1.054e-05, + "loss": 1.071, + "step": 527 + }, + { + "epoch": 0.08620056324231665, + "grad_norm": 2.089154005050659, + "learning_rate": 1.056e-05, + "loss": 0.9921, + "step": 528 + }, + { + "epoch": 0.08636382188482103, + "grad_norm": 2.1897799968719482, + "learning_rate": 1.0580000000000002e-05, + "loss": 1.1252, + "step": 529 + }, + { + "epoch": 0.08652708052732541, + "grad_norm": 2.064509630203247, + "learning_rate": 1.0600000000000002e-05, + "loss": 1.1517, + "step": 530 + }, + { + "epoch": 0.08669033916982981, + "grad_norm": 2.5933384895324707, + "learning_rate": 1.0620000000000002e-05, + "loss": 1.2152, + "step": 531 + }, + { + "epoch": 0.08685359781233419, + "grad_norm": 2.1191277503967285, + "learning_rate": 1.0640000000000001e-05, + "loss": 1.179, + "step": 532 + }, + { + "epoch": 0.08701685645483857, + "grad_norm": 2.110588312149048, + "learning_rate": 1.0660000000000001e-05, + "loss": 1.1441, + "step": 533 + }, + { + "epoch": 0.08718011509734297, + "grad_norm": 2.329209566116333, + "learning_rate": 1.0680000000000001e-05, + "loss": 0.9713, + "step": 534 + }, + { + "epoch": 0.08734337373984735, + "grad_norm": 2.2991254329681396, + "learning_rate": 1.0700000000000001e-05, + "loss": 1.2401, + "step": 535 + }, + { + "epoch": 0.08750663238235173, + "grad_norm": 2.3157129287719727, + "learning_rate": 1.072e-05, + "loss": 1.2062, + "step": 536 + }, + { + "epoch": 0.08766989102485613, + "grad_norm": 2.2499959468841553, + "learning_rate": 1.0740000000000002e-05, + "loss": 1.1536, + "step": 537 + }, + { + "epoch": 0.08783314966736051, + "grad_norm": 2.0809717178344727, + "learning_rate": 1.0760000000000002e-05, + "loss": 1.1889, + "step": 538 + }, + { + "epoch": 0.08799640830986491, + "grad_norm": 1.8503974676132202, + "learning_rate": 1.0780000000000002e-05, + "loss": 1.1144, + "step": 539 + }, + { + "epoch": 0.08815966695236929, + "grad_norm": 2.1475627422332764, + "learning_rate": 1.0800000000000002e-05, + "loss": 1.351, + "step": 540 + }, + { + "epoch": 0.08832292559487367, + "grad_norm": 2.1597707271575928, + "learning_rate": 1.0820000000000001e-05, + "loss": 1.2216, + "step": 541 + }, + { + "epoch": 0.08848618423737807, + "grad_norm": 2.3894410133361816, + "learning_rate": 1.0840000000000001e-05, + "loss": 1.0663, + "step": 542 + }, + { + "epoch": 0.08864944287988245, + "grad_norm": 2.2700750827789307, + "learning_rate": 1.0860000000000001e-05, + "loss": 1.1551, + "step": 543 + }, + { + "epoch": 0.08881270152238684, + "grad_norm": 2.050352096557617, + "learning_rate": 1.0880000000000001e-05, + "loss": 1.2078, + "step": 544 + }, + { + "epoch": 0.08897596016489123, + "grad_norm": 2.765122652053833, + "learning_rate": 1.0900000000000002e-05, + "loss": 1.2681, + "step": 545 + }, + { + "epoch": 0.08913921880739561, + "grad_norm": 1.9609044790267944, + "learning_rate": 1.0920000000000002e-05, + "loss": 1.0512, + "step": 546 + }, + { + "epoch": 0.0893024774499, + "grad_norm": 2.1948235034942627, + "learning_rate": 1.0940000000000002e-05, + "loss": 1.0793, + "step": 547 + }, + { + "epoch": 0.0894657360924044, + "grad_norm": 2.0248279571533203, + "learning_rate": 1.0960000000000002e-05, + "loss": 1.2348, + "step": 548 + }, + { + "epoch": 0.08962899473490878, + "grad_norm": 1.8916188478469849, + "learning_rate": 1.0980000000000002e-05, + "loss": 1.0623, + "step": 549 + }, + { + "epoch": 0.08979225337741317, + "grad_norm": 2.194124221801758, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.25, + "step": 550 + }, + { + "epoch": 0.08995551201991756, + "grad_norm": 2.102618455886841, + "learning_rate": 1.1020000000000001e-05, + "loss": 1.2631, + "step": 551 + }, + { + "epoch": 0.09011877066242194, + "grad_norm": 2.1111202239990234, + "learning_rate": 1.1040000000000001e-05, + "loss": 1.0794, + "step": 552 + }, + { + "epoch": 0.09028202930492633, + "grad_norm": 2.1188979148864746, + "learning_rate": 1.1060000000000003e-05, + "loss": 1.2089, + "step": 553 + }, + { + "epoch": 0.09044528794743072, + "grad_norm": 2.3771398067474365, + "learning_rate": 1.1080000000000002e-05, + "loss": 1.0782, + "step": 554 + }, + { + "epoch": 0.0906085465899351, + "grad_norm": 2.024806261062622, + "learning_rate": 1.1100000000000002e-05, + "loss": 1.1943, + "step": 555 + }, + { + "epoch": 0.0907718052324395, + "grad_norm": 2.2835841178894043, + "learning_rate": 1.1120000000000002e-05, + "loss": 1.1081, + "step": 556 + }, + { + "epoch": 0.09093506387494388, + "grad_norm": 2.400186061859131, + "learning_rate": 1.1140000000000002e-05, + "loss": 1.1067, + "step": 557 + }, + { + "epoch": 0.09109832251744827, + "grad_norm": 2.1547060012817383, + "learning_rate": 1.1160000000000002e-05, + "loss": 1.2332, + "step": 558 + }, + { + "epoch": 0.09126158115995266, + "grad_norm": 2.4429080486297607, + "learning_rate": 1.1180000000000001e-05, + "loss": 1.1745, + "step": 559 + }, + { + "epoch": 0.09142483980245704, + "grad_norm": 2.3900227546691895, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.1192, + "step": 560 + }, + { + "epoch": 0.09158809844496144, + "grad_norm": 2.234807014465332, + "learning_rate": 1.1220000000000003e-05, + "loss": 0.9934, + "step": 561 + }, + { + "epoch": 0.09175135708746582, + "grad_norm": 2.44914174079895, + "learning_rate": 1.1240000000000002e-05, + "loss": 1.2582, + "step": 562 + }, + { + "epoch": 0.0919146157299702, + "grad_norm": 2.4764840602874756, + "learning_rate": 1.126e-05, + "loss": 1.3293, + "step": 563 + }, + { + "epoch": 0.0920778743724746, + "grad_norm": 1.8345723152160645, + "learning_rate": 1.128e-05, + "loss": 0.9171, + "step": 564 + }, + { + "epoch": 0.09224113301497898, + "grad_norm": 2.7708799839019775, + "learning_rate": 1.13e-05, + "loss": 1.0781, + "step": 565 + }, + { + "epoch": 0.09240439165748336, + "grad_norm": 2.0084972381591797, + "learning_rate": 1.132e-05, + "loss": 1.0984, + "step": 566 + }, + { + "epoch": 0.09256765029998776, + "grad_norm": 2.392289638519287, + "learning_rate": 1.134e-05, + "loss": 1.1296, + "step": 567 + }, + { + "epoch": 0.09273090894249214, + "grad_norm": 2.2483296394348145, + "learning_rate": 1.136e-05, + "loss": 1.1202, + "step": 568 + }, + { + "epoch": 0.09289416758499654, + "grad_norm": 2.3153254985809326, + "learning_rate": 1.138e-05, + "loss": 1.2516, + "step": 569 + }, + { + "epoch": 0.09305742622750092, + "grad_norm": 2.4120562076568604, + "learning_rate": 1.14e-05, + "loss": 1.2738, + "step": 570 + }, + { + "epoch": 0.0932206848700053, + "grad_norm": 2.2585086822509766, + "learning_rate": 1.142e-05, + "loss": 1.0587, + "step": 571 + }, + { + "epoch": 0.0933839435125097, + "grad_norm": 2.134504795074463, + "learning_rate": 1.144e-05, + "loss": 1.1773, + "step": 572 + }, + { + "epoch": 0.09354720215501408, + "grad_norm": 2.3113505840301514, + "learning_rate": 1.146e-05, + "loss": 1.12, + "step": 573 + }, + { + "epoch": 0.09371046079751846, + "grad_norm": 2.0091545581817627, + "learning_rate": 1.148e-05, + "loss": 1.1135, + "step": 574 + }, + { + "epoch": 0.09387371944002286, + "grad_norm": 2.099668264389038, + "learning_rate": 1.15e-05, + "loss": 0.9895, + "step": 575 + }, + { + "epoch": 0.09403697808252724, + "grad_norm": 2.4951369762420654, + "learning_rate": 1.152e-05, + "loss": 0.9999, + "step": 576 + }, + { + "epoch": 0.09420023672503162, + "grad_norm": 2.508176326751709, + "learning_rate": 1.154e-05, + "loss": 1.205, + "step": 577 + }, + { + "epoch": 0.09436349536753602, + "grad_norm": 2.27274489402771, + "learning_rate": 1.156e-05, + "loss": 1.2918, + "step": 578 + }, + { + "epoch": 0.0945267540100404, + "grad_norm": 2.387101650238037, + "learning_rate": 1.1580000000000001e-05, + "loss": 1.1922, + "step": 579 + }, + { + "epoch": 0.0946900126525448, + "grad_norm": 2.4729442596435547, + "learning_rate": 1.16e-05, + "loss": 1.1973, + "step": 580 + }, + { + "epoch": 0.09485327129504918, + "grad_norm": 2.2617437839508057, + "learning_rate": 1.162e-05, + "loss": 1.0657, + "step": 581 + }, + { + "epoch": 0.09501652993755356, + "grad_norm": 2.2428550720214844, + "learning_rate": 1.164e-05, + "loss": 1.1562, + "step": 582 + }, + { + "epoch": 0.09517978858005796, + "grad_norm": 2.196427345275879, + "learning_rate": 1.166e-05, + "loss": 1.0205, + "step": 583 + }, + { + "epoch": 0.09534304722256234, + "grad_norm": 2.013038158416748, + "learning_rate": 1.168e-05, + "loss": 0.9623, + "step": 584 + }, + { + "epoch": 0.09550630586506673, + "grad_norm": 2.2273364067077637, + "learning_rate": 1.17e-05, + "loss": 1.2066, + "step": 585 + }, + { + "epoch": 0.09566956450757112, + "grad_norm": 2.2216742038726807, + "learning_rate": 1.172e-05, + "loss": 1.0945, + "step": 586 + }, + { + "epoch": 0.0958328231500755, + "grad_norm": 2.389660596847534, + "learning_rate": 1.1740000000000001e-05, + "loss": 1.3045, + "step": 587 + }, + { + "epoch": 0.0959960817925799, + "grad_norm": 2.3388900756835938, + "learning_rate": 1.1760000000000001e-05, + "loss": 0.9157, + "step": 588 + }, + { + "epoch": 0.09615934043508428, + "grad_norm": 2.515687942504883, + "learning_rate": 1.178e-05, + "loss": 1.1609, + "step": 589 + }, + { + "epoch": 0.09632259907758867, + "grad_norm": 2.1678519248962402, + "learning_rate": 1.18e-05, + "loss": 1.1717, + "step": 590 + }, + { + "epoch": 0.09648585772009306, + "grad_norm": 1.849572777748108, + "learning_rate": 1.182e-05, + "loss": 0.8716, + "step": 591 + }, + { + "epoch": 0.09664911636259745, + "grad_norm": 2.2102913856506348, + "learning_rate": 1.184e-05, + "loss": 1.1424, + "step": 592 + }, + { + "epoch": 0.09681237500510183, + "grad_norm": 2.0526554584503174, + "learning_rate": 1.186e-05, + "loss": 1.0293, + "step": 593 + }, + { + "epoch": 0.09697563364760622, + "grad_norm": 2.3345088958740234, + "learning_rate": 1.188e-05, + "loss": 1.0374, + "step": 594 + }, + { + "epoch": 0.0971388922901106, + "grad_norm": 2.162921667098999, + "learning_rate": 1.1900000000000001e-05, + "loss": 1.0631, + "step": 595 + }, + { + "epoch": 0.09730215093261499, + "grad_norm": 2.481452465057373, + "learning_rate": 1.1920000000000001e-05, + "loss": 1.0674, + "step": 596 + }, + { + "epoch": 0.09746540957511939, + "grad_norm": 2.1651577949523926, + "learning_rate": 1.1940000000000001e-05, + "loss": 1.1815, + "step": 597 + }, + { + "epoch": 0.09762866821762377, + "grad_norm": 2.4811413288116455, + "learning_rate": 1.196e-05, + "loss": 0.947, + "step": 598 + }, + { + "epoch": 0.09779192686012816, + "grad_norm": 2.122241973876953, + "learning_rate": 1.198e-05, + "loss": 1.222, + "step": 599 + }, + { + "epoch": 0.09795518550263255, + "grad_norm": 2.29854416847229, + "learning_rate": 1.2e-05, + "loss": 1.0891, + "step": 600 + }, + { + "epoch": 0.09811844414513693, + "grad_norm": 2.8610055446624756, + "learning_rate": 1.202e-05, + "loss": 1.296, + "step": 601 + }, + { + "epoch": 0.09828170278764133, + "grad_norm": 2.278808832168579, + "learning_rate": 1.204e-05, + "loss": 1.001, + "step": 602 + }, + { + "epoch": 0.09844496143014571, + "grad_norm": 2.1872823238372803, + "learning_rate": 1.2060000000000001e-05, + "loss": 1.0714, + "step": 603 + }, + { + "epoch": 0.09860822007265009, + "grad_norm": 2.403470516204834, + "learning_rate": 1.2080000000000001e-05, + "loss": 1.1625, + "step": 604 + }, + { + "epoch": 0.09877147871515449, + "grad_norm": 2.1634583473205566, + "learning_rate": 1.2100000000000001e-05, + "loss": 1.1078, + "step": 605 + }, + { + "epoch": 0.09893473735765887, + "grad_norm": 2.5648088455200195, + "learning_rate": 1.2120000000000001e-05, + "loss": 1.2635, + "step": 606 + }, + { + "epoch": 0.09909799600016325, + "grad_norm": 2.313460350036621, + "learning_rate": 1.214e-05, + "loss": 1.1816, + "step": 607 + }, + { + "epoch": 0.09926125464266765, + "grad_norm": 2.6342110633850098, + "learning_rate": 1.216e-05, + "loss": 1.1403, + "step": 608 + }, + { + "epoch": 0.09942451328517203, + "grad_norm": 2.3983922004699707, + "learning_rate": 1.218e-05, + "loss": 1.115, + "step": 609 + }, + { + "epoch": 0.09958777192767643, + "grad_norm": 2.0575366020202637, + "learning_rate": 1.22e-05, + "loss": 1.0788, + "step": 610 + }, + { + "epoch": 0.09975103057018081, + "grad_norm": 2.4335927963256836, + "learning_rate": 1.2220000000000002e-05, + "loss": 1.1847, + "step": 611 + }, + { + "epoch": 0.09991428921268519, + "grad_norm": 2.651470184326172, + "learning_rate": 1.2240000000000001e-05, + "loss": 1.3627, + "step": 612 + }, + { + "epoch": 0.10007754785518959, + "grad_norm": 2.40061092376709, + "learning_rate": 1.2260000000000001e-05, + "loss": 1.1795, + "step": 613 + }, + { + "epoch": 0.10024080649769397, + "grad_norm": 2.3222713470458984, + "learning_rate": 1.2280000000000001e-05, + "loss": 1.2542, + "step": 614 + }, + { + "epoch": 0.10040406514019835, + "grad_norm": 1.927268624305725, + "learning_rate": 1.23e-05, + "loss": 0.8903, + "step": 615 + }, + { + "epoch": 0.10056732378270275, + "grad_norm": 2.3598694801330566, + "learning_rate": 1.232e-05, + "loss": 1.2318, + "step": 616 + }, + { + "epoch": 0.10073058242520713, + "grad_norm": 2.3972742557525635, + "learning_rate": 1.234e-05, + "loss": 1.1292, + "step": 617 + }, + { + "epoch": 0.10089384106771153, + "grad_norm": 2.6992738246917725, + "learning_rate": 1.236e-05, + "loss": 1.2477, + "step": 618 + }, + { + "epoch": 0.10105709971021591, + "grad_norm": 2.1505918502807617, + "learning_rate": 1.2380000000000002e-05, + "loss": 1.09, + "step": 619 + }, + { + "epoch": 0.1012203583527203, + "grad_norm": 2.234858989715576, + "learning_rate": 1.2400000000000002e-05, + "loss": 1.0779, + "step": 620 + }, + { + "epoch": 0.10138361699522469, + "grad_norm": 1.892857313156128, + "learning_rate": 1.2420000000000001e-05, + "loss": 0.8626, + "step": 621 + }, + { + "epoch": 0.10154687563772907, + "grad_norm": 1.979612112045288, + "learning_rate": 1.2440000000000001e-05, + "loss": 1.1399, + "step": 622 + }, + { + "epoch": 0.10171013428023346, + "grad_norm": 2.1129825115203857, + "learning_rate": 1.2460000000000001e-05, + "loss": 1.1086, + "step": 623 + }, + { + "epoch": 0.10187339292273785, + "grad_norm": 2.2777364253997803, + "learning_rate": 1.248e-05, + "loss": 1.1868, + "step": 624 + }, + { + "epoch": 0.10203665156524223, + "grad_norm": 2.2597436904907227, + "learning_rate": 1.25e-05, + "loss": 1.1333, + "step": 625 + }, + { + "epoch": 0.10219991020774662, + "grad_norm": 2.003836154937744, + "learning_rate": 1.252e-05, + "loss": 1.1433, + "step": 626 + }, + { + "epoch": 0.10236316885025101, + "grad_norm": 2.2667949199676514, + "learning_rate": 1.254e-05, + "loss": 1.0817, + "step": 627 + }, + { + "epoch": 0.1025264274927554, + "grad_norm": 2.122574806213379, + "learning_rate": 1.2560000000000002e-05, + "loss": 1.2371, + "step": 628 + }, + { + "epoch": 0.10268968613525979, + "grad_norm": 2.221588611602783, + "learning_rate": 1.2580000000000002e-05, + "loss": 1.0462, + "step": 629 + }, + { + "epoch": 0.10285294477776417, + "grad_norm": 2.48842453956604, + "learning_rate": 1.2600000000000001e-05, + "loss": 1.2567, + "step": 630 + }, + { + "epoch": 0.10301620342026856, + "grad_norm": 2.400120973587036, + "learning_rate": 1.2620000000000001e-05, + "loss": 1.2489, + "step": 631 + }, + { + "epoch": 0.10317946206277295, + "grad_norm": 2.3211960792541504, + "learning_rate": 1.2640000000000001e-05, + "loss": 1.0691, + "step": 632 + }, + { + "epoch": 0.10334272070527734, + "grad_norm": 2.183497190475464, + "learning_rate": 1.266e-05, + "loss": 1.0862, + "step": 633 + }, + { + "epoch": 0.10350597934778172, + "grad_norm": 2.494892120361328, + "learning_rate": 1.268e-05, + "loss": 1.1687, + "step": 634 + }, + { + "epoch": 0.10366923799028611, + "grad_norm": 2.3134379386901855, + "learning_rate": 1.27e-05, + "loss": 1.022, + "step": 635 + }, + { + "epoch": 0.1038324966327905, + "grad_norm": 1.8653385639190674, + "learning_rate": 1.2720000000000002e-05, + "loss": 1.0266, + "step": 636 + }, + { + "epoch": 0.10399575527529488, + "grad_norm": 2.489906072616577, + "learning_rate": 1.2740000000000002e-05, + "loss": 1.1891, + "step": 637 + }, + { + "epoch": 0.10415901391779928, + "grad_norm": 2.2346251010894775, + "learning_rate": 1.2760000000000001e-05, + "loss": 1.0892, + "step": 638 + }, + { + "epoch": 0.10432227256030366, + "grad_norm": 2.2134947776794434, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.9985, + "step": 639 + }, + { + "epoch": 0.10448553120280805, + "grad_norm": 2.2416558265686035, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.1691, + "step": 640 + }, + { + "epoch": 0.10464878984531244, + "grad_norm": 2.217766523361206, + "learning_rate": 1.2820000000000001e-05, + "loss": 0.9682, + "step": 641 + }, + { + "epoch": 0.10481204848781682, + "grad_norm": 2.610581874847412, + "learning_rate": 1.284e-05, + "loss": 1.1064, + "step": 642 + }, + { + "epoch": 0.10497530713032122, + "grad_norm": 2.012836456298828, + "learning_rate": 1.286e-05, + "loss": 0.9583, + "step": 643 + }, + { + "epoch": 0.1051385657728256, + "grad_norm": 2.2592179775238037, + "learning_rate": 1.2880000000000002e-05, + "loss": 1.1472, + "step": 644 + }, + { + "epoch": 0.10530182441532998, + "grad_norm": 2.37949275970459, + "learning_rate": 1.2900000000000002e-05, + "loss": 0.995, + "step": 645 + }, + { + "epoch": 0.10546508305783438, + "grad_norm": 2.1269478797912598, + "learning_rate": 1.2920000000000002e-05, + "loss": 1.1412, + "step": 646 + }, + { + "epoch": 0.10562834170033876, + "grad_norm": 2.4893782138824463, + "learning_rate": 1.2940000000000001e-05, + "loss": 1.1055, + "step": 647 + }, + { + "epoch": 0.10579160034284316, + "grad_norm": 2.216463804244995, + "learning_rate": 1.2960000000000001e-05, + "loss": 1.1752, + "step": 648 + }, + { + "epoch": 0.10595485898534754, + "grad_norm": 2.459069013595581, + "learning_rate": 1.2980000000000001e-05, + "loss": 1.2992, + "step": 649 + }, + { + "epoch": 0.10611811762785192, + "grad_norm": 2.3783762454986572, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.1758, + "step": 650 + }, + { + "epoch": 0.10628137627035632, + "grad_norm": 2.1655356884002686, + "learning_rate": 1.302e-05, + "loss": 1.2611, + "step": 651 + }, + { + "epoch": 0.1064446349128607, + "grad_norm": 2.161815643310547, + "learning_rate": 1.3040000000000002e-05, + "loss": 1.0408, + "step": 652 + }, + { + "epoch": 0.10660789355536508, + "grad_norm": 1.9729121923446655, + "learning_rate": 1.3060000000000002e-05, + "loss": 1.0232, + "step": 653 + }, + { + "epoch": 0.10677115219786948, + "grad_norm": 2.0821754932403564, + "learning_rate": 1.3080000000000002e-05, + "loss": 0.9521, + "step": 654 + }, + { + "epoch": 0.10693441084037386, + "grad_norm": 2.408712863922119, + "learning_rate": 1.3100000000000002e-05, + "loss": 1.0755, + "step": 655 + }, + { + "epoch": 0.10709766948287824, + "grad_norm": 2.3315343856811523, + "learning_rate": 1.3120000000000001e-05, + "loss": 0.9234, + "step": 656 + }, + { + "epoch": 0.10726092812538264, + "grad_norm": 1.9373000860214233, + "learning_rate": 1.3140000000000001e-05, + "loss": 0.9108, + "step": 657 + }, + { + "epoch": 0.10742418676788702, + "grad_norm": 2.2427690029144287, + "learning_rate": 1.3160000000000001e-05, + "loss": 1.0481, + "step": 658 + }, + { + "epoch": 0.10758744541039142, + "grad_norm": 2.1169705390930176, + "learning_rate": 1.3180000000000001e-05, + "loss": 1.1872, + "step": 659 + }, + { + "epoch": 0.1077507040528958, + "grad_norm": 2.207862138748169, + "learning_rate": 1.3200000000000002e-05, + "loss": 1.0943, + "step": 660 + }, + { + "epoch": 0.10791396269540018, + "grad_norm": 2.1384873390197754, + "learning_rate": 1.3220000000000002e-05, + "loss": 1.0693, + "step": 661 + }, + { + "epoch": 0.10807722133790458, + "grad_norm": 2.3833110332489014, + "learning_rate": 1.3240000000000002e-05, + "loss": 1.2993, + "step": 662 + }, + { + "epoch": 0.10824047998040896, + "grad_norm": 1.946217656135559, + "learning_rate": 1.3260000000000002e-05, + "loss": 0.9552, + "step": 663 + }, + { + "epoch": 0.10840373862291335, + "grad_norm": 2.142319917678833, + "learning_rate": 1.3280000000000002e-05, + "loss": 1.2534, + "step": 664 + }, + { + "epoch": 0.10856699726541774, + "grad_norm": 2.3199260234832764, + "learning_rate": 1.3300000000000001e-05, + "loss": 1.0762, + "step": 665 + }, + { + "epoch": 0.10873025590792212, + "grad_norm": 2.9386916160583496, + "learning_rate": 1.3320000000000001e-05, + "loss": 1.2278, + "step": 666 + }, + { + "epoch": 0.1088935145504265, + "grad_norm": 2.275683879852295, + "learning_rate": 1.3340000000000001e-05, + "loss": 1.2545, + "step": 667 + }, + { + "epoch": 0.1090567731929309, + "grad_norm": 2.272437572479248, + "learning_rate": 1.3360000000000003e-05, + "loss": 1.1936, + "step": 668 + }, + { + "epoch": 0.10922003183543529, + "grad_norm": 2.2124342918395996, + "learning_rate": 1.3380000000000002e-05, + "loss": 1.2094, + "step": 669 + }, + { + "epoch": 0.10938329047793968, + "grad_norm": 2.012925386428833, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.8534, + "step": 670 + }, + { + "epoch": 0.10954654912044406, + "grad_norm": 2.936959743499756, + "learning_rate": 1.3420000000000002e-05, + "loss": 1.3426, + "step": 671 + }, + { + "epoch": 0.10970980776294845, + "grad_norm": 2.002520799636841, + "learning_rate": 1.3440000000000002e-05, + "loss": 1.1952, + "step": 672 + }, + { + "epoch": 0.10987306640545284, + "grad_norm": 2.1583547592163086, + "learning_rate": 1.3460000000000002e-05, + "loss": 0.9572, + "step": 673 + }, + { + "epoch": 0.11003632504795723, + "grad_norm": 2.11974835395813, + "learning_rate": 1.3480000000000001e-05, + "loss": 0.9344, + "step": 674 + }, + { + "epoch": 0.11019958369046161, + "grad_norm": 2.084968328475952, + "learning_rate": 1.3500000000000001e-05, + "loss": 0.9384, + "step": 675 + }, + { + "epoch": 0.110362842332966, + "grad_norm": 2.271674871444702, + "learning_rate": 1.3520000000000003e-05, + "loss": 1.1174, + "step": 676 + }, + { + "epoch": 0.11052610097547039, + "grad_norm": 2.2235991954803467, + "learning_rate": 1.3540000000000003e-05, + "loss": 1.137, + "step": 677 + }, + { + "epoch": 0.11068935961797477, + "grad_norm": 2.312089204788208, + "learning_rate": 1.3560000000000002e-05, + "loss": 0.9163, + "step": 678 + }, + { + "epoch": 0.11085261826047917, + "grad_norm": 2.362788677215576, + "learning_rate": 1.3580000000000002e-05, + "loss": 1.2848, + "step": 679 + }, + { + "epoch": 0.11101587690298355, + "grad_norm": 2.237948417663574, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.2563, + "step": 680 + }, + { + "epoch": 0.11117913554548794, + "grad_norm": 2.2222514152526855, + "learning_rate": 1.3620000000000002e-05, + "loss": 0.8566, + "step": 681 + }, + { + "epoch": 0.11134239418799233, + "grad_norm": 2.18951416015625, + "learning_rate": 1.3640000000000002e-05, + "loss": 0.9657, + "step": 682 + }, + { + "epoch": 0.11150565283049671, + "grad_norm": 2.488557815551758, + "learning_rate": 1.3660000000000001e-05, + "loss": 1.1271, + "step": 683 + }, + { + "epoch": 0.1116689114730011, + "grad_norm": 2.2374722957611084, + "learning_rate": 1.3680000000000003e-05, + "loss": 1.0334, + "step": 684 + }, + { + "epoch": 0.11183217011550549, + "grad_norm": 2.1181163787841797, + "learning_rate": 1.3700000000000003e-05, + "loss": 1.0255, + "step": 685 + }, + { + "epoch": 0.11199542875800987, + "grad_norm": 2.5455894470214844, + "learning_rate": 1.3720000000000002e-05, + "loss": 1.2454, + "step": 686 + }, + { + "epoch": 0.11215868740051427, + "grad_norm": 2.715745687484741, + "learning_rate": 1.3740000000000002e-05, + "loss": 1.2491, + "step": 687 + }, + { + "epoch": 0.11232194604301865, + "grad_norm": 2.4145407676696777, + "learning_rate": 1.376e-05, + "loss": 1.1853, + "step": 688 + }, + { + "epoch": 0.11248520468552305, + "grad_norm": 2.3001222610473633, + "learning_rate": 1.378e-05, + "loss": 1.1955, + "step": 689 + }, + { + "epoch": 0.11264846332802743, + "grad_norm": 2.644507884979248, + "learning_rate": 1.38e-05, + "loss": 1.2369, + "step": 690 + }, + { + "epoch": 0.11281172197053181, + "grad_norm": 2.3529052734375, + "learning_rate": 1.382e-05, + "loss": 1.0045, + "step": 691 + }, + { + "epoch": 0.11297498061303621, + "grad_norm": 2.4610183238983154, + "learning_rate": 1.384e-05, + "loss": 1.0641, + "step": 692 + }, + { + "epoch": 0.11313823925554059, + "grad_norm": 2.415421962738037, + "learning_rate": 1.386e-05, + "loss": 1.0964, + "step": 693 + }, + { + "epoch": 0.11330149789804497, + "grad_norm": 2.3912975788116455, + "learning_rate": 1.3880000000000001e-05, + "loss": 1.149, + "step": 694 + }, + { + "epoch": 0.11346475654054937, + "grad_norm": 2.6268279552459717, + "learning_rate": 1.39e-05, + "loss": 1.3216, + "step": 695 + }, + { + "epoch": 0.11362801518305375, + "grad_norm": 2.5815670490264893, + "learning_rate": 1.392e-05, + "loss": 1.0412, + "step": 696 + }, + { + "epoch": 0.11379127382555813, + "grad_norm": 2.3840436935424805, + "learning_rate": 1.394e-05, + "loss": 1.1186, + "step": 697 + }, + { + "epoch": 0.11395453246806253, + "grad_norm": 2.1603963375091553, + "learning_rate": 1.396e-05, + "loss": 1.2135, + "step": 698 + }, + { + "epoch": 0.11411779111056691, + "grad_norm": 2.9880950450897217, + "learning_rate": 1.398e-05, + "loss": 1.1652, + "step": 699 + }, + { + "epoch": 0.11428104975307131, + "grad_norm": 2.3261570930480957, + "learning_rate": 1.4e-05, + "loss": 1.2449, + "step": 700 + }, + { + "epoch": 0.11444430839557569, + "grad_norm": 2.495210647583008, + "learning_rate": 1.402e-05, + "loss": 1.274, + "step": 701 + }, + { + "epoch": 0.11460756703808007, + "grad_norm": 2.5343966484069824, + "learning_rate": 1.4040000000000001e-05, + "loss": 1.1562, + "step": 702 + }, + { + "epoch": 0.11477082568058447, + "grad_norm": 2.2462477684020996, + "learning_rate": 1.4060000000000001e-05, + "loss": 1.0743, + "step": 703 + }, + { + "epoch": 0.11493408432308885, + "grad_norm": 2.763458251953125, + "learning_rate": 1.408e-05, + "loss": 1.3091, + "step": 704 + }, + { + "epoch": 0.11509734296559324, + "grad_norm": 2.1928839683532715, + "learning_rate": 1.41e-05, + "loss": 1.1241, + "step": 705 + }, + { + "epoch": 0.11526060160809763, + "grad_norm": 2.0539863109588623, + "learning_rate": 1.412e-05, + "loss": 1.1229, + "step": 706 + }, + { + "epoch": 0.11542386025060201, + "grad_norm": 2.0049281120300293, + "learning_rate": 1.414e-05, + "loss": 1.1066, + "step": 707 + }, + { + "epoch": 0.1155871188931064, + "grad_norm": 2.1301469802856445, + "learning_rate": 1.416e-05, + "loss": 1.0123, + "step": 708 + }, + { + "epoch": 0.1157503775356108, + "grad_norm": 2.1698412895202637, + "learning_rate": 1.418e-05, + "loss": 1.0132, + "step": 709 + }, + { + "epoch": 0.11591363617811518, + "grad_norm": 2.023695945739746, + "learning_rate": 1.4200000000000001e-05, + "loss": 1.1145, + "step": 710 + }, + { + "epoch": 0.11607689482061957, + "grad_norm": 2.1574056148529053, + "learning_rate": 1.4220000000000001e-05, + "loss": 1.0613, + "step": 711 + }, + { + "epoch": 0.11624015346312395, + "grad_norm": 2.7349820137023926, + "learning_rate": 1.4240000000000001e-05, + "loss": 1.0168, + "step": 712 + }, + { + "epoch": 0.11640341210562834, + "grad_norm": 2.127218723297119, + "learning_rate": 1.426e-05, + "loss": 0.9757, + "step": 713 + }, + { + "epoch": 0.11656667074813273, + "grad_norm": 2.580047607421875, + "learning_rate": 1.428e-05, + "loss": 1.1039, + "step": 714 + }, + { + "epoch": 0.11672992939063712, + "grad_norm": 2.18753981590271, + "learning_rate": 1.43e-05, + "loss": 1.0925, + "step": 715 + }, + { + "epoch": 0.1168931880331415, + "grad_norm": 2.3090806007385254, + "learning_rate": 1.432e-05, + "loss": 1.2568, + "step": 716 + }, + { + "epoch": 0.1170564466756459, + "grad_norm": 2.2329230308532715, + "learning_rate": 1.434e-05, + "loss": 1.1274, + "step": 717 + }, + { + "epoch": 0.11721970531815028, + "grad_norm": 2.684112787246704, + "learning_rate": 1.4360000000000001e-05, + "loss": 1.2778, + "step": 718 + }, + { + "epoch": 0.11738296396065467, + "grad_norm": 2.398986339569092, + "learning_rate": 1.4380000000000001e-05, + "loss": 1.0811, + "step": 719 + }, + { + "epoch": 0.11754622260315906, + "grad_norm": 2.491607427597046, + "learning_rate": 1.4400000000000001e-05, + "loss": 1.0466, + "step": 720 + }, + { + "epoch": 0.11770948124566344, + "grad_norm": 2.4555017948150635, + "learning_rate": 1.4420000000000001e-05, + "loss": 1.1665, + "step": 721 + }, + { + "epoch": 0.11787273988816784, + "grad_norm": 2.703523874282837, + "learning_rate": 1.444e-05, + "loss": 1.2154, + "step": 722 + }, + { + "epoch": 0.11803599853067222, + "grad_norm": 2.1012768745422363, + "learning_rate": 1.446e-05, + "loss": 0.9451, + "step": 723 + }, + { + "epoch": 0.1181992571731766, + "grad_norm": 2.0012354850769043, + "learning_rate": 1.448e-05, + "loss": 0.9802, + "step": 724 + }, + { + "epoch": 0.118362515815681, + "grad_norm": 2.3113489151000977, + "learning_rate": 1.45e-05, + "loss": 1.0418, + "step": 725 + }, + { + "epoch": 0.11852577445818538, + "grad_norm": 2.1188430786132812, + "learning_rate": 1.4520000000000002e-05, + "loss": 1.1208, + "step": 726 + }, + { + "epoch": 0.11868903310068976, + "grad_norm": 2.2806811332702637, + "learning_rate": 1.4540000000000001e-05, + "loss": 1.0697, + "step": 727 + }, + { + "epoch": 0.11885229174319416, + "grad_norm": 2.280212163925171, + "learning_rate": 1.4560000000000001e-05, + "loss": 0.9671, + "step": 728 + }, + { + "epoch": 0.11901555038569854, + "grad_norm": 2.061343193054199, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.9964, + "step": 729 + }, + { + "epoch": 0.11917880902820294, + "grad_norm": 2.514763116836548, + "learning_rate": 1.46e-05, + "loss": 1.1097, + "step": 730 + }, + { + "epoch": 0.11934206767070732, + "grad_norm": 2.141361713409424, + "learning_rate": 1.462e-05, + "loss": 1.0284, + "step": 731 + }, + { + "epoch": 0.1195053263132117, + "grad_norm": 2.612725019454956, + "learning_rate": 1.464e-05, + "loss": 1.3805, + "step": 732 + }, + { + "epoch": 0.1196685849557161, + "grad_norm": 2.377556562423706, + "learning_rate": 1.466e-05, + "loss": 1.1615, + "step": 733 + }, + { + "epoch": 0.11983184359822048, + "grad_norm": 2.8224315643310547, + "learning_rate": 1.4680000000000002e-05, + "loss": 1.1403, + "step": 734 + }, + { + "epoch": 0.11999510224072486, + "grad_norm": 2.278228998184204, + "learning_rate": 1.4700000000000002e-05, + "loss": 1.0359, + "step": 735 + }, + { + "epoch": 0.12015836088322926, + "grad_norm": 2.27772855758667, + "learning_rate": 1.4720000000000001e-05, + "loss": 1.073, + "step": 736 + }, + { + "epoch": 0.12032161952573364, + "grad_norm": 2.2087297439575195, + "learning_rate": 1.4740000000000001e-05, + "loss": 1.0801, + "step": 737 + }, + { + "epoch": 0.12048487816823802, + "grad_norm": 2.184826135635376, + "learning_rate": 1.4760000000000001e-05, + "loss": 0.9646, + "step": 738 + }, + { + "epoch": 0.12064813681074242, + "grad_norm": 2.453735828399658, + "learning_rate": 1.478e-05, + "loss": 1.2635, + "step": 739 + }, + { + "epoch": 0.1208113954532468, + "grad_norm": 2.414106845855713, + "learning_rate": 1.48e-05, + "loss": 0.9237, + "step": 740 + }, + { + "epoch": 0.1209746540957512, + "grad_norm": 2.2158467769622803, + "learning_rate": 1.482e-05, + "loss": 1.0015, + "step": 741 + }, + { + "epoch": 0.12113791273825558, + "grad_norm": 2.1643996238708496, + "learning_rate": 1.4840000000000002e-05, + "loss": 1.1763, + "step": 742 + }, + { + "epoch": 0.12130117138075996, + "grad_norm": 2.547316551208496, + "learning_rate": 1.4860000000000002e-05, + "loss": 1.0759, + "step": 743 + }, + { + "epoch": 0.12146443002326436, + "grad_norm": 2.59954571723938, + "learning_rate": 1.4880000000000002e-05, + "loss": 1.1794, + "step": 744 + }, + { + "epoch": 0.12162768866576874, + "grad_norm": 2.4989686012268066, + "learning_rate": 1.4900000000000001e-05, + "loss": 1.0237, + "step": 745 + }, + { + "epoch": 0.12179094730827313, + "grad_norm": 2.4219133853912354, + "learning_rate": 1.4920000000000001e-05, + "loss": 1.1134, + "step": 746 + }, + { + "epoch": 0.12195420595077752, + "grad_norm": 2.3253793716430664, + "learning_rate": 1.4940000000000001e-05, + "loss": 1.0946, + "step": 747 + }, + { + "epoch": 0.1221174645932819, + "grad_norm": 2.725053071975708, + "learning_rate": 1.496e-05, + "loss": 1.0228, + "step": 748 + }, + { + "epoch": 0.1222807232357863, + "grad_norm": 2.5383968353271484, + "learning_rate": 1.498e-05, + "loss": 1.2493, + "step": 749 + }, + { + "epoch": 0.12244398187829068, + "grad_norm": 2.634434938430786, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.0577, + "step": 750 + }, + { + "epoch": 0.12260724052079507, + "grad_norm": 2.4397354125976562, + "learning_rate": 1.5020000000000002e-05, + "loss": 1.1039, + "step": 751 + }, + { + "epoch": 0.12277049916329946, + "grad_norm": 2.715039014816284, + "learning_rate": 1.5040000000000002e-05, + "loss": 1.2254, + "step": 752 + }, + { + "epoch": 0.12293375780580384, + "grad_norm": 2.3144099712371826, + "learning_rate": 1.5060000000000001e-05, + "loss": 0.9922, + "step": 753 + }, + { + "epoch": 0.12309701644830823, + "grad_norm": 2.6371517181396484, + "learning_rate": 1.5080000000000001e-05, + "loss": 1.249, + "step": 754 + }, + { + "epoch": 0.12326027509081262, + "grad_norm": 2.331406593322754, + "learning_rate": 1.5100000000000001e-05, + "loss": 1.0866, + "step": 755 + }, + { + "epoch": 0.123423533733317, + "grad_norm": 2.396252155303955, + "learning_rate": 1.5120000000000001e-05, + "loss": 1.0895, + "step": 756 + }, + { + "epoch": 0.12358679237582139, + "grad_norm": 2.273235559463501, + "learning_rate": 1.514e-05, + "loss": 1.157, + "step": 757 + }, + { + "epoch": 0.12375005101832579, + "grad_norm": 2.4729843139648438, + "learning_rate": 1.516e-05, + "loss": 1.0587, + "step": 758 + }, + { + "epoch": 0.12391330966083017, + "grad_norm": 2.291576385498047, + "learning_rate": 1.5180000000000002e-05, + "loss": 1.0536, + "step": 759 + }, + { + "epoch": 0.12407656830333456, + "grad_norm": 2.4626877307891846, + "learning_rate": 1.5200000000000002e-05, + "loss": 0.9912, + "step": 760 + }, + { + "epoch": 0.12423982694583895, + "grad_norm": 2.3226864337921143, + "learning_rate": 1.5220000000000002e-05, + "loss": 1.0823, + "step": 761 + }, + { + "epoch": 0.12440308558834333, + "grad_norm": 2.7476718425750732, + "learning_rate": 1.5240000000000001e-05, + "loss": 1.0776, + "step": 762 + }, + { + "epoch": 0.12456634423084773, + "grad_norm": 2.017200469970703, + "learning_rate": 1.5260000000000003e-05, + "loss": 1.028, + "step": 763 + }, + { + "epoch": 0.12472960287335211, + "grad_norm": 2.230628728866577, + "learning_rate": 1.5280000000000003e-05, + "loss": 0.9139, + "step": 764 + }, + { + "epoch": 0.12489286151585649, + "grad_norm": 2.4791698455810547, + "learning_rate": 1.5300000000000003e-05, + "loss": 1.266, + "step": 765 + }, + { + "epoch": 0.12505612015836087, + "grad_norm": 2.521533489227295, + "learning_rate": 1.5320000000000002e-05, + "loss": 0.9969, + "step": 766 + }, + { + "epoch": 0.12521937880086528, + "grad_norm": 2.473158121109009, + "learning_rate": 1.5340000000000002e-05, + "loss": 0.9861, + "step": 767 + }, + { + "epoch": 0.12538263744336967, + "grad_norm": 2.7101058959960938, + "learning_rate": 1.5360000000000002e-05, + "loss": 0.8905, + "step": 768 + }, + { + "epoch": 0.12554589608587405, + "grad_norm": 2.70375657081604, + "learning_rate": 1.5380000000000002e-05, + "loss": 1.2277, + "step": 769 + }, + { + "epoch": 0.12570915472837843, + "grad_norm": 2.231943368911743, + "learning_rate": 1.54e-05, + "loss": 1.0401, + "step": 770 + }, + { + "epoch": 0.1258724133708828, + "grad_norm": 2.251126766204834, + "learning_rate": 1.542e-05, + "loss": 1.1845, + "step": 771 + }, + { + "epoch": 0.1260356720133872, + "grad_norm": 2.613600492477417, + "learning_rate": 1.544e-05, + "loss": 1.075, + "step": 772 + }, + { + "epoch": 0.1261989306558916, + "grad_norm": 2.5316648483276367, + "learning_rate": 1.546e-05, + "loss": 1.0518, + "step": 773 + }, + { + "epoch": 0.126362189298396, + "grad_norm": 2.1738829612731934, + "learning_rate": 1.548e-05, + "loss": 0.9975, + "step": 774 + }, + { + "epoch": 0.12652544794090037, + "grad_norm": 2.2844667434692383, + "learning_rate": 1.55e-05, + "loss": 1.146, + "step": 775 + }, + { + "epoch": 0.12668870658340475, + "grad_norm": 2.266869068145752, + "learning_rate": 1.552e-05, + "loss": 1.1181, + "step": 776 + }, + { + "epoch": 0.12685196522590914, + "grad_norm": 2.207763671875, + "learning_rate": 1.554e-05, + "loss": 0.92, + "step": 777 + }, + { + "epoch": 0.12701522386841355, + "grad_norm": 2.888216733932495, + "learning_rate": 1.556e-05, + "loss": 1.1068, + "step": 778 + }, + { + "epoch": 0.12717848251091793, + "grad_norm": 2.5158331394195557, + "learning_rate": 1.5580000000000003e-05, + "loss": 1.0591, + "step": 779 + }, + { + "epoch": 0.1273417411534223, + "grad_norm": 2.4173104763031006, + "learning_rate": 1.5600000000000003e-05, + "loss": 1.0881, + "step": 780 + }, + { + "epoch": 0.1275049997959267, + "grad_norm": 2.8312129974365234, + "learning_rate": 1.5620000000000003e-05, + "loss": 1.2619, + "step": 781 + }, + { + "epoch": 0.12766825843843108, + "grad_norm": 2.542279005050659, + "learning_rate": 1.5640000000000003e-05, + "loss": 1.0846, + "step": 782 + }, + { + "epoch": 0.12783151708093546, + "grad_norm": 2.1690378189086914, + "learning_rate": 1.5660000000000003e-05, + "loss": 0.9546, + "step": 783 + }, + { + "epoch": 0.12799477572343987, + "grad_norm": 2.281083345413208, + "learning_rate": 1.5680000000000002e-05, + "loss": 1.3403, + "step": 784 + }, + { + "epoch": 0.12815803436594425, + "grad_norm": 2.5888724327087402, + "learning_rate": 1.5700000000000002e-05, + "loss": 1.1039, + "step": 785 + }, + { + "epoch": 0.12832129300844863, + "grad_norm": 2.0737390518188477, + "learning_rate": 1.5720000000000002e-05, + "loss": 0.9884, + "step": 786 + }, + { + "epoch": 0.12848455165095302, + "grad_norm": 2.239034652709961, + "learning_rate": 1.5740000000000002e-05, + "loss": 1.2374, + "step": 787 + }, + { + "epoch": 0.1286478102934574, + "grad_norm": 2.0441274642944336, + "learning_rate": 1.576e-05, + "loss": 0.9067, + "step": 788 + }, + { + "epoch": 0.1288110689359618, + "grad_norm": 2.762007474899292, + "learning_rate": 1.578e-05, + "loss": 1.1817, + "step": 789 + }, + { + "epoch": 0.1289743275784662, + "grad_norm": 2.167156934738159, + "learning_rate": 1.58e-05, + "loss": 1.0362, + "step": 790 + }, + { + "epoch": 0.12913758622097057, + "grad_norm": 2.466776132583618, + "learning_rate": 1.582e-05, + "loss": 1.0475, + "step": 791 + }, + { + "epoch": 0.12930084486347496, + "grad_norm": 2.870243787765503, + "learning_rate": 1.584e-05, + "loss": 1.075, + "step": 792 + }, + { + "epoch": 0.12946410350597934, + "grad_norm": 2.1869876384735107, + "learning_rate": 1.586e-05, + "loss": 0.9839, + "step": 793 + }, + { + "epoch": 0.12962736214848375, + "grad_norm": 2.2803492546081543, + "learning_rate": 1.588e-05, + "loss": 1.0328, + "step": 794 + }, + { + "epoch": 0.12979062079098813, + "grad_norm": 2.5293941497802734, + "learning_rate": 1.5900000000000004e-05, + "loss": 1.0025, + "step": 795 + }, + { + "epoch": 0.12995387943349251, + "grad_norm": 2.30033802986145, + "learning_rate": 1.5920000000000003e-05, + "loss": 0.9179, + "step": 796 + }, + { + "epoch": 0.1301171380759969, + "grad_norm": 2.378999710083008, + "learning_rate": 1.5940000000000003e-05, + "loss": 1.1092, + "step": 797 + }, + { + "epoch": 0.13028039671850128, + "grad_norm": 2.3963797092437744, + "learning_rate": 1.5960000000000003e-05, + "loss": 0.9248, + "step": 798 + }, + { + "epoch": 0.13044365536100566, + "grad_norm": 2.2756881713867188, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.9011, + "step": 799 + }, + { + "epoch": 0.13060691400351007, + "grad_norm": 2.4396896362304688, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.1119, + "step": 800 + }, + { + "epoch": 0.13077017264601445, + "grad_norm": 2.329301357269287, + "learning_rate": 1.6020000000000002e-05, + "loss": 1.1733, + "step": 801 + }, + { + "epoch": 0.13093343128851884, + "grad_norm": 2.3051140308380127, + "learning_rate": 1.6040000000000002e-05, + "loss": 1.0397, + "step": 802 + }, + { + "epoch": 0.13109668993102322, + "grad_norm": 2.466191530227661, + "learning_rate": 1.6060000000000002e-05, + "loss": 1.1942, + "step": 803 + }, + { + "epoch": 0.1312599485735276, + "grad_norm": 2.410820722579956, + "learning_rate": 1.6080000000000002e-05, + "loss": 1.1701, + "step": 804 + }, + { + "epoch": 0.131423207216032, + "grad_norm": 2.2420690059661865, + "learning_rate": 1.6100000000000002e-05, + "loss": 0.9753, + "step": 805 + }, + { + "epoch": 0.1315864658585364, + "grad_norm": 2.461653232574463, + "learning_rate": 1.612e-05, + "loss": 1.0665, + "step": 806 + }, + { + "epoch": 0.13174972450104078, + "grad_norm": 2.347559690475464, + "learning_rate": 1.614e-05, + "loss": 1.0816, + "step": 807 + }, + { + "epoch": 0.13191298314354516, + "grad_norm": 2.683488368988037, + "learning_rate": 1.616e-05, + "loss": 1.0225, + "step": 808 + }, + { + "epoch": 0.13207624178604954, + "grad_norm": 2.085374355316162, + "learning_rate": 1.618e-05, + "loss": 1.0274, + "step": 809 + }, + { + "epoch": 0.13223950042855392, + "grad_norm": 2.2985503673553467, + "learning_rate": 1.62e-05, + "loss": 1.1442, + "step": 810 + }, + { + "epoch": 0.13240275907105833, + "grad_norm": 2.378345489501953, + "learning_rate": 1.6220000000000004e-05, + "loss": 0.9335, + "step": 811 + }, + { + "epoch": 0.13256601771356272, + "grad_norm": 2.120229959487915, + "learning_rate": 1.6240000000000004e-05, + "loss": 1.0921, + "step": 812 + }, + { + "epoch": 0.1327292763560671, + "grad_norm": 2.2785110473632812, + "learning_rate": 1.626e-05, + "loss": 1.0372, + "step": 813 + }, + { + "epoch": 0.13289253499857148, + "grad_norm": 2.3823118209838867, + "learning_rate": 1.628e-05, + "loss": 1.1213, + "step": 814 + }, + { + "epoch": 0.13305579364107586, + "grad_norm": 2.504801034927368, + "learning_rate": 1.63e-05, + "loss": 1.2661, + "step": 815 + }, + { + "epoch": 0.13321905228358027, + "grad_norm": 2.092095136642456, + "learning_rate": 1.632e-05, + "loss": 1.0209, + "step": 816 + }, + { + "epoch": 0.13338231092608466, + "grad_norm": 2.267958402633667, + "learning_rate": 1.634e-05, + "loss": 1.1929, + "step": 817 + }, + { + "epoch": 0.13354556956858904, + "grad_norm": 2.91835618019104, + "learning_rate": 1.636e-05, + "loss": 1.178, + "step": 818 + }, + { + "epoch": 0.13370882821109342, + "grad_norm": 2.530365228652954, + "learning_rate": 1.638e-05, + "loss": 1.0486, + "step": 819 + }, + { + "epoch": 0.1338720868535978, + "grad_norm": 2.5693790912628174, + "learning_rate": 1.64e-05, + "loss": 1.0119, + "step": 820 + }, + { + "epoch": 0.1340353454961022, + "grad_norm": 3.007214069366455, + "learning_rate": 1.6420000000000002e-05, + "loss": 1.3315, + "step": 821 + }, + { + "epoch": 0.1341986041386066, + "grad_norm": 2.337501049041748, + "learning_rate": 1.6440000000000002e-05, + "loss": 1.0489, + "step": 822 + }, + { + "epoch": 0.13436186278111098, + "grad_norm": 2.2695515155792236, + "learning_rate": 1.646e-05, + "loss": 1.0759, + "step": 823 + }, + { + "epoch": 0.13452512142361536, + "grad_norm": 2.47705340385437, + "learning_rate": 1.648e-05, + "loss": 1.1809, + "step": 824 + }, + { + "epoch": 0.13468838006611975, + "grad_norm": 2.219982147216797, + "learning_rate": 1.65e-05, + "loss": 0.9075, + "step": 825 + }, + { + "epoch": 0.13485163870862413, + "grad_norm": 2.298489570617676, + "learning_rate": 1.652e-05, + "loss": 0.9577, + "step": 826 + }, + { + "epoch": 0.13501489735112854, + "grad_norm": 2.265995502471924, + "learning_rate": 1.654e-05, + "loss": 1.0926, + "step": 827 + }, + { + "epoch": 0.13517815599363292, + "grad_norm": 2.2554972171783447, + "learning_rate": 1.656e-05, + "loss": 1.0165, + "step": 828 + }, + { + "epoch": 0.1353414146361373, + "grad_norm": 2.1899595260620117, + "learning_rate": 1.658e-05, + "loss": 0.8842, + "step": 829 + }, + { + "epoch": 0.13550467327864169, + "grad_norm": 2.708967924118042, + "learning_rate": 1.66e-05, + "loss": 1.0506, + "step": 830 + }, + { + "epoch": 0.13566793192114607, + "grad_norm": 2.488602638244629, + "learning_rate": 1.662e-05, + "loss": 1.1737, + "step": 831 + }, + { + "epoch": 0.13583119056365045, + "grad_norm": 2.2469325065612793, + "learning_rate": 1.664e-05, + "loss": 1.1111, + "step": 832 + }, + { + "epoch": 0.13599444920615486, + "grad_norm": 2.581990957260132, + "learning_rate": 1.666e-05, + "loss": 1.0497, + "step": 833 + }, + { + "epoch": 0.13615770784865924, + "grad_norm": 2.4781339168548584, + "learning_rate": 1.668e-05, + "loss": 0.9631, + "step": 834 + }, + { + "epoch": 0.13632096649116363, + "grad_norm": 2.491020679473877, + "learning_rate": 1.67e-05, + "loss": 1.2637, + "step": 835 + }, + { + "epoch": 0.136484225133668, + "grad_norm": 2.620746612548828, + "learning_rate": 1.672e-05, + "loss": 1.2318, + "step": 836 + }, + { + "epoch": 0.1366474837761724, + "grad_norm": 2.739013671875, + "learning_rate": 1.6740000000000002e-05, + "loss": 0.9492, + "step": 837 + }, + { + "epoch": 0.1368107424186768, + "grad_norm": 2.1043741703033447, + "learning_rate": 1.6760000000000002e-05, + "loss": 0.949, + "step": 838 + }, + { + "epoch": 0.13697400106118118, + "grad_norm": 2.4798238277435303, + "learning_rate": 1.6780000000000002e-05, + "loss": 1.2807, + "step": 839 + }, + { + "epoch": 0.13713725970368557, + "grad_norm": 2.0695459842681885, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.07, + "step": 840 + }, + { + "epoch": 0.13730051834618995, + "grad_norm": 2.578718423843384, + "learning_rate": 1.682e-05, + "loss": 1.122, + "step": 841 + }, + { + "epoch": 0.13746377698869433, + "grad_norm": 2.4468400478363037, + "learning_rate": 1.684e-05, + "loss": 1.0477, + "step": 842 + }, + { + "epoch": 0.1376270356311987, + "grad_norm": 2.2157211303710938, + "learning_rate": 1.686e-05, + "loss": 1.0999, + "step": 843 + }, + { + "epoch": 0.13779029427370312, + "grad_norm": 2.626016616821289, + "learning_rate": 1.688e-05, + "loss": 0.919, + "step": 844 + }, + { + "epoch": 0.1379535529162075, + "grad_norm": 2.4097883701324463, + "learning_rate": 1.69e-05, + "loss": 1.018, + "step": 845 + }, + { + "epoch": 0.1381168115587119, + "grad_norm": 2.5626206398010254, + "learning_rate": 1.692e-05, + "loss": 1.2025, + "step": 846 + }, + { + "epoch": 0.13828007020121627, + "grad_norm": 2.166566848754883, + "learning_rate": 1.694e-05, + "loss": 1.2181, + "step": 847 + }, + { + "epoch": 0.13844332884372065, + "grad_norm": 2.3883957862854004, + "learning_rate": 1.696e-05, + "loss": 1.179, + "step": 848 + }, + { + "epoch": 0.13860658748622506, + "grad_norm": 2.9392340183258057, + "learning_rate": 1.698e-05, + "loss": 1.194, + "step": 849 + }, + { + "epoch": 0.13876984612872945, + "grad_norm": 2.3590869903564453, + "learning_rate": 1.7e-05, + "loss": 0.9364, + "step": 850 + }, + { + "epoch": 0.13893310477123383, + "grad_norm": 2.6429126262664795, + "learning_rate": 1.702e-05, + "loss": 1.2343, + "step": 851 + }, + { + "epoch": 0.1390963634137382, + "grad_norm": 2.3160974979400635, + "learning_rate": 1.704e-05, + "loss": 1.1265, + "step": 852 + }, + { + "epoch": 0.1392596220562426, + "grad_norm": 2.275766611099243, + "learning_rate": 1.7060000000000003e-05, + "loss": 0.9642, + "step": 853 + }, + { + "epoch": 0.139422880698747, + "grad_norm": 2.634352684020996, + "learning_rate": 1.7080000000000002e-05, + "loss": 1.126, + "step": 854 + }, + { + "epoch": 0.1395861393412514, + "grad_norm": 2.3045482635498047, + "learning_rate": 1.7100000000000002e-05, + "loss": 1.1128, + "step": 855 + }, + { + "epoch": 0.13974939798375577, + "grad_norm": 2.288778066635132, + "learning_rate": 1.7120000000000002e-05, + "loss": 1.2359, + "step": 856 + }, + { + "epoch": 0.13991265662626015, + "grad_norm": 2.6486730575561523, + "learning_rate": 1.7140000000000002e-05, + "loss": 1.0841, + "step": 857 + }, + { + "epoch": 0.14007591526876453, + "grad_norm": 2.1670572757720947, + "learning_rate": 1.7160000000000002e-05, + "loss": 1.0854, + "step": 858 + }, + { + "epoch": 0.14023917391126892, + "grad_norm": 2.4031360149383545, + "learning_rate": 1.718e-05, + "loss": 1.2407, + "step": 859 + }, + { + "epoch": 0.14040243255377333, + "grad_norm": 2.2284305095672607, + "learning_rate": 1.72e-05, + "loss": 1.0318, + "step": 860 + }, + { + "epoch": 0.1405656911962777, + "grad_norm": 2.4811019897460938, + "learning_rate": 1.722e-05, + "loss": 1.321, + "step": 861 + }, + { + "epoch": 0.1407289498387821, + "grad_norm": 2.680697441101074, + "learning_rate": 1.724e-05, + "loss": 1.1752, + "step": 862 + }, + { + "epoch": 0.14089220848128647, + "grad_norm": 2.218731641769409, + "learning_rate": 1.726e-05, + "loss": 1.1682, + "step": 863 + }, + { + "epoch": 0.14105546712379086, + "grad_norm": 3.526254177093506, + "learning_rate": 1.728e-05, + "loss": 1.2073, + "step": 864 + }, + { + "epoch": 0.14121872576629527, + "grad_norm": 2.1793603897094727, + "learning_rate": 1.73e-05, + "loss": 1.2821, + "step": 865 + }, + { + "epoch": 0.14138198440879965, + "grad_norm": 2.213533639907837, + "learning_rate": 1.732e-05, + "loss": 1.1173, + "step": 866 + }, + { + "epoch": 0.14154524305130403, + "grad_norm": 2.397644519805908, + "learning_rate": 1.734e-05, + "loss": 1.2369, + "step": 867 + }, + { + "epoch": 0.14170850169380841, + "grad_norm": 2.247380018234253, + "learning_rate": 1.736e-05, + "loss": 1.0636, + "step": 868 + }, + { + "epoch": 0.1418717603363128, + "grad_norm": 2.516901969909668, + "learning_rate": 1.7380000000000003e-05, + "loss": 1.2021, + "step": 869 + }, + { + "epoch": 0.14203501897881718, + "grad_norm": 2.1053361892700195, + "learning_rate": 1.7400000000000003e-05, + "loss": 1.0103, + "step": 870 + }, + { + "epoch": 0.1421982776213216, + "grad_norm": 2.3672521114349365, + "learning_rate": 1.7420000000000003e-05, + "loss": 0.8694, + "step": 871 + }, + { + "epoch": 0.14236153626382597, + "grad_norm": 2.465688943862915, + "learning_rate": 1.7440000000000002e-05, + "loss": 0.9321, + "step": 872 + }, + { + "epoch": 0.14252479490633035, + "grad_norm": 1.9444098472595215, + "learning_rate": 1.7460000000000002e-05, + "loss": 0.9203, + "step": 873 + }, + { + "epoch": 0.14268805354883474, + "grad_norm": 2.4069600105285645, + "learning_rate": 1.7480000000000002e-05, + "loss": 1.1466, + "step": 874 + }, + { + "epoch": 0.14285131219133912, + "grad_norm": 2.3116390705108643, + "learning_rate": 1.7500000000000002e-05, + "loss": 1.062, + "step": 875 + }, + { + "epoch": 0.14301457083384353, + "grad_norm": 2.219588041305542, + "learning_rate": 1.752e-05, + "loss": 1.0319, + "step": 876 + }, + { + "epoch": 0.1431778294763479, + "grad_norm": 2.5299127101898193, + "learning_rate": 1.754e-05, + "loss": 1.0672, + "step": 877 + }, + { + "epoch": 0.1433410881188523, + "grad_norm": 2.377128839492798, + "learning_rate": 1.756e-05, + "loss": 1.0066, + "step": 878 + }, + { + "epoch": 0.14350434676135668, + "grad_norm": 2.3860764503479004, + "learning_rate": 1.758e-05, + "loss": 1.1342, + "step": 879 + }, + { + "epoch": 0.14366760540386106, + "grad_norm": 2.5924859046936035, + "learning_rate": 1.76e-05, + "loss": 0.971, + "step": 880 + }, + { + "epoch": 0.14383086404636544, + "grad_norm": 2.513972759246826, + "learning_rate": 1.762e-05, + "loss": 0.9887, + "step": 881 + }, + { + "epoch": 0.14399412268886985, + "grad_norm": 2.78017520904541, + "learning_rate": 1.764e-05, + "loss": 1.3437, + "step": 882 + }, + { + "epoch": 0.14415738133137423, + "grad_norm": 2.2695229053497314, + "learning_rate": 1.766e-05, + "loss": 1.0414, + "step": 883 + }, + { + "epoch": 0.14432063997387862, + "grad_norm": 2.2176451683044434, + "learning_rate": 1.768e-05, + "loss": 1.0601, + "step": 884 + }, + { + "epoch": 0.144483898616383, + "grad_norm": 2.479351043701172, + "learning_rate": 1.77e-05, + "loss": 1.2492, + "step": 885 + }, + { + "epoch": 0.14464715725888738, + "grad_norm": 2.792762279510498, + "learning_rate": 1.7720000000000003e-05, + "loss": 1.1413, + "step": 886 + }, + { + "epoch": 0.1448104159013918, + "grad_norm": 2.196213722229004, + "learning_rate": 1.7740000000000003e-05, + "loss": 0.9508, + "step": 887 + }, + { + "epoch": 0.14497367454389617, + "grad_norm": 2.2285099029541016, + "learning_rate": 1.7760000000000003e-05, + "loss": 1.0122, + "step": 888 + }, + { + "epoch": 0.14513693318640056, + "grad_norm": 2.2343876361846924, + "learning_rate": 1.7780000000000003e-05, + "loss": 1.1884, + "step": 889 + }, + { + "epoch": 0.14530019182890494, + "grad_norm": 2.203963279724121, + "learning_rate": 1.7800000000000002e-05, + "loss": 0.8829, + "step": 890 + }, + { + "epoch": 0.14546345047140932, + "grad_norm": 2.469761610031128, + "learning_rate": 1.7820000000000002e-05, + "loss": 1.1417, + "step": 891 + }, + { + "epoch": 0.1456267091139137, + "grad_norm": 2.4011363983154297, + "learning_rate": 1.7840000000000002e-05, + "loss": 1.007, + "step": 892 + }, + { + "epoch": 0.14578996775641812, + "grad_norm": 2.156428575515747, + "learning_rate": 1.7860000000000002e-05, + "loss": 1.035, + "step": 893 + }, + { + "epoch": 0.1459532263989225, + "grad_norm": 2.422863483428955, + "learning_rate": 1.788e-05, + "loss": 1.1256, + "step": 894 + }, + { + "epoch": 0.14611648504142688, + "grad_norm": 2.4491419792175293, + "learning_rate": 1.79e-05, + "loss": 1.0105, + "step": 895 + }, + { + "epoch": 0.14627974368393126, + "grad_norm": 2.2466745376586914, + "learning_rate": 1.792e-05, + "loss": 1.0177, + "step": 896 + }, + { + "epoch": 0.14644300232643565, + "grad_norm": 2.443605661392212, + "learning_rate": 1.794e-05, + "loss": 1.0399, + "step": 897 + }, + { + "epoch": 0.14660626096894006, + "grad_norm": 2.762124538421631, + "learning_rate": 1.796e-05, + "loss": 1.1509, + "step": 898 + }, + { + "epoch": 0.14676951961144444, + "grad_norm": 2.450104236602783, + "learning_rate": 1.798e-05, + "loss": 1.2523, + "step": 899 + }, + { + "epoch": 0.14693277825394882, + "grad_norm": 2.320499897003174, + "learning_rate": 1.8e-05, + "loss": 1.0628, + "step": 900 + }, + { + "epoch": 0.1470960368964532, + "grad_norm": 2.3713152408599854, + "learning_rate": 1.802e-05, + "loss": 1.073, + "step": 901 + }, + { + "epoch": 0.14725929553895759, + "grad_norm": 2.3368642330169678, + "learning_rate": 1.8040000000000003e-05, + "loss": 1.0805, + "step": 902 + }, + { + "epoch": 0.14742255418146197, + "grad_norm": 2.1584417819976807, + "learning_rate": 1.8060000000000003e-05, + "loss": 1.015, + "step": 903 + }, + { + "epoch": 0.14758581282396638, + "grad_norm": 2.605186939239502, + "learning_rate": 1.8080000000000003e-05, + "loss": 1.0982, + "step": 904 + }, + { + "epoch": 0.14774907146647076, + "grad_norm": 2.4269142150878906, + "learning_rate": 1.8100000000000003e-05, + "loss": 0.9702, + "step": 905 + }, + { + "epoch": 0.14791233010897514, + "grad_norm": 2.6633119583129883, + "learning_rate": 1.8120000000000003e-05, + "loss": 1.282, + "step": 906 + }, + { + "epoch": 0.14807558875147953, + "grad_norm": 2.067863702774048, + "learning_rate": 1.8140000000000003e-05, + "loss": 0.9699, + "step": 907 + }, + { + "epoch": 0.1482388473939839, + "grad_norm": 2.3342347145080566, + "learning_rate": 1.8160000000000002e-05, + "loss": 1.105, + "step": 908 + }, + { + "epoch": 0.14840210603648832, + "grad_norm": 2.120023250579834, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.9601, + "step": 909 + }, + { + "epoch": 0.1485653646789927, + "grad_norm": 2.340888500213623, + "learning_rate": 1.8200000000000002e-05, + "loss": 1.2264, + "step": 910 + }, + { + "epoch": 0.14872862332149708, + "grad_norm": 2.3110930919647217, + "learning_rate": 1.8220000000000002e-05, + "loss": 1.1465, + "step": 911 + }, + { + "epoch": 0.14889188196400147, + "grad_norm": 2.3530757427215576, + "learning_rate": 1.824e-05, + "loss": 1.0401, + "step": 912 + }, + { + "epoch": 0.14905514060650585, + "grad_norm": 2.5621514320373535, + "learning_rate": 1.826e-05, + "loss": 1.1464, + "step": 913 + }, + { + "epoch": 0.14921839924901023, + "grad_norm": 2.6963536739349365, + "learning_rate": 1.828e-05, + "loss": 1.6817, + "step": 914 + }, + { + "epoch": 0.14938165789151464, + "grad_norm": 2.5850436687469482, + "learning_rate": 1.83e-05, + "loss": 1.2472, + "step": 915 + }, + { + "epoch": 0.14954491653401902, + "grad_norm": 2.27846622467041, + "learning_rate": 1.832e-05, + "loss": 1.088, + "step": 916 + }, + { + "epoch": 0.1497081751765234, + "grad_norm": 2.473759412765503, + "learning_rate": 1.834e-05, + "loss": 1.0865, + "step": 917 + }, + { + "epoch": 0.1498714338190278, + "grad_norm": 2.447110176086426, + "learning_rate": 1.8360000000000004e-05, + "loss": 1.1901, + "step": 918 + }, + { + "epoch": 0.15003469246153217, + "grad_norm": 2.13765549659729, + "learning_rate": 1.8380000000000004e-05, + "loss": 1.1359, + "step": 919 + }, + { + "epoch": 0.15019795110403658, + "grad_norm": 2.4469408988952637, + "learning_rate": 1.8400000000000003e-05, + "loss": 0.9667, + "step": 920 + }, + { + "epoch": 0.15036120974654096, + "grad_norm": 2.3996708393096924, + "learning_rate": 1.8420000000000003e-05, + "loss": 1.0945, + "step": 921 + }, + { + "epoch": 0.15052446838904535, + "grad_norm": 2.118812084197998, + "learning_rate": 1.8440000000000003e-05, + "loss": 1.1095, + "step": 922 + }, + { + "epoch": 0.15068772703154973, + "grad_norm": 2.3676974773406982, + "learning_rate": 1.8460000000000003e-05, + "loss": 1.0102, + "step": 923 + }, + { + "epoch": 0.1508509856740541, + "grad_norm": 2.2139267921447754, + "learning_rate": 1.8480000000000003e-05, + "loss": 0.9487, + "step": 924 + }, + { + "epoch": 0.15101424431655852, + "grad_norm": 2.4325449466705322, + "learning_rate": 1.8500000000000002e-05, + "loss": 1.0409, + "step": 925 + }, + { + "epoch": 0.1511775029590629, + "grad_norm": 2.704031229019165, + "learning_rate": 1.8520000000000002e-05, + "loss": 1.0144, + "step": 926 + }, + { + "epoch": 0.1513407616015673, + "grad_norm": 2.346242904663086, + "learning_rate": 1.8540000000000002e-05, + "loss": 1.0091, + "step": 927 + }, + { + "epoch": 0.15150402024407167, + "grad_norm": 2.290635824203491, + "learning_rate": 1.8560000000000002e-05, + "loss": 1.1726, + "step": 928 + }, + { + "epoch": 0.15166727888657605, + "grad_norm": 2.4403936862945557, + "learning_rate": 1.858e-05, + "loss": 1.1241, + "step": 929 + }, + { + "epoch": 0.15183053752908043, + "grad_norm": 2.4562816619873047, + "learning_rate": 1.86e-05, + "loss": 1.1553, + "step": 930 + }, + { + "epoch": 0.15199379617158484, + "grad_norm": 2.367035388946533, + "learning_rate": 1.862e-05, + "loss": 0.9737, + "step": 931 + }, + { + "epoch": 0.15215705481408923, + "grad_norm": 2.527221202850342, + "learning_rate": 1.864e-05, + "loss": 1.2389, + "step": 932 + }, + { + "epoch": 0.1523203134565936, + "grad_norm": 2.767686128616333, + "learning_rate": 1.866e-05, + "loss": 1.0117, + "step": 933 + }, + { + "epoch": 0.152483572099098, + "grad_norm": 3.014974355697632, + "learning_rate": 1.8680000000000004e-05, + "loss": 1.0793, + "step": 934 + }, + { + "epoch": 0.15264683074160237, + "grad_norm": 2.3546905517578125, + "learning_rate": 1.8700000000000004e-05, + "loss": 1.009, + "step": 935 + }, + { + "epoch": 0.15281008938410678, + "grad_norm": 2.686230421066284, + "learning_rate": 1.8720000000000004e-05, + "loss": 1.8782, + "step": 936 + }, + { + "epoch": 0.15297334802661117, + "grad_norm": 2.307716131210327, + "learning_rate": 1.8740000000000004e-05, + "loss": 1.0153, + "step": 937 + }, + { + "epoch": 0.15313660666911555, + "grad_norm": 2.2142722606658936, + "learning_rate": 1.876e-05, + "loss": 0.9678, + "step": 938 + }, + { + "epoch": 0.15329986531161993, + "grad_norm": 2.4044694900512695, + "learning_rate": 1.878e-05, + "loss": 0.8921, + "step": 939 + }, + { + "epoch": 0.15346312395412431, + "grad_norm": 2.0629494190216064, + "learning_rate": 1.88e-05, + "loss": 0.9116, + "step": 940 + }, + { + "epoch": 0.1536263825966287, + "grad_norm": 2.3467674255371094, + "learning_rate": 1.882e-05, + "loss": 0.9945, + "step": 941 + }, + { + "epoch": 0.1537896412391331, + "grad_norm": 2.241781234741211, + "learning_rate": 1.884e-05, + "loss": 1.0151, + "step": 942 + }, + { + "epoch": 0.1539528998816375, + "grad_norm": 2.0616438388824463, + "learning_rate": 1.886e-05, + "loss": 0.9926, + "step": 943 + }, + { + "epoch": 0.15411615852414187, + "grad_norm": 2.4432897567749023, + "learning_rate": 1.8880000000000002e-05, + "loss": 0.9447, + "step": 944 + }, + { + "epoch": 0.15427941716664625, + "grad_norm": 2.038989782333374, + "learning_rate": 1.8900000000000002e-05, + "loss": 1.0205, + "step": 945 + }, + { + "epoch": 0.15444267580915064, + "grad_norm": 2.1921145915985107, + "learning_rate": 1.8920000000000002e-05, + "loss": 0.9221, + "step": 946 + }, + { + "epoch": 0.15460593445165505, + "grad_norm": 2.7732040882110596, + "learning_rate": 1.894e-05, + "loss": 1.0628, + "step": 947 + }, + { + "epoch": 0.15476919309415943, + "grad_norm": 2.082597255706787, + "learning_rate": 1.896e-05, + "loss": 0.8795, + "step": 948 + }, + { + "epoch": 0.1549324517366638, + "grad_norm": 2.2344720363616943, + "learning_rate": 1.898e-05, + "loss": 0.9981, + "step": 949 + }, + { + "epoch": 0.1550957103791682, + "grad_norm": 2.1080291271209717, + "learning_rate": 1.9e-05, + "loss": 0.8675, + "step": 950 + }, + { + "epoch": 0.15525896902167258, + "grad_norm": 2.2417209148406982, + "learning_rate": 1.902e-05, + "loss": 0.8967, + "step": 951 + }, + { + "epoch": 0.15542222766417696, + "grad_norm": 2.2811005115509033, + "learning_rate": 1.904e-05, + "loss": 1.0032, + "step": 952 + }, + { + "epoch": 0.15558548630668137, + "grad_norm": 2.200798273086548, + "learning_rate": 1.906e-05, + "loss": 1.0215, + "step": 953 + }, + { + "epoch": 0.15574874494918575, + "grad_norm": 2.081390142440796, + "learning_rate": 1.908e-05, + "loss": 0.9008, + "step": 954 + }, + { + "epoch": 0.15591200359169013, + "grad_norm": 2.841245412826538, + "learning_rate": 1.91e-05, + "loss": 1.1497, + "step": 955 + }, + { + "epoch": 0.15607526223419452, + "grad_norm": 2.388960123062134, + "learning_rate": 1.912e-05, + "loss": 1.278, + "step": 956 + }, + { + "epoch": 0.1562385208766989, + "grad_norm": 2.2032625675201416, + "learning_rate": 1.914e-05, + "loss": 1.0945, + "step": 957 + }, + { + "epoch": 0.1564017795192033, + "grad_norm": 2.2127585411071777, + "learning_rate": 1.916e-05, + "loss": 0.888, + "step": 958 + }, + { + "epoch": 0.1565650381617077, + "grad_norm": 2.2445883750915527, + "learning_rate": 1.918e-05, + "loss": 1.0393, + "step": 959 + }, + { + "epoch": 0.15672829680421207, + "grad_norm": 2.3601393699645996, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.0214, + "step": 960 + }, + { + "epoch": 0.15689155544671646, + "grad_norm": 2.6366095542907715, + "learning_rate": 1.9220000000000002e-05, + "loss": 1.1965, + "step": 961 + }, + { + "epoch": 0.15705481408922084, + "grad_norm": 2.335256338119507, + "learning_rate": 1.9240000000000002e-05, + "loss": 1.1656, + "step": 962 + }, + { + "epoch": 0.15721807273172522, + "grad_norm": 2.935983896255493, + "learning_rate": 1.9260000000000002e-05, + "loss": 1.1075, + "step": 963 + }, + { + "epoch": 0.15738133137422963, + "grad_norm": 2.0931265354156494, + "learning_rate": 1.9280000000000002e-05, + "loss": 0.9205, + "step": 964 + }, + { + "epoch": 0.15754459001673402, + "grad_norm": 2.435394287109375, + "learning_rate": 1.93e-05, + "loss": 1.1145, + "step": 965 + }, + { + "epoch": 0.1577078486592384, + "grad_norm": 2.206242322921753, + "learning_rate": 1.932e-05, + "loss": 1.1818, + "step": 966 + }, + { + "epoch": 0.15787110730174278, + "grad_norm": 2.281053066253662, + "learning_rate": 1.934e-05, + "loss": 0.9596, + "step": 967 + }, + { + "epoch": 0.15803436594424716, + "grad_norm": 2.3303661346435547, + "learning_rate": 1.936e-05, + "loss": 1.0887, + "step": 968 + }, + { + "epoch": 0.15819762458675157, + "grad_norm": 1.9661091566085815, + "learning_rate": 1.938e-05, + "loss": 0.9627, + "step": 969 + }, + { + "epoch": 0.15836088322925596, + "grad_norm": 2.201622724533081, + "learning_rate": 1.94e-05, + "loss": 0.9263, + "step": 970 + }, + { + "epoch": 0.15852414187176034, + "grad_norm": 2.303248882293701, + "learning_rate": 1.942e-05, + "loss": 1.0863, + "step": 971 + }, + { + "epoch": 0.15868740051426472, + "grad_norm": 2.137838840484619, + "learning_rate": 1.944e-05, + "loss": 0.9642, + "step": 972 + }, + { + "epoch": 0.1588506591567691, + "grad_norm": 2.151686906814575, + "learning_rate": 1.946e-05, + "loss": 1.0841, + "step": 973 + }, + { + "epoch": 0.15901391779927349, + "grad_norm": 2.3956427574157715, + "learning_rate": 1.948e-05, + "loss": 0.9505, + "step": 974 + }, + { + "epoch": 0.1591771764417779, + "grad_norm": 2.575671911239624, + "learning_rate": 1.95e-05, + "loss": 1.1096, + "step": 975 + }, + { + "epoch": 0.15934043508428228, + "grad_norm": 2.334066867828369, + "learning_rate": 1.9520000000000003e-05, + "loss": 1.006, + "step": 976 + }, + { + "epoch": 0.15950369372678666, + "grad_norm": 2.7651143074035645, + "learning_rate": 1.9540000000000003e-05, + "loss": 0.8839, + "step": 977 + }, + { + "epoch": 0.15966695236929104, + "grad_norm": 2.4744086265563965, + "learning_rate": 1.9560000000000002e-05, + "loss": 0.9218, + "step": 978 + }, + { + "epoch": 0.15983021101179543, + "grad_norm": 2.304168224334717, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.979, + "step": 979 + }, + { + "epoch": 0.15999346965429984, + "grad_norm": 3.1773602962493896, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.8332, + "step": 980 + }, + { + "epoch": 0.16015672829680422, + "grad_norm": 2.3833625316619873, + "learning_rate": 1.9620000000000002e-05, + "loss": 1.0549, + "step": 981 + }, + { + "epoch": 0.1603199869393086, + "grad_norm": 2.2018184661865234, + "learning_rate": 1.9640000000000002e-05, + "loss": 1.0805, + "step": 982 + }, + { + "epoch": 0.16048324558181298, + "grad_norm": 2.5941967964172363, + "learning_rate": 1.966e-05, + "loss": 1.0817, + "step": 983 + }, + { + "epoch": 0.16064650422431737, + "grad_norm": 2.1696386337280273, + "learning_rate": 1.968e-05, + "loss": 1.0285, + "step": 984 + }, + { + "epoch": 0.16080976286682178, + "grad_norm": 2.133671283721924, + "learning_rate": 1.97e-05, + "loss": 0.9338, + "step": 985 + }, + { + "epoch": 0.16097302150932616, + "grad_norm": 2.1205811500549316, + "learning_rate": 1.972e-05, + "loss": 0.984, + "step": 986 + }, + { + "epoch": 0.16113628015183054, + "grad_norm": 2.0616884231567383, + "learning_rate": 1.974e-05, + "loss": 0.8192, + "step": 987 + }, + { + "epoch": 0.16129953879433492, + "grad_norm": 2.1773252487182617, + "learning_rate": 1.976e-05, + "loss": 1.0812, + "step": 988 + }, + { + "epoch": 0.1614627974368393, + "grad_norm": 2.3835434913635254, + "learning_rate": 1.978e-05, + "loss": 1.0698, + "step": 989 + }, + { + "epoch": 0.1616260560793437, + "grad_norm": 2.598984479904175, + "learning_rate": 1.98e-05, + "loss": 1.2068, + "step": 990 + }, + { + "epoch": 0.1617893147218481, + "grad_norm": 2.163893699645996, + "learning_rate": 1.982e-05, + "loss": 1.1473, + "step": 991 + }, + { + "epoch": 0.16195257336435248, + "grad_norm": 2.4261348247528076, + "learning_rate": 1.9840000000000003e-05, + "loss": 1.0327, + "step": 992 + }, + { + "epoch": 0.16211583200685686, + "grad_norm": 2.2704286575317383, + "learning_rate": 1.9860000000000003e-05, + "loss": 1.2738, + "step": 993 + }, + { + "epoch": 0.16227909064936125, + "grad_norm": 2.5879311561584473, + "learning_rate": 1.9880000000000003e-05, + "loss": 1.1713, + "step": 994 + }, + { + "epoch": 0.16244234929186563, + "grad_norm": 2.484565496444702, + "learning_rate": 1.9900000000000003e-05, + "loss": 1.112, + "step": 995 + }, + { + "epoch": 0.16260560793437004, + "grad_norm": 2.3270070552825928, + "learning_rate": 1.9920000000000002e-05, + "loss": 0.9787, + "step": 996 + }, + { + "epoch": 0.16276886657687442, + "grad_norm": 2.4373741149902344, + "learning_rate": 1.9940000000000002e-05, + "loss": 0.8845, + "step": 997 + }, + { + "epoch": 0.1629321252193788, + "grad_norm": 2.1516246795654297, + "learning_rate": 1.9960000000000002e-05, + "loss": 1.0106, + "step": 998 + }, + { + "epoch": 0.1630953838618832, + "grad_norm": 2.3449010848999023, + "learning_rate": 1.9980000000000002e-05, + "loss": 1.0922, + "step": 999 + }, + { + "epoch": 0.16325864250438757, + "grad_norm": 2.336559295654297, + "learning_rate": 2e-05, + "loss": 1.1378, + "step": 1000 + }, + { + "epoch": 0.16342190114689195, + "grad_norm": 2.3514554500579834, + "learning_rate": 1.9999999994965004e-05, + "loss": 1.0644, + "step": 1001 + }, + { + "epoch": 0.16358515978939636, + "grad_norm": 2.2715625762939453, + "learning_rate": 1.9999999979860004e-05, + "loss": 1.0649, + "step": 1002 + }, + { + "epoch": 0.16374841843190074, + "grad_norm": 2.4589476585388184, + "learning_rate": 1.9999999954685013e-05, + "loss": 1.0996, + "step": 1003 + }, + { + "epoch": 0.16391167707440513, + "grad_norm": 2.614118814468384, + "learning_rate": 1.9999999919440023e-05, + "loss": 1.2055, + "step": 1004 + }, + { + "epoch": 0.1640749357169095, + "grad_norm": 2.2623252868652344, + "learning_rate": 1.9999999874125034e-05, + "loss": 0.9441, + "step": 1005 + }, + { + "epoch": 0.1642381943594139, + "grad_norm": 2.55422306060791, + "learning_rate": 1.999999981874005e-05, + "loss": 1.0869, + "step": 1006 + }, + { + "epoch": 0.1644014530019183, + "grad_norm": 2.1740806102752686, + "learning_rate": 1.9999999753285067e-05, + "loss": 0.9363, + "step": 1007 + }, + { + "epoch": 0.16456471164442268, + "grad_norm": 2.140511989593506, + "learning_rate": 1.9999999677760086e-05, + "loss": 0.9024, + "step": 1008 + }, + { + "epoch": 0.16472797028692707, + "grad_norm": 2.785182237625122, + "learning_rate": 1.999999959216511e-05, + "loss": 1.0512, + "step": 1009 + }, + { + "epoch": 0.16489122892943145, + "grad_norm": 2.6590869426727295, + "learning_rate": 1.9999999496500138e-05, + "loss": 1.129, + "step": 1010 + }, + { + "epoch": 0.16505448757193583, + "grad_norm": 3.057839870452881, + "learning_rate": 1.9999999390765168e-05, + "loss": 1.2267, + "step": 1011 + }, + { + "epoch": 0.16521774621444021, + "grad_norm": 2.4520065784454346, + "learning_rate": 1.99999992749602e-05, + "loss": 0.9163, + "step": 1012 + }, + { + "epoch": 0.16538100485694462, + "grad_norm": 2.593477487564087, + "learning_rate": 1.9999999149085238e-05, + "loss": 1.0129, + "step": 1013 + }, + { + "epoch": 0.165544263499449, + "grad_norm": 2.6609976291656494, + "learning_rate": 1.999999901314028e-05, + "loss": 1.1978, + "step": 1014 + }, + { + "epoch": 0.1657075221419534, + "grad_norm": 2.2153432369232178, + "learning_rate": 1.999999886712532e-05, + "loss": 1.022, + "step": 1015 + }, + { + "epoch": 0.16587078078445777, + "grad_norm": 2.2598860263824463, + "learning_rate": 1.999999871104037e-05, + "loss": 1.1382, + "step": 1016 + }, + { + "epoch": 0.16603403942696215, + "grad_norm": 2.4949610233306885, + "learning_rate": 1.9999998544885422e-05, + "loss": 1.0505, + "step": 1017 + }, + { + "epoch": 0.16619729806946656, + "grad_norm": 2.293769121170044, + "learning_rate": 1.9999998368660475e-05, + "loss": 1.1003, + "step": 1018 + }, + { + "epoch": 0.16636055671197095, + "grad_norm": 2.0951051712036133, + "learning_rate": 1.9999998182365536e-05, + "loss": 1.0586, + "step": 1019 + }, + { + "epoch": 0.16652381535447533, + "grad_norm": 2.4665348529815674, + "learning_rate": 1.9999997986000598e-05, + "loss": 1.0945, + "step": 1020 + }, + { + "epoch": 0.1666870739969797, + "grad_norm": 2.3877999782562256, + "learning_rate": 1.999999777956567e-05, + "loss": 1.0852, + "step": 1021 + }, + { + "epoch": 0.1668503326394841, + "grad_norm": 2.138324499130249, + "learning_rate": 1.9999997563060744e-05, + "loss": 1.1416, + "step": 1022 + }, + { + "epoch": 0.16701359128198848, + "grad_norm": 2.38706111907959, + "learning_rate": 1.999999733648582e-05, + "loss": 0.978, + "step": 1023 + }, + { + "epoch": 0.1671768499244929, + "grad_norm": 2.095360040664673, + "learning_rate": 1.9999997099840905e-05, + "loss": 1.0607, + "step": 1024 + }, + { + "epoch": 0.16734010856699727, + "grad_norm": 2.2453858852386475, + "learning_rate": 1.9999996853125995e-05, + "loss": 1.0933, + "step": 1025 + }, + { + "epoch": 0.16750336720950165, + "grad_norm": 2.563276767730713, + "learning_rate": 1.9999996596341092e-05, + "loss": 1.159, + "step": 1026 + }, + { + "epoch": 0.16766662585200603, + "grad_norm": 2.481497049331665, + "learning_rate": 1.9999996329486195e-05, + "loss": 1.5889, + "step": 1027 + }, + { + "epoch": 0.16782988449451042, + "grad_norm": 2.391479253768921, + "learning_rate": 1.9999996052561302e-05, + "loss": 1.2435, + "step": 1028 + }, + { + "epoch": 0.16799314313701483, + "grad_norm": 2.746399164199829, + "learning_rate": 1.9999995765566414e-05, + "loss": 1.3302, + "step": 1029 + }, + { + "epoch": 0.1681564017795192, + "grad_norm": 2.409726142883301, + "learning_rate": 1.9999995468501538e-05, + "loss": 1.1857, + "step": 1030 + }, + { + "epoch": 0.1683196604220236, + "grad_norm": 2.2544591426849365, + "learning_rate": 1.999999516136667e-05, + "loss": 1.0864, + "step": 1031 + }, + { + "epoch": 0.16848291906452798, + "grad_norm": 2.6994612216949463, + "learning_rate": 1.9999994844161802e-05, + "loss": 1.1999, + "step": 1032 + }, + { + "epoch": 0.16864617770703236, + "grad_norm": 1.8875113725662231, + "learning_rate": 1.9999994516886947e-05, + "loss": 0.922, + "step": 1033 + }, + { + "epoch": 0.16880943634953674, + "grad_norm": 2.148346185684204, + "learning_rate": 1.99999941795421e-05, + "loss": 1.0054, + "step": 1034 + }, + { + "epoch": 0.16897269499204115, + "grad_norm": 1.928247094154358, + "learning_rate": 1.999999383212726e-05, + "loss": 0.9847, + "step": 1035 + }, + { + "epoch": 0.16913595363454553, + "grad_norm": 2.3509058952331543, + "learning_rate": 1.999999347464243e-05, + "loss": 1.1257, + "step": 1036 + }, + { + "epoch": 0.16929921227704992, + "grad_norm": 2.087721824645996, + "learning_rate": 1.999999310708761e-05, + "loss": 0.9637, + "step": 1037 + }, + { + "epoch": 0.1694624709195543, + "grad_norm": 2.2591004371643066, + "learning_rate": 1.99999927294628e-05, + "loss": 0.9033, + "step": 1038 + }, + { + "epoch": 0.16962572956205868, + "grad_norm": 2.5685207843780518, + "learning_rate": 1.9999992341767995e-05, + "loss": 1.0999, + "step": 1039 + }, + { + "epoch": 0.1697889882045631, + "grad_norm": 2.15116286277771, + "learning_rate": 1.9999991944003204e-05, + "loss": 1.1206, + "step": 1040 + }, + { + "epoch": 0.16995224684706747, + "grad_norm": 2.097074508666992, + "learning_rate": 1.9999991536168424e-05, + "loss": 0.9521, + "step": 1041 + }, + { + "epoch": 0.17011550548957186, + "grad_norm": 2.128910541534424, + "learning_rate": 1.9999991118263655e-05, + "loss": 0.845, + "step": 1042 + }, + { + "epoch": 0.17027876413207624, + "grad_norm": 2.199960708618164, + "learning_rate": 1.9999990690288898e-05, + "loss": 1.0856, + "step": 1043 + }, + { + "epoch": 0.17044202277458062, + "grad_norm": 2.304919719696045, + "learning_rate": 1.9999990252244153e-05, + "loss": 1.0568, + "step": 1044 + }, + { + "epoch": 0.17060528141708503, + "grad_norm": 2.1934592723846436, + "learning_rate": 1.999998980412942e-05, + "loss": 1.1594, + "step": 1045 + }, + { + "epoch": 0.1707685400595894, + "grad_norm": 2.3597006797790527, + "learning_rate": 1.99999893459447e-05, + "loss": 1.0172, + "step": 1046 + }, + { + "epoch": 0.1709317987020938, + "grad_norm": 2.3874928951263428, + "learning_rate": 1.9999988877689992e-05, + "loss": 1.0739, + "step": 1047 + }, + { + "epoch": 0.17109505734459818, + "grad_norm": 2.2946035861968994, + "learning_rate": 1.99999883993653e-05, + "loss": 0.8877, + "step": 1048 + }, + { + "epoch": 0.17125831598710256, + "grad_norm": 2.2626149654388428, + "learning_rate": 1.999998791097062e-05, + "loss": 1.1662, + "step": 1049 + }, + { + "epoch": 0.17142157462960694, + "grad_norm": 2.1669869422912598, + "learning_rate": 1.9999987412505956e-05, + "loss": 0.9913, + "step": 1050 + }, + { + "epoch": 0.17158483327211135, + "grad_norm": 2.5278143882751465, + "learning_rate": 1.999998690397131e-05, + "loss": 1.1662, + "step": 1051 + }, + { + "epoch": 0.17174809191461574, + "grad_norm": 2.166165828704834, + "learning_rate": 1.9999986385366675e-05, + "loss": 1.0432, + "step": 1052 + }, + { + "epoch": 0.17191135055712012, + "grad_norm": 2.134746551513672, + "learning_rate": 1.9999985856692058e-05, + "loss": 1.0275, + "step": 1053 + }, + { + "epoch": 0.1720746091996245, + "grad_norm": 2.0434696674346924, + "learning_rate": 1.9999985317947458e-05, + "loss": 0.9469, + "step": 1054 + }, + { + "epoch": 0.17223786784212888, + "grad_norm": 2.1447222232818604, + "learning_rate": 1.999998476913288e-05, + "loss": 1.0476, + "step": 1055 + }, + { + "epoch": 0.1724011264846333, + "grad_norm": 2.585195541381836, + "learning_rate": 1.9999984210248314e-05, + "loss": 1.1478, + "step": 1056 + }, + { + "epoch": 0.17256438512713768, + "grad_norm": 2.3194594383239746, + "learning_rate": 1.999998364129377e-05, + "loss": 0.9964, + "step": 1057 + }, + { + "epoch": 0.17272764376964206, + "grad_norm": 2.095606565475464, + "learning_rate": 1.9999983062269243e-05, + "loss": 1.0483, + "step": 1058 + }, + { + "epoch": 0.17289090241214644, + "grad_norm": 2.25608229637146, + "learning_rate": 1.9999982473174738e-05, + "loss": 1.1089, + "step": 1059 + }, + { + "epoch": 0.17305416105465082, + "grad_norm": 2.320568561553955, + "learning_rate": 1.999998187401025e-05, + "loss": 1.0553, + "step": 1060 + }, + { + "epoch": 0.1732174196971552, + "grad_norm": 2.3252997398376465, + "learning_rate": 1.9999981264775784e-05, + "loss": 1.1983, + "step": 1061 + }, + { + "epoch": 0.17338067833965962, + "grad_norm": 3.0871660709381104, + "learning_rate": 1.999998064547134e-05, + "loss": 1.2339, + "step": 1062 + }, + { + "epoch": 0.173543936982164, + "grad_norm": 2.380835771560669, + "learning_rate": 1.999998001609692e-05, + "loss": 1.0403, + "step": 1063 + }, + { + "epoch": 0.17370719562466838, + "grad_norm": 2.651228666305542, + "learning_rate": 1.999997937665252e-05, + "loss": 0.916, + "step": 1064 + }, + { + "epoch": 0.17387045426717276, + "grad_norm": 2.321899652481079, + "learning_rate": 1.9999978727138146e-05, + "loss": 1.1296, + "step": 1065 + }, + { + "epoch": 0.17403371290967715, + "grad_norm": 2.0351624488830566, + "learning_rate": 1.9999978067553796e-05, + "loss": 1.026, + "step": 1066 + }, + { + "epoch": 0.17419697155218156, + "grad_norm": 2.510901689529419, + "learning_rate": 1.999997739789947e-05, + "loss": 0.9268, + "step": 1067 + }, + { + "epoch": 0.17436023019468594, + "grad_norm": 2.541374683380127, + "learning_rate": 1.9999976718175166e-05, + "loss": 1.0569, + "step": 1068 + }, + { + "epoch": 0.17452348883719032, + "grad_norm": 2.503784418106079, + "learning_rate": 1.999997602838089e-05, + "loss": 1.1805, + "step": 1069 + }, + { + "epoch": 0.1746867474796947, + "grad_norm": 2.244020938873291, + "learning_rate": 1.9999975328516643e-05, + "loss": 1.0966, + "step": 1070 + }, + { + "epoch": 0.1748500061221991, + "grad_norm": 2.1564693450927734, + "learning_rate": 1.999997461858242e-05, + "loss": 1.1774, + "step": 1071 + }, + { + "epoch": 0.17501326476470347, + "grad_norm": 1.9784401655197144, + "learning_rate": 1.999997389857823e-05, + "loss": 0.9598, + "step": 1072 + }, + { + "epoch": 0.17517652340720788, + "grad_norm": 2.2448673248291016, + "learning_rate": 1.9999973168504067e-05, + "loss": 0.9912, + "step": 1073 + }, + { + "epoch": 0.17533978204971226, + "grad_norm": 2.1335999965667725, + "learning_rate": 1.9999972428359932e-05, + "loss": 0.9893, + "step": 1074 + }, + { + "epoch": 0.17550304069221664, + "grad_norm": 2.3255131244659424, + "learning_rate": 1.999997167814583e-05, + "loss": 1.0537, + "step": 1075 + }, + { + "epoch": 0.17566629933472103, + "grad_norm": 2.0237951278686523, + "learning_rate": 1.9999970917861757e-05, + "loss": 1.0538, + "step": 1076 + }, + { + "epoch": 0.1758295579772254, + "grad_norm": 2.1195428371429443, + "learning_rate": 1.9999970147507714e-05, + "loss": 1.0867, + "step": 1077 + }, + { + "epoch": 0.17599281661972982, + "grad_norm": 1.9172940254211426, + "learning_rate": 1.999996936708371e-05, + "loss": 0.8206, + "step": 1078 + }, + { + "epoch": 0.1761560752622342, + "grad_norm": 2.0909342765808105, + "learning_rate": 1.999996857658973e-05, + "loss": 1.0318, + "step": 1079 + }, + { + "epoch": 0.17631933390473858, + "grad_norm": 2.5228865146636963, + "learning_rate": 1.9999967776025794e-05, + "loss": 1.0434, + "step": 1080 + }, + { + "epoch": 0.17648259254724297, + "grad_norm": 2.6171646118164062, + "learning_rate": 1.999996696539189e-05, + "loss": 1.0133, + "step": 1081 + }, + { + "epoch": 0.17664585118974735, + "grad_norm": 2.546786069869995, + "learning_rate": 1.9999966144688022e-05, + "loss": 0.998, + "step": 1082 + }, + { + "epoch": 0.17680910983225173, + "grad_norm": 2.314805030822754, + "learning_rate": 1.9999965313914187e-05, + "loss": 1.2567, + "step": 1083 + }, + { + "epoch": 0.17697236847475614, + "grad_norm": 2.327486515045166, + "learning_rate": 1.999996447307039e-05, + "loss": 0.9973, + "step": 1084 + }, + { + "epoch": 0.17713562711726052, + "grad_norm": 2.1672661304473877, + "learning_rate": 1.9999963622156637e-05, + "loss": 0.8303, + "step": 1085 + }, + { + "epoch": 0.1772988857597649, + "grad_norm": 2.19378662109375, + "learning_rate": 1.9999962761172918e-05, + "loss": 0.9721, + "step": 1086 + }, + { + "epoch": 0.1774621444022693, + "grad_norm": 2.725313901901245, + "learning_rate": 1.9999961890119245e-05, + "loss": 1.3225, + "step": 1087 + }, + { + "epoch": 0.17762540304477367, + "grad_norm": 2.0173120498657227, + "learning_rate": 1.9999961008995607e-05, + "loss": 1.0302, + "step": 1088 + }, + { + "epoch": 0.17778866168727808, + "grad_norm": 2.1488540172576904, + "learning_rate": 1.9999960117802014e-05, + "loss": 1.0383, + "step": 1089 + }, + { + "epoch": 0.17795192032978246, + "grad_norm": 2.299060583114624, + "learning_rate": 1.9999959216538463e-05, + "loss": 1.0007, + "step": 1090 + }, + { + "epoch": 0.17811517897228685, + "grad_norm": 2.6042659282684326, + "learning_rate": 1.9999958305204955e-05, + "loss": 1.2325, + "step": 1091 + }, + { + "epoch": 0.17827843761479123, + "grad_norm": 1.9889030456542969, + "learning_rate": 1.9999957383801495e-05, + "loss": 0.9731, + "step": 1092 + }, + { + "epoch": 0.1784416962572956, + "grad_norm": 2.363703966140747, + "learning_rate": 1.9999956452328077e-05, + "loss": 1.1493, + "step": 1093 + }, + { + "epoch": 0.1786049548998, + "grad_norm": 2.4196512699127197, + "learning_rate": 1.9999955510784705e-05, + "loss": 1.054, + "step": 1094 + }, + { + "epoch": 0.1787682135423044, + "grad_norm": 2.3570573329925537, + "learning_rate": 1.999995455917138e-05, + "loss": 1.1088, + "step": 1095 + }, + { + "epoch": 0.1789314721848088, + "grad_norm": 2.4296398162841797, + "learning_rate": 1.9999953597488106e-05, + "loss": 0.9603, + "step": 1096 + }, + { + "epoch": 0.17909473082731317, + "grad_norm": 2.5928080081939697, + "learning_rate": 1.9999952625734884e-05, + "loss": 1.1198, + "step": 1097 + }, + { + "epoch": 0.17925798946981755, + "grad_norm": 2.5594751834869385, + "learning_rate": 1.9999951643911706e-05, + "loss": 0.9519, + "step": 1098 + }, + { + "epoch": 0.17942124811232193, + "grad_norm": 2.5569941997528076, + "learning_rate": 1.9999950652018585e-05, + "loss": 1.1821, + "step": 1099 + }, + { + "epoch": 0.17958450675482635, + "grad_norm": 1.89167058467865, + "learning_rate": 1.9999949650055512e-05, + "loss": 0.9435, + "step": 1100 + }, + { + "epoch": 0.17974776539733073, + "grad_norm": 1.994001865386963, + "learning_rate": 1.9999948638022495e-05, + "loss": 1.0539, + "step": 1101 + }, + { + "epoch": 0.1799110240398351, + "grad_norm": 2.433737277984619, + "learning_rate": 1.999994761591953e-05, + "loss": 1.1752, + "step": 1102 + }, + { + "epoch": 0.1800742826823395, + "grad_norm": 1.8177391290664673, + "learning_rate": 1.999994658374662e-05, + "loss": 0.8957, + "step": 1103 + }, + { + "epoch": 0.18023754132484388, + "grad_norm": 2.210867404937744, + "learning_rate": 1.999994554150377e-05, + "loss": 0.945, + "step": 1104 + }, + { + "epoch": 0.18040079996734826, + "grad_norm": 2.2383158206939697, + "learning_rate": 1.9999944489190975e-05, + "loss": 1.2078, + "step": 1105 + }, + { + "epoch": 0.18056405860985267, + "grad_norm": 2.2448253631591797, + "learning_rate": 1.999994342680824e-05, + "loss": 1.1679, + "step": 1106 + }, + { + "epoch": 0.18072731725235705, + "grad_norm": 2.525371789932251, + "learning_rate": 1.9999942354355566e-05, + "loss": 1.2366, + "step": 1107 + }, + { + "epoch": 0.18089057589486143, + "grad_norm": 2.216412305831909, + "learning_rate": 1.999994127183295e-05, + "loss": 1.0239, + "step": 1108 + }, + { + "epoch": 0.18105383453736582, + "grad_norm": 2.325221061706543, + "learning_rate": 1.9999940179240395e-05, + "loss": 1.1005, + "step": 1109 + }, + { + "epoch": 0.1812170931798702, + "grad_norm": 2.2306694984436035, + "learning_rate": 1.9999939076577906e-05, + "loss": 1.1815, + "step": 1110 + }, + { + "epoch": 0.1813803518223746, + "grad_norm": 2.216809034347534, + "learning_rate": 1.9999937963845478e-05, + "loss": 1.0072, + "step": 1111 + }, + { + "epoch": 0.181543610464879, + "grad_norm": 2.0775082111358643, + "learning_rate": 1.9999936841043116e-05, + "loss": 1.0162, + "step": 1112 + }, + { + "epoch": 0.18170686910738337, + "grad_norm": 2.219434976577759, + "learning_rate": 1.999993570817082e-05, + "loss": 1.0333, + "step": 1113 + }, + { + "epoch": 0.18187012774988776, + "grad_norm": 2.2550816535949707, + "learning_rate": 1.9999934565228594e-05, + "loss": 1.0297, + "step": 1114 + }, + { + "epoch": 0.18203338639239214, + "grad_norm": 1.8956637382507324, + "learning_rate": 1.9999933412216436e-05, + "loss": 0.9578, + "step": 1115 + }, + { + "epoch": 0.18219664503489655, + "grad_norm": 2.551481246948242, + "learning_rate": 1.9999932249134347e-05, + "loss": 1.0417, + "step": 1116 + }, + { + "epoch": 0.18235990367740093, + "grad_norm": 2.552155017852783, + "learning_rate": 1.9999931075982328e-05, + "loss": 1.1275, + "step": 1117 + }, + { + "epoch": 0.1825231623199053, + "grad_norm": 2.355808734893799, + "learning_rate": 1.999992989276038e-05, + "loss": 1.0488, + "step": 1118 + }, + { + "epoch": 0.1826864209624097, + "grad_norm": 2.86993670463562, + "learning_rate": 1.9999928699468506e-05, + "loss": 0.8156, + "step": 1119 + }, + { + "epoch": 0.18284967960491408, + "grad_norm": 2.4833908081054688, + "learning_rate": 1.9999927496106708e-05, + "loss": 1.2515, + "step": 1120 + }, + { + "epoch": 0.18301293824741846, + "grad_norm": 2.038306713104248, + "learning_rate": 1.9999926282674985e-05, + "loss": 1.0707, + "step": 1121 + }, + { + "epoch": 0.18317619688992287, + "grad_norm": 2.3936352729797363, + "learning_rate": 1.999992505917334e-05, + "loss": 0.9615, + "step": 1122 + }, + { + "epoch": 0.18333945553242725, + "grad_norm": 2.4925572872161865, + "learning_rate": 1.9999923825601768e-05, + "loss": 1.2249, + "step": 1123 + }, + { + "epoch": 0.18350271417493164, + "grad_norm": 2.205446243286133, + "learning_rate": 1.999992258196028e-05, + "loss": 0.9487, + "step": 1124 + }, + { + "epoch": 0.18366597281743602, + "grad_norm": 2.225482225418091, + "learning_rate": 1.999992132824887e-05, + "loss": 1.0573, + "step": 1125 + }, + { + "epoch": 0.1838292314599404, + "grad_norm": 2.1631839275360107, + "learning_rate": 1.9999920064467545e-05, + "loss": 0.979, + "step": 1126 + }, + { + "epoch": 0.1839924901024448, + "grad_norm": 2.3457841873168945, + "learning_rate": 1.9999918790616305e-05, + "loss": 1.1933, + "step": 1127 + }, + { + "epoch": 0.1841557487449492, + "grad_norm": 2.2748007774353027, + "learning_rate": 1.9999917506695144e-05, + "loss": 1.0622, + "step": 1128 + }, + { + "epoch": 0.18431900738745358, + "grad_norm": 2.6205477714538574, + "learning_rate": 1.999991621270407e-05, + "loss": 1.0085, + "step": 1129 + }, + { + "epoch": 0.18448226602995796, + "grad_norm": 2.382124423980713, + "learning_rate": 1.9999914908643084e-05, + "loss": 1.0002, + "step": 1130 + }, + { + "epoch": 0.18464552467246234, + "grad_norm": 2.582602024078369, + "learning_rate": 1.999991359451219e-05, + "loss": 1.1519, + "step": 1131 + }, + { + "epoch": 0.18480878331496672, + "grad_norm": 2.577791690826416, + "learning_rate": 1.9999912270311376e-05, + "loss": 0.8848, + "step": 1132 + }, + { + "epoch": 0.18497204195747113, + "grad_norm": 2.2415804862976074, + "learning_rate": 1.9999910936040662e-05, + "loss": 1.1085, + "step": 1133 + }, + { + "epoch": 0.18513530059997552, + "grad_norm": 2.088890314102173, + "learning_rate": 1.9999909591700035e-05, + "loss": 0.929, + "step": 1134 + }, + { + "epoch": 0.1852985592424799, + "grad_norm": 2.7266616821289062, + "learning_rate": 1.9999908237289504e-05, + "loss": 1.105, + "step": 1135 + }, + { + "epoch": 0.18546181788498428, + "grad_norm": 2.5893471240997314, + "learning_rate": 1.999990687280907e-05, + "loss": 1.1033, + "step": 1136 + }, + { + "epoch": 0.18562507652748866, + "grad_norm": 2.6022417545318604, + "learning_rate": 1.999990549825873e-05, + "loss": 1.253, + "step": 1137 + }, + { + "epoch": 0.18578833516999307, + "grad_norm": 2.2605323791503906, + "learning_rate": 1.9999904113638488e-05, + "loss": 1.1433, + "step": 1138 + }, + { + "epoch": 0.18595159381249746, + "grad_norm": 2.047659397125244, + "learning_rate": 1.9999902718948345e-05, + "loss": 0.9878, + "step": 1139 + }, + { + "epoch": 0.18611485245500184, + "grad_norm": 2.3359527587890625, + "learning_rate": 1.9999901314188302e-05, + "loss": 1.1012, + "step": 1140 + }, + { + "epoch": 0.18627811109750622, + "grad_norm": 2.625244617462158, + "learning_rate": 1.9999899899358362e-05, + "loss": 1.0277, + "step": 1141 + }, + { + "epoch": 0.1864413697400106, + "grad_norm": 2.3946313858032227, + "learning_rate": 1.9999898474458525e-05, + "loss": 0.9741, + "step": 1142 + }, + { + "epoch": 0.186604628382515, + "grad_norm": 2.119610071182251, + "learning_rate": 1.9999897039488794e-05, + "loss": 1.0047, + "step": 1143 + }, + { + "epoch": 0.1867678870250194, + "grad_norm": 2.176919937133789, + "learning_rate": 1.999989559444917e-05, + "loss": 0.9946, + "step": 1144 + }, + { + "epoch": 0.18693114566752378, + "grad_norm": 2.5582399368286133, + "learning_rate": 1.9999894139339652e-05, + "loss": 0.9903, + "step": 1145 + }, + { + "epoch": 0.18709440431002816, + "grad_norm": 2.4175796508789062, + "learning_rate": 1.9999892674160244e-05, + "loss": 1.0786, + "step": 1146 + }, + { + "epoch": 0.18725766295253254, + "grad_norm": 2.4266650676727295, + "learning_rate": 1.999989119891095e-05, + "loss": 1.0346, + "step": 1147 + }, + { + "epoch": 0.18742092159503693, + "grad_norm": 1.9651210308074951, + "learning_rate": 1.999988971359176e-05, + "loss": 0.9538, + "step": 1148 + }, + { + "epoch": 0.18758418023754134, + "grad_norm": 2.1556639671325684, + "learning_rate": 1.999988821820269e-05, + "loss": 1.0038, + "step": 1149 + }, + { + "epoch": 0.18774743888004572, + "grad_norm": 1.9531604051589966, + "learning_rate": 1.9999886712743734e-05, + "loss": 1.036, + "step": 1150 + }, + { + "epoch": 0.1879106975225501, + "grad_norm": 2.4789934158325195, + "learning_rate": 1.9999885197214895e-05, + "loss": 1.0816, + "step": 1151 + }, + { + "epoch": 0.18807395616505448, + "grad_norm": 1.9194669723510742, + "learning_rate": 1.9999883671616173e-05, + "loss": 1.0355, + "step": 1152 + }, + { + "epoch": 0.18823721480755887, + "grad_norm": 2.1420159339904785, + "learning_rate": 1.9999882135947574e-05, + "loss": 1.0958, + "step": 1153 + }, + { + "epoch": 0.18840047345006325, + "grad_norm": 2.0625131130218506, + "learning_rate": 1.999988059020909e-05, + "loss": 1.0212, + "step": 1154 + }, + { + "epoch": 0.18856373209256766, + "grad_norm": 2.0413875579833984, + "learning_rate": 1.9999879034400733e-05, + "loss": 0.9744, + "step": 1155 + }, + { + "epoch": 0.18872699073507204, + "grad_norm": 2.2596724033355713, + "learning_rate": 1.99998774685225e-05, + "loss": 1.1592, + "step": 1156 + }, + { + "epoch": 0.18889024937757642, + "grad_norm": 1.8991316556930542, + "learning_rate": 1.9999875892574395e-05, + "loss": 0.9992, + "step": 1157 + }, + { + "epoch": 0.1890535080200808, + "grad_norm": 2.1433775424957275, + "learning_rate": 1.9999874306556416e-05, + "loss": 1.0734, + "step": 1158 + }, + { + "epoch": 0.1892167666625852, + "grad_norm": 2.1161365509033203, + "learning_rate": 1.9999872710468568e-05, + "loss": 1.0906, + "step": 1159 + }, + { + "epoch": 0.1893800253050896, + "grad_norm": 2.5800371170043945, + "learning_rate": 1.9999871104310846e-05, + "loss": 1.026, + "step": 1160 + }, + { + "epoch": 0.18954328394759398, + "grad_norm": 2.267658233642578, + "learning_rate": 1.9999869488083257e-05, + "loss": 1.0681, + "step": 1161 + }, + { + "epoch": 0.18970654259009836, + "grad_norm": 2.2277655601501465, + "learning_rate": 1.9999867861785806e-05, + "loss": 0.9169, + "step": 1162 + }, + { + "epoch": 0.18986980123260275, + "grad_norm": 2.348982334136963, + "learning_rate": 1.9999866225418488e-05, + "loss": 1.137, + "step": 1163 + }, + { + "epoch": 0.19003305987510713, + "grad_norm": 2.3634753227233887, + "learning_rate": 1.999986457898131e-05, + "loss": 1.1237, + "step": 1164 + }, + { + "epoch": 0.1901963185176115, + "grad_norm": 2.1389710903167725, + "learning_rate": 1.999986292247427e-05, + "loss": 1.0627, + "step": 1165 + }, + { + "epoch": 0.19035957716011592, + "grad_norm": 2.117138624191284, + "learning_rate": 1.999986125589737e-05, + "loss": 0.856, + "step": 1166 + }, + { + "epoch": 0.1905228358026203, + "grad_norm": 2.3714962005615234, + "learning_rate": 1.9999859579250612e-05, + "loss": 1.0273, + "step": 1167 + }, + { + "epoch": 0.1906860944451247, + "grad_norm": 2.1897380352020264, + "learning_rate": 1.9999857892534e-05, + "loss": 0.8903, + "step": 1168 + }, + { + "epoch": 0.19084935308762907, + "grad_norm": 2.0271244049072266, + "learning_rate": 1.999985619574753e-05, + "loss": 0.8694, + "step": 1169 + }, + { + "epoch": 0.19101261173013345, + "grad_norm": 2.8580963611602783, + "learning_rate": 1.9999854488891214e-05, + "loss": 1.2105, + "step": 1170 + }, + { + "epoch": 0.19117587037263786, + "grad_norm": 2.389904022216797, + "learning_rate": 1.9999852771965042e-05, + "loss": 0.9998, + "step": 1171 + }, + { + "epoch": 0.19133912901514225, + "grad_norm": 2.6290574073791504, + "learning_rate": 1.999985104496902e-05, + "loss": 1.0353, + "step": 1172 + }, + { + "epoch": 0.19150238765764663, + "grad_norm": 2.6093568801879883, + "learning_rate": 1.9999849307903153e-05, + "loss": 1.049, + "step": 1173 + }, + { + "epoch": 0.191665646300151, + "grad_norm": 2.4639508724212646, + "learning_rate": 1.999984756076744e-05, + "loss": 1.0843, + "step": 1174 + }, + { + "epoch": 0.1918289049426554, + "grad_norm": 2.90348219871521, + "learning_rate": 1.9999845803561882e-05, + "loss": 1.0587, + "step": 1175 + }, + { + "epoch": 0.1919921635851598, + "grad_norm": 2.272756576538086, + "learning_rate": 1.9999844036286483e-05, + "loss": 0.9462, + "step": 1176 + }, + { + "epoch": 0.19215542222766419, + "grad_norm": 2.4548888206481934, + "learning_rate": 1.9999842258941244e-05, + "loss": 1.1625, + "step": 1177 + }, + { + "epoch": 0.19231868087016857, + "grad_norm": 2.604268789291382, + "learning_rate": 1.9999840471526166e-05, + "loss": 1.1899, + "step": 1178 + }, + { + "epoch": 0.19248193951267295, + "grad_norm": 2.066979169845581, + "learning_rate": 1.9999838674041252e-05, + "loss": 0.9286, + "step": 1179 + }, + { + "epoch": 0.19264519815517733, + "grad_norm": 2.3797128200531006, + "learning_rate": 1.9999836866486505e-05, + "loss": 1.0364, + "step": 1180 + }, + { + "epoch": 0.19280845679768172, + "grad_norm": 2.6473474502563477, + "learning_rate": 1.999983504886192e-05, + "loss": 1.2209, + "step": 1181 + }, + { + "epoch": 0.19297171544018613, + "grad_norm": 2.0787739753723145, + "learning_rate": 1.9999833221167507e-05, + "loss": 1.0413, + "step": 1182 + }, + { + "epoch": 0.1931349740826905, + "grad_norm": 2.279916763305664, + "learning_rate": 1.9999831383403263e-05, + "loss": 0.9349, + "step": 1183 + }, + { + "epoch": 0.1932982327251949, + "grad_norm": 2.120527744293213, + "learning_rate": 1.9999829535569196e-05, + "loss": 0.8887, + "step": 1184 + }, + { + "epoch": 0.19346149136769927, + "grad_norm": 2.002307891845703, + "learning_rate": 1.99998276776653e-05, + "loss": 0.9832, + "step": 1185 + }, + { + "epoch": 0.19362475001020366, + "grad_norm": 2.755699396133423, + "learning_rate": 1.9999825809691577e-05, + "loss": 1.1735, + "step": 1186 + }, + { + "epoch": 0.19378800865270807, + "grad_norm": 2.5381875038146973, + "learning_rate": 1.9999823931648036e-05, + "loss": 1.0875, + "step": 1187 + }, + { + "epoch": 0.19395126729521245, + "grad_norm": 2.0140693187713623, + "learning_rate": 1.9999822043534673e-05, + "loss": 1.0085, + "step": 1188 + }, + { + "epoch": 0.19411452593771683, + "grad_norm": 2.2564597129821777, + "learning_rate": 1.9999820145351493e-05, + "loss": 1.0603, + "step": 1189 + }, + { + "epoch": 0.1942777845802212, + "grad_norm": 2.2346925735473633, + "learning_rate": 1.9999818237098495e-05, + "loss": 0.9727, + "step": 1190 + }, + { + "epoch": 0.1944410432227256, + "grad_norm": 2.393723726272583, + "learning_rate": 1.9999816318775688e-05, + "loss": 0.8842, + "step": 1191 + }, + { + "epoch": 0.19460430186522998, + "grad_norm": 2.2890357971191406, + "learning_rate": 1.999981439038306e-05, + "loss": 0.8331, + "step": 1192 + }, + { + "epoch": 0.1947675605077344, + "grad_norm": 2.5719926357269287, + "learning_rate": 1.999981245192063e-05, + "loss": 1.0345, + "step": 1193 + }, + { + "epoch": 0.19493081915023877, + "grad_norm": 2.381460428237915, + "learning_rate": 1.9999810503388386e-05, + "loss": 1.1593, + "step": 1194 + }, + { + "epoch": 0.19509407779274315, + "grad_norm": 2.2159101963043213, + "learning_rate": 1.9999808544786336e-05, + "loss": 0.9811, + "step": 1195 + }, + { + "epoch": 0.19525733643524754, + "grad_norm": 2.201510429382324, + "learning_rate": 1.9999806576114485e-05, + "loss": 0.8886, + "step": 1196 + }, + { + "epoch": 0.19542059507775192, + "grad_norm": 2.8090901374816895, + "learning_rate": 1.999980459737283e-05, + "loss": 1.2377, + "step": 1197 + }, + { + "epoch": 0.19558385372025633, + "grad_norm": 2.294390916824341, + "learning_rate": 1.999980260856137e-05, + "loss": 1.0078, + "step": 1198 + }, + { + "epoch": 0.1957471123627607, + "grad_norm": 2.2880094051361084, + "learning_rate": 1.9999800609680117e-05, + "loss": 1.1318, + "step": 1199 + }, + { + "epoch": 0.1959103710052651, + "grad_norm": 2.2412564754486084, + "learning_rate": 1.9999798600729067e-05, + "loss": 0.9749, + "step": 1200 + }, + { + "epoch": 0.19607362964776948, + "grad_norm": 1.8755762577056885, + "learning_rate": 1.9999796581708222e-05, + "loss": 0.8569, + "step": 1201 + }, + { + "epoch": 0.19623688829027386, + "grad_norm": 2.215679407119751, + "learning_rate": 1.999979455261758e-05, + "loss": 0.9661, + "step": 1202 + }, + { + "epoch": 0.19640014693277824, + "grad_norm": 2.204582691192627, + "learning_rate": 1.9999792513457152e-05, + "loss": 0.9762, + "step": 1203 + }, + { + "epoch": 0.19656340557528265, + "grad_norm": 2.370715856552124, + "learning_rate": 1.9999790464226934e-05, + "loss": 1.2432, + "step": 1204 + }, + { + "epoch": 0.19672666421778703, + "grad_norm": 2.324059247970581, + "learning_rate": 1.999978840492693e-05, + "loss": 1.1338, + "step": 1205 + }, + { + "epoch": 0.19688992286029142, + "grad_norm": 2.1559267044067383, + "learning_rate": 1.9999786335557143e-05, + "loss": 0.981, + "step": 1206 + }, + { + "epoch": 0.1970531815027958, + "grad_norm": 2.0476436614990234, + "learning_rate": 1.9999784256117573e-05, + "loss": 0.9939, + "step": 1207 + }, + { + "epoch": 0.19721644014530018, + "grad_norm": 2.2658190727233887, + "learning_rate": 1.9999782166608225e-05, + "loss": 1.117, + "step": 1208 + }, + { + "epoch": 0.1973796987878046, + "grad_norm": 2.3181371688842773, + "learning_rate": 1.9999780067029095e-05, + "loss": 1.1727, + "step": 1209 + }, + { + "epoch": 0.19754295743030897, + "grad_norm": 2.826958179473877, + "learning_rate": 1.999977795738019e-05, + "loss": 1.5862, + "step": 1210 + }, + { + "epoch": 0.19770621607281336, + "grad_norm": 2.242987632751465, + "learning_rate": 1.9999775837661513e-05, + "loss": 0.9381, + "step": 1211 + }, + { + "epoch": 0.19786947471531774, + "grad_norm": 2.1063759326934814, + "learning_rate": 1.999977370787306e-05, + "loss": 0.9262, + "step": 1212 + }, + { + "epoch": 0.19803273335782212, + "grad_norm": 2.459761619567871, + "learning_rate": 1.9999771568014845e-05, + "loss": 1.1155, + "step": 1213 + }, + { + "epoch": 0.1981959920003265, + "grad_norm": 2.1590492725372314, + "learning_rate": 1.999976941808686e-05, + "loss": 0.9855, + "step": 1214 + }, + { + "epoch": 0.19835925064283091, + "grad_norm": 2.3454430103302, + "learning_rate": 1.9999767258089107e-05, + "loss": 1.1352, + "step": 1215 + }, + { + "epoch": 0.1985225092853353, + "grad_norm": 2.3492486476898193, + "learning_rate": 1.9999765088021596e-05, + "loss": 1.0754, + "step": 1216 + }, + { + "epoch": 0.19868576792783968, + "grad_norm": 2.1677584648132324, + "learning_rate": 1.999976290788432e-05, + "loss": 0.9619, + "step": 1217 + }, + { + "epoch": 0.19884902657034406, + "grad_norm": 2.3685142993927, + "learning_rate": 1.9999760717677286e-05, + "loss": 1.029, + "step": 1218 + }, + { + "epoch": 0.19901228521284844, + "grad_norm": 2.2057154178619385, + "learning_rate": 1.9999758517400494e-05, + "loss": 1.1021, + "step": 1219 + }, + { + "epoch": 0.19917554385535285, + "grad_norm": 2.1651723384857178, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.9394, + "step": 1220 + }, + { + "epoch": 0.19933880249785724, + "grad_norm": 1.9442890882492065, + "learning_rate": 1.9999754086637652e-05, + "loss": 0.9528, + "step": 1221 + }, + { + "epoch": 0.19950206114036162, + "grad_norm": 2.0366950035095215, + "learning_rate": 1.9999751856151606e-05, + "loss": 0.958, + "step": 1222 + }, + { + "epoch": 0.199665319782866, + "grad_norm": 2.293421506881714, + "learning_rate": 1.9999749615595813e-05, + "loss": 1.1066, + "step": 1223 + }, + { + "epoch": 0.19982857842537038, + "grad_norm": 1.8304027318954468, + "learning_rate": 1.9999747364970274e-05, + "loss": 0.9793, + "step": 1224 + }, + { + "epoch": 0.19999183706787477, + "grad_norm": 1.9746372699737549, + "learning_rate": 1.9999745104274995e-05, + "loss": 0.8131, + "step": 1225 + }, + { + "epoch": 0.20015509571037918, + "grad_norm": 2.0511436462402344, + "learning_rate": 1.999974283350997e-05, + "loss": 1.0009, + "step": 1226 + }, + { + "epoch": 0.20031835435288356, + "grad_norm": 2.3559556007385254, + "learning_rate": 1.9999740552675212e-05, + "loss": 1.2299, + "step": 1227 + }, + { + "epoch": 0.20048161299538794, + "grad_norm": 2.171827554702759, + "learning_rate": 1.9999738261770713e-05, + "loss": 0.901, + "step": 1228 + }, + { + "epoch": 0.20064487163789232, + "grad_norm": 2.365586996078491, + "learning_rate": 1.9999735960796482e-05, + "loss": 1.1892, + "step": 1229 + }, + { + "epoch": 0.2008081302803967, + "grad_norm": 2.331192970275879, + "learning_rate": 1.999973364975252e-05, + "loss": 0.7867, + "step": 1230 + }, + { + "epoch": 0.20097138892290112, + "grad_norm": 2.357205390930176, + "learning_rate": 1.9999731328638828e-05, + "loss": 0.8799, + "step": 1231 + }, + { + "epoch": 0.2011346475654055, + "grad_norm": 2.172893762588501, + "learning_rate": 1.999972899745541e-05, + "loss": 0.9546, + "step": 1232 + }, + { + "epoch": 0.20129790620790988, + "grad_norm": 2.3461246490478516, + "learning_rate": 1.999972665620227e-05, + "loss": 1.1344, + "step": 1233 + }, + { + "epoch": 0.20146116485041426, + "grad_norm": 2.1069836616516113, + "learning_rate": 1.9999724304879406e-05, + "loss": 1.0727, + "step": 1234 + }, + { + "epoch": 0.20162442349291865, + "grad_norm": 2.5568220615386963, + "learning_rate": 1.999972194348682e-05, + "loss": 1.1865, + "step": 1235 + }, + { + "epoch": 0.20178768213542306, + "grad_norm": 2.0884177684783936, + "learning_rate": 1.999971957202452e-05, + "loss": 0.8975, + "step": 1236 + }, + { + "epoch": 0.20195094077792744, + "grad_norm": 2.4405572414398193, + "learning_rate": 1.9999717190492503e-05, + "loss": 1.0843, + "step": 1237 + }, + { + "epoch": 0.20211419942043182, + "grad_norm": 2.220410108566284, + "learning_rate": 1.9999714798890775e-05, + "loss": 1.0091, + "step": 1238 + }, + { + "epoch": 0.2022774580629362, + "grad_norm": 2.249708890914917, + "learning_rate": 1.9999712397219337e-05, + "loss": 1.0198, + "step": 1239 + }, + { + "epoch": 0.2024407167054406, + "grad_norm": 2.0833353996276855, + "learning_rate": 1.9999709985478188e-05, + "loss": 0.848, + "step": 1240 + }, + { + "epoch": 0.20260397534794497, + "grad_norm": 2.45491099357605, + "learning_rate": 1.9999707563667338e-05, + "loss": 1.2241, + "step": 1241 + }, + { + "epoch": 0.20276723399044938, + "grad_norm": 2.3407816886901855, + "learning_rate": 1.999970513178678e-05, + "loss": 0.9794, + "step": 1242 + }, + { + "epoch": 0.20293049263295376, + "grad_norm": 2.091902256011963, + "learning_rate": 1.9999702689836527e-05, + "loss": 0.9687, + "step": 1243 + }, + { + "epoch": 0.20309375127545815, + "grad_norm": 2.0730838775634766, + "learning_rate": 1.999970023781657e-05, + "loss": 1.1505, + "step": 1244 + }, + { + "epoch": 0.20325700991796253, + "grad_norm": 2.111569881439209, + "learning_rate": 1.999969777572692e-05, + "loss": 0.8972, + "step": 1245 + }, + { + "epoch": 0.2034202685604669, + "grad_norm": 2.1591970920562744, + "learning_rate": 1.999969530356758e-05, + "loss": 0.9328, + "step": 1246 + }, + { + "epoch": 0.20358352720297132, + "grad_norm": 2.1966850757598877, + "learning_rate": 1.9999692821338547e-05, + "loss": 1.0219, + "step": 1247 + }, + { + "epoch": 0.2037467858454757, + "grad_norm": 2.2881972789764404, + "learning_rate": 1.999969032903983e-05, + "loss": 1.0175, + "step": 1248 + }, + { + "epoch": 0.20391004448798009, + "grad_norm": 2.8964879512786865, + "learning_rate": 1.9999687826671422e-05, + "loss": 1.1091, + "step": 1249 + }, + { + "epoch": 0.20407330313048447, + "grad_norm": 1.9811725616455078, + "learning_rate": 1.9999685314233333e-05, + "loss": 0.7789, + "step": 1250 + }, + { + "epoch": 0.20423656177298885, + "grad_norm": 2.0875306129455566, + "learning_rate": 1.9999682791725563e-05, + "loss": 0.8624, + "step": 1251 + }, + { + "epoch": 0.20439982041549323, + "grad_norm": 2.2419612407684326, + "learning_rate": 1.9999680259148117e-05, + "loss": 0.9159, + "step": 1252 + }, + { + "epoch": 0.20456307905799764, + "grad_norm": 2.273787260055542, + "learning_rate": 1.9999677716500994e-05, + "loss": 0.8924, + "step": 1253 + }, + { + "epoch": 0.20472633770050203, + "grad_norm": 2.7177908420562744, + "learning_rate": 1.9999675163784197e-05, + "loss": 1.197, + "step": 1254 + }, + { + "epoch": 0.2048895963430064, + "grad_norm": 2.159367561340332, + "learning_rate": 1.9999672600997734e-05, + "loss": 0.9107, + "step": 1255 + }, + { + "epoch": 0.2050528549855108, + "grad_norm": 2.605884313583374, + "learning_rate": 1.9999670028141598e-05, + "loss": 1.1049, + "step": 1256 + }, + { + "epoch": 0.20521611362801517, + "grad_norm": 2.570202589035034, + "learning_rate": 1.99996674452158e-05, + "loss": 0.9643, + "step": 1257 + }, + { + "epoch": 0.20537937227051958, + "grad_norm": 2.489055633544922, + "learning_rate": 1.999966485222034e-05, + "loss": 0.9722, + "step": 1258 + }, + { + "epoch": 0.20554263091302397, + "grad_norm": 2.2112061977386475, + "learning_rate": 1.999966224915522e-05, + "loss": 0.8344, + "step": 1259 + }, + { + "epoch": 0.20570588955552835, + "grad_norm": 2.590627908706665, + "learning_rate": 1.999965963602044e-05, + "loss": 1.3387, + "step": 1260 + }, + { + "epoch": 0.20586914819803273, + "grad_norm": 2.5064964294433594, + "learning_rate": 1.9999657012816008e-05, + "loss": 1.3058, + "step": 1261 + }, + { + "epoch": 0.2060324068405371, + "grad_norm": 2.611572265625, + "learning_rate": 1.9999654379541923e-05, + "loss": 1.0867, + "step": 1262 + }, + { + "epoch": 0.2061956654830415, + "grad_norm": 2.591822862625122, + "learning_rate": 1.999965173619819e-05, + "loss": 1.2636, + "step": 1263 + }, + { + "epoch": 0.2063589241255459, + "grad_norm": 2.198455810546875, + "learning_rate": 1.9999649082784807e-05, + "loss": 0.9545, + "step": 1264 + }, + { + "epoch": 0.2065221827680503, + "grad_norm": 2.234243392944336, + "learning_rate": 1.9999646419301783e-05, + "loss": 0.9726, + "step": 1265 + }, + { + "epoch": 0.20668544141055467, + "grad_norm": 2.020203113555908, + "learning_rate": 1.9999643745749116e-05, + "loss": 0.887, + "step": 1266 + }, + { + "epoch": 0.20684870005305905, + "grad_norm": 2.0185112953186035, + "learning_rate": 1.9999641062126814e-05, + "loss": 0.8834, + "step": 1267 + }, + { + "epoch": 0.20701195869556344, + "grad_norm": 2.165705680847168, + "learning_rate": 1.999963836843487e-05, + "loss": 1.0574, + "step": 1268 + }, + { + "epoch": 0.20717521733806785, + "grad_norm": 2.4223058223724365, + "learning_rate": 1.99996356646733e-05, + "loss": 0.8913, + "step": 1269 + }, + { + "epoch": 0.20733847598057223, + "grad_norm": 2.149723529815674, + "learning_rate": 1.9999632950842095e-05, + "loss": 1.2076, + "step": 1270 + }, + { + "epoch": 0.2075017346230766, + "grad_norm": 2.0023581981658936, + "learning_rate": 1.9999630226941265e-05, + "loss": 0.9207, + "step": 1271 + }, + { + "epoch": 0.207664993265581, + "grad_norm": 2.56469988822937, + "learning_rate": 1.9999627492970805e-05, + "loss": 1.251, + "step": 1272 + }, + { + "epoch": 0.20782825190808538, + "grad_norm": 2.017416477203369, + "learning_rate": 1.999962474893073e-05, + "loss": 0.9956, + "step": 1273 + }, + { + "epoch": 0.20799151055058976, + "grad_norm": 2.3790576457977295, + "learning_rate": 1.999962199482103e-05, + "loss": 1.0752, + "step": 1274 + }, + { + "epoch": 0.20815476919309417, + "grad_norm": 2.342595100402832, + "learning_rate": 1.9999619230641714e-05, + "loss": 0.9823, + "step": 1275 + }, + { + "epoch": 0.20831802783559855, + "grad_norm": 2.540520429611206, + "learning_rate": 1.9999616456392785e-05, + "loss": 1.1521, + "step": 1276 + }, + { + "epoch": 0.20848128647810293, + "grad_norm": 2.3206164836883545, + "learning_rate": 1.9999613672074246e-05, + "loss": 1.0796, + "step": 1277 + }, + { + "epoch": 0.20864454512060732, + "grad_norm": 2.3632194995880127, + "learning_rate": 1.99996108776861e-05, + "loss": 1.0677, + "step": 1278 + }, + { + "epoch": 0.2088078037631117, + "grad_norm": 2.3757784366607666, + "learning_rate": 1.9999608073228342e-05, + "loss": 0.9669, + "step": 1279 + }, + { + "epoch": 0.2089710624056161, + "grad_norm": 2.3979928493499756, + "learning_rate": 1.9999605258700983e-05, + "loss": 0.9178, + "step": 1280 + }, + { + "epoch": 0.2091343210481205, + "grad_norm": 2.3388993740081787, + "learning_rate": 1.999960243410403e-05, + "loss": 0.928, + "step": 1281 + }, + { + "epoch": 0.20929757969062487, + "grad_norm": 2.285773277282715, + "learning_rate": 1.9999599599437476e-05, + "loss": 0.9614, + "step": 1282 + }, + { + "epoch": 0.20946083833312926, + "grad_norm": 2.317728281021118, + "learning_rate": 1.9999596754701328e-05, + "loss": 0.9957, + "step": 1283 + }, + { + "epoch": 0.20962409697563364, + "grad_norm": 2.3520326614379883, + "learning_rate": 1.999959389989559e-05, + "loss": 1.0384, + "step": 1284 + }, + { + "epoch": 0.20978735561813802, + "grad_norm": 2.4376580715179443, + "learning_rate": 1.999959103502026e-05, + "loss": 1.0004, + "step": 1285 + }, + { + "epoch": 0.20995061426064243, + "grad_norm": 2.5730464458465576, + "learning_rate": 1.999958816007535e-05, + "loss": 1.1472, + "step": 1286 + }, + { + "epoch": 0.21011387290314681, + "grad_norm": 2.2769081592559814, + "learning_rate": 1.9999585275060854e-05, + "loss": 1.0164, + "step": 1287 + }, + { + "epoch": 0.2102771315456512, + "grad_norm": 2.3943207263946533, + "learning_rate": 1.999958237997678e-05, + "loss": 0.9327, + "step": 1288 + }, + { + "epoch": 0.21044039018815558, + "grad_norm": 2.2491486072540283, + "learning_rate": 1.9999579474823126e-05, + "loss": 0.9422, + "step": 1289 + }, + { + "epoch": 0.21060364883065996, + "grad_norm": 2.295269727706909, + "learning_rate": 1.99995765595999e-05, + "loss": 1.0003, + "step": 1290 + }, + { + "epoch": 0.21076690747316437, + "grad_norm": 2.0796635150909424, + "learning_rate": 1.9999573634307104e-05, + "loss": 0.9415, + "step": 1291 + }, + { + "epoch": 0.21093016611566875, + "grad_norm": 2.276812791824341, + "learning_rate": 1.9999570698944743e-05, + "loss": 1.0389, + "step": 1292 + }, + { + "epoch": 0.21109342475817314, + "grad_norm": 2.033170700073242, + "learning_rate": 1.999956775351281e-05, + "loss": 0.9762, + "step": 1293 + }, + { + "epoch": 0.21125668340067752, + "grad_norm": 2.484546184539795, + "learning_rate": 1.999956479801132e-05, + "loss": 0.9551, + "step": 1294 + }, + { + "epoch": 0.2114199420431819, + "grad_norm": 2.6277737617492676, + "learning_rate": 1.9999561832440268e-05, + "loss": 1.2138, + "step": 1295 + }, + { + "epoch": 0.2115832006856863, + "grad_norm": 2.1640849113464355, + "learning_rate": 1.999955885679966e-05, + "loss": 1.1261, + "step": 1296 + }, + { + "epoch": 0.2117464593281907, + "grad_norm": 2.3965718746185303, + "learning_rate": 1.99995558710895e-05, + "loss": 0.9749, + "step": 1297 + }, + { + "epoch": 0.21190971797069508, + "grad_norm": 2.2983665466308594, + "learning_rate": 1.999955287530979e-05, + "loss": 0.9456, + "step": 1298 + }, + { + "epoch": 0.21207297661319946, + "grad_norm": 2.103120803833008, + "learning_rate": 1.9999549869460534e-05, + "loss": 0.94, + "step": 1299 + }, + { + "epoch": 0.21223623525570384, + "grad_norm": 1.9734042882919312, + "learning_rate": 1.9999546853541728e-05, + "loss": 0.8945, + "step": 1300 + }, + { + "epoch": 0.21239949389820822, + "grad_norm": 2.0586676597595215, + "learning_rate": 1.999954382755339e-05, + "loss": 0.9791, + "step": 1301 + }, + { + "epoch": 0.21256275254071264, + "grad_norm": 2.274178981781006, + "learning_rate": 1.9999540791495507e-05, + "loss": 0.9691, + "step": 1302 + }, + { + "epoch": 0.21272601118321702, + "grad_norm": 2.022470712661743, + "learning_rate": 1.999953774536809e-05, + "loss": 0.9499, + "step": 1303 + }, + { + "epoch": 0.2128892698257214, + "grad_norm": 2.472344160079956, + "learning_rate": 1.9999534689171146e-05, + "loss": 0.9589, + "step": 1304 + }, + { + "epoch": 0.21305252846822578, + "grad_norm": 1.9726943969726562, + "learning_rate": 1.999953162290467e-05, + "loss": 0.8441, + "step": 1305 + }, + { + "epoch": 0.21321578711073016, + "grad_norm": 2.1743080615997314, + "learning_rate": 1.9999528546568667e-05, + "loss": 0.9969, + "step": 1306 + }, + { + "epoch": 0.21337904575323458, + "grad_norm": 2.4251761436462402, + "learning_rate": 1.9999525460163143e-05, + "loss": 1.1353, + "step": 1307 + }, + { + "epoch": 0.21354230439573896, + "grad_norm": 2.0615875720977783, + "learning_rate": 1.99995223636881e-05, + "loss": 0.7659, + "step": 1308 + }, + { + "epoch": 0.21370556303824334, + "grad_norm": 2.5262575149536133, + "learning_rate": 1.999951925714354e-05, + "loss": 1.2011, + "step": 1309 + }, + { + "epoch": 0.21386882168074772, + "grad_norm": 2.1753737926483154, + "learning_rate": 1.9999516140529465e-05, + "loss": 1.0763, + "step": 1310 + }, + { + "epoch": 0.2140320803232521, + "grad_norm": 2.435194492340088, + "learning_rate": 1.999951301384588e-05, + "loss": 1.0107, + "step": 1311 + }, + { + "epoch": 0.2141953389657565, + "grad_norm": 2.2366535663604736, + "learning_rate": 1.999950987709279e-05, + "loss": 0.9, + "step": 1312 + }, + { + "epoch": 0.2143585976082609, + "grad_norm": 2.2835752964019775, + "learning_rate": 1.9999506730270198e-05, + "loss": 0.9283, + "step": 1313 + }, + { + "epoch": 0.21452185625076528, + "grad_norm": 2.4992947578430176, + "learning_rate": 1.9999503573378102e-05, + "loss": 1.0813, + "step": 1314 + }, + { + "epoch": 0.21468511489326966, + "grad_norm": 2.288408041000366, + "learning_rate": 1.999950040641651e-05, + "loss": 0.9945, + "step": 1315 + }, + { + "epoch": 0.21484837353577405, + "grad_norm": 2.0417964458465576, + "learning_rate": 1.9999497229385422e-05, + "loss": 0.936, + "step": 1316 + }, + { + "epoch": 0.21501163217827843, + "grad_norm": 2.712717294692993, + "learning_rate": 1.9999494042284844e-05, + "loss": 1.0761, + "step": 1317 + }, + { + "epoch": 0.21517489082078284, + "grad_norm": 2.1315293312072754, + "learning_rate": 1.999949084511478e-05, + "loss": 0.9006, + "step": 1318 + }, + { + "epoch": 0.21533814946328722, + "grad_norm": 2.0772135257720947, + "learning_rate": 1.999948763787523e-05, + "loss": 1.0365, + "step": 1319 + }, + { + "epoch": 0.2155014081057916, + "grad_norm": 2.3228888511657715, + "learning_rate": 1.9999484420566197e-05, + "loss": 1.1043, + "step": 1320 + }, + { + "epoch": 0.21566466674829599, + "grad_norm": 2.0455996990203857, + "learning_rate": 1.9999481193187685e-05, + "loss": 0.8884, + "step": 1321 + }, + { + "epoch": 0.21582792539080037, + "grad_norm": 2.403216600418091, + "learning_rate": 1.9999477955739703e-05, + "loss": 1.0579, + "step": 1322 + }, + { + "epoch": 0.21599118403330475, + "grad_norm": 2.4208836555480957, + "learning_rate": 1.9999474708222248e-05, + "loss": 0.8545, + "step": 1323 + }, + { + "epoch": 0.21615444267580916, + "grad_norm": 2.4777541160583496, + "learning_rate": 1.9999471450635322e-05, + "loss": 0.9584, + "step": 1324 + }, + { + "epoch": 0.21631770131831354, + "grad_norm": 2.428277015686035, + "learning_rate": 1.999946818297893e-05, + "loss": 1.0093, + "step": 1325 + }, + { + "epoch": 0.21648095996081793, + "grad_norm": 2.368472099304199, + "learning_rate": 1.999946490525308e-05, + "loss": 0.9628, + "step": 1326 + }, + { + "epoch": 0.2166442186033223, + "grad_norm": 2.0551724433898926, + "learning_rate": 1.9999461617457773e-05, + "loss": 1.0424, + "step": 1327 + }, + { + "epoch": 0.2168074772458267, + "grad_norm": 2.1272008419036865, + "learning_rate": 1.9999458319593008e-05, + "loss": 1.0637, + "step": 1328 + }, + { + "epoch": 0.2169707358883311, + "grad_norm": 2.2524030208587646, + "learning_rate": 1.999945501165879e-05, + "loss": 0.9783, + "step": 1329 + }, + { + "epoch": 0.21713399453083548, + "grad_norm": 2.1017332077026367, + "learning_rate": 1.9999451693655125e-05, + "loss": 1.0354, + "step": 1330 + }, + { + "epoch": 0.21729725317333987, + "grad_norm": 2.338069200515747, + "learning_rate": 1.9999448365582014e-05, + "loss": 1.1087, + "step": 1331 + }, + { + "epoch": 0.21746051181584425, + "grad_norm": 2.274113178253174, + "learning_rate": 1.999944502743946e-05, + "loss": 1.0582, + "step": 1332 + }, + { + "epoch": 0.21762377045834863, + "grad_norm": 2.543405532836914, + "learning_rate": 1.9999441679227466e-05, + "loss": 1.0986, + "step": 1333 + }, + { + "epoch": 0.217787029100853, + "grad_norm": 2.4448163509368896, + "learning_rate": 1.999943832094604e-05, + "loss": 2.1344, + "step": 1334 + }, + { + "epoch": 0.21795028774335742, + "grad_norm": 2.404064416885376, + "learning_rate": 1.9999434952595184e-05, + "loss": 1.198, + "step": 1335 + }, + { + "epoch": 0.2181135463858618, + "grad_norm": 2.368515968322754, + "learning_rate": 1.9999431574174895e-05, + "loss": 1.0837, + "step": 1336 + }, + { + "epoch": 0.2182768050283662, + "grad_norm": 2.336869955062866, + "learning_rate": 1.9999428185685187e-05, + "loss": 1.0213, + "step": 1337 + }, + { + "epoch": 0.21844006367087057, + "grad_norm": 2.384500026702881, + "learning_rate": 1.999942478712605e-05, + "loss": 0.9344, + "step": 1338 + }, + { + "epoch": 0.21860332231337495, + "grad_norm": 2.080235004425049, + "learning_rate": 1.9999421378497504e-05, + "loss": 1.0958, + "step": 1339 + }, + { + "epoch": 0.21876658095587936, + "grad_norm": 2.1801416873931885, + "learning_rate": 1.9999417959799538e-05, + "loss": 0.965, + "step": 1340 + }, + { + "epoch": 0.21892983959838375, + "grad_norm": 2.2784318923950195, + "learning_rate": 1.999941453103216e-05, + "loss": 1.0562, + "step": 1341 + }, + { + "epoch": 0.21909309824088813, + "grad_norm": 2.4351587295532227, + "learning_rate": 1.9999411092195373e-05, + "loss": 1.1061, + "step": 1342 + }, + { + "epoch": 0.2192563568833925, + "grad_norm": 2.589292049407959, + "learning_rate": 1.9999407643289187e-05, + "loss": 1.0785, + "step": 1343 + }, + { + "epoch": 0.2194196155258969, + "grad_norm": 2.379124641418457, + "learning_rate": 1.9999404184313595e-05, + "loss": 1.174, + "step": 1344 + }, + { + "epoch": 0.21958287416840128, + "grad_norm": 2.216259002685547, + "learning_rate": 1.9999400715268607e-05, + "loss": 1.1748, + "step": 1345 + }, + { + "epoch": 0.2197461328109057, + "grad_norm": 2.670081615447998, + "learning_rate": 1.9999397236154228e-05, + "loss": 0.9275, + "step": 1346 + }, + { + "epoch": 0.21990939145341007, + "grad_norm": 2.440054416656494, + "learning_rate": 1.9999393746970456e-05, + "loss": 0.855, + "step": 1347 + }, + { + "epoch": 0.22007265009591445, + "grad_norm": 2.3037898540496826, + "learning_rate": 1.9999390247717296e-05, + "loss": 0.8509, + "step": 1348 + }, + { + "epoch": 0.22023590873841883, + "grad_norm": 2.0270252227783203, + "learning_rate": 1.9999386738394757e-05, + "loss": 1.0842, + "step": 1349 + }, + { + "epoch": 0.22039916738092322, + "grad_norm": 2.1370058059692383, + "learning_rate": 1.9999383219002836e-05, + "loss": 0.9708, + "step": 1350 + }, + { + "epoch": 0.22056242602342763, + "grad_norm": 2.2172136306762695, + "learning_rate": 1.9999379689541536e-05, + "loss": 0.9063, + "step": 1351 + }, + { + "epoch": 0.220725684665932, + "grad_norm": 2.106698751449585, + "learning_rate": 1.9999376150010868e-05, + "loss": 0.9857, + "step": 1352 + }, + { + "epoch": 0.2208889433084364, + "grad_norm": 2.2829997539520264, + "learning_rate": 1.9999372600410828e-05, + "loss": 0.9354, + "step": 1353 + }, + { + "epoch": 0.22105220195094077, + "grad_norm": 2.5648410320281982, + "learning_rate": 1.9999369040741423e-05, + "loss": 0.9826, + "step": 1354 + }, + { + "epoch": 0.22121546059344516, + "grad_norm": 2.2933948040008545, + "learning_rate": 1.9999365471002656e-05, + "loss": 1.0284, + "step": 1355 + }, + { + "epoch": 0.22137871923594954, + "grad_norm": 2.216158866882324, + "learning_rate": 1.999936189119453e-05, + "loss": 0.8352, + "step": 1356 + }, + { + "epoch": 0.22154197787845395, + "grad_norm": 2.386352777481079, + "learning_rate": 1.999935830131705e-05, + "loss": 1.1436, + "step": 1357 + }, + { + "epoch": 0.22170523652095833, + "grad_norm": 1.8827351331710815, + "learning_rate": 1.9999354701370217e-05, + "loss": 0.7611, + "step": 1358 + }, + { + "epoch": 0.22186849516346271, + "grad_norm": 2.6127867698669434, + "learning_rate": 1.9999351091354038e-05, + "loss": 0.8851, + "step": 1359 + }, + { + "epoch": 0.2220317538059671, + "grad_norm": 2.170525550842285, + "learning_rate": 1.9999347471268517e-05, + "loss": 0.9324, + "step": 1360 + }, + { + "epoch": 0.22219501244847148, + "grad_norm": 2.5319437980651855, + "learning_rate": 1.9999343841113652e-05, + "loss": 1.3992, + "step": 1361 + }, + { + "epoch": 0.2223582710909759, + "grad_norm": 2.2870419025421143, + "learning_rate": 1.9999340200889455e-05, + "loss": 1.0953, + "step": 1362 + }, + { + "epoch": 0.22252152973348027, + "grad_norm": 2.201634645462036, + "learning_rate": 1.999933655059592e-05, + "loss": 1.0753, + "step": 1363 + }, + { + "epoch": 0.22268478837598465, + "grad_norm": 2.5825555324554443, + "learning_rate": 1.999933289023306e-05, + "loss": 1.1056, + "step": 1364 + }, + { + "epoch": 0.22284804701848904, + "grad_norm": 2.396245002746582, + "learning_rate": 1.999932921980087e-05, + "loss": 1.0403, + "step": 1365 + }, + { + "epoch": 0.22301130566099342, + "grad_norm": 1.9141837358474731, + "learning_rate": 1.999932553929936e-05, + "loss": 0.8429, + "step": 1366 + }, + { + "epoch": 0.22317456430349783, + "grad_norm": 1.8894944190979004, + "learning_rate": 1.9999321848728535e-05, + "loss": 0.9629, + "step": 1367 + }, + { + "epoch": 0.2233378229460022, + "grad_norm": 2.036377191543579, + "learning_rate": 1.9999318148088392e-05, + "loss": 0.988, + "step": 1368 + }, + { + "epoch": 0.2235010815885066, + "grad_norm": 2.181813955307007, + "learning_rate": 1.9999314437378942e-05, + "loss": 0.9079, + "step": 1369 + }, + { + "epoch": 0.22366434023101098, + "grad_norm": 2.5406486988067627, + "learning_rate": 1.999931071660018e-05, + "loss": 1.0315, + "step": 1370 + }, + { + "epoch": 0.22382759887351536, + "grad_norm": 2.390594482421875, + "learning_rate": 1.999930698575212e-05, + "loss": 1.1485, + "step": 1371 + }, + { + "epoch": 0.22399085751601974, + "grad_norm": 2.5430421829223633, + "learning_rate": 1.9999303244834756e-05, + "loss": 1.1223, + "step": 1372 + }, + { + "epoch": 0.22415411615852415, + "grad_norm": 2.1572649478912354, + "learning_rate": 1.99992994938481e-05, + "loss": 0.9383, + "step": 1373 + }, + { + "epoch": 0.22431737480102854, + "grad_norm": 2.532707691192627, + "learning_rate": 1.9999295732792146e-05, + "loss": 1.0254, + "step": 1374 + }, + { + "epoch": 0.22448063344353292, + "grad_norm": 2.120185613632202, + "learning_rate": 1.999929196166691e-05, + "loss": 1.1167, + "step": 1375 + }, + { + "epoch": 0.2246438920860373, + "grad_norm": 2.017282724380493, + "learning_rate": 1.9999288180472388e-05, + "loss": 1.0245, + "step": 1376 + }, + { + "epoch": 0.22480715072854168, + "grad_norm": 2.0543930530548096, + "learning_rate": 1.9999284389208586e-05, + "loss": 1.0614, + "step": 1377 + }, + { + "epoch": 0.2249704093710461, + "grad_norm": 2.2948455810546875, + "learning_rate": 1.9999280587875504e-05, + "loss": 1.1396, + "step": 1378 + }, + { + "epoch": 0.22513366801355048, + "grad_norm": 2.0332608222961426, + "learning_rate": 1.9999276776473152e-05, + "loss": 0.9794, + "step": 1379 + }, + { + "epoch": 0.22529692665605486, + "grad_norm": 2.266589403152466, + "learning_rate": 1.999927295500153e-05, + "loss": 1.0102, + "step": 1380 + }, + { + "epoch": 0.22546018529855924, + "grad_norm": 2.2956466674804688, + "learning_rate": 1.9999269123460644e-05, + "loss": 0.7885, + "step": 1381 + }, + { + "epoch": 0.22562344394106362, + "grad_norm": 2.3948919773101807, + "learning_rate": 1.9999265281850495e-05, + "loss": 1.0625, + "step": 1382 + }, + { + "epoch": 0.225786702583568, + "grad_norm": 2.183424949645996, + "learning_rate": 1.999926143017109e-05, + "loss": 1.0145, + "step": 1383 + }, + { + "epoch": 0.22594996122607242, + "grad_norm": 2.180690288543701, + "learning_rate": 1.999925756842243e-05, + "loss": 0.9044, + "step": 1384 + }, + { + "epoch": 0.2261132198685768, + "grad_norm": 2.635434865951538, + "learning_rate": 1.9999253696604522e-05, + "loss": 1.016, + "step": 1385 + }, + { + "epoch": 0.22627647851108118, + "grad_norm": 2.303046226501465, + "learning_rate": 1.9999249814717364e-05, + "loss": 1.0416, + "step": 1386 + }, + { + "epoch": 0.22643973715358556, + "grad_norm": 2.290841817855835, + "learning_rate": 1.999924592276097e-05, + "loss": 1.145, + "step": 1387 + }, + { + "epoch": 0.22660299579608995, + "grad_norm": 2.2113678455352783, + "learning_rate": 1.9999242020735338e-05, + "loss": 0.9243, + "step": 1388 + }, + { + "epoch": 0.22676625443859436, + "grad_norm": 2.449328899383545, + "learning_rate": 1.9999238108640467e-05, + "loss": 1.0628, + "step": 1389 + }, + { + "epoch": 0.22692951308109874, + "grad_norm": 2.423537254333496, + "learning_rate": 1.9999234186476366e-05, + "loss": 1.0556, + "step": 1390 + }, + { + "epoch": 0.22709277172360312, + "grad_norm": 2.016934633255005, + "learning_rate": 1.999923025424304e-05, + "loss": 0.9526, + "step": 1391 + }, + { + "epoch": 0.2272560303661075, + "grad_norm": 2.370967388153076, + "learning_rate": 1.9999226311940494e-05, + "loss": 1.6729, + "step": 1392 + }, + { + "epoch": 0.22741928900861189, + "grad_norm": 2.3866734504699707, + "learning_rate": 1.9999222359568732e-05, + "loss": 1.0401, + "step": 1393 + }, + { + "epoch": 0.22758254765111627, + "grad_norm": 2.1386513710021973, + "learning_rate": 1.999921839712775e-05, + "loss": 0.9644, + "step": 1394 + }, + { + "epoch": 0.22774580629362068, + "grad_norm": 2.1274666786193848, + "learning_rate": 1.999921442461756e-05, + "loss": 0.977, + "step": 1395 + }, + { + "epoch": 0.22790906493612506, + "grad_norm": 2.0370278358459473, + "learning_rate": 1.9999210442038164e-05, + "loss": 0.8958, + "step": 1396 + }, + { + "epoch": 0.22807232357862944, + "grad_norm": 2.118417501449585, + "learning_rate": 1.9999206449389566e-05, + "loss": 0.8765, + "step": 1397 + }, + { + "epoch": 0.22823558222113383, + "grad_norm": 2.3906350135803223, + "learning_rate": 1.9999202446671768e-05, + "loss": 0.9643, + "step": 1398 + }, + { + "epoch": 0.2283988408636382, + "grad_norm": 2.0891644954681396, + "learning_rate": 1.9999198433884778e-05, + "loss": 0.8123, + "step": 1399 + }, + { + "epoch": 0.22856209950614262, + "grad_norm": 2.760629892349243, + "learning_rate": 1.9999194411028596e-05, + "loss": 1.0803, + "step": 1400 + }, + { + "epoch": 0.228725358148647, + "grad_norm": 1.9869005680084229, + "learning_rate": 1.9999190378103228e-05, + "loss": 1.0466, + "step": 1401 + }, + { + "epoch": 0.22888861679115138, + "grad_norm": 2.286304473876953, + "learning_rate": 1.999918633510868e-05, + "loss": 0.8207, + "step": 1402 + }, + { + "epoch": 0.22905187543365577, + "grad_norm": 2.0858097076416016, + "learning_rate": 1.999918228204495e-05, + "loss": 0.8137, + "step": 1403 + }, + { + "epoch": 0.22921513407616015, + "grad_norm": 2.5562191009521484, + "learning_rate": 1.999917821891205e-05, + "loss": 0.9105, + "step": 1404 + }, + { + "epoch": 0.22937839271866453, + "grad_norm": 2.386957883834839, + "learning_rate": 1.9999174145709978e-05, + "loss": 1.002, + "step": 1405 + }, + { + "epoch": 0.22954165136116894, + "grad_norm": 2.51273775100708, + "learning_rate": 1.999917006243874e-05, + "loss": 1.0689, + "step": 1406 + }, + { + "epoch": 0.22970491000367332, + "grad_norm": 1.989636778831482, + "learning_rate": 1.9999165969098344e-05, + "loss": 0.8451, + "step": 1407 + }, + { + "epoch": 0.2298681686461777, + "grad_norm": 2.369889736175537, + "learning_rate": 1.9999161865688787e-05, + "loss": 1.1861, + "step": 1408 + }, + { + "epoch": 0.2300314272886821, + "grad_norm": 2.5874602794647217, + "learning_rate": 1.9999157752210078e-05, + "loss": 0.9587, + "step": 1409 + }, + { + "epoch": 0.23019468593118647, + "grad_norm": 2.3225300312042236, + "learning_rate": 1.9999153628662217e-05, + "loss": 1.0108, + "step": 1410 + }, + { + "epoch": 0.23035794457369088, + "grad_norm": 2.2423036098480225, + "learning_rate": 1.9999149495045215e-05, + "loss": 0.9892, + "step": 1411 + }, + { + "epoch": 0.23052120321619526, + "grad_norm": 2.851832628250122, + "learning_rate": 1.999914535135907e-05, + "loss": 0.9404, + "step": 1412 + }, + { + "epoch": 0.23068446185869965, + "grad_norm": 2.4121601581573486, + "learning_rate": 1.999914119760379e-05, + "loss": 0.973, + "step": 1413 + }, + { + "epoch": 0.23084772050120403, + "grad_norm": 2.440477132797241, + "learning_rate": 1.9999137033779377e-05, + "loss": 1.0385, + "step": 1414 + }, + { + "epoch": 0.2310109791437084, + "grad_norm": 2.185458183288574, + "learning_rate": 1.9999132859885836e-05, + "loss": 0.9599, + "step": 1415 + }, + { + "epoch": 0.2311742377862128, + "grad_norm": 2.3907816410064697, + "learning_rate": 1.9999128675923167e-05, + "loss": 0.8458, + "step": 1416 + }, + { + "epoch": 0.2313374964287172, + "grad_norm": 2.205493450164795, + "learning_rate": 1.999912448189138e-05, + "loss": 1.3449, + "step": 1417 + }, + { + "epoch": 0.2315007550712216, + "grad_norm": 2.369933843612671, + "learning_rate": 1.9999120277790477e-05, + "loss": 1.0692, + "step": 1418 + }, + { + "epoch": 0.23166401371372597, + "grad_norm": 2.2691874504089355, + "learning_rate": 1.9999116063620465e-05, + "loss": 1.3118, + "step": 1419 + }, + { + "epoch": 0.23182727235623035, + "grad_norm": 2.6719963550567627, + "learning_rate": 1.9999111839381346e-05, + "loss": 0.9667, + "step": 1420 + }, + { + "epoch": 0.23199053099873473, + "grad_norm": 2.344728946685791, + "learning_rate": 1.9999107605073123e-05, + "loss": 0.9749, + "step": 1421 + }, + { + "epoch": 0.23215378964123914, + "grad_norm": 2.1785640716552734, + "learning_rate": 1.9999103360695802e-05, + "loss": 1.0146, + "step": 1422 + }, + { + "epoch": 0.23231704828374353, + "grad_norm": 2.3754987716674805, + "learning_rate": 1.9999099106249384e-05, + "loss": 1.1816, + "step": 1423 + }, + { + "epoch": 0.2324803069262479, + "grad_norm": 2.326988458633423, + "learning_rate": 1.999909484173388e-05, + "loss": 1.1123, + "step": 1424 + }, + { + "epoch": 0.2326435655687523, + "grad_norm": 2.1828572750091553, + "learning_rate": 1.9999090567149283e-05, + "loss": 0.9252, + "step": 1425 + }, + { + "epoch": 0.23280682421125667, + "grad_norm": 2.2992196083068848, + "learning_rate": 1.999908628249561e-05, + "loss": 1.1553, + "step": 1426 + }, + { + "epoch": 0.23297008285376108, + "grad_norm": 2.4465243816375732, + "learning_rate": 1.999908198777286e-05, + "loss": 1.0558, + "step": 1427 + }, + { + "epoch": 0.23313334149626547, + "grad_norm": 2.6177003383636475, + "learning_rate": 1.9999077682981033e-05, + "loss": 0.9485, + "step": 1428 + }, + { + "epoch": 0.23329660013876985, + "grad_norm": 2.356412649154663, + "learning_rate": 1.9999073368120142e-05, + "loss": 1.0065, + "step": 1429 + }, + { + "epoch": 0.23345985878127423, + "grad_norm": 2.119990110397339, + "learning_rate": 1.9999069043190184e-05, + "loss": 0.9982, + "step": 1430 + }, + { + "epoch": 0.23362311742377861, + "grad_norm": 2.2722253799438477, + "learning_rate": 1.9999064708191167e-05, + "loss": 1.077, + "step": 1431 + }, + { + "epoch": 0.233786376066283, + "grad_norm": 2.3732821941375732, + "learning_rate": 1.9999060363123096e-05, + "loss": 0.9463, + "step": 1432 + }, + { + "epoch": 0.2339496347087874, + "grad_norm": 2.402902603149414, + "learning_rate": 1.999905600798597e-05, + "loss": 0.9248, + "step": 1433 + }, + { + "epoch": 0.2341128933512918, + "grad_norm": 2.02632212638855, + "learning_rate": 1.99990516427798e-05, + "loss": 0.9383, + "step": 1434 + }, + { + "epoch": 0.23427615199379617, + "grad_norm": 2.4528942108154297, + "learning_rate": 1.9999047267504587e-05, + "loss": 0.974, + "step": 1435 + }, + { + "epoch": 0.23443941063630055, + "grad_norm": 2.2176969051361084, + "learning_rate": 1.9999042882160336e-05, + "loss": 1.0099, + "step": 1436 + }, + { + "epoch": 0.23460266927880494, + "grad_norm": 2.0852878093719482, + "learning_rate": 1.9999038486747053e-05, + "loss": 0.8545, + "step": 1437 + }, + { + "epoch": 0.23476592792130935, + "grad_norm": 2.4913487434387207, + "learning_rate": 1.999903408126474e-05, + "loss": 1.051, + "step": 1438 + }, + { + "epoch": 0.23492918656381373, + "grad_norm": 2.172224283218384, + "learning_rate": 1.99990296657134e-05, + "loss": 0.8908, + "step": 1439 + }, + { + "epoch": 0.2350924452063181, + "grad_norm": 2.3027777671813965, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.7996, + "step": 1440 + }, + { + "epoch": 0.2352557038488225, + "grad_norm": 2.466632604598999, + "learning_rate": 1.999902080440367e-05, + "loss": 1.0025, + "step": 1441 + }, + { + "epoch": 0.23541896249132688, + "grad_norm": 2.1049580574035645, + "learning_rate": 1.9999016358645283e-05, + "loss": 0.7542, + "step": 1442 + }, + { + "epoch": 0.23558222113383126, + "grad_norm": 2.0810844898223877, + "learning_rate": 1.999901190281789e-05, + "loss": 0.9617, + "step": 1443 + }, + { + "epoch": 0.23574547977633567, + "grad_norm": 2.6055381298065186, + "learning_rate": 1.9999007436921497e-05, + "loss": 1.1758, + "step": 1444 + }, + { + "epoch": 0.23590873841884005, + "grad_norm": 2.4072861671447754, + "learning_rate": 1.9999002960956105e-05, + "loss": 0.9462, + "step": 1445 + }, + { + "epoch": 0.23607199706134444, + "grad_norm": 2.1017234325408936, + "learning_rate": 1.999899847492172e-05, + "loss": 0.7997, + "step": 1446 + }, + { + "epoch": 0.23623525570384882, + "grad_norm": 2.5558464527130127, + "learning_rate": 1.9998993978818345e-05, + "loss": 1.0884, + "step": 1447 + }, + { + "epoch": 0.2363985143463532, + "grad_norm": 2.5460708141326904, + "learning_rate": 1.999898947264599e-05, + "loss": 1.1385, + "step": 1448 + }, + { + "epoch": 0.2365617729888576, + "grad_norm": 2.0711004734039307, + "learning_rate": 1.999898495640465e-05, + "loss": 0.8552, + "step": 1449 + }, + { + "epoch": 0.236725031631362, + "grad_norm": 2.5267064571380615, + "learning_rate": 1.9998980430094333e-05, + "loss": 1.0751, + "step": 1450 + }, + { + "epoch": 0.23688829027386638, + "grad_norm": 1.9658653736114502, + "learning_rate": 1.999897589371505e-05, + "loss": 0.9529, + "step": 1451 + }, + { + "epoch": 0.23705154891637076, + "grad_norm": 2.2422935962677, + "learning_rate": 1.99989713472668e-05, + "loss": 0.955, + "step": 1452 + }, + { + "epoch": 0.23721480755887514, + "grad_norm": 2.404816150665283, + "learning_rate": 1.9998966790749586e-05, + "loss": 1.1408, + "step": 1453 + }, + { + "epoch": 0.23737806620137952, + "grad_norm": 2.3228487968444824, + "learning_rate": 1.999896222416342e-05, + "loss": 0.8905, + "step": 1454 + }, + { + "epoch": 0.23754132484388393, + "grad_norm": 2.3918845653533936, + "learning_rate": 1.99989576475083e-05, + "loss": 1.0516, + "step": 1455 + }, + { + "epoch": 0.23770458348638832, + "grad_norm": 2.2459566593170166, + "learning_rate": 1.999895306078423e-05, + "loss": 0.9331, + "step": 1456 + }, + { + "epoch": 0.2378678421288927, + "grad_norm": 2.4983103275299072, + "learning_rate": 1.999894846399122e-05, + "loss": 1.1322, + "step": 1457 + }, + { + "epoch": 0.23803110077139708, + "grad_norm": 2.425989866256714, + "learning_rate": 1.999894385712927e-05, + "loss": 0.8786, + "step": 1458 + }, + { + "epoch": 0.23819435941390146, + "grad_norm": 2.1459083557128906, + "learning_rate": 1.9998939240198384e-05, + "loss": 1.5598, + "step": 1459 + }, + { + "epoch": 0.23835761805640587, + "grad_norm": 2.112872362136841, + "learning_rate": 1.999893461319857e-05, + "loss": 1.0029, + "step": 1460 + }, + { + "epoch": 0.23852087669891026, + "grad_norm": 2.5439507961273193, + "learning_rate": 1.9998929976129834e-05, + "loss": 1.072, + "step": 1461 + }, + { + "epoch": 0.23868413534141464, + "grad_norm": 2.1317644119262695, + "learning_rate": 1.9998925328992175e-05, + "loss": 0.9668, + "step": 1462 + }, + { + "epoch": 0.23884739398391902, + "grad_norm": 2.0046586990356445, + "learning_rate": 1.9998920671785602e-05, + "loss": 0.8435, + "step": 1463 + }, + { + "epoch": 0.2390106526264234, + "grad_norm": 2.114110231399536, + "learning_rate": 1.999891600451012e-05, + "loss": 0.9386, + "step": 1464 + }, + { + "epoch": 0.23917391126892779, + "grad_norm": 1.9382222890853882, + "learning_rate": 1.999891132716573e-05, + "loss": 0.903, + "step": 1465 + }, + { + "epoch": 0.2393371699114322, + "grad_norm": 2.161062240600586, + "learning_rate": 1.999890663975244e-05, + "loss": 0.9577, + "step": 1466 + }, + { + "epoch": 0.23950042855393658, + "grad_norm": 2.14208722114563, + "learning_rate": 1.9998901942270254e-05, + "loss": 1.0309, + "step": 1467 + }, + { + "epoch": 0.23966368719644096, + "grad_norm": 2.3023059368133545, + "learning_rate": 1.9998897234719177e-05, + "loss": 0.9797, + "step": 1468 + }, + { + "epoch": 0.23982694583894534, + "grad_norm": 2.7637124061584473, + "learning_rate": 1.9998892517099212e-05, + "loss": 1.211, + "step": 1469 + }, + { + "epoch": 0.23999020448144973, + "grad_norm": 2.2507212162017822, + "learning_rate": 1.9998887789410363e-05, + "loss": 0.9452, + "step": 1470 + }, + { + "epoch": 0.24015346312395414, + "grad_norm": 2.3210856914520264, + "learning_rate": 1.999888305165264e-05, + "loss": 1.0974, + "step": 1471 + }, + { + "epoch": 0.24031672176645852, + "grad_norm": 2.1683733463287354, + "learning_rate": 1.9998878303826045e-05, + "loss": 0.9439, + "step": 1472 + }, + { + "epoch": 0.2404799804089629, + "grad_norm": 2.733599901199341, + "learning_rate": 1.999887354593058e-05, + "loss": 1.0075, + "step": 1473 + }, + { + "epoch": 0.24064323905146728, + "grad_norm": 2.223419189453125, + "learning_rate": 1.999886877796625e-05, + "loss": 0.9918, + "step": 1474 + }, + { + "epoch": 0.24080649769397167, + "grad_norm": 2.3882501125335693, + "learning_rate": 1.9998863999933065e-05, + "loss": 1.0608, + "step": 1475 + }, + { + "epoch": 0.24096975633647605, + "grad_norm": 2.3184542655944824, + "learning_rate": 1.9998859211831024e-05, + "loss": 0.8315, + "step": 1476 + }, + { + "epoch": 0.24113301497898046, + "grad_norm": 3.0580360889434814, + "learning_rate": 1.9998854413660137e-05, + "loss": 1.3356, + "step": 1477 + }, + { + "epoch": 0.24129627362148484, + "grad_norm": 2.5476059913635254, + "learning_rate": 1.9998849605420404e-05, + "loss": 0.8755, + "step": 1478 + }, + { + "epoch": 0.24145953226398922, + "grad_norm": 2.3434176445007324, + "learning_rate": 1.9998844787111834e-05, + "loss": 0.8769, + "step": 1479 + }, + { + "epoch": 0.2416227909064936, + "grad_norm": 2.677049160003662, + "learning_rate": 1.999883995873443e-05, + "loss": 1.0425, + "step": 1480 + }, + { + "epoch": 0.241786049548998, + "grad_norm": 2.4634435176849365, + "learning_rate": 1.9998835120288197e-05, + "loss": 1.0394, + "step": 1481 + }, + { + "epoch": 0.2419493081915024, + "grad_norm": 2.212402820587158, + "learning_rate": 1.999883027177314e-05, + "loss": 0.7754, + "step": 1482 + }, + { + "epoch": 0.24211256683400678, + "grad_norm": 2.1681182384490967, + "learning_rate": 1.9998825413189262e-05, + "loss": 1.0417, + "step": 1483 + }, + { + "epoch": 0.24227582547651116, + "grad_norm": 2.4645488262176514, + "learning_rate": 1.999882054453657e-05, + "loss": 1.0478, + "step": 1484 + }, + { + "epoch": 0.24243908411901555, + "grad_norm": 2.2799720764160156, + "learning_rate": 1.9998815665815066e-05, + "loss": 1.0213, + "step": 1485 + }, + { + "epoch": 0.24260234276151993, + "grad_norm": 2.281770944595337, + "learning_rate": 1.9998810777024762e-05, + "loss": 0.9329, + "step": 1486 + }, + { + "epoch": 0.24276560140402434, + "grad_norm": 2.2412912845611572, + "learning_rate": 1.9998805878165656e-05, + "loss": 0.9762, + "step": 1487 + }, + { + "epoch": 0.24292886004652872, + "grad_norm": 2.2016985416412354, + "learning_rate": 1.9998800969237754e-05, + "loss": 0.8546, + "step": 1488 + }, + { + "epoch": 0.2430921186890331, + "grad_norm": 2.065603733062744, + "learning_rate": 1.9998796050241066e-05, + "loss": 1.0957, + "step": 1489 + }, + { + "epoch": 0.2432553773315375, + "grad_norm": 2.2019879817962646, + "learning_rate": 1.999879112117559e-05, + "loss": 0.9278, + "step": 1490 + }, + { + "epoch": 0.24341863597404187, + "grad_norm": 2.5850536823272705, + "learning_rate": 1.9998786182041333e-05, + "loss": 1.2565, + "step": 1491 + }, + { + "epoch": 0.24358189461654625, + "grad_norm": 2.2490878105163574, + "learning_rate": 1.9998781232838302e-05, + "loss": 0.8471, + "step": 1492 + }, + { + "epoch": 0.24374515325905066, + "grad_norm": 2.3886966705322266, + "learning_rate": 1.99987762735665e-05, + "loss": 0.9601, + "step": 1493 + }, + { + "epoch": 0.24390841190155504, + "grad_norm": 2.222541093826294, + "learning_rate": 1.9998771304225933e-05, + "loss": 1.0927, + "step": 1494 + }, + { + "epoch": 0.24407167054405943, + "grad_norm": 2.399160623550415, + "learning_rate": 1.9998766324816606e-05, + "loss": 0.9948, + "step": 1495 + }, + { + "epoch": 0.2442349291865638, + "grad_norm": 2.2189831733703613, + "learning_rate": 1.9998761335338527e-05, + "loss": 1.0703, + "step": 1496 + }, + { + "epoch": 0.2443981878290682, + "grad_norm": 2.16328763961792, + "learning_rate": 1.9998756335791696e-05, + "loss": 0.9389, + "step": 1497 + }, + { + "epoch": 0.2445614464715726, + "grad_norm": 2.092993974685669, + "learning_rate": 1.999875132617612e-05, + "loss": 0.8331, + "step": 1498 + }, + { + "epoch": 0.24472470511407698, + "grad_norm": 2.309347629547119, + "learning_rate": 1.9998746306491802e-05, + "loss": 0.9673, + "step": 1499 + }, + { + "epoch": 0.24488796375658137, + "grad_norm": 2.4202919006347656, + "learning_rate": 1.9998741276738753e-05, + "loss": 0.9527, + "step": 1500 + }, + { + "epoch": 0.24505122239908575, + "grad_norm": 2.4826650619506836, + "learning_rate": 1.9998736236916973e-05, + "loss": 1.1269, + "step": 1501 + }, + { + "epoch": 0.24521448104159013, + "grad_norm": 2.2730469703674316, + "learning_rate": 1.9998731187026464e-05, + "loss": 1.1311, + "step": 1502 + }, + { + "epoch": 0.24537773968409451, + "grad_norm": 2.3420541286468506, + "learning_rate": 1.999872612706724e-05, + "loss": 1.0369, + "step": 1503 + }, + { + "epoch": 0.24554099832659892, + "grad_norm": 2.153437852859497, + "learning_rate": 1.99987210570393e-05, + "loss": 0.9874, + "step": 1504 + }, + { + "epoch": 0.2457042569691033, + "grad_norm": 2.032949209213257, + "learning_rate": 1.999871597694265e-05, + "loss": 1.045, + "step": 1505 + }, + { + "epoch": 0.2458675156116077, + "grad_norm": 2.3785126209259033, + "learning_rate": 1.9998710886777298e-05, + "loss": 0.921, + "step": 1506 + }, + { + "epoch": 0.24603077425411207, + "grad_norm": 2.154026985168457, + "learning_rate": 1.9998705786543247e-05, + "loss": 0.9143, + "step": 1507 + }, + { + "epoch": 0.24619403289661645, + "grad_norm": 2.1284878253936768, + "learning_rate": 1.99987006762405e-05, + "loss": 1.0371, + "step": 1508 + }, + { + "epoch": 0.24635729153912087, + "grad_norm": 2.11383056640625, + "learning_rate": 1.9998695555869063e-05, + "loss": 1.047, + "step": 1509 + }, + { + "epoch": 0.24652055018162525, + "grad_norm": 2.1327617168426514, + "learning_rate": 1.9998690425428943e-05, + "loss": 0.9771, + "step": 1510 + }, + { + "epoch": 0.24668380882412963, + "grad_norm": 2.110987424850464, + "learning_rate": 1.9998685284920146e-05, + "loss": 0.8581, + "step": 1511 + }, + { + "epoch": 0.246847067466634, + "grad_norm": 2.160046100616455, + "learning_rate": 1.9998680134342675e-05, + "loss": 0.861, + "step": 1512 + }, + { + "epoch": 0.2470103261091384, + "grad_norm": 2.345703125, + "learning_rate": 1.9998674973696534e-05, + "loss": 1.0274, + "step": 1513 + }, + { + "epoch": 0.24717358475164278, + "grad_norm": 1.9081501960754395, + "learning_rate": 1.999866980298173e-05, + "loss": 0.8382, + "step": 1514 + }, + { + "epoch": 0.2473368433941472, + "grad_norm": 2.9701004028320312, + "learning_rate": 1.999866462219827e-05, + "loss": 0.9373, + "step": 1515 + }, + { + "epoch": 0.24750010203665157, + "grad_norm": 2.34753155708313, + "learning_rate": 1.9998659431346158e-05, + "loss": 0.891, + "step": 1516 + }, + { + "epoch": 0.24766336067915595, + "grad_norm": 2.2245185375213623, + "learning_rate": 1.9998654230425396e-05, + "loss": 1.0074, + "step": 1517 + }, + { + "epoch": 0.24782661932166034, + "grad_norm": 2.178072929382324, + "learning_rate": 1.9998649019435994e-05, + "loss": 1.0016, + "step": 1518 + }, + { + "epoch": 0.24798987796416472, + "grad_norm": 2.284916400909424, + "learning_rate": 1.999864379837795e-05, + "loss": 0.7719, + "step": 1519 + }, + { + "epoch": 0.24815313660666913, + "grad_norm": 2.2304069995880127, + "learning_rate": 1.9998638567251283e-05, + "loss": 0.8286, + "step": 1520 + }, + { + "epoch": 0.2483163952491735, + "grad_norm": 2.528050661087036, + "learning_rate": 1.9998633326055984e-05, + "loss": 0.9881, + "step": 1521 + }, + { + "epoch": 0.2484796538916779, + "grad_norm": 2.3047289848327637, + "learning_rate": 1.9998628074792066e-05, + "loss": 1.0666, + "step": 1522 + }, + { + "epoch": 0.24864291253418228, + "grad_norm": 1.966029405593872, + "learning_rate": 1.999862281345953e-05, + "loss": 0.9146, + "step": 1523 + }, + { + "epoch": 0.24880617117668666, + "grad_norm": 2.1465721130371094, + "learning_rate": 1.9998617542058384e-05, + "loss": 0.8277, + "step": 1524 + }, + { + "epoch": 0.24896942981919104, + "grad_norm": 2.6145849227905273, + "learning_rate": 1.9998612260588634e-05, + "loss": 1.1829, + "step": 1525 + }, + { + "epoch": 0.24913268846169545, + "grad_norm": 2.1193745136260986, + "learning_rate": 1.9998606969050285e-05, + "loss": 0.9246, + "step": 1526 + }, + { + "epoch": 0.24929594710419983, + "grad_norm": 2.849768877029419, + "learning_rate": 1.999860166744334e-05, + "loss": 0.9778, + "step": 1527 + }, + { + "epoch": 0.24945920574670422, + "grad_norm": 2.1113669872283936, + "learning_rate": 1.9998596355767805e-05, + "loss": 0.8719, + "step": 1528 + }, + { + "epoch": 0.2496224643892086, + "grad_norm": 2.4507896900177, + "learning_rate": 1.9998591034023688e-05, + "loss": 1.0836, + "step": 1529 + }, + { + "epoch": 0.24978572303171298, + "grad_norm": 2.362757444381714, + "learning_rate": 1.9998585702210992e-05, + "loss": 1.0842, + "step": 1530 + }, + { + "epoch": 0.2499489816742174, + "grad_norm": 2.245173692703247, + "learning_rate": 1.999858036032972e-05, + "loss": 0.878, + "step": 1531 + }, + { + "epoch": 0.25011224031672175, + "grad_norm": 2.061924457550049, + "learning_rate": 1.9998575008379887e-05, + "loss": 0.9249, + "step": 1532 + }, + { + "epoch": 0.25027549895922613, + "grad_norm": 2.1389143466949463, + "learning_rate": 1.9998569646361484e-05, + "loss": 0.9873, + "step": 1533 + }, + { + "epoch": 0.25043875760173057, + "grad_norm": 2.0788917541503906, + "learning_rate": 1.9998564274274527e-05, + "loss": 0.9244, + "step": 1534 + }, + { + "epoch": 0.25060201624423495, + "grad_norm": 2.3605082035064697, + "learning_rate": 1.9998558892119017e-05, + "loss": 1.0441, + "step": 1535 + }, + { + "epoch": 0.25076527488673933, + "grad_norm": 2.5390236377716064, + "learning_rate": 1.9998553499894963e-05, + "loss": 1.1871, + "step": 1536 + }, + { + "epoch": 0.2509285335292437, + "grad_norm": 2.2017407417297363, + "learning_rate": 1.999854809760237e-05, + "loss": 0.8647, + "step": 1537 + }, + { + "epoch": 0.2510917921717481, + "grad_norm": 2.0021510124206543, + "learning_rate": 1.999854268524124e-05, + "loss": 0.8848, + "step": 1538 + }, + { + "epoch": 0.2512550508142525, + "grad_norm": 2.070573091506958, + "learning_rate": 1.999853726281158e-05, + "loss": 0.9853, + "step": 1539 + }, + { + "epoch": 0.25141830945675686, + "grad_norm": 2.2029378414154053, + "learning_rate": 1.9998531830313394e-05, + "loss": 0.9324, + "step": 1540 + }, + { + "epoch": 0.25158156809926124, + "grad_norm": 2.452939748764038, + "learning_rate": 1.9998526387746692e-05, + "loss": 0.9051, + "step": 1541 + }, + { + "epoch": 0.2517448267417656, + "grad_norm": 1.935509443283081, + "learning_rate": 1.999852093511147e-05, + "loss": 0.8392, + "step": 1542 + }, + { + "epoch": 0.25190808538427, + "grad_norm": 2.1782240867614746, + "learning_rate": 1.9998515472407747e-05, + "loss": 0.8205, + "step": 1543 + }, + { + "epoch": 0.2520713440267744, + "grad_norm": 2.0976052284240723, + "learning_rate": 1.999850999963552e-05, + "loss": 0.9226, + "step": 1544 + }, + { + "epoch": 0.25223460266927883, + "grad_norm": 2.405000925064087, + "learning_rate": 1.9998504516794797e-05, + "loss": 1.0881, + "step": 1545 + }, + { + "epoch": 0.2523978613117832, + "grad_norm": 2.09303879737854, + "learning_rate": 1.999849902388558e-05, + "loss": 0.8997, + "step": 1546 + }, + { + "epoch": 0.2525611199542876, + "grad_norm": 2.3805785179138184, + "learning_rate": 1.999849352090788e-05, + "loss": 0.8635, + "step": 1547 + }, + { + "epoch": 0.252724378596792, + "grad_norm": 2.4358837604522705, + "learning_rate": 1.9998488007861695e-05, + "loss": 1.0361, + "step": 1548 + }, + { + "epoch": 0.25288763723929636, + "grad_norm": 2.5397839546203613, + "learning_rate": 1.999848248474704e-05, + "loss": 1.0601, + "step": 1549 + }, + { + "epoch": 0.25305089588180074, + "grad_norm": 2.1588454246520996, + "learning_rate": 1.9998476951563914e-05, + "loss": 0.8145, + "step": 1550 + }, + { + "epoch": 0.2532141545243051, + "grad_norm": 2.297045946121216, + "learning_rate": 1.9998471408312326e-05, + "loss": 1.0361, + "step": 1551 + }, + { + "epoch": 0.2533774131668095, + "grad_norm": 2.5115957260131836, + "learning_rate": 1.9998465854992278e-05, + "loss": 1.0172, + "step": 1552 + }, + { + "epoch": 0.2535406718093139, + "grad_norm": 2.2926783561706543, + "learning_rate": 1.9998460291603776e-05, + "loss": 1.0654, + "step": 1553 + }, + { + "epoch": 0.25370393045181827, + "grad_norm": 2.562986135482788, + "learning_rate": 1.999845471814683e-05, + "loss": 1.841, + "step": 1554 + }, + { + "epoch": 0.25386718909432265, + "grad_norm": 2.257481336593628, + "learning_rate": 1.9998449134621442e-05, + "loss": 0.9736, + "step": 1555 + }, + { + "epoch": 0.2540304477368271, + "grad_norm": 2.180178642272949, + "learning_rate": 1.999844354102762e-05, + "loss": 0.9232, + "step": 1556 + }, + { + "epoch": 0.2541937063793315, + "grad_norm": 2.0806853771209717, + "learning_rate": 1.9998437937365365e-05, + "loss": 0.9799, + "step": 1557 + }, + { + "epoch": 0.25435696502183586, + "grad_norm": 2.0185790061950684, + "learning_rate": 1.9998432323634683e-05, + "loss": 0.8551, + "step": 1558 + }, + { + "epoch": 0.25452022366434024, + "grad_norm": 1.8499635457992554, + "learning_rate": 1.9998426699835588e-05, + "loss": 0.7678, + "step": 1559 + }, + { + "epoch": 0.2546834823068446, + "grad_norm": 2.428950309753418, + "learning_rate": 1.999842106596808e-05, + "loss": 1.0661, + "step": 1560 + }, + { + "epoch": 0.254846740949349, + "grad_norm": 3.280778169631958, + "learning_rate": 1.9998415422032163e-05, + "loss": 1.0602, + "step": 1561 + }, + { + "epoch": 0.2550099995918534, + "grad_norm": 1.896915316581726, + "learning_rate": 1.9998409768027846e-05, + "loss": 0.961, + "step": 1562 + }, + { + "epoch": 0.25517325823435777, + "grad_norm": 2.0367724895477295, + "learning_rate": 1.9998404103955126e-05, + "loss": 0.9002, + "step": 1563 + }, + { + "epoch": 0.25533651687686215, + "grad_norm": 2.1089634895324707, + "learning_rate": 1.9998398429814024e-05, + "loss": 0.8505, + "step": 1564 + }, + { + "epoch": 0.25549977551936653, + "grad_norm": 2.0980021953582764, + "learning_rate": 1.9998392745604533e-05, + "loss": 0.8176, + "step": 1565 + }, + { + "epoch": 0.2556630341618709, + "grad_norm": 2.2106893062591553, + "learning_rate": 1.9998387051326665e-05, + "loss": 0.8366, + "step": 1566 + }, + { + "epoch": 0.25582629280437535, + "grad_norm": 2.1966288089752197, + "learning_rate": 1.9998381346980423e-05, + "loss": 1.1024, + "step": 1567 + }, + { + "epoch": 0.25598955144687974, + "grad_norm": 2.3940885066986084, + "learning_rate": 1.9998375632565814e-05, + "loss": 1.1565, + "step": 1568 + }, + { + "epoch": 0.2561528100893841, + "grad_norm": 1.9346933364868164, + "learning_rate": 1.9998369908082844e-05, + "loss": 1.0515, + "step": 1569 + }, + { + "epoch": 0.2563160687318885, + "grad_norm": 2.501025438308716, + "learning_rate": 1.9998364173531514e-05, + "loss": 1.052, + "step": 1570 + }, + { + "epoch": 0.2564793273743929, + "grad_norm": 2.4516477584838867, + "learning_rate": 1.999835842891184e-05, + "loss": 1.1361, + "step": 1571 + }, + { + "epoch": 0.25664258601689727, + "grad_norm": 2.2060141563415527, + "learning_rate": 1.9998352674223816e-05, + "loss": 1.0832, + "step": 1572 + }, + { + "epoch": 0.25680584465940165, + "grad_norm": 2.3892993927001953, + "learning_rate": 1.999834690946746e-05, + "loss": 1.0129, + "step": 1573 + }, + { + "epoch": 0.25696910330190603, + "grad_norm": 2.552574634552002, + "learning_rate": 1.999834113464277e-05, + "loss": 0.959, + "step": 1574 + }, + { + "epoch": 0.2571323619444104, + "grad_norm": 2.1598026752471924, + "learning_rate": 1.999833534974975e-05, + "loss": 1.1556, + "step": 1575 + }, + { + "epoch": 0.2572956205869148, + "grad_norm": 2.3585972785949707, + "learning_rate": 1.9998329554788407e-05, + "loss": 1.1857, + "step": 1576 + }, + { + "epoch": 0.2574588792294192, + "grad_norm": 2.315092086791992, + "learning_rate": 1.9998323749758756e-05, + "loss": 1.0557, + "step": 1577 + }, + { + "epoch": 0.2576221378719236, + "grad_norm": 2.0442087650299072, + "learning_rate": 1.999831793466079e-05, + "loss": 0.9384, + "step": 1578 + }, + { + "epoch": 0.257785396514428, + "grad_norm": 2.3240437507629395, + "learning_rate": 1.9998312109494523e-05, + "loss": 1.1384, + "step": 1579 + }, + { + "epoch": 0.2579486551569324, + "grad_norm": 2.2552802562713623, + "learning_rate": 1.9998306274259955e-05, + "loss": 1.049, + "step": 1580 + }, + { + "epoch": 0.25811191379943677, + "grad_norm": 2.3245856761932373, + "learning_rate": 1.9998300428957096e-05, + "loss": 0.8941, + "step": 1581 + }, + { + "epoch": 0.25827517244194115, + "grad_norm": 1.9388028383255005, + "learning_rate": 1.9998294573585953e-05, + "loss": 0.9672, + "step": 1582 + }, + { + "epoch": 0.25843843108444553, + "grad_norm": 2.3898706436157227, + "learning_rate": 1.999828870814653e-05, + "loss": 1.0688, + "step": 1583 + }, + { + "epoch": 0.2586016897269499, + "grad_norm": 2.032569169998169, + "learning_rate": 1.9998282832638834e-05, + "loss": 0.943, + "step": 1584 + }, + { + "epoch": 0.2587649483694543, + "grad_norm": 2.325777530670166, + "learning_rate": 1.999827694706287e-05, + "loss": 0.9167, + "step": 1585 + }, + { + "epoch": 0.2589282070119587, + "grad_norm": 2.3224668502807617, + "learning_rate": 1.999827105141864e-05, + "loss": 0.9083, + "step": 1586 + }, + { + "epoch": 0.25909146565446306, + "grad_norm": 2.0769941806793213, + "learning_rate": 1.9998265145706156e-05, + "loss": 0.9112, + "step": 1587 + }, + { + "epoch": 0.2592547242969675, + "grad_norm": 2.6352624893188477, + "learning_rate": 1.9998259229925422e-05, + "loss": 1.0263, + "step": 1588 + }, + { + "epoch": 0.2594179829394719, + "grad_norm": 2.4069159030914307, + "learning_rate": 1.9998253304076442e-05, + "loss": 1.1865, + "step": 1589 + }, + { + "epoch": 0.25958124158197626, + "grad_norm": 1.9527581930160522, + "learning_rate": 1.9998247368159225e-05, + "loss": 0.9937, + "step": 1590 + }, + { + "epoch": 0.25974450022448065, + "grad_norm": 2.4401156902313232, + "learning_rate": 1.9998241422173774e-05, + "loss": 0.972, + "step": 1591 + }, + { + "epoch": 0.25990775886698503, + "grad_norm": 2.6888279914855957, + "learning_rate": 1.99982354661201e-05, + "loss": 0.8185, + "step": 1592 + }, + { + "epoch": 0.2600710175094894, + "grad_norm": 2.214204788208008, + "learning_rate": 1.9998229499998203e-05, + "loss": 0.9354, + "step": 1593 + }, + { + "epoch": 0.2602342761519938, + "grad_norm": 2.2476868629455566, + "learning_rate": 1.9998223523808092e-05, + "loss": 0.8918, + "step": 1594 + }, + { + "epoch": 0.2603975347944982, + "grad_norm": 2.2604010105133057, + "learning_rate": 1.9998217537549772e-05, + "loss": 0.9494, + "step": 1595 + }, + { + "epoch": 0.26056079343700256, + "grad_norm": 2.822908878326416, + "learning_rate": 1.9998211541223253e-05, + "loss": 1.1784, + "step": 1596 + }, + { + "epoch": 0.26072405207950694, + "grad_norm": 2.354078531265259, + "learning_rate": 1.999820553482853e-05, + "loss": 1.0766, + "step": 1597 + }, + { + "epoch": 0.2608873107220113, + "grad_norm": 2.2730464935302734, + "learning_rate": 1.9998199518365622e-05, + "loss": 0.9455, + "step": 1598 + }, + { + "epoch": 0.26105056936451576, + "grad_norm": 2.1004371643066406, + "learning_rate": 1.999819349183453e-05, + "loss": 0.8128, + "step": 1599 + }, + { + "epoch": 0.26121382800702014, + "grad_norm": 2.533642053604126, + "learning_rate": 1.999818745523526e-05, + "loss": 0.9828, + "step": 1600 + }, + { + "epoch": 0.2613770866495245, + "grad_norm": 2.904257297515869, + "learning_rate": 1.9998181408567815e-05, + "loss": 0.9588, + "step": 1601 + }, + { + "epoch": 0.2615403452920289, + "grad_norm": 2.8340792655944824, + "learning_rate": 1.9998175351832207e-05, + "loss": 1.0453, + "step": 1602 + }, + { + "epoch": 0.2617036039345333, + "grad_norm": 2.5507748126983643, + "learning_rate": 1.9998169285028436e-05, + "loss": 1.0701, + "step": 1603 + }, + { + "epoch": 0.2618668625770377, + "grad_norm": 2.3377580642700195, + "learning_rate": 1.9998163208156517e-05, + "loss": 0.9628, + "step": 1604 + }, + { + "epoch": 0.26203012121954206, + "grad_norm": 2.3866794109344482, + "learning_rate": 1.9998157121216442e-05, + "loss": 1.0395, + "step": 1605 + }, + { + "epoch": 0.26219337986204644, + "grad_norm": 2.1547176837921143, + "learning_rate": 1.9998151024208232e-05, + "loss": 0.9158, + "step": 1606 + }, + { + "epoch": 0.2623566385045508, + "grad_norm": 2.375020980834961, + "learning_rate": 1.9998144917131884e-05, + "loss": 0.9389, + "step": 1607 + }, + { + "epoch": 0.2625198971470552, + "grad_norm": 2.4806644916534424, + "learning_rate": 1.9998138799987407e-05, + "loss": 1.0151, + "step": 1608 + }, + { + "epoch": 0.2626831557895596, + "grad_norm": 2.5218520164489746, + "learning_rate": 1.999813267277481e-05, + "loss": 0.9753, + "step": 1609 + }, + { + "epoch": 0.262846414432064, + "grad_norm": 2.500267744064331, + "learning_rate": 1.999812653549409e-05, + "loss": 0.9941, + "step": 1610 + }, + { + "epoch": 0.2630096730745684, + "grad_norm": 2.330549478530884, + "learning_rate": 1.9998120388145264e-05, + "loss": 0.9842, + "step": 1611 + }, + { + "epoch": 0.2631729317170728, + "grad_norm": 2.485522747039795, + "learning_rate": 1.999811423072833e-05, + "loss": 0.973, + "step": 1612 + }, + { + "epoch": 0.26333619035957717, + "grad_norm": 2.0989956855773926, + "learning_rate": 1.9998108063243298e-05, + "loss": 0.8529, + "step": 1613 + }, + { + "epoch": 0.26349944900208155, + "grad_norm": 2.3262698650360107, + "learning_rate": 1.9998101885690176e-05, + "loss": 0.9587, + "step": 1614 + }, + { + "epoch": 0.26366270764458594, + "grad_norm": 2.1522011756896973, + "learning_rate": 1.999809569806897e-05, + "loss": 1.0415, + "step": 1615 + }, + { + "epoch": 0.2638259662870903, + "grad_norm": 2.226195812225342, + "learning_rate": 1.999808950037968e-05, + "loss": 0.9189, + "step": 1616 + }, + { + "epoch": 0.2639892249295947, + "grad_norm": 2.1276137828826904, + "learning_rate": 1.9998083292622315e-05, + "loss": 0.873, + "step": 1617 + }, + { + "epoch": 0.2641524835720991, + "grad_norm": 1.9301470518112183, + "learning_rate": 1.999807707479689e-05, + "loss": 0.9848, + "step": 1618 + }, + { + "epoch": 0.26431574221460347, + "grad_norm": 2.3828012943267822, + "learning_rate": 1.9998070846903397e-05, + "loss": 0.9448, + "step": 1619 + }, + { + "epoch": 0.26447900085710785, + "grad_norm": 2.3664534091949463, + "learning_rate": 1.999806460894185e-05, + "loss": 1.0607, + "step": 1620 + }, + { + "epoch": 0.2646422594996123, + "grad_norm": 2.5018577575683594, + "learning_rate": 1.999805836091226e-05, + "loss": 1.04, + "step": 1621 + }, + { + "epoch": 0.26480551814211667, + "grad_norm": 1.9868546724319458, + "learning_rate": 1.9998052102814624e-05, + "loss": 1.0129, + "step": 1622 + }, + { + "epoch": 0.26496877678462105, + "grad_norm": 2.041656970977783, + "learning_rate": 1.9998045834648953e-05, + "loss": 0.9558, + "step": 1623 + }, + { + "epoch": 0.26513203542712543, + "grad_norm": 2.130643367767334, + "learning_rate": 1.9998039556415253e-05, + "loss": 0.9915, + "step": 1624 + }, + { + "epoch": 0.2652952940696298, + "grad_norm": 2.3569979667663574, + "learning_rate": 1.999803326811353e-05, + "loss": 1.0052, + "step": 1625 + }, + { + "epoch": 0.2654585527121342, + "grad_norm": 2.2775113582611084, + "learning_rate": 1.9998026969743788e-05, + "loss": 1.0191, + "step": 1626 + }, + { + "epoch": 0.2656218113546386, + "grad_norm": 2.1333043575286865, + "learning_rate": 1.9998020661306037e-05, + "loss": 0.9171, + "step": 1627 + }, + { + "epoch": 0.26578506999714296, + "grad_norm": 2.5165746212005615, + "learning_rate": 1.999801434280028e-05, + "loss": 1.1021, + "step": 1628 + }, + { + "epoch": 0.26594832863964735, + "grad_norm": 2.580007791519165, + "learning_rate": 1.9998008014226527e-05, + "loss": 1.1493, + "step": 1629 + }, + { + "epoch": 0.26611158728215173, + "grad_norm": 2.4853732585906982, + "learning_rate": 1.999800167558478e-05, + "loss": 1.1057, + "step": 1630 + }, + { + "epoch": 0.2662748459246561, + "grad_norm": 2.509490966796875, + "learning_rate": 1.9997995326875053e-05, + "loss": 0.9258, + "step": 1631 + }, + { + "epoch": 0.26643810456716055, + "grad_norm": 2.2175002098083496, + "learning_rate": 1.9997988968097345e-05, + "loss": 0.7514, + "step": 1632 + }, + { + "epoch": 0.26660136320966493, + "grad_norm": 2.173711061477661, + "learning_rate": 1.999798259925166e-05, + "loss": 0.9267, + "step": 1633 + }, + { + "epoch": 0.2667646218521693, + "grad_norm": 2.260765552520752, + "learning_rate": 1.9997976220338015e-05, + "loss": 1.0368, + "step": 1634 + }, + { + "epoch": 0.2669278804946737, + "grad_norm": 2.242966413497925, + "learning_rate": 1.999796983135641e-05, + "loss": 1.0133, + "step": 1635 + }, + { + "epoch": 0.2670911391371781, + "grad_norm": 2.297445297241211, + "learning_rate": 1.9997963432306852e-05, + "loss": 0.8132, + "step": 1636 + }, + { + "epoch": 0.26725439777968246, + "grad_norm": 2.109172821044922, + "learning_rate": 1.9997957023189346e-05, + "loss": 0.7029, + "step": 1637 + }, + { + "epoch": 0.26741765642218684, + "grad_norm": 2.336223602294922, + "learning_rate": 1.99979506040039e-05, + "loss": 0.9521, + "step": 1638 + }, + { + "epoch": 0.2675809150646912, + "grad_norm": 2.297769784927368, + "learning_rate": 1.999794417475052e-05, + "loss": 0.9215, + "step": 1639 + }, + { + "epoch": 0.2677441737071956, + "grad_norm": 2.0260095596313477, + "learning_rate": 1.999793773542922e-05, + "loss": 0.8628, + "step": 1640 + }, + { + "epoch": 0.2679074323497, + "grad_norm": 2.1957921981811523, + "learning_rate": 1.9997931286039992e-05, + "loss": 0.8311, + "step": 1641 + }, + { + "epoch": 0.2680706909922044, + "grad_norm": 2.393292188644409, + "learning_rate": 1.9997924826582847e-05, + "loss": 0.913, + "step": 1642 + }, + { + "epoch": 0.2682339496347088, + "grad_norm": 2.118877410888672, + "learning_rate": 1.99979183570578e-05, + "loss": 0.8079, + "step": 1643 + }, + { + "epoch": 0.2683972082772132, + "grad_norm": 2.3472061157226562, + "learning_rate": 1.999791187746485e-05, + "loss": 1.0146, + "step": 1644 + }, + { + "epoch": 0.2685604669197176, + "grad_norm": 2.4542300701141357, + "learning_rate": 1.9997905387804007e-05, + "loss": 1.0577, + "step": 1645 + }, + { + "epoch": 0.26872372556222196, + "grad_norm": 2.2689297199249268, + "learning_rate": 1.9997898888075273e-05, + "loss": 0.8602, + "step": 1646 + }, + { + "epoch": 0.26888698420472634, + "grad_norm": 2.413187026977539, + "learning_rate": 1.999789237827866e-05, + "loss": 1.0737, + "step": 1647 + }, + { + "epoch": 0.2690502428472307, + "grad_norm": 2.8695242404937744, + "learning_rate": 1.999788585841417e-05, + "loss": 1.1135, + "step": 1648 + }, + { + "epoch": 0.2692135014897351, + "grad_norm": 2.277801275253296, + "learning_rate": 1.9997879328481816e-05, + "loss": 0.8762, + "step": 1649 + }, + { + "epoch": 0.2693767601322395, + "grad_norm": 2.2017099857330322, + "learning_rate": 1.9997872788481595e-05, + "loss": 0.998, + "step": 1650 + }, + { + "epoch": 0.2695400187747439, + "grad_norm": 2.3423523902893066, + "learning_rate": 1.999786623841352e-05, + "loss": 0.9666, + "step": 1651 + }, + { + "epoch": 0.26970327741724825, + "grad_norm": 2.263124704360962, + "learning_rate": 1.9997859678277596e-05, + "loss": 0.9831, + "step": 1652 + }, + { + "epoch": 0.26986653605975264, + "grad_norm": 2.322350025177002, + "learning_rate": 1.9997853108073833e-05, + "loss": 1.1455, + "step": 1653 + }, + { + "epoch": 0.2700297947022571, + "grad_norm": 2.317052125930786, + "learning_rate": 1.999784652780223e-05, + "loss": 1.7202, + "step": 1654 + }, + { + "epoch": 0.27019305334476146, + "grad_norm": 2.520709753036499, + "learning_rate": 1.99978399374628e-05, + "loss": 1.4526, + "step": 1655 + }, + { + "epoch": 0.27035631198726584, + "grad_norm": 2.4490184783935547, + "learning_rate": 1.9997833337055552e-05, + "loss": 1.0673, + "step": 1656 + }, + { + "epoch": 0.2705195706297702, + "grad_norm": 2.217167377471924, + "learning_rate": 1.9997826726580483e-05, + "loss": 0.9699, + "step": 1657 + }, + { + "epoch": 0.2706828292722746, + "grad_norm": 2.0539422035217285, + "learning_rate": 1.999782010603761e-05, + "loss": 1.0228, + "step": 1658 + }, + { + "epoch": 0.270846087914779, + "grad_norm": 2.2571682929992676, + "learning_rate": 1.999781347542693e-05, + "loss": 1.0471, + "step": 1659 + }, + { + "epoch": 0.27100934655728337, + "grad_norm": 2.2672970294952393, + "learning_rate": 1.9997806834748455e-05, + "loss": 1.1108, + "step": 1660 + }, + { + "epoch": 0.27117260519978775, + "grad_norm": 1.935877799987793, + "learning_rate": 1.9997800184002194e-05, + "loss": 0.9211, + "step": 1661 + }, + { + "epoch": 0.27133586384229214, + "grad_norm": 1.9514377117156982, + "learning_rate": 1.999779352318815e-05, + "loss": 0.9121, + "step": 1662 + }, + { + "epoch": 0.2714991224847965, + "grad_norm": 1.8865456581115723, + "learning_rate": 1.999778685230633e-05, + "loss": 0.798, + "step": 1663 + }, + { + "epoch": 0.2716623811273009, + "grad_norm": 2.085359573364258, + "learning_rate": 1.999778017135674e-05, + "loss": 1.0143, + "step": 1664 + }, + { + "epoch": 0.27182563976980534, + "grad_norm": 1.925869107246399, + "learning_rate": 1.999777348033939e-05, + "loss": 0.8887, + "step": 1665 + }, + { + "epoch": 0.2719888984123097, + "grad_norm": 2.187286853790283, + "learning_rate": 1.9997766779254283e-05, + "loss": 0.8771, + "step": 1666 + }, + { + "epoch": 0.2721521570548141, + "grad_norm": 2.073378801345825, + "learning_rate": 1.999776006810143e-05, + "loss": 0.8269, + "step": 1667 + }, + { + "epoch": 0.2723154156973185, + "grad_norm": 2.08562970161438, + "learning_rate": 1.9997753346880834e-05, + "loss": 0.887, + "step": 1668 + }, + { + "epoch": 0.27247867433982287, + "grad_norm": 2.305457830429077, + "learning_rate": 1.9997746615592503e-05, + "loss": 0.8997, + "step": 1669 + }, + { + "epoch": 0.27264193298232725, + "grad_norm": 2.546196699142456, + "learning_rate": 1.9997739874236444e-05, + "loss": 1.1771, + "step": 1670 + }, + { + "epoch": 0.27280519162483163, + "grad_norm": 1.9601502418518066, + "learning_rate": 1.9997733122812663e-05, + "loss": 0.8145, + "step": 1671 + }, + { + "epoch": 0.272968450267336, + "grad_norm": 2.5266425609588623, + "learning_rate": 1.999772636132117e-05, + "loss": 0.9758, + "step": 1672 + }, + { + "epoch": 0.2731317089098404, + "grad_norm": 2.358248233795166, + "learning_rate": 1.9997719589761965e-05, + "loss": 0.8518, + "step": 1673 + }, + { + "epoch": 0.2732949675523448, + "grad_norm": 2.7241437435150146, + "learning_rate": 1.999771280813506e-05, + "loss": 1.0817, + "step": 1674 + }, + { + "epoch": 0.27345822619484916, + "grad_norm": 2.140350103378296, + "learning_rate": 1.9997706016440462e-05, + "loss": 0.773, + "step": 1675 + }, + { + "epoch": 0.2736214848373536, + "grad_norm": 2.663552761077881, + "learning_rate": 1.9997699214678177e-05, + "loss": 0.9427, + "step": 1676 + }, + { + "epoch": 0.273784743479858, + "grad_norm": 1.9863176345825195, + "learning_rate": 1.9997692402848214e-05, + "loss": 0.6763, + "step": 1677 + }, + { + "epoch": 0.27394800212236237, + "grad_norm": 2.4186410903930664, + "learning_rate": 1.999768558095057e-05, + "loss": 0.9637, + "step": 1678 + }, + { + "epoch": 0.27411126076486675, + "grad_norm": 3.0716097354888916, + "learning_rate": 1.9997678748985265e-05, + "loss": 0.9685, + "step": 1679 + }, + { + "epoch": 0.27427451940737113, + "grad_norm": 2.457216501235962, + "learning_rate": 1.99976719069523e-05, + "loss": 1.0601, + "step": 1680 + }, + { + "epoch": 0.2744377780498755, + "grad_norm": 2.243696928024292, + "learning_rate": 1.999766505485168e-05, + "loss": 0.8549, + "step": 1681 + }, + { + "epoch": 0.2746010366923799, + "grad_norm": 2.188337802886963, + "learning_rate": 1.9997658192683412e-05, + "loss": 0.8794, + "step": 1682 + }, + { + "epoch": 0.2747642953348843, + "grad_norm": 2.316406726837158, + "learning_rate": 1.999765132044751e-05, + "loss": 1.0367, + "step": 1683 + }, + { + "epoch": 0.27492755397738866, + "grad_norm": 2.30904221534729, + "learning_rate": 1.9997644438143974e-05, + "loss": 0.8854, + "step": 1684 + }, + { + "epoch": 0.27509081261989304, + "grad_norm": 2.2979986667633057, + "learning_rate": 1.9997637545772812e-05, + "loss": 0.9133, + "step": 1685 + }, + { + "epoch": 0.2752540712623974, + "grad_norm": 2.2740354537963867, + "learning_rate": 1.999763064333403e-05, + "loss": 0.9431, + "step": 1686 + }, + { + "epoch": 0.27541732990490186, + "grad_norm": 2.338010549545288, + "learning_rate": 1.999762373082764e-05, + "loss": 1.025, + "step": 1687 + }, + { + "epoch": 0.27558058854740625, + "grad_norm": 2.351081371307373, + "learning_rate": 1.9997616808253645e-05, + "loss": 0.8859, + "step": 1688 + }, + { + "epoch": 0.27574384718991063, + "grad_norm": 2.4026598930358887, + "learning_rate": 1.9997609875612053e-05, + "loss": 1.0489, + "step": 1689 + }, + { + "epoch": 0.275907105832415, + "grad_norm": 2.311793327331543, + "learning_rate": 1.9997602932902866e-05, + "loss": 0.9617, + "step": 1690 + }, + { + "epoch": 0.2760703644749194, + "grad_norm": 2.0073466300964355, + "learning_rate": 1.99975959801261e-05, + "loss": 0.8674, + "step": 1691 + }, + { + "epoch": 0.2762336231174238, + "grad_norm": 2.258066415786743, + "learning_rate": 1.9997589017281755e-05, + "loss": 0.8881, + "step": 1692 + }, + { + "epoch": 0.27639688175992816, + "grad_norm": 2.2021420001983643, + "learning_rate": 1.9997582044369843e-05, + "loss": 0.9043, + "step": 1693 + }, + { + "epoch": 0.27656014040243254, + "grad_norm": 2.2010984420776367, + "learning_rate": 1.9997575061390368e-05, + "loss": 0.9344, + "step": 1694 + }, + { + "epoch": 0.2767233990449369, + "grad_norm": 2.5051543712615967, + "learning_rate": 1.9997568068343333e-05, + "loss": 1.1483, + "step": 1695 + }, + { + "epoch": 0.2768866576874413, + "grad_norm": 2.1381585597991943, + "learning_rate": 1.9997561065228753e-05, + "loss": 0.987, + "step": 1696 + }, + { + "epoch": 0.2770499163299457, + "grad_norm": 1.9149789810180664, + "learning_rate": 1.999755405204663e-05, + "loss": 0.7444, + "step": 1697 + }, + { + "epoch": 0.2772131749724501, + "grad_norm": 2.3960580825805664, + "learning_rate": 1.999754702879698e-05, + "loss": 1.1122, + "step": 1698 + }, + { + "epoch": 0.2773764336149545, + "grad_norm": 2.421442985534668, + "learning_rate": 1.9997539995479794e-05, + "loss": 1.1685, + "step": 1699 + }, + { + "epoch": 0.2775396922574589, + "grad_norm": 2.129492998123169, + "learning_rate": 1.9997532952095093e-05, + "loss": 0.9949, + "step": 1700 + }, + { + "epoch": 0.2777029508999633, + "grad_norm": 2.243894338607788, + "learning_rate": 1.9997525898642876e-05, + "loss": 0.9654, + "step": 1701 + }, + { + "epoch": 0.27786620954246766, + "grad_norm": 2.1951138973236084, + "learning_rate": 1.9997518835123155e-05, + "loss": 0.9112, + "step": 1702 + }, + { + "epoch": 0.27802946818497204, + "grad_norm": 2.2304961681365967, + "learning_rate": 1.9997511761535935e-05, + "loss": 1.0407, + "step": 1703 + }, + { + "epoch": 0.2781927268274764, + "grad_norm": 2.2933595180511475, + "learning_rate": 1.9997504677881224e-05, + "loss": 0.9932, + "step": 1704 + }, + { + "epoch": 0.2783559854699808, + "grad_norm": 2.379369020462036, + "learning_rate": 1.9997497584159028e-05, + "loss": 0.9062, + "step": 1705 + }, + { + "epoch": 0.2785192441124852, + "grad_norm": 2.7372329235076904, + "learning_rate": 1.999749048036935e-05, + "loss": 1.1248, + "step": 1706 + }, + { + "epoch": 0.27868250275498957, + "grad_norm": 2.449551820755005, + "learning_rate": 1.9997483366512206e-05, + "loss": 1.1531, + "step": 1707 + }, + { + "epoch": 0.278845761397494, + "grad_norm": 2.3896937370300293, + "learning_rate": 1.99974762425876e-05, + "loss": 1.0661, + "step": 1708 + }, + { + "epoch": 0.2790090200399984, + "grad_norm": 2.32053279876709, + "learning_rate": 1.9997469108595538e-05, + "loss": 0.9925, + "step": 1709 + }, + { + "epoch": 0.2791722786825028, + "grad_norm": 2.5755343437194824, + "learning_rate": 1.9997461964536024e-05, + "loss": 0.8253, + "step": 1710 + }, + { + "epoch": 0.27933553732500715, + "grad_norm": 2.212538003921509, + "learning_rate": 1.9997454810409073e-05, + "loss": 0.9059, + "step": 1711 + }, + { + "epoch": 0.27949879596751154, + "grad_norm": 2.488921880722046, + "learning_rate": 1.9997447646214684e-05, + "loss": 0.9493, + "step": 1712 + }, + { + "epoch": 0.2796620546100159, + "grad_norm": 2.3235771656036377, + "learning_rate": 1.9997440471952866e-05, + "loss": 1.0083, + "step": 1713 + }, + { + "epoch": 0.2798253132525203, + "grad_norm": 2.0563318729400635, + "learning_rate": 1.9997433287623633e-05, + "loss": 0.8402, + "step": 1714 + }, + { + "epoch": 0.2799885718950247, + "grad_norm": 2.5520212650299072, + "learning_rate": 1.9997426093226984e-05, + "loss": 0.9519, + "step": 1715 + }, + { + "epoch": 0.28015183053752907, + "grad_norm": 2.3819894790649414, + "learning_rate": 1.9997418888762932e-05, + "loss": 0.8547, + "step": 1716 + }, + { + "epoch": 0.28031508918003345, + "grad_norm": 2.2384724617004395, + "learning_rate": 1.999741167423148e-05, + "loss": 1.064, + "step": 1717 + }, + { + "epoch": 0.28047834782253783, + "grad_norm": 2.20548939704895, + "learning_rate": 1.9997404449632638e-05, + "loss": 1.083, + "step": 1718 + }, + { + "epoch": 0.28064160646504227, + "grad_norm": 2.0349936485290527, + "learning_rate": 1.9997397214966413e-05, + "loss": 0.9854, + "step": 1719 + }, + { + "epoch": 0.28080486510754665, + "grad_norm": 2.217069149017334, + "learning_rate": 1.999738997023281e-05, + "loss": 0.9177, + "step": 1720 + }, + { + "epoch": 0.28096812375005104, + "grad_norm": 2.2290215492248535, + "learning_rate": 1.999738271543184e-05, + "loss": 0.91, + "step": 1721 + }, + { + "epoch": 0.2811313823925554, + "grad_norm": 2.0737690925598145, + "learning_rate": 1.9997375450563504e-05, + "loss": 1.0694, + "step": 1722 + }, + { + "epoch": 0.2812946410350598, + "grad_norm": 2.2942206859588623, + "learning_rate": 1.9997368175627818e-05, + "loss": 1.0193, + "step": 1723 + }, + { + "epoch": 0.2814578996775642, + "grad_norm": 2.15690016746521, + "learning_rate": 1.9997360890624783e-05, + "loss": 0.802, + "step": 1724 + }, + { + "epoch": 0.28162115832006857, + "grad_norm": 2.163816213607788, + "learning_rate": 1.999735359555441e-05, + "loss": 0.9, + "step": 1725 + }, + { + "epoch": 0.28178441696257295, + "grad_norm": 2.829559087753296, + "learning_rate": 1.9997346290416703e-05, + "loss": 0.9583, + "step": 1726 + }, + { + "epoch": 0.28194767560507733, + "grad_norm": 2.512258291244507, + "learning_rate": 1.9997338975211668e-05, + "loss": 1.1696, + "step": 1727 + }, + { + "epoch": 0.2821109342475817, + "grad_norm": 2.7595059871673584, + "learning_rate": 1.999733164993932e-05, + "loss": 1.088, + "step": 1728 + }, + { + "epoch": 0.2822741928900861, + "grad_norm": 2.56469464302063, + "learning_rate": 1.999732431459966e-05, + "loss": 0.8273, + "step": 1729 + }, + { + "epoch": 0.28243745153259053, + "grad_norm": 1.90068781375885, + "learning_rate": 1.9997316969192696e-05, + "loss": 0.7508, + "step": 1730 + }, + { + "epoch": 0.2826007101750949, + "grad_norm": 2.1129913330078125, + "learning_rate": 1.999730961371844e-05, + "loss": 0.8875, + "step": 1731 + }, + { + "epoch": 0.2827639688175993, + "grad_norm": 2.1892049312591553, + "learning_rate": 1.9997302248176894e-05, + "loss": 0.9783, + "step": 1732 + }, + { + "epoch": 0.2829272274601037, + "grad_norm": 2.0149967670440674, + "learning_rate": 1.9997294872568066e-05, + "loss": 0.7448, + "step": 1733 + }, + { + "epoch": 0.28309048610260806, + "grad_norm": 2.2894842624664307, + "learning_rate": 1.9997287486891967e-05, + "loss": 0.8915, + "step": 1734 + }, + { + "epoch": 0.28325374474511245, + "grad_norm": 3.0624523162841797, + "learning_rate": 1.99972800911486e-05, + "loss": 1.0856, + "step": 1735 + }, + { + "epoch": 0.28341700338761683, + "grad_norm": 2.7290103435516357, + "learning_rate": 1.9997272685337975e-05, + "loss": 0.9923, + "step": 1736 + }, + { + "epoch": 0.2835802620301212, + "grad_norm": 2.2716572284698486, + "learning_rate": 1.99972652694601e-05, + "loss": 1.1229, + "step": 1737 + }, + { + "epoch": 0.2837435206726256, + "grad_norm": 2.3769171237945557, + "learning_rate": 1.999725784351498e-05, + "loss": 1.1987, + "step": 1738 + }, + { + "epoch": 0.28390677931513, + "grad_norm": 2.3682901859283447, + "learning_rate": 1.9997250407502627e-05, + "loss": 0.923, + "step": 1739 + }, + { + "epoch": 0.28407003795763436, + "grad_norm": 2.070437431335449, + "learning_rate": 1.9997242961423043e-05, + "loss": 0.9941, + "step": 1740 + }, + { + "epoch": 0.2842332966001388, + "grad_norm": 2.248260259628296, + "learning_rate": 1.9997235505276235e-05, + "loss": 1.0237, + "step": 1741 + }, + { + "epoch": 0.2843965552426432, + "grad_norm": 2.353444814682007, + "learning_rate": 1.999722803906222e-05, + "loss": 0.869, + "step": 1742 + }, + { + "epoch": 0.28455981388514756, + "grad_norm": 2.087865114212036, + "learning_rate": 1.9997220562780996e-05, + "loss": 0.9296, + "step": 1743 + }, + { + "epoch": 0.28472307252765194, + "grad_norm": 2.1050267219543457, + "learning_rate": 1.9997213076432575e-05, + "loss": 0.9705, + "step": 1744 + }, + { + "epoch": 0.2848863311701563, + "grad_norm": 2.3999109268188477, + "learning_rate": 1.9997205580016957e-05, + "loss": 0.9756, + "step": 1745 + }, + { + "epoch": 0.2850495898126607, + "grad_norm": 2.196774959564209, + "learning_rate": 1.9997198073534163e-05, + "loss": 0.9513, + "step": 1746 + }, + { + "epoch": 0.2852128484551651, + "grad_norm": 2.0695548057556152, + "learning_rate": 1.999719055698419e-05, + "loss": 0.8229, + "step": 1747 + }, + { + "epoch": 0.2853761070976695, + "grad_norm": 3.023033618927002, + "learning_rate": 1.999718303036705e-05, + "loss": 1.0365, + "step": 1748 + }, + { + "epoch": 0.28553936574017386, + "grad_norm": 2.175701141357422, + "learning_rate": 1.9997175493682745e-05, + "loss": 0.8192, + "step": 1749 + }, + { + "epoch": 0.28570262438267824, + "grad_norm": 2.2661218643188477, + "learning_rate": 1.9997167946931293e-05, + "loss": 0.9124, + "step": 1750 + }, + { + "epoch": 0.2858658830251826, + "grad_norm": 2.457913875579834, + "learning_rate": 1.9997160390112692e-05, + "loss": 0.7647, + "step": 1751 + }, + { + "epoch": 0.28602914166768706, + "grad_norm": 2.244631290435791, + "learning_rate": 1.9997152823226952e-05, + "loss": 0.9874, + "step": 1752 + }, + { + "epoch": 0.28619240031019144, + "grad_norm": 2.134220600128174, + "learning_rate": 1.999714524627409e-05, + "loss": 0.9269, + "step": 1753 + }, + { + "epoch": 0.2863556589526958, + "grad_norm": 2.4289536476135254, + "learning_rate": 1.9997137659254094e-05, + "loss": 1.0393, + "step": 1754 + }, + { + "epoch": 0.2865189175952002, + "grad_norm": 2.1581428050994873, + "learning_rate": 1.9997130062166988e-05, + "loss": 1.0649, + "step": 1755 + }, + { + "epoch": 0.2866821762377046, + "grad_norm": 2.2612667083740234, + "learning_rate": 1.9997122455012776e-05, + "loss": 0.8685, + "step": 1756 + }, + { + "epoch": 0.28684543488020897, + "grad_norm": 2.0228095054626465, + "learning_rate": 1.9997114837791462e-05, + "loss": 0.8604, + "step": 1757 + }, + { + "epoch": 0.28700869352271335, + "grad_norm": 2.2434167861938477, + "learning_rate": 1.999710721050306e-05, + "loss": 0.8253, + "step": 1758 + }, + { + "epoch": 0.28717195216521774, + "grad_norm": 2.064114570617676, + "learning_rate": 1.999709957314757e-05, + "loss": 0.8035, + "step": 1759 + }, + { + "epoch": 0.2873352108077221, + "grad_norm": 2.1393179893493652, + "learning_rate": 1.9997091925725006e-05, + "loss": 0.8987, + "step": 1760 + }, + { + "epoch": 0.2874984694502265, + "grad_norm": 2.1834259033203125, + "learning_rate": 1.999708426823537e-05, + "loss": 0.9278, + "step": 1761 + }, + { + "epoch": 0.2876617280927309, + "grad_norm": 2.135474920272827, + "learning_rate": 1.9997076600678676e-05, + "loss": 1.1741, + "step": 1762 + }, + { + "epoch": 0.2878249867352353, + "grad_norm": 2.4918086528778076, + "learning_rate": 1.9997068923054925e-05, + "loss": 1.0133, + "step": 1763 + }, + { + "epoch": 0.2879882453777397, + "grad_norm": 2.0163965225219727, + "learning_rate": 1.999706123536413e-05, + "loss": 0.8634, + "step": 1764 + }, + { + "epoch": 0.2881515040202441, + "grad_norm": 2.2787725925445557, + "learning_rate": 1.9997053537606296e-05, + "loss": 0.8585, + "step": 1765 + }, + { + "epoch": 0.28831476266274847, + "grad_norm": 2.2824740409851074, + "learning_rate": 1.9997045829781432e-05, + "loss": 0.8824, + "step": 1766 + }, + { + "epoch": 0.28847802130525285, + "grad_norm": 2.351191997528076, + "learning_rate": 1.9997038111889545e-05, + "loss": 1.0162, + "step": 1767 + }, + { + "epoch": 0.28864127994775723, + "grad_norm": 2.037731885910034, + "learning_rate": 1.9997030383930647e-05, + "loss": 0.8748, + "step": 1768 + }, + { + "epoch": 0.2888045385902616, + "grad_norm": 1.997275948524475, + "learning_rate": 1.999702264590474e-05, + "loss": 0.8632, + "step": 1769 + }, + { + "epoch": 0.288967797232766, + "grad_norm": 2.363363265991211, + "learning_rate": 1.9997014897811834e-05, + "loss": 0.9279, + "step": 1770 + }, + { + "epoch": 0.2891310558752704, + "grad_norm": 1.9954055547714233, + "learning_rate": 1.9997007139651936e-05, + "loss": 0.9655, + "step": 1771 + }, + { + "epoch": 0.28929431451777476, + "grad_norm": 2.1882033348083496, + "learning_rate": 1.999699937142505e-05, + "loss": 1.1249, + "step": 1772 + }, + { + "epoch": 0.28945757316027915, + "grad_norm": 2.3289670944213867, + "learning_rate": 1.9996991593131197e-05, + "loss": 0.9733, + "step": 1773 + }, + { + "epoch": 0.2896208318027836, + "grad_norm": 2.334594964981079, + "learning_rate": 1.9996983804770372e-05, + "loss": 1.0032, + "step": 1774 + }, + { + "epoch": 0.28978409044528797, + "grad_norm": 2.3415305614471436, + "learning_rate": 1.999697600634259e-05, + "loss": 0.9444, + "step": 1775 + }, + { + "epoch": 0.28994734908779235, + "grad_norm": 2.154383897781372, + "learning_rate": 1.999696819784785e-05, + "loss": 1.099, + "step": 1776 + }, + { + "epoch": 0.29011060773029673, + "grad_norm": 2.133610725402832, + "learning_rate": 1.9996960379286164e-05, + "loss": 1.0719, + "step": 1777 + }, + { + "epoch": 0.2902738663728011, + "grad_norm": 2.2411067485809326, + "learning_rate": 1.999695255065755e-05, + "loss": 1.0788, + "step": 1778 + }, + { + "epoch": 0.2904371250153055, + "grad_norm": 1.896086573600769, + "learning_rate": 1.9996944711962002e-05, + "loss": 0.8539, + "step": 1779 + }, + { + "epoch": 0.2906003836578099, + "grad_norm": 2.087146282196045, + "learning_rate": 1.9996936863199537e-05, + "loss": 0.9858, + "step": 1780 + }, + { + "epoch": 0.29076364230031426, + "grad_norm": 2.506073236465454, + "learning_rate": 1.9996929004370152e-05, + "loss": 1.1566, + "step": 1781 + }, + { + "epoch": 0.29092690094281864, + "grad_norm": 2.2587239742279053, + "learning_rate": 1.999692113547387e-05, + "loss": 0.9451, + "step": 1782 + }, + { + "epoch": 0.291090159585323, + "grad_norm": 2.5805678367614746, + "learning_rate": 1.9996913256510688e-05, + "loss": 1.0205, + "step": 1783 + }, + { + "epoch": 0.2912534182278274, + "grad_norm": 2.261308431625366, + "learning_rate": 1.9996905367480618e-05, + "loss": 1.1322, + "step": 1784 + }, + { + "epoch": 0.29141667687033185, + "grad_norm": 1.8181143999099731, + "learning_rate": 1.9996897468383663e-05, + "loss": 0.724, + "step": 1785 + }, + { + "epoch": 0.29157993551283623, + "grad_norm": 1.9690752029418945, + "learning_rate": 1.9996889559219837e-05, + "loss": 0.9462, + "step": 1786 + }, + { + "epoch": 0.2917431941553406, + "grad_norm": 2.173159599304199, + "learning_rate": 1.999688163998915e-05, + "loss": 0.909, + "step": 1787 + }, + { + "epoch": 0.291906452797845, + "grad_norm": 2.365762233734131, + "learning_rate": 1.99968737106916e-05, + "loss": 1.0998, + "step": 1788 + }, + { + "epoch": 0.2920697114403494, + "grad_norm": 2.2918171882629395, + "learning_rate": 1.9996865771327205e-05, + "loss": 0.8516, + "step": 1789 + }, + { + "epoch": 0.29223297008285376, + "grad_norm": 2.4013500213623047, + "learning_rate": 1.9996857821895968e-05, + "loss": 1.1067, + "step": 1790 + }, + { + "epoch": 0.29239622872535814, + "grad_norm": 1.999075174331665, + "learning_rate": 1.9996849862397897e-05, + "loss": 0.9636, + "step": 1791 + }, + { + "epoch": 0.2925594873678625, + "grad_norm": 2.4877376556396484, + "learning_rate": 1.9996841892833e-05, + "loss": 1.0049, + "step": 1792 + }, + { + "epoch": 0.2927227460103669, + "grad_norm": 2.0414798259735107, + "learning_rate": 1.999683391320129e-05, + "loss": 0.8821, + "step": 1793 + }, + { + "epoch": 0.2928860046528713, + "grad_norm": 2.263793706893921, + "learning_rate": 1.9996825923502766e-05, + "loss": 1.0381, + "step": 1794 + }, + { + "epoch": 0.2930492632953757, + "grad_norm": 2.078835964202881, + "learning_rate": 1.9996817923737443e-05, + "loss": 0.9892, + "step": 1795 + }, + { + "epoch": 0.2932125219378801, + "grad_norm": 2.838857889175415, + "learning_rate": 1.9996809913905327e-05, + "loss": 0.7975, + "step": 1796 + }, + { + "epoch": 0.2933757805803845, + "grad_norm": 2.388136148452759, + "learning_rate": 1.9996801894006427e-05, + "loss": 1.1007, + "step": 1797 + }, + { + "epoch": 0.2935390392228889, + "grad_norm": 2.319694757461548, + "learning_rate": 1.9996793864040748e-05, + "loss": 0.9014, + "step": 1798 + }, + { + "epoch": 0.29370229786539326, + "grad_norm": 2.1210925579071045, + "learning_rate": 1.9996785824008302e-05, + "loss": 1.0515, + "step": 1799 + }, + { + "epoch": 0.29386555650789764, + "grad_norm": 2.234692096710205, + "learning_rate": 1.9996777773909093e-05, + "loss": 0.9736, + "step": 1800 + }, + { + "epoch": 0.294028815150402, + "grad_norm": 2.2412960529327393, + "learning_rate": 1.999676971374313e-05, + "loss": 0.954, + "step": 1801 + }, + { + "epoch": 0.2941920737929064, + "grad_norm": 1.9720630645751953, + "learning_rate": 1.9996761643510427e-05, + "loss": 0.9463, + "step": 1802 + }, + { + "epoch": 0.2943553324354108, + "grad_norm": 2.066502094268799, + "learning_rate": 1.9996753563210987e-05, + "loss": 0.7793, + "step": 1803 + }, + { + "epoch": 0.29451859107791517, + "grad_norm": 2.0353128910064697, + "learning_rate": 1.9996745472844817e-05, + "loss": 0.8101, + "step": 1804 + }, + { + "epoch": 0.29468184972041955, + "grad_norm": 2.017674684524536, + "learning_rate": 1.999673737241193e-05, + "loss": 0.8605, + "step": 1805 + }, + { + "epoch": 0.29484510836292394, + "grad_norm": 2.4582204818725586, + "learning_rate": 1.9996729261912325e-05, + "loss": 0.9876, + "step": 1806 + }, + { + "epoch": 0.2950083670054284, + "grad_norm": 2.1909172534942627, + "learning_rate": 1.9996721141346023e-05, + "loss": 0.7002, + "step": 1807 + }, + { + "epoch": 0.29517162564793276, + "grad_norm": 2.3859176635742188, + "learning_rate": 1.9996713010713022e-05, + "loss": 0.9628, + "step": 1808 + }, + { + "epoch": 0.29533488429043714, + "grad_norm": 2.2689006328582764, + "learning_rate": 1.9996704870013336e-05, + "loss": 1.0471, + "step": 1809 + }, + { + "epoch": 0.2954981429329415, + "grad_norm": 2.393009662628174, + "learning_rate": 1.999669671924697e-05, + "loss": 1.0094, + "step": 1810 + }, + { + "epoch": 0.2956614015754459, + "grad_norm": 2.284860134124756, + "learning_rate": 1.999668855841393e-05, + "loss": 0.8872, + "step": 1811 + }, + { + "epoch": 0.2958246602179503, + "grad_norm": 2.42022705078125, + "learning_rate": 1.999668038751423e-05, + "loss": 0.924, + "step": 1812 + }, + { + "epoch": 0.29598791886045467, + "grad_norm": 2.097926139831543, + "learning_rate": 1.9996672206547874e-05, + "loss": 0.7862, + "step": 1813 + }, + { + "epoch": 0.29615117750295905, + "grad_norm": 2.687265634536743, + "learning_rate": 1.999666401551487e-05, + "loss": 1.2101, + "step": 1814 + }, + { + "epoch": 0.29631443614546343, + "grad_norm": 2.167206048965454, + "learning_rate": 1.9996655814415235e-05, + "loss": 0.868, + "step": 1815 + }, + { + "epoch": 0.2964776947879678, + "grad_norm": 2.363760232925415, + "learning_rate": 1.9996647603248967e-05, + "loss": 1.1274, + "step": 1816 + }, + { + "epoch": 0.2966409534304722, + "grad_norm": 2.518871784210205, + "learning_rate": 1.9996639382016075e-05, + "loss": 1.0346, + "step": 1817 + }, + { + "epoch": 0.29680421207297664, + "grad_norm": 2.5150697231292725, + "learning_rate": 1.9996631150716573e-05, + "loss": 1.1477, + "step": 1818 + }, + { + "epoch": 0.296967470715481, + "grad_norm": 2.2987048625946045, + "learning_rate": 1.9996622909350463e-05, + "loss": 0.8676, + "step": 1819 + }, + { + "epoch": 0.2971307293579854, + "grad_norm": 1.8667775392532349, + "learning_rate": 1.999661465791776e-05, + "loss": 0.7664, + "step": 1820 + }, + { + "epoch": 0.2972939880004898, + "grad_norm": 2.6247787475585938, + "learning_rate": 1.9996606396418464e-05, + "loss": 1.0458, + "step": 1821 + }, + { + "epoch": 0.29745724664299417, + "grad_norm": 2.195392608642578, + "learning_rate": 1.999659812485259e-05, + "loss": 1.1182, + "step": 1822 + }, + { + "epoch": 0.29762050528549855, + "grad_norm": 1.9957014322280884, + "learning_rate": 1.9996589843220148e-05, + "loss": 0.8267, + "step": 1823 + }, + { + "epoch": 0.29778376392800293, + "grad_norm": 1.8753207921981812, + "learning_rate": 1.999658155152114e-05, + "loss": 0.7389, + "step": 1824 + }, + { + "epoch": 0.2979470225705073, + "grad_norm": 2.1934688091278076, + "learning_rate": 1.9996573249755573e-05, + "loss": 0.9375, + "step": 1825 + }, + { + "epoch": 0.2981102812130117, + "grad_norm": 2.035888910293579, + "learning_rate": 1.9996564937923464e-05, + "loss": 0.9029, + "step": 1826 + }, + { + "epoch": 0.2982735398555161, + "grad_norm": 2.503812551498413, + "learning_rate": 1.9996556616024817e-05, + "loss": 1.0332, + "step": 1827 + }, + { + "epoch": 0.29843679849802046, + "grad_norm": 2.4947330951690674, + "learning_rate": 1.999654828405964e-05, + "loss": 1.1278, + "step": 1828 + }, + { + "epoch": 0.2986000571405249, + "grad_norm": 2.050753355026245, + "learning_rate": 1.999653994202794e-05, + "loss": 0.8583, + "step": 1829 + }, + { + "epoch": 0.2987633157830293, + "grad_norm": 2.221581220626831, + "learning_rate": 1.9996531589929725e-05, + "loss": 1.1037, + "step": 1830 + }, + { + "epoch": 0.29892657442553366, + "grad_norm": 2.2006335258483887, + "learning_rate": 1.999652322776501e-05, + "loss": 1.0575, + "step": 1831 + }, + { + "epoch": 0.29908983306803805, + "grad_norm": 2.361781597137451, + "learning_rate": 1.9996514855533796e-05, + "loss": 1.0388, + "step": 1832 + }, + { + "epoch": 0.29925309171054243, + "grad_norm": 2.1568288803100586, + "learning_rate": 1.9996506473236095e-05, + "loss": 0.908, + "step": 1833 + }, + { + "epoch": 0.2994163503530468, + "grad_norm": 2.2944083213806152, + "learning_rate": 1.9996498080871913e-05, + "loss": 0.9799, + "step": 1834 + }, + { + "epoch": 0.2995796089955512, + "grad_norm": 2.647690534591675, + "learning_rate": 1.999648967844126e-05, + "loss": 0.9652, + "step": 1835 + }, + { + "epoch": 0.2997428676380556, + "grad_norm": 2.3497045040130615, + "learning_rate": 1.9996481265944146e-05, + "loss": 0.967, + "step": 1836 + }, + { + "epoch": 0.29990612628055996, + "grad_norm": 2.1509971618652344, + "learning_rate": 1.999647284338058e-05, + "loss": 0.8766, + "step": 1837 + }, + { + "epoch": 0.30006938492306434, + "grad_norm": 2.196582555770874, + "learning_rate": 1.9996464410750565e-05, + "loss": 0.9295, + "step": 1838 + }, + { + "epoch": 0.3002326435655688, + "grad_norm": 2.223280429840088, + "learning_rate": 1.9996455968054115e-05, + "loss": 0.9258, + "step": 1839 + }, + { + "epoch": 0.30039590220807316, + "grad_norm": 2.3706820011138916, + "learning_rate": 1.9996447515291236e-05, + "loss": 0.8871, + "step": 1840 + }, + { + "epoch": 0.30055916085057754, + "grad_norm": 2.203500270843506, + "learning_rate": 1.9996439052461935e-05, + "loss": 0.88, + "step": 1841 + }, + { + "epoch": 0.3007224194930819, + "grad_norm": 2.3259363174438477, + "learning_rate": 1.9996430579566227e-05, + "loss": 1.166, + "step": 1842 + }, + { + "epoch": 0.3008856781355863, + "grad_norm": 2.0105044841766357, + "learning_rate": 1.9996422096604112e-05, + "loss": 0.9735, + "step": 1843 + }, + { + "epoch": 0.3010489367780907, + "grad_norm": 2.0992789268493652, + "learning_rate": 1.9996413603575603e-05, + "loss": 0.9938, + "step": 1844 + }, + { + "epoch": 0.3012121954205951, + "grad_norm": 2.4526548385620117, + "learning_rate": 1.999640510048071e-05, + "loss": 0.9913, + "step": 1845 + }, + { + "epoch": 0.30137545406309946, + "grad_norm": 2.1673238277435303, + "learning_rate": 1.9996396587319438e-05, + "loss": 0.9126, + "step": 1846 + }, + { + "epoch": 0.30153871270560384, + "grad_norm": 2.379727840423584, + "learning_rate": 1.99963880640918e-05, + "loss": 1.467, + "step": 1847 + }, + { + "epoch": 0.3017019713481082, + "grad_norm": 2.4432058334350586, + "learning_rate": 1.99963795307978e-05, + "loss": 1.0507, + "step": 1848 + }, + { + "epoch": 0.3018652299906126, + "grad_norm": 2.0599846839904785, + "learning_rate": 1.999637098743745e-05, + "loss": 0.8815, + "step": 1849 + }, + { + "epoch": 0.30202848863311704, + "grad_norm": 2.3709397315979004, + "learning_rate": 1.9996362434010754e-05, + "loss": 1.0773, + "step": 1850 + }, + { + "epoch": 0.3021917472756214, + "grad_norm": 2.1630303859710693, + "learning_rate": 1.9996353870517727e-05, + "loss": 1.0406, + "step": 1851 + }, + { + "epoch": 0.3023550059181258, + "grad_norm": 5.419304847717285, + "learning_rate": 1.999634529695837e-05, + "loss": 0.7661, + "step": 1852 + }, + { + "epoch": 0.3025182645606302, + "grad_norm": 2.2432124614715576, + "learning_rate": 1.99963367133327e-05, + "loss": 0.9495, + "step": 1853 + }, + { + "epoch": 0.3026815232031346, + "grad_norm": 2.6006953716278076, + "learning_rate": 1.999632811964072e-05, + "loss": 0.8108, + "step": 1854 + }, + { + "epoch": 0.30284478184563896, + "grad_norm": 2.358459234237671, + "learning_rate": 1.9996319515882437e-05, + "loss": 0.9541, + "step": 1855 + }, + { + "epoch": 0.30300804048814334, + "grad_norm": 2.0351223945617676, + "learning_rate": 1.999631090205787e-05, + "loss": 0.816, + "step": 1856 + }, + { + "epoch": 0.3031712991306477, + "grad_norm": 1.9229481220245361, + "learning_rate": 1.9996302278167015e-05, + "loss": 0.928, + "step": 1857 + }, + { + "epoch": 0.3033345577731521, + "grad_norm": 2.1130125522613525, + "learning_rate": 1.9996293644209886e-05, + "loss": 0.9349, + "step": 1858 + }, + { + "epoch": 0.3034978164156565, + "grad_norm": 2.1527414321899414, + "learning_rate": 1.9996285000186496e-05, + "loss": 0.9109, + "step": 1859 + }, + { + "epoch": 0.30366107505816087, + "grad_norm": 2.6694931983947754, + "learning_rate": 1.9996276346096847e-05, + "loss": 0.9562, + "step": 1860 + }, + { + "epoch": 0.3038243337006653, + "grad_norm": 1.9501043558120728, + "learning_rate": 1.9996267681940954e-05, + "loss": 0.8062, + "step": 1861 + }, + { + "epoch": 0.3039875923431697, + "grad_norm": 2.1534764766693115, + "learning_rate": 1.9996259007718816e-05, + "loss": 0.8883, + "step": 1862 + }, + { + "epoch": 0.30415085098567407, + "grad_norm": 2.2884817123413086, + "learning_rate": 1.9996250323430454e-05, + "loss": 0.7966, + "step": 1863 + }, + { + "epoch": 0.30431410962817845, + "grad_norm": 2.2576916217803955, + "learning_rate": 1.9996241629075865e-05, + "loss": 0.8951, + "step": 1864 + }, + { + "epoch": 0.30447736827068284, + "grad_norm": 2.4795234203338623, + "learning_rate": 1.9996232924655068e-05, + "loss": 1.0171, + "step": 1865 + }, + { + "epoch": 0.3046406269131872, + "grad_norm": 2.3633334636688232, + "learning_rate": 1.9996224210168064e-05, + "loss": 0.9066, + "step": 1866 + }, + { + "epoch": 0.3048038855556916, + "grad_norm": 2.052485942840576, + "learning_rate": 1.9996215485614866e-05, + "loss": 0.8591, + "step": 1867 + }, + { + "epoch": 0.304967144198196, + "grad_norm": 2.3114430904388428, + "learning_rate": 1.999620675099548e-05, + "loss": 0.8355, + "step": 1868 + }, + { + "epoch": 0.30513040284070037, + "grad_norm": 2.459855318069458, + "learning_rate": 1.999619800630992e-05, + "loss": 0.9194, + "step": 1869 + }, + { + "epoch": 0.30529366148320475, + "grad_norm": 2.4072425365448, + "learning_rate": 1.999618925155819e-05, + "loss": 0.8884, + "step": 1870 + }, + { + "epoch": 0.30545692012570913, + "grad_norm": 2.284618854522705, + "learning_rate": 1.99961804867403e-05, + "loss": 1.0507, + "step": 1871 + }, + { + "epoch": 0.30562017876821357, + "grad_norm": 2.274831771850586, + "learning_rate": 1.9996171711856258e-05, + "loss": 0.8397, + "step": 1872 + }, + { + "epoch": 0.30578343741071795, + "grad_norm": 2.380213737487793, + "learning_rate": 1.9996162926906073e-05, + "loss": 1.0664, + "step": 1873 + }, + { + "epoch": 0.30594669605322233, + "grad_norm": 2.435422658920288, + "learning_rate": 1.9996154131889756e-05, + "loss": 0.9538, + "step": 1874 + }, + { + "epoch": 0.3061099546957267, + "grad_norm": 2.2473526000976562, + "learning_rate": 1.9996145326807313e-05, + "loss": 0.8381, + "step": 1875 + }, + { + "epoch": 0.3062732133382311, + "grad_norm": 2.5036516189575195, + "learning_rate": 1.9996136511658758e-05, + "loss": 0.9812, + "step": 1876 + }, + { + "epoch": 0.3064364719807355, + "grad_norm": 2.4131202697753906, + "learning_rate": 1.999612768644409e-05, + "loss": 1.0851, + "step": 1877 + }, + { + "epoch": 0.30659973062323986, + "grad_norm": 2.074706792831421, + "learning_rate": 1.999611885116333e-05, + "loss": 0.7702, + "step": 1878 + }, + { + "epoch": 0.30676298926574425, + "grad_norm": 2.6941747665405273, + "learning_rate": 1.9996110005816478e-05, + "loss": 1.0339, + "step": 1879 + }, + { + "epoch": 0.30692624790824863, + "grad_norm": 2.2137928009033203, + "learning_rate": 1.9996101150403543e-05, + "loss": 0.7859, + "step": 1880 + }, + { + "epoch": 0.307089506550753, + "grad_norm": 2.206000328063965, + "learning_rate": 1.999609228492454e-05, + "loss": 0.8538, + "step": 1881 + }, + { + "epoch": 0.3072527651932574, + "grad_norm": 2.4029436111450195, + "learning_rate": 1.9996083409379477e-05, + "loss": 1.0059, + "step": 1882 + }, + { + "epoch": 0.30741602383576183, + "grad_norm": 2.1993868350982666, + "learning_rate": 1.9996074523768358e-05, + "loss": 0.7822, + "step": 1883 + }, + { + "epoch": 0.3075792824782662, + "grad_norm": 2.259855270385742, + "learning_rate": 1.9996065628091194e-05, + "loss": 1.041, + "step": 1884 + }, + { + "epoch": 0.3077425411207706, + "grad_norm": 2.6312127113342285, + "learning_rate": 1.9996056722348e-05, + "loss": 1.0809, + "step": 1885 + }, + { + "epoch": 0.307905799763275, + "grad_norm": 2.402372121810913, + "learning_rate": 1.9996047806538774e-05, + "loss": 0.7589, + "step": 1886 + }, + { + "epoch": 0.30806905840577936, + "grad_norm": 2.1683380603790283, + "learning_rate": 1.999603888066353e-05, + "loss": 0.8481, + "step": 1887 + }, + { + "epoch": 0.30823231704828374, + "grad_norm": 2.2182719707489014, + "learning_rate": 1.9996029944722283e-05, + "loss": 0.8417, + "step": 1888 + }, + { + "epoch": 0.3083955756907881, + "grad_norm": 2.53362774848938, + "learning_rate": 1.999602099871503e-05, + "loss": 1.0031, + "step": 1889 + }, + { + "epoch": 0.3085588343332925, + "grad_norm": 2.3196370601654053, + "learning_rate": 1.999601204264179e-05, + "loss": 1.0138, + "step": 1890 + }, + { + "epoch": 0.3087220929757969, + "grad_norm": 2.1313841342926025, + "learning_rate": 1.9996003076502567e-05, + "loss": 0.8826, + "step": 1891 + }, + { + "epoch": 0.3088853516183013, + "grad_norm": 2.307976722717285, + "learning_rate": 1.9995994100297374e-05, + "loss": 1.1606, + "step": 1892 + }, + { + "epoch": 0.30904861026080566, + "grad_norm": 2.456875801086426, + "learning_rate": 1.9995985114026215e-05, + "loss": 0.9748, + "step": 1893 + }, + { + "epoch": 0.3092118689033101, + "grad_norm": 2.2529234886169434, + "learning_rate": 1.9995976117689103e-05, + "loss": 0.9549, + "step": 1894 + }, + { + "epoch": 0.3093751275458145, + "grad_norm": 2.2278850078582764, + "learning_rate": 1.9995967111286044e-05, + "loss": 1.0525, + "step": 1895 + }, + { + "epoch": 0.30953838618831886, + "grad_norm": 2.0293984413146973, + "learning_rate": 1.9995958094817053e-05, + "loss": 0.9524, + "step": 1896 + }, + { + "epoch": 0.30970164483082324, + "grad_norm": 2.322810173034668, + "learning_rate": 1.9995949068282128e-05, + "loss": 0.7795, + "step": 1897 + }, + { + "epoch": 0.3098649034733276, + "grad_norm": 2.898709774017334, + "learning_rate": 1.999594003168129e-05, + "loss": 0.9007, + "step": 1898 + }, + { + "epoch": 0.310028162115832, + "grad_norm": 2.153977394104004, + "learning_rate": 1.999593098501454e-05, + "loss": 0.8632, + "step": 1899 + }, + { + "epoch": 0.3101914207583364, + "grad_norm": 1.8902404308319092, + "learning_rate": 1.9995921928281893e-05, + "loss": 0.7912, + "step": 1900 + }, + { + "epoch": 0.31035467940084077, + "grad_norm": 2.064795970916748, + "learning_rate": 1.9995912861483355e-05, + "loss": 0.8242, + "step": 1901 + }, + { + "epoch": 0.31051793804334515, + "grad_norm": 2.057786703109741, + "learning_rate": 1.9995903784618936e-05, + "loss": 0.9381, + "step": 1902 + }, + { + "epoch": 0.31068119668584954, + "grad_norm": 2.115062713623047, + "learning_rate": 1.999589469768864e-05, + "loss": 0.849, + "step": 1903 + }, + { + "epoch": 0.3108444553283539, + "grad_norm": 2.1812825202941895, + "learning_rate": 1.9995885600692485e-05, + "loss": 0.8691, + "step": 1904 + }, + { + "epoch": 0.31100771397085836, + "grad_norm": 2.2959213256835938, + "learning_rate": 1.9995876493630473e-05, + "loss": 1.0265, + "step": 1905 + }, + { + "epoch": 0.31117097261336274, + "grad_norm": 2.5628418922424316, + "learning_rate": 1.9995867376502624e-05, + "loss": 0.8736, + "step": 1906 + }, + { + "epoch": 0.3113342312558671, + "grad_norm": 1.9588905572891235, + "learning_rate": 1.999585824930893e-05, + "loss": 0.8359, + "step": 1907 + }, + { + "epoch": 0.3114974898983715, + "grad_norm": 2.0684986114501953, + "learning_rate": 1.999584911204941e-05, + "loss": 0.7631, + "step": 1908 + }, + { + "epoch": 0.3116607485408759, + "grad_norm": 2.327775001525879, + "learning_rate": 1.9995839964724078e-05, + "loss": 0.9819, + "step": 1909 + }, + { + "epoch": 0.31182400718338027, + "grad_norm": 2.471426486968994, + "learning_rate": 1.9995830807332934e-05, + "loss": 1.0497, + "step": 1910 + }, + { + "epoch": 0.31198726582588465, + "grad_norm": 2.795081615447998, + "learning_rate": 1.9995821639875993e-05, + "loss": 0.9949, + "step": 1911 + }, + { + "epoch": 0.31215052446838903, + "grad_norm": 2.845214366912842, + "learning_rate": 1.999581246235326e-05, + "loss": 0.9592, + "step": 1912 + }, + { + "epoch": 0.3123137831108934, + "grad_norm": 2.3473212718963623, + "learning_rate": 1.999580327476475e-05, + "loss": 0.9192, + "step": 1913 + }, + { + "epoch": 0.3124770417533978, + "grad_norm": 2.1011221408843994, + "learning_rate": 1.9995794077110464e-05, + "loss": 0.9326, + "step": 1914 + }, + { + "epoch": 0.3126403003959022, + "grad_norm": 2.3332345485687256, + "learning_rate": 1.9995784869390418e-05, + "loss": 0.9522, + "step": 1915 + }, + { + "epoch": 0.3128035590384066, + "grad_norm": 2.337526798248291, + "learning_rate": 1.9995775651604622e-05, + "loss": 0.9752, + "step": 1916 + }, + { + "epoch": 0.312966817680911, + "grad_norm": 2.1353020668029785, + "learning_rate": 1.9995766423753077e-05, + "loss": 0.9199, + "step": 1917 + }, + { + "epoch": 0.3131300763234154, + "grad_norm": 2.2876875400543213, + "learning_rate": 1.99957571858358e-05, + "loss": 1.0367, + "step": 1918 + }, + { + "epoch": 0.31329333496591977, + "grad_norm": 2.1771531105041504, + "learning_rate": 1.9995747937852803e-05, + "loss": 0.7713, + "step": 1919 + }, + { + "epoch": 0.31345659360842415, + "grad_norm": 2.130089044570923, + "learning_rate": 1.9995738679804086e-05, + "loss": 0.908, + "step": 1920 + }, + { + "epoch": 0.31361985225092853, + "grad_norm": 2.383592367172241, + "learning_rate": 1.9995729411689663e-05, + "loss": 0.6984, + "step": 1921 + }, + { + "epoch": 0.3137831108934329, + "grad_norm": 2.2531981468200684, + "learning_rate": 1.9995720133509544e-05, + "loss": 0.8238, + "step": 1922 + }, + { + "epoch": 0.3139463695359373, + "grad_norm": 2.9069082736968994, + "learning_rate": 1.9995710845263736e-05, + "loss": 0.939, + "step": 1923 + }, + { + "epoch": 0.3141096281784417, + "grad_norm": 2.290740966796875, + "learning_rate": 1.9995701546952252e-05, + "loss": 1.0394, + "step": 1924 + }, + { + "epoch": 0.31427288682094606, + "grad_norm": 1.8700591325759888, + "learning_rate": 1.9995692238575097e-05, + "loss": 0.7602, + "step": 1925 + }, + { + "epoch": 0.31443614546345044, + "grad_norm": 2.2481889724731445, + "learning_rate": 1.9995682920132283e-05, + "loss": 0.9549, + "step": 1926 + }, + { + "epoch": 0.3145994041059549, + "grad_norm": 2.194477081298828, + "learning_rate": 1.999567359162382e-05, + "loss": 1.1404, + "step": 1927 + }, + { + "epoch": 0.31476266274845927, + "grad_norm": 2.3044304847717285, + "learning_rate": 1.9995664253049715e-05, + "loss": 0.9811, + "step": 1928 + }, + { + "epoch": 0.31492592139096365, + "grad_norm": 2.171403169631958, + "learning_rate": 1.9995654904409983e-05, + "loss": 0.8437, + "step": 1929 + }, + { + "epoch": 0.31508918003346803, + "grad_norm": 1.9033031463623047, + "learning_rate": 1.9995645545704624e-05, + "loss": 0.7277, + "step": 1930 + }, + { + "epoch": 0.3152524386759724, + "grad_norm": 2.0085577964782715, + "learning_rate": 1.9995636176933653e-05, + "loss": 0.8095, + "step": 1931 + }, + { + "epoch": 0.3154156973184768, + "grad_norm": 2.5117721557617188, + "learning_rate": 1.9995626798097082e-05, + "loss": 1.0718, + "step": 1932 + }, + { + "epoch": 0.3155789559609812, + "grad_norm": 2.1465444564819336, + "learning_rate": 1.9995617409194917e-05, + "loss": 0.9325, + "step": 1933 + }, + { + "epoch": 0.31574221460348556, + "grad_norm": 2.561150550842285, + "learning_rate": 1.9995608010227165e-05, + "loss": 1.0459, + "step": 1934 + }, + { + "epoch": 0.31590547324598994, + "grad_norm": 2.4200587272644043, + "learning_rate": 1.9995598601193842e-05, + "loss": 1.1183, + "step": 1935 + }, + { + "epoch": 0.3160687318884943, + "grad_norm": 2.4876551628112793, + "learning_rate": 1.9995589182094952e-05, + "loss": 1.0658, + "step": 1936 + }, + { + "epoch": 0.3162319905309987, + "grad_norm": 1.9665910005569458, + "learning_rate": 1.999557975293051e-05, + "loss": 0.7901, + "step": 1937 + }, + { + "epoch": 0.31639524917350315, + "grad_norm": 2.49641489982605, + "learning_rate": 1.9995570313700516e-05, + "loss": 0.9481, + "step": 1938 + }, + { + "epoch": 0.31655850781600753, + "grad_norm": 3.1538190841674805, + "learning_rate": 1.9995560864404986e-05, + "loss": 0.8603, + "step": 1939 + }, + { + "epoch": 0.3167217664585119, + "grad_norm": 2.141737937927246, + "learning_rate": 1.999555140504393e-05, + "loss": 0.9025, + "step": 1940 + }, + { + "epoch": 0.3168850251010163, + "grad_norm": 2.2524075508117676, + "learning_rate": 1.999554193561736e-05, + "loss": 0.8228, + "step": 1941 + }, + { + "epoch": 0.3170482837435207, + "grad_norm": 2.5607335567474365, + "learning_rate": 1.9995532456125276e-05, + "loss": 0.9923, + "step": 1942 + }, + { + "epoch": 0.31721154238602506, + "grad_norm": 2.2738871574401855, + "learning_rate": 1.99955229665677e-05, + "loss": 1.104, + "step": 1943 + }, + { + "epoch": 0.31737480102852944, + "grad_norm": 2.4409875869750977, + "learning_rate": 1.999551346694463e-05, + "loss": 1.0368, + "step": 1944 + }, + { + "epoch": 0.3175380596710338, + "grad_norm": 2.068826675415039, + "learning_rate": 1.999550395725608e-05, + "loss": 0.936, + "step": 1945 + }, + { + "epoch": 0.3177013183135382, + "grad_norm": 2.0867598056793213, + "learning_rate": 1.9995494437502064e-05, + "loss": 0.8273, + "step": 1946 + }, + { + "epoch": 0.3178645769560426, + "grad_norm": 1.9942004680633545, + "learning_rate": 1.9995484907682585e-05, + "loss": 0.9291, + "step": 1947 + }, + { + "epoch": 0.31802783559854697, + "grad_norm": 1.9516021013259888, + "learning_rate": 1.9995475367797657e-05, + "loss": 0.7602, + "step": 1948 + }, + { + "epoch": 0.3181910942410514, + "grad_norm": 2.075179100036621, + "learning_rate": 1.9995465817847285e-05, + "loss": 0.9861, + "step": 1949 + }, + { + "epoch": 0.3183543528835558, + "grad_norm": 2.7094974517822266, + "learning_rate": 1.9995456257831484e-05, + "loss": 0.9338, + "step": 1950 + }, + { + "epoch": 0.3185176115260602, + "grad_norm": 2.3906681537628174, + "learning_rate": 1.9995446687750262e-05, + "loss": 0.9018, + "step": 1951 + }, + { + "epoch": 0.31868087016856456, + "grad_norm": 2.440882921218872, + "learning_rate": 1.999543710760363e-05, + "loss": 0.821, + "step": 1952 + }, + { + "epoch": 0.31884412881106894, + "grad_norm": 2.168727397918701, + "learning_rate": 1.999542751739159e-05, + "loss": 0.8328, + "step": 1953 + }, + { + "epoch": 0.3190073874535733, + "grad_norm": 2.1331562995910645, + "learning_rate": 1.9995417917114158e-05, + "loss": 0.9479, + "step": 1954 + }, + { + "epoch": 0.3191706460960777, + "grad_norm": 2.4438023567199707, + "learning_rate": 1.9995408306771346e-05, + "loss": 1.1519, + "step": 1955 + }, + { + "epoch": 0.3193339047385821, + "grad_norm": 2.0842325687408447, + "learning_rate": 1.999539868636316e-05, + "loss": 0.9023, + "step": 1956 + }, + { + "epoch": 0.31949716338108647, + "grad_norm": 2.1083788871765137, + "learning_rate": 1.9995389055889607e-05, + "loss": 0.8745, + "step": 1957 + }, + { + "epoch": 0.31966042202359085, + "grad_norm": 2.7526612281799316, + "learning_rate": 1.99953794153507e-05, + "loss": 0.9408, + "step": 1958 + }, + { + "epoch": 0.3198236806660953, + "grad_norm": 2.8871612548828125, + "learning_rate": 1.999536976474645e-05, + "loss": 0.7876, + "step": 1959 + }, + { + "epoch": 0.31998693930859967, + "grad_norm": 2.1247785091400146, + "learning_rate": 1.999536010407687e-05, + "loss": 0.8282, + "step": 1960 + }, + { + "epoch": 0.32015019795110405, + "grad_norm": 2.213501453399658, + "learning_rate": 1.999535043334196e-05, + "loss": 0.8825, + "step": 1961 + }, + { + "epoch": 0.32031345659360844, + "grad_norm": 2.230036735534668, + "learning_rate": 1.9995340752541734e-05, + "loss": 0.9411, + "step": 1962 + }, + { + "epoch": 0.3204767152361128, + "grad_norm": 2.5333335399627686, + "learning_rate": 1.9995331061676202e-05, + "loss": 0.9424, + "step": 1963 + }, + { + "epoch": 0.3206399738786172, + "grad_norm": 2.2996723651885986, + "learning_rate": 1.9995321360745378e-05, + "loss": 0.946, + "step": 1964 + }, + { + "epoch": 0.3208032325211216, + "grad_norm": 2.258808135986328, + "learning_rate": 1.9995311649749265e-05, + "loss": 0.9377, + "step": 1965 + }, + { + "epoch": 0.32096649116362597, + "grad_norm": 2.205732583999634, + "learning_rate": 1.9995301928687876e-05, + "loss": 0.8508, + "step": 1966 + }, + { + "epoch": 0.32112974980613035, + "grad_norm": 2.093491315841675, + "learning_rate": 1.9995292197561222e-05, + "loss": 0.8182, + "step": 1967 + }, + { + "epoch": 0.32129300844863473, + "grad_norm": 2.314694881439209, + "learning_rate": 1.9995282456369313e-05, + "loss": 0.9964, + "step": 1968 + }, + { + "epoch": 0.3214562670911391, + "grad_norm": 2.4808578491210938, + "learning_rate": 1.9995272705112155e-05, + "loss": 1.0413, + "step": 1969 + }, + { + "epoch": 0.32161952573364355, + "grad_norm": 2.299713611602783, + "learning_rate": 1.999526294378976e-05, + "loss": 0.9024, + "step": 1970 + }, + { + "epoch": 0.32178278437614793, + "grad_norm": 2.3408026695251465, + "learning_rate": 1.999525317240214e-05, + "loss": 0.9971, + "step": 1971 + }, + { + "epoch": 0.3219460430186523, + "grad_norm": 2.387197971343994, + "learning_rate": 1.99952433909493e-05, + "loss": 0.9663, + "step": 1972 + }, + { + "epoch": 0.3221093016611567, + "grad_norm": 2.513010025024414, + "learning_rate": 1.9995233599431252e-05, + "loss": 0.7465, + "step": 1973 + }, + { + "epoch": 0.3222725603036611, + "grad_norm": 1.784935474395752, + "learning_rate": 1.9995223797848008e-05, + "loss": 0.7262, + "step": 1974 + }, + { + "epoch": 0.32243581894616546, + "grad_norm": 1.8477271795272827, + "learning_rate": 1.9995213986199576e-05, + "loss": 0.8789, + "step": 1975 + }, + { + "epoch": 0.32259907758866985, + "grad_norm": 2.1493494510650635, + "learning_rate": 1.9995204164485967e-05, + "loss": 1.0142, + "step": 1976 + }, + { + "epoch": 0.32276233623117423, + "grad_norm": 2.319883108139038, + "learning_rate": 1.9995194332707188e-05, + "loss": 0.9799, + "step": 1977 + }, + { + "epoch": 0.3229255948736786, + "grad_norm": 2.0385282039642334, + "learning_rate": 1.9995184490863254e-05, + "loss": 0.9161, + "step": 1978 + }, + { + "epoch": 0.323088853516183, + "grad_norm": 2.0282909870147705, + "learning_rate": 1.9995174638954167e-05, + "loss": 0.9606, + "step": 1979 + }, + { + "epoch": 0.3232521121586874, + "grad_norm": 2.1738038063049316, + "learning_rate": 1.999516477697995e-05, + "loss": 1.1171, + "step": 1980 + }, + { + "epoch": 0.3234153708011918, + "grad_norm": 2.3258984088897705, + "learning_rate": 1.99951549049406e-05, + "loss": 1.048, + "step": 1981 + }, + { + "epoch": 0.3235786294436962, + "grad_norm": 2.3648929595947266, + "learning_rate": 1.999514502283613e-05, + "loss": 1.0, + "step": 1982 + }, + { + "epoch": 0.3237418880862006, + "grad_norm": 2.0263330936431885, + "learning_rate": 1.9995135130666555e-05, + "loss": 0.9626, + "step": 1983 + }, + { + "epoch": 0.32390514672870496, + "grad_norm": 2.278067111968994, + "learning_rate": 1.9995125228431877e-05, + "loss": 1.0609, + "step": 1984 + }, + { + "epoch": 0.32406840537120934, + "grad_norm": 2.182239055633545, + "learning_rate": 1.9995115316132115e-05, + "loss": 0.8626, + "step": 1985 + }, + { + "epoch": 0.3242316640137137, + "grad_norm": 2.162590980529785, + "learning_rate": 1.9995105393767272e-05, + "loss": 1.0223, + "step": 1986 + }, + { + "epoch": 0.3243949226562181, + "grad_norm": 2.054163694381714, + "learning_rate": 1.999509546133736e-05, + "loss": 0.9514, + "step": 1987 + }, + { + "epoch": 0.3245581812987225, + "grad_norm": 1.880192518234253, + "learning_rate": 1.9995085518842388e-05, + "loss": 0.792, + "step": 1988 + }, + { + "epoch": 0.3247214399412269, + "grad_norm": 2.197519302368164, + "learning_rate": 1.9995075566282375e-05, + "loss": 0.9808, + "step": 1989 + }, + { + "epoch": 0.32488469858373126, + "grad_norm": 2.058189868927002, + "learning_rate": 1.9995065603657317e-05, + "loss": 0.9672, + "step": 1990 + }, + { + "epoch": 0.32504795722623564, + "grad_norm": 2.078843832015991, + "learning_rate": 1.9995055630967234e-05, + "loss": 0.9064, + "step": 1991 + }, + { + "epoch": 0.3252112158687401, + "grad_norm": 2.2384493350982666, + "learning_rate": 1.999504564821213e-05, + "loss": 0.8692, + "step": 1992 + }, + { + "epoch": 0.32537447451124446, + "grad_norm": 2.507277250289917, + "learning_rate": 1.9995035655392017e-05, + "loss": 0.851, + "step": 1993 + }, + { + "epoch": 0.32553773315374884, + "grad_norm": 2.3797905445098877, + "learning_rate": 1.999502565250691e-05, + "loss": 1.7626, + "step": 1994 + }, + { + "epoch": 0.3257009917962532, + "grad_norm": 2.4022936820983887, + "learning_rate": 1.999501563955681e-05, + "loss": 0.9421, + "step": 1995 + }, + { + "epoch": 0.3258642504387576, + "grad_norm": 2.4655656814575195, + "learning_rate": 1.9995005616541734e-05, + "loss": 0.9923, + "step": 1996 + }, + { + "epoch": 0.326027509081262, + "grad_norm": 2.080310106277466, + "learning_rate": 1.9994995583461692e-05, + "loss": 0.9348, + "step": 1997 + }, + { + "epoch": 0.3261907677237664, + "grad_norm": 2.1733334064483643, + "learning_rate": 1.999498554031669e-05, + "loss": 0.861, + "step": 1998 + }, + { + "epoch": 0.32635402636627076, + "grad_norm": 2.284602403640747, + "learning_rate": 1.999497548710674e-05, + "loss": 0.7705, + "step": 1999 + }, + { + "epoch": 0.32651728500877514, + "grad_norm": 2.149527072906494, + "learning_rate": 1.9994965423831853e-05, + "loss": 0.7754, + "step": 2000 + }, + { + "epoch": 0.3266805436512795, + "grad_norm": 2.6528069972991943, + "learning_rate": 1.9994955350492036e-05, + "loss": 1.1085, + "step": 2001 + }, + { + "epoch": 0.3268438022937839, + "grad_norm": 2.4617483615875244, + "learning_rate": 1.9994945267087303e-05, + "loss": 1.0819, + "step": 2002 + }, + { + "epoch": 0.32700706093628834, + "grad_norm": 2.2044270038604736, + "learning_rate": 1.9994935173617668e-05, + "loss": 0.8818, + "step": 2003 + }, + { + "epoch": 0.3271703195787927, + "grad_norm": 2.2330777645111084, + "learning_rate": 1.999492507008313e-05, + "loss": 1.0008, + "step": 2004 + }, + { + "epoch": 0.3273335782212971, + "grad_norm": 1.9187195301055908, + "learning_rate": 1.9994914956483708e-05, + "loss": 0.8032, + "step": 2005 + }, + { + "epoch": 0.3274968368638015, + "grad_norm": 2.287264823913574, + "learning_rate": 1.9994904832819407e-05, + "loss": 1.0513, + "step": 2006 + }, + { + "epoch": 0.32766009550630587, + "grad_norm": 2.0997748374938965, + "learning_rate": 1.9994894699090238e-05, + "loss": 0.9403, + "step": 2007 + }, + { + "epoch": 0.32782335414881025, + "grad_norm": 2.169922351837158, + "learning_rate": 1.9994884555296216e-05, + "loss": 1.0135, + "step": 2008 + }, + { + "epoch": 0.32798661279131464, + "grad_norm": 1.8399162292480469, + "learning_rate": 1.9994874401437346e-05, + "loss": 0.9531, + "step": 2009 + }, + { + "epoch": 0.328149871433819, + "grad_norm": 2.3304731845855713, + "learning_rate": 1.9994864237513645e-05, + "loss": 0.9157, + "step": 2010 + }, + { + "epoch": 0.3283131300763234, + "grad_norm": 1.9055558443069458, + "learning_rate": 1.9994854063525114e-05, + "loss": 0.7696, + "step": 2011 + }, + { + "epoch": 0.3284763887188278, + "grad_norm": 3.275575876235962, + "learning_rate": 1.999484387947177e-05, + "loss": 1.0835, + "step": 2012 + }, + { + "epoch": 0.32863964736133217, + "grad_norm": 2.178969621658325, + "learning_rate": 1.9994833685353616e-05, + "loss": 0.8852, + "step": 2013 + }, + { + "epoch": 0.3288029060038366, + "grad_norm": 2.186030149459839, + "learning_rate": 1.9994823481170672e-05, + "loss": 0.9727, + "step": 2014 + }, + { + "epoch": 0.328966164646341, + "grad_norm": 2.189082384109497, + "learning_rate": 1.999481326692294e-05, + "loss": 0.9327, + "step": 2015 + }, + { + "epoch": 0.32912942328884537, + "grad_norm": 2.274480104446411, + "learning_rate": 1.9994803042610435e-05, + "loss": 1.1283, + "step": 2016 + }, + { + "epoch": 0.32929268193134975, + "grad_norm": 2.0124118328094482, + "learning_rate": 1.999479280823317e-05, + "loss": 0.8464, + "step": 2017 + }, + { + "epoch": 0.32945594057385413, + "grad_norm": 2.2133171558380127, + "learning_rate": 1.9994782563791145e-05, + "loss": 0.9932, + "step": 2018 + }, + { + "epoch": 0.3296191992163585, + "grad_norm": 2.128962516784668, + "learning_rate": 1.999477230928438e-05, + "loss": 1.0498, + "step": 2019 + }, + { + "epoch": 0.3297824578588629, + "grad_norm": 1.8741042613983154, + "learning_rate": 1.999476204471288e-05, + "loss": 0.9217, + "step": 2020 + }, + { + "epoch": 0.3299457165013673, + "grad_norm": 2.2913742065429688, + "learning_rate": 1.999475177007666e-05, + "loss": 1.0893, + "step": 2021 + }, + { + "epoch": 0.33010897514387166, + "grad_norm": 2.260071039199829, + "learning_rate": 1.9994741485375722e-05, + "loss": 0.975, + "step": 2022 + }, + { + "epoch": 0.33027223378637605, + "grad_norm": 1.8261796236038208, + "learning_rate": 1.999473119061009e-05, + "loss": 0.8696, + "step": 2023 + }, + { + "epoch": 0.33043549242888043, + "grad_norm": 1.9671235084533691, + "learning_rate": 1.9994720885779763e-05, + "loss": 0.7925, + "step": 2024 + }, + { + "epoch": 0.33059875107138487, + "grad_norm": 2.142930269241333, + "learning_rate": 1.999471057088475e-05, + "loss": 0.9468, + "step": 2025 + }, + { + "epoch": 0.33076200971388925, + "grad_norm": 2.091610908508301, + "learning_rate": 1.9994700245925073e-05, + "loss": 0.8645, + "step": 2026 + }, + { + "epoch": 0.33092526835639363, + "grad_norm": 2.206188917160034, + "learning_rate": 1.9994689910900733e-05, + "loss": 1.022, + "step": 2027 + }, + { + "epoch": 0.331088526998898, + "grad_norm": 2.12250018119812, + "learning_rate": 1.999467956581174e-05, + "loss": 0.8605, + "step": 2028 + }, + { + "epoch": 0.3312517856414024, + "grad_norm": 2.1260790824890137, + "learning_rate": 1.9994669210658112e-05, + "loss": 0.9439, + "step": 2029 + }, + { + "epoch": 0.3314150442839068, + "grad_norm": 2.162956476211548, + "learning_rate": 1.9994658845439853e-05, + "loss": 0.8384, + "step": 2030 + }, + { + "epoch": 0.33157830292641116, + "grad_norm": 2.5060272216796875, + "learning_rate": 1.9994648470156975e-05, + "loss": 0.8788, + "step": 2031 + }, + { + "epoch": 0.33174156156891554, + "grad_norm": 2.014723300933838, + "learning_rate": 1.9994638084809486e-05, + "loss": 0.9125, + "step": 2032 + }, + { + "epoch": 0.3319048202114199, + "grad_norm": 1.939516544342041, + "learning_rate": 1.9994627689397402e-05, + "loss": 0.7789, + "step": 2033 + }, + { + "epoch": 0.3320680788539243, + "grad_norm": 2.0263335704803467, + "learning_rate": 1.999461728392073e-05, + "loss": 0.9457, + "step": 2034 + }, + { + "epoch": 0.3322313374964287, + "grad_norm": 2.236926555633545, + "learning_rate": 1.999460686837948e-05, + "loss": 0.9398, + "step": 2035 + }, + { + "epoch": 0.33239459613893313, + "grad_norm": 2.2217295169830322, + "learning_rate": 1.9994596442773665e-05, + "loss": 0.9888, + "step": 2036 + }, + { + "epoch": 0.3325578547814375, + "grad_norm": 2.509582757949829, + "learning_rate": 1.9994586007103295e-05, + "loss": 0.9791, + "step": 2037 + }, + { + "epoch": 0.3327211134239419, + "grad_norm": 2.2319276332855225, + "learning_rate": 1.9994575561368377e-05, + "loss": 0.9423, + "step": 2038 + }, + { + "epoch": 0.3328843720664463, + "grad_norm": 2.3501384258270264, + "learning_rate": 1.9994565105568926e-05, + "loss": 0.9936, + "step": 2039 + }, + { + "epoch": 0.33304763070895066, + "grad_norm": 2.050896406173706, + "learning_rate": 1.999455463970495e-05, + "loss": 0.8577, + "step": 2040 + }, + { + "epoch": 0.33321088935145504, + "grad_norm": 2.292370319366455, + "learning_rate": 1.999454416377646e-05, + "loss": 0.9187, + "step": 2041 + }, + { + "epoch": 0.3333741479939594, + "grad_norm": 2.061749219894409, + "learning_rate": 1.999453367778347e-05, + "loss": 0.8661, + "step": 2042 + }, + { + "epoch": 0.3335374066364638, + "grad_norm": 2.3626863956451416, + "learning_rate": 1.9994523181725978e-05, + "loss": 0.899, + "step": 2043 + }, + { + "epoch": 0.3337006652789682, + "grad_norm": 2.219214916229248, + "learning_rate": 1.999451267560401e-05, + "loss": 0.8948, + "step": 2044 + }, + { + "epoch": 0.33386392392147257, + "grad_norm": 2.380796194076538, + "learning_rate": 1.9994502159417576e-05, + "loss": 0.9366, + "step": 2045 + }, + { + "epoch": 0.33402718256397695, + "grad_norm": 2.1140198707580566, + "learning_rate": 1.9994491633166674e-05, + "loss": 0.9223, + "step": 2046 + }, + { + "epoch": 0.3341904412064814, + "grad_norm": 2.3979454040527344, + "learning_rate": 1.999448109685132e-05, + "loss": 1.9587, + "step": 2047 + }, + { + "epoch": 0.3343536998489858, + "grad_norm": 2.460454225540161, + "learning_rate": 1.999447055047153e-05, + "loss": 0.9642, + "step": 2048 + }, + { + "epoch": 0.33451695849149016, + "grad_norm": 2.3328163623809814, + "learning_rate": 1.999445999402731e-05, + "loss": 0.8157, + "step": 2049 + }, + { + "epoch": 0.33468021713399454, + "grad_norm": 2.3049585819244385, + "learning_rate": 1.999444942751867e-05, + "loss": 1.0342, + "step": 2050 + }, + { + "epoch": 0.3348434757764989, + "grad_norm": 2.8461382389068604, + "learning_rate": 1.9994438850945626e-05, + "loss": 0.8226, + "step": 2051 + }, + { + "epoch": 0.3350067344190033, + "grad_norm": 2.18105149269104, + "learning_rate": 1.9994428264308177e-05, + "loss": 0.9701, + "step": 2052 + }, + { + "epoch": 0.3351699930615077, + "grad_norm": 2.1959421634674072, + "learning_rate": 1.999441766760635e-05, + "loss": 0.9388, + "step": 2053 + }, + { + "epoch": 0.33533325170401207, + "grad_norm": 2.320424795150757, + "learning_rate": 1.9994407060840142e-05, + "loss": 0.9234, + "step": 2054 + }, + { + "epoch": 0.33549651034651645, + "grad_norm": 2.2582578659057617, + "learning_rate": 1.999439644400957e-05, + "loss": 0.95, + "step": 2055 + }, + { + "epoch": 0.33565976898902083, + "grad_norm": 2.422736883163452, + "learning_rate": 1.9994385817114644e-05, + "loss": 0.8733, + "step": 2056 + }, + { + "epoch": 0.3358230276315252, + "grad_norm": 1.9901894330978394, + "learning_rate": 1.9994375180155374e-05, + "loss": 0.8736, + "step": 2057 + }, + { + "epoch": 0.33598628627402966, + "grad_norm": 2.180048942565918, + "learning_rate": 1.999436453313177e-05, + "loss": 1.003, + "step": 2058 + }, + { + "epoch": 0.33614954491653404, + "grad_norm": 1.9810211658477783, + "learning_rate": 1.9994353876043844e-05, + "loss": 0.7661, + "step": 2059 + }, + { + "epoch": 0.3363128035590384, + "grad_norm": 2.0579397678375244, + "learning_rate": 1.9994343208891607e-05, + "loss": 0.9963, + "step": 2060 + }, + { + "epoch": 0.3364760622015428, + "grad_norm": 2.465041160583496, + "learning_rate": 1.999433253167507e-05, + "loss": 1.032, + "step": 2061 + }, + { + "epoch": 0.3366393208440472, + "grad_norm": 2.716782808303833, + "learning_rate": 1.999432184439424e-05, + "loss": 0.9079, + "step": 2062 + }, + { + "epoch": 0.33680257948655157, + "grad_norm": 2.4450056552886963, + "learning_rate": 1.9994311147049134e-05, + "loss": 1.0619, + "step": 2063 + }, + { + "epoch": 0.33696583812905595, + "grad_norm": 2.1064541339874268, + "learning_rate": 1.9994300439639755e-05, + "loss": 0.9634, + "step": 2064 + }, + { + "epoch": 0.33712909677156033, + "grad_norm": 2.6513583660125732, + "learning_rate": 1.999428972216612e-05, + "loss": 0.8964, + "step": 2065 + }, + { + "epoch": 0.3372923554140647, + "grad_norm": 2.3569247722625732, + "learning_rate": 1.9994278994628238e-05, + "loss": 0.8866, + "step": 2066 + }, + { + "epoch": 0.3374556140565691, + "grad_norm": 2.380462646484375, + "learning_rate": 1.999426825702612e-05, + "loss": 0.9127, + "step": 2067 + }, + { + "epoch": 0.3376188726990735, + "grad_norm": 2.1882009506225586, + "learning_rate": 1.9994257509359775e-05, + "loss": 0.8994, + "step": 2068 + }, + { + "epoch": 0.3377821313415779, + "grad_norm": 2.030829429626465, + "learning_rate": 1.999424675162922e-05, + "loss": 0.7505, + "step": 2069 + }, + { + "epoch": 0.3379453899840823, + "grad_norm": 2.229931116104126, + "learning_rate": 1.9994235983834455e-05, + "loss": 1.0204, + "step": 2070 + }, + { + "epoch": 0.3381086486265867, + "grad_norm": 2.4455432891845703, + "learning_rate": 1.9994225205975503e-05, + "loss": 1.12, + "step": 2071 + }, + { + "epoch": 0.33827190726909107, + "grad_norm": 2.4948627948760986, + "learning_rate": 1.9994214418052363e-05, + "loss": 1.0179, + "step": 2072 + }, + { + "epoch": 0.33843516591159545, + "grad_norm": 2.1760003566741943, + "learning_rate": 1.9994203620065055e-05, + "loss": 0.8521, + "step": 2073 + }, + { + "epoch": 0.33859842455409983, + "grad_norm": 2.46781325340271, + "learning_rate": 1.9994192812013587e-05, + "loss": 1.0272, + "step": 2074 + }, + { + "epoch": 0.3387616831966042, + "grad_norm": 2.51511812210083, + "learning_rate": 1.9994181993897968e-05, + "loss": 0.8909, + "step": 2075 + }, + { + "epoch": 0.3389249418391086, + "grad_norm": 2.062363386154175, + "learning_rate": 1.999417116571821e-05, + "loss": 0.8785, + "step": 2076 + }, + { + "epoch": 0.339088200481613, + "grad_norm": 2.8004567623138428, + "learning_rate": 1.9994160327474325e-05, + "loss": 0.7429, + "step": 2077 + }, + { + "epoch": 0.33925145912411736, + "grad_norm": 2.4795336723327637, + "learning_rate": 1.9994149479166324e-05, + "loss": 1.2717, + "step": 2078 + }, + { + "epoch": 0.33941471776662174, + "grad_norm": 2.1139636039733887, + "learning_rate": 1.9994138620794218e-05, + "loss": 0.9475, + "step": 2079 + }, + { + "epoch": 0.3395779764091262, + "grad_norm": 2.1901333332061768, + "learning_rate": 1.9994127752358014e-05, + "loss": 0.8075, + "step": 2080 + }, + { + "epoch": 0.33974123505163056, + "grad_norm": 2.7274088859558105, + "learning_rate": 1.9994116873857728e-05, + "loss": 0.9243, + "step": 2081 + }, + { + "epoch": 0.33990449369413495, + "grad_norm": 2.0786941051483154, + "learning_rate": 1.9994105985293372e-05, + "loss": 0.9116, + "step": 2082 + }, + { + "epoch": 0.34006775233663933, + "grad_norm": 2.1323153972625732, + "learning_rate": 1.9994095086664947e-05, + "loss": 0.8995, + "step": 2083 + }, + { + "epoch": 0.3402310109791437, + "grad_norm": 2.4291841983795166, + "learning_rate": 1.9994084177972476e-05, + "loss": 0.8695, + "step": 2084 + }, + { + "epoch": 0.3403942696216481, + "grad_norm": 2.288094997406006, + "learning_rate": 1.9994073259215963e-05, + "loss": 0.8031, + "step": 2085 + }, + { + "epoch": 0.3405575282641525, + "grad_norm": 2.4803199768066406, + "learning_rate": 1.999406233039542e-05, + "loss": 0.886, + "step": 2086 + }, + { + "epoch": 0.34072078690665686, + "grad_norm": 2.1942403316497803, + "learning_rate": 1.999405139151086e-05, + "loss": 0.9558, + "step": 2087 + }, + { + "epoch": 0.34088404554916124, + "grad_norm": 2.8580310344696045, + "learning_rate": 1.9994040442562292e-05, + "loss": 1.115, + "step": 2088 + }, + { + "epoch": 0.3410473041916656, + "grad_norm": 2.238694667816162, + "learning_rate": 1.9994029483549732e-05, + "loss": 0.8577, + "step": 2089 + }, + { + "epoch": 0.34121056283417006, + "grad_norm": 2.4264864921569824, + "learning_rate": 1.9994018514473183e-05, + "loss": 0.835, + "step": 2090 + }, + { + "epoch": 0.34137382147667444, + "grad_norm": 2.27400279045105, + "learning_rate": 1.999400753533266e-05, + "loss": 0.8389, + "step": 2091 + }, + { + "epoch": 0.3415370801191788, + "grad_norm": 2.367957353591919, + "learning_rate": 1.9993996546128173e-05, + "loss": 1.0911, + "step": 2092 + }, + { + "epoch": 0.3417003387616832, + "grad_norm": 2.168483257293701, + "learning_rate": 1.999398554685974e-05, + "loss": 0.941, + "step": 2093 + }, + { + "epoch": 0.3418635974041876, + "grad_norm": 2.2000865936279297, + "learning_rate": 1.999397453752736e-05, + "loss": 0.9258, + "step": 2094 + }, + { + "epoch": 0.342026856046692, + "grad_norm": 2.1025407314300537, + "learning_rate": 1.9993963518131054e-05, + "loss": 0.8735, + "step": 2095 + }, + { + "epoch": 0.34219011468919636, + "grad_norm": 2.651447296142578, + "learning_rate": 1.9993952488670828e-05, + "loss": 1.0375, + "step": 2096 + }, + { + "epoch": 0.34235337333170074, + "grad_norm": 2.0821502208709717, + "learning_rate": 1.9993941449146695e-05, + "loss": 0.7812, + "step": 2097 + }, + { + "epoch": 0.3425166319742051, + "grad_norm": 2.144423484802246, + "learning_rate": 1.9993930399558662e-05, + "loss": 1.1149, + "step": 2098 + }, + { + "epoch": 0.3426798906167095, + "grad_norm": 2.2029471397399902, + "learning_rate": 1.999391933990675e-05, + "loss": 1.0421, + "step": 2099 + }, + { + "epoch": 0.3428431492592139, + "grad_norm": 2.1124396324157715, + "learning_rate": 1.999390827019096e-05, + "loss": 0.9474, + "step": 2100 + }, + { + "epoch": 0.3430064079017183, + "grad_norm": 1.9667856693267822, + "learning_rate": 1.9993897190411306e-05, + "loss": 0.9063, + "step": 2101 + }, + { + "epoch": 0.3431696665442227, + "grad_norm": 2.061030387878418, + "learning_rate": 1.9993886100567802e-05, + "loss": 0.8996, + "step": 2102 + }, + { + "epoch": 0.3433329251867271, + "grad_norm": 2.0891366004943848, + "learning_rate": 1.9993875000660455e-05, + "loss": 1.025, + "step": 2103 + }, + { + "epoch": 0.34349618382923147, + "grad_norm": 2.3173506259918213, + "learning_rate": 1.9993863890689283e-05, + "loss": 0.9889, + "step": 2104 + }, + { + "epoch": 0.34365944247173585, + "grad_norm": 2.34025239944458, + "learning_rate": 1.9993852770654293e-05, + "loss": 1.0616, + "step": 2105 + }, + { + "epoch": 0.34382270111424024, + "grad_norm": 2.251068592071533, + "learning_rate": 1.999384164055549e-05, + "loss": 0.8137, + "step": 2106 + }, + { + "epoch": 0.3439859597567446, + "grad_norm": 2.0504937171936035, + "learning_rate": 1.9993830500392896e-05, + "loss": 0.8871, + "step": 2107 + }, + { + "epoch": 0.344149218399249, + "grad_norm": 2.12589955329895, + "learning_rate": 1.9993819350166514e-05, + "loss": 1.076, + "step": 2108 + }, + { + "epoch": 0.3443124770417534, + "grad_norm": 2.095710515975952, + "learning_rate": 1.999380818987636e-05, + "loss": 0.8054, + "step": 2109 + }, + { + "epoch": 0.34447573568425777, + "grad_norm": 2.30148983001709, + "learning_rate": 1.9993797019522446e-05, + "loss": 0.8246, + "step": 2110 + }, + { + "epoch": 0.34463899432676215, + "grad_norm": 2.128077507019043, + "learning_rate": 1.999378583910478e-05, + "loss": 1.0352, + "step": 2111 + }, + { + "epoch": 0.3448022529692666, + "grad_norm": 2.619269609451294, + "learning_rate": 1.999377464862337e-05, + "loss": 0.8868, + "step": 2112 + }, + { + "epoch": 0.34496551161177097, + "grad_norm": 2.2907278537750244, + "learning_rate": 1.9993763448078237e-05, + "loss": 0.9743, + "step": 2113 + }, + { + "epoch": 0.34512877025427535, + "grad_norm": 2.165529251098633, + "learning_rate": 1.9993752237469387e-05, + "loss": 0.8966, + "step": 2114 + }, + { + "epoch": 0.34529202889677973, + "grad_norm": 1.9542646408081055, + "learning_rate": 1.999374101679683e-05, + "loss": 0.7128, + "step": 2115 + }, + { + "epoch": 0.3454552875392841, + "grad_norm": 2.1606602668762207, + "learning_rate": 1.9993729786060576e-05, + "loss": 0.9415, + "step": 2116 + }, + { + "epoch": 0.3456185461817885, + "grad_norm": 2.3581011295318604, + "learning_rate": 1.999371854526064e-05, + "loss": 0.9914, + "step": 2117 + }, + { + "epoch": 0.3457818048242929, + "grad_norm": 2.1375253200531006, + "learning_rate": 1.9993707294397035e-05, + "loss": 0.8501, + "step": 2118 + }, + { + "epoch": 0.34594506346679726, + "grad_norm": 2.8618838787078857, + "learning_rate": 1.9993696033469762e-05, + "loss": 0.7966, + "step": 2119 + }, + { + "epoch": 0.34610832210930165, + "grad_norm": 2.259526014328003, + "learning_rate": 1.9993684762478846e-05, + "loss": 0.7646, + "step": 2120 + }, + { + "epoch": 0.34627158075180603, + "grad_norm": 2.0475118160247803, + "learning_rate": 1.9993673481424292e-05, + "loss": 0.8235, + "step": 2121 + }, + { + "epoch": 0.3464348393943104, + "grad_norm": 2.335235595703125, + "learning_rate": 1.999366219030611e-05, + "loss": 1.0237, + "step": 2122 + }, + { + "epoch": 0.34659809803681485, + "grad_norm": 2.608257293701172, + "learning_rate": 1.9993650889124313e-05, + "loss": 1.7445, + "step": 2123 + }, + { + "epoch": 0.34676135667931923, + "grad_norm": 2.4408581256866455, + "learning_rate": 1.9993639577878912e-05, + "loss": 0.8553, + "step": 2124 + }, + { + "epoch": 0.3469246153218236, + "grad_norm": 2.3189711570739746, + "learning_rate": 1.999362825656992e-05, + "loss": 0.9223, + "step": 2125 + }, + { + "epoch": 0.347087873964328, + "grad_norm": 2.0570454597473145, + "learning_rate": 1.9993616925197346e-05, + "loss": 0.775, + "step": 2126 + }, + { + "epoch": 0.3472511326068324, + "grad_norm": 2.3881213665008545, + "learning_rate": 1.99936055837612e-05, + "loss": 0.975, + "step": 2127 + }, + { + "epoch": 0.34741439124933676, + "grad_norm": 2.2285678386688232, + "learning_rate": 1.99935942322615e-05, + "loss": 0.7383, + "step": 2128 + }, + { + "epoch": 0.34757764989184115, + "grad_norm": 1.9476563930511475, + "learning_rate": 1.999358287069825e-05, + "loss": 1.0025, + "step": 2129 + }, + { + "epoch": 0.3477409085343455, + "grad_norm": 2.308231830596924, + "learning_rate": 1.9993571499071465e-05, + "loss": 0.9545, + "step": 2130 + }, + { + "epoch": 0.3479041671768499, + "grad_norm": 2.1318538188934326, + "learning_rate": 1.9993560117381157e-05, + "loss": 0.836, + "step": 2131 + }, + { + "epoch": 0.3480674258193543, + "grad_norm": 2.246293067932129, + "learning_rate": 1.9993548725627338e-05, + "loss": 0.8676, + "step": 2132 + }, + { + "epoch": 0.3482306844618587, + "grad_norm": 2.2373719215393066, + "learning_rate": 1.9993537323810015e-05, + "loss": 0.9473, + "step": 2133 + }, + { + "epoch": 0.3483939431043631, + "grad_norm": 2.375591993331909, + "learning_rate": 1.9993525911929206e-05, + "loss": 1.1422, + "step": 2134 + }, + { + "epoch": 0.3485572017468675, + "grad_norm": 2.101518392562866, + "learning_rate": 1.9993514489984917e-05, + "loss": 0.8816, + "step": 2135 + }, + { + "epoch": 0.3487204603893719, + "grad_norm": 2.192919969558716, + "learning_rate": 1.999350305797716e-05, + "loss": 1.017, + "step": 2136 + }, + { + "epoch": 0.34888371903187626, + "grad_norm": 2.3834173679351807, + "learning_rate": 1.9993491615905953e-05, + "loss": 1.0242, + "step": 2137 + }, + { + "epoch": 0.34904697767438064, + "grad_norm": 2.317965030670166, + "learning_rate": 1.99934801637713e-05, + "loss": 0.8636, + "step": 2138 + }, + { + "epoch": 0.349210236316885, + "grad_norm": 2.3477907180786133, + "learning_rate": 1.9993468701573213e-05, + "loss": 1.6591, + "step": 2139 + }, + { + "epoch": 0.3493734949593894, + "grad_norm": 2.342668294906616, + "learning_rate": 1.9993457229311707e-05, + "loss": 0.8015, + "step": 2140 + }, + { + "epoch": 0.3495367536018938, + "grad_norm": 1.9518961906433105, + "learning_rate": 1.9993445746986793e-05, + "loss": 0.8932, + "step": 2141 + }, + { + "epoch": 0.3497000122443982, + "grad_norm": 2.0530073642730713, + "learning_rate": 1.999343425459848e-05, + "loss": 0.9125, + "step": 2142 + }, + { + "epoch": 0.34986327088690256, + "grad_norm": 2.084711790084839, + "learning_rate": 1.9993422752146786e-05, + "loss": 0.8997, + "step": 2143 + }, + { + "epoch": 0.35002652952940694, + "grad_norm": 2.10321044921875, + "learning_rate": 1.9993411239631713e-05, + "loss": 0.8179, + "step": 2144 + }, + { + "epoch": 0.3501897881719114, + "grad_norm": 2.346308708190918, + "learning_rate": 1.999339971705328e-05, + "loss": 0.9511, + "step": 2145 + }, + { + "epoch": 0.35035304681441576, + "grad_norm": 2.47147274017334, + "learning_rate": 1.9993388184411495e-05, + "loss": 0.9644, + "step": 2146 + }, + { + "epoch": 0.35051630545692014, + "grad_norm": 2.2305777072906494, + "learning_rate": 1.999337664170637e-05, + "loss": 1.0437, + "step": 2147 + }, + { + "epoch": 0.3506795640994245, + "grad_norm": 2.229667901992798, + "learning_rate": 1.999336508893792e-05, + "loss": 0.9786, + "step": 2148 + }, + { + "epoch": 0.3508428227419289, + "grad_norm": 2.1389808654785156, + "learning_rate": 1.9993353526106152e-05, + "loss": 0.8375, + "step": 2149 + }, + { + "epoch": 0.3510060813844333, + "grad_norm": 2.15337872505188, + "learning_rate": 1.999334195321108e-05, + "loss": 0.9677, + "step": 2150 + }, + { + "epoch": 0.35116934002693767, + "grad_norm": 2.6053457260131836, + "learning_rate": 1.9993330370252713e-05, + "loss": 0.8884, + "step": 2151 + }, + { + "epoch": 0.35133259866944205, + "grad_norm": 2.1354944705963135, + "learning_rate": 1.999331877723107e-05, + "loss": 0.9774, + "step": 2152 + }, + { + "epoch": 0.35149585731194644, + "grad_norm": 2.0861690044403076, + "learning_rate": 1.9993307174146155e-05, + "loss": 0.8699, + "step": 2153 + }, + { + "epoch": 0.3516591159544508, + "grad_norm": 2.134557008743286, + "learning_rate": 1.999329556099798e-05, + "loss": 0.976, + "step": 2154 + }, + { + "epoch": 0.3518223745969552, + "grad_norm": 1.8021939992904663, + "learning_rate": 1.9993283937786562e-05, + "loss": 0.7702, + "step": 2155 + }, + { + "epoch": 0.35198563323945964, + "grad_norm": 2.1763951778411865, + "learning_rate": 1.9993272304511913e-05, + "loss": 1.1184, + "step": 2156 + }, + { + "epoch": 0.352148891881964, + "grad_norm": 2.4434990882873535, + "learning_rate": 1.9993260661174038e-05, + "loss": 0.8916, + "step": 2157 + }, + { + "epoch": 0.3523121505244684, + "grad_norm": 2.2396368980407715, + "learning_rate": 1.999324900777295e-05, + "loss": 0.9654, + "step": 2158 + }, + { + "epoch": 0.3524754091669728, + "grad_norm": 1.6811450719833374, + "learning_rate": 1.9993237344308666e-05, + "loss": 0.7207, + "step": 2159 + }, + { + "epoch": 0.35263866780947717, + "grad_norm": 2.126481533050537, + "learning_rate": 1.9993225670781192e-05, + "loss": 0.8318, + "step": 2160 + }, + { + "epoch": 0.35280192645198155, + "grad_norm": 2.5312836170196533, + "learning_rate": 1.9993213987190543e-05, + "loss": 0.8627, + "step": 2161 + }, + { + "epoch": 0.35296518509448593, + "grad_norm": 2.12343430519104, + "learning_rate": 1.9993202293536733e-05, + "loss": 1.0181, + "step": 2162 + }, + { + "epoch": 0.3531284437369903, + "grad_norm": 2.1349616050720215, + "learning_rate": 1.9993190589819768e-05, + "loss": 0.8591, + "step": 2163 + }, + { + "epoch": 0.3532917023794947, + "grad_norm": 2.408550977706909, + "learning_rate": 1.9993178876039662e-05, + "loss": 1.1706, + "step": 2164 + }, + { + "epoch": 0.3534549610219991, + "grad_norm": 2.338921308517456, + "learning_rate": 1.9993167152196432e-05, + "loss": 0.9855, + "step": 2165 + }, + { + "epoch": 0.35361821966450346, + "grad_norm": 2.0626108646392822, + "learning_rate": 1.999315541829008e-05, + "loss": 0.7688, + "step": 2166 + }, + { + "epoch": 0.3537814783070079, + "grad_norm": 2.1387109756469727, + "learning_rate": 1.9993143674320626e-05, + "loss": 0.8958, + "step": 2167 + }, + { + "epoch": 0.3539447369495123, + "grad_norm": 1.9962046146392822, + "learning_rate": 1.9993131920288078e-05, + "loss": 0.7148, + "step": 2168 + }, + { + "epoch": 0.35410799559201667, + "grad_norm": 2.27956485748291, + "learning_rate": 1.999312015619245e-05, + "loss": 0.8794, + "step": 2169 + }, + { + "epoch": 0.35427125423452105, + "grad_norm": 2.0613083839416504, + "learning_rate": 1.9993108382033752e-05, + "loss": 0.816, + "step": 2170 + }, + { + "epoch": 0.35443451287702543, + "grad_norm": 2.4596760272979736, + "learning_rate": 1.9993096597811997e-05, + "loss": 0.9815, + "step": 2171 + }, + { + "epoch": 0.3545977715195298, + "grad_norm": 2.0031962394714355, + "learning_rate": 1.9993084803527197e-05, + "loss": 0.9398, + "step": 2172 + }, + { + "epoch": 0.3547610301620342, + "grad_norm": 2.244058609008789, + "learning_rate": 1.999307299917936e-05, + "loss": 0.9973, + "step": 2173 + }, + { + "epoch": 0.3549242888045386, + "grad_norm": 2.236382007598877, + "learning_rate": 1.9993061184768504e-05, + "loss": 0.9558, + "step": 2174 + }, + { + "epoch": 0.35508754744704296, + "grad_norm": 2.6043248176574707, + "learning_rate": 1.9993049360294636e-05, + "loss": 0.8723, + "step": 2175 + }, + { + "epoch": 0.35525080608954734, + "grad_norm": 2.1027164459228516, + "learning_rate": 1.9993037525757773e-05, + "loss": 0.7825, + "step": 2176 + }, + { + "epoch": 0.3554140647320517, + "grad_norm": 2.1460907459259033, + "learning_rate": 1.999302568115792e-05, + "loss": 0.8968, + "step": 2177 + }, + { + "epoch": 0.35557732337455616, + "grad_norm": 2.357816219329834, + "learning_rate": 1.9993013826495097e-05, + "loss": 0.8547, + "step": 2178 + }, + { + "epoch": 0.35574058201706055, + "grad_norm": 2.111359119415283, + "learning_rate": 1.999300196176931e-05, + "loss": 0.908, + "step": 2179 + }, + { + "epoch": 0.35590384065956493, + "grad_norm": 2.334989309310913, + "learning_rate": 1.999299008698057e-05, + "loss": 0.987, + "step": 2180 + }, + { + "epoch": 0.3560670993020693, + "grad_norm": 2.6345107555389404, + "learning_rate": 1.9992978202128895e-05, + "loss": 0.948, + "step": 2181 + }, + { + "epoch": 0.3562303579445737, + "grad_norm": 2.2766902446746826, + "learning_rate": 1.9992966307214293e-05, + "loss": 1.12, + "step": 2182 + }, + { + "epoch": 0.3563936165870781, + "grad_norm": 1.8305878639221191, + "learning_rate": 1.9992954402236774e-05, + "loss": 0.7865, + "step": 2183 + }, + { + "epoch": 0.35655687522958246, + "grad_norm": 2.214966058731079, + "learning_rate": 1.9992942487196358e-05, + "loss": 0.7793, + "step": 2184 + }, + { + "epoch": 0.35672013387208684, + "grad_norm": 2.3188652992248535, + "learning_rate": 1.9992930562093048e-05, + "loss": 0.8254, + "step": 2185 + }, + { + "epoch": 0.3568833925145912, + "grad_norm": 2.268418788909912, + "learning_rate": 1.9992918626926857e-05, + "loss": 0.9278, + "step": 2186 + }, + { + "epoch": 0.3570466511570956, + "grad_norm": 2.226747751235962, + "learning_rate": 1.9992906681697804e-05, + "loss": 0.837, + "step": 2187 + }, + { + "epoch": 0.3572099097996, + "grad_norm": 1.9379140138626099, + "learning_rate": 1.9992894726405894e-05, + "loss": 0.7272, + "step": 2188 + }, + { + "epoch": 0.3573731684421044, + "grad_norm": 2.3467555046081543, + "learning_rate": 1.9992882761051145e-05, + "loss": 0.9209, + "step": 2189 + }, + { + "epoch": 0.3575364270846088, + "grad_norm": 2.590972423553467, + "learning_rate": 1.9992870785633563e-05, + "loss": 0.929, + "step": 2190 + }, + { + "epoch": 0.3576996857271132, + "grad_norm": 2.357642889022827, + "learning_rate": 1.999285880015316e-05, + "loss": 1.6963, + "step": 2191 + }, + { + "epoch": 0.3578629443696176, + "grad_norm": 2.321352243423462, + "learning_rate": 1.9992846804609955e-05, + "loss": 0.8093, + "step": 2192 + }, + { + "epoch": 0.35802620301212196, + "grad_norm": 2.1331088542938232, + "learning_rate": 1.9992834799003956e-05, + "loss": 0.8995, + "step": 2193 + }, + { + "epoch": 0.35818946165462634, + "grad_norm": 2.1914002895355225, + "learning_rate": 1.999282278333517e-05, + "loss": 0.8782, + "step": 2194 + }, + { + "epoch": 0.3583527202971307, + "grad_norm": 1.9910295009613037, + "learning_rate": 1.999281075760362e-05, + "loss": 0.8439, + "step": 2195 + }, + { + "epoch": 0.3585159789396351, + "grad_norm": 2.1690845489501953, + "learning_rate": 1.999279872180931e-05, + "loss": 0.8659, + "step": 2196 + }, + { + "epoch": 0.3586792375821395, + "grad_norm": 2.2820091247558594, + "learning_rate": 1.9992786675952256e-05, + "loss": 0.993, + "step": 2197 + }, + { + "epoch": 0.35884249622464387, + "grad_norm": 1.8423489332199097, + "learning_rate": 1.9992774620032466e-05, + "loss": 0.6868, + "step": 2198 + }, + { + "epoch": 0.35900575486714825, + "grad_norm": 2.052805185317993, + "learning_rate": 1.9992762554049955e-05, + "loss": 0.9652, + "step": 2199 + }, + { + "epoch": 0.3591690135096527, + "grad_norm": 2.368788480758667, + "learning_rate": 1.999275047800474e-05, + "loss": 1.0008, + "step": 2200 + }, + { + "epoch": 0.3593322721521571, + "grad_norm": 1.9906386137008667, + "learning_rate": 1.9992738391896818e-05, + "loss": 0.9164, + "step": 2201 + }, + { + "epoch": 0.35949553079466146, + "grad_norm": 2.1123273372650146, + "learning_rate": 1.999272629572622e-05, + "loss": 0.8087, + "step": 2202 + }, + { + "epoch": 0.35965878943716584, + "grad_norm": 2.163785219192505, + "learning_rate": 1.9992714189492943e-05, + "loss": 0.7636, + "step": 2203 + }, + { + "epoch": 0.3598220480796702, + "grad_norm": 2.182392120361328, + "learning_rate": 1.999270207319701e-05, + "loss": 1.0744, + "step": 2204 + }, + { + "epoch": 0.3599853067221746, + "grad_norm": 2.6292316913604736, + "learning_rate": 1.999268994683843e-05, + "loss": 1.0012, + "step": 2205 + }, + { + "epoch": 0.360148565364679, + "grad_norm": 2.496081829071045, + "learning_rate": 1.9992677810417207e-05, + "loss": 1.0008, + "step": 2206 + }, + { + "epoch": 0.36031182400718337, + "grad_norm": 2.3611650466918945, + "learning_rate": 1.9992665663933362e-05, + "loss": 1.0318, + "step": 2207 + }, + { + "epoch": 0.36047508264968775, + "grad_norm": 2.1103577613830566, + "learning_rate": 1.999265350738691e-05, + "loss": 0.7471, + "step": 2208 + }, + { + "epoch": 0.36063834129219213, + "grad_norm": 2.255551338195801, + "learning_rate": 1.9992641340777856e-05, + "loss": 0.945, + "step": 2209 + }, + { + "epoch": 0.3608015999346965, + "grad_norm": 2.042912721633911, + "learning_rate": 1.999262916410621e-05, + "loss": 0.8392, + "step": 2210 + }, + { + "epoch": 0.36096485857720095, + "grad_norm": 2.2842702865600586, + "learning_rate": 1.9992616977371998e-05, + "loss": 0.8474, + "step": 2211 + }, + { + "epoch": 0.36112811721970534, + "grad_norm": 2.192051410675049, + "learning_rate": 1.9992604780575216e-05, + "loss": 1.0306, + "step": 2212 + }, + { + "epoch": 0.3612913758622097, + "grad_norm": 2.4476466178894043, + "learning_rate": 1.999259257371589e-05, + "loss": 0.9405, + "step": 2213 + }, + { + "epoch": 0.3614546345047141, + "grad_norm": 2.293199062347412, + "learning_rate": 1.999258035679402e-05, + "loss": 0.8404, + "step": 2214 + }, + { + "epoch": 0.3616178931472185, + "grad_norm": 2.5179603099823, + "learning_rate": 1.9992568129809627e-05, + "loss": 1.0047, + "step": 2215 + }, + { + "epoch": 0.36178115178972287, + "grad_norm": 2.0449423789978027, + "learning_rate": 1.999255589276272e-05, + "loss": 0.788, + "step": 2216 + }, + { + "epoch": 0.36194441043222725, + "grad_norm": 2.334240198135376, + "learning_rate": 1.9992543645653313e-05, + "loss": 0.8862, + "step": 2217 + }, + { + "epoch": 0.36210766907473163, + "grad_norm": 2.43619704246521, + "learning_rate": 1.9992531388481416e-05, + "loss": 0.9423, + "step": 2218 + }, + { + "epoch": 0.362270927717236, + "grad_norm": 2.5858755111694336, + "learning_rate": 1.9992519121247045e-05, + "loss": 1.0318, + "step": 2219 + }, + { + "epoch": 0.3624341863597404, + "grad_norm": 2.763028621673584, + "learning_rate": 1.9992506843950207e-05, + "loss": 1.0611, + "step": 2220 + }, + { + "epoch": 0.36259744500224483, + "grad_norm": 2.262590169906616, + "learning_rate": 1.999249455659092e-05, + "loss": 0.9048, + "step": 2221 + }, + { + "epoch": 0.3627607036447492, + "grad_norm": 1.978287935256958, + "learning_rate": 1.9992482259169188e-05, + "loss": 0.9646, + "step": 2222 + }, + { + "epoch": 0.3629239622872536, + "grad_norm": 2.0319831371307373, + "learning_rate": 1.9992469951685034e-05, + "loss": 0.8185, + "step": 2223 + }, + { + "epoch": 0.363087220929758, + "grad_norm": 2.1010732650756836, + "learning_rate": 1.9992457634138464e-05, + "loss": 0.8115, + "step": 2224 + }, + { + "epoch": 0.36325047957226236, + "grad_norm": 1.9776264429092407, + "learning_rate": 1.9992445306529492e-05, + "loss": 0.932, + "step": 2225 + }, + { + "epoch": 0.36341373821476675, + "grad_norm": 2.70733642578125, + "learning_rate": 1.9992432968858128e-05, + "loss": 1.1287, + "step": 2226 + }, + { + "epoch": 0.36357699685727113, + "grad_norm": 2.383721351623535, + "learning_rate": 1.999242062112439e-05, + "loss": 0.9074, + "step": 2227 + }, + { + "epoch": 0.3637402554997755, + "grad_norm": 2.4274322986602783, + "learning_rate": 1.9992408263328286e-05, + "loss": 0.9591, + "step": 2228 + }, + { + "epoch": 0.3639035141422799, + "grad_norm": 2.2306063175201416, + "learning_rate": 1.999239589546983e-05, + "loss": 0.865, + "step": 2229 + }, + { + "epoch": 0.3640667727847843, + "grad_norm": 2.1650025844573975, + "learning_rate": 1.9992383517549032e-05, + "loss": 1.007, + "step": 2230 + }, + { + "epoch": 0.36423003142728866, + "grad_norm": 2.162447214126587, + "learning_rate": 1.999237112956591e-05, + "loss": 0.9327, + "step": 2231 + }, + { + "epoch": 0.3643932900697931, + "grad_norm": 2.4145126342773438, + "learning_rate": 1.999235873152047e-05, + "loss": 0.9317, + "step": 2232 + }, + { + "epoch": 0.3645565487122975, + "grad_norm": 2.1048343181610107, + "learning_rate": 1.9992346323412728e-05, + "loss": 0.9958, + "step": 2233 + }, + { + "epoch": 0.36471980735480186, + "grad_norm": 1.872110366821289, + "learning_rate": 1.9992333905242697e-05, + "loss": 1.0119, + "step": 2234 + }, + { + "epoch": 0.36488306599730624, + "grad_norm": 1.938316822052002, + "learning_rate": 1.9992321477010388e-05, + "loss": 0.8157, + "step": 2235 + }, + { + "epoch": 0.3650463246398106, + "grad_norm": 2.1497247219085693, + "learning_rate": 1.9992309038715812e-05, + "loss": 0.999, + "step": 2236 + }, + { + "epoch": 0.365209583282315, + "grad_norm": 1.9800329208374023, + "learning_rate": 1.9992296590358987e-05, + "loss": 0.9124, + "step": 2237 + }, + { + "epoch": 0.3653728419248194, + "grad_norm": 2.2014238834381104, + "learning_rate": 1.9992284131939918e-05, + "loss": 0.8728, + "step": 2238 + }, + { + "epoch": 0.3655361005673238, + "grad_norm": 2.6112349033355713, + "learning_rate": 1.9992271663458623e-05, + "loss": 0.7642, + "step": 2239 + }, + { + "epoch": 0.36569935920982816, + "grad_norm": 1.9591474533081055, + "learning_rate": 1.9992259184915115e-05, + "loss": 0.8695, + "step": 2240 + }, + { + "epoch": 0.36586261785233254, + "grad_norm": 2.0417397022247314, + "learning_rate": 1.99922466963094e-05, + "loss": 0.7663, + "step": 2241 + }, + { + "epoch": 0.3660258764948369, + "grad_norm": 2.4605541229248047, + "learning_rate": 1.99922341976415e-05, + "loss": 1.0054, + "step": 2242 + }, + { + "epoch": 0.36618913513734136, + "grad_norm": 2.291613817214966, + "learning_rate": 1.999222168891142e-05, + "loss": 0.8368, + "step": 2243 + }, + { + "epoch": 0.36635239377984574, + "grad_norm": 2.2952418327331543, + "learning_rate": 1.9992209170119177e-05, + "loss": 1.0905, + "step": 2244 + }, + { + "epoch": 0.3665156524223501, + "grad_norm": 2.679644823074341, + "learning_rate": 1.9992196641264782e-05, + "loss": 1.0988, + "step": 2245 + }, + { + "epoch": 0.3666789110648545, + "grad_norm": 2.282463788986206, + "learning_rate": 1.999218410234825e-05, + "loss": 0.9432, + "step": 2246 + }, + { + "epoch": 0.3668421697073589, + "grad_norm": 2.69142484664917, + "learning_rate": 1.9992171553369584e-05, + "loss": 0.9991, + "step": 2247 + }, + { + "epoch": 0.36700542834986327, + "grad_norm": 2.083199977874756, + "learning_rate": 1.9992158994328812e-05, + "loss": 0.7862, + "step": 2248 + }, + { + "epoch": 0.36716868699236765, + "grad_norm": 2.3319509029388428, + "learning_rate": 1.9992146425225932e-05, + "loss": 0.9877, + "step": 2249 + }, + { + "epoch": 0.36733194563487204, + "grad_norm": 1.9969801902770996, + "learning_rate": 1.999213384606097e-05, + "loss": 0.8515, + "step": 2250 + }, + { + "epoch": 0.3674952042773764, + "grad_norm": 2.3730595111846924, + "learning_rate": 1.9992121256833927e-05, + "loss": 0.9218, + "step": 2251 + }, + { + "epoch": 0.3676584629198808, + "grad_norm": 2.5518863201141357, + "learning_rate": 1.9992108657544823e-05, + "loss": 1.0718, + "step": 2252 + }, + { + "epoch": 0.3678217215623852, + "grad_norm": 1.8573054075241089, + "learning_rate": 1.9992096048193664e-05, + "loss": 0.5913, + "step": 2253 + }, + { + "epoch": 0.3679849802048896, + "grad_norm": 2.7444937229156494, + "learning_rate": 1.999208342878047e-05, + "loss": 0.9895, + "step": 2254 + }, + { + "epoch": 0.368148238847394, + "grad_norm": 2.201964855194092, + "learning_rate": 1.999207079930525e-05, + "loss": 0.8339, + "step": 2255 + }, + { + "epoch": 0.3683114974898984, + "grad_norm": 2.202101230621338, + "learning_rate": 1.9992058159768018e-05, + "loss": 0.8526, + "step": 2256 + }, + { + "epoch": 0.36847475613240277, + "grad_norm": 2.18135929107666, + "learning_rate": 1.999204551016879e-05, + "loss": 0.9135, + "step": 2257 + }, + { + "epoch": 0.36863801477490715, + "grad_norm": 2.245607852935791, + "learning_rate": 1.9992032850507566e-05, + "loss": 0.8863, + "step": 2258 + }, + { + "epoch": 0.36880127341741153, + "grad_norm": 2.2053489685058594, + "learning_rate": 1.9992020180784372e-05, + "loss": 1.71, + "step": 2259 + }, + { + "epoch": 0.3689645320599159, + "grad_norm": 2.4611237049102783, + "learning_rate": 1.9992007500999216e-05, + "loss": 0.7997, + "step": 2260 + }, + { + "epoch": 0.3691277907024203, + "grad_norm": 2.557905673980713, + "learning_rate": 1.9991994811152115e-05, + "loss": 0.8783, + "step": 2261 + }, + { + "epoch": 0.3692910493449247, + "grad_norm": 2.4489026069641113, + "learning_rate": 1.9991982111243075e-05, + "loss": 1.0168, + "step": 2262 + }, + { + "epoch": 0.36945430798742906, + "grad_norm": 1.969509243965149, + "learning_rate": 1.9991969401272107e-05, + "loss": 0.9258, + "step": 2263 + }, + { + "epoch": 0.36961756662993345, + "grad_norm": 2.2505760192871094, + "learning_rate": 1.9991956681239232e-05, + "loss": 0.7833, + "step": 2264 + }, + { + "epoch": 0.3697808252724379, + "grad_norm": 2.386566400527954, + "learning_rate": 1.9991943951144462e-05, + "loss": 0.9975, + "step": 2265 + }, + { + "epoch": 0.36994408391494227, + "grad_norm": 2.1028547286987305, + "learning_rate": 1.9991931210987805e-05, + "loss": 0.9321, + "step": 2266 + }, + { + "epoch": 0.37010734255744665, + "grad_norm": 2.2028493881225586, + "learning_rate": 1.9991918460769274e-05, + "loss": 0.9036, + "step": 2267 + }, + { + "epoch": 0.37027060119995103, + "grad_norm": 2.378711700439453, + "learning_rate": 1.9991905700488886e-05, + "loss": 1.0446, + "step": 2268 + }, + { + "epoch": 0.3704338598424554, + "grad_norm": 2.0718767642974854, + "learning_rate": 1.999189293014665e-05, + "loss": 0.8442, + "step": 2269 + }, + { + "epoch": 0.3705971184849598, + "grad_norm": 1.9486868381500244, + "learning_rate": 1.9991880149742583e-05, + "loss": 0.868, + "step": 2270 + }, + { + "epoch": 0.3707603771274642, + "grad_norm": 2.341618061065674, + "learning_rate": 1.9991867359276696e-05, + "loss": 0.6921, + "step": 2271 + }, + { + "epoch": 0.37092363576996856, + "grad_norm": 1.854557991027832, + "learning_rate": 1.9991854558749e-05, + "loss": 0.7725, + "step": 2272 + }, + { + "epoch": 0.37108689441247295, + "grad_norm": 2.4644439220428467, + "learning_rate": 1.9991841748159506e-05, + "loss": 0.9248, + "step": 2273 + }, + { + "epoch": 0.3712501530549773, + "grad_norm": 2.175028085708618, + "learning_rate": 1.9991828927508234e-05, + "loss": 0.8805, + "step": 2274 + }, + { + "epoch": 0.3714134116974817, + "grad_norm": 2.0989503860473633, + "learning_rate": 1.9991816096795193e-05, + "loss": 0.914, + "step": 2275 + }, + { + "epoch": 0.37157667033998615, + "grad_norm": 2.0568270683288574, + "learning_rate": 1.9991803256020393e-05, + "loss": 0.7703, + "step": 2276 + }, + { + "epoch": 0.37173992898249053, + "grad_norm": 1.9770033359527588, + "learning_rate": 1.999179040518385e-05, + "loss": 0.7251, + "step": 2277 + }, + { + "epoch": 0.3719031876249949, + "grad_norm": 2.2503979206085205, + "learning_rate": 1.9991777544285575e-05, + "loss": 0.9457, + "step": 2278 + }, + { + "epoch": 0.3720664462674993, + "grad_norm": 2.0288405418395996, + "learning_rate": 1.9991764673325587e-05, + "loss": 0.8405, + "step": 2279 + }, + { + "epoch": 0.3722297049100037, + "grad_norm": 2.3824329376220703, + "learning_rate": 1.9991751792303892e-05, + "loss": 0.9277, + "step": 2280 + }, + { + "epoch": 0.37239296355250806, + "grad_norm": 2.0862839221954346, + "learning_rate": 1.999173890122051e-05, + "loss": 0.9338, + "step": 2281 + }, + { + "epoch": 0.37255622219501244, + "grad_norm": 2.0769782066345215, + "learning_rate": 1.9991726000075445e-05, + "loss": 0.9175, + "step": 2282 + }, + { + "epoch": 0.3727194808375168, + "grad_norm": 2.2393691539764404, + "learning_rate": 1.9991713088868714e-05, + "loss": 0.8211, + "step": 2283 + }, + { + "epoch": 0.3728827394800212, + "grad_norm": 2.3423573970794678, + "learning_rate": 1.9991700167600333e-05, + "loss": 1.115, + "step": 2284 + }, + { + "epoch": 0.3730459981225256, + "grad_norm": 2.3422374725341797, + "learning_rate": 1.999168723627031e-05, + "loss": 1.0062, + "step": 2285 + }, + { + "epoch": 0.37320925676503, + "grad_norm": 2.2843518257141113, + "learning_rate": 1.9991674294878663e-05, + "loss": 0.9245, + "step": 2286 + }, + { + "epoch": 0.3733725154075344, + "grad_norm": 2.6438379287719727, + "learning_rate": 1.9991661343425402e-05, + "loss": 0.8941, + "step": 2287 + }, + { + "epoch": 0.3735357740500388, + "grad_norm": 2.2897679805755615, + "learning_rate": 1.999164838191054e-05, + "loss": 0.9905, + "step": 2288 + }, + { + "epoch": 0.3736990326925432, + "grad_norm": 2.055382251739502, + "learning_rate": 1.9991635410334092e-05, + "loss": 1.0715, + "step": 2289 + }, + { + "epoch": 0.37386229133504756, + "grad_norm": 2.235107660293579, + "learning_rate": 1.999162242869607e-05, + "loss": 0.8594, + "step": 2290 + }, + { + "epoch": 0.37402554997755194, + "grad_norm": 2.1087429523468018, + "learning_rate": 1.9991609436996486e-05, + "loss": 0.8611, + "step": 2291 + }, + { + "epoch": 0.3741888086200563, + "grad_norm": 2.0639665126800537, + "learning_rate": 1.9991596435235353e-05, + "loss": 0.8815, + "step": 2292 + }, + { + "epoch": 0.3743520672625607, + "grad_norm": 2.148899793624878, + "learning_rate": 1.999158342341269e-05, + "loss": 1.0174, + "step": 2293 + }, + { + "epoch": 0.3745153259050651, + "grad_norm": 2.390429973602295, + "learning_rate": 1.99915704015285e-05, + "loss": 1.1164, + "step": 2294 + }, + { + "epoch": 0.37467858454756947, + "grad_norm": 2.3499748706817627, + "learning_rate": 1.9991557369582802e-05, + "loss": 0.9772, + "step": 2295 + }, + { + "epoch": 0.37484184319007385, + "grad_norm": 1.977137565612793, + "learning_rate": 1.999154432757561e-05, + "loss": 0.9271, + "step": 2296 + }, + { + "epoch": 0.37500510183257824, + "grad_norm": 2.1660349369049072, + "learning_rate": 1.9991531275506934e-05, + "loss": 0.8688, + "step": 2297 + }, + { + "epoch": 0.3751683604750827, + "grad_norm": 1.947456955909729, + "learning_rate": 1.9991518213376787e-05, + "loss": 0.8144, + "step": 2298 + }, + { + "epoch": 0.37533161911758706, + "grad_norm": 2.258554697036743, + "learning_rate": 1.9991505141185187e-05, + "loss": 0.7922, + "step": 2299 + }, + { + "epoch": 0.37549487776009144, + "grad_norm": 2.0119235515594482, + "learning_rate": 1.9991492058932143e-05, + "loss": 1.0209, + "step": 2300 + }, + { + "epoch": 0.3756581364025958, + "grad_norm": 1.927079200744629, + "learning_rate": 1.999147896661767e-05, + "loss": 0.8222, + "step": 2301 + }, + { + "epoch": 0.3758213950451002, + "grad_norm": 1.9848893880844116, + "learning_rate": 1.9991465864241778e-05, + "loss": 0.8582, + "step": 2302 + }, + { + "epoch": 0.3759846536876046, + "grad_norm": 2.156874895095825, + "learning_rate": 1.9991452751804484e-05, + "loss": 0.8524, + "step": 2303 + }, + { + "epoch": 0.37614791233010897, + "grad_norm": 2.1641054153442383, + "learning_rate": 1.99914396293058e-05, + "loss": 1.0119, + "step": 2304 + }, + { + "epoch": 0.37631117097261335, + "grad_norm": 2.165212631225586, + "learning_rate": 1.999142649674574e-05, + "loss": 0.8027, + "step": 2305 + }, + { + "epoch": 0.37647442961511773, + "grad_norm": 2.553990364074707, + "learning_rate": 1.999141335412431e-05, + "loss": 1.1193, + "step": 2306 + }, + { + "epoch": 0.3766376882576221, + "grad_norm": 2.1941773891448975, + "learning_rate": 1.9991400201441538e-05, + "loss": 0.96, + "step": 2307 + }, + { + "epoch": 0.3768009469001265, + "grad_norm": 2.500070095062256, + "learning_rate": 1.9991387038697423e-05, + "loss": 0.9214, + "step": 2308 + }, + { + "epoch": 0.37696420554263094, + "grad_norm": 2.172396659851074, + "learning_rate": 1.9991373865891986e-05, + "loss": 0.8634, + "step": 2309 + }, + { + "epoch": 0.3771274641851353, + "grad_norm": 2.038590431213379, + "learning_rate": 1.9991360683025238e-05, + "loss": 0.7695, + "step": 2310 + }, + { + "epoch": 0.3772907228276397, + "grad_norm": 1.849043607711792, + "learning_rate": 1.999134749009719e-05, + "loss": 0.7995, + "step": 2311 + }, + { + "epoch": 0.3774539814701441, + "grad_norm": 2.282470464706421, + "learning_rate": 1.999133428710786e-05, + "loss": 0.9761, + "step": 2312 + }, + { + "epoch": 0.37761724011264847, + "grad_norm": 2.2381653785705566, + "learning_rate": 1.9991321074057263e-05, + "loss": 0.9223, + "step": 2313 + }, + { + "epoch": 0.37778049875515285, + "grad_norm": 2.0038673877716064, + "learning_rate": 1.99913078509454e-05, + "loss": 0.9387, + "step": 2314 + }, + { + "epoch": 0.37794375739765723, + "grad_norm": 2.218248128890991, + "learning_rate": 1.99912946177723e-05, + "loss": 0.9657, + "step": 2315 + }, + { + "epoch": 0.3781070160401616, + "grad_norm": 2.7542827129364014, + "learning_rate": 1.9991281374537967e-05, + "loss": 0.8709, + "step": 2316 + }, + { + "epoch": 0.378270274682666, + "grad_norm": 2.073265790939331, + "learning_rate": 1.9991268121242414e-05, + "loss": 0.8432, + "step": 2317 + }, + { + "epoch": 0.3784335333251704, + "grad_norm": 2.5492687225341797, + "learning_rate": 1.999125485788566e-05, + "loss": 0.8065, + "step": 2318 + }, + { + "epoch": 0.37859679196767476, + "grad_norm": 2.3170039653778076, + "learning_rate": 1.999124158446771e-05, + "loss": 0.8277, + "step": 2319 + }, + { + "epoch": 0.3787600506101792, + "grad_norm": 2.0927159786224365, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.7591, + "step": 2320 + }, + { + "epoch": 0.3789233092526836, + "grad_norm": 2.2113091945648193, + "learning_rate": 1.9991215007448293e-05, + "loss": 0.932, + "step": 2321 + }, + { + "epoch": 0.37908656789518796, + "grad_norm": 1.8843798637390137, + "learning_rate": 1.9991201703846858e-05, + "loss": 0.7216, + "step": 2322 + }, + { + "epoch": 0.37924982653769235, + "grad_norm": 2.3609485626220703, + "learning_rate": 1.999118839018428e-05, + "loss": 0.9213, + "step": 2323 + }, + { + "epoch": 0.37941308518019673, + "grad_norm": 2.308814525604248, + "learning_rate": 1.9991175066460578e-05, + "loss": 0.8812, + "step": 2324 + }, + { + "epoch": 0.3795763438227011, + "grad_norm": 2.204939842224121, + "learning_rate": 1.9991161732675767e-05, + "loss": 0.9319, + "step": 2325 + }, + { + "epoch": 0.3797396024652055, + "grad_norm": 2.607072591781616, + "learning_rate": 1.9991148388829855e-05, + "loss": 0.9323, + "step": 2326 + }, + { + "epoch": 0.3799028611077099, + "grad_norm": 2.0990898609161377, + "learning_rate": 1.9991135034922865e-05, + "loss": 0.9018, + "step": 2327 + }, + { + "epoch": 0.38006611975021426, + "grad_norm": 2.0443265438079834, + "learning_rate": 1.99911216709548e-05, + "loss": 0.9103, + "step": 2328 + }, + { + "epoch": 0.38022937839271864, + "grad_norm": 2.032975196838379, + "learning_rate": 1.999110829692568e-05, + "loss": 0.9044, + "step": 2329 + }, + { + "epoch": 0.380392637035223, + "grad_norm": 2.20564603805542, + "learning_rate": 1.9991094912835514e-05, + "loss": 0.7813, + "step": 2330 + }, + { + "epoch": 0.38055589567772746, + "grad_norm": 2.3233721256256104, + "learning_rate": 1.9991081518684322e-05, + "loss": 1.0568, + "step": 2331 + }, + { + "epoch": 0.38071915432023185, + "grad_norm": 2.0378851890563965, + "learning_rate": 1.9991068114472113e-05, + "loss": 0.949, + "step": 2332 + }, + { + "epoch": 0.3808824129627362, + "grad_norm": 2.1214215755462646, + "learning_rate": 1.9991054700198898e-05, + "loss": 0.8867, + "step": 2333 + }, + { + "epoch": 0.3810456716052406, + "grad_norm": 2.3817129135131836, + "learning_rate": 1.9991041275864697e-05, + "loss": 1.0808, + "step": 2334 + }, + { + "epoch": 0.381208930247745, + "grad_norm": 2.0851662158966064, + "learning_rate": 1.999102784146952e-05, + "loss": 0.8811, + "step": 2335 + }, + { + "epoch": 0.3813721888902494, + "grad_norm": 2.3774023056030273, + "learning_rate": 1.999101439701338e-05, + "loss": 1.2085, + "step": 2336 + }, + { + "epoch": 0.38153544753275376, + "grad_norm": 2.5050346851348877, + "learning_rate": 1.999100094249629e-05, + "loss": 0.9142, + "step": 2337 + }, + { + "epoch": 0.38169870617525814, + "grad_norm": 2.126155376434326, + "learning_rate": 1.9990987477918266e-05, + "loss": 1.03, + "step": 2338 + }, + { + "epoch": 0.3818619648177625, + "grad_norm": 2.551145315170288, + "learning_rate": 1.9990974003279316e-05, + "loss": 1.1277, + "step": 2339 + }, + { + "epoch": 0.3820252234602669, + "grad_norm": 2.352571487426758, + "learning_rate": 1.9990960518579462e-05, + "loss": 0.8142, + "step": 2340 + }, + { + "epoch": 0.38218848210277134, + "grad_norm": 2.0983633995056152, + "learning_rate": 1.9990947023818713e-05, + "loss": 0.95, + "step": 2341 + }, + { + "epoch": 0.3823517407452757, + "grad_norm": 2.0947790145874023, + "learning_rate": 1.9990933518997086e-05, + "loss": 0.9835, + "step": 2342 + }, + { + "epoch": 0.3825149993877801, + "grad_norm": 2.527278184890747, + "learning_rate": 1.9990920004114588e-05, + "loss": 1.2071, + "step": 2343 + }, + { + "epoch": 0.3826782580302845, + "grad_norm": 2.229099988937378, + "learning_rate": 1.9990906479171236e-05, + "loss": 0.8382, + "step": 2344 + }, + { + "epoch": 0.3828415166727889, + "grad_norm": 1.9527391195297241, + "learning_rate": 1.9990892944167044e-05, + "loss": 0.9016, + "step": 2345 + }, + { + "epoch": 0.38300477531529326, + "grad_norm": 1.9701675176620483, + "learning_rate": 1.9990879399102024e-05, + "loss": 0.7919, + "step": 2346 + }, + { + "epoch": 0.38316803395779764, + "grad_norm": 2.1456830501556396, + "learning_rate": 1.9990865843976195e-05, + "loss": 0.7835, + "step": 2347 + }, + { + "epoch": 0.383331292600302, + "grad_norm": 2.3690779209136963, + "learning_rate": 1.9990852278789562e-05, + "loss": 0.8976, + "step": 2348 + }, + { + "epoch": 0.3834945512428064, + "grad_norm": 2.124682903289795, + "learning_rate": 1.9990838703542146e-05, + "loss": 0.8693, + "step": 2349 + }, + { + "epoch": 0.3836578098853108, + "grad_norm": 1.9659172296524048, + "learning_rate": 1.9990825118233958e-05, + "loss": 0.7612, + "step": 2350 + }, + { + "epoch": 0.38382106852781517, + "grad_norm": 2.310131072998047, + "learning_rate": 1.9990811522865014e-05, + "loss": 1.0597, + "step": 2351 + }, + { + "epoch": 0.3839843271703196, + "grad_norm": 2.1698062419891357, + "learning_rate": 1.9990797917435324e-05, + "loss": 0.8984, + "step": 2352 + }, + { + "epoch": 0.384147585812824, + "grad_norm": 2.0753705501556396, + "learning_rate": 1.9990784301944902e-05, + "loss": 0.739, + "step": 2353 + }, + { + "epoch": 0.38431084445532837, + "grad_norm": 2.3093955516815186, + "learning_rate": 1.9990770676393762e-05, + "loss": 0.9565, + "step": 2354 + }, + { + "epoch": 0.38447410309783275, + "grad_norm": 2.806009292602539, + "learning_rate": 1.999075704078192e-05, + "loss": 1.1218, + "step": 2355 + }, + { + "epoch": 0.38463736174033714, + "grad_norm": 2.46855092048645, + "learning_rate": 1.999074339510939e-05, + "loss": 1.0959, + "step": 2356 + }, + { + "epoch": 0.3848006203828415, + "grad_norm": 2.4953761100769043, + "learning_rate": 1.999072973937618e-05, + "loss": 1.0218, + "step": 2357 + }, + { + "epoch": 0.3849638790253459, + "grad_norm": 2.846637725830078, + "learning_rate": 1.999071607358231e-05, + "loss": 1.0803, + "step": 2358 + }, + { + "epoch": 0.3851271376678503, + "grad_norm": 2.1669204235076904, + "learning_rate": 1.9990702397727794e-05, + "loss": 0.9435, + "step": 2359 + }, + { + "epoch": 0.38529039631035467, + "grad_norm": 2.246234893798828, + "learning_rate": 1.999068871181264e-05, + "loss": 1.0379, + "step": 2360 + }, + { + "epoch": 0.38545365495285905, + "grad_norm": 1.9008138179779053, + "learning_rate": 1.9990675015836863e-05, + "loss": 0.873, + "step": 2361 + }, + { + "epoch": 0.38561691359536343, + "grad_norm": 2.467764377593994, + "learning_rate": 1.9990661309800483e-05, + "loss": 1.1829, + "step": 2362 + }, + { + "epoch": 0.38578017223786787, + "grad_norm": 2.077467441558838, + "learning_rate": 1.999064759370351e-05, + "loss": 0.9456, + "step": 2363 + }, + { + "epoch": 0.38594343088037225, + "grad_norm": 2.39619779586792, + "learning_rate": 1.9990633867545956e-05, + "loss": 1.031, + "step": 2364 + }, + { + "epoch": 0.38610668952287663, + "grad_norm": 2.1364123821258545, + "learning_rate": 1.9990620131327836e-05, + "loss": 1.1695, + "step": 2365 + }, + { + "epoch": 0.386269948165381, + "grad_norm": 2.1466245651245117, + "learning_rate": 1.9990606385049165e-05, + "loss": 0.9586, + "step": 2366 + }, + { + "epoch": 0.3864332068078854, + "grad_norm": 2.5207529067993164, + "learning_rate": 1.9990592628709957e-05, + "loss": 1.0535, + "step": 2367 + }, + { + "epoch": 0.3865964654503898, + "grad_norm": 2.0793449878692627, + "learning_rate": 1.999057886231022e-05, + "loss": 0.8829, + "step": 2368 + }, + { + "epoch": 0.38675972409289416, + "grad_norm": 2.1145431995391846, + "learning_rate": 1.9990565085849976e-05, + "loss": 0.8804, + "step": 2369 + }, + { + "epoch": 0.38692298273539855, + "grad_norm": 2.1253602504730225, + "learning_rate": 1.999055129932924e-05, + "loss": 1.0116, + "step": 2370 + }, + { + "epoch": 0.38708624137790293, + "grad_norm": 2.194624662399292, + "learning_rate": 1.9990537502748016e-05, + "loss": 0.9817, + "step": 2371 + }, + { + "epoch": 0.3872495000204073, + "grad_norm": 2.5927858352661133, + "learning_rate": 1.9990523696106327e-05, + "loss": 0.8736, + "step": 2372 + }, + { + "epoch": 0.3874127586629117, + "grad_norm": 2.4821815490722656, + "learning_rate": 1.999050987940418e-05, + "loss": 1.0939, + "step": 2373 + }, + { + "epoch": 0.38757601730541613, + "grad_norm": 2.0023574829101562, + "learning_rate": 1.9990496052641594e-05, + "loss": 0.7837, + "step": 2374 + }, + { + "epoch": 0.3877392759479205, + "grad_norm": 2.0740838050842285, + "learning_rate": 1.999048221581858e-05, + "loss": 0.8384, + "step": 2375 + }, + { + "epoch": 0.3879025345904249, + "grad_norm": 2.616694450378418, + "learning_rate": 1.9990468368935155e-05, + "loss": 0.9406, + "step": 2376 + }, + { + "epoch": 0.3880657932329293, + "grad_norm": 2.4535982608795166, + "learning_rate": 1.9990454511991327e-05, + "loss": 1.0165, + "step": 2377 + }, + { + "epoch": 0.38822905187543366, + "grad_norm": 2.466538906097412, + "learning_rate": 1.9990440644987116e-05, + "loss": 1.1825, + "step": 2378 + }, + { + "epoch": 0.38839231051793804, + "grad_norm": 2.030176877975464, + "learning_rate": 1.9990426767922535e-05, + "loss": 0.9254, + "step": 2379 + }, + { + "epoch": 0.3885555691604424, + "grad_norm": 1.867173194885254, + "learning_rate": 1.9990412880797595e-05, + "loss": 0.8518, + "step": 2380 + }, + { + "epoch": 0.3887188278029468, + "grad_norm": 2.000919818878174, + "learning_rate": 1.9990398983612316e-05, + "loss": 0.7092, + "step": 2381 + }, + { + "epoch": 0.3888820864454512, + "grad_norm": 1.9182844161987305, + "learning_rate": 1.99903850763667e-05, + "loss": 0.898, + "step": 2382 + }, + { + "epoch": 0.3890453450879556, + "grad_norm": 2.3210976123809814, + "learning_rate": 1.9990371159060778e-05, + "loss": 1.1047, + "step": 2383 + }, + { + "epoch": 0.38920860373045996, + "grad_norm": 2.0933878421783447, + "learning_rate": 1.999035723169455e-05, + "loss": 0.9968, + "step": 2384 + }, + { + "epoch": 0.3893718623729644, + "grad_norm": 2.1206212043762207, + "learning_rate": 1.9990343294268036e-05, + "loss": 1.0121, + "step": 2385 + }, + { + "epoch": 0.3895351210154688, + "grad_norm": 1.999044418334961, + "learning_rate": 1.999032934678125e-05, + "loss": 0.8421, + "step": 2386 + }, + { + "epoch": 0.38969837965797316, + "grad_norm": 1.8623462915420532, + "learning_rate": 1.9990315389234203e-05, + "loss": 0.7388, + "step": 2387 + }, + { + "epoch": 0.38986163830047754, + "grad_norm": 2.1583359241485596, + "learning_rate": 1.999030142162691e-05, + "loss": 0.8295, + "step": 2388 + }, + { + "epoch": 0.3900248969429819, + "grad_norm": 2.46870756149292, + "learning_rate": 1.999028744395939e-05, + "loss": 0.9973, + "step": 2389 + }, + { + "epoch": 0.3901881555854863, + "grad_norm": 2.453171730041504, + "learning_rate": 1.999027345623165e-05, + "loss": 0.8719, + "step": 2390 + }, + { + "epoch": 0.3903514142279907, + "grad_norm": 2.5562548637390137, + "learning_rate": 1.999025945844371e-05, + "loss": 1.0458, + "step": 2391 + }, + { + "epoch": 0.39051467287049507, + "grad_norm": 2.0096516609191895, + "learning_rate": 1.999024545059558e-05, + "loss": 0.9288, + "step": 2392 + }, + { + "epoch": 0.39067793151299945, + "grad_norm": 2.213114023208618, + "learning_rate": 1.9990231432687274e-05, + "loss": 0.8927, + "step": 2393 + }, + { + "epoch": 0.39084119015550384, + "grad_norm": 2.1281092166900635, + "learning_rate": 1.9990217404718807e-05, + "loss": 0.7749, + "step": 2394 + }, + { + "epoch": 0.3910044487980082, + "grad_norm": 2.512016773223877, + "learning_rate": 1.9990203366690197e-05, + "loss": 0.853, + "step": 2395 + }, + { + "epoch": 0.39116770744051266, + "grad_norm": 2.199647903442383, + "learning_rate": 1.9990189318601452e-05, + "loss": 0.9074, + "step": 2396 + }, + { + "epoch": 0.39133096608301704, + "grad_norm": 2.2656641006469727, + "learning_rate": 1.999017526045259e-05, + "loss": 0.9226, + "step": 2397 + }, + { + "epoch": 0.3914942247255214, + "grad_norm": 2.167797565460205, + "learning_rate": 1.9990161192243628e-05, + "loss": 0.8112, + "step": 2398 + }, + { + "epoch": 0.3916574833680258, + "grad_norm": 1.9817966222763062, + "learning_rate": 1.999014711397457e-05, + "loss": 0.7674, + "step": 2399 + }, + { + "epoch": 0.3918207420105302, + "grad_norm": 2.379504919052124, + "learning_rate": 1.999013302564544e-05, + "loss": 0.7849, + "step": 2400 + }, + { + "epoch": 0.39198400065303457, + "grad_norm": 2.1934640407562256, + "learning_rate": 1.9990118927256247e-05, + "loss": 1.0028, + "step": 2401 + }, + { + "epoch": 0.39214725929553895, + "grad_norm": 2.2847135066986084, + "learning_rate": 1.999010481880701e-05, + "loss": 0.8633, + "step": 2402 + }, + { + "epoch": 0.39231051793804333, + "grad_norm": 2.3834688663482666, + "learning_rate": 1.9990090700297736e-05, + "loss": 0.8161, + "step": 2403 + }, + { + "epoch": 0.3924737765805477, + "grad_norm": 2.2691235542297363, + "learning_rate": 1.9990076571728447e-05, + "loss": 0.8593, + "step": 2404 + }, + { + "epoch": 0.3926370352230521, + "grad_norm": 2.3049893379211426, + "learning_rate": 1.999006243309915e-05, + "loss": 0.9053, + "step": 2405 + }, + { + "epoch": 0.3928002938655565, + "grad_norm": 2.031383514404297, + "learning_rate": 1.9990048284409867e-05, + "loss": 0.7923, + "step": 2406 + }, + { + "epoch": 0.3929635525080609, + "grad_norm": 2.201955795288086, + "learning_rate": 1.9990034125660607e-05, + "loss": 1.0715, + "step": 2407 + }, + { + "epoch": 0.3931268111505653, + "grad_norm": 2.3718245029449463, + "learning_rate": 1.9990019956851384e-05, + "loss": 0.8632, + "step": 2408 + }, + { + "epoch": 0.3932900697930697, + "grad_norm": 2.0922348499298096, + "learning_rate": 1.9990005777982212e-05, + "loss": 0.8749, + "step": 2409 + }, + { + "epoch": 0.39345332843557407, + "grad_norm": 2.5148468017578125, + "learning_rate": 1.9989991589053107e-05, + "loss": 0.9764, + "step": 2410 + }, + { + "epoch": 0.39361658707807845, + "grad_norm": 2.161440849304199, + "learning_rate": 1.9989977390064087e-05, + "loss": 0.9593, + "step": 2411 + }, + { + "epoch": 0.39377984572058283, + "grad_norm": 2.1965153217315674, + "learning_rate": 1.9989963181015158e-05, + "loss": 0.8848, + "step": 2412 + }, + { + "epoch": 0.3939431043630872, + "grad_norm": 2.3817453384399414, + "learning_rate": 1.998994896190634e-05, + "loss": 0.8978, + "step": 2413 + }, + { + "epoch": 0.3941063630055916, + "grad_norm": 2.1136372089385986, + "learning_rate": 1.9989934732737648e-05, + "loss": 1.0197, + "step": 2414 + }, + { + "epoch": 0.394269621648096, + "grad_norm": 2.265977144241333, + "learning_rate": 1.998992049350909e-05, + "loss": 0.9031, + "step": 2415 + }, + { + "epoch": 0.39443288029060036, + "grad_norm": 2.0319786071777344, + "learning_rate": 1.998990624422069e-05, + "loss": 0.9202, + "step": 2416 + }, + { + "epoch": 0.39459613893310475, + "grad_norm": 2.1427950859069824, + "learning_rate": 1.9989891984872457e-05, + "loss": 0.9407, + "step": 2417 + }, + { + "epoch": 0.3947593975756092, + "grad_norm": 2.5593388080596924, + "learning_rate": 1.9989877715464404e-05, + "loss": 0.8438, + "step": 2418 + }, + { + "epoch": 0.39492265621811357, + "grad_norm": 2.397465705871582, + "learning_rate": 1.9989863435996544e-05, + "loss": 1.123, + "step": 2419 + }, + { + "epoch": 0.39508591486061795, + "grad_norm": 1.7739402055740356, + "learning_rate": 1.9989849146468897e-05, + "loss": 0.7916, + "step": 2420 + }, + { + "epoch": 0.39524917350312233, + "grad_norm": 2.119030714035034, + "learning_rate": 1.9989834846881473e-05, + "loss": 0.9219, + "step": 2421 + }, + { + "epoch": 0.3954124321456267, + "grad_norm": 2.2377028465270996, + "learning_rate": 1.9989820537234287e-05, + "loss": 1.0082, + "step": 2422 + }, + { + "epoch": 0.3955756907881311, + "grad_norm": 2.1268718242645264, + "learning_rate": 1.9989806217527357e-05, + "loss": 0.9853, + "step": 2423 + }, + { + "epoch": 0.3957389494306355, + "grad_norm": 2.1222565174102783, + "learning_rate": 1.9989791887760695e-05, + "loss": 0.8893, + "step": 2424 + }, + { + "epoch": 0.39590220807313986, + "grad_norm": 2.537492275238037, + "learning_rate": 1.9989777547934314e-05, + "loss": 1.0533, + "step": 2425 + }, + { + "epoch": 0.39606546671564424, + "grad_norm": 2.0177674293518066, + "learning_rate": 1.998976319804823e-05, + "loss": 0.8432, + "step": 2426 + }, + { + "epoch": 0.3962287253581486, + "grad_norm": 2.4538278579711914, + "learning_rate": 1.9989748838102456e-05, + "loss": 0.9414, + "step": 2427 + }, + { + "epoch": 0.396391984000653, + "grad_norm": 2.0986878871917725, + "learning_rate": 1.998973446809701e-05, + "loss": 0.8182, + "step": 2428 + }, + { + "epoch": 0.39655524264315745, + "grad_norm": 1.7855708599090576, + "learning_rate": 1.99897200880319e-05, + "loss": 0.7151, + "step": 2429 + }, + { + "epoch": 0.39671850128566183, + "grad_norm": 2.5517826080322266, + "learning_rate": 1.998970569790715e-05, + "loss": 1.1297, + "step": 2430 + }, + { + "epoch": 0.3968817599281662, + "grad_norm": 2.115194320678711, + "learning_rate": 1.9989691297722765e-05, + "loss": 0.7574, + "step": 2431 + }, + { + "epoch": 0.3970450185706706, + "grad_norm": 2.2917094230651855, + "learning_rate": 1.9989676887478764e-05, + "loss": 0.9255, + "step": 2432 + }, + { + "epoch": 0.397208277213175, + "grad_norm": 2.540828227996826, + "learning_rate": 1.9989662467175163e-05, + "loss": 1.6806, + "step": 2433 + }, + { + "epoch": 0.39737153585567936, + "grad_norm": 1.9915798902511597, + "learning_rate": 1.998964803681197e-05, + "loss": 0.9324, + "step": 2434 + }, + { + "epoch": 0.39753479449818374, + "grad_norm": 2.2942895889282227, + "learning_rate": 1.998963359638921e-05, + "loss": 0.8661, + "step": 2435 + }, + { + "epoch": 0.3976980531406881, + "grad_norm": 2.766526460647583, + "learning_rate": 1.9989619145906888e-05, + "loss": 0.9932, + "step": 2436 + }, + { + "epoch": 0.3978613117831925, + "grad_norm": 2.6054022312164307, + "learning_rate": 1.9989604685365024e-05, + "loss": 0.9714, + "step": 2437 + }, + { + "epoch": 0.3980245704256969, + "grad_norm": 2.4717018604278564, + "learning_rate": 1.9989590214763627e-05, + "loss": 0.7983, + "step": 2438 + }, + { + "epoch": 0.39818782906820127, + "grad_norm": 2.5681324005126953, + "learning_rate": 1.998957573410272e-05, + "loss": 0.9195, + "step": 2439 + }, + { + "epoch": 0.3983510877107057, + "grad_norm": 2.060110569000244, + "learning_rate": 1.9989561243382313e-05, + "loss": 0.7392, + "step": 2440 + }, + { + "epoch": 0.3985143463532101, + "grad_norm": 2.5571093559265137, + "learning_rate": 1.9989546742602416e-05, + "loss": 0.8896, + "step": 2441 + }, + { + "epoch": 0.3986776049957145, + "grad_norm": 2.0924389362335205, + "learning_rate": 1.998953223176305e-05, + "loss": 0.8161, + "step": 2442 + }, + { + "epoch": 0.39884086363821886, + "grad_norm": 2.2974934577941895, + "learning_rate": 1.9989517710864228e-05, + "loss": 0.993, + "step": 2443 + }, + { + "epoch": 0.39900412228072324, + "grad_norm": 2.378819704055786, + "learning_rate": 1.9989503179905963e-05, + "loss": 1.0175, + "step": 2444 + }, + { + "epoch": 0.3991673809232276, + "grad_norm": 2.131253957748413, + "learning_rate": 1.9989488638888274e-05, + "loss": 0.9907, + "step": 2445 + }, + { + "epoch": 0.399330639565732, + "grad_norm": 2.3086137771606445, + "learning_rate": 1.998947408781117e-05, + "loss": 0.9663, + "step": 2446 + }, + { + "epoch": 0.3994938982082364, + "grad_norm": 2.284175157546997, + "learning_rate": 1.998945952667467e-05, + "loss": 1.0497, + "step": 2447 + }, + { + "epoch": 0.39965715685074077, + "grad_norm": 2.550283432006836, + "learning_rate": 1.9989444955478788e-05, + "loss": 1.0249, + "step": 2448 + }, + { + "epoch": 0.39982041549324515, + "grad_norm": 2.1344642639160156, + "learning_rate": 1.9989430374223534e-05, + "loss": 0.9956, + "step": 2449 + }, + { + "epoch": 0.39998367413574953, + "grad_norm": 2.0825536251068115, + "learning_rate": 1.998941578290893e-05, + "loss": 0.9194, + "step": 2450 + }, + { + "epoch": 0.40014693277825397, + "grad_norm": 2.0687222480773926, + "learning_rate": 1.9989401181534985e-05, + "loss": 0.9302, + "step": 2451 + }, + { + "epoch": 0.40031019142075835, + "grad_norm": 2.411212205886841, + "learning_rate": 1.9989386570101716e-05, + "loss": 0.9873, + "step": 2452 + }, + { + "epoch": 0.40047345006326274, + "grad_norm": 2.179976463317871, + "learning_rate": 1.9989371948609134e-05, + "loss": 0.8882, + "step": 2453 + }, + { + "epoch": 0.4006367087057671, + "grad_norm": 2.2875638008117676, + "learning_rate": 1.998935731705726e-05, + "loss": 1.0496, + "step": 2454 + }, + { + "epoch": 0.4007999673482715, + "grad_norm": 3.0647501945495605, + "learning_rate": 1.998934267544611e-05, + "loss": 1.1592, + "step": 2455 + }, + { + "epoch": 0.4009632259907759, + "grad_norm": 2.404136896133423, + "learning_rate": 1.9989328023775688e-05, + "loss": 0.9799, + "step": 2456 + }, + { + "epoch": 0.40112648463328027, + "grad_norm": 2.19797945022583, + "learning_rate": 1.998931336204602e-05, + "loss": 0.9022, + "step": 2457 + }, + { + "epoch": 0.40128974327578465, + "grad_norm": 2.4626898765563965, + "learning_rate": 1.998929869025711e-05, + "loss": 0.8586, + "step": 2458 + }, + { + "epoch": 0.40145300191828903, + "grad_norm": 1.9525703191757202, + "learning_rate": 1.9989284008408985e-05, + "loss": 0.8928, + "step": 2459 + }, + { + "epoch": 0.4016162605607934, + "grad_norm": 2.2514874935150146, + "learning_rate": 1.998926931650165e-05, + "loss": 1.0309, + "step": 2460 + }, + { + "epoch": 0.4017795192032978, + "grad_norm": 2.2697155475616455, + "learning_rate": 1.9989254614535125e-05, + "loss": 0.739, + "step": 2461 + }, + { + "epoch": 0.40194277784580223, + "grad_norm": 2.230809211730957, + "learning_rate": 1.9989239902509423e-05, + "loss": 0.9559, + "step": 2462 + }, + { + "epoch": 0.4021060364883066, + "grad_norm": 2.2936198711395264, + "learning_rate": 1.998922518042456e-05, + "loss": 0.887, + "step": 2463 + }, + { + "epoch": 0.402269295130811, + "grad_norm": 2.4923603534698486, + "learning_rate": 1.9989210448280548e-05, + "loss": 1.0061, + "step": 2464 + }, + { + "epoch": 0.4024325537733154, + "grad_norm": 2.1371591091156006, + "learning_rate": 1.9989195706077406e-05, + "loss": 0.9595, + "step": 2465 + }, + { + "epoch": 0.40259581241581976, + "grad_norm": 2.299093246459961, + "learning_rate": 1.9989180953815145e-05, + "loss": 0.9594, + "step": 2466 + }, + { + "epoch": 0.40275907105832415, + "grad_norm": 2.4092671871185303, + "learning_rate": 1.9989166191493782e-05, + "loss": 0.9845, + "step": 2467 + }, + { + "epoch": 0.40292232970082853, + "grad_norm": 2.5959017276763916, + "learning_rate": 1.998915141911333e-05, + "loss": 1.1591, + "step": 2468 + }, + { + "epoch": 0.4030855883433329, + "grad_norm": 2.149758815765381, + "learning_rate": 1.9989136636673805e-05, + "loss": 0.8009, + "step": 2469 + }, + { + "epoch": 0.4032488469858373, + "grad_norm": 2.285952091217041, + "learning_rate": 1.9989121844175226e-05, + "loss": 0.9094, + "step": 2470 + }, + { + "epoch": 0.4034121056283417, + "grad_norm": 2.2832398414611816, + "learning_rate": 1.9989107041617602e-05, + "loss": 1.0566, + "step": 2471 + }, + { + "epoch": 0.4035753642708461, + "grad_norm": 2.092925548553467, + "learning_rate": 1.9989092229000947e-05, + "loss": 0.9838, + "step": 2472 + }, + { + "epoch": 0.4037386229133505, + "grad_norm": 2.4731366634368896, + "learning_rate": 1.9989077406325285e-05, + "loss": 0.8645, + "step": 2473 + }, + { + "epoch": 0.4039018815558549, + "grad_norm": 2.2036325931549072, + "learning_rate": 1.9989062573590618e-05, + "loss": 0.9265, + "step": 2474 + }, + { + "epoch": 0.40406514019835926, + "grad_norm": 2.272914409637451, + "learning_rate": 1.998904773079697e-05, + "loss": 0.9708, + "step": 2475 + }, + { + "epoch": 0.40422839884086365, + "grad_norm": 2.1340277194976807, + "learning_rate": 1.9989032877944353e-05, + "loss": 0.7244, + "step": 2476 + }, + { + "epoch": 0.404391657483368, + "grad_norm": 2.39723539352417, + "learning_rate": 1.9989018015032785e-05, + "loss": 1.0894, + "step": 2477 + }, + { + "epoch": 0.4045549161258724, + "grad_norm": 2.237788438796997, + "learning_rate": 1.9989003142062278e-05, + "loss": 0.8802, + "step": 2478 + }, + { + "epoch": 0.4047181747683768, + "grad_norm": 2.2704155445098877, + "learning_rate": 1.9988988259032845e-05, + "loss": 1.047, + "step": 2479 + }, + { + "epoch": 0.4048814334108812, + "grad_norm": 2.0703465938568115, + "learning_rate": 1.9988973365944505e-05, + "loss": 0.8315, + "step": 2480 + }, + { + "epoch": 0.40504469205338556, + "grad_norm": 2.293002128601074, + "learning_rate": 1.998895846279727e-05, + "loss": 0.9043, + "step": 2481 + }, + { + "epoch": 0.40520795069588994, + "grad_norm": 1.9347337484359741, + "learning_rate": 1.998894354959116e-05, + "loss": 0.7863, + "step": 2482 + }, + { + "epoch": 0.4053712093383944, + "grad_norm": 2.192974090576172, + "learning_rate": 1.9988928626326184e-05, + "loss": 0.9902, + "step": 2483 + }, + { + "epoch": 0.40553446798089876, + "grad_norm": 2.1292686462402344, + "learning_rate": 1.9988913693002357e-05, + "loss": 0.802, + "step": 2484 + }, + { + "epoch": 0.40569772662340314, + "grad_norm": 1.9144996404647827, + "learning_rate": 1.9988898749619702e-05, + "loss": 0.6783, + "step": 2485 + }, + { + "epoch": 0.4058609852659075, + "grad_norm": 2.1686155796051025, + "learning_rate": 1.9988883796178225e-05, + "loss": 0.8397, + "step": 2486 + }, + { + "epoch": 0.4060242439084119, + "grad_norm": 2.360686779022217, + "learning_rate": 1.9988868832677945e-05, + "loss": 0.9389, + "step": 2487 + }, + { + "epoch": 0.4061875025509163, + "grad_norm": 2.213864803314209, + "learning_rate": 1.998885385911888e-05, + "loss": 0.7705, + "step": 2488 + }, + { + "epoch": 0.4063507611934207, + "grad_norm": 2.1179182529449463, + "learning_rate": 1.9988838875501038e-05, + "loss": 0.7862, + "step": 2489 + }, + { + "epoch": 0.40651401983592506, + "grad_norm": 2.2212228775024414, + "learning_rate": 1.9988823881824436e-05, + "loss": 0.8399, + "step": 2490 + }, + { + "epoch": 0.40667727847842944, + "grad_norm": 2.483272075653076, + "learning_rate": 1.9988808878089098e-05, + "loss": 0.9944, + "step": 2491 + }, + { + "epoch": 0.4068405371209338, + "grad_norm": 2.3988304138183594, + "learning_rate": 1.998879386429503e-05, + "loss": 0.8675, + "step": 2492 + }, + { + "epoch": 0.4070037957634382, + "grad_norm": 2.72599196434021, + "learning_rate": 1.9988778840442245e-05, + "loss": 0.9306, + "step": 2493 + }, + { + "epoch": 0.40716705440594264, + "grad_norm": 2.291930675506592, + "learning_rate": 1.9988763806530765e-05, + "loss": 0.8025, + "step": 2494 + }, + { + "epoch": 0.407330313048447, + "grad_norm": 2.6197917461395264, + "learning_rate": 1.9988748762560603e-05, + "loss": 0.6936, + "step": 2495 + }, + { + "epoch": 0.4074935716909514, + "grad_norm": 2.8044846057891846, + "learning_rate": 1.9988733708531772e-05, + "loss": 0.988, + "step": 2496 + }, + { + "epoch": 0.4076568303334558, + "grad_norm": 2.4618797302246094, + "learning_rate": 1.998871864444429e-05, + "loss": 0.961, + "step": 2497 + }, + { + "epoch": 0.40782008897596017, + "grad_norm": 2.451298713684082, + "learning_rate": 1.998870357029817e-05, + "loss": 0.9464, + "step": 2498 + }, + { + "epoch": 0.40798334761846455, + "grad_norm": 2.3801121711730957, + "learning_rate": 1.998868848609343e-05, + "loss": 0.7763, + "step": 2499 + }, + { + "epoch": 0.40814660626096894, + "grad_norm": 2.047248601913452, + "learning_rate": 1.9988673391830082e-05, + "loss": 0.9, + "step": 2500 + }, + { + "epoch": 0.4083098649034733, + "grad_norm": 2.207097053527832, + "learning_rate": 1.998865828750814e-05, + "loss": 0.9595, + "step": 2501 + }, + { + "epoch": 0.4084731235459777, + "grad_norm": 2.621706962585449, + "learning_rate": 1.9988643173127627e-05, + "loss": 0.8284, + "step": 2502 + }, + { + "epoch": 0.4086363821884821, + "grad_norm": 2.0710906982421875, + "learning_rate": 1.998862804868855e-05, + "loss": 0.7883, + "step": 2503 + }, + { + "epoch": 0.40879964083098647, + "grad_norm": 2.157914638519287, + "learning_rate": 1.9988612914190927e-05, + "loss": 0.8823, + "step": 2504 + }, + { + "epoch": 0.4089628994734909, + "grad_norm": 2.0399465560913086, + "learning_rate": 1.9988597769634775e-05, + "loss": 0.8627, + "step": 2505 + }, + { + "epoch": 0.4091261581159953, + "grad_norm": 2.9331929683685303, + "learning_rate": 1.9988582615020107e-05, + "loss": 0.992, + "step": 2506 + }, + { + "epoch": 0.40928941675849967, + "grad_norm": 2.2221031188964844, + "learning_rate": 1.9988567450346937e-05, + "loss": 0.7773, + "step": 2507 + }, + { + "epoch": 0.40945267540100405, + "grad_norm": 2.1749541759490967, + "learning_rate": 1.9988552275615287e-05, + "loss": 0.9387, + "step": 2508 + }, + { + "epoch": 0.40961593404350843, + "grad_norm": 1.8630714416503906, + "learning_rate": 1.9988537090825166e-05, + "loss": 0.7531, + "step": 2509 + }, + { + "epoch": 0.4097791926860128, + "grad_norm": 2.0631561279296875, + "learning_rate": 1.9988521895976585e-05, + "loss": 0.8155, + "step": 2510 + }, + { + "epoch": 0.4099424513285172, + "grad_norm": 2.1360857486724854, + "learning_rate": 1.998850669106957e-05, + "loss": 0.8572, + "step": 2511 + }, + { + "epoch": 0.4101057099710216, + "grad_norm": 2.1138761043548584, + "learning_rate": 1.9988491476104134e-05, + "loss": 0.8673, + "step": 2512 + }, + { + "epoch": 0.41026896861352596, + "grad_norm": 2.1313304901123047, + "learning_rate": 1.9988476251080286e-05, + "loss": 0.878, + "step": 2513 + }, + { + "epoch": 0.41043222725603035, + "grad_norm": 2.2187817096710205, + "learning_rate": 1.9988461015998044e-05, + "loss": 0.7952, + "step": 2514 + }, + { + "epoch": 0.41059548589853473, + "grad_norm": 2.1907341480255127, + "learning_rate": 1.9988445770857424e-05, + "loss": 0.9067, + "step": 2515 + }, + { + "epoch": 0.41075874454103917, + "grad_norm": 2.4918317794799805, + "learning_rate": 1.9988430515658444e-05, + "loss": 1.1319, + "step": 2516 + }, + { + "epoch": 0.41092200318354355, + "grad_norm": 2.2417404651641846, + "learning_rate": 1.9988415250401118e-05, + "loss": 0.8734, + "step": 2517 + }, + { + "epoch": 0.41108526182604793, + "grad_norm": 2.722362756729126, + "learning_rate": 1.998839997508546e-05, + "loss": 0.9068, + "step": 2518 + }, + { + "epoch": 0.4112485204685523, + "grad_norm": 2.4187746047973633, + "learning_rate": 1.9988384689711488e-05, + "loss": 0.9884, + "step": 2519 + }, + { + "epoch": 0.4114117791110567, + "grad_norm": 2.4642841815948486, + "learning_rate": 1.998836939427921e-05, + "loss": 0.9182, + "step": 2520 + }, + { + "epoch": 0.4115750377535611, + "grad_norm": 2.4722025394439697, + "learning_rate": 1.9988354088788647e-05, + "loss": 0.9368, + "step": 2521 + }, + { + "epoch": 0.41173829639606546, + "grad_norm": 1.9845893383026123, + "learning_rate": 1.998833877323982e-05, + "loss": 0.8145, + "step": 2522 + }, + { + "epoch": 0.41190155503856984, + "grad_norm": 2.298062324523926, + "learning_rate": 1.9988323447632734e-05, + "loss": 0.8222, + "step": 2523 + }, + { + "epoch": 0.4120648136810742, + "grad_norm": 2.304030656814575, + "learning_rate": 1.998830811196741e-05, + "loss": 0.8895, + "step": 2524 + }, + { + "epoch": 0.4122280723235786, + "grad_norm": 2.721147060394287, + "learning_rate": 1.998829276624386e-05, + "loss": 0.9336, + "step": 2525 + }, + { + "epoch": 0.412391330966083, + "grad_norm": 2.0208778381347656, + "learning_rate": 1.9988277410462107e-05, + "loss": 0.7763, + "step": 2526 + }, + { + "epoch": 0.41255458960858743, + "grad_norm": 2.641338348388672, + "learning_rate": 1.998826204462216e-05, + "loss": 1.0205, + "step": 2527 + }, + { + "epoch": 0.4127178482510918, + "grad_norm": 2.2045462131500244, + "learning_rate": 1.9988246668724033e-05, + "loss": 0.9223, + "step": 2528 + }, + { + "epoch": 0.4128811068935962, + "grad_norm": 2.7185170650482178, + "learning_rate": 1.9988231282767744e-05, + "loss": 1.2058, + "step": 2529 + }, + { + "epoch": 0.4130443655361006, + "grad_norm": 2.5857558250427246, + "learning_rate": 1.9988215886753308e-05, + "loss": 0.922, + "step": 2530 + }, + { + "epoch": 0.41320762417860496, + "grad_norm": 2.5655033588409424, + "learning_rate": 1.9988200480680745e-05, + "loss": 1.0105, + "step": 2531 + }, + { + "epoch": 0.41337088282110934, + "grad_norm": 2.238011121749878, + "learning_rate": 1.9988185064550065e-05, + "loss": 0.8987, + "step": 2532 + }, + { + "epoch": 0.4135341414636137, + "grad_norm": 2.143281936645508, + "learning_rate": 1.9988169638361286e-05, + "loss": 1.0788, + "step": 2533 + }, + { + "epoch": 0.4136974001061181, + "grad_norm": 2.717515230178833, + "learning_rate": 1.998815420211442e-05, + "loss": 1.0217, + "step": 2534 + }, + { + "epoch": 0.4138606587486225, + "grad_norm": 1.9467624425888062, + "learning_rate": 1.998813875580949e-05, + "loss": 0.7849, + "step": 2535 + }, + { + "epoch": 0.41402391739112687, + "grad_norm": 2.3189358711242676, + "learning_rate": 1.9988123299446505e-05, + "loss": 1.112, + "step": 2536 + }, + { + "epoch": 0.41418717603363125, + "grad_norm": 2.7086145877838135, + "learning_rate": 1.998810783302548e-05, + "loss": 1.0364, + "step": 2537 + }, + { + "epoch": 0.4143504346761357, + "grad_norm": 1.9426896572113037, + "learning_rate": 1.9988092356546434e-05, + "loss": 0.8665, + "step": 2538 + }, + { + "epoch": 0.4145136933186401, + "grad_norm": 1.9196878671646118, + "learning_rate": 1.9988076870009384e-05, + "loss": 0.8402, + "step": 2539 + }, + { + "epoch": 0.41467695196114446, + "grad_norm": 1.8388781547546387, + "learning_rate": 1.9988061373414342e-05, + "loss": 0.8519, + "step": 2540 + }, + { + "epoch": 0.41484021060364884, + "grad_norm": 2.220960855484009, + "learning_rate": 1.9988045866761326e-05, + "loss": 0.9278, + "step": 2541 + }, + { + "epoch": 0.4150034692461532, + "grad_norm": 1.971710443496704, + "learning_rate": 1.9988030350050346e-05, + "loss": 0.9422, + "step": 2542 + }, + { + "epoch": 0.4151667278886576, + "grad_norm": 1.9760900735855103, + "learning_rate": 1.9988014823281426e-05, + "loss": 0.8995, + "step": 2543 + }, + { + "epoch": 0.415329986531162, + "grad_norm": 2.2629480361938477, + "learning_rate": 1.998799928645458e-05, + "loss": 1.0904, + "step": 2544 + }, + { + "epoch": 0.41549324517366637, + "grad_norm": 2.1864173412323, + "learning_rate": 1.9987983739569815e-05, + "loss": 1.022, + "step": 2545 + }, + { + "epoch": 0.41565650381617075, + "grad_norm": 1.9978785514831543, + "learning_rate": 1.998796818262716e-05, + "loss": 0.8582, + "step": 2546 + }, + { + "epoch": 0.41581976245867514, + "grad_norm": 2.5000813007354736, + "learning_rate": 1.9987952615626617e-05, + "loss": 0.713, + "step": 2547 + }, + { + "epoch": 0.4159830211011795, + "grad_norm": 2.153843879699707, + "learning_rate": 1.9987937038568212e-05, + "loss": 0.9524, + "step": 2548 + }, + { + "epoch": 0.41614627974368396, + "grad_norm": 2.0682623386383057, + "learning_rate": 1.998792145145196e-05, + "loss": 0.9338, + "step": 2549 + }, + { + "epoch": 0.41630953838618834, + "grad_norm": 1.8734424114227295, + "learning_rate": 1.9987905854277867e-05, + "loss": 0.9532, + "step": 2550 + }, + { + "epoch": 0.4164727970286927, + "grad_norm": 2.2436208724975586, + "learning_rate": 1.998789024704596e-05, + "loss": 0.8538, + "step": 2551 + }, + { + "epoch": 0.4166360556711971, + "grad_norm": 2.1438546180725098, + "learning_rate": 1.9987874629756248e-05, + "loss": 0.7532, + "step": 2552 + }, + { + "epoch": 0.4167993143137015, + "grad_norm": 2.1417322158813477, + "learning_rate": 1.998785900240875e-05, + "loss": 0.8365, + "step": 2553 + }, + { + "epoch": 0.41696257295620587, + "grad_norm": 2.048462152481079, + "learning_rate": 1.998784336500348e-05, + "loss": 0.803, + "step": 2554 + }, + { + "epoch": 0.41712583159871025, + "grad_norm": 1.9908592700958252, + "learning_rate": 1.9987827717540457e-05, + "loss": 0.7958, + "step": 2555 + }, + { + "epoch": 0.41728909024121463, + "grad_norm": 2.3123931884765625, + "learning_rate": 1.9987812060019692e-05, + "loss": 0.9047, + "step": 2556 + }, + { + "epoch": 0.417452348883719, + "grad_norm": 2.427549362182617, + "learning_rate": 1.99877963924412e-05, + "loss": 0.9235, + "step": 2557 + }, + { + "epoch": 0.4176156075262234, + "grad_norm": 2.414076805114746, + "learning_rate": 1.9987780714805006e-05, + "loss": 0.9491, + "step": 2558 + }, + { + "epoch": 0.4177788661687278, + "grad_norm": 2.3998637199401855, + "learning_rate": 1.9987765027111115e-05, + "loss": 0.965, + "step": 2559 + }, + { + "epoch": 0.4179421248112322, + "grad_norm": 2.331723690032959, + "learning_rate": 1.998774932935955e-05, + "loss": 0.826, + "step": 2560 + }, + { + "epoch": 0.4181053834537366, + "grad_norm": 2.5903913974761963, + "learning_rate": 1.998773362155032e-05, + "loss": 1.0947, + "step": 2561 + }, + { + "epoch": 0.418268642096241, + "grad_norm": 2.973273754119873, + "learning_rate": 1.9987717903683447e-05, + "loss": 0.9077, + "step": 2562 + }, + { + "epoch": 0.41843190073874537, + "grad_norm": 2.08506441116333, + "learning_rate": 1.9987702175758944e-05, + "loss": 0.7905, + "step": 2563 + }, + { + "epoch": 0.41859515938124975, + "grad_norm": 2.776777982711792, + "learning_rate": 1.998768643777683e-05, + "loss": 0.9312, + "step": 2564 + }, + { + "epoch": 0.41875841802375413, + "grad_norm": 2.6250064373016357, + "learning_rate": 1.9987670689737116e-05, + "loss": 1.1168, + "step": 2565 + }, + { + "epoch": 0.4189216766662585, + "grad_norm": 2.3053131103515625, + "learning_rate": 1.998765493163982e-05, + "loss": 0.9886, + "step": 2566 + }, + { + "epoch": 0.4190849353087629, + "grad_norm": 2.3273322582244873, + "learning_rate": 1.998763916348496e-05, + "loss": 1.13, + "step": 2567 + }, + { + "epoch": 0.4192481939512673, + "grad_norm": 2.0786330699920654, + "learning_rate": 1.998762338527255e-05, + "loss": 0.8755, + "step": 2568 + }, + { + "epoch": 0.41941145259377166, + "grad_norm": 2.1966702938079834, + "learning_rate": 1.9987607597002605e-05, + "loss": 0.9536, + "step": 2569 + }, + { + "epoch": 0.41957471123627604, + "grad_norm": 2.1364517211914062, + "learning_rate": 1.9987591798675142e-05, + "loss": 0.959, + "step": 2570 + }, + { + "epoch": 0.4197379698787805, + "grad_norm": 1.9524366855621338, + "learning_rate": 1.9987575990290176e-05, + "loss": 0.8255, + "step": 2571 + }, + { + "epoch": 0.41990122852128486, + "grad_norm": 2.1553242206573486, + "learning_rate": 1.9987560171847727e-05, + "loss": 1.0061, + "step": 2572 + }, + { + "epoch": 0.42006448716378925, + "grad_norm": 1.990578532218933, + "learning_rate": 1.9987544343347802e-05, + "loss": 0.95, + "step": 2573 + }, + { + "epoch": 0.42022774580629363, + "grad_norm": 2.5374364852905273, + "learning_rate": 1.9987528504790425e-05, + "loss": 0.9161, + "step": 2574 + }, + { + "epoch": 0.420391004448798, + "grad_norm": 1.9781410694122314, + "learning_rate": 1.9987512656175612e-05, + "loss": 0.8864, + "step": 2575 + }, + { + "epoch": 0.4205542630913024, + "grad_norm": 3.0853006839752197, + "learning_rate": 1.998749679750337e-05, + "loss": 0.9969, + "step": 2576 + }, + { + "epoch": 0.4207175217338068, + "grad_norm": 2.0785045623779297, + "learning_rate": 1.998748092877373e-05, + "loss": 0.9968, + "step": 2577 + }, + { + "epoch": 0.42088078037631116, + "grad_norm": 2.253516912460327, + "learning_rate": 1.9987465049986693e-05, + "loss": 0.8925, + "step": 2578 + }, + { + "epoch": 0.42104403901881554, + "grad_norm": 2.089512348175049, + "learning_rate": 1.9987449161142284e-05, + "loss": 0.934, + "step": 2579 + }, + { + "epoch": 0.4212072976613199, + "grad_norm": 1.8636101484298706, + "learning_rate": 1.9987433262240518e-05, + "loss": 0.8302, + "step": 2580 + }, + { + "epoch": 0.4213705563038243, + "grad_norm": 2.141235113143921, + "learning_rate": 1.9987417353281407e-05, + "loss": 0.8097, + "step": 2581 + }, + { + "epoch": 0.42153381494632874, + "grad_norm": 2.5590522289276123, + "learning_rate": 1.9987401434264966e-05, + "loss": 0.8327, + "step": 2582 + }, + { + "epoch": 0.4216970735888331, + "grad_norm": 2.420496940612793, + "learning_rate": 1.998738550519122e-05, + "loss": 0.9455, + "step": 2583 + }, + { + "epoch": 0.4218603322313375, + "grad_norm": 1.946527361869812, + "learning_rate": 1.998736956606018e-05, + "loss": 0.7956, + "step": 2584 + }, + { + "epoch": 0.4220235908738419, + "grad_norm": 2.544832706451416, + "learning_rate": 1.9987353616871855e-05, + "loss": 1.0669, + "step": 2585 + }, + { + "epoch": 0.4221868495163463, + "grad_norm": 2.0066325664520264, + "learning_rate": 1.9987337657626272e-05, + "loss": 0.7262, + "step": 2586 + }, + { + "epoch": 0.42235010815885066, + "grad_norm": 2.5045721530914307, + "learning_rate": 1.9987321688323444e-05, + "loss": 0.7806, + "step": 2587 + }, + { + "epoch": 0.42251336680135504, + "grad_norm": 2.1405093669891357, + "learning_rate": 1.998730570896338e-05, + "loss": 0.9533, + "step": 2588 + }, + { + "epoch": 0.4226766254438594, + "grad_norm": 2.3566653728485107, + "learning_rate": 1.9987289719546107e-05, + "loss": 1.0133, + "step": 2589 + }, + { + "epoch": 0.4228398840863638, + "grad_norm": 2.389056444168091, + "learning_rate": 1.9987273720071633e-05, + "loss": 0.9189, + "step": 2590 + }, + { + "epoch": 0.4230031427288682, + "grad_norm": 1.881344199180603, + "learning_rate": 1.998725771053998e-05, + "loss": 0.9044, + "step": 2591 + }, + { + "epoch": 0.4231664013713726, + "grad_norm": 2.144057512283325, + "learning_rate": 1.998724169095116e-05, + "loss": 0.8832, + "step": 2592 + }, + { + "epoch": 0.423329660013877, + "grad_norm": 2.0240700244903564, + "learning_rate": 1.998722566130519e-05, + "loss": 0.8022, + "step": 2593 + }, + { + "epoch": 0.4234929186563814, + "grad_norm": 2.117976427078247, + "learning_rate": 1.9987209621602086e-05, + "loss": 0.8283, + "step": 2594 + }, + { + "epoch": 0.42365617729888577, + "grad_norm": 2.322115898132324, + "learning_rate": 1.9987193571841865e-05, + "loss": 0.7451, + "step": 2595 + }, + { + "epoch": 0.42381943594139015, + "grad_norm": 2.426928758621216, + "learning_rate": 1.9987177512024543e-05, + "loss": 1.057, + "step": 2596 + }, + { + "epoch": 0.42398269458389454, + "grad_norm": 2.165200710296631, + "learning_rate": 1.9987161442150135e-05, + "loss": 0.9862, + "step": 2597 + }, + { + "epoch": 0.4241459532263989, + "grad_norm": 2.2707483768463135, + "learning_rate": 1.9987145362218658e-05, + "loss": 0.9719, + "step": 2598 + }, + { + "epoch": 0.4243092118689033, + "grad_norm": 2.195021390914917, + "learning_rate": 1.9987129272230128e-05, + "loss": 0.8666, + "step": 2599 + }, + { + "epoch": 0.4244724705114077, + "grad_norm": 2.2502899169921875, + "learning_rate": 1.9987113172184562e-05, + "loss": 1.0496, + "step": 2600 + }, + { + "epoch": 0.42463572915391207, + "grad_norm": 2.3947930335998535, + "learning_rate": 1.9987097062081978e-05, + "loss": 0.8629, + "step": 2601 + }, + { + "epoch": 0.42479898779641645, + "grad_norm": 2.5473814010620117, + "learning_rate": 1.9987080941922385e-05, + "loss": 1.2059, + "step": 2602 + }, + { + "epoch": 0.4249622464389209, + "grad_norm": 2.2321674823760986, + "learning_rate": 1.9987064811705807e-05, + "loss": 0.9677, + "step": 2603 + }, + { + "epoch": 0.42512550508142527, + "grad_norm": 2.011504650115967, + "learning_rate": 1.9987048671432258e-05, + "loss": 0.7454, + "step": 2604 + }, + { + "epoch": 0.42528876372392965, + "grad_norm": 2.228098154067993, + "learning_rate": 1.998703252110175e-05, + "loss": 0.8066, + "step": 2605 + }, + { + "epoch": 0.42545202236643404, + "grad_norm": 2.144942283630371, + "learning_rate": 1.9987016360714307e-05, + "loss": 1.0303, + "step": 2606 + }, + { + "epoch": 0.4256152810089384, + "grad_norm": 2.026698112487793, + "learning_rate": 1.9987000190269943e-05, + "loss": 0.814, + "step": 2607 + }, + { + "epoch": 0.4257785396514428, + "grad_norm": 2.0498287677764893, + "learning_rate": 1.9986984009768665e-05, + "loss": 0.9857, + "step": 2608 + }, + { + "epoch": 0.4259417982939472, + "grad_norm": 2.043485403060913, + "learning_rate": 1.9986967819210507e-05, + "loss": 0.8504, + "step": 2609 + }, + { + "epoch": 0.42610505693645156, + "grad_norm": 2.2902987003326416, + "learning_rate": 1.9986951618595466e-05, + "loss": 0.9418, + "step": 2610 + }, + { + "epoch": 0.42626831557895595, + "grad_norm": 2.1433684825897217, + "learning_rate": 1.9986935407923572e-05, + "loss": 0.9862, + "step": 2611 + }, + { + "epoch": 0.42643157422146033, + "grad_norm": 2.1448352336883545, + "learning_rate": 1.9986919187194832e-05, + "loss": 0.8599, + "step": 2612 + }, + { + "epoch": 0.4265948328639647, + "grad_norm": 2.2717056274414062, + "learning_rate": 1.9986902956409274e-05, + "loss": 0.9652, + "step": 2613 + }, + { + "epoch": 0.42675809150646915, + "grad_norm": 2.016901731491089, + "learning_rate": 1.9986886715566903e-05, + "loss": 0.9389, + "step": 2614 + }, + { + "epoch": 0.42692135014897353, + "grad_norm": 1.9714692831039429, + "learning_rate": 1.998687046466774e-05, + "loss": 0.7918, + "step": 2615 + }, + { + "epoch": 0.4270846087914779, + "grad_norm": 2.046323299407959, + "learning_rate": 1.99868542037118e-05, + "loss": 0.9182, + "step": 2616 + }, + { + "epoch": 0.4272478674339823, + "grad_norm": 2.1829674243927, + "learning_rate": 1.9986837932699103e-05, + "loss": 0.8179, + "step": 2617 + }, + { + "epoch": 0.4274111260764867, + "grad_norm": 2.3346526622772217, + "learning_rate": 1.9986821651629664e-05, + "loss": 0.8893, + "step": 2618 + }, + { + "epoch": 0.42757438471899106, + "grad_norm": 2.1561694145202637, + "learning_rate": 1.9986805360503494e-05, + "loss": 0.8789, + "step": 2619 + }, + { + "epoch": 0.42773764336149545, + "grad_norm": 2.158025026321411, + "learning_rate": 1.9986789059320614e-05, + "loss": 0.7085, + "step": 2620 + }, + { + "epoch": 0.42790090200399983, + "grad_norm": 2.2466347217559814, + "learning_rate": 1.9986772748081044e-05, + "loss": 0.9338, + "step": 2621 + }, + { + "epoch": 0.4280641606465042, + "grad_norm": 2.3954708576202393, + "learning_rate": 1.9986756426784794e-05, + "loss": 0.9137, + "step": 2622 + }, + { + "epoch": 0.4282274192890086, + "grad_norm": 2.3417820930480957, + "learning_rate": 1.9986740095431884e-05, + "loss": 0.9494, + "step": 2623 + }, + { + "epoch": 0.428390677931513, + "grad_norm": 2.4926111698150635, + "learning_rate": 1.998672375402233e-05, + "loss": 0.9806, + "step": 2624 + }, + { + "epoch": 0.4285539365740174, + "grad_norm": 2.5028328895568848, + "learning_rate": 1.9986707402556144e-05, + "loss": 0.884, + "step": 2625 + }, + { + "epoch": 0.4287171952165218, + "grad_norm": 2.1877872943878174, + "learning_rate": 1.998669104103335e-05, + "loss": 0.8646, + "step": 2626 + }, + { + "epoch": 0.4288804538590262, + "grad_norm": 2.3030076026916504, + "learning_rate": 1.998667466945396e-05, + "loss": 0.9533, + "step": 2627 + }, + { + "epoch": 0.42904371250153056, + "grad_norm": 2.288083553314209, + "learning_rate": 1.998665828781799e-05, + "loss": 0.8689, + "step": 2628 + }, + { + "epoch": 0.42920697114403494, + "grad_norm": 2.1332218647003174, + "learning_rate": 1.9986641896125457e-05, + "loss": 0.9012, + "step": 2629 + }, + { + "epoch": 0.4293702297865393, + "grad_norm": 2.2287707328796387, + "learning_rate": 1.998662549437638e-05, + "loss": 0.9217, + "step": 2630 + }, + { + "epoch": 0.4295334884290437, + "grad_norm": 2.3400299549102783, + "learning_rate": 1.9986609082570775e-05, + "loss": 0.8374, + "step": 2631 + }, + { + "epoch": 0.4296967470715481, + "grad_norm": 2.132723569869995, + "learning_rate": 1.9986592660708654e-05, + "loss": 0.8746, + "step": 2632 + }, + { + "epoch": 0.4298600057140525, + "grad_norm": 2.0846195220947266, + "learning_rate": 1.998657622879004e-05, + "loss": 0.8457, + "step": 2633 + }, + { + "epoch": 0.43002326435655686, + "grad_norm": 2.1875085830688477, + "learning_rate": 1.9986559786814946e-05, + "loss": 0.9212, + "step": 2634 + }, + { + "epoch": 0.43018652299906124, + "grad_norm": 2.014497995376587, + "learning_rate": 1.9986543334783386e-05, + "loss": 0.8776, + "step": 2635 + }, + { + "epoch": 0.4303497816415657, + "grad_norm": 2.0356993675231934, + "learning_rate": 1.9986526872695383e-05, + "loss": 0.8039, + "step": 2636 + }, + { + "epoch": 0.43051304028407006, + "grad_norm": 2.043959856033325, + "learning_rate": 1.9986510400550947e-05, + "loss": 0.8053, + "step": 2637 + }, + { + "epoch": 0.43067629892657444, + "grad_norm": 2.468763589859009, + "learning_rate": 1.99864939183501e-05, + "loss": 0.8736, + "step": 2638 + }, + { + "epoch": 0.4308395575690788, + "grad_norm": 2.263637065887451, + "learning_rate": 1.9986477426092856e-05, + "loss": 0.9566, + "step": 2639 + }, + { + "epoch": 0.4310028162115832, + "grad_norm": 2.235112428665161, + "learning_rate": 1.998646092377923e-05, + "loss": 0.9102, + "step": 2640 + }, + { + "epoch": 0.4311660748540876, + "grad_norm": 2.0359420776367188, + "learning_rate": 1.9986444411409245e-05, + "loss": 0.8232, + "step": 2641 + }, + { + "epoch": 0.43132933349659197, + "grad_norm": 2.1837315559387207, + "learning_rate": 1.9986427888982907e-05, + "loss": 0.9655, + "step": 2642 + }, + { + "epoch": 0.43149259213909635, + "grad_norm": 2.1636385917663574, + "learning_rate": 1.9986411356500242e-05, + "loss": 0.8835, + "step": 2643 + }, + { + "epoch": 0.43165585078160074, + "grad_norm": 2.4290692806243896, + "learning_rate": 1.9986394813961267e-05, + "loss": 0.9449, + "step": 2644 + }, + { + "epoch": 0.4318191094241051, + "grad_norm": 2.2007832527160645, + "learning_rate": 1.9986378261365987e-05, + "loss": 0.974, + "step": 2645 + }, + { + "epoch": 0.4319823680666095, + "grad_norm": 2.0438380241394043, + "learning_rate": 1.998636169871443e-05, + "loss": 0.9276, + "step": 2646 + }, + { + "epoch": 0.43214562670911394, + "grad_norm": 1.9163894653320312, + "learning_rate": 1.9986345126006612e-05, + "loss": 0.7641, + "step": 2647 + }, + { + "epoch": 0.4323088853516183, + "grad_norm": 2.1693336963653564, + "learning_rate": 1.9986328543242546e-05, + "loss": 0.9465, + "step": 2648 + }, + { + "epoch": 0.4324721439941227, + "grad_norm": 2.2977375984191895, + "learning_rate": 1.9986311950422252e-05, + "loss": 0.8577, + "step": 2649 + }, + { + "epoch": 0.4326354026366271, + "grad_norm": 2.289811372756958, + "learning_rate": 1.9986295347545738e-05, + "loss": 0.914, + "step": 2650 + }, + { + "epoch": 0.43279866127913147, + "grad_norm": 2.16536808013916, + "learning_rate": 1.9986278734613032e-05, + "loss": 0.9527, + "step": 2651 + }, + { + "epoch": 0.43296191992163585, + "grad_norm": 2.135718822479248, + "learning_rate": 1.9986262111624145e-05, + "loss": 0.7753, + "step": 2652 + }, + { + "epoch": 0.43312517856414023, + "grad_norm": 2.2220683097839355, + "learning_rate": 1.9986245478579095e-05, + "loss": 0.7502, + "step": 2653 + }, + { + "epoch": 0.4332884372066446, + "grad_norm": 1.9034044742584229, + "learning_rate": 1.99862288354779e-05, + "loss": 0.8147, + "step": 2654 + }, + { + "epoch": 0.433451695849149, + "grad_norm": 2.4922430515289307, + "learning_rate": 1.9986212182320574e-05, + "loss": 1.0012, + "step": 2655 + }, + { + "epoch": 0.4336149544916534, + "grad_norm": 2.050568103790283, + "learning_rate": 1.9986195519107135e-05, + "loss": 0.8234, + "step": 2656 + }, + { + "epoch": 0.43377821313415776, + "grad_norm": 2.2101564407348633, + "learning_rate": 1.99861788458376e-05, + "loss": 0.785, + "step": 2657 + }, + { + "epoch": 0.4339414717766622, + "grad_norm": 2.09731388092041, + "learning_rate": 1.9986162162511983e-05, + "loss": 0.8506, + "step": 2658 + }, + { + "epoch": 0.4341047304191666, + "grad_norm": 2.0556869506835938, + "learning_rate": 1.9986145469130304e-05, + "loss": 0.9208, + "step": 2659 + }, + { + "epoch": 0.43426798906167097, + "grad_norm": 2.35178279876709, + "learning_rate": 1.998612876569258e-05, + "loss": 1.2053, + "step": 2660 + }, + { + "epoch": 0.43443124770417535, + "grad_norm": 2.285785675048828, + "learning_rate": 1.998611205219883e-05, + "loss": 0.9811, + "step": 2661 + }, + { + "epoch": 0.43459450634667973, + "grad_norm": 1.9938793182373047, + "learning_rate": 1.9986095328649063e-05, + "loss": 0.8602, + "step": 2662 + }, + { + "epoch": 0.4347577649891841, + "grad_norm": 1.9170814752578735, + "learning_rate": 1.9986078595043303e-05, + "loss": 0.7936, + "step": 2663 + }, + { + "epoch": 0.4349210236316885, + "grad_norm": 2.26836895942688, + "learning_rate": 1.9986061851381565e-05, + "loss": 0.9123, + "step": 2664 + }, + { + "epoch": 0.4350842822741929, + "grad_norm": 1.9311662912368774, + "learning_rate": 1.9986045097663866e-05, + "loss": 0.7011, + "step": 2665 + }, + { + "epoch": 0.43524754091669726, + "grad_norm": 2.714801788330078, + "learning_rate": 1.9986028333890218e-05, + "loss": 1.1053, + "step": 2666 + }, + { + "epoch": 0.43541079955920164, + "grad_norm": 2.334782361984253, + "learning_rate": 1.9986011560060646e-05, + "loss": 1.0644, + "step": 2667 + }, + { + "epoch": 0.435574058201706, + "grad_norm": 1.690503478050232, + "learning_rate": 1.998599477617516e-05, + "loss": 0.6588, + "step": 2668 + }, + { + "epoch": 0.43573731684421046, + "grad_norm": 1.747901439666748, + "learning_rate": 1.9985977982233783e-05, + "loss": 0.7068, + "step": 2669 + }, + { + "epoch": 0.43590057548671485, + "grad_norm": 2.215257167816162, + "learning_rate": 1.9985961178236526e-05, + "loss": 0.9213, + "step": 2670 + }, + { + "epoch": 0.43606383412921923, + "grad_norm": 2.1856789588928223, + "learning_rate": 1.9985944364183413e-05, + "loss": 1.0029, + "step": 2671 + }, + { + "epoch": 0.4362270927717236, + "grad_norm": 2.3084564208984375, + "learning_rate": 1.9985927540074453e-05, + "loss": 0.9062, + "step": 2672 + }, + { + "epoch": 0.436390351414228, + "grad_norm": 2.2608482837677, + "learning_rate": 1.9985910705909667e-05, + "loss": 1.1294, + "step": 2673 + }, + { + "epoch": 0.4365536100567324, + "grad_norm": 2.2989938259124756, + "learning_rate": 1.9985893861689076e-05, + "loss": 0.9288, + "step": 2674 + }, + { + "epoch": 0.43671686869923676, + "grad_norm": 2.4355976581573486, + "learning_rate": 1.9985877007412685e-05, + "loss": 1.0084, + "step": 2675 + }, + { + "epoch": 0.43688012734174114, + "grad_norm": 2.417752742767334, + "learning_rate": 1.9985860143080526e-05, + "loss": 0.8687, + "step": 2676 + }, + { + "epoch": 0.4370433859842455, + "grad_norm": 2.6050524711608887, + "learning_rate": 1.9985843268692605e-05, + "loss": 0.9473, + "step": 2677 + }, + { + "epoch": 0.4372066446267499, + "grad_norm": 2.0831985473632812, + "learning_rate": 1.998582638424894e-05, + "loss": 0.8053, + "step": 2678 + }, + { + "epoch": 0.4373699032692543, + "grad_norm": 2.1258373260498047, + "learning_rate": 1.9985809489749553e-05, + "loss": 0.7933, + "step": 2679 + }, + { + "epoch": 0.43753316191175873, + "grad_norm": 2.739917039871216, + "learning_rate": 1.998579258519446e-05, + "loss": 0.9547, + "step": 2680 + }, + { + "epoch": 0.4376964205542631, + "grad_norm": 1.8580518960952759, + "learning_rate": 1.998577567058367e-05, + "loss": 0.8272, + "step": 2681 + }, + { + "epoch": 0.4378596791967675, + "grad_norm": 2.168564558029175, + "learning_rate": 1.9985758745917213e-05, + "loss": 0.9615, + "step": 2682 + }, + { + "epoch": 0.4380229378392719, + "grad_norm": 2.185171127319336, + "learning_rate": 1.9985741811195098e-05, + "loss": 0.9181, + "step": 2683 + }, + { + "epoch": 0.43818619648177626, + "grad_norm": 2.091188907623291, + "learning_rate": 1.9985724866417343e-05, + "loss": 0.9006, + "step": 2684 + }, + { + "epoch": 0.43834945512428064, + "grad_norm": 2.0668110847473145, + "learning_rate": 1.9985707911583966e-05, + "loss": 0.8571, + "step": 2685 + }, + { + "epoch": 0.438512713766785, + "grad_norm": 2.063580274581909, + "learning_rate": 1.9985690946694983e-05, + "loss": 0.8711, + "step": 2686 + }, + { + "epoch": 0.4386759724092894, + "grad_norm": 1.9888142347335815, + "learning_rate": 1.9985673971750414e-05, + "loss": 0.8756, + "step": 2687 + }, + { + "epoch": 0.4388392310517938, + "grad_norm": 1.8143372535705566, + "learning_rate": 1.9985656986750273e-05, + "loss": 0.8307, + "step": 2688 + }, + { + "epoch": 0.43900248969429817, + "grad_norm": 1.866960048675537, + "learning_rate": 1.9985639991694578e-05, + "loss": 0.8031, + "step": 2689 + }, + { + "epoch": 0.43916574833680255, + "grad_norm": 2.203295946121216, + "learning_rate": 1.9985622986583347e-05, + "loss": 0.8205, + "step": 2690 + }, + { + "epoch": 0.439329006979307, + "grad_norm": 1.8331339359283447, + "learning_rate": 1.9985605971416596e-05, + "loss": 0.7562, + "step": 2691 + }, + { + "epoch": 0.4394922656218114, + "grad_norm": 2.258321523666382, + "learning_rate": 1.9985588946194343e-05, + "loss": 0.792, + "step": 2692 + }, + { + "epoch": 0.43965552426431576, + "grad_norm": 2.39616060256958, + "learning_rate": 1.9985571910916604e-05, + "loss": 0.9636, + "step": 2693 + }, + { + "epoch": 0.43981878290682014, + "grad_norm": 2.1235127449035645, + "learning_rate": 1.9985554865583394e-05, + "loss": 0.8517, + "step": 2694 + }, + { + "epoch": 0.4399820415493245, + "grad_norm": 2.282531976699829, + "learning_rate": 1.9985537810194734e-05, + "loss": 0.8666, + "step": 2695 + }, + { + "epoch": 0.4401453001918289, + "grad_norm": 2.088718891143799, + "learning_rate": 1.998552074475064e-05, + "loss": 0.8936, + "step": 2696 + }, + { + "epoch": 0.4403085588343333, + "grad_norm": 2.4440555572509766, + "learning_rate": 1.9985503669251135e-05, + "loss": 0.8472, + "step": 2697 + }, + { + "epoch": 0.44047181747683767, + "grad_norm": 1.8653243780136108, + "learning_rate": 1.9985486583696227e-05, + "loss": 0.8235, + "step": 2698 + }, + { + "epoch": 0.44063507611934205, + "grad_norm": 2.849872589111328, + "learning_rate": 1.9985469488085933e-05, + "loss": 0.8711, + "step": 2699 + }, + { + "epoch": 0.44079833476184643, + "grad_norm": 1.9888776540756226, + "learning_rate": 1.9985452382420277e-05, + "loss": 0.813, + "step": 2700 + }, + { + "epoch": 0.4409615934043508, + "grad_norm": 2.2006723880767822, + "learning_rate": 1.998543526669927e-05, + "loss": 0.9467, + "step": 2701 + }, + { + "epoch": 0.44112485204685525, + "grad_norm": 2.177020311355591, + "learning_rate": 1.9985418140922938e-05, + "loss": 0.9523, + "step": 2702 + }, + { + "epoch": 0.44128811068935964, + "grad_norm": 2.3006911277770996, + "learning_rate": 1.9985401005091288e-05, + "loss": 0.9483, + "step": 2703 + }, + { + "epoch": 0.441451369331864, + "grad_norm": 2.1592843532562256, + "learning_rate": 1.9985383859204343e-05, + "loss": 0.8812, + "step": 2704 + }, + { + "epoch": 0.4416146279743684, + "grad_norm": 1.8533493280410767, + "learning_rate": 1.998536670326212e-05, + "loss": 0.7322, + "step": 2705 + }, + { + "epoch": 0.4417778866168728, + "grad_norm": 2.0111234188079834, + "learning_rate": 1.9985349537264637e-05, + "loss": 0.8164, + "step": 2706 + }, + { + "epoch": 0.44194114525937717, + "grad_norm": 2.3743317127227783, + "learning_rate": 1.9985332361211905e-05, + "loss": 0.7042, + "step": 2707 + }, + { + "epoch": 0.44210440390188155, + "grad_norm": 2.3276257514953613, + "learning_rate": 1.9985315175103947e-05, + "loss": 1.1234, + "step": 2708 + }, + { + "epoch": 0.44226766254438593, + "grad_norm": 2.0492115020751953, + "learning_rate": 1.998529797894078e-05, + "loss": 0.8982, + "step": 2709 + }, + { + "epoch": 0.4424309211868903, + "grad_norm": 1.9858986139297485, + "learning_rate": 1.9985280772722423e-05, + "loss": 0.821, + "step": 2710 + }, + { + "epoch": 0.4425941798293947, + "grad_norm": 2.381416082382202, + "learning_rate": 1.9985263556448888e-05, + "loss": 0.9602, + "step": 2711 + }, + { + "epoch": 0.4427574384718991, + "grad_norm": 1.8716630935668945, + "learning_rate": 1.9985246330120197e-05, + "loss": 0.8154, + "step": 2712 + }, + { + "epoch": 0.4429206971144035, + "grad_norm": 2.085667133331299, + "learning_rate": 1.9985229093736365e-05, + "loss": 0.6856, + "step": 2713 + }, + { + "epoch": 0.4430839557569079, + "grad_norm": 2.112861156463623, + "learning_rate": 1.998521184729741e-05, + "loss": 0.8776, + "step": 2714 + }, + { + "epoch": 0.4432472143994123, + "grad_norm": 1.8321067094802856, + "learning_rate": 1.9985194590803346e-05, + "loss": 0.7676, + "step": 2715 + }, + { + "epoch": 0.44341047304191666, + "grad_norm": 1.997267484664917, + "learning_rate": 1.99851773242542e-05, + "loss": 0.8008, + "step": 2716 + }, + { + "epoch": 0.44357373168442105, + "grad_norm": 2.4477429389953613, + "learning_rate": 1.9985160047649978e-05, + "loss": 0.9343, + "step": 2717 + }, + { + "epoch": 0.44373699032692543, + "grad_norm": 2.2380995750427246, + "learning_rate": 1.9985142760990705e-05, + "loss": 0.8682, + "step": 2718 + }, + { + "epoch": 0.4439002489694298, + "grad_norm": 2.1809186935424805, + "learning_rate": 1.9985125464276395e-05, + "loss": 1.0538, + "step": 2719 + }, + { + "epoch": 0.4440635076119342, + "grad_norm": 2.454454183578491, + "learning_rate": 1.998510815750707e-05, + "loss": 1.0195, + "step": 2720 + }, + { + "epoch": 0.4442267662544386, + "grad_norm": 2.165220022201538, + "learning_rate": 1.9985090840682737e-05, + "loss": 0.9512, + "step": 2721 + }, + { + "epoch": 0.44439002489694296, + "grad_norm": 1.8629913330078125, + "learning_rate": 1.9985073513803425e-05, + "loss": 0.8271, + "step": 2722 + }, + { + "epoch": 0.4445532835394474, + "grad_norm": 2.238548994064331, + "learning_rate": 1.9985056176869145e-05, + "loss": 0.8329, + "step": 2723 + }, + { + "epoch": 0.4447165421819518, + "grad_norm": 2.0306456089019775, + "learning_rate": 1.9985038829879917e-05, + "loss": 0.8197, + "step": 2724 + }, + { + "epoch": 0.44487980082445616, + "grad_norm": 2.0444462299346924, + "learning_rate": 1.9985021472835756e-05, + "loss": 0.946, + "step": 2725 + }, + { + "epoch": 0.44504305946696054, + "grad_norm": 2.3781251907348633, + "learning_rate": 1.998500410573668e-05, + "loss": 1.0021, + "step": 2726 + }, + { + "epoch": 0.4452063181094649, + "grad_norm": 1.8087161779403687, + "learning_rate": 1.9984986728582712e-05, + "loss": 0.8959, + "step": 2727 + }, + { + "epoch": 0.4453695767519693, + "grad_norm": 2.05653715133667, + "learning_rate": 1.9984969341373862e-05, + "loss": 0.8544, + "step": 2728 + }, + { + "epoch": 0.4455328353944737, + "grad_norm": 2.2994933128356934, + "learning_rate": 1.9984951944110152e-05, + "loss": 1.015, + "step": 2729 + }, + { + "epoch": 0.4456960940369781, + "grad_norm": 2.725553512573242, + "learning_rate": 1.9984934536791594e-05, + "loss": 1.0603, + "step": 2730 + }, + { + "epoch": 0.44585935267948246, + "grad_norm": 2.4205844402313232, + "learning_rate": 1.9984917119418214e-05, + "loss": 1.0091, + "step": 2731 + }, + { + "epoch": 0.44602261132198684, + "grad_norm": 2.026588201522827, + "learning_rate": 1.9984899691990024e-05, + "loss": 0.8715, + "step": 2732 + }, + { + "epoch": 0.4461858699644912, + "grad_norm": 2.3007028102874756, + "learning_rate": 1.998488225450704e-05, + "loss": 0.9269, + "step": 2733 + }, + { + "epoch": 0.44634912860699566, + "grad_norm": 2.229365587234497, + "learning_rate": 1.9984864806969286e-05, + "loss": 0.8723, + "step": 2734 + }, + { + "epoch": 0.44651238724950004, + "grad_norm": 2.1601788997650146, + "learning_rate": 1.9984847349376775e-05, + "loss": 0.757, + "step": 2735 + }, + { + "epoch": 0.4466756458920044, + "grad_norm": 2.043607473373413, + "learning_rate": 1.9984829881729526e-05, + "loss": 0.8626, + "step": 2736 + }, + { + "epoch": 0.4468389045345088, + "grad_norm": 2.118283271789551, + "learning_rate": 1.9984812404027555e-05, + "loss": 1.0084, + "step": 2737 + }, + { + "epoch": 0.4470021631770132, + "grad_norm": 2.2881579399108887, + "learning_rate": 1.9984794916270876e-05, + "loss": 1.3142, + "step": 2738 + }, + { + "epoch": 0.4471654218195176, + "grad_norm": 2.0737898349761963, + "learning_rate": 1.9984777418459517e-05, + "loss": 0.8781, + "step": 2739 + }, + { + "epoch": 0.44732868046202195, + "grad_norm": 2.217668056488037, + "learning_rate": 1.998475991059349e-05, + "loss": 0.9425, + "step": 2740 + }, + { + "epoch": 0.44749193910452634, + "grad_norm": 2.0479846000671387, + "learning_rate": 1.998474239267281e-05, + "loss": 0.8539, + "step": 2741 + }, + { + "epoch": 0.4476551977470307, + "grad_norm": 2.188425302505493, + "learning_rate": 1.9984724864697495e-05, + "loss": 0.8078, + "step": 2742 + }, + { + "epoch": 0.4478184563895351, + "grad_norm": 2.477513551712036, + "learning_rate": 1.998470732666757e-05, + "loss": 0.9089, + "step": 2743 + }, + { + "epoch": 0.4479817150320395, + "grad_norm": 1.9713108539581299, + "learning_rate": 1.9984689778583044e-05, + "loss": 0.9462, + "step": 2744 + }, + { + "epoch": 0.4481449736745439, + "grad_norm": 2.284900188446045, + "learning_rate": 1.998467222044394e-05, + "loss": 1.1004, + "step": 2745 + }, + { + "epoch": 0.4483082323170483, + "grad_norm": 2.3754286766052246, + "learning_rate": 1.998465465225027e-05, + "loss": 0.8506, + "step": 2746 + }, + { + "epoch": 0.4484714909595527, + "grad_norm": 2.0146396160125732, + "learning_rate": 1.9984637074002056e-05, + "loss": 0.9111, + "step": 2747 + }, + { + "epoch": 0.44863474960205707, + "grad_norm": 2.218055486679077, + "learning_rate": 1.998461948569932e-05, + "loss": 1.0541, + "step": 2748 + }, + { + "epoch": 0.44879800824456145, + "grad_norm": 2.104807138442993, + "learning_rate": 1.998460188734207e-05, + "loss": 0.9474, + "step": 2749 + }, + { + "epoch": 0.44896126688706584, + "grad_norm": 2.4090664386749268, + "learning_rate": 1.9984584278930333e-05, + "loss": 0.7195, + "step": 2750 + }, + { + "epoch": 0.4491245255295702, + "grad_norm": 2.2388219833374023, + "learning_rate": 1.998456666046412e-05, + "loss": 0.9465, + "step": 2751 + }, + { + "epoch": 0.4492877841720746, + "grad_norm": 2.3202152252197266, + "learning_rate": 1.998454903194345e-05, + "loss": 0.9494, + "step": 2752 + }, + { + "epoch": 0.449451042814579, + "grad_norm": 2.259066343307495, + "learning_rate": 1.998453139336834e-05, + "loss": 0.8967, + "step": 2753 + }, + { + "epoch": 0.44961430145708337, + "grad_norm": 2.4667165279388428, + "learning_rate": 1.9984513744738815e-05, + "loss": 0.9511, + "step": 2754 + }, + { + "epoch": 0.44977756009958775, + "grad_norm": 1.9335856437683105, + "learning_rate": 1.9984496086054882e-05, + "loss": 0.9403, + "step": 2755 + }, + { + "epoch": 0.4499408187420922, + "grad_norm": 2.232543706893921, + "learning_rate": 1.9984478417316566e-05, + "loss": 0.8169, + "step": 2756 + }, + { + "epoch": 0.45010407738459657, + "grad_norm": 2.1563398838043213, + "learning_rate": 1.9984460738523882e-05, + "loss": 0.9334, + "step": 2757 + }, + { + "epoch": 0.45026733602710095, + "grad_norm": 1.9198511838912964, + "learning_rate": 1.998444304967685e-05, + "loss": 0.7089, + "step": 2758 + }, + { + "epoch": 0.45043059466960533, + "grad_norm": 2.1701595783233643, + "learning_rate": 1.998442535077549e-05, + "loss": 0.9623, + "step": 2759 + }, + { + "epoch": 0.4505938533121097, + "grad_norm": 2.248934745788574, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.8981, + "step": 2760 + }, + { + "epoch": 0.4507571119546141, + "grad_norm": 2.0339176654815674, + "learning_rate": 1.998438992280984e-05, + "loss": 0.7647, + "step": 2761 + }, + { + "epoch": 0.4509203705971185, + "grad_norm": 1.8736517429351807, + "learning_rate": 1.9984372193745588e-05, + "loss": 0.8505, + "step": 2762 + }, + { + "epoch": 0.45108362923962286, + "grad_norm": 1.9474809169769287, + "learning_rate": 1.9984354454627074e-05, + "loss": 0.6766, + "step": 2763 + }, + { + "epoch": 0.45124688788212725, + "grad_norm": 2.469625473022461, + "learning_rate": 1.998433670545432e-05, + "loss": 0.9633, + "step": 2764 + }, + { + "epoch": 0.45141014652463163, + "grad_norm": 2.1931521892547607, + "learning_rate": 1.9984318946227343e-05, + "loss": 0.9492, + "step": 2765 + }, + { + "epoch": 0.451573405167136, + "grad_norm": 2.0469679832458496, + "learning_rate": 1.998430117694616e-05, + "loss": 0.9221, + "step": 2766 + }, + { + "epoch": 0.45173666380964045, + "grad_norm": 2.043379306793213, + "learning_rate": 1.9984283397610785e-05, + "loss": 0.8435, + "step": 2767 + }, + { + "epoch": 0.45189992245214483, + "grad_norm": 2.3748512268066406, + "learning_rate": 1.9984265608221242e-05, + "loss": 0.9452, + "step": 2768 + }, + { + "epoch": 0.4520631810946492, + "grad_norm": 2.4516921043395996, + "learning_rate": 1.9984247808777547e-05, + "loss": 0.9168, + "step": 2769 + }, + { + "epoch": 0.4522264397371536, + "grad_norm": 2.0556933879852295, + "learning_rate": 1.9984229999279713e-05, + "loss": 0.8783, + "step": 2770 + }, + { + "epoch": 0.452389698379658, + "grad_norm": 1.9374761581420898, + "learning_rate": 1.9984212179727768e-05, + "loss": 0.7759, + "step": 2771 + }, + { + "epoch": 0.45255295702216236, + "grad_norm": 1.919141411781311, + "learning_rate": 1.998419435012172e-05, + "loss": 0.7105, + "step": 2772 + }, + { + "epoch": 0.45271621566466674, + "grad_norm": 1.9514325857162476, + "learning_rate": 1.9984176510461592e-05, + "loss": 0.8254, + "step": 2773 + }, + { + "epoch": 0.4528794743071711, + "grad_norm": 2.317502021789551, + "learning_rate": 1.9984158660747396e-05, + "loss": 0.8213, + "step": 2774 + }, + { + "epoch": 0.4530427329496755, + "grad_norm": 2.1851048469543457, + "learning_rate": 1.9984140800979163e-05, + "loss": 0.7771, + "step": 2775 + }, + { + "epoch": 0.4532059915921799, + "grad_norm": 2.4070708751678467, + "learning_rate": 1.9984122931156896e-05, + "loss": 0.8929, + "step": 2776 + }, + { + "epoch": 0.4533692502346843, + "grad_norm": 2.4133450984954834, + "learning_rate": 1.9984105051280626e-05, + "loss": 1.0678, + "step": 2777 + }, + { + "epoch": 0.4535325088771887, + "grad_norm": 2.160731554031372, + "learning_rate": 1.998408716135036e-05, + "loss": 0.881, + "step": 2778 + }, + { + "epoch": 0.4536957675196931, + "grad_norm": 1.9060003757476807, + "learning_rate": 1.998406926136612e-05, + "loss": 0.7609, + "step": 2779 + }, + { + "epoch": 0.4538590261621975, + "grad_norm": 2.153287887573242, + "learning_rate": 1.998405135132793e-05, + "loss": 0.941, + "step": 2780 + }, + { + "epoch": 0.45402228480470186, + "grad_norm": 2.3656256198883057, + "learning_rate": 1.99840334312358e-05, + "loss": 0.942, + "step": 2781 + }, + { + "epoch": 0.45418554344720624, + "grad_norm": 2.4292593002319336, + "learning_rate": 1.998401550108975e-05, + "loss": 0.9825, + "step": 2782 + }, + { + "epoch": 0.4543488020897106, + "grad_norm": 2.3295626640319824, + "learning_rate": 1.9983997560889804e-05, + "loss": 0.8922, + "step": 2783 + }, + { + "epoch": 0.454512060732215, + "grad_norm": 2.081510543823242, + "learning_rate": 1.9983979610635972e-05, + "loss": 0.7443, + "step": 2784 + }, + { + "epoch": 0.4546753193747194, + "grad_norm": 2.332350015640259, + "learning_rate": 1.9983961650328272e-05, + "loss": 1.0261, + "step": 2785 + }, + { + "epoch": 0.45483857801722377, + "grad_norm": 2.1659679412841797, + "learning_rate": 1.9983943679966728e-05, + "loss": 0.8339, + "step": 2786 + }, + { + "epoch": 0.45500183665972815, + "grad_norm": 2.744924306869507, + "learning_rate": 1.9983925699551357e-05, + "loss": 0.8831, + "step": 2787 + }, + { + "epoch": 0.45516509530223254, + "grad_norm": 2.4122302532196045, + "learning_rate": 1.9983907709082172e-05, + "loss": 1.0166, + "step": 2788 + }, + { + "epoch": 0.455328353944737, + "grad_norm": 2.241115093231201, + "learning_rate": 1.9983889708559198e-05, + "loss": 0.9517, + "step": 2789 + }, + { + "epoch": 0.45549161258724136, + "grad_norm": 2.609023094177246, + "learning_rate": 1.9983871697982448e-05, + "loss": 1.0865, + "step": 2790 + }, + { + "epoch": 0.45565487122974574, + "grad_norm": 2.022104501724243, + "learning_rate": 1.9983853677351945e-05, + "loss": 0.8671, + "step": 2791 + }, + { + "epoch": 0.4558181298722501, + "grad_norm": 1.9546022415161133, + "learning_rate": 1.99838356466677e-05, + "loss": 0.7546, + "step": 2792 + }, + { + "epoch": 0.4559813885147545, + "grad_norm": 2.274498224258423, + "learning_rate": 1.9983817605929735e-05, + "loss": 0.9117, + "step": 2793 + }, + { + "epoch": 0.4561446471572589, + "grad_norm": 2.3548507690429688, + "learning_rate": 1.998379955513807e-05, + "loss": 1.0239, + "step": 2794 + }, + { + "epoch": 0.45630790579976327, + "grad_norm": 2.3394954204559326, + "learning_rate": 1.998378149429272e-05, + "loss": 0.9683, + "step": 2795 + }, + { + "epoch": 0.45647116444226765, + "grad_norm": 1.926032304763794, + "learning_rate": 1.9983763423393703e-05, + "loss": 0.8501, + "step": 2796 + }, + { + "epoch": 0.45663442308477203, + "grad_norm": 2.6548948287963867, + "learning_rate": 1.9983745342441044e-05, + "loss": 0.936, + "step": 2797 + }, + { + "epoch": 0.4567976817272764, + "grad_norm": 2.3010077476501465, + "learning_rate": 1.9983727251434755e-05, + "loss": 0.9463, + "step": 2798 + }, + { + "epoch": 0.4569609403697808, + "grad_norm": 1.773181676864624, + "learning_rate": 1.998370915037485e-05, + "loss": 0.641, + "step": 2799 + }, + { + "epoch": 0.45712419901228524, + "grad_norm": 2.1500132083892822, + "learning_rate": 1.9983691039261358e-05, + "loss": 0.9939, + "step": 2800 + }, + { + "epoch": 0.4572874576547896, + "grad_norm": 2.171962022781372, + "learning_rate": 1.9983672918094292e-05, + "loss": 0.9821, + "step": 2801 + }, + { + "epoch": 0.457450716297294, + "grad_norm": 1.9071234464645386, + "learning_rate": 1.9983654786873666e-05, + "loss": 0.7982, + "step": 2802 + }, + { + "epoch": 0.4576139749397984, + "grad_norm": 1.9087942838668823, + "learning_rate": 1.9983636645599508e-05, + "loss": 0.8352, + "step": 2803 + }, + { + "epoch": 0.45777723358230277, + "grad_norm": 2.2338805198669434, + "learning_rate": 1.9983618494271825e-05, + "loss": 0.9627, + "step": 2804 + }, + { + "epoch": 0.45794049222480715, + "grad_norm": 2.2184669971466064, + "learning_rate": 1.9983600332890643e-05, + "loss": 0.8054, + "step": 2805 + }, + { + "epoch": 0.45810375086731153, + "grad_norm": 2.211547613143921, + "learning_rate": 1.9983582161455976e-05, + "loss": 0.8377, + "step": 2806 + }, + { + "epoch": 0.4582670095098159, + "grad_norm": 2.0472335815429688, + "learning_rate": 1.9983563979967848e-05, + "loss": 0.846, + "step": 2807 + }, + { + "epoch": 0.4584302681523203, + "grad_norm": 2.140235424041748, + "learning_rate": 1.9983545788426273e-05, + "loss": 0.8343, + "step": 2808 + }, + { + "epoch": 0.4585935267948247, + "grad_norm": 2.291480779647827, + "learning_rate": 1.998352758683127e-05, + "loss": 1.032, + "step": 2809 + }, + { + "epoch": 0.45875678543732906, + "grad_norm": 2.3100991249084473, + "learning_rate": 1.9983509375182853e-05, + "loss": 0.9329, + "step": 2810 + }, + { + "epoch": 0.4589200440798335, + "grad_norm": 2.1981215476989746, + "learning_rate": 1.998349115348105e-05, + "loss": 0.8952, + "step": 2811 + }, + { + "epoch": 0.4590833027223379, + "grad_norm": 2.182628870010376, + "learning_rate": 1.9983472921725874e-05, + "loss": 0.8301, + "step": 2812 + }, + { + "epoch": 0.45924656136484227, + "grad_norm": 2.0193445682525635, + "learning_rate": 1.998345467991734e-05, + "loss": 0.7344, + "step": 2813 + }, + { + "epoch": 0.45940982000734665, + "grad_norm": 2.03368878364563, + "learning_rate": 1.998343642805547e-05, + "loss": 0.8465, + "step": 2814 + }, + { + "epoch": 0.45957307864985103, + "grad_norm": 2.191793203353882, + "learning_rate": 1.9983418166140286e-05, + "loss": 0.9405, + "step": 2815 + }, + { + "epoch": 0.4597363372923554, + "grad_norm": 2.1480743885040283, + "learning_rate": 1.99833998941718e-05, + "loss": 0.7638, + "step": 2816 + }, + { + "epoch": 0.4598995959348598, + "grad_norm": 2.185225248336792, + "learning_rate": 1.9983381612150034e-05, + "loss": 0.9954, + "step": 2817 + }, + { + "epoch": 0.4600628545773642, + "grad_norm": 2.2997236251831055, + "learning_rate": 1.9983363320075006e-05, + "loss": 0.9414, + "step": 2818 + }, + { + "epoch": 0.46022611321986856, + "grad_norm": 2.383753776550293, + "learning_rate": 1.998334501794673e-05, + "loss": 1.1423, + "step": 2819 + }, + { + "epoch": 0.46038937186237294, + "grad_norm": 2.103109121322632, + "learning_rate": 1.998332670576523e-05, + "loss": 0.8618, + "step": 2820 + }, + { + "epoch": 0.4605526305048773, + "grad_norm": 2.23368763923645, + "learning_rate": 1.9983308383530522e-05, + "loss": 0.9832, + "step": 2821 + }, + { + "epoch": 0.46071588914738176, + "grad_norm": 2.262098550796509, + "learning_rate": 1.998329005124263e-05, + "loss": 0.8908, + "step": 2822 + }, + { + "epoch": 0.46087914778988615, + "grad_norm": 2.0950767993927, + "learning_rate": 1.998327170890156e-05, + "loss": 0.8278, + "step": 2823 + }, + { + "epoch": 0.46104240643239053, + "grad_norm": 2.093890428543091, + "learning_rate": 1.9983253356507345e-05, + "loss": 1.0121, + "step": 2824 + }, + { + "epoch": 0.4612056650748949, + "grad_norm": 2.174423933029175, + "learning_rate": 1.998323499405999e-05, + "loss": 0.8794, + "step": 2825 + }, + { + "epoch": 0.4613689237173993, + "grad_norm": 2.1634421348571777, + "learning_rate": 1.9983216621559525e-05, + "loss": 0.9161, + "step": 2826 + }, + { + "epoch": 0.4615321823599037, + "grad_norm": 2.0016865730285645, + "learning_rate": 1.9983198239005962e-05, + "loss": 0.885, + "step": 2827 + }, + { + "epoch": 0.46169544100240806, + "grad_norm": 2.013849973678589, + "learning_rate": 1.998317984639932e-05, + "loss": 0.8185, + "step": 2828 + }, + { + "epoch": 0.46185869964491244, + "grad_norm": 2.138503313064575, + "learning_rate": 1.998316144373962e-05, + "loss": 0.9567, + "step": 2829 + }, + { + "epoch": 0.4620219582874168, + "grad_norm": 2.2302157878875732, + "learning_rate": 1.998314303102688e-05, + "loss": 0.9868, + "step": 2830 + }, + { + "epoch": 0.4621852169299212, + "grad_norm": 1.9154495000839233, + "learning_rate": 1.9983124608261116e-05, + "loss": 0.7744, + "step": 2831 + }, + { + "epoch": 0.4623484755724256, + "grad_norm": 2.2065558433532715, + "learning_rate": 1.9983106175442348e-05, + "loss": 0.8315, + "step": 2832 + }, + { + "epoch": 0.46251173421493, + "grad_norm": 1.9942960739135742, + "learning_rate": 1.9983087732570596e-05, + "loss": 0.7742, + "step": 2833 + }, + { + "epoch": 0.4626749928574344, + "grad_norm": 2.16050386428833, + "learning_rate": 1.9983069279645875e-05, + "loss": 0.8622, + "step": 2834 + }, + { + "epoch": 0.4628382514999388, + "grad_norm": 2.252201795578003, + "learning_rate": 1.9983050816668207e-05, + "loss": 0.8288, + "step": 2835 + }, + { + "epoch": 0.4630015101424432, + "grad_norm": 2.4610588550567627, + "learning_rate": 1.998303234363761e-05, + "loss": 0.923, + "step": 2836 + }, + { + "epoch": 0.46316476878494756, + "grad_norm": 2.255638360977173, + "learning_rate": 1.99830138605541e-05, + "loss": 0.8623, + "step": 2837 + }, + { + "epoch": 0.46332802742745194, + "grad_norm": 3.4806549549102783, + "learning_rate": 1.99829953674177e-05, + "loss": 0.8391, + "step": 2838 + }, + { + "epoch": 0.4634912860699563, + "grad_norm": 2.0836477279663086, + "learning_rate": 1.9982976864228427e-05, + "loss": 0.9397, + "step": 2839 + }, + { + "epoch": 0.4636545447124607, + "grad_norm": 1.8030729293823242, + "learning_rate": 1.9982958350986296e-05, + "loss": 0.6601, + "step": 2840 + }, + { + "epoch": 0.4638178033549651, + "grad_norm": 1.9555153846740723, + "learning_rate": 1.998293982769133e-05, + "loss": 0.847, + "step": 2841 + }, + { + "epoch": 0.46398106199746947, + "grad_norm": 2.0813868045806885, + "learning_rate": 1.9982921294343548e-05, + "loss": 0.7273, + "step": 2842 + }, + { + "epoch": 0.4641443206399739, + "grad_norm": 2.059178590774536, + "learning_rate": 1.998290275094297e-05, + "loss": 0.8192, + "step": 2843 + }, + { + "epoch": 0.4643075792824783, + "grad_norm": 2.340812921524048, + "learning_rate": 1.9982884197489602e-05, + "loss": 0.8316, + "step": 2844 + }, + { + "epoch": 0.46447083792498267, + "grad_norm": 2.144193410873413, + "learning_rate": 1.998286563398348e-05, + "loss": 0.8057, + "step": 2845 + }, + { + "epoch": 0.46463409656748705, + "grad_norm": 2.0668280124664307, + "learning_rate": 1.998284706042461e-05, + "loss": 0.7549, + "step": 2846 + }, + { + "epoch": 0.46479735520999144, + "grad_norm": 2.1740007400512695, + "learning_rate": 1.9982828476813018e-05, + "loss": 0.8372, + "step": 2847 + }, + { + "epoch": 0.4649606138524958, + "grad_norm": 2.311825752258301, + "learning_rate": 1.998280988314872e-05, + "loss": 0.803, + "step": 2848 + }, + { + "epoch": 0.4651238724950002, + "grad_norm": 2.0905911922454834, + "learning_rate": 1.9982791279431738e-05, + "loss": 0.6485, + "step": 2849 + }, + { + "epoch": 0.4652871311375046, + "grad_norm": 2.2241992950439453, + "learning_rate": 1.9982772665662083e-05, + "loss": 0.9435, + "step": 2850 + }, + { + "epoch": 0.46545038978000897, + "grad_norm": 2.195361375808716, + "learning_rate": 1.9982754041839784e-05, + "loss": 0.9347, + "step": 2851 + }, + { + "epoch": 0.46561364842251335, + "grad_norm": 2.078212261199951, + "learning_rate": 1.9982735407964847e-05, + "loss": 0.7921, + "step": 2852 + }, + { + "epoch": 0.46577690706501773, + "grad_norm": 2.2851967811584473, + "learning_rate": 1.9982716764037303e-05, + "loss": 0.8441, + "step": 2853 + }, + { + "epoch": 0.46594016570752217, + "grad_norm": 2.23728346824646, + "learning_rate": 1.9982698110057165e-05, + "loss": 0.7958, + "step": 2854 + }, + { + "epoch": 0.46610342435002655, + "grad_norm": 2.095329999923706, + "learning_rate": 1.9982679446024457e-05, + "loss": 0.9383, + "step": 2855 + }, + { + "epoch": 0.46626668299253093, + "grad_norm": 2.822284460067749, + "learning_rate": 1.9982660771939185e-05, + "loss": 0.8709, + "step": 2856 + }, + { + "epoch": 0.4664299416350353, + "grad_norm": 2.1636219024658203, + "learning_rate": 1.998264208780138e-05, + "loss": 0.7915, + "step": 2857 + }, + { + "epoch": 0.4665932002775397, + "grad_norm": 2.226421356201172, + "learning_rate": 1.998262339361106e-05, + "loss": 0.8553, + "step": 2858 + }, + { + "epoch": 0.4667564589200441, + "grad_norm": 2.3619461059570312, + "learning_rate": 1.998260468936824e-05, + "loss": 0.9113, + "step": 2859 + }, + { + "epoch": 0.46691971756254846, + "grad_norm": 2.351224660873413, + "learning_rate": 1.9982585975072938e-05, + "loss": 0.9972, + "step": 2860 + }, + { + "epoch": 0.46708297620505285, + "grad_norm": 2.365201473236084, + "learning_rate": 1.9982567250725175e-05, + "loss": 0.8192, + "step": 2861 + }, + { + "epoch": 0.46724623484755723, + "grad_norm": 2.134860038757324, + "learning_rate": 1.9982548516324967e-05, + "loss": 0.829, + "step": 2862 + }, + { + "epoch": 0.4674094934900616, + "grad_norm": 1.9725315570831299, + "learning_rate": 1.998252977187234e-05, + "loss": 0.8187, + "step": 2863 + }, + { + "epoch": 0.467572752132566, + "grad_norm": 2.3101861476898193, + "learning_rate": 1.9982511017367307e-05, + "loss": 0.8469, + "step": 2864 + }, + { + "epoch": 0.46773601077507043, + "grad_norm": 1.8851842880249023, + "learning_rate": 1.998249225280988e-05, + "loss": 0.8447, + "step": 2865 + }, + { + "epoch": 0.4678992694175748, + "grad_norm": 2.2839772701263428, + "learning_rate": 1.9982473478200094e-05, + "loss": 0.9947, + "step": 2866 + }, + { + "epoch": 0.4680625280600792, + "grad_norm": 2.428194999694824, + "learning_rate": 1.998245469353796e-05, + "loss": 1.0399, + "step": 2867 + }, + { + "epoch": 0.4682257867025836, + "grad_norm": 2.3076398372650146, + "learning_rate": 1.9982435898823495e-05, + "loss": 1.0321, + "step": 2868 + }, + { + "epoch": 0.46838904534508796, + "grad_norm": 2.163058042526245, + "learning_rate": 1.998241709405672e-05, + "loss": 0.7943, + "step": 2869 + }, + { + "epoch": 0.46855230398759234, + "grad_norm": 1.9033005237579346, + "learning_rate": 1.9982398279237657e-05, + "loss": 0.7803, + "step": 2870 + }, + { + "epoch": 0.4687155626300967, + "grad_norm": 1.9590667486190796, + "learning_rate": 1.9982379454366314e-05, + "loss": 0.8188, + "step": 2871 + }, + { + "epoch": 0.4688788212726011, + "grad_norm": 1.980513334274292, + "learning_rate": 1.9982360619442724e-05, + "loss": 0.7395, + "step": 2872 + }, + { + "epoch": 0.4690420799151055, + "grad_norm": 2.6382150650024414, + "learning_rate": 1.9982341774466895e-05, + "loss": 0.9217, + "step": 2873 + }, + { + "epoch": 0.4692053385576099, + "grad_norm": 2.3668220043182373, + "learning_rate": 1.9982322919438855e-05, + "loss": 0.9604, + "step": 2874 + }, + { + "epoch": 0.46936859720011426, + "grad_norm": 1.9891583919525146, + "learning_rate": 1.9982304054358615e-05, + "loss": 0.8778, + "step": 2875 + }, + { + "epoch": 0.4695318558426187, + "grad_norm": 2.083937406539917, + "learning_rate": 1.9982285179226197e-05, + "loss": 0.8809, + "step": 2876 + }, + { + "epoch": 0.4696951144851231, + "grad_norm": 2.156355619430542, + "learning_rate": 1.9982266294041623e-05, + "loss": 0.8518, + "step": 2877 + }, + { + "epoch": 0.46985837312762746, + "grad_norm": 2.194596529006958, + "learning_rate": 1.998224739880491e-05, + "loss": 0.9476, + "step": 2878 + }, + { + "epoch": 0.47002163177013184, + "grad_norm": 2.057173490524292, + "learning_rate": 1.9982228493516076e-05, + "loss": 0.7898, + "step": 2879 + }, + { + "epoch": 0.4701848904126362, + "grad_norm": 2.5394163131713867, + "learning_rate": 1.9982209578175136e-05, + "loss": 0.9684, + "step": 2880 + }, + { + "epoch": 0.4703481490551406, + "grad_norm": 2.59202241897583, + "learning_rate": 1.9982190652782122e-05, + "loss": 0.9448, + "step": 2881 + }, + { + "epoch": 0.470511407697645, + "grad_norm": 2.0157599449157715, + "learning_rate": 1.998217171733704e-05, + "loss": 0.9343, + "step": 2882 + }, + { + "epoch": 0.4706746663401494, + "grad_norm": 2.2865381240844727, + "learning_rate": 1.998215277183991e-05, + "loss": 0.8782, + "step": 2883 + }, + { + "epoch": 0.47083792498265375, + "grad_norm": 2.233272075653076, + "learning_rate": 1.9982133816290757e-05, + "loss": 0.9271, + "step": 2884 + }, + { + "epoch": 0.47100118362515814, + "grad_norm": 2.287358045578003, + "learning_rate": 1.9982114850689603e-05, + "loss": 0.9606, + "step": 2885 + }, + { + "epoch": 0.4711644422676625, + "grad_norm": 2.0011157989501953, + "learning_rate": 1.9982095875036455e-05, + "loss": 0.8636, + "step": 2886 + }, + { + "epoch": 0.47132770091016696, + "grad_norm": 2.073028564453125, + "learning_rate": 1.9982076889331344e-05, + "loss": 0.797, + "step": 2887 + }, + { + "epoch": 0.47149095955267134, + "grad_norm": 2.008582353591919, + "learning_rate": 1.9982057893574286e-05, + "loss": 0.6691, + "step": 2888 + }, + { + "epoch": 0.4716542181951757, + "grad_norm": 1.8523125648498535, + "learning_rate": 1.998203888776529e-05, + "loss": 0.8356, + "step": 2889 + }, + { + "epoch": 0.4718174768376801, + "grad_norm": 2.081228494644165, + "learning_rate": 1.998201987190439e-05, + "loss": 0.8704, + "step": 2890 + }, + { + "epoch": 0.4719807354801845, + "grad_norm": 1.900962471961975, + "learning_rate": 1.9982000845991598e-05, + "loss": 0.8424, + "step": 2891 + }, + { + "epoch": 0.47214399412268887, + "grad_norm": 2.2780520915985107, + "learning_rate": 1.9981981810026932e-05, + "loss": 0.9071, + "step": 2892 + }, + { + "epoch": 0.47230725276519325, + "grad_norm": 2.255894899368286, + "learning_rate": 1.9981962764010415e-05, + "loss": 0.9602, + "step": 2893 + }, + { + "epoch": 0.47247051140769764, + "grad_norm": 2.24461030960083, + "learning_rate": 1.9981943707942064e-05, + "loss": 0.8588, + "step": 2894 + }, + { + "epoch": 0.472633770050202, + "grad_norm": 2.2727653980255127, + "learning_rate": 1.9981924641821897e-05, + "loss": 0.9272, + "step": 2895 + }, + { + "epoch": 0.4727970286927064, + "grad_norm": 2.308840036392212, + "learning_rate": 1.9981905565649937e-05, + "loss": 0.6948, + "step": 2896 + }, + { + "epoch": 0.4729602873352108, + "grad_norm": 2.0174977779388428, + "learning_rate": 1.9981886479426195e-05, + "loss": 0.7442, + "step": 2897 + }, + { + "epoch": 0.4731235459777152, + "grad_norm": 1.8840582370758057, + "learning_rate": 1.99818673831507e-05, + "loss": 0.6555, + "step": 2898 + }, + { + "epoch": 0.4732868046202196, + "grad_norm": 1.9221245050430298, + "learning_rate": 1.9981848276823466e-05, + "loss": 0.6932, + "step": 2899 + }, + { + "epoch": 0.473450063262724, + "grad_norm": 2.556025743484497, + "learning_rate": 1.9981829160444515e-05, + "loss": 0.9103, + "step": 2900 + }, + { + "epoch": 0.47361332190522837, + "grad_norm": 2.0670883655548096, + "learning_rate": 1.9981810034013865e-05, + "loss": 0.8191, + "step": 2901 + }, + { + "epoch": 0.47377658054773275, + "grad_norm": 2.203158140182495, + "learning_rate": 1.9981790897531535e-05, + "loss": 0.8158, + "step": 2902 + }, + { + "epoch": 0.47393983919023713, + "grad_norm": 2.3072898387908936, + "learning_rate": 1.998177175099754e-05, + "loss": 0.7948, + "step": 2903 + }, + { + "epoch": 0.4741030978327415, + "grad_norm": 2.3490943908691406, + "learning_rate": 1.9981752594411908e-05, + "loss": 0.9464, + "step": 2904 + }, + { + "epoch": 0.4742663564752459, + "grad_norm": 2.2218451499938965, + "learning_rate": 1.9981733427774653e-05, + "loss": 0.9224, + "step": 2905 + }, + { + "epoch": 0.4744296151177503, + "grad_norm": 2.1985065937042236, + "learning_rate": 1.9981714251085794e-05, + "loss": 0.8522, + "step": 2906 + }, + { + "epoch": 0.47459287376025466, + "grad_norm": 2.1712403297424316, + "learning_rate": 1.998169506434535e-05, + "loss": 0.9383, + "step": 2907 + }, + { + "epoch": 0.47475613240275905, + "grad_norm": 2.305015802383423, + "learning_rate": 1.9981675867553344e-05, + "loss": 1.0712, + "step": 2908 + }, + { + "epoch": 0.4749193910452635, + "grad_norm": 1.900051236152649, + "learning_rate": 1.9981656660709794e-05, + "loss": 0.7351, + "step": 2909 + }, + { + "epoch": 0.47508264968776787, + "grad_norm": 2.1865296363830566, + "learning_rate": 1.9981637443814717e-05, + "loss": 0.9549, + "step": 2910 + }, + { + "epoch": 0.47524590833027225, + "grad_norm": 2.153003215789795, + "learning_rate": 1.9981618216868134e-05, + "loss": 0.9081, + "step": 2911 + }, + { + "epoch": 0.47540916697277663, + "grad_norm": 2.1814370155334473, + "learning_rate": 1.998159897987006e-05, + "loss": 0.7573, + "step": 2912 + }, + { + "epoch": 0.475572425615281, + "grad_norm": 2.627352237701416, + "learning_rate": 1.9981579732820523e-05, + "loss": 0.9614, + "step": 2913 + }, + { + "epoch": 0.4757356842577854, + "grad_norm": 2.1022489070892334, + "learning_rate": 1.998156047571954e-05, + "loss": 0.971, + "step": 2914 + }, + { + "epoch": 0.4758989429002898, + "grad_norm": 2.0379631519317627, + "learning_rate": 1.9981541208567122e-05, + "loss": 0.8237, + "step": 2915 + }, + { + "epoch": 0.47606220154279416, + "grad_norm": 2.286691188812256, + "learning_rate": 1.9981521931363303e-05, + "loss": 0.938, + "step": 2916 + }, + { + "epoch": 0.47622546018529854, + "grad_norm": 2.454576253890991, + "learning_rate": 1.998150264410809e-05, + "loss": 0.9036, + "step": 2917 + }, + { + "epoch": 0.4763887188278029, + "grad_norm": 1.854154109954834, + "learning_rate": 1.99814833468015e-05, + "loss": 0.8774, + "step": 2918 + }, + { + "epoch": 0.4765519774703073, + "grad_norm": 2.294463872909546, + "learning_rate": 1.9981464039443566e-05, + "loss": 0.9566, + "step": 2919 + }, + { + "epoch": 0.47671523611281175, + "grad_norm": 1.9930500984191895, + "learning_rate": 1.99814447220343e-05, + "loss": 0.8527, + "step": 2920 + }, + { + "epoch": 0.47687849475531613, + "grad_norm": 2.007268190383911, + "learning_rate": 1.998142539457372e-05, + "loss": 0.7803, + "step": 2921 + }, + { + "epoch": 0.4770417533978205, + "grad_norm": 2.0584607124328613, + "learning_rate": 1.9981406057061846e-05, + "loss": 0.7947, + "step": 2922 + }, + { + "epoch": 0.4772050120403249, + "grad_norm": 2.1352384090423584, + "learning_rate": 1.9981386709498703e-05, + "loss": 0.7785, + "step": 2923 + }, + { + "epoch": 0.4773682706828293, + "grad_norm": 2.671921730041504, + "learning_rate": 1.9981367351884305e-05, + "loss": 1.0969, + "step": 2924 + }, + { + "epoch": 0.47753152932533366, + "grad_norm": 2.308342933654785, + "learning_rate": 1.998134798421867e-05, + "loss": 0.8548, + "step": 2925 + }, + { + "epoch": 0.47769478796783804, + "grad_norm": 2.1172280311584473, + "learning_rate": 1.9981328606501826e-05, + "loss": 0.8842, + "step": 2926 + }, + { + "epoch": 0.4778580466103424, + "grad_norm": 2.494612455368042, + "learning_rate": 1.998130921873378e-05, + "loss": 1.7319, + "step": 2927 + }, + { + "epoch": 0.4780213052528468, + "grad_norm": 2.1061408519744873, + "learning_rate": 1.9981289820914562e-05, + "loss": 0.9551, + "step": 2928 + }, + { + "epoch": 0.4781845638953512, + "grad_norm": 2.1102888584136963, + "learning_rate": 1.9981270413044184e-05, + "loss": 0.9396, + "step": 2929 + }, + { + "epoch": 0.47834782253785557, + "grad_norm": 2.4565937519073486, + "learning_rate": 1.9981250995122673e-05, + "loss": 1.0906, + "step": 2930 + }, + { + "epoch": 0.47851108118036, + "grad_norm": 2.8187267780303955, + "learning_rate": 1.9981231567150042e-05, + "loss": 1.0055, + "step": 2931 + }, + { + "epoch": 0.4786743398228644, + "grad_norm": 2.0861103534698486, + "learning_rate": 1.998121212912632e-05, + "loss": 0.8419, + "step": 2932 + }, + { + "epoch": 0.4788375984653688, + "grad_norm": 2.2977378368377686, + "learning_rate": 1.9981192681051514e-05, + "loss": 0.9361, + "step": 2933 + }, + { + "epoch": 0.47900085710787316, + "grad_norm": 1.8260252475738525, + "learning_rate": 1.998117322292565e-05, + "loss": 0.8027, + "step": 2934 + }, + { + "epoch": 0.47916411575037754, + "grad_norm": 1.976986050605774, + "learning_rate": 1.9981153754748747e-05, + "loss": 0.8388, + "step": 2935 + }, + { + "epoch": 0.4793273743928819, + "grad_norm": 2.6454837322235107, + "learning_rate": 1.9981134276520828e-05, + "loss": 0.8392, + "step": 2936 + }, + { + "epoch": 0.4794906330353863, + "grad_norm": 2.001678705215454, + "learning_rate": 1.9981114788241907e-05, + "loss": 0.6858, + "step": 2937 + }, + { + "epoch": 0.4796538916778907, + "grad_norm": 2.0924317836761475, + "learning_rate": 1.9981095289912006e-05, + "loss": 0.7612, + "step": 2938 + }, + { + "epoch": 0.47981715032039507, + "grad_norm": 2.139970302581787, + "learning_rate": 1.9981075781531147e-05, + "loss": 0.7729, + "step": 2939 + }, + { + "epoch": 0.47998040896289945, + "grad_norm": 2.0188868045806885, + "learning_rate": 1.9981056263099347e-05, + "loss": 0.9477, + "step": 2940 + }, + { + "epoch": 0.48014366760540383, + "grad_norm": 1.75154709815979, + "learning_rate": 1.998103673461662e-05, + "loss": 0.7585, + "step": 2941 + }, + { + "epoch": 0.4803069262479083, + "grad_norm": 2.139803409576416, + "learning_rate": 1.9981017196082998e-05, + "loss": 0.8247, + "step": 2942 + }, + { + "epoch": 0.48047018489041265, + "grad_norm": 2.2295327186584473, + "learning_rate": 1.9980997647498493e-05, + "loss": 0.9058, + "step": 2943 + }, + { + "epoch": 0.48063344353291704, + "grad_norm": 2.1121714115142822, + "learning_rate": 1.9980978088863125e-05, + "loss": 0.9122, + "step": 2944 + }, + { + "epoch": 0.4807967021754214, + "grad_norm": 2.3188819885253906, + "learning_rate": 1.9980958520176915e-05, + "loss": 0.8868, + "step": 2945 + }, + { + "epoch": 0.4809599608179258, + "grad_norm": 2.8637523651123047, + "learning_rate": 1.9980938941439883e-05, + "loss": 1.1229, + "step": 2946 + }, + { + "epoch": 0.4811232194604302, + "grad_norm": 2.2811903953552246, + "learning_rate": 1.998091935265205e-05, + "loss": 0.7047, + "step": 2947 + }, + { + "epoch": 0.48128647810293457, + "grad_norm": 2.3774428367614746, + "learning_rate": 1.9980899753813432e-05, + "loss": 0.8565, + "step": 2948 + }, + { + "epoch": 0.48144973674543895, + "grad_norm": 2.6348989009857178, + "learning_rate": 1.998088014492405e-05, + "loss": 0.9203, + "step": 2949 + }, + { + "epoch": 0.48161299538794333, + "grad_norm": 2.54010009765625, + "learning_rate": 1.9980860525983924e-05, + "loss": 0.894, + "step": 2950 + }, + { + "epoch": 0.4817762540304477, + "grad_norm": 2.545994997024536, + "learning_rate": 1.9980840896993074e-05, + "loss": 0.8777, + "step": 2951 + }, + { + "epoch": 0.4819395126729521, + "grad_norm": 2.4247658252716064, + "learning_rate": 1.9980821257951522e-05, + "loss": 0.9417, + "step": 2952 + }, + { + "epoch": 0.48210277131545654, + "grad_norm": 1.8573678731918335, + "learning_rate": 1.9980801608859283e-05, + "loss": 0.7365, + "step": 2953 + }, + { + "epoch": 0.4822660299579609, + "grad_norm": 2.3054144382476807, + "learning_rate": 1.998078194971638e-05, + "loss": 0.9102, + "step": 2954 + }, + { + "epoch": 0.4824292886004653, + "grad_norm": 2.2980403900146484, + "learning_rate": 1.9980762280522834e-05, + "loss": 0.9507, + "step": 2955 + }, + { + "epoch": 0.4825925472429697, + "grad_norm": 2.1343464851379395, + "learning_rate": 1.9980742601278662e-05, + "loss": 0.948, + "step": 2956 + }, + { + "epoch": 0.48275580588547407, + "grad_norm": 2.2745609283447266, + "learning_rate": 1.9980722911983884e-05, + "loss": 0.8342, + "step": 2957 + }, + { + "epoch": 0.48291906452797845, + "grad_norm": 2.047905206680298, + "learning_rate": 1.9980703212638522e-05, + "loss": 0.9463, + "step": 2958 + }, + { + "epoch": 0.48308232317048283, + "grad_norm": 2.983752489089966, + "learning_rate": 1.9980683503242596e-05, + "loss": 0.9554, + "step": 2959 + }, + { + "epoch": 0.4832455818129872, + "grad_norm": 2.0807745456695557, + "learning_rate": 1.998066378379612e-05, + "loss": 0.8924, + "step": 2960 + }, + { + "epoch": 0.4834088404554916, + "grad_norm": 2.2232308387756348, + "learning_rate": 1.9980644054299122e-05, + "loss": 0.9268, + "step": 2961 + }, + { + "epoch": 0.483572099097996, + "grad_norm": 2.1287057399749756, + "learning_rate": 1.9980624314751614e-05, + "loss": 1.0554, + "step": 2962 + }, + { + "epoch": 0.48373535774050036, + "grad_norm": 2.1028623580932617, + "learning_rate": 1.9980604565153624e-05, + "loss": 0.9326, + "step": 2963 + }, + { + "epoch": 0.4838986163830048, + "grad_norm": 2.0352272987365723, + "learning_rate": 1.9980584805505167e-05, + "loss": 0.8342, + "step": 2964 + }, + { + "epoch": 0.4840618750255092, + "grad_norm": 2.1748850345611572, + "learning_rate": 1.9980565035806262e-05, + "loss": 0.8828, + "step": 2965 + }, + { + "epoch": 0.48422513366801356, + "grad_norm": 2.2086987495422363, + "learning_rate": 1.998054525605693e-05, + "loss": 0.7939, + "step": 2966 + }, + { + "epoch": 0.48438839231051795, + "grad_norm": 2.4605724811553955, + "learning_rate": 1.9980525466257195e-05, + "loss": 1.0773, + "step": 2967 + }, + { + "epoch": 0.48455165095302233, + "grad_norm": 2.135646104812622, + "learning_rate": 1.9980505666407074e-05, + "loss": 0.8832, + "step": 2968 + }, + { + "epoch": 0.4847149095955267, + "grad_norm": 2.2326395511627197, + "learning_rate": 1.9980485856506582e-05, + "loss": 0.9103, + "step": 2969 + }, + { + "epoch": 0.4848781682380311, + "grad_norm": 2.2525382041931152, + "learning_rate": 1.9980466036555746e-05, + "loss": 0.8218, + "step": 2970 + }, + { + "epoch": 0.4850414268805355, + "grad_norm": 2.2327094078063965, + "learning_rate": 1.9980446206554583e-05, + "loss": 0.9171, + "step": 2971 + }, + { + "epoch": 0.48520468552303986, + "grad_norm": 2.320629358291626, + "learning_rate": 1.9980426366503117e-05, + "loss": 0.9102, + "step": 2972 + }, + { + "epoch": 0.48536794416554424, + "grad_norm": 2.501129388809204, + "learning_rate": 1.9980406516401357e-05, + "loss": 0.8034, + "step": 2973 + }, + { + "epoch": 0.4855312028080487, + "grad_norm": 2.139223575592041, + "learning_rate": 1.9980386656249334e-05, + "loss": 0.9316, + "step": 2974 + }, + { + "epoch": 0.48569446145055306, + "grad_norm": 2.0296218395233154, + "learning_rate": 1.9980366786047066e-05, + "loss": 0.8225, + "step": 2975 + }, + { + "epoch": 0.48585772009305744, + "grad_norm": 2.0393569469451904, + "learning_rate": 1.998034690579457e-05, + "loss": 0.901, + "step": 2976 + }, + { + "epoch": 0.4860209787355618, + "grad_norm": 2.026050090789795, + "learning_rate": 1.9980327015491866e-05, + "loss": 0.84, + "step": 2977 + }, + { + "epoch": 0.4861842373780662, + "grad_norm": 2.0071427822113037, + "learning_rate": 1.9980307115138976e-05, + "loss": 0.8944, + "step": 2978 + }, + { + "epoch": 0.4863474960205706, + "grad_norm": 2.3075180053710938, + "learning_rate": 1.998028720473592e-05, + "loss": 0.9718, + "step": 2979 + }, + { + "epoch": 0.486510754663075, + "grad_norm": 2.4666945934295654, + "learning_rate": 1.9980267284282718e-05, + "loss": 1.4253, + "step": 2980 + }, + { + "epoch": 0.48667401330557936, + "grad_norm": 1.9533698558807373, + "learning_rate": 1.9980247353779388e-05, + "loss": 0.9028, + "step": 2981 + }, + { + "epoch": 0.48683727194808374, + "grad_norm": 1.9554541110992432, + "learning_rate": 1.998022741322595e-05, + "loss": 0.8571, + "step": 2982 + }, + { + "epoch": 0.4870005305905881, + "grad_norm": 2.191021203994751, + "learning_rate": 1.998020746262243e-05, + "loss": 0.7544, + "step": 2983 + }, + { + "epoch": 0.4871637892330925, + "grad_norm": 2.1417734622955322, + "learning_rate": 1.9980187501968838e-05, + "loss": 0.9186, + "step": 2984 + }, + { + "epoch": 0.48732704787559694, + "grad_norm": 2.29416561126709, + "learning_rate": 1.9980167531265206e-05, + "loss": 1.0595, + "step": 2985 + }, + { + "epoch": 0.4874903065181013, + "grad_norm": 2.046635627746582, + "learning_rate": 1.9980147550511545e-05, + "loss": 0.9007, + "step": 2986 + }, + { + "epoch": 0.4876535651606057, + "grad_norm": 1.726946234703064, + "learning_rate": 1.998012755970788e-05, + "loss": 0.716, + "step": 2987 + }, + { + "epoch": 0.4878168238031101, + "grad_norm": 2.0166027545928955, + "learning_rate": 1.9980107558854227e-05, + "loss": 0.868, + "step": 2988 + }, + { + "epoch": 0.48798008244561447, + "grad_norm": 2.3249034881591797, + "learning_rate": 1.9980087547950607e-05, + "loss": 1.0554, + "step": 2989 + }, + { + "epoch": 0.48814334108811885, + "grad_norm": 2.352238655090332, + "learning_rate": 1.9980067526997044e-05, + "loss": 0.8417, + "step": 2990 + }, + { + "epoch": 0.48830659973062324, + "grad_norm": 2.213148832321167, + "learning_rate": 1.9980047495993556e-05, + "loss": 1.0963, + "step": 2991 + }, + { + "epoch": 0.4884698583731276, + "grad_norm": 2.137869119644165, + "learning_rate": 1.998002745494016e-05, + "loss": 0.8551, + "step": 2992 + }, + { + "epoch": 0.488633117015632, + "grad_norm": 2.222379207611084, + "learning_rate": 1.9980007403836884e-05, + "loss": 1.0197, + "step": 2993 + }, + { + "epoch": 0.4887963756581364, + "grad_norm": 1.9355217218399048, + "learning_rate": 1.9979987342683736e-05, + "loss": 0.6892, + "step": 2994 + }, + { + "epoch": 0.48895963430064077, + "grad_norm": 2.2415010929107666, + "learning_rate": 1.997996727148075e-05, + "loss": 0.8633, + "step": 2995 + }, + { + "epoch": 0.4891228929431452, + "grad_norm": 1.8818628787994385, + "learning_rate": 1.9979947190227937e-05, + "loss": 0.7942, + "step": 2996 + }, + { + "epoch": 0.4892861515856496, + "grad_norm": 2.001256227493286, + "learning_rate": 1.997992709892532e-05, + "loss": 0.8923, + "step": 2997 + }, + { + "epoch": 0.48944941022815397, + "grad_norm": 2.2003233432769775, + "learning_rate": 1.997990699757292e-05, + "loss": 0.9447, + "step": 2998 + }, + { + "epoch": 0.48961266887065835, + "grad_norm": 2.0368096828460693, + "learning_rate": 1.9979886886170755e-05, + "loss": 0.8759, + "step": 2999 + }, + { + "epoch": 0.48977592751316273, + "grad_norm": 2.2755496501922607, + "learning_rate": 1.9979866764718846e-05, + "loss": 0.9196, + "step": 3000 + }, + { + "epoch": 0.4899391861556671, + "grad_norm": 1.7515732049942017, + "learning_rate": 1.9979846633217214e-05, + "loss": 0.6944, + "step": 3001 + }, + { + "epoch": 0.4901024447981715, + "grad_norm": 2.0700576305389404, + "learning_rate": 1.997982649166588e-05, + "loss": 0.9317, + "step": 3002 + }, + { + "epoch": 0.4902657034406759, + "grad_norm": 3.290846347808838, + "learning_rate": 1.997980634006486e-05, + "loss": 0.8254, + "step": 3003 + }, + { + "epoch": 0.49042896208318026, + "grad_norm": 1.9789406061172485, + "learning_rate": 1.9979786178414185e-05, + "loss": 0.8504, + "step": 3004 + }, + { + "epoch": 0.49059222072568465, + "grad_norm": 2.6427135467529297, + "learning_rate": 1.9979766006713863e-05, + "loss": 0.9686, + "step": 3005 + }, + { + "epoch": 0.49075547936818903, + "grad_norm": 2.3769781589508057, + "learning_rate": 1.9979745824963918e-05, + "loss": 1.0175, + "step": 3006 + }, + { + "epoch": 0.49091873801069347, + "grad_norm": 2.253631114959717, + "learning_rate": 1.9979725633164375e-05, + "loss": 0.9372, + "step": 3007 + }, + { + "epoch": 0.49108199665319785, + "grad_norm": 1.912671685218811, + "learning_rate": 1.997970543131525e-05, + "loss": 0.7671, + "step": 3008 + }, + { + "epoch": 0.49124525529570223, + "grad_norm": 1.8150739669799805, + "learning_rate": 1.9979685219416563e-05, + "loss": 0.727, + "step": 3009 + }, + { + "epoch": 0.4914085139382066, + "grad_norm": 1.9659093618392944, + "learning_rate": 1.997966499746834e-05, + "loss": 0.7915, + "step": 3010 + }, + { + "epoch": 0.491571772580711, + "grad_norm": 1.967844009399414, + "learning_rate": 1.997964476547059e-05, + "loss": 0.9199, + "step": 3011 + }, + { + "epoch": 0.4917350312232154, + "grad_norm": 2.3013789653778076, + "learning_rate": 1.9979624523423342e-05, + "loss": 0.6664, + "step": 3012 + }, + { + "epoch": 0.49189828986571976, + "grad_norm": 2.030122756958008, + "learning_rate": 1.9979604271326617e-05, + "loss": 0.6904, + "step": 3013 + }, + { + "epoch": 0.49206154850822414, + "grad_norm": 2.4916930198669434, + "learning_rate": 1.9979584009180435e-05, + "loss": 0.8858, + "step": 3014 + }, + { + "epoch": 0.4922248071507285, + "grad_norm": 2.3177926540374756, + "learning_rate": 1.997956373698481e-05, + "loss": 0.8444, + "step": 3015 + }, + { + "epoch": 0.4923880657932329, + "grad_norm": 2.183171272277832, + "learning_rate": 1.997954345473977e-05, + "loss": 0.7691, + "step": 3016 + }, + { + "epoch": 0.4925513244357373, + "grad_norm": 2.253082275390625, + "learning_rate": 1.997952316244533e-05, + "loss": 0.9144, + "step": 3017 + }, + { + "epoch": 0.49271458307824173, + "grad_norm": 2.270782232284546, + "learning_rate": 1.9979502860101517e-05, + "loss": 0.6775, + "step": 3018 + }, + { + "epoch": 0.4928778417207461, + "grad_norm": 2.40990948677063, + "learning_rate": 1.9979482547708344e-05, + "loss": 0.8789, + "step": 3019 + }, + { + "epoch": 0.4930411003632505, + "grad_norm": 2.1739113330841064, + "learning_rate": 1.9979462225265834e-05, + "loss": 0.9664, + "step": 3020 + }, + { + "epoch": 0.4932043590057549, + "grad_norm": 2.346156120300293, + "learning_rate": 1.997944189277401e-05, + "loss": 0.8365, + "step": 3021 + }, + { + "epoch": 0.49336761764825926, + "grad_norm": 2.339733600616455, + "learning_rate": 1.9979421550232892e-05, + "loss": 0.9033, + "step": 3022 + }, + { + "epoch": 0.49353087629076364, + "grad_norm": 1.6829936504364014, + "learning_rate": 1.9979401197642497e-05, + "loss": 0.6937, + "step": 3023 + }, + { + "epoch": 0.493694134933268, + "grad_norm": 2.2345287799835205, + "learning_rate": 1.9979380835002846e-05, + "loss": 0.8525, + "step": 3024 + }, + { + "epoch": 0.4938573935757724, + "grad_norm": 2.111694097518921, + "learning_rate": 1.9979360462313965e-05, + "loss": 0.9757, + "step": 3025 + }, + { + "epoch": 0.4940206522182768, + "grad_norm": 2.213365077972412, + "learning_rate": 1.9979340079575865e-05, + "loss": 0.9656, + "step": 3026 + }, + { + "epoch": 0.4941839108607812, + "grad_norm": 2.278170108795166, + "learning_rate": 1.997931968678858e-05, + "loss": 0.8585, + "step": 3027 + }, + { + "epoch": 0.49434716950328556, + "grad_norm": 2.8380050659179688, + "learning_rate": 1.9979299283952116e-05, + "loss": 0.8634, + "step": 3028 + }, + { + "epoch": 0.49451042814579, + "grad_norm": 2.406644582748413, + "learning_rate": 1.9979278871066504e-05, + "loss": 0.7971, + "step": 3029 + }, + { + "epoch": 0.4946736867882944, + "grad_norm": 2.6808650493621826, + "learning_rate": 1.997925844813176e-05, + "loss": 0.9972, + "step": 3030 + }, + { + "epoch": 0.49483694543079876, + "grad_norm": 1.8772902488708496, + "learning_rate": 1.9979238015147904e-05, + "loss": 0.8138, + "step": 3031 + }, + { + "epoch": 0.49500020407330314, + "grad_norm": 1.8968791961669922, + "learning_rate": 1.997921757211496e-05, + "loss": 0.846, + "step": 3032 + }, + { + "epoch": 0.4951634627158075, + "grad_norm": 2.2460570335388184, + "learning_rate": 1.9979197119032946e-05, + "loss": 0.9541, + "step": 3033 + }, + { + "epoch": 0.4953267213583119, + "grad_norm": 2.087461233139038, + "learning_rate": 1.997917665590188e-05, + "loss": 0.8338, + "step": 3034 + }, + { + "epoch": 0.4954899800008163, + "grad_norm": 2.6389570236206055, + "learning_rate": 1.997915618272179e-05, + "loss": 0.9143, + "step": 3035 + }, + { + "epoch": 0.49565323864332067, + "grad_norm": 2.1088502407073975, + "learning_rate": 1.9979135699492692e-05, + "loss": 0.8716, + "step": 3036 + }, + { + "epoch": 0.49581649728582505, + "grad_norm": 2.5845401287078857, + "learning_rate": 1.9979115206214607e-05, + "loss": 0.8624, + "step": 3037 + }, + { + "epoch": 0.49597975592832944, + "grad_norm": 2.876234292984009, + "learning_rate": 1.9979094702887554e-05, + "loss": 0.9556, + "step": 3038 + }, + { + "epoch": 0.4961430145708338, + "grad_norm": 2.5351595878601074, + "learning_rate": 1.9979074189511556e-05, + "loss": 0.9999, + "step": 3039 + }, + { + "epoch": 0.49630627321333826, + "grad_norm": 2.285290002822876, + "learning_rate": 1.9979053666086633e-05, + "loss": 0.8573, + "step": 3040 + }, + { + "epoch": 0.49646953185584264, + "grad_norm": 3.3460261821746826, + "learning_rate": 1.9979033132612806e-05, + "loss": 0.9797, + "step": 3041 + }, + { + "epoch": 0.496632790498347, + "grad_norm": 1.9459445476531982, + "learning_rate": 1.9979012589090092e-05, + "loss": 0.8252, + "step": 3042 + }, + { + "epoch": 0.4967960491408514, + "grad_norm": 2.526336669921875, + "learning_rate": 1.997899203551852e-05, + "loss": 1.1115, + "step": 3043 + }, + { + "epoch": 0.4969593077833558, + "grad_norm": 2.222221851348877, + "learning_rate": 1.9978971471898105e-05, + "loss": 0.8229, + "step": 3044 + }, + { + "epoch": 0.49712256642586017, + "grad_norm": 2.2877819538116455, + "learning_rate": 1.9978950898228865e-05, + "loss": 1.0483, + "step": 3045 + }, + { + "epoch": 0.49728582506836455, + "grad_norm": 2.0507566928863525, + "learning_rate": 1.9978930314510826e-05, + "loss": 0.957, + "step": 3046 + }, + { + "epoch": 0.49744908371086893, + "grad_norm": 1.8991285562515259, + "learning_rate": 1.9978909720744005e-05, + "loss": 0.8751, + "step": 3047 + }, + { + "epoch": 0.4976123423533733, + "grad_norm": 2.0472488403320312, + "learning_rate": 1.997888911692843e-05, + "loss": 0.8776, + "step": 3048 + }, + { + "epoch": 0.4977756009958777, + "grad_norm": 2.287355422973633, + "learning_rate": 1.997886850306411e-05, + "loss": 0.9827, + "step": 3049 + }, + { + "epoch": 0.4979388596383821, + "grad_norm": 2.0967390537261963, + "learning_rate": 1.9978847879151076e-05, + "loss": 0.9019, + "step": 3050 + }, + { + "epoch": 0.4981021182808865, + "grad_norm": 2.326050043106079, + "learning_rate": 1.997882724518934e-05, + "loss": 0.9828, + "step": 3051 + }, + { + "epoch": 0.4982653769233909, + "grad_norm": 1.8875048160552979, + "learning_rate": 1.9978806601178933e-05, + "loss": 0.76, + "step": 3052 + }, + { + "epoch": 0.4984286355658953, + "grad_norm": 2.5950992107391357, + "learning_rate": 1.9978785947119866e-05, + "loss": 0.8534, + "step": 3053 + }, + { + "epoch": 0.49859189420839967, + "grad_norm": 1.9403152465820312, + "learning_rate": 1.997876528301217e-05, + "loss": 0.8539, + "step": 3054 + }, + { + "epoch": 0.49875515285090405, + "grad_norm": 2.7445127964019775, + "learning_rate": 1.9978744608855857e-05, + "loss": 0.8984, + "step": 3055 + }, + { + "epoch": 0.49891841149340843, + "grad_norm": 1.7534751892089844, + "learning_rate": 1.997872392465095e-05, + "loss": 0.7085, + "step": 3056 + }, + { + "epoch": 0.4990816701359128, + "grad_norm": 2.113125801086426, + "learning_rate": 1.997870323039747e-05, + "loss": 0.8371, + "step": 3057 + }, + { + "epoch": 0.4992449287784172, + "grad_norm": 2.7556309700012207, + "learning_rate": 1.997868252609544e-05, + "loss": 0.8921, + "step": 3058 + }, + { + "epoch": 0.4994081874209216, + "grad_norm": 2.338942289352417, + "learning_rate": 1.9978661811744876e-05, + "loss": 0.8578, + "step": 3059 + }, + { + "epoch": 0.49957144606342596, + "grad_norm": 2.310269832611084, + "learning_rate": 1.9978641087345808e-05, + "loss": 0.8858, + "step": 3060 + }, + { + "epoch": 0.49973470470593034, + "grad_norm": 2.177549362182617, + "learning_rate": 1.9978620352898246e-05, + "loss": 0.746, + "step": 3061 + }, + { + "epoch": 0.4998979633484348, + "grad_norm": 2.3217713832855225, + "learning_rate": 1.9978599608402217e-05, + "loss": 0.9706, + "step": 3062 + }, + { + "epoch": 0.5000612219909392, + "grad_norm": 2.488421678543091, + "learning_rate": 1.9978578853857744e-05, + "loss": 0.9503, + "step": 3063 + }, + { + "epoch": 0.5002244806334435, + "grad_norm": 2.4593520164489746, + "learning_rate": 1.9978558089264842e-05, + "loss": 0.7776, + "step": 3064 + }, + { + "epoch": 0.5003877392759479, + "grad_norm": 2.1755521297454834, + "learning_rate": 1.9978537314623537e-05, + "loss": 0.8517, + "step": 3065 + }, + { + "epoch": 0.5005509979184523, + "grad_norm": 2.511699914932251, + "learning_rate": 1.9978516529933847e-05, + "loss": 0.7657, + "step": 3066 + }, + { + "epoch": 0.5007142565609567, + "grad_norm": 2.1570489406585693, + "learning_rate": 1.997849573519579e-05, + "loss": 0.899, + "step": 3067 + }, + { + "epoch": 0.5008775152034611, + "grad_norm": 2.316589832305908, + "learning_rate": 1.9978474930409396e-05, + "loss": 1.0725, + "step": 3068 + }, + { + "epoch": 0.5010407738459655, + "grad_norm": 2.532916784286499, + "learning_rate": 1.9978454115574677e-05, + "loss": 0.7456, + "step": 3069 + }, + { + "epoch": 0.5012040324884699, + "grad_norm": 2.3918509483337402, + "learning_rate": 1.9978433290691654e-05, + "loss": 1.0135, + "step": 3070 + }, + { + "epoch": 0.5013672911309742, + "grad_norm": 1.9708644151687622, + "learning_rate": 1.9978412455760356e-05, + "loss": 0.8468, + "step": 3071 + }, + { + "epoch": 0.5015305497734787, + "grad_norm": 2.123650312423706, + "learning_rate": 1.9978391610780798e-05, + "loss": 0.8918, + "step": 3072 + }, + { + "epoch": 0.501693808415983, + "grad_norm": 2.218646287918091, + "learning_rate": 1.9978370755753004e-05, + "loss": 0.7343, + "step": 3073 + }, + { + "epoch": 0.5018570670584874, + "grad_norm": 2.8331501483917236, + "learning_rate": 1.9978349890676993e-05, + "loss": 0.8768, + "step": 3074 + }, + { + "epoch": 0.5020203257009918, + "grad_norm": 1.8815797567367554, + "learning_rate": 1.9978329015552783e-05, + "loss": 0.794, + "step": 3075 + }, + { + "epoch": 0.5021835843434962, + "grad_norm": 1.9058696031570435, + "learning_rate": 1.99783081303804e-05, + "loss": 0.746, + "step": 3076 + }, + { + "epoch": 0.5023468429860005, + "grad_norm": 1.8353303670883179, + "learning_rate": 1.9978287235159866e-05, + "loss": 0.789, + "step": 3077 + }, + { + "epoch": 0.502510101628505, + "grad_norm": 1.9611802101135254, + "learning_rate": 1.9978266329891196e-05, + "loss": 0.9944, + "step": 3078 + }, + { + "epoch": 0.5026733602710094, + "grad_norm": 2.1930031776428223, + "learning_rate": 1.997824541457442e-05, + "loss": 0.9648, + "step": 3079 + }, + { + "epoch": 0.5028366189135137, + "grad_norm": 3.010266065597534, + "learning_rate": 1.9978224489209547e-05, + "loss": 0.7474, + "step": 3080 + }, + { + "epoch": 0.5029998775560182, + "grad_norm": 2.297976493835449, + "learning_rate": 1.9978203553796605e-05, + "loss": 0.82, + "step": 3081 + }, + { + "epoch": 0.5031631361985225, + "grad_norm": 2.1804168224334717, + "learning_rate": 1.9978182608335616e-05, + "loss": 0.8221, + "step": 3082 + }, + { + "epoch": 0.5033263948410269, + "grad_norm": 2.184976100921631, + "learning_rate": 1.99781616528266e-05, + "loss": 0.9469, + "step": 3083 + }, + { + "epoch": 0.5034896534835313, + "grad_norm": 1.9245314598083496, + "learning_rate": 1.997814068726958e-05, + "loss": 0.8229, + "step": 3084 + }, + { + "epoch": 0.5036529121260357, + "grad_norm": 2.4057416915893555, + "learning_rate": 1.9978119711664573e-05, + "loss": 0.8896, + "step": 3085 + }, + { + "epoch": 0.50381617076854, + "grad_norm": 2.02895450592041, + "learning_rate": 1.9978098726011603e-05, + "loss": 0.814, + "step": 3086 + }, + { + "epoch": 0.5039794294110445, + "grad_norm": 2.1146857738494873, + "learning_rate": 1.9978077730310687e-05, + "loss": 0.9083, + "step": 3087 + }, + { + "epoch": 0.5041426880535488, + "grad_norm": 2.010622978210449, + "learning_rate": 1.9978056724561853e-05, + "loss": 0.7959, + "step": 3088 + }, + { + "epoch": 0.5043059466960532, + "grad_norm": 1.9162832498550415, + "learning_rate": 1.997803570876512e-05, + "loss": 0.7084, + "step": 3089 + }, + { + "epoch": 0.5044692053385577, + "grad_norm": 2.6650588512420654, + "learning_rate": 1.9978014682920503e-05, + "loss": 0.8471, + "step": 3090 + }, + { + "epoch": 0.504632463981062, + "grad_norm": 2.136096477508545, + "learning_rate": 1.997799364702803e-05, + "loss": 0.7691, + "step": 3091 + }, + { + "epoch": 0.5047957226235664, + "grad_norm": 1.9939074516296387, + "learning_rate": 1.997797260108772e-05, + "loss": 0.9131, + "step": 3092 + }, + { + "epoch": 0.5049589812660707, + "grad_norm": 2.473628282546997, + "learning_rate": 1.9977951545099593e-05, + "loss": 0.9144, + "step": 3093 + }, + { + "epoch": 0.5051222399085752, + "grad_norm": 2.4819302558898926, + "learning_rate": 1.997793047906367e-05, + "loss": 0.8941, + "step": 3094 + }, + { + "epoch": 0.5052854985510795, + "grad_norm": 2.5031745433807373, + "learning_rate": 1.997790940297998e-05, + "loss": 0.8393, + "step": 3095 + }, + { + "epoch": 0.505448757193584, + "grad_norm": 2.0246689319610596, + "learning_rate": 1.997788831684853e-05, + "loss": 0.6635, + "step": 3096 + }, + { + "epoch": 0.5056120158360883, + "grad_norm": 2.317363977432251, + "learning_rate": 1.9977867220669356e-05, + "loss": 1.0788, + "step": 3097 + }, + { + "epoch": 0.5057752744785927, + "grad_norm": 2.2687721252441406, + "learning_rate": 1.997784611444247e-05, + "loss": 0.953, + "step": 3098 + }, + { + "epoch": 0.505938533121097, + "grad_norm": 2.5856900215148926, + "learning_rate": 1.9977824998167894e-05, + "loss": 0.8202, + "step": 3099 + }, + { + "epoch": 0.5061017917636015, + "grad_norm": 2.1392757892608643, + "learning_rate": 1.997780387184565e-05, + "loss": 0.8015, + "step": 3100 + }, + { + "epoch": 0.5062650504061059, + "grad_norm": 2.0395615100860596, + "learning_rate": 1.9977782735475765e-05, + "loss": 0.834, + "step": 3101 + }, + { + "epoch": 0.5064283090486102, + "grad_norm": 2.0939781665802, + "learning_rate": 1.9977761589058252e-05, + "loss": 0.9268, + "step": 3102 + }, + { + "epoch": 0.5065915676911147, + "grad_norm": 1.819284439086914, + "learning_rate": 1.997774043259314e-05, + "loss": 0.6608, + "step": 3103 + }, + { + "epoch": 0.506754826333619, + "grad_norm": 2.2785489559173584, + "learning_rate": 1.997771926608044e-05, + "loss": 0.8759, + "step": 3104 + }, + { + "epoch": 0.5069180849761235, + "grad_norm": 1.9993271827697754, + "learning_rate": 1.9977698089520183e-05, + "loss": 0.7594, + "step": 3105 + }, + { + "epoch": 0.5070813436186278, + "grad_norm": 2.1211814880371094, + "learning_rate": 1.9977676902912383e-05, + "loss": 0.8613, + "step": 3106 + }, + { + "epoch": 0.5072446022611322, + "grad_norm": 2.458709716796875, + "learning_rate": 1.9977655706257068e-05, + "loss": 0.8629, + "step": 3107 + }, + { + "epoch": 0.5074078609036365, + "grad_norm": 1.8477967977523804, + "learning_rate": 1.9977634499554255e-05, + "loss": 0.7501, + "step": 3108 + }, + { + "epoch": 0.507571119546141, + "grad_norm": 2.2111892700195312, + "learning_rate": 1.9977613282803968e-05, + "loss": 0.8437, + "step": 3109 + }, + { + "epoch": 0.5077343781886453, + "grad_norm": 1.9481911659240723, + "learning_rate": 1.9977592056006226e-05, + "loss": 0.7037, + "step": 3110 + }, + { + "epoch": 0.5078976368311497, + "grad_norm": 2.0714974403381348, + "learning_rate": 1.997757081916105e-05, + "loss": 0.8396, + "step": 3111 + }, + { + "epoch": 0.5080608954736542, + "grad_norm": 2.2502570152282715, + "learning_rate": 1.997754957226847e-05, + "loss": 0.9439, + "step": 3112 + }, + { + "epoch": 0.5082241541161585, + "grad_norm": 2.034342050552368, + "learning_rate": 1.9977528315328492e-05, + "loss": 0.6994, + "step": 3113 + }, + { + "epoch": 0.508387412758663, + "grad_norm": 2.3898301124572754, + "learning_rate": 1.997750704834115e-05, + "loss": 1.0438, + "step": 3114 + }, + { + "epoch": 0.5085506714011673, + "grad_norm": 2.1666438579559326, + "learning_rate": 1.997748577130646e-05, + "loss": 0.9007, + "step": 3115 + }, + { + "epoch": 0.5087139300436717, + "grad_norm": 3.30202579498291, + "learning_rate": 1.997746448422444e-05, + "loss": 0.9208, + "step": 3116 + }, + { + "epoch": 0.508877188686176, + "grad_norm": 2.3611483573913574, + "learning_rate": 1.997744318709512e-05, + "loss": 0.7812, + "step": 3117 + }, + { + "epoch": 0.5090404473286805, + "grad_norm": 1.8441590070724487, + "learning_rate": 1.997742187991852e-05, + "loss": 0.8313, + "step": 3118 + }, + { + "epoch": 0.5092037059711848, + "grad_norm": 2.158367156982422, + "learning_rate": 1.9977400562694656e-05, + "loss": 0.9435, + "step": 3119 + }, + { + "epoch": 0.5093669646136892, + "grad_norm": 2.4378623962402344, + "learning_rate": 1.9977379235423553e-05, + "loss": 0.9167, + "step": 3120 + }, + { + "epoch": 0.5095302232561936, + "grad_norm": 2.6020264625549316, + "learning_rate": 1.997735789810523e-05, + "loss": 1.0001, + "step": 3121 + }, + { + "epoch": 0.509693481898698, + "grad_norm": 2.036524534225464, + "learning_rate": 1.9977336550739716e-05, + "loss": 0.911, + "step": 3122 + }, + { + "epoch": 0.5098567405412024, + "grad_norm": 2.365675210952759, + "learning_rate": 1.9977315193327017e-05, + "loss": 0.9513, + "step": 3123 + }, + { + "epoch": 0.5100199991837068, + "grad_norm": 2.202857732772827, + "learning_rate": 1.9977293825867173e-05, + "loss": 0.8564, + "step": 3124 + }, + { + "epoch": 0.5101832578262112, + "grad_norm": 2.1707661151885986, + "learning_rate": 1.9977272448360193e-05, + "loss": 0.9094, + "step": 3125 + }, + { + "epoch": 0.5103465164687155, + "grad_norm": 1.899152159690857, + "learning_rate": 1.9977251060806102e-05, + "loss": 0.801, + "step": 3126 + }, + { + "epoch": 0.51050977511122, + "grad_norm": 1.845895767211914, + "learning_rate": 1.9977229663204922e-05, + "loss": 0.7105, + "step": 3127 + }, + { + "epoch": 0.5106730337537243, + "grad_norm": 2.0829052925109863, + "learning_rate": 1.9977208255556675e-05, + "loss": 0.7011, + "step": 3128 + }, + { + "epoch": 0.5108362923962287, + "grad_norm": 2.175828218460083, + "learning_rate": 1.997718683786138e-05, + "loss": 0.9134, + "step": 3129 + }, + { + "epoch": 0.5109995510387331, + "grad_norm": 2.1157751083374023, + "learning_rate": 1.9977165410119065e-05, + "loss": 0.7965, + "step": 3130 + }, + { + "epoch": 0.5111628096812375, + "grad_norm": 2.2744505405426025, + "learning_rate": 1.9977143972329744e-05, + "loss": 0.9872, + "step": 3131 + }, + { + "epoch": 0.5113260683237418, + "grad_norm": 2.2209384441375732, + "learning_rate": 1.9977122524493442e-05, + "loss": 0.999, + "step": 3132 + }, + { + "epoch": 0.5114893269662463, + "grad_norm": 2.137951612472534, + "learning_rate": 1.997710106661018e-05, + "loss": 1.0127, + "step": 3133 + }, + { + "epoch": 0.5116525856087507, + "grad_norm": 2.148066997528076, + "learning_rate": 1.9977079598679978e-05, + "loss": 0.9146, + "step": 3134 + }, + { + "epoch": 0.511815844251255, + "grad_norm": 2.263364791870117, + "learning_rate": 1.9977058120702863e-05, + "loss": 0.9476, + "step": 3135 + }, + { + "epoch": 0.5119791028937595, + "grad_norm": 2.104234218597412, + "learning_rate": 1.9977036632678853e-05, + "loss": 0.8515, + "step": 3136 + }, + { + "epoch": 0.5121423615362638, + "grad_norm": 2.1755144596099854, + "learning_rate": 1.997701513460797e-05, + "loss": 0.8653, + "step": 3137 + }, + { + "epoch": 0.5123056201787682, + "grad_norm": 2.43241286277771, + "learning_rate": 1.997699362649023e-05, + "loss": 0.8202, + "step": 3138 + }, + { + "epoch": 0.5124688788212726, + "grad_norm": 2.2741568088531494, + "learning_rate": 1.9976972108325667e-05, + "loss": 0.8328, + "step": 3139 + }, + { + "epoch": 0.512632137463777, + "grad_norm": 2.2319533824920654, + "learning_rate": 1.997695058011429e-05, + "loss": 0.8935, + "step": 3140 + }, + { + "epoch": 0.5127953961062813, + "grad_norm": 2.158381700515747, + "learning_rate": 1.997692904185613e-05, + "loss": 0.6791, + "step": 3141 + }, + { + "epoch": 0.5129586547487858, + "grad_norm": 2.067470073699951, + "learning_rate": 1.99769074935512e-05, + "loss": 0.9704, + "step": 3142 + }, + { + "epoch": 0.5131219133912901, + "grad_norm": 2.209883689880371, + "learning_rate": 1.9976885935199533e-05, + "loss": 0.8688, + "step": 3143 + }, + { + "epoch": 0.5132851720337945, + "grad_norm": 2.204659938812256, + "learning_rate": 1.9976864366801146e-05, + "loss": 0.9793, + "step": 3144 + }, + { + "epoch": 0.513448430676299, + "grad_norm": 2.3818485736846924, + "learning_rate": 1.9976842788356054e-05, + "loss": 0.9124, + "step": 3145 + }, + { + "epoch": 0.5136116893188033, + "grad_norm": 2.0255539417266846, + "learning_rate": 1.9976821199864287e-05, + "loss": 0.9115, + "step": 3146 + }, + { + "epoch": 0.5137749479613077, + "grad_norm": 2.194688081741333, + "learning_rate": 1.9976799601325863e-05, + "loss": 0.9187, + "step": 3147 + }, + { + "epoch": 0.5139382066038121, + "grad_norm": 1.9303861856460571, + "learning_rate": 1.9976777992740804e-05, + "loss": 0.8442, + "step": 3148 + }, + { + "epoch": 0.5141014652463165, + "grad_norm": 2.0928473472595215, + "learning_rate": 1.997675637410913e-05, + "loss": 0.7622, + "step": 3149 + }, + { + "epoch": 0.5142647238888208, + "grad_norm": 2.3140709400177, + "learning_rate": 1.997673474543087e-05, + "loss": 1.0003, + "step": 3150 + }, + { + "epoch": 0.5144279825313253, + "grad_norm": 2.0424208641052246, + "learning_rate": 1.9976713106706036e-05, + "loss": 0.8173, + "step": 3151 + }, + { + "epoch": 0.5145912411738296, + "grad_norm": 2.2570807933807373, + "learning_rate": 1.9976691457934655e-05, + "loss": 0.9522, + "step": 3152 + }, + { + "epoch": 0.514754499816334, + "grad_norm": 2.0205392837524414, + "learning_rate": 1.997666979911675e-05, + "loss": 0.7731, + "step": 3153 + }, + { + "epoch": 0.5149177584588384, + "grad_norm": 2.14528489112854, + "learning_rate": 1.997664813025234e-05, + "loss": 0.8298, + "step": 3154 + }, + { + "epoch": 0.5150810171013428, + "grad_norm": 2.021923065185547, + "learning_rate": 1.997662645134145e-05, + "loss": 1.0278, + "step": 3155 + }, + { + "epoch": 0.5152442757438472, + "grad_norm": 1.8985143899917603, + "learning_rate": 1.99766047623841e-05, + "loss": 0.7724, + "step": 3156 + }, + { + "epoch": 0.5154075343863516, + "grad_norm": 2.1301212310791016, + "learning_rate": 1.997658306338031e-05, + "loss": 0.817, + "step": 3157 + }, + { + "epoch": 0.515570793028856, + "grad_norm": 2.3517141342163086, + "learning_rate": 1.9976561354330105e-05, + "loss": 0.9374, + "step": 3158 + }, + { + "epoch": 0.5157340516713603, + "grad_norm": 2.257906436920166, + "learning_rate": 1.99765396352335e-05, + "loss": 0.8537, + "step": 3159 + }, + { + "epoch": 0.5158973103138648, + "grad_norm": 2.4338629245758057, + "learning_rate": 1.9976517906090528e-05, + "loss": 1.1772, + "step": 3160 + }, + { + "epoch": 0.5160605689563691, + "grad_norm": 2.1121225357055664, + "learning_rate": 1.9976496166901205e-05, + "loss": 0.9339, + "step": 3161 + }, + { + "epoch": 0.5162238275988735, + "grad_norm": 2.0665366649627686, + "learning_rate": 1.997647441766555e-05, + "loss": 0.8609, + "step": 3162 + }, + { + "epoch": 0.5163870862413779, + "grad_norm": 2.125631809234619, + "learning_rate": 1.9976452658383588e-05, + "loss": 1.0179, + "step": 3163 + }, + { + "epoch": 0.5165503448838823, + "grad_norm": 2.4693562984466553, + "learning_rate": 1.9976430889055342e-05, + "loss": 0.9653, + "step": 3164 + }, + { + "epoch": 0.5167136035263867, + "grad_norm": 1.9813436269760132, + "learning_rate": 1.9976409109680835e-05, + "loss": 0.7908, + "step": 3165 + }, + { + "epoch": 0.5168768621688911, + "grad_norm": 2.1706442832946777, + "learning_rate": 1.9976387320260083e-05, + "loss": 0.9407, + "step": 3166 + }, + { + "epoch": 0.5170401208113955, + "grad_norm": 1.9704300165176392, + "learning_rate": 1.9976365520793114e-05, + "loss": 0.7262, + "step": 3167 + }, + { + "epoch": 0.5172033794538998, + "grad_norm": 2.4319238662719727, + "learning_rate": 1.9976343711279947e-05, + "loss": 1.1614, + "step": 3168 + }, + { + "epoch": 0.5173666380964043, + "grad_norm": 2.3980228900909424, + "learning_rate": 1.9976321891720604e-05, + "loss": 0.9184, + "step": 3169 + }, + { + "epoch": 0.5175298967389086, + "grad_norm": 2.2042312622070312, + "learning_rate": 1.9976300062115112e-05, + "loss": 0.907, + "step": 3170 + }, + { + "epoch": 0.517693155381413, + "grad_norm": 2.05627703666687, + "learning_rate": 1.997627822246348e-05, + "loss": 0.7615, + "step": 3171 + }, + { + "epoch": 0.5178564140239174, + "grad_norm": 2.013457775115967, + "learning_rate": 1.9976256372765746e-05, + "loss": 0.9695, + "step": 3172 + }, + { + "epoch": 0.5180196726664218, + "grad_norm": 2.065033435821533, + "learning_rate": 1.997623451302192e-05, + "loss": 0.9746, + "step": 3173 + }, + { + "epoch": 0.5181829313089261, + "grad_norm": 2.101802349090576, + "learning_rate": 1.997621264323203e-05, + "loss": 0.8493, + "step": 3174 + }, + { + "epoch": 0.5183461899514306, + "grad_norm": 1.8131414651870728, + "learning_rate": 1.9976190763396094e-05, + "loss": 0.7428, + "step": 3175 + }, + { + "epoch": 0.518509448593935, + "grad_norm": 2.0616135597229004, + "learning_rate": 1.997616887351414e-05, + "loss": 1.0356, + "step": 3176 + }, + { + "epoch": 0.5186727072364393, + "grad_norm": 2.1442439556121826, + "learning_rate": 1.9976146973586184e-05, + "loss": 0.9392, + "step": 3177 + }, + { + "epoch": 0.5188359658789438, + "grad_norm": 2.1017026901245117, + "learning_rate": 1.9976125063612254e-05, + "loss": 1.2462, + "step": 3178 + }, + { + "epoch": 0.5189992245214481, + "grad_norm": 2.22216534614563, + "learning_rate": 1.9976103143592368e-05, + "loss": 0.8796, + "step": 3179 + }, + { + "epoch": 0.5191624831639525, + "grad_norm": 2.0383503437042236, + "learning_rate": 1.9976081213526545e-05, + "loss": 0.8975, + "step": 3180 + }, + { + "epoch": 0.5193257418064569, + "grad_norm": 2.0457003116607666, + "learning_rate": 1.9976059273414813e-05, + "loss": 0.6546, + "step": 3181 + }, + { + "epoch": 0.5194890004489613, + "grad_norm": 2.2953717708587646, + "learning_rate": 1.9976037323257193e-05, + "loss": 0.9085, + "step": 3182 + }, + { + "epoch": 0.5196522590914656, + "grad_norm": 1.8370320796966553, + "learning_rate": 1.9976015363053708e-05, + "loss": 0.8271, + "step": 3183 + }, + { + "epoch": 0.5198155177339701, + "grad_norm": 1.757144570350647, + "learning_rate": 1.9975993392804374e-05, + "loss": 0.7603, + "step": 3184 + }, + { + "epoch": 0.5199787763764744, + "grad_norm": 1.935219645500183, + "learning_rate": 1.997597141250922e-05, + "loss": 0.7571, + "step": 3185 + }, + { + "epoch": 0.5201420350189788, + "grad_norm": 2.089667797088623, + "learning_rate": 1.9975949422168265e-05, + "loss": 0.9095, + "step": 3186 + }, + { + "epoch": 0.5203052936614833, + "grad_norm": 1.9769450426101685, + "learning_rate": 1.997592742178153e-05, + "loss": 0.6965, + "step": 3187 + }, + { + "epoch": 0.5204685523039876, + "grad_norm": 1.7493739128112793, + "learning_rate": 1.997590541134904e-05, + "loss": 0.6207, + "step": 3188 + }, + { + "epoch": 0.520631810946492, + "grad_norm": 2.201674699783325, + "learning_rate": 1.9975883390870817e-05, + "loss": 0.8436, + "step": 3189 + }, + { + "epoch": 0.5207950695889964, + "grad_norm": 1.9206316471099854, + "learning_rate": 1.9975861360346877e-05, + "loss": 0.8058, + "step": 3190 + }, + { + "epoch": 0.5209583282315008, + "grad_norm": 2.2072269916534424, + "learning_rate": 1.997583931977725e-05, + "loss": 0.9863, + "step": 3191 + }, + { + "epoch": 0.5211215868740051, + "grad_norm": 2.0386557579040527, + "learning_rate": 1.9975817269161957e-05, + "loss": 0.8646, + "step": 3192 + }, + { + "epoch": 0.5212848455165096, + "grad_norm": 2.0219480991363525, + "learning_rate": 1.9975795208501018e-05, + "loss": 0.8849, + "step": 3193 + }, + { + "epoch": 0.5214481041590139, + "grad_norm": 2.244020462036133, + "learning_rate": 1.9975773137794458e-05, + "loss": 0.9284, + "step": 3194 + }, + { + "epoch": 0.5216113628015183, + "grad_norm": 2.0932788848876953, + "learning_rate": 1.9975751057042294e-05, + "loss": 0.8693, + "step": 3195 + }, + { + "epoch": 0.5217746214440226, + "grad_norm": 2.072313070297241, + "learning_rate": 1.9975728966244553e-05, + "loss": 0.711, + "step": 3196 + }, + { + "epoch": 0.5219378800865271, + "grad_norm": 1.811410903930664, + "learning_rate": 1.9975706865401255e-05, + "loss": 0.6794, + "step": 3197 + }, + { + "epoch": 0.5221011387290315, + "grad_norm": 2.2907443046569824, + "learning_rate": 1.9975684754512425e-05, + "loss": 0.8816, + "step": 3198 + }, + { + "epoch": 0.5222643973715358, + "grad_norm": 2.330420732498169, + "learning_rate": 1.9975662633578078e-05, + "loss": 0.7539, + "step": 3199 + }, + { + "epoch": 0.5224276560140403, + "grad_norm": 1.6327462196350098, + "learning_rate": 1.9975640502598243e-05, + "loss": 0.5512, + "step": 3200 + }, + { + "epoch": 0.5225909146565446, + "grad_norm": 2.2267472743988037, + "learning_rate": 1.9975618361572942e-05, + "loss": 0.7534, + "step": 3201 + }, + { + "epoch": 0.522754173299049, + "grad_norm": 2.464442729949951, + "learning_rate": 1.9975596210502197e-05, + "loss": 1.0419, + "step": 3202 + }, + { + "epoch": 0.5229174319415534, + "grad_norm": 2.3450839519500732, + "learning_rate": 1.9975574049386027e-05, + "loss": 0.7692, + "step": 3203 + }, + { + "epoch": 0.5230806905840578, + "grad_norm": 2.1445021629333496, + "learning_rate": 1.997555187822446e-05, + "loss": 1.0271, + "step": 3204 + }, + { + "epoch": 0.5232439492265621, + "grad_norm": 1.9949091672897339, + "learning_rate": 1.997552969701751e-05, + "loss": 0.8389, + "step": 3205 + }, + { + "epoch": 0.5234072078690666, + "grad_norm": 2.0966928005218506, + "learning_rate": 1.9975507505765207e-05, + "loss": 0.7554, + "step": 3206 + }, + { + "epoch": 0.5235704665115709, + "grad_norm": 1.6886711120605469, + "learning_rate": 1.997548530446757e-05, + "loss": 0.6343, + "step": 3207 + }, + { + "epoch": 0.5237337251540753, + "grad_norm": 2.2185089588165283, + "learning_rate": 1.9975463093124623e-05, + "loss": 1.0363, + "step": 3208 + }, + { + "epoch": 0.5238969837965798, + "grad_norm": 2.123464584350586, + "learning_rate": 1.9975440871736387e-05, + "loss": 0.8059, + "step": 3209 + }, + { + "epoch": 0.5240602424390841, + "grad_norm": 1.9603670835494995, + "learning_rate": 1.9975418640302885e-05, + "loss": 0.6738, + "step": 3210 + }, + { + "epoch": 0.5242235010815885, + "grad_norm": 1.8736332654953003, + "learning_rate": 1.997539639882414e-05, + "loss": 0.7986, + "step": 3211 + }, + { + "epoch": 0.5243867597240929, + "grad_norm": 2.2341456413269043, + "learning_rate": 1.9975374147300172e-05, + "loss": 0.8755, + "step": 3212 + }, + { + "epoch": 0.5245500183665973, + "grad_norm": 1.7159340381622314, + "learning_rate": 1.9975351885731004e-05, + "loss": 0.7989, + "step": 3213 + }, + { + "epoch": 0.5247132770091016, + "grad_norm": 1.766485571861267, + "learning_rate": 1.997532961411666e-05, + "loss": 0.6922, + "step": 3214 + }, + { + "epoch": 0.5248765356516061, + "grad_norm": 2.2090628147125244, + "learning_rate": 1.997530733245716e-05, + "loss": 0.9142, + "step": 3215 + }, + { + "epoch": 0.5250397942941104, + "grad_norm": 2.0952234268188477, + "learning_rate": 1.997528504075253e-05, + "loss": 0.9603, + "step": 3216 + }, + { + "epoch": 0.5252030529366148, + "grad_norm": 2.554661989212036, + "learning_rate": 1.9975262739002793e-05, + "loss": 0.8493, + "step": 3217 + }, + { + "epoch": 0.5253663115791192, + "grad_norm": 1.922166347503662, + "learning_rate": 1.9975240427207966e-05, + "loss": 0.767, + "step": 3218 + }, + { + "epoch": 0.5255295702216236, + "grad_norm": 2.4131007194519043, + "learning_rate": 1.9975218105368074e-05, + "loss": 0.8432, + "step": 3219 + }, + { + "epoch": 0.525692828864128, + "grad_norm": 2.114917278289795, + "learning_rate": 1.997519577348314e-05, + "loss": 0.8398, + "step": 3220 + }, + { + "epoch": 0.5258560875066324, + "grad_norm": 2.2072269916534424, + "learning_rate": 1.9975173431553188e-05, + "loss": 0.8589, + "step": 3221 + }, + { + "epoch": 0.5260193461491368, + "grad_norm": 2.3732430934906006, + "learning_rate": 1.9975151079578238e-05, + "loss": 0.8723, + "step": 3222 + }, + { + "epoch": 0.5261826047916411, + "grad_norm": 1.9268263578414917, + "learning_rate": 1.9975128717558318e-05, + "loss": 0.7422, + "step": 3223 + }, + { + "epoch": 0.5263458634341456, + "grad_norm": 2.1012864112854004, + "learning_rate": 1.997510634549344e-05, + "loss": 0.9143, + "step": 3224 + }, + { + "epoch": 0.5265091220766499, + "grad_norm": 2.1604623794555664, + "learning_rate": 1.9975083963383634e-05, + "loss": 0.8148, + "step": 3225 + }, + { + "epoch": 0.5266723807191543, + "grad_norm": 2.0763907432556152, + "learning_rate": 1.997506157122892e-05, + "loss": 0.8783, + "step": 3226 + }, + { + "epoch": 0.5268356393616587, + "grad_norm": 2.274029016494751, + "learning_rate": 1.9975039169029325e-05, + "loss": 1.0623, + "step": 3227 + }, + { + "epoch": 0.5269988980041631, + "grad_norm": 2.1632800102233887, + "learning_rate": 1.9975016756784868e-05, + "loss": 0.9064, + "step": 3228 + }, + { + "epoch": 0.5271621566466674, + "grad_norm": 1.9675068855285645, + "learning_rate": 1.997499433449557e-05, + "loss": 0.8688, + "step": 3229 + }, + { + "epoch": 0.5273254152891719, + "grad_norm": 2.0734686851501465, + "learning_rate": 1.9974971902161455e-05, + "loss": 1.0222, + "step": 3230 + }, + { + "epoch": 0.5274886739316763, + "grad_norm": 2.190962553024292, + "learning_rate": 1.9974949459782547e-05, + "loss": 0.8687, + "step": 3231 + }, + { + "epoch": 0.5276519325741806, + "grad_norm": 2.356158971786499, + "learning_rate": 1.9974927007358868e-05, + "loss": 0.8767, + "step": 3232 + }, + { + "epoch": 0.5278151912166851, + "grad_norm": 1.9763189554214478, + "learning_rate": 1.997490454489044e-05, + "loss": 0.815, + "step": 3233 + }, + { + "epoch": 0.5279784498591894, + "grad_norm": 2.2305474281311035, + "learning_rate": 1.9974882072377283e-05, + "loss": 0.8647, + "step": 3234 + }, + { + "epoch": 0.5281417085016938, + "grad_norm": 1.9637455940246582, + "learning_rate": 1.9974859589819428e-05, + "loss": 0.8085, + "step": 3235 + }, + { + "epoch": 0.5283049671441982, + "grad_norm": 1.7090474367141724, + "learning_rate": 1.9974837097216887e-05, + "loss": 0.6628, + "step": 3236 + }, + { + "epoch": 0.5284682257867026, + "grad_norm": 2.2208778858184814, + "learning_rate": 1.997481459456969e-05, + "loss": 0.7848, + "step": 3237 + }, + { + "epoch": 0.5286314844292069, + "grad_norm": 2.433609962463379, + "learning_rate": 1.9974792081877856e-05, + "loss": 0.8681, + "step": 3238 + }, + { + "epoch": 0.5287947430717114, + "grad_norm": 2.417046308517456, + "learning_rate": 1.997476955914141e-05, + "loss": 0.9345, + "step": 3239 + }, + { + "epoch": 0.5289580017142157, + "grad_norm": 1.9577100276947021, + "learning_rate": 1.9974747026360372e-05, + "loss": 0.8468, + "step": 3240 + }, + { + "epoch": 0.5291212603567201, + "grad_norm": 2.3044791221618652, + "learning_rate": 1.9974724483534768e-05, + "loss": 1.0128, + "step": 3241 + }, + { + "epoch": 0.5292845189992246, + "grad_norm": 2.224580764770508, + "learning_rate": 1.997470193066462e-05, + "loss": 0.9383, + "step": 3242 + }, + { + "epoch": 0.5294477776417289, + "grad_norm": 1.8904632329940796, + "learning_rate": 1.997467936774995e-05, + "loss": 0.6855, + "step": 3243 + }, + { + "epoch": 0.5296110362842333, + "grad_norm": 2.261305809020996, + "learning_rate": 1.9974656794790777e-05, + "loss": 0.9835, + "step": 3244 + }, + { + "epoch": 0.5297742949267377, + "grad_norm": 2.5073816776275635, + "learning_rate": 1.997463421178713e-05, + "loss": 1.045, + "step": 3245 + }, + { + "epoch": 0.5299375535692421, + "grad_norm": 1.9100139141082764, + "learning_rate": 1.997461161873903e-05, + "loss": 0.8544, + "step": 3246 + }, + { + "epoch": 0.5301008122117464, + "grad_norm": 2.0610477924346924, + "learning_rate": 1.9974589015646494e-05, + "loss": 0.9465, + "step": 3247 + }, + { + "epoch": 0.5302640708542509, + "grad_norm": 2.2391514778137207, + "learning_rate": 1.9974566402509556e-05, + "loss": 1.0182, + "step": 3248 + }, + { + "epoch": 0.5304273294967552, + "grad_norm": 2.4734034538269043, + "learning_rate": 1.997454377932823e-05, + "loss": 0.8235, + "step": 3249 + }, + { + "epoch": 0.5305905881392596, + "grad_norm": 1.961127758026123, + "learning_rate": 1.9974521146102535e-05, + "loss": 0.8251, + "step": 3250 + }, + { + "epoch": 0.530753846781764, + "grad_norm": 2.1797642707824707, + "learning_rate": 1.9974498502832508e-05, + "loss": 0.8809, + "step": 3251 + }, + { + "epoch": 0.5309171054242684, + "grad_norm": 2.0917508602142334, + "learning_rate": 1.9974475849518157e-05, + "loss": 0.8194, + "step": 3252 + }, + { + "epoch": 0.5310803640667728, + "grad_norm": 2.3419740200042725, + "learning_rate": 1.9974453186159517e-05, + "loss": 1.1124, + "step": 3253 + }, + { + "epoch": 0.5312436227092772, + "grad_norm": 1.7141506671905518, + "learning_rate": 1.9974430512756604e-05, + "loss": 0.6427, + "step": 3254 + }, + { + "epoch": 0.5314068813517816, + "grad_norm": 2.333385705947876, + "learning_rate": 1.9974407829309442e-05, + "loss": 0.8156, + "step": 3255 + }, + { + "epoch": 0.5315701399942859, + "grad_norm": 2.105518102645874, + "learning_rate": 1.9974385135818052e-05, + "loss": 0.6289, + "step": 3256 + }, + { + "epoch": 0.5317333986367904, + "grad_norm": 1.9439438581466675, + "learning_rate": 1.997436243228246e-05, + "loss": 0.7239, + "step": 3257 + }, + { + "epoch": 0.5318966572792947, + "grad_norm": 1.995877742767334, + "learning_rate": 1.9974339718702688e-05, + "loss": 0.8766, + "step": 3258 + }, + { + "epoch": 0.5320599159217991, + "grad_norm": 1.8711572885513306, + "learning_rate": 1.9974316995078758e-05, + "loss": 0.8147, + "step": 3259 + }, + { + "epoch": 0.5322231745643035, + "grad_norm": 2.0966269969940186, + "learning_rate": 1.9974294261410695e-05, + "loss": 0.7724, + "step": 3260 + }, + { + "epoch": 0.5323864332068079, + "grad_norm": 2.099099636077881, + "learning_rate": 1.997427151769852e-05, + "loss": 0.7857, + "step": 3261 + }, + { + "epoch": 0.5325496918493122, + "grad_norm": 2.083033323287964, + "learning_rate": 1.9974248763942255e-05, + "loss": 0.834, + "step": 3262 + }, + { + "epoch": 0.5327129504918167, + "grad_norm": 2.5886573791503906, + "learning_rate": 1.9974226000141926e-05, + "loss": 0.9564, + "step": 3263 + }, + { + "epoch": 0.5328762091343211, + "grad_norm": 2.36087965965271, + "learning_rate": 1.997420322629755e-05, + "loss": 0.8632, + "step": 3264 + }, + { + "epoch": 0.5330394677768254, + "grad_norm": 1.7981657981872559, + "learning_rate": 1.9974180442409155e-05, + "loss": 0.7335, + "step": 3265 + }, + { + "epoch": 0.5332027264193299, + "grad_norm": 2.0183613300323486, + "learning_rate": 1.9974157648476768e-05, + "loss": 0.796, + "step": 3266 + }, + { + "epoch": 0.5333659850618342, + "grad_norm": 2.4413630962371826, + "learning_rate": 1.9974134844500402e-05, + "loss": 0.848, + "step": 3267 + }, + { + "epoch": 0.5335292437043386, + "grad_norm": 2.2178828716278076, + "learning_rate": 1.997411203048009e-05, + "loss": 0.9209, + "step": 3268 + }, + { + "epoch": 0.533692502346843, + "grad_norm": 1.8141670227050781, + "learning_rate": 1.9974089206415843e-05, + "loss": 0.6697, + "step": 3269 + }, + { + "epoch": 0.5338557609893474, + "grad_norm": 2.1023662090301514, + "learning_rate": 1.9974066372307694e-05, + "loss": 0.8987, + "step": 3270 + }, + { + "epoch": 0.5340190196318517, + "grad_norm": 2.038614511489868, + "learning_rate": 1.997404352815566e-05, + "loss": 0.9465, + "step": 3271 + }, + { + "epoch": 0.5341822782743562, + "grad_norm": 2.1336827278137207, + "learning_rate": 1.997402067395977e-05, + "loss": 1.1489, + "step": 3272 + }, + { + "epoch": 0.5343455369168605, + "grad_norm": 1.8819364309310913, + "learning_rate": 1.9973997809720045e-05, + "loss": 0.8246, + "step": 3273 + }, + { + "epoch": 0.5345087955593649, + "grad_norm": 2.729527711868286, + "learning_rate": 1.9973974935436503e-05, + "loss": 0.912, + "step": 3274 + }, + { + "epoch": 0.5346720542018694, + "grad_norm": 2.132807970046997, + "learning_rate": 1.9973952051109176e-05, + "loss": 0.9008, + "step": 3275 + }, + { + "epoch": 0.5348353128443737, + "grad_norm": 2.097180128097534, + "learning_rate": 1.9973929156738078e-05, + "loss": 0.9581, + "step": 3276 + }, + { + "epoch": 0.5349985714868781, + "grad_norm": 2.1963400840759277, + "learning_rate": 1.997390625232324e-05, + "loss": 0.8222, + "step": 3277 + }, + { + "epoch": 0.5351618301293825, + "grad_norm": 1.8913047313690186, + "learning_rate": 1.9973883337864674e-05, + "loss": 0.9136, + "step": 3278 + }, + { + "epoch": 0.5353250887718869, + "grad_norm": 2.293325901031494, + "learning_rate": 1.9973860413362418e-05, + "loss": 0.8359, + "step": 3279 + }, + { + "epoch": 0.5354883474143912, + "grad_norm": 2.0164906978607178, + "learning_rate": 1.9973837478816483e-05, + "loss": 0.8161, + "step": 3280 + }, + { + "epoch": 0.5356516060568957, + "grad_norm": 1.9961150884628296, + "learning_rate": 1.9973814534226895e-05, + "loss": 0.8885, + "step": 3281 + }, + { + "epoch": 0.5358148646994, + "grad_norm": 2.101891279220581, + "learning_rate": 1.997379157959368e-05, + "loss": 0.7739, + "step": 3282 + }, + { + "epoch": 0.5359781233419044, + "grad_norm": 2.17558217048645, + "learning_rate": 1.997376861491686e-05, + "loss": 0.893, + "step": 3283 + }, + { + "epoch": 0.5361413819844087, + "grad_norm": 2.166165351867676, + "learning_rate": 1.9973745640196458e-05, + "loss": 0.8295, + "step": 3284 + }, + { + "epoch": 0.5363046406269132, + "grad_norm": 2.0578770637512207, + "learning_rate": 1.9973722655432497e-05, + "loss": 0.8408, + "step": 3285 + }, + { + "epoch": 0.5364678992694176, + "grad_norm": 1.8979583978652954, + "learning_rate": 1.9973699660625e-05, + "loss": 0.8439, + "step": 3286 + }, + { + "epoch": 0.536631157911922, + "grad_norm": 1.8617664575576782, + "learning_rate": 1.9973676655773988e-05, + "loss": 0.7567, + "step": 3287 + }, + { + "epoch": 0.5367944165544264, + "grad_norm": 2.4459118843078613, + "learning_rate": 1.9973653640879486e-05, + "loss": 0.9096, + "step": 3288 + }, + { + "epoch": 0.5369576751969307, + "grad_norm": 2.6515989303588867, + "learning_rate": 1.997363061594152e-05, + "loss": 0.8885, + "step": 3289 + }, + { + "epoch": 0.5371209338394352, + "grad_norm": 3.1581332683563232, + "learning_rate": 1.997360758096011e-05, + "loss": 0.9964, + "step": 3290 + }, + { + "epoch": 0.5372841924819395, + "grad_norm": 2.3488147258758545, + "learning_rate": 1.9973584535935277e-05, + "loss": 0.9653, + "step": 3291 + }, + { + "epoch": 0.5374474511244439, + "grad_norm": 2.1410341262817383, + "learning_rate": 1.997356148086705e-05, + "loss": 0.7864, + "step": 3292 + }, + { + "epoch": 0.5376107097669482, + "grad_norm": 2.797759771347046, + "learning_rate": 1.9973538415755448e-05, + "loss": 0.8193, + "step": 3293 + }, + { + "epoch": 0.5377739684094527, + "grad_norm": 2.1440000534057617, + "learning_rate": 1.99735153406005e-05, + "loss": 0.8494, + "step": 3294 + }, + { + "epoch": 0.537937227051957, + "grad_norm": 2.4268276691436768, + "learning_rate": 1.9973492255402215e-05, + "loss": 0.789, + "step": 3295 + }, + { + "epoch": 0.5381004856944615, + "grad_norm": 1.965209722518921, + "learning_rate": 1.9973469160160635e-05, + "loss": 0.9018, + "step": 3296 + }, + { + "epoch": 0.5382637443369659, + "grad_norm": 2.0752551555633545, + "learning_rate": 1.997344605487577e-05, + "loss": 0.8973, + "step": 3297 + }, + { + "epoch": 0.5384270029794702, + "grad_norm": 2.0615603923797607, + "learning_rate": 1.997342293954765e-05, + "loss": 0.7705, + "step": 3298 + }, + { + "epoch": 0.5385902616219747, + "grad_norm": 2.37383770942688, + "learning_rate": 1.9973399814176293e-05, + "loss": 0.7852, + "step": 3299 + }, + { + "epoch": 0.538753520264479, + "grad_norm": 2.173271656036377, + "learning_rate": 1.9973376678761726e-05, + "loss": 0.9145, + "step": 3300 + }, + { + "epoch": 0.5389167789069834, + "grad_norm": 2.4143214225769043, + "learning_rate": 1.997335353330397e-05, + "loss": 1.014, + "step": 3301 + }, + { + "epoch": 0.5390800375494877, + "grad_norm": 2.2319889068603516, + "learning_rate": 1.997333037780305e-05, + "loss": 0.9775, + "step": 3302 + }, + { + "epoch": 0.5392432961919922, + "grad_norm": 2.1811447143554688, + "learning_rate": 1.997330721225899e-05, + "loss": 0.8083, + "step": 3303 + }, + { + "epoch": 0.5394065548344965, + "grad_norm": 2.50657057762146, + "learning_rate": 1.9973284036671814e-05, + "loss": 0.7693, + "step": 3304 + }, + { + "epoch": 0.539569813477001, + "grad_norm": 2.503931999206543, + "learning_rate": 1.9973260851041542e-05, + "loss": 0.883, + "step": 3305 + }, + { + "epoch": 0.5397330721195053, + "grad_norm": 2.1914658546447754, + "learning_rate": 1.9973237655368197e-05, + "loss": 1.0242, + "step": 3306 + }, + { + "epoch": 0.5398963307620097, + "grad_norm": 2.1027865409851074, + "learning_rate": 1.9973214449651806e-05, + "loss": 0.9424, + "step": 3307 + }, + { + "epoch": 0.5400595894045142, + "grad_norm": 1.8685299158096313, + "learning_rate": 1.9973191233892393e-05, + "loss": 0.9353, + "step": 3308 + }, + { + "epoch": 0.5402228480470185, + "grad_norm": 1.9433985948562622, + "learning_rate": 1.9973168008089977e-05, + "loss": 0.7342, + "step": 3309 + }, + { + "epoch": 0.5403861066895229, + "grad_norm": 2.336198091506958, + "learning_rate": 1.997314477224458e-05, + "loss": 1.0094, + "step": 3310 + }, + { + "epoch": 0.5405493653320272, + "grad_norm": 2.2141501903533936, + "learning_rate": 1.9973121526356236e-05, + "loss": 0.8756, + "step": 3311 + }, + { + "epoch": 0.5407126239745317, + "grad_norm": 2.107102394104004, + "learning_rate": 1.9973098270424957e-05, + "loss": 0.7689, + "step": 3312 + }, + { + "epoch": 0.540875882617036, + "grad_norm": 2.1019861698150635, + "learning_rate": 1.997307500445077e-05, + "loss": 0.8061, + "step": 3313 + }, + { + "epoch": 0.5410391412595404, + "grad_norm": 2.0350253582000732, + "learning_rate": 1.99730517284337e-05, + "loss": 0.9164, + "step": 3314 + }, + { + "epoch": 0.5412023999020448, + "grad_norm": 1.9105134010314941, + "learning_rate": 1.9973028442373768e-05, + "loss": 0.6903, + "step": 3315 + }, + { + "epoch": 0.5413656585445492, + "grad_norm": 2.3066296577453613, + "learning_rate": 1.9973005146271003e-05, + "loss": 0.9312, + "step": 3316 + }, + { + "epoch": 0.5415289171870535, + "grad_norm": 2.2854549884796143, + "learning_rate": 1.9972981840125422e-05, + "loss": 0.8628, + "step": 3317 + }, + { + "epoch": 0.541692175829558, + "grad_norm": 2.052642583847046, + "learning_rate": 1.9972958523937053e-05, + "loss": 0.882, + "step": 3318 + }, + { + "epoch": 0.5418554344720624, + "grad_norm": 2.201667547225952, + "learning_rate": 1.9972935197705915e-05, + "loss": 0.7227, + "step": 3319 + }, + { + "epoch": 0.5420186931145667, + "grad_norm": 1.9283092021942139, + "learning_rate": 1.9972911861432033e-05, + "loss": 0.9028, + "step": 3320 + }, + { + "epoch": 0.5421819517570712, + "grad_norm": 2.266380548477173, + "learning_rate": 1.9972888515115433e-05, + "loss": 0.8383, + "step": 3321 + }, + { + "epoch": 0.5423452103995755, + "grad_norm": 1.8276673555374146, + "learning_rate": 1.9972865158756137e-05, + "loss": 0.6794, + "step": 3322 + }, + { + "epoch": 0.5425084690420799, + "grad_norm": 2.2632904052734375, + "learning_rate": 1.997284179235417e-05, + "loss": 0.9747, + "step": 3323 + }, + { + "epoch": 0.5426717276845843, + "grad_norm": 2.114713668823242, + "learning_rate": 1.997281841590955e-05, + "loss": 0.7752, + "step": 3324 + }, + { + "epoch": 0.5428349863270887, + "grad_norm": 1.948089361190796, + "learning_rate": 1.997279502942231e-05, + "loss": 0.8183, + "step": 3325 + }, + { + "epoch": 0.542998244969593, + "grad_norm": 2.033433437347412, + "learning_rate": 1.997277163289246e-05, + "loss": 0.8493, + "step": 3326 + }, + { + "epoch": 0.5431615036120975, + "grad_norm": 2.0917389392852783, + "learning_rate": 1.997274822632004e-05, + "loss": 0.9189, + "step": 3327 + }, + { + "epoch": 0.5433247622546018, + "grad_norm": 2.043719530105591, + "learning_rate": 1.997272480970506e-05, + "loss": 0.8304, + "step": 3328 + }, + { + "epoch": 0.5434880208971062, + "grad_norm": 2.3795104026794434, + "learning_rate": 1.9972701383047552e-05, + "loss": 0.8467, + "step": 3329 + }, + { + "epoch": 0.5436512795396107, + "grad_norm": 2.2582433223724365, + "learning_rate": 1.9972677946347536e-05, + "loss": 0.8613, + "step": 3330 + }, + { + "epoch": 0.543814538182115, + "grad_norm": 1.813295841217041, + "learning_rate": 1.9972654499605034e-05, + "loss": 0.7642, + "step": 3331 + }, + { + "epoch": 0.5439777968246194, + "grad_norm": 2.236614227294922, + "learning_rate": 1.997263104282007e-05, + "loss": 0.8329, + "step": 3332 + }, + { + "epoch": 0.5441410554671238, + "grad_norm": 1.9925373792648315, + "learning_rate": 1.9972607575992672e-05, + "loss": 0.9528, + "step": 3333 + }, + { + "epoch": 0.5443043141096282, + "grad_norm": 1.9964669942855835, + "learning_rate": 1.997258409912286e-05, + "loss": 0.7323, + "step": 3334 + }, + { + "epoch": 0.5444675727521325, + "grad_norm": 2.1535727977752686, + "learning_rate": 1.997256061221066e-05, + "loss": 0.8805, + "step": 3335 + }, + { + "epoch": 0.544630831394637, + "grad_norm": 2.2644340991973877, + "learning_rate": 1.9972537115256095e-05, + "loss": 0.9324, + "step": 3336 + }, + { + "epoch": 0.5447940900371413, + "grad_norm": 2.2130517959594727, + "learning_rate": 1.9972513608259185e-05, + "loss": 0.9007, + "step": 3337 + }, + { + "epoch": 0.5449573486796457, + "grad_norm": 2.4157769680023193, + "learning_rate": 1.9972490091219954e-05, + "loss": 1.0026, + "step": 3338 + }, + { + "epoch": 0.5451206073221501, + "grad_norm": 2.194753885269165, + "learning_rate": 1.9972466564138433e-05, + "loss": 0.9746, + "step": 3339 + }, + { + "epoch": 0.5452838659646545, + "grad_norm": 2.1495954990386963, + "learning_rate": 1.997244302701464e-05, + "loss": 0.8154, + "step": 3340 + }, + { + "epoch": 0.5454471246071589, + "grad_norm": 1.6893572807312012, + "learning_rate": 1.9972419479848597e-05, + "loss": 0.7939, + "step": 3341 + }, + { + "epoch": 0.5456103832496633, + "grad_norm": 1.875591516494751, + "learning_rate": 1.997239592264033e-05, + "loss": 0.7619, + "step": 3342 + }, + { + "epoch": 0.5457736418921677, + "grad_norm": 1.9269273281097412, + "learning_rate": 1.997237235538987e-05, + "loss": 0.7747, + "step": 3343 + }, + { + "epoch": 0.545936900534672, + "grad_norm": 1.987618327140808, + "learning_rate": 1.9972348778097225e-05, + "loss": 0.8482, + "step": 3344 + }, + { + "epoch": 0.5461001591771765, + "grad_norm": 2.366896152496338, + "learning_rate": 1.997232519076243e-05, + "loss": 0.9536, + "step": 3345 + }, + { + "epoch": 0.5462634178196808, + "grad_norm": 2.162234306335449, + "learning_rate": 1.9972301593385507e-05, + "loss": 0.8529, + "step": 3346 + }, + { + "epoch": 0.5464266764621852, + "grad_norm": 1.7913020849227905, + "learning_rate": 1.9972277985966482e-05, + "loss": 0.7579, + "step": 3347 + }, + { + "epoch": 0.5465899351046896, + "grad_norm": 1.8930449485778809, + "learning_rate": 1.997225436850537e-05, + "loss": 0.7405, + "step": 3348 + }, + { + "epoch": 0.546753193747194, + "grad_norm": 2.0314600467681885, + "learning_rate": 1.9972230741002204e-05, + "loss": 0.9037, + "step": 3349 + }, + { + "epoch": 0.5469164523896983, + "grad_norm": 2.158505916595459, + "learning_rate": 1.9972207103457e-05, + "loss": 0.7989, + "step": 3350 + }, + { + "epoch": 0.5470797110322028, + "grad_norm": 2.019305467605591, + "learning_rate": 1.9972183455869793e-05, + "loss": 0.7828, + "step": 3351 + }, + { + "epoch": 0.5472429696747072, + "grad_norm": 2.5652377605438232, + "learning_rate": 1.9972159798240596e-05, + "loss": 0.9107, + "step": 3352 + }, + { + "epoch": 0.5474062283172115, + "grad_norm": 1.9729971885681152, + "learning_rate": 1.9972136130569438e-05, + "loss": 0.8646, + "step": 3353 + }, + { + "epoch": 0.547569486959716, + "grad_norm": 1.9854708909988403, + "learning_rate": 1.997211245285634e-05, + "loss": 0.8254, + "step": 3354 + }, + { + "epoch": 0.5477327456022203, + "grad_norm": 2.113562822341919, + "learning_rate": 1.9972088765101326e-05, + "loss": 0.7603, + "step": 3355 + }, + { + "epoch": 0.5478960042447247, + "grad_norm": 2.289031744003296, + "learning_rate": 1.9972065067304424e-05, + "loss": 0.9278, + "step": 3356 + }, + { + "epoch": 0.5480592628872291, + "grad_norm": 2.354802131652832, + "learning_rate": 1.9972041359465658e-05, + "loss": 0.7929, + "step": 3357 + }, + { + "epoch": 0.5482225215297335, + "grad_norm": 2.2339534759521484, + "learning_rate": 1.9972017641585043e-05, + "loss": 0.9376, + "step": 3358 + }, + { + "epoch": 0.5483857801722378, + "grad_norm": 2.3014743328094482, + "learning_rate": 1.997199391366261e-05, + "loss": 0.8999, + "step": 3359 + }, + { + "epoch": 0.5485490388147423, + "grad_norm": 2.0174829959869385, + "learning_rate": 1.9971970175698387e-05, + "loss": 0.7391, + "step": 3360 + }, + { + "epoch": 0.5487122974572466, + "grad_norm": 2.2106211185455322, + "learning_rate": 1.9971946427692387e-05, + "loss": 0.9314, + "step": 3361 + }, + { + "epoch": 0.548875556099751, + "grad_norm": 2.4365060329437256, + "learning_rate": 1.9971922669644642e-05, + "loss": 1.0066, + "step": 3362 + }, + { + "epoch": 0.5490388147422555, + "grad_norm": 2.1324260234832764, + "learning_rate": 1.9971898901555173e-05, + "loss": 0.7237, + "step": 3363 + }, + { + "epoch": 0.5492020733847598, + "grad_norm": 2.3569600582122803, + "learning_rate": 1.9971875123424006e-05, + "loss": 0.9277, + "step": 3364 + }, + { + "epoch": 0.5493653320272642, + "grad_norm": 2.323190689086914, + "learning_rate": 1.9971851335251162e-05, + "loss": 0.8148, + "step": 3365 + }, + { + "epoch": 0.5495285906697686, + "grad_norm": 2.846837282180786, + "learning_rate": 1.9971827537036664e-05, + "loss": 1.0011, + "step": 3366 + }, + { + "epoch": 0.549691849312273, + "grad_norm": 2.1637978553771973, + "learning_rate": 1.997180372878054e-05, + "loss": 0.8356, + "step": 3367 + }, + { + "epoch": 0.5498551079547773, + "grad_norm": 2.0121238231658936, + "learning_rate": 1.997177991048281e-05, + "loss": 0.7152, + "step": 3368 + }, + { + "epoch": 0.5500183665972818, + "grad_norm": 2.341517210006714, + "learning_rate": 1.9971756082143504e-05, + "loss": 0.8444, + "step": 3369 + }, + { + "epoch": 0.5501816252397861, + "grad_norm": 2.399437427520752, + "learning_rate": 1.9971732243762643e-05, + "loss": 0.908, + "step": 3370 + }, + { + "epoch": 0.5503448838822905, + "grad_norm": 2.1694765090942383, + "learning_rate": 1.9971708395340247e-05, + "loss": 0.8396, + "step": 3371 + }, + { + "epoch": 0.5505081425247949, + "grad_norm": 2.0025227069854736, + "learning_rate": 1.9971684536876347e-05, + "loss": 0.7701, + "step": 3372 + }, + { + "epoch": 0.5506714011672993, + "grad_norm": 2.12707257270813, + "learning_rate": 1.997166066837096e-05, + "loss": 0.9347, + "step": 3373 + }, + { + "epoch": 0.5508346598098037, + "grad_norm": 2.4536960124969482, + "learning_rate": 1.9971636789824114e-05, + "loss": 0.925, + "step": 3374 + }, + { + "epoch": 0.550997918452308, + "grad_norm": 2.416557550430298, + "learning_rate": 1.9971612901235832e-05, + "loss": 0.9445, + "step": 3375 + }, + { + "epoch": 0.5511611770948125, + "grad_norm": 1.9669743776321411, + "learning_rate": 1.997158900260614e-05, + "loss": 0.7123, + "step": 3376 + }, + { + "epoch": 0.5513244357373168, + "grad_norm": 1.9152058362960815, + "learning_rate": 1.997156509393506e-05, + "loss": 0.9422, + "step": 3377 + }, + { + "epoch": 0.5514876943798213, + "grad_norm": 2.189948558807373, + "learning_rate": 1.9971541175222618e-05, + "loss": 1.3452, + "step": 3378 + }, + { + "epoch": 0.5516509530223256, + "grad_norm": 2.670701503753662, + "learning_rate": 1.9971517246468834e-05, + "loss": 0.9336, + "step": 3379 + }, + { + "epoch": 0.55181421166483, + "grad_norm": 2.3493216037750244, + "learning_rate": 1.9971493307673735e-05, + "loss": 1.0201, + "step": 3380 + }, + { + "epoch": 0.5519774703073344, + "grad_norm": 1.6180508136749268, + "learning_rate": 1.9971469358837348e-05, + "loss": 0.6577, + "step": 3381 + }, + { + "epoch": 0.5521407289498388, + "grad_norm": 2.2462778091430664, + "learning_rate": 1.997144539995969e-05, + "loss": 0.8901, + "step": 3382 + }, + { + "epoch": 0.5523039875923431, + "grad_norm": 1.9646728038787842, + "learning_rate": 1.9971421431040793e-05, + "loss": 0.9873, + "step": 3383 + }, + { + "epoch": 0.5524672462348476, + "grad_norm": 1.9699361324310303, + "learning_rate": 1.9971397452080673e-05, + "loss": 0.8236, + "step": 3384 + }, + { + "epoch": 0.552630504877352, + "grad_norm": 2.031480550765991, + "learning_rate": 1.9971373463079363e-05, + "loss": 0.8913, + "step": 3385 + }, + { + "epoch": 0.5527937635198563, + "grad_norm": 2.3946778774261475, + "learning_rate": 1.997134946403688e-05, + "loss": 0.919, + "step": 3386 + }, + { + "epoch": 0.5529570221623608, + "grad_norm": 2.1686277389526367, + "learning_rate": 1.997132545495325e-05, + "loss": 0.9801, + "step": 3387 + }, + { + "epoch": 0.5531202808048651, + "grad_norm": 2.0874907970428467, + "learning_rate": 1.99713014358285e-05, + "loss": 0.9114, + "step": 3388 + }, + { + "epoch": 0.5532835394473695, + "grad_norm": 2.0704283714294434, + "learning_rate": 1.997127740666265e-05, + "loss": 0.8794, + "step": 3389 + }, + { + "epoch": 0.5534467980898738, + "grad_norm": 2.326942205429077, + "learning_rate": 1.9971253367455728e-05, + "loss": 1.0979, + "step": 3390 + }, + { + "epoch": 0.5536100567323783, + "grad_norm": 1.8089680671691895, + "learning_rate": 1.9971229318207753e-05, + "loss": 0.8726, + "step": 3391 + }, + { + "epoch": 0.5537733153748826, + "grad_norm": 2.3091816902160645, + "learning_rate": 1.9971205258918758e-05, + "loss": 1.4083, + "step": 3392 + }, + { + "epoch": 0.553936574017387, + "grad_norm": 1.720828652381897, + "learning_rate": 1.9971181189588756e-05, + "loss": 0.7107, + "step": 3393 + }, + { + "epoch": 0.5540998326598914, + "grad_norm": 1.8170922994613647, + "learning_rate": 1.9971157110217782e-05, + "loss": 0.7238, + "step": 3394 + }, + { + "epoch": 0.5542630913023958, + "grad_norm": 2.1354002952575684, + "learning_rate": 1.9971133020805856e-05, + "loss": 0.8321, + "step": 3395 + }, + { + "epoch": 0.5544263499449003, + "grad_norm": 2.0781266689300537, + "learning_rate": 1.9971108921352998e-05, + "loss": 0.6815, + "step": 3396 + }, + { + "epoch": 0.5545896085874046, + "grad_norm": 2.0168750286102295, + "learning_rate": 1.997108481185924e-05, + "loss": 0.7504, + "step": 3397 + }, + { + "epoch": 0.554752867229909, + "grad_norm": 2.2027952671051025, + "learning_rate": 1.99710606923246e-05, + "loss": 0.795, + "step": 3398 + }, + { + "epoch": 0.5549161258724133, + "grad_norm": 2.126131057739258, + "learning_rate": 1.99710365627491e-05, + "loss": 0.7701, + "step": 3399 + }, + { + "epoch": 0.5550793845149178, + "grad_norm": 1.6917164325714111, + "learning_rate": 1.9971012423132776e-05, + "loss": 0.7215, + "step": 3400 + }, + { + "epoch": 0.5552426431574221, + "grad_norm": 1.962458848953247, + "learning_rate": 1.9970988273475642e-05, + "loss": 0.7456, + "step": 3401 + }, + { + "epoch": 0.5554059017999265, + "grad_norm": 2.2335526943206787, + "learning_rate": 1.9970964113777725e-05, + "loss": 0.8518, + "step": 3402 + }, + { + "epoch": 0.5555691604424309, + "grad_norm": 2.119255781173706, + "learning_rate": 1.9970939944039052e-05, + "loss": 0.9137, + "step": 3403 + }, + { + "epoch": 0.5557324190849353, + "grad_norm": 2.1925292015075684, + "learning_rate": 1.9970915764259644e-05, + "loss": 0.7889, + "step": 3404 + }, + { + "epoch": 0.5558956777274396, + "grad_norm": 2.054605484008789, + "learning_rate": 1.9970891574439524e-05, + "loss": 0.8136, + "step": 3405 + }, + { + "epoch": 0.5560589363699441, + "grad_norm": 2.2178456783294678, + "learning_rate": 1.9970867374578724e-05, + "loss": 0.8551, + "step": 3406 + }, + { + "epoch": 0.5562221950124485, + "grad_norm": 1.9237934350967407, + "learning_rate": 1.9970843164677262e-05, + "loss": 0.6965, + "step": 3407 + }, + { + "epoch": 0.5563854536549528, + "grad_norm": 2.1611223220825195, + "learning_rate": 1.997081894473516e-05, + "loss": 0.834, + "step": 3408 + }, + { + "epoch": 0.5565487122974573, + "grad_norm": 2.290708541870117, + "learning_rate": 1.9970794714752448e-05, + "loss": 0.8227, + "step": 3409 + }, + { + "epoch": 0.5567119709399616, + "grad_norm": 2.1337385177612305, + "learning_rate": 1.9970770474729146e-05, + "loss": 0.8777, + "step": 3410 + }, + { + "epoch": 0.556875229582466, + "grad_norm": 2.221975564956665, + "learning_rate": 1.9970746224665282e-05, + "loss": 0.9283, + "step": 3411 + }, + { + "epoch": 0.5570384882249704, + "grad_norm": 2.21610426902771, + "learning_rate": 1.9970721964560882e-05, + "loss": 0.8254, + "step": 3412 + }, + { + "epoch": 0.5572017468674748, + "grad_norm": 2.266871452331543, + "learning_rate": 1.9970697694415967e-05, + "loss": 0.9072, + "step": 3413 + }, + { + "epoch": 0.5573650055099791, + "grad_norm": 2.2100205421447754, + "learning_rate": 1.997067341423056e-05, + "loss": 0.8356, + "step": 3414 + }, + { + "epoch": 0.5575282641524836, + "grad_norm": 2.517849922180176, + "learning_rate": 1.9970649124004687e-05, + "loss": 1.0958, + "step": 3415 + }, + { + "epoch": 0.557691522794988, + "grad_norm": 1.908424973487854, + "learning_rate": 1.9970624823738376e-05, + "loss": 0.719, + "step": 3416 + }, + { + "epoch": 0.5578547814374923, + "grad_norm": 2.277839183807373, + "learning_rate": 1.9970600513431645e-05, + "loss": 0.9011, + "step": 3417 + }, + { + "epoch": 0.5580180400799968, + "grad_norm": 1.965239405632019, + "learning_rate": 1.9970576193084524e-05, + "loss": 0.8449, + "step": 3418 + }, + { + "epoch": 0.5581812987225011, + "grad_norm": 3.430443048477173, + "learning_rate": 1.9970551862697037e-05, + "loss": 1.0985, + "step": 3419 + }, + { + "epoch": 0.5583445573650055, + "grad_norm": 2.2285091876983643, + "learning_rate": 1.9970527522269204e-05, + "loss": 0.7707, + "step": 3420 + }, + { + "epoch": 0.5585078160075099, + "grad_norm": 1.9810905456542969, + "learning_rate": 1.9970503171801053e-05, + "loss": 0.8253, + "step": 3421 + }, + { + "epoch": 0.5586710746500143, + "grad_norm": 2.501267671585083, + "learning_rate": 1.997047881129261e-05, + "loss": 0.9598, + "step": 3422 + }, + { + "epoch": 0.5588343332925186, + "grad_norm": 2.3219518661499023, + "learning_rate": 1.9970454440743893e-05, + "loss": 0.9445, + "step": 3423 + }, + { + "epoch": 0.5589975919350231, + "grad_norm": 2.236375570297241, + "learning_rate": 1.9970430060154938e-05, + "loss": 0.9506, + "step": 3424 + }, + { + "epoch": 0.5591608505775274, + "grad_norm": 2.064094066619873, + "learning_rate": 1.9970405669525756e-05, + "loss": 0.7553, + "step": 3425 + }, + { + "epoch": 0.5593241092200318, + "grad_norm": 2.0976550579071045, + "learning_rate": 1.997038126885638e-05, + "loss": 0.8452, + "step": 3426 + }, + { + "epoch": 0.5594873678625363, + "grad_norm": 2.2344675064086914, + "learning_rate": 1.9970356858146833e-05, + "loss": 1.004, + "step": 3427 + }, + { + "epoch": 0.5596506265050406, + "grad_norm": 1.7670211791992188, + "learning_rate": 1.997033243739714e-05, + "loss": 0.6927, + "step": 3428 + }, + { + "epoch": 0.559813885147545, + "grad_norm": 1.8610122203826904, + "learning_rate": 1.9970308006607327e-05, + "loss": 0.8392, + "step": 3429 + }, + { + "epoch": 0.5599771437900494, + "grad_norm": 2.0461933612823486, + "learning_rate": 1.997028356577741e-05, + "loss": 0.8206, + "step": 3430 + }, + { + "epoch": 0.5601404024325538, + "grad_norm": 2.8661203384399414, + "learning_rate": 1.9970259114907428e-05, + "loss": 0.8657, + "step": 3431 + }, + { + "epoch": 0.5603036610750581, + "grad_norm": 1.7290693521499634, + "learning_rate": 1.9970234653997395e-05, + "loss": 0.7571, + "step": 3432 + }, + { + "epoch": 0.5604669197175626, + "grad_norm": 1.7906416654586792, + "learning_rate": 1.9970210183047335e-05, + "loss": 0.8287, + "step": 3433 + }, + { + "epoch": 0.5606301783600669, + "grad_norm": 2.152283191680908, + "learning_rate": 1.9970185702057278e-05, + "loss": 0.8399, + "step": 3434 + }, + { + "epoch": 0.5607934370025713, + "grad_norm": 2.637589454650879, + "learning_rate": 1.9970161211027248e-05, + "loss": 0.8584, + "step": 3435 + }, + { + "epoch": 0.5609566956450757, + "grad_norm": 1.7848567962646484, + "learning_rate": 1.9970136709957265e-05, + "loss": 0.6849, + "step": 3436 + }, + { + "epoch": 0.5611199542875801, + "grad_norm": 2.3367159366607666, + "learning_rate": 1.997011219884736e-05, + "loss": 0.7873, + "step": 3437 + }, + { + "epoch": 0.5612832129300845, + "grad_norm": 2.27604341506958, + "learning_rate": 1.997008767769755e-05, + "loss": 0.9516, + "step": 3438 + }, + { + "epoch": 0.5614464715725889, + "grad_norm": 2.1098456382751465, + "learning_rate": 1.9970063146507873e-05, + "loss": 0.8121, + "step": 3439 + }, + { + "epoch": 0.5616097302150933, + "grad_norm": 2.8377089500427246, + "learning_rate": 1.997003860527834e-05, + "loss": 1.0012, + "step": 3440 + }, + { + "epoch": 0.5617729888575976, + "grad_norm": 1.9472919702529907, + "learning_rate": 1.997001405400898e-05, + "loss": 0.7443, + "step": 3441 + }, + { + "epoch": 0.5619362475001021, + "grad_norm": 2.575282573699951, + "learning_rate": 1.996998949269982e-05, + "loss": 1.0491, + "step": 3442 + }, + { + "epoch": 0.5620995061426064, + "grad_norm": 1.89371657371521, + "learning_rate": 1.996996492135088e-05, + "loss": 0.7521, + "step": 3443 + }, + { + "epoch": 0.5622627647851108, + "grad_norm": 2.2644269466400146, + "learning_rate": 1.9969940339962192e-05, + "loss": 0.9906, + "step": 3444 + }, + { + "epoch": 0.5624260234276152, + "grad_norm": 1.9731885194778442, + "learning_rate": 1.9969915748533774e-05, + "loss": 0.873, + "step": 3445 + }, + { + "epoch": 0.5625892820701196, + "grad_norm": 2.233092784881592, + "learning_rate": 1.9969891147065657e-05, + "loss": 0.8655, + "step": 3446 + }, + { + "epoch": 0.5627525407126239, + "grad_norm": 1.8233085870742798, + "learning_rate": 1.996986653555786e-05, + "loss": 0.8324, + "step": 3447 + }, + { + "epoch": 0.5629157993551284, + "grad_norm": 1.8706403970718384, + "learning_rate": 1.996984191401041e-05, + "loss": 0.7154, + "step": 3448 + }, + { + "epoch": 0.5630790579976328, + "grad_norm": 1.9819782972335815, + "learning_rate": 1.9969817282423332e-05, + "loss": 0.8228, + "step": 3449 + }, + { + "epoch": 0.5632423166401371, + "grad_norm": 1.7624868154525757, + "learning_rate": 1.996979264079665e-05, + "loss": 0.827, + "step": 3450 + }, + { + "epoch": 0.5634055752826416, + "grad_norm": 2.0551652908325195, + "learning_rate": 1.996976798913039e-05, + "loss": 0.8641, + "step": 3451 + }, + { + "epoch": 0.5635688339251459, + "grad_norm": 2.5862843990325928, + "learning_rate": 1.9969743327424574e-05, + "loss": 1.0906, + "step": 3452 + }, + { + "epoch": 0.5637320925676503, + "grad_norm": 1.7549127340316772, + "learning_rate": 1.9969718655679235e-05, + "loss": 0.6891, + "step": 3453 + }, + { + "epoch": 0.5638953512101547, + "grad_norm": 2.14625883102417, + "learning_rate": 1.9969693973894387e-05, + "loss": 0.8222, + "step": 3454 + }, + { + "epoch": 0.5640586098526591, + "grad_norm": 2.0105526447296143, + "learning_rate": 1.996966928207006e-05, + "loss": 0.8607, + "step": 3455 + }, + { + "epoch": 0.5642218684951634, + "grad_norm": 1.8692042827606201, + "learning_rate": 1.996964458020628e-05, + "loss": 0.6152, + "step": 3456 + }, + { + "epoch": 0.5643851271376679, + "grad_norm": 2.189004421234131, + "learning_rate": 1.996961986830307e-05, + "loss": 0.9449, + "step": 3457 + }, + { + "epoch": 0.5645483857801722, + "grad_norm": 2.206759452819824, + "learning_rate": 1.996959514636046e-05, + "loss": 0.8098, + "step": 3458 + }, + { + "epoch": 0.5647116444226766, + "grad_norm": 1.8337886333465576, + "learning_rate": 1.9969570414378463e-05, + "loss": 0.7908, + "step": 3459 + }, + { + "epoch": 0.5648749030651811, + "grad_norm": 2.0800657272338867, + "learning_rate": 1.9969545672357117e-05, + "loss": 0.8759, + "step": 3460 + }, + { + "epoch": 0.5650381617076854, + "grad_norm": 2.2918269634246826, + "learning_rate": 1.9969520920296436e-05, + "loss": 0.8315, + "step": 3461 + }, + { + "epoch": 0.5652014203501898, + "grad_norm": 2.3565824031829834, + "learning_rate": 1.9969496158196452e-05, + "loss": 0.8446, + "step": 3462 + }, + { + "epoch": 0.5653646789926942, + "grad_norm": 2.054570436477661, + "learning_rate": 1.996947138605719e-05, + "loss": 0.8475, + "step": 3463 + }, + { + "epoch": 0.5655279376351986, + "grad_norm": 1.9744058847427368, + "learning_rate": 1.9969446603878673e-05, + "loss": 0.8297, + "step": 3464 + }, + { + "epoch": 0.5656911962777029, + "grad_norm": 2.1881208419799805, + "learning_rate": 1.9969421811660922e-05, + "loss": 1.0169, + "step": 3465 + }, + { + "epoch": 0.5658544549202074, + "grad_norm": 1.7953952550888062, + "learning_rate": 1.9969397009403967e-05, + "loss": 0.7628, + "step": 3466 + }, + { + "epoch": 0.5660177135627117, + "grad_norm": 2.4982221126556396, + "learning_rate": 1.9969372197107835e-05, + "loss": 0.8465, + "step": 3467 + }, + { + "epoch": 0.5661809722052161, + "grad_norm": 2.2998929023742676, + "learning_rate": 1.9969347374772547e-05, + "loss": 1.0015, + "step": 3468 + }, + { + "epoch": 0.5663442308477205, + "grad_norm": 1.9936132431030273, + "learning_rate": 1.9969322542398126e-05, + "loss": 0.7333, + "step": 3469 + }, + { + "epoch": 0.5665074894902249, + "grad_norm": 1.7774120569229126, + "learning_rate": 1.9969297699984606e-05, + "loss": 0.7523, + "step": 3470 + }, + { + "epoch": 0.5666707481327293, + "grad_norm": 2.0726122856140137, + "learning_rate": 1.9969272847532e-05, + "loss": 0.8299, + "step": 3471 + }, + { + "epoch": 0.5668340067752337, + "grad_norm": 2.009347677230835, + "learning_rate": 1.996924798504034e-05, + "loss": 1.0534, + "step": 3472 + }, + { + "epoch": 0.5669972654177381, + "grad_norm": 1.7794783115386963, + "learning_rate": 1.996922311250965e-05, + "loss": 0.6734, + "step": 3473 + }, + { + "epoch": 0.5671605240602424, + "grad_norm": 2.088350534439087, + "learning_rate": 1.9969198229939955e-05, + "loss": 0.728, + "step": 3474 + }, + { + "epoch": 0.5673237827027469, + "grad_norm": 2.16640567779541, + "learning_rate": 1.9969173337331283e-05, + "loss": 0.8941, + "step": 3475 + }, + { + "epoch": 0.5674870413452512, + "grad_norm": 2.1613821983337402, + "learning_rate": 1.996914843468365e-05, + "loss": 0.8288, + "step": 3476 + }, + { + "epoch": 0.5676502999877556, + "grad_norm": 2.108583688735962, + "learning_rate": 1.9969123521997092e-05, + "loss": 0.8162, + "step": 3477 + }, + { + "epoch": 0.56781355863026, + "grad_norm": 2.116083860397339, + "learning_rate": 1.996909859927163e-05, + "loss": 0.9621, + "step": 3478 + }, + { + "epoch": 0.5679768172727644, + "grad_norm": 2.1136112213134766, + "learning_rate": 1.9969073666507283e-05, + "loss": 0.798, + "step": 3479 + }, + { + "epoch": 0.5681400759152687, + "grad_norm": 2.1535415649414062, + "learning_rate": 1.996904872370409e-05, + "loss": 0.9049, + "step": 3480 + }, + { + "epoch": 0.5683033345577732, + "grad_norm": 2.131030559539795, + "learning_rate": 1.996902377086206e-05, + "loss": 0.7919, + "step": 3481 + }, + { + "epoch": 0.5684665932002776, + "grad_norm": 1.9921826124191284, + "learning_rate": 1.9968998807981224e-05, + "loss": 0.7865, + "step": 3482 + }, + { + "epoch": 0.5686298518427819, + "grad_norm": 2.3258039951324463, + "learning_rate": 1.9968973835061615e-05, + "loss": 0.8733, + "step": 3483 + }, + { + "epoch": 0.5687931104852864, + "grad_norm": 2.5045089721679688, + "learning_rate": 1.9968948852103252e-05, + "loss": 0.8565, + "step": 3484 + }, + { + "epoch": 0.5689563691277907, + "grad_norm": 2.0942046642303467, + "learning_rate": 1.9968923859106156e-05, + "loss": 0.9612, + "step": 3485 + }, + { + "epoch": 0.5691196277702951, + "grad_norm": 1.9553985595703125, + "learning_rate": 1.996889885607036e-05, + "loss": 0.9012, + "step": 3486 + }, + { + "epoch": 0.5692828864127994, + "grad_norm": 1.8133631944656372, + "learning_rate": 1.9968873842995884e-05, + "loss": 0.7124, + "step": 3487 + }, + { + "epoch": 0.5694461450553039, + "grad_norm": 2.2276971340179443, + "learning_rate": 1.9968848819882755e-05, + "loss": 0.6877, + "step": 3488 + }, + { + "epoch": 0.5696094036978082, + "grad_norm": 2.2094480991363525, + "learning_rate": 1.9968823786730995e-05, + "loss": 0.8508, + "step": 3489 + }, + { + "epoch": 0.5697726623403127, + "grad_norm": 1.9274355173110962, + "learning_rate": 1.9968798743540638e-05, + "loss": 0.8486, + "step": 3490 + }, + { + "epoch": 0.569935920982817, + "grad_norm": 2.271136522293091, + "learning_rate": 1.9968773690311696e-05, + "loss": 0.8311, + "step": 3491 + }, + { + "epoch": 0.5700991796253214, + "grad_norm": 1.7240419387817383, + "learning_rate": 1.996874862704421e-05, + "loss": 0.7199, + "step": 3492 + }, + { + "epoch": 0.5702624382678259, + "grad_norm": 2.30059814453125, + "learning_rate": 1.996872355373819e-05, + "loss": 0.9258, + "step": 3493 + }, + { + "epoch": 0.5704256969103302, + "grad_norm": 2.240341901779175, + "learning_rate": 1.996869847039367e-05, + "loss": 0.727, + "step": 3494 + }, + { + "epoch": 0.5705889555528346, + "grad_norm": 1.8499248027801514, + "learning_rate": 1.9968673377010672e-05, + "loss": 0.6413, + "step": 3495 + }, + { + "epoch": 0.570752214195339, + "grad_norm": 1.9977656602859497, + "learning_rate": 1.9968648273589225e-05, + "loss": 0.9071, + "step": 3496 + }, + { + "epoch": 0.5709154728378434, + "grad_norm": 1.923082947731018, + "learning_rate": 1.9968623160129353e-05, + "loss": 0.8189, + "step": 3497 + }, + { + "epoch": 0.5710787314803477, + "grad_norm": 1.8215272426605225, + "learning_rate": 1.9968598036631077e-05, + "loss": 0.7802, + "step": 3498 + }, + { + "epoch": 0.5712419901228522, + "grad_norm": 2.44059157371521, + "learning_rate": 1.9968572903094427e-05, + "loss": 0.8205, + "step": 3499 + }, + { + "epoch": 0.5714052487653565, + "grad_norm": 1.7760988473892212, + "learning_rate": 1.9968547759519426e-05, + "loss": 0.7798, + "step": 3500 + }, + { + "epoch": 0.5715685074078609, + "grad_norm": 2.06912899017334, + "learning_rate": 1.9968522605906097e-05, + "loss": 0.9287, + "step": 3501 + }, + { + "epoch": 0.5717317660503652, + "grad_norm": 2.350764036178589, + "learning_rate": 1.9968497442254474e-05, + "loss": 0.9821, + "step": 3502 + }, + { + "epoch": 0.5718950246928697, + "grad_norm": 2.0147688388824463, + "learning_rate": 1.9968472268564573e-05, + "loss": 0.8077, + "step": 3503 + }, + { + "epoch": 0.5720582833353741, + "grad_norm": 2.2154200077056885, + "learning_rate": 1.9968447084836423e-05, + "loss": 0.8475, + "step": 3504 + }, + { + "epoch": 0.5722215419778784, + "grad_norm": 2.128984212875366, + "learning_rate": 1.9968421891070052e-05, + "loss": 1.0071, + "step": 3505 + }, + { + "epoch": 0.5723848006203829, + "grad_norm": 2.417509078979492, + "learning_rate": 1.9968396687265483e-05, + "loss": 0.9865, + "step": 3506 + }, + { + "epoch": 0.5725480592628872, + "grad_norm": 2.044718027114868, + "learning_rate": 1.9968371473422737e-05, + "loss": 1.075, + "step": 3507 + }, + { + "epoch": 0.5727113179053916, + "grad_norm": 2.431800365447998, + "learning_rate": 1.9968346249541848e-05, + "loss": 0.8866, + "step": 3508 + }, + { + "epoch": 0.572874576547896, + "grad_norm": 2.267066240310669, + "learning_rate": 1.9968321015622836e-05, + "loss": 0.9074, + "step": 3509 + }, + { + "epoch": 0.5730378351904004, + "grad_norm": 1.9672914743423462, + "learning_rate": 1.9968295771665727e-05, + "loss": 0.7516, + "step": 3510 + }, + { + "epoch": 0.5732010938329047, + "grad_norm": 2.1989762783050537, + "learning_rate": 1.9968270517670546e-05, + "loss": 0.8349, + "step": 3511 + }, + { + "epoch": 0.5733643524754092, + "grad_norm": 2.4410431385040283, + "learning_rate": 1.996824525363732e-05, + "loss": 0.918, + "step": 3512 + }, + { + "epoch": 0.5735276111179135, + "grad_norm": 1.757378101348877, + "learning_rate": 1.9968219979566073e-05, + "loss": 0.818, + "step": 3513 + }, + { + "epoch": 0.5736908697604179, + "grad_norm": 2.1554975509643555, + "learning_rate": 1.996819469545683e-05, + "loss": 1.0567, + "step": 3514 + }, + { + "epoch": 0.5738541284029224, + "grad_norm": 2.4001452922821045, + "learning_rate": 1.996816940130962e-05, + "loss": 1.0932, + "step": 3515 + }, + { + "epoch": 0.5740173870454267, + "grad_norm": 2.1639411449432373, + "learning_rate": 1.9968144097124467e-05, + "loss": 0.7229, + "step": 3516 + }, + { + "epoch": 0.5741806456879311, + "grad_norm": 2.1429758071899414, + "learning_rate": 1.9968118782901395e-05, + "loss": 0.8223, + "step": 3517 + }, + { + "epoch": 0.5743439043304355, + "grad_norm": 2.041686534881592, + "learning_rate": 1.9968093458640426e-05, + "loss": 0.9954, + "step": 3518 + }, + { + "epoch": 0.5745071629729399, + "grad_norm": 1.8341180086135864, + "learning_rate": 1.9968068124341593e-05, + "loss": 0.6653, + "step": 3519 + }, + { + "epoch": 0.5746704216154442, + "grad_norm": 2.1523661613464355, + "learning_rate": 1.9968042780004917e-05, + "loss": 1.047, + "step": 3520 + }, + { + "epoch": 0.5748336802579487, + "grad_norm": 1.844008207321167, + "learning_rate": 1.9968017425630426e-05, + "loss": 0.7597, + "step": 3521 + }, + { + "epoch": 0.574996938900453, + "grad_norm": 2.1349868774414062, + "learning_rate": 1.996799206121814e-05, + "loss": 0.7207, + "step": 3522 + }, + { + "epoch": 0.5751601975429574, + "grad_norm": 2.0012898445129395, + "learning_rate": 1.9967966686768096e-05, + "loss": 0.7216, + "step": 3523 + }, + { + "epoch": 0.5753234561854618, + "grad_norm": 2.0907328128814697, + "learning_rate": 1.9967941302280307e-05, + "loss": 0.9239, + "step": 3524 + }, + { + "epoch": 0.5754867148279662, + "grad_norm": 1.8208011388778687, + "learning_rate": 1.99679159077548e-05, + "loss": 0.7155, + "step": 3525 + }, + { + "epoch": 0.5756499734704706, + "grad_norm": 2.0465950965881348, + "learning_rate": 1.9967890503191613e-05, + "loss": 1.0453, + "step": 3526 + }, + { + "epoch": 0.575813232112975, + "grad_norm": 1.9289677143096924, + "learning_rate": 1.996786508859076e-05, + "loss": 0.9206, + "step": 3527 + }, + { + "epoch": 0.5759764907554794, + "grad_norm": 2.235194444656372, + "learning_rate": 1.9967839663952267e-05, + "loss": 0.9156, + "step": 3528 + }, + { + "epoch": 0.5761397493979837, + "grad_norm": 1.7359317541122437, + "learning_rate": 1.9967814229276163e-05, + "loss": 0.7562, + "step": 3529 + }, + { + "epoch": 0.5763030080404882, + "grad_norm": 1.7814279794692993, + "learning_rate": 1.9967788784562474e-05, + "loss": 0.6828, + "step": 3530 + }, + { + "epoch": 0.5764662666829925, + "grad_norm": 2.071117639541626, + "learning_rate": 1.9967763329811222e-05, + "loss": 0.8092, + "step": 3531 + }, + { + "epoch": 0.5766295253254969, + "grad_norm": 2.035979986190796, + "learning_rate": 1.9967737865022436e-05, + "loss": 0.8125, + "step": 3532 + }, + { + "epoch": 0.5767927839680013, + "grad_norm": 2.5577938556671143, + "learning_rate": 1.9967712390196144e-05, + "loss": 0.7574, + "step": 3533 + }, + { + "epoch": 0.5769560426105057, + "grad_norm": 1.939315915107727, + "learning_rate": 1.9967686905332365e-05, + "loss": 0.7368, + "step": 3534 + }, + { + "epoch": 0.57711930125301, + "grad_norm": 1.9030582904815674, + "learning_rate": 1.9967661410431128e-05, + "loss": 0.7362, + "step": 3535 + }, + { + "epoch": 0.5772825598955145, + "grad_norm": 1.9101777076721191, + "learning_rate": 1.996763590549246e-05, + "loss": 0.9099, + "step": 3536 + }, + { + "epoch": 0.5774458185380189, + "grad_norm": 2.0086679458618164, + "learning_rate": 1.9967610390516384e-05, + "loss": 0.7953, + "step": 3537 + }, + { + "epoch": 0.5776090771805232, + "grad_norm": 2.229421854019165, + "learning_rate": 1.9967584865502925e-05, + "loss": 0.9647, + "step": 3538 + }, + { + "epoch": 0.5777723358230277, + "grad_norm": 1.8451991081237793, + "learning_rate": 1.9967559330452113e-05, + "loss": 0.7281, + "step": 3539 + }, + { + "epoch": 0.577935594465532, + "grad_norm": 2.137878894805908, + "learning_rate": 1.996753378536397e-05, + "loss": 0.9125, + "step": 3540 + }, + { + "epoch": 0.5780988531080364, + "grad_norm": 1.878936529159546, + "learning_rate": 1.9967508230238524e-05, + "loss": 0.7519, + "step": 3541 + }, + { + "epoch": 0.5782621117505408, + "grad_norm": 2.218960762023926, + "learning_rate": 1.9967482665075802e-05, + "loss": 0.8717, + "step": 3542 + }, + { + "epoch": 0.5784253703930452, + "grad_norm": 2.2280657291412354, + "learning_rate": 1.9967457089875824e-05, + "loss": 0.8948, + "step": 3543 + }, + { + "epoch": 0.5785886290355495, + "grad_norm": 2.416912317276001, + "learning_rate": 1.9967431504638624e-05, + "loss": 0.9698, + "step": 3544 + }, + { + "epoch": 0.578751887678054, + "grad_norm": 2.0678751468658447, + "learning_rate": 1.9967405909364216e-05, + "loss": 0.8646, + "step": 3545 + }, + { + "epoch": 0.5789151463205583, + "grad_norm": 2.2173993587493896, + "learning_rate": 1.996738030405264e-05, + "loss": 0.8275, + "step": 3546 + }, + { + "epoch": 0.5790784049630627, + "grad_norm": 1.7599059343338013, + "learning_rate": 1.996735468870391e-05, + "loss": 0.6167, + "step": 3547 + }, + { + "epoch": 0.5792416636055672, + "grad_norm": 2.010413646697998, + "learning_rate": 1.9967329063318058e-05, + "loss": 0.7683, + "step": 3548 + }, + { + "epoch": 0.5794049222480715, + "grad_norm": 1.968126654624939, + "learning_rate": 1.996730342789511e-05, + "loss": 0.8275, + "step": 3549 + }, + { + "epoch": 0.5795681808905759, + "grad_norm": 2.081322431564331, + "learning_rate": 1.996727778243509e-05, + "loss": 0.8272, + "step": 3550 + }, + { + "epoch": 0.5797314395330803, + "grad_norm": 2.2533774375915527, + "learning_rate": 1.996725212693802e-05, + "loss": 0.8824, + "step": 3551 + }, + { + "epoch": 0.5798946981755847, + "grad_norm": 2.0859153270721436, + "learning_rate": 1.9967226461403934e-05, + "loss": 0.7182, + "step": 3552 + }, + { + "epoch": 0.580057956818089, + "grad_norm": 1.9190319776535034, + "learning_rate": 1.9967200785832853e-05, + "loss": 0.825, + "step": 3553 + }, + { + "epoch": 0.5802212154605935, + "grad_norm": 2.0208308696746826, + "learning_rate": 1.9967175100224803e-05, + "loss": 0.773, + "step": 3554 + }, + { + "epoch": 0.5803844741030978, + "grad_norm": 2.127307653427124, + "learning_rate": 1.996714940457981e-05, + "loss": 0.813, + "step": 3555 + }, + { + "epoch": 0.5805477327456022, + "grad_norm": 2.342010021209717, + "learning_rate": 1.9967123698897896e-05, + "loss": 0.9685, + "step": 3556 + }, + { + "epoch": 0.5807109913881066, + "grad_norm": 2.3246588706970215, + "learning_rate": 1.9967097983179096e-05, + "loss": 0.7986, + "step": 3557 + }, + { + "epoch": 0.580874250030611, + "grad_norm": 1.9008607864379883, + "learning_rate": 1.996707225742343e-05, + "loss": 0.7681, + "step": 3558 + }, + { + "epoch": 0.5810375086731154, + "grad_norm": 2.0052239894866943, + "learning_rate": 1.9967046521630925e-05, + "loss": 0.9682, + "step": 3559 + }, + { + "epoch": 0.5812007673156198, + "grad_norm": 1.9885621070861816, + "learning_rate": 1.9967020775801605e-05, + "loss": 0.9662, + "step": 3560 + }, + { + "epoch": 0.5813640259581242, + "grad_norm": 2.164640426635742, + "learning_rate": 1.99669950199355e-05, + "loss": 0.9055, + "step": 3561 + }, + { + "epoch": 0.5815272846006285, + "grad_norm": 2.1173012256622314, + "learning_rate": 1.9966969254032637e-05, + "loss": 0.8021, + "step": 3562 + }, + { + "epoch": 0.581690543243133, + "grad_norm": 2.1482012271881104, + "learning_rate": 1.996694347809303e-05, + "loss": 0.6883, + "step": 3563 + }, + { + "epoch": 0.5818538018856373, + "grad_norm": 1.790844202041626, + "learning_rate": 1.996691769211672e-05, + "loss": 0.7198, + "step": 3564 + }, + { + "epoch": 0.5820170605281417, + "grad_norm": 1.9320993423461914, + "learning_rate": 1.9966891896103723e-05, + "loss": 0.8342, + "step": 3565 + }, + { + "epoch": 0.582180319170646, + "grad_norm": 2.5462567806243896, + "learning_rate": 1.996686609005407e-05, + "loss": 0.9056, + "step": 3566 + }, + { + "epoch": 0.5823435778131505, + "grad_norm": 1.8899462223052979, + "learning_rate": 1.996684027396779e-05, + "loss": 0.859, + "step": 3567 + }, + { + "epoch": 0.5825068364556548, + "grad_norm": 1.9892734289169312, + "learning_rate": 1.9966814447844898e-05, + "loss": 0.8492, + "step": 3568 + }, + { + "epoch": 0.5826700950981593, + "grad_norm": 2.21549129486084, + "learning_rate": 1.996678861168543e-05, + "loss": 0.9839, + "step": 3569 + }, + { + "epoch": 0.5828333537406637, + "grad_norm": 1.7828598022460938, + "learning_rate": 1.9966762765489407e-05, + "loss": 0.8695, + "step": 3570 + }, + { + "epoch": 0.582996612383168, + "grad_norm": 1.858689546585083, + "learning_rate": 1.9966736909256857e-05, + "loss": 0.9036, + "step": 3571 + }, + { + "epoch": 0.5831598710256725, + "grad_norm": 2.165029764175415, + "learning_rate": 1.9966711042987806e-05, + "loss": 0.9676, + "step": 3572 + }, + { + "epoch": 0.5833231296681768, + "grad_norm": 2.0980281829833984, + "learning_rate": 1.9966685166682276e-05, + "loss": 1.3083, + "step": 3573 + }, + { + "epoch": 0.5834863883106812, + "grad_norm": 1.7611467838287354, + "learning_rate": 1.99666592803403e-05, + "loss": 0.8083, + "step": 3574 + }, + { + "epoch": 0.5836496469531856, + "grad_norm": 2.7075891494750977, + "learning_rate": 1.99666333839619e-05, + "loss": 1.0356, + "step": 3575 + }, + { + "epoch": 0.58381290559569, + "grad_norm": 1.8931360244750977, + "learning_rate": 1.9966607477547105e-05, + "loss": 0.7758, + "step": 3576 + }, + { + "epoch": 0.5839761642381943, + "grad_norm": 1.759946584701538, + "learning_rate": 1.9966581561095933e-05, + "loss": 0.7916, + "step": 3577 + }, + { + "epoch": 0.5841394228806988, + "grad_norm": 2.396160125732422, + "learning_rate": 1.996655563460842e-05, + "loss": 1.0775, + "step": 3578 + }, + { + "epoch": 0.5843026815232031, + "grad_norm": 2.015117645263672, + "learning_rate": 1.996652969808459e-05, + "loss": 0.799, + "step": 3579 + }, + { + "epoch": 0.5844659401657075, + "grad_norm": 1.8990167379379272, + "learning_rate": 1.9966503751524467e-05, + "loss": 0.8666, + "step": 3580 + }, + { + "epoch": 0.584629198808212, + "grad_norm": 1.863648533821106, + "learning_rate": 1.9966477794928078e-05, + "loss": 0.8379, + "step": 3581 + }, + { + "epoch": 0.5847924574507163, + "grad_norm": 2.1056289672851562, + "learning_rate": 1.9966451828295445e-05, + "loss": 0.8087, + "step": 3582 + }, + { + "epoch": 0.5849557160932207, + "grad_norm": 2.0448837280273438, + "learning_rate": 1.9966425851626598e-05, + "loss": 0.7847, + "step": 3583 + }, + { + "epoch": 0.585118974735725, + "grad_norm": 2.1308772563934326, + "learning_rate": 1.9966399864921565e-05, + "loss": 0.9436, + "step": 3584 + }, + { + "epoch": 0.5852822333782295, + "grad_norm": 2.0299911499023438, + "learning_rate": 1.9966373868180367e-05, + "loss": 0.8106, + "step": 3585 + }, + { + "epoch": 0.5854454920207338, + "grad_norm": 1.900641679763794, + "learning_rate": 1.9966347861403035e-05, + "loss": 0.8712, + "step": 3586 + }, + { + "epoch": 0.5856087506632383, + "grad_norm": 2.2011184692382812, + "learning_rate": 1.9966321844589592e-05, + "loss": 0.9172, + "step": 3587 + }, + { + "epoch": 0.5857720093057426, + "grad_norm": 2.1134426593780518, + "learning_rate": 1.996629581774007e-05, + "loss": 0.8899, + "step": 3588 + }, + { + "epoch": 0.585935267948247, + "grad_norm": 1.8321781158447266, + "learning_rate": 1.9966269780854487e-05, + "loss": 0.7152, + "step": 3589 + }, + { + "epoch": 0.5860985265907513, + "grad_norm": 2.057455062866211, + "learning_rate": 1.9966243733932873e-05, + "loss": 1.0119, + "step": 3590 + }, + { + "epoch": 0.5862617852332558, + "grad_norm": 2.1324174404144287, + "learning_rate": 1.9966217676975256e-05, + "loss": 0.8262, + "step": 3591 + }, + { + "epoch": 0.5864250438757602, + "grad_norm": 2.0259804725646973, + "learning_rate": 1.9966191609981657e-05, + "loss": 0.9146, + "step": 3592 + }, + { + "epoch": 0.5865883025182645, + "grad_norm": 2.019693374633789, + "learning_rate": 1.996616553295211e-05, + "loss": 0.7471, + "step": 3593 + }, + { + "epoch": 0.586751561160769, + "grad_norm": 1.9416102170944214, + "learning_rate": 1.9966139445886633e-05, + "loss": 0.7142, + "step": 3594 + }, + { + "epoch": 0.5869148198032733, + "grad_norm": 1.8799490928649902, + "learning_rate": 1.9966113348785258e-05, + "loss": 0.7989, + "step": 3595 + }, + { + "epoch": 0.5870780784457778, + "grad_norm": 1.9430004358291626, + "learning_rate": 1.996608724164801e-05, + "loss": 0.7038, + "step": 3596 + }, + { + "epoch": 0.5872413370882821, + "grad_norm": 2.42558217048645, + "learning_rate": 1.9966061124474912e-05, + "loss": 1.408, + "step": 3597 + }, + { + "epoch": 0.5874045957307865, + "grad_norm": 1.786195158958435, + "learning_rate": 1.9966034997266e-05, + "loss": 0.6356, + "step": 3598 + }, + { + "epoch": 0.5875678543732908, + "grad_norm": 2.3067615032196045, + "learning_rate": 1.9966008860021286e-05, + "loss": 0.8633, + "step": 3599 + }, + { + "epoch": 0.5877311130157953, + "grad_norm": 2.1207830905914307, + "learning_rate": 1.996598271274081e-05, + "loss": 0.8098, + "step": 3600 + }, + { + "epoch": 0.5878943716582996, + "grad_norm": 2.233381986618042, + "learning_rate": 1.9965956555424587e-05, + "loss": 0.7689, + "step": 3601 + }, + { + "epoch": 0.588057630300804, + "grad_norm": 2.062166213989258, + "learning_rate": 1.996593038807265e-05, + "loss": 0.7001, + "step": 3602 + }, + { + "epoch": 0.5882208889433085, + "grad_norm": 1.9232172966003418, + "learning_rate": 1.9965904210685025e-05, + "loss": 0.886, + "step": 3603 + }, + { + "epoch": 0.5883841475858128, + "grad_norm": 2.2392020225524902, + "learning_rate": 1.996587802326173e-05, + "loss": 0.928, + "step": 3604 + }, + { + "epoch": 0.5885474062283172, + "grad_norm": 2.1075050830841064, + "learning_rate": 1.996585182580281e-05, + "loss": 0.9298, + "step": 3605 + }, + { + "epoch": 0.5887106648708216, + "grad_norm": 1.9919886589050293, + "learning_rate": 1.996582561830827e-05, + "loss": 0.7881, + "step": 3606 + }, + { + "epoch": 0.588873923513326, + "grad_norm": 2.227320909500122, + "learning_rate": 1.9965799400778154e-05, + "loss": 0.7981, + "step": 3607 + }, + { + "epoch": 0.5890371821558303, + "grad_norm": 2.0313994884490967, + "learning_rate": 1.9965773173212475e-05, + "loss": 0.7718, + "step": 3608 + }, + { + "epoch": 0.5892004407983348, + "grad_norm": 2.2753689289093018, + "learning_rate": 1.996574693561127e-05, + "loss": 0.9776, + "step": 3609 + }, + { + "epoch": 0.5893636994408391, + "grad_norm": 2.286255359649658, + "learning_rate": 1.9965720687974555e-05, + "loss": 1.0616, + "step": 3610 + }, + { + "epoch": 0.5895269580833435, + "grad_norm": 1.9817832708358765, + "learning_rate": 1.9965694430302364e-05, + "loss": 0.79, + "step": 3611 + }, + { + "epoch": 0.5896902167258479, + "grad_norm": 2.2115752696990967, + "learning_rate": 1.9965668162594723e-05, + "loss": 0.9056, + "step": 3612 + }, + { + "epoch": 0.5898534753683523, + "grad_norm": 1.92093026638031, + "learning_rate": 1.9965641884851657e-05, + "loss": 0.7717, + "step": 3613 + }, + { + "epoch": 0.5900167340108567, + "grad_norm": 2.053100347518921, + "learning_rate": 1.9965615597073188e-05, + "loss": 0.8169, + "step": 3614 + }, + { + "epoch": 0.5901799926533611, + "grad_norm": 2.5104873180389404, + "learning_rate": 1.996558929925935e-05, + "loss": 0.9791, + "step": 3615 + }, + { + "epoch": 0.5903432512958655, + "grad_norm": 1.9146260023117065, + "learning_rate": 1.9965562991410167e-05, + "loss": 0.8574, + "step": 3616 + }, + { + "epoch": 0.5905065099383698, + "grad_norm": 1.938597559928894, + "learning_rate": 1.9965536673525664e-05, + "loss": 0.7675, + "step": 3617 + }, + { + "epoch": 0.5906697685808743, + "grad_norm": 2.1565463542938232, + "learning_rate": 1.9965510345605866e-05, + "loss": 1.0027, + "step": 3618 + }, + { + "epoch": 0.5908330272233786, + "grad_norm": 1.985184669494629, + "learning_rate": 1.9965484007650805e-05, + "loss": 0.8628, + "step": 3619 + }, + { + "epoch": 0.590996285865883, + "grad_norm": 2.17162823677063, + "learning_rate": 1.9965457659660504e-05, + "loss": 0.8638, + "step": 3620 + }, + { + "epoch": 0.5911595445083874, + "grad_norm": 2.005338191986084, + "learning_rate": 1.9965431301634987e-05, + "loss": 1.0438, + "step": 3621 + }, + { + "epoch": 0.5913228031508918, + "grad_norm": 1.822651982307434, + "learning_rate": 1.9965404933574284e-05, + "loss": 0.7258, + "step": 3622 + }, + { + "epoch": 0.5914860617933961, + "grad_norm": 2.550746202468872, + "learning_rate": 1.9965378555478423e-05, + "loss": 0.9615, + "step": 3623 + }, + { + "epoch": 0.5916493204359006, + "grad_norm": 1.7623876333236694, + "learning_rate": 1.9965352167347428e-05, + "loss": 0.7797, + "step": 3624 + }, + { + "epoch": 0.591812579078405, + "grad_norm": 1.955164909362793, + "learning_rate": 1.9965325769181324e-05, + "loss": 0.8866, + "step": 3625 + }, + { + "epoch": 0.5919758377209093, + "grad_norm": 1.9095629453659058, + "learning_rate": 1.996529936098014e-05, + "loss": 0.8066, + "step": 3626 + }, + { + "epoch": 0.5921390963634138, + "grad_norm": 2.33553147315979, + "learning_rate": 1.9965272942743903e-05, + "loss": 0.8389, + "step": 3627 + }, + { + "epoch": 0.5923023550059181, + "grad_norm": 1.7946014404296875, + "learning_rate": 1.996524651447264e-05, + "loss": 0.9017, + "step": 3628 + }, + { + "epoch": 0.5924656136484225, + "grad_norm": 1.7187762260437012, + "learning_rate": 1.9965220076166376e-05, + "loss": 0.7062, + "step": 3629 + }, + { + "epoch": 0.5926288722909269, + "grad_norm": 2.0866942405700684, + "learning_rate": 1.9965193627825138e-05, + "loss": 0.8384, + "step": 3630 + }, + { + "epoch": 0.5927921309334313, + "grad_norm": 2.062596559524536, + "learning_rate": 1.9965167169448947e-05, + "loss": 1.0171, + "step": 3631 + }, + { + "epoch": 0.5929553895759356, + "grad_norm": 2.073064088821411, + "learning_rate": 1.9965140701037843e-05, + "loss": 0.9276, + "step": 3632 + }, + { + "epoch": 0.5931186482184401, + "grad_norm": 1.9622799158096313, + "learning_rate": 1.996511422259184e-05, + "loss": 0.8272, + "step": 3633 + }, + { + "epoch": 0.5932819068609444, + "grad_norm": 1.9165892601013184, + "learning_rate": 1.9965087734110974e-05, + "loss": 0.7848, + "step": 3634 + }, + { + "epoch": 0.5934451655034488, + "grad_norm": 2.123906373977661, + "learning_rate": 1.9965061235595265e-05, + "loss": 0.8469, + "step": 3635 + }, + { + "epoch": 0.5936084241459533, + "grad_norm": 1.6614809036254883, + "learning_rate": 1.9965034727044743e-05, + "loss": 0.7089, + "step": 3636 + }, + { + "epoch": 0.5937716827884576, + "grad_norm": 2.218162775039673, + "learning_rate": 1.9965008208459434e-05, + "loss": 0.9996, + "step": 3637 + }, + { + "epoch": 0.593934941430962, + "grad_norm": 2.1402623653411865, + "learning_rate": 1.996498167983936e-05, + "loss": 0.8732, + "step": 3638 + }, + { + "epoch": 0.5940982000734664, + "grad_norm": 2.083862781524658, + "learning_rate": 1.9964955141184556e-05, + "loss": 0.8834, + "step": 3639 + }, + { + "epoch": 0.5942614587159708, + "grad_norm": 2.0694124698638916, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.8436, + "step": 3640 + }, + { + "epoch": 0.5944247173584751, + "grad_norm": 1.8595224618911743, + "learning_rate": 1.9964902033770853e-05, + "loss": 0.6515, + "step": 3641 + }, + { + "epoch": 0.5945879760009796, + "grad_norm": 1.8203481435775757, + "learning_rate": 1.9964875465012005e-05, + "loss": 0.9081, + "step": 3642 + }, + { + "epoch": 0.5947512346434839, + "grad_norm": 2.117825984954834, + "learning_rate": 1.996484888621853e-05, + "loss": 0.7962, + "step": 3643 + }, + { + "epoch": 0.5949144932859883, + "grad_norm": 2.43833065032959, + "learning_rate": 1.996482229739046e-05, + "loss": 1.1591, + "step": 3644 + }, + { + "epoch": 0.5950777519284927, + "grad_norm": 1.9372369050979614, + "learning_rate": 1.9964795698527816e-05, + "loss": 0.9037, + "step": 3645 + }, + { + "epoch": 0.5952410105709971, + "grad_norm": 1.837344765663147, + "learning_rate": 1.9964769089630622e-05, + "loss": 0.7742, + "step": 3646 + }, + { + "epoch": 0.5954042692135015, + "grad_norm": 2.213494300842285, + "learning_rate": 1.9964742470698906e-05, + "loss": 0.9234, + "step": 3647 + }, + { + "epoch": 0.5955675278560059, + "grad_norm": 2.143841505050659, + "learning_rate": 1.99647158417327e-05, + "loss": 0.8837, + "step": 3648 + }, + { + "epoch": 0.5957307864985103, + "grad_norm": 2.2131524085998535, + "learning_rate": 1.996468920273203e-05, + "loss": 0.7107, + "step": 3649 + }, + { + "epoch": 0.5958940451410146, + "grad_norm": 2.2660155296325684, + "learning_rate": 1.9964662553696915e-05, + "loss": 1.0301, + "step": 3650 + }, + { + "epoch": 0.5960573037835191, + "grad_norm": 2.46110200881958, + "learning_rate": 1.996463589462739e-05, + "loss": 0.8158, + "step": 3651 + }, + { + "epoch": 0.5962205624260234, + "grad_norm": 1.8709434270858765, + "learning_rate": 1.9964609225523484e-05, + "loss": 0.7605, + "step": 3652 + }, + { + "epoch": 0.5963838210685278, + "grad_norm": 1.6768497228622437, + "learning_rate": 1.9964582546385212e-05, + "loss": 0.6069, + "step": 3653 + }, + { + "epoch": 0.5965470797110322, + "grad_norm": 1.8969913721084595, + "learning_rate": 1.9964555857212612e-05, + "loss": 0.8008, + "step": 3654 + }, + { + "epoch": 0.5967103383535366, + "grad_norm": 1.8646571636199951, + "learning_rate": 1.9964529158005707e-05, + "loss": 0.7478, + "step": 3655 + }, + { + "epoch": 0.5968735969960409, + "grad_norm": 2.075502872467041, + "learning_rate": 1.9964502448764524e-05, + "loss": 0.8409, + "step": 3656 + }, + { + "epoch": 0.5970368556385454, + "grad_norm": 1.8992784023284912, + "learning_rate": 1.9964475729489087e-05, + "loss": 0.7192, + "step": 3657 + }, + { + "epoch": 0.5972001142810498, + "grad_norm": 2.103079319000244, + "learning_rate": 1.9964449000179428e-05, + "loss": 0.9669, + "step": 3658 + }, + { + "epoch": 0.5973633729235541, + "grad_norm": 1.7781537771224976, + "learning_rate": 1.996442226083557e-05, + "loss": 0.7325, + "step": 3659 + }, + { + "epoch": 0.5975266315660586, + "grad_norm": 2.018824338912964, + "learning_rate": 1.9964395511457543e-05, + "loss": 0.7988, + "step": 3660 + }, + { + "epoch": 0.5976898902085629, + "grad_norm": 2.1745574474334717, + "learning_rate": 1.9964368752045372e-05, + "loss": 0.7999, + "step": 3661 + }, + { + "epoch": 0.5978531488510673, + "grad_norm": 2.1089706420898438, + "learning_rate": 1.996434198259908e-05, + "loss": 0.7532, + "step": 3662 + }, + { + "epoch": 0.5980164074935717, + "grad_norm": 2.1655523777008057, + "learning_rate": 1.9964315203118706e-05, + "loss": 0.7481, + "step": 3663 + }, + { + "epoch": 0.5981796661360761, + "grad_norm": 2.165886163711548, + "learning_rate": 1.9964288413604262e-05, + "loss": 0.7264, + "step": 3664 + }, + { + "epoch": 0.5983429247785804, + "grad_norm": 2.5514092445373535, + "learning_rate": 1.9964261614055788e-05, + "loss": 0.9026, + "step": 3665 + }, + { + "epoch": 0.5985061834210849, + "grad_norm": 2.8042421340942383, + "learning_rate": 1.99642348044733e-05, + "loss": 1.0191, + "step": 3666 + }, + { + "epoch": 0.5986694420635893, + "grad_norm": 2.482532262802124, + "learning_rate": 1.9964207984856833e-05, + "loss": 0.9144, + "step": 3667 + }, + { + "epoch": 0.5988327007060936, + "grad_norm": 1.8352465629577637, + "learning_rate": 1.996418115520641e-05, + "loss": 0.7136, + "step": 3668 + }, + { + "epoch": 0.5989959593485981, + "grad_norm": 2.125253438949585, + "learning_rate": 1.9964154315522062e-05, + "loss": 0.7817, + "step": 3669 + }, + { + "epoch": 0.5991592179911024, + "grad_norm": 2.0890777111053467, + "learning_rate": 1.9964127465803812e-05, + "loss": 0.8061, + "step": 3670 + }, + { + "epoch": 0.5993224766336068, + "grad_norm": 1.7184405326843262, + "learning_rate": 1.9964100606051685e-05, + "loss": 0.7482, + "step": 3671 + }, + { + "epoch": 0.5994857352761112, + "grad_norm": 2.1258692741394043, + "learning_rate": 1.9964073736265717e-05, + "loss": 0.8278, + "step": 3672 + }, + { + "epoch": 0.5996489939186156, + "grad_norm": 2.428915023803711, + "learning_rate": 1.9964046856445926e-05, + "loss": 0.8707, + "step": 3673 + }, + { + "epoch": 0.5998122525611199, + "grad_norm": 2.3831193447113037, + "learning_rate": 1.996401996659234e-05, + "loss": 0.7595, + "step": 3674 + }, + { + "epoch": 0.5999755112036244, + "grad_norm": 2.1495227813720703, + "learning_rate": 1.9963993066704995e-05, + "loss": 0.8719, + "step": 3675 + }, + { + "epoch": 0.6001387698461287, + "grad_norm": 2.247288942337036, + "learning_rate": 1.9963966156783906e-05, + "loss": 0.9942, + "step": 3676 + }, + { + "epoch": 0.6003020284886331, + "grad_norm": 2.2114663124084473, + "learning_rate": 1.9963939236829108e-05, + "loss": 0.994, + "step": 3677 + }, + { + "epoch": 0.6004652871311376, + "grad_norm": 2.113921880722046, + "learning_rate": 1.9963912306840626e-05, + "loss": 0.7393, + "step": 3678 + }, + { + "epoch": 0.6006285457736419, + "grad_norm": 2.2942121028900146, + "learning_rate": 1.9963885366818486e-05, + "loss": 0.8665, + "step": 3679 + }, + { + "epoch": 0.6007918044161463, + "grad_norm": 2.5849621295928955, + "learning_rate": 1.996385841676272e-05, + "loss": 0.8805, + "step": 3680 + }, + { + "epoch": 0.6009550630586507, + "grad_norm": 1.9586313962936401, + "learning_rate": 1.9963831456673346e-05, + "loss": 0.8416, + "step": 3681 + }, + { + "epoch": 0.6011183217011551, + "grad_norm": 1.8255609273910522, + "learning_rate": 1.9963804486550397e-05, + "loss": 0.8924, + "step": 3682 + }, + { + "epoch": 0.6012815803436594, + "grad_norm": 2.106628894805908, + "learning_rate": 1.99637775063939e-05, + "loss": 0.9633, + "step": 3683 + }, + { + "epoch": 0.6014448389861639, + "grad_norm": 1.8040400743484497, + "learning_rate": 1.9963750516203887e-05, + "loss": 0.8778, + "step": 3684 + }, + { + "epoch": 0.6016080976286682, + "grad_norm": 1.8429375886917114, + "learning_rate": 1.9963723515980372e-05, + "loss": 0.8145, + "step": 3685 + }, + { + "epoch": 0.6017713562711726, + "grad_norm": 1.9379334449768066, + "learning_rate": 1.9963696505723392e-05, + "loss": 0.7614, + "step": 3686 + }, + { + "epoch": 0.601934614913677, + "grad_norm": 2.2609550952911377, + "learning_rate": 1.9963669485432975e-05, + "loss": 0.9209, + "step": 3687 + }, + { + "epoch": 0.6020978735561814, + "grad_norm": 2.6682698726654053, + "learning_rate": 1.9963642455109144e-05, + "loss": 0.6736, + "step": 3688 + }, + { + "epoch": 0.6022611321986858, + "grad_norm": 2.4525601863861084, + "learning_rate": 1.996361541475193e-05, + "loss": 1.0349, + "step": 3689 + }, + { + "epoch": 0.6024243908411901, + "grad_norm": 2.488133430480957, + "learning_rate": 1.9963588364361356e-05, + "loss": 0.8848, + "step": 3690 + }, + { + "epoch": 0.6025876494836946, + "grad_norm": 2.042724132537842, + "learning_rate": 1.9963561303937447e-05, + "loss": 0.7803, + "step": 3691 + }, + { + "epoch": 0.6027509081261989, + "grad_norm": 2.018433094024658, + "learning_rate": 1.996353423348024e-05, + "loss": 0.7195, + "step": 3692 + }, + { + "epoch": 0.6029141667687034, + "grad_norm": 1.9370495080947876, + "learning_rate": 1.9963507152989755e-05, + "loss": 0.8553, + "step": 3693 + }, + { + "epoch": 0.6030774254112077, + "grad_norm": 2.1063125133514404, + "learning_rate": 1.9963480062466022e-05, + "loss": 0.8353, + "step": 3694 + }, + { + "epoch": 0.6032406840537121, + "grad_norm": 2.00225830078125, + "learning_rate": 1.9963452961909065e-05, + "loss": 0.9269, + "step": 3695 + }, + { + "epoch": 0.6034039426962164, + "grad_norm": 2.2192225456237793, + "learning_rate": 1.9963425851318915e-05, + "loss": 0.7334, + "step": 3696 + }, + { + "epoch": 0.6035672013387209, + "grad_norm": 1.948266863822937, + "learning_rate": 1.9963398730695598e-05, + "loss": 0.7861, + "step": 3697 + }, + { + "epoch": 0.6037304599812252, + "grad_norm": 2.260859966278076, + "learning_rate": 1.996337160003914e-05, + "loss": 0.9476, + "step": 3698 + }, + { + "epoch": 0.6038937186237296, + "grad_norm": 1.8101441860198975, + "learning_rate": 1.9963344459349572e-05, + "loss": 0.8074, + "step": 3699 + }, + { + "epoch": 0.6040569772662341, + "grad_norm": 2.1374411582946777, + "learning_rate": 1.9963317308626916e-05, + "loss": 0.8633, + "step": 3700 + }, + { + "epoch": 0.6042202359087384, + "grad_norm": 2.003028631210327, + "learning_rate": 1.9963290147871205e-05, + "loss": 0.7181, + "step": 3701 + }, + { + "epoch": 0.6043834945512429, + "grad_norm": 2.6432526111602783, + "learning_rate": 1.996326297708246e-05, + "loss": 0.8456, + "step": 3702 + }, + { + "epoch": 0.6045467531937472, + "grad_norm": 2.264915704727173, + "learning_rate": 1.9963235796260713e-05, + "loss": 0.8801, + "step": 3703 + }, + { + "epoch": 0.6047100118362516, + "grad_norm": 2.3882110118865967, + "learning_rate": 1.996320860540599e-05, + "loss": 0.9004, + "step": 3704 + }, + { + "epoch": 0.6048732704787559, + "grad_norm": 1.9614765644073486, + "learning_rate": 1.9963181404518318e-05, + "loss": 0.8522, + "step": 3705 + }, + { + "epoch": 0.6050365291212604, + "grad_norm": 2.125267744064331, + "learning_rate": 1.9963154193597728e-05, + "loss": 0.9304, + "step": 3706 + }, + { + "epoch": 0.6051997877637647, + "grad_norm": 2.4708149433135986, + "learning_rate": 1.9963126972644243e-05, + "loss": 0.9468, + "step": 3707 + }, + { + "epoch": 0.6053630464062691, + "grad_norm": 1.9877036809921265, + "learning_rate": 1.9963099741657887e-05, + "loss": 1.0306, + "step": 3708 + }, + { + "epoch": 0.6055263050487735, + "grad_norm": 1.9365112781524658, + "learning_rate": 1.9963072500638697e-05, + "loss": 0.7073, + "step": 3709 + }, + { + "epoch": 0.6056895636912779, + "grad_norm": 2.130544424057007, + "learning_rate": 1.9963045249586696e-05, + "loss": 0.8411, + "step": 3710 + }, + { + "epoch": 0.6058528223337823, + "grad_norm": 1.9434716701507568, + "learning_rate": 1.9963017988501908e-05, + "loss": 0.7616, + "step": 3711 + }, + { + "epoch": 0.6060160809762867, + "grad_norm": 2.1127660274505615, + "learning_rate": 1.9962990717384368e-05, + "loss": 0.821, + "step": 3712 + }, + { + "epoch": 0.6061793396187911, + "grad_norm": 1.9306023120880127, + "learning_rate": 1.9962963436234095e-05, + "loss": 0.8825, + "step": 3713 + }, + { + "epoch": 0.6063425982612954, + "grad_norm": 1.9786553382873535, + "learning_rate": 1.9962936145051123e-05, + "loss": 0.7718, + "step": 3714 + }, + { + "epoch": 0.6065058569037999, + "grad_norm": 1.7668818235397339, + "learning_rate": 1.9962908843835476e-05, + "loss": 0.7322, + "step": 3715 + }, + { + "epoch": 0.6066691155463042, + "grad_norm": 2.0161406993865967, + "learning_rate": 1.996288153258718e-05, + "loss": 0.8121, + "step": 3716 + }, + { + "epoch": 0.6068323741888086, + "grad_norm": 2.028794765472412, + "learning_rate": 1.996285421130627e-05, + "loss": 0.8758, + "step": 3717 + }, + { + "epoch": 0.606995632831313, + "grad_norm": 2.0440173149108887, + "learning_rate": 1.9962826879992767e-05, + "loss": 0.8263, + "step": 3718 + }, + { + "epoch": 0.6071588914738174, + "grad_norm": 1.9800260066986084, + "learning_rate": 1.9962799538646698e-05, + "loss": 0.8812, + "step": 3719 + }, + { + "epoch": 0.6073221501163217, + "grad_norm": 2.2526094913482666, + "learning_rate": 1.9962772187268093e-05, + "loss": 0.7996, + "step": 3720 + }, + { + "epoch": 0.6074854087588262, + "grad_norm": 2.237852096557617, + "learning_rate": 1.996274482585698e-05, + "loss": 1.0548, + "step": 3721 + }, + { + "epoch": 0.6076486674013306, + "grad_norm": 1.9838221073150635, + "learning_rate": 1.9962717454413384e-05, + "loss": 0.8007, + "step": 3722 + }, + { + "epoch": 0.6078119260438349, + "grad_norm": 1.9112454652786255, + "learning_rate": 1.9962690072937337e-05, + "loss": 0.6838, + "step": 3723 + }, + { + "epoch": 0.6079751846863394, + "grad_norm": 1.6715295314788818, + "learning_rate": 1.996266268142886e-05, + "loss": 0.6539, + "step": 3724 + }, + { + "epoch": 0.6081384433288437, + "grad_norm": 1.7555391788482666, + "learning_rate": 1.9962635279887987e-05, + "loss": 0.8086, + "step": 3725 + }, + { + "epoch": 0.6083017019713481, + "grad_norm": 2.1395602226257324, + "learning_rate": 1.996260786831474e-05, + "loss": 0.9322, + "step": 3726 + }, + { + "epoch": 0.6084649606138525, + "grad_norm": 1.9704563617706299, + "learning_rate": 1.9962580446709153e-05, + "loss": 0.7355, + "step": 3727 + }, + { + "epoch": 0.6086282192563569, + "grad_norm": 2.4036669731140137, + "learning_rate": 1.996255301507125e-05, + "loss": 0.9818, + "step": 3728 + }, + { + "epoch": 0.6087914778988612, + "grad_norm": 2.23121976852417, + "learning_rate": 1.9962525573401053e-05, + "loss": 0.8791, + "step": 3729 + }, + { + "epoch": 0.6089547365413657, + "grad_norm": 2.050804615020752, + "learning_rate": 1.9962498121698602e-05, + "loss": 0.8695, + "step": 3730 + }, + { + "epoch": 0.60911799518387, + "grad_norm": 2.21913743019104, + "learning_rate": 1.9962470659963914e-05, + "loss": 0.7476, + "step": 3731 + }, + { + "epoch": 0.6092812538263744, + "grad_norm": 2.0922482013702393, + "learning_rate": 1.9962443188197024e-05, + "loss": 0.8584, + "step": 3732 + }, + { + "epoch": 0.6094445124688789, + "grad_norm": 2.075019359588623, + "learning_rate": 1.9962415706397954e-05, + "loss": 0.8137, + "step": 3733 + }, + { + "epoch": 0.6096077711113832, + "grad_norm": 2.05233097076416, + "learning_rate": 1.9962388214566738e-05, + "loss": 0.8552, + "step": 3734 + }, + { + "epoch": 0.6097710297538876, + "grad_norm": 2.4691267013549805, + "learning_rate": 1.9962360712703396e-05, + "loss": 1.0121, + "step": 3735 + }, + { + "epoch": 0.609934288396392, + "grad_norm": 1.9852056503295898, + "learning_rate": 1.9962333200807958e-05, + "loss": 0.9552, + "step": 3736 + }, + { + "epoch": 0.6100975470388964, + "grad_norm": 2.222888469696045, + "learning_rate": 1.996230567888046e-05, + "loss": 0.8516, + "step": 3737 + }, + { + "epoch": 0.6102608056814007, + "grad_norm": 2.003159999847412, + "learning_rate": 1.9962278146920914e-05, + "loss": 0.8734, + "step": 3738 + }, + { + "epoch": 0.6104240643239052, + "grad_norm": 2.2001168727874756, + "learning_rate": 1.9962250604929362e-05, + "loss": 0.9279, + "step": 3739 + }, + { + "epoch": 0.6105873229664095, + "grad_norm": 1.823410153388977, + "learning_rate": 1.9962223052905823e-05, + "loss": 0.6804, + "step": 3740 + }, + { + "epoch": 0.6107505816089139, + "grad_norm": 2.1045539379119873, + "learning_rate": 1.996219549085033e-05, + "loss": 0.7195, + "step": 3741 + }, + { + "epoch": 0.6109138402514183, + "grad_norm": 1.905200481414795, + "learning_rate": 1.996216791876291e-05, + "loss": 0.8083, + "step": 3742 + }, + { + "epoch": 0.6110770988939227, + "grad_norm": 1.8150074481964111, + "learning_rate": 1.9962140336643588e-05, + "loss": 0.8253, + "step": 3743 + }, + { + "epoch": 0.6112403575364271, + "grad_norm": 2.5424857139587402, + "learning_rate": 1.996211274449239e-05, + "loss": 0.9048, + "step": 3744 + }, + { + "epoch": 0.6114036161789315, + "grad_norm": 2.2649083137512207, + "learning_rate": 1.9962085142309354e-05, + "loss": 0.9257, + "step": 3745 + }, + { + "epoch": 0.6115668748214359, + "grad_norm": 2.1318905353546143, + "learning_rate": 1.9962057530094498e-05, + "loss": 0.9251, + "step": 3746 + }, + { + "epoch": 0.6117301334639402, + "grad_norm": 2.132192611694336, + "learning_rate": 1.9962029907847852e-05, + "loss": 0.8719, + "step": 3747 + }, + { + "epoch": 0.6118933921064447, + "grad_norm": 1.7561556100845337, + "learning_rate": 1.9962002275569445e-05, + "loss": 0.7785, + "step": 3748 + }, + { + "epoch": 0.612056650748949, + "grad_norm": 1.6951043605804443, + "learning_rate": 1.9961974633259302e-05, + "loss": 0.6695, + "step": 3749 + }, + { + "epoch": 0.6122199093914534, + "grad_norm": 1.6717313528060913, + "learning_rate": 1.9961946980917457e-05, + "loss": 0.5896, + "step": 3750 + }, + { + "epoch": 0.6123831680339578, + "grad_norm": 2.0419058799743652, + "learning_rate": 1.9961919318543933e-05, + "loss": 0.7776, + "step": 3751 + }, + { + "epoch": 0.6125464266764622, + "grad_norm": 2.1031951904296875, + "learning_rate": 1.9961891646138757e-05, + "loss": 0.8297, + "step": 3752 + }, + { + "epoch": 0.6127096853189665, + "grad_norm": 1.8134374618530273, + "learning_rate": 1.9961863963701963e-05, + "loss": 0.7281, + "step": 3753 + }, + { + "epoch": 0.612872943961471, + "grad_norm": 2.130743980407715, + "learning_rate": 1.9961836271233568e-05, + "loss": 0.8177, + "step": 3754 + }, + { + "epoch": 0.6130362026039754, + "grad_norm": 2.097658157348633, + "learning_rate": 1.9961808568733612e-05, + "loss": 0.8198, + "step": 3755 + }, + { + "epoch": 0.6131994612464797, + "grad_norm": 2.359623432159424, + "learning_rate": 1.9961780856202114e-05, + "loss": 1.0107, + "step": 3756 + }, + { + "epoch": 0.6133627198889842, + "grad_norm": 2.0818188190460205, + "learning_rate": 1.996175313363911e-05, + "loss": 0.8016, + "step": 3757 + }, + { + "epoch": 0.6135259785314885, + "grad_norm": 2.3268625736236572, + "learning_rate": 1.996172540104462e-05, + "loss": 0.9352, + "step": 3758 + }, + { + "epoch": 0.6136892371739929, + "grad_norm": 2.025575876235962, + "learning_rate": 1.9961697658418674e-05, + "loss": 0.8486, + "step": 3759 + }, + { + "epoch": 0.6138524958164973, + "grad_norm": 2.0393269062042236, + "learning_rate": 1.9961669905761303e-05, + "loss": 0.8731, + "step": 3760 + }, + { + "epoch": 0.6140157544590017, + "grad_norm": 1.8359060287475586, + "learning_rate": 1.9961642143072532e-05, + "loss": 0.747, + "step": 3761 + }, + { + "epoch": 0.614179013101506, + "grad_norm": 2.3686137199401855, + "learning_rate": 1.996161437035239e-05, + "loss": 1.0249, + "step": 3762 + }, + { + "epoch": 0.6143422717440105, + "grad_norm": 2.0227677822113037, + "learning_rate": 1.9961586587600905e-05, + "loss": 0.8272, + "step": 3763 + }, + { + "epoch": 0.6145055303865148, + "grad_norm": 2.142469882965088, + "learning_rate": 1.9961558794818107e-05, + "loss": 0.8906, + "step": 3764 + }, + { + "epoch": 0.6146687890290192, + "grad_norm": 2.2985129356384277, + "learning_rate": 1.996153099200402e-05, + "loss": 0.7387, + "step": 3765 + }, + { + "epoch": 0.6148320476715237, + "grad_norm": 2.1139304637908936, + "learning_rate": 1.9961503179158673e-05, + "loss": 0.8451, + "step": 3766 + }, + { + "epoch": 0.614995306314028, + "grad_norm": 1.921873927116394, + "learning_rate": 1.9961475356282095e-05, + "loss": 0.8413, + "step": 3767 + }, + { + "epoch": 0.6151585649565324, + "grad_norm": 2.2924017906188965, + "learning_rate": 1.9961447523374316e-05, + "loss": 0.9166, + "step": 3768 + }, + { + "epoch": 0.6153218235990368, + "grad_norm": 2.0093233585357666, + "learning_rate": 1.996141968043536e-05, + "loss": 0.8167, + "step": 3769 + }, + { + "epoch": 0.6154850822415412, + "grad_norm": 2.016788959503174, + "learning_rate": 1.996139182746526e-05, + "loss": 0.8128, + "step": 3770 + }, + { + "epoch": 0.6156483408840455, + "grad_norm": 1.9738140106201172, + "learning_rate": 1.9961363964464037e-05, + "loss": 0.7093, + "step": 3771 + }, + { + "epoch": 0.61581159952655, + "grad_norm": 1.8268052339553833, + "learning_rate": 1.9961336091431728e-05, + "loss": 0.7611, + "step": 3772 + }, + { + "epoch": 0.6159748581690543, + "grad_norm": 1.717950701713562, + "learning_rate": 1.996130820836835e-05, + "loss": 0.676, + "step": 3773 + }, + { + "epoch": 0.6161381168115587, + "grad_norm": 2.321230173110962, + "learning_rate": 1.9961280315273944e-05, + "loss": 0.8696, + "step": 3774 + }, + { + "epoch": 0.616301375454063, + "grad_norm": 2.2084743976593018, + "learning_rate": 1.9961252412148526e-05, + "loss": 0.7299, + "step": 3775 + }, + { + "epoch": 0.6164646340965675, + "grad_norm": 1.9729336500167847, + "learning_rate": 1.9961224498992134e-05, + "loss": 0.9359, + "step": 3776 + }, + { + "epoch": 0.6166278927390719, + "grad_norm": 2.1378586292266846, + "learning_rate": 1.996119657580479e-05, + "loss": 0.8089, + "step": 3777 + }, + { + "epoch": 0.6167911513815763, + "grad_norm": 2.071220636367798, + "learning_rate": 1.9961168642586523e-05, + "loss": 0.8486, + "step": 3778 + }, + { + "epoch": 0.6169544100240807, + "grad_norm": 2.0117225646972656, + "learning_rate": 1.9961140699337358e-05, + "loss": 0.7344, + "step": 3779 + }, + { + "epoch": 0.617117668666585, + "grad_norm": 2.243736982345581, + "learning_rate": 1.996111274605733e-05, + "loss": 0.9864, + "step": 3780 + }, + { + "epoch": 0.6172809273090895, + "grad_norm": 1.8221784830093384, + "learning_rate": 1.9961084782746468e-05, + "loss": 0.8879, + "step": 3781 + }, + { + "epoch": 0.6174441859515938, + "grad_norm": 2.270043134689331, + "learning_rate": 1.9961056809404793e-05, + "loss": 0.8056, + "step": 3782 + }, + { + "epoch": 0.6176074445940982, + "grad_norm": 2.108318567276001, + "learning_rate": 1.9961028826032335e-05, + "loss": 0.8774, + "step": 3783 + }, + { + "epoch": 0.6177707032366025, + "grad_norm": 2.0278093814849854, + "learning_rate": 1.9961000832629126e-05, + "loss": 0.8867, + "step": 3784 + }, + { + "epoch": 0.617933961879107, + "grad_norm": 2.036600112915039, + "learning_rate": 1.996097282919519e-05, + "loss": 0.8026, + "step": 3785 + }, + { + "epoch": 0.6180972205216113, + "grad_norm": 2.1643218994140625, + "learning_rate": 1.9960944815730558e-05, + "loss": 0.9236, + "step": 3786 + }, + { + "epoch": 0.6182604791641158, + "grad_norm": 2.492954969406128, + "learning_rate": 1.9960916792235256e-05, + "loss": 0.8474, + "step": 3787 + }, + { + "epoch": 0.6184237378066202, + "grad_norm": 1.9372669458389282, + "learning_rate": 1.9960888758709316e-05, + "loss": 0.8115, + "step": 3788 + }, + { + "epoch": 0.6185869964491245, + "grad_norm": 2.0380916595458984, + "learning_rate": 1.996086071515276e-05, + "loss": 0.8055, + "step": 3789 + }, + { + "epoch": 0.618750255091629, + "grad_norm": 1.7349754571914673, + "learning_rate": 1.9960832661565625e-05, + "loss": 0.5634, + "step": 3790 + }, + { + "epoch": 0.6189135137341333, + "grad_norm": 2.294201135635376, + "learning_rate": 1.996080459794793e-05, + "loss": 1.0117, + "step": 3791 + }, + { + "epoch": 0.6190767723766377, + "grad_norm": 2.2213833332061768, + "learning_rate": 1.9960776524299708e-05, + "loss": 0.8685, + "step": 3792 + }, + { + "epoch": 0.619240031019142, + "grad_norm": 2.023338556289673, + "learning_rate": 1.9960748440620988e-05, + "loss": 0.8657, + "step": 3793 + }, + { + "epoch": 0.6194032896616465, + "grad_norm": 2.0414113998413086, + "learning_rate": 1.9960720346911798e-05, + "loss": 0.8349, + "step": 3794 + }, + { + "epoch": 0.6195665483041508, + "grad_norm": 2.300799608230591, + "learning_rate": 1.9960692243172163e-05, + "loss": 0.842, + "step": 3795 + }, + { + "epoch": 0.6197298069466552, + "grad_norm": 2.1251487731933594, + "learning_rate": 1.9960664129402113e-05, + "loss": 0.8899, + "step": 3796 + }, + { + "epoch": 0.6198930655891596, + "grad_norm": 2.2507076263427734, + "learning_rate": 1.9960636005601678e-05, + "loss": 0.9363, + "step": 3797 + }, + { + "epoch": 0.620056324231664, + "grad_norm": 2.0603346824645996, + "learning_rate": 1.9960607871770886e-05, + "loss": 0.9506, + "step": 3798 + }, + { + "epoch": 0.6202195828741685, + "grad_norm": 2.081101417541504, + "learning_rate": 1.9960579727909763e-05, + "loss": 1.0015, + "step": 3799 + }, + { + "epoch": 0.6203828415166728, + "grad_norm": 1.9013113975524902, + "learning_rate": 1.996055157401834e-05, + "loss": 0.6796, + "step": 3800 + }, + { + "epoch": 0.6205461001591772, + "grad_norm": 2.277682304382324, + "learning_rate": 1.9960523410096645e-05, + "loss": 0.7942, + "step": 3801 + }, + { + "epoch": 0.6207093588016815, + "grad_norm": 2.6727676391601562, + "learning_rate": 1.9960495236144704e-05, + "loss": 0.8886, + "step": 3802 + }, + { + "epoch": 0.620872617444186, + "grad_norm": 2.2822396755218506, + "learning_rate": 1.9960467052162548e-05, + "loss": 0.9177, + "step": 3803 + }, + { + "epoch": 0.6210358760866903, + "grad_norm": 1.922499418258667, + "learning_rate": 1.99604388581502e-05, + "loss": 0.807, + "step": 3804 + }, + { + "epoch": 0.6211991347291947, + "grad_norm": 1.9836468696594238, + "learning_rate": 1.99604106541077e-05, + "loss": 0.8503, + "step": 3805 + }, + { + "epoch": 0.6213623933716991, + "grad_norm": 1.9225810766220093, + "learning_rate": 1.9960382440035063e-05, + "loss": 0.9294, + "step": 3806 + }, + { + "epoch": 0.6215256520142035, + "grad_norm": 1.9350652694702148, + "learning_rate": 1.9960354215932324e-05, + "loss": 0.8095, + "step": 3807 + }, + { + "epoch": 0.6216889106567078, + "grad_norm": 2.0266857147216797, + "learning_rate": 1.9960325981799516e-05, + "loss": 0.8309, + "step": 3808 + }, + { + "epoch": 0.6218521692992123, + "grad_norm": 2.285499334335327, + "learning_rate": 1.9960297737636658e-05, + "loss": 0.7849, + "step": 3809 + }, + { + "epoch": 0.6220154279417167, + "grad_norm": 1.9849332571029663, + "learning_rate": 1.9960269483443785e-05, + "loss": 0.8395, + "step": 3810 + }, + { + "epoch": 0.622178686584221, + "grad_norm": 1.8987846374511719, + "learning_rate": 1.996024121922092e-05, + "loss": 0.8899, + "step": 3811 + }, + { + "epoch": 0.6223419452267255, + "grad_norm": 1.9115079641342163, + "learning_rate": 1.9960212944968098e-05, + "loss": 0.859, + "step": 3812 + }, + { + "epoch": 0.6225052038692298, + "grad_norm": 1.9027541875839233, + "learning_rate": 1.9960184660685345e-05, + "loss": 0.7638, + "step": 3813 + }, + { + "epoch": 0.6226684625117342, + "grad_norm": 1.9608430862426758, + "learning_rate": 1.996015636637269e-05, + "loss": 0.8208, + "step": 3814 + }, + { + "epoch": 0.6228317211542386, + "grad_norm": 1.624884843826294, + "learning_rate": 1.9960128062030153e-05, + "loss": 0.6761, + "step": 3815 + }, + { + "epoch": 0.622994979796743, + "grad_norm": 2.1514575481414795, + "learning_rate": 1.9960099747657774e-05, + "loss": 0.9092, + "step": 3816 + }, + { + "epoch": 0.6231582384392473, + "grad_norm": 2.1382553577423096, + "learning_rate": 1.9960071423255577e-05, + "loss": 0.747, + "step": 3817 + }, + { + "epoch": 0.6233214970817518, + "grad_norm": 2.1861326694488525, + "learning_rate": 1.996004308882359e-05, + "loss": 0.8345, + "step": 3818 + }, + { + "epoch": 0.6234847557242561, + "grad_norm": 2.439685821533203, + "learning_rate": 1.9960014744361844e-05, + "loss": 1.1167, + "step": 3819 + }, + { + "epoch": 0.6236480143667605, + "grad_norm": 2.220843553543091, + "learning_rate": 1.9959986389870364e-05, + "loss": 0.783, + "step": 3820 + }, + { + "epoch": 0.623811273009265, + "grad_norm": 1.9950898885726929, + "learning_rate": 1.995995802534918e-05, + "loss": 0.7427, + "step": 3821 + }, + { + "epoch": 0.6239745316517693, + "grad_norm": 2.029041051864624, + "learning_rate": 1.9959929650798325e-05, + "loss": 0.8306, + "step": 3822 + }, + { + "epoch": 0.6241377902942737, + "grad_norm": 2.349630117416382, + "learning_rate": 1.9959901266217818e-05, + "loss": 0.9501, + "step": 3823 + }, + { + "epoch": 0.6243010489367781, + "grad_norm": 2.2787833213806152, + "learning_rate": 1.9959872871607696e-05, + "loss": 0.8555, + "step": 3824 + }, + { + "epoch": 0.6244643075792825, + "grad_norm": 2.0943055152893066, + "learning_rate": 1.9959844466967985e-05, + "loss": 0.6602, + "step": 3825 + }, + { + "epoch": 0.6246275662217868, + "grad_norm": 2.1828486919403076, + "learning_rate": 1.9959816052298713e-05, + "loss": 0.8173, + "step": 3826 + }, + { + "epoch": 0.6247908248642913, + "grad_norm": 2.2292890548706055, + "learning_rate": 1.9959787627599907e-05, + "loss": 0.9583, + "step": 3827 + }, + { + "epoch": 0.6249540835067956, + "grad_norm": 1.7890475988388062, + "learning_rate": 1.99597591928716e-05, + "loss": 0.8043, + "step": 3828 + }, + { + "epoch": 0.6251173421493, + "grad_norm": 2.1255531311035156, + "learning_rate": 1.9959730748113814e-05, + "loss": 0.7283, + "step": 3829 + }, + { + "epoch": 0.6252806007918044, + "grad_norm": 1.5569697618484497, + "learning_rate": 1.9959702293326585e-05, + "loss": 0.5379, + "step": 3830 + }, + { + "epoch": 0.6254438594343088, + "grad_norm": 2.226503610610962, + "learning_rate": 1.995967382850994e-05, + "loss": 0.7863, + "step": 3831 + }, + { + "epoch": 0.6256071180768132, + "grad_norm": 2.0033159255981445, + "learning_rate": 1.9959645353663904e-05, + "loss": 0.8435, + "step": 3832 + }, + { + "epoch": 0.6257703767193176, + "grad_norm": 2.1793508529663086, + "learning_rate": 1.9959616868788506e-05, + "loss": 0.7264, + "step": 3833 + }, + { + "epoch": 0.625933635361822, + "grad_norm": 2.237682342529297, + "learning_rate": 1.9959588373883784e-05, + "loss": 0.8884, + "step": 3834 + }, + { + "epoch": 0.6260968940043263, + "grad_norm": 2.142112970352173, + "learning_rate": 1.995955986894975e-05, + "loss": 0.7348, + "step": 3835 + }, + { + "epoch": 0.6262601526468308, + "grad_norm": 1.862033724784851, + "learning_rate": 1.9959531353986445e-05, + "loss": 0.7222, + "step": 3836 + }, + { + "epoch": 0.6264234112893351, + "grad_norm": 2.026902914047241, + "learning_rate": 1.9959502828993896e-05, + "loss": 0.8597, + "step": 3837 + }, + { + "epoch": 0.6265866699318395, + "grad_norm": 2.1309542655944824, + "learning_rate": 1.995947429397213e-05, + "loss": 0.8716, + "step": 3838 + }, + { + "epoch": 0.6267499285743439, + "grad_norm": 2.212858200073242, + "learning_rate": 1.9959445748921176e-05, + "loss": 0.9834, + "step": 3839 + }, + { + "epoch": 0.6269131872168483, + "grad_norm": 2.2014424800872803, + "learning_rate": 1.995941719384106e-05, + "loss": 0.9188, + "step": 3840 + }, + { + "epoch": 0.6270764458593526, + "grad_norm": 2.1766600608825684, + "learning_rate": 1.9959388628731816e-05, + "loss": 0.8088, + "step": 3841 + }, + { + "epoch": 0.6272397045018571, + "grad_norm": 1.9613821506500244, + "learning_rate": 1.9959360053593473e-05, + "loss": 0.7239, + "step": 3842 + }, + { + "epoch": 0.6274029631443615, + "grad_norm": 2.1956701278686523, + "learning_rate": 1.9959331468426054e-05, + "loss": 0.9328, + "step": 3843 + }, + { + "epoch": 0.6275662217868658, + "grad_norm": 2.0547103881835938, + "learning_rate": 1.995930287322959e-05, + "loss": 0.7885, + "step": 3844 + }, + { + "epoch": 0.6277294804293703, + "grad_norm": 2.6029984951019287, + "learning_rate": 1.995927426800411e-05, + "loss": 0.82, + "step": 3845 + }, + { + "epoch": 0.6278927390718746, + "grad_norm": 2.6842284202575684, + "learning_rate": 1.9959245652749647e-05, + "loss": 0.9036, + "step": 3846 + }, + { + "epoch": 0.628055997714379, + "grad_norm": 1.9465556144714355, + "learning_rate": 1.9959217027466226e-05, + "loss": 0.8334, + "step": 3847 + }, + { + "epoch": 0.6282192563568834, + "grad_norm": 2.2189435958862305, + "learning_rate": 1.9959188392153873e-05, + "loss": 0.6879, + "step": 3848 + }, + { + "epoch": 0.6283825149993878, + "grad_norm": 1.9280937910079956, + "learning_rate": 1.995915974681262e-05, + "loss": 0.8867, + "step": 3849 + }, + { + "epoch": 0.6285457736418921, + "grad_norm": 2.1597084999084473, + "learning_rate": 1.9959131091442497e-05, + "loss": 0.8926, + "step": 3850 + }, + { + "epoch": 0.6287090322843966, + "grad_norm": 2.059746742248535, + "learning_rate": 1.9959102426043534e-05, + "loss": 0.9488, + "step": 3851 + }, + { + "epoch": 0.6288722909269009, + "grad_norm": 1.934052586555481, + "learning_rate": 1.9959073750615756e-05, + "loss": 0.8256, + "step": 3852 + }, + { + "epoch": 0.6290355495694053, + "grad_norm": 2.3877503871917725, + "learning_rate": 1.995904506515919e-05, + "loss": 0.951, + "step": 3853 + }, + { + "epoch": 0.6291988082119098, + "grad_norm": 2.0603699684143066, + "learning_rate": 1.9959016369673873e-05, + "loss": 0.9771, + "step": 3854 + }, + { + "epoch": 0.6293620668544141, + "grad_norm": 2.3064584732055664, + "learning_rate": 1.9958987664159826e-05, + "loss": 0.8445, + "step": 3855 + }, + { + "epoch": 0.6295253254969185, + "grad_norm": 1.651580810546875, + "learning_rate": 1.9958958948617082e-05, + "loss": 0.6746, + "step": 3856 + }, + { + "epoch": 0.6296885841394229, + "grad_norm": 2.1996777057647705, + "learning_rate": 1.995893022304567e-05, + "loss": 0.9707, + "step": 3857 + }, + { + "epoch": 0.6298518427819273, + "grad_norm": 2.047375202178955, + "learning_rate": 1.9958901487445613e-05, + "loss": 0.8364, + "step": 3858 + }, + { + "epoch": 0.6300151014244316, + "grad_norm": 2.2454278469085693, + "learning_rate": 1.995887274181695e-05, + "loss": 0.9821, + "step": 3859 + }, + { + "epoch": 0.6301783600669361, + "grad_norm": 2.129563331604004, + "learning_rate": 1.9958843986159705e-05, + "loss": 0.7714, + "step": 3860 + }, + { + "epoch": 0.6303416187094404, + "grad_norm": 2.073392152786255, + "learning_rate": 1.9958815220473905e-05, + "loss": 0.7857, + "step": 3861 + }, + { + "epoch": 0.6305048773519448, + "grad_norm": 1.8291730880737305, + "learning_rate": 1.995878644475958e-05, + "loss": 0.7052, + "step": 3862 + }, + { + "epoch": 0.6306681359944492, + "grad_norm": 2.301786422729492, + "learning_rate": 1.9958757659016765e-05, + "loss": 0.8392, + "step": 3863 + }, + { + "epoch": 0.6308313946369536, + "grad_norm": 1.9461169242858887, + "learning_rate": 1.9958728863245475e-05, + "loss": 0.8542, + "step": 3864 + }, + { + "epoch": 0.630994653279458, + "grad_norm": 1.9911609888076782, + "learning_rate": 1.9958700057445753e-05, + "loss": 0.7682, + "step": 3865 + }, + { + "epoch": 0.6311579119219624, + "grad_norm": 1.770376205444336, + "learning_rate": 1.9958671241617625e-05, + "loss": 0.6403, + "step": 3866 + }, + { + "epoch": 0.6313211705644668, + "grad_norm": 1.987022876739502, + "learning_rate": 1.9958642415761115e-05, + "loss": 0.757, + "step": 3867 + }, + { + "epoch": 0.6314844292069711, + "grad_norm": 2.4466052055358887, + "learning_rate": 1.9958613579876253e-05, + "loss": 0.9384, + "step": 3868 + }, + { + "epoch": 0.6316476878494756, + "grad_norm": 2.070402145385742, + "learning_rate": 1.995858473396307e-05, + "loss": 0.8061, + "step": 3869 + }, + { + "epoch": 0.6318109464919799, + "grad_norm": 2.2122299671173096, + "learning_rate": 1.9958555878021596e-05, + "loss": 0.6943, + "step": 3870 + }, + { + "epoch": 0.6319742051344843, + "grad_norm": 1.8979194164276123, + "learning_rate": 1.995852701205186e-05, + "loss": 0.8448, + "step": 3871 + }, + { + "epoch": 0.6321374637769887, + "grad_norm": 1.987113356590271, + "learning_rate": 1.9958498136053888e-05, + "loss": 0.855, + "step": 3872 + }, + { + "epoch": 0.6323007224194931, + "grad_norm": 2.451205015182495, + "learning_rate": 1.995846925002771e-05, + "loss": 0.8958, + "step": 3873 + }, + { + "epoch": 0.6324639810619974, + "grad_norm": 2.0886292457580566, + "learning_rate": 1.995844035397336e-05, + "loss": 0.7626, + "step": 3874 + }, + { + "epoch": 0.6326272397045019, + "grad_norm": 1.9825876951217651, + "learning_rate": 1.995841144789086e-05, + "loss": 0.7961, + "step": 3875 + }, + { + "epoch": 0.6327904983470063, + "grad_norm": 1.8546133041381836, + "learning_rate": 1.9958382531780243e-05, + "loss": 0.8598, + "step": 3876 + }, + { + "epoch": 0.6329537569895106, + "grad_norm": 2.2103657722473145, + "learning_rate": 1.9958353605641537e-05, + "loss": 0.7279, + "step": 3877 + }, + { + "epoch": 0.6331170156320151, + "grad_norm": 2.257733106613159, + "learning_rate": 1.9958324669474774e-05, + "loss": 0.8375, + "step": 3878 + }, + { + "epoch": 0.6332802742745194, + "grad_norm": 1.9387556314468384, + "learning_rate": 1.9958295723279978e-05, + "loss": 0.6344, + "step": 3879 + }, + { + "epoch": 0.6334435329170238, + "grad_norm": 2.170278549194336, + "learning_rate": 1.9958266767057183e-05, + "loss": 0.9048, + "step": 3880 + }, + { + "epoch": 0.6336067915595281, + "grad_norm": 2.1518657207489014, + "learning_rate": 1.995823780080641e-05, + "loss": 0.9136, + "step": 3881 + }, + { + "epoch": 0.6337700502020326, + "grad_norm": 1.9989204406738281, + "learning_rate": 1.9958208824527702e-05, + "loss": 0.7156, + "step": 3882 + }, + { + "epoch": 0.6339333088445369, + "grad_norm": 1.9998705387115479, + "learning_rate": 1.9958179838221078e-05, + "loss": 0.8506, + "step": 3883 + }, + { + "epoch": 0.6340965674870414, + "grad_norm": 2.1180222034454346, + "learning_rate": 1.995815084188657e-05, + "loss": 0.7945, + "step": 3884 + }, + { + "epoch": 0.6342598261295457, + "grad_norm": 2.1036112308502197, + "learning_rate": 1.9958121835524204e-05, + "loss": 0.7738, + "step": 3885 + }, + { + "epoch": 0.6344230847720501, + "grad_norm": 2.1715307235717773, + "learning_rate": 1.9958092819134012e-05, + "loss": 0.9625, + "step": 3886 + }, + { + "epoch": 0.6345863434145546, + "grad_norm": 1.6309186220169067, + "learning_rate": 1.9958063792716028e-05, + "loss": 0.6469, + "step": 3887 + }, + { + "epoch": 0.6347496020570589, + "grad_norm": 2.154336452484131, + "learning_rate": 1.995803475627027e-05, + "loss": 0.772, + "step": 3888 + }, + { + "epoch": 0.6349128606995633, + "grad_norm": 1.9058438539505005, + "learning_rate": 1.995800570979678e-05, + "loss": 0.6833, + "step": 3889 + }, + { + "epoch": 0.6350761193420676, + "grad_norm": 2.0721659660339355, + "learning_rate": 1.9957976653295576e-05, + "loss": 0.8417, + "step": 3890 + }, + { + "epoch": 0.6352393779845721, + "grad_norm": 3.318566083908081, + "learning_rate": 1.9957947586766695e-05, + "loss": 0.9382, + "step": 3891 + }, + { + "epoch": 0.6354026366270764, + "grad_norm": 1.9561172723770142, + "learning_rate": 1.995791851021016e-05, + "loss": 0.7667, + "step": 3892 + }, + { + "epoch": 0.6355658952695808, + "grad_norm": 2.0578510761260986, + "learning_rate": 1.9957889423626006e-05, + "loss": 0.9117, + "step": 3893 + }, + { + "epoch": 0.6357291539120852, + "grad_norm": 2.0947399139404297, + "learning_rate": 1.9957860327014262e-05, + "loss": 0.976, + "step": 3894 + }, + { + "epoch": 0.6358924125545896, + "grad_norm": 2.0819363594055176, + "learning_rate": 1.9957831220374953e-05, + "loss": 0.8592, + "step": 3895 + }, + { + "epoch": 0.6360556711970939, + "grad_norm": 2.1190083026885986, + "learning_rate": 1.995780210370811e-05, + "loss": 0.8897, + "step": 3896 + }, + { + "epoch": 0.6362189298395984, + "grad_norm": 2.0488765239715576, + "learning_rate": 1.9957772977013765e-05, + "loss": 0.7649, + "step": 3897 + }, + { + "epoch": 0.6363821884821028, + "grad_norm": 2.5979888439178467, + "learning_rate": 1.9957743840291942e-05, + "loss": 0.9276, + "step": 3898 + }, + { + "epoch": 0.6365454471246071, + "grad_norm": 3.41827130317688, + "learning_rate": 1.9957714693542678e-05, + "loss": 1.1754, + "step": 3899 + }, + { + "epoch": 0.6367087057671116, + "grad_norm": 2.1105499267578125, + "learning_rate": 1.9957685536765998e-05, + "loss": 0.8533, + "step": 3900 + }, + { + "epoch": 0.6368719644096159, + "grad_norm": 1.8626378774642944, + "learning_rate": 1.9957656369961928e-05, + "loss": 0.6931, + "step": 3901 + }, + { + "epoch": 0.6370352230521203, + "grad_norm": 1.8937807083129883, + "learning_rate": 1.99576271931305e-05, + "loss": 0.8176, + "step": 3902 + }, + { + "epoch": 0.6371984816946247, + "grad_norm": 2.338810443878174, + "learning_rate": 1.9957598006271745e-05, + "loss": 0.9986, + "step": 3903 + }, + { + "epoch": 0.6373617403371291, + "grad_norm": 2.3824877738952637, + "learning_rate": 1.9957568809385693e-05, + "loss": 0.7272, + "step": 3904 + }, + { + "epoch": 0.6375249989796334, + "grad_norm": 2.1532349586486816, + "learning_rate": 1.995753960247237e-05, + "loss": 0.821, + "step": 3905 + }, + { + "epoch": 0.6376882576221379, + "grad_norm": 2.2774415016174316, + "learning_rate": 1.995751038553181e-05, + "loss": 0.7677, + "step": 3906 + }, + { + "epoch": 0.6378515162646422, + "grad_norm": 2.0611109733581543, + "learning_rate": 1.9957481158564037e-05, + "loss": 0.9521, + "step": 3907 + }, + { + "epoch": 0.6380147749071466, + "grad_norm": 2.0877084732055664, + "learning_rate": 1.9957451921569084e-05, + "loss": 0.9196, + "step": 3908 + }, + { + "epoch": 0.6381780335496511, + "grad_norm": 2.128732204437256, + "learning_rate": 1.995742267454698e-05, + "loss": 0.914, + "step": 3909 + }, + { + "epoch": 0.6383412921921554, + "grad_norm": 2.6169135570526123, + "learning_rate": 1.9957393417497753e-05, + "loss": 0.8113, + "step": 3910 + }, + { + "epoch": 0.6385045508346598, + "grad_norm": 2.217097759246826, + "learning_rate": 1.9957364150421435e-05, + "loss": 0.8716, + "step": 3911 + }, + { + "epoch": 0.6386678094771642, + "grad_norm": 1.9963403940200806, + "learning_rate": 1.9957334873318052e-05, + "loss": 0.867, + "step": 3912 + }, + { + "epoch": 0.6388310681196686, + "grad_norm": 1.984139323234558, + "learning_rate": 1.9957305586187636e-05, + "loss": 0.8685, + "step": 3913 + }, + { + "epoch": 0.6389943267621729, + "grad_norm": 2.370695114135742, + "learning_rate": 1.9957276289030217e-05, + "loss": 1.0087, + "step": 3914 + }, + { + "epoch": 0.6391575854046774, + "grad_norm": 2.2320048809051514, + "learning_rate": 1.9957246981845825e-05, + "loss": 0.9583, + "step": 3915 + }, + { + "epoch": 0.6393208440471817, + "grad_norm": 2.2000739574432373, + "learning_rate": 1.9957217664634484e-05, + "loss": 0.8816, + "step": 3916 + }, + { + "epoch": 0.6394841026896861, + "grad_norm": 2.422565460205078, + "learning_rate": 1.9957188337396228e-05, + "loss": 0.776, + "step": 3917 + }, + { + "epoch": 0.6396473613321906, + "grad_norm": 2.2651000022888184, + "learning_rate": 1.9957159000131087e-05, + "loss": 0.8537, + "step": 3918 + }, + { + "epoch": 0.6398106199746949, + "grad_norm": 2.1559486389160156, + "learning_rate": 1.9957129652839092e-05, + "loss": 0.8695, + "step": 3919 + }, + { + "epoch": 0.6399738786171993, + "grad_norm": 2.0040483474731445, + "learning_rate": 1.9957100295520267e-05, + "loss": 0.783, + "step": 3920 + }, + { + "epoch": 0.6401371372597037, + "grad_norm": 2.025927782058716, + "learning_rate": 1.9957070928174645e-05, + "loss": 0.7896, + "step": 3921 + }, + { + "epoch": 0.6403003959022081, + "grad_norm": 2.081263780593872, + "learning_rate": 1.9957041550802257e-05, + "loss": 0.8882, + "step": 3922 + }, + { + "epoch": 0.6404636545447124, + "grad_norm": 2.5506882667541504, + "learning_rate": 1.995701216340313e-05, + "loss": 0.8313, + "step": 3923 + }, + { + "epoch": 0.6406269131872169, + "grad_norm": 1.8875963687896729, + "learning_rate": 1.995698276597729e-05, + "loss": 0.895, + "step": 3924 + }, + { + "epoch": 0.6407901718297212, + "grad_norm": 1.805955171585083, + "learning_rate": 1.9956953358524774e-05, + "loss": 0.7252, + "step": 3925 + }, + { + "epoch": 0.6409534304722256, + "grad_norm": 2.4521286487579346, + "learning_rate": 1.9956923941045613e-05, + "loss": 1.0291, + "step": 3926 + }, + { + "epoch": 0.64111668911473, + "grad_norm": 2.2084898948669434, + "learning_rate": 1.995689451353983e-05, + "loss": 0.9433, + "step": 3927 + }, + { + "epoch": 0.6412799477572344, + "grad_norm": 1.8056122064590454, + "learning_rate": 1.995686507600745e-05, + "loss": 0.6321, + "step": 3928 + }, + { + "epoch": 0.6414432063997388, + "grad_norm": 2.013023853302002, + "learning_rate": 1.995683562844852e-05, + "loss": 0.8773, + "step": 3929 + }, + { + "epoch": 0.6416064650422432, + "grad_norm": 1.9381183385849, + "learning_rate": 1.995680617086305e-05, + "loss": 0.736, + "step": 3930 + }, + { + "epoch": 0.6417697236847476, + "grad_norm": 2.0055699348449707, + "learning_rate": 1.9956776703251083e-05, + "loss": 0.7631, + "step": 3931 + }, + { + "epoch": 0.6419329823272519, + "grad_norm": 1.858241319656372, + "learning_rate": 1.9956747225612643e-05, + "loss": 0.7891, + "step": 3932 + }, + { + "epoch": 0.6420962409697564, + "grad_norm": 2.552084445953369, + "learning_rate": 1.9956717737947766e-05, + "loss": 0.89, + "step": 3933 + }, + { + "epoch": 0.6422594996122607, + "grad_norm": 2.1156399250030518, + "learning_rate": 1.9956688240256473e-05, + "loss": 0.882, + "step": 3934 + }, + { + "epoch": 0.6424227582547651, + "grad_norm": 1.8388075828552246, + "learning_rate": 1.99566587325388e-05, + "loss": 0.7159, + "step": 3935 + }, + { + "epoch": 0.6425860168972695, + "grad_norm": 2.6280083656311035, + "learning_rate": 1.9956629214794773e-05, + "loss": 0.9021, + "step": 3936 + }, + { + "epoch": 0.6427492755397739, + "grad_norm": 1.8802495002746582, + "learning_rate": 1.995659968702442e-05, + "loss": 0.8772, + "step": 3937 + }, + { + "epoch": 0.6429125341822782, + "grad_norm": 1.6542173624038696, + "learning_rate": 1.9956570149227777e-05, + "loss": 0.6515, + "step": 3938 + }, + { + "epoch": 0.6430757928247827, + "grad_norm": 2.072232484817505, + "learning_rate": 1.995654060140487e-05, + "loss": 0.817, + "step": 3939 + }, + { + "epoch": 0.6432390514672871, + "grad_norm": 2.2016494274139404, + "learning_rate": 1.995651104355573e-05, + "loss": 0.8919, + "step": 3940 + }, + { + "epoch": 0.6434023101097914, + "grad_norm": 2.393803834915161, + "learning_rate": 1.9956481475680384e-05, + "loss": 0.8891, + "step": 3941 + }, + { + "epoch": 0.6435655687522959, + "grad_norm": 2.1537551879882812, + "learning_rate": 1.9956451897778864e-05, + "loss": 0.8584, + "step": 3942 + }, + { + "epoch": 0.6437288273948002, + "grad_norm": 2.1254405975341797, + "learning_rate": 1.99564223098512e-05, + "loss": 0.7949, + "step": 3943 + }, + { + "epoch": 0.6438920860373046, + "grad_norm": 2.1835193634033203, + "learning_rate": 1.995639271189742e-05, + "loss": 0.8324, + "step": 3944 + }, + { + "epoch": 0.644055344679809, + "grad_norm": 2.6670496463775635, + "learning_rate": 1.995636310391756e-05, + "loss": 0.8234, + "step": 3945 + }, + { + "epoch": 0.6442186033223134, + "grad_norm": 1.982601523399353, + "learning_rate": 1.9956333485911642e-05, + "loss": 0.7973, + "step": 3946 + }, + { + "epoch": 0.6443818619648177, + "grad_norm": 2.302584648132324, + "learning_rate": 1.9956303857879698e-05, + "loss": 0.7846, + "step": 3947 + }, + { + "epoch": 0.6445451206073222, + "grad_norm": 2.196352958679199, + "learning_rate": 1.995627421982176e-05, + "loss": 0.8901, + "step": 3948 + }, + { + "epoch": 0.6447083792498265, + "grad_norm": 2.1761200428009033, + "learning_rate": 1.9956244571737855e-05, + "loss": 0.9241, + "step": 3949 + }, + { + "epoch": 0.6448716378923309, + "grad_norm": 2.5097243785858154, + "learning_rate": 1.9956214913628015e-05, + "loss": 0.8693, + "step": 3950 + }, + { + "epoch": 0.6450348965348354, + "grad_norm": 1.8800350427627563, + "learning_rate": 1.995618524549227e-05, + "loss": 0.8748, + "step": 3951 + }, + { + "epoch": 0.6451981551773397, + "grad_norm": 1.6585696935653687, + "learning_rate": 1.9956155567330648e-05, + "loss": 0.6693, + "step": 3952 + }, + { + "epoch": 0.6453614138198441, + "grad_norm": 2.103243589401245, + "learning_rate": 1.995612587914318e-05, + "loss": 0.9704, + "step": 3953 + }, + { + "epoch": 0.6455246724623485, + "grad_norm": 2.090947389602661, + "learning_rate": 1.99560961809299e-05, + "loss": 0.9566, + "step": 3954 + }, + { + "epoch": 0.6456879311048529, + "grad_norm": 1.9720873832702637, + "learning_rate": 1.9956066472690826e-05, + "loss": 0.7959, + "step": 3955 + }, + { + "epoch": 0.6458511897473572, + "grad_norm": 1.688127875328064, + "learning_rate": 1.9956036754426004e-05, + "loss": 0.7929, + "step": 3956 + }, + { + "epoch": 0.6460144483898617, + "grad_norm": 1.9238029718399048, + "learning_rate": 1.9956007026135448e-05, + "loss": 0.7159, + "step": 3957 + }, + { + "epoch": 0.646177707032366, + "grad_norm": 2.5537431240081787, + "learning_rate": 1.99559772878192e-05, + "loss": 1.0091, + "step": 3958 + }, + { + "epoch": 0.6463409656748704, + "grad_norm": 1.8655478954315186, + "learning_rate": 1.9955947539477285e-05, + "loss": 0.7544, + "step": 3959 + }, + { + "epoch": 0.6465042243173748, + "grad_norm": 2.066704034805298, + "learning_rate": 1.995591778110973e-05, + "loss": 0.7309, + "step": 3960 + }, + { + "epoch": 0.6466674829598792, + "grad_norm": 2.049016237258911, + "learning_rate": 1.9955888012716574e-05, + "loss": 0.8132, + "step": 3961 + }, + { + "epoch": 0.6468307416023836, + "grad_norm": 2.3270530700683594, + "learning_rate": 1.995585823429784e-05, + "loss": 0.9067, + "step": 3962 + }, + { + "epoch": 0.646994000244888, + "grad_norm": 2.370431661605835, + "learning_rate": 1.995582844585356e-05, + "loss": 1.0286, + "step": 3963 + }, + { + "epoch": 0.6471572588873924, + "grad_norm": 1.9145859479904175, + "learning_rate": 1.995579864738376e-05, + "loss": 0.7595, + "step": 3964 + }, + { + "epoch": 0.6473205175298967, + "grad_norm": 2.0103914737701416, + "learning_rate": 1.9955768838888473e-05, + "loss": 0.7468, + "step": 3965 + }, + { + "epoch": 0.6474837761724012, + "grad_norm": 2.1546573638916016, + "learning_rate": 1.9955739020367733e-05, + "loss": 0.819, + "step": 3966 + }, + { + "epoch": 0.6476470348149055, + "grad_norm": 2.047351837158203, + "learning_rate": 1.9955709191821565e-05, + "loss": 0.8153, + "step": 3967 + }, + { + "epoch": 0.6478102934574099, + "grad_norm": 2.5663466453552246, + "learning_rate": 1.995567935325e-05, + "loss": 0.9616, + "step": 3968 + }, + { + "epoch": 0.6479735520999143, + "grad_norm": 2.136587142944336, + "learning_rate": 1.995564950465307e-05, + "loss": 0.7275, + "step": 3969 + }, + { + "epoch": 0.6481368107424187, + "grad_norm": 2.013615846633911, + "learning_rate": 1.99556196460308e-05, + "loss": 0.7894, + "step": 3970 + }, + { + "epoch": 0.648300069384923, + "grad_norm": 2.1864113807678223, + "learning_rate": 1.995558977738323e-05, + "loss": 0.823, + "step": 3971 + }, + { + "epoch": 0.6484633280274275, + "grad_norm": 2.0250179767608643, + "learning_rate": 1.9955559898710377e-05, + "loss": 0.8342, + "step": 3972 + }, + { + "epoch": 0.6486265866699319, + "grad_norm": 2.1324098110198975, + "learning_rate": 1.9955530010012283e-05, + "loss": 0.9318, + "step": 3973 + }, + { + "epoch": 0.6487898453124362, + "grad_norm": 1.7510625123977661, + "learning_rate": 1.995550011128897e-05, + "loss": 0.7564, + "step": 3974 + }, + { + "epoch": 0.6489531039549407, + "grad_norm": 2.1405553817749023, + "learning_rate": 1.9955470202540472e-05, + "loss": 0.8508, + "step": 3975 + }, + { + "epoch": 0.649116362597445, + "grad_norm": 2.23287296295166, + "learning_rate": 1.995544028376682e-05, + "loss": 0.9902, + "step": 3976 + }, + { + "epoch": 0.6492796212399494, + "grad_norm": 1.8539090156555176, + "learning_rate": 1.9955410354968038e-05, + "loss": 0.8302, + "step": 3977 + }, + { + "epoch": 0.6494428798824537, + "grad_norm": 1.7731791734695435, + "learning_rate": 1.9955380416144164e-05, + "loss": 0.6716, + "step": 3978 + }, + { + "epoch": 0.6496061385249582, + "grad_norm": 1.912860631942749, + "learning_rate": 1.995535046729522e-05, + "loss": 0.8558, + "step": 3979 + }, + { + "epoch": 0.6497693971674625, + "grad_norm": 2.142641305923462, + "learning_rate": 1.9955320508421247e-05, + "loss": 0.9237, + "step": 3980 + }, + { + "epoch": 0.649932655809967, + "grad_norm": 1.9753752946853638, + "learning_rate": 1.9955290539522262e-05, + "loss": 0.7924, + "step": 3981 + }, + { + "epoch": 0.6500959144524713, + "grad_norm": 1.8638825416564941, + "learning_rate": 1.9955260560598306e-05, + "loss": 0.7445, + "step": 3982 + }, + { + "epoch": 0.6502591730949757, + "grad_norm": 2.375680685043335, + "learning_rate": 1.9955230571649407e-05, + "loss": 1.0534, + "step": 3983 + }, + { + "epoch": 0.6504224317374802, + "grad_norm": 2.331519842147827, + "learning_rate": 1.9955200572675593e-05, + "loss": 0.7965, + "step": 3984 + }, + { + "epoch": 0.6505856903799845, + "grad_norm": 1.7263696193695068, + "learning_rate": 1.9955170563676892e-05, + "loss": 0.673, + "step": 3985 + }, + { + "epoch": 0.6507489490224889, + "grad_norm": 1.8704642057418823, + "learning_rate": 1.9955140544653336e-05, + "loss": 1.0084, + "step": 3986 + }, + { + "epoch": 0.6509122076649932, + "grad_norm": 1.694795846939087, + "learning_rate": 1.995511051560496e-05, + "loss": 0.7092, + "step": 3987 + }, + { + "epoch": 0.6510754663074977, + "grad_norm": 2.0374913215637207, + "learning_rate": 1.9955080476531788e-05, + "loss": 0.8642, + "step": 3988 + }, + { + "epoch": 0.651238724950002, + "grad_norm": 1.8005086183547974, + "learning_rate": 1.9955050427433857e-05, + "loss": 0.7632, + "step": 3989 + }, + { + "epoch": 0.6514019835925065, + "grad_norm": 2.0485095977783203, + "learning_rate": 1.9955020368311185e-05, + "loss": 0.7562, + "step": 3990 + }, + { + "epoch": 0.6515652422350108, + "grad_norm": 1.7770757675170898, + "learning_rate": 1.9954990299163814e-05, + "loss": 0.6999, + "step": 3991 + }, + { + "epoch": 0.6517285008775152, + "grad_norm": 1.9794975519180298, + "learning_rate": 1.995496021999177e-05, + "loss": 0.7433, + "step": 3992 + }, + { + "epoch": 0.6518917595200195, + "grad_norm": 2.2751927375793457, + "learning_rate": 1.9954930130795084e-05, + "loss": 0.9534, + "step": 3993 + }, + { + "epoch": 0.652055018162524, + "grad_norm": 2.1149494647979736, + "learning_rate": 1.9954900031573788e-05, + "loss": 0.8748, + "step": 3994 + }, + { + "epoch": 0.6522182768050284, + "grad_norm": 2.0718464851379395, + "learning_rate": 1.9954869922327908e-05, + "loss": 0.8226, + "step": 3995 + }, + { + "epoch": 0.6523815354475327, + "grad_norm": 1.923607349395752, + "learning_rate": 1.9954839803057478e-05, + "loss": 0.7806, + "step": 3996 + }, + { + "epoch": 0.6525447940900372, + "grad_norm": 2.0130627155303955, + "learning_rate": 1.9954809673762528e-05, + "loss": 0.8563, + "step": 3997 + }, + { + "epoch": 0.6527080527325415, + "grad_norm": 2.1710920333862305, + "learning_rate": 1.995477953444308e-05, + "loss": 0.9389, + "step": 3998 + }, + { + "epoch": 0.652871311375046, + "grad_norm": 1.9817372560501099, + "learning_rate": 1.995474938509918e-05, + "loss": 0.8255, + "step": 3999 + }, + { + "epoch": 0.6530345700175503, + "grad_norm": 2.010631799697876, + "learning_rate": 1.9954719225730847e-05, + "loss": 0.6978, + "step": 4000 + }, + { + "epoch": 0.6531978286600547, + "grad_norm": 1.9969439506530762, + "learning_rate": 1.9954689056338113e-05, + "loss": 0.7017, + "step": 4001 + }, + { + "epoch": 0.653361087302559, + "grad_norm": 1.8172589540481567, + "learning_rate": 1.9954658876921012e-05, + "loss": 0.6477, + "step": 4002 + }, + { + "epoch": 0.6535243459450635, + "grad_norm": 1.9709951877593994, + "learning_rate": 1.995462868747957e-05, + "loss": 0.8902, + "step": 4003 + }, + { + "epoch": 0.6536876045875678, + "grad_norm": 2.084540605545044, + "learning_rate": 1.9954598488013826e-05, + "loss": 0.8218, + "step": 4004 + }, + { + "epoch": 0.6538508632300722, + "grad_norm": 2.008793354034424, + "learning_rate": 1.9954568278523796e-05, + "loss": 0.8614, + "step": 4005 + }, + { + "epoch": 0.6540141218725767, + "grad_norm": 2.1246957778930664, + "learning_rate": 1.9954538059009523e-05, + "loss": 0.8835, + "step": 4006 + }, + { + "epoch": 0.654177380515081, + "grad_norm": 2.1777400970458984, + "learning_rate": 1.995450782947103e-05, + "loss": 1.0186, + "step": 4007 + }, + { + "epoch": 0.6543406391575854, + "grad_norm": 2.5215978622436523, + "learning_rate": 1.995447758990835e-05, + "loss": 0.9352, + "step": 4008 + }, + { + "epoch": 0.6545038978000898, + "grad_norm": 2.022606372833252, + "learning_rate": 1.9954447340321516e-05, + "loss": 0.9196, + "step": 4009 + }, + { + "epoch": 0.6546671564425942, + "grad_norm": 1.7668960094451904, + "learning_rate": 1.9954417080710557e-05, + "loss": 0.748, + "step": 4010 + }, + { + "epoch": 0.6548304150850985, + "grad_norm": 2.0687551498413086, + "learning_rate": 1.9954386811075502e-05, + "loss": 0.9036, + "step": 4011 + }, + { + "epoch": 0.654993673727603, + "grad_norm": 1.5938286781311035, + "learning_rate": 1.995435653141638e-05, + "loss": 0.6414, + "step": 4012 + }, + { + "epoch": 0.6551569323701073, + "grad_norm": 2.12983775138855, + "learning_rate": 1.9954326241733223e-05, + "loss": 0.9285, + "step": 4013 + }, + { + "epoch": 0.6553201910126117, + "grad_norm": 2.091355085372925, + "learning_rate": 1.9954295942026065e-05, + "loss": 0.9115, + "step": 4014 + }, + { + "epoch": 0.6554834496551161, + "grad_norm": 1.7807918787002563, + "learning_rate": 1.995426563229493e-05, + "loss": 0.6876, + "step": 4015 + }, + { + "epoch": 0.6556467082976205, + "grad_norm": 2.3356451988220215, + "learning_rate": 1.9954235312539855e-05, + "loss": 0.6976, + "step": 4016 + }, + { + "epoch": 0.6558099669401249, + "grad_norm": 2.1427462100982666, + "learning_rate": 1.995420498276087e-05, + "loss": 0.9099, + "step": 4017 + }, + { + "epoch": 0.6559732255826293, + "grad_norm": 1.99425208568573, + "learning_rate": 1.9954174642958e-05, + "loss": 0.8985, + "step": 4018 + }, + { + "epoch": 0.6561364842251337, + "grad_norm": 2.2237167358398438, + "learning_rate": 1.9954144293131275e-05, + "loss": 0.7749, + "step": 4019 + }, + { + "epoch": 0.656299742867638, + "grad_norm": 1.7875380516052246, + "learning_rate": 1.9954113933280737e-05, + "loss": 0.7482, + "step": 4020 + }, + { + "epoch": 0.6564630015101425, + "grad_norm": 1.9485515356063843, + "learning_rate": 1.9954083563406407e-05, + "loss": 0.8339, + "step": 4021 + }, + { + "epoch": 0.6566262601526468, + "grad_norm": 1.8322675228118896, + "learning_rate": 1.9954053183508317e-05, + "loss": 0.7418, + "step": 4022 + }, + { + "epoch": 0.6567895187951512, + "grad_norm": 1.6777312755584717, + "learning_rate": 1.9954022793586492e-05, + "loss": 0.7214, + "step": 4023 + }, + { + "epoch": 0.6569527774376556, + "grad_norm": 2.226450204849243, + "learning_rate": 1.9953992393640975e-05, + "loss": 0.753, + "step": 4024 + }, + { + "epoch": 0.65711603608016, + "grad_norm": 1.9682592153549194, + "learning_rate": 1.9953961983671792e-05, + "loss": 0.7863, + "step": 4025 + }, + { + "epoch": 0.6572792947226643, + "grad_norm": 2.1388142108917236, + "learning_rate": 1.9953931563678966e-05, + "loss": 0.7784, + "step": 4026 + }, + { + "epoch": 0.6574425533651688, + "grad_norm": 2.3465826511383057, + "learning_rate": 1.995390113366254e-05, + "loss": 0.8246, + "step": 4027 + }, + { + "epoch": 0.6576058120076732, + "grad_norm": 1.8669836521148682, + "learning_rate": 1.995387069362253e-05, + "loss": 0.7372, + "step": 4028 + }, + { + "epoch": 0.6577690706501775, + "grad_norm": 2.10113787651062, + "learning_rate": 1.9953840243558982e-05, + "loss": 0.8374, + "step": 4029 + }, + { + "epoch": 0.657932329292682, + "grad_norm": 1.9991933107376099, + "learning_rate": 1.9953809783471917e-05, + "loss": 0.8547, + "step": 4030 + }, + { + "epoch": 0.6580955879351863, + "grad_norm": 1.760053277015686, + "learning_rate": 1.9953779313361368e-05, + "loss": 0.8386, + "step": 4031 + }, + { + "epoch": 0.6582588465776907, + "grad_norm": 2.0603861808776855, + "learning_rate": 1.9953748833227364e-05, + "loss": 0.7094, + "step": 4032 + }, + { + "epoch": 0.6584221052201951, + "grad_norm": 1.8455878496170044, + "learning_rate": 1.995371834306994e-05, + "loss": 0.7988, + "step": 4033 + }, + { + "epoch": 0.6585853638626995, + "grad_norm": 2.02435040473938, + "learning_rate": 1.9953687842889126e-05, + "loss": 0.7722, + "step": 4034 + }, + { + "epoch": 0.6587486225052038, + "grad_norm": 1.9577025175094604, + "learning_rate": 1.9953657332684947e-05, + "loss": 0.8516, + "step": 4035 + }, + { + "epoch": 0.6589118811477083, + "grad_norm": 1.9614293575286865, + "learning_rate": 1.995362681245744e-05, + "loss": 0.7988, + "step": 4036 + }, + { + "epoch": 0.6590751397902126, + "grad_norm": 1.9903169870376587, + "learning_rate": 1.995359628220663e-05, + "loss": 0.8822, + "step": 4037 + }, + { + "epoch": 0.659238398432717, + "grad_norm": 2.0466933250427246, + "learning_rate": 1.995356574193256e-05, + "loss": 0.7112, + "step": 4038 + }, + { + "epoch": 0.6594016570752215, + "grad_norm": 2.1602303981781006, + "learning_rate": 1.9953535191635245e-05, + "loss": 0.8392, + "step": 4039 + }, + { + "epoch": 0.6595649157177258, + "grad_norm": 2.1674540042877197, + "learning_rate": 1.9953504631314722e-05, + "loss": 0.911, + "step": 4040 + }, + { + "epoch": 0.6597281743602302, + "grad_norm": 1.9284535646438599, + "learning_rate": 1.9953474060971024e-05, + "loss": 0.8454, + "step": 4041 + }, + { + "epoch": 0.6598914330027346, + "grad_norm": 1.9467206001281738, + "learning_rate": 1.9953443480604182e-05, + "loss": 0.8372, + "step": 4042 + }, + { + "epoch": 0.660054691645239, + "grad_norm": 1.886176347732544, + "learning_rate": 1.9953412890214223e-05, + "loss": 0.6558, + "step": 4043 + }, + { + "epoch": 0.6602179502877433, + "grad_norm": 1.801827311515808, + "learning_rate": 1.995338228980118e-05, + "loss": 0.7936, + "step": 4044 + }, + { + "epoch": 0.6603812089302478, + "grad_norm": 2.0684685707092285, + "learning_rate": 1.9953351679365086e-05, + "loss": 0.8093, + "step": 4045 + }, + { + "epoch": 0.6605444675727521, + "grad_norm": 2.105072498321533, + "learning_rate": 1.995332105890597e-05, + "loss": 0.8979, + "step": 4046 + }, + { + "epoch": 0.6607077262152565, + "grad_norm": 2.3216841220855713, + "learning_rate": 1.9953290428423857e-05, + "loss": 0.9275, + "step": 4047 + }, + { + "epoch": 0.6608709848577609, + "grad_norm": 2.2121288776397705, + "learning_rate": 1.9953259787918788e-05, + "loss": 0.8287, + "step": 4048 + }, + { + "epoch": 0.6610342435002653, + "grad_norm": 2.5402417182922363, + "learning_rate": 1.9953229137390787e-05, + "loss": 0.6928, + "step": 4049 + }, + { + "epoch": 0.6611975021427697, + "grad_norm": 2.049816846847534, + "learning_rate": 1.9953198476839886e-05, + "loss": 0.8361, + "step": 4050 + }, + { + "epoch": 0.6613607607852741, + "grad_norm": 1.8441988229751587, + "learning_rate": 1.995316780626612e-05, + "loss": 0.7696, + "step": 4051 + }, + { + "epoch": 0.6615240194277785, + "grad_norm": 2.2757208347320557, + "learning_rate": 1.9953137125669513e-05, + "loss": 0.8383, + "step": 4052 + }, + { + "epoch": 0.6616872780702828, + "grad_norm": 1.8649359941482544, + "learning_rate": 1.99531064350501e-05, + "loss": 0.7691, + "step": 4053 + }, + { + "epoch": 0.6618505367127873, + "grad_norm": 1.9752105474472046, + "learning_rate": 1.9953075734407915e-05, + "loss": 0.8499, + "step": 4054 + }, + { + "epoch": 0.6620137953552916, + "grad_norm": 2.0030970573425293, + "learning_rate": 1.9953045023742984e-05, + "loss": 0.857, + "step": 4055 + }, + { + "epoch": 0.662177053997796, + "grad_norm": 2.2038145065307617, + "learning_rate": 1.9953014303055336e-05, + "loss": 0.9591, + "step": 4056 + }, + { + "epoch": 0.6623403126403004, + "grad_norm": 1.8572810888290405, + "learning_rate": 1.9952983572345008e-05, + "loss": 0.8644, + "step": 4057 + }, + { + "epoch": 0.6625035712828048, + "grad_norm": 1.8880647420883179, + "learning_rate": 1.9952952831612027e-05, + "loss": 0.7326, + "step": 4058 + }, + { + "epoch": 0.6626668299253091, + "grad_norm": 1.7331615686416626, + "learning_rate": 1.9952922080856427e-05, + "loss": 0.7337, + "step": 4059 + }, + { + "epoch": 0.6628300885678136, + "grad_norm": 1.9105256795883179, + "learning_rate": 1.9952891320078235e-05, + "loss": 0.7964, + "step": 4060 + }, + { + "epoch": 0.662993347210318, + "grad_norm": 1.8040122985839844, + "learning_rate": 1.9952860549277485e-05, + "loss": 0.629, + "step": 4061 + }, + { + "epoch": 0.6631566058528223, + "grad_norm": 2.099109649658203, + "learning_rate": 1.9952829768454208e-05, + "loss": 0.8116, + "step": 4062 + }, + { + "epoch": 0.6633198644953268, + "grad_norm": 2.135161876678467, + "learning_rate": 1.995279897760843e-05, + "loss": 0.8711, + "step": 4063 + }, + { + "epoch": 0.6634831231378311, + "grad_norm": 2.1493113040924072, + "learning_rate": 1.9952768176740193e-05, + "loss": 0.8911, + "step": 4064 + }, + { + "epoch": 0.6636463817803355, + "grad_norm": 2.266852855682373, + "learning_rate": 1.9952737365849516e-05, + "loss": 0.9986, + "step": 4065 + }, + { + "epoch": 0.6638096404228399, + "grad_norm": 2.052060604095459, + "learning_rate": 1.9952706544936437e-05, + "loss": 0.8148, + "step": 4066 + }, + { + "epoch": 0.6639728990653443, + "grad_norm": 1.8236923217773438, + "learning_rate": 1.9952675714000983e-05, + "loss": 0.7644, + "step": 4067 + }, + { + "epoch": 0.6641361577078486, + "grad_norm": 1.8633387088775635, + "learning_rate": 1.995264487304319e-05, + "loss": 0.7575, + "step": 4068 + }, + { + "epoch": 0.6642994163503531, + "grad_norm": 2.448219060897827, + "learning_rate": 1.9952614022063085e-05, + "loss": 0.9747, + "step": 4069 + }, + { + "epoch": 0.6644626749928574, + "grad_norm": 2.0879111289978027, + "learning_rate": 1.9952583161060702e-05, + "loss": 0.8959, + "step": 4070 + }, + { + "epoch": 0.6646259336353618, + "grad_norm": 2.194457530975342, + "learning_rate": 1.9952552290036066e-05, + "loss": 1.5685, + "step": 4071 + }, + { + "epoch": 0.6647891922778663, + "grad_norm": 2.1314384937286377, + "learning_rate": 1.9952521408989215e-05, + "loss": 0.8554, + "step": 4072 + }, + { + "epoch": 0.6649524509203706, + "grad_norm": 1.9167932271957397, + "learning_rate": 1.9952490517920178e-05, + "loss": 0.8732, + "step": 4073 + }, + { + "epoch": 0.665115709562875, + "grad_norm": 2.3634629249572754, + "learning_rate": 1.9952459616828986e-05, + "loss": 0.7868, + "step": 4074 + }, + { + "epoch": 0.6652789682053794, + "grad_norm": 2.101855993270874, + "learning_rate": 1.995242870571567e-05, + "loss": 0.8442, + "step": 4075 + }, + { + "epoch": 0.6654422268478838, + "grad_norm": 1.9536573886871338, + "learning_rate": 1.995239778458026e-05, + "loss": 0.8829, + "step": 4076 + }, + { + "epoch": 0.6656054854903881, + "grad_norm": 1.6369829177856445, + "learning_rate": 1.995236685342279e-05, + "loss": 0.7549, + "step": 4077 + }, + { + "epoch": 0.6657687441328926, + "grad_norm": 2.187455654144287, + "learning_rate": 1.9952335912243284e-05, + "loss": 0.7359, + "step": 4078 + }, + { + "epoch": 0.6659320027753969, + "grad_norm": 2.4019758701324463, + "learning_rate": 1.9952304961041783e-05, + "loss": 0.9003, + "step": 4079 + }, + { + "epoch": 0.6660952614179013, + "grad_norm": 1.9343417882919312, + "learning_rate": 1.9952273999818312e-05, + "loss": 0.7625, + "step": 4080 + }, + { + "epoch": 0.6662585200604056, + "grad_norm": 2.135756015777588, + "learning_rate": 1.9952243028572904e-05, + "loss": 0.9449, + "step": 4081 + }, + { + "epoch": 0.6664217787029101, + "grad_norm": 2.0714633464813232, + "learning_rate": 1.9952212047305592e-05, + "loss": 0.7356, + "step": 4082 + }, + { + "epoch": 0.6665850373454145, + "grad_norm": 1.6581432819366455, + "learning_rate": 1.9952181056016403e-05, + "loss": 0.76, + "step": 4083 + }, + { + "epoch": 0.6667482959879188, + "grad_norm": 2.813175678253174, + "learning_rate": 1.995215005470537e-05, + "loss": 0.9086, + "step": 4084 + }, + { + "epoch": 0.6669115546304233, + "grad_norm": 2.511840343475342, + "learning_rate": 1.9952119043372526e-05, + "loss": 0.9545, + "step": 4085 + }, + { + "epoch": 0.6670748132729276, + "grad_norm": 1.8923448324203491, + "learning_rate": 1.99520880220179e-05, + "loss": 0.7757, + "step": 4086 + }, + { + "epoch": 0.667238071915432, + "grad_norm": 1.7587465047836304, + "learning_rate": 1.9952056990641523e-05, + "loss": 0.5989, + "step": 4087 + }, + { + "epoch": 0.6674013305579364, + "grad_norm": 1.9205251932144165, + "learning_rate": 1.9952025949243427e-05, + "loss": 0.6685, + "step": 4088 + }, + { + "epoch": 0.6675645892004408, + "grad_norm": 2.4398887157440186, + "learning_rate": 1.9951994897823647e-05, + "loss": 0.9893, + "step": 4089 + }, + { + "epoch": 0.6677278478429451, + "grad_norm": 2.3446192741394043, + "learning_rate": 1.9951963836382206e-05, + "loss": 0.7545, + "step": 4090 + }, + { + "epoch": 0.6678911064854496, + "grad_norm": 1.7858868837356567, + "learning_rate": 1.9951932764919143e-05, + "loss": 0.6628, + "step": 4091 + }, + { + "epoch": 0.6680543651279539, + "grad_norm": 1.98744535446167, + "learning_rate": 1.9951901683434487e-05, + "loss": 0.7449, + "step": 4092 + }, + { + "epoch": 0.6682176237704583, + "grad_norm": 1.9634106159210205, + "learning_rate": 1.9951870591928266e-05, + "loss": 0.9274, + "step": 4093 + }, + { + "epoch": 0.6683808824129628, + "grad_norm": 1.6654844284057617, + "learning_rate": 1.9951839490400514e-05, + "loss": 0.6702, + "step": 4094 + }, + { + "epoch": 0.6685441410554671, + "grad_norm": 1.8778430223464966, + "learning_rate": 1.9951808378851264e-05, + "loss": 0.7187, + "step": 4095 + }, + { + "epoch": 0.6687073996979715, + "grad_norm": 1.9327142238616943, + "learning_rate": 1.9951777257280547e-05, + "loss": 0.907, + "step": 4096 + }, + { + "epoch": 0.6688706583404759, + "grad_norm": 1.9685276746749878, + "learning_rate": 1.995174612568839e-05, + "loss": 0.7314, + "step": 4097 + }, + { + "epoch": 0.6690339169829803, + "grad_norm": 1.7189851999282837, + "learning_rate": 1.995171498407483e-05, + "loss": 0.8234, + "step": 4098 + }, + { + "epoch": 0.6691971756254846, + "grad_norm": 1.8868552446365356, + "learning_rate": 1.9951683832439892e-05, + "loss": 0.8507, + "step": 4099 + }, + { + "epoch": 0.6693604342679891, + "grad_norm": 2.4335665702819824, + "learning_rate": 1.9951652670783615e-05, + "loss": 0.8789, + "step": 4100 + }, + { + "epoch": 0.6695236929104934, + "grad_norm": 2.0909385681152344, + "learning_rate": 1.9951621499106024e-05, + "loss": 0.7983, + "step": 4101 + }, + { + "epoch": 0.6696869515529978, + "grad_norm": 1.7976733446121216, + "learning_rate": 1.9951590317407152e-05, + "loss": 0.6667, + "step": 4102 + }, + { + "epoch": 0.6698502101955022, + "grad_norm": 2.168764352798462, + "learning_rate": 1.9951559125687033e-05, + "loss": 0.7478, + "step": 4103 + }, + { + "epoch": 0.6700134688380066, + "grad_norm": 1.8159606456756592, + "learning_rate": 1.9951527923945698e-05, + "loss": 0.7724, + "step": 4104 + }, + { + "epoch": 0.670176727480511, + "grad_norm": 1.8856970071792603, + "learning_rate": 1.9951496712183177e-05, + "loss": 0.7946, + "step": 4105 + }, + { + "epoch": 0.6703399861230154, + "grad_norm": 1.654207706451416, + "learning_rate": 1.9951465490399497e-05, + "loss": 0.6858, + "step": 4106 + }, + { + "epoch": 0.6705032447655198, + "grad_norm": 1.956335425376892, + "learning_rate": 1.9951434258594696e-05, + "loss": 0.8269, + "step": 4107 + }, + { + "epoch": 0.6706665034080241, + "grad_norm": 1.9062511920928955, + "learning_rate": 1.9951403016768804e-05, + "loss": 0.7978, + "step": 4108 + }, + { + "epoch": 0.6708297620505286, + "grad_norm": 1.8814045190811157, + "learning_rate": 1.9951371764921852e-05, + "loss": 0.8816, + "step": 4109 + }, + { + "epoch": 0.6709930206930329, + "grad_norm": 2.0050930976867676, + "learning_rate": 1.995134050305387e-05, + "loss": 0.8296, + "step": 4110 + }, + { + "epoch": 0.6711562793355373, + "grad_norm": 2.005835771560669, + "learning_rate": 1.995130923116489e-05, + "loss": 0.791, + "step": 4111 + }, + { + "epoch": 0.6713195379780417, + "grad_norm": 1.7616783380508423, + "learning_rate": 1.9951277949254947e-05, + "loss": 0.6316, + "step": 4112 + }, + { + "epoch": 0.6714827966205461, + "grad_norm": 2.390568733215332, + "learning_rate": 1.995124665732407e-05, + "loss": 0.7934, + "step": 4113 + }, + { + "epoch": 0.6716460552630504, + "grad_norm": 2.151191473007202, + "learning_rate": 1.9951215355372287e-05, + "loss": 0.9134, + "step": 4114 + }, + { + "epoch": 0.6718093139055549, + "grad_norm": 1.9410500526428223, + "learning_rate": 1.9951184043399636e-05, + "loss": 0.7673, + "step": 4115 + }, + { + "epoch": 0.6719725725480593, + "grad_norm": 2.383833408355713, + "learning_rate": 1.9951152721406145e-05, + "loss": 0.7873, + "step": 4116 + }, + { + "epoch": 0.6721358311905636, + "grad_norm": 1.8013930320739746, + "learning_rate": 1.9951121389391844e-05, + "loss": 0.7774, + "step": 4117 + }, + { + "epoch": 0.6722990898330681, + "grad_norm": 1.7600305080413818, + "learning_rate": 1.9951090047356767e-05, + "loss": 0.6834, + "step": 4118 + }, + { + "epoch": 0.6724623484755724, + "grad_norm": 1.7398566007614136, + "learning_rate": 1.9951058695300945e-05, + "loss": 0.6073, + "step": 4119 + }, + { + "epoch": 0.6726256071180768, + "grad_norm": 1.8340108394622803, + "learning_rate": 1.995102733322441e-05, + "loss": 0.7637, + "step": 4120 + }, + { + "epoch": 0.6727888657605812, + "grad_norm": 1.9868963956832886, + "learning_rate": 1.9950995961127193e-05, + "loss": 0.969, + "step": 4121 + }, + { + "epoch": 0.6729521244030856, + "grad_norm": 2.232551097869873, + "learning_rate": 1.9950964579009328e-05, + "loss": 0.7824, + "step": 4122 + }, + { + "epoch": 0.6731153830455899, + "grad_norm": 1.8406410217285156, + "learning_rate": 1.995093318687084e-05, + "loss": 0.7621, + "step": 4123 + }, + { + "epoch": 0.6732786416880944, + "grad_norm": 1.7530088424682617, + "learning_rate": 1.9950901784711765e-05, + "loss": 0.7559, + "step": 4124 + }, + { + "epoch": 0.6734419003305987, + "grad_norm": 2.102611541748047, + "learning_rate": 1.9950870372532138e-05, + "loss": 0.7479, + "step": 4125 + }, + { + "epoch": 0.6736051589731031, + "grad_norm": 1.8543158769607544, + "learning_rate": 1.9950838950331986e-05, + "loss": 0.8004, + "step": 4126 + }, + { + "epoch": 0.6737684176156076, + "grad_norm": 1.7378910779953003, + "learning_rate": 1.9950807518111342e-05, + "loss": 0.8071, + "step": 4127 + }, + { + "epoch": 0.6739316762581119, + "grad_norm": 2.0280206203460693, + "learning_rate": 1.9950776075870235e-05, + "loss": 0.8395, + "step": 4128 + }, + { + "epoch": 0.6740949349006163, + "grad_norm": 2.075049638748169, + "learning_rate": 1.99507446236087e-05, + "loss": 0.9797, + "step": 4129 + }, + { + "epoch": 0.6742581935431207, + "grad_norm": 1.668195128440857, + "learning_rate": 1.995071316132677e-05, + "loss": 0.6875, + "step": 4130 + }, + { + "epoch": 0.6744214521856251, + "grad_norm": 2.2895493507385254, + "learning_rate": 1.995068168902447e-05, + "loss": 0.9833, + "step": 4131 + }, + { + "epoch": 0.6745847108281294, + "grad_norm": 1.9771149158477783, + "learning_rate": 1.995065020670184e-05, + "loss": 0.7198, + "step": 4132 + }, + { + "epoch": 0.6747479694706339, + "grad_norm": 1.9430091381072998, + "learning_rate": 1.9950618714358908e-05, + "loss": 0.8313, + "step": 4133 + }, + { + "epoch": 0.6749112281131382, + "grad_norm": 1.8720214366912842, + "learning_rate": 1.9950587211995707e-05, + "loss": 0.6809, + "step": 4134 + }, + { + "epoch": 0.6750744867556426, + "grad_norm": 2.122695207595825, + "learning_rate": 1.9950555699612265e-05, + "loss": 0.9174, + "step": 4135 + }, + { + "epoch": 0.675237745398147, + "grad_norm": 2.1797494888305664, + "learning_rate": 1.9950524177208614e-05, + "loss": 0.7063, + "step": 4136 + }, + { + "epoch": 0.6754010040406514, + "grad_norm": 1.9694017171859741, + "learning_rate": 1.9950492644784793e-05, + "loss": 0.7874, + "step": 4137 + }, + { + "epoch": 0.6755642626831558, + "grad_norm": 1.819265604019165, + "learning_rate": 1.9950461102340823e-05, + "loss": 0.6381, + "step": 4138 + }, + { + "epoch": 0.6757275213256602, + "grad_norm": 2.2781858444213867, + "learning_rate": 1.9950429549876748e-05, + "loss": 0.7844, + "step": 4139 + }, + { + "epoch": 0.6758907799681646, + "grad_norm": 2.3974759578704834, + "learning_rate": 1.9950397987392588e-05, + "loss": 0.8841, + "step": 4140 + }, + { + "epoch": 0.6760540386106689, + "grad_norm": 2.2151107788085938, + "learning_rate": 1.995036641488838e-05, + "loss": 0.8983, + "step": 4141 + }, + { + "epoch": 0.6762172972531734, + "grad_norm": 2.004086494445801, + "learning_rate": 1.9950334832364157e-05, + "loss": 0.8643, + "step": 4142 + }, + { + "epoch": 0.6763805558956777, + "grad_norm": 2.504765272140503, + "learning_rate": 1.9950303239819946e-05, + "loss": 0.9706, + "step": 4143 + }, + { + "epoch": 0.6765438145381821, + "grad_norm": 1.9661954641342163, + "learning_rate": 1.9950271637255787e-05, + "loss": 0.9314, + "step": 4144 + }, + { + "epoch": 0.6767070731806865, + "grad_norm": 1.8291016817092896, + "learning_rate": 1.9950240024671705e-05, + "loss": 0.7036, + "step": 4145 + }, + { + "epoch": 0.6768703318231909, + "grad_norm": 1.9649732112884521, + "learning_rate": 1.9950208402067735e-05, + "loss": 0.8265, + "step": 4146 + }, + { + "epoch": 0.6770335904656952, + "grad_norm": 1.8695887327194214, + "learning_rate": 1.9950176769443908e-05, + "loss": 0.7423, + "step": 4147 + }, + { + "epoch": 0.6771968491081997, + "grad_norm": 2.0008432865142822, + "learning_rate": 1.9950145126800253e-05, + "loss": 0.7986, + "step": 4148 + }, + { + "epoch": 0.6773601077507041, + "grad_norm": 1.8796616792678833, + "learning_rate": 1.9950113474136805e-05, + "loss": 0.7066, + "step": 4149 + }, + { + "epoch": 0.6775233663932084, + "grad_norm": 1.6945209503173828, + "learning_rate": 1.9950081811453598e-05, + "loss": 0.8076, + "step": 4150 + }, + { + "epoch": 0.6776866250357129, + "grad_norm": 1.897228479385376, + "learning_rate": 1.9950050138750662e-05, + "loss": 0.708, + "step": 4151 + }, + { + "epoch": 0.6778498836782172, + "grad_norm": 1.8645113706588745, + "learning_rate": 1.9950018456028024e-05, + "loss": 0.8532, + "step": 4152 + }, + { + "epoch": 0.6780131423207216, + "grad_norm": 2.045151948928833, + "learning_rate": 1.994998676328572e-05, + "loss": 0.7437, + "step": 4153 + }, + { + "epoch": 0.678176400963226, + "grad_norm": 1.9175703525543213, + "learning_rate": 1.9949955060523784e-05, + "loss": 0.8108, + "step": 4154 + }, + { + "epoch": 0.6783396596057304, + "grad_norm": 2.1187527179718018, + "learning_rate": 1.9949923347742248e-05, + "loss": 0.8112, + "step": 4155 + }, + { + "epoch": 0.6785029182482347, + "grad_norm": 1.9319872856140137, + "learning_rate": 1.9949891624941138e-05, + "loss": 0.8092, + "step": 4156 + }, + { + "epoch": 0.6786661768907392, + "grad_norm": 2.0780091285705566, + "learning_rate": 1.9949859892120492e-05, + "loss": 0.7458, + "step": 4157 + }, + { + "epoch": 0.6788294355332435, + "grad_norm": 2.0599660873413086, + "learning_rate": 1.9949828149280338e-05, + "loss": 0.903, + "step": 4158 + }, + { + "epoch": 0.6789926941757479, + "grad_norm": 2.172431230545044, + "learning_rate": 1.9949796396420712e-05, + "loss": 0.8421, + "step": 4159 + }, + { + "epoch": 0.6791559528182524, + "grad_norm": 1.6010431051254272, + "learning_rate": 1.9949764633541645e-05, + "loss": 0.6038, + "step": 4160 + }, + { + "epoch": 0.6793192114607567, + "grad_norm": 1.9309957027435303, + "learning_rate": 1.994973286064316e-05, + "loss": 0.9091, + "step": 4161 + }, + { + "epoch": 0.6794824701032611, + "grad_norm": 1.9657368659973145, + "learning_rate": 1.9949701077725304e-05, + "loss": 0.7011, + "step": 4162 + }, + { + "epoch": 0.6796457287457655, + "grad_norm": 2.3205385208129883, + "learning_rate": 1.99496692847881e-05, + "loss": 1.0717, + "step": 4163 + }, + { + "epoch": 0.6798089873882699, + "grad_norm": 1.9440120458602905, + "learning_rate": 1.994963748183158e-05, + "loss": 0.7, + "step": 4164 + }, + { + "epoch": 0.6799722460307742, + "grad_norm": 1.8481014966964722, + "learning_rate": 1.994960566885578e-05, + "loss": 0.9877, + "step": 4165 + }, + { + "epoch": 0.6801355046732787, + "grad_norm": 2.0271425247192383, + "learning_rate": 1.9949573845860727e-05, + "loss": 0.9173, + "step": 4166 + }, + { + "epoch": 0.680298763315783, + "grad_norm": 1.9089303016662598, + "learning_rate": 1.994954201284646e-05, + "loss": 0.6098, + "step": 4167 + }, + { + "epoch": 0.6804620219582874, + "grad_norm": 2.172492742538452, + "learning_rate": 1.9949510169813006e-05, + "loss": 0.8572, + "step": 4168 + }, + { + "epoch": 0.6806252806007919, + "grad_norm": 1.8873982429504395, + "learning_rate": 1.9949478316760394e-05, + "loss": 0.757, + "step": 4169 + }, + { + "epoch": 0.6807885392432962, + "grad_norm": 1.919996976852417, + "learning_rate": 1.994944645368866e-05, + "loss": 1.0166, + "step": 4170 + }, + { + "epoch": 0.6809517978858006, + "grad_norm": 2.1866002082824707, + "learning_rate": 1.994941458059784e-05, + "loss": 0.9012, + "step": 4171 + }, + { + "epoch": 0.681115056528305, + "grad_norm": 1.8691411018371582, + "learning_rate": 1.994938269748796e-05, + "loss": 0.6651, + "step": 4172 + }, + { + "epoch": 0.6812783151708094, + "grad_norm": 2.1386609077453613, + "learning_rate": 1.9949350804359057e-05, + "loss": 0.7147, + "step": 4173 + }, + { + "epoch": 0.6814415738133137, + "grad_norm": 1.673698902130127, + "learning_rate": 1.9949318901211155e-05, + "loss": 0.7399, + "step": 4174 + }, + { + "epoch": 0.6816048324558182, + "grad_norm": 2.0039803981781006, + "learning_rate": 1.9949286988044293e-05, + "loss": 0.8556, + "step": 4175 + }, + { + "epoch": 0.6817680910983225, + "grad_norm": 1.7296161651611328, + "learning_rate": 1.9949255064858505e-05, + "loss": 0.5359, + "step": 4176 + }, + { + "epoch": 0.6819313497408269, + "grad_norm": 2.1121556758880615, + "learning_rate": 1.9949223131653818e-05, + "loss": 0.8199, + "step": 4177 + }, + { + "epoch": 0.6820946083833312, + "grad_norm": 2.249812602996826, + "learning_rate": 1.9949191188430266e-05, + "loss": 0.7857, + "step": 4178 + }, + { + "epoch": 0.6822578670258357, + "grad_norm": 1.7473031282424927, + "learning_rate": 1.994915923518788e-05, + "loss": 0.6729, + "step": 4179 + }, + { + "epoch": 0.6824211256683401, + "grad_norm": 2.116805076599121, + "learning_rate": 1.9949127271926696e-05, + "loss": 0.8303, + "step": 4180 + }, + { + "epoch": 0.6825843843108444, + "grad_norm": 1.9649015665054321, + "learning_rate": 1.994909529864674e-05, + "loss": 0.7353, + "step": 4181 + }, + { + "epoch": 0.6827476429533489, + "grad_norm": 2.056251287460327, + "learning_rate": 1.994906331534805e-05, + "loss": 0.8865, + "step": 4182 + }, + { + "epoch": 0.6829109015958532, + "grad_norm": 2.0325703620910645, + "learning_rate": 1.9949031322030654e-05, + "loss": 0.7782, + "step": 4183 + }, + { + "epoch": 0.6830741602383577, + "grad_norm": 1.781244158744812, + "learning_rate": 1.994899931869459e-05, + "loss": 0.6667, + "step": 4184 + }, + { + "epoch": 0.683237418880862, + "grad_norm": 2.1162655353546143, + "learning_rate": 1.994896730533988e-05, + "loss": 0.8312, + "step": 4185 + }, + { + "epoch": 0.6834006775233664, + "grad_norm": 1.973695158958435, + "learning_rate": 1.994893528196657e-05, + "loss": 0.8368, + "step": 4186 + }, + { + "epoch": 0.6835639361658707, + "grad_norm": 1.8510578870773315, + "learning_rate": 1.994890324857468e-05, + "loss": 0.8579, + "step": 4187 + }, + { + "epoch": 0.6837271948083752, + "grad_norm": 2.1016201972961426, + "learning_rate": 1.994887120516425e-05, + "loss": 0.8654, + "step": 4188 + }, + { + "epoch": 0.6838904534508795, + "grad_norm": 2.2568845748901367, + "learning_rate": 1.9948839151735305e-05, + "loss": 1.0038, + "step": 4189 + }, + { + "epoch": 0.684053712093384, + "grad_norm": 2.1826863288879395, + "learning_rate": 1.9948807088287884e-05, + "loss": 0.885, + "step": 4190 + }, + { + "epoch": 0.6842169707358884, + "grad_norm": 2.1392626762390137, + "learning_rate": 1.9948775014822016e-05, + "loss": 0.7477, + "step": 4191 + }, + { + "epoch": 0.6843802293783927, + "grad_norm": 2.1337029933929443, + "learning_rate": 1.9948742931337736e-05, + "loss": 0.83, + "step": 4192 + }, + { + "epoch": 0.6845434880208972, + "grad_norm": 2.0879764556884766, + "learning_rate": 1.9948710837835072e-05, + "loss": 0.7925, + "step": 4193 + }, + { + "epoch": 0.6847067466634015, + "grad_norm": 2.0691444873809814, + "learning_rate": 1.9948678734314062e-05, + "loss": 0.835, + "step": 4194 + }, + { + "epoch": 0.6848700053059059, + "grad_norm": 1.8288135528564453, + "learning_rate": 1.9948646620774733e-05, + "loss": 0.7324, + "step": 4195 + }, + { + "epoch": 0.6850332639484102, + "grad_norm": 1.9561655521392822, + "learning_rate": 1.994861449721712e-05, + "loss": 0.8832, + "step": 4196 + }, + { + "epoch": 0.6851965225909147, + "grad_norm": 2.3786544799804688, + "learning_rate": 1.9948582363641254e-05, + "loss": 0.9468, + "step": 4197 + }, + { + "epoch": 0.685359781233419, + "grad_norm": 1.624997615814209, + "learning_rate": 1.9948550220047173e-05, + "loss": 0.6864, + "step": 4198 + }, + { + "epoch": 0.6855230398759234, + "grad_norm": 2.4162254333496094, + "learning_rate": 1.9948518066434898e-05, + "loss": 1.0107, + "step": 4199 + }, + { + "epoch": 0.6856862985184278, + "grad_norm": 1.8422808647155762, + "learning_rate": 1.9948485902804472e-05, + "loss": 0.7443, + "step": 4200 + }, + { + "epoch": 0.6858495571609322, + "grad_norm": 1.6834032535552979, + "learning_rate": 1.994845372915592e-05, + "loss": 0.7019, + "step": 4201 + }, + { + "epoch": 0.6860128158034366, + "grad_norm": 1.7679576873779297, + "learning_rate": 1.994842154548928e-05, + "loss": 0.6552, + "step": 4202 + }, + { + "epoch": 0.686176074445941, + "grad_norm": 1.9738646745681763, + "learning_rate": 1.994838935180458e-05, + "loss": 0.9114, + "step": 4203 + }, + { + "epoch": 0.6863393330884454, + "grad_norm": 1.7693486213684082, + "learning_rate": 1.994835714810186e-05, + "loss": 0.761, + "step": 4204 + }, + { + "epoch": 0.6865025917309497, + "grad_norm": 2.203340530395508, + "learning_rate": 1.9948324934381142e-05, + "loss": 1.3468, + "step": 4205 + }, + { + "epoch": 0.6866658503734542, + "grad_norm": 1.9346898794174194, + "learning_rate": 1.9948292710642464e-05, + "loss": 0.8564, + "step": 4206 + }, + { + "epoch": 0.6868291090159585, + "grad_norm": 2.4551684856414795, + "learning_rate": 1.994826047688586e-05, + "loss": 0.9484, + "step": 4207 + }, + { + "epoch": 0.6869923676584629, + "grad_norm": 2.2222578525543213, + "learning_rate": 1.994822823311136e-05, + "loss": 0.7566, + "step": 4208 + }, + { + "epoch": 0.6871556263009673, + "grad_norm": 1.671071171760559, + "learning_rate": 1.9948195979318995e-05, + "loss": 0.7242, + "step": 4209 + }, + { + "epoch": 0.6873188849434717, + "grad_norm": 2.14620304107666, + "learning_rate": 1.99481637155088e-05, + "loss": 0.8451, + "step": 4210 + }, + { + "epoch": 0.687482143585976, + "grad_norm": 2.0595812797546387, + "learning_rate": 1.994813144168081e-05, + "loss": 0.9181, + "step": 4211 + }, + { + "epoch": 0.6876454022284805, + "grad_norm": 1.9789401292800903, + "learning_rate": 1.994809915783505e-05, + "loss": 0.7538, + "step": 4212 + }, + { + "epoch": 0.6878086608709849, + "grad_norm": 1.8737579584121704, + "learning_rate": 1.9948066863971556e-05, + "loss": 0.7529, + "step": 4213 + }, + { + "epoch": 0.6879719195134892, + "grad_norm": 1.8760442733764648, + "learning_rate": 1.9948034560090364e-05, + "loss": 0.944, + "step": 4214 + }, + { + "epoch": 0.6881351781559937, + "grad_norm": 2.2963998317718506, + "learning_rate": 1.9948002246191503e-05, + "loss": 1.4175, + "step": 4215 + }, + { + "epoch": 0.688298436798498, + "grad_norm": 1.8749713897705078, + "learning_rate": 1.9947969922275007e-05, + "loss": 0.82, + "step": 4216 + }, + { + "epoch": 0.6884616954410024, + "grad_norm": 2.7313199043273926, + "learning_rate": 1.9947937588340907e-05, + "loss": 0.9915, + "step": 4217 + }, + { + "epoch": 0.6886249540835068, + "grad_norm": 1.8387986421585083, + "learning_rate": 1.9947905244389235e-05, + "loss": 0.6928, + "step": 4218 + }, + { + "epoch": 0.6887882127260112, + "grad_norm": 1.822058081626892, + "learning_rate": 1.994787289042003e-05, + "loss": 0.857, + "step": 4219 + }, + { + "epoch": 0.6889514713685155, + "grad_norm": 2.0355279445648193, + "learning_rate": 1.9947840526433316e-05, + "loss": 0.8637, + "step": 4220 + }, + { + "epoch": 0.68911473001102, + "grad_norm": 2.2732481956481934, + "learning_rate": 1.994780815242913e-05, + "loss": 0.8268, + "step": 4221 + }, + { + "epoch": 0.6892779886535243, + "grad_norm": 2.1448006629943848, + "learning_rate": 1.9947775768407504e-05, + "loss": 0.6977, + "step": 4222 + }, + { + "epoch": 0.6894412472960287, + "grad_norm": 2.252592086791992, + "learning_rate": 1.9947743374368467e-05, + "loss": 0.8878, + "step": 4223 + }, + { + "epoch": 0.6896045059385332, + "grad_norm": 2.1373229026794434, + "learning_rate": 1.994771097031206e-05, + "loss": 0.8448, + "step": 4224 + }, + { + "epoch": 0.6897677645810375, + "grad_norm": 1.992712140083313, + "learning_rate": 1.994767855623831e-05, + "loss": 0.9574, + "step": 4225 + }, + { + "epoch": 0.6899310232235419, + "grad_norm": 1.8699966669082642, + "learning_rate": 1.9947646132147248e-05, + "loss": 0.834, + "step": 4226 + }, + { + "epoch": 0.6900942818660463, + "grad_norm": 1.740078091621399, + "learning_rate": 1.994761369803891e-05, + "loss": 0.7823, + "step": 4227 + }, + { + "epoch": 0.6902575405085507, + "grad_norm": 1.994172215461731, + "learning_rate": 1.994758125391333e-05, + "loss": 0.7185, + "step": 4228 + }, + { + "epoch": 0.690420799151055, + "grad_norm": 1.887046456336975, + "learning_rate": 1.9947548799770535e-05, + "loss": 0.8063, + "step": 4229 + }, + { + "epoch": 0.6905840577935595, + "grad_norm": 2.1959755420684814, + "learning_rate": 1.9947516335610563e-05, + "loss": 1.3874, + "step": 4230 + }, + { + "epoch": 0.6907473164360638, + "grad_norm": 2.1626553535461426, + "learning_rate": 1.9947483861433444e-05, + "loss": 0.8176, + "step": 4231 + }, + { + "epoch": 0.6909105750785682, + "grad_norm": 2.0679123401641846, + "learning_rate": 1.9947451377239212e-05, + "loss": 0.7845, + "step": 4232 + }, + { + "epoch": 0.6910738337210726, + "grad_norm": 2.1547374725341797, + "learning_rate": 1.9947418883027894e-05, + "loss": 0.8036, + "step": 4233 + }, + { + "epoch": 0.691237092363577, + "grad_norm": 2.0083796977996826, + "learning_rate": 1.9947386378799534e-05, + "loss": 0.8588, + "step": 4234 + }, + { + "epoch": 0.6914003510060814, + "grad_norm": 2.129823684692383, + "learning_rate": 1.994735386455416e-05, + "loss": 0.8286, + "step": 4235 + }, + { + "epoch": 0.6915636096485858, + "grad_norm": 1.9867092370986938, + "learning_rate": 1.99473213402918e-05, + "loss": 0.7711, + "step": 4236 + }, + { + "epoch": 0.6917268682910902, + "grad_norm": 1.8291975259780884, + "learning_rate": 1.994728880601249e-05, + "loss": 0.7234, + "step": 4237 + }, + { + "epoch": 0.6918901269335945, + "grad_norm": 2.030412435531616, + "learning_rate": 1.994725626171626e-05, + "loss": 0.8301, + "step": 4238 + }, + { + "epoch": 0.692053385576099, + "grad_norm": 2.0023584365844727, + "learning_rate": 1.9947223707403148e-05, + "loss": 0.7924, + "step": 4239 + }, + { + "epoch": 0.6922166442186033, + "grad_norm": 1.7801696062088013, + "learning_rate": 1.9947191143073185e-05, + "loss": 0.7787, + "step": 4240 + }, + { + "epoch": 0.6923799028611077, + "grad_norm": 1.8166919946670532, + "learning_rate": 1.9947158568726404e-05, + "loss": 0.7034, + "step": 4241 + }, + { + "epoch": 0.6925431615036121, + "grad_norm": 2.013502359390259, + "learning_rate": 1.9947125984362835e-05, + "loss": 0.7165, + "step": 4242 + }, + { + "epoch": 0.6927064201461165, + "grad_norm": 2.4362733364105225, + "learning_rate": 1.9947093389982515e-05, + "loss": 0.9341, + "step": 4243 + }, + { + "epoch": 0.6928696787886208, + "grad_norm": 1.9451409578323364, + "learning_rate": 1.9947060785585472e-05, + "loss": 0.697, + "step": 4244 + }, + { + "epoch": 0.6930329374311253, + "grad_norm": 1.8926961421966553, + "learning_rate": 1.9947028171171742e-05, + "loss": 0.7593, + "step": 4245 + }, + { + "epoch": 0.6931961960736297, + "grad_norm": 2.113431930541992, + "learning_rate": 1.994699554674136e-05, + "loss": 0.9076, + "step": 4246 + }, + { + "epoch": 0.693359454716134, + "grad_norm": 1.7997654676437378, + "learning_rate": 1.9946962912294356e-05, + "loss": 0.7661, + "step": 4247 + }, + { + "epoch": 0.6935227133586385, + "grad_norm": 2.130751848220825, + "learning_rate": 1.9946930267830757e-05, + "loss": 0.9527, + "step": 4248 + }, + { + "epoch": 0.6936859720011428, + "grad_norm": 2.097210168838501, + "learning_rate": 1.9946897613350607e-05, + "loss": 0.8686, + "step": 4249 + }, + { + "epoch": 0.6938492306436472, + "grad_norm": 2.080037832260132, + "learning_rate": 1.9946864948853936e-05, + "loss": 0.9064, + "step": 4250 + }, + { + "epoch": 0.6940124892861516, + "grad_norm": 2.0504324436187744, + "learning_rate": 1.994683227434077e-05, + "loss": 0.8105, + "step": 4251 + }, + { + "epoch": 0.694175747928656, + "grad_norm": 1.935913324356079, + "learning_rate": 1.9946799589811146e-05, + "loss": 0.8818, + "step": 4252 + }, + { + "epoch": 0.6943390065711603, + "grad_norm": 1.9660416841506958, + "learning_rate": 1.99467668952651e-05, + "loss": 0.8194, + "step": 4253 + }, + { + "epoch": 0.6945022652136648, + "grad_norm": 1.6792112588882446, + "learning_rate": 1.9946734190702664e-05, + "loss": 0.7723, + "step": 4254 + }, + { + "epoch": 0.6946655238561691, + "grad_norm": 1.9246500730514526, + "learning_rate": 1.9946701476123867e-05, + "loss": 0.8484, + "step": 4255 + }, + { + "epoch": 0.6948287824986735, + "grad_norm": 2.087454080581665, + "learning_rate": 1.9946668751528745e-05, + "loss": 0.9466, + "step": 4256 + }, + { + "epoch": 0.694992041141178, + "grad_norm": 1.8705824613571167, + "learning_rate": 1.9946636016917326e-05, + "loss": 0.9323, + "step": 4257 + }, + { + "epoch": 0.6951552997836823, + "grad_norm": 1.7846181392669678, + "learning_rate": 1.9946603272289652e-05, + "loss": 0.8027, + "step": 4258 + }, + { + "epoch": 0.6953185584261867, + "grad_norm": 1.8184690475463867, + "learning_rate": 1.994657051764575e-05, + "loss": 0.8243, + "step": 4259 + }, + { + "epoch": 0.695481817068691, + "grad_norm": 1.768057107925415, + "learning_rate": 1.9946537752985653e-05, + "loss": 0.8454, + "step": 4260 + }, + { + "epoch": 0.6956450757111955, + "grad_norm": 1.7855664491653442, + "learning_rate": 1.9946504978309397e-05, + "loss": 0.8301, + "step": 4261 + }, + { + "epoch": 0.6958083343536998, + "grad_norm": 1.9005568027496338, + "learning_rate": 1.994647219361701e-05, + "loss": 0.984, + "step": 4262 + }, + { + "epoch": 0.6959715929962043, + "grad_norm": 1.8614243268966675, + "learning_rate": 1.9946439398908533e-05, + "loss": 0.7467, + "step": 4263 + }, + { + "epoch": 0.6961348516387086, + "grad_norm": 1.6746809482574463, + "learning_rate": 1.9946406594183993e-05, + "loss": 0.7641, + "step": 4264 + }, + { + "epoch": 0.696298110281213, + "grad_norm": 1.6465895175933838, + "learning_rate": 1.994637377944342e-05, + "loss": 0.6649, + "step": 4265 + }, + { + "epoch": 0.6964613689237173, + "grad_norm": 1.7396831512451172, + "learning_rate": 1.9946340954686852e-05, + "loss": 0.8663, + "step": 4266 + }, + { + "epoch": 0.6966246275662218, + "grad_norm": 1.9153978824615479, + "learning_rate": 1.9946308119914323e-05, + "loss": 0.7456, + "step": 4267 + }, + { + "epoch": 0.6967878862087262, + "grad_norm": 2.133854627609253, + "learning_rate": 1.9946275275125867e-05, + "loss": 0.7657, + "step": 4268 + }, + { + "epoch": 0.6969511448512306, + "grad_norm": 1.8593735694885254, + "learning_rate": 1.9946242420321513e-05, + "loss": 0.8253, + "step": 4269 + }, + { + "epoch": 0.697114403493735, + "grad_norm": 1.7236478328704834, + "learning_rate": 1.9946209555501293e-05, + "loss": 0.7209, + "step": 4270 + }, + { + "epoch": 0.6972776621362393, + "grad_norm": 2.262835741043091, + "learning_rate": 1.9946176680665244e-05, + "loss": 0.7842, + "step": 4271 + }, + { + "epoch": 0.6974409207787438, + "grad_norm": 2.00213623046875, + "learning_rate": 1.9946143795813396e-05, + "loss": 0.8801, + "step": 4272 + }, + { + "epoch": 0.6976041794212481, + "grad_norm": 1.6758482456207275, + "learning_rate": 1.9946110900945787e-05, + "loss": 0.7961, + "step": 4273 + }, + { + "epoch": 0.6977674380637525, + "grad_norm": 1.9252508878707886, + "learning_rate": 1.9946077996062447e-05, + "loss": 0.7542, + "step": 4274 + }, + { + "epoch": 0.6979306967062568, + "grad_norm": 2.0963165760040283, + "learning_rate": 1.9946045081163407e-05, + "loss": 0.9548, + "step": 4275 + }, + { + "epoch": 0.6980939553487613, + "grad_norm": 1.7537860870361328, + "learning_rate": 1.9946012156248703e-05, + "loss": 0.7042, + "step": 4276 + }, + { + "epoch": 0.6982572139912656, + "grad_norm": 2.062753200531006, + "learning_rate": 1.9945979221318367e-05, + "loss": 0.8515, + "step": 4277 + }, + { + "epoch": 0.69842047263377, + "grad_norm": 1.8563761711120605, + "learning_rate": 1.9945946276372435e-05, + "loss": 0.768, + "step": 4278 + }, + { + "epoch": 0.6985837312762745, + "grad_norm": 2.1685030460357666, + "learning_rate": 1.9945913321410935e-05, + "loss": 0.9328, + "step": 4279 + }, + { + "epoch": 0.6987469899187788, + "grad_norm": 1.908599853515625, + "learning_rate": 1.9945880356433904e-05, + "loss": 0.8296, + "step": 4280 + }, + { + "epoch": 0.6989102485612833, + "grad_norm": 2.0254435539245605, + "learning_rate": 1.9945847381441372e-05, + "loss": 0.8294, + "step": 4281 + }, + { + "epoch": 0.6990735072037876, + "grad_norm": 1.7319930791854858, + "learning_rate": 1.9945814396433377e-05, + "loss": 0.7415, + "step": 4282 + }, + { + "epoch": 0.699236765846292, + "grad_norm": 2.027949571609497, + "learning_rate": 1.9945781401409946e-05, + "loss": 0.8439, + "step": 4283 + }, + { + "epoch": 0.6994000244887963, + "grad_norm": 2.514307975769043, + "learning_rate": 1.994574839637112e-05, + "loss": 0.8266, + "step": 4284 + }, + { + "epoch": 0.6995632831313008, + "grad_norm": 2.7291760444641113, + "learning_rate": 1.994571538131693e-05, + "loss": 0.958, + "step": 4285 + }, + { + "epoch": 0.6997265417738051, + "grad_norm": 1.8221714496612549, + "learning_rate": 1.99456823562474e-05, + "loss": 0.8429, + "step": 4286 + }, + { + "epoch": 0.6998898004163095, + "grad_norm": 1.8138916492462158, + "learning_rate": 1.9945649321162576e-05, + "loss": 0.7084, + "step": 4287 + }, + { + "epoch": 0.7000530590588139, + "grad_norm": 2.039774179458618, + "learning_rate": 1.9945616276062482e-05, + "loss": 0.8453, + "step": 4288 + }, + { + "epoch": 0.7002163177013183, + "grad_norm": 1.8851399421691895, + "learning_rate": 1.9945583220947156e-05, + "loss": 0.8044, + "step": 4289 + }, + { + "epoch": 0.7003795763438228, + "grad_norm": 1.7872333526611328, + "learning_rate": 1.9945550155816634e-05, + "loss": 0.7419, + "step": 4290 + }, + { + "epoch": 0.7005428349863271, + "grad_norm": 2.0840182304382324, + "learning_rate": 1.994551708067094e-05, + "loss": 0.8663, + "step": 4291 + }, + { + "epoch": 0.7007060936288315, + "grad_norm": 2.122913360595703, + "learning_rate": 1.994548399551012e-05, + "loss": 0.963, + "step": 4292 + }, + { + "epoch": 0.7008693522713358, + "grad_norm": 1.951949119567871, + "learning_rate": 1.9945450900334197e-05, + "loss": 0.8804, + "step": 4293 + }, + { + "epoch": 0.7010326109138403, + "grad_norm": 1.8915026187896729, + "learning_rate": 1.99454177951432e-05, + "loss": 0.7281, + "step": 4294 + }, + { + "epoch": 0.7011958695563446, + "grad_norm": 1.8705130815505981, + "learning_rate": 1.9945384679937182e-05, + "loss": 0.7097, + "step": 4295 + }, + { + "epoch": 0.701359128198849, + "grad_norm": 2.0616843700408936, + "learning_rate": 1.9945351554716158e-05, + "loss": 0.8083, + "step": 4296 + }, + { + "epoch": 0.7015223868413534, + "grad_norm": 2.316213369369507, + "learning_rate": 1.9945318419480168e-05, + "loss": 1.0206, + "step": 4297 + }, + { + "epoch": 0.7016856454838578, + "grad_norm": 1.6560720205307007, + "learning_rate": 1.9945285274229244e-05, + "loss": 0.7628, + "step": 4298 + }, + { + "epoch": 0.7018489041263621, + "grad_norm": 1.5189416408538818, + "learning_rate": 1.994525211896342e-05, + "loss": 0.6112, + "step": 4299 + }, + { + "epoch": 0.7020121627688666, + "grad_norm": 1.9120960235595703, + "learning_rate": 1.9945218953682736e-05, + "loss": 0.7637, + "step": 4300 + }, + { + "epoch": 0.702175421411371, + "grad_norm": 1.6891099214553833, + "learning_rate": 1.9945185778387214e-05, + "loss": 0.6498, + "step": 4301 + }, + { + "epoch": 0.7023386800538753, + "grad_norm": 2.311800956726074, + "learning_rate": 1.9945152593076893e-05, + "loss": 0.9035, + "step": 4302 + }, + { + "epoch": 0.7025019386963798, + "grad_norm": 2.07788348197937, + "learning_rate": 1.9945119397751807e-05, + "loss": 1.0844, + "step": 4303 + }, + { + "epoch": 0.7026651973388841, + "grad_norm": 2.0255422592163086, + "learning_rate": 1.9945086192411986e-05, + "loss": 0.7657, + "step": 4304 + }, + { + "epoch": 0.7028284559813885, + "grad_norm": 1.9278734922409058, + "learning_rate": 1.994505297705747e-05, + "loss": 0.9272, + "step": 4305 + }, + { + "epoch": 0.7029917146238929, + "grad_norm": 2.594517469406128, + "learning_rate": 1.9945019751688284e-05, + "loss": 0.9448, + "step": 4306 + }, + { + "epoch": 0.7031549732663973, + "grad_norm": 1.808044195175171, + "learning_rate": 1.994498651630447e-05, + "loss": 0.848, + "step": 4307 + }, + { + "epoch": 0.7033182319089016, + "grad_norm": 1.9005550146102905, + "learning_rate": 1.9944953270906054e-05, + "loss": 0.7092, + "step": 4308 + }, + { + "epoch": 0.7034814905514061, + "grad_norm": 2.1721482276916504, + "learning_rate": 1.9944920015493074e-05, + "loss": 0.8934, + "step": 4309 + }, + { + "epoch": 0.7036447491939104, + "grad_norm": 1.7311025857925415, + "learning_rate": 1.994488675006556e-05, + "loss": 0.8381, + "step": 4310 + }, + { + "epoch": 0.7038080078364148, + "grad_norm": 1.9951573610305786, + "learning_rate": 1.994485347462355e-05, + "loss": 0.8851, + "step": 4311 + }, + { + "epoch": 0.7039712664789193, + "grad_norm": 2.0537636280059814, + "learning_rate": 1.9944820189167076e-05, + "loss": 0.9549, + "step": 4312 + }, + { + "epoch": 0.7041345251214236, + "grad_norm": 2.0764970779418945, + "learning_rate": 1.9944786893696166e-05, + "loss": 0.975, + "step": 4313 + }, + { + "epoch": 0.704297783763928, + "grad_norm": 1.805821180343628, + "learning_rate": 1.9944753588210864e-05, + "loss": 0.8177, + "step": 4314 + }, + { + "epoch": 0.7044610424064324, + "grad_norm": 1.931982398033142, + "learning_rate": 1.9944720272711192e-05, + "loss": 0.9035, + "step": 4315 + }, + { + "epoch": 0.7046243010489368, + "grad_norm": 2.203350305557251, + "learning_rate": 1.9944686947197196e-05, + "loss": 1.0634, + "step": 4316 + }, + { + "epoch": 0.7047875596914411, + "grad_norm": 2.152621269226074, + "learning_rate": 1.99446536116689e-05, + "loss": 0.9628, + "step": 4317 + }, + { + "epoch": 0.7049508183339456, + "grad_norm": 2.0857505798339844, + "learning_rate": 1.9944620266126338e-05, + "loss": 0.7802, + "step": 4318 + }, + { + "epoch": 0.7051140769764499, + "grad_norm": 1.7998846769332886, + "learning_rate": 1.9944586910569546e-05, + "loss": 0.7594, + "step": 4319 + }, + { + "epoch": 0.7052773356189543, + "grad_norm": 1.6078064441680908, + "learning_rate": 1.9944553544998563e-05, + "loss": 0.6447, + "step": 4320 + }, + { + "epoch": 0.7054405942614587, + "grad_norm": 1.8381255865097046, + "learning_rate": 1.9944520169413413e-05, + "loss": 0.6851, + "step": 4321 + }, + { + "epoch": 0.7056038529039631, + "grad_norm": 1.9162471294403076, + "learning_rate": 1.9944486783814135e-05, + "loss": 0.8674, + "step": 4322 + }, + { + "epoch": 0.7057671115464675, + "grad_norm": 1.7429533004760742, + "learning_rate": 1.994445338820076e-05, + "loss": 0.8149, + "step": 4323 + }, + { + "epoch": 0.7059303701889719, + "grad_norm": 2.092479944229126, + "learning_rate": 1.9944419982573323e-05, + "loss": 0.7385, + "step": 4324 + }, + { + "epoch": 0.7060936288314763, + "grad_norm": 1.873106837272644, + "learning_rate": 1.994438656693186e-05, + "loss": 0.8601, + "step": 4325 + }, + { + "epoch": 0.7062568874739806, + "grad_norm": 1.8368573188781738, + "learning_rate": 1.99443531412764e-05, + "loss": 0.7698, + "step": 4326 + }, + { + "epoch": 0.7064201461164851, + "grad_norm": 1.8464480638504028, + "learning_rate": 1.9944319705606983e-05, + "loss": 0.8331, + "step": 4327 + }, + { + "epoch": 0.7065834047589894, + "grad_norm": 2.2949576377868652, + "learning_rate": 1.9944286259923637e-05, + "loss": 0.9281, + "step": 4328 + }, + { + "epoch": 0.7067466634014938, + "grad_norm": 2.095341205596924, + "learning_rate": 1.9944252804226393e-05, + "loss": 0.878, + "step": 4329 + }, + { + "epoch": 0.7069099220439982, + "grad_norm": 2.058976888656616, + "learning_rate": 1.994421933851529e-05, + "loss": 0.9978, + "step": 4330 + }, + { + "epoch": 0.7070731806865026, + "grad_norm": 2.2933664321899414, + "learning_rate": 1.9944185862790366e-05, + "loss": 0.9138, + "step": 4331 + }, + { + "epoch": 0.7072364393290069, + "grad_norm": 2.0721192359924316, + "learning_rate": 1.9944152377051648e-05, + "loss": 0.9309, + "step": 4332 + }, + { + "epoch": 0.7073996979715114, + "grad_norm": 1.724638819694519, + "learning_rate": 1.9944118881299167e-05, + "loss": 0.7519, + "step": 4333 + }, + { + "epoch": 0.7075629566140158, + "grad_norm": 1.9347200393676758, + "learning_rate": 1.9944085375532965e-05, + "loss": 0.8812, + "step": 4334 + }, + { + "epoch": 0.7077262152565201, + "grad_norm": 2.1574108600616455, + "learning_rate": 1.9944051859753072e-05, + "loss": 0.7688, + "step": 4335 + }, + { + "epoch": 0.7078894738990246, + "grad_norm": 1.8231219053268433, + "learning_rate": 1.9944018333959518e-05, + "loss": 0.6678, + "step": 4336 + }, + { + "epoch": 0.7080527325415289, + "grad_norm": 2.307121515274048, + "learning_rate": 1.9943984798152343e-05, + "loss": 0.9512, + "step": 4337 + }, + { + "epoch": 0.7082159911840333, + "grad_norm": 1.6214911937713623, + "learning_rate": 1.9943951252331576e-05, + "loss": 0.6637, + "step": 4338 + }, + { + "epoch": 0.7083792498265377, + "grad_norm": 1.7350754737854004, + "learning_rate": 1.9943917696497256e-05, + "loss": 0.8309, + "step": 4339 + }, + { + "epoch": 0.7085425084690421, + "grad_norm": 2.1449248790740967, + "learning_rate": 1.9943884130649408e-05, + "loss": 0.9577, + "step": 4340 + }, + { + "epoch": 0.7087057671115464, + "grad_norm": 1.7031553983688354, + "learning_rate": 1.9943850554788075e-05, + "loss": 0.7667, + "step": 4341 + }, + { + "epoch": 0.7088690257540509, + "grad_norm": 2.3525102138519287, + "learning_rate": 1.9943816968913288e-05, + "loss": 0.8443, + "step": 4342 + }, + { + "epoch": 0.7090322843965552, + "grad_norm": 2.133125066757202, + "learning_rate": 1.9943783373025077e-05, + "loss": 0.9871, + "step": 4343 + }, + { + "epoch": 0.7091955430390596, + "grad_norm": 2.244927406311035, + "learning_rate": 1.994374976712348e-05, + "loss": 0.9959, + "step": 4344 + }, + { + "epoch": 0.7093588016815641, + "grad_norm": 1.6296930313110352, + "learning_rate": 1.994371615120853e-05, + "loss": 0.6568, + "step": 4345 + }, + { + "epoch": 0.7095220603240684, + "grad_norm": 1.7802700996398926, + "learning_rate": 1.9943682525280263e-05, + "loss": 0.7199, + "step": 4346 + }, + { + "epoch": 0.7096853189665728, + "grad_norm": 2.2932846546173096, + "learning_rate": 1.9943648889338707e-05, + "loss": 0.8624, + "step": 4347 + }, + { + "epoch": 0.7098485776090772, + "grad_norm": 2.2595794200897217, + "learning_rate": 1.9943615243383897e-05, + "loss": 0.9352, + "step": 4348 + }, + { + "epoch": 0.7100118362515816, + "grad_norm": 2.022770881652832, + "learning_rate": 1.9943581587415873e-05, + "loss": 0.8571, + "step": 4349 + }, + { + "epoch": 0.7101750948940859, + "grad_norm": 2.03802752494812, + "learning_rate": 1.9943547921434666e-05, + "loss": 0.7644, + "step": 4350 + }, + { + "epoch": 0.7103383535365904, + "grad_norm": 1.731443166732788, + "learning_rate": 1.9943514245440307e-05, + "loss": 0.6777, + "step": 4351 + }, + { + "epoch": 0.7105016121790947, + "grad_norm": 1.8085764646530151, + "learning_rate": 1.9943480559432832e-05, + "loss": 0.7689, + "step": 4352 + }, + { + "epoch": 0.7106648708215991, + "grad_norm": 1.8456294536590576, + "learning_rate": 1.9943446863412276e-05, + "loss": 0.7884, + "step": 4353 + }, + { + "epoch": 0.7108281294641035, + "grad_norm": 1.7384699583053589, + "learning_rate": 1.994341315737867e-05, + "loss": 0.6381, + "step": 4354 + }, + { + "epoch": 0.7109913881066079, + "grad_norm": 1.8413194417953491, + "learning_rate": 1.994337944133205e-05, + "loss": 0.6429, + "step": 4355 + }, + { + "epoch": 0.7111546467491123, + "grad_norm": 2.1186680793762207, + "learning_rate": 1.9943345715272445e-05, + "loss": 0.8943, + "step": 4356 + }, + { + "epoch": 0.7113179053916167, + "grad_norm": 1.9528472423553467, + "learning_rate": 1.99433119791999e-05, + "loss": 0.8599, + "step": 4357 + }, + { + "epoch": 0.7114811640341211, + "grad_norm": 2.0833191871643066, + "learning_rate": 1.994327823311444e-05, + "loss": 0.8997, + "step": 4358 + }, + { + "epoch": 0.7116444226766254, + "grad_norm": 2.3430826663970947, + "learning_rate": 1.99432444770161e-05, + "loss": 0.8764, + "step": 4359 + }, + { + "epoch": 0.7118076813191299, + "grad_norm": 2.094642400741577, + "learning_rate": 1.994321071090492e-05, + "loss": 0.8559, + "step": 4360 + }, + { + "epoch": 0.7119709399616342, + "grad_norm": 2.1967456340789795, + "learning_rate": 1.9943176934780926e-05, + "loss": 0.8555, + "step": 4361 + }, + { + "epoch": 0.7121341986041386, + "grad_norm": 1.8970474004745483, + "learning_rate": 1.9943143148644155e-05, + "loss": 0.9391, + "step": 4362 + }, + { + "epoch": 0.712297457246643, + "grad_norm": 1.9624043703079224, + "learning_rate": 1.994310935249464e-05, + "loss": 0.8835, + "step": 4363 + }, + { + "epoch": 0.7124607158891474, + "grad_norm": 2.1123692989349365, + "learning_rate": 1.9943075546332423e-05, + "loss": 0.8153, + "step": 4364 + }, + { + "epoch": 0.7126239745316517, + "grad_norm": 1.8385090827941895, + "learning_rate": 1.9943041730157526e-05, + "loss": 0.8518, + "step": 4365 + }, + { + "epoch": 0.7127872331741562, + "grad_norm": 1.819939136505127, + "learning_rate": 1.994300790396999e-05, + "loss": 0.7119, + "step": 4366 + }, + { + "epoch": 0.7129504918166606, + "grad_norm": 2.3111627101898193, + "learning_rate": 1.9942974067769847e-05, + "loss": 0.7905, + "step": 4367 + }, + { + "epoch": 0.7131137504591649, + "grad_norm": 2.173330783843994, + "learning_rate": 1.9942940221557135e-05, + "loss": 0.9446, + "step": 4368 + }, + { + "epoch": 0.7132770091016694, + "grad_norm": 1.9439023733139038, + "learning_rate": 1.994290636533188e-05, + "loss": 0.8962, + "step": 4369 + }, + { + "epoch": 0.7134402677441737, + "grad_norm": 2.0761358737945557, + "learning_rate": 1.9942872499094125e-05, + "loss": 0.9764, + "step": 4370 + }, + { + "epoch": 0.7136035263866781, + "grad_norm": 2.273097515106201, + "learning_rate": 1.9942838622843898e-05, + "loss": 1.0192, + "step": 4371 + }, + { + "epoch": 0.7137667850291824, + "grad_norm": 1.8375968933105469, + "learning_rate": 1.9942804736581236e-05, + "loss": 0.8392, + "step": 4372 + }, + { + "epoch": 0.7139300436716869, + "grad_norm": 1.7532120943069458, + "learning_rate": 1.994277084030617e-05, + "loss": 0.9312, + "step": 4373 + }, + { + "epoch": 0.7140933023141912, + "grad_norm": 1.9342118501663208, + "learning_rate": 1.994273693401874e-05, + "loss": 0.9675, + "step": 4374 + }, + { + "epoch": 0.7142565609566957, + "grad_norm": 1.7692539691925049, + "learning_rate": 1.9942703017718977e-05, + "loss": 0.7715, + "step": 4375 + }, + { + "epoch": 0.7144198195992, + "grad_norm": 1.4650812149047852, + "learning_rate": 1.994266909140691e-05, + "loss": 0.6356, + "step": 4376 + }, + { + "epoch": 0.7145830782417044, + "grad_norm": 1.6834259033203125, + "learning_rate": 1.994263515508258e-05, + "loss": 0.7278, + "step": 4377 + }, + { + "epoch": 0.7147463368842089, + "grad_norm": 1.982469081878662, + "learning_rate": 1.994260120874602e-05, + "loss": 0.9722, + "step": 4378 + }, + { + "epoch": 0.7149095955267132, + "grad_norm": 2.1274068355560303, + "learning_rate": 1.9942567252397262e-05, + "loss": 0.8126, + "step": 4379 + }, + { + "epoch": 0.7150728541692176, + "grad_norm": 2.203218698501587, + "learning_rate": 1.9942533286036343e-05, + "loss": 0.9513, + "step": 4380 + }, + { + "epoch": 0.715236112811722, + "grad_norm": 2.0004398822784424, + "learning_rate": 1.9942499309663294e-05, + "loss": 0.6825, + "step": 4381 + }, + { + "epoch": 0.7153993714542264, + "grad_norm": 1.9553112983703613, + "learning_rate": 1.9942465323278153e-05, + "loss": 0.84, + "step": 4382 + }, + { + "epoch": 0.7155626300967307, + "grad_norm": 1.8179467916488647, + "learning_rate": 1.994243132688095e-05, + "loss": 0.782, + "step": 4383 + }, + { + "epoch": 0.7157258887392351, + "grad_norm": 2.235081911087036, + "learning_rate": 1.9942397320471723e-05, + "loss": 0.7786, + "step": 4384 + }, + { + "epoch": 0.7158891473817395, + "grad_norm": 1.925100326538086, + "learning_rate": 1.99423633040505e-05, + "loss": 0.8828, + "step": 4385 + }, + { + "epoch": 0.7160524060242439, + "grad_norm": 2.014695405960083, + "learning_rate": 1.9942329277617325e-05, + "loss": 0.8665, + "step": 4386 + }, + { + "epoch": 0.7162156646667482, + "grad_norm": 1.8004367351531982, + "learning_rate": 1.9942295241172225e-05, + "loss": 0.6948, + "step": 4387 + }, + { + "epoch": 0.7163789233092527, + "grad_norm": 1.9331480264663696, + "learning_rate": 1.9942261194715236e-05, + "loss": 0.7294, + "step": 4388 + }, + { + "epoch": 0.7165421819517571, + "grad_norm": 1.7150667905807495, + "learning_rate": 1.9942227138246393e-05, + "loss": 0.6843, + "step": 4389 + }, + { + "epoch": 0.7167054405942614, + "grad_norm": 1.5684446096420288, + "learning_rate": 1.9942193071765728e-05, + "loss": 0.6938, + "step": 4390 + }, + { + "epoch": 0.7168686992367659, + "grad_norm": 1.931045651435852, + "learning_rate": 1.9942158995273283e-05, + "loss": 0.8214, + "step": 4391 + }, + { + "epoch": 0.7170319578792702, + "grad_norm": 2.2262299060821533, + "learning_rate": 1.994212490876908e-05, + "loss": 0.8845, + "step": 4392 + }, + { + "epoch": 0.7171952165217746, + "grad_norm": 1.936834692955017, + "learning_rate": 1.9942090812253164e-05, + "loss": 0.7213, + "step": 4393 + }, + { + "epoch": 0.717358475164279, + "grad_norm": 2.140897750854492, + "learning_rate": 1.9942056705725564e-05, + "loss": 0.9362, + "step": 4394 + }, + { + "epoch": 0.7175217338067834, + "grad_norm": 1.9691821336746216, + "learning_rate": 1.9942022589186316e-05, + "loss": 0.8167, + "step": 4395 + }, + { + "epoch": 0.7176849924492877, + "grad_norm": 1.8906147480010986, + "learning_rate": 1.994198846263545e-05, + "loss": 0.9226, + "step": 4396 + }, + { + "epoch": 0.7178482510917922, + "grad_norm": 2.099067449569702, + "learning_rate": 1.994195432607301e-05, + "loss": 0.9348, + "step": 4397 + }, + { + "epoch": 0.7180115097342965, + "grad_norm": 1.9238884449005127, + "learning_rate": 1.9941920179499022e-05, + "loss": 0.7465, + "step": 4398 + }, + { + "epoch": 0.7181747683768009, + "grad_norm": 1.9999345541000366, + "learning_rate": 1.9941886022913523e-05, + "loss": 0.8098, + "step": 4399 + }, + { + "epoch": 0.7183380270193054, + "grad_norm": 2.05483341217041, + "learning_rate": 1.994185185631655e-05, + "loss": 0.7072, + "step": 4400 + }, + { + "epoch": 0.7185012856618097, + "grad_norm": 2.3453845977783203, + "learning_rate": 1.994181767970813e-05, + "loss": 0.8156, + "step": 4401 + }, + { + "epoch": 0.7186645443043141, + "grad_norm": 2.759714126586914, + "learning_rate": 1.9941783493088304e-05, + "loss": 0.923, + "step": 4402 + }, + { + "epoch": 0.7188278029468185, + "grad_norm": 2.065807819366455, + "learning_rate": 1.9941749296457103e-05, + "loss": 0.8615, + "step": 4403 + }, + { + "epoch": 0.7189910615893229, + "grad_norm": 1.7405942678451538, + "learning_rate": 1.994171508981457e-05, + "loss": 0.6686, + "step": 4404 + }, + { + "epoch": 0.7191543202318272, + "grad_norm": 2.803520441055298, + "learning_rate": 1.9941680873160727e-05, + "loss": 0.8708, + "step": 4405 + }, + { + "epoch": 0.7193175788743317, + "grad_norm": 1.911081314086914, + "learning_rate": 1.9941646646495615e-05, + "loss": 0.8535, + "step": 4406 + }, + { + "epoch": 0.719480837516836, + "grad_norm": 1.9501444101333618, + "learning_rate": 1.9941612409819265e-05, + "loss": 0.7194, + "step": 4407 + }, + { + "epoch": 0.7196440961593404, + "grad_norm": 1.9938591718673706, + "learning_rate": 1.9941578163131717e-05, + "loss": 0.8317, + "step": 4408 + }, + { + "epoch": 0.7198073548018448, + "grad_norm": 1.904526948928833, + "learning_rate": 1.9941543906433003e-05, + "loss": 0.7517, + "step": 4409 + }, + { + "epoch": 0.7199706134443492, + "grad_norm": 2.1266376972198486, + "learning_rate": 1.9941509639723155e-05, + "loss": 0.7963, + "step": 4410 + }, + { + "epoch": 0.7201338720868536, + "grad_norm": 1.8949412107467651, + "learning_rate": 1.994147536300221e-05, + "loss": 0.8006, + "step": 4411 + }, + { + "epoch": 0.720297130729358, + "grad_norm": 1.9772933721542358, + "learning_rate": 1.99414410762702e-05, + "loss": 1.0022, + "step": 4412 + }, + { + "epoch": 0.7204603893718624, + "grad_norm": 1.732645034790039, + "learning_rate": 1.9941406779527167e-05, + "loss": 0.7157, + "step": 4413 + }, + { + "epoch": 0.7206236480143667, + "grad_norm": 1.798436164855957, + "learning_rate": 1.994137247277314e-05, + "loss": 0.6785, + "step": 4414 + }, + { + "epoch": 0.7207869066568712, + "grad_norm": 1.6932530403137207, + "learning_rate": 1.9941338156008147e-05, + "loss": 0.786, + "step": 4415 + }, + { + "epoch": 0.7209501652993755, + "grad_norm": 1.8624871969223022, + "learning_rate": 1.994130382923223e-05, + "loss": 0.7434, + "step": 4416 + }, + { + "epoch": 0.7211134239418799, + "grad_norm": 1.823852777481079, + "learning_rate": 1.9941269492445427e-05, + "loss": 0.8213, + "step": 4417 + }, + { + "epoch": 0.7212766825843843, + "grad_norm": 1.9162726402282715, + "learning_rate": 1.9941235145647763e-05, + "loss": 0.824, + "step": 4418 + }, + { + "epoch": 0.7214399412268887, + "grad_norm": 1.8077019453048706, + "learning_rate": 1.9941200788839285e-05, + "loss": 0.9382, + "step": 4419 + }, + { + "epoch": 0.721603199869393, + "grad_norm": 1.7717796564102173, + "learning_rate": 1.9941166422020016e-05, + "loss": 0.6571, + "step": 4420 + }, + { + "epoch": 0.7217664585118975, + "grad_norm": 2.067509889602661, + "learning_rate": 1.9941132045189993e-05, + "loss": 0.8508, + "step": 4421 + }, + { + "epoch": 0.7219297171544019, + "grad_norm": 2.26788592338562, + "learning_rate": 1.9941097658349256e-05, + "loss": 0.8283, + "step": 4422 + }, + { + "epoch": 0.7220929757969062, + "grad_norm": 1.917021632194519, + "learning_rate": 1.9941063261497838e-05, + "loss": 0.7346, + "step": 4423 + }, + { + "epoch": 0.7222562344394107, + "grad_norm": 1.7852272987365723, + "learning_rate": 1.9941028854635767e-05, + "loss": 0.7495, + "step": 4424 + }, + { + "epoch": 0.722419493081915, + "grad_norm": 1.7005994319915771, + "learning_rate": 1.9940994437763085e-05, + "loss": 0.6463, + "step": 4425 + }, + { + "epoch": 0.7225827517244194, + "grad_norm": 2.0141804218292236, + "learning_rate": 1.9940960010879825e-05, + "loss": 0.7524, + "step": 4426 + }, + { + "epoch": 0.7227460103669238, + "grad_norm": 2.0254030227661133, + "learning_rate": 1.9940925573986018e-05, + "loss": 0.8772, + "step": 4427 + }, + { + "epoch": 0.7229092690094282, + "grad_norm": 1.6054635047912598, + "learning_rate": 1.9940891127081704e-05, + "loss": 0.6519, + "step": 4428 + }, + { + "epoch": 0.7230725276519325, + "grad_norm": 2.340158700942993, + "learning_rate": 1.9940856670166915e-05, + "loss": 0.9471, + "step": 4429 + }, + { + "epoch": 0.723235786294437, + "grad_norm": 1.890354037284851, + "learning_rate": 1.9940822203241684e-05, + "loss": 0.7177, + "step": 4430 + }, + { + "epoch": 0.7233990449369414, + "grad_norm": 2.3571619987487793, + "learning_rate": 1.994078772630605e-05, + "loss": 0.6998, + "step": 4431 + }, + { + "epoch": 0.7235623035794457, + "grad_norm": 2.3782622814178467, + "learning_rate": 1.9940753239360047e-05, + "loss": 0.8007, + "step": 4432 + }, + { + "epoch": 0.7237255622219502, + "grad_norm": 2.500842571258545, + "learning_rate": 1.9940718742403707e-05, + "loss": 0.7501, + "step": 4433 + }, + { + "epoch": 0.7238888208644545, + "grad_norm": 1.7871969938278198, + "learning_rate": 1.9940684235437063e-05, + "loss": 0.6593, + "step": 4434 + }, + { + "epoch": 0.7240520795069589, + "grad_norm": 1.8737636804580688, + "learning_rate": 1.9940649718460157e-05, + "loss": 0.7737, + "step": 4435 + }, + { + "epoch": 0.7242153381494633, + "grad_norm": 2.200291395187378, + "learning_rate": 1.9940615191473015e-05, + "loss": 0.8039, + "step": 4436 + }, + { + "epoch": 0.7243785967919677, + "grad_norm": 1.956521987915039, + "learning_rate": 1.994058065447568e-05, + "loss": 0.7505, + "step": 4437 + }, + { + "epoch": 0.724541855434472, + "grad_norm": 1.8933959007263184, + "learning_rate": 1.994054610746818e-05, + "loss": 0.7759, + "step": 4438 + }, + { + "epoch": 0.7247051140769765, + "grad_norm": 2.11024808883667, + "learning_rate": 1.9940511550450554e-05, + "loss": 0.8867, + "step": 4439 + }, + { + "epoch": 0.7248683727194808, + "grad_norm": 1.61370050907135, + "learning_rate": 1.9940476983422834e-05, + "loss": 0.6572, + "step": 4440 + }, + { + "epoch": 0.7250316313619852, + "grad_norm": 1.6555801630020142, + "learning_rate": 1.9940442406385057e-05, + "loss": 0.8441, + "step": 4441 + }, + { + "epoch": 0.7251948900044897, + "grad_norm": 2.0151753425598145, + "learning_rate": 1.994040781933726e-05, + "loss": 0.9194, + "step": 4442 + }, + { + "epoch": 0.725358148646994, + "grad_norm": 2.0465376377105713, + "learning_rate": 1.9940373222279473e-05, + "loss": 0.8144, + "step": 4443 + }, + { + "epoch": 0.7255214072894984, + "grad_norm": 2.4903249740600586, + "learning_rate": 1.9940338615211732e-05, + "loss": 0.8174, + "step": 4444 + }, + { + "epoch": 0.7256846659320028, + "grad_norm": 1.8807685375213623, + "learning_rate": 1.9940303998134075e-05, + "loss": 0.8298, + "step": 4445 + }, + { + "epoch": 0.7258479245745072, + "grad_norm": 2.66568922996521, + "learning_rate": 1.994026937104653e-05, + "loss": 0.9182, + "step": 4446 + }, + { + "epoch": 0.7260111832170115, + "grad_norm": 1.9964847564697266, + "learning_rate": 1.994023473394914e-05, + "loss": 0.8194, + "step": 4447 + }, + { + "epoch": 0.726174441859516, + "grad_norm": 2.089733123779297, + "learning_rate": 1.9940200086841934e-05, + "loss": 0.79, + "step": 4448 + }, + { + "epoch": 0.7263377005020203, + "grad_norm": 1.882120966911316, + "learning_rate": 1.9940165429724948e-05, + "loss": 0.7946, + "step": 4449 + }, + { + "epoch": 0.7265009591445247, + "grad_norm": 1.7768546342849731, + "learning_rate": 1.9940130762598224e-05, + "loss": 0.8142, + "step": 4450 + }, + { + "epoch": 0.726664217787029, + "grad_norm": 2.1513869762420654, + "learning_rate": 1.9940096085461787e-05, + "loss": 1.0164, + "step": 4451 + }, + { + "epoch": 0.7268274764295335, + "grad_norm": 1.8286242485046387, + "learning_rate": 1.9940061398315674e-05, + "loss": 0.7776, + "step": 4452 + }, + { + "epoch": 0.7269907350720379, + "grad_norm": 1.778072714805603, + "learning_rate": 1.9940026701159928e-05, + "loss": 0.7149, + "step": 4453 + }, + { + "epoch": 0.7271539937145423, + "grad_norm": 2.6977362632751465, + "learning_rate": 1.993999199399457e-05, + "loss": 0.8836, + "step": 4454 + }, + { + "epoch": 0.7273172523570467, + "grad_norm": 1.6712161302566528, + "learning_rate": 1.993995727681965e-05, + "loss": 0.7576, + "step": 4455 + }, + { + "epoch": 0.727480510999551, + "grad_norm": 2.0784361362457275, + "learning_rate": 1.9939922549635192e-05, + "loss": 0.8192, + "step": 4456 + }, + { + "epoch": 0.7276437696420555, + "grad_norm": 1.839468002319336, + "learning_rate": 1.9939887812441233e-05, + "loss": 0.8589, + "step": 4457 + }, + { + "epoch": 0.7278070282845598, + "grad_norm": 1.9465668201446533, + "learning_rate": 1.993985306523781e-05, + "loss": 0.8724, + "step": 4458 + }, + { + "epoch": 0.7279702869270642, + "grad_norm": 2.0373129844665527, + "learning_rate": 1.9939818308024962e-05, + "loss": 0.894, + "step": 4459 + }, + { + "epoch": 0.7281335455695686, + "grad_norm": 1.747261643409729, + "learning_rate": 1.9939783540802715e-05, + "loss": 0.7493, + "step": 4460 + }, + { + "epoch": 0.728296804212073, + "grad_norm": 1.9915575981140137, + "learning_rate": 1.993974876357111e-05, + "loss": 0.9033, + "step": 4461 + }, + { + "epoch": 0.7284600628545773, + "grad_norm": 2.0744011402130127, + "learning_rate": 1.9939713976330182e-05, + "loss": 0.7934, + "step": 4462 + }, + { + "epoch": 0.7286233214970818, + "grad_norm": 1.8379077911376953, + "learning_rate": 1.9939679179079964e-05, + "loss": 0.7784, + "step": 4463 + }, + { + "epoch": 0.7287865801395862, + "grad_norm": 2.0709240436553955, + "learning_rate": 1.993964437182049e-05, + "loss": 0.6351, + "step": 4464 + }, + { + "epoch": 0.7289498387820905, + "grad_norm": 1.9836870431900024, + "learning_rate": 1.99396095545518e-05, + "loss": 0.8069, + "step": 4465 + }, + { + "epoch": 0.729113097424595, + "grad_norm": 1.9226022958755493, + "learning_rate": 1.9939574727273924e-05, + "loss": 0.6556, + "step": 4466 + }, + { + "epoch": 0.7292763560670993, + "grad_norm": 1.9196112155914307, + "learning_rate": 1.9939539889986897e-05, + "loss": 0.7753, + "step": 4467 + }, + { + "epoch": 0.7294396147096037, + "grad_norm": 1.849435567855835, + "learning_rate": 1.993950504269076e-05, + "loss": 0.5915, + "step": 4468 + }, + { + "epoch": 0.729602873352108, + "grad_norm": 2.0433857440948486, + "learning_rate": 1.993947018538554e-05, + "loss": 0.9091, + "step": 4469 + }, + { + "epoch": 0.7297661319946125, + "grad_norm": 2.2304375171661377, + "learning_rate": 1.9939435318071277e-05, + "loss": 0.8822, + "step": 4470 + }, + { + "epoch": 0.7299293906371168, + "grad_norm": 2.138397216796875, + "learning_rate": 1.9939400440748008e-05, + "loss": 0.8578, + "step": 4471 + }, + { + "epoch": 0.7300926492796213, + "grad_norm": 1.9320099353790283, + "learning_rate": 1.993936555341576e-05, + "loss": 0.7436, + "step": 4472 + }, + { + "epoch": 0.7302559079221256, + "grad_norm": 2.055783271789551, + "learning_rate": 1.993933065607458e-05, + "loss": 0.9926, + "step": 4473 + }, + { + "epoch": 0.73041916656463, + "grad_norm": 2.0557861328125, + "learning_rate": 1.9939295748724494e-05, + "loss": 0.738, + "step": 4474 + }, + { + "epoch": 0.7305824252071345, + "grad_norm": 2.1829559803009033, + "learning_rate": 1.993926083136554e-05, + "loss": 0.9589, + "step": 4475 + }, + { + "epoch": 0.7307456838496388, + "grad_norm": 1.9490411281585693, + "learning_rate": 1.9939225903997748e-05, + "loss": 0.8352, + "step": 4476 + }, + { + "epoch": 0.7309089424921432, + "grad_norm": 1.7162531614303589, + "learning_rate": 1.993919096662117e-05, + "loss": 0.663, + "step": 4477 + }, + { + "epoch": 0.7310722011346475, + "grad_norm": 2.1170153617858887, + "learning_rate": 1.9939156019235817e-05, + "loss": 0.8473, + "step": 4478 + }, + { + "epoch": 0.731235459777152, + "grad_norm": 2.079963207244873, + "learning_rate": 1.993912106184174e-05, + "loss": 0.8273, + "step": 4479 + }, + { + "epoch": 0.7313987184196563, + "grad_norm": 1.9364622831344604, + "learning_rate": 1.9939086094438975e-05, + "loss": 0.8535, + "step": 4480 + }, + { + "epoch": 0.7315619770621608, + "grad_norm": 2.1018033027648926, + "learning_rate": 1.993905111702755e-05, + "loss": 0.8389, + "step": 4481 + }, + { + "epoch": 0.7317252357046651, + "grad_norm": 3.277167320251465, + "learning_rate": 1.9939016129607503e-05, + "loss": 0.9941, + "step": 4482 + }, + { + "epoch": 0.7318884943471695, + "grad_norm": 2.1185555458068848, + "learning_rate": 1.993898113217887e-05, + "loss": 0.934, + "step": 4483 + }, + { + "epoch": 0.7320517529896738, + "grad_norm": 2.1316545009613037, + "learning_rate": 1.9938946124741684e-05, + "loss": 0.893, + "step": 4484 + }, + { + "epoch": 0.7322150116321783, + "grad_norm": 1.7952659130096436, + "learning_rate": 1.9938911107295984e-05, + "loss": 0.7511, + "step": 4485 + }, + { + "epoch": 0.7323782702746827, + "grad_norm": 2.254964590072632, + "learning_rate": 1.9938876079841804e-05, + "loss": 0.8298, + "step": 4486 + }, + { + "epoch": 0.732541528917187, + "grad_norm": 1.7881091833114624, + "learning_rate": 1.9938841042379174e-05, + "loss": 0.9017, + "step": 4487 + }, + { + "epoch": 0.7327047875596915, + "grad_norm": 1.9769015312194824, + "learning_rate": 1.993880599490814e-05, + "loss": 0.8554, + "step": 4488 + }, + { + "epoch": 0.7328680462021958, + "grad_norm": 2.071852207183838, + "learning_rate": 1.9938770937428728e-05, + "loss": 0.8367, + "step": 4489 + }, + { + "epoch": 0.7330313048447002, + "grad_norm": 1.9506504535675049, + "learning_rate": 1.9938735869940972e-05, + "loss": 0.8202, + "step": 4490 + }, + { + "epoch": 0.7331945634872046, + "grad_norm": 1.6314549446105957, + "learning_rate": 1.993870079244492e-05, + "loss": 0.7745, + "step": 4491 + }, + { + "epoch": 0.733357822129709, + "grad_norm": 1.9646632671356201, + "learning_rate": 1.9938665704940592e-05, + "loss": 0.7403, + "step": 4492 + }, + { + "epoch": 0.7335210807722133, + "grad_norm": 1.8802987337112427, + "learning_rate": 1.9938630607428033e-05, + "loss": 0.7444, + "step": 4493 + }, + { + "epoch": 0.7336843394147178, + "grad_norm": 1.703447937965393, + "learning_rate": 1.9938595499907274e-05, + "loss": 0.8167, + "step": 4494 + }, + { + "epoch": 0.7338475980572221, + "grad_norm": 1.7272857427597046, + "learning_rate": 1.9938560382378353e-05, + "loss": 0.6824, + "step": 4495 + }, + { + "epoch": 0.7340108566997265, + "grad_norm": 2.1873269081115723, + "learning_rate": 1.9938525254841305e-05, + "loss": 0.8032, + "step": 4496 + }, + { + "epoch": 0.734174115342231, + "grad_norm": 1.997214674949646, + "learning_rate": 1.9938490117296165e-05, + "loss": 0.7912, + "step": 4497 + }, + { + "epoch": 0.7343373739847353, + "grad_norm": 2.073227882385254, + "learning_rate": 1.993845496974297e-05, + "loss": 0.8176, + "step": 4498 + }, + { + "epoch": 0.7345006326272397, + "grad_norm": 2.1228950023651123, + "learning_rate": 1.9938419812181747e-05, + "loss": 0.8408, + "step": 4499 + }, + { + "epoch": 0.7346638912697441, + "grad_norm": 1.8168696165084839, + "learning_rate": 1.9938384644612542e-05, + "loss": 0.817, + "step": 4500 + }, + { + "epoch": 0.7348271499122485, + "grad_norm": 1.7455905675888062, + "learning_rate": 1.9938349467035386e-05, + "loss": 0.8409, + "step": 4501 + }, + { + "epoch": 0.7349904085547528, + "grad_norm": 1.759324312210083, + "learning_rate": 1.9938314279450318e-05, + "loss": 0.6535, + "step": 4502 + }, + { + "epoch": 0.7351536671972573, + "grad_norm": 1.982790470123291, + "learning_rate": 1.9938279081857367e-05, + "loss": 0.8982, + "step": 4503 + }, + { + "epoch": 0.7353169258397616, + "grad_norm": 2.3180794715881348, + "learning_rate": 1.9938243874256572e-05, + "loss": 0.6466, + "step": 4504 + }, + { + "epoch": 0.735480184482266, + "grad_norm": 1.9459507465362549, + "learning_rate": 1.9938208656647964e-05, + "loss": 0.8078, + "step": 4505 + }, + { + "epoch": 0.7356434431247704, + "grad_norm": 1.7770904302597046, + "learning_rate": 1.9938173429031588e-05, + "loss": 0.8085, + "step": 4506 + }, + { + "epoch": 0.7358067017672748, + "grad_norm": 1.8279656171798706, + "learning_rate": 1.9938138191407473e-05, + "loss": 0.7013, + "step": 4507 + }, + { + "epoch": 0.7359699604097792, + "grad_norm": 2.4340391159057617, + "learning_rate": 1.9938102943775653e-05, + "loss": 0.8248, + "step": 4508 + }, + { + "epoch": 0.7361332190522836, + "grad_norm": 2.24891996383667, + "learning_rate": 1.9938067686136167e-05, + "loss": 0.8393, + "step": 4509 + }, + { + "epoch": 0.736296477694788, + "grad_norm": 1.69552743434906, + "learning_rate": 1.993803241848905e-05, + "loss": 0.7323, + "step": 4510 + }, + { + "epoch": 0.7364597363372923, + "grad_norm": 2.2317330837249756, + "learning_rate": 1.9937997140834338e-05, + "loss": 0.9625, + "step": 4511 + }, + { + "epoch": 0.7366229949797968, + "grad_norm": 2.0568790435791016, + "learning_rate": 1.9937961853172064e-05, + "loss": 0.9577, + "step": 4512 + }, + { + "epoch": 0.7367862536223011, + "grad_norm": 1.8632373809814453, + "learning_rate": 1.9937926555502268e-05, + "loss": 0.7166, + "step": 4513 + }, + { + "epoch": 0.7369495122648055, + "grad_norm": 2.076192617416382, + "learning_rate": 1.9937891247824977e-05, + "loss": 0.9299, + "step": 4514 + }, + { + "epoch": 0.7371127709073099, + "grad_norm": 1.8990850448608398, + "learning_rate": 1.9937855930140237e-05, + "loss": 0.8555, + "step": 4515 + }, + { + "epoch": 0.7372760295498143, + "grad_norm": 1.7650129795074463, + "learning_rate": 1.9937820602448076e-05, + "loss": 0.6615, + "step": 4516 + }, + { + "epoch": 0.7374392881923186, + "grad_norm": 1.987490177154541, + "learning_rate": 1.9937785264748536e-05, + "loss": 0.9224, + "step": 4517 + }, + { + "epoch": 0.7376025468348231, + "grad_norm": 1.9159023761749268, + "learning_rate": 1.9937749917041645e-05, + "loss": 0.7918, + "step": 4518 + }, + { + "epoch": 0.7377658054773275, + "grad_norm": 1.8243857622146606, + "learning_rate": 1.9937714559327445e-05, + "loss": 0.8178, + "step": 4519 + }, + { + "epoch": 0.7379290641198318, + "grad_norm": 1.8879094123840332, + "learning_rate": 1.9937679191605964e-05, + "loss": 0.879, + "step": 4520 + }, + { + "epoch": 0.7380923227623363, + "grad_norm": 1.7971030473709106, + "learning_rate": 1.9937643813877247e-05, + "loss": 0.8109, + "step": 4521 + }, + { + "epoch": 0.7382555814048406, + "grad_norm": 2.050477981567383, + "learning_rate": 1.9937608426141326e-05, + "loss": 0.8226, + "step": 4522 + }, + { + "epoch": 0.738418840047345, + "grad_norm": 1.600393533706665, + "learning_rate": 1.9937573028398233e-05, + "loss": 0.664, + "step": 4523 + }, + { + "epoch": 0.7385820986898494, + "grad_norm": 2.0359764099121094, + "learning_rate": 1.993753762064801e-05, + "loss": 0.8488, + "step": 4524 + }, + { + "epoch": 0.7387453573323538, + "grad_norm": 1.86660897731781, + "learning_rate": 1.993750220289069e-05, + "loss": 0.8534, + "step": 4525 + }, + { + "epoch": 0.7389086159748581, + "grad_norm": 2.136317491531372, + "learning_rate": 1.9937466775126305e-05, + "loss": 0.8335, + "step": 4526 + }, + { + "epoch": 0.7390718746173626, + "grad_norm": 2.17376971244812, + "learning_rate": 1.9937431337354894e-05, + "loss": 0.9482, + "step": 4527 + }, + { + "epoch": 0.7392351332598669, + "grad_norm": 2.136197566986084, + "learning_rate": 1.993739588957649e-05, + "loss": 0.8684, + "step": 4528 + }, + { + "epoch": 0.7393983919023713, + "grad_norm": 1.655583143234253, + "learning_rate": 1.9937360431791136e-05, + "loss": 0.7033, + "step": 4529 + }, + { + "epoch": 0.7395616505448758, + "grad_norm": 2.09268856048584, + "learning_rate": 1.9937324963998858e-05, + "loss": 0.8713, + "step": 4530 + }, + { + "epoch": 0.7397249091873801, + "grad_norm": 1.774221420288086, + "learning_rate": 1.9937289486199696e-05, + "loss": 0.6822, + "step": 4531 + }, + { + "epoch": 0.7398881678298845, + "grad_norm": 2.0438027381896973, + "learning_rate": 1.993725399839369e-05, + "loss": 0.7528, + "step": 4532 + }, + { + "epoch": 0.7400514264723889, + "grad_norm": 1.654295563697815, + "learning_rate": 1.993721850058087e-05, + "loss": 0.6219, + "step": 4533 + }, + { + "epoch": 0.7402146851148933, + "grad_norm": 1.7724084854125977, + "learning_rate": 1.9937182992761276e-05, + "loss": 0.84, + "step": 4534 + }, + { + "epoch": 0.7403779437573976, + "grad_norm": 1.8691307306289673, + "learning_rate": 1.9937147474934936e-05, + "loss": 0.9156, + "step": 4535 + }, + { + "epoch": 0.7405412023999021, + "grad_norm": 1.8682236671447754, + "learning_rate": 1.9937111947101897e-05, + "loss": 0.7704, + "step": 4536 + }, + { + "epoch": 0.7407044610424064, + "grad_norm": 1.8265695571899414, + "learning_rate": 1.9937076409262187e-05, + "loss": 0.8736, + "step": 4537 + }, + { + "epoch": 0.7408677196849108, + "grad_norm": 2.2403807640075684, + "learning_rate": 1.9937040861415843e-05, + "loss": 1.1012, + "step": 4538 + }, + { + "epoch": 0.7410309783274152, + "grad_norm": 2.3451006412506104, + "learning_rate": 1.9937005303562903e-05, + "loss": 1.0008, + "step": 4539 + }, + { + "epoch": 0.7411942369699196, + "grad_norm": 1.5230185985565186, + "learning_rate": 1.9936969735703396e-05, + "loss": 0.5971, + "step": 4540 + }, + { + "epoch": 0.741357495612424, + "grad_norm": 2.101780414581299, + "learning_rate": 1.9936934157837368e-05, + "loss": 0.9586, + "step": 4541 + }, + { + "epoch": 0.7415207542549284, + "grad_norm": 2.001110315322876, + "learning_rate": 1.993689856996485e-05, + "loss": 0.8577, + "step": 4542 + }, + { + "epoch": 0.7416840128974328, + "grad_norm": 1.8413203954696655, + "learning_rate": 1.9936862972085874e-05, + "loss": 0.8253, + "step": 4543 + }, + { + "epoch": 0.7418472715399371, + "grad_norm": 2.646864175796509, + "learning_rate": 1.9936827364200483e-05, + "loss": 0.9044, + "step": 4544 + }, + { + "epoch": 0.7420105301824416, + "grad_norm": 1.8788542747497559, + "learning_rate": 1.993679174630871e-05, + "loss": 0.8153, + "step": 4545 + }, + { + "epoch": 0.7421737888249459, + "grad_norm": 1.7516509294509888, + "learning_rate": 1.993675611841059e-05, + "loss": 0.7019, + "step": 4546 + }, + { + "epoch": 0.7423370474674503, + "grad_norm": 2.01668119430542, + "learning_rate": 1.9936720480506158e-05, + "loss": 0.7598, + "step": 4547 + }, + { + "epoch": 0.7425003061099547, + "grad_norm": 2.1090736389160156, + "learning_rate": 1.993668483259545e-05, + "loss": 0.9602, + "step": 4548 + }, + { + "epoch": 0.7426635647524591, + "grad_norm": 2.140662431716919, + "learning_rate": 1.9936649174678508e-05, + "loss": 0.7182, + "step": 4549 + }, + { + "epoch": 0.7428268233949634, + "grad_norm": 2.0196878910064697, + "learning_rate": 1.9936613506755357e-05, + "loss": 0.8091, + "step": 4550 + }, + { + "epoch": 0.7429900820374679, + "grad_norm": 2.254681348800659, + "learning_rate": 1.993657782882604e-05, + "loss": 0.9833, + "step": 4551 + }, + { + "epoch": 0.7431533406799723, + "grad_norm": 1.9238251447677612, + "learning_rate": 1.9936542140890595e-05, + "loss": 0.8406, + "step": 4552 + }, + { + "epoch": 0.7433165993224766, + "grad_norm": 1.9392521381378174, + "learning_rate": 1.9936506442949054e-05, + "loss": 0.7429, + "step": 4553 + }, + { + "epoch": 0.7434798579649811, + "grad_norm": 1.7947956323623657, + "learning_rate": 1.9936470735001448e-05, + "loss": 0.7023, + "step": 4554 + }, + { + "epoch": 0.7436431166074854, + "grad_norm": 5.196544170379639, + "learning_rate": 1.9936435017047826e-05, + "loss": 0.9003, + "step": 4555 + }, + { + "epoch": 0.7438063752499898, + "grad_norm": 2.1067655086517334, + "learning_rate": 1.9936399289088213e-05, + "loss": 0.8855, + "step": 4556 + }, + { + "epoch": 0.7439696338924942, + "grad_norm": 1.9620591402053833, + "learning_rate": 1.993636355112265e-05, + "loss": 0.7034, + "step": 4557 + }, + { + "epoch": 0.7441328925349986, + "grad_norm": 1.7370753288269043, + "learning_rate": 1.993632780315117e-05, + "loss": 0.6889, + "step": 4558 + }, + { + "epoch": 0.7442961511775029, + "grad_norm": 1.8719536066055298, + "learning_rate": 1.993629204517381e-05, + "loss": 0.8508, + "step": 4559 + }, + { + "epoch": 0.7444594098200074, + "grad_norm": 2.041933536529541, + "learning_rate": 1.9936256277190608e-05, + "loss": 0.8183, + "step": 4560 + }, + { + "epoch": 0.7446226684625117, + "grad_norm": 2.0351321697235107, + "learning_rate": 1.99362204992016e-05, + "loss": 0.8701, + "step": 4561 + }, + { + "epoch": 0.7447859271050161, + "grad_norm": 1.898887038230896, + "learning_rate": 1.993618471120682e-05, + "loss": 0.7686, + "step": 4562 + }, + { + "epoch": 0.7449491857475206, + "grad_norm": 2.1133129596710205, + "learning_rate": 1.99361489132063e-05, + "loss": 0.7291, + "step": 4563 + }, + { + "epoch": 0.7451124443900249, + "grad_norm": 1.756273627281189, + "learning_rate": 1.9936113105200085e-05, + "loss": 0.7525, + "step": 4564 + }, + { + "epoch": 0.7452757030325293, + "grad_norm": 1.82601797580719, + "learning_rate": 1.9936077287188206e-05, + "loss": 0.6829, + "step": 4565 + }, + { + "epoch": 0.7454389616750337, + "grad_norm": 1.9084341526031494, + "learning_rate": 1.99360414591707e-05, + "loss": 0.9182, + "step": 4566 + }, + { + "epoch": 0.7456022203175381, + "grad_norm": 1.4208651781082153, + "learning_rate": 1.9936005621147604e-05, + "loss": 0.5942, + "step": 4567 + }, + { + "epoch": 0.7457654789600424, + "grad_norm": 1.8099150657653809, + "learning_rate": 1.993596977311895e-05, + "loss": 0.7734, + "step": 4568 + }, + { + "epoch": 0.7459287376025469, + "grad_norm": 1.9993034601211548, + "learning_rate": 1.993593391508478e-05, + "loss": 0.8243, + "step": 4569 + }, + { + "epoch": 0.7460919962450512, + "grad_norm": 2.488537311553955, + "learning_rate": 1.9935898047045126e-05, + "loss": 0.8958, + "step": 4570 + }, + { + "epoch": 0.7462552548875556, + "grad_norm": 2.1328558921813965, + "learning_rate": 1.9935862169000023e-05, + "loss": 0.9041, + "step": 4571 + }, + { + "epoch": 0.74641851353006, + "grad_norm": 1.924705982208252, + "learning_rate": 1.9935826280949513e-05, + "loss": 0.8744, + "step": 4572 + }, + { + "epoch": 0.7465817721725644, + "grad_norm": 1.9434748888015747, + "learning_rate": 1.9935790382893626e-05, + "loss": 0.7799, + "step": 4573 + }, + { + "epoch": 0.7467450308150688, + "grad_norm": 2.048208475112915, + "learning_rate": 1.9935754474832403e-05, + "loss": 0.8689, + "step": 4574 + }, + { + "epoch": 0.7469082894575731, + "grad_norm": 2.0442686080932617, + "learning_rate": 1.9935718556765878e-05, + "loss": 0.8505, + "step": 4575 + }, + { + "epoch": 0.7470715481000776, + "grad_norm": 2.0635268688201904, + "learning_rate": 1.9935682628694085e-05, + "loss": 0.8212, + "step": 4576 + }, + { + "epoch": 0.7472348067425819, + "grad_norm": 2.116694927215576, + "learning_rate": 1.9935646690617063e-05, + "loss": 0.8075, + "step": 4577 + }, + { + "epoch": 0.7473980653850864, + "grad_norm": 1.9998581409454346, + "learning_rate": 1.9935610742534845e-05, + "loss": 0.878, + "step": 4578 + }, + { + "epoch": 0.7475613240275907, + "grad_norm": 2.115821123123169, + "learning_rate": 1.9935574784447473e-05, + "loss": 0.6966, + "step": 4579 + }, + { + "epoch": 0.7477245826700951, + "grad_norm": 1.9248027801513672, + "learning_rate": 1.993553881635498e-05, + "loss": 0.7794, + "step": 4580 + }, + { + "epoch": 0.7478878413125994, + "grad_norm": 2.1643948554992676, + "learning_rate": 1.9935502838257403e-05, + "loss": 0.7905, + "step": 4581 + }, + { + "epoch": 0.7480510999551039, + "grad_norm": 2.010206699371338, + "learning_rate": 1.993546685015477e-05, + "loss": 0.7532, + "step": 4582 + }, + { + "epoch": 0.7482143585976082, + "grad_norm": 1.829881191253662, + "learning_rate": 1.9935430852047133e-05, + "loss": 0.8127, + "step": 4583 + }, + { + "epoch": 0.7483776172401126, + "grad_norm": 1.9014230966567993, + "learning_rate": 1.9935394843934513e-05, + "loss": 0.6807, + "step": 4584 + }, + { + "epoch": 0.7485408758826171, + "grad_norm": 1.834202766418457, + "learning_rate": 1.9935358825816958e-05, + "loss": 0.7037, + "step": 4585 + }, + { + "epoch": 0.7487041345251214, + "grad_norm": 1.898146390914917, + "learning_rate": 1.99353227976945e-05, + "loss": 0.755, + "step": 4586 + }, + { + "epoch": 0.7488673931676258, + "grad_norm": 2.0748255252838135, + "learning_rate": 1.993528675956717e-05, + "loss": 0.8054, + "step": 4587 + }, + { + "epoch": 0.7490306518101302, + "grad_norm": 1.707938551902771, + "learning_rate": 1.9935250711435016e-05, + "loss": 0.6984, + "step": 4588 + }, + { + "epoch": 0.7491939104526346, + "grad_norm": 2.3744587898254395, + "learning_rate": 1.9935214653298057e-05, + "loss": 0.7162, + "step": 4589 + }, + { + "epoch": 0.7493571690951389, + "grad_norm": 2.039430856704712, + "learning_rate": 1.993517858515635e-05, + "loss": 0.8329, + "step": 4590 + }, + { + "epoch": 0.7495204277376434, + "grad_norm": 2.0659639835357666, + "learning_rate": 1.9935142507009914e-05, + "loss": 0.9205, + "step": 4591 + }, + { + "epoch": 0.7496836863801477, + "grad_norm": 2.235739231109619, + "learning_rate": 1.9935106418858793e-05, + "loss": 0.882, + "step": 4592 + }, + { + "epoch": 0.7498469450226521, + "grad_norm": 1.8451383113861084, + "learning_rate": 1.993507032070302e-05, + "loss": 0.7907, + "step": 4593 + }, + { + "epoch": 0.7500102036651565, + "grad_norm": 1.8548585176467896, + "learning_rate": 1.993503421254264e-05, + "loss": 0.8617, + "step": 4594 + }, + { + "epoch": 0.7501734623076609, + "grad_norm": 2.1970179080963135, + "learning_rate": 1.993499809437768e-05, + "loss": 0.7481, + "step": 4595 + }, + { + "epoch": 0.7503367209501653, + "grad_norm": 1.9062014818191528, + "learning_rate": 1.993496196620818e-05, + "loss": 0.6913, + "step": 4596 + }, + { + "epoch": 0.7504999795926697, + "grad_norm": 1.8940649032592773, + "learning_rate": 1.9934925828034174e-05, + "loss": 0.8878, + "step": 4597 + }, + { + "epoch": 0.7506632382351741, + "grad_norm": 1.7794382572174072, + "learning_rate": 1.9934889679855706e-05, + "loss": 0.8393, + "step": 4598 + }, + { + "epoch": 0.7508264968776784, + "grad_norm": 1.7150719165802002, + "learning_rate": 1.9934853521672802e-05, + "loss": 0.8862, + "step": 4599 + }, + { + "epoch": 0.7509897555201829, + "grad_norm": 2.082970380783081, + "learning_rate": 1.99348173534855e-05, + "loss": 0.8946, + "step": 4600 + }, + { + "epoch": 0.7511530141626872, + "grad_norm": 1.7865692377090454, + "learning_rate": 1.9934781175293847e-05, + "loss": 0.653, + "step": 4601 + }, + { + "epoch": 0.7513162728051916, + "grad_norm": 2.137843132019043, + "learning_rate": 1.993474498709787e-05, + "loss": 0.7865, + "step": 4602 + }, + { + "epoch": 0.751479531447696, + "grad_norm": 2.069361686706543, + "learning_rate": 1.9934708788897606e-05, + "loss": 0.9781, + "step": 4603 + }, + { + "epoch": 0.7516427900902004, + "grad_norm": 2.074106454849243, + "learning_rate": 1.993467258069309e-05, + "loss": 0.7962, + "step": 4604 + }, + { + "epoch": 0.7518060487327047, + "grad_norm": 1.688706874847412, + "learning_rate": 1.9934636362484364e-05, + "loss": 0.6754, + "step": 4605 + }, + { + "epoch": 0.7519693073752092, + "grad_norm": 1.7413018941879272, + "learning_rate": 1.9934600134271463e-05, + "loss": 0.74, + "step": 4606 + }, + { + "epoch": 0.7521325660177136, + "grad_norm": 1.9484628438949585, + "learning_rate": 1.9934563896054423e-05, + "loss": 0.7287, + "step": 4607 + }, + { + "epoch": 0.7522958246602179, + "grad_norm": 1.9032983779907227, + "learning_rate": 1.9934527647833276e-05, + "loss": 0.6762, + "step": 4608 + }, + { + "epoch": 0.7524590833027224, + "grad_norm": 1.8567625284194946, + "learning_rate": 1.9934491389608067e-05, + "loss": 0.736, + "step": 4609 + }, + { + "epoch": 0.7526223419452267, + "grad_norm": 2.129917621612549, + "learning_rate": 1.9934455121378822e-05, + "loss": 0.7626, + "step": 4610 + }, + { + "epoch": 0.7527856005877311, + "grad_norm": 1.8982903957366943, + "learning_rate": 1.9934418843145587e-05, + "loss": 0.8415, + "step": 4611 + }, + { + "epoch": 0.7529488592302355, + "grad_norm": 2.0184431076049805, + "learning_rate": 1.9934382554908395e-05, + "loss": 0.8828, + "step": 4612 + }, + { + "epoch": 0.7531121178727399, + "grad_norm": 2.0772407054901123, + "learning_rate": 1.9934346256667282e-05, + "loss": 0.6562, + "step": 4613 + }, + { + "epoch": 0.7532753765152442, + "grad_norm": 2.028277635574341, + "learning_rate": 1.9934309948422287e-05, + "loss": 0.9042, + "step": 4614 + }, + { + "epoch": 0.7534386351577487, + "grad_norm": 1.9939104318618774, + "learning_rate": 1.993427363017344e-05, + "loss": 0.7808, + "step": 4615 + }, + { + "epoch": 0.753601893800253, + "grad_norm": 1.923258662223816, + "learning_rate": 1.9934237301920785e-05, + "loss": 0.8258, + "step": 4616 + }, + { + "epoch": 0.7537651524427574, + "grad_norm": 1.8246545791625977, + "learning_rate": 1.9934200963664356e-05, + "loss": 0.7033, + "step": 4617 + }, + { + "epoch": 0.7539284110852619, + "grad_norm": 2.1785573959350586, + "learning_rate": 1.993416461540419e-05, + "loss": 0.8981, + "step": 4618 + }, + { + "epoch": 0.7540916697277662, + "grad_norm": 2.1557085514068604, + "learning_rate": 1.993412825714032e-05, + "loss": 0.9544, + "step": 4619 + }, + { + "epoch": 0.7542549283702706, + "grad_norm": 2.243162155151367, + "learning_rate": 1.9934091888872785e-05, + "loss": 0.8437, + "step": 4620 + }, + { + "epoch": 0.754418187012775, + "grad_norm": 1.753284215927124, + "learning_rate": 1.9934055510601625e-05, + "loss": 0.7302, + "step": 4621 + }, + { + "epoch": 0.7545814456552794, + "grad_norm": 1.8497236967086792, + "learning_rate": 1.9934019122326873e-05, + "loss": 0.7348, + "step": 4622 + }, + { + "epoch": 0.7547447042977837, + "grad_norm": 1.950338363647461, + "learning_rate": 1.9933982724048568e-05, + "loss": 0.673, + "step": 4623 + }, + { + "epoch": 0.7549079629402882, + "grad_norm": 1.8835318088531494, + "learning_rate": 1.9933946315766742e-05, + "loss": 0.7368, + "step": 4624 + }, + { + "epoch": 0.7550712215827925, + "grad_norm": 2.0387613773345947, + "learning_rate": 1.9933909897481434e-05, + "loss": 0.7548, + "step": 4625 + }, + { + "epoch": 0.7552344802252969, + "grad_norm": 1.9279372692108154, + "learning_rate": 1.9933873469192683e-05, + "loss": 0.7164, + "step": 4626 + }, + { + "epoch": 0.7553977388678013, + "grad_norm": 2.3233654499053955, + "learning_rate": 1.993383703090052e-05, + "loss": 0.8276, + "step": 4627 + }, + { + "epoch": 0.7555609975103057, + "grad_norm": 2.139484167098999, + "learning_rate": 1.9933800582604994e-05, + "loss": 0.8029, + "step": 4628 + }, + { + "epoch": 0.7557242561528101, + "grad_norm": 1.6661276817321777, + "learning_rate": 1.993376412430613e-05, + "loss": 0.5543, + "step": 4629 + }, + { + "epoch": 0.7558875147953145, + "grad_norm": 2.394298553466797, + "learning_rate": 1.9933727656003964e-05, + "loss": 0.9944, + "step": 4630 + }, + { + "epoch": 0.7560507734378189, + "grad_norm": 2.443758249282837, + "learning_rate": 1.993369117769854e-05, + "loss": 1.0998, + "step": 4631 + }, + { + "epoch": 0.7562140320803232, + "grad_norm": 1.943636178970337, + "learning_rate": 1.9933654689389893e-05, + "loss": 0.7691, + "step": 4632 + }, + { + "epoch": 0.7563772907228277, + "grad_norm": 2.244948387145996, + "learning_rate": 1.993361819107806e-05, + "loss": 1.0348, + "step": 4633 + }, + { + "epoch": 0.756540549365332, + "grad_norm": 2.144317626953125, + "learning_rate": 1.993358168276307e-05, + "loss": 0.7825, + "step": 4634 + }, + { + "epoch": 0.7567038080078364, + "grad_norm": 2.0784707069396973, + "learning_rate": 1.9933545164444973e-05, + "loss": 0.7839, + "step": 4635 + }, + { + "epoch": 0.7568670666503408, + "grad_norm": 1.8936043977737427, + "learning_rate": 1.9933508636123793e-05, + "loss": 0.6154, + "step": 4636 + }, + { + "epoch": 0.7570303252928452, + "grad_norm": 1.9787250757217407, + "learning_rate": 1.9933472097799574e-05, + "loss": 0.878, + "step": 4637 + }, + { + "epoch": 0.7571935839353495, + "grad_norm": 2.2274301052093506, + "learning_rate": 1.9933435549472354e-05, + "loss": 0.902, + "step": 4638 + }, + { + "epoch": 0.757356842577854, + "grad_norm": 2.0145561695098877, + "learning_rate": 1.993339899114216e-05, + "loss": 0.8602, + "step": 4639 + }, + { + "epoch": 0.7575201012203584, + "grad_norm": 1.7700239419937134, + "learning_rate": 1.9933362422809043e-05, + "loss": 0.7816, + "step": 4640 + }, + { + "epoch": 0.7576833598628627, + "grad_norm": 1.8755545616149902, + "learning_rate": 1.993332584447303e-05, + "loss": 0.7278, + "step": 4641 + }, + { + "epoch": 0.7578466185053672, + "grad_norm": 1.8476940393447876, + "learning_rate": 1.9933289256134162e-05, + "loss": 0.764, + "step": 4642 + }, + { + "epoch": 0.7580098771478715, + "grad_norm": 1.626434564590454, + "learning_rate": 1.993325265779247e-05, + "loss": 0.5623, + "step": 4643 + }, + { + "epoch": 0.7581731357903759, + "grad_norm": 1.8222821950912476, + "learning_rate": 1.9933216049448003e-05, + "loss": 0.7164, + "step": 4644 + }, + { + "epoch": 0.7583363944328803, + "grad_norm": 1.6290500164031982, + "learning_rate": 1.9933179431100783e-05, + "loss": 0.5175, + "step": 4645 + }, + { + "epoch": 0.7584996530753847, + "grad_norm": 1.8754215240478516, + "learning_rate": 1.9933142802750856e-05, + "loss": 0.7563, + "step": 4646 + }, + { + "epoch": 0.758662911717889, + "grad_norm": 1.8576394319534302, + "learning_rate": 1.9933106164398257e-05, + "loss": 0.7555, + "step": 4647 + }, + { + "epoch": 0.7588261703603935, + "grad_norm": 1.7232359647750854, + "learning_rate": 1.993306951604302e-05, + "loss": 0.612, + "step": 4648 + }, + { + "epoch": 0.7589894290028978, + "grad_norm": 1.851973295211792, + "learning_rate": 1.9933032857685187e-05, + "loss": 0.7261, + "step": 4649 + }, + { + "epoch": 0.7591526876454022, + "grad_norm": 1.763386607170105, + "learning_rate": 1.9932996189324796e-05, + "loss": 0.7674, + "step": 4650 + }, + { + "epoch": 0.7593159462879067, + "grad_norm": 1.9506514072418213, + "learning_rate": 1.9932959510961877e-05, + "loss": 0.7898, + "step": 4651 + }, + { + "epoch": 0.759479204930411, + "grad_norm": 1.608551263809204, + "learning_rate": 1.993292282259647e-05, + "loss": 0.6283, + "step": 4652 + }, + { + "epoch": 0.7596424635729154, + "grad_norm": 1.7226485013961792, + "learning_rate": 1.9932886124228615e-05, + "loss": 0.6519, + "step": 4653 + }, + { + "epoch": 0.7598057222154198, + "grad_norm": 2.018169641494751, + "learning_rate": 1.9932849415858344e-05, + "loss": 0.6938, + "step": 4654 + }, + { + "epoch": 0.7599689808579242, + "grad_norm": 2.0303313732147217, + "learning_rate": 1.9932812697485695e-05, + "loss": 0.9363, + "step": 4655 + }, + { + "epoch": 0.7601322395004285, + "grad_norm": 1.918703556060791, + "learning_rate": 1.993277596911071e-05, + "loss": 0.8338, + "step": 4656 + }, + { + "epoch": 0.760295498142933, + "grad_norm": 2.071023464202881, + "learning_rate": 1.993273923073342e-05, + "loss": 0.8917, + "step": 4657 + }, + { + "epoch": 0.7604587567854373, + "grad_norm": 1.9688845872879028, + "learning_rate": 1.9932702482353864e-05, + "loss": 0.7896, + "step": 4658 + }, + { + "epoch": 0.7606220154279417, + "grad_norm": 1.9989173412322998, + "learning_rate": 1.993266572397208e-05, + "loss": 0.8769, + "step": 4659 + }, + { + "epoch": 0.760785274070446, + "grad_norm": 2.200631856918335, + "learning_rate": 1.9932628955588103e-05, + "loss": 0.7868, + "step": 4660 + }, + { + "epoch": 0.7609485327129505, + "grad_norm": 2.1566972732543945, + "learning_rate": 1.9932592177201974e-05, + "loss": 0.7045, + "step": 4661 + }, + { + "epoch": 0.7611117913554549, + "grad_norm": 1.9910906553268433, + "learning_rate": 1.9932555388813727e-05, + "loss": 0.8742, + "step": 4662 + }, + { + "epoch": 0.7612750499979593, + "grad_norm": 1.8481934070587158, + "learning_rate": 1.9932518590423396e-05, + "loss": 0.737, + "step": 4663 + }, + { + "epoch": 0.7614383086404637, + "grad_norm": 2.0628092288970947, + "learning_rate": 1.9932481782031023e-05, + "loss": 0.8166, + "step": 4664 + }, + { + "epoch": 0.761601567282968, + "grad_norm": 2.189767360687256, + "learning_rate": 1.9932444963636644e-05, + "loss": 0.869, + "step": 4665 + }, + { + "epoch": 0.7617648259254725, + "grad_norm": 2.3621294498443604, + "learning_rate": 1.9932408135240297e-05, + "loss": 0.9071, + "step": 4666 + }, + { + "epoch": 0.7619280845679768, + "grad_norm": 1.9231842756271362, + "learning_rate": 1.9932371296842015e-05, + "loss": 0.8447, + "step": 4667 + }, + { + "epoch": 0.7620913432104812, + "grad_norm": 1.7949057817459106, + "learning_rate": 1.993233444844184e-05, + "loss": 0.7444, + "step": 4668 + }, + { + "epoch": 0.7622546018529855, + "grad_norm": 1.9224872589111328, + "learning_rate": 1.9932297590039804e-05, + "loss": 0.8879, + "step": 4669 + }, + { + "epoch": 0.76241786049549, + "grad_norm": 1.7565805912017822, + "learning_rate": 1.9932260721635946e-05, + "loss": 0.6991, + "step": 4670 + }, + { + "epoch": 0.7625811191379943, + "grad_norm": 2.1124050617218018, + "learning_rate": 1.993222384323031e-05, + "loss": 0.9798, + "step": 4671 + }, + { + "epoch": 0.7627443777804988, + "grad_norm": 1.6439214944839478, + "learning_rate": 1.993218695482292e-05, + "loss": 0.6485, + "step": 4672 + }, + { + "epoch": 0.7629076364230032, + "grad_norm": 2.0205078125, + "learning_rate": 1.993215005641383e-05, + "loss": 0.872, + "step": 4673 + }, + { + "epoch": 0.7630708950655075, + "grad_norm": 1.8556334972381592, + "learning_rate": 1.9932113148003057e-05, + "loss": 0.686, + "step": 4674 + }, + { + "epoch": 0.763234153708012, + "grad_norm": 1.7366294860839844, + "learning_rate": 1.9932076229590655e-05, + "loss": 0.812, + "step": 4675 + }, + { + "epoch": 0.7633974123505163, + "grad_norm": 1.9102355241775513, + "learning_rate": 1.9932039301176654e-05, + "loss": 0.8177, + "step": 4676 + }, + { + "epoch": 0.7635606709930207, + "grad_norm": 1.694512128829956, + "learning_rate": 1.993200236276109e-05, + "loss": 0.6744, + "step": 4677 + }, + { + "epoch": 0.763723929635525, + "grad_norm": 2.0839977264404297, + "learning_rate": 1.9931965414344004e-05, + "loss": 0.8284, + "step": 4678 + }, + { + "epoch": 0.7638871882780295, + "grad_norm": 1.9487155675888062, + "learning_rate": 1.9931928455925433e-05, + "loss": 0.7822, + "step": 4679 + }, + { + "epoch": 0.7640504469205338, + "grad_norm": 1.6613975763320923, + "learning_rate": 1.993189148750541e-05, + "loss": 0.564, + "step": 4680 + }, + { + "epoch": 0.7642137055630382, + "grad_norm": 2.0825307369232178, + "learning_rate": 1.9931854509083975e-05, + "loss": 0.7952, + "step": 4681 + }, + { + "epoch": 0.7643769642055427, + "grad_norm": 1.6338077783584595, + "learning_rate": 1.9931817520661165e-05, + "loss": 0.6926, + "step": 4682 + }, + { + "epoch": 0.764540222848047, + "grad_norm": 1.9700241088867188, + "learning_rate": 1.9931780522237018e-05, + "loss": 0.8455, + "step": 4683 + }, + { + "epoch": 0.7647034814905515, + "grad_norm": 2.0447447299957275, + "learning_rate": 1.9931743513811573e-05, + "loss": 0.8625, + "step": 4684 + }, + { + "epoch": 0.7648667401330558, + "grad_norm": 1.913874864578247, + "learning_rate": 1.9931706495384865e-05, + "loss": 0.7448, + "step": 4685 + }, + { + "epoch": 0.7650299987755602, + "grad_norm": 2.4198153018951416, + "learning_rate": 1.9931669466956927e-05, + "loss": 0.8871, + "step": 4686 + }, + { + "epoch": 0.7651932574180645, + "grad_norm": 2.0204646587371826, + "learning_rate": 1.9931632428527803e-05, + "loss": 0.8744, + "step": 4687 + }, + { + "epoch": 0.765356516060569, + "grad_norm": 1.941763997077942, + "learning_rate": 1.9931595380097524e-05, + "loss": 0.8268, + "step": 4688 + }, + { + "epoch": 0.7655197747030733, + "grad_norm": 1.6127785444259644, + "learning_rate": 1.9931558321666134e-05, + "loss": 0.6734, + "step": 4689 + }, + { + "epoch": 0.7656830333455777, + "grad_norm": 1.7478084564208984, + "learning_rate": 1.993152125323367e-05, + "loss": 0.728, + "step": 4690 + }, + { + "epoch": 0.7658462919880821, + "grad_norm": 2.241806745529175, + "learning_rate": 1.9931484174800163e-05, + "loss": 0.8486, + "step": 4691 + }, + { + "epoch": 0.7660095506305865, + "grad_norm": 2.177450180053711, + "learning_rate": 1.9931447086365657e-05, + "loss": 0.8279, + "step": 4692 + }, + { + "epoch": 0.766172809273091, + "grad_norm": 2.2248799800872803, + "learning_rate": 1.9931409987930185e-05, + "loss": 1.0418, + "step": 4693 + }, + { + "epoch": 0.7663360679155953, + "grad_norm": 1.9920138120651245, + "learning_rate": 1.9931372879493788e-05, + "loss": 0.5998, + "step": 4694 + }, + { + "epoch": 0.7664993265580997, + "grad_norm": 1.8563570976257324, + "learning_rate": 1.9931335761056497e-05, + "loss": 0.8525, + "step": 4695 + }, + { + "epoch": 0.766662585200604, + "grad_norm": 1.9834612607955933, + "learning_rate": 1.9931298632618355e-05, + "loss": 0.7254, + "step": 4696 + }, + { + "epoch": 0.7668258438431085, + "grad_norm": 1.8087358474731445, + "learning_rate": 1.9931261494179398e-05, + "loss": 0.7608, + "step": 4697 + }, + { + "epoch": 0.7669891024856128, + "grad_norm": 2.1756250858306885, + "learning_rate": 1.9931224345739664e-05, + "loss": 0.7805, + "step": 4698 + }, + { + "epoch": 0.7671523611281172, + "grad_norm": 2.1738898754119873, + "learning_rate": 1.993118718729919e-05, + "loss": 0.7425, + "step": 4699 + }, + { + "epoch": 0.7673156197706216, + "grad_norm": 1.6675218343734741, + "learning_rate": 1.9931150018858013e-05, + "loss": 0.7173, + "step": 4700 + }, + { + "epoch": 0.767478878413126, + "grad_norm": 1.959957480430603, + "learning_rate": 1.993111284041617e-05, + "loss": 0.9401, + "step": 4701 + }, + { + "epoch": 0.7676421370556303, + "grad_norm": 2.18278431892395, + "learning_rate": 1.99310756519737e-05, + "loss": 0.7895, + "step": 4702 + }, + { + "epoch": 0.7678053956981348, + "grad_norm": 1.9910626411437988, + "learning_rate": 1.993103845353064e-05, + "loss": 0.8391, + "step": 4703 + }, + { + "epoch": 0.7679686543406392, + "grad_norm": 1.7729636430740356, + "learning_rate": 1.9931001245087024e-05, + "loss": 0.7613, + "step": 4704 + }, + { + "epoch": 0.7681319129831435, + "grad_norm": 2.279572010040283, + "learning_rate": 1.993096402664289e-05, + "loss": 0.7986, + "step": 4705 + }, + { + "epoch": 0.768295171625648, + "grad_norm": 1.9377883672714233, + "learning_rate": 1.9930926798198286e-05, + "loss": 0.8515, + "step": 4706 + }, + { + "epoch": 0.7684584302681523, + "grad_norm": 1.964151382446289, + "learning_rate": 1.9930889559753235e-05, + "loss": 0.6435, + "step": 4707 + }, + { + "epoch": 0.7686216889106567, + "grad_norm": 2.023728370666504, + "learning_rate": 1.993085231130778e-05, + "loss": 0.7854, + "step": 4708 + }, + { + "epoch": 0.7687849475531611, + "grad_norm": 1.7447378635406494, + "learning_rate": 1.9930815052861964e-05, + "loss": 0.6915, + "step": 4709 + }, + { + "epoch": 0.7689482061956655, + "grad_norm": 2.0710904598236084, + "learning_rate": 1.993077778441582e-05, + "loss": 0.8041, + "step": 4710 + }, + { + "epoch": 0.7691114648381698, + "grad_norm": 1.5173877477645874, + "learning_rate": 1.9930740505969383e-05, + "loss": 0.6818, + "step": 4711 + }, + { + "epoch": 0.7692747234806743, + "grad_norm": 1.8935774564743042, + "learning_rate": 1.9930703217522693e-05, + "loss": 0.7381, + "step": 4712 + }, + { + "epoch": 0.7694379821231786, + "grad_norm": 1.7899399995803833, + "learning_rate": 1.993066591907579e-05, + "loss": 0.6607, + "step": 4713 + }, + { + "epoch": 0.769601240765683, + "grad_norm": 2.166910171508789, + "learning_rate": 1.9930628610628703e-05, + "loss": 0.7556, + "step": 4714 + }, + { + "epoch": 0.7697644994081875, + "grad_norm": 1.7777835130691528, + "learning_rate": 1.993059129218148e-05, + "loss": 0.6343, + "step": 4715 + }, + { + "epoch": 0.7699277580506918, + "grad_norm": 1.8069733381271362, + "learning_rate": 1.9930553963734155e-05, + "loss": 0.8172, + "step": 4716 + }, + { + "epoch": 0.7700910166931962, + "grad_norm": 2.0318892002105713, + "learning_rate": 1.9930516625286764e-05, + "loss": 0.7681, + "step": 4717 + }, + { + "epoch": 0.7702542753357006, + "grad_norm": 2.02823543548584, + "learning_rate": 1.9930479276839347e-05, + "loss": 0.8, + "step": 4718 + }, + { + "epoch": 0.770417533978205, + "grad_norm": 1.9297562837600708, + "learning_rate": 1.9930441918391933e-05, + "loss": 0.7559, + "step": 4719 + }, + { + "epoch": 0.7705807926207093, + "grad_norm": 2.3635354042053223, + "learning_rate": 1.9930404549944577e-05, + "loss": 0.9671, + "step": 4720 + }, + { + "epoch": 0.7707440512632138, + "grad_norm": 2.5022261142730713, + "learning_rate": 1.99303671714973e-05, + "loss": 0.8454, + "step": 4721 + }, + { + "epoch": 0.7709073099057181, + "grad_norm": 2.095069169998169, + "learning_rate": 1.9930329783050146e-05, + "loss": 0.7869, + "step": 4722 + }, + { + "epoch": 0.7710705685482225, + "grad_norm": 2.1346852779388428, + "learning_rate": 1.9930292384603153e-05, + "loss": 1.0629, + "step": 4723 + }, + { + "epoch": 0.7712338271907269, + "grad_norm": 1.9955021142959595, + "learning_rate": 1.993025497615636e-05, + "loss": 0.8815, + "step": 4724 + }, + { + "epoch": 0.7713970858332313, + "grad_norm": 1.7011231184005737, + "learning_rate": 1.99302175577098e-05, + "loss": 0.6644, + "step": 4725 + }, + { + "epoch": 0.7715603444757357, + "grad_norm": 1.6701023578643799, + "learning_rate": 1.9930180129263516e-05, + "loss": 0.7036, + "step": 4726 + }, + { + "epoch": 0.7717236031182401, + "grad_norm": 2.1168112754821777, + "learning_rate": 1.993014269081754e-05, + "loss": 0.8638, + "step": 4727 + }, + { + "epoch": 0.7718868617607445, + "grad_norm": 1.7615264654159546, + "learning_rate": 1.9930105242371916e-05, + "loss": 0.6784, + "step": 4728 + }, + { + "epoch": 0.7720501204032488, + "grad_norm": 1.9493588209152222, + "learning_rate": 1.9930067783926676e-05, + "loss": 0.7227, + "step": 4729 + }, + { + "epoch": 0.7722133790457533, + "grad_norm": 2.3278698921203613, + "learning_rate": 1.9930030315481862e-05, + "loss": 0.8652, + "step": 4730 + }, + { + "epoch": 0.7723766376882576, + "grad_norm": 2.0557870864868164, + "learning_rate": 1.992999283703751e-05, + "loss": 0.7313, + "step": 4731 + }, + { + "epoch": 0.772539896330762, + "grad_norm": 1.932987928390503, + "learning_rate": 1.992995534859366e-05, + "loss": 0.9973, + "step": 4732 + }, + { + "epoch": 0.7727031549732664, + "grad_norm": 2.2142040729522705, + "learning_rate": 1.9929917850150344e-05, + "loss": 0.8792, + "step": 4733 + }, + { + "epoch": 0.7728664136157708, + "grad_norm": 2.124579906463623, + "learning_rate": 1.9929880341707605e-05, + "loss": 0.9059, + "step": 4734 + }, + { + "epoch": 0.7730296722582751, + "grad_norm": 1.956519365310669, + "learning_rate": 1.992984282326548e-05, + "loss": 0.8621, + "step": 4735 + }, + { + "epoch": 0.7731929309007796, + "grad_norm": 1.8126347064971924, + "learning_rate": 1.9929805294824004e-05, + "loss": 0.7112, + "step": 4736 + }, + { + "epoch": 0.773356189543284, + "grad_norm": 2.1600191593170166, + "learning_rate": 1.9929767756383217e-05, + "loss": 0.9415, + "step": 4737 + }, + { + "epoch": 0.7735194481857883, + "grad_norm": 2.1210319995880127, + "learning_rate": 1.992973020794316e-05, + "loss": 0.7454, + "step": 4738 + }, + { + "epoch": 0.7736827068282928, + "grad_norm": 2.0887176990509033, + "learning_rate": 1.9929692649503866e-05, + "loss": 0.9213, + "step": 4739 + }, + { + "epoch": 0.7738459654707971, + "grad_norm": 2.2875921726226807, + "learning_rate": 1.992965508106537e-05, + "loss": 1.0257, + "step": 4740 + }, + { + "epoch": 0.7740092241133015, + "grad_norm": 1.7170697450637817, + "learning_rate": 1.9929617502627717e-05, + "loss": 0.7716, + "step": 4741 + }, + { + "epoch": 0.7741724827558059, + "grad_norm": 2.331270933151245, + "learning_rate": 1.9929579914190943e-05, + "loss": 1.0241, + "step": 4742 + }, + { + "epoch": 0.7743357413983103, + "grad_norm": 1.8073878288269043, + "learning_rate": 1.9929542315755083e-05, + "loss": 0.7575, + "step": 4743 + }, + { + "epoch": 0.7744990000408146, + "grad_norm": 2.0143723487854004, + "learning_rate": 1.9929504707320176e-05, + "loss": 0.9307, + "step": 4744 + }, + { + "epoch": 0.7746622586833191, + "grad_norm": 1.470751404762268, + "learning_rate": 1.992946708888626e-05, + "loss": 0.625, + "step": 4745 + }, + { + "epoch": 0.7748255173258234, + "grad_norm": 2.1794958114624023, + "learning_rate": 1.9929429460453377e-05, + "loss": 0.9388, + "step": 4746 + }, + { + "epoch": 0.7749887759683278, + "grad_norm": 2.0387630462646484, + "learning_rate": 1.9929391822021556e-05, + "loss": 0.8402, + "step": 4747 + }, + { + "epoch": 0.7751520346108323, + "grad_norm": 2.002237319946289, + "learning_rate": 1.9929354173590844e-05, + "loss": 0.8698, + "step": 4748 + }, + { + "epoch": 0.7753152932533366, + "grad_norm": 1.8765798807144165, + "learning_rate": 1.9929316515161274e-05, + "loss": 0.8289, + "step": 4749 + }, + { + "epoch": 0.775478551895841, + "grad_norm": 2.0631752014160156, + "learning_rate": 1.9929278846732883e-05, + "loss": 0.7836, + "step": 4750 + }, + { + "epoch": 0.7756418105383454, + "grad_norm": 1.8475792407989502, + "learning_rate": 1.9929241168305715e-05, + "loss": 0.7923, + "step": 4751 + }, + { + "epoch": 0.7758050691808498, + "grad_norm": 1.8382411003112793, + "learning_rate": 1.9929203479879798e-05, + "loss": 0.7335, + "step": 4752 + }, + { + "epoch": 0.7759683278233541, + "grad_norm": 2.214232921600342, + "learning_rate": 1.992916578145518e-05, + "loss": 0.7798, + "step": 4753 + }, + { + "epoch": 0.7761315864658586, + "grad_norm": 1.7648773193359375, + "learning_rate": 1.9929128073031894e-05, + "loss": 0.7423, + "step": 4754 + }, + { + "epoch": 0.7762948451083629, + "grad_norm": 1.9422231912612915, + "learning_rate": 1.992909035460998e-05, + "loss": 0.7501, + "step": 4755 + }, + { + "epoch": 0.7764581037508673, + "grad_norm": 1.9625016450881958, + "learning_rate": 1.992905262618947e-05, + "loss": 0.7894, + "step": 4756 + }, + { + "epoch": 0.7766213623933717, + "grad_norm": 1.9452979564666748, + "learning_rate": 1.992901488777041e-05, + "loss": 1.0352, + "step": 4757 + }, + { + "epoch": 0.7767846210358761, + "grad_norm": 2.0180585384368896, + "learning_rate": 1.9928977139352836e-05, + "loss": 0.9555, + "step": 4758 + }, + { + "epoch": 0.7769478796783805, + "grad_norm": 2.206775665283203, + "learning_rate": 1.992893938093678e-05, + "loss": 0.9121, + "step": 4759 + }, + { + "epoch": 0.7771111383208849, + "grad_norm": 2.0719492435455322, + "learning_rate": 1.992890161252229e-05, + "loss": 1.0306, + "step": 4760 + }, + { + "epoch": 0.7772743969633893, + "grad_norm": 1.9695645570755005, + "learning_rate": 1.9928863834109397e-05, + "loss": 0.7072, + "step": 4761 + }, + { + "epoch": 0.7774376556058936, + "grad_norm": 2.0175983905792236, + "learning_rate": 1.9928826045698138e-05, + "loss": 0.7949, + "step": 4762 + }, + { + "epoch": 0.7776009142483981, + "grad_norm": 1.643094539642334, + "learning_rate": 1.9928788247288557e-05, + "loss": 0.8299, + "step": 4763 + }, + { + "epoch": 0.7777641728909024, + "grad_norm": 1.770142674446106, + "learning_rate": 1.9928750438880687e-05, + "loss": 0.6598, + "step": 4764 + }, + { + "epoch": 0.7779274315334068, + "grad_norm": 2.1587982177734375, + "learning_rate": 1.992871262047457e-05, + "loss": 0.8087, + "step": 4765 + }, + { + "epoch": 0.7780906901759111, + "grad_norm": 1.7740154266357422, + "learning_rate": 1.992867479207024e-05, + "loss": 0.8044, + "step": 4766 + }, + { + "epoch": 0.7782539488184156, + "grad_norm": 1.7717618942260742, + "learning_rate": 1.9928636953667734e-05, + "loss": 0.7726, + "step": 4767 + }, + { + "epoch": 0.7784172074609199, + "grad_norm": 2.0453851222991943, + "learning_rate": 1.9928599105267098e-05, + "loss": 0.8473, + "step": 4768 + }, + { + "epoch": 0.7785804661034244, + "grad_norm": 1.8253381252288818, + "learning_rate": 1.9928561246868367e-05, + "loss": 0.6764, + "step": 4769 + }, + { + "epoch": 0.7787437247459288, + "grad_norm": 1.5525867938995361, + "learning_rate": 1.9928523378471573e-05, + "loss": 0.7257, + "step": 4770 + }, + { + "epoch": 0.7789069833884331, + "grad_norm": 2.183997631072998, + "learning_rate": 1.992848550007676e-05, + "loss": 0.8833, + "step": 4771 + }, + { + "epoch": 0.7790702420309376, + "grad_norm": 2.07392954826355, + "learning_rate": 1.9928447611683964e-05, + "loss": 1.0782, + "step": 4772 + }, + { + "epoch": 0.7792335006734419, + "grad_norm": 1.9873499870300293, + "learning_rate": 1.9928409713293226e-05, + "loss": 0.7835, + "step": 4773 + }, + { + "epoch": 0.7793967593159463, + "grad_norm": 1.7642195224761963, + "learning_rate": 1.992837180490458e-05, + "loss": 0.6764, + "step": 4774 + }, + { + "epoch": 0.7795600179584506, + "grad_norm": 1.6348297595977783, + "learning_rate": 1.992833388651807e-05, + "loss": 0.7414, + "step": 4775 + }, + { + "epoch": 0.7797232766009551, + "grad_norm": 2.057547092437744, + "learning_rate": 1.9928295958133726e-05, + "loss": 0.7635, + "step": 4776 + }, + { + "epoch": 0.7798865352434594, + "grad_norm": 2.053492784500122, + "learning_rate": 1.992825801975159e-05, + "loss": 0.7378, + "step": 4777 + }, + { + "epoch": 0.7800497938859638, + "grad_norm": 1.648028016090393, + "learning_rate": 1.9928220071371706e-05, + "loss": 0.7207, + "step": 4778 + }, + { + "epoch": 0.7802130525284682, + "grad_norm": 1.8631021976470947, + "learning_rate": 1.9928182112994105e-05, + "loss": 0.782, + "step": 4779 + }, + { + "epoch": 0.7803763111709726, + "grad_norm": 1.5458502769470215, + "learning_rate": 1.9928144144618824e-05, + "loss": 0.6672, + "step": 4780 + }, + { + "epoch": 0.780539569813477, + "grad_norm": 1.768155574798584, + "learning_rate": 1.9928106166245906e-05, + "loss": 0.7477, + "step": 4781 + }, + { + "epoch": 0.7807028284559814, + "grad_norm": 2.1225619316101074, + "learning_rate": 1.992806817787539e-05, + "loss": 0.9158, + "step": 4782 + }, + { + "epoch": 0.7808660870984858, + "grad_norm": 2.369610071182251, + "learning_rate": 1.992803017950731e-05, + "loss": 0.9061, + "step": 4783 + }, + { + "epoch": 0.7810293457409901, + "grad_norm": 2.3653757572174072, + "learning_rate": 1.9927992171141707e-05, + "loss": 0.9392, + "step": 4784 + }, + { + "epoch": 0.7811926043834946, + "grad_norm": 1.8838850259780884, + "learning_rate": 1.9927954152778618e-05, + "loss": 0.8839, + "step": 4785 + }, + { + "epoch": 0.7813558630259989, + "grad_norm": 1.7974094152450562, + "learning_rate": 1.9927916124418084e-05, + "loss": 0.7637, + "step": 4786 + }, + { + "epoch": 0.7815191216685033, + "grad_norm": 1.8508896827697754, + "learning_rate": 1.9927878086060136e-05, + "loss": 0.7516, + "step": 4787 + }, + { + "epoch": 0.7816823803110077, + "grad_norm": 1.7573643922805786, + "learning_rate": 1.9927840037704823e-05, + "loss": 0.8523, + "step": 4788 + }, + { + "epoch": 0.7818456389535121, + "grad_norm": 1.8613853454589844, + "learning_rate": 1.9927801979352174e-05, + "loss": 0.8423, + "step": 4789 + }, + { + "epoch": 0.7820088975960164, + "grad_norm": 1.8870909214019775, + "learning_rate": 1.9927763911002232e-05, + "loss": 0.9155, + "step": 4790 + }, + { + "epoch": 0.7821721562385209, + "grad_norm": 1.7808561325073242, + "learning_rate": 1.9927725832655035e-05, + "loss": 0.6944, + "step": 4791 + }, + { + "epoch": 0.7823354148810253, + "grad_norm": 1.8420054912567139, + "learning_rate": 1.992768774431062e-05, + "loss": 0.7751, + "step": 4792 + }, + { + "epoch": 0.7824986735235296, + "grad_norm": 1.9556150436401367, + "learning_rate": 1.9927649645969026e-05, + "loss": 0.7477, + "step": 4793 + }, + { + "epoch": 0.7826619321660341, + "grad_norm": 2.1239070892333984, + "learning_rate": 1.9927611537630293e-05, + "loss": 0.7029, + "step": 4794 + }, + { + "epoch": 0.7828251908085384, + "grad_norm": 2.495624542236328, + "learning_rate": 1.9927573419294456e-05, + "loss": 0.8544, + "step": 4795 + }, + { + "epoch": 0.7829884494510428, + "grad_norm": 1.9147332906723022, + "learning_rate": 1.9927535290961558e-05, + "loss": 0.8788, + "step": 4796 + }, + { + "epoch": 0.7831517080935472, + "grad_norm": 1.638369083404541, + "learning_rate": 1.992749715263163e-05, + "loss": 0.6212, + "step": 4797 + }, + { + "epoch": 0.7833149667360516, + "grad_norm": 1.7015105485916138, + "learning_rate": 1.992745900430472e-05, + "loss": 0.6655, + "step": 4798 + }, + { + "epoch": 0.7834782253785559, + "grad_norm": 1.9030895233154297, + "learning_rate": 1.9927420845980857e-05, + "loss": 0.8161, + "step": 4799 + }, + { + "epoch": 0.7836414840210604, + "grad_norm": 1.6917569637298584, + "learning_rate": 1.992738267766009e-05, + "loss": 0.8033, + "step": 4800 + }, + { + "epoch": 0.7838047426635647, + "grad_norm": 1.9744513034820557, + "learning_rate": 1.992734449934244e-05, + "loss": 0.8762, + "step": 4801 + }, + { + "epoch": 0.7839680013060691, + "grad_norm": 2.2937164306640625, + "learning_rate": 1.992730631102797e-05, + "loss": 1.0305, + "step": 4802 + }, + { + "epoch": 0.7841312599485736, + "grad_norm": 2.205116033554077, + "learning_rate": 1.9927268112716698e-05, + "loss": 0.8747, + "step": 4803 + }, + { + "epoch": 0.7842945185910779, + "grad_norm": 1.6136376857757568, + "learning_rate": 1.992722990440867e-05, + "loss": 0.673, + "step": 4804 + }, + { + "epoch": 0.7844577772335823, + "grad_norm": 1.7520679235458374, + "learning_rate": 1.9927191686103924e-05, + "loss": 0.8504, + "step": 4805 + }, + { + "epoch": 0.7846210358760867, + "grad_norm": 1.8477044105529785, + "learning_rate": 1.99271534578025e-05, + "loss": 0.8235, + "step": 4806 + }, + { + "epoch": 0.7847842945185911, + "grad_norm": 1.7849783897399902, + "learning_rate": 1.9927115219504433e-05, + "loss": 0.9249, + "step": 4807 + }, + { + "epoch": 0.7849475531610954, + "grad_norm": 1.8515146970748901, + "learning_rate": 1.9927076971209765e-05, + "loss": 0.7988, + "step": 4808 + }, + { + "epoch": 0.7851108118035999, + "grad_norm": 2.2251012325286865, + "learning_rate": 1.9927038712918532e-05, + "loss": 0.825, + "step": 4809 + }, + { + "epoch": 0.7852740704461042, + "grad_norm": 1.7487651109695435, + "learning_rate": 1.9927000444630776e-05, + "loss": 0.9116, + "step": 4810 + }, + { + "epoch": 0.7854373290886086, + "grad_norm": 2.046638250350952, + "learning_rate": 1.992696216634653e-05, + "loss": 0.7812, + "step": 4811 + }, + { + "epoch": 0.785600587731113, + "grad_norm": 2.1835498809814453, + "learning_rate": 1.992692387806584e-05, + "loss": 0.8792, + "step": 4812 + }, + { + "epoch": 0.7857638463736174, + "grad_norm": 1.9345844984054565, + "learning_rate": 1.9926885579788736e-05, + "loss": 0.7299, + "step": 4813 + }, + { + "epoch": 0.7859271050161218, + "grad_norm": 1.4853423833847046, + "learning_rate": 1.9926847271515265e-05, + "loss": 0.7281, + "step": 4814 + }, + { + "epoch": 0.7860903636586262, + "grad_norm": 1.7897347211837769, + "learning_rate": 1.9926808953245457e-05, + "loss": 0.7749, + "step": 4815 + }, + { + "epoch": 0.7862536223011306, + "grad_norm": 2.2011101245880127, + "learning_rate": 1.992677062497936e-05, + "loss": 0.9332, + "step": 4816 + }, + { + "epoch": 0.7864168809436349, + "grad_norm": 1.8387035131454468, + "learning_rate": 1.9926732286717005e-05, + "loss": 0.732, + "step": 4817 + }, + { + "epoch": 0.7865801395861394, + "grad_norm": 1.8893117904663086, + "learning_rate": 1.9926693938458432e-05, + "loss": 0.8071, + "step": 4818 + }, + { + "epoch": 0.7867433982286437, + "grad_norm": 2.084770679473877, + "learning_rate": 1.992665558020368e-05, + "loss": 0.9329, + "step": 4819 + }, + { + "epoch": 0.7869066568711481, + "grad_norm": 2.0545647144317627, + "learning_rate": 1.9926617211952793e-05, + "loss": 0.8625, + "step": 4820 + }, + { + "epoch": 0.7870699155136525, + "grad_norm": 1.4702197313308716, + "learning_rate": 1.9926578833705802e-05, + "loss": 0.6962, + "step": 4821 + }, + { + "epoch": 0.7872331741561569, + "grad_norm": 2.0203728675842285, + "learning_rate": 1.992654044546275e-05, + "loss": 0.74, + "step": 4822 + }, + { + "epoch": 0.7873964327986612, + "grad_norm": 1.8938812017440796, + "learning_rate": 1.9926502047223674e-05, + "loss": 0.8473, + "step": 4823 + }, + { + "epoch": 0.7875596914411657, + "grad_norm": 2.1865949630737305, + "learning_rate": 1.992646363898861e-05, + "loss": 0.7746, + "step": 4824 + }, + { + "epoch": 0.7877229500836701, + "grad_norm": 1.8459864854812622, + "learning_rate": 1.9926425220757607e-05, + "loss": 0.786, + "step": 4825 + }, + { + "epoch": 0.7878862087261744, + "grad_norm": 2.072862386703491, + "learning_rate": 1.992638679253069e-05, + "loss": 0.9947, + "step": 4826 + }, + { + "epoch": 0.7880494673686789, + "grad_norm": 2.2285473346710205, + "learning_rate": 1.9926348354307906e-05, + "loss": 0.7423, + "step": 4827 + }, + { + "epoch": 0.7882127260111832, + "grad_norm": 1.7071491479873657, + "learning_rate": 1.992630990608929e-05, + "loss": 0.7249, + "step": 4828 + }, + { + "epoch": 0.7883759846536876, + "grad_norm": 2.0336320400238037, + "learning_rate": 1.9926271447874885e-05, + "loss": 0.7545, + "step": 4829 + }, + { + "epoch": 0.788539243296192, + "grad_norm": 1.8453136682510376, + "learning_rate": 1.9926232979664727e-05, + "loss": 0.7947, + "step": 4830 + }, + { + "epoch": 0.7887025019386964, + "grad_norm": 1.8886222839355469, + "learning_rate": 1.9926194501458856e-05, + "loss": 0.8197, + "step": 4831 + }, + { + "epoch": 0.7888657605812007, + "grad_norm": 2.2592084407806396, + "learning_rate": 1.992615601325731e-05, + "loss": 0.8515, + "step": 4832 + }, + { + "epoch": 0.7890290192237052, + "grad_norm": 2.0957882404327393, + "learning_rate": 1.9926117515060124e-05, + "loss": 0.844, + "step": 4833 + }, + { + "epoch": 0.7891922778662095, + "grad_norm": 2.1119346618652344, + "learning_rate": 1.992607900686734e-05, + "loss": 0.9145, + "step": 4834 + }, + { + "epoch": 0.7893555365087139, + "grad_norm": 1.541168212890625, + "learning_rate": 1.9926040488679e-05, + "loss": 0.6781, + "step": 4835 + }, + { + "epoch": 0.7895187951512184, + "grad_norm": 2.149961233139038, + "learning_rate": 1.992600196049514e-05, + "loss": 0.8346, + "step": 4836 + }, + { + "epoch": 0.7896820537937227, + "grad_norm": 2.000056266784668, + "learning_rate": 1.99259634223158e-05, + "loss": 0.9461, + "step": 4837 + }, + { + "epoch": 0.7898453124362271, + "grad_norm": 1.895493745803833, + "learning_rate": 1.9925924874141014e-05, + "loss": 0.924, + "step": 4838 + }, + { + "epoch": 0.7900085710787315, + "grad_norm": 2.0710206031799316, + "learning_rate": 1.9925886315970825e-05, + "loss": 1.0897, + "step": 4839 + }, + { + "epoch": 0.7901718297212359, + "grad_norm": 1.845704436302185, + "learning_rate": 1.9925847747805274e-05, + "loss": 0.7711, + "step": 4840 + }, + { + "epoch": 0.7903350883637402, + "grad_norm": 1.8371671438217163, + "learning_rate": 1.9925809169644395e-05, + "loss": 0.6374, + "step": 4841 + }, + { + "epoch": 0.7904983470062447, + "grad_norm": 1.8138757944107056, + "learning_rate": 1.9925770581488226e-05, + "loss": 0.8355, + "step": 4842 + }, + { + "epoch": 0.790661605648749, + "grad_norm": 2.237771511077881, + "learning_rate": 1.9925731983336814e-05, + "loss": 0.9534, + "step": 4843 + }, + { + "epoch": 0.7908248642912534, + "grad_norm": 2.1276376247406006, + "learning_rate": 1.9925693375190187e-05, + "loss": 0.7991, + "step": 4844 + }, + { + "epoch": 0.7909881229337578, + "grad_norm": 2.7150862216949463, + "learning_rate": 1.9925654757048394e-05, + "loss": 0.9995, + "step": 4845 + }, + { + "epoch": 0.7911513815762622, + "grad_norm": 2.2937488555908203, + "learning_rate": 1.9925616128911467e-05, + "loss": 1.0305, + "step": 4846 + }, + { + "epoch": 0.7913146402187666, + "grad_norm": 1.6939880847930908, + "learning_rate": 1.992557749077945e-05, + "loss": 0.6885, + "step": 4847 + }, + { + "epoch": 0.791477898861271, + "grad_norm": 1.8745018243789673, + "learning_rate": 1.9925538842652376e-05, + "loss": 0.7845, + "step": 4848 + }, + { + "epoch": 0.7916411575037754, + "grad_norm": 1.8437085151672363, + "learning_rate": 1.9925500184530286e-05, + "loss": 0.8282, + "step": 4849 + }, + { + "epoch": 0.7918044161462797, + "grad_norm": 1.677315354347229, + "learning_rate": 1.9925461516413224e-05, + "loss": 0.7354, + "step": 4850 + }, + { + "epoch": 0.7919676747887842, + "grad_norm": 1.949843168258667, + "learning_rate": 1.992542283830122e-05, + "loss": 0.7897, + "step": 4851 + }, + { + "epoch": 0.7921309334312885, + "grad_norm": 1.8098435401916504, + "learning_rate": 1.992538415019432e-05, + "loss": 0.9394, + "step": 4852 + }, + { + "epoch": 0.7922941920737929, + "grad_norm": 1.827813744544983, + "learning_rate": 1.992534545209256e-05, + "loss": 0.9133, + "step": 4853 + }, + { + "epoch": 0.7924574507162973, + "grad_norm": 2.09625506401062, + "learning_rate": 1.9925306743995984e-05, + "loss": 0.8424, + "step": 4854 + }, + { + "epoch": 0.7926207093588017, + "grad_norm": 1.937185287475586, + "learning_rate": 1.9925268025904622e-05, + "loss": 0.8087, + "step": 4855 + }, + { + "epoch": 0.792783968001306, + "grad_norm": 2.0197770595550537, + "learning_rate": 1.992522929781852e-05, + "loss": 0.8478, + "step": 4856 + }, + { + "epoch": 0.7929472266438105, + "grad_norm": 1.6214985847473145, + "learning_rate": 1.9925190559737714e-05, + "loss": 0.5859, + "step": 4857 + }, + { + "epoch": 0.7931104852863149, + "grad_norm": 2.090153694152832, + "learning_rate": 1.9925151811662243e-05, + "loss": 0.9524, + "step": 4858 + }, + { + "epoch": 0.7932737439288192, + "grad_norm": 1.8472976684570312, + "learning_rate": 1.992511305359215e-05, + "loss": 0.7586, + "step": 4859 + }, + { + "epoch": 0.7934370025713237, + "grad_norm": 2.0367839336395264, + "learning_rate": 1.9925074285527467e-05, + "loss": 0.7588, + "step": 4860 + }, + { + "epoch": 0.793600261213828, + "grad_norm": 1.6678805351257324, + "learning_rate": 1.992503550746824e-05, + "loss": 0.7385, + "step": 4861 + }, + { + "epoch": 0.7937635198563324, + "grad_norm": 1.7842092514038086, + "learning_rate": 1.9924996719414503e-05, + "loss": 0.7824, + "step": 4862 + }, + { + "epoch": 0.7939267784988367, + "grad_norm": 2.0741941928863525, + "learning_rate": 1.9924957921366298e-05, + "loss": 0.9028, + "step": 4863 + }, + { + "epoch": 0.7940900371413412, + "grad_norm": 2.0686750411987305, + "learning_rate": 1.992491911332366e-05, + "loss": 0.8946, + "step": 4864 + }, + { + "epoch": 0.7942532957838455, + "grad_norm": 1.8333791494369507, + "learning_rate": 1.9924880295286634e-05, + "loss": 0.7942, + "step": 4865 + }, + { + "epoch": 0.79441655442635, + "grad_norm": 1.8150825500488281, + "learning_rate": 1.9924841467255254e-05, + "loss": 0.7016, + "step": 4866 + }, + { + "epoch": 0.7945798130688543, + "grad_norm": 1.9416899681091309, + "learning_rate": 1.9924802629229563e-05, + "loss": 0.7524, + "step": 4867 + }, + { + "epoch": 0.7947430717113587, + "grad_norm": 1.6752439737319946, + "learning_rate": 1.99247637812096e-05, + "loss": 0.777, + "step": 4868 + }, + { + "epoch": 0.7949063303538632, + "grad_norm": 2.1142969131469727, + "learning_rate": 1.9924724923195397e-05, + "loss": 0.9306, + "step": 4869 + }, + { + "epoch": 0.7950695889963675, + "grad_norm": 1.9637212753295898, + "learning_rate": 1.9924686055187003e-05, + "loss": 0.8004, + "step": 4870 + }, + { + "epoch": 0.7952328476388719, + "grad_norm": 1.7740687131881714, + "learning_rate": 1.9924647177184453e-05, + "loss": 0.6823, + "step": 4871 + }, + { + "epoch": 0.7953961062813762, + "grad_norm": 1.6591767072677612, + "learning_rate": 1.9924608289187786e-05, + "loss": 0.7141, + "step": 4872 + }, + { + "epoch": 0.7955593649238807, + "grad_norm": 1.8841469287872314, + "learning_rate": 1.9924569391197038e-05, + "loss": 0.7758, + "step": 4873 + }, + { + "epoch": 0.795722623566385, + "grad_norm": 1.7900296449661255, + "learning_rate": 1.9924530483212253e-05, + "loss": 0.7176, + "step": 4874 + }, + { + "epoch": 0.7958858822088895, + "grad_norm": 2.041778802871704, + "learning_rate": 1.992449156523347e-05, + "loss": 0.7577, + "step": 4875 + }, + { + "epoch": 0.7960491408513938, + "grad_norm": 1.648495078086853, + "learning_rate": 1.9924452637260726e-05, + "loss": 0.6537, + "step": 4876 + }, + { + "epoch": 0.7962123994938982, + "grad_norm": 1.8487268686294556, + "learning_rate": 1.9924413699294058e-05, + "loss": 0.7848, + "step": 4877 + }, + { + "epoch": 0.7963756581364025, + "grad_norm": 2.021528482437134, + "learning_rate": 1.9924374751333512e-05, + "loss": 0.8082, + "step": 4878 + }, + { + "epoch": 0.796538916778907, + "grad_norm": 2.3087358474731445, + "learning_rate": 1.9924335793379123e-05, + "loss": 0.7887, + "step": 4879 + }, + { + "epoch": 0.7967021754214114, + "grad_norm": 2.0047802925109863, + "learning_rate": 1.9924296825430928e-05, + "loss": 0.8623, + "step": 4880 + }, + { + "epoch": 0.7968654340639157, + "grad_norm": 1.678239107131958, + "learning_rate": 1.9924257847488967e-05, + "loss": 0.7003, + "step": 4881 + }, + { + "epoch": 0.7970286927064202, + "grad_norm": 1.731492519378662, + "learning_rate": 1.9924218859553288e-05, + "loss": 0.708, + "step": 4882 + }, + { + "epoch": 0.7971919513489245, + "grad_norm": 2.324237585067749, + "learning_rate": 1.9924179861623917e-05, + "loss": 0.836, + "step": 4883 + }, + { + "epoch": 0.797355209991429, + "grad_norm": 1.8351635932922363, + "learning_rate": 1.99241408537009e-05, + "loss": 0.6634, + "step": 4884 + }, + { + "epoch": 0.7975184686339333, + "grad_norm": 2.070648670196533, + "learning_rate": 1.9924101835784276e-05, + "loss": 0.8651, + "step": 4885 + }, + { + "epoch": 0.7976817272764377, + "grad_norm": 1.9946140050888062, + "learning_rate": 1.9924062807874086e-05, + "loss": 0.8188, + "step": 4886 + }, + { + "epoch": 0.797844985918942, + "grad_norm": 1.9841363430023193, + "learning_rate": 1.9924023769970368e-05, + "loss": 0.891, + "step": 4887 + }, + { + "epoch": 0.7980082445614465, + "grad_norm": 2.1555466651916504, + "learning_rate": 1.992398472207316e-05, + "loss": 0.9885, + "step": 4888 + }, + { + "epoch": 0.7981715032039508, + "grad_norm": 2.10235857963562, + "learning_rate": 1.99239456641825e-05, + "loss": 0.8864, + "step": 4889 + }, + { + "epoch": 0.7983347618464552, + "grad_norm": 2.0317468643188477, + "learning_rate": 1.992390659629843e-05, + "loss": 0.6946, + "step": 4890 + }, + { + "epoch": 0.7984980204889597, + "grad_norm": 1.592612385749817, + "learning_rate": 1.9923867518420993e-05, + "loss": 0.5966, + "step": 4891 + }, + { + "epoch": 0.798661279131464, + "grad_norm": 1.9969428777694702, + "learning_rate": 1.9923828430550215e-05, + "loss": 0.7439, + "step": 4892 + }, + { + "epoch": 0.7988245377739684, + "grad_norm": 1.888777256011963, + "learning_rate": 1.9923789332686153e-05, + "loss": 0.8761, + "step": 4893 + }, + { + "epoch": 0.7989877964164728, + "grad_norm": 1.9656144380569458, + "learning_rate": 1.9923750224828833e-05, + "loss": 0.8355, + "step": 4894 + }, + { + "epoch": 0.7991510550589772, + "grad_norm": 1.9136943817138672, + "learning_rate": 1.99237111069783e-05, + "loss": 0.8629, + "step": 4895 + }, + { + "epoch": 0.7993143137014815, + "grad_norm": 1.7606338262557983, + "learning_rate": 1.9923671979134594e-05, + "loss": 0.7806, + "step": 4896 + }, + { + "epoch": 0.799477572343986, + "grad_norm": 1.8844459056854248, + "learning_rate": 1.9923632841297753e-05, + "loss": 0.8732, + "step": 4897 + }, + { + "epoch": 0.7996408309864903, + "grad_norm": 1.7217713594436646, + "learning_rate": 1.9923593693467816e-05, + "loss": 0.6952, + "step": 4898 + }, + { + "epoch": 0.7998040896289947, + "grad_norm": 1.9081617593765259, + "learning_rate": 1.9923554535644823e-05, + "loss": 0.9705, + "step": 4899 + }, + { + "epoch": 0.7999673482714991, + "grad_norm": 1.782641887664795, + "learning_rate": 1.9923515367828812e-05, + "loss": 0.7567, + "step": 4900 + }, + { + "epoch": 0.8001306069140035, + "grad_norm": 1.5512231588363647, + "learning_rate": 1.9923476190019825e-05, + "loss": 0.5722, + "step": 4901 + }, + { + "epoch": 0.8002938655565079, + "grad_norm": 1.762524962425232, + "learning_rate": 1.99234370022179e-05, + "loss": 0.683, + "step": 4902 + }, + { + "epoch": 0.8004571241990123, + "grad_norm": 1.9655622243881226, + "learning_rate": 1.9923397804423075e-05, + "loss": 0.8997, + "step": 4903 + }, + { + "epoch": 0.8006203828415167, + "grad_norm": 2.107661008834839, + "learning_rate": 1.9923358596635393e-05, + "loss": 0.8201, + "step": 4904 + }, + { + "epoch": 0.800783641484021, + "grad_norm": 2.104870080947876, + "learning_rate": 1.9923319378854888e-05, + "loss": 0.9219, + "step": 4905 + }, + { + "epoch": 0.8009469001265255, + "grad_norm": 2.1182050704956055, + "learning_rate": 1.992328015108161e-05, + "loss": 0.8201, + "step": 4906 + }, + { + "epoch": 0.8011101587690298, + "grad_norm": 1.873949646949768, + "learning_rate": 1.9923240913315585e-05, + "loss": 0.8274, + "step": 4907 + }, + { + "epoch": 0.8012734174115342, + "grad_norm": 1.82496178150177, + "learning_rate": 1.992320166555686e-05, + "loss": 0.8341, + "step": 4908 + }, + { + "epoch": 0.8014366760540386, + "grad_norm": 2.264035940170288, + "learning_rate": 1.9923162407805475e-05, + "loss": 0.8222, + "step": 4909 + }, + { + "epoch": 0.801599934696543, + "grad_norm": 1.9848077297210693, + "learning_rate": 1.9923123140061467e-05, + "loss": 0.9032, + "step": 4910 + }, + { + "epoch": 0.8017631933390473, + "grad_norm": 1.8136736154556274, + "learning_rate": 1.9923083862324876e-05, + "loss": 0.8664, + "step": 4911 + }, + { + "epoch": 0.8019264519815518, + "grad_norm": 1.7448256015777588, + "learning_rate": 1.9923044574595746e-05, + "loss": 0.7248, + "step": 4912 + }, + { + "epoch": 0.8020897106240562, + "grad_norm": 2.084658622741699, + "learning_rate": 1.992300527687411e-05, + "loss": 0.8148, + "step": 4913 + }, + { + "epoch": 0.8022529692665605, + "grad_norm": 1.652200698852539, + "learning_rate": 1.9922965969160007e-05, + "loss": 0.7118, + "step": 4914 + }, + { + "epoch": 0.802416227909065, + "grad_norm": 1.9420719146728516, + "learning_rate": 1.9922926651453487e-05, + "loss": 0.8443, + "step": 4915 + }, + { + "epoch": 0.8025794865515693, + "grad_norm": 2.0187103748321533, + "learning_rate": 1.992288732375458e-05, + "loss": 0.7311, + "step": 4916 + }, + { + "epoch": 0.8027427451940737, + "grad_norm": 2.107795476913452, + "learning_rate": 1.9922847986063326e-05, + "loss": 0.6756, + "step": 4917 + }, + { + "epoch": 0.8029060038365781, + "grad_norm": 2.0052661895751953, + "learning_rate": 1.9922808638379767e-05, + "loss": 0.7797, + "step": 4918 + }, + { + "epoch": 0.8030692624790825, + "grad_norm": 1.8871933221817017, + "learning_rate": 1.9922769280703944e-05, + "loss": 0.7946, + "step": 4919 + }, + { + "epoch": 0.8032325211215868, + "grad_norm": 1.9102355241775513, + "learning_rate": 1.9922729913035893e-05, + "loss": 0.9598, + "step": 4920 + }, + { + "epoch": 0.8033957797640913, + "grad_norm": 1.4432445764541626, + "learning_rate": 1.9922690535375656e-05, + "loss": 0.6567, + "step": 4921 + }, + { + "epoch": 0.8035590384065956, + "grad_norm": 1.935724139213562, + "learning_rate": 1.9922651147723275e-05, + "loss": 0.8524, + "step": 4922 + }, + { + "epoch": 0.8037222970491, + "grad_norm": 1.9747815132141113, + "learning_rate": 1.9922611750078783e-05, + "loss": 0.8371, + "step": 4923 + }, + { + "epoch": 0.8038855556916045, + "grad_norm": 2.0641162395477295, + "learning_rate": 1.9922572342442225e-05, + "loss": 0.5564, + "step": 4924 + }, + { + "epoch": 0.8040488143341088, + "grad_norm": 1.9731000661849976, + "learning_rate": 1.992253292481364e-05, + "loss": 0.8307, + "step": 4925 + }, + { + "epoch": 0.8042120729766132, + "grad_norm": 2.0066750049591064, + "learning_rate": 1.992249349719307e-05, + "loss": 0.8237, + "step": 4926 + }, + { + "epoch": 0.8043753316191176, + "grad_norm": 2.580623149871826, + "learning_rate": 1.9922454059580543e-05, + "loss": 0.7891, + "step": 4927 + }, + { + "epoch": 0.804538590261622, + "grad_norm": 2.139864206314087, + "learning_rate": 1.9922414611976116e-05, + "loss": 0.8092, + "step": 4928 + }, + { + "epoch": 0.8047018489041263, + "grad_norm": 1.8119847774505615, + "learning_rate": 1.9922375154379818e-05, + "loss": 0.8028, + "step": 4929 + }, + { + "epoch": 0.8048651075466308, + "grad_norm": 1.9565775394439697, + "learning_rate": 1.992233568679169e-05, + "loss": 0.807, + "step": 4930 + }, + { + "epoch": 0.8050283661891351, + "grad_norm": 1.834346890449524, + "learning_rate": 1.992229620921177e-05, + "loss": 0.8014, + "step": 4931 + }, + { + "epoch": 0.8051916248316395, + "grad_norm": 1.9296114444732666, + "learning_rate": 1.9922256721640104e-05, + "loss": 0.7247, + "step": 4932 + }, + { + "epoch": 0.805354883474144, + "grad_norm": 1.9737929105758667, + "learning_rate": 1.992221722407673e-05, + "loss": 0.7077, + "step": 4933 + }, + { + "epoch": 0.8055181421166483, + "grad_norm": 2.2124643325805664, + "learning_rate": 1.9922177716521678e-05, + "loss": 0.9552, + "step": 4934 + }, + { + "epoch": 0.8056814007591527, + "grad_norm": 1.9864575862884521, + "learning_rate": 1.9922138198975003e-05, + "loss": 0.7147, + "step": 4935 + }, + { + "epoch": 0.8058446594016571, + "grad_norm": 2.0567729473114014, + "learning_rate": 1.9922098671436734e-05, + "loss": 0.8462, + "step": 4936 + }, + { + "epoch": 0.8060079180441615, + "grad_norm": 1.8009942770004272, + "learning_rate": 1.9922059133906915e-05, + "loss": 0.9017, + "step": 4937 + }, + { + "epoch": 0.8061711766866658, + "grad_norm": 1.920516848564148, + "learning_rate": 1.9922019586385587e-05, + "loss": 0.7646, + "step": 4938 + }, + { + "epoch": 0.8063344353291703, + "grad_norm": 1.9540677070617676, + "learning_rate": 1.9921980028872784e-05, + "loss": 0.8017, + "step": 4939 + }, + { + "epoch": 0.8064976939716746, + "grad_norm": 1.7938027381896973, + "learning_rate": 1.9921940461368552e-05, + "loss": 0.838, + "step": 4940 + }, + { + "epoch": 0.806660952614179, + "grad_norm": 1.694833755493164, + "learning_rate": 1.9921900883872927e-05, + "loss": 0.6853, + "step": 4941 + }, + { + "epoch": 0.8068242112566834, + "grad_norm": 1.9588202238082886, + "learning_rate": 1.9921861296385952e-05, + "loss": 0.8206, + "step": 4942 + }, + { + "epoch": 0.8069874698991878, + "grad_norm": 2.0975844860076904, + "learning_rate": 1.9921821698907668e-05, + "loss": 0.8198, + "step": 4943 + }, + { + "epoch": 0.8071507285416922, + "grad_norm": 1.7575784921646118, + "learning_rate": 1.9921782091438108e-05, + "loss": 0.7722, + "step": 4944 + }, + { + "epoch": 0.8073139871841966, + "grad_norm": 1.9443762302398682, + "learning_rate": 1.9921742473977317e-05, + "loss": 0.7943, + "step": 4945 + }, + { + "epoch": 0.807477245826701, + "grad_norm": 1.9529401063919067, + "learning_rate": 1.9921702846525335e-05, + "loss": 0.8789, + "step": 4946 + }, + { + "epoch": 0.8076405044692053, + "grad_norm": 1.9852482080459595, + "learning_rate": 1.99216632090822e-05, + "loss": 0.8449, + "step": 4947 + }, + { + "epoch": 0.8078037631117098, + "grad_norm": 1.942453145980835, + "learning_rate": 1.9921623561647952e-05, + "loss": 0.7073, + "step": 4948 + }, + { + "epoch": 0.8079670217542141, + "grad_norm": 1.9876902103424072, + "learning_rate": 1.9921583904222636e-05, + "loss": 0.8965, + "step": 4949 + }, + { + "epoch": 0.8081302803967185, + "grad_norm": 2.0401511192321777, + "learning_rate": 1.9921544236806284e-05, + "loss": 0.9623, + "step": 4950 + }, + { + "epoch": 0.8082935390392229, + "grad_norm": 1.8046239614486694, + "learning_rate": 1.992150455939894e-05, + "loss": 0.6899, + "step": 4951 + }, + { + "epoch": 0.8084567976817273, + "grad_norm": 1.8443927764892578, + "learning_rate": 1.9921464872000643e-05, + "loss": 0.8912, + "step": 4952 + }, + { + "epoch": 0.8086200563242316, + "grad_norm": 2.1088807582855225, + "learning_rate": 1.9921425174611435e-05, + "loss": 0.7481, + "step": 4953 + }, + { + "epoch": 0.808783314966736, + "grad_norm": 2.0166337490081787, + "learning_rate": 1.992138546723135e-05, + "loss": 0.858, + "step": 4954 + }, + { + "epoch": 0.8089465736092405, + "grad_norm": 1.9046247005462646, + "learning_rate": 1.9921345749860438e-05, + "loss": 0.8141, + "step": 4955 + }, + { + "epoch": 0.8091098322517448, + "grad_norm": 2.1999142169952393, + "learning_rate": 1.992130602249873e-05, + "loss": 0.8141, + "step": 4956 + }, + { + "epoch": 0.8092730908942493, + "grad_norm": 1.3705949783325195, + "learning_rate": 1.992126628514627e-05, + "loss": 0.5902, + "step": 4957 + }, + { + "epoch": 0.8094363495367536, + "grad_norm": 1.646638035774231, + "learning_rate": 1.99212265378031e-05, + "loss": 0.6318, + "step": 4958 + }, + { + "epoch": 0.809599608179258, + "grad_norm": 1.901304006576538, + "learning_rate": 1.9921186780469256e-05, + "loss": 0.8359, + "step": 4959 + }, + { + "epoch": 0.8097628668217624, + "grad_norm": 2.0890119075775146, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.7419, + "step": 4960 + }, + { + "epoch": 0.8099261254642668, + "grad_norm": 2.219465732574463, + "learning_rate": 1.992110723582971e-05, + "loss": 0.7566, + "step": 4961 + }, + { + "epoch": 0.8100893841067711, + "grad_norm": 2.0533900260925293, + "learning_rate": 1.992106744852409e-05, + "loss": 0.8146, + "step": 4962 + }, + { + "epoch": 0.8102526427492756, + "grad_norm": 1.624820590019226, + "learning_rate": 1.9921027651227954e-05, + "loss": 0.7129, + "step": 4963 + }, + { + "epoch": 0.8104159013917799, + "grad_norm": 1.7947266101837158, + "learning_rate": 1.992098784394135e-05, + "loss": 0.7161, + "step": 4964 + }, + { + "epoch": 0.8105791600342843, + "grad_norm": 2.0412700176239014, + "learning_rate": 1.992094802666431e-05, + "loss": 0.8682, + "step": 4965 + }, + { + "epoch": 0.8107424186767888, + "grad_norm": 2.7092697620391846, + "learning_rate": 1.992090819939688e-05, + "loss": 0.7957, + "step": 4966 + }, + { + "epoch": 0.8109056773192931, + "grad_norm": 1.8372575044631958, + "learning_rate": 1.99208683621391e-05, + "loss": 0.731, + "step": 4967 + }, + { + "epoch": 0.8110689359617975, + "grad_norm": 2.0023632049560547, + "learning_rate": 1.9920828514891007e-05, + "loss": 0.7215, + "step": 4968 + }, + { + "epoch": 0.8112321946043018, + "grad_norm": 1.9355003833770752, + "learning_rate": 1.992078865765264e-05, + "loss": 0.8644, + "step": 4969 + }, + { + "epoch": 0.8113954532468063, + "grad_norm": 1.875082015991211, + "learning_rate": 1.9920748790424043e-05, + "loss": 0.8122, + "step": 4970 + }, + { + "epoch": 0.8115587118893106, + "grad_norm": 1.6935220956802368, + "learning_rate": 1.9920708913205254e-05, + "loss": 0.718, + "step": 4971 + }, + { + "epoch": 0.811721970531815, + "grad_norm": 1.9387755393981934, + "learning_rate": 1.9920669025996314e-05, + "loss": 0.7114, + "step": 4972 + }, + { + "epoch": 0.8118852291743194, + "grad_norm": 2.08963680267334, + "learning_rate": 1.9920629128797265e-05, + "loss": 0.81, + "step": 4973 + }, + { + "epoch": 0.8120484878168238, + "grad_norm": 1.8642210960388184, + "learning_rate": 1.9920589221608143e-05, + "loss": 0.9308, + "step": 4974 + }, + { + "epoch": 0.8122117464593281, + "grad_norm": 2.2590415477752686, + "learning_rate": 1.9920549304428992e-05, + "loss": 0.6884, + "step": 4975 + }, + { + "epoch": 0.8123750051018326, + "grad_norm": 2.1210553646087646, + "learning_rate": 1.992050937725985e-05, + "loss": 1.0297, + "step": 4976 + }, + { + "epoch": 0.812538263744337, + "grad_norm": 2.174837350845337, + "learning_rate": 1.9920469440100757e-05, + "loss": 0.9024, + "step": 4977 + }, + { + "epoch": 0.8127015223868413, + "grad_norm": 2.0452444553375244, + "learning_rate": 1.992042949295175e-05, + "loss": 1.0108, + "step": 4978 + }, + { + "epoch": 0.8128647810293458, + "grad_norm": 1.9397871494293213, + "learning_rate": 1.992038953581288e-05, + "loss": 0.7674, + "step": 4979 + }, + { + "epoch": 0.8130280396718501, + "grad_norm": 2.1064770221710205, + "learning_rate": 1.992034956868418e-05, + "loss": 0.8802, + "step": 4980 + }, + { + "epoch": 0.8131912983143545, + "grad_norm": 2.225207805633545, + "learning_rate": 1.9920309591565684e-05, + "loss": 0.8788, + "step": 4981 + }, + { + "epoch": 0.8133545569568589, + "grad_norm": 1.7960642576217651, + "learning_rate": 1.9920269604457444e-05, + "loss": 0.8891, + "step": 4982 + }, + { + "epoch": 0.8135178155993633, + "grad_norm": 2.098869800567627, + "learning_rate": 1.9920229607359495e-05, + "loss": 0.7232, + "step": 4983 + }, + { + "epoch": 0.8136810742418676, + "grad_norm": 1.8331701755523682, + "learning_rate": 1.9920189600271876e-05, + "loss": 0.8301, + "step": 4984 + }, + { + "epoch": 0.8138443328843721, + "grad_norm": 1.993514060974121, + "learning_rate": 1.992014958319463e-05, + "loss": 0.9245, + "step": 4985 + }, + { + "epoch": 0.8140075915268764, + "grad_norm": 1.4990359544754028, + "learning_rate": 1.9920109556127793e-05, + "loss": 0.6753, + "step": 4986 + }, + { + "epoch": 0.8141708501693808, + "grad_norm": 1.8884570598602295, + "learning_rate": 1.9920069519071414e-05, + "loss": 0.7855, + "step": 4987 + }, + { + "epoch": 0.8143341088118853, + "grad_norm": 2.021421194076538, + "learning_rate": 1.992002947202552e-05, + "loss": 0.8976, + "step": 4988 + }, + { + "epoch": 0.8144973674543896, + "grad_norm": 1.9134283065795898, + "learning_rate": 1.9919989414990164e-05, + "loss": 0.7849, + "step": 4989 + }, + { + "epoch": 0.814660626096894, + "grad_norm": 1.9964030981063843, + "learning_rate": 1.991994934796538e-05, + "loss": 0.8339, + "step": 4990 + }, + { + "epoch": 0.8148238847393984, + "grad_norm": 1.9525470733642578, + "learning_rate": 1.991990927095121e-05, + "loss": 0.868, + "step": 4991 + }, + { + "epoch": 0.8149871433819028, + "grad_norm": 2.0830307006835938, + "learning_rate": 1.9919869183947693e-05, + "loss": 0.9768, + "step": 4992 + }, + { + "epoch": 0.8151504020244071, + "grad_norm": 1.8301317691802979, + "learning_rate": 1.9919829086954872e-05, + "loss": 0.7571, + "step": 4993 + }, + { + "epoch": 0.8153136606669116, + "grad_norm": 1.6224919557571411, + "learning_rate": 1.9919788979972785e-05, + "loss": 0.6798, + "step": 4994 + }, + { + "epoch": 0.8154769193094159, + "grad_norm": 1.7822777032852173, + "learning_rate": 1.9919748863001473e-05, + "loss": 0.7875, + "step": 4995 + }, + { + "epoch": 0.8156401779519203, + "grad_norm": 1.8727947473526, + "learning_rate": 1.9919708736040976e-05, + "loss": 0.9101, + "step": 4996 + }, + { + "epoch": 0.8158034365944247, + "grad_norm": 1.812536597251892, + "learning_rate": 1.9919668599091334e-05, + "loss": 0.7774, + "step": 4997 + }, + { + "epoch": 0.8159666952369291, + "grad_norm": 1.6802173852920532, + "learning_rate": 1.9919628452152592e-05, + "loss": 0.6975, + "step": 4998 + }, + { + "epoch": 0.8161299538794335, + "grad_norm": 2.2278597354888916, + "learning_rate": 1.9919588295224784e-05, + "loss": 0.7392, + "step": 4999 + }, + { + "epoch": 0.8162932125219379, + "grad_norm": 1.9044816493988037, + "learning_rate": 1.9919548128307954e-05, + "loss": 0.8257, + "step": 5000 + }, + { + "epoch": 0.8164564711644423, + "grad_norm": 1.8235116004943848, + "learning_rate": 1.9919507951402142e-05, + "loss": 0.8094, + "step": 5001 + }, + { + "epoch": 0.8166197298069466, + "grad_norm": 1.6744729280471802, + "learning_rate": 1.991946776450739e-05, + "loss": 0.6615, + "step": 5002 + }, + { + "epoch": 0.8167829884494511, + "grad_norm": 2.3734798431396484, + "learning_rate": 1.9919427567623732e-05, + "loss": 0.873, + "step": 5003 + }, + { + "epoch": 0.8169462470919554, + "grad_norm": 1.7986395359039307, + "learning_rate": 1.9919387360751216e-05, + "loss": 0.7157, + "step": 5004 + }, + { + "epoch": 0.8171095057344598, + "grad_norm": 1.9178245067596436, + "learning_rate": 1.9919347143889877e-05, + "loss": 0.793, + "step": 5005 + }, + { + "epoch": 0.8172727643769642, + "grad_norm": 1.9701792001724243, + "learning_rate": 1.991930691703976e-05, + "loss": 0.8186, + "step": 5006 + }, + { + "epoch": 0.8174360230194686, + "grad_norm": 2.0716731548309326, + "learning_rate": 1.9919266680200905e-05, + "loss": 0.7329, + "step": 5007 + }, + { + "epoch": 0.8175992816619729, + "grad_norm": 1.9646869897842407, + "learning_rate": 1.991922643337335e-05, + "loss": 0.8934, + "step": 5008 + }, + { + "epoch": 0.8177625403044774, + "grad_norm": 1.7814162969589233, + "learning_rate": 1.9919186176557136e-05, + "loss": 0.7497, + "step": 5009 + }, + { + "epoch": 0.8179257989469818, + "grad_norm": 1.6358851194381714, + "learning_rate": 1.9919145909752305e-05, + "loss": 0.7428, + "step": 5010 + }, + { + "epoch": 0.8180890575894861, + "grad_norm": 1.7643718719482422, + "learning_rate": 1.9919105632958896e-05, + "loss": 0.7474, + "step": 5011 + }, + { + "epoch": 0.8182523162319906, + "grad_norm": 1.8401768207550049, + "learning_rate": 1.991906534617695e-05, + "loss": 0.7184, + "step": 5012 + }, + { + "epoch": 0.8184155748744949, + "grad_norm": 1.85053551197052, + "learning_rate": 1.991902504940651e-05, + "loss": 0.6583, + "step": 5013 + }, + { + "epoch": 0.8185788335169993, + "grad_norm": 1.9926639795303345, + "learning_rate": 1.991898474264761e-05, + "loss": 0.814, + "step": 5014 + }, + { + "epoch": 0.8187420921595037, + "grad_norm": 1.7965377569198608, + "learning_rate": 1.99189444259003e-05, + "loss": 0.8565, + "step": 5015 + }, + { + "epoch": 0.8189053508020081, + "grad_norm": 2.0575406551361084, + "learning_rate": 1.991890409916461e-05, + "loss": 0.947, + "step": 5016 + }, + { + "epoch": 0.8190686094445124, + "grad_norm": 2.526061773300171, + "learning_rate": 1.991886376244059e-05, + "loss": 0.9665, + "step": 5017 + }, + { + "epoch": 0.8192318680870169, + "grad_norm": 1.7794110774993896, + "learning_rate": 1.9918823415728276e-05, + "loss": 0.674, + "step": 5018 + }, + { + "epoch": 0.8193951267295212, + "grad_norm": 1.7379111051559448, + "learning_rate": 1.991878305902771e-05, + "loss": 0.6696, + "step": 5019 + }, + { + "epoch": 0.8195583853720256, + "grad_norm": 1.908668041229248, + "learning_rate": 1.991874269233893e-05, + "loss": 0.8622, + "step": 5020 + }, + { + "epoch": 0.8197216440145301, + "grad_norm": 2.2182705402374268, + "learning_rate": 1.9918702315661985e-05, + "loss": 0.84, + "step": 5021 + }, + { + "epoch": 0.8198849026570344, + "grad_norm": 2.102921962738037, + "learning_rate": 1.9918661928996903e-05, + "loss": 0.8621, + "step": 5022 + }, + { + "epoch": 0.8200481612995388, + "grad_norm": 1.9957351684570312, + "learning_rate": 1.991862153234373e-05, + "loss": 0.9756, + "step": 5023 + }, + { + "epoch": 0.8202114199420432, + "grad_norm": 2.015124797821045, + "learning_rate": 1.9918581125702512e-05, + "loss": 0.7818, + "step": 5024 + }, + { + "epoch": 0.8203746785845476, + "grad_norm": 1.8854998350143433, + "learning_rate": 1.9918540709073288e-05, + "loss": 0.7965, + "step": 5025 + }, + { + "epoch": 0.8205379372270519, + "grad_norm": 2.047118663787842, + "learning_rate": 1.991850028245609e-05, + "loss": 0.6867, + "step": 5026 + }, + { + "epoch": 0.8207011958695564, + "grad_norm": 1.8411861658096313, + "learning_rate": 1.9918459845850967e-05, + "loss": 0.7055, + "step": 5027 + }, + { + "epoch": 0.8208644545120607, + "grad_norm": 2.222003221511841, + "learning_rate": 1.991841939925796e-05, + "loss": 0.8281, + "step": 5028 + }, + { + "epoch": 0.8210277131545651, + "grad_norm": 1.9511771202087402, + "learning_rate": 1.99183789426771e-05, + "loss": 0.8471, + "step": 5029 + }, + { + "epoch": 0.8211909717970695, + "grad_norm": 1.9045811891555786, + "learning_rate": 1.991833847610844e-05, + "loss": 0.8528, + "step": 5030 + }, + { + "epoch": 0.8213542304395739, + "grad_norm": 1.8613239526748657, + "learning_rate": 1.9918297999552018e-05, + "loss": 0.7263, + "step": 5031 + }, + { + "epoch": 0.8215174890820783, + "grad_norm": 1.8274565935134888, + "learning_rate": 1.991825751300787e-05, + "loss": 0.7779, + "step": 5032 + }, + { + "epoch": 0.8216807477245827, + "grad_norm": 2.3139355182647705, + "learning_rate": 1.991821701647604e-05, + "loss": 0.727, + "step": 5033 + }, + { + "epoch": 0.8218440063670871, + "grad_norm": 2.1294779777526855, + "learning_rate": 1.9918176509956568e-05, + "loss": 0.9566, + "step": 5034 + }, + { + "epoch": 0.8220072650095914, + "grad_norm": 2.0486795902252197, + "learning_rate": 1.9918135993449494e-05, + "loss": 0.8147, + "step": 5035 + }, + { + "epoch": 0.8221705236520959, + "grad_norm": 2.0120131969451904, + "learning_rate": 1.9918095466954862e-05, + "loss": 0.7275, + "step": 5036 + }, + { + "epoch": 0.8223337822946002, + "grad_norm": 1.8299384117126465, + "learning_rate": 1.991805493047271e-05, + "loss": 0.8452, + "step": 5037 + }, + { + "epoch": 0.8224970409371046, + "grad_norm": 1.8235822916030884, + "learning_rate": 1.9918014384003074e-05, + "loss": 0.7923, + "step": 5038 + }, + { + "epoch": 0.822660299579609, + "grad_norm": 1.9736616611480713, + "learning_rate": 1.9917973827546006e-05, + "loss": 0.6447, + "step": 5039 + }, + { + "epoch": 0.8228235582221134, + "grad_norm": 1.6450153589248657, + "learning_rate": 1.991793326110154e-05, + "loss": 0.6996, + "step": 5040 + }, + { + "epoch": 0.8229868168646177, + "grad_norm": 2.1436688899993896, + "learning_rate": 1.9917892684669717e-05, + "loss": 0.9107, + "step": 5041 + }, + { + "epoch": 0.8231500755071222, + "grad_norm": 1.8279258012771606, + "learning_rate": 1.9917852098250577e-05, + "loss": 0.6802, + "step": 5042 + }, + { + "epoch": 0.8233133341496266, + "grad_norm": 2.0657525062561035, + "learning_rate": 1.9917811501844166e-05, + "loss": 0.6491, + "step": 5043 + }, + { + "epoch": 0.8234765927921309, + "grad_norm": 1.9762307405471802, + "learning_rate": 1.9917770895450518e-05, + "loss": 0.699, + "step": 5044 + }, + { + "epoch": 0.8236398514346354, + "grad_norm": 2.1198737621307373, + "learning_rate": 1.991773027906968e-05, + "loss": 0.8205, + "step": 5045 + }, + { + "epoch": 0.8238031100771397, + "grad_norm": 2.019064426422119, + "learning_rate": 1.991768965270169e-05, + "loss": 0.8918, + "step": 5046 + }, + { + "epoch": 0.8239663687196441, + "grad_norm": 2.2834441661834717, + "learning_rate": 1.991764901634659e-05, + "loss": 1.3521, + "step": 5047 + }, + { + "epoch": 0.8241296273621485, + "grad_norm": 1.8330230712890625, + "learning_rate": 1.9917608370004417e-05, + "loss": 0.7895, + "step": 5048 + }, + { + "epoch": 0.8242928860046529, + "grad_norm": 1.563226580619812, + "learning_rate": 1.9917567713675216e-05, + "loss": 0.6555, + "step": 5049 + }, + { + "epoch": 0.8244561446471572, + "grad_norm": 1.8024563789367676, + "learning_rate": 1.991752704735903e-05, + "loss": 0.7995, + "step": 5050 + }, + { + "epoch": 0.8246194032896617, + "grad_norm": 1.7589255571365356, + "learning_rate": 1.9917486371055893e-05, + "loss": 0.7803, + "step": 5051 + }, + { + "epoch": 0.824782661932166, + "grad_norm": 2.030750036239624, + "learning_rate": 1.9917445684765853e-05, + "loss": 0.8942, + "step": 5052 + }, + { + "epoch": 0.8249459205746704, + "grad_norm": 1.9861804246902466, + "learning_rate": 1.9917404988488945e-05, + "loss": 0.7845, + "step": 5053 + }, + { + "epoch": 0.8251091792171749, + "grad_norm": 1.822818398475647, + "learning_rate": 1.9917364282225213e-05, + "loss": 0.7473, + "step": 5054 + }, + { + "epoch": 0.8252724378596792, + "grad_norm": 1.8324793577194214, + "learning_rate": 1.99173235659747e-05, + "loss": 0.7691, + "step": 5055 + }, + { + "epoch": 0.8254356965021836, + "grad_norm": 1.6955355405807495, + "learning_rate": 1.9917282839737443e-05, + "loss": 0.6456, + "step": 5056 + }, + { + "epoch": 0.825598955144688, + "grad_norm": 1.7857342958450317, + "learning_rate": 1.9917242103513485e-05, + "loss": 0.7328, + "step": 5057 + }, + { + "epoch": 0.8257622137871924, + "grad_norm": 2.206937551498413, + "learning_rate": 1.9917201357302867e-05, + "loss": 0.9463, + "step": 5058 + }, + { + "epoch": 0.8259254724296967, + "grad_norm": 1.7162189483642578, + "learning_rate": 1.9917160601105632e-05, + "loss": 0.7151, + "step": 5059 + }, + { + "epoch": 0.8260887310722012, + "grad_norm": 1.7935936450958252, + "learning_rate": 1.9917119834921818e-05, + "loss": 0.7918, + "step": 5060 + }, + { + "epoch": 0.8262519897147055, + "grad_norm": 2.0773730278015137, + "learning_rate": 1.9917079058751464e-05, + "loss": 0.8335, + "step": 5061 + }, + { + "epoch": 0.8264152483572099, + "grad_norm": 1.9841821193695068, + "learning_rate": 1.9917038272594616e-05, + "loss": 0.7716, + "step": 5062 + }, + { + "epoch": 0.8265785069997142, + "grad_norm": 2.1464812755584717, + "learning_rate": 1.991699747645131e-05, + "loss": 0.8795, + "step": 5063 + }, + { + "epoch": 0.8267417656422187, + "grad_norm": 2.1702659130096436, + "learning_rate": 1.9916956670321595e-05, + "loss": 0.6632, + "step": 5064 + }, + { + "epoch": 0.8269050242847231, + "grad_norm": 1.7651402950286865, + "learning_rate": 1.9916915854205504e-05, + "loss": 0.7662, + "step": 5065 + }, + { + "epoch": 0.8270682829272274, + "grad_norm": 1.8429718017578125, + "learning_rate": 1.9916875028103083e-05, + "loss": 0.7271, + "step": 5066 + }, + { + "epoch": 0.8272315415697319, + "grad_norm": 2.1390669345855713, + "learning_rate": 1.9916834192014375e-05, + "loss": 0.7916, + "step": 5067 + }, + { + "epoch": 0.8273948002122362, + "grad_norm": 2.1748156547546387, + "learning_rate": 1.9916793345939412e-05, + "loss": 0.854, + "step": 5068 + }, + { + "epoch": 0.8275580588547407, + "grad_norm": 2.016902446746826, + "learning_rate": 1.9916752489878243e-05, + "loss": 0.8834, + "step": 5069 + }, + { + "epoch": 0.827721317497245, + "grad_norm": 1.7721856832504272, + "learning_rate": 1.9916711623830904e-05, + "loss": 0.6287, + "step": 5070 + }, + { + "epoch": 0.8278845761397494, + "grad_norm": 1.9506150484085083, + "learning_rate": 1.9916670747797444e-05, + "loss": 0.888, + "step": 5071 + }, + { + "epoch": 0.8280478347822537, + "grad_norm": 2.0228545665740967, + "learning_rate": 1.9916629861777898e-05, + "loss": 0.7147, + "step": 5072 + }, + { + "epoch": 0.8282110934247582, + "grad_norm": 1.7982124090194702, + "learning_rate": 1.9916588965772305e-05, + "loss": 0.667, + "step": 5073 + }, + { + "epoch": 0.8283743520672625, + "grad_norm": 2.1498124599456787, + "learning_rate": 1.9916548059780712e-05, + "loss": 0.8328, + "step": 5074 + }, + { + "epoch": 0.828537610709767, + "grad_norm": 1.9324437379837036, + "learning_rate": 1.991650714380316e-05, + "loss": 0.8766, + "step": 5075 + }, + { + "epoch": 0.8287008693522714, + "grad_norm": 2.0160040855407715, + "learning_rate": 1.9916466217839684e-05, + "loss": 0.8178, + "step": 5076 + }, + { + "epoch": 0.8288641279947757, + "grad_norm": 1.7618170976638794, + "learning_rate": 1.991642528189033e-05, + "loss": 0.7313, + "step": 5077 + }, + { + "epoch": 0.8290273866372802, + "grad_norm": 2.2003722190856934, + "learning_rate": 1.9916384335955138e-05, + "loss": 0.8097, + "step": 5078 + }, + { + "epoch": 0.8291906452797845, + "grad_norm": 1.9265284538269043, + "learning_rate": 1.991634338003415e-05, + "loss": 0.8687, + "step": 5079 + }, + { + "epoch": 0.8293539039222889, + "grad_norm": 2.032991409301758, + "learning_rate": 1.9916302414127408e-05, + "loss": 0.808, + "step": 5080 + }, + { + "epoch": 0.8295171625647932, + "grad_norm": 2.3782904148101807, + "learning_rate": 1.9916261438234953e-05, + "loss": 0.8414, + "step": 5081 + }, + { + "epoch": 0.8296804212072977, + "grad_norm": 1.792364239692688, + "learning_rate": 1.991622045235682e-05, + "loss": 0.78, + "step": 5082 + }, + { + "epoch": 0.829843679849802, + "grad_norm": 1.7115353345870972, + "learning_rate": 1.9916179456493062e-05, + "loss": 0.717, + "step": 5083 + }, + { + "epoch": 0.8300069384923064, + "grad_norm": 1.7939060926437378, + "learning_rate": 1.991613845064371e-05, + "loss": 0.7764, + "step": 5084 + }, + { + "epoch": 0.8301701971348108, + "grad_norm": 1.9377961158752441, + "learning_rate": 1.991609743480881e-05, + "loss": 0.9417, + "step": 5085 + }, + { + "epoch": 0.8303334557773152, + "grad_norm": 1.8728289604187012, + "learning_rate": 1.9916056408988402e-05, + "loss": 0.7574, + "step": 5086 + }, + { + "epoch": 0.8304967144198196, + "grad_norm": 1.7909367084503174, + "learning_rate": 1.9916015373182528e-05, + "loss": 0.7087, + "step": 5087 + }, + { + "epoch": 0.830659973062324, + "grad_norm": 1.8567538261413574, + "learning_rate": 1.9915974327391234e-05, + "loss": 0.7226, + "step": 5088 + }, + { + "epoch": 0.8308232317048284, + "grad_norm": 1.6564449071884155, + "learning_rate": 1.991593327161455e-05, + "loss": 0.6412, + "step": 5089 + }, + { + "epoch": 0.8309864903473327, + "grad_norm": 2.0899055004119873, + "learning_rate": 1.9915892205852528e-05, + "loss": 0.8715, + "step": 5090 + }, + { + "epoch": 0.8311497489898372, + "grad_norm": 1.9624117612838745, + "learning_rate": 1.9915851130105205e-05, + "loss": 0.8584, + "step": 5091 + }, + { + "epoch": 0.8313130076323415, + "grad_norm": 2.044499158859253, + "learning_rate": 1.9915810044372618e-05, + "loss": 0.7197, + "step": 5092 + }, + { + "epoch": 0.8314762662748459, + "grad_norm": 1.7407574653625488, + "learning_rate": 1.9915768948654816e-05, + "loss": 0.8142, + "step": 5093 + }, + { + "epoch": 0.8316395249173503, + "grad_norm": 1.9463224411010742, + "learning_rate": 1.9915727842951838e-05, + "loss": 0.7875, + "step": 5094 + }, + { + "epoch": 0.8318027835598547, + "grad_norm": 1.6472877264022827, + "learning_rate": 1.9915686727263723e-05, + "loss": 0.5924, + "step": 5095 + }, + { + "epoch": 0.831966042202359, + "grad_norm": 1.7957106828689575, + "learning_rate": 1.9915645601590517e-05, + "loss": 0.6818, + "step": 5096 + }, + { + "epoch": 0.8321293008448635, + "grad_norm": 1.837754487991333, + "learning_rate": 1.9915604465932255e-05, + "loss": 0.6845, + "step": 5097 + }, + { + "epoch": 0.8322925594873679, + "grad_norm": 1.650193452835083, + "learning_rate": 1.9915563320288983e-05, + "loss": 0.6541, + "step": 5098 + }, + { + "epoch": 0.8324558181298722, + "grad_norm": 2.0429975986480713, + "learning_rate": 1.991552216466074e-05, + "loss": 0.7325, + "step": 5099 + }, + { + "epoch": 0.8326190767723767, + "grad_norm": 1.5354825258255005, + "learning_rate": 1.9915480999047573e-05, + "loss": 0.6282, + "step": 5100 + }, + { + "epoch": 0.832782335414881, + "grad_norm": 1.7072590589523315, + "learning_rate": 1.991543982344952e-05, + "loss": 0.7017, + "step": 5101 + }, + { + "epoch": 0.8329455940573854, + "grad_norm": 1.7479791641235352, + "learning_rate": 1.9915398637866615e-05, + "loss": 0.8186, + "step": 5102 + }, + { + "epoch": 0.8331088526998898, + "grad_norm": 1.655551791191101, + "learning_rate": 1.991535744229891e-05, + "loss": 0.7231, + "step": 5103 + }, + { + "epoch": 0.8332721113423942, + "grad_norm": 1.872882604598999, + "learning_rate": 1.9915316236746443e-05, + "loss": 0.96, + "step": 5104 + }, + { + "epoch": 0.8334353699848985, + "grad_norm": 2.03730845451355, + "learning_rate": 1.9915275021209255e-05, + "loss": 0.7652, + "step": 5105 + }, + { + "epoch": 0.833598628627403, + "grad_norm": 1.7753304243087769, + "learning_rate": 1.9915233795687388e-05, + "loss": 0.6701, + "step": 5106 + }, + { + "epoch": 0.8337618872699073, + "grad_norm": 2.580536365509033, + "learning_rate": 1.991519256018088e-05, + "loss": 0.8883, + "step": 5107 + }, + { + "epoch": 0.8339251459124117, + "grad_norm": 2.025146007537842, + "learning_rate": 1.991515131468978e-05, + "loss": 0.8724, + "step": 5108 + }, + { + "epoch": 0.8340884045549162, + "grad_norm": 1.9715062379837036, + "learning_rate": 1.9915110059214124e-05, + "loss": 0.9495, + "step": 5109 + }, + { + "epoch": 0.8342516631974205, + "grad_norm": 1.925430178642273, + "learning_rate": 1.9915068793753952e-05, + "loss": 0.6901, + "step": 5110 + }, + { + "epoch": 0.8344149218399249, + "grad_norm": 1.6726386547088623, + "learning_rate": 1.9915027518309312e-05, + "loss": 0.7338, + "step": 5111 + }, + { + "epoch": 0.8345781804824293, + "grad_norm": 1.8051495552062988, + "learning_rate": 1.991498623288024e-05, + "loss": 0.7517, + "step": 5112 + }, + { + "epoch": 0.8347414391249337, + "grad_norm": 1.6939828395843506, + "learning_rate": 1.9914944937466784e-05, + "loss": 0.7633, + "step": 5113 + }, + { + "epoch": 0.834904697767438, + "grad_norm": 1.7901723384857178, + "learning_rate": 1.9914903632068975e-05, + "loss": 0.7185, + "step": 5114 + }, + { + "epoch": 0.8350679564099425, + "grad_norm": 1.998066782951355, + "learning_rate": 1.9914862316686863e-05, + "loss": 0.7844, + "step": 5115 + }, + { + "epoch": 0.8352312150524468, + "grad_norm": 1.554466724395752, + "learning_rate": 1.9914820991320486e-05, + "loss": 0.6126, + "step": 5116 + }, + { + "epoch": 0.8353944736949512, + "grad_norm": 1.8795663118362427, + "learning_rate": 1.991477965596989e-05, + "loss": 0.9675, + "step": 5117 + }, + { + "epoch": 0.8355577323374556, + "grad_norm": 1.7835410833358765, + "learning_rate": 1.991473831063511e-05, + "loss": 0.8659, + "step": 5118 + }, + { + "epoch": 0.83572099097996, + "grad_norm": 2.07253098487854, + "learning_rate": 1.9914696955316192e-05, + "loss": 0.8165, + "step": 5119 + }, + { + "epoch": 0.8358842496224644, + "grad_norm": 1.766620397567749, + "learning_rate": 1.9914655590013177e-05, + "loss": 0.7845, + "step": 5120 + }, + { + "epoch": 0.8360475082649688, + "grad_norm": 2.0287749767303467, + "learning_rate": 1.991461421472611e-05, + "loss": 0.8063, + "step": 5121 + }, + { + "epoch": 0.8362107669074732, + "grad_norm": 1.842419147491455, + "learning_rate": 1.9914572829455022e-05, + "loss": 0.7971, + "step": 5122 + }, + { + "epoch": 0.8363740255499775, + "grad_norm": 2.0687546730041504, + "learning_rate": 1.991453143419997e-05, + "loss": 0.7034, + "step": 5123 + }, + { + "epoch": 0.836537284192482, + "grad_norm": 1.5977431535720825, + "learning_rate": 1.991449002896098e-05, + "loss": 0.6752, + "step": 5124 + }, + { + "epoch": 0.8367005428349863, + "grad_norm": 2.0692272186279297, + "learning_rate": 1.9914448613738107e-05, + "loss": 0.9777, + "step": 5125 + }, + { + "epoch": 0.8368638014774907, + "grad_norm": 1.600207805633545, + "learning_rate": 1.9914407188531383e-05, + "loss": 0.6891, + "step": 5126 + }, + { + "epoch": 0.8370270601199951, + "grad_norm": 1.544081211090088, + "learning_rate": 1.9914365753340855e-05, + "loss": 0.7329, + "step": 5127 + }, + { + "epoch": 0.8371903187624995, + "grad_norm": 2.2848329544067383, + "learning_rate": 1.9914324308166564e-05, + "loss": 0.8599, + "step": 5128 + }, + { + "epoch": 0.8373535774050038, + "grad_norm": 2.0703237056732178, + "learning_rate": 1.9914282853008552e-05, + "loss": 0.9249, + "step": 5129 + }, + { + "epoch": 0.8375168360475083, + "grad_norm": 2.01664137840271, + "learning_rate": 1.9914241387866858e-05, + "loss": 0.7939, + "step": 5130 + }, + { + "epoch": 0.8376800946900127, + "grad_norm": 1.5371699333190918, + "learning_rate": 1.9914199912741522e-05, + "loss": 0.6422, + "step": 5131 + }, + { + "epoch": 0.837843353332517, + "grad_norm": 1.905791997909546, + "learning_rate": 1.9914158427632595e-05, + "loss": 0.7184, + "step": 5132 + }, + { + "epoch": 0.8380066119750215, + "grad_norm": 1.9164390563964844, + "learning_rate": 1.991411693254011e-05, + "loss": 0.8402, + "step": 5133 + }, + { + "epoch": 0.8381698706175258, + "grad_norm": 2.4472849369049072, + "learning_rate": 1.9914075427464113e-05, + "loss": 0.9627, + "step": 5134 + }, + { + "epoch": 0.8383331292600302, + "grad_norm": 2.0392396450042725, + "learning_rate": 1.9914033912404646e-05, + "loss": 0.9576, + "step": 5135 + }, + { + "epoch": 0.8384963879025346, + "grad_norm": 1.824885606765747, + "learning_rate": 1.9913992387361747e-05, + "loss": 0.658, + "step": 5136 + }, + { + "epoch": 0.838659646545039, + "grad_norm": 1.7902811765670776, + "learning_rate": 1.991395085233546e-05, + "loss": 0.7542, + "step": 5137 + }, + { + "epoch": 0.8388229051875433, + "grad_norm": 2.062926769256592, + "learning_rate": 1.991390930732583e-05, + "loss": 0.8996, + "step": 5138 + }, + { + "epoch": 0.8389861638300478, + "grad_norm": 1.8213695287704468, + "learning_rate": 1.9913867752332897e-05, + "loss": 0.768, + "step": 5139 + }, + { + "epoch": 0.8391494224725521, + "grad_norm": 1.50505793094635, + "learning_rate": 1.99138261873567e-05, + "loss": 0.7062, + "step": 5140 + }, + { + "epoch": 0.8393126811150565, + "grad_norm": 1.6011627912521362, + "learning_rate": 1.991378461239728e-05, + "loss": 0.6055, + "step": 5141 + }, + { + "epoch": 0.839475939757561, + "grad_norm": 1.6268272399902344, + "learning_rate": 1.9913743027454686e-05, + "loss": 0.5225, + "step": 5142 + }, + { + "epoch": 0.8396391984000653, + "grad_norm": 2.160813331604004, + "learning_rate": 1.991370143252895e-05, + "loss": 0.8309, + "step": 5143 + }, + { + "epoch": 0.8398024570425697, + "grad_norm": 1.8993743658065796, + "learning_rate": 1.991365982762012e-05, + "loss": 0.9248, + "step": 5144 + }, + { + "epoch": 0.839965715685074, + "grad_norm": 1.8001459836959839, + "learning_rate": 1.991361821272824e-05, + "loss": 0.7515, + "step": 5145 + }, + { + "epoch": 0.8401289743275785, + "grad_norm": 1.91147780418396, + "learning_rate": 1.991357658785335e-05, + "loss": 0.8288, + "step": 5146 + }, + { + "epoch": 0.8402922329700828, + "grad_norm": 1.7198907136917114, + "learning_rate": 1.991353495299549e-05, + "loss": 0.7039, + "step": 5147 + }, + { + "epoch": 0.8404554916125873, + "grad_norm": 1.782019019126892, + "learning_rate": 1.9913493308154702e-05, + "loss": 0.7034, + "step": 5148 + }, + { + "epoch": 0.8406187502550916, + "grad_norm": 1.6733627319335938, + "learning_rate": 1.9913451653331028e-05, + "loss": 0.6633, + "step": 5149 + }, + { + "epoch": 0.840782008897596, + "grad_norm": 1.7069876194000244, + "learning_rate": 1.991340998852451e-05, + "loss": 0.8409, + "step": 5150 + }, + { + "epoch": 0.8409452675401003, + "grad_norm": 2.3522868156433105, + "learning_rate": 1.9913368313735194e-05, + "loss": 0.9128, + "step": 5151 + }, + { + "epoch": 0.8411085261826048, + "grad_norm": 2.088667154312134, + "learning_rate": 1.9913326628963118e-05, + "loss": 0.819, + "step": 5152 + }, + { + "epoch": 0.8412717848251092, + "grad_norm": 2.356287717819214, + "learning_rate": 1.9913284934208325e-05, + "loss": 0.9845, + "step": 5153 + }, + { + "epoch": 0.8414350434676136, + "grad_norm": 2.047569751739502, + "learning_rate": 1.9913243229470857e-05, + "loss": 0.7949, + "step": 5154 + }, + { + "epoch": 0.841598302110118, + "grad_norm": 2.131227493286133, + "learning_rate": 1.9913201514750752e-05, + "loss": 0.8029, + "step": 5155 + }, + { + "epoch": 0.8417615607526223, + "grad_norm": 1.8882328271865845, + "learning_rate": 1.991315979004806e-05, + "loss": 0.7818, + "step": 5156 + }, + { + "epoch": 0.8419248193951268, + "grad_norm": 2.1638076305389404, + "learning_rate": 1.991311805536282e-05, + "loss": 0.9051, + "step": 5157 + }, + { + "epoch": 0.8420880780376311, + "grad_norm": 1.791635274887085, + "learning_rate": 1.9913076310695068e-05, + "loss": 0.9033, + "step": 5158 + }, + { + "epoch": 0.8422513366801355, + "grad_norm": 1.9655120372772217, + "learning_rate": 1.991303455604485e-05, + "loss": 0.8673, + "step": 5159 + }, + { + "epoch": 0.8424145953226398, + "grad_norm": 1.9005126953125, + "learning_rate": 1.9912992791412212e-05, + "loss": 0.8878, + "step": 5160 + }, + { + "epoch": 0.8425778539651443, + "grad_norm": 1.7389261722564697, + "learning_rate": 1.9912951016797195e-05, + "loss": 0.8836, + "step": 5161 + }, + { + "epoch": 0.8427411126076486, + "grad_norm": 1.581194519996643, + "learning_rate": 1.9912909232199834e-05, + "loss": 0.6804, + "step": 5162 + }, + { + "epoch": 0.842904371250153, + "grad_norm": 1.9976918697357178, + "learning_rate": 1.991286743762018e-05, + "loss": 0.7426, + "step": 5163 + }, + { + "epoch": 0.8430676298926575, + "grad_norm": 1.8329269886016846, + "learning_rate": 1.991282563305827e-05, + "loss": 0.7062, + "step": 5164 + }, + { + "epoch": 0.8432308885351618, + "grad_norm": 1.6030261516571045, + "learning_rate": 1.9912783818514144e-05, + "loss": 0.6584, + "step": 5165 + }, + { + "epoch": 0.8433941471776663, + "grad_norm": 1.8029874563217163, + "learning_rate": 1.9912741993987853e-05, + "loss": 0.7975, + "step": 5166 + }, + { + "epoch": 0.8435574058201706, + "grad_norm": 2.0740880966186523, + "learning_rate": 1.9912700159479428e-05, + "loss": 0.8719, + "step": 5167 + }, + { + "epoch": 0.843720664462675, + "grad_norm": 1.8785903453826904, + "learning_rate": 1.991265831498892e-05, + "loss": 0.7047, + "step": 5168 + }, + { + "epoch": 0.8438839231051793, + "grad_norm": 2.018407106399536, + "learning_rate": 1.9912616460516364e-05, + "loss": 0.9659, + "step": 5169 + }, + { + "epoch": 0.8440471817476838, + "grad_norm": 1.6844274997711182, + "learning_rate": 1.991257459606181e-05, + "loss": 0.6877, + "step": 5170 + }, + { + "epoch": 0.8442104403901881, + "grad_norm": 1.733445167541504, + "learning_rate": 1.9912532721625295e-05, + "loss": 0.7463, + "step": 5171 + }, + { + "epoch": 0.8443736990326925, + "grad_norm": 1.9155004024505615, + "learning_rate": 1.9912490837206862e-05, + "loss": 0.7527, + "step": 5172 + }, + { + "epoch": 0.8445369576751969, + "grad_norm": 1.95585036277771, + "learning_rate": 1.9912448942806553e-05, + "loss": 0.7729, + "step": 5173 + }, + { + "epoch": 0.8447002163177013, + "grad_norm": 1.7079719305038452, + "learning_rate": 1.991240703842441e-05, + "loss": 0.628, + "step": 5174 + }, + { + "epoch": 0.8448634749602058, + "grad_norm": 1.6530548334121704, + "learning_rate": 1.9912365124060478e-05, + "loss": 0.7373, + "step": 5175 + }, + { + "epoch": 0.8450267336027101, + "grad_norm": 2.1432971954345703, + "learning_rate": 1.9912323199714796e-05, + "loss": 0.9194, + "step": 5176 + }, + { + "epoch": 0.8451899922452145, + "grad_norm": 1.7289494276046753, + "learning_rate": 1.9912281265387407e-05, + "loss": 0.6558, + "step": 5177 + }, + { + "epoch": 0.8453532508877188, + "grad_norm": 2.2957234382629395, + "learning_rate": 1.9912239321078354e-05, + "loss": 0.8881, + "step": 5178 + }, + { + "epoch": 0.8455165095302233, + "grad_norm": 2.081101894378662, + "learning_rate": 1.9912197366787676e-05, + "loss": 0.8361, + "step": 5179 + }, + { + "epoch": 0.8456797681727276, + "grad_norm": 1.7224607467651367, + "learning_rate": 1.991215540251542e-05, + "loss": 0.7167, + "step": 5180 + }, + { + "epoch": 0.845843026815232, + "grad_norm": 1.725548267364502, + "learning_rate": 1.9912113428261624e-05, + "loss": 0.5677, + "step": 5181 + }, + { + "epoch": 0.8460062854577364, + "grad_norm": 2.104229211807251, + "learning_rate": 1.9912071444026332e-05, + "loss": 0.9158, + "step": 5182 + }, + { + "epoch": 0.8461695441002408, + "grad_norm": 1.826003909111023, + "learning_rate": 1.991202944980959e-05, + "loss": 0.7274, + "step": 5183 + }, + { + "epoch": 0.8463328027427452, + "grad_norm": 1.8764857053756714, + "learning_rate": 1.991198744561144e-05, + "loss": 0.722, + "step": 5184 + }, + { + "epoch": 0.8464960613852496, + "grad_norm": 1.919197916984558, + "learning_rate": 1.9911945431431913e-05, + "loss": 0.7334, + "step": 5185 + }, + { + "epoch": 0.846659320027754, + "grad_norm": 1.8895585536956787, + "learning_rate": 1.9911903407271062e-05, + "loss": 0.733, + "step": 5186 + }, + { + "epoch": 0.8468225786702583, + "grad_norm": 1.8721678256988525, + "learning_rate": 1.991186137312893e-05, + "loss": 0.8117, + "step": 5187 + }, + { + "epoch": 0.8469858373127628, + "grad_norm": 1.9103262424468994, + "learning_rate": 1.9911819329005554e-05, + "loss": 0.828, + "step": 5188 + }, + { + "epoch": 0.8471490959552671, + "grad_norm": 1.7443757057189941, + "learning_rate": 1.991177727490098e-05, + "loss": 0.7869, + "step": 5189 + }, + { + "epoch": 0.8473123545977715, + "grad_norm": 2.033560276031494, + "learning_rate": 1.991173521081525e-05, + "loss": 0.8675, + "step": 5190 + }, + { + "epoch": 0.8474756132402759, + "grad_norm": 2.1890907287597656, + "learning_rate": 1.9911693136748403e-05, + "loss": 0.7402, + "step": 5191 + }, + { + "epoch": 0.8476388718827803, + "grad_norm": 1.9003220796585083, + "learning_rate": 1.9911651052700483e-05, + "loss": 0.741, + "step": 5192 + }, + { + "epoch": 0.8478021305252846, + "grad_norm": 1.8407002687454224, + "learning_rate": 1.9911608958671533e-05, + "loss": 0.7796, + "step": 5193 + }, + { + "epoch": 0.8479653891677891, + "grad_norm": 2.300396680831909, + "learning_rate": 1.9911566854661598e-05, + "loss": 0.7326, + "step": 5194 + }, + { + "epoch": 0.8481286478102935, + "grad_norm": 1.9121159315109253, + "learning_rate": 1.9911524740670715e-05, + "loss": 0.6953, + "step": 5195 + }, + { + "epoch": 0.8482919064527978, + "grad_norm": 1.9805223941802979, + "learning_rate": 1.991148261669893e-05, + "loss": 0.8407, + "step": 5196 + }, + { + "epoch": 0.8484551650953023, + "grad_norm": 1.764054775238037, + "learning_rate": 1.9911440482746286e-05, + "loss": 0.7854, + "step": 5197 + }, + { + "epoch": 0.8486184237378066, + "grad_norm": 1.6168181896209717, + "learning_rate": 1.9911398338812825e-05, + "loss": 0.7899, + "step": 5198 + }, + { + "epoch": 0.848781682380311, + "grad_norm": 1.5975042581558228, + "learning_rate": 1.9911356184898585e-05, + "loss": 0.6038, + "step": 5199 + }, + { + "epoch": 0.8489449410228154, + "grad_norm": 1.6113570928573608, + "learning_rate": 1.9911314021003614e-05, + "loss": 0.7886, + "step": 5200 + }, + { + "epoch": 0.8491081996653198, + "grad_norm": 2.0364503860473633, + "learning_rate": 1.991127184712795e-05, + "loss": 0.9071, + "step": 5201 + }, + { + "epoch": 0.8492714583078241, + "grad_norm": 2.098405122756958, + "learning_rate": 1.991122966327164e-05, + "loss": 0.9129, + "step": 5202 + }, + { + "epoch": 0.8494347169503286, + "grad_norm": 2.038407325744629, + "learning_rate": 1.9911187469434724e-05, + "loss": 0.9639, + "step": 5203 + }, + { + "epoch": 0.8495979755928329, + "grad_norm": 1.9227811098098755, + "learning_rate": 1.9911145265617246e-05, + "loss": 0.8318, + "step": 5204 + }, + { + "epoch": 0.8497612342353373, + "grad_norm": 1.8247493505477905, + "learning_rate": 1.9911103051819248e-05, + "loss": 0.797, + "step": 5205 + }, + { + "epoch": 0.8499244928778418, + "grad_norm": 1.9497145414352417, + "learning_rate": 1.9911060828040768e-05, + "loss": 0.8014, + "step": 5206 + }, + { + "epoch": 0.8500877515203461, + "grad_norm": 2.2105038166046143, + "learning_rate": 1.9911018594281855e-05, + "loss": 0.8664, + "step": 5207 + }, + { + "epoch": 0.8502510101628505, + "grad_norm": 1.912534236907959, + "learning_rate": 1.991097635054255e-05, + "loss": 0.7798, + "step": 5208 + }, + { + "epoch": 0.8504142688053549, + "grad_norm": 1.8173553943634033, + "learning_rate": 1.9910934096822895e-05, + "loss": 0.7025, + "step": 5209 + }, + { + "epoch": 0.8505775274478593, + "grad_norm": 2.2416248321533203, + "learning_rate": 1.9910891833122926e-05, + "loss": 0.7768, + "step": 5210 + }, + { + "epoch": 0.8507407860903636, + "grad_norm": 1.8488212823867798, + "learning_rate": 1.9910849559442697e-05, + "loss": 0.766, + "step": 5211 + }, + { + "epoch": 0.8509040447328681, + "grad_norm": 1.8061200380325317, + "learning_rate": 1.9910807275782244e-05, + "loss": 0.8663, + "step": 5212 + }, + { + "epoch": 0.8510673033753724, + "grad_norm": 1.764367938041687, + "learning_rate": 1.991076498214161e-05, + "loss": 0.7467, + "step": 5213 + }, + { + "epoch": 0.8512305620178768, + "grad_norm": 1.757871389389038, + "learning_rate": 1.991072267852084e-05, + "loss": 0.8953, + "step": 5214 + }, + { + "epoch": 0.8513938206603812, + "grad_norm": 1.8776533603668213, + "learning_rate": 1.9910680364919975e-05, + "loss": 0.8229, + "step": 5215 + }, + { + "epoch": 0.8515570793028856, + "grad_norm": 1.6906567811965942, + "learning_rate": 1.9910638041339053e-05, + "loss": 0.7582, + "step": 5216 + }, + { + "epoch": 0.85172033794539, + "grad_norm": 1.802294135093689, + "learning_rate": 1.9910595707778127e-05, + "loss": 0.9735, + "step": 5217 + }, + { + "epoch": 0.8518835965878944, + "grad_norm": 2.0317392349243164, + "learning_rate": 1.9910553364237232e-05, + "loss": 0.7421, + "step": 5218 + }, + { + "epoch": 0.8520468552303988, + "grad_norm": 2.0111281871795654, + "learning_rate": 1.9910511010716405e-05, + "loss": 0.8753, + "step": 5219 + }, + { + "epoch": 0.8522101138729031, + "grad_norm": 1.837296485900879, + "learning_rate": 1.9910468647215708e-05, + "loss": 0.7092, + "step": 5220 + }, + { + "epoch": 0.8523733725154076, + "grad_norm": 1.9931890964508057, + "learning_rate": 1.9910426273735163e-05, + "loss": 0.8305, + "step": 5221 + }, + { + "epoch": 0.8525366311579119, + "grad_norm": 1.766592264175415, + "learning_rate": 1.9910383890274825e-05, + "loss": 0.7077, + "step": 5222 + }, + { + "epoch": 0.8526998898004163, + "grad_norm": 2.0314688682556152, + "learning_rate": 1.9910341496834734e-05, + "loss": 0.7143, + "step": 5223 + }, + { + "epoch": 0.8528631484429207, + "grad_norm": 1.7196743488311768, + "learning_rate": 1.991029909341493e-05, + "loss": 0.7315, + "step": 5224 + }, + { + "epoch": 0.8530264070854251, + "grad_norm": 1.6130890846252441, + "learning_rate": 1.991025668001546e-05, + "loss": 0.7469, + "step": 5225 + }, + { + "epoch": 0.8531896657279294, + "grad_norm": 2.028916120529175, + "learning_rate": 1.991021425663636e-05, + "loss": 0.7618, + "step": 5226 + }, + { + "epoch": 0.8533529243704339, + "grad_norm": 1.955233097076416, + "learning_rate": 1.9910171823277676e-05, + "loss": 0.8082, + "step": 5227 + }, + { + "epoch": 0.8535161830129383, + "grad_norm": 1.7255843877792358, + "learning_rate": 1.9910129379939455e-05, + "loss": 0.6573, + "step": 5228 + }, + { + "epoch": 0.8536794416554426, + "grad_norm": 1.9246351718902588, + "learning_rate": 1.9910086926621738e-05, + "loss": 0.7637, + "step": 5229 + }, + { + "epoch": 0.8538427002979471, + "grad_norm": 1.8635032176971436, + "learning_rate": 1.9910044463324564e-05, + "loss": 0.7794, + "step": 5230 + }, + { + "epoch": 0.8540059589404514, + "grad_norm": 1.5943204164505005, + "learning_rate": 1.9910001990047976e-05, + "loss": 0.7402, + "step": 5231 + }, + { + "epoch": 0.8541692175829558, + "grad_norm": 1.635206937789917, + "learning_rate": 1.9909959506792022e-05, + "loss": 0.6256, + "step": 5232 + }, + { + "epoch": 0.8543324762254602, + "grad_norm": 1.894118070602417, + "learning_rate": 1.990991701355674e-05, + "loss": 0.8568, + "step": 5233 + }, + { + "epoch": 0.8544957348679646, + "grad_norm": 2.069542169570923, + "learning_rate": 1.990987451034217e-05, + "loss": 0.9139, + "step": 5234 + }, + { + "epoch": 0.8546589935104689, + "grad_norm": 1.9148963689804077, + "learning_rate": 1.9909831997148363e-05, + "loss": 0.7141, + "step": 5235 + }, + { + "epoch": 0.8548222521529734, + "grad_norm": 1.4539021253585815, + "learning_rate": 1.9909789473975358e-05, + "loss": 0.5316, + "step": 5236 + }, + { + "epoch": 0.8549855107954777, + "grad_norm": 2.009082078933716, + "learning_rate": 1.9909746940823197e-05, + "loss": 0.724, + "step": 5237 + }, + { + "epoch": 0.8551487694379821, + "grad_norm": 1.810773253440857, + "learning_rate": 1.9909704397691924e-05, + "loss": 0.7616, + "step": 5238 + }, + { + "epoch": 0.8553120280804866, + "grad_norm": 1.774091362953186, + "learning_rate": 1.990966184458158e-05, + "loss": 0.7153, + "step": 5239 + }, + { + "epoch": 0.8554752867229909, + "grad_norm": 2.0093867778778076, + "learning_rate": 1.990961928149221e-05, + "loss": 0.8105, + "step": 5240 + }, + { + "epoch": 0.8556385453654953, + "grad_norm": 1.7953866720199585, + "learning_rate": 1.9909576708423857e-05, + "loss": 0.6886, + "step": 5241 + }, + { + "epoch": 0.8558018040079997, + "grad_norm": 1.8970423936843872, + "learning_rate": 1.9909534125376563e-05, + "loss": 0.8574, + "step": 5242 + }, + { + "epoch": 0.8559650626505041, + "grad_norm": 1.8539543151855469, + "learning_rate": 1.990949153235037e-05, + "loss": 0.7792, + "step": 5243 + }, + { + "epoch": 0.8561283212930084, + "grad_norm": 1.7712501287460327, + "learning_rate": 1.990944892934532e-05, + "loss": 0.5931, + "step": 5244 + }, + { + "epoch": 0.8562915799355129, + "grad_norm": 1.84429132938385, + "learning_rate": 1.990940631636146e-05, + "loss": 0.6989, + "step": 5245 + }, + { + "epoch": 0.8564548385780172, + "grad_norm": 1.8634265661239624, + "learning_rate": 1.9909363693398828e-05, + "loss": 0.7841, + "step": 5246 + }, + { + "epoch": 0.8566180972205216, + "grad_norm": 1.753777265548706, + "learning_rate": 1.9909321060457475e-05, + "loss": 0.6982, + "step": 5247 + }, + { + "epoch": 0.856781355863026, + "grad_norm": 1.595438838005066, + "learning_rate": 1.9909278417537433e-05, + "loss": 0.6368, + "step": 5248 + }, + { + "epoch": 0.8569446145055304, + "grad_norm": 2.764350414276123, + "learning_rate": 1.990923576463875e-05, + "loss": 0.9282, + "step": 5249 + }, + { + "epoch": 0.8571078731480348, + "grad_norm": 1.5892215967178345, + "learning_rate": 1.990919310176147e-05, + "loss": 0.7189, + "step": 5250 + }, + { + "epoch": 0.8572711317905392, + "grad_norm": 2.046966552734375, + "learning_rate": 1.990915042890564e-05, + "loss": 0.6612, + "step": 5251 + }, + { + "epoch": 0.8574343904330436, + "grad_norm": 2.2284348011016846, + "learning_rate": 1.9909107746071294e-05, + "loss": 0.8726, + "step": 5252 + }, + { + "epoch": 0.8575976490755479, + "grad_norm": 1.8981080055236816, + "learning_rate": 1.9909065053258477e-05, + "loss": 0.7297, + "step": 5253 + }, + { + "epoch": 0.8577609077180524, + "grad_norm": 2.076026201248169, + "learning_rate": 1.9909022350467236e-05, + "loss": 0.8593, + "step": 5254 + }, + { + "epoch": 0.8579241663605567, + "grad_norm": 2.0104031562805176, + "learning_rate": 1.9908979637697612e-05, + "loss": 0.7239, + "step": 5255 + }, + { + "epoch": 0.8580874250030611, + "grad_norm": 1.6701771020889282, + "learning_rate": 1.990893691494965e-05, + "loss": 0.7287, + "step": 5256 + }, + { + "epoch": 0.8582506836455654, + "grad_norm": 1.668904185295105, + "learning_rate": 1.990889418222339e-05, + "loss": 0.6737, + "step": 5257 + }, + { + "epoch": 0.8584139422880699, + "grad_norm": 1.901370882987976, + "learning_rate": 1.9908851439518875e-05, + "loss": 0.6895, + "step": 5258 + }, + { + "epoch": 0.8585772009305742, + "grad_norm": 2.0240578651428223, + "learning_rate": 1.990880868683615e-05, + "loss": 0.9721, + "step": 5259 + }, + { + "epoch": 0.8587404595730787, + "grad_norm": 1.9102860689163208, + "learning_rate": 1.9908765924175256e-05, + "loss": 0.7693, + "step": 5260 + }, + { + "epoch": 0.8589037182155831, + "grad_norm": 2.2018327713012695, + "learning_rate": 1.990872315153624e-05, + "loss": 0.9316, + "step": 5261 + }, + { + "epoch": 0.8590669768580874, + "grad_norm": 2.16581130027771, + "learning_rate": 1.990868036891914e-05, + "loss": 0.8588, + "step": 5262 + }, + { + "epoch": 0.8592302355005919, + "grad_norm": 2.003629207611084, + "learning_rate": 1.9908637576324005e-05, + "loss": 0.8266, + "step": 5263 + }, + { + "epoch": 0.8593934941430962, + "grad_norm": 2.2723209857940674, + "learning_rate": 1.990859477375087e-05, + "loss": 0.7091, + "step": 5264 + }, + { + "epoch": 0.8595567527856006, + "grad_norm": 1.9962843656539917, + "learning_rate": 1.9908551961199785e-05, + "loss": 0.7861, + "step": 5265 + }, + { + "epoch": 0.859720011428105, + "grad_norm": 1.9620423316955566, + "learning_rate": 1.990850913867079e-05, + "loss": 0.8081, + "step": 5266 + }, + { + "epoch": 0.8598832700706094, + "grad_norm": 1.8551784753799438, + "learning_rate": 1.990846630616393e-05, + "loss": 0.8188, + "step": 5267 + }, + { + "epoch": 0.8600465287131137, + "grad_norm": 1.8519151210784912, + "learning_rate": 1.9908423463679246e-05, + "loss": 0.7708, + "step": 5268 + }, + { + "epoch": 0.8602097873556181, + "grad_norm": 1.7020376920700073, + "learning_rate": 1.9908380611216785e-05, + "loss": 0.6531, + "step": 5269 + }, + { + "epoch": 0.8603730459981225, + "grad_norm": 1.87134850025177, + "learning_rate": 1.9908337748776585e-05, + "loss": 0.7429, + "step": 5270 + }, + { + "epoch": 0.8605363046406269, + "grad_norm": 2.0105204582214355, + "learning_rate": 1.9908294876358692e-05, + "loss": 0.7262, + "step": 5271 + }, + { + "epoch": 0.8606995632831314, + "grad_norm": 2.0024731159210205, + "learning_rate": 1.9908251993963148e-05, + "loss": 0.8756, + "step": 5272 + }, + { + "epoch": 0.8608628219256357, + "grad_norm": 1.6941802501678467, + "learning_rate": 1.9908209101589996e-05, + "loss": 0.7362, + "step": 5273 + }, + { + "epoch": 0.8610260805681401, + "grad_norm": 1.6821627616882324, + "learning_rate": 1.9908166199239284e-05, + "loss": 0.7683, + "step": 5274 + }, + { + "epoch": 0.8611893392106444, + "grad_norm": 1.6837925910949707, + "learning_rate": 1.990812328691105e-05, + "loss": 0.7419, + "step": 5275 + }, + { + "epoch": 0.8613525978531489, + "grad_norm": 1.6824650764465332, + "learning_rate": 1.9908080364605334e-05, + "loss": 0.7107, + "step": 5276 + }, + { + "epoch": 0.8615158564956532, + "grad_norm": 1.8458701372146606, + "learning_rate": 1.990803743232219e-05, + "loss": 0.8471, + "step": 5277 + }, + { + "epoch": 0.8616791151381576, + "grad_norm": 1.8365248441696167, + "learning_rate": 1.990799449006165e-05, + "loss": 0.6981, + "step": 5278 + }, + { + "epoch": 0.861842373780662, + "grad_norm": 1.8982356786727905, + "learning_rate": 1.9907951537823762e-05, + "loss": 0.896, + "step": 5279 + }, + { + "epoch": 0.8620056324231664, + "grad_norm": 2.1281328201293945, + "learning_rate": 1.9907908575608573e-05, + "loss": 0.8622, + "step": 5280 + }, + { + "epoch": 0.8621688910656707, + "grad_norm": 1.3575146198272705, + "learning_rate": 1.990786560341612e-05, + "loss": 0.5932, + "step": 5281 + }, + { + "epoch": 0.8623321497081752, + "grad_norm": 1.8779218196868896, + "learning_rate": 1.9907822621246452e-05, + "loss": 0.9159, + "step": 5282 + }, + { + "epoch": 0.8624954083506796, + "grad_norm": 1.809924602508545, + "learning_rate": 1.9907779629099605e-05, + "loss": 0.7411, + "step": 5283 + }, + { + "epoch": 0.8626586669931839, + "grad_norm": 1.6161234378814697, + "learning_rate": 1.990773662697563e-05, + "loss": 0.757, + "step": 5284 + }, + { + "epoch": 0.8628219256356884, + "grad_norm": 1.9763113260269165, + "learning_rate": 1.9907693614874566e-05, + "loss": 0.6941, + "step": 5285 + }, + { + "epoch": 0.8629851842781927, + "grad_norm": 1.9395538568496704, + "learning_rate": 1.9907650592796454e-05, + "loss": 0.7952, + "step": 5286 + }, + { + "epoch": 0.8631484429206971, + "grad_norm": 1.9521416425704956, + "learning_rate": 1.9907607560741345e-05, + "loss": 0.7401, + "step": 5287 + }, + { + "epoch": 0.8633117015632015, + "grad_norm": 1.8089687824249268, + "learning_rate": 1.9907564518709276e-05, + "loss": 0.7943, + "step": 5288 + }, + { + "epoch": 0.8634749602057059, + "grad_norm": 1.4096678495407104, + "learning_rate": 1.9907521466700293e-05, + "loss": 0.5581, + "step": 5289 + }, + { + "epoch": 0.8636382188482102, + "grad_norm": 1.72004234790802, + "learning_rate": 1.9907478404714438e-05, + "loss": 0.783, + "step": 5290 + }, + { + "epoch": 0.8638014774907147, + "grad_norm": 1.8231446743011475, + "learning_rate": 1.9907435332751754e-05, + "loss": 0.8052, + "step": 5291 + }, + { + "epoch": 0.863964736133219, + "grad_norm": 2.055612087249756, + "learning_rate": 1.9907392250812287e-05, + "loss": 0.7756, + "step": 5292 + }, + { + "epoch": 0.8641279947757234, + "grad_norm": 1.622261643409729, + "learning_rate": 1.9907349158896075e-05, + "loss": 0.7184, + "step": 5293 + }, + { + "epoch": 0.8642912534182279, + "grad_norm": 1.9895159006118774, + "learning_rate": 1.9907306057003167e-05, + "loss": 0.8028, + "step": 5294 + }, + { + "epoch": 0.8644545120607322, + "grad_norm": 2.29270076751709, + "learning_rate": 1.9907262945133607e-05, + "loss": 0.8341, + "step": 5295 + }, + { + "epoch": 0.8646177707032366, + "grad_norm": 1.69044828414917, + "learning_rate": 1.9907219823287436e-05, + "loss": 0.7895, + "step": 5296 + }, + { + "epoch": 0.864781029345741, + "grad_norm": 1.723405122756958, + "learning_rate": 1.9907176691464693e-05, + "loss": 0.8586, + "step": 5297 + }, + { + "epoch": 0.8649442879882454, + "grad_norm": 2.107980251312256, + "learning_rate": 1.990713354966543e-05, + "loss": 0.7913, + "step": 5298 + }, + { + "epoch": 0.8651075466307497, + "grad_norm": 1.6097900867462158, + "learning_rate": 1.9907090397889682e-05, + "loss": 0.7168, + "step": 5299 + }, + { + "epoch": 0.8652708052732542, + "grad_norm": 1.8492448329925537, + "learning_rate": 1.99070472361375e-05, + "loss": 0.629, + "step": 5300 + }, + { + "epoch": 0.8654340639157585, + "grad_norm": 2.362110137939453, + "learning_rate": 1.9907004064408923e-05, + "loss": 0.7905, + "step": 5301 + }, + { + "epoch": 0.8655973225582629, + "grad_norm": 1.621491551399231, + "learning_rate": 1.9906960882703993e-05, + "loss": 0.741, + "step": 5302 + }, + { + "epoch": 0.8657605812007673, + "grad_norm": 1.8737967014312744, + "learning_rate": 1.9906917691022757e-05, + "loss": 0.7164, + "step": 5303 + }, + { + "epoch": 0.8659238398432717, + "grad_norm": 1.8647116422653198, + "learning_rate": 1.990687448936526e-05, + "loss": 0.704, + "step": 5304 + }, + { + "epoch": 0.8660870984857761, + "grad_norm": 1.8268877267837524, + "learning_rate": 1.990683127773154e-05, + "loss": 0.6797, + "step": 5305 + }, + { + "epoch": 0.8662503571282805, + "grad_norm": 2.1025314331054688, + "learning_rate": 1.9906788056121646e-05, + "loss": 0.8211, + "step": 5306 + }, + { + "epoch": 0.8664136157707849, + "grad_norm": 1.8377305269241333, + "learning_rate": 1.990674482453562e-05, + "loss": 0.7841, + "step": 5307 + }, + { + "epoch": 0.8665768744132892, + "grad_norm": 1.6889852285385132, + "learning_rate": 1.9906701582973502e-05, + "loss": 0.7654, + "step": 5308 + }, + { + "epoch": 0.8667401330557937, + "grad_norm": 1.8103327751159668, + "learning_rate": 1.9906658331435338e-05, + "loss": 0.5858, + "step": 5309 + }, + { + "epoch": 0.866903391698298, + "grad_norm": 1.8921058177947998, + "learning_rate": 1.9906615069921175e-05, + "loss": 0.8204, + "step": 5310 + }, + { + "epoch": 0.8670666503408024, + "grad_norm": 1.5741298198699951, + "learning_rate": 1.990657179843105e-05, + "loss": 0.6562, + "step": 5311 + }, + { + "epoch": 0.8672299089833068, + "grad_norm": 1.8704915046691895, + "learning_rate": 1.990652851696501e-05, + "loss": 0.79, + "step": 5312 + }, + { + "epoch": 0.8673931676258112, + "grad_norm": 1.824502944946289, + "learning_rate": 1.99064852255231e-05, + "loss": 0.8256, + "step": 5313 + }, + { + "epoch": 0.8675564262683155, + "grad_norm": 1.771916151046753, + "learning_rate": 1.990644192410536e-05, + "loss": 0.7333, + "step": 5314 + }, + { + "epoch": 0.86771968491082, + "grad_norm": 2.2634685039520264, + "learning_rate": 1.9906398612711837e-05, + "loss": 0.7429, + "step": 5315 + }, + { + "epoch": 0.8678829435533244, + "grad_norm": 2.1489388942718506, + "learning_rate": 1.9906355291342573e-05, + "loss": 0.811, + "step": 5316 + }, + { + "epoch": 0.8680462021958287, + "grad_norm": 1.9035115242004395, + "learning_rate": 1.9906311959997614e-05, + "loss": 0.7779, + "step": 5317 + }, + { + "epoch": 0.8682094608383332, + "grad_norm": 1.9477570056915283, + "learning_rate": 1.9906268618676997e-05, + "loss": 1.0146, + "step": 5318 + }, + { + "epoch": 0.8683727194808375, + "grad_norm": 2.302804470062256, + "learning_rate": 1.9906225267380774e-05, + "loss": 0.9743, + "step": 5319 + }, + { + "epoch": 0.8685359781233419, + "grad_norm": 1.984257698059082, + "learning_rate": 1.9906181906108983e-05, + "loss": 0.9706, + "step": 5320 + }, + { + "epoch": 0.8686992367658463, + "grad_norm": 2.1444292068481445, + "learning_rate": 1.990613853486167e-05, + "loss": 1.0123, + "step": 5321 + }, + { + "epoch": 0.8688624954083507, + "grad_norm": 1.9393644332885742, + "learning_rate": 1.9906095153638877e-05, + "loss": 0.7344, + "step": 5322 + }, + { + "epoch": 0.869025754050855, + "grad_norm": 1.7750895023345947, + "learning_rate": 1.990605176244065e-05, + "loss": 0.6801, + "step": 5323 + }, + { + "epoch": 0.8691890126933595, + "grad_norm": 1.9480575323104858, + "learning_rate": 1.990600836126703e-05, + "loss": 0.6649, + "step": 5324 + }, + { + "epoch": 0.8693522713358638, + "grad_norm": 1.7845743894577026, + "learning_rate": 1.9905964950118063e-05, + "loss": 0.7911, + "step": 5325 + }, + { + "epoch": 0.8695155299783682, + "grad_norm": 1.681143879890442, + "learning_rate": 1.9905921528993796e-05, + "loss": 0.8538, + "step": 5326 + }, + { + "epoch": 0.8696787886208727, + "grad_norm": 1.5916496515274048, + "learning_rate": 1.9905878097894263e-05, + "loss": 0.7182, + "step": 5327 + }, + { + "epoch": 0.869842047263377, + "grad_norm": 1.7842963933944702, + "learning_rate": 1.9905834656819513e-05, + "loss": 0.7537, + "step": 5328 + }, + { + "epoch": 0.8700053059058814, + "grad_norm": 1.7643011808395386, + "learning_rate": 1.9905791205769596e-05, + "loss": 0.6886, + "step": 5329 + }, + { + "epoch": 0.8701685645483858, + "grad_norm": 1.9059230089187622, + "learning_rate": 1.9905747744744547e-05, + "loss": 0.8013, + "step": 5330 + }, + { + "epoch": 0.8703318231908902, + "grad_norm": 1.6504101753234863, + "learning_rate": 1.990570427374441e-05, + "loss": 0.6243, + "step": 5331 + }, + { + "epoch": 0.8704950818333945, + "grad_norm": 1.8690545558929443, + "learning_rate": 1.9905660792769234e-05, + "loss": 0.7277, + "step": 5332 + }, + { + "epoch": 0.870658340475899, + "grad_norm": 1.6733988523483276, + "learning_rate": 1.9905617301819057e-05, + "loss": 0.7448, + "step": 5333 + }, + { + "epoch": 0.8708215991184033, + "grad_norm": 2.0021817684173584, + "learning_rate": 1.990557380089393e-05, + "loss": 1.0192, + "step": 5334 + }, + { + "epoch": 0.8709848577609077, + "grad_norm": 2.192671298980713, + "learning_rate": 1.990553028999389e-05, + "loss": 0.7175, + "step": 5335 + }, + { + "epoch": 0.871148116403412, + "grad_norm": 1.4054670333862305, + "learning_rate": 1.9905486769118987e-05, + "loss": 0.5802, + "step": 5336 + }, + { + "epoch": 0.8713113750459165, + "grad_norm": 1.6612898111343384, + "learning_rate": 1.990544323826926e-05, + "loss": 0.7542, + "step": 5337 + }, + { + "epoch": 0.8714746336884209, + "grad_norm": 1.7611727714538574, + "learning_rate": 1.990539969744475e-05, + "loss": 0.8509, + "step": 5338 + }, + { + "epoch": 0.8716378923309253, + "grad_norm": 1.9242517948150635, + "learning_rate": 1.9905356146645514e-05, + "loss": 0.7966, + "step": 5339 + }, + { + "epoch": 0.8718011509734297, + "grad_norm": 2.21624493598938, + "learning_rate": 1.990531258587158e-05, + "loss": 0.9661, + "step": 5340 + }, + { + "epoch": 0.871964409615934, + "grad_norm": 1.9838974475860596, + "learning_rate": 1.9905269015123e-05, + "loss": 0.9176, + "step": 5341 + }, + { + "epoch": 0.8721276682584385, + "grad_norm": 1.858581304550171, + "learning_rate": 1.990522543439982e-05, + "loss": 0.6789, + "step": 5342 + }, + { + "epoch": 0.8722909269009428, + "grad_norm": 1.9639451503753662, + "learning_rate": 1.9905181843702073e-05, + "loss": 0.8564, + "step": 5343 + }, + { + "epoch": 0.8724541855434472, + "grad_norm": 2.051272392272949, + "learning_rate": 1.990513824302982e-05, + "loss": 0.8163, + "step": 5344 + }, + { + "epoch": 0.8726174441859516, + "grad_norm": 1.8524880409240723, + "learning_rate": 1.990509463238309e-05, + "loss": 0.8013, + "step": 5345 + }, + { + "epoch": 0.872780702828456, + "grad_norm": 1.9199869632720947, + "learning_rate": 1.9905051011761933e-05, + "loss": 0.726, + "step": 5346 + }, + { + "epoch": 0.8729439614709603, + "grad_norm": 2.0056276321411133, + "learning_rate": 1.9905007381166394e-05, + "loss": 1.2549, + "step": 5347 + }, + { + "epoch": 0.8731072201134648, + "grad_norm": 2.1515982151031494, + "learning_rate": 1.9904963740596514e-05, + "loss": 0.8041, + "step": 5348 + }, + { + "epoch": 0.8732704787559692, + "grad_norm": 2.007981538772583, + "learning_rate": 1.9904920090052336e-05, + "loss": 0.8187, + "step": 5349 + }, + { + "epoch": 0.8734337373984735, + "grad_norm": 1.9774667024612427, + "learning_rate": 1.9904876429533912e-05, + "loss": 0.6288, + "step": 5350 + }, + { + "epoch": 0.873596996040978, + "grad_norm": 1.6390999555587769, + "learning_rate": 1.990483275904127e-05, + "loss": 0.6677, + "step": 5351 + }, + { + "epoch": 0.8737602546834823, + "grad_norm": 1.9435499906539917, + "learning_rate": 1.9904789078574472e-05, + "loss": 0.7302, + "step": 5352 + }, + { + "epoch": 0.8739235133259867, + "grad_norm": 1.7114388942718506, + "learning_rate": 1.9904745388133552e-05, + "loss": 0.8115, + "step": 5353 + }, + { + "epoch": 0.874086771968491, + "grad_norm": 2.0061943531036377, + "learning_rate": 1.9904701687718558e-05, + "loss": 0.9039, + "step": 5354 + }, + { + "epoch": 0.8742500306109955, + "grad_norm": 1.8601288795471191, + "learning_rate": 1.990465797732953e-05, + "loss": 0.9263, + "step": 5355 + }, + { + "epoch": 0.8744132892534998, + "grad_norm": 2.5641002655029297, + "learning_rate": 1.9904614256966514e-05, + "loss": 0.7432, + "step": 5356 + }, + { + "epoch": 0.8745765478960043, + "grad_norm": 2.044412136077881, + "learning_rate": 1.9904570526629556e-05, + "loss": 0.7277, + "step": 5357 + }, + { + "epoch": 0.8747398065385086, + "grad_norm": 2.10294508934021, + "learning_rate": 1.9904526786318693e-05, + "loss": 0.8701, + "step": 5358 + }, + { + "epoch": 0.874903065181013, + "grad_norm": 1.6787285804748535, + "learning_rate": 1.990448303603398e-05, + "loss": 0.6683, + "step": 5359 + }, + { + "epoch": 0.8750663238235175, + "grad_norm": 1.6135354042053223, + "learning_rate": 1.990443927577545e-05, + "loss": 0.7424, + "step": 5360 + }, + { + "epoch": 0.8752295824660218, + "grad_norm": 1.7826166152954102, + "learning_rate": 1.9904395505543156e-05, + "loss": 0.7811, + "step": 5361 + }, + { + "epoch": 0.8753928411085262, + "grad_norm": 1.9096801280975342, + "learning_rate": 1.990435172533714e-05, + "loss": 0.6655, + "step": 5362 + }, + { + "epoch": 0.8755560997510305, + "grad_norm": 1.8651432991027832, + "learning_rate": 1.990430793515744e-05, + "loss": 0.8022, + "step": 5363 + }, + { + "epoch": 0.875719358393535, + "grad_norm": 1.912885069847107, + "learning_rate": 1.990426413500411e-05, + "loss": 0.9028, + "step": 5364 + }, + { + "epoch": 0.8758826170360393, + "grad_norm": 1.8068113327026367, + "learning_rate": 1.9904220324877183e-05, + "loss": 0.7282, + "step": 5365 + }, + { + "epoch": 0.8760458756785438, + "grad_norm": 2.1601099967956543, + "learning_rate": 1.9904176504776707e-05, + "loss": 0.7525, + "step": 5366 + }, + { + "epoch": 0.8762091343210481, + "grad_norm": 2.074791669845581, + "learning_rate": 1.9904132674702734e-05, + "loss": 0.9304, + "step": 5367 + }, + { + "epoch": 0.8763723929635525, + "grad_norm": 1.9066044092178345, + "learning_rate": 1.99040888346553e-05, + "loss": 0.8537, + "step": 5368 + }, + { + "epoch": 0.8765356516060568, + "grad_norm": 1.7449294328689575, + "learning_rate": 1.990404498463445e-05, + "loss": 0.6296, + "step": 5369 + }, + { + "epoch": 0.8766989102485613, + "grad_norm": 1.786722183227539, + "learning_rate": 1.990400112464023e-05, + "loss": 0.7452, + "step": 5370 + }, + { + "epoch": 0.8768621688910657, + "grad_norm": 2.097872018814087, + "learning_rate": 1.9903957254672687e-05, + "loss": 0.8531, + "step": 5371 + }, + { + "epoch": 0.87702542753357, + "grad_norm": 2.0409600734710693, + "learning_rate": 1.9903913374731858e-05, + "loss": 0.8803, + "step": 5372 + }, + { + "epoch": 0.8771886861760745, + "grad_norm": 1.5766253471374512, + "learning_rate": 1.990386948481779e-05, + "loss": 0.6391, + "step": 5373 + }, + { + "epoch": 0.8773519448185788, + "grad_norm": 1.8633641004562378, + "learning_rate": 1.9903825584930527e-05, + "loss": 0.6918, + "step": 5374 + }, + { + "epoch": 0.8775152034610832, + "grad_norm": 2.0771825313568115, + "learning_rate": 1.9903781675070116e-05, + "loss": 0.9161, + "step": 5375 + }, + { + "epoch": 0.8776784621035876, + "grad_norm": 1.7609386444091797, + "learning_rate": 1.9903737755236604e-05, + "loss": 0.7634, + "step": 5376 + }, + { + "epoch": 0.877841720746092, + "grad_norm": 1.960425615310669, + "learning_rate": 1.9903693825430027e-05, + "loss": 0.8728, + "step": 5377 + }, + { + "epoch": 0.8780049793885963, + "grad_norm": 1.8830480575561523, + "learning_rate": 1.990364988565043e-05, + "loss": 0.7346, + "step": 5378 + }, + { + "epoch": 0.8781682380311008, + "grad_norm": 1.8873202800750732, + "learning_rate": 1.990360593589786e-05, + "loss": 0.7891, + "step": 5379 + }, + { + "epoch": 0.8783314966736051, + "grad_norm": 1.6141752004623413, + "learning_rate": 1.9903561976172367e-05, + "loss": 0.927, + "step": 5380 + }, + { + "epoch": 0.8784947553161095, + "grad_norm": 1.7607455253601074, + "learning_rate": 1.9903518006473987e-05, + "loss": 0.8128, + "step": 5381 + }, + { + "epoch": 0.878658013958614, + "grad_norm": 1.829599380493164, + "learning_rate": 1.9903474026802768e-05, + "loss": 0.763, + "step": 5382 + }, + { + "epoch": 0.8788212726011183, + "grad_norm": 2.2322628498077393, + "learning_rate": 1.9903430037158755e-05, + "loss": 0.7634, + "step": 5383 + }, + { + "epoch": 0.8789845312436227, + "grad_norm": 1.962327480316162, + "learning_rate": 1.9903386037541986e-05, + "loss": 0.8711, + "step": 5384 + }, + { + "epoch": 0.8791477898861271, + "grad_norm": 2.259624719619751, + "learning_rate": 1.9903342027952512e-05, + "loss": 0.846, + "step": 5385 + }, + { + "epoch": 0.8793110485286315, + "grad_norm": 1.8829166889190674, + "learning_rate": 1.9903298008390374e-05, + "loss": 0.8885, + "step": 5386 + }, + { + "epoch": 0.8794743071711358, + "grad_norm": 1.8962863683700562, + "learning_rate": 1.990325397885562e-05, + "loss": 0.8238, + "step": 5387 + }, + { + "epoch": 0.8796375658136403, + "grad_norm": 1.6922260522842407, + "learning_rate": 1.990320993934829e-05, + "loss": 0.6972, + "step": 5388 + }, + { + "epoch": 0.8798008244561446, + "grad_norm": 1.8908518552780151, + "learning_rate": 1.990316588986843e-05, + "loss": 0.7384, + "step": 5389 + }, + { + "epoch": 0.879964083098649, + "grad_norm": 2.1582534313201904, + "learning_rate": 1.9903121830416085e-05, + "loss": 0.9081, + "step": 5390 + }, + { + "epoch": 0.8801273417411534, + "grad_norm": 1.9674888849258423, + "learning_rate": 1.99030777609913e-05, + "loss": 0.8014, + "step": 5391 + }, + { + "epoch": 0.8802906003836578, + "grad_norm": 1.6110748052597046, + "learning_rate": 1.9903033681594117e-05, + "loss": 0.7339, + "step": 5392 + }, + { + "epoch": 0.8804538590261622, + "grad_norm": 1.792742133140564, + "learning_rate": 1.990298959222458e-05, + "loss": 0.7502, + "step": 5393 + }, + { + "epoch": 0.8806171176686666, + "grad_norm": 1.8898837566375732, + "learning_rate": 1.9902945492882738e-05, + "loss": 0.7956, + "step": 5394 + }, + { + "epoch": 0.880780376311171, + "grad_norm": 1.7538906335830688, + "learning_rate": 1.9902901383568633e-05, + "loss": 0.9086, + "step": 5395 + }, + { + "epoch": 0.8809436349536753, + "grad_norm": 1.9342808723449707, + "learning_rate": 1.9902857264282307e-05, + "loss": 0.8356, + "step": 5396 + }, + { + "epoch": 0.8811068935961798, + "grad_norm": 1.7108091115951538, + "learning_rate": 1.9902813135023805e-05, + "loss": 0.6881, + "step": 5397 + }, + { + "epoch": 0.8812701522386841, + "grad_norm": 1.8676313161849976, + "learning_rate": 1.9902768995793177e-05, + "loss": 0.8874, + "step": 5398 + }, + { + "epoch": 0.8814334108811885, + "grad_norm": 1.487164855003357, + "learning_rate": 1.990272484659046e-05, + "loss": 0.7379, + "step": 5399 + }, + { + "epoch": 0.8815966695236929, + "grad_norm": 1.4870518445968628, + "learning_rate": 1.9902680687415704e-05, + "loss": 0.6003, + "step": 5400 + }, + { + "epoch": 0.8817599281661973, + "grad_norm": 1.7540218830108643, + "learning_rate": 1.990263651826895e-05, + "loss": 0.8199, + "step": 5401 + }, + { + "epoch": 0.8819231868087016, + "grad_norm": 1.9954863786697388, + "learning_rate": 1.9902592339150243e-05, + "loss": 0.8971, + "step": 5402 + }, + { + "epoch": 0.8820864454512061, + "grad_norm": 1.8785138130187988, + "learning_rate": 1.990254815005963e-05, + "loss": 0.843, + "step": 5403 + }, + { + "epoch": 0.8822497040937105, + "grad_norm": 1.6195671558380127, + "learning_rate": 1.9902503950997154e-05, + "loss": 0.6432, + "step": 5404 + }, + { + "epoch": 0.8824129627362148, + "grad_norm": 1.957841396331787, + "learning_rate": 1.9902459741962856e-05, + "loss": 0.6774, + "step": 5405 + }, + { + "epoch": 0.8825762213787193, + "grad_norm": 1.8928015232086182, + "learning_rate": 1.9902415522956785e-05, + "loss": 0.7582, + "step": 5406 + }, + { + "epoch": 0.8827394800212236, + "grad_norm": 2.031609058380127, + "learning_rate": 1.9902371293978985e-05, + "loss": 0.8242, + "step": 5407 + }, + { + "epoch": 0.882902738663728, + "grad_norm": 1.8119521141052246, + "learning_rate": 1.99023270550295e-05, + "loss": 0.8074, + "step": 5408 + }, + { + "epoch": 0.8830659973062324, + "grad_norm": 2.09952449798584, + "learning_rate": 1.9902282806108372e-05, + "loss": 0.8683, + "step": 5409 + }, + { + "epoch": 0.8832292559487368, + "grad_norm": 1.6313326358795166, + "learning_rate": 1.9902238547215652e-05, + "loss": 0.7289, + "step": 5410 + }, + { + "epoch": 0.8833925145912411, + "grad_norm": 2.0314393043518066, + "learning_rate": 1.9902194278351375e-05, + "loss": 0.7627, + "step": 5411 + }, + { + "epoch": 0.8835557732337456, + "grad_norm": 1.949706792831421, + "learning_rate": 1.9902149999515593e-05, + "loss": 0.7523, + "step": 5412 + }, + { + "epoch": 0.8837190318762499, + "grad_norm": 1.7507826089859009, + "learning_rate": 1.9902105710708352e-05, + "loss": 0.8117, + "step": 5413 + }, + { + "epoch": 0.8838822905187543, + "grad_norm": 2.0059726238250732, + "learning_rate": 1.990206141192969e-05, + "loss": 0.7035, + "step": 5414 + }, + { + "epoch": 0.8840455491612588, + "grad_norm": 1.7472230195999146, + "learning_rate": 1.9902017103179655e-05, + "loss": 0.7694, + "step": 5415 + }, + { + "epoch": 0.8842088078037631, + "grad_norm": 1.9423251152038574, + "learning_rate": 1.990197278445829e-05, + "loss": 0.7938, + "step": 5416 + }, + { + "epoch": 0.8843720664462675, + "grad_norm": 2.387195587158203, + "learning_rate": 1.9901928455765644e-05, + "loss": 0.7778, + "step": 5417 + }, + { + "epoch": 0.8845353250887719, + "grad_norm": 2.100958824157715, + "learning_rate": 1.990188411710176e-05, + "loss": 0.8895, + "step": 5418 + }, + { + "epoch": 0.8846985837312763, + "grad_norm": 1.7963606119155884, + "learning_rate": 1.9901839768466677e-05, + "loss": 0.7657, + "step": 5419 + }, + { + "epoch": 0.8848618423737806, + "grad_norm": 1.9964659214019775, + "learning_rate": 1.9901795409860445e-05, + "loss": 0.8749, + "step": 5420 + }, + { + "epoch": 0.8850251010162851, + "grad_norm": 1.9309360980987549, + "learning_rate": 1.9901751041283108e-05, + "loss": 0.8827, + "step": 5421 + }, + { + "epoch": 0.8851883596587894, + "grad_norm": 1.8912358283996582, + "learning_rate": 1.990170666273471e-05, + "loss": 0.7277, + "step": 5422 + }, + { + "epoch": 0.8853516183012938, + "grad_norm": 1.998364806175232, + "learning_rate": 1.9901662274215298e-05, + "loss": 0.7564, + "step": 5423 + }, + { + "epoch": 0.8855148769437982, + "grad_norm": 2.4111011028289795, + "learning_rate": 1.9901617875724914e-05, + "loss": 0.9263, + "step": 5424 + }, + { + "epoch": 0.8856781355863026, + "grad_norm": 1.7396621704101562, + "learning_rate": 1.9901573467263603e-05, + "loss": 0.6814, + "step": 5425 + }, + { + "epoch": 0.885841394228807, + "grad_norm": 1.8112989664077759, + "learning_rate": 1.9901529048831407e-05, + "loss": 0.819, + "step": 5426 + }, + { + "epoch": 0.8860046528713114, + "grad_norm": 2.6161205768585205, + "learning_rate": 1.990148462042838e-05, + "loss": 0.7833, + "step": 5427 + }, + { + "epoch": 0.8861679115138158, + "grad_norm": 1.402883529663086, + "learning_rate": 1.9901440182054555e-05, + "loss": 0.5433, + "step": 5428 + }, + { + "epoch": 0.8863311701563201, + "grad_norm": 1.7268433570861816, + "learning_rate": 1.9901395733709987e-05, + "loss": 0.7172, + "step": 5429 + }, + { + "epoch": 0.8864944287988246, + "grad_norm": 1.9710057973861694, + "learning_rate": 1.9901351275394712e-05, + "loss": 0.7811, + "step": 5430 + }, + { + "epoch": 0.8866576874413289, + "grad_norm": 1.5883653163909912, + "learning_rate": 1.9901306807108783e-05, + "loss": 0.614, + "step": 5431 + }, + { + "epoch": 0.8868209460838333, + "grad_norm": 1.7911334037780762, + "learning_rate": 1.990126232885224e-05, + "loss": 0.737, + "step": 5432 + }, + { + "epoch": 0.8869842047263377, + "grad_norm": 1.984209418296814, + "learning_rate": 1.990121784062512e-05, + "loss": 0.841, + "step": 5433 + }, + { + "epoch": 0.8871474633688421, + "grad_norm": 1.9430984258651733, + "learning_rate": 1.9901173342427487e-05, + "loss": 0.7965, + "step": 5434 + }, + { + "epoch": 0.8873107220113465, + "grad_norm": 1.787397861480713, + "learning_rate": 1.990112883425937e-05, + "loss": 0.6982, + "step": 5435 + }, + { + "epoch": 0.8874739806538509, + "grad_norm": 1.8417294025421143, + "learning_rate": 1.990108431612082e-05, + "loss": 0.8915, + "step": 5436 + }, + { + "epoch": 0.8876372392963553, + "grad_norm": 1.75802743434906, + "learning_rate": 1.9901039788011883e-05, + "loss": 0.7694, + "step": 5437 + }, + { + "epoch": 0.8878004979388596, + "grad_norm": 1.8526890277862549, + "learning_rate": 1.99009952499326e-05, + "loss": 0.846, + "step": 5438 + }, + { + "epoch": 0.8879637565813641, + "grad_norm": 1.3793190717697144, + "learning_rate": 1.9900950701883014e-05, + "loss": 0.6121, + "step": 5439 + }, + { + "epoch": 0.8881270152238684, + "grad_norm": 1.8159807920455933, + "learning_rate": 1.990090614386318e-05, + "loss": 0.8367, + "step": 5440 + }, + { + "epoch": 0.8882902738663728, + "grad_norm": 1.8311798572540283, + "learning_rate": 1.9900861575873128e-05, + "loss": 0.7265, + "step": 5441 + }, + { + "epoch": 0.8884535325088772, + "grad_norm": 1.9749536514282227, + "learning_rate": 1.990081699791292e-05, + "loss": 0.7355, + "step": 5442 + }, + { + "epoch": 0.8886167911513816, + "grad_norm": 1.6932133436203003, + "learning_rate": 1.9900772409982582e-05, + "loss": 0.6625, + "step": 5443 + }, + { + "epoch": 0.8887800497938859, + "grad_norm": 1.8621851205825806, + "learning_rate": 1.9900727812082177e-05, + "loss": 0.8148, + "step": 5444 + }, + { + "epoch": 0.8889433084363904, + "grad_norm": 1.9090908765792847, + "learning_rate": 1.9900683204211737e-05, + "loss": 0.7586, + "step": 5445 + }, + { + "epoch": 0.8891065670788948, + "grad_norm": 1.8892121315002441, + "learning_rate": 1.9900638586371315e-05, + "loss": 0.7747, + "step": 5446 + }, + { + "epoch": 0.8892698257213991, + "grad_norm": 2.0352370738983154, + "learning_rate": 1.9900593958560953e-05, + "loss": 0.856, + "step": 5447 + }, + { + "epoch": 0.8894330843639036, + "grad_norm": 1.8563554286956787, + "learning_rate": 1.9900549320780692e-05, + "loss": 0.7798, + "step": 5448 + }, + { + "epoch": 0.8895963430064079, + "grad_norm": 1.806465983390808, + "learning_rate": 1.9900504673030582e-05, + "loss": 0.693, + "step": 5449 + }, + { + "epoch": 0.8897596016489123, + "grad_norm": 1.9736226797103882, + "learning_rate": 1.9900460015310667e-05, + "loss": 0.8577, + "step": 5450 + }, + { + "epoch": 0.8899228602914167, + "grad_norm": 1.6509897708892822, + "learning_rate": 1.9900415347620988e-05, + "loss": 0.8087, + "step": 5451 + }, + { + "epoch": 0.8900861189339211, + "grad_norm": 1.6485549211502075, + "learning_rate": 1.99003706699616e-05, + "loss": 0.6334, + "step": 5452 + }, + { + "epoch": 0.8902493775764254, + "grad_norm": 2.000493049621582, + "learning_rate": 1.9900325982332537e-05, + "loss": 0.7635, + "step": 5453 + }, + { + "epoch": 0.8904126362189299, + "grad_norm": 2.075079917907715, + "learning_rate": 1.9900281284733847e-05, + "loss": 0.777, + "step": 5454 + }, + { + "epoch": 0.8905758948614342, + "grad_norm": 1.787853479385376, + "learning_rate": 1.990023657716558e-05, + "loss": 0.8209, + "step": 5455 + }, + { + "epoch": 0.8907391535039386, + "grad_norm": 1.917310118675232, + "learning_rate": 1.9900191859627773e-05, + "loss": 0.7167, + "step": 5456 + }, + { + "epoch": 0.8909024121464431, + "grad_norm": 1.845294713973999, + "learning_rate": 1.9900147132120478e-05, + "loss": 0.6278, + "step": 5457 + }, + { + "epoch": 0.8910656707889474, + "grad_norm": 2.0440807342529297, + "learning_rate": 1.9900102394643738e-05, + "loss": 0.788, + "step": 5458 + }, + { + "epoch": 0.8912289294314518, + "grad_norm": 2.147150754928589, + "learning_rate": 1.9900057647197595e-05, + "loss": 0.9297, + "step": 5459 + }, + { + "epoch": 0.8913921880739561, + "grad_norm": 1.6638108491897583, + "learning_rate": 1.99000128897821e-05, + "loss": 0.7667, + "step": 5460 + }, + { + "epoch": 0.8915554467164606, + "grad_norm": 2.0987725257873535, + "learning_rate": 1.9899968122397293e-05, + "loss": 0.8054, + "step": 5461 + }, + { + "epoch": 0.8917187053589649, + "grad_norm": 1.8221789598464966, + "learning_rate": 1.989992334504322e-05, + "loss": 0.6381, + "step": 5462 + }, + { + "epoch": 0.8918819640014694, + "grad_norm": 1.7546521425247192, + "learning_rate": 1.989987855771993e-05, + "loss": 0.6581, + "step": 5463 + }, + { + "epoch": 0.8920452226439737, + "grad_norm": 1.8613066673278809, + "learning_rate": 1.989983376042746e-05, + "loss": 0.7308, + "step": 5464 + }, + { + "epoch": 0.8922084812864781, + "grad_norm": 2.0199716091156006, + "learning_rate": 1.9899788953165863e-05, + "loss": 0.9019, + "step": 5465 + }, + { + "epoch": 0.8923717399289824, + "grad_norm": 1.5132046937942505, + "learning_rate": 1.989974413593518e-05, + "loss": 0.6306, + "step": 5466 + }, + { + "epoch": 0.8925349985714869, + "grad_norm": 1.5626327991485596, + "learning_rate": 1.9899699308735458e-05, + "loss": 0.6759, + "step": 5467 + }, + { + "epoch": 0.8926982572139913, + "grad_norm": 1.9975075721740723, + "learning_rate": 1.989965447156674e-05, + "loss": 0.815, + "step": 5468 + }, + { + "epoch": 0.8928615158564956, + "grad_norm": 1.8193795680999756, + "learning_rate": 1.9899609624429077e-05, + "loss": 0.7413, + "step": 5469 + }, + { + "epoch": 0.8930247744990001, + "grad_norm": 1.783850908279419, + "learning_rate": 1.989956476732251e-05, + "loss": 0.7752, + "step": 5470 + }, + { + "epoch": 0.8931880331415044, + "grad_norm": 1.9898436069488525, + "learning_rate": 1.9899519900247077e-05, + "loss": 0.8762, + "step": 5471 + }, + { + "epoch": 0.8933512917840088, + "grad_norm": 1.9051214456558228, + "learning_rate": 1.9899475023202838e-05, + "loss": 0.7845, + "step": 5472 + }, + { + "epoch": 0.8935145504265132, + "grad_norm": 1.774433970451355, + "learning_rate": 1.9899430136189823e-05, + "loss": 0.7743, + "step": 5473 + }, + { + "epoch": 0.8936778090690176, + "grad_norm": 2.087049961090088, + "learning_rate": 1.9899385239208088e-05, + "loss": 0.8172, + "step": 5474 + }, + { + "epoch": 0.8938410677115219, + "grad_norm": 1.5513324737548828, + "learning_rate": 1.9899340332257676e-05, + "loss": 0.734, + "step": 5475 + }, + { + "epoch": 0.8940043263540264, + "grad_norm": 1.6456248760223389, + "learning_rate": 1.989929541533863e-05, + "loss": 0.8016, + "step": 5476 + }, + { + "epoch": 0.8941675849965307, + "grad_norm": 1.6411423683166504, + "learning_rate": 1.9899250488450993e-05, + "loss": 0.708, + "step": 5477 + }, + { + "epoch": 0.8943308436390351, + "grad_norm": 1.74517023563385, + "learning_rate": 1.9899205551594816e-05, + "loss": 0.7394, + "step": 5478 + }, + { + "epoch": 0.8944941022815396, + "grad_norm": 1.9233719110488892, + "learning_rate": 1.989916060477014e-05, + "loss": 0.7487, + "step": 5479 + }, + { + "epoch": 0.8946573609240439, + "grad_norm": 2.1521646976470947, + "learning_rate": 1.9899115647977015e-05, + "loss": 0.7456, + "step": 5480 + }, + { + "epoch": 0.8948206195665483, + "grad_norm": 1.9030790328979492, + "learning_rate": 1.989907068121548e-05, + "loss": 0.699, + "step": 5481 + }, + { + "epoch": 0.8949838782090527, + "grad_norm": 1.69891357421875, + "learning_rate": 1.9899025704485585e-05, + "loss": 0.8169, + "step": 5482 + }, + { + "epoch": 0.8951471368515571, + "grad_norm": 2.1596570014953613, + "learning_rate": 1.9898980717787374e-05, + "loss": 0.904, + "step": 5483 + }, + { + "epoch": 0.8953103954940614, + "grad_norm": 2.141803741455078, + "learning_rate": 1.9898935721120892e-05, + "loss": 0.6872, + "step": 5484 + }, + { + "epoch": 0.8954736541365659, + "grad_norm": 1.8917534351348877, + "learning_rate": 1.9898890714486182e-05, + "loss": 0.8662, + "step": 5485 + }, + { + "epoch": 0.8956369127790702, + "grad_norm": 1.811232566833496, + "learning_rate": 1.9898845697883296e-05, + "loss": 0.8044, + "step": 5486 + }, + { + "epoch": 0.8958001714215746, + "grad_norm": 1.9061232805252075, + "learning_rate": 1.9898800671312273e-05, + "loss": 0.69, + "step": 5487 + }, + { + "epoch": 0.895963430064079, + "grad_norm": 1.9330824613571167, + "learning_rate": 1.989875563477316e-05, + "loss": 0.8772, + "step": 5488 + }, + { + "epoch": 0.8961266887065834, + "grad_norm": 1.7840790748596191, + "learning_rate": 1.9898710588266002e-05, + "loss": 0.7764, + "step": 5489 + }, + { + "epoch": 0.8962899473490878, + "grad_norm": 1.415550708770752, + "learning_rate": 1.9898665531790846e-05, + "loss": 0.5857, + "step": 5490 + }, + { + "epoch": 0.8964532059915922, + "grad_norm": 1.6764709949493408, + "learning_rate": 1.9898620465347735e-05, + "loss": 0.6888, + "step": 5491 + }, + { + "epoch": 0.8966164646340966, + "grad_norm": 1.9158803224563599, + "learning_rate": 1.9898575388936717e-05, + "loss": 0.7512, + "step": 5492 + }, + { + "epoch": 0.8967797232766009, + "grad_norm": 2.0202760696411133, + "learning_rate": 1.9898530302557836e-05, + "loss": 0.818, + "step": 5493 + }, + { + "epoch": 0.8969429819191054, + "grad_norm": 1.768582820892334, + "learning_rate": 1.9898485206211138e-05, + "loss": 0.7236, + "step": 5494 + }, + { + "epoch": 0.8971062405616097, + "grad_norm": 1.9406176805496216, + "learning_rate": 1.989844009989667e-05, + "loss": 0.8035, + "step": 5495 + }, + { + "epoch": 0.8972694992041141, + "grad_norm": 2.0315680503845215, + "learning_rate": 1.989839498361447e-05, + "loss": 0.7808, + "step": 5496 + }, + { + "epoch": 0.8974327578466185, + "grad_norm": 1.77961266040802, + "learning_rate": 1.9898349857364594e-05, + "loss": 0.7149, + "step": 5497 + }, + { + "epoch": 0.8975960164891229, + "grad_norm": 2.0483458042144775, + "learning_rate": 1.989830472114708e-05, + "loss": 0.8953, + "step": 5498 + }, + { + "epoch": 0.8977592751316272, + "grad_norm": 2.363706350326538, + "learning_rate": 1.9898259574961977e-05, + "loss": 0.854, + "step": 5499 + }, + { + "epoch": 0.8979225337741317, + "grad_norm": 2.0008885860443115, + "learning_rate": 1.989821441880933e-05, + "loss": 0.7716, + "step": 5500 + }, + { + "epoch": 0.8980857924166361, + "grad_norm": 1.823672890663147, + "learning_rate": 1.989816925268918e-05, + "loss": 0.6495, + "step": 5501 + }, + { + "epoch": 0.8982490510591404, + "grad_norm": 1.8799450397491455, + "learning_rate": 1.9898124076601578e-05, + "loss": 0.8637, + "step": 5502 + }, + { + "epoch": 0.8984123097016449, + "grad_norm": 1.800307035446167, + "learning_rate": 1.989807889054657e-05, + "loss": 0.7077, + "step": 5503 + }, + { + "epoch": 0.8985755683441492, + "grad_norm": 1.9598329067230225, + "learning_rate": 1.9898033694524196e-05, + "loss": 0.7876, + "step": 5504 + }, + { + "epoch": 0.8987388269866536, + "grad_norm": 1.796534776687622, + "learning_rate": 1.9897988488534508e-05, + "loss": 0.6227, + "step": 5505 + }, + { + "epoch": 0.898902085629158, + "grad_norm": 1.6570171117782593, + "learning_rate": 1.9897943272577546e-05, + "loss": 0.7332, + "step": 5506 + }, + { + "epoch": 0.8990653442716624, + "grad_norm": 1.778590440750122, + "learning_rate": 1.9897898046653358e-05, + "loss": 0.7386, + "step": 5507 + }, + { + "epoch": 0.8992286029141667, + "grad_norm": 1.4596481323242188, + "learning_rate": 1.9897852810761987e-05, + "loss": 0.6164, + "step": 5508 + }, + { + "epoch": 0.8993918615566712, + "grad_norm": 1.9816324710845947, + "learning_rate": 1.9897807564903485e-05, + "loss": 0.8267, + "step": 5509 + }, + { + "epoch": 0.8995551201991755, + "grad_norm": 1.7053388357162476, + "learning_rate": 1.989776230907789e-05, + "loss": 0.7752, + "step": 5510 + }, + { + "epoch": 0.8997183788416799, + "grad_norm": 2.085296392440796, + "learning_rate": 1.9897717043285255e-05, + "loss": 0.8386, + "step": 5511 + }, + { + "epoch": 0.8998816374841844, + "grad_norm": 1.8707584142684937, + "learning_rate": 1.989767176752562e-05, + "loss": 0.8964, + "step": 5512 + }, + { + "epoch": 0.9000448961266887, + "grad_norm": 1.8359700441360474, + "learning_rate": 1.989762648179903e-05, + "loss": 0.7131, + "step": 5513 + }, + { + "epoch": 0.9002081547691931, + "grad_norm": 1.7875401973724365, + "learning_rate": 1.989758118610553e-05, + "loss": 0.7612, + "step": 5514 + }, + { + "epoch": 0.9003714134116975, + "grad_norm": 1.903703212738037, + "learning_rate": 1.9897535880445174e-05, + "loss": 0.8097, + "step": 5515 + }, + { + "epoch": 0.9005346720542019, + "grad_norm": 2.0160763263702393, + "learning_rate": 1.9897490564818e-05, + "loss": 0.7252, + "step": 5516 + }, + { + "epoch": 0.9006979306967062, + "grad_norm": 1.8471753597259521, + "learning_rate": 1.989744523922406e-05, + "loss": 0.7367, + "step": 5517 + }, + { + "epoch": 0.9008611893392107, + "grad_norm": 1.9934808015823364, + "learning_rate": 1.989739990366339e-05, + "loss": 0.7985, + "step": 5518 + }, + { + "epoch": 0.901024447981715, + "grad_norm": 2.1980695724487305, + "learning_rate": 1.989735455813604e-05, + "loss": 0.9086, + "step": 5519 + }, + { + "epoch": 0.9011877066242194, + "grad_norm": 1.5855870246887207, + "learning_rate": 1.989730920264206e-05, + "loss": 0.7498, + "step": 5520 + }, + { + "epoch": 0.9013509652667238, + "grad_norm": 1.9430112838745117, + "learning_rate": 1.9897263837181492e-05, + "loss": 0.6587, + "step": 5521 + }, + { + "epoch": 0.9015142239092282, + "grad_norm": 1.9637012481689453, + "learning_rate": 1.989721846175438e-05, + "loss": 0.7715, + "step": 5522 + }, + { + "epoch": 0.9016774825517326, + "grad_norm": 1.6537851095199585, + "learning_rate": 1.989717307636077e-05, + "loss": 0.7607, + "step": 5523 + }, + { + "epoch": 0.901840741194237, + "grad_norm": 2.33439302444458, + "learning_rate": 1.9897127681000714e-05, + "loss": 0.9548, + "step": 5524 + }, + { + "epoch": 0.9020039998367414, + "grad_norm": 1.4771759510040283, + "learning_rate": 1.9897082275674252e-05, + "loss": 0.6319, + "step": 5525 + }, + { + "epoch": 0.9021672584792457, + "grad_norm": 1.8442362546920776, + "learning_rate": 1.989703686038143e-05, + "loss": 0.7391, + "step": 5526 + }, + { + "epoch": 0.9023305171217502, + "grad_norm": 2.017043113708496, + "learning_rate": 1.9896991435122294e-05, + "loss": 0.7818, + "step": 5527 + }, + { + "epoch": 0.9024937757642545, + "grad_norm": 2.36134672164917, + "learning_rate": 1.9896945999896887e-05, + "loss": 1.4161, + "step": 5528 + }, + { + "epoch": 0.9026570344067589, + "grad_norm": 1.7070966958999634, + "learning_rate": 1.9896900554705264e-05, + "loss": 0.7795, + "step": 5529 + }, + { + "epoch": 0.9028202930492633, + "grad_norm": 2.256730794906616, + "learning_rate": 1.9896855099547462e-05, + "loss": 0.895, + "step": 5530 + }, + { + "epoch": 0.9029835516917677, + "grad_norm": 2.093231201171875, + "learning_rate": 1.989680963442353e-05, + "loss": 0.7501, + "step": 5531 + }, + { + "epoch": 0.903146810334272, + "grad_norm": 2.1362226009368896, + "learning_rate": 1.989676415933351e-05, + "loss": 0.8682, + "step": 5532 + }, + { + "epoch": 0.9033100689767765, + "grad_norm": 1.9808677434921265, + "learning_rate": 1.989671867427746e-05, + "loss": 0.8651, + "step": 5533 + }, + { + "epoch": 0.9034733276192809, + "grad_norm": 1.87907874584198, + "learning_rate": 1.989667317925541e-05, + "loss": 0.8305, + "step": 5534 + }, + { + "epoch": 0.9036365862617852, + "grad_norm": 1.8500330448150635, + "learning_rate": 1.9896627674267414e-05, + "loss": 0.8166, + "step": 5535 + }, + { + "epoch": 0.9037998449042897, + "grad_norm": 2.022334098815918, + "learning_rate": 1.9896582159313517e-05, + "loss": 0.9245, + "step": 5536 + }, + { + "epoch": 0.903963103546794, + "grad_norm": 2.089448928833008, + "learning_rate": 1.9896536634393762e-05, + "loss": 0.7491, + "step": 5537 + }, + { + "epoch": 0.9041263621892984, + "grad_norm": 1.7539336681365967, + "learning_rate": 1.98964910995082e-05, + "loss": 0.8298, + "step": 5538 + }, + { + "epoch": 0.9042896208318028, + "grad_norm": 1.7201881408691406, + "learning_rate": 1.989644555465687e-05, + "loss": 0.9033, + "step": 5539 + }, + { + "epoch": 0.9044528794743072, + "grad_norm": 1.544772744178772, + "learning_rate": 1.9896399999839828e-05, + "loss": 0.6645, + "step": 5540 + }, + { + "epoch": 0.9046161381168115, + "grad_norm": 1.993411660194397, + "learning_rate": 1.989635443505711e-05, + "loss": 0.6866, + "step": 5541 + }, + { + "epoch": 0.904779396759316, + "grad_norm": 1.773689866065979, + "learning_rate": 1.989630886030877e-05, + "loss": 0.6561, + "step": 5542 + }, + { + "epoch": 0.9049426554018203, + "grad_norm": 1.8346515893936157, + "learning_rate": 1.989626327559484e-05, + "loss": 0.8791, + "step": 5543 + }, + { + "epoch": 0.9051059140443247, + "grad_norm": 1.9989535808563232, + "learning_rate": 1.9896217680915388e-05, + "loss": 0.7248, + "step": 5544 + }, + { + "epoch": 0.9052691726868292, + "grad_norm": 1.7388193607330322, + "learning_rate": 1.9896172076270436e-05, + "loss": 0.6694, + "step": 5545 + }, + { + "epoch": 0.9054324313293335, + "grad_norm": 1.682182788848877, + "learning_rate": 1.989612646166005e-05, + "loss": 0.8053, + "step": 5546 + }, + { + "epoch": 0.9055956899718379, + "grad_norm": 1.5830105543136597, + "learning_rate": 1.989608083708426e-05, + "loss": 0.7209, + "step": 5547 + }, + { + "epoch": 0.9057589486143423, + "grad_norm": 1.7821048498153687, + "learning_rate": 1.9896035202543124e-05, + "loss": 0.745, + "step": 5548 + }, + { + "epoch": 0.9059222072568467, + "grad_norm": 1.872824788093567, + "learning_rate": 1.9895989558036684e-05, + "loss": 0.7568, + "step": 5549 + }, + { + "epoch": 0.906085465899351, + "grad_norm": 1.8558952808380127, + "learning_rate": 1.989594390356498e-05, + "loss": 0.6495, + "step": 5550 + }, + { + "epoch": 0.9062487245418555, + "grad_norm": 1.6698013544082642, + "learning_rate": 1.9895898239128072e-05, + "loss": 0.6811, + "step": 5551 + }, + { + "epoch": 0.9064119831843598, + "grad_norm": 1.519544243812561, + "learning_rate": 1.989585256472599e-05, + "loss": 0.6459, + "step": 5552 + }, + { + "epoch": 0.9065752418268642, + "grad_norm": 1.902032494544983, + "learning_rate": 1.9895806880358788e-05, + "loss": 0.9176, + "step": 5553 + }, + { + "epoch": 0.9067385004693685, + "grad_norm": 1.7615669965744019, + "learning_rate": 1.989576118602651e-05, + "loss": 0.9291, + "step": 5554 + }, + { + "epoch": 0.906901759111873, + "grad_norm": 1.9209628105163574, + "learning_rate": 1.9895715481729207e-05, + "loss": 0.7969, + "step": 5555 + }, + { + "epoch": 0.9070650177543774, + "grad_norm": 1.75411057472229, + "learning_rate": 1.989566976746692e-05, + "loss": 0.6233, + "step": 5556 + }, + { + "epoch": 0.9072282763968817, + "grad_norm": 1.8862360715866089, + "learning_rate": 1.9895624043239696e-05, + "loss": 0.7266, + "step": 5557 + }, + { + "epoch": 0.9073915350393862, + "grad_norm": 1.9569650888442993, + "learning_rate": 1.9895578309047577e-05, + "loss": 0.8652, + "step": 5558 + }, + { + "epoch": 0.9075547936818905, + "grad_norm": 2.200761079788208, + "learning_rate": 1.9895532564890616e-05, + "loss": 1.0836, + "step": 5559 + }, + { + "epoch": 0.907718052324395, + "grad_norm": 2.2112808227539062, + "learning_rate": 1.989548681076886e-05, + "loss": 0.8381, + "step": 5560 + }, + { + "epoch": 0.9078813109668993, + "grad_norm": 1.8368815183639526, + "learning_rate": 1.9895441046682345e-05, + "loss": 0.7016, + "step": 5561 + }, + { + "epoch": 0.9080445696094037, + "grad_norm": 2.1557767391204834, + "learning_rate": 1.989539527263113e-05, + "loss": 0.7825, + "step": 5562 + }, + { + "epoch": 0.908207828251908, + "grad_norm": 1.9988001585006714, + "learning_rate": 1.9895349488615248e-05, + "loss": 1.0115, + "step": 5563 + }, + { + "epoch": 0.9083710868944125, + "grad_norm": 1.884873390197754, + "learning_rate": 1.9895303694634756e-05, + "loss": 0.7757, + "step": 5564 + }, + { + "epoch": 0.9085343455369168, + "grad_norm": 1.9829567670822144, + "learning_rate": 1.9895257890689698e-05, + "loss": 0.8484, + "step": 5565 + }, + { + "epoch": 0.9086976041794212, + "grad_norm": 1.9736597537994385, + "learning_rate": 1.9895212076780113e-05, + "loss": 0.8061, + "step": 5566 + }, + { + "epoch": 0.9088608628219257, + "grad_norm": 1.860804557800293, + "learning_rate": 1.9895166252906053e-05, + "loss": 0.9104, + "step": 5567 + }, + { + "epoch": 0.90902412146443, + "grad_norm": 2.1515371799468994, + "learning_rate": 1.9895120419067565e-05, + "loss": 0.8432, + "step": 5568 + }, + { + "epoch": 0.9091873801069345, + "grad_norm": 2.1283607482910156, + "learning_rate": 1.9895074575264694e-05, + "loss": 0.6855, + "step": 5569 + }, + { + "epoch": 0.9093506387494388, + "grad_norm": 1.998544454574585, + "learning_rate": 1.9895028721497482e-05, + "loss": 0.7761, + "step": 5570 + }, + { + "epoch": 0.9095138973919432, + "grad_norm": 1.8876937627792358, + "learning_rate": 1.9894982857765982e-05, + "loss": 0.8624, + "step": 5571 + }, + { + "epoch": 0.9096771560344475, + "grad_norm": 1.6461340188980103, + "learning_rate": 1.9894936984070234e-05, + "loss": 0.7732, + "step": 5572 + }, + { + "epoch": 0.909840414676952, + "grad_norm": 2.3960626125335693, + "learning_rate": 1.989489110041029e-05, + "loss": 0.7479, + "step": 5573 + }, + { + "epoch": 0.9100036733194563, + "grad_norm": 1.6843887567520142, + "learning_rate": 1.9894845206786192e-05, + "loss": 0.6725, + "step": 5574 + }, + { + "epoch": 0.9101669319619607, + "grad_norm": 2.2510833740234375, + "learning_rate": 1.9894799303197987e-05, + "loss": 0.7124, + "step": 5575 + }, + { + "epoch": 0.9103301906044651, + "grad_norm": 1.7971994876861572, + "learning_rate": 1.9894753389645723e-05, + "loss": 0.7557, + "step": 5576 + }, + { + "epoch": 0.9104934492469695, + "grad_norm": 1.6881722211837769, + "learning_rate": 1.9894707466129444e-05, + "loss": 0.6575, + "step": 5577 + }, + { + "epoch": 0.910656707889474, + "grad_norm": 1.8837109804153442, + "learning_rate": 1.98946615326492e-05, + "loss": 0.6754, + "step": 5578 + }, + { + "epoch": 0.9108199665319783, + "grad_norm": 2.2239246368408203, + "learning_rate": 1.9894615589205027e-05, + "loss": 0.882, + "step": 5579 + }, + { + "epoch": 0.9109832251744827, + "grad_norm": 1.5459859371185303, + "learning_rate": 1.9894569635796987e-05, + "loss": 0.6854, + "step": 5580 + }, + { + "epoch": 0.911146483816987, + "grad_norm": 1.7673524618148804, + "learning_rate": 1.989452367242511e-05, + "loss": 0.715, + "step": 5581 + }, + { + "epoch": 0.9113097424594915, + "grad_norm": 2.1071817874908447, + "learning_rate": 1.989447769908946e-05, + "loss": 0.801, + "step": 5582 + }, + { + "epoch": 0.9114730011019958, + "grad_norm": 1.8170974254608154, + "learning_rate": 1.989443171579007e-05, + "loss": 0.6932, + "step": 5583 + }, + { + "epoch": 0.9116362597445002, + "grad_norm": 1.7549247741699219, + "learning_rate": 1.9894385722526984e-05, + "loss": 0.7693, + "step": 5584 + }, + { + "epoch": 0.9117995183870046, + "grad_norm": 1.7795149087905884, + "learning_rate": 1.989433971930026e-05, + "loss": 0.834, + "step": 5585 + }, + { + "epoch": 0.911962777029509, + "grad_norm": 1.735330581665039, + "learning_rate": 1.9894293706109936e-05, + "loss": 0.657, + "step": 5586 + }, + { + "epoch": 0.9121260356720133, + "grad_norm": 2.176182508468628, + "learning_rate": 1.9894247682956064e-05, + "loss": 0.8754, + "step": 5587 + }, + { + "epoch": 0.9122892943145178, + "grad_norm": 1.7211261987686157, + "learning_rate": 1.9894201649838686e-05, + "loss": 0.7445, + "step": 5588 + }, + { + "epoch": 0.9124525529570222, + "grad_norm": 1.876654028892517, + "learning_rate": 1.9894155606757846e-05, + "loss": 0.7932, + "step": 5589 + }, + { + "epoch": 0.9126158115995265, + "grad_norm": 2.7031610012054443, + "learning_rate": 1.9894109553713597e-05, + "loss": 0.7621, + "step": 5590 + }, + { + "epoch": 0.912779070242031, + "grad_norm": 1.8352471590042114, + "learning_rate": 1.9894063490705982e-05, + "loss": 0.7445, + "step": 5591 + }, + { + "epoch": 0.9129423288845353, + "grad_norm": 1.967947006225586, + "learning_rate": 1.9894017417735046e-05, + "loss": 0.6841, + "step": 5592 + }, + { + "epoch": 0.9131055875270397, + "grad_norm": 1.8985121250152588, + "learning_rate": 1.989397133480084e-05, + "loss": 0.8033, + "step": 5593 + }, + { + "epoch": 0.9132688461695441, + "grad_norm": 1.9212956428527832, + "learning_rate": 1.9893925241903405e-05, + "loss": 0.8779, + "step": 5594 + }, + { + "epoch": 0.9134321048120485, + "grad_norm": 2.0976109504699707, + "learning_rate": 1.9893879139042795e-05, + "loss": 0.8377, + "step": 5595 + }, + { + "epoch": 0.9135953634545528, + "grad_norm": 1.8942443132400513, + "learning_rate": 1.9893833026219044e-05, + "loss": 0.7014, + "step": 5596 + }, + { + "epoch": 0.9137586220970573, + "grad_norm": 2.207634687423706, + "learning_rate": 1.989378690343221e-05, + "loss": 0.7141, + "step": 5597 + }, + { + "epoch": 0.9139218807395616, + "grad_norm": 1.9334526062011719, + "learning_rate": 1.9893740770682334e-05, + "loss": 0.9303, + "step": 5598 + }, + { + "epoch": 0.914085139382066, + "grad_norm": 1.7855514287948608, + "learning_rate": 1.9893694627969464e-05, + "loss": 0.6626, + "step": 5599 + }, + { + "epoch": 0.9142483980245705, + "grad_norm": 1.9264668226242065, + "learning_rate": 1.9893648475293646e-05, + "loss": 0.7269, + "step": 5600 + }, + { + "epoch": 0.9144116566670748, + "grad_norm": 1.5973302125930786, + "learning_rate": 1.989360231265493e-05, + "loss": 0.6111, + "step": 5601 + }, + { + "epoch": 0.9145749153095792, + "grad_norm": 2.0852298736572266, + "learning_rate": 1.9893556140053352e-05, + "loss": 0.949, + "step": 5602 + }, + { + "epoch": 0.9147381739520836, + "grad_norm": 2.078183889389038, + "learning_rate": 1.989350995748897e-05, + "loss": 0.8123, + "step": 5603 + }, + { + "epoch": 0.914901432594588, + "grad_norm": 1.89762544631958, + "learning_rate": 1.989346376496183e-05, + "loss": 0.8302, + "step": 5604 + }, + { + "epoch": 0.9150646912370923, + "grad_norm": 1.9650306701660156, + "learning_rate": 1.9893417562471966e-05, + "loss": 0.8212, + "step": 5605 + }, + { + "epoch": 0.9152279498795968, + "grad_norm": 2.1041293144226074, + "learning_rate": 1.989337135001944e-05, + "loss": 1.0749, + "step": 5606 + }, + { + "epoch": 0.9153912085221011, + "grad_norm": 1.7313833236694336, + "learning_rate": 1.9893325127604287e-05, + "loss": 0.6885, + "step": 5607 + }, + { + "epoch": 0.9155544671646055, + "grad_norm": 1.7360668182373047, + "learning_rate": 1.989327889522656e-05, + "loss": 0.6979, + "step": 5608 + }, + { + "epoch": 0.9157177258071099, + "grad_norm": 2.13582706451416, + "learning_rate": 1.9893232652886306e-05, + "loss": 0.8396, + "step": 5609 + }, + { + "epoch": 0.9158809844496143, + "grad_norm": 1.4471087455749512, + "learning_rate": 1.9893186400583568e-05, + "loss": 0.5425, + "step": 5610 + }, + { + "epoch": 0.9160442430921187, + "grad_norm": 1.9984731674194336, + "learning_rate": 1.9893140138318394e-05, + "loss": 0.793, + "step": 5611 + }, + { + "epoch": 0.9162075017346231, + "grad_norm": 1.6998302936553955, + "learning_rate": 1.9893093866090828e-05, + "loss": 0.75, + "step": 5612 + }, + { + "epoch": 0.9163707603771275, + "grad_norm": 1.7694296836853027, + "learning_rate": 1.9893047583900918e-05, + "loss": 0.7792, + "step": 5613 + }, + { + "epoch": 0.9165340190196318, + "grad_norm": 1.841478943824768, + "learning_rate": 1.9893001291748715e-05, + "loss": 0.6653, + "step": 5614 + }, + { + "epoch": 0.9166972776621363, + "grad_norm": 1.9038026332855225, + "learning_rate": 1.9892954989634263e-05, + "loss": 0.7349, + "step": 5615 + }, + { + "epoch": 0.9168605363046406, + "grad_norm": 1.9097278118133545, + "learning_rate": 1.989290867755761e-05, + "loss": 0.7502, + "step": 5616 + }, + { + "epoch": 0.917023794947145, + "grad_norm": 1.634843349456787, + "learning_rate": 1.9892862355518793e-05, + "loss": 0.695, + "step": 5617 + }, + { + "epoch": 0.9171870535896494, + "grad_norm": 1.6406042575836182, + "learning_rate": 1.9892816023517874e-05, + "loss": 0.7695, + "step": 5618 + }, + { + "epoch": 0.9173503122321538, + "grad_norm": 2.134953737258911, + "learning_rate": 1.9892769681554885e-05, + "loss": 0.7551, + "step": 5619 + }, + { + "epoch": 0.9175135708746581, + "grad_norm": 1.660075306892395, + "learning_rate": 1.9892723329629885e-05, + "loss": 0.6419, + "step": 5620 + }, + { + "epoch": 0.9176768295171626, + "grad_norm": 2.1675283908843994, + "learning_rate": 1.9892676967742912e-05, + "loss": 0.9836, + "step": 5621 + }, + { + "epoch": 0.917840088159667, + "grad_norm": 1.8870595693588257, + "learning_rate": 1.989263059589402e-05, + "loss": 0.8409, + "step": 5622 + }, + { + "epoch": 0.9180033468021713, + "grad_norm": 1.700561285018921, + "learning_rate": 1.9892584214083247e-05, + "loss": 0.7769, + "step": 5623 + }, + { + "epoch": 0.9181666054446758, + "grad_norm": 2.0961501598358154, + "learning_rate": 1.989253782231065e-05, + "loss": 0.953, + "step": 5624 + }, + { + "epoch": 0.9183298640871801, + "grad_norm": 1.9159821271896362, + "learning_rate": 1.9892491420576265e-05, + "loss": 0.8213, + "step": 5625 + }, + { + "epoch": 0.9184931227296845, + "grad_norm": 1.900506615638733, + "learning_rate": 1.989244500888014e-05, + "loss": 0.8112, + "step": 5626 + }, + { + "epoch": 0.9186563813721889, + "grad_norm": 2.0761029720306396, + "learning_rate": 1.9892398587222336e-05, + "loss": 0.8352, + "step": 5627 + }, + { + "epoch": 0.9188196400146933, + "grad_norm": 1.85646390914917, + "learning_rate": 1.989235215560288e-05, + "loss": 0.6232, + "step": 5628 + }, + { + "epoch": 0.9189828986571976, + "grad_norm": 1.524402141571045, + "learning_rate": 1.9892305714021832e-05, + "loss": 0.6017, + "step": 5629 + }, + { + "epoch": 0.9191461572997021, + "grad_norm": 1.9812732934951782, + "learning_rate": 1.9892259262479238e-05, + "loss": 0.6919, + "step": 5630 + }, + { + "epoch": 0.9193094159422064, + "grad_norm": 1.702762484550476, + "learning_rate": 1.9892212800975136e-05, + "loss": 0.8046, + "step": 5631 + }, + { + "epoch": 0.9194726745847108, + "grad_norm": 1.7462058067321777, + "learning_rate": 1.989216632950958e-05, + "loss": 0.7064, + "step": 5632 + }, + { + "epoch": 0.9196359332272153, + "grad_norm": 1.877686619758606, + "learning_rate": 1.9892119848082615e-05, + "loss": 0.875, + "step": 5633 + }, + { + "epoch": 0.9197991918697196, + "grad_norm": 1.9059504270553589, + "learning_rate": 1.9892073356694287e-05, + "loss": 0.7883, + "step": 5634 + }, + { + "epoch": 0.919962450512224, + "grad_norm": 1.660733699798584, + "learning_rate": 1.9892026855344648e-05, + "loss": 0.7094, + "step": 5635 + }, + { + "epoch": 0.9201257091547284, + "grad_norm": 1.8124175071716309, + "learning_rate": 1.989198034403374e-05, + "loss": 0.7422, + "step": 5636 + }, + { + "epoch": 0.9202889677972328, + "grad_norm": 1.7042763233184814, + "learning_rate": 1.9891933822761603e-05, + "loss": 0.724, + "step": 5637 + }, + { + "epoch": 0.9204522264397371, + "grad_norm": 1.8302861452102661, + "learning_rate": 1.98918872915283e-05, + "loss": 0.6867, + "step": 5638 + }, + { + "epoch": 0.9206154850822416, + "grad_norm": 2.1161882877349854, + "learning_rate": 1.9891840750333864e-05, + "loss": 0.8932, + "step": 5639 + }, + { + "epoch": 0.9207787437247459, + "grad_norm": 1.601354718208313, + "learning_rate": 1.989179419917835e-05, + "loss": 0.5357, + "step": 5640 + }, + { + "epoch": 0.9209420023672503, + "grad_norm": 1.6711939573287964, + "learning_rate": 1.98917476380618e-05, + "loss": 0.6833, + "step": 5641 + }, + { + "epoch": 0.9211052610097547, + "grad_norm": 1.7503052949905396, + "learning_rate": 1.9891701066984264e-05, + "loss": 0.7157, + "step": 5642 + }, + { + "epoch": 0.9212685196522591, + "grad_norm": 1.9473214149475098, + "learning_rate": 1.9891654485945786e-05, + "loss": 0.9499, + "step": 5643 + }, + { + "epoch": 0.9214317782947635, + "grad_norm": 1.9269390106201172, + "learning_rate": 1.9891607894946413e-05, + "loss": 0.7534, + "step": 5644 + }, + { + "epoch": 0.9215950369372679, + "grad_norm": 1.6406761407852173, + "learning_rate": 1.9891561293986197e-05, + "loss": 0.7171, + "step": 5645 + }, + { + "epoch": 0.9217582955797723, + "grad_norm": 2.1362133026123047, + "learning_rate": 1.9891514683065183e-05, + "loss": 0.9696, + "step": 5646 + }, + { + "epoch": 0.9219215542222766, + "grad_norm": 1.5797624588012695, + "learning_rate": 1.9891468062183413e-05, + "loss": 0.6037, + "step": 5647 + }, + { + "epoch": 0.9220848128647811, + "grad_norm": 1.6846390962600708, + "learning_rate": 1.989142143134094e-05, + "loss": 0.7527, + "step": 5648 + }, + { + "epoch": 0.9222480715072854, + "grad_norm": 2.2771694660186768, + "learning_rate": 1.9891374790537804e-05, + "loss": 1.0315, + "step": 5649 + }, + { + "epoch": 0.9224113301497898, + "grad_norm": 1.8506041765213013, + "learning_rate": 1.9891328139774057e-05, + "loss": 0.7388, + "step": 5650 + }, + { + "epoch": 0.9225745887922941, + "grad_norm": 1.8565744161605835, + "learning_rate": 1.9891281479049748e-05, + "loss": 0.6494, + "step": 5651 + }, + { + "epoch": 0.9227378474347986, + "grad_norm": 1.9432196617126465, + "learning_rate": 1.9891234808364917e-05, + "loss": 0.8302, + "step": 5652 + }, + { + "epoch": 0.9229011060773029, + "grad_norm": 1.4637361764907837, + "learning_rate": 1.989118812771962e-05, + "loss": 0.5649, + "step": 5653 + }, + { + "epoch": 0.9230643647198074, + "grad_norm": 1.5249428749084473, + "learning_rate": 1.9891141437113896e-05, + "loss": 0.6706, + "step": 5654 + }, + { + "epoch": 0.9232276233623118, + "grad_norm": 1.7894009351730347, + "learning_rate": 1.9891094736547796e-05, + "loss": 0.7278, + "step": 5655 + }, + { + "epoch": 0.9233908820048161, + "grad_norm": 1.8121771812438965, + "learning_rate": 1.9891048026021368e-05, + "loss": 0.8781, + "step": 5656 + }, + { + "epoch": 0.9235541406473206, + "grad_norm": 2.038479804992676, + "learning_rate": 1.9891001305534656e-05, + "loss": 0.8222, + "step": 5657 + }, + { + "epoch": 0.9237173992898249, + "grad_norm": 1.6146109104156494, + "learning_rate": 1.9890954575087708e-05, + "loss": 0.6848, + "step": 5658 + }, + { + "epoch": 0.9238806579323293, + "grad_norm": 1.6325279474258423, + "learning_rate": 1.989090783468057e-05, + "loss": 0.7504, + "step": 5659 + }, + { + "epoch": 0.9240439165748336, + "grad_norm": 1.873058795928955, + "learning_rate": 1.9890861084313293e-05, + "loss": 0.7952, + "step": 5660 + }, + { + "epoch": 0.9242071752173381, + "grad_norm": 2.0763649940490723, + "learning_rate": 1.989081432398592e-05, + "loss": 0.9199, + "step": 5661 + }, + { + "epoch": 0.9243704338598424, + "grad_norm": 1.7875324487686157, + "learning_rate": 1.98907675536985e-05, + "loss": 0.7985, + "step": 5662 + }, + { + "epoch": 0.9245336925023468, + "grad_norm": 1.8535127639770508, + "learning_rate": 1.989072077345108e-05, + "loss": 0.8475, + "step": 5663 + }, + { + "epoch": 0.9246969511448512, + "grad_norm": 1.922010898590088, + "learning_rate": 1.9890673983243708e-05, + "loss": 0.7158, + "step": 5664 + }, + { + "epoch": 0.9248602097873556, + "grad_norm": 1.8820998668670654, + "learning_rate": 1.9890627183076427e-05, + "loss": 0.784, + "step": 5665 + }, + { + "epoch": 0.92502346842986, + "grad_norm": 1.6809495687484741, + "learning_rate": 1.989058037294929e-05, + "loss": 0.6742, + "step": 5666 + }, + { + "epoch": 0.9251867270723644, + "grad_norm": 2.1938772201538086, + "learning_rate": 1.9890533552862337e-05, + "loss": 1.6547, + "step": 5667 + }, + { + "epoch": 0.9253499857148688, + "grad_norm": 1.977703332901001, + "learning_rate": 1.9890486722815624e-05, + "loss": 0.9057, + "step": 5668 + }, + { + "epoch": 0.9255132443573731, + "grad_norm": 1.9104596376419067, + "learning_rate": 1.989043988280919e-05, + "loss": 0.832, + "step": 5669 + }, + { + "epoch": 0.9256765029998776, + "grad_norm": 1.937463641166687, + "learning_rate": 1.989039303284309e-05, + "loss": 0.7084, + "step": 5670 + }, + { + "epoch": 0.9258397616423819, + "grad_norm": 1.9445589780807495, + "learning_rate": 1.9890346172917362e-05, + "loss": 0.812, + "step": 5671 + }, + { + "epoch": 0.9260030202848863, + "grad_norm": 1.9386945962905884, + "learning_rate": 1.989029930303206e-05, + "loss": 0.8995, + "step": 5672 + }, + { + "epoch": 0.9261662789273907, + "grad_norm": 2.154684066772461, + "learning_rate": 1.989025242318723e-05, + "loss": 0.9429, + "step": 5673 + }, + { + "epoch": 0.9263295375698951, + "grad_norm": 1.7646502256393433, + "learning_rate": 1.9890205533382917e-05, + "loss": 0.8514, + "step": 5674 + }, + { + "epoch": 0.9264927962123994, + "grad_norm": 1.8757656812667847, + "learning_rate": 1.989015863361917e-05, + "loss": 0.8768, + "step": 5675 + }, + { + "epoch": 0.9266560548549039, + "grad_norm": 1.8330364227294922, + "learning_rate": 1.989011172389604e-05, + "loss": 0.7772, + "step": 5676 + }, + { + "epoch": 0.9268193134974083, + "grad_norm": 1.7209479808807373, + "learning_rate": 1.989006480421356e-05, + "loss": 0.7909, + "step": 5677 + }, + { + "epoch": 0.9269825721399126, + "grad_norm": 1.7864714860916138, + "learning_rate": 1.9890017874571795e-05, + "loss": 0.7267, + "step": 5678 + }, + { + "epoch": 0.9271458307824171, + "grad_norm": 2.228712797164917, + "learning_rate": 1.9889970934970785e-05, + "loss": 0.7427, + "step": 5679 + }, + { + "epoch": 0.9273090894249214, + "grad_norm": 1.5044628381729126, + "learning_rate": 1.9889923985410576e-05, + "loss": 0.6222, + "step": 5680 + }, + { + "epoch": 0.9274723480674258, + "grad_norm": 1.8296196460723877, + "learning_rate": 1.9889877025891217e-05, + "loss": 0.7642, + "step": 5681 + }, + { + "epoch": 0.9276356067099302, + "grad_norm": 1.8356233835220337, + "learning_rate": 1.988983005641275e-05, + "loss": 0.8395, + "step": 5682 + }, + { + "epoch": 0.9277988653524346, + "grad_norm": 1.589242696762085, + "learning_rate": 1.988978307697523e-05, + "loss": 0.7172, + "step": 5683 + }, + { + "epoch": 0.9279621239949389, + "grad_norm": 1.811841607093811, + "learning_rate": 1.9889736087578703e-05, + "loss": 0.7679, + "step": 5684 + }, + { + "epoch": 0.9281253826374434, + "grad_norm": 1.8375786542892456, + "learning_rate": 1.9889689088223208e-05, + "loss": 0.8319, + "step": 5685 + }, + { + "epoch": 0.9282886412799478, + "grad_norm": 1.5653473138809204, + "learning_rate": 1.9889642078908805e-05, + "loss": 0.6628, + "step": 5686 + }, + { + "epoch": 0.9284518999224521, + "grad_norm": 1.9220969676971436, + "learning_rate": 1.988959505963553e-05, + "loss": 0.8849, + "step": 5687 + }, + { + "epoch": 0.9286151585649566, + "grad_norm": 1.7848671674728394, + "learning_rate": 1.988954803040344e-05, + "loss": 0.8502, + "step": 5688 + }, + { + "epoch": 0.9287784172074609, + "grad_norm": 1.5070096254348755, + "learning_rate": 1.9889500991212575e-05, + "loss": 0.6433, + "step": 5689 + }, + { + "epoch": 0.9289416758499653, + "grad_norm": 1.5660334825515747, + "learning_rate": 1.9889453942062988e-05, + "loss": 0.6326, + "step": 5690 + }, + { + "epoch": 0.9291049344924697, + "grad_norm": 1.6906564235687256, + "learning_rate": 1.988940688295472e-05, + "loss": 0.7252, + "step": 5691 + }, + { + "epoch": 0.9292681931349741, + "grad_norm": 1.730214238166809, + "learning_rate": 1.9889359813887824e-05, + "loss": 0.8148, + "step": 5692 + }, + { + "epoch": 0.9294314517774784, + "grad_norm": 1.95001220703125, + "learning_rate": 1.9889312734862345e-05, + "loss": 0.7926, + "step": 5693 + }, + { + "epoch": 0.9295947104199829, + "grad_norm": 1.7198446989059448, + "learning_rate": 1.988926564587833e-05, + "loss": 0.7987, + "step": 5694 + }, + { + "epoch": 0.9297579690624872, + "grad_norm": 1.6887645721435547, + "learning_rate": 1.9889218546935827e-05, + "loss": 0.7883, + "step": 5695 + }, + { + "epoch": 0.9299212277049916, + "grad_norm": 1.9154633283615112, + "learning_rate": 1.9889171438034886e-05, + "loss": 0.819, + "step": 5696 + }, + { + "epoch": 0.9300844863474961, + "grad_norm": 1.5947593450546265, + "learning_rate": 1.9889124319175548e-05, + "loss": 0.6547, + "step": 5697 + }, + { + "epoch": 0.9302477449900004, + "grad_norm": 1.7725056409835815, + "learning_rate": 1.9889077190357868e-05, + "loss": 0.7414, + "step": 5698 + }, + { + "epoch": 0.9304110036325048, + "grad_norm": 2.019804000854492, + "learning_rate": 1.9889030051581888e-05, + "loss": 0.5944, + "step": 5699 + }, + { + "epoch": 0.9305742622750092, + "grad_norm": 1.9094572067260742, + "learning_rate": 1.9888982902847658e-05, + "loss": 0.8468, + "step": 5700 + }, + { + "epoch": 0.9307375209175136, + "grad_norm": 1.995947003364563, + "learning_rate": 1.9888935744155223e-05, + "loss": 0.8157, + "step": 5701 + }, + { + "epoch": 0.9309007795600179, + "grad_norm": 1.7261426448822021, + "learning_rate": 1.9888888575504636e-05, + "loss": 0.7469, + "step": 5702 + }, + { + "epoch": 0.9310640382025224, + "grad_norm": 2.261383056640625, + "learning_rate": 1.988884139689594e-05, + "loss": 0.7302, + "step": 5703 + }, + { + "epoch": 0.9312272968450267, + "grad_norm": 1.7354803085327148, + "learning_rate": 1.9888794208329182e-05, + "loss": 0.6641, + "step": 5704 + }, + { + "epoch": 0.9313905554875311, + "grad_norm": 1.998819351196289, + "learning_rate": 1.988874700980441e-05, + "loss": 0.8802, + "step": 5705 + }, + { + "epoch": 0.9315538141300355, + "grad_norm": 1.815419316291809, + "learning_rate": 1.9888699801321675e-05, + "loss": 0.6646, + "step": 5706 + }, + { + "epoch": 0.9317170727725399, + "grad_norm": 2.088097095489502, + "learning_rate": 1.9888652582881017e-05, + "loss": 0.8867, + "step": 5707 + }, + { + "epoch": 0.9318803314150443, + "grad_norm": 1.8003586530685425, + "learning_rate": 1.9888605354482494e-05, + "loss": 0.7524, + "step": 5708 + }, + { + "epoch": 0.9320435900575487, + "grad_norm": 1.787760853767395, + "learning_rate": 1.9888558116126143e-05, + "loss": 0.7341, + "step": 5709 + }, + { + "epoch": 0.9322068487000531, + "grad_norm": 1.8817999362945557, + "learning_rate": 1.9888510867812022e-05, + "loss": 0.6457, + "step": 5710 + }, + { + "epoch": 0.9323701073425574, + "grad_norm": 1.9737091064453125, + "learning_rate": 1.988846360954017e-05, + "loss": 0.7561, + "step": 5711 + }, + { + "epoch": 0.9325333659850619, + "grad_norm": 2.175656795501709, + "learning_rate": 1.9888416341310637e-05, + "loss": 0.8052, + "step": 5712 + }, + { + "epoch": 0.9326966246275662, + "grad_norm": 1.705418586730957, + "learning_rate": 1.9888369063123473e-05, + "loss": 0.6683, + "step": 5713 + }, + { + "epoch": 0.9328598832700706, + "grad_norm": 1.9361976385116577, + "learning_rate": 1.9888321774978726e-05, + "loss": 0.7575, + "step": 5714 + }, + { + "epoch": 0.933023141912575, + "grad_norm": 1.9550631046295166, + "learning_rate": 1.988827447687644e-05, + "loss": 0.6993, + "step": 5715 + }, + { + "epoch": 0.9331864005550794, + "grad_norm": 1.8096879720687866, + "learning_rate": 1.988822716881666e-05, + "loss": 0.7994, + "step": 5716 + }, + { + "epoch": 0.9333496591975837, + "grad_norm": 1.5592663288116455, + "learning_rate": 1.988817985079944e-05, + "loss": 0.5859, + "step": 5717 + }, + { + "epoch": 0.9335129178400882, + "grad_norm": 2.107410192489624, + "learning_rate": 1.988813252282483e-05, + "loss": 0.8668, + "step": 5718 + }, + { + "epoch": 0.9336761764825926, + "grad_norm": 1.9437669515609741, + "learning_rate": 1.9888085184892868e-05, + "loss": 0.7966, + "step": 5719 + }, + { + "epoch": 0.9338394351250969, + "grad_norm": 1.9885010719299316, + "learning_rate": 1.988803783700361e-05, + "loss": 0.8751, + "step": 5720 + }, + { + "epoch": 0.9340026937676014, + "grad_norm": 2.0310909748077393, + "learning_rate": 1.9887990479157098e-05, + "loss": 0.8305, + "step": 5721 + }, + { + "epoch": 0.9341659524101057, + "grad_norm": 1.7244991064071655, + "learning_rate": 1.9887943111353385e-05, + "loss": 0.7848, + "step": 5722 + }, + { + "epoch": 0.9343292110526101, + "grad_norm": 1.6967183351516724, + "learning_rate": 1.9887895733592514e-05, + "loss": 0.7127, + "step": 5723 + }, + { + "epoch": 0.9344924696951145, + "grad_norm": 1.746645450592041, + "learning_rate": 1.9887848345874538e-05, + "loss": 0.7447, + "step": 5724 + }, + { + "epoch": 0.9346557283376189, + "grad_norm": 1.96803879737854, + "learning_rate": 1.9887800948199496e-05, + "loss": 0.7861, + "step": 5725 + }, + { + "epoch": 0.9348189869801232, + "grad_norm": 1.7303558588027954, + "learning_rate": 1.9887753540567446e-05, + "loss": 0.732, + "step": 5726 + }, + { + "epoch": 0.9349822456226277, + "grad_norm": 1.9768098592758179, + "learning_rate": 1.9887706122978426e-05, + "loss": 0.7366, + "step": 5727 + }, + { + "epoch": 0.935145504265132, + "grad_norm": 1.7681853771209717, + "learning_rate": 1.988765869543249e-05, + "loss": 0.6297, + "step": 5728 + }, + { + "epoch": 0.9353087629076364, + "grad_norm": 1.7637391090393066, + "learning_rate": 1.9887611257929687e-05, + "loss": 0.7391, + "step": 5729 + }, + { + "epoch": 0.9354720215501409, + "grad_norm": 1.6085448265075684, + "learning_rate": 1.988756381047006e-05, + "loss": 0.655, + "step": 5730 + }, + { + "epoch": 0.9356352801926452, + "grad_norm": 1.8517422676086426, + "learning_rate": 1.988751635305366e-05, + "loss": 0.8333, + "step": 5731 + }, + { + "epoch": 0.9357985388351496, + "grad_norm": 1.7325329780578613, + "learning_rate": 1.988746888568053e-05, + "loss": 0.732, + "step": 5732 + }, + { + "epoch": 0.935961797477654, + "grad_norm": 1.9422686100006104, + "learning_rate": 1.9887421408350728e-05, + "loss": 0.9066, + "step": 5733 + }, + { + "epoch": 0.9361250561201584, + "grad_norm": 1.6214052438735962, + "learning_rate": 1.988737392106429e-05, + "loss": 0.8074, + "step": 5734 + }, + { + "epoch": 0.9362883147626627, + "grad_norm": 1.999585747718811, + "learning_rate": 1.988732642382127e-05, + "loss": 1.0196, + "step": 5735 + }, + { + "epoch": 0.9364515734051672, + "grad_norm": 1.8567827939987183, + "learning_rate": 1.9887278916621717e-05, + "loss": 0.8236, + "step": 5736 + }, + { + "epoch": 0.9366148320476715, + "grad_norm": 2.0515315532684326, + "learning_rate": 1.9887231399465678e-05, + "loss": 0.8267, + "step": 5737 + }, + { + "epoch": 0.9367780906901759, + "grad_norm": 1.78919517993927, + "learning_rate": 1.9887183872353197e-05, + "loss": 0.7893, + "step": 5738 + }, + { + "epoch": 0.9369413493326803, + "grad_norm": 1.8790026903152466, + "learning_rate": 1.988713633528432e-05, + "loss": 0.9207, + "step": 5739 + }, + { + "epoch": 0.9371046079751847, + "grad_norm": 2.102926254272461, + "learning_rate": 1.9887088788259105e-05, + "loss": 0.7826, + "step": 5740 + }, + { + "epoch": 0.9372678666176891, + "grad_norm": 1.7118357419967651, + "learning_rate": 1.9887041231277593e-05, + "loss": 0.6542, + "step": 5741 + }, + { + "epoch": 0.9374311252601935, + "grad_norm": 1.5889919996261597, + "learning_rate": 1.988699366433983e-05, + "loss": 0.6889, + "step": 5742 + }, + { + "epoch": 0.9375943839026979, + "grad_norm": 1.6478652954101562, + "learning_rate": 1.9886946087445872e-05, + "loss": 0.7139, + "step": 5743 + }, + { + "epoch": 0.9377576425452022, + "grad_norm": 1.866075038909912, + "learning_rate": 1.9886898500595763e-05, + "loss": 0.7629, + "step": 5744 + }, + { + "epoch": 0.9379209011877067, + "grad_norm": 2.1026644706726074, + "learning_rate": 1.9886850903789546e-05, + "loss": 0.8349, + "step": 5745 + }, + { + "epoch": 0.938084159830211, + "grad_norm": 1.798527479171753, + "learning_rate": 1.988680329702727e-05, + "loss": 0.7032, + "step": 5746 + }, + { + "epoch": 0.9382474184727154, + "grad_norm": 1.8757797479629517, + "learning_rate": 1.988675568030899e-05, + "loss": 0.8275, + "step": 5747 + }, + { + "epoch": 0.9384106771152197, + "grad_norm": 1.7428367137908936, + "learning_rate": 1.9886708053634752e-05, + "loss": 0.7727, + "step": 5748 + }, + { + "epoch": 0.9385739357577242, + "grad_norm": 1.7845141887664795, + "learning_rate": 1.9886660417004594e-05, + "loss": 0.7315, + "step": 5749 + }, + { + "epoch": 0.9387371944002285, + "grad_norm": 2.002915620803833, + "learning_rate": 1.988661277041858e-05, + "loss": 0.8609, + "step": 5750 + }, + { + "epoch": 0.938900453042733, + "grad_norm": 1.6306358575820923, + "learning_rate": 1.9886565113876744e-05, + "loss": 0.6831, + "step": 5751 + }, + { + "epoch": 0.9390637116852374, + "grad_norm": 1.8465707302093506, + "learning_rate": 1.988651744737914e-05, + "loss": 0.8808, + "step": 5752 + }, + { + "epoch": 0.9392269703277417, + "grad_norm": 1.8070030212402344, + "learning_rate": 1.988646977092582e-05, + "loss": 0.587, + "step": 5753 + }, + { + "epoch": 0.9393902289702462, + "grad_norm": 2.4189624786376953, + "learning_rate": 1.9886422084516822e-05, + "loss": 0.8934, + "step": 5754 + }, + { + "epoch": 0.9395534876127505, + "grad_norm": 1.87638258934021, + "learning_rate": 1.9886374388152203e-05, + "loss": 0.7656, + "step": 5755 + }, + { + "epoch": 0.9397167462552549, + "grad_norm": 1.7015703916549683, + "learning_rate": 1.9886326681832006e-05, + "loss": 0.7307, + "step": 5756 + }, + { + "epoch": 0.9398800048977592, + "grad_norm": 2.803229331970215, + "learning_rate": 1.988627896555628e-05, + "loss": 0.8285, + "step": 5757 + }, + { + "epoch": 0.9400432635402637, + "grad_norm": 2.406782627105713, + "learning_rate": 1.9886231239325074e-05, + "loss": 0.8408, + "step": 5758 + }, + { + "epoch": 0.940206522182768, + "grad_norm": 1.8035948276519775, + "learning_rate": 1.9886183503138438e-05, + "loss": 0.7883, + "step": 5759 + }, + { + "epoch": 0.9403697808252724, + "grad_norm": 1.8613662719726562, + "learning_rate": 1.988613575699642e-05, + "loss": 0.9293, + "step": 5760 + }, + { + "epoch": 0.9405330394677768, + "grad_norm": 1.7085455656051636, + "learning_rate": 1.988608800089906e-05, + "loss": 0.7528, + "step": 5761 + }, + { + "epoch": 0.9406962981102812, + "grad_norm": 1.8669358491897583, + "learning_rate": 1.9886040234846415e-05, + "loss": 0.7261, + "step": 5762 + }, + { + "epoch": 0.9408595567527857, + "grad_norm": 2.2036235332489014, + "learning_rate": 1.9885992458838527e-05, + "loss": 0.9291, + "step": 5763 + }, + { + "epoch": 0.94102281539529, + "grad_norm": 1.8173681497573853, + "learning_rate": 1.988594467287545e-05, + "loss": 0.9374, + "step": 5764 + }, + { + "epoch": 0.9411860740377944, + "grad_norm": 1.7223553657531738, + "learning_rate": 1.988589687695723e-05, + "loss": 0.9217, + "step": 5765 + }, + { + "epoch": 0.9413493326802987, + "grad_norm": 1.607495903968811, + "learning_rate": 1.9885849071083912e-05, + "loss": 0.7183, + "step": 5766 + }, + { + "epoch": 0.9415125913228032, + "grad_norm": 1.7953224182128906, + "learning_rate": 1.9885801255255552e-05, + "loss": 0.7187, + "step": 5767 + }, + { + "epoch": 0.9416758499653075, + "grad_norm": 1.5894697904586792, + "learning_rate": 1.988575342947219e-05, + "loss": 0.6975, + "step": 5768 + }, + { + "epoch": 0.941839108607812, + "grad_norm": 2.0527658462524414, + "learning_rate": 1.9885705593733872e-05, + "loss": 0.7373, + "step": 5769 + }, + { + "epoch": 0.9420023672503163, + "grad_norm": 1.8515516519546509, + "learning_rate": 1.9885657748040655e-05, + "loss": 0.6878, + "step": 5770 + }, + { + "epoch": 0.9421656258928207, + "grad_norm": 1.92622971534729, + "learning_rate": 1.9885609892392584e-05, + "loss": 0.8359, + "step": 5771 + }, + { + "epoch": 0.942328884535325, + "grad_norm": 1.627686858177185, + "learning_rate": 1.9885562026789705e-05, + "loss": 0.6738, + "step": 5772 + }, + { + "epoch": 0.9424921431778295, + "grad_norm": 1.826431155204773, + "learning_rate": 1.9885514151232067e-05, + "loss": 0.7694, + "step": 5773 + }, + { + "epoch": 0.9426554018203339, + "grad_norm": 2.256471633911133, + "learning_rate": 1.9885466265719723e-05, + "loss": 0.8479, + "step": 5774 + }, + { + "epoch": 0.9428186604628382, + "grad_norm": 1.654866337776184, + "learning_rate": 1.9885418370252715e-05, + "loss": 0.6293, + "step": 5775 + }, + { + "epoch": 0.9429819191053427, + "grad_norm": 1.6537717580795288, + "learning_rate": 1.988537046483109e-05, + "loss": 0.7651, + "step": 5776 + }, + { + "epoch": 0.943145177747847, + "grad_norm": 1.6033003330230713, + "learning_rate": 1.9885322549454905e-05, + "loss": 0.9255, + "step": 5777 + }, + { + "epoch": 0.9433084363903514, + "grad_norm": 1.7734872102737427, + "learning_rate": 1.98852746241242e-05, + "loss": 0.7852, + "step": 5778 + }, + { + "epoch": 0.9434716950328558, + "grad_norm": 1.9155776500701904, + "learning_rate": 1.9885226688839023e-05, + "loss": 1.1201, + "step": 5779 + }, + { + "epoch": 0.9436349536753602, + "grad_norm": 1.5816771984100342, + "learning_rate": 1.988517874359943e-05, + "loss": 0.5468, + "step": 5780 + }, + { + "epoch": 0.9437982123178645, + "grad_norm": 2.1904280185699463, + "learning_rate": 1.9885130788405463e-05, + "loss": 0.7584, + "step": 5781 + }, + { + "epoch": 0.943961470960369, + "grad_norm": 2.0713632106781006, + "learning_rate": 1.988508282325717e-05, + "loss": 0.7614, + "step": 5782 + }, + { + "epoch": 0.9441247296028733, + "grad_norm": 2.3178539276123047, + "learning_rate": 1.9885034848154605e-05, + "loss": 0.9069, + "step": 5783 + }, + { + "epoch": 0.9442879882453777, + "grad_norm": 2.1415598392486572, + "learning_rate": 1.988498686309781e-05, + "loss": 0.7871, + "step": 5784 + }, + { + "epoch": 0.9444512468878822, + "grad_norm": 1.6048065423965454, + "learning_rate": 1.9884938868086836e-05, + "loss": 0.709, + "step": 5785 + }, + { + "epoch": 0.9446145055303865, + "grad_norm": 1.8477904796600342, + "learning_rate": 1.9884890863121734e-05, + "loss": 0.8037, + "step": 5786 + }, + { + "epoch": 0.9447777641728909, + "grad_norm": 2.049612045288086, + "learning_rate": 1.9884842848202545e-05, + "loss": 0.9281, + "step": 5787 + }, + { + "epoch": 0.9449410228153953, + "grad_norm": 1.8600860834121704, + "learning_rate": 1.9884794823329327e-05, + "loss": 0.7258, + "step": 5788 + }, + { + "epoch": 0.9451042814578997, + "grad_norm": 1.9998430013656616, + "learning_rate": 1.988474678850212e-05, + "loss": 0.8283, + "step": 5789 + }, + { + "epoch": 0.945267540100404, + "grad_norm": 1.780461072921753, + "learning_rate": 1.9884698743720973e-05, + "loss": 0.6849, + "step": 5790 + }, + { + "epoch": 0.9454307987429085, + "grad_norm": 1.8567193746566772, + "learning_rate": 1.9884650688985943e-05, + "loss": 0.6133, + "step": 5791 + }, + { + "epoch": 0.9455940573854128, + "grad_norm": 2.219817876815796, + "learning_rate": 1.988460262429707e-05, + "loss": 0.7821, + "step": 5792 + }, + { + "epoch": 0.9457573160279172, + "grad_norm": 1.7746769189834595, + "learning_rate": 1.98845545496544e-05, + "loss": 0.826, + "step": 5793 + }, + { + "epoch": 0.9459205746704216, + "grad_norm": 1.9887628555297852, + "learning_rate": 1.988450646505799e-05, + "loss": 0.7882, + "step": 5794 + }, + { + "epoch": 0.946083833312926, + "grad_norm": 1.7247023582458496, + "learning_rate": 1.9884458370507886e-05, + "loss": 0.7313, + "step": 5795 + }, + { + "epoch": 0.9462470919554304, + "grad_norm": 2.068726062774658, + "learning_rate": 1.9884410266004134e-05, + "loss": 0.7834, + "step": 5796 + }, + { + "epoch": 0.9464103505979348, + "grad_norm": 2.4623751640319824, + "learning_rate": 1.9884362151546783e-05, + "loss": 0.8833, + "step": 5797 + }, + { + "epoch": 0.9465736092404392, + "grad_norm": 2.131727457046509, + "learning_rate": 1.9884314027135883e-05, + "loss": 0.8595, + "step": 5798 + }, + { + "epoch": 0.9467368678829435, + "grad_norm": 2.0381524562835693, + "learning_rate": 1.9884265892771483e-05, + "loss": 1.3767, + "step": 5799 + }, + { + "epoch": 0.946900126525448, + "grad_norm": 1.7590593099594116, + "learning_rate": 1.9884217748453625e-05, + "loss": 0.7559, + "step": 5800 + }, + { + "epoch": 0.9470633851679523, + "grad_norm": 1.6981379985809326, + "learning_rate": 1.9884169594182364e-05, + "loss": 0.7696, + "step": 5801 + }, + { + "epoch": 0.9472266438104567, + "grad_norm": 1.8272701501846313, + "learning_rate": 1.988412142995775e-05, + "loss": 0.7707, + "step": 5802 + }, + { + "epoch": 0.9473899024529611, + "grad_norm": 1.9308500289916992, + "learning_rate": 1.9884073255779824e-05, + "loss": 0.7492, + "step": 5803 + }, + { + "epoch": 0.9475531610954655, + "grad_norm": 1.7385494709014893, + "learning_rate": 1.9884025071648643e-05, + "loss": 0.7122, + "step": 5804 + }, + { + "epoch": 0.9477164197379698, + "grad_norm": 1.829702615737915, + "learning_rate": 1.988397687756425e-05, + "loss": 0.7234, + "step": 5805 + }, + { + "epoch": 0.9478796783804743, + "grad_norm": 1.8464897871017456, + "learning_rate": 1.9883928673526692e-05, + "loss": 0.7385, + "step": 5806 + }, + { + "epoch": 0.9480429370229787, + "grad_norm": 1.8805749416351318, + "learning_rate": 1.9883880459536024e-05, + "loss": 0.8518, + "step": 5807 + }, + { + "epoch": 0.948206195665483, + "grad_norm": 2.240586519241333, + "learning_rate": 1.988383223559229e-05, + "loss": 0.8711, + "step": 5808 + }, + { + "epoch": 0.9483694543079875, + "grad_norm": 2.2569186687469482, + "learning_rate": 1.988378400169554e-05, + "loss": 0.7033, + "step": 5809 + }, + { + "epoch": 0.9485327129504918, + "grad_norm": 1.9544212818145752, + "learning_rate": 1.9883735757845822e-05, + "loss": 0.8219, + "step": 5810 + }, + { + "epoch": 0.9486959715929962, + "grad_norm": 1.8035211563110352, + "learning_rate": 1.9883687504043183e-05, + "loss": 0.6754, + "step": 5811 + }, + { + "epoch": 0.9488592302355006, + "grad_norm": 1.9403053522109985, + "learning_rate": 1.9883639240287676e-05, + "loss": 0.7626, + "step": 5812 + }, + { + "epoch": 0.949022488878005, + "grad_norm": 1.8226954936981201, + "learning_rate": 1.9883590966579342e-05, + "loss": 0.7701, + "step": 5813 + }, + { + "epoch": 0.9491857475205093, + "grad_norm": 1.732888102531433, + "learning_rate": 1.988354268291824e-05, + "loss": 0.7134, + "step": 5814 + }, + { + "epoch": 0.9493490061630138, + "grad_norm": 1.760650873184204, + "learning_rate": 1.988349438930441e-05, + "loss": 0.8481, + "step": 5815 + }, + { + "epoch": 0.9495122648055181, + "grad_norm": 1.6131792068481445, + "learning_rate": 1.9883446085737904e-05, + "loss": 0.6023, + "step": 5816 + }, + { + "epoch": 0.9496755234480225, + "grad_norm": 1.8252925872802734, + "learning_rate": 1.988339777221877e-05, + "loss": 0.6659, + "step": 5817 + }, + { + "epoch": 0.949838782090527, + "grad_norm": 1.9540563821792603, + "learning_rate": 1.988334944874706e-05, + "loss": 0.7254, + "step": 5818 + }, + { + "epoch": 0.9500020407330313, + "grad_norm": 2.14513897895813, + "learning_rate": 1.9883301115322817e-05, + "loss": 0.78, + "step": 5819 + }, + { + "epoch": 0.9501652993755357, + "grad_norm": 1.8827747106552124, + "learning_rate": 1.9883252771946094e-05, + "loss": 0.6161, + "step": 5820 + }, + { + "epoch": 0.9503285580180401, + "grad_norm": 2.076988935470581, + "learning_rate": 1.988320441861694e-05, + "loss": 0.8529, + "step": 5821 + }, + { + "epoch": 0.9504918166605445, + "grad_norm": 2.323436975479126, + "learning_rate": 1.9883156055335398e-05, + "loss": 0.8945, + "step": 5822 + }, + { + "epoch": 0.9506550753030488, + "grad_norm": 2.1867728233337402, + "learning_rate": 1.9883107682101523e-05, + "loss": 0.8147, + "step": 5823 + }, + { + "epoch": 0.9508183339455533, + "grad_norm": 2.0491859912872314, + "learning_rate": 1.988305929891536e-05, + "loss": 0.8346, + "step": 5824 + }, + { + "epoch": 0.9509815925880576, + "grad_norm": 1.8013696670532227, + "learning_rate": 1.9883010905776955e-05, + "loss": 0.8056, + "step": 5825 + }, + { + "epoch": 0.951144851230562, + "grad_norm": 1.7445491552352905, + "learning_rate": 1.988296250268637e-05, + "loss": 0.7998, + "step": 5826 + }, + { + "epoch": 0.9513081098730664, + "grad_norm": 1.6402561664581299, + "learning_rate": 1.9882914089643635e-05, + "loss": 0.7034, + "step": 5827 + }, + { + "epoch": 0.9514713685155708, + "grad_norm": 2.0749800205230713, + "learning_rate": 1.9882865666648814e-05, + "loss": 0.8125, + "step": 5828 + }, + { + "epoch": 0.9516346271580752, + "grad_norm": 1.8249925374984741, + "learning_rate": 1.988281723370195e-05, + "loss": 0.7257, + "step": 5829 + }, + { + "epoch": 0.9517978858005796, + "grad_norm": 2.4648420810699463, + "learning_rate": 1.9882768790803086e-05, + "loss": 1.1782, + "step": 5830 + }, + { + "epoch": 0.951961144443084, + "grad_norm": 1.930014967918396, + "learning_rate": 1.9882720337952278e-05, + "loss": 0.7643, + "step": 5831 + }, + { + "epoch": 0.9521244030855883, + "grad_norm": 1.962272047996521, + "learning_rate": 1.988267187514958e-05, + "loss": 0.7616, + "step": 5832 + }, + { + "epoch": 0.9522876617280928, + "grad_norm": 1.3929247856140137, + "learning_rate": 1.9882623402395027e-05, + "loss": 0.5807, + "step": 5833 + }, + { + "epoch": 0.9524509203705971, + "grad_norm": 1.6981593370437622, + "learning_rate": 1.9882574919688676e-05, + "loss": 0.717, + "step": 5834 + }, + { + "epoch": 0.9526141790131015, + "grad_norm": 1.6585187911987305, + "learning_rate": 1.988252642703058e-05, + "loss": 0.8325, + "step": 5835 + }, + { + "epoch": 0.9527774376556059, + "grad_norm": 1.8912004232406616, + "learning_rate": 1.9882477924420773e-05, + "loss": 0.7601, + "step": 5836 + }, + { + "epoch": 0.9529406962981103, + "grad_norm": 1.9597433805465698, + "learning_rate": 1.9882429411859322e-05, + "loss": 0.7029, + "step": 5837 + }, + { + "epoch": 0.9531039549406146, + "grad_norm": 1.9821723699569702, + "learning_rate": 1.988238088934626e-05, + "loss": 0.8478, + "step": 5838 + }, + { + "epoch": 0.953267213583119, + "grad_norm": 2.066716432571411, + "learning_rate": 1.9882332356881647e-05, + "loss": 0.7375, + "step": 5839 + }, + { + "epoch": 0.9534304722256235, + "grad_norm": 1.8429127931594849, + "learning_rate": 1.988228381446553e-05, + "loss": 0.7104, + "step": 5840 + }, + { + "epoch": 0.9535937308681278, + "grad_norm": 1.911085605621338, + "learning_rate": 1.9882235262097954e-05, + "loss": 0.8169, + "step": 5841 + }, + { + "epoch": 0.9537569895106323, + "grad_norm": 1.6619443893432617, + "learning_rate": 1.988218669977897e-05, + "loss": 0.8408, + "step": 5842 + }, + { + "epoch": 0.9539202481531366, + "grad_norm": 1.5614078044891357, + "learning_rate": 1.9882138127508624e-05, + "loss": 0.673, + "step": 5843 + }, + { + "epoch": 0.954083506795641, + "grad_norm": 1.6054402589797974, + "learning_rate": 1.9882089545286967e-05, + "loss": 0.6165, + "step": 5844 + }, + { + "epoch": 0.9542467654381454, + "grad_norm": 2.0731489658355713, + "learning_rate": 1.9882040953114056e-05, + "loss": 0.9001, + "step": 5845 + }, + { + "epoch": 0.9544100240806498, + "grad_norm": 1.7190436124801636, + "learning_rate": 1.9881992350989927e-05, + "loss": 0.7477, + "step": 5846 + }, + { + "epoch": 0.9545732827231541, + "grad_norm": 2.2513763904571533, + "learning_rate": 1.9881943738914634e-05, + "loss": 0.6684, + "step": 5847 + }, + { + "epoch": 0.9547365413656586, + "grad_norm": 1.3994193077087402, + "learning_rate": 1.988189511688823e-05, + "loss": 0.6258, + "step": 5848 + }, + { + "epoch": 0.9548998000081629, + "grad_norm": 1.9851990938186646, + "learning_rate": 1.9881846484910752e-05, + "loss": 0.7173, + "step": 5849 + }, + { + "epoch": 0.9550630586506673, + "grad_norm": 1.8960609436035156, + "learning_rate": 1.9881797842982265e-05, + "loss": 1.039, + "step": 5850 + }, + { + "epoch": 0.9552263172931718, + "grad_norm": 1.6367026567459106, + "learning_rate": 1.9881749191102807e-05, + "loss": 0.6784, + "step": 5851 + }, + { + "epoch": 0.9553895759356761, + "grad_norm": 1.9023123979568481, + "learning_rate": 1.988170052927243e-05, + "loss": 0.6968, + "step": 5852 + }, + { + "epoch": 0.9555528345781805, + "grad_norm": 1.7912591695785522, + "learning_rate": 1.9881651857491184e-05, + "loss": 0.9238, + "step": 5853 + }, + { + "epoch": 0.9557160932206848, + "grad_norm": 1.9536640644073486, + "learning_rate": 1.9881603175759117e-05, + "loss": 0.8356, + "step": 5854 + }, + { + "epoch": 0.9558793518631893, + "grad_norm": 2.178215503692627, + "learning_rate": 1.988155448407628e-05, + "loss": 0.7426, + "step": 5855 + }, + { + "epoch": 0.9560426105056936, + "grad_norm": 2.2000627517700195, + "learning_rate": 1.9881505782442717e-05, + "loss": 0.6558, + "step": 5856 + }, + { + "epoch": 0.956205869148198, + "grad_norm": 1.9184342622756958, + "learning_rate": 1.9881457070858482e-05, + "loss": 0.8036, + "step": 5857 + }, + { + "epoch": 0.9563691277907024, + "grad_norm": 1.7472063302993774, + "learning_rate": 1.9881408349323622e-05, + "loss": 0.6824, + "step": 5858 + }, + { + "epoch": 0.9565323864332068, + "grad_norm": 1.871843695640564, + "learning_rate": 1.988135961783819e-05, + "loss": 0.8832, + "step": 5859 + }, + { + "epoch": 0.9566956450757111, + "grad_norm": 1.9221521615982056, + "learning_rate": 1.9881310876402225e-05, + "loss": 0.8213, + "step": 5860 + }, + { + "epoch": 0.9568589037182156, + "grad_norm": 2.058335065841675, + "learning_rate": 1.9881262125015786e-05, + "loss": 0.8761, + "step": 5861 + }, + { + "epoch": 0.95702216236072, + "grad_norm": 1.7215598821640015, + "learning_rate": 1.988121336367892e-05, + "loss": 0.6424, + "step": 5862 + }, + { + "epoch": 0.9571854210032243, + "grad_norm": 1.4935752153396606, + "learning_rate": 1.9881164592391672e-05, + "loss": 0.5509, + "step": 5863 + }, + { + "epoch": 0.9573486796457288, + "grad_norm": 1.992567777633667, + "learning_rate": 1.9881115811154098e-05, + "loss": 0.9443, + "step": 5864 + }, + { + "epoch": 0.9575119382882331, + "grad_norm": 1.592947244644165, + "learning_rate": 1.988106701996624e-05, + "loss": 0.6793, + "step": 5865 + }, + { + "epoch": 0.9576751969307375, + "grad_norm": 1.5262306928634644, + "learning_rate": 1.9881018218828147e-05, + "loss": 0.7548, + "step": 5866 + }, + { + "epoch": 0.9578384555732419, + "grad_norm": 1.7564045190811157, + "learning_rate": 1.9880969407739875e-05, + "loss": 0.6788, + "step": 5867 + }, + { + "epoch": 0.9580017142157463, + "grad_norm": 2.1623117923736572, + "learning_rate": 1.988092058670147e-05, + "loss": 1.0269, + "step": 5868 + }, + { + "epoch": 0.9581649728582506, + "grad_norm": 2.032778024673462, + "learning_rate": 1.988087175571298e-05, + "loss": 0.7619, + "step": 5869 + }, + { + "epoch": 0.9583282315007551, + "grad_norm": 1.6946464776992798, + "learning_rate": 1.9880822914774453e-05, + "loss": 0.6284, + "step": 5870 + }, + { + "epoch": 0.9584914901432594, + "grad_norm": 1.8071743249893188, + "learning_rate": 1.9880774063885942e-05, + "loss": 0.8879, + "step": 5871 + }, + { + "epoch": 0.9586547487857638, + "grad_norm": 1.8844603300094604, + "learning_rate": 1.988072520304749e-05, + "loss": 0.6178, + "step": 5872 + }, + { + "epoch": 0.9588180074282683, + "grad_norm": 2.064663887023926, + "learning_rate": 1.9880676332259155e-05, + "loss": 0.9803, + "step": 5873 + }, + { + "epoch": 0.9589812660707726, + "grad_norm": 1.6574041843414307, + "learning_rate": 1.9880627451520983e-05, + "loss": 0.6165, + "step": 5874 + }, + { + "epoch": 0.959144524713277, + "grad_norm": 1.7718545198440552, + "learning_rate": 1.9880578560833017e-05, + "loss": 0.7296, + "step": 5875 + }, + { + "epoch": 0.9593077833557814, + "grad_norm": 2.145753860473633, + "learning_rate": 1.9880529660195314e-05, + "loss": 0.7915, + "step": 5876 + }, + { + "epoch": 0.9594710419982858, + "grad_norm": 2.0439789295196533, + "learning_rate": 1.988048074960792e-05, + "loss": 0.7655, + "step": 5877 + }, + { + "epoch": 0.9596343006407901, + "grad_norm": 1.9380052089691162, + "learning_rate": 1.988043182907088e-05, + "loss": 0.8097, + "step": 5878 + }, + { + "epoch": 0.9597975592832946, + "grad_norm": 1.8143365383148193, + "learning_rate": 1.9880382898584254e-05, + "loss": 0.6877, + "step": 5879 + }, + { + "epoch": 0.9599608179257989, + "grad_norm": 1.7788065671920776, + "learning_rate": 1.9880333958148085e-05, + "loss": 0.8665, + "step": 5880 + }, + { + "epoch": 0.9601240765683033, + "grad_norm": 1.9629443883895874, + "learning_rate": 1.988028500776242e-05, + "loss": 0.7228, + "step": 5881 + }, + { + "epoch": 0.9602873352108077, + "grad_norm": 1.7812095880508423, + "learning_rate": 1.9880236047427308e-05, + "loss": 0.7608, + "step": 5882 + }, + { + "epoch": 0.9604505938533121, + "grad_norm": 1.9366093873977661, + "learning_rate": 1.98801870771428e-05, + "loss": 0.874, + "step": 5883 + }, + { + "epoch": 0.9606138524958165, + "grad_norm": 1.9091942310333252, + "learning_rate": 1.9880138096908955e-05, + "loss": 0.7725, + "step": 5884 + }, + { + "epoch": 0.9607771111383209, + "grad_norm": 1.7375255823135376, + "learning_rate": 1.9880089106725805e-05, + "loss": 0.7603, + "step": 5885 + }, + { + "epoch": 0.9609403697808253, + "grad_norm": 1.4243805408477783, + "learning_rate": 1.9880040106593413e-05, + "loss": 0.6183, + "step": 5886 + }, + { + "epoch": 0.9611036284233296, + "grad_norm": 1.7426848411560059, + "learning_rate": 1.987999109651182e-05, + "loss": 0.8244, + "step": 5887 + }, + { + "epoch": 0.9612668870658341, + "grad_norm": 1.4907567501068115, + "learning_rate": 1.9879942076481082e-05, + "loss": 0.6861, + "step": 5888 + }, + { + "epoch": 0.9614301457083384, + "grad_norm": 1.6635817289352417, + "learning_rate": 1.987989304650124e-05, + "loss": 0.7142, + "step": 5889 + }, + { + "epoch": 0.9615934043508428, + "grad_norm": 1.504945158958435, + "learning_rate": 1.987984400657235e-05, + "loss": 0.6635, + "step": 5890 + }, + { + "epoch": 0.9617566629933472, + "grad_norm": 1.8974223136901855, + "learning_rate": 1.9879794956694463e-05, + "loss": 0.9495, + "step": 5891 + }, + { + "epoch": 0.9619199216358516, + "grad_norm": 1.4257986545562744, + "learning_rate": 1.9879745896867624e-05, + "loss": 0.6576, + "step": 5892 + }, + { + "epoch": 0.9620831802783559, + "grad_norm": 1.6958147287368774, + "learning_rate": 1.9879696827091882e-05, + "loss": 0.8572, + "step": 5893 + }, + { + "epoch": 0.9622464389208604, + "grad_norm": 1.65938401222229, + "learning_rate": 1.987964774736729e-05, + "loss": 0.6801, + "step": 5894 + }, + { + "epoch": 0.9624096975633648, + "grad_norm": 1.9055495262145996, + "learning_rate": 1.9879598657693894e-05, + "loss": 0.8883, + "step": 5895 + }, + { + "epoch": 0.9625729562058691, + "grad_norm": 2.2240192890167236, + "learning_rate": 1.9879549558071742e-05, + "loss": 0.7431, + "step": 5896 + }, + { + "epoch": 0.9627362148483736, + "grad_norm": 1.4063907861709595, + "learning_rate": 1.987950044850089e-05, + "loss": 0.6701, + "step": 5897 + }, + { + "epoch": 0.9628994734908779, + "grad_norm": 1.8229234218597412, + "learning_rate": 1.9879451328981384e-05, + "loss": 0.8331, + "step": 5898 + }, + { + "epoch": 0.9630627321333823, + "grad_norm": 1.8376917839050293, + "learning_rate": 1.987940219951327e-05, + "loss": 0.7301, + "step": 5899 + }, + { + "epoch": 0.9632259907758867, + "grad_norm": 2.0361239910125732, + "learning_rate": 1.98793530600966e-05, + "loss": 0.8292, + "step": 5900 + }, + { + "epoch": 0.9633892494183911, + "grad_norm": 1.9441180229187012, + "learning_rate": 1.987930391073143e-05, + "loss": 0.7768, + "step": 5901 + }, + { + "epoch": 0.9635525080608954, + "grad_norm": 2.0011613368988037, + "learning_rate": 1.9879254751417797e-05, + "loss": 0.7991, + "step": 5902 + }, + { + "epoch": 0.9637157667033999, + "grad_norm": 1.787095546722412, + "learning_rate": 1.987920558215576e-05, + "loss": 0.8272, + "step": 5903 + }, + { + "epoch": 0.9638790253459042, + "grad_norm": 1.8058454990386963, + "learning_rate": 1.9879156402945368e-05, + "loss": 0.7298, + "step": 5904 + }, + { + "epoch": 0.9640422839884086, + "grad_norm": 2.158674716949463, + "learning_rate": 1.9879107213786667e-05, + "loss": 0.9092, + "step": 5905 + }, + { + "epoch": 0.9642055426309131, + "grad_norm": 1.6222621202468872, + "learning_rate": 1.9879058014679704e-05, + "loss": 0.6428, + "step": 5906 + }, + { + "epoch": 0.9643688012734174, + "grad_norm": 2.0902462005615234, + "learning_rate": 1.9879008805624535e-05, + "loss": 0.6753, + "step": 5907 + }, + { + "epoch": 0.9645320599159218, + "grad_norm": 1.9423555135726929, + "learning_rate": 1.9878959586621204e-05, + "loss": 0.8046, + "step": 5908 + }, + { + "epoch": 0.9646953185584262, + "grad_norm": 1.8502558469772339, + "learning_rate": 1.9878910357669766e-05, + "loss": 0.8262, + "step": 5909 + }, + { + "epoch": 0.9648585772009306, + "grad_norm": 2.2460103034973145, + "learning_rate": 1.987886111877027e-05, + "loss": 0.9176, + "step": 5910 + }, + { + "epoch": 0.9650218358434349, + "grad_norm": 1.8354326486587524, + "learning_rate": 1.987881186992276e-05, + "loss": 0.9197, + "step": 5911 + }, + { + "epoch": 0.9651850944859394, + "grad_norm": 1.7509719133377075, + "learning_rate": 1.9878762611127288e-05, + "loss": 0.8023, + "step": 5912 + }, + { + "epoch": 0.9653483531284437, + "grad_norm": 1.6554336547851562, + "learning_rate": 1.987871334238391e-05, + "loss": 0.7153, + "step": 5913 + }, + { + "epoch": 0.9655116117709481, + "grad_norm": 1.7806309461593628, + "learning_rate": 1.9878664063692664e-05, + "loss": 0.6852, + "step": 5914 + }, + { + "epoch": 0.9656748704134525, + "grad_norm": 1.6907514333724976, + "learning_rate": 1.987861477505361e-05, + "loss": 0.7683, + "step": 5915 + }, + { + "epoch": 0.9658381290559569, + "grad_norm": 2.20597243309021, + "learning_rate": 1.987856547646679e-05, + "loss": 0.8144, + "step": 5916 + }, + { + "epoch": 0.9660013876984613, + "grad_norm": 1.771627426147461, + "learning_rate": 1.987851616793226e-05, + "loss": 0.8175, + "step": 5917 + }, + { + "epoch": 0.9661646463409657, + "grad_norm": 1.6341692209243774, + "learning_rate": 1.9878466849450067e-05, + "loss": 0.694, + "step": 5918 + }, + { + "epoch": 0.9663279049834701, + "grad_norm": 1.7307456731796265, + "learning_rate": 1.987841752102026e-05, + "loss": 0.6035, + "step": 5919 + }, + { + "epoch": 0.9664911636259744, + "grad_norm": 1.8764612674713135, + "learning_rate": 1.987836818264289e-05, + "loss": 0.7312, + "step": 5920 + }, + { + "epoch": 0.9666544222684789, + "grad_norm": 1.9953573942184448, + "learning_rate": 1.9878318834318005e-05, + "loss": 0.892, + "step": 5921 + }, + { + "epoch": 0.9668176809109832, + "grad_norm": 2.0008957386016846, + "learning_rate": 1.9878269476045656e-05, + "loss": 0.9351, + "step": 5922 + }, + { + "epoch": 0.9669809395534876, + "grad_norm": 1.5710453987121582, + "learning_rate": 1.9878220107825892e-05, + "loss": 0.6207, + "step": 5923 + }, + { + "epoch": 0.967144198195992, + "grad_norm": 1.9178194999694824, + "learning_rate": 1.9878170729658762e-05, + "loss": 0.8535, + "step": 5924 + }, + { + "epoch": 0.9673074568384964, + "grad_norm": 1.4948939085006714, + "learning_rate": 1.9878121341544317e-05, + "loss": 0.6455, + "step": 5925 + }, + { + "epoch": 0.9674707154810007, + "grad_norm": 1.773796796798706, + "learning_rate": 1.987807194348261e-05, + "loss": 0.71, + "step": 5926 + }, + { + "epoch": 0.9676339741235052, + "grad_norm": 2.001394748687744, + "learning_rate": 1.9878022535473682e-05, + "loss": 0.9076, + "step": 5927 + }, + { + "epoch": 0.9677972327660096, + "grad_norm": 1.7112550735473633, + "learning_rate": 1.987797311751759e-05, + "loss": 0.7714, + "step": 5928 + }, + { + "epoch": 0.9679604914085139, + "grad_norm": 1.954550862312317, + "learning_rate": 1.9877923689614382e-05, + "loss": 0.8302, + "step": 5929 + }, + { + "epoch": 0.9681237500510184, + "grad_norm": 2.1458640098571777, + "learning_rate": 1.9877874251764108e-05, + "loss": 0.843, + "step": 5930 + }, + { + "epoch": 0.9682870086935227, + "grad_norm": 1.9994858503341675, + "learning_rate": 1.9877824803966818e-05, + "loss": 0.838, + "step": 5931 + }, + { + "epoch": 0.9684502673360271, + "grad_norm": 1.6891282796859741, + "learning_rate": 1.987777534622256e-05, + "loss": 0.6915, + "step": 5932 + }, + { + "epoch": 0.9686135259785315, + "grad_norm": 1.8957222700119019, + "learning_rate": 1.987772587853138e-05, + "loss": 0.7644, + "step": 5933 + }, + { + "epoch": 0.9687767846210359, + "grad_norm": 1.6775943040847778, + "learning_rate": 1.987767640089334e-05, + "loss": 0.7307, + "step": 5934 + }, + { + "epoch": 0.9689400432635402, + "grad_norm": 1.917017936706543, + "learning_rate": 1.987762691330848e-05, + "loss": 0.9459, + "step": 5935 + }, + { + "epoch": 0.9691033019060447, + "grad_norm": 1.6119433641433716, + "learning_rate": 1.987757741577685e-05, + "loss": 0.684, + "step": 5936 + }, + { + "epoch": 0.9692665605485491, + "grad_norm": 1.8658279180526733, + "learning_rate": 1.9877527908298503e-05, + "loss": 0.8066, + "step": 5937 + }, + { + "epoch": 0.9694298191910534, + "grad_norm": 1.8829654455184937, + "learning_rate": 1.987747839087349e-05, + "loss": 0.8023, + "step": 5938 + }, + { + "epoch": 0.9695930778335579, + "grad_norm": 1.4931589365005493, + "learning_rate": 1.9877428863501857e-05, + "loss": 0.6303, + "step": 5939 + }, + { + "epoch": 0.9697563364760622, + "grad_norm": 1.7371169328689575, + "learning_rate": 1.9877379326183656e-05, + "loss": 0.6894, + "step": 5940 + }, + { + "epoch": 0.9699195951185666, + "grad_norm": 1.8759992122650146, + "learning_rate": 1.9877329778918938e-05, + "loss": 0.861, + "step": 5941 + }, + { + "epoch": 0.970082853761071, + "grad_norm": 1.5274263620376587, + "learning_rate": 1.9877280221707752e-05, + "loss": 0.6898, + "step": 5942 + }, + { + "epoch": 0.9702461124035754, + "grad_norm": 1.8152787685394287, + "learning_rate": 1.9877230654550143e-05, + "loss": 0.8718, + "step": 5943 + }, + { + "epoch": 0.9704093710460797, + "grad_norm": 1.7864818572998047, + "learning_rate": 1.9877181077446172e-05, + "loss": 0.8864, + "step": 5944 + }, + { + "epoch": 0.9705726296885842, + "grad_norm": 2.062959671020508, + "learning_rate": 1.987713149039588e-05, + "loss": 1.026, + "step": 5945 + }, + { + "epoch": 0.9707358883310885, + "grad_norm": 1.7025115489959717, + "learning_rate": 1.9877081893399315e-05, + "loss": 0.711, + "step": 5946 + }, + { + "epoch": 0.9708991469735929, + "grad_norm": 1.8405207395553589, + "learning_rate": 1.9877032286456535e-05, + "loss": 0.8961, + "step": 5947 + }, + { + "epoch": 0.9710624056160974, + "grad_norm": 1.8922622203826904, + "learning_rate": 1.9876982669567585e-05, + "loss": 0.7401, + "step": 5948 + }, + { + "epoch": 0.9712256642586017, + "grad_norm": 1.89198637008667, + "learning_rate": 1.9876933042732517e-05, + "loss": 0.9072, + "step": 5949 + }, + { + "epoch": 0.9713889229011061, + "grad_norm": 1.819451093673706, + "learning_rate": 1.9876883405951378e-05, + "loss": 0.8554, + "step": 5950 + }, + { + "epoch": 0.9715521815436104, + "grad_norm": 2.102905511856079, + "learning_rate": 1.9876833759224223e-05, + "loss": 0.9314, + "step": 5951 + }, + { + "epoch": 0.9717154401861149, + "grad_norm": 1.8134114742279053, + "learning_rate": 1.9876784102551098e-05, + "loss": 0.6426, + "step": 5952 + }, + { + "epoch": 0.9718786988286192, + "grad_norm": 1.63399338722229, + "learning_rate": 1.987673443593205e-05, + "loss": 0.7231, + "step": 5953 + }, + { + "epoch": 0.9720419574711237, + "grad_norm": 1.5192047357559204, + "learning_rate": 1.987668475936714e-05, + "loss": 0.6071, + "step": 5954 + }, + { + "epoch": 0.972205216113628, + "grad_norm": 1.8394055366516113, + "learning_rate": 1.987663507285641e-05, + "loss": 0.8735, + "step": 5955 + }, + { + "epoch": 0.9723684747561324, + "grad_norm": 1.5879292488098145, + "learning_rate": 1.9876585376399904e-05, + "loss": 0.6748, + "step": 5956 + }, + { + "epoch": 0.9725317333986367, + "grad_norm": 1.635048270225525, + "learning_rate": 1.9876535669997685e-05, + "loss": 0.7705, + "step": 5957 + }, + { + "epoch": 0.9726949920411412, + "grad_norm": 1.8931376934051514, + "learning_rate": 1.9876485953649795e-05, + "loss": 0.7077, + "step": 5958 + }, + { + "epoch": 0.9728582506836456, + "grad_norm": 1.8646506071090698, + "learning_rate": 1.9876436227356288e-05, + "loss": 0.7208, + "step": 5959 + }, + { + "epoch": 0.97302150932615, + "grad_norm": 1.542148232460022, + "learning_rate": 1.987638649111721e-05, + "loss": 0.6866, + "step": 5960 + }, + { + "epoch": 0.9731847679686544, + "grad_norm": 1.799831748008728, + "learning_rate": 1.9876336744932616e-05, + "loss": 0.8734, + "step": 5961 + }, + { + "epoch": 0.9733480266111587, + "grad_norm": 1.745521068572998, + "learning_rate": 1.9876286988802552e-05, + "loss": 0.7456, + "step": 5962 + }, + { + "epoch": 0.9735112852536632, + "grad_norm": 1.6802253723144531, + "learning_rate": 1.9876237222727072e-05, + "loss": 0.806, + "step": 5963 + }, + { + "epoch": 0.9736745438961675, + "grad_norm": 1.6264944076538086, + "learning_rate": 1.9876187446706222e-05, + "loss": 0.7477, + "step": 5964 + }, + { + "epoch": 0.9738378025386719, + "grad_norm": 1.580211877822876, + "learning_rate": 1.9876137660740054e-05, + "loss": 0.6917, + "step": 5965 + }, + { + "epoch": 0.9740010611811762, + "grad_norm": 1.4693100452423096, + "learning_rate": 1.9876087864828617e-05, + "loss": 0.681, + "step": 5966 + }, + { + "epoch": 0.9741643198236807, + "grad_norm": 1.7324347496032715, + "learning_rate": 1.9876038058971963e-05, + "loss": 0.7502, + "step": 5967 + }, + { + "epoch": 0.974327578466185, + "grad_norm": 1.9221038818359375, + "learning_rate": 1.987598824317014e-05, + "loss": 0.8654, + "step": 5968 + }, + { + "epoch": 0.9744908371086894, + "grad_norm": 2.1047980785369873, + "learning_rate": 1.98759384174232e-05, + "loss": 0.7481, + "step": 5969 + }, + { + "epoch": 0.9746540957511939, + "grad_norm": 1.6862722635269165, + "learning_rate": 1.9875888581731194e-05, + "loss": 0.7664, + "step": 5970 + }, + { + "epoch": 0.9748173543936982, + "grad_norm": 1.769893765449524, + "learning_rate": 1.9875838736094173e-05, + "loss": 0.7537, + "step": 5971 + }, + { + "epoch": 0.9749806130362026, + "grad_norm": 2.2298662662506104, + "learning_rate": 1.9875788880512183e-05, + "loss": 0.8482, + "step": 5972 + }, + { + "epoch": 0.975143871678707, + "grad_norm": 2.0298850536346436, + "learning_rate": 1.9875739014985273e-05, + "loss": 0.8494, + "step": 5973 + }, + { + "epoch": 0.9753071303212114, + "grad_norm": 1.6761415004730225, + "learning_rate": 1.98756891395135e-05, + "loss": 0.6759, + "step": 5974 + }, + { + "epoch": 0.9754703889637157, + "grad_norm": 1.8522803783416748, + "learning_rate": 1.9875639254096908e-05, + "loss": 0.7023, + "step": 5975 + }, + { + "epoch": 0.9756336476062202, + "grad_norm": 2.02699613571167, + "learning_rate": 1.9875589358735553e-05, + "loss": 0.9165, + "step": 5976 + }, + { + "epoch": 0.9757969062487245, + "grad_norm": 1.7743932008743286, + "learning_rate": 1.987553945342948e-05, + "loss": 0.6396, + "step": 5977 + }, + { + "epoch": 0.9759601648912289, + "grad_norm": 1.6438225507736206, + "learning_rate": 1.987548953817874e-05, + "loss": 0.6977, + "step": 5978 + }, + { + "epoch": 0.9761234235337333, + "grad_norm": 1.67996084690094, + "learning_rate": 1.9875439612983388e-05, + "loss": 0.6831, + "step": 5979 + }, + { + "epoch": 0.9762866821762377, + "grad_norm": 1.6769899129867554, + "learning_rate": 1.987538967784347e-05, + "loss": 0.5868, + "step": 5980 + }, + { + "epoch": 0.9764499408187421, + "grad_norm": 1.9860998392105103, + "learning_rate": 1.9875339732759037e-05, + "loss": 0.7618, + "step": 5981 + }, + { + "epoch": 0.9766131994612465, + "grad_norm": 1.8570220470428467, + "learning_rate": 1.9875289777730137e-05, + "loss": 0.7333, + "step": 5982 + }, + { + "epoch": 0.9767764581037509, + "grad_norm": 2.0959367752075195, + "learning_rate": 1.9875239812756826e-05, + "loss": 0.8431, + "step": 5983 + }, + { + "epoch": 0.9769397167462552, + "grad_norm": 1.5867465734481812, + "learning_rate": 1.987518983783915e-05, + "loss": 0.5917, + "step": 5984 + }, + { + "epoch": 0.9771029753887597, + "grad_norm": 2.1152360439300537, + "learning_rate": 1.9875139852977162e-05, + "loss": 0.8539, + "step": 5985 + }, + { + "epoch": 0.977266234031264, + "grad_norm": 1.6980233192443848, + "learning_rate": 1.9875089858170907e-05, + "loss": 0.6967, + "step": 5986 + }, + { + "epoch": 0.9774294926737684, + "grad_norm": 1.9800946712493896, + "learning_rate": 1.9875039853420445e-05, + "loss": 0.8382, + "step": 5987 + }, + { + "epoch": 0.9775927513162728, + "grad_norm": 1.9031760692596436, + "learning_rate": 1.9874989838725812e-05, + "loss": 0.8271, + "step": 5988 + }, + { + "epoch": 0.9777560099587772, + "grad_norm": 1.8787875175476074, + "learning_rate": 1.9874939814087074e-05, + "loss": 0.8431, + "step": 5989 + }, + { + "epoch": 0.9779192686012815, + "grad_norm": 1.6274372339248657, + "learning_rate": 1.9874889779504274e-05, + "loss": 0.6774, + "step": 5990 + }, + { + "epoch": 0.978082527243786, + "grad_norm": 2.103212356567383, + "learning_rate": 1.987483973497746e-05, + "loss": 0.9484, + "step": 5991 + }, + { + "epoch": 0.9782457858862904, + "grad_norm": 2.074878454208374, + "learning_rate": 1.9874789680506685e-05, + "loss": 0.8176, + "step": 5992 + }, + { + "epoch": 0.9784090445287947, + "grad_norm": 1.7970536947250366, + "learning_rate": 1.9874739616092e-05, + "loss": 0.7182, + "step": 5993 + }, + { + "epoch": 0.9785723031712992, + "grad_norm": 1.6808812618255615, + "learning_rate": 1.9874689541733455e-05, + "loss": 0.7016, + "step": 5994 + }, + { + "epoch": 0.9787355618138035, + "grad_norm": 1.6821799278259277, + "learning_rate": 1.98746394574311e-05, + "loss": 0.7992, + "step": 5995 + }, + { + "epoch": 0.9788988204563079, + "grad_norm": 1.5709832906723022, + "learning_rate": 1.9874589363184988e-05, + "loss": 0.7204, + "step": 5996 + }, + { + "epoch": 0.9790620790988123, + "grad_norm": 1.7833362817764282, + "learning_rate": 1.987453925899516e-05, + "loss": 0.8071, + "step": 5997 + }, + { + "epoch": 0.9792253377413167, + "grad_norm": 1.6405885219573975, + "learning_rate": 1.9874489144861683e-05, + "loss": 0.7564, + "step": 5998 + }, + { + "epoch": 0.979388596383821, + "grad_norm": 1.953478217124939, + "learning_rate": 1.987443902078459e-05, + "loss": 0.7531, + "step": 5999 + }, + { + "epoch": 0.9795518550263255, + "grad_norm": 1.6311215162277222, + "learning_rate": 1.9874388886763944e-05, + "loss": 0.6578, + "step": 6000 + }, + { + "epoch": 0.9797151136688298, + "grad_norm": 1.6680793762207031, + "learning_rate": 1.987433874279979e-05, + "loss": 0.7281, + "step": 6001 + }, + { + "epoch": 0.9798783723113342, + "grad_norm": 1.6072957515716553, + "learning_rate": 1.987428858889218e-05, + "loss": 0.6269, + "step": 6002 + }, + { + "epoch": 0.9800416309538387, + "grad_norm": 1.8018102645874023, + "learning_rate": 1.9874238425041164e-05, + "loss": 0.8311, + "step": 6003 + }, + { + "epoch": 0.980204889596343, + "grad_norm": 1.9095489978790283, + "learning_rate": 1.987418825124679e-05, + "loss": 0.8519, + "step": 6004 + }, + { + "epoch": 0.9803681482388474, + "grad_norm": 1.6413098573684692, + "learning_rate": 1.9874138067509116e-05, + "loss": 0.6749, + "step": 6005 + }, + { + "epoch": 0.9805314068813518, + "grad_norm": 1.5669972896575928, + "learning_rate": 1.9874087873828185e-05, + "loss": 0.6804, + "step": 6006 + }, + { + "epoch": 0.9806946655238562, + "grad_norm": 1.9253073930740356, + "learning_rate": 1.987403767020405e-05, + "loss": 0.8547, + "step": 6007 + }, + { + "epoch": 0.9808579241663605, + "grad_norm": 1.6400352716445923, + "learning_rate": 1.987398745663676e-05, + "loss": 0.6881, + "step": 6008 + }, + { + "epoch": 0.981021182808865, + "grad_norm": 1.9098623991012573, + "learning_rate": 1.987393723312637e-05, + "loss": 0.728, + "step": 6009 + }, + { + "epoch": 0.9811844414513693, + "grad_norm": 1.4521044492721558, + "learning_rate": 1.9873886999672927e-05, + "loss": 0.5883, + "step": 6010 + }, + { + "epoch": 0.9813477000938737, + "grad_norm": 1.7754608392715454, + "learning_rate": 1.9873836756276482e-05, + "loss": 0.7166, + "step": 6011 + }, + { + "epoch": 0.9815109587363781, + "grad_norm": 1.4899892807006836, + "learning_rate": 1.9873786502937086e-05, + "loss": 0.7209, + "step": 6012 + }, + { + "epoch": 0.9816742173788825, + "grad_norm": 1.8073278665542603, + "learning_rate": 1.9873736239654787e-05, + "loss": 0.6165, + "step": 6013 + }, + { + "epoch": 0.9818374760213869, + "grad_norm": 2.1747896671295166, + "learning_rate": 1.9873685966429646e-05, + "loss": 0.9362, + "step": 6014 + }, + { + "epoch": 0.9820007346638913, + "grad_norm": 1.6571619510650635, + "learning_rate": 1.98736356832617e-05, + "loss": 0.7174, + "step": 6015 + }, + { + "epoch": 0.9821639933063957, + "grad_norm": 1.764414668083191, + "learning_rate": 1.9873585390151003e-05, + "loss": 0.6344, + "step": 6016 + }, + { + "epoch": 0.9823272519489, + "grad_norm": 1.7066560983657837, + "learning_rate": 1.9873535087097614e-05, + "loss": 0.6145, + "step": 6017 + }, + { + "epoch": 0.9824905105914045, + "grad_norm": 1.8135972023010254, + "learning_rate": 1.9873484774101576e-05, + "loss": 0.7636, + "step": 6018 + }, + { + "epoch": 0.9826537692339088, + "grad_norm": 2.093597412109375, + "learning_rate": 1.987343445116294e-05, + "loss": 0.9134, + "step": 6019 + }, + { + "epoch": 0.9828170278764132, + "grad_norm": 1.9487853050231934, + "learning_rate": 1.987338411828176e-05, + "loss": 0.7286, + "step": 6020 + }, + { + "epoch": 0.9829802865189176, + "grad_norm": 2.1160693168640137, + "learning_rate": 1.9873333775458082e-05, + "loss": 0.8427, + "step": 6021 + }, + { + "epoch": 0.983143545161422, + "grad_norm": 2.046687126159668, + "learning_rate": 1.987328342269196e-05, + "loss": 0.7701, + "step": 6022 + }, + { + "epoch": 0.9833068038039263, + "grad_norm": 2.279602527618408, + "learning_rate": 1.9873233059983446e-05, + "loss": 0.8699, + "step": 6023 + }, + { + "epoch": 0.9834700624464308, + "grad_norm": 1.8102754354476929, + "learning_rate": 1.987318268733259e-05, + "loss": 0.7004, + "step": 6024 + }, + { + "epoch": 0.9836333210889352, + "grad_norm": 1.9187486171722412, + "learning_rate": 1.987313230473944e-05, + "loss": 0.7657, + "step": 6025 + }, + { + "epoch": 0.9837965797314395, + "grad_norm": 2.090390682220459, + "learning_rate": 1.9873081912204048e-05, + "loss": 1.0659, + "step": 6026 + }, + { + "epoch": 0.983959838373944, + "grad_norm": 1.8013267517089844, + "learning_rate": 1.9873031509726463e-05, + "loss": 0.8419, + "step": 6027 + }, + { + "epoch": 0.9841230970164483, + "grad_norm": 2.1475915908813477, + "learning_rate": 1.987298109730674e-05, + "loss": 0.9188, + "step": 6028 + }, + { + "epoch": 0.9842863556589527, + "grad_norm": 2.16963529586792, + "learning_rate": 1.987293067494493e-05, + "loss": 0.8846, + "step": 6029 + }, + { + "epoch": 0.984449614301457, + "grad_norm": 2.2411036491394043, + "learning_rate": 1.9872880242641078e-05, + "loss": 0.7537, + "step": 6030 + }, + { + "epoch": 0.9846128729439615, + "grad_norm": 1.4837383031845093, + "learning_rate": 1.987282980039524e-05, + "loss": 0.6128, + "step": 6031 + }, + { + "epoch": 0.9847761315864658, + "grad_norm": 1.7958338260650635, + "learning_rate": 1.9872779348207465e-05, + "loss": 0.8465, + "step": 6032 + }, + { + "epoch": 0.9849393902289703, + "grad_norm": 1.8919084072113037, + "learning_rate": 1.9872728886077802e-05, + "loss": 0.6389, + "step": 6033 + }, + { + "epoch": 0.9851026488714746, + "grad_norm": 1.8457815647125244, + "learning_rate": 1.9872678414006306e-05, + "loss": 0.7177, + "step": 6034 + }, + { + "epoch": 0.985265907513979, + "grad_norm": 1.7271432876586914, + "learning_rate": 1.9872627931993026e-05, + "loss": 0.6808, + "step": 6035 + }, + { + "epoch": 0.9854291661564835, + "grad_norm": 1.746943473815918, + "learning_rate": 1.987257744003801e-05, + "loss": 0.7321, + "step": 6036 + }, + { + "epoch": 0.9855924247989878, + "grad_norm": 1.8206374645233154, + "learning_rate": 1.9872526938141313e-05, + "loss": 0.7911, + "step": 6037 + }, + { + "epoch": 0.9857556834414922, + "grad_norm": 1.9376498460769653, + "learning_rate": 1.9872476426302983e-05, + "loss": 0.6821, + "step": 6038 + }, + { + "epoch": 0.9859189420839966, + "grad_norm": 1.9760218858718872, + "learning_rate": 1.987242590452307e-05, + "loss": 0.8867, + "step": 6039 + }, + { + "epoch": 0.986082200726501, + "grad_norm": 1.9285062551498413, + "learning_rate": 1.9872375372801627e-05, + "loss": 0.7281, + "step": 6040 + }, + { + "epoch": 0.9862454593690053, + "grad_norm": 1.731630563735962, + "learning_rate": 1.987232483113871e-05, + "loss": 0.6779, + "step": 6041 + }, + { + "epoch": 0.9864087180115098, + "grad_norm": 1.5643675327301025, + "learning_rate": 1.987227427953436e-05, + "loss": 0.6119, + "step": 6042 + }, + { + "epoch": 0.9865719766540141, + "grad_norm": 1.5938082933425903, + "learning_rate": 1.9872223717988632e-05, + "loss": 0.7078, + "step": 6043 + }, + { + "epoch": 0.9867352352965185, + "grad_norm": 1.920799970626831, + "learning_rate": 1.9872173146501577e-05, + "loss": 0.7572, + "step": 6044 + }, + { + "epoch": 0.9868984939390228, + "grad_norm": 2.1344242095947266, + "learning_rate": 1.987212256507325e-05, + "loss": 0.984, + "step": 6045 + }, + { + "epoch": 0.9870617525815273, + "grad_norm": 1.501420021057129, + "learning_rate": 1.9872071973703695e-05, + "loss": 0.6223, + "step": 6046 + }, + { + "epoch": 0.9872250112240317, + "grad_norm": 2.3442068099975586, + "learning_rate": 1.987202137239297e-05, + "loss": 0.7741, + "step": 6047 + }, + { + "epoch": 0.987388269866536, + "grad_norm": 1.8293782472610474, + "learning_rate": 1.9871970761141116e-05, + "loss": 0.7081, + "step": 6048 + }, + { + "epoch": 0.9875515285090405, + "grad_norm": 2.0163910388946533, + "learning_rate": 1.9871920139948193e-05, + "loss": 0.6837, + "step": 6049 + }, + { + "epoch": 0.9877147871515448, + "grad_norm": 1.9918174743652344, + "learning_rate": 1.987186950881425e-05, + "loss": 0.6608, + "step": 6050 + }, + { + "epoch": 0.9878780457940493, + "grad_norm": 1.8569176197052002, + "learning_rate": 1.9871818867739336e-05, + "loss": 0.7203, + "step": 6051 + }, + { + "epoch": 0.9880413044365536, + "grad_norm": 1.4728494882583618, + "learning_rate": 1.9871768216723504e-05, + "loss": 0.6783, + "step": 6052 + }, + { + "epoch": 0.988204563079058, + "grad_norm": 1.806503415107727, + "learning_rate": 1.9871717555766802e-05, + "loss": 0.7128, + "step": 6053 + }, + { + "epoch": 0.9883678217215623, + "grad_norm": 1.6558796167373657, + "learning_rate": 1.9871666884869284e-05, + "loss": 0.7896, + "step": 6054 + }, + { + "epoch": 0.9885310803640668, + "grad_norm": 1.5507030487060547, + "learning_rate": 1.9871616204031e-05, + "loss": 0.757, + "step": 6055 + }, + { + "epoch": 0.9886943390065711, + "grad_norm": 2.423875331878662, + "learning_rate": 1.9871565513252004e-05, + "loss": 0.9425, + "step": 6056 + }, + { + "epoch": 0.9888575976490755, + "grad_norm": 1.7223186492919922, + "learning_rate": 1.9871514812532343e-05, + "loss": 0.9469, + "step": 6057 + }, + { + "epoch": 0.98902085629158, + "grad_norm": 1.9923958778381348, + "learning_rate": 1.9871464101872066e-05, + "loss": 0.9279, + "step": 6058 + }, + { + "epoch": 0.9891841149340843, + "grad_norm": 1.9704676866531372, + "learning_rate": 1.9871413381271227e-05, + "loss": 0.7176, + "step": 6059 + }, + { + "epoch": 0.9893473735765888, + "grad_norm": 1.9575555324554443, + "learning_rate": 1.987136265072988e-05, + "loss": 0.8085, + "step": 6060 + }, + { + "epoch": 0.9895106322190931, + "grad_norm": 1.8110486268997192, + "learning_rate": 1.9871311910248074e-05, + "loss": 0.8116, + "step": 6061 + }, + { + "epoch": 0.9896738908615975, + "grad_norm": 1.7949093580245972, + "learning_rate": 1.9871261159825858e-05, + "loss": 0.8535, + "step": 6062 + }, + { + "epoch": 0.9898371495041018, + "grad_norm": 1.7336087226867676, + "learning_rate": 1.9871210399463287e-05, + "loss": 0.833, + "step": 6063 + }, + { + "epoch": 0.9900004081466063, + "grad_norm": 1.7822155952453613, + "learning_rate": 1.9871159629160405e-05, + "loss": 0.6829, + "step": 6064 + }, + { + "epoch": 0.9901636667891106, + "grad_norm": 1.9211289882659912, + "learning_rate": 1.9871108848917272e-05, + "loss": 0.8141, + "step": 6065 + }, + { + "epoch": 0.990326925431615, + "grad_norm": 1.6113996505737305, + "learning_rate": 1.987105805873393e-05, + "loss": 0.7642, + "step": 6066 + }, + { + "epoch": 0.9904901840741194, + "grad_norm": 1.684557557106018, + "learning_rate": 1.987100725861044e-05, + "loss": 0.7923, + "step": 6067 + }, + { + "epoch": 0.9906534427166238, + "grad_norm": 1.6841557025909424, + "learning_rate": 1.987095644854685e-05, + "loss": 0.692, + "step": 6068 + }, + { + "epoch": 0.9908167013591282, + "grad_norm": 1.7554411888122559, + "learning_rate": 1.9870905628543204e-05, + "loss": 0.7748, + "step": 6069 + }, + { + "epoch": 0.9909799600016326, + "grad_norm": 2.031614065170288, + "learning_rate": 1.9870854798599563e-05, + "loss": 0.8937, + "step": 6070 + }, + { + "epoch": 0.991143218644137, + "grad_norm": 2.0891551971435547, + "learning_rate": 1.987080395871597e-05, + "loss": 0.8405, + "step": 6071 + }, + { + "epoch": 0.9913064772866413, + "grad_norm": 2.0731348991394043, + "learning_rate": 1.9870753108892483e-05, + "loss": 0.7968, + "step": 6072 + }, + { + "epoch": 0.9914697359291458, + "grad_norm": 1.9249430894851685, + "learning_rate": 1.987070224912915e-05, + "loss": 0.7778, + "step": 6073 + }, + { + "epoch": 0.9916329945716501, + "grad_norm": 1.6516939401626587, + "learning_rate": 1.9870651379426022e-05, + "loss": 0.6959, + "step": 6074 + }, + { + "epoch": 0.9917962532141545, + "grad_norm": 1.8190982341766357, + "learning_rate": 1.987060049978315e-05, + "loss": 0.7352, + "step": 6075 + }, + { + "epoch": 0.9919595118566589, + "grad_norm": 1.8975927829742432, + "learning_rate": 1.9870549610200587e-05, + "loss": 0.8362, + "step": 6076 + }, + { + "epoch": 0.9921227704991633, + "grad_norm": 1.655179500579834, + "learning_rate": 1.9870498710678383e-05, + "loss": 0.7377, + "step": 6077 + }, + { + "epoch": 0.9922860291416676, + "grad_norm": 1.9465856552124023, + "learning_rate": 1.987044780121659e-05, + "loss": 0.9519, + "step": 6078 + }, + { + "epoch": 0.9924492877841721, + "grad_norm": 1.9071656465530396, + "learning_rate": 1.987039688181526e-05, + "loss": 0.8051, + "step": 6079 + }, + { + "epoch": 0.9926125464266765, + "grad_norm": 1.632247805595398, + "learning_rate": 1.9870345952474436e-05, + "loss": 0.6219, + "step": 6080 + }, + { + "epoch": 0.9927758050691808, + "grad_norm": 1.689948320388794, + "learning_rate": 1.9870295013194183e-05, + "loss": 0.6615, + "step": 6081 + }, + { + "epoch": 0.9929390637116853, + "grad_norm": 2.5762290954589844, + "learning_rate": 1.987024406397454e-05, + "loss": 0.7496, + "step": 6082 + }, + { + "epoch": 0.9931023223541896, + "grad_norm": 1.4671343564987183, + "learning_rate": 1.987019310481557e-05, + "loss": 0.5928, + "step": 6083 + }, + { + "epoch": 0.993265580996694, + "grad_norm": 1.5476707220077515, + "learning_rate": 1.9870142135717314e-05, + "loss": 0.6122, + "step": 6084 + }, + { + "epoch": 0.9934288396391984, + "grad_norm": 1.7163381576538086, + "learning_rate": 1.9870091156679828e-05, + "loss": 0.6932, + "step": 6085 + }, + { + "epoch": 0.9935920982817028, + "grad_norm": 1.8964236974716187, + "learning_rate": 1.9870040167703166e-05, + "loss": 0.8945, + "step": 6086 + }, + { + "epoch": 0.9937553569242071, + "grad_norm": 1.7007569074630737, + "learning_rate": 1.9869989168787372e-05, + "loss": 0.8272, + "step": 6087 + }, + { + "epoch": 0.9939186155667116, + "grad_norm": 2.0102131366729736, + "learning_rate": 1.9869938159932504e-05, + "loss": 0.8839, + "step": 6088 + }, + { + "epoch": 0.9940818742092159, + "grad_norm": 1.9050819873809814, + "learning_rate": 1.9869887141138612e-05, + "loss": 0.6724, + "step": 6089 + }, + { + "epoch": 0.9942451328517203, + "grad_norm": 3.572845458984375, + "learning_rate": 1.9869836112405744e-05, + "loss": 0.8055, + "step": 6090 + }, + { + "epoch": 0.9944083914942248, + "grad_norm": 1.8550292253494263, + "learning_rate": 1.9869785073733953e-05, + "loss": 0.838, + "step": 6091 + }, + { + "epoch": 0.9945716501367291, + "grad_norm": 1.958802580833435, + "learning_rate": 1.9869734025123292e-05, + "loss": 0.7308, + "step": 6092 + }, + { + "epoch": 0.9947349087792335, + "grad_norm": 2.0518805980682373, + "learning_rate": 1.9869682966573814e-05, + "loss": 0.8529, + "step": 6093 + }, + { + "epoch": 0.9948981674217379, + "grad_norm": 1.6574863195419312, + "learning_rate": 1.9869631898085563e-05, + "loss": 0.7876, + "step": 6094 + }, + { + "epoch": 0.9950614260642423, + "grad_norm": 2.1080191135406494, + "learning_rate": 1.98695808196586e-05, + "loss": 0.7873, + "step": 6095 + }, + { + "epoch": 0.9952246847067466, + "grad_norm": 1.8979504108428955, + "learning_rate": 1.986952973129297e-05, + "loss": 0.8885, + "step": 6096 + }, + { + "epoch": 0.9953879433492511, + "grad_norm": 1.9292210340499878, + "learning_rate": 1.9869478632988724e-05, + "loss": 0.7992, + "step": 6097 + }, + { + "epoch": 0.9955512019917554, + "grad_norm": 1.9728667736053467, + "learning_rate": 1.986942752474592e-05, + "loss": 0.9235, + "step": 6098 + }, + { + "epoch": 0.9957144606342598, + "grad_norm": 1.4399513006210327, + "learning_rate": 1.98693764065646e-05, + "loss": 0.5895, + "step": 6099 + }, + { + "epoch": 0.9958777192767642, + "grad_norm": 1.7028934955596924, + "learning_rate": 1.9869325278444824e-05, + "loss": 0.595, + "step": 6100 + }, + { + "epoch": 0.9960409779192686, + "grad_norm": 1.6393214464187622, + "learning_rate": 1.986927414038664e-05, + "loss": 0.6776, + "step": 6101 + }, + { + "epoch": 0.996204236561773, + "grad_norm": 4.110625267028809, + "learning_rate": 1.98692229923901e-05, + "loss": 0.7828, + "step": 6102 + }, + { + "epoch": 0.9963674952042774, + "grad_norm": 1.804225206375122, + "learning_rate": 1.9869171834455253e-05, + "loss": 0.7869, + "step": 6103 + }, + { + "epoch": 0.9965307538467818, + "grad_norm": 1.6790324449539185, + "learning_rate": 1.9869120666582153e-05, + "loss": 0.6917, + "step": 6104 + }, + { + "epoch": 0.9966940124892861, + "grad_norm": 1.5569666624069214, + "learning_rate": 1.9869069488770853e-05, + "loss": 0.7547, + "step": 6105 + }, + { + "epoch": 0.9968572711317906, + "grad_norm": 1.550554871559143, + "learning_rate": 1.98690183010214e-05, + "loss": 0.6964, + "step": 6106 + }, + { + "epoch": 0.9970205297742949, + "grad_norm": 1.5028797388076782, + "learning_rate": 1.986896710333385e-05, + "loss": 0.7396, + "step": 6107 + }, + { + "epoch": 0.9971837884167993, + "grad_norm": 1.733398675918579, + "learning_rate": 1.986891589570825e-05, + "loss": 0.7703, + "step": 6108 + }, + { + "epoch": 0.9973470470593037, + "grad_norm": 2.112023115158081, + "learning_rate": 1.9868864678144658e-05, + "loss": 0.7964, + "step": 6109 + }, + { + "epoch": 0.9975103057018081, + "grad_norm": 1.8998324871063232, + "learning_rate": 1.9868813450643118e-05, + "loss": 0.7696, + "step": 6110 + }, + { + "epoch": 0.9976735643443124, + "grad_norm": 2.2745769023895264, + "learning_rate": 1.986876221320369e-05, + "loss": 0.9555, + "step": 6111 + }, + { + "epoch": 0.9978368229868169, + "grad_norm": 1.8462913036346436, + "learning_rate": 1.986871096582642e-05, + "loss": 0.7811, + "step": 6112 + }, + { + "epoch": 0.9980000816293213, + "grad_norm": 1.734758734703064, + "learning_rate": 1.9868659708511357e-05, + "loss": 0.6561, + "step": 6113 + }, + { + "epoch": 0.9981633402718256, + "grad_norm": 1.6623098850250244, + "learning_rate": 1.9868608441258558e-05, + "loss": 0.6602, + "step": 6114 + }, + { + "epoch": 0.9983265989143301, + "grad_norm": 1.757601261138916, + "learning_rate": 1.9868557164068073e-05, + "loss": 0.7259, + "step": 6115 + }, + { + "epoch": 0.9984898575568344, + "grad_norm": 2.138526439666748, + "learning_rate": 1.9868505876939954e-05, + "loss": 0.8225, + "step": 6116 + }, + { + "epoch": 0.9986531161993388, + "grad_norm": 1.7662920951843262, + "learning_rate": 1.9868454579874255e-05, + "loss": 0.723, + "step": 6117 + }, + { + "epoch": 0.9988163748418432, + "grad_norm": 1.8942501544952393, + "learning_rate": 1.9868403272871023e-05, + "loss": 0.771, + "step": 6118 + }, + { + "epoch": 0.9989796334843476, + "grad_norm": 2.097524642944336, + "learning_rate": 1.986835195593031e-05, + "loss": 0.8086, + "step": 6119 + }, + { + "epoch": 0.9991428921268519, + "grad_norm": 1.8089945316314697, + "learning_rate": 1.986830062905217e-05, + "loss": 0.7581, + "step": 6120 + }, + { + "epoch": 0.9993061507693564, + "grad_norm": 1.7585166692733765, + "learning_rate": 1.986824929223665e-05, + "loss": 0.8141, + "step": 6121 + }, + { + "epoch": 0.9994694094118607, + "grad_norm": 1.9036017656326294, + "learning_rate": 1.9868197945483813e-05, + "loss": 0.7553, + "step": 6122 + }, + { + "epoch": 0.9996326680543651, + "grad_norm": 1.3933123350143433, + "learning_rate": 1.98681465887937e-05, + "loss": 0.5856, + "step": 6123 + }, + { + "epoch": 0.9997959266968696, + "grad_norm": 1.8385380506515503, + "learning_rate": 1.9868095222166364e-05, + "loss": 0.6761, + "step": 6124 + }, + { + "epoch": 0.9999591853393739, + "grad_norm": 1.7274433374404907, + "learning_rate": 1.9868043845601863e-05, + "loss": 0.7228, + "step": 6125 + }, + { + "epoch": 1.0, + "grad_norm": 4.170963287353516, + "learning_rate": 1.986799245910024e-05, + "loss": 0.9424, + "step": 6126 + }, + { + "epoch": 1.0001632586425044, + "grad_norm": 1.607887864112854, + "learning_rate": 1.9867941062661555e-05, + "loss": 0.5744, + "step": 6127 + }, + { + "epoch": 1.0003265172850089, + "grad_norm": 1.720959186553955, + "learning_rate": 1.9867889656285854e-05, + "loss": 0.7406, + "step": 6128 + }, + { + "epoch": 1.000489775927513, + "grad_norm": 1.9235025644302368, + "learning_rate": 1.986783823997319e-05, + "loss": 0.8196, + "step": 6129 + }, + { + "epoch": 1.0006530345700175, + "grad_norm": 2.0748870372772217, + "learning_rate": 1.9867786813723615e-05, + "loss": 0.7751, + "step": 6130 + }, + { + "epoch": 1.000816293212522, + "grad_norm": 1.7143235206604004, + "learning_rate": 1.9867735377537186e-05, + "loss": 0.7054, + "step": 6131 + }, + { + "epoch": 1.0009795518550264, + "grad_norm": 1.737565040588379, + "learning_rate": 1.9867683931413942e-05, + "loss": 0.6615, + "step": 6132 + }, + { + "epoch": 1.0011428104975306, + "grad_norm": 1.840692162513733, + "learning_rate": 1.986763247535395e-05, + "loss": 0.6369, + "step": 6133 + }, + { + "epoch": 1.001306069140035, + "grad_norm": 1.8484382629394531, + "learning_rate": 1.986758100935725e-05, + "loss": 0.8318, + "step": 6134 + }, + { + "epoch": 1.0014693277825395, + "grad_norm": 1.841214895248413, + "learning_rate": 1.9867529533423903e-05, + "loss": 0.7698, + "step": 6135 + }, + { + "epoch": 1.001632586425044, + "grad_norm": 2.356635808944702, + "learning_rate": 1.9867478047553957e-05, + "loss": 0.9288, + "step": 6136 + }, + { + "epoch": 1.0017958450675484, + "grad_norm": 1.6894954442977905, + "learning_rate": 1.9867426551747457e-05, + "loss": 0.7143, + "step": 6137 + }, + { + "epoch": 1.0019591037100526, + "grad_norm": 1.8318668603897095, + "learning_rate": 1.9867375046004467e-05, + "loss": 0.7396, + "step": 6138 + }, + { + "epoch": 1.002122362352557, + "grad_norm": 1.9993305206298828, + "learning_rate": 1.986732353032503e-05, + "loss": 0.753, + "step": 6139 + }, + { + "epoch": 1.0022856209950615, + "grad_norm": 1.6902981996536255, + "learning_rate": 1.98672720047092e-05, + "loss": 0.5345, + "step": 6140 + }, + { + "epoch": 1.002448879637566, + "grad_norm": 1.8155654668807983, + "learning_rate": 1.9867220469157035e-05, + "loss": 0.7035, + "step": 6141 + }, + { + "epoch": 1.0026121382800701, + "grad_norm": 1.6589241027832031, + "learning_rate": 1.9867168923668573e-05, + "loss": 0.6555, + "step": 6142 + }, + { + "epoch": 1.0027753969225746, + "grad_norm": 2.3357906341552734, + "learning_rate": 1.986711736824388e-05, + "loss": 0.8881, + "step": 6143 + }, + { + "epoch": 1.002938655565079, + "grad_norm": 1.7395215034484863, + "learning_rate": 1.9867065802883004e-05, + "loss": 0.7562, + "step": 6144 + }, + { + "epoch": 1.0031019142075834, + "grad_norm": 1.889523983001709, + "learning_rate": 1.9867014227585992e-05, + "loss": 0.7889, + "step": 6145 + }, + { + "epoch": 1.0032651728500876, + "grad_norm": 1.6654964685440063, + "learning_rate": 1.98669626423529e-05, + "loss": 0.5987, + "step": 6146 + }, + { + "epoch": 1.003428431492592, + "grad_norm": 1.7797136306762695, + "learning_rate": 1.9866911047183782e-05, + "loss": 0.669, + "step": 6147 + }, + { + "epoch": 1.0035916901350965, + "grad_norm": 2.866408586502075, + "learning_rate": 1.986685944207868e-05, + "loss": 0.7728, + "step": 6148 + }, + { + "epoch": 1.003754948777601, + "grad_norm": 1.8780059814453125, + "learning_rate": 1.986680782703766e-05, + "loss": 0.6784, + "step": 6149 + }, + { + "epoch": 1.0039182074201054, + "grad_norm": 1.9424911737442017, + "learning_rate": 1.9866756202060764e-05, + "loss": 0.7224, + "step": 6150 + }, + { + "epoch": 1.0040814660626096, + "grad_norm": 1.5416868925094604, + "learning_rate": 1.986670456714805e-05, + "loss": 0.7022, + "step": 6151 + }, + { + "epoch": 1.004244724705114, + "grad_norm": 1.6087270975112915, + "learning_rate": 1.9866652922299563e-05, + "loss": 0.682, + "step": 6152 + }, + { + "epoch": 1.0044079833476185, + "grad_norm": 2.1810262203216553, + "learning_rate": 1.9866601267515363e-05, + "loss": 0.8818, + "step": 6153 + }, + { + "epoch": 1.004571241990123, + "grad_norm": 2.1084063053131104, + "learning_rate": 1.9866549602795494e-05, + "loss": 0.8209, + "step": 6154 + }, + { + "epoch": 1.0047345006326271, + "grad_norm": 1.9017915725708008, + "learning_rate": 1.9866497928140017e-05, + "loss": 0.8047, + "step": 6155 + }, + { + "epoch": 1.0048977592751316, + "grad_norm": 1.7743273973464966, + "learning_rate": 1.986644624354898e-05, + "loss": 0.6562, + "step": 6156 + }, + { + "epoch": 1.005061017917636, + "grad_norm": 1.8690946102142334, + "learning_rate": 1.986639454902243e-05, + "loss": 0.7204, + "step": 6157 + }, + { + "epoch": 1.0052242765601405, + "grad_norm": 1.727832555770874, + "learning_rate": 1.9866342844560422e-05, + "loss": 0.6859, + "step": 6158 + }, + { + "epoch": 1.005387535202645, + "grad_norm": 1.7663624286651611, + "learning_rate": 1.9866291130163013e-05, + "loss": 0.7951, + "step": 6159 + }, + { + "epoch": 1.0055507938451491, + "grad_norm": 1.5686070919036865, + "learning_rate": 1.986623940583025e-05, + "loss": 0.6854, + "step": 6160 + }, + { + "epoch": 1.0057140524876536, + "grad_norm": 1.9059823751449585, + "learning_rate": 1.9866187671562185e-05, + "loss": 0.7349, + "step": 6161 + }, + { + "epoch": 1.005877311130158, + "grad_norm": 1.5141847133636475, + "learning_rate": 1.9866135927358872e-05, + "loss": 0.5795, + "step": 6162 + }, + { + "epoch": 1.0060405697726624, + "grad_norm": 1.505925178527832, + "learning_rate": 1.9866084173220364e-05, + "loss": 0.5691, + "step": 6163 + }, + { + "epoch": 1.0062038284151666, + "grad_norm": 1.7032802104949951, + "learning_rate": 1.9866032409146716e-05, + "loss": 0.7281, + "step": 6164 + }, + { + "epoch": 1.006367087057671, + "grad_norm": 1.741918683052063, + "learning_rate": 1.986598063513797e-05, + "loss": 0.7371, + "step": 6165 + }, + { + "epoch": 1.0065303457001755, + "grad_norm": 1.6459757089614868, + "learning_rate": 1.986592885119419e-05, + "loss": 0.6124, + "step": 6166 + }, + { + "epoch": 1.00669360434268, + "grad_norm": 1.9404174089431763, + "learning_rate": 1.9865877057315416e-05, + "loss": 0.7492, + "step": 6167 + }, + { + "epoch": 1.0068568629851842, + "grad_norm": 1.5976184606552124, + "learning_rate": 1.9865825253501708e-05, + "loss": 0.5577, + "step": 6168 + }, + { + "epoch": 1.0070201216276886, + "grad_norm": 1.6678483486175537, + "learning_rate": 1.9865773439753118e-05, + "loss": 0.6452, + "step": 6169 + }, + { + "epoch": 1.007183380270193, + "grad_norm": 1.5222314596176147, + "learning_rate": 1.9865721616069695e-05, + "loss": 0.6555, + "step": 6170 + }, + { + "epoch": 1.0073466389126975, + "grad_norm": 1.7038832902908325, + "learning_rate": 1.9865669782451493e-05, + "loss": 0.798, + "step": 6171 + }, + { + "epoch": 1.007509897555202, + "grad_norm": 1.8904991149902344, + "learning_rate": 1.9865617938898568e-05, + "loss": 0.8571, + "step": 6172 + }, + { + "epoch": 1.0076731561977061, + "grad_norm": 1.5897445678710938, + "learning_rate": 1.9865566085410966e-05, + "loss": 0.6717, + "step": 6173 + }, + { + "epoch": 1.0078364148402106, + "grad_norm": 1.865080714225769, + "learning_rate": 1.986551422198874e-05, + "loss": 0.803, + "step": 6174 + }, + { + "epoch": 1.007999673482715, + "grad_norm": 1.6964479684829712, + "learning_rate": 1.9865462348631945e-05, + "loss": 0.6113, + "step": 6175 + }, + { + "epoch": 1.0081629321252195, + "grad_norm": 1.9108397960662842, + "learning_rate": 1.9865410465340635e-05, + "loss": 0.7235, + "step": 6176 + }, + { + "epoch": 1.0083261907677237, + "grad_norm": 2.0547409057617188, + "learning_rate": 1.9865358572114855e-05, + "loss": 0.779, + "step": 6177 + }, + { + "epoch": 1.0084894494102281, + "grad_norm": 1.8377087116241455, + "learning_rate": 1.9865306668954662e-05, + "loss": 0.678, + "step": 6178 + }, + { + "epoch": 1.0086527080527325, + "grad_norm": 1.8337879180908203, + "learning_rate": 1.9865254755860105e-05, + "loss": 0.6748, + "step": 6179 + }, + { + "epoch": 1.008815966695237, + "grad_norm": 1.7516900300979614, + "learning_rate": 1.9865202832831243e-05, + "loss": 0.7469, + "step": 6180 + }, + { + "epoch": 1.0089792253377414, + "grad_norm": 2.2443320751190186, + "learning_rate": 1.9865150899868126e-05, + "loss": 0.6674, + "step": 6181 + }, + { + "epoch": 1.0091424839802456, + "grad_norm": 1.7198677062988281, + "learning_rate": 1.9865098956970802e-05, + "loss": 0.607, + "step": 6182 + }, + { + "epoch": 1.00930574262275, + "grad_norm": 1.375084400177002, + "learning_rate": 1.9865047004139327e-05, + "loss": 0.4885, + "step": 6183 + }, + { + "epoch": 1.0094690012652545, + "grad_norm": 1.7029366493225098, + "learning_rate": 1.9864995041373755e-05, + "loss": 0.6846, + "step": 6184 + }, + { + "epoch": 1.009632259907759, + "grad_norm": 1.5492157936096191, + "learning_rate": 1.986494306867413e-05, + "loss": 0.5639, + "step": 6185 + }, + { + "epoch": 1.0097955185502632, + "grad_norm": 1.7408475875854492, + "learning_rate": 1.9864891086040515e-05, + "loss": 0.9219, + "step": 6186 + }, + { + "epoch": 1.0099587771927676, + "grad_norm": 1.8497453927993774, + "learning_rate": 1.9864839093472952e-05, + "loss": 0.6364, + "step": 6187 + }, + { + "epoch": 1.010122035835272, + "grad_norm": 1.5509625673294067, + "learning_rate": 1.9864787090971502e-05, + "loss": 0.6101, + "step": 6188 + }, + { + "epoch": 1.0102852944777765, + "grad_norm": 1.7902127504348755, + "learning_rate": 1.9864735078536213e-05, + "loss": 0.6635, + "step": 6189 + }, + { + "epoch": 1.0104485531202807, + "grad_norm": 2.0763742923736572, + "learning_rate": 1.9864683056167137e-05, + "loss": 0.767, + "step": 6190 + }, + { + "epoch": 1.0106118117627851, + "grad_norm": 1.8266562223434448, + "learning_rate": 1.986463102386433e-05, + "loss": 0.6773, + "step": 6191 + }, + { + "epoch": 1.0107750704052896, + "grad_norm": 2.068077325820923, + "learning_rate": 1.9864578981627844e-05, + "loss": 0.834, + "step": 6192 + }, + { + "epoch": 1.010938329047794, + "grad_norm": 1.7848087549209595, + "learning_rate": 1.9864526929457727e-05, + "loss": 0.7425, + "step": 6193 + }, + { + "epoch": 1.0111015876902985, + "grad_norm": 1.8313685655593872, + "learning_rate": 1.9864474867354035e-05, + "loss": 0.6767, + "step": 6194 + }, + { + "epoch": 1.0112648463328027, + "grad_norm": 1.8804821968078613, + "learning_rate": 1.9864422795316818e-05, + "loss": 0.6778, + "step": 6195 + }, + { + "epoch": 1.011428104975307, + "grad_norm": 1.555083155632019, + "learning_rate": 1.986437071334613e-05, + "loss": 0.615, + "step": 6196 + }, + { + "epoch": 1.0115913636178115, + "grad_norm": 1.920152187347412, + "learning_rate": 1.9864318621442024e-05, + "loss": 0.8044, + "step": 6197 + }, + { + "epoch": 1.011754622260316, + "grad_norm": 1.6950316429138184, + "learning_rate": 1.9864266519604554e-05, + "loss": 0.6972, + "step": 6198 + }, + { + "epoch": 1.0119178809028202, + "grad_norm": 2.0170254707336426, + "learning_rate": 1.9864214407833767e-05, + "loss": 0.8063, + "step": 6199 + }, + { + "epoch": 1.0120811395453246, + "grad_norm": 1.9544119834899902, + "learning_rate": 1.986416228612972e-05, + "loss": 0.8022, + "step": 6200 + }, + { + "epoch": 1.012244398187829, + "grad_norm": 1.925719141960144, + "learning_rate": 1.9864110154492463e-05, + "loss": 0.7458, + "step": 6201 + }, + { + "epoch": 1.0124076568303335, + "grad_norm": 2.0820651054382324, + "learning_rate": 1.986405801292205e-05, + "loss": 0.8773, + "step": 6202 + }, + { + "epoch": 1.012570915472838, + "grad_norm": 1.848922848701477, + "learning_rate": 1.9864005861418537e-05, + "loss": 0.6056, + "step": 6203 + }, + { + "epoch": 1.0127341741153422, + "grad_norm": 1.9207857847213745, + "learning_rate": 1.9863953699981966e-05, + "loss": 0.7101, + "step": 6204 + }, + { + "epoch": 1.0128974327578466, + "grad_norm": 1.673141360282898, + "learning_rate": 1.9863901528612402e-05, + "loss": 0.6234, + "step": 6205 + }, + { + "epoch": 1.013060691400351, + "grad_norm": 1.8589849472045898, + "learning_rate": 1.986384934730989e-05, + "loss": 0.7866, + "step": 6206 + }, + { + "epoch": 1.0132239500428555, + "grad_norm": 2.122983932495117, + "learning_rate": 1.9863797156074484e-05, + "loss": 0.7326, + "step": 6207 + }, + { + "epoch": 1.0133872086853597, + "grad_norm": 1.9577350616455078, + "learning_rate": 1.9863744954906233e-05, + "loss": 0.7529, + "step": 6208 + }, + { + "epoch": 1.0135504673278641, + "grad_norm": 1.7594752311706543, + "learning_rate": 1.98636927438052e-05, + "loss": 0.7112, + "step": 6209 + }, + { + "epoch": 1.0137137259703686, + "grad_norm": 1.7207964658737183, + "learning_rate": 1.986364052277143e-05, + "loss": 0.6229, + "step": 6210 + }, + { + "epoch": 1.013876984612873, + "grad_norm": 1.7278751134872437, + "learning_rate": 1.9863588291804973e-05, + "loss": 0.838, + "step": 6211 + }, + { + "epoch": 1.0140402432553772, + "grad_norm": 1.7588289976119995, + "learning_rate": 1.9863536050905886e-05, + "loss": 0.6742, + "step": 6212 + }, + { + "epoch": 1.0142035018978817, + "grad_norm": 1.84909188747406, + "learning_rate": 1.986348380007422e-05, + "loss": 0.6163, + "step": 6213 + }, + { + "epoch": 1.014366760540386, + "grad_norm": 1.7126491069793701, + "learning_rate": 1.9863431539310033e-05, + "loss": 0.6796, + "step": 6214 + }, + { + "epoch": 1.0145300191828905, + "grad_norm": 1.6727677583694458, + "learning_rate": 1.986337926861337e-05, + "loss": 0.6452, + "step": 6215 + }, + { + "epoch": 1.014693277825395, + "grad_norm": 1.600612998008728, + "learning_rate": 1.9863326987984286e-05, + "loss": 0.667, + "step": 6216 + }, + { + "epoch": 1.0148565364678992, + "grad_norm": 1.7464735507965088, + "learning_rate": 1.9863274697422835e-05, + "loss": 0.6719, + "step": 6217 + }, + { + "epoch": 1.0150197951104036, + "grad_norm": 1.9114577770233154, + "learning_rate": 1.9863222396929068e-05, + "loss": 0.7442, + "step": 6218 + }, + { + "epoch": 1.015183053752908, + "grad_norm": 1.8738914728164673, + "learning_rate": 1.9863170086503044e-05, + "loss": 0.6257, + "step": 6219 + }, + { + "epoch": 1.0153463123954125, + "grad_norm": 1.7792226076126099, + "learning_rate": 1.9863117766144807e-05, + "loss": 0.7197, + "step": 6220 + }, + { + "epoch": 1.0155095710379167, + "grad_norm": 1.628347396850586, + "learning_rate": 1.986306543585441e-05, + "loss": 0.6001, + "step": 6221 + }, + { + "epoch": 1.0156728296804212, + "grad_norm": 2.1227853298187256, + "learning_rate": 1.9863013095631912e-05, + "loss": 0.7956, + "step": 6222 + }, + { + "epoch": 1.0158360883229256, + "grad_norm": 1.9262471199035645, + "learning_rate": 1.986296074547736e-05, + "loss": 0.8666, + "step": 6223 + }, + { + "epoch": 1.01599934696543, + "grad_norm": 1.8963056802749634, + "learning_rate": 1.986290838539081e-05, + "loss": 0.7368, + "step": 6224 + }, + { + "epoch": 1.0161626056079345, + "grad_norm": 1.8621140718460083, + "learning_rate": 1.9862856015372315e-05, + "loss": 0.827, + "step": 6225 + }, + { + "epoch": 1.0163258642504387, + "grad_norm": 1.6787586212158203, + "learning_rate": 1.986280363542193e-05, + "loss": 0.6474, + "step": 6226 + }, + { + "epoch": 1.0164891228929431, + "grad_norm": 1.8509377241134644, + "learning_rate": 1.9862751245539698e-05, + "loss": 0.6509, + "step": 6227 + }, + { + "epoch": 1.0166523815354476, + "grad_norm": 1.5820976495742798, + "learning_rate": 1.9862698845725677e-05, + "loss": 0.5479, + "step": 6228 + }, + { + "epoch": 1.016815640177952, + "grad_norm": 1.8056031465530396, + "learning_rate": 1.9862646435979928e-05, + "loss": 0.6627, + "step": 6229 + }, + { + "epoch": 1.0169788988204562, + "grad_norm": 2.0000226497650146, + "learning_rate": 1.9862594016302493e-05, + "loss": 0.7842, + "step": 6230 + }, + { + "epoch": 1.0171421574629607, + "grad_norm": 2.152437686920166, + "learning_rate": 1.9862541586693428e-05, + "loss": 0.7431, + "step": 6231 + }, + { + "epoch": 1.017305416105465, + "grad_norm": 1.9848170280456543, + "learning_rate": 1.9862489147152786e-05, + "loss": 0.7583, + "step": 6232 + }, + { + "epoch": 1.0174686747479695, + "grad_norm": 1.7627274990081787, + "learning_rate": 1.986243669768062e-05, + "loss": 0.7277, + "step": 6233 + }, + { + "epoch": 1.0176319333904738, + "grad_norm": 1.600558876991272, + "learning_rate": 1.9862384238276986e-05, + "loss": 0.6123, + "step": 6234 + }, + { + "epoch": 1.0177951920329782, + "grad_norm": 2.0893051624298096, + "learning_rate": 1.986233176894193e-05, + "loss": 0.8007, + "step": 6235 + }, + { + "epoch": 1.0179584506754826, + "grad_norm": 1.8009285926818848, + "learning_rate": 1.986227928967551e-05, + "loss": 0.6951, + "step": 6236 + }, + { + "epoch": 1.018121709317987, + "grad_norm": 2.0503101348876953, + "learning_rate": 1.986222680047778e-05, + "loss": 0.7789, + "step": 6237 + }, + { + "epoch": 1.0182849679604915, + "grad_norm": 1.8305678367614746, + "learning_rate": 1.9862174301348783e-05, + "loss": 0.723, + "step": 6238 + }, + { + "epoch": 1.0184482266029957, + "grad_norm": 1.906397819519043, + "learning_rate": 1.9862121792288586e-05, + "loss": 0.7958, + "step": 6239 + }, + { + "epoch": 1.0186114852455002, + "grad_norm": 2.1317081451416016, + "learning_rate": 1.9862069273297233e-05, + "loss": 0.6731, + "step": 6240 + }, + { + "epoch": 1.0187747438880046, + "grad_norm": 2.286470413208008, + "learning_rate": 1.9862016744374778e-05, + "loss": 0.7268, + "step": 6241 + }, + { + "epoch": 1.018938002530509, + "grad_norm": 1.6786940097808838, + "learning_rate": 1.9861964205521274e-05, + "loss": 0.635, + "step": 6242 + }, + { + "epoch": 1.0191012611730133, + "grad_norm": 1.821219563484192, + "learning_rate": 1.9861911656736778e-05, + "loss": 0.7595, + "step": 6243 + }, + { + "epoch": 1.0192645198155177, + "grad_norm": 1.8271708488464355, + "learning_rate": 1.9861859098021338e-05, + "loss": 0.6791, + "step": 6244 + }, + { + "epoch": 1.0194277784580221, + "grad_norm": 1.7191015481948853, + "learning_rate": 1.9861806529375006e-05, + "loss": 0.647, + "step": 6245 + }, + { + "epoch": 1.0195910371005266, + "grad_norm": 1.7316787242889404, + "learning_rate": 1.986175395079784e-05, + "loss": 0.702, + "step": 6246 + }, + { + "epoch": 1.019754295743031, + "grad_norm": 1.6700059175491333, + "learning_rate": 1.9861701362289892e-05, + "loss": 0.5623, + "step": 6247 + }, + { + "epoch": 1.0199175543855352, + "grad_norm": 1.7218483686447144, + "learning_rate": 1.986164876385121e-05, + "loss": 0.6324, + "step": 6248 + }, + { + "epoch": 1.0200808130280397, + "grad_norm": 1.9733182191848755, + "learning_rate": 1.986159615548185e-05, + "loss": 0.8765, + "step": 6249 + }, + { + "epoch": 1.020244071670544, + "grad_norm": 1.9796265363693237, + "learning_rate": 1.986154353718187e-05, + "loss": 0.7411, + "step": 6250 + }, + { + "epoch": 1.0204073303130485, + "grad_norm": 1.5361871719360352, + "learning_rate": 1.9861490908951312e-05, + "loss": 0.6553, + "step": 6251 + }, + { + "epoch": 1.0205705889555527, + "grad_norm": 2.1760194301605225, + "learning_rate": 1.986143827079024e-05, + "loss": 0.8365, + "step": 6252 + }, + { + "epoch": 1.0207338475980572, + "grad_norm": 1.891868233680725, + "learning_rate": 1.98613856226987e-05, + "loss": 0.7455, + "step": 6253 + }, + { + "epoch": 1.0208971062405616, + "grad_norm": 1.712141513824463, + "learning_rate": 1.9861332964676747e-05, + "loss": 0.6784, + "step": 6254 + }, + { + "epoch": 1.021060364883066, + "grad_norm": 2.3060672283172607, + "learning_rate": 1.9861280296724438e-05, + "loss": 0.6736, + "step": 6255 + }, + { + "epoch": 1.0212236235255703, + "grad_norm": 1.726008415222168, + "learning_rate": 1.9861227618841817e-05, + "loss": 0.6931, + "step": 6256 + }, + { + "epoch": 1.0213868821680747, + "grad_norm": 1.6330381631851196, + "learning_rate": 1.9861174931028943e-05, + "loss": 0.5625, + "step": 6257 + }, + { + "epoch": 1.0215501408105792, + "grad_norm": 2.276740074157715, + "learning_rate": 1.9861122233285873e-05, + "loss": 0.816, + "step": 6258 + }, + { + "epoch": 1.0217133994530836, + "grad_norm": 2.1491215229034424, + "learning_rate": 1.9861069525612652e-05, + "loss": 0.8122, + "step": 6259 + }, + { + "epoch": 1.021876658095588, + "grad_norm": 1.961983561515808, + "learning_rate": 1.9861016808009335e-05, + "loss": 0.711, + "step": 6260 + }, + { + "epoch": 1.0220399167380922, + "grad_norm": 1.800482988357544, + "learning_rate": 1.986096408047598e-05, + "loss": 0.6433, + "step": 6261 + }, + { + "epoch": 1.0222031753805967, + "grad_norm": 1.8586786985397339, + "learning_rate": 1.9860911343012638e-05, + "loss": 0.6463, + "step": 6262 + }, + { + "epoch": 1.0223664340231011, + "grad_norm": 1.9219046831130981, + "learning_rate": 1.986085859561936e-05, + "loss": 0.5772, + "step": 6263 + }, + { + "epoch": 1.0225296926656056, + "grad_norm": 2.07528018951416, + "learning_rate": 1.9860805838296197e-05, + "loss": 0.7872, + "step": 6264 + }, + { + "epoch": 1.0226929513081098, + "grad_norm": 2.04831862449646, + "learning_rate": 1.9860753071043207e-05, + "loss": 0.6821, + "step": 6265 + }, + { + "epoch": 1.0228562099506142, + "grad_norm": 2.2361581325531006, + "learning_rate": 1.9860700293860444e-05, + "loss": 0.692, + "step": 6266 + }, + { + "epoch": 1.0230194685931187, + "grad_norm": 2.163529396057129, + "learning_rate": 1.9860647506747953e-05, + "loss": 0.7447, + "step": 6267 + }, + { + "epoch": 1.023182727235623, + "grad_norm": 2.1037120819091797, + "learning_rate": 1.9860594709705797e-05, + "loss": 0.9543, + "step": 6268 + }, + { + "epoch": 1.0233459858781275, + "grad_norm": 1.9110475778579712, + "learning_rate": 1.9860541902734023e-05, + "loss": 0.7081, + "step": 6269 + }, + { + "epoch": 1.0235092445206317, + "grad_norm": 1.7073440551757812, + "learning_rate": 1.9860489085832685e-05, + "loss": 0.6178, + "step": 6270 + }, + { + "epoch": 1.0236725031631362, + "grad_norm": 1.8464313745498657, + "learning_rate": 1.9860436259001837e-05, + "loss": 0.7466, + "step": 6271 + }, + { + "epoch": 1.0238357618056406, + "grad_norm": 1.9696903228759766, + "learning_rate": 1.9860383422241534e-05, + "loss": 0.7249, + "step": 6272 + }, + { + "epoch": 1.023999020448145, + "grad_norm": 2.1451330184936523, + "learning_rate": 1.9860330575551826e-05, + "loss": 0.7367, + "step": 6273 + }, + { + "epoch": 1.0241622790906493, + "grad_norm": 1.6812928915023804, + "learning_rate": 1.986027771893277e-05, + "loss": 0.6759, + "step": 6274 + }, + { + "epoch": 1.0243255377331537, + "grad_norm": 1.6618458032608032, + "learning_rate": 1.9860224852384416e-05, + "loss": 0.7328, + "step": 6275 + }, + { + "epoch": 1.0244887963756582, + "grad_norm": 2.396059513092041, + "learning_rate": 1.9860171975906815e-05, + "loss": 0.847, + "step": 6276 + }, + { + "epoch": 1.0246520550181626, + "grad_norm": 1.9034167528152466, + "learning_rate": 1.986011908950003e-05, + "loss": 0.6862, + "step": 6277 + }, + { + "epoch": 1.0248153136606668, + "grad_norm": 1.7377004623413086, + "learning_rate": 1.9860066193164102e-05, + "loss": 0.7185, + "step": 6278 + }, + { + "epoch": 1.0249785723031712, + "grad_norm": 2.1314213275909424, + "learning_rate": 1.986001328689909e-05, + "loss": 0.8233, + "step": 6279 + }, + { + "epoch": 1.0251418309456757, + "grad_norm": 1.7401275634765625, + "learning_rate": 1.985996037070505e-05, + "loss": 0.7173, + "step": 6280 + }, + { + "epoch": 1.0253050895881801, + "grad_norm": 1.8057104349136353, + "learning_rate": 1.9859907444582032e-05, + "loss": 0.6848, + "step": 6281 + }, + { + "epoch": 1.0254683482306846, + "grad_norm": 1.7019447088241577, + "learning_rate": 1.985985450853009e-05, + "loss": 0.716, + "step": 6282 + }, + { + "epoch": 1.0256316068731888, + "grad_norm": 1.6462762355804443, + "learning_rate": 1.9859801562549277e-05, + "loss": 0.6676, + "step": 6283 + }, + { + "epoch": 1.0257948655156932, + "grad_norm": 1.7857152223587036, + "learning_rate": 1.9859748606639644e-05, + "loss": 0.6541, + "step": 6284 + }, + { + "epoch": 1.0259581241581976, + "grad_norm": 1.4962654113769531, + "learning_rate": 1.985969564080125e-05, + "loss": 0.6499, + "step": 6285 + }, + { + "epoch": 1.026121382800702, + "grad_norm": 1.9302033185958862, + "learning_rate": 1.9859642665034146e-05, + "loss": 0.7586, + "step": 6286 + }, + { + "epoch": 1.0262846414432063, + "grad_norm": 1.5182029008865356, + "learning_rate": 1.985958967933838e-05, + "loss": 0.7166, + "step": 6287 + }, + { + "epoch": 1.0264479000857107, + "grad_norm": 1.6593518257141113, + "learning_rate": 1.9859536683714014e-05, + "loss": 0.7362, + "step": 6288 + }, + { + "epoch": 1.0266111587282152, + "grad_norm": 1.9190257787704468, + "learning_rate": 1.9859483678161092e-05, + "loss": 0.7138, + "step": 6289 + }, + { + "epoch": 1.0267744173707196, + "grad_norm": 1.7671862840652466, + "learning_rate": 1.9859430662679676e-05, + "loss": 0.6131, + "step": 6290 + }, + { + "epoch": 1.026937676013224, + "grad_norm": 1.673975944519043, + "learning_rate": 1.9859377637269817e-05, + "loss": 0.5403, + "step": 6291 + }, + { + "epoch": 1.0271009346557283, + "grad_norm": 1.8193539381027222, + "learning_rate": 1.9859324601931567e-05, + "loss": 0.7647, + "step": 6292 + }, + { + "epoch": 1.0272641932982327, + "grad_norm": 1.7898828983306885, + "learning_rate": 1.985927155666498e-05, + "loss": 0.7991, + "step": 6293 + }, + { + "epoch": 1.0274274519407371, + "grad_norm": 1.6811909675598145, + "learning_rate": 1.9859218501470105e-05, + "loss": 0.58, + "step": 6294 + }, + { + "epoch": 1.0275907105832416, + "grad_norm": 1.752687931060791, + "learning_rate": 1.9859165436347006e-05, + "loss": 0.7363, + "step": 6295 + }, + { + "epoch": 1.0277539692257458, + "grad_norm": 1.6997674703598022, + "learning_rate": 1.9859112361295724e-05, + "loss": 0.6956, + "step": 6296 + }, + { + "epoch": 1.0279172278682502, + "grad_norm": 1.5739259719848633, + "learning_rate": 1.9859059276316322e-05, + "loss": 0.6887, + "step": 6297 + }, + { + "epoch": 1.0280804865107547, + "grad_norm": 1.8841649293899536, + "learning_rate": 1.9859006181408847e-05, + "loss": 0.7953, + "step": 6298 + }, + { + "epoch": 1.0282437451532591, + "grad_norm": 1.8035109043121338, + "learning_rate": 1.985895307657336e-05, + "loss": 0.7746, + "step": 6299 + }, + { + "epoch": 1.0284070037957633, + "grad_norm": 1.9514487981796265, + "learning_rate": 1.9858899961809904e-05, + "loss": 0.7103, + "step": 6300 + }, + { + "epoch": 1.0285702624382678, + "grad_norm": 1.7052786350250244, + "learning_rate": 1.9858846837118545e-05, + "loss": 0.7442, + "step": 6301 + }, + { + "epoch": 1.0287335210807722, + "grad_norm": 1.7776670455932617, + "learning_rate": 1.9858793702499322e-05, + "loss": 0.6829, + "step": 6302 + }, + { + "epoch": 1.0288967797232766, + "grad_norm": 1.647672414779663, + "learning_rate": 1.9858740557952304e-05, + "loss": 0.6907, + "step": 6303 + }, + { + "epoch": 1.029060038365781, + "grad_norm": 1.936865210533142, + "learning_rate": 1.9858687403477535e-05, + "loss": 0.8271, + "step": 6304 + }, + { + "epoch": 1.0292232970082853, + "grad_norm": 1.9570997953414917, + "learning_rate": 1.9858634239075066e-05, + "loss": 0.7679, + "step": 6305 + }, + { + "epoch": 1.0293865556507897, + "grad_norm": 1.9535030126571655, + "learning_rate": 1.985858106474496e-05, + "loss": 0.7658, + "step": 6306 + }, + { + "epoch": 1.0295498142932942, + "grad_norm": 1.6436525583267212, + "learning_rate": 1.9858527880487263e-05, + "loss": 0.6313, + "step": 6307 + }, + { + "epoch": 1.0297130729357986, + "grad_norm": 1.9210878610610962, + "learning_rate": 1.985847468630203e-05, + "loss": 0.6596, + "step": 6308 + }, + { + "epoch": 1.0298763315783028, + "grad_norm": 1.8534647226333618, + "learning_rate": 1.9858421482189318e-05, + "loss": 0.7182, + "step": 6309 + }, + { + "epoch": 1.0300395902208073, + "grad_norm": 2.131354570388794, + "learning_rate": 1.985836826814918e-05, + "loss": 0.8707, + "step": 6310 + }, + { + "epoch": 1.0302028488633117, + "grad_norm": 1.7080912590026855, + "learning_rate": 1.9858315044181666e-05, + "loss": 0.5776, + "step": 6311 + }, + { + "epoch": 1.0303661075058161, + "grad_norm": 1.8762413263320923, + "learning_rate": 1.9858261810286828e-05, + "loss": 0.6788, + "step": 6312 + }, + { + "epoch": 1.0305293661483206, + "grad_norm": 2.174556255340576, + "learning_rate": 1.9858208566464726e-05, + "loss": 0.8724, + "step": 6313 + }, + { + "epoch": 1.0306926247908248, + "grad_norm": 2.1115877628326416, + "learning_rate": 1.985815531271541e-05, + "loss": 0.6962, + "step": 6314 + }, + { + "epoch": 1.0308558834333292, + "grad_norm": 1.936353325843811, + "learning_rate": 1.9858102049038933e-05, + "loss": 0.7639, + "step": 6315 + }, + { + "epoch": 1.0310191420758337, + "grad_norm": 2.2395389080047607, + "learning_rate": 1.9858048775435353e-05, + "loss": 0.6992, + "step": 6316 + }, + { + "epoch": 1.031182400718338, + "grad_norm": 2.0371451377868652, + "learning_rate": 1.985799549190472e-05, + "loss": 0.6941, + "step": 6317 + }, + { + "epoch": 1.0313456593608423, + "grad_norm": 2.105923652648926, + "learning_rate": 1.9857942198447084e-05, + "loss": 0.8578, + "step": 6318 + }, + { + "epoch": 1.0315089180033468, + "grad_norm": 1.6982399225234985, + "learning_rate": 1.9857888895062506e-05, + "loss": 0.5982, + "step": 6319 + }, + { + "epoch": 1.0316721766458512, + "grad_norm": 1.8045004606246948, + "learning_rate": 1.9857835581751038e-05, + "loss": 0.6575, + "step": 6320 + }, + { + "epoch": 1.0318354352883556, + "grad_norm": 1.7899636030197144, + "learning_rate": 1.9857782258512726e-05, + "loss": 0.5769, + "step": 6321 + }, + { + "epoch": 1.03199869393086, + "grad_norm": 2.0876801013946533, + "learning_rate": 1.9857728925347636e-05, + "loss": 0.7906, + "step": 6322 + }, + { + "epoch": 1.0321619525733643, + "grad_norm": 1.9150800704956055, + "learning_rate": 1.9857675582255814e-05, + "loss": 0.7853, + "step": 6323 + }, + { + "epoch": 1.0323252112158687, + "grad_norm": 1.9456051588058472, + "learning_rate": 1.9857622229237315e-05, + "loss": 0.8225, + "step": 6324 + }, + { + "epoch": 1.0324884698583732, + "grad_norm": 1.776223063468933, + "learning_rate": 1.9857568866292193e-05, + "loss": 0.7346, + "step": 6325 + }, + { + "epoch": 1.0326517285008776, + "grad_norm": 1.9625061750411987, + "learning_rate": 1.9857515493420502e-05, + "loss": 0.7034, + "step": 6326 + }, + { + "epoch": 1.0328149871433818, + "grad_norm": 1.841731071472168, + "learning_rate": 1.9857462110622293e-05, + "loss": 0.625, + "step": 6327 + }, + { + "epoch": 1.0329782457858863, + "grad_norm": 1.9265470504760742, + "learning_rate": 1.9857408717897627e-05, + "loss": 0.7014, + "step": 6328 + }, + { + "epoch": 1.0331415044283907, + "grad_norm": 1.795730471611023, + "learning_rate": 1.985735531524655e-05, + "loss": 0.6803, + "step": 6329 + }, + { + "epoch": 1.0333047630708951, + "grad_norm": 1.9166145324707031, + "learning_rate": 1.985730190266912e-05, + "loss": 0.653, + "step": 6330 + }, + { + "epoch": 1.0334680217133994, + "grad_norm": 1.8467719554901123, + "learning_rate": 1.9857248480165388e-05, + "loss": 0.6842, + "step": 6331 + }, + { + "epoch": 1.0336312803559038, + "grad_norm": 2.0247559547424316, + "learning_rate": 1.9857195047735412e-05, + "loss": 0.6433, + "step": 6332 + }, + { + "epoch": 1.0337945389984082, + "grad_norm": 2.146075487136841, + "learning_rate": 1.985714160537924e-05, + "loss": 0.7956, + "step": 6333 + }, + { + "epoch": 1.0339577976409127, + "grad_norm": 1.7712678909301758, + "learning_rate": 1.985708815309693e-05, + "loss": 0.5848, + "step": 6334 + }, + { + "epoch": 1.034121056283417, + "grad_norm": 1.7628885507583618, + "learning_rate": 1.985703469088854e-05, + "loss": 0.7027, + "step": 6335 + }, + { + "epoch": 1.0342843149259213, + "grad_norm": 2.074078321456909, + "learning_rate": 1.985698121875411e-05, + "loss": 0.9274, + "step": 6336 + }, + { + "epoch": 1.0344475735684258, + "grad_norm": 1.541096806526184, + "learning_rate": 1.9856927736693706e-05, + "loss": 0.6185, + "step": 6337 + }, + { + "epoch": 1.0346108322109302, + "grad_norm": 2.016784191131592, + "learning_rate": 1.9856874244707383e-05, + "loss": 0.7035, + "step": 6338 + }, + { + "epoch": 1.0347740908534346, + "grad_norm": 2.2863566875457764, + "learning_rate": 1.9856820742795183e-05, + "loss": 0.8084, + "step": 6339 + }, + { + "epoch": 1.0349373494959389, + "grad_norm": 1.9221899509429932, + "learning_rate": 1.9856767230957173e-05, + "loss": 0.8719, + "step": 6340 + }, + { + "epoch": 1.0351006081384433, + "grad_norm": 1.6395528316497803, + "learning_rate": 1.9856713709193397e-05, + "loss": 0.7319, + "step": 6341 + }, + { + "epoch": 1.0352638667809477, + "grad_norm": 2.3567001819610596, + "learning_rate": 1.9856660177503916e-05, + "loss": 0.9236, + "step": 6342 + }, + { + "epoch": 1.0354271254234522, + "grad_norm": 2.0187878608703613, + "learning_rate": 1.9856606635888777e-05, + "loss": 0.8507, + "step": 6343 + }, + { + "epoch": 1.0355903840659564, + "grad_norm": 2.2076609134674072, + "learning_rate": 1.985655308434804e-05, + "loss": 0.7411, + "step": 6344 + }, + { + "epoch": 1.0357536427084608, + "grad_norm": 1.939016342163086, + "learning_rate": 1.985649952288176e-05, + "loss": 0.657, + "step": 6345 + }, + { + "epoch": 1.0359169013509653, + "grad_norm": 1.7595069408416748, + "learning_rate": 1.9856445951489984e-05, + "loss": 0.7662, + "step": 6346 + }, + { + "epoch": 1.0360801599934697, + "grad_norm": 1.9005506038665771, + "learning_rate": 1.9856392370172768e-05, + "loss": 0.7351, + "step": 6347 + }, + { + "epoch": 1.0362434186359741, + "grad_norm": 2.0733301639556885, + "learning_rate": 1.985633877893017e-05, + "loss": 0.8138, + "step": 6348 + }, + { + "epoch": 1.0364066772784783, + "grad_norm": 1.927930474281311, + "learning_rate": 1.985628517776224e-05, + "loss": 0.7378, + "step": 6349 + }, + { + "epoch": 1.0365699359209828, + "grad_norm": 1.5964467525482178, + "learning_rate": 1.9856231566669036e-05, + "loss": 0.6966, + "step": 6350 + }, + { + "epoch": 1.0367331945634872, + "grad_norm": 1.8646775484085083, + "learning_rate": 1.9856177945650607e-05, + "loss": 0.7725, + "step": 6351 + }, + { + "epoch": 1.0368964532059917, + "grad_norm": 1.489858865737915, + "learning_rate": 1.9856124314707008e-05, + "loss": 0.6389, + "step": 6352 + }, + { + "epoch": 1.0370597118484959, + "grad_norm": 2.1021125316619873, + "learning_rate": 1.9856070673838297e-05, + "loss": 0.7079, + "step": 6353 + }, + { + "epoch": 1.0372229704910003, + "grad_norm": 1.8392387628555298, + "learning_rate": 1.9856017023044525e-05, + "loss": 0.6704, + "step": 6354 + }, + { + "epoch": 1.0373862291335048, + "grad_norm": 1.7700083255767822, + "learning_rate": 1.9855963362325746e-05, + "loss": 0.6521, + "step": 6355 + }, + { + "epoch": 1.0375494877760092, + "grad_norm": 1.7325944900512695, + "learning_rate": 1.9855909691682014e-05, + "loss": 0.7651, + "step": 6356 + }, + { + "epoch": 1.0377127464185136, + "grad_norm": 1.5628496408462524, + "learning_rate": 1.9855856011113384e-05, + "loss": 0.6273, + "step": 6357 + }, + { + "epoch": 1.0378760050610178, + "grad_norm": 1.9006133079528809, + "learning_rate": 1.9855802320619913e-05, + "loss": 0.7559, + "step": 6358 + }, + { + "epoch": 1.0380392637035223, + "grad_norm": 1.7265238761901855, + "learning_rate": 1.9855748620201646e-05, + "loss": 0.6388, + "step": 6359 + }, + { + "epoch": 1.0382025223460267, + "grad_norm": 1.879228949546814, + "learning_rate": 1.9855694909858645e-05, + "loss": 0.7162, + "step": 6360 + }, + { + "epoch": 1.0383657809885312, + "grad_norm": 2.1982500553131104, + "learning_rate": 1.9855641189590963e-05, + "loss": 0.9425, + "step": 6361 + }, + { + "epoch": 1.0385290396310354, + "grad_norm": 1.7462934255599976, + "learning_rate": 1.9855587459398654e-05, + "loss": 0.6618, + "step": 6362 + }, + { + "epoch": 1.0386922982735398, + "grad_norm": 1.8214459419250488, + "learning_rate": 1.9855533719281768e-05, + "loss": 0.6754, + "step": 6363 + }, + { + "epoch": 1.0388555569160443, + "grad_norm": 1.4484686851501465, + "learning_rate": 1.9855479969240363e-05, + "loss": 0.5025, + "step": 6364 + }, + { + "epoch": 1.0390188155585487, + "grad_norm": 1.9273101091384888, + "learning_rate": 1.9855426209274493e-05, + "loss": 0.7877, + "step": 6365 + }, + { + "epoch": 1.0391820742010531, + "grad_norm": 1.6532548666000366, + "learning_rate": 1.985537243938421e-05, + "loss": 0.6323, + "step": 6366 + }, + { + "epoch": 1.0393453328435573, + "grad_norm": 1.9814331531524658, + "learning_rate": 1.9855318659569572e-05, + "loss": 0.7283, + "step": 6367 + }, + { + "epoch": 1.0395085914860618, + "grad_norm": 1.765197515487671, + "learning_rate": 1.985526486983063e-05, + "loss": 0.7197, + "step": 6368 + }, + { + "epoch": 1.0396718501285662, + "grad_norm": 1.5571547746658325, + "learning_rate": 1.985521107016744e-05, + "loss": 0.6677, + "step": 6369 + }, + { + "epoch": 1.0398351087710707, + "grad_norm": 1.9828777313232422, + "learning_rate": 1.9855157260580052e-05, + "loss": 0.8341, + "step": 6370 + }, + { + "epoch": 1.0399983674135749, + "grad_norm": 1.760911464691162, + "learning_rate": 1.9855103441068525e-05, + "loss": 0.7723, + "step": 6371 + }, + { + "epoch": 1.0401616260560793, + "grad_norm": 1.6029481887817383, + "learning_rate": 1.985504961163291e-05, + "loss": 0.8017, + "step": 6372 + }, + { + "epoch": 1.0403248846985838, + "grad_norm": 1.9062387943267822, + "learning_rate": 1.9854995772273266e-05, + "loss": 0.7894, + "step": 6373 + }, + { + "epoch": 1.0404881433410882, + "grad_norm": 1.6903984546661377, + "learning_rate": 1.9854941922989638e-05, + "loss": 0.6404, + "step": 6374 + }, + { + "epoch": 1.0406514019835924, + "grad_norm": 1.6882866621017456, + "learning_rate": 1.9854888063782088e-05, + "loss": 0.7031, + "step": 6375 + }, + { + "epoch": 1.0408146606260968, + "grad_norm": 2.1223995685577393, + "learning_rate": 1.9854834194650673e-05, + "loss": 0.8229, + "step": 6376 + }, + { + "epoch": 1.0409779192686013, + "grad_norm": 1.3748483657836914, + "learning_rate": 1.985478031559544e-05, + "loss": 0.5831, + "step": 6377 + }, + { + "epoch": 1.0411411779111057, + "grad_norm": 1.6554031372070312, + "learning_rate": 1.9854726426616447e-05, + "loss": 0.6084, + "step": 6378 + }, + { + "epoch": 1.0413044365536102, + "grad_norm": 1.5632424354553223, + "learning_rate": 1.9854672527713745e-05, + "loss": 0.6284, + "step": 6379 + }, + { + "epoch": 1.0414676951961144, + "grad_norm": 2.5645785331726074, + "learning_rate": 1.985461861888739e-05, + "loss": 0.9476, + "step": 6380 + }, + { + "epoch": 1.0416309538386188, + "grad_norm": 1.9751784801483154, + "learning_rate": 1.9854564700137437e-05, + "loss": 0.691, + "step": 6381 + }, + { + "epoch": 1.0417942124811232, + "grad_norm": 1.5935426950454712, + "learning_rate": 1.9854510771463942e-05, + "loss": 0.7099, + "step": 6382 + }, + { + "epoch": 1.0419574711236277, + "grad_norm": 2.161407470703125, + "learning_rate": 1.9854456832866956e-05, + "loss": 0.8541, + "step": 6383 + }, + { + "epoch": 1.042120729766132, + "grad_norm": 1.9132003784179688, + "learning_rate": 1.9854402884346534e-05, + "loss": 0.6346, + "step": 6384 + }, + { + "epoch": 1.0422839884086363, + "grad_norm": 1.8029053211212158, + "learning_rate": 1.985434892590273e-05, + "loss": 0.7083, + "step": 6385 + }, + { + "epoch": 1.0424472470511408, + "grad_norm": 1.8897643089294434, + "learning_rate": 1.98542949575356e-05, + "loss": 0.667, + "step": 6386 + }, + { + "epoch": 1.0426105056936452, + "grad_norm": 2.5108985900878906, + "learning_rate": 1.98542409792452e-05, + "loss": 0.8482, + "step": 6387 + }, + { + "epoch": 1.0427737643361494, + "grad_norm": 1.5588014125823975, + "learning_rate": 1.985418699103158e-05, + "loss": 0.6847, + "step": 6388 + }, + { + "epoch": 1.0429370229786539, + "grad_norm": 1.939668893814087, + "learning_rate": 1.9854132992894793e-05, + "loss": 0.6844, + "step": 6389 + }, + { + "epoch": 1.0431002816211583, + "grad_norm": 1.7362983226776123, + "learning_rate": 1.9854078984834904e-05, + "loss": 0.6969, + "step": 6390 + }, + { + "epoch": 1.0432635402636627, + "grad_norm": 2.0692286491394043, + "learning_rate": 1.9854024966851953e-05, + "loss": 0.8139, + "step": 6391 + }, + { + "epoch": 1.0434267989061672, + "grad_norm": 1.7947496175765991, + "learning_rate": 1.9853970938946005e-05, + "loss": 0.5697, + "step": 6392 + }, + { + "epoch": 1.0435900575486714, + "grad_norm": 1.83210027217865, + "learning_rate": 1.9853916901117112e-05, + "loss": 0.6931, + "step": 6393 + }, + { + "epoch": 1.0437533161911758, + "grad_norm": 2.1277687549591064, + "learning_rate": 1.9853862853365324e-05, + "loss": 0.726, + "step": 6394 + }, + { + "epoch": 1.0439165748336803, + "grad_norm": 1.94338858127594, + "learning_rate": 1.9853808795690704e-05, + "loss": 0.7794, + "step": 6395 + }, + { + "epoch": 1.0440798334761847, + "grad_norm": 2.382124185562134, + "learning_rate": 1.9853754728093296e-05, + "loss": 0.7814, + "step": 6396 + }, + { + "epoch": 1.044243092118689, + "grad_norm": 1.559651255607605, + "learning_rate": 1.985370065057316e-05, + "loss": 0.6314, + "step": 6397 + }, + { + "epoch": 1.0444063507611934, + "grad_norm": 1.9126427173614502, + "learning_rate": 1.9853646563130355e-05, + "loss": 0.7189, + "step": 6398 + }, + { + "epoch": 1.0445696094036978, + "grad_norm": 1.7692419290542603, + "learning_rate": 1.9853592465764926e-05, + "loss": 0.6914, + "step": 6399 + }, + { + "epoch": 1.0447328680462022, + "grad_norm": 1.9816216230392456, + "learning_rate": 1.9853538358476933e-05, + "loss": 0.8409, + "step": 6400 + }, + { + "epoch": 1.0448961266887067, + "grad_norm": 1.821016788482666, + "learning_rate": 1.985348424126643e-05, + "loss": 0.8012, + "step": 6401 + }, + { + "epoch": 1.045059385331211, + "grad_norm": 1.792432427406311, + "learning_rate": 1.985343011413347e-05, + "loss": 0.7281, + "step": 6402 + }, + { + "epoch": 1.0452226439737153, + "grad_norm": 1.7540838718414307, + "learning_rate": 1.985337597707811e-05, + "loss": 0.7064, + "step": 6403 + }, + { + "epoch": 1.0453859026162198, + "grad_norm": 1.9724094867706299, + "learning_rate": 1.98533218301004e-05, + "loss": 0.8457, + "step": 6404 + }, + { + "epoch": 1.0455491612587242, + "grad_norm": 1.963816523551941, + "learning_rate": 1.98532676732004e-05, + "loss": 0.8454, + "step": 6405 + }, + { + "epoch": 1.0457124199012284, + "grad_norm": 1.8810901641845703, + "learning_rate": 1.9853213506378163e-05, + "loss": 0.7963, + "step": 6406 + }, + { + "epoch": 1.0458756785437329, + "grad_norm": 1.9094598293304443, + "learning_rate": 1.9853159329633737e-05, + "loss": 0.8457, + "step": 6407 + }, + { + "epoch": 1.0460389371862373, + "grad_norm": 1.7724827527999878, + "learning_rate": 1.985310514296719e-05, + "loss": 0.8429, + "step": 6408 + }, + { + "epoch": 1.0462021958287417, + "grad_norm": 1.3906306028366089, + "learning_rate": 1.9853050946378563e-05, + "loss": 0.5326, + "step": 6409 + }, + { + "epoch": 1.0463654544712462, + "grad_norm": 1.5561398267745972, + "learning_rate": 1.9852996739867918e-05, + "loss": 0.6433, + "step": 6410 + }, + { + "epoch": 1.0465287131137504, + "grad_norm": 1.5456129312515259, + "learning_rate": 1.985294252343531e-05, + "loss": 0.6315, + "step": 6411 + }, + { + "epoch": 1.0466919717562548, + "grad_norm": 1.8530503511428833, + "learning_rate": 1.985288829708079e-05, + "loss": 0.7208, + "step": 6412 + }, + { + "epoch": 1.0468552303987593, + "grad_norm": 1.6401225328445435, + "learning_rate": 1.985283406080441e-05, + "loss": 0.7369, + "step": 6413 + }, + { + "epoch": 1.0470184890412637, + "grad_norm": 2.065150499343872, + "learning_rate": 1.9852779814606232e-05, + "loss": 0.825, + "step": 6414 + }, + { + "epoch": 1.047181747683768, + "grad_norm": 1.6929408311843872, + "learning_rate": 1.985272555848631e-05, + "loss": 0.7282, + "step": 6415 + }, + { + "epoch": 1.0473450063262724, + "grad_norm": 1.9294968843460083, + "learning_rate": 1.9852671292444692e-05, + "loss": 0.7184, + "step": 6416 + }, + { + "epoch": 1.0475082649687768, + "grad_norm": 2.0008933544158936, + "learning_rate": 1.9852617016481442e-05, + "loss": 0.8084, + "step": 6417 + }, + { + "epoch": 1.0476715236112812, + "grad_norm": 2.3468823432922363, + "learning_rate": 1.9852562730596606e-05, + "loss": 0.8604, + "step": 6418 + }, + { + "epoch": 1.0478347822537855, + "grad_norm": 1.7941347360610962, + "learning_rate": 1.985250843479024e-05, + "loss": 0.7205, + "step": 6419 + }, + { + "epoch": 1.04799804089629, + "grad_norm": 1.568460464477539, + "learning_rate": 1.98524541290624e-05, + "loss": 0.6615, + "step": 6420 + }, + { + "epoch": 1.0481612995387943, + "grad_norm": 1.4633936882019043, + "learning_rate": 1.9852399813413146e-05, + "loss": 0.6811, + "step": 6421 + }, + { + "epoch": 1.0483245581812988, + "grad_norm": 1.7978830337524414, + "learning_rate": 1.9852345487842527e-05, + "loss": 0.6925, + "step": 6422 + }, + { + "epoch": 1.0484878168238032, + "grad_norm": 1.621343731880188, + "learning_rate": 1.9852291152350593e-05, + "loss": 0.5299, + "step": 6423 + }, + { + "epoch": 1.0486510754663074, + "grad_norm": 2.0123748779296875, + "learning_rate": 1.985223680693741e-05, + "loss": 0.7217, + "step": 6424 + }, + { + "epoch": 1.0488143341088119, + "grad_norm": 2.2097108364105225, + "learning_rate": 1.9852182451603026e-05, + "loss": 0.9622, + "step": 6425 + }, + { + "epoch": 1.0489775927513163, + "grad_norm": 1.7552664279937744, + "learning_rate": 1.9852128086347497e-05, + "loss": 0.7548, + "step": 6426 + }, + { + "epoch": 1.0491408513938207, + "grad_norm": 2.169128179550171, + "learning_rate": 1.9852073711170877e-05, + "loss": 0.9252, + "step": 6427 + }, + { + "epoch": 1.049304110036325, + "grad_norm": 2.043309450149536, + "learning_rate": 1.985201932607322e-05, + "loss": 0.7248, + "step": 6428 + }, + { + "epoch": 1.0494673686788294, + "grad_norm": 1.878727674484253, + "learning_rate": 1.9851964931054584e-05, + "loss": 0.7259, + "step": 6429 + }, + { + "epoch": 1.0496306273213338, + "grad_norm": 1.5973033905029297, + "learning_rate": 1.9851910526115023e-05, + "loss": 0.5529, + "step": 6430 + }, + { + "epoch": 1.0497938859638383, + "grad_norm": 1.636610507965088, + "learning_rate": 1.985185611125459e-05, + "loss": 0.6918, + "step": 6431 + }, + { + "epoch": 1.0499571446063425, + "grad_norm": 2.177405595779419, + "learning_rate": 1.985180168647334e-05, + "loss": 0.8603, + "step": 6432 + }, + { + "epoch": 1.050120403248847, + "grad_norm": 2.079627513885498, + "learning_rate": 1.9851747251771328e-05, + "loss": 0.7971, + "step": 6433 + }, + { + "epoch": 1.0502836618913514, + "grad_norm": 1.9238831996917725, + "learning_rate": 1.9851692807148612e-05, + "loss": 0.8452, + "step": 6434 + }, + { + "epoch": 1.0504469205338558, + "grad_norm": 1.85007643699646, + "learning_rate": 1.985163835260524e-05, + "loss": 0.7016, + "step": 6435 + }, + { + "epoch": 1.0506101791763602, + "grad_norm": 1.5897303819656372, + "learning_rate": 1.9851583888141274e-05, + "loss": 0.6901, + "step": 6436 + }, + { + "epoch": 1.0507734378188645, + "grad_norm": 1.5689098834991455, + "learning_rate": 1.985152941375676e-05, + "loss": 0.6482, + "step": 6437 + }, + { + "epoch": 1.050936696461369, + "grad_norm": 1.6794500350952148, + "learning_rate": 1.9851474929451764e-05, + "loss": 0.6576, + "step": 6438 + }, + { + "epoch": 1.0510999551038733, + "grad_norm": 1.7521792650222778, + "learning_rate": 1.9851420435226334e-05, + "loss": 0.6242, + "step": 6439 + }, + { + "epoch": 1.0512632137463778, + "grad_norm": 1.6738594770431519, + "learning_rate": 1.9851365931080525e-05, + "loss": 0.608, + "step": 6440 + }, + { + "epoch": 1.051426472388882, + "grad_norm": 1.6715350151062012, + "learning_rate": 1.9851311417014396e-05, + "loss": 0.6584, + "step": 6441 + }, + { + "epoch": 1.0515897310313864, + "grad_norm": 1.8142921924591064, + "learning_rate": 1.9851256893027996e-05, + "loss": 0.6869, + "step": 6442 + }, + { + "epoch": 1.0517529896738909, + "grad_norm": 1.7631057500839233, + "learning_rate": 1.9851202359121383e-05, + "loss": 0.7282, + "step": 6443 + }, + { + "epoch": 1.0519162483163953, + "grad_norm": 1.8243625164031982, + "learning_rate": 1.9851147815294615e-05, + "loss": 0.6897, + "step": 6444 + }, + { + "epoch": 1.0520795069588997, + "grad_norm": 1.9253132343292236, + "learning_rate": 1.985109326154774e-05, + "loss": 0.8799, + "step": 6445 + }, + { + "epoch": 1.052242765601404, + "grad_norm": 1.9539350271224976, + "learning_rate": 1.9851038697880817e-05, + "loss": 0.581, + "step": 6446 + }, + { + "epoch": 1.0524060242439084, + "grad_norm": 2.1103930473327637, + "learning_rate": 1.9850984124293902e-05, + "loss": 0.7538, + "step": 6447 + }, + { + "epoch": 1.0525692828864128, + "grad_norm": 3.306098699569702, + "learning_rate": 1.985092954078705e-05, + "loss": 0.7329, + "step": 6448 + }, + { + "epoch": 1.0527325415289173, + "grad_norm": 1.8497775793075562, + "learning_rate": 1.9850874947360316e-05, + "loss": 0.6954, + "step": 6449 + }, + { + "epoch": 1.0528958001714215, + "grad_norm": 1.8238327503204346, + "learning_rate": 1.985082034401375e-05, + "loss": 0.6843, + "step": 6450 + }, + { + "epoch": 1.053059058813926, + "grad_norm": 2.092165470123291, + "learning_rate": 1.985076573074741e-05, + "loss": 0.6028, + "step": 6451 + }, + { + "epoch": 1.0532223174564304, + "grad_norm": 1.6197607517242432, + "learning_rate": 1.9850711107561353e-05, + "loss": 0.5844, + "step": 6452 + }, + { + "epoch": 1.0533855760989348, + "grad_norm": 1.900766372680664, + "learning_rate": 1.985065647445563e-05, + "loss": 0.6243, + "step": 6453 + }, + { + "epoch": 1.0535488347414392, + "grad_norm": 1.6840955018997192, + "learning_rate": 1.9850601831430304e-05, + "loss": 0.6925, + "step": 6454 + }, + { + "epoch": 1.0537120933839434, + "grad_norm": 1.6519290208816528, + "learning_rate": 1.9850547178485418e-05, + "loss": 0.7135, + "step": 6455 + }, + { + "epoch": 1.0538753520264479, + "grad_norm": 1.8017761707305908, + "learning_rate": 1.9850492515621038e-05, + "loss": 0.7089, + "step": 6456 + }, + { + "epoch": 1.0540386106689523, + "grad_norm": 2.288264274597168, + "learning_rate": 1.9850437842837214e-05, + "loss": 0.7643, + "step": 6457 + }, + { + "epoch": 1.0542018693114568, + "grad_norm": 1.6951135396957397, + "learning_rate": 1.9850383160134e-05, + "loss": 0.6662, + "step": 6458 + }, + { + "epoch": 1.054365127953961, + "grad_norm": 1.6866683959960938, + "learning_rate": 1.985032846751146e-05, + "loss": 0.5699, + "step": 6459 + }, + { + "epoch": 1.0545283865964654, + "grad_norm": 2.023360013961792, + "learning_rate": 1.9850273764969632e-05, + "loss": 0.8211, + "step": 6460 + }, + { + "epoch": 1.0546916452389699, + "grad_norm": 1.8363749980926514, + "learning_rate": 1.9850219052508586e-05, + "loss": 0.7046, + "step": 6461 + }, + { + "epoch": 1.0548549038814743, + "grad_norm": 1.6069679260253906, + "learning_rate": 1.985016433012837e-05, + "loss": 0.5969, + "step": 6462 + }, + { + "epoch": 1.0550181625239785, + "grad_norm": 1.8681252002716064, + "learning_rate": 1.985010959782904e-05, + "loss": 0.6333, + "step": 6463 + }, + { + "epoch": 1.055181421166483, + "grad_norm": 2.0554378032684326, + "learning_rate": 1.9850054855610656e-05, + "loss": 0.6216, + "step": 6464 + }, + { + "epoch": 1.0553446798089874, + "grad_norm": 1.68414306640625, + "learning_rate": 1.985000010347327e-05, + "loss": 0.5856, + "step": 6465 + }, + { + "epoch": 1.0555079384514918, + "grad_norm": 2.0159666538238525, + "learning_rate": 1.9849945341416932e-05, + "loss": 0.7287, + "step": 6466 + }, + { + "epoch": 1.0556711970939963, + "grad_norm": 1.9936645030975342, + "learning_rate": 1.9849890569441704e-05, + "loss": 0.6266, + "step": 6467 + }, + { + "epoch": 1.0558344557365005, + "grad_norm": 2.7062675952911377, + "learning_rate": 1.984983578754764e-05, + "loss": 0.6609, + "step": 6468 + }, + { + "epoch": 1.055997714379005, + "grad_norm": 1.8133025169372559, + "learning_rate": 1.984978099573479e-05, + "loss": 0.6778, + "step": 6469 + }, + { + "epoch": 1.0561609730215094, + "grad_norm": 1.7791633605957031, + "learning_rate": 1.9849726194003215e-05, + "loss": 0.7104, + "step": 6470 + }, + { + "epoch": 1.0563242316640138, + "grad_norm": 1.6535755395889282, + "learning_rate": 1.984967138235297e-05, + "loss": 0.7184, + "step": 6471 + }, + { + "epoch": 1.056487490306518, + "grad_norm": 1.9171730279922485, + "learning_rate": 1.984961656078411e-05, + "loss": 0.7616, + "step": 6472 + }, + { + "epoch": 1.0566507489490224, + "grad_norm": 1.7422312498092651, + "learning_rate": 1.9849561729296686e-05, + "loss": 0.6773, + "step": 6473 + }, + { + "epoch": 1.0568140075915269, + "grad_norm": 1.9317115545272827, + "learning_rate": 1.9849506887890754e-05, + "loss": 0.702, + "step": 6474 + }, + { + "epoch": 1.0569772662340313, + "grad_norm": 1.7813512086868286, + "learning_rate": 1.9849452036566374e-05, + "loss": 0.7586, + "step": 6475 + }, + { + "epoch": 1.0571405248765358, + "grad_norm": 2.199392795562744, + "learning_rate": 1.9849397175323598e-05, + "loss": 0.9489, + "step": 6476 + }, + { + "epoch": 1.05730378351904, + "grad_norm": 1.9289523363113403, + "learning_rate": 1.9849342304162482e-05, + "loss": 0.8421, + "step": 6477 + }, + { + "epoch": 1.0574670421615444, + "grad_norm": 1.667407512664795, + "learning_rate": 1.984928742308308e-05, + "loss": 0.6283, + "step": 6478 + }, + { + "epoch": 1.0576303008040489, + "grad_norm": 1.8135247230529785, + "learning_rate": 1.9849232532085447e-05, + "loss": 0.6939, + "step": 6479 + }, + { + "epoch": 1.0577935594465533, + "grad_norm": 1.814348816871643, + "learning_rate": 1.9849177631169643e-05, + "loss": 0.7166, + "step": 6480 + }, + { + "epoch": 1.0579568180890575, + "grad_norm": 2.018916368484497, + "learning_rate": 1.9849122720335717e-05, + "loss": 0.7593, + "step": 6481 + }, + { + "epoch": 1.058120076731562, + "grad_norm": 2.4966375827789307, + "learning_rate": 1.984906779958373e-05, + "loss": 0.728, + "step": 6482 + }, + { + "epoch": 1.0582833353740664, + "grad_norm": 2.3089301586151123, + "learning_rate": 1.984901286891373e-05, + "loss": 0.7133, + "step": 6483 + }, + { + "epoch": 1.0584465940165708, + "grad_norm": 1.7044378519058228, + "learning_rate": 1.9848957928325777e-05, + "loss": 0.6748, + "step": 6484 + }, + { + "epoch": 1.058609852659075, + "grad_norm": 1.6541234254837036, + "learning_rate": 1.984890297781993e-05, + "loss": 0.6403, + "step": 6485 + }, + { + "epoch": 1.0587731113015795, + "grad_norm": 2.038078784942627, + "learning_rate": 1.9848848017396237e-05, + "loss": 0.7525, + "step": 6486 + }, + { + "epoch": 1.058936369944084, + "grad_norm": 1.9732763767242432, + "learning_rate": 1.984879304705476e-05, + "loss": 0.6197, + "step": 6487 + }, + { + "epoch": 1.0590996285865883, + "grad_norm": 1.8807592391967773, + "learning_rate": 1.9848738066795547e-05, + "loss": 0.7507, + "step": 6488 + }, + { + "epoch": 1.0592628872290928, + "grad_norm": 1.9210035800933838, + "learning_rate": 1.984868307661866e-05, + "loss": 0.6903, + "step": 6489 + }, + { + "epoch": 1.059426145871597, + "grad_norm": 1.615373969078064, + "learning_rate": 1.984862807652415e-05, + "loss": 0.5683, + "step": 6490 + }, + { + "epoch": 1.0595894045141014, + "grad_norm": 1.8936339616775513, + "learning_rate": 1.9848573066512074e-05, + "loss": 0.5732, + "step": 6491 + }, + { + "epoch": 1.0597526631566059, + "grad_norm": 1.8804913759231567, + "learning_rate": 1.984851804658249e-05, + "loss": 0.8062, + "step": 6492 + }, + { + "epoch": 1.0599159217991103, + "grad_norm": 1.891694188117981, + "learning_rate": 1.984846301673545e-05, + "loss": 0.5941, + "step": 6493 + }, + { + "epoch": 1.0600791804416145, + "grad_norm": 1.7877391576766968, + "learning_rate": 1.984840797697101e-05, + "loss": 0.6229, + "step": 6494 + }, + { + "epoch": 1.060242439084119, + "grad_norm": 2.062567949295044, + "learning_rate": 1.9848352927289224e-05, + "loss": 0.8307, + "step": 6495 + }, + { + "epoch": 1.0604056977266234, + "grad_norm": 1.7834426164627075, + "learning_rate": 1.9848297867690152e-05, + "loss": 0.6428, + "step": 6496 + }, + { + "epoch": 1.0605689563691278, + "grad_norm": 2.1293575763702393, + "learning_rate": 1.9848242798173846e-05, + "loss": 0.8863, + "step": 6497 + }, + { + "epoch": 1.0607322150116323, + "grad_norm": 1.8757915496826172, + "learning_rate": 1.9848187718740363e-05, + "loss": 0.7395, + "step": 6498 + }, + { + "epoch": 1.0608954736541365, + "grad_norm": 1.5795472860336304, + "learning_rate": 1.9848132629389757e-05, + "loss": 0.632, + "step": 6499 + }, + { + "epoch": 1.061058732296641, + "grad_norm": 1.74015212059021, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.6635, + "step": 6500 + }, + { + "epoch": 1.0612219909391454, + "grad_norm": 1.6464121341705322, + "learning_rate": 1.9848022420937398e-05, + "loss": 0.6114, + "step": 6501 + }, + { + "epoch": 1.0613852495816498, + "grad_norm": 1.8635272979736328, + "learning_rate": 1.9847967301835756e-05, + "loss": 0.6404, + "step": 6502 + }, + { + "epoch": 1.061548508224154, + "grad_norm": 1.721967101097107, + "learning_rate": 1.9847912172817215e-05, + "loss": 0.7079, + "step": 6503 + }, + { + "epoch": 1.0617117668666585, + "grad_norm": 1.7030422687530518, + "learning_rate": 1.984785703388183e-05, + "loss": 0.6446, + "step": 6504 + }, + { + "epoch": 1.061875025509163, + "grad_norm": 1.8117716312408447, + "learning_rate": 1.9847801885029652e-05, + "loss": 0.7431, + "step": 6505 + }, + { + "epoch": 1.0620382841516673, + "grad_norm": 1.884779453277588, + "learning_rate": 1.9847746726260742e-05, + "loss": 0.6949, + "step": 6506 + }, + { + "epoch": 1.0622015427941716, + "grad_norm": 1.7023359537124634, + "learning_rate": 1.9847691557575153e-05, + "loss": 0.7352, + "step": 6507 + }, + { + "epoch": 1.062364801436676, + "grad_norm": 2.025390148162842, + "learning_rate": 1.9847636378972944e-05, + "loss": 0.7879, + "step": 6508 + }, + { + "epoch": 1.0625280600791804, + "grad_norm": 1.7900786399841309, + "learning_rate": 1.9847581190454166e-05, + "loss": 0.7449, + "step": 6509 + }, + { + "epoch": 1.0626913187216849, + "grad_norm": 1.8524688482284546, + "learning_rate": 1.9847525992018877e-05, + "loss": 0.5716, + "step": 6510 + }, + { + "epoch": 1.0628545773641893, + "grad_norm": 2.3486416339874268, + "learning_rate": 1.9847470783667128e-05, + "loss": 0.8288, + "step": 6511 + }, + { + "epoch": 1.0630178360066935, + "grad_norm": 1.7486556768417358, + "learning_rate": 1.9847415565398985e-05, + "loss": 0.6412, + "step": 6512 + }, + { + "epoch": 1.063181094649198, + "grad_norm": 2.2853293418884277, + "learning_rate": 1.9847360337214492e-05, + "loss": 0.8744, + "step": 6513 + }, + { + "epoch": 1.0633443532917024, + "grad_norm": 1.9964988231658936, + "learning_rate": 1.9847305099113713e-05, + "loss": 0.8406, + "step": 6514 + }, + { + "epoch": 1.0635076119342068, + "grad_norm": 1.9495255947113037, + "learning_rate": 1.9847249851096696e-05, + "loss": 0.6728, + "step": 6515 + }, + { + "epoch": 1.063670870576711, + "grad_norm": 2.054844617843628, + "learning_rate": 1.9847194593163504e-05, + "loss": 0.8373, + "step": 6516 + }, + { + "epoch": 1.0638341292192155, + "grad_norm": 1.7116367816925049, + "learning_rate": 1.984713932531419e-05, + "loss": 0.5758, + "step": 6517 + }, + { + "epoch": 1.06399738786172, + "grad_norm": 1.9038788080215454, + "learning_rate": 1.984708404754881e-05, + "loss": 0.8737, + "step": 6518 + }, + { + "epoch": 1.0641606465042244, + "grad_norm": 1.902789831161499, + "learning_rate": 1.9847028759867417e-05, + "loss": 0.8002, + "step": 6519 + }, + { + "epoch": 1.0643239051467286, + "grad_norm": 2.3554506301879883, + "learning_rate": 1.984697346227007e-05, + "loss": 0.8593, + "step": 6520 + }, + { + "epoch": 1.064487163789233, + "grad_norm": 1.751593828201294, + "learning_rate": 1.9846918154756823e-05, + "loss": 0.7558, + "step": 6521 + }, + { + "epoch": 1.0646504224317375, + "grad_norm": 1.6111421585083008, + "learning_rate": 1.9846862837327733e-05, + "loss": 0.6807, + "step": 6522 + }, + { + "epoch": 1.064813681074242, + "grad_norm": 1.8458054065704346, + "learning_rate": 1.9846807509982854e-05, + "loss": 0.6997, + "step": 6523 + }, + { + "epoch": 1.0649769397167463, + "grad_norm": 1.7240962982177734, + "learning_rate": 1.9846752172722242e-05, + "loss": 0.6612, + "step": 6524 + }, + { + "epoch": 1.0651401983592506, + "grad_norm": 1.822969913482666, + "learning_rate": 1.984669682554595e-05, + "loss": 0.732, + "step": 6525 + }, + { + "epoch": 1.065303457001755, + "grad_norm": 2.0365076065063477, + "learning_rate": 1.984664146845404e-05, + "loss": 0.7476, + "step": 6526 + }, + { + "epoch": 1.0654667156442594, + "grad_norm": 2.104407548904419, + "learning_rate": 1.9846586101446567e-05, + "loss": 0.7717, + "step": 6527 + }, + { + "epoch": 1.0656299742867639, + "grad_norm": 1.7804725170135498, + "learning_rate": 1.9846530724523583e-05, + "loss": 0.7185, + "step": 6528 + }, + { + "epoch": 1.065793232929268, + "grad_norm": 1.8573238849639893, + "learning_rate": 1.9846475337685143e-05, + "loss": 0.7154, + "step": 6529 + }, + { + "epoch": 1.0659564915717725, + "grad_norm": 1.7709726095199585, + "learning_rate": 1.9846419940931305e-05, + "loss": 0.723, + "step": 6530 + }, + { + "epoch": 1.066119750214277, + "grad_norm": 1.8850891590118408, + "learning_rate": 1.9846364534262127e-05, + "loss": 0.9694, + "step": 6531 + }, + { + "epoch": 1.0662830088567814, + "grad_norm": 1.6783599853515625, + "learning_rate": 1.9846309117677662e-05, + "loss": 0.6751, + "step": 6532 + }, + { + "epoch": 1.0664462674992858, + "grad_norm": 2.040072441101074, + "learning_rate": 1.9846253691177965e-05, + "loss": 0.7128, + "step": 6533 + }, + { + "epoch": 1.06660952614179, + "grad_norm": 1.5698174238204956, + "learning_rate": 1.9846198254763097e-05, + "loss": 0.5784, + "step": 6534 + }, + { + "epoch": 1.0667727847842945, + "grad_norm": 1.7402125597000122, + "learning_rate": 1.9846142808433108e-05, + "loss": 0.7817, + "step": 6535 + }, + { + "epoch": 1.066936043426799, + "grad_norm": 1.799605369567871, + "learning_rate": 1.9846087352188053e-05, + "loss": 0.7424, + "step": 6536 + }, + { + "epoch": 1.0670993020693034, + "grad_norm": 1.6422302722930908, + "learning_rate": 1.9846031886027994e-05, + "loss": 0.6358, + "step": 6537 + }, + { + "epoch": 1.0672625607118076, + "grad_norm": 1.985588788986206, + "learning_rate": 1.984597640995298e-05, + "loss": 0.7278, + "step": 6538 + }, + { + "epoch": 1.067425819354312, + "grad_norm": 1.6890209913253784, + "learning_rate": 1.9845920923963072e-05, + "loss": 0.6707, + "step": 6539 + }, + { + "epoch": 1.0675890779968165, + "grad_norm": 2.059126853942871, + "learning_rate": 1.9845865428058325e-05, + "loss": 0.8534, + "step": 6540 + }, + { + "epoch": 1.067752336639321, + "grad_norm": 1.724249243736267, + "learning_rate": 1.9845809922238794e-05, + "loss": 0.7028, + "step": 6541 + }, + { + "epoch": 1.0679155952818253, + "grad_norm": 1.9257467985153198, + "learning_rate": 1.9845754406504535e-05, + "loss": 0.7676, + "step": 6542 + }, + { + "epoch": 1.0680788539243296, + "grad_norm": 1.7396624088287354, + "learning_rate": 1.9845698880855603e-05, + "loss": 0.7737, + "step": 6543 + }, + { + "epoch": 1.068242112566834, + "grad_norm": 1.868084192276001, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.6294, + "step": 6544 + }, + { + "epoch": 1.0684053712093384, + "grad_norm": 2.02315092086792, + "learning_rate": 1.9845587799813947e-05, + "loss": 0.6757, + "step": 6545 + }, + { + "epoch": 1.0685686298518429, + "grad_norm": 1.730357050895691, + "learning_rate": 1.9845532244421334e-05, + "loss": 0.7325, + "step": 6546 + }, + { + "epoch": 1.068731888494347, + "grad_norm": 1.9987250566482544, + "learning_rate": 1.9845476679114276e-05, + "loss": 0.632, + "step": 6547 + }, + { + "epoch": 1.0688951471368515, + "grad_norm": 2.0440049171447754, + "learning_rate": 1.984542110389282e-05, + "loss": 0.7285, + "step": 6548 + }, + { + "epoch": 1.069058405779356, + "grad_norm": 1.7097535133361816, + "learning_rate": 1.9845365518757033e-05, + "loss": 0.654, + "step": 6549 + }, + { + "epoch": 1.0692216644218604, + "grad_norm": 1.4935684204101562, + "learning_rate": 1.9845309923706965e-05, + "loss": 0.556, + "step": 6550 + }, + { + "epoch": 1.0693849230643648, + "grad_norm": 1.8168354034423828, + "learning_rate": 1.9845254318742668e-05, + "loss": 0.6495, + "step": 6551 + }, + { + "epoch": 1.069548181706869, + "grad_norm": 1.7025024890899658, + "learning_rate": 1.9845198703864205e-05, + "loss": 0.5683, + "step": 6552 + }, + { + "epoch": 1.0697114403493735, + "grad_norm": 2.0251781940460205, + "learning_rate": 1.9845143079071632e-05, + "loss": 0.8429, + "step": 6553 + }, + { + "epoch": 1.069874698991878, + "grad_norm": 1.65981924533844, + "learning_rate": 1.9845087444365002e-05, + "loss": 0.6155, + "step": 6554 + }, + { + "epoch": 1.0700379576343824, + "grad_norm": 2.0557613372802734, + "learning_rate": 1.9845031799744367e-05, + "loss": 0.7641, + "step": 6555 + }, + { + "epoch": 1.0702012162768866, + "grad_norm": 1.9714964628219604, + "learning_rate": 1.984497614520979e-05, + "loss": 0.6825, + "step": 6556 + }, + { + "epoch": 1.070364474919391, + "grad_norm": 1.9259599447250366, + "learning_rate": 1.9844920480761327e-05, + "loss": 0.6782, + "step": 6557 + }, + { + "epoch": 1.0705277335618955, + "grad_norm": 1.8293898105621338, + "learning_rate": 1.984486480639903e-05, + "loss": 0.7082, + "step": 6558 + }, + { + "epoch": 1.0706909922044, + "grad_norm": 1.7960492372512817, + "learning_rate": 1.9844809122122955e-05, + "loss": 0.7651, + "step": 6559 + }, + { + "epoch": 1.070854250846904, + "grad_norm": 1.6418561935424805, + "learning_rate": 1.9844753427933163e-05, + "loss": 0.5683, + "step": 6560 + }, + { + "epoch": 1.0710175094894085, + "grad_norm": 2.100053310394287, + "learning_rate": 1.9844697723829703e-05, + "loss": 0.7285, + "step": 6561 + }, + { + "epoch": 1.071180768131913, + "grad_norm": 2.004775285720825, + "learning_rate": 1.9844642009812637e-05, + "loss": 0.8311, + "step": 6562 + }, + { + "epoch": 1.0713440267744174, + "grad_norm": 1.6053495407104492, + "learning_rate": 1.984458628588202e-05, + "loss": 0.5773, + "step": 6563 + }, + { + "epoch": 1.0715072854169219, + "grad_norm": 1.648263931274414, + "learning_rate": 1.984453055203791e-05, + "loss": 0.5895, + "step": 6564 + }, + { + "epoch": 1.071670544059426, + "grad_norm": 2.4731173515319824, + "learning_rate": 1.9844474808280355e-05, + "loss": 0.7566, + "step": 6565 + }, + { + "epoch": 1.0718338027019305, + "grad_norm": 1.8368890285491943, + "learning_rate": 1.9844419054609418e-05, + "loss": 0.6419, + "step": 6566 + }, + { + "epoch": 1.071997061344435, + "grad_norm": 1.9592753648757935, + "learning_rate": 1.9844363291025154e-05, + "loss": 0.6806, + "step": 6567 + }, + { + "epoch": 1.0721603199869394, + "grad_norm": 2.030353546142578, + "learning_rate": 1.9844307517527622e-05, + "loss": 0.8372, + "step": 6568 + }, + { + "epoch": 1.0723235786294436, + "grad_norm": 2.108290195465088, + "learning_rate": 1.9844251734116867e-05, + "loss": 0.6782, + "step": 6569 + }, + { + "epoch": 1.072486837271948, + "grad_norm": 1.9641799926757812, + "learning_rate": 1.984419594079296e-05, + "loss": 0.855, + "step": 6570 + }, + { + "epoch": 1.0726500959144525, + "grad_norm": 2.051434278488159, + "learning_rate": 1.9844140137555946e-05, + "loss": 0.7145, + "step": 6571 + }, + { + "epoch": 1.072813354556957, + "grad_norm": 1.8669847249984741, + "learning_rate": 1.984408432440589e-05, + "loss": 0.6731, + "step": 6572 + }, + { + "epoch": 1.0729766131994611, + "grad_norm": 1.75222909450531, + "learning_rate": 1.984402850134284e-05, + "loss": 0.6348, + "step": 6573 + }, + { + "epoch": 1.0731398718419656, + "grad_norm": 1.7861241102218628, + "learning_rate": 1.984397266836686e-05, + "loss": 0.7489, + "step": 6574 + }, + { + "epoch": 1.07330313048447, + "grad_norm": 1.9785913228988647, + "learning_rate": 1.9843916825477997e-05, + "loss": 0.6934, + "step": 6575 + }, + { + "epoch": 1.0734663891269745, + "grad_norm": 1.661584496498108, + "learning_rate": 1.9843860972676318e-05, + "loss": 0.6539, + "step": 6576 + }, + { + "epoch": 1.073629647769479, + "grad_norm": 2.224846601486206, + "learning_rate": 1.984380510996187e-05, + "loss": 0.9455, + "step": 6577 + }, + { + "epoch": 1.073792906411983, + "grad_norm": 1.7586872577667236, + "learning_rate": 1.9843749237334714e-05, + "loss": 0.6399, + "step": 6578 + }, + { + "epoch": 1.0739561650544875, + "grad_norm": 1.882796049118042, + "learning_rate": 1.98436933547949e-05, + "loss": 0.672, + "step": 6579 + }, + { + "epoch": 1.074119423696992, + "grad_norm": 1.898260235786438, + "learning_rate": 1.9843637462342498e-05, + "loss": 0.7816, + "step": 6580 + }, + { + "epoch": 1.0742826823394964, + "grad_norm": 1.8620870113372803, + "learning_rate": 1.9843581559977553e-05, + "loss": 0.7026, + "step": 6581 + }, + { + "epoch": 1.0744459409820006, + "grad_norm": 1.5727565288543701, + "learning_rate": 1.984352564770012e-05, + "loss": 0.5662, + "step": 6582 + }, + { + "epoch": 1.074609199624505, + "grad_norm": 1.3328168392181396, + "learning_rate": 1.984346972551026e-05, + "loss": 0.5136, + "step": 6583 + }, + { + "epoch": 1.0747724582670095, + "grad_norm": 2.0531411170959473, + "learning_rate": 1.984341379340803e-05, + "loss": 0.726, + "step": 6584 + }, + { + "epoch": 1.074935716909514, + "grad_norm": 2.056612253189087, + "learning_rate": 1.984335785139349e-05, + "loss": 0.8, + "step": 6585 + }, + { + "epoch": 1.0750989755520184, + "grad_norm": 1.8384311199188232, + "learning_rate": 1.9843301899466682e-05, + "loss": 0.706, + "step": 6586 + }, + { + "epoch": 1.0752622341945226, + "grad_norm": 2.2263176441192627, + "learning_rate": 1.984324593762768e-05, + "loss": 0.8344, + "step": 6587 + }, + { + "epoch": 1.075425492837027, + "grad_norm": 1.9422591924667358, + "learning_rate": 1.9843189965876525e-05, + "loss": 0.71, + "step": 6588 + }, + { + "epoch": 1.0755887514795315, + "grad_norm": 1.6306496858596802, + "learning_rate": 1.9843133984213284e-05, + "loss": 0.5425, + "step": 6589 + }, + { + "epoch": 1.075752010122036, + "grad_norm": 2.018303155899048, + "learning_rate": 1.984307799263801e-05, + "loss": 0.7406, + "step": 6590 + }, + { + "epoch": 1.0759152687645401, + "grad_norm": 1.973543643951416, + "learning_rate": 1.984302199115076e-05, + "loss": 0.6378, + "step": 6591 + }, + { + "epoch": 1.0760785274070446, + "grad_norm": 2.021148443222046, + "learning_rate": 1.9842965979751586e-05, + "loss": 0.7338, + "step": 6592 + }, + { + "epoch": 1.076241786049549, + "grad_norm": 1.8328499794006348, + "learning_rate": 1.9842909958440552e-05, + "loss": 0.7935, + "step": 6593 + }, + { + "epoch": 1.0764050446920534, + "grad_norm": 1.772855520248413, + "learning_rate": 1.9842853927217708e-05, + "loss": 0.5776, + "step": 6594 + }, + { + "epoch": 1.0765683033345579, + "grad_norm": 1.8396480083465576, + "learning_rate": 1.984279788608311e-05, + "loss": 0.6988, + "step": 6595 + }, + { + "epoch": 1.076731561977062, + "grad_norm": 2.009335517883301, + "learning_rate": 1.9842741835036817e-05, + "loss": 0.7981, + "step": 6596 + }, + { + "epoch": 1.0768948206195665, + "grad_norm": 1.584448218345642, + "learning_rate": 1.984268577407889e-05, + "loss": 0.67, + "step": 6597 + }, + { + "epoch": 1.077058079262071, + "grad_norm": 2.006624460220337, + "learning_rate": 1.984262970320938e-05, + "loss": 0.787, + "step": 6598 + }, + { + "epoch": 1.0772213379045754, + "grad_norm": 1.5471735000610352, + "learning_rate": 1.9842573622428346e-05, + "loss": 0.5102, + "step": 6599 + }, + { + "epoch": 1.0773845965470796, + "grad_norm": 1.5162869691848755, + "learning_rate": 1.9842517531735837e-05, + "loss": 0.5633, + "step": 6600 + }, + { + "epoch": 1.077547855189584, + "grad_norm": 1.5768474340438843, + "learning_rate": 1.9842461431131922e-05, + "loss": 0.6684, + "step": 6601 + }, + { + "epoch": 1.0777111138320885, + "grad_norm": 1.9167141914367676, + "learning_rate": 1.9842405320616647e-05, + "loss": 0.7495, + "step": 6602 + }, + { + "epoch": 1.077874372474593, + "grad_norm": 2.0941808223724365, + "learning_rate": 1.9842349200190073e-05, + "loss": 0.7309, + "step": 6603 + }, + { + "epoch": 1.0780376311170972, + "grad_norm": 1.6425756216049194, + "learning_rate": 1.9842293069852258e-05, + "loss": 0.5606, + "step": 6604 + }, + { + "epoch": 1.0782008897596016, + "grad_norm": 2.034040689468384, + "learning_rate": 1.9842236929603253e-05, + "loss": 0.7984, + "step": 6605 + }, + { + "epoch": 1.078364148402106, + "grad_norm": 1.6111505031585693, + "learning_rate": 1.984218077944312e-05, + "loss": 0.6204, + "step": 6606 + }, + { + "epoch": 1.0785274070446105, + "grad_norm": 1.7184796333312988, + "learning_rate": 1.9842124619371918e-05, + "loss": 0.6918, + "step": 6607 + }, + { + "epoch": 1.078690665687115, + "grad_norm": 1.7415813207626343, + "learning_rate": 1.984206844938969e-05, + "loss": 0.6403, + "step": 6608 + }, + { + "epoch": 1.0788539243296191, + "grad_norm": 1.9236661195755005, + "learning_rate": 1.984201226949651e-05, + "loss": 0.6855, + "step": 6609 + }, + { + "epoch": 1.0790171829721236, + "grad_norm": 1.8409168720245361, + "learning_rate": 1.984195607969242e-05, + "loss": 0.6473, + "step": 6610 + }, + { + "epoch": 1.079180441614628, + "grad_norm": 1.800718069076538, + "learning_rate": 1.9841899879977485e-05, + "loss": 0.7693, + "step": 6611 + }, + { + "epoch": 1.0793437002571324, + "grad_norm": 2.3091940879821777, + "learning_rate": 1.9841843670351762e-05, + "loss": 0.7447, + "step": 6612 + }, + { + "epoch": 1.0795069588996367, + "grad_norm": 1.992472529411316, + "learning_rate": 1.9841787450815303e-05, + "loss": 0.7803, + "step": 6613 + }, + { + "epoch": 1.079670217542141, + "grad_norm": 2.1091084480285645, + "learning_rate": 1.9841731221368166e-05, + "loss": 0.7881, + "step": 6614 + }, + { + "epoch": 1.0798334761846455, + "grad_norm": 1.6654093265533447, + "learning_rate": 1.9841674982010408e-05, + "loss": 0.6653, + "step": 6615 + }, + { + "epoch": 1.07999673482715, + "grad_norm": 1.7147135734558105, + "learning_rate": 1.984161873274209e-05, + "loss": 0.6555, + "step": 6616 + }, + { + "epoch": 1.0801599934696542, + "grad_norm": 1.780023217201233, + "learning_rate": 1.984156247356326e-05, + "loss": 0.6606, + "step": 6617 + }, + { + "epoch": 1.0803232521121586, + "grad_norm": 1.875758171081543, + "learning_rate": 1.984150620447398e-05, + "loss": 0.6295, + "step": 6618 + }, + { + "epoch": 1.080486510754663, + "grad_norm": 1.5792806148529053, + "learning_rate": 1.9841449925474307e-05, + "loss": 0.7325, + "step": 6619 + }, + { + "epoch": 1.0806497693971675, + "grad_norm": 1.7335821390151978, + "learning_rate": 1.9841393636564295e-05, + "loss": 0.6596, + "step": 6620 + }, + { + "epoch": 1.080813028039672, + "grad_norm": 1.9912279844284058, + "learning_rate": 1.9841337337744004e-05, + "loss": 0.6337, + "step": 6621 + }, + { + "epoch": 1.0809762866821762, + "grad_norm": 1.6391801834106445, + "learning_rate": 1.9841281029013488e-05, + "loss": 0.6601, + "step": 6622 + }, + { + "epoch": 1.0811395453246806, + "grad_norm": 2.2245352268218994, + "learning_rate": 1.9841224710372805e-05, + "loss": 0.7679, + "step": 6623 + }, + { + "epoch": 1.081302803967185, + "grad_norm": 1.8162120580673218, + "learning_rate": 1.984116838182201e-05, + "loss": 0.8066, + "step": 6624 + }, + { + "epoch": 1.0814660626096895, + "grad_norm": 2.0029609203338623, + "learning_rate": 1.984111204336116e-05, + "loss": 0.7806, + "step": 6625 + }, + { + "epoch": 1.0816293212521937, + "grad_norm": 1.8661329746246338, + "learning_rate": 1.9841055694990315e-05, + "loss": 0.8323, + "step": 6626 + }, + { + "epoch": 1.0817925798946981, + "grad_norm": 2.160513162612915, + "learning_rate": 1.984099933670953e-05, + "loss": 0.82, + "step": 6627 + }, + { + "epoch": 1.0819558385372026, + "grad_norm": 2.184948682785034, + "learning_rate": 1.984094296851886e-05, + "loss": 0.7703, + "step": 6628 + }, + { + "epoch": 1.082119097179707, + "grad_norm": 1.5797338485717773, + "learning_rate": 1.9840886590418366e-05, + "loss": 0.5749, + "step": 6629 + }, + { + "epoch": 1.0822823558222114, + "grad_norm": 2.1470658779144287, + "learning_rate": 1.98408302024081e-05, + "loss": 0.7633, + "step": 6630 + }, + { + "epoch": 1.0824456144647157, + "grad_norm": 1.7773462533950806, + "learning_rate": 1.984077380448812e-05, + "loss": 0.7237, + "step": 6631 + }, + { + "epoch": 1.08260887310722, + "grad_norm": 1.9119062423706055, + "learning_rate": 1.9840717396658483e-05, + "loss": 0.6825, + "step": 6632 + }, + { + "epoch": 1.0827721317497245, + "grad_norm": 1.8408288955688477, + "learning_rate": 1.984066097891925e-05, + "loss": 0.614, + "step": 6633 + }, + { + "epoch": 1.082935390392229, + "grad_norm": 1.7402695417404175, + "learning_rate": 1.9840604551270467e-05, + "loss": 0.7003, + "step": 6634 + }, + { + "epoch": 1.0830986490347332, + "grad_norm": 1.747113585472107, + "learning_rate": 1.98405481137122e-05, + "loss": 0.7841, + "step": 6635 + }, + { + "epoch": 1.0832619076772376, + "grad_norm": 2.0237865447998047, + "learning_rate": 1.9840491666244508e-05, + "loss": 0.7104, + "step": 6636 + }, + { + "epoch": 1.083425166319742, + "grad_norm": 1.9401023387908936, + "learning_rate": 1.984043520886744e-05, + "loss": 0.8054, + "step": 6637 + }, + { + "epoch": 1.0835884249622465, + "grad_norm": 1.8557326793670654, + "learning_rate": 1.984037874158106e-05, + "loss": 0.6534, + "step": 6638 + }, + { + "epoch": 1.083751683604751, + "grad_norm": 1.9136977195739746, + "learning_rate": 1.9840322264385418e-05, + "loss": 0.6825, + "step": 6639 + }, + { + "epoch": 1.0839149422472552, + "grad_norm": 1.8995481729507446, + "learning_rate": 1.984026577728057e-05, + "loss": 0.7275, + "step": 6640 + }, + { + "epoch": 1.0840782008897596, + "grad_norm": 1.9766427278518677, + "learning_rate": 1.9840209280266585e-05, + "loss": 0.6974, + "step": 6641 + }, + { + "epoch": 1.084241459532264, + "grad_norm": 1.9898711442947388, + "learning_rate": 1.9840152773343506e-05, + "loss": 0.7818, + "step": 6642 + }, + { + "epoch": 1.0844047181747685, + "grad_norm": 1.780605435371399, + "learning_rate": 1.9840096256511398e-05, + "loss": 0.7254, + "step": 6643 + }, + { + "epoch": 1.0845679768172727, + "grad_norm": 2.2149736881256104, + "learning_rate": 1.9840039729770316e-05, + "loss": 0.7445, + "step": 6644 + }, + { + "epoch": 1.0847312354597771, + "grad_norm": 2.1186225414276123, + "learning_rate": 1.9839983193120317e-05, + "loss": 0.8586, + "step": 6645 + }, + { + "epoch": 1.0848944941022816, + "grad_norm": 1.6025733947753906, + "learning_rate": 1.9839926646561456e-05, + "loss": 0.5964, + "step": 6646 + }, + { + "epoch": 1.085057752744786, + "grad_norm": 1.7277116775512695, + "learning_rate": 1.9839870090093793e-05, + "loss": 0.706, + "step": 6647 + }, + { + "epoch": 1.0852210113872902, + "grad_norm": 1.8434275388717651, + "learning_rate": 1.9839813523717383e-05, + "loss": 0.5565, + "step": 6648 + }, + { + "epoch": 1.0853842700297947, + "grad_norm": 1.7056304216384888, + "learning_rate": 1.9839756947432283e-05, + "loss": 0.6498, + "step": 6649 + }, + { + "epoch": 1.085547528672299, + "grad_norm": 1.8487529754638672, + "learning_rate": 1.9839700361238548e-05, + "loss": 0.8111, + "step": 6650 + }, + { + "epoch": 1.0857107873148035, + "grad_norm": 1.73468017578125, + "learning_rate": 1.9839643765136242e-05, + "loss": 0.6909, + "step": 6651 + }, + { + "epoch": 1.085874045957308, + "grad_norm": 1.5972963571548462, + "learning_rate": 1.9839587159125415e-05, + "loss": 0.6562, + "step": 6652 + }, + { + "epoch": 1.0860373045998122, + "grad_norm": 1.9899357557296753, + "learning_rate": 1.9839530543206126e-05, + "loss": 0.7233, + "step": 6653 + }, + { + "epoch": 1.0862005632423166, + "grad_norm": 1.976824164390564, + "learning_rate": 1.9839473917378432e-05, + "loss": 0.7709, + "step": 6654 + }, + { + "epoch": 1.086363821884821, + "grad_norm": 1.8326069116592407, + "learning_rate": 1.9839417281642394e-05, + "loss": 0.7178, + "step": 6655 + }, + { + "epoch": 1.0865270805273255, + "grad_norm": 1.8693517446517944, + "learning_rate": 1.9839360635998062e-05, + "loss": 0.6715, + "step": 6656 + }, + { + "epoch": 1.0866903391698297, + "grad_norm": 1.7195690870285034, + "learning_rate": 1.9839303980445498e-05, + "loss": 0.5223, + "step": 6657 + }, + { + "epoch": 1.0868535978123341, + "grad_norm": 2.2858951091766357, + "learning_rate": 1.9839247314984756e-05, + "loss": 0.8267, + "step": 6658 + }, + { + "epoch": 1.0870168564548386, + "grad_norm": 1.7932099103927612, + "learning_rate": 1.9839190639615894e-05, + "loss": 0.6833, + "step": 6659 + }, + { + "epoch": 1.087180115097343, + "grad_norm": 1.4854425191879272, + "learning_rate": 1.9839133954338972e-05, + "loss": 0.5713, + "step": 6660 + }, + { + "epoch": 1.0873433737398472, + "grad_norm": 1.6302744150161743, + "learning_rate": 1.983907725915404e-05, + "loss": 0.5961, + "step": 6661 + }, + { + "epoch": 1.0875066323823517, + "grad_norm": 1.8020672798156738, + "learning_rate": 1.9839020554061167e-05, + "loss": 0.9097, + "step": 6662 + }, + { + "epoch": 1.0876698910248561, + "grad_norm": 1.9074314832687378, + "learning_rate": 1.9838963839060395e-05, + "loss": 0.7573, + "step": 6663 + }, + { + "epoch": 1.0878331496673606, + "grad_norm": 1.7917287349700928, + "learning_rate": 1.9838907114151794e-05, + "loss": 0.5639, + "step": 6664 + }, + { + "epoch": 1.087996408309865, + "grad_norm": 1.9362990856170654, + "learning_rate": 1.983885037933542e-05, + "loss": 0.8913, + "step": 6665 + }, + { + "epoch": 1.0881596669523692, + "grad_norm": 1.9013866186141968, + "learning_rate": 1.983879363461132e-05, + "loss": 0.79, + "step": 6666 + }, + { + "epoch": 1.0883229255948736, + "grad_norm": 1.799142837524414, + "learning_rate": 1.983873687997956e-05, + "loss": 0.6431, + "step": 6667 + }, + { + "epoch": 1.088486184237378, + "grad_norm": 1.4831737279891968, + "learning_rate": 1.983868011544019e-05, + "loss": 0.5704, + "step": 6668 + }, + { + "epoch": 1.0886494428798825, + "grad_norm": 2.0343286991119385, + "learning_rate": 1.983862334099328e-05, + "loss": 0.8351, + "step": 6669 + }, + { + "epoch": 1.0888127015223867, + "grad_norm": 1.5970900058746338, + "learning_rate": 1.9838566556638872e-05, + "loss": 0.5001, + "step": 6670 + }, + { + "epoch": 1.0889759601648912, + "grad_norm": 2.1249027252197266, + "learning_rate": 1.9838509762377033e-05, + "loss": 0.7897, + "step": 6671 + }, + { + "epoch": 1.0891392188073956, + "grad_norm": 1.9271678924560547, + "learning_rate": 1.9838452958207816e-05, + "loss": 0.6199, + "step": 6672 + }, + { + "epoch": 1.0893024774499, + "grad_norm": 2.1807351112365723, + "learning_rate": 1.983839614413128e-05, + "loss": 1.1245, + "step": 6673 + }, + { + "epoch": 1.0894657360924045, + "grad_norm": 2.3089358806610107, + "learning_rate": 1.9838339320147483e-05, + "loss": 0.7157, + "step": 6674 + }, + { + "epoch": 1.0896289947349087, + "grad_norm": 1.7235743999481201, + "learning_rate": 1.983828248625648e-05, + "loss": 0.6675, + "step": 6675 + }, + { + "epoch": 1.0897922533774131, + "grad_norm": 2.100759744644165, + "learning_rate": 1.983822564245833e-05, + "loss": 0.6916, + "step": 6676 + }, + { + "epoch": 1.0899555120199176, + "grad_norm": 1.8754830360412598, + "learning_rate": 1.9838168788753088e-05, + "loss": 0.8093, + "step": 6677 + }, + { + "epoch": 1.090118770662422, + "grad_norm": 1.998581051826477, + "learning_rate": 1.983811192514081e-05, + "loss": 0.6973, + "step": 6678 + }, + { + "epoch": 1.0902820293049262, + "grad_norm": 1.96390962600708, + "learning_rate": 1.983805505162156e-05, + "loss": 0.7281, + "step": 6679 + }, + { + "epoch": 1.0904452879474307, + "grad_norm": 1.7440420389175415, + "learning_rate": 1.983799816819539e-05, + "loss": 0.6581, + "step": 6680 + }, + { + "epoch": 1.0906085465899351, + "grad_norm": 1.7326900959014893, + "learning_rate": 1.9837941274862358e-05, + "loss": 0.7065, + "step": 6681 + }, + { + "epoch": 1.0907718052324396, + "grad_norm": 1.9566411972045898, + "learning_rate": 1.9837884371622524e-05, + "loss": 0.6938, + "step": 6682 + }, + { + "epoch": 1.090935063874944, + "grad_norm": 1.9278169870376587, + "learning_rate": 1.983782745847594e-05, + "loss": 0.7652, + "step": 6683 + }, + { + "epoch": 1.0910983225174482, + "grad_norm": 1.527809500694275, + "learning_rate": 1.9837770535422668e-05, + "loss": 0.5814, + "step": 6684 + }, + { + "epoch": 1.0912615811599526, + "grad_norm": 1.8250073194503784, + "learning_rate": 1.9837713602462762e-05, + "loss": 0.732, + "step": 6685 + }, + { + "epoch": 1.091424839802457, + "grad_norm": 1.928114652633667, + "learning_rate": 1.9837656659596283e-05, + "loss": 0.737, + "step": 6686 + }, + { + "epoch": 1.0915880984449615, + "grad_norm": 1.8044427633285522, + "learning_rate": 1.9837599706823284e-05, + "loss": 0.6829, + "step": 6687 + }, + { + "epoch": 1.0917513570874657, + "grad_norm": 2.185000419616699, + "learning_rate": 1.9837542744143827e-05, + "loss": 0.7051, + "step": 6688 + }, + { + "epoch": 1.0919146157299702, + "grad_norm": 1.6885191202163696, + "learning_rate": 1.9837485771557968e-05, + "loss": 0.5987, + "step": 6689 + }, + { + "epoch": 1.0920778743724746, + "grad_norm": 1.9911586046218872, + "learning_rate": 1.983742878906576e-05, + "loss": 0.7108, + "step": 6690 + }, + { + "epoch": 1.092241133014979, + "grad_norm": 1.8788814544677734, + "learning_rate": 1.9837371796667265e-05, + "loss": 0.5904, + "step": 6691 + }, + { + "epoch": 1.0924043916574833, + "grad_norm": 2.096450090408325, + "learning_rate": 1.983731479436254e-05, + "loss": 0.7639, + "step": 6692 + }, + { + "epoch": 1.0925676502999877, + "grad_norm": 1.6665771007537842, + "learning_rate": 1.9837257782151643e-05, + "loss": 0.5899, + "step": 6693 + }, + { + "epoch": 1.0927309089424921, + "grad_norm": 2.044680118560791, + "learning_rate": 1.9837200760034627e-05, + "loss": 0.8299, + "step": 6694 + }, + { + "epoch": 1.0928941675849966, + "grad_norm": 1.6426305770874023, + "learning_rate": 1.9837143728011555e-05, + "loss": 0.652, + "step": 6695 + }, + { + "epoch": 1.093057426227501, + "grad_norm": 1.7755138874053955, + "learning_rate": 1.983708668608248e-05, + "loss": 0.6615, + "step": 6696 + }, + { + "epoch": 1.0932206848700052, + "grad_norm": 2.2207517623901367, + "learning_rate": 1.9837029634247465e-05, + "loss": 0.7706, + "step": 6697 + }, + { + "epoch": 1.0933839435125097, + "grad_norm": 1.7230175733566284, + "learning_rate": 1.9836972572506557e-05, + "loss": 0.7526, + "step": 6698 + }, + { + "epoch": 1.093547202155014, + "grad_norm": 1.8972721099853516, + "learning_rate": 1.9836915500859825e-05, + "loss": 0.7701, + "step": 6699 + }, + { + "epoch": 1.0937104607975185, + "grad_norm": 1.9653180837631226, + "learning_rate": 1.9836858419307325e-05, + "loss": 0.7227, + "step": 6700 + }, + { + "epoch": 1.0938737194400228, + "grad_norm": 2.07397198677063, + "learning_rate": 1.9836801327849105e-05, + "loss": 0.7456, + "step": 6701 + }, + { + "epoch": 1.0940369780825272, + "grad_norm": 1.9206992387771606, + "learning_rate": 1.9836744226485232e-05, + "loss": 0.6984, + "step": 6702 + }, + { + "epoch": 1.0942002367250316, + "grad_norm": 2.120697259902954, + "learning_rate": 1.983668711521576e-05, + "loss": 0.8265, + "step": 6703 + }, + { + "epoch": 1.094363495367536, + "grad_norm": 1.6795170307159424, + "learning_rate": 1.983662999404074e-05, + "loss": 0.7036, + "step": 6704 + }, + { + "epoch": 1.0945267540100403, + "grad_norm": 1.9468168020248413, + "learning_rate": 1.9836572862960242e-05, + "loss": 0.7614, + "step": 6705 + }, + { + "epoch": 1.0946900126525447, + "grad_norm": 1.7960861921310425, + "learning_rate": 1.983651572197432e-05, + "loss": 0.7041, + "step": 6706 + }, + { + "epoch": 1.0948532712950492, + "grad_norm": 1.7694761753082275, + "learning_rate": 1.9836458571083027e-05, + "loss": 0.7586, + "step": 6707 + }, + { + "epoch": 1.0950165299375536, + "grad_norm": 1.8289145231246948, + "learning_rate": 1.983640141028642e-05, + "loss": 0.8302, + "step": 6708 + }, + { + "epoch": 1.095179788580058, + "grad_norm": 1.9909168481826782, + "learning_rate": 1.9836344239584566e-05, + "loss": 0.817, + "step": 6709 + }, + { + "epoch": 1.0953430472225623, + "grad_norm": 1.8028284311294556, + "learning_rate": 1.983628705897751e-05, + "loss": 0.7386, + "step": 6710 + }, + { + "epoch": 1.0955063058650667, + "grad_norm": 1.9929534196853638, + "learning_rate": 1.9836229868465318e-05, + "loss": 0.7484, + "step": 6711 + }, + { + "epoch": 1.0956695645075711, + "grad_norm": 2.0461676120758057, + "learning_rate": 1.9836172668048043e-05, + "loss": 1.3219, + "step": 6712 + }, + { + "epoch": 1.0958328231500756, + "grad_norm": 1.7320629358291626, + "learning_rate": 1.9836115457725745e-05, + "loss": 0.7366, + "step": 6713 + }, + { + "epoch": 1.0959960817925798, + "grad_norm": 2.1333844661712646, + "learning_rate": 1.9836058237498487e-05, + "loss": 1.1913, + "step": 6714 + }, + { + "epoch": 1.0961593404350842, + "grad_norm": 1.4836969375610352, + "learning_rate": 1.9836001007366317e-05, + "loss": 0.5272, + "step": 6715 + }, + { + "epoch": 1.0963225990775887, + "grad_norm": 1.635049819946289, + "learning_rate": 1.9835943767329297e-05, + "loss": 0.5816, + "step": 6716 + }, + { + "epoch": 1.096485857720093, + "grad_norm": 1.8632649183273315, + "learning_rate": 1.9835886517387483e-05, + "loss": 0.7868, + "step": 6717 + }, + { + "epoch": 1.0966491163625975, + "grad_norm": 1.587315559387207, + "learning_rate": 1.9835829257540933e-05, + "loss": 0.61, + "step": 6718 + }, + { + "epoch": 1.0968123750051018, + "grad_norm": 1.6050032377243042, + "learning_rate": 1.9835771987789706e-05, + "loss": 0.6116, + "step": 6719 + }, + { + "epoch": 1.0969756336476062, + "grad_norm": 2.281839370727539, + "learning_rate": 1.983571470813386e-05, + "loss": 0.7951, + "step": 6720 + }, + { + "epoch": 1.0971388922901106, + "grad_norm": 1.5932621955871582, + "learning_rate": 1.9835657418573453e-05, + "loss": 0.6, + "step": 6721 + }, + { + "epoch": 1.097302150932615, + "grad_norm": 1.9839459657669067, + "learning_rate": 1.983560011910854e-05, + "loss": 0.7756, + "step": 6722 + }, + { + "epoch": 1.0974654095751193, + "grad_norm": 2.067816734313965, + "learning_rate": 1.9835542809739183e-05, + "loss": 0.6887, + "step": 6723 + }, + { + "epoch": 1.0976286682176237, + "grad_norm": 1.8881237506866455, + "learning_rate": 1.9835485490465435e-05, + "loss": 0.8247, + "step": 6724 + }, + { + "epoch": 1.0977919268601282, + "grad_norm": 1.849304437637329, + "learning_rate": 1.9835428161287355e-05, + "loss": 0.6657, + "step": 6725 + }, + { + "epoch": 1.0979551855026326, + "grad_norm": 1.6956336498260498, + "learning_rate": 1.9835370822205e-05, + "loss": 0.6685, + "step": 6726 + }, + { + "epoch": 1.098118444145137, + "grad_norm": 1.6432068347930908, + "learning_rate": 1.983531347321843e-05, + "loss": 0.6313, + "step": 6727 + }, + { + "epoch": 1.0982817027876413, + "grad_norm": 1.582260012626648, + "learning_rate": 1.9835256114327706e-05, + "loss": 0.5925, + "step": 6728 + }, + { + "epoch": 1.0984449614301457, + "grad_norm": 2.0107574462890625, + "learning_rate": 1.983519874553288e-05, + "loss": 0.7573, + "step": 6729 + }, + { + "epoch": 1.0986082200726501, + "grad_norm": 1.5984526872634888, + "learning_rate": 1.9835141366834006e-05, + "loss": 0.664, + "step": 6730 + }, + { + "epoch": 1.0987714787151546, + "grad_norm": 1.8791431188583374, + "learning_rate": 1.9835083978231157e-05, + "loss": 0.626, + "step": 6731 + }, + { + "epoch": 1.0989347373576588, + "grad_norm": 1.9137969017028809, + "learning_rate": 1.9835026579724372e-05, + "loss": 0.8716, + "step": 6732 + }, + { + "epoch": 1.0990979960001632, + "grad_norm": 1.7189158201217651, + "learning_rate": 1.9834969171313722e-05, + "loss": 0.6836, + "step": 6733 + }, + { + "epoch": 1.0992612546426677, + "grad_norm": 2.041017532348633, + "learning_rate": 1.983491175299926e-05, + "loss": 0.7335, + "step": 6734 + }, + { + "epoch": 1.099424513285172, + "grad_norm": 1.7586238384246826, + "learning_rate": 1.9834854324781044e-05, + "loss": 0.6964, + "step": 6735 + }, + { + "epoch": 1.0995877719276765, + "grad_norm": 3.443878412246704, + "learning_rate": 1.9834796886659135e-05, + "loss": 0.8846, + "step": 6736 + }, + { + "epoch": 1.0997510305701808, + "grad_norm": 1.6668941974639893, + "learning_rate": 1.9834739438633584e-05, + "loss": 0.5762, + "step": 6737 + }, + { + "epoch": 1.0999142892126852, + "grad_norm": 2.1670210361480713, + "learning_rate": 1.9834681980704456e-05, + "loss": 0.7511, + "step": 6738 + }, + { + "epoch": 1.1000775478551896, + "grad_norm": 1.9482485055923462, + "learning_rate": 1.9834624512871806e-05, + "loss": 0.7927, + "step": 6739 + }, + { + "epoch": 1.100240806497694, + "grad_norm": 1.5745545625686646, + "learning_rate": 1.983456703513569e-05, + "loss": 0.5912, + "step": 6740 + }, + { + "epoch": 1.1004040651401983, + "grad_norm": 1.7891712188720703, + "learning_rate": 1.9834509547496167e-05, + "loss": 0.719, + "step": 6741 + }, + { + "epoch": 1.1005673237827027, + "grad_norm": 1.8192144632339478, + "learning_rate": 1.98344520499533e-05, + "loss": 0.7566, + "step": 6742 + }, + { + "epoch": 1.1007305824252072, + "grad_norm": 1.8048319816589355, + "learning_rate": 1.9834394542507138e-05, + "loss": 0.7487, + "step": 6743 + }, + { + "epoch": 1.1008938410677116, + "grad_norm": 2.1811366081237793, + "learning_rate": 1.9834337025157745e-05, + "loss": 0.7314, + "step": 6744 + }, + { + "epoch": 1.1010570997102158, + "grad_norm": 1.6925514936447144, + "learning_rate": 1.9834279497905177e-05, + "loss": 0.6426, + "step": 6745 + }, + { + "epoch": 1.1012203583527203, + "grad_norm": 1.8848170042037964, + "learning_rate": 1.983422196074949e-05, + "loss": 0.6891, + "step": 6746 + }, + { + "epoch": 1.1013836169952247, + "grad_norm": 1.9195700883865356, + "learning_rate": 1.9834164413690748e-05, + "loss": 0.794, + "step": 6747 + }, + { + "epoch": 1.1015468756377291, + "grad_norm": 2.074894428253174, + "learning_rate": 1.9834106856729e-05, + "loss": 0.8428, + "step": 6748 + }, + { + "epoch": 1.1017101342802333, + "grad_norm": 2.0318331718444824, + "learning_rate": 1.983404928986431e-05, + "loss": 0.8508, + "step": 6749 + }, + { + "epoch": 1.1018733929227378, + "grad_norm": 2.1480116844177246, + "learning_rate": 1.9833991713096742e-05, + "loss": 0.7183, + "step": 6750 + }, + { + "epoch": 1.1020366515652422, + "grad_norm": 2.1867852210998535, + "learning_rate": 1.9833934126426338e-05, + "loss": 0.7751, + "step": 6751 + }, + { + "epoch": 1.1021999102077467, + "grad_norm": 2.1268882751464844, + "learning_rate": 1.983387652985317e-05, + "loss": 0.7264, + "step": 6752 + }, + { + "epoch": 1.102363168850251, + "grad_norm": 1.6672708988189697, + "learning_rate": 1.9833818923377293e-05, + "loss": 0.7489, + "step": 6753 + }, + { + "epoch": 1.1025264274927553, + "grad_norm": 1.8561550378799438, + "learning_rate": 1.9833761306998757e-05, + "loss": 0.7351, + "step": 6754 + }, + { + "epoch": 1.1026896861352598, + "grad_norm": 1.5503875017166138, + "learning_rate": 1.983370368071763e-05, + "loss": 0.6543, + "step": 6755 + }, + { + "epoch": 1.1028529447777642, + "grad_norm": 2.2268285751342773, + "learning_rate": 1.9833646044533962e-05, + "loss": 0.756, + "step": 6756 + }, + { + "epoch": 1.1030162034202686, + "grad_norm": 1.775679349899292, + "learning_rate": 1.9833588398447822e-05, + "loss": 0.579, + "step": 6757 + }, + { + "epoch": 1.1031794620627728, + "grad_norm": 1.82843816280365, + "learning_rate": 1.9833530742459253e-05, + "loss": 0.6761, + "step": 6758 + }, + { + "epoch": 1.1033427207052773, + "grad_norm": 1.8779449462890625, + "learning_rate": 1.9833473076568328e-05, + "loss": 0.7181, + "step": 6759 + }, + { + "epoch": 1.1035059793477817, + "grad_norm": 2.030198574066162, + "learning_rate": 1.9833415400775092e-05, + "loss": 0.817, + "step": 6760 + }, + { + "epoch": 1.1036692379902862, + "grad_norm": 1.7051976919174194, + "learning_rate": 1.9833357715079615e-05, + "loss": 0.7074, + "step": 6761 + }, + { + "epoch": 1.1038324966327906, + "grad_norm": 1.8799012899398804, + "learning_rate": 1.9833300019481946e-05, + "loss": 0.6918, + "step": 6762 + }, + { + "epoch": 1.1039957552752948, + "grad_norm": 1.8130028247833252, + "learning_rate": 1.9833242313982147e-05, + "loss": 0.6648, + "step": 6763 + }, + { + "epoch": 1.1041590139177992, + "grad_norm": 1.9656734466552734, + "learning_rate": 1.983318459858028e-05, + "loss": 0.7135, + "step": 6764 + }, + { + "epoch": 1.1043222725603037, + "grad_norm": 1.6402291059494019, + "learning_rate": 1.9833126873276392e-05, + "loss": 0.8452, + "step": 6765 + }, + { + "epoch": 1.1044855312028081, + "grad_norm": 1.6347730159759521, + "learning_rate": 1.983306913807055e-05, + "loss": 0.627, + "step": 6766 + }, + { + "epoch": 1.1046487898453123, + "grad_norm": 1.9236478805541992, + "learning_rate": 1.983301139296281e-05, + "loss": 0.6807, + "step": 6767 + }, + { + "epoch": 1.1048120484878168, + "grad_norm": 2.181328773498535, + "learning_rate": 1.983295363795323e-05, + "loss": 0.8568, + "step": 6768 + }, + { + "epoch": 1.1049753071303212, + "grad_norm": 1.7246285676956177, + "learning_rate": 1.983289587304187e-05, + "loss": 0.5648, + "step": 6769 + }, + { + "epoch": 1.1051385657728257, + "grad_norm": 1.7487621307373047, + "learning_rate": 1.9832838098228786e-05, + "loss": 0.696, + "step": 6770 + }, + { + "epoch": 1.10530182441533, + "grad_norm": 1.9745725393295288, + "learning_rate": 1.9832780313514036e-05, + "loss": 0.753, + "step": 6771 + }, + { + "epoch": 1.1054650830578343, + "grad_norm": 1.7327139377593994, + "learning_rate": 1.983272251889768e-05, + "loss": 0.7353, + "step": 6772 + }, + { + "epoch": 1.1056283417003387, + "grad_norm": 1.7728241682052612, + "learning_rate": 1.9832664714379774e-05, + "loss": 0.7146, + "step": 6773 + }, + { + "epoch": 1.1057916003428432, + "grad_norm": 1.7190454006195068, + "learning_rate": 1.9832606899960377e-05, + "loss": 0.6562, + "step": 6774 + }, + { + "epoch": 1.1059548589853476, + "grad_norm": 1.6471067667007446, + "learning_rate": 1.983254907563955e-05, + "loss": 0.6505, + "step": 6775 + }, + { + "epoch": 1.1061181176278518, + "grad_norm": 1.693933129310608, + "learning_rate": 1.9832491241417345e-05, + "loss": 0.5357, + "step": 6776 + }, + { + "epoch": 1.1062813762703563, + "grad_norm": 1.8552799224853516, + "learning_rate": 1.9832433397293825e-05, + "loss": 0.7356, + "step": 6777 + }, + { + "epoch": 1.1064446349128607, + "grad_norm": 1.868455410003662, + "learning_rate": 1.9832375543269048e-05, + "loss": 0.6095, + "step": 6778 + }, + { + "epoch": 1.1066078935553652, + "grad_norm": 1.9555726051330566, + "learning_rate": 1.983231767934307e-05, + "loss": 0.8013, + "step": 6779 + }, + { + "epoch": 1.1067711521978696, + "grad_norm": 1.9471238851547241, + "learning_rate": 1.9832259805515954e-05, + "loss": 0.7786, + "step": 6780 + }, + { + "epoch": 1.1069344108403738, + "grad_norm": 2.1935322284698486, + "learning_rate": 1.983220192178775e-05, + "loss": 0.7145, + "step": 6781 + }, + { + "epoch": 1.1070976694828782, + "grad_norm": 1.7586286067962646, + "learning_rate": 1.9832144028158523e-05, + "loss": 0.5735, + "step": 6782 + }, + { + "epoch": 1.1072609281253827, + "grad_norm": 1.8207159042358398, + "learning_rate": 1.9832086124628333e-05, + "loss": 0.7068, + "step": 6783 + }, + { + "epoch": 1.1074241867678871, + "grad_norm": 1.9500941038131714, + "learning_rate": 1.983202821119723e-05, + "loss": 0.6724, + "step": 6784 + }, + { + "epoch": 1.1075874454103913, + "grad_norm": 1.7856190204620361, + "learning_rate": 1.983197028786528e-05, + "loss": 0.7329, + "step": 6785 + }, + { + "epoch": 1.1077507040528958, + "grad_norm": 1.9484974145889282, + "learning_rate": 1.9831912354632537e-05, + "loss": 0.9002, + "step": 6786 + }, + { + "epoch": 1.1079139626954002, + "grad_norm": 2.2160184383392334, + "learning_rate": 1.983185441149906e-05, + "loss": 0.7222, + "step": 6787 + }, + { + "epoch": 1.1080772213379046, + "grad_norm": 1.9722622632980347, + "learning_rate": 1.9831796458464915e-05, + "loss": 0.7333, + "step": 6788 + }, + { + "epoch": 1.1082404799804089, + "grad_norm": 1.6124356985092163, + "learning_rate": 1.9831738495530147e-05, + "loss": 0.6508, + "step": 6789 + }, + { + "epoch": 1.1084037386229133, + "grad_norm": 2.224576950073242, + "learning_rate": 1.9831680522694823e-05, + "loss": 1.2893, + "step": 6790 + }, + { + "epoch": 1.1085669972654177, + "grad_norm": 1.9055266380310059, + "learning_rate": 1.9831622539958996e-05, + "loss": 0.7919, + "step": 6791 + }, + { + "epoch": 1.1087302559079222, + "grad_norm": 1.8393490314483643, + "learning_rate": 1.9831564547322733e-05, + "loss": 0.7646, + "step": 6792 + }, + { + "epoch": 1.1088935145504264, + "grad_norm": 1.7349016666412354, + "learning_rate": 1.9831506544786087e-05, + "loss": 0.8456, + "step": 6793 + }, + { + "epoch": 1.1090567731929308, + "grad_norm": 1.774179220199585, + "learning_rate": 1.983144853234911e-05, + "loss": 0.7852, + "step": 6794 + }, + { + "epoch": 1.1092200318354353, + "grad_norm": 1.796922206878662, + "learning_rate": 1.9831390510011874e-05, + "loss": 0.7277, + "step": 6795 + }, + { + "epoch": 1.1093832904779397, + "grad_norm": 1.7811744213104248, + "learning_rate": 1.9831332477774428e-05, + "loss": 0.7266, + "step": 6796 + }, + { + "epoch": 1.1095465491204441, + "grad_norm": 1.5025882720947266, + "learning_rate": 1.983127443563683e-05, + "loss": 0.6427, + "step": 6797 + }, + { + "epoch": 1.1097098077629484, + "grad_norm": 1.9513026475906372, + "learning_rate": 1.9831216383599146e-05, + "loss": 0.8602, + "step": 6798 + }, + { + "epoch": 1.1098730664054528, + "grad_norm": 1.953722596168518, + "learning_rate": 1.9831158321661425e-05, + "loss": 0.7889, + "step": 6799 + }, + { + "epoch": 1.1100363250479572, + "grad_norm": 1.7625430822372437, + "learning_rate": 1.9831100249823732e-05, + "loss": 0.6398, + "step": 6800 + }, + { + "epoch": 1.1101995836904617, + "grad_norm": 1.7166260480880737, + "learning_rate": 1.9831042168086125e-05, + "loss": 0.6793, + "step": 6801 + }, + { + "epoch": 1.110362842332966, + "grad_norm": 1.7962063550949097, + "learning_rate": 1.983098407644866e-05, + "loss": 0.6801, + "step": 6802 + }, + { + "epoch": 1.1105261009754703, + "grad_norm": 1.418904185295105, + "learning_rate": 1.98309259749114e-05, + "loss": 0.5075, + "step": 6803 + }, + { + "epoch": 1.1106893596179748, + "grad_norm": 1.7665963172912598, + "learning_rate": 1.9830867863474395e-05, + "loss": 0.6998, + "step": 6804 + }, + { + "epoch": 1.1108526182604792, + "grad_norm": 2.005910634994507, + "learning_rate": 1.983080974213771e-05, + "loss": 0.7852, + "step": 6805 + }, + { + "epoch": 1.1110158769029836, + "grad_norm": 1.6989368200302124, + "learning_rate": 1.9830751610901404e-05, + "loss": 0.6165, + "step": 6806 + }, + { + "epoch": 1.1111791355454879, + "grad_norm": 1.8590635061264038, + "learning_rate": 1.9830693469765534e-05, + "loss": 0.7582, + "step": 6807 + }, + { + "epoch": 1.1113423941879923, + "grad_norm": 2.2276952266693115, + "learning_rate": 1.9830635318730155e-05, + "loss": 0.7856, + "step": 6808 + }, + { + "epoch": 1.1115056528304967, + "grad_norm": 1.7470234632492065, + "learning_rate": 1.9830577157795333e-05, + "loss": 0.5781, + "step": 6809 + }, + { + "epoch": 1.1116689114730012, + "grad_norm": 1.6071053743362427, + "learning_rate": 1.983051898696112e-05, + "loss": 0.6398, + "step": 6810 + }, + { + "epoch": 1.1118321701155054, + "grad_norm": 1.5759453773498535, + "learning_rate": 1.9830460806227575e-05, + "loss": 0.7185, + "step": 6811 + }, + { + "epoch": 1.1119954287580098, + "grad_norm": 1.892674446105957, + "learning_rate": 1.9830402615594765e-05, + "loss": 0.698, + "step": 6812 + }, + { + "epoch": 1.1121586874005143, + "grad_norm": 2.023233652114868, + "learning_rate": 1.9830344415062735e-05, + "loss": 0.7472, + "step": 6813 + }, + { + "epoch": 1.1123219460430187, + "grad_norm": 1.8775781393051147, + "learning_rate": 1.9830286204631556e-05, + "loss": 0.7512, + "step": 6814 + }, + { + "epoch": 1.1124852046855231, + "grad_norm": 2.041203498840332, + "learning_rate": 1.9830227984301276e-05, + "loss": 0.7348, + "step": 6815 + }, + { + "epoch": 1.1126484633280274, + "grad_norm": 1.5735390186309814, + "learning_rate": 1.9830169754071963e-05, + "loss": 0.7381, + "step": 6816 + }, + { + "epoch": 1.1128117219705318, + "grad_norm": 1.788425326347351, + "learning_rate": 1.9830111513943673e-05, + "loss": 0.7612, + "step": 6817 + }, + { + "epoch": 1.1129749806130362, + "grad_norm": 1.8437559604644775, + "learning_rate": 1.983005326391646e-05, + "loss": 0.78, + "step": 6818 + }, + { + "epoch": 1.1131382392555407, + "grad_norm": 1.8134492635726929, + "learning_rate": 1.9829995003990387e-05, + "loss": 0.7635, + "step": 6819 + }, + { + "epoch": 1.113301497898045, + "grad_norm": 1.636628270149231, + "learning_rate": 1.9829936734165512e-05, + "loss": 0.7704, + "step": 6820 + }, + { + "epoch": 1.1134647565405493, + "grad_norm": 1.6763769388198853, + "learning_rate": 1.982987845444189e-05, + "loss": 0.6359, + "step": 6821 + }, + { + "epoch": 1.1136280151830538, + "grad_norm": 1.7487847805023193, + "learning_rate": 1.982982016481959e-05, + "loss": 0.8518, + "step": 6822 + }, + { + "epoch": 1.1137912738255582, + "grad_norm": 2.1973586082458496, + "learning_rate": 1.9829761865298658e-05, + "loss": 0.7173, + "step": 6823 + }, + { + "epoch": 1.1139545324680626, + "grad_norm": 1.5976512432098389, + "learning_rate": 1.982970355587916e-05, + "loss": 0.5756, + "step": 6824 + }, + { + "epoch": 1.1141177911105669, + "grad_norm": 1.6528211832046509, + "learning_rate": 1.9829645236561154e-05, + "loss": 0.6819, + "step": 6825 + }, + { + "epoch": 1.1142810497530713, + "grad_norm": 1.726807713508606, + "learning_rate": 1.9829586907344697e-05, + "loss": 0.7371, + "step": 6826 + }, + { + "epoch": 1.1144443083955757, + "grad_norm": 1.6804795265197754, + "learning_rate": 1.982952856822985e-05, + "loss": 0.5118, + "step": 6827 + }, + { + "epoch": 1.1146075670380802, + "grad_norm": 1.7619603872299194, + "learning_rate": 1.982947021921667e-05, + "loss": 0.6357, + "step": 6828 + }, + { + "epoch": 1.1147708256805844, + "grad_norm": 1.529826045036316, + "learning_rate": 1.9829411860305215e-05, + "loss": 0.5842, + "step": 6829 + }, + { + "epoch": 1.1149340843230888, + "grad_norm": 1.8984102010726929, + "learning_rate": 1.9829353491495545e-05, + "loss": 0.7354, + "step": 6830 + }, + { + "epoch": 1.1150973429655933, + "grad_norm": 1.5339477062225342, + "learning_rate": 1.982929511278772e-05, + "loss": 0.6427, + "step": 6831 + }, + { + "epoch": 1.1152606016080977, + "grad_norm": 1.8459383249282837, + "learning_rate": 1.9829236724181794e-05, + "loss": 0.7263, + "step": 6832 + }, + { + "epoch": 1.115423860250602, + "grad_norm": 1.7333714962005615, + "learning_rate": 1.9829178325677832e-05, + "loss": 0.6706, + "step": 6833 + }, + { + "epoch": 1.1155871188931064, + "grad_norm": 1.841983437538147, + "learning_rate": 1.9829119917275887e-05, + "loss": 0.8563, + "step": 6834 + }, + { + "epoch": 1.1157503775356108, + "grad_norm": 1.87663733959198, + "learning_rate": 1.9829061498976026e-05, + "loss": 0.6598, + "step": 6835 + }, + { + "epoch": 1.1159136361781152, + "grad_norm": 1.8269145488739014, + "learning_rate": 1.9829003070778298e-05, + "loss": 0.6476, + "step": 6836 + }, + { + "epoch": 1.1160768948206197, + "grad_norm": 1.9679805040359497, + "learning_rate": 1.982894463268277e-05, + "loss": 0.7917, + "step": 6837 + }, + { + "epoch": 1.1162401534631239, + "grad_norm": 2.2173662185668945, + "learning_rate": 1.9828886184689494e-05, + "loss": 0.7781, + "step": 6838 + }, + { + "epoch": 1.1164034121056283, + "grad_norm": 2.1593503952026367, + "learning_rate": 1.9828827726798538e-05, + "loss": 0.8614, + "step": 6839 + }, + { + "epoch": 1.1165666707481328, + "grad_norm": 2.1898624897003174, + "learning_rate": 1.9828769259009947e-05, + "loss": 0.6893, + "step": 6840 + }, + { + "epoch": 1.1167299293906372, + "grad_norm": 1.8274704217910767, + "learning_rate": 1.9828710781323793e-05, + "loss": 0.6357, + "step": 6841 + }, + { + "epoch": 1.1168931880331414, + "grad_norm": 1.6828792095184326, + "learning_rate": 1.982865229374013e-05, + "loss": 0.5303, + "step": 6842 + }, + { + "epoch": 1.1170564466756459, + "grad_norm": 1.3662415742874146, + "learning_rate": 1.9828593796259013e-05, + "loss": 0.4519, + "step": 6843 + }, + { + "epoch": 1.1172197053181503, + "grad_norm": 1.6215028762817383, + "learning_rate": 1.982853528888051e-05, + "loss": 0.6232, + "step": 6844 + }, + { + "epoch": 1.1173829639606547, + "grad_norm": 1.80733323097229, + "learning_rate": 1.9828476771604673e-05, + "loss": 0.7416, + "step": 6845 + }, + { + "epoch": 1.117546222603159, + "grad_norm": 2.053267478942871, + "learning_rate": 1.982841824443156e-05, + "loss": 0.6888, + "step": 6846 + }, + { + "epoch": 1.1177094812456634, + "grad_norm": 2.2380573749542236, + "learning_rate": 1.9828359707361232e-05, + "loss": 0.8681, + "step": 6847 + }, + { + "epoch": 1.1178727398881678, + "grad_norm": 1.7616380453109741, + "learning_rate": 1.9828301160393753e-05, + "loss": 0.5855, + "step": 6848 + }, + { + "epoch": 1.1180359985306723, + "grad_norm": 2.110002040863037, + "learning_rate": 1.9828242603529175e-05, + "loss": 0.8064, + "step": 6849 + }, + { + "epoch": 1.1181992571731767, + "grad_norm": 1.8278428316116333, + "learning_rate": 1.9828184036767556e-05, + "loss": 0.6246, + "step": 6850 + }, + { + "epoch": 1.118362515815681, + "grad_norm": 1.9694550037384033, + "learning_rate": 1.9828125460108964e-05, + "loss": 0.6068, + "step": 6851 + }, + { + "epoch": 1.1185257744581854, + "grad_norm": 2.0617997646331787, + "learning_rate": 1.982806687355345e-05, + "loss": 0.6433, + "step": 6852 + }, + { + "epoch": 1.1186890331006898, + "grad_norm": 2.086246967315674, + "learning_rate": 1.9828008277101075e-05, + "loss": 0.8958, + "step": 6853 + }, + { + "epoch": 1.1188522917431942, + "grad_norm": 2.010230541229248, + "learning_rate": 1.9827949670751897e-05, + "loss": 0.7442, + "step": 6854 + }, + { + "epoch": 1.1190155503856984, + "grad_norm": 1.8758224248886108, + "learning_rate": 1.9827891054505976e-05, + "loss": 0.7477, + "step": 6855 + }, + { + "epoch": 1.1191788090282029, + "grad_norm": 2.02659010887146, + "learning_rate": 1.9827832428363373e-05, + "loss": 0.7538, + "step": 6856 + }, + { + "epoch": 1.1193420676707073, + "grad_norm": 2.0248045921325684, + "learning_rate": 1.9827773792324146e-05, + "loss": 0.8045, + "step": 6857 + }, + { + "epoch": 1.1195053263132118, + "grad_norm": 1.6483783721923828, + "learning_rate": 1.982771514638835e-05, + "loss": 0.6714, + "step": 6858 + }, + { + "epoch": 1.1196685849557162, + "grad_norm": 2.2637393474578857, + "learning_rate": 1.982765649055605e-05, + "loss": 0.898, + "step": 6859 + }, + { + "epoch": 1.1198318435982204, + "grad_norm": 2.0730416774749756, + "learning_rate": 1.9827597824827306e-05, + "loss": 0.7739, + "step": 6860 + }, + { + "epoch": 1.1199951022407248, + "grad_norm": 1.6173166036605835, + "learning_rate": 1.982753914920217e-05, + "loss": 0.5648, + "step": 6861 + }, + { + "epoch": 1.1201583608832293, + "grad_norm": 1.6421947479248047, + "learning_rate": 1.98274804636807e-05, + "loss": 0.6001, + "step": 6862 + }, + { + "epoch": 1.1203216195257337, + "grad_norm": 1.5277462005615234, + "learning_rate": 1.9827421768262966e-05, + "loss": 0.6189, + "step": 6863 + }, + { + "epoch": 1.120484878168238, + "grad_norm": 2.0833053588867188, + "learning_rate": 1.982736306294902e-05, + "loss": 0.7544, + "step": 6864 + }, + { + "epoch": 1.1206481368107424, + "grad_norm": 1.8574639558792114, + "learning_rate": 1.982730434773892e-05, + "loss": 0.5558, + "step": 6865 + }, + { + "epoch": 1.1208113954532468, + "grad_norm": 1.820709228515625, + "learning_rate": 1.982724562263273e-05, + "loss": 0.6608, + "step": 6866 + }, + { + "epoch": 1.1209746540957513, + "grad_norm": 1.5128601789474487, + "learning_rate": 1.9827186887630505e-05, + "loss": 0.5032, + "step": 6867 + }, + { + "epoch": 1.1211379127382557, + "grad_norm": 2.0339748859405518, + "learning_rate": 1.9827128142732304e-05, + "loss": 0.7245, + "step": 6868 + }, + { + "epoch": 1.12130117138076, + "grad_norm": 1.9182184934616089, + "learning_rate": 1.9827069387938187e-05, + "loss": 0.7249, + "step": 6869 + }, + { + "epoch": 1.1214644300232643, + "grad_norm": 1.9852222204208374, + "learning_rate": 1.9827010623248217e-05, + "loss": 0.7908, + "step": 6870 + }, + { + "epoch": 1.1216276886657688, + "grad_norm": 2.0301201343536377, + "learning_rate": 1.9826951848662447e-05, + "loss": 0.7463, + "step": 6871 + }, + { + "epoch": 1.1217909473082732, + "grad_norm": 2.267519235610962, + "learning_rate": 1.9826893064180942e-05, + "loss": 1.2568, + "step": 6872 + }, + { + "epoch": 1.1219542059507774, + "grad_norm": 2.128164529800415, + "learning_rate": 1.9826834269803756e-05, + "loss": 0.897, + "step": 6873 + }, + { + "epoch": 1.1221174645932819, + "grad_norm": 1.8943778276443481, + "learning_rate": 1.982677546553095e-05, + "loss": 0.6516, + "step": 6874 + }, + { + "epoch": 1.1222807232357863, + "grad_norm": 1.9815822839736938, + "learning_rate": 1.9826716651362585e-05, + "loss": 0.7964, + "step": 6875 + }, + { + "epoch": 1.1224439818782908, + "grad_norm": 1.4860695600509644, + "learning_rate": 1.982665782729872e-05, + "loss": 0.6318, + "step": 6876 + }, + { + "epoch": 1.122607240520795, + "grad_norm": 2.015270471572876, + "learning_rate": 1.9826598993339412e-05, + "loss": 0.7627, + "step": 6877 + }, + { + "epoch": 1.1227704991632994, + "grad_norm": 1.7218314409255981, + "learning_rate": 1.982654014948472e-05, + "loss": 0.5873, + "step": 6878 + }, + { + "epoch": 1.1229337578058038, + "grad_norm": 1.775505781173706, + "learning_rate": 1.9826481295734708e-05, + "loss": 0.6014, + "step": 6879 + }, + { + "epoch": 1.1230970164483083, + "grad_norm": 1.2582354545593262, + "learning_rate": 1.982642243208943e-05, + "loss": 0.4573, + "step": 6880 + }, + { + "epoch": 1.1232602750908127, + "grad_norm": 1.7087342739105225, + "learning_rate": 1.9826363558548947e-05, + "loss": 0.6562, + "step": 6881 + }, + { + "epoch": 1.123423533733317, + "grad_norm": 1.7182787656784058, + "learning_rate": 1.982630467511332e-05, + "loss": 0.6444, + "step": 6882 + }, + { + "epoch": 1.1235867923758214, + "grad_norm": 1.9483979940414429, + "learning_rate": 1.9826245781782604e-05, + "loss": 0.7652, + "step": 6883 + }, + { + "epoch": 1.1237500510183258, + "grad_norm": 2.0538694858551025, + "learning_rate": 1.9826186878556862e-05, + "loss": 0.8064, + "step": 6884 + }, + { + "epoch": 1.1239133096608303, + "grad_norm": 1.6622323989868164, + "learning_rate": 1.9826127965436153e-05, + "loss": 0.698, + "step": 6885 + }, + { + "epoch": 1.1240765683033345, + "grad_norm": 1.5758603811264038, + "learning_rate": 1.9826069042420537e-05, + "loss": 0.7471, + "step": 6886 + }, + { + "epoch": 1.124239826945839, + "grad_norm": 2.111633539199829, + "learning_rate": 1.982601010951007e-05, + "loss": 0.7788, + "step": 6887 + }, + { + "epoch": 1.1244030855883433, + "grad_norm": 1.8844319581985474, + "learning_rate": 1.9825951166704814e-05, + "loss": 0.7439, + "step": 6888 + }, + { + "epoch": 1.1245663442308478, + "grad_norm": 1.818794846534729, + "learning_rate": 1.982589221400483e-05, + "loss": 0.6433, + "step": 6889 + }, + { + "epoch": 1.124729602873352, + "grad_norm": 1.547567367553711, + "learning_rate": 1.9825833251410173e-05, + "loss": 0.6391, + "step": 6890 + }, + { + "epoch": 1.1248928615158564, + "grad_norm": 1.7218223810195923, + "learning_rate": 1.9825774278920904e-05, + "loss": 0.7365, + "step": 6891 + }, + { + "epoch": 1.1250561201583609, + "grad_norm": 1.700149655342102, + "learning_rate": 1.9825715296537083e-05, + "loss": 0.6649, + "step": 6892 + }, + { + "epoch": 1.1252193788008653, + "grad_norm": 1.910664439201355, + "learning_rate": 1.982565630425877e-05, + "loss": 0.6421, + "step": 6893 + }, + { + "epoch": 1.1253826374433697, + "grad_norm": 1.937506079673767, + "learning_rate": 1.9825597302086024e-05, + "loss": 0.6958, + "step": 6894 + }, + { + "epoch": 1.125545896085874, + "grad_norm": 1.831484079360962, + "learning_rate": 1.9825538290018903e-05, + "loss": 0.7641, + "step": 6895 + }, + { + "epoch": 1.1257091547283784, + "grad_norm": 1.9198724031448364, + "learning_rate": 1.982547926805747e-05, + "loss": 0.768, + "step": 6896 + }, + { + "epoch": 1.1258724133708828, + "grad_norm": 1.6930948495864868, + "learning_rate": 1.982542023620178e-05, + "loss": 0.6376, + "step": 6897 + }, + { + "epoch": 1.1260356720133873, + "grad_norm": 1.803470492362976, + "learning_rate": 1.9825361194451895e-05, + "loss": 0.7914, + "step": 6898 + }, + { + "epoch": 1.1261989306558915, + "grad_norm": 1.538540244102478, + "learning_rate": 1.982530214280787e-05, + "loss": 0.571, + "step": 6899 + }, + { + "epoch": 1.126362189298396, + "grad_norm": 1.7105906009674072, + "learning_rate": 1.9825243081269778e-05, + "loss": 0.6593, + "step": 6900 + }, + { + "epoch": 1.1265254479409004, + "grad_norm": 1.6625746488571167, + "learning_rate": 1.982518400983766e-05, + "loss": 0.6883, + "step": 6901 + }, + { + "epoch": 1.1266887065834048, + "grad_norm": 1.6381312608718872, + "learning_rate": 1.9825124928511588e-05, + "loss": 0.7135, + "step": 6902 + }, + { + "epoch": 1.1268519652259092, + "grad_norm": 1.7359404563903809, + "learning_rate": 1.9825065837291616e-05, + "loss": 0.7304, + "step": 6903 + }, + { + "epoch": 1.1270152238684135, + "grad_norm": 2.1568503379821777, + "learning_rate": 1.982500673617781e-05, + "loss": 0.7397, + "step": 6904 + }, + { + "epoch": 1.127178482510918, + "grad_norm": 1.876868486404419, + "learning_rate": 1.9824947625170216e-05, + "loss": 0.7381, + "step": 6905 + }, + { + "epoch": 1.1273417411534223, + "grad_norm": 1.8094137907028198, + "learning_rate": 1.982488850426891e-05, + "loss": 0.7562, + "step": 6906 + }, + { + "epoch": 1.1275049997959268, + "grad_norm": 1.7405377626419067, + "learning_rate": 1.9824829373473943e-05, + "loss": 0.6241, + "step": 6907 + }, + { + "epoch": 1.127668258438431, + "grad_norm": 1.9996654987335205, + "learning_rate": 1.982477023278537e-05, + "loss": 0.7316, + "step": 6908 + }, + { + "epoch": 1.1278315170809354, + "grad_norm": 2.032029867172241, + "learning_rate": 1.982471108220326e-05, + "loss": 0.6398, + "step": 6909 + }, + { + "epoch": 1.1279947757234399, + "grad_norm": 1.5801384449005127, + "learning_rate": 1.982465192172767e-05, + "loss": 0.5926, + "step": 6910 + }, + { + "epoch": 1.1281580343659443, + "grad_norm": 1.6958065032958984, + "learning_rate": 1.9824592751358656e-05, + "loss": 0.5564, + "step": 6911 + }, + { + "epoch": 1.1283212930084487, + "grad_norm": 1.8347259759902954, + "learning_rate": 1.9824533571096278e-05, + "loss": 0.6802, + "step": 6912 + }, + { + "epoch": 1.128484551650953, + "grad_norm": 1.6899250745773315, + "learning_rate": 1.9824474380940598e-05, + "loss": 0.5774, + "step": 6913 + }, + { + "epoch": 1.1286478102934574, + "grad_norm": 1.5960321426391602, + "learning_rate": 1.982441518089168e-05, + "loss": 0.6177, + "step": 6914 + }, + { + "epoch": 1.1288110689359618, + "grad_norm": 1.7834609746932983, + "learning_rate": 1.9824355970949574e-05, + "loss": 0.752, + "step": 6915 + }, + { + "epoch": 1.1289743275784663, + "grad_norm": 2.014601469039917, + "learning_rate": 1.9824296751114345e-05, + "loss": 0.8105, + "step": 6916 + }, + { + "epoch": 1.1291375862209705, + "grad_norm": 1.9496270418167114, + "learning_rate": 1.9824237521386052e-05, + "loss": 0.7297, + "step": 6917 + }, + { + "epoch": 1.129300844863475, + "grad_norm": 1.708036184310913, + "learning_rate": 1.9824178281764753e-05, + "loss": 0.6581, + "step": 6918 + }, + { + "epoch": 1.1294641035059794, + "grad_norm": 1.8401415348052979, + "learning_rate": 1.982411903225051e-05, + "loss": 0.6348, + "step": 6919 + }, + { + "epoch": 1.1296273621484838, + "grad_norm": 1.9307568073272705, + "learning_rate": 1.982405977284338e-05, + "loss": 0.7795, + "step": 6920 + }, + { + "epoch": 1.1297906207909882, + "grad_norm": 2.1299707889556885, + "learning_rate": 1.9824000503543427e-05, + "loss": 0.7177, + "step": 6921 + }, + { + "epoch": 1.1299538794334925, + "grad_norm": 1.9480713605880737, + "learning_rate": 1.982394122435071e-05, + "loss": 0.7472, + "step": 6922 + }, + { + "epoch": 1.130117138075997, + "grad_norm": 2.2438302040100098, + "learning_rate": 1.9823881935265283e-05, + "loss": 0.5738, + "step": 6923 + }, + { + "epoch": 1.1302803967185013, + "grad_norm": 1.8637434244155884, + "learning_rate": 1.982382263628721e-05, + "loss": 0.6903, + "step": 6924 + }, + { + "epoch": 1.1304436553610056, + "grad_norm": 1.7981104850769043, + "learning_rate": 1.982376332741655e-05, + "loss": 0.6454, + "step": 6925 + }, + { + "epoch": 1.13060691400351, + "grad_norm": 1.9311376810073853, + "learning_rate": 1.9823704008653365e-05, + "loss": 0.6819, + "step": 6926 + }, + { + "epoch": 1.1307701726460144, + "grad_norm": 1.835029125213623, + "learning_rate": 1.9823644679997713e-05, + "loss": 0.666, + "step": 6927 + }, + { + "epoch": 1.1309334312885189, + "grad_norm": 1.8951245546340942, + "learning_rate": 1.982358534144965e-05, + "loss": 0.6749, + "step": 6928 + }, + { + "epoch": 1.1310966899310233, + "grad_norm": 1.8287206888198853, + "learning_rate": 1.9823525993009243e-05, + "loss": 0.6901, + "step": 6929 + }, + { + "epoch": 1.1312599485735275, + "grad_norm": 2.0255959033966064, + "learning_rate": 1.9823466634676544e-05, + "loss": 0.69, + "step": 6930 + }, + { + "epoch": 1.131423207216032, + "grad_norm": 1.9068493843078613, + "learning_rate": 1.982340726645162e-05, + "loss": 0.6113, + "step": 6931 + }, + { + "epoch": 1.1315864658585364, + "grad_norm": 1.5973666906356812, + "learning_rate": 1.9823347888334527e-05, + "loss": 0.6205, + "step": 6932 + }, + { + "epoch": 1.1317497245010408, + "grad_norm": 2.0450289249420166, + "learning_rate": 1.9823288500325324e-05, + "loss": 0.7166, + "step": 6933 + }, + { + "epoch": 1.131912983143545, + "grad_norm": 2.0293185710906982, + "learning_rate": 1.9823229102424074e-05, + "loss": 0.7862, + "step": 6934 + }, + { + "epoch": 1.1320762417860495, + "grad_norm": 1.8720734119415283, + "learning_rate": 1.9823169694630834e-05, + "loss": 0.688, + "step": 6935 + }, + { + "epoch": 1.132239500428554, + "grad_norm": 2.0822865962982178, + "learning_rate": 1.9823110276945663e-05, + "loss": 0.6992, + "step": 6936 + }, + { + "epoch": 1.1324027590710584, + "grad_norm": 2.3082008361816406, + "learning_rate": 1.9823050849368624e-05, + "loss": 0.8385, + "step": 6937 + }, + { + "epoch": 1.1325660177135628, + "grad_norm": 1.9729644060134888, + "learning_rate": 1.9822991411899774e-05, + "loss": 0.7139, + "step": 6938 + }, + { + "epoch": 1.132729276356067, + "grad_norm": 1.766169548034668, + "learning_rate": 1.9822931964539176e-05, + "loss": 0.6878, + "step": 6939 + }, + { + "epoch": 1.1328925349985715, + "grad_norm": 1.7650502920150757, + "learning_rate": 1.982287250728689e-05, + "loss": 0.6276, + "step": 6940 + }, + { + "epoch": 1.133055793641076, + "grad_norm": 1.562251091003418, + "learning_rate": 1.982281304014297e-05, + "loss": 0.6002, + "step": 6941 + }, + { + "epoch": 1.1332190522835803, + "grad_norm": 1.899066686630249, + "learning_rate": 1.982275356310748e-05, + "loss": 0.6751, + "step": 6942 + }, + { + "epoch": 1.1333823109260845, + "grad_norm": 1.3933321237564087, + "learning_rate": 1.9822694076180486e-05, + "loss": 0.506, + "step": 6943 + }, + { + "epoch": 1.133545569568589, + "grad_norm": 1.668020248413086, + "learning_rate": 1.9822634579362034e-05, + "loss": 0.6858, + "step": 6944 + }, + { + "epoch": 1.1337088282110934, + "grad_norm": 2.193957805633545, + "learning_rate": 1.9822575072652195e-05, + "loss": 0.8343, + "step": 6945 + }, + { + "epoch": 1.1338720868535979, + "grad_norm": 2.141026020050049, + "learning_rate": 1.9822515556051024e-05, + "loss": 0.7218, + "step": 6946 + }, + { + "epoch": 1.1340353454961023, + "grad_norm": 2.370328664779663, + "learning_rate": 1.9822456029558582e-05, + "loss": 0.8837, + "step": 6947 + }, + { + "epoch": 1.1341986041386065, + "grad_norm": 1.751165509223938, + "learning_rate": 1.9822396493174933e-05, + "loss": 0.7019, + "step": 6948 + }, + { + "epoch": 1.134361862781111, + "grad_norm": 2.104771375656128, + "learning_rate": 1.982233694690013e-05, + "loss": 0.8495, + "step": 6949 + }, + { + "epoch": 1.1345251214236154, + "grad_norm": 1.864200472831726, + "learning_rate": 1.982227739073424e-05, + "loss": 0.7545, + "step": 6950 + }, + { + "epoch": 1.1346883800661198, + "grad_norm": 1.9873508214950562, + "learning_rate": 1.9822217824677313e-05, + "loss": 0.6727, + "step": 6951 + }, + { + "epoch": 1.134851638708624, + "grad_norm": 1.7002352476119995, + "learning_rate": 1.9822158248729422e-05, + "loss": 0.6552, + "step": 6952 + }, + { + "epoch": 1.1350148973511285, + "grad_norm": 1.9444044828414917, + "learning_rate": 1.9822098662890616e-05, + "loss": 0.7116, + "step": 6953 + }, + { + "epoch": 1.135178155993633, + "grad_norm": 1.8556920289993286, + "learning_rate": 1.9822039067160962e-05, + "loss": 0.6784, + "step": 6954 + }, + { + "epoch": 1.1353414146361374, + "grad_norm": 1.7590546607971191, + "learning_rate": 1.9821979461540515e-05, + "loss": 0.6829, + "step": 6955 + }, + { + "epoch": 1.1355046732786418, + "grad_norm": 1.465651512145996, + "learning_rate": 1.9821919846029338e-05, + "loss": 0.5867, + "step": 6956 + }, + { + "epoch": 1.135667931921146, + "grad_norm": 1.775283932685852, + "learning_rate": 1.982186022062749e-05, + "loss": 0.6874, + "step": 6957 + }, + { + "epoch": 1.1358311905636505, + "grad_norm": 1.9980140924453735, + "learning_rate": 1.982180058533503e-05, + "loss": 0.7226, + "step": 6958 + }, + { + "epoch": 1.1359944492061549, + "grad_norm": 1.4668790102005005, + "learning_rate": 1.9821740940152022e-05, + "loss": 0.6143, + "step": 6959 + }, + { + "epoch": 1.1361577078486593, + "grad_norm": 2.04335355758667, + "learning_rate": 1.9821681285078522e-05, + "loss": 0.7531, + "step": 6960 + }, + { + "epoch": 1.1363209664911635, + "grad_norm": 1.9430917501449585, + "learning_rate": 1.9821621620114594e-05, + "loss": 0.7174, + "step": 6961 + }, + { + "epoch": 1.136484225133668, + "grad_norm": 1.930712342262268, + "learning_rate": 1.9821561945260292e-05, + "loss": 0.7249, + "step": 6962 + }, + { + "epoch": 1.1366474837761724, + "grad_norm": 1.9937719106674194, + "learning_rate": 1.982150226051568e-05, + "loss": 0.6685, + "step": 6963 + }, + { + "epoch": 1.1368107424186769, + "grad_norm": 1.8762524127960205, + "learning_rate": 1.9821442565880823e-05, + "loss": 0.7068, + "step": 6964 + }, + { + "epoch": 1.1369740010611813, + "grad_norm": 1.6878684759140015, + "learning_rate": 1.982138286135577e-05, + "loss": 0.6735, + "step": 6965 + }, + { + "epoch": 1.1371372597036855, + "grad_norm": 1.6689066886901855, + "learning_rate": 1.982132314694059e-05, + "loss": 0.7429, + "step": 6966 + }, + { + "epoch": 1.13730051834619, + "grad_norm": 1.8950519561767578, + "learning_rate": 1.982126342263534e-05, + "loss": 0.749, + "step": 6967 + }, + { + "epoch": 1.1374637769886944, + "grad_norm": 1.7202976942062378, + "learning_rate": 1.982120368844008e-05, + "loss": 0.6911, + "step": 6968 + }, + { + "epoch": 1.1376270356311986, + "grad_norm": 1.565660834312439, + "learning_rate": 1.982114394435487e-05, + "loss": 0.6263, + "step": 6969 + }, + { + "epoch": 1.137790294273703, + "grad_norm": 1.8304948806762695, + "learning_rate": 1.982108419037977e-05, + "loss": 0.6627, + "step": 6970 + }, + { + "epoch": 1.1379535529162075, + "grad_norm": 1.6983762979507446, + "learning_rate": 1.9821024426514843e-05, + "loss": 0.6886, + "step": 6971 + }, + { + "epoch": 1.138116811558712, + "grad_norm": 1.974568486213684, + "learning_rate": 1.9820964652760147e-05, + "loss": 0.6252, + "step": 6972 + }, + { + "epoch": 1.1382800702012164, + "grad_norm": 1.5335415601730347, + "learning_rate": 1.982090486911574e-05, + "loss": 0.4782, + "step": 6973 + }, + { + "epoch": 1.1384433288437206, + "grad_norm": 2.145669460296631, + "learning_rate": 1.9820845075581686e-05, + "loss": 0.793, + "step": 6974 + }, + { + "epoch": 1.138606587486225, + "grad_norm": 1.9396700859069824, + "learning_rate": 1.9820785272158043e-05, + "loss": 0.8486, + "step": 6975 + }, + { + "epoch": 1.1387698461287294, + "grad_norm": 1.717542290687561, + "learning_rate": 1.9820725458844873e-05, + "loss": 0.6977, + "step": 6976 + }, + { + "epoch": 1.1389331047712339, + "grad_norm": 1.8559787273406982, + "learning_rate": 1.982066563564223e-05, + "loss": 0.6823, + "step": 6977 + }, + { + "epoch": 1.139096363413738, + "grad_norm": 1.7434706687927246, + "learning_rate": 1.9820605802550187e-05, + "loss": 0.6563, + "step": 6978 + }, + { + "epoch": 1.1392596220562425, + "grad_norm": 1.943930745124817, + "learning_rate": 1.9820545959568793e-05, + "loss": 0.87, + "step": 6979 + }, + { + "epoch": 1.139422880698747, + "grad_norm": 1.7395485639572144, + "learning_rate": 1.982048610669811e-05, + "loss": 0.6553, + "step": 6980 + }, + { + "epoch": 1.1395861393412514, + "grad_norm": 2.004188060760498, + "learning_rate": 1.9820426243938203e-05, + "loss": 0.7147, + "step": 6981 + }, + { + "epoch": 1.1397493979837559, + "grad_norm": 1.7561771869659424, + "learning_rate": 1.9820366371289128e-05, + "loss": 0.7855, + "step": 6982 + }, + { + "epoch": 1.13991265662626, + "grad_norm": 1.744246006011963, + "learning_rate": 1.9820306488750947e-05, + "loss": 0.7072, + "step": 6983 + }, + { + "epoch": 1.1400759152687645, + "grad_norm": 1.7740131616592407, + "learning_rate": 1.982024659632372e-05, + "loss": 0.6253, + "step": 6984 + }, + { + "epoch": 1.140239173911269, + "grad_norm": 2.2510316371917725, + "learning_rate": 1.9820186694007506e-05, + "loss": 0.6562, + "step": 6985 + }, + { + "epoch": 1.1404024325537734, + "grad_norm": 1.735775113105774, + "learning_rate": 1.9820126781802365e-05, + "loss": 0.6647, + "step": 6986 + }, + { + "epoch": 1.1405656911962776, + "grad_norm": 1.6217494010925293, + "learning_rate": 1.9820066859708366e-05, + "loss": 0.6782, + "step": 6987 + }, + { + "epoch": 1.140728949838782, + "grad_norm": 1.8319271802902222, + "learning_rate": 1.9820006927725558e-05, + "loss": 0.6714, + "step": 6988 + }, + { + "epoch": 1.1408922084812865, + "grad_norm": 1.9567840099334717, + "learning_rate": 1.9819946985854003e-05, + "loss": 0.5958, + "step": 6989 + }, + { + "epoch": 1.141055467123791, + "grad_norm": 1.9703575372695923, + "learning_rate": 1.9819887034093768e-05, + "loss": 0.715, + "step": 6990 + }, + { + "epoch": 1.1412187257662953, + "grad_norm": 2.015397310256958, + "learning_rate": 1.9819827072444905e-05, + "loss": 0.8536, + "step": 6991 + }, + { + "epoch": 1.1413819844087996, + "grad_norm": 2.131084442138672, + "learning_rate": 1.9819767100907485e-05, + "loss": 0.9671, + "step": 6992 + }, + { + "epoch": 1.141545243051304, + "grad_norm": 1.6590217351913452, + "learning_rate": 1.9819707119481558e-05, + "loss": 0.6252, + "step": 6993 + }, + { + "epoch": 1.1417085016938084, + "grad_norm": 1.8757551908493042, + "learning_rate": 1.981964712816719e-05, + "loss": 0.6873, + "step": 6994 + }, + { + "epoch": 1.1418717603363129, + "grad_norm": 1.8032195568084717, + "learning_rate": 1.981958712696444e-05, + "loss": 0.719, + "step": 6995 + }, + { + "epoch": 1.142035018978817, + "grad_norm": 1.5081963539123535, + "learning_rate": 1.9819527115873365e-05, + "loss": 0.6256, + "step": 6996 + }, + { + "epoch": 1.1421982776213215, + "grad_norm": 2.0163052082061768, + "learning_rate": 1.981946709489403e-05, + "loss": 0.768, + "step": 6997 + }, + { + "epoch": 1.142361536263826, + "grad_norm": 1.6024318933486938, + "learning_rate": 1.9819407064026497e-05, + "loss": 0.5547, + "step": 6998 + }, + { + "epoch": 1.1425247949063304, + "grad_norm": 1.6843394041061401, + "learning_rate": 1.9819347023270825e-05, + "loss": 0.6469, + "step": 6999 + }, + { + "epoch": 1.1426880535488348, + "grad_norm": 1.9185224771499634, + "learning_rate": 1.9819286972627066e-05, + "loss": 0.7551, + "step": 7000 + }, + { + "epoch": 1.142851312191339, + "grad_norm": 1.907623052597046, + "learning_rate": 1.9819226912095296e-05, + "loss": 0.7487, + "step": 7001 + }, + { + "epoch": 1.1430145708338435, + "grad_norm": 1.6331384181976318, + "learning_rate": 1.981916684167556e-05, + "loss": 0.6189, + "step": 7002 + }, + { + "epoch": 1.143177829476348, + "grad_norm": 1.7796825170516968, + "learning_rate": 1.981910676136793e-05, + "loss": 0.7001, + "step": 7003 + }, + { + "epoch": 1.1433410881188524, + "grad_norm": 1.947471261024475, + "learning_rate": 1.9819046671172462e-05, + "loss": 0.6548, + "step": 7004 + }, + { + "epoch": 1.1435043467613566, + "grad_norm": 1.510225534439087, + "learning_rate": 1.9818986571089213e-05, + "loss": 0.6269, + "step": 7005 + }, + { + "epoch": 1.143667605403861, + "grad_norm": 2.231996536254883, + "learning_rate": 1.9818926461118254e-05, + "loss": 0.6445, + "step": 7006 + }, + { + "epoch": 1.1438308640463655, + "grad_norm": 1.8167731761932373, + "learning_rate": 1.981886634125963e-05, + "loss": 0.746, + "step": 7007 + }, + { + "epoch": 1.14399412268887, + "grad_norm": 1.9367519617080688, + "learning_rate": 1.9818806211513414e-05, + "loss": 0.773, + "step": 7008 + }, + { + "epoch": 1.1441573813313743, + "grad_norm": 1.8689509630203247, + "learning_rate": 1.9818746071879666e-05, + "loss": 0.7417, + "step": 7009 + }, + { + "epoch": 1.1443206399738786, + "grad_norm": 1.78598153591156, + "learning_rate": 1.981868592235844e-05, + "loss": 0.6231, + "step": 7010 + }, + { + "epoch": 1.144483898616383, + "grad_norm": 2.1730446815490723, + "learning_rate": 1.98186257629498e-05, + "loss": 0.8011, + "step": 7011 + }, + { + "epoch": 1.1446471572588874, + "grad_norm": 1.4840019941329956, + "learning_rate": 1.981856559365381e-05, + "loss": 0.6577, + "step": 7012 + }, + { + "epoch": 1.1448104159013919, + "grad_norm": 1.9703127145767212, + "learning_rate": 1.981850541447052e-05, + "loss": 0.8455, + "step": 7013 + }, + { + "epoch": 1.144973674543896, + "grad_norm": 1.7421993017196655, + "learning_rate": 1.9818445225400004e-05, + "loss": 0.6875, + "step": 7014 + }, + { + "epoch": 1.1451369331864005, + "grad_norm": 1.710767388343811, + "learning_rate": 1.9818385026442314e-05, + "loss": 0.6877, + "step": 7015 + }, + { + "epoch": 1.145300191828905, + "grad_norm": 1.7985020875930786, + "learning_rate": 1.981832481759751e-05, + "loss": 0.6734, + "step": 7016 + }, + { + "epoch": 1.1454634504714094, + "grad_norm": 1.6702628135681152, + "learning_rate": 1.981826459886566e-05, + "loss": 0.6299, + "step": 7017 + }, + { + "epoch": 1.1456267091139136, + "grad_norm": 1.8287038803100586, + "learning_rate": 1.981820437024682e-05, + "loss": 0.6199, + "step": 7018 + }, + { + "epoch": 1.145789967756418, + "grad_norm": 1.996075987815857, + "learning_rate": 1.981814413174105e-05, + "loss": 0.7018, + "step": 7019 + }, + { + "epoch": 1.1459532263989225, + "grad_norm": 1.8047541379928589, + "learning_rate": 1.981808388334841e-05, + "loss": 0.6946, + "step": 7020 + }, + { + "epoch": 1.146116485041427, + "grad_norm": 1.895222783088684, + "learning_rate": 1.981802362506896e-05, + "loss": 0.7188, + "step": 7021 + }, + { + "epoch": 1.1462797436839312, + "grad_norm": 1.7218928337097168, + "learning_rate": 1.9817963356902768e-05, + "loss": 0.6843, + "step": 7022 + }, + { + "epoch": 1.1464430023264356, + "grad_norm": 2.0415101051330566, + "learning_rate": 1.9817903078849884e-05, + "loss": 0.706, + "step": 7023 + }, + { + "epoch": 1.14660626096894, + "grad_norm": 2.0164825916290283, + "learning_rate": 1.981784279091038e-05, + "loss": 0.7478, + "step": 7024 + }, + { + "epoch": 1.1467695196114445, + "grad_norm": 1.7963260412216187, + "learning_rate": 1.981778249308431e-05, + "loss": 0.7191, + "step": 7025 + }, + { + "epoch": 1.146932778253949, + "grad_norm": 1.9597198963165283, + "learning_rate": 1.9817722185371733e-05, + "loss": 0.6932, + "step": 7026 + }, + { + "epoch": 1.1470960368964531, + "grad_norm": 2.1291117668151855, + "learning_rate": 1.981766186777271e-05, + "loss": 0.828, + "step": 7027 + }, + { + "epoch": 1.1472592955389576, + "grad_norm": 1.851850152015686, + "learning_rate": 1.981760154028731e-05, + "loss": 0.6523, + "step": 7028 + }, + { + "epoch": 1.147422554181462, + "grad_norm": 1.885074496269226, + "learning_rate": 1.9817541202915586e-05, + "loss": 0.6985, + "step": 7029 + }, + { + "epoch": 1.1475858128239664, + "grad_norm": 1.7407574653625488, + "learning_rate": 1.98174808556576e-05, + "loss": 0.6714, + "step": 7030 + }, + { + "epoch": 1.1477490714664706, + "grad_norm": 2.3079099655151367, + "learning_rate": 1.981742049851341e-05, + "loss": 0.8315, + "step": 7031 + }, + { + "epoch": 1.147912330108975, + "grad_norm": 2.117048501968384, + "learning_rate": 1.9817360131483086e-05, + "loss": 0.8309, + "step": 7032 + }, + { + "epoch": 1.1480755887514795, + "grad_norm": 1.8486849069595337, + "learning_rate": 1.981729975456668e-05, + "loss": 0.7698, + "step": 7033 + }, + { + "epoch": 1.148238847393984, + "grad_norm": 1.8877975940704346, + "learning_rate": 1.9817239367764257e-05, + "loss": 0.7791, + "step": 7034 + }, + { + "epoch": 1.1484021060364884, + "grad_norm": 2.020747184753418, + "learning_rate": 1.981717897107587e-05, + "loss": 0.74, + "step": 7035 + }, + { + "epoch": 1.1485653646789926, + "grad_norm": 2.266770601272583, + "learning_rate": 1.9817118564501597e-05, + "loss": 0.7876, + "step": 7036 + }, + { + "epoch": 1.148728623321497, + "grad_norm": 1.847443699836731, + "learning_rate": 1.981705814804148e-05, + "loss": 0.7213, + "step": 7037 + }, + { + "epoch": 1.1488918819640015, + "grad_norm": 1.7767583131790161, + "learning_rate": 1.9816997721695593e-05, + "loss": 0.7806, + "step": 7038 + }, + { + "epoch": 1.149055140606506, + "grad_norm": 1.6770697832107544, + "learning_rate": 1.9816937285463992e-05, + "loss": 0.6639, + "step": 7039 + }, + { + "epoch": 1.1492183992490101, + "grad_norm": 1.8690392971038818, + "learning_rate": 1.9816876839346734e-05, + "loss": 0.8025, + "step": 7040 + }, + { + "epoch": 1.1493816578915146, + "grad_norm": 2.367093324661255, + "learning_rate": 1.9816816383343886e-05, + "loss": 0.607, + "step": 7041 + }, + { + "epoch": 1.149544916534019, + "grad_norm": 1.757157325744629, + "learning_rate": 1.9816755917455507e-05, + "loss": 0.663, + "step": 7042 + }, + { + "epoch": 1.1497081751765235, + "grad_norm": 1.756466269493103, + "learning_rate": 1.9816695441681653e-05, + "loss": 0.858, + "step": 7043 + }, + { + "epoch": 1.149871433819028, + "grad_norm": 1.6544139385223389, + "learning_rate": 1.9816634956022397e-05, + "loss": 0.7734, + "step": 7044 + }, + { + "epoch": 1.1500346924615321, + "grad_norm": 1.8360061645507812, + "learning_rate": 1.9816574460477788e-05, + "loss": 0.7147, + "step": 7045 + }, + { + "epoch": 1.1501979511040366, + "grad_norm": 1.7487989664077759, + "learning_rate": 1.9816513955047888e-05, + "loss": 0.6487, + "step": 7046 + }, + { + "epoch": 1.150361209746541, + "grad_norm": 2.075815439224243, + "learning_rate": 1.9816453439732764e-05, + "loss": 0.7866, + "step": 7047 + }, + { + "epoch": 1.1505244683890454, + "grad_norm": 1.8106496334075928, + "learning_rate": 1.9816392914532475e-05, + "loss": 0.7671, + "step": 7048 + }, + { + "epoch": 1.1506877270315496, + "grad_norm": 1.978122591972351, + "learning_rate": 1.981633237944708e-05, + "loss": 0.6661, + "step": 7049 + }, + { + "epoch": 1.150850985674054, + "grad_norm": 1.87139093875885, + "learning_rate": 1.9816271834476642e-05, + "loss": 0.7516, + "step": 7050 + }, + { + "epoch": 1.1510142443165585, + "grad_norm": 2.12522292137146, + "learning_rate": 1.981621127962122e-05, + "loss": 0.73, + "step": 7051 + }, + { + "epoch": 1.151177502959063, + "grad_norm": 1.7314891815185547, + "learning_rate": 1.9816150714880874e-05, + "loss": 0.7127, + "step": 7052 + }, + { + "epoch": 1.1513407616015674, + "grad_norm": 1.875504970550537, + "learning_rate": 1.9816090140255667e-05, + "loss": 0.7278, + "step": 7053 + }, + { + "epoch": 1.1515040202440716, + "grad_norm": 1.7297595739364624, + "learning_rate": 1.9816029555745663e-05, + "loss": 0.719, + "step": 7054 + }, + { + "epoch": 1.151667278886576, + "grad_norm": 1.7305926084518433, + "learning_rate": 1.9815968961350916e-05, + "loss": 0.7204, + "step": 7055 + }, + { + "epoch": 1.1518305375290805, + "grad_norm": 2.2047476768493652, + "learning_rate": 1.9815908357071496e-05, + "loss": 0.8031, + "step": 7056 + }, + { + "epoch": 1.151993796171585, + "grad_norm": 1.5716166496276855, + "learning_rate": 1.9815847742907458e-05, + "loss": 0.6744, + "step": 7057 + }, + { + "epoch": 1.1521570548140891, + "grad_norm": 1.672675609588623, + "learning_rate": 1.9815787118858857e-05, + "loss": 0.6443, + "step": 7058 + }, + { + "epoch": 1.1523203134565936, + "grad_norm": 1.9502196311950684, + "learning_rate": 1.9815726484925768e-05, + "loss": 0.7718, + "step": 7059 + }, + { + "epoch": 1.152483572099098, + "grad_norm": 1.7567880153656006, + "learning_rate": 1.981566584110824e-05, + "loss": 0.6525, + "step": 7060 + }, + { + "epoch": 1.1526468307416025, + "grad_norm": 1.9837549924850464, + "learning_rate": 1.9815605187406345e-05, + "loss": 0.7317, + "step": 7061 + }, + { + "epoch": 1.152810089384107, + "grad_norm": 1.8186602592468262, + "learning_rate": 1.9815544523820134e-05, + "loss": 0.7096, + "step": 7062 + }, + { + "epoch": 1.1529733480266111, + "grad_norm": 1.7165849208831787, + "learning_rate": 1.9815483850349675e-05, + "loss": 0.7509, + "step": 7063 + }, + { + "epoch": 1.1531366066691155, + "grad_norm": 1.9084129333496094, + "learning_rate": 1.9815423166995025e-05, + "loss": 0.7224, + "step": 7064 + }, + { + "epoch": 1.15329986531162, + "grad_norm": 2.2137844562530518, + "learning_rate": 1.9815362473756247e-05, + "loss": 0.8647, + "step": 7065 + }, + { + "epoch": 1.1534631239541242, + "grad_norm": 2.0009689331054688, + "learning_rate": 1.98153017706334e-05, + "loss": 0.7212, + "step": 7066 + }, + { + "epoch": 1.1536263825966286, + "grad_norm": 1.6575508117675781, + "learning_rate": 1.9815241057626547e-05, + "loss": 0.746, + "step": 7067 + }, + { + "epoch": 1.153789641239133, + "grad_norm": 1.8437013626098633, + "learning_rate": 1.981518033473575e-05, + "loss": 0.7092, + "step": 7068 + }, + { + "epoch": 1.1539528998816375, + "grad_norm": 1.8319963216781616, + "learning_rate": 1.981511960196107e-05, + "loss": 0.7405, + "step": 7069 + }, + { + "epoch": 1.154116158524142, + "grad_norm": 1.8586317300796509, + "learning_rate": 1.9815058859302563e-05, + "loss": 0.668, + "step": 7070 + }, + { + "epoch": 1.1542794171666462, + "grad_norm": 1.9352737665176392, + "learning_rate": 1.9814998106760297e-05, + "loss": 0.8728, + "step": 7071 + }, + { + "epoch": 1.1544426758091506, + "grad_norm": 1.8377000093460083, + "learning_rate": 1.981493734433433e-05, + "loss": 0.6248, + "step": 7072 + }, + { + "epoch": 1.154605934451655, + "grad_norm": 1.7349605560302734, + "learning_rate": 1.9814876572024727e-05, + "loss": 0.7057, + "step": 7073 + }, + { + "epoch": 1.1547691930941595, + "grad_norm": 2.0657670497894287, + "learning_rate": 1.981481578983154e-05, + "loss": 0.6793, + "step": 7074 + }, + { + "epoch": 1.1549324517366637, + "grad_norm": 1.988625168800354, + "learning_rate": 1.981475499775484e-05, + "loss": 0.7845, + "step": 7075 + }, + { + "epoch": 1.1550957103791681, + "grad_norm": 1.8430049419403076, + "learning_rate": 1.9814694195794683e-05, + "loss": 0.7203, + "step": 7076 + }, + { + "epoch": 1.1552589690216726, + "grad_norm": 1.4854589700698853, + "learning_rate": 1.9814633383951133e-05, + "loss": 0.5439, + "step": 7077 + }, + { + "epoch": 1.155422227664177, + "grad_norm": 1.7490549087524414, + "learning_rate": 1.981457256222425e-05, + "loss": 0.5914, + "step": 7078 + }, + { + "epoch": 1.1555854863066815, + "grad_norm": 2.2118566036224365, + "learning_rate": 1.981451173061409e-05, + "loss": 0.678, + "step": 7079 + }, + { + "epoch": 1.1557487449491857, + "grad_norm": 1.812168002128601, + "learning_rate": 1.9814450889120725e-05, + "loss": 0.6848, + "step": 7080 + }, + { + "epoch": 1.15591200359169, + "grad_norm": 1.582257628440857, + "learning_rate": 1.981439003774421e-05, + "loss": 0.7262, + "step": 7081 + }, + { + "epoch": 1.1560752622341945, + "grad_norm": 1.7081013917922974, + "learning_rate": 1.9814329176484604e-05, + "loss": 0.6169, + "step": 7082 + }, + { + "epoch": 1.156238520876699, + "grad_norm": 1.8678871393203735, + "learning_rate": 1.9814268305341974e-05, + "loss": 0.6994, + "step": 7083 + }, + { + "epoch": 1.1564017795192032, + "grad_norm": 1.8009268045425415, + "learning_rate": 1.9814207424316378e-05, + "loss": 0.6198, + "step": 7084 + }, + { + "epoch": 1.1565650381617076, + "grad_norm": 1.7349498271942139, + "learning_rate": 1.9814146533407875e-05, + "loss": 0.6545, + "step": 7085 + }, + { + "epoch": 1.156728296804212, + "grad_norm": 1.4686235189437866, + "learning_rate": 1.981408563261653e-05, + "loss": 0.5118, + "step": 7086 + }, + { + "epoch": 1.1568915554467165, + "grad_norm": 2.028930902481079, + "learning_rate": 1.9814024721942403e-05, + "loss": 0.7321, + "step": 7087 + }, + { + "epoch": 1.157054814089221, + "grad_norm": 1.8459935188293457, + "learning_rate": 1.9813963801385558e-05, + "loss": 0.6153, + "step": 7088 + }, + { + "epoch": 1.1572180727317252, + "grad_norm": 2.0305066108703613, + "learning_rate": 1.9813902870946055e-05, + "loss": 0.7784, + "step": 7089 + }, + { + "epoch": 1.1573813313742296, + "grad_norm": 1.7402966022491455, + "learning_rate": 1.981384193062395e-05, + "loss": 0.6681, + "step": 7090 + }, + { + "epoch": 1.157544590016734, + "grad_norm": 1.8188283443450928, + "learning_rate": 1.981378098041931e-05, + "loss": 0.6946, + "step": 7091 + }, + { + "epoch": 1.1577078486592385, + "grad_norm": 2.270312786102295, + "learning_rate": 1.98137200203322e-05, + "loss": 0.8649, + "step": 7092 + }, + { + "epoch": 1.1578711073017427, + "grad_norm": 1.8433678150177002, + "learning_rate": 1.981365905036267e-05, + "loss": 0.8585, + "step": 7093 + }, + { + "epoch": 1.1580343659442471, + "grad_norm": 2.6229636669158936, + "learning_rate": 1.981359807051079e-05, + "loss": 0.8618, + "step": 7094 + }, + { + "epoch": 1.1581976245867516, + "grad_norm": 2.3593556880950928, + "learning_rate": 1.981353708077662e-05, + "loss": 0.9448, + "step": 7095 + }, + { + "epoch": 1.158360883229256, + "grad_norm": 1.933457374572754, + "learning_rate": 1.981347608116022e-05, + "loss": 0.7786, + "step": 7096 + }, + { + "epoch": 1.1585241418717604, + "grad_norm": 1.6249339580535889, + "learning_rate": 1.9813415071661657e-05, + "loss": 0.5231, + "step": 7097 + }, + { + "epoch": 1.1586874005142647, + "grad_norm": 1.7684881687164307, + "learning_rate": 1.981335405228098e-05, + "loss": 0.6819, + "step": 7098 + }, + { + "epoch": 1.158850659156769, + "grad_norm": 2.005373239517212, + "learning_rate": 1.9813293023018266e-05, + "loss": 0.7291, + "step": 7099 + }, + { + "epoch": 1.1590139177992735, + "grad_norm": 1.5159815549850464, + "learning_rate": 1.9813231983873563e-05, + "loss": 0.6002, + "step": 7100 + }, + { + "epoch": 1.159177176441778, + "grad_norm": 1.6615958213806152, + "learning_rate": 1.9813170934846937e-05, + "loss": 0.6358, + "step": 7101 + }, + { + "epoch": 1.1593404350842822, + "grad_norm": 1.8222267627716064, + "learning_rate": 1.9813109875938455e-05, + "loss": 0.7597, + "step": 7102 + }, + { + "epoch": 1.1595036937267866, + "grad_norm": 1.5389870405197144, + "learning_rate": 1.981304880714817e-05, + "loss": 0.7253, + "step": 7103 + }, + { + "epoch": 1.159666952369291, + "grad_norm": 1.5359859466552734, + "learning_rate": 1.981298772847615e-05, + "loss": 0.7347, + "step": 7104 + }, + { + "epoch": 1.1598302110117955, + "grad_norm": 1.9134548902511597, + "learning_rate": 1.981292663992245e-05, + "loss": 0.7431, + "step": 7105 + }, + { + "epoch": 1.1599934696543, + "grad_norm": 1.759498119354248, + "learning_rate": 1.9812865541487142e-05, + "loss": 0.6685, + "step": 7106 + }, + { + "epoch": 1.1601567282968042, + "grad_norm": 1.4908102750778198, + "learning_rate": 1.9812804433170276e-05, + "loss": 0.5898, + "step": 7107 + }, + { + "epoch": 1.1603199869393086, + "grad_norm": 1.699192762374878, + "learning_rate": 1.981274331497192e-05, + "loss": 0.6301, + "step": 7108 + }, + { + "epoch": 1.160483245581813, + "grad_norm": 2.5019733905792236, + "learning_rate": 1.9812682186892136e-05, + "loss": 0.8335, + "step": 7109 + }, + { + "epoch": 1.1606465042243173, + "grad_norm": 1.8765603303909302, + "learning_rate": 1.981262104893098e-05, + "loss": 0.7594, + "step": 7110 + }, + { + "epoch": 1.1608097628668217, + "grad_norm": 1.539469838142395, + "learning_rate": 1.9812559901088518e-05, + "loss": 0.5843, + "step": 7111 + }, + { + "epoch": 1.1609730215093261, + "grad_norm": 2.524442434310913, + "learning_rate": 1.9812498743364814e-05, + "loss": 0.8364, + "step": 7112 + }, + { + "epoch": 1.1611362801518306, + "grad_norm": 1.8964955806732178, + "learning_rate": 1.9812437575759924e-05, + "loss": 0.7686, + "step": 7113 + }, + { + "epoch": 1.161299538794335, + "grad_norm": 1.8319218158721924, + "learning_rate": 1.9812376398273914e-05, + "loss": 0.7677, + "step": 7114 + }, + { + "epoch": 1.1614627974368392, + "grad_norm": 2.158236265182495, + "learning_rate": 1.981231521090684e-05, + "loss": 0.7319, + "step": 7115 + }, + { + "epoch": 1.1616260560793437, + "grad_norm": 2.2673230171203613, + "learning_rate": 1.981225401365877e-05, + "loss": 0.8539, + "step": 7116 + }, + { + "epoch": 1.161789314721848, + "grad_norm": 1.8791080713272095, + "learning_rate": 1.981219280652976e-05, + "loss": 0.7454, + "step": 7117 + }, + { + "epoch": 1.1619525733643525, + "grad_norm": 1.9308931827545166, + "learning_rate": 1.9812131589519876e-05, + "loss": 0.7057, + "step": 7118 + }, + { + "epoch": 1.1621158320068568, + "grad_norm": 1.5517247915267944, + "learning_rate": 1.981207036262918e-05, + "loss": 0.5228, + "step": 7119 + }, + { + "epoch": 1.1622790906493612, + "grad_norm": 1.773578405380249, + "learning_rate": 1.981200912585773e-05, + "loss": 0.6027, + "step": 7120 + }, + { + "epoch": 1.1624423492918656, + "grad_norm": 1.693326473236084, + "learning_rate": 1.981194787920559e-05, + "loss": 0.7101, + "step": 7121 + }, + { + "epoch": 1.16260560793437, + "grad_norm": 1.614210605621338, + "learning_rate": 1.981188662267282e-05, + "loss": 0.5765, + "step": 7122 + }, + { + "epoch": 1.1627688665768745, + "grad_norm": 1.8770637512207031, + "learning_rate": 1.9811825356259483e-05, + "loss": 0.6339, + "step": 7123 + }, + { + "epoch": 1.1629321252193787, + "grad_norm": 1.9384770393371582, + "learning_rate": 1.981176407996564e-05, + "loss": 0.6792, + "step": 7124 + }, + { + "epoch": 1.1630953838618832, + "grad_norm": 2.167144775390625, + "learning_rate": 1.9811702793791357e-05, + "loss": 0.6558, + "step": 7125 + }, + { + "epoch": 1.1632586425043876, + "grad_norm": 2.067514419555664, + "learning_rate": 1.9811641497736686e-05, + "loss": 0.8768, + "step": 7126 + }, + { + "epoch": 1.163421901146892, + "grad_norm": 1.8030084371566772, + "learning_rate": 1.9811580191801697e-05, + "loss": 0.6911, + "step": 7127 + }, + { + "epoch": 1.1635851597893963, + "grad_norm": 2.123981475830078, + "learning_rate": 1.9811518875986452e-05, + "loss": 0.6905, + "step": 7128 + }, + { + "epoch": 1.1637484184319007, + "grad_norm": 1.6889708042144775, + "learning_rate": 1.9811457550291008e-05, + "loss": 0.604, + "step": 7129 + }, + { + "epoch": 1.1639116770744051, + "grad_norm": 1.786833643913269, + "learning_rate": 1.981139621471543e-05, + "loss": 0.7208, + "step": 7130 + }, + { + "epoch": 1.1640749357169096, + "grad_norm": 2.3827736377716064, + "learning_rate": 1.9811334869259774e-05, + "loss": 0.688, + "step": 7131 + }, + { + "epoch": 1.164238194359414, + "grad_norm": 1.7948240041732788, + "learning_rate": 1.9811273513924112e-05, + "loss": 0.6452, + "step": 7132 + }, + { + "epoch": 1.1644014530019182, + "grad_norm": 1.8083299398422241, + "learning_rate": 1.9811212148708496e-05, + "loss": 0.6758, + "step": 7133 + }, + { + "epoch": 1.1645647116444227, + "grad_norm": 1.6124064922332764, + "learning_rate": 1.9811150773612996e-05, + "loss": 0.6251, + "step": 7134 + }, + { + "epoch": 1.164727970286927, + "grad_norm": 1.8798433542251587, + "learning_rate": 1.9811089388637667e-05, + "loss": 0.6362, + "step": 7135 + }, + { + "epoch": 1.1648912289294315, + "grad_norm": 1.766593098640442, + "learning_rate": 1.9811027993782575e-05, + "loss": 0.7453, + "step": 7136 + }, + { + "epoch": 1.1650544875719357, + "grad_norm": 1.8099079132080078, + "learning_rate": 1.981096658904778e-05, + "loss": 0.705, + "step": 7137 + }, + { + "epoch": 1.1652177462144402, + "grad_norm": 1.97365403175354, + "learning_rate": 1.981090517443334e-05, + "loss": 0.7224, + "step": 7138 + }, + { + "epoch": 1.1653810048569446, + "grad_norm": 1.5255546569824219, + "learning_rate": 1.9810843749939327e-05, + "loss": 0.614, + "step": 7139 + }, + { + "epoch": 1.165544263499449, + "grad_norm": 2.3167102336883545, + "learning_rate": 1.9810782315565795e-05, + "loss": 0.7847, + "step": 7140 + }, + { + "epoch": 1.1657075221419535, + "grad_norm": 1.9067007303237915, + "learning_rate": 1.9810720871312806e-05, + "loss": 0.5857, + "step": 7141 + }, + { + "epoch": 1.1658707807844577, + "grad_norm": 1.5282741785049438, + "learning_rate": 1.9810659417180428e-05, + "loss": 0.6213, + "step": 7142 + }, + { + "epoch": 1.1660340394269622, + "grad_norm": 1.8993463516235352, + "learning_rate": 1.9810597953168715e-05, + "loss": 0.7016, + "step": 7143 + }, + { + "epoch": 1.1661972980694666, + "grad_norm": 2.0074756145477295, + "learning_rate": 1.9810536479277735e-05, + "loss": 0.8246, + "step": 7144 + }, + { + "epoch": 1.166360556711971, + "grad_norm": 1.938750147819519, + "learning_rate": 1.9810474995507545e-05, + "loss": 0.7049, + "step": 7145 + }, + { + "epoch": 1.1665238153544752, + "grad_norm": 1.690386414527893, + "learning_rate": 1.981041350185821e-05, + "loss": 0.5837, + "step": 7146 + }, + { + "epoch": 1.1666870739969797, + "grad_norm": 1.9899473190307617, + "learning_rate": 1.981035199832979e-05, + "loss": 0.6591, + "step": 7147 + }, + { + "epoch": 1.1668503326394841, + "grad_norm": 1.6416652202606201, + "learning_rate": 1.9810290484922352e-05, + "loss": 0.7038, + "step": 7148 + }, + { + "epoch": 1.1670135912819886, + "grad_norm": 1.7029551267623901, + "learning_rate": 1.981022896163595e-05, + "loss": 0.6901, + "step": 7149 + }, + { + "epoch": 1.167176849924493, + "grad_norm": 1.770218014717102, + "learning_rate": 1.9810167428470653e-05, + "loss": 0.7067, + "step": 7150 + }, + { + "epoch": 1.1673401085669972, + "grad_norm": 1.8394699096679688, + "learning_rate": 1.9810105885426517e-05, + "loss": 0.627, + "step": 7151 + }, + { + "epoch": 1.1675033672095017, + "grad_norm": 1.7034099102020264, + "learning_rate": 1.9810044332503612e-05, + "loss": 0.6397, + "step": 7152 + }, + { + "epoch": 1.167666625852006, + "grad_norm": 1.5363147258758545, + "learning_rate": 1.980998276970199e-05, + "loss": 0.5937, + "step": 7153 + }, + { + "epoch": 1.1678298844945103, + "grad_norm": 1.547714352607727, + "learning_rate": 1.980992119702172e-05, + "loss": 0.5913, + "step": 7154 + }, + { + "epoch": 1.1679931431370147, + "grad_norm": 1.6826876401901245, + "learning_rate": 1.980985961446286e-05, + "loss": 0.61, + "step": 7155 + }, + { + "epoch": 1.1681564017795192, + "grad_norm": 2.023994207382202, + "learning_rate": 1.9809798022025475e-05, + "loss": 0.6641, + "step": 7156 + }, + { + "epoch": 1.1683196604220236, + "grad_norm": 1.753090739250183, + "learning_rate": 1.9809736419709626e-05, + "loss": 0.6604, + "step": 7157 + }, + { + "epoch": 1.168482919064528, + "grad_norm": 2.070065975189209, + "learning_rate": 1.9809674807515374e-05, + "loss": 0.7202, + "step": 7158 + }, + { + "epoch": 1.1686461777070323, + "grad_norm": 2.0827131271362305, + "learning_rate": 1.9809613185442783e-05, + "loss": 0.8166, + "step": 7159 + }, + { + "epoch": 1.1688094363495367, + "grad_norm": 1.8187520503997803, + "learning_rate": 1.9809551553491918e-05, + "loss": 0.614, + "step": 7160 + }, + { + "epoch": 1.1689726949920412, + "grad_norm": 1.7988108396530151, + "learning_rate": 1.9809489911662832e-05, + "loss": 0.5808, + "step": 7161 + }, + { + "epoch": 1.1691359536345456, + "grad_norm": 1.5943621397018433, + "learning_rate": 1.9809428259955594e-05, + "loss": 0.5103, + "step": 7162 + }, + { + "epoch": 1.1692992122770498, + "grad_norm": 2.5683858394622803, + "learning_rate": 1.9809366598370265e-05, + "loss": 0.8241, + "step": 7163 + }, + { + "epoch": 1.1694624709195542, + "grad_norm": 1.8795560598373413, + "learning_rate": 1.980930492690691e-05, + "loss": 0.7316, + "step": 7164 + }, + { + "epoch": 1.1696257295620587, + "grad_norm": 1.7260730266571045, + "learning_rate": 1.9809243245565583e-05, + "loss": 0.6325, + "step": 7165 + }, + { + "epoch": 1.1697889882045631, + "grad_norm": 2.002794027328491, + "learning_rate": 1.9809181554346348e-05, + "loss": 0.8228, + "step": 7166 + }, + { + "epoch": 1.1699522468470676, + "grad_norm": 1.757325291633606, + "learning_rate": 1.9809119853249273e-05, + "loss": 0.5626, + "step": 7167 + }, + { + "epoch": 1.1701155054895718, + "grad_norm": 2.499027967453003, + "learning_rate": 1.9809058142274415e-05, + "loss": 0.6445, + "step": 7168 + }, + { + "epoch": 1.1702787641320762, + "grad_norm": 1.6577476263046265, + "learning_rate": 1.980899642142184e-05, + "loss": 0.5466, + "step": 7169 + }, + { + "epoch": 1.1704420227745806, + "grad_norm": 1.8361775875091553, + "learning_rate": 1.980893469069161e-05, + "loss": 0.6365, + "step": 7170 + }, + { + "epoch": 1.170605281417085, + "grad_norm": 1.7803117036819458, + "learning_rate": 1.9808872950083785e-05, + "loss": 0.6641, + "step": 7171 + }, + { + "epoch": 1.1707685400595893, + "grad_norm": 1.928062081336975, + "learning_rate": 1.9808811199598424e-05, + "loss": 0.6656, + "step": 7172 + }, + { + "epoch": 1.1709317987020937, + "grad_norm": 1.6976220607757568, + "learning_rate": 1.9808749439235595e-05, + "loss": 0.6067, + "step": 7173 + }, + { + "epoch": 1.1710950573445982, + "grad_norm": 1.857807993888855, + "learning_rate": 1.9808687668995356e-05, + "loss": 0.6502, + "step": 7174 + }, + { + "epoch": 1.1712583159871026, + "grad_norm": 1.7393978834152222, + "learning_rate": 1.9808625888877775e-05, + "loss": 0.6222, + "step": 7175 + }, + { + "epoch": 1.171421574629607, + "grad_norm": 1.9331141710281372, + "learning_rate": 1.9808564098882908e-05, + "loss": 0.6521, + "step": 7176 + }, + { + "epoch": 1.1715848332721113, + "grad_norm": 1.793966293334961, + "learning_rate": 1.9808502299010817e-05, + "loss": 0.6389, + "step": 7177 + }, + { + "epoch": 1.1717480919146157, + "grad_norm": 1.8630450963974, + "learning_rate": 1.980844048926157e-05, + "loss": 0.7533, + "step": 7178 + }, + { + "epoch": 1.1719113505571201, + "grad_norm": 1.7854267358779907, + "learning_rate": 1.9808378669635227e-05, + "loss": 0.7015, + "step": 7179 + }, + { + "epoch": 1.1720746091996246, + "grad_norm": 1.8687618970870972, + "learning_rate": 1.9808316840131846e-05, + "loss": 0.6534, + "step": 7180 + }, + { + "epoch": 1.1722378678421288, + "grad_norm": 1.9814318418502808, + "learning_rate": 1.9808255000751496e-05, + "loss": 0.7849, + "step": 7181 + }, + { + "epoch": 1.1724011264846332, + "grad_norm": 1.961942195892334, + "learning_rate": 1.9808193151494233e-05, + "loss": 0.7385, + "step": 7182 + }, + { + "epoch": 1.1725643851271377, + "grad_norm": 1.8831777572631836, + "learning_rate": 1.9808131292360123e-05, + "loss": 0.6908, + "step": 7183 + }, + { + "epoch": 1.1727276437696421, + "grad_norm": 1.4788898229599, + "learning_rate": 1.9808069423349228e-05, + "loss": 0.5731, + "step": 7184 + }, + { + "epoch": 1.1728909024121466, + "grad_norm": 1.8709713220596313, + "learning_rate": 1.9808007544461608e-05, + "loss": 0.7836, + "step": 7185 + }, + { + "epoch": 1.1730541610546508, + "grad_norm": 1.7641911506652832, + "learning_rate": 1.980794565569733e-05, + "loss": 0.6387, + "step": 7186 + }, + { + "epoch": 1.1732174196971552, + "grad_norm": 1.7214722633361816, + "learning_rate": 1.980788375705645e-05, + "loss": 0.6267, + "step": 7187 + }, + { + "epoch": 1.1733806783396596, + "grad_norm": 1.7868555784225464, + "learning_rate": 1.9807821848539034e-05, + "loss": 0.7555, + "step": 7188 + }, + { + "epoch": 1.173543936982164, + "grad_norm": 1.7970112562179565, + "learning_rate": 1.9807759930145146e-05, + "loss": 0.7551, + "step": 7189 + }, + { + "epoch": 1.1737071956246683, + "grad_norm": 2.7307162284851074, + "learning_rate": 1.9807698001874848e-05, + "loss": 0.853, + "step": 7190 + }, + { + "epoch": 1.1738704542671727, + "grad_norm": 1.9283397197723389, + "learning_rate": 1.9807636063728196e-05, + "loss": 0.6791, + "step": 7191 + }, + { + "epoch": 1.1740337129096772, + "grad_norm": 1.6616342067718506, + "learning_rate": 1.980757411570526e-05, + "loss": 0.6663, + "step": 7192 + }, + { + "epoch": 1.1741969715521816, + "grad_norm": 1.6126534938812256, + "learning_rate": 1.98075121578061e-05, + "loss": 0.5349, + "step": 7193 + }, + { + "epoch": 1.174360230194686, + "grad_norm": 1.5531113147735596, + "learning_rate": 1.9807450190030777e-05, + "loss": 0.6795, + "step": 7194 + }, + { + "epoch": 1.1745234888371903, + "grad_norm": 2.0304970741271973, + "learning_rate": 1.9807388212379352e-05, + "loss": 0.7633, + "step": 7195 + }, + { + "epoch": 1.1746867474796947, + "grad_norm": 1.7407114505767822, + "learning_rate": 1.980732622485189e-05, + "loss": 0.7003, + "step": 7196 + }, + { + "epoch": 1.1748500061221991, + "grad_norm": 1.731769323348999, + "learning_rate": 1.9807264227448456e-05, + "loss": 0.7286, + "step": 7197 + }, + { + "epoch": 1.1750132647647034, + "grad_norm": 1.772623062133789, + "learning_rate": 1.980720222016911e-05, + "loss": 0.7105, + "step": 7198 + }, + { + "epoch": 1.1751765234072078, + "grad_norm": 1.680614948272705, + "learning_rate": 1.9807140203013914e-05, + "loss": 0.6188, + "step": 7199 + }, + { + "epoch": 1.1753397820497122, + "grad_norm": 1.6053986549377441, + "learning_rate": 1.9807078175982925e-05, + "loss": 0.6323, + "step": 7200 + }, + { + "epoch": 1.1755030406922167, + "grad_norm": 1.564581274986267, + "learning_rate": 1.9807016139076216e-05, + "loss": 0.5888, + "step": 7201 + }, + { + "epoch": 1.175666299334721, + "grad_norm": 1.6494383811950684, + "learning_rate": 1.980695409229384e-05, + "loss": 0.6808, + "step": 7202 + }, + { + "epoch": 1.1758295579772253, + "grad_norm": 2.1700117588043213, + "learning_rate": 1.980689203563587e-05, + "loss": 0.6326, + "step": 7203 + }, + { + "epoch": 1.1759928166197298, + "grad_norm": 2.066725015640259, + "learning_rate": 1.9806829969102356e-05, + "loss": 0.7514, + "step": 7204 + }, + { + "epoch": 1.1761560752622342, + "grad_norm": 1.9142897129058838, + "learning_rate": 1.980676789269337e-05, + "loss": 0.6411, + "step": 7205 + }, + { + "epoch": 1.1763193339047386, + "grad_norm": 1.920900821685791, + "learning_rate": 1.980670580640897e-05, + "loss": 0.7701, + "step": 7206 + }, + { + "epoch": 1.1764825925472429, + "grad_norm": 1.7631487846374512, + "learning_rate": 1.9806643710249224e-05, + "loss": 0.6583, + "step": 7207 + }, + { + "epoch": 1.1766458511897473, + "grad_norm": 2.0889523029327393, + "learning_rate": 1.9806581604214184e-05, + "loss": 0.7652, + "step": 7208 + }, + { + "epoch": 1.1768091098322517, + "grad_norm": 1.8101285696029663, + "learning_rate": 1.9806519488303926e-05, + "loss": 0.6677, + "step": 7209 + }, + { + "epoch": 1.1769723684747562, + "grad_norm": 2.164069414138794, + "learning_rate": 1.98064573625185e-05, + "loss": 0.9241, + "step": 7210 + }, + { + "epoch": 1.1771356271172606, + "grad_norm": 2.1131157875061035, + "learning_rate": 1.9806395226857975e-05, + "loss": 0.8211, + "step": 7211 + }, + { + "epoch": 1.1772988857597648, + "grad_norm": 1.6238783597946167, + "learning_rate": 1.9806333081322414e-05, + "loss": 0.6878, + "step": 7212 + }, + { + "epoch": 1.1774621444022693, + "grad_norm": 1.96669340133667, + "learning_rate": 1.980627092591188e-05, + "loss": 0.6012, + "step": 7213 + }, + { + "epoch": 1.1776254030447737, + "grad_norm": 1.8521089553833008, + "learning_rate": 1.980620876062643e-05, + "loss": 0.6935, + "step": 7214 + }, + { + "epoch": 1.1777886616872781, + "grad_norm": 2.254666328430176, + "learning_rate": 1.980614658546613e-05, + "loss": 0.6948, + "step": 7215 + }, + { + "epoch": 1.1779519203297824, + "grad_norm": 1.829430341720581, + "learning_rate": 1.9806084400431048e-05, + "loss": 0.6497, + "step": 7216 + }, + { + "epoch": 1.1781151789722868, + "grad_norm": 1.8784542083740234, + "learning_rate": 1.9806022205521235e-05, + "loss": 0.6555, + "step": 7217 + }, + { + "epoch": 1.1782784376147912, + "grad_norm": 1.8420171737670898, + "learning_rate": 1.9805960000736767e-05, + "loss": 0.6848, + "step": 7218 + }, + { + "epoch": 1.1784416962572957, + "grad_norm": 1.7938734292984009, + "learning_rate": 1.9805897786077693e-05, + "loss": 0.7145, + "step": 7219 + }, + { + "epoch": 1.1786049548998, + "grad_norm": 1.8854244947433472, + "learning_rate": 1.9805835561544086e-05, + "loss": 0.698, + "step": 7220 + }, + { + "epoch": 1.1787682135423043, + "grad_norm": 1.705085277557373, + "learning_rate": 1.9805773327136005e-05, + "loss": 0.7466, + "step": 7221 + }, + { + "epoch": 1.1789314721848088, + "grad_norm": 1.8277993202209473, + "learning_rate": 1.9805711082853513e-05, + "loss": 0.6877, + "step": 7222 + }, + { + "epoch": 1.1790947308273132, + "grad_norm": 1.8434219360351562, + "learning_rate": 1.9805648828696676e-05, + "loss": 0.6875, + "step": 7223 + }, + { + "epoch": 1.1792579894698176, + "grad_norm": 1.9891518354415894, + "learning_rate": 1.980558656466555e-05, + "loss": 0.7142, + "step": 7224 + }, + { + "epoch": 1.1794212481123219, + "grad_norm": 1.7292863130569458, + "learning_rate": 1.98055242907602e-05, + "loss": 0.6988, + "step": 7225 + }, + { + "epoch": 1.1795845067548263, + "grad_norm": 1.930353045463562, + "learning_rate": 1.9805462006980688e-05, + "loss": 0.7645, + "step": 7226 + }, + { + "epoch": 1.1797477653973307, + "grad_norm": 2.107757091522217, + "learning_rate": 1.980539971332708e-05, + "loss": 0.6784, + "step": 7227 + }, + { + "epoch": 1.1799110240398352, + "grad_norm": 1.8817429542541504, + "learning_rate": 1.9805337409799442e-05, + "loss": 0.7438, + "step": 7228 + }, + { + "epoch": 1.1800742826823396, + "grad_norm": 2.0765912532806396, + "learning_rate": 1.9805275096397828e-05, + "loss": 0.7681, + "step": 7229 + }, + { + "epoch": 1.1802375413248438, + "grad_norm": 1.9435827732086182, + "learning_rate": 1.9805212773122303e-05, + "loss": 0.619, + "step": 7230 + }, + { + "epoch": 1.1804007999673483, + "grad_norm": 1.7972595691680908, + "learning_rate": 1.9805150439972933e-05, + "loss": 0.658, + "step": 7231 + }, + { + "epoch": 1.1805640586098527, + "grad_norm": 1.7873631715774536, + "learning_rate": 1.9805088096949777e-05, + "loss": 0.6799, + "step": 7232 + }, + { + "epoch": 1.1807273172523571, + "grad_norm": 1.9250143766403198, + "learning_rate": 1.98050257440529e-05, + "loss": 0.7841, + "step": 7233 + }, + { + "epoch": 1.1808905758948613, + "grad_norm": 1.7598320245742798, + "learning_rate": 1.9804963381282367e-05, + "loss": 0.6523, + "step": 7234 + }, + { + "epoch": 1.1810538345373658, + "grad_norm": 2.061849355697632, + "learning_rate": 1.980490100863824e-05, + "loss": 0.8347, + "step": 7235 + }, + { + "epoch": 1.1812170931798702, + "grad_norm": 1.8773934841156006, + "learning_rate": 1.9804838626120576e-05, + "loss": 0.6891, + "step": 7236 + }, + { + "epoch": 1.1813803518223747, + "grad_norm": 1.5814512968063354, + "learning_rate": 1.9804776233729446e-05, + "loss": 0.6159, + "step": 7237 + }, + { + "epoch": 1.181543610464879, + "grad_norm": 1.5928398370742798, + "learning_rate": 1.9804713831464908e-05, + "loss": 0.5482, + "step": 7238 + }, + { + "epoch": 1.1817068691073833, + "grad_norm": 1.9759432077407837, + "learning_rate": 1.9804651419327025e-05, + "loss": 0.8435, + "step": 7239 + }, + { + "epoch": 1.1818701277498878, + "grad_norm": 1.6878821849822998, + "learning_rate": 1.9804588997315858e-05, + "loss": 0.6631, + "step": 7240 + }, + { + "epoch": 1.1820333863923922, + "grad_norm": 1.6341748237609863, + "learning_rate": 1.9804526565431478e-05, + "loss": 0.5929, + "step": 7241 + }, + { + "epoch": 1.1821966450348966, + "grad_norm": 1.8746899366378784, + "learning_rate": 1.980446412367394e-05, + "loss": 0.5766, + "step": 7242 + }, + { + "epoch": 1.1823599036774008, + "grad_norm": 1.624620795249939, + "learning_rate": 1.980440167204331e-05, + "loss": 0.5421, + "step": 7243 + }, + { + "epoch": 1.1825231623199053, + "grad_norm": 1.8365871906280518, + "learning_rate": 1.9804339210539644e-05, + "loss": 0.6809, + "step": 7244 + }, + { + "epoch": 1.1826864209624097, + "grad_norm": 1.7437423467636108, + "learning_rate": 1.9804276739163017e-05, + "loss": 0.761, + "step": 7245 + }, + { + "epoch": 1.1828496796049142, + "grad_norm": 1.7837105989456177, + "learning_rate": 1.9804214257913483e-05, + "loss": 0.6267, + "step": 7246 + }, + { + "epoch": 1.1830129382474184, + "grad_norm": 1.418367624282837, + "learning_rate": 1.980415176679111e-05, + "loss": 0.5915, + "step": 7247 + }, + { + "epoch": 1.1831761968899228, + "grad_norm": 2.2641959190368652, + "learning_rate": 1.980408926579596e-05, + "loss": 0.9123, + "step": 7248 + }, + { + "epoch": 1.1833394555324273, + "grad_norm": 2.065375804901123, + "learning_rate": 1.9804026754928092e-05, + "loss": 0.6492, + "step": 7249 + }, + { + "epoch": 1.1835027141749317, + "grad_norm": 1.9078385829925537, + "learning_rate": 1.980396423418757e-05, + "loss": 0.7211, + "step": 7250 + }, + { + "epoch": 1.183665972817436, + "grad_norm": 2.0276901721954346, + "learning_rate": 1.9803901703574465e-05, + "loss": 0.7703, + "step": 7251 + }, + { + "epoch": 1.1838292314599403, + "grad_norm": 2.091717004776001, + "learning_rate": 1.980383916308883e-05, + "loss": 0.8034, + "step": 7252 + }, + { + "epoch": 1.1839924901024448, + "grad_norm": 1.7219983339309692, + "learning_rate": 1.9803776612730728e-05, + "loss": 0.5072, + "step": 7253 + }, + { + "epoch": 1.1841557487449492, + "grad_norm": 1.8698267936706543, + "learning_rate": 1.980371405250023e-05, + "loss": 0.5958, + "step": 7254 + }, + { + "epoch": 1.1843190073874537, + "grad_norm": 1.6547167301177979, + "learning_rate": 1.9803651482397394e-05, + "loss": 0.6008, + "step": 7255 + }, + { + "epoch": 1.1844822660299579, + "grad_norm": 2.0773849487304688, + "learning_rate": 1.9803588902422283e-05, + "loss": 0.6719, + "step": 7256 + }, + { + "epoch": 1.1846455246724623, + "grad_norm": 2.084836721420288, + "learning_rate": 1.980352631257496e-05, + "loss": 0.7241, + "step": 7257 + }, + { + "epoch": 1.1848087833149668, + "grad_norm": 1.738813877105713, + "learning_rate": 1.980346371285549e-05, + "loss": 0.6673, + "step": 7258 + }, + { + "epoch": 1.1849720419574712, + "grad_norm": 1.6602576971054077, + "learning_rate": 1.980340110326393e-05, + "loss": 0.6159, + "step": 7259 + }, + { + "epoch": 1.1851353005999754, + "grad_norm": 2.4889349937438965, + "learning_rate": 1.9803338483800355e-05, + "loss": 0.7961, + "step": 7260 + }, + { + "epoch": 1.1852985592424798, + "grad_norm": 1.765291452407837, + "learning_rate": 1.9803275854464817e-05, + "loss": 0.5899, + "step": 7261 + }, + { + "epoch": 1.1854618178849843, + "grad_norm": 1.704529047012329, + "learning_rate": 1.9803213215257383e-05, + "loss": 0.607, + "step": 7262 + }, + { + "epoch": 1.1856250765274887, + "grad_norm": 1.74045729637146, + "learning_rate": 1.9803150566178117e-05, + "loss": 0.5892, + "step": 7263 + }, + { + "epoch": 1.1857883351699932, + "grad_norm": 3.506566047668457, + "learning_rate": 1.9803087907227077e-05, + "loss": 0.7507, + "step": 7264 + }, + { + "epoch": 1.1859515938124974, + "grad_norm": 2.0792431831359863, + "learning_rate": 1.9803025238404333e-05, + "loss": 0.8694, + "step": 7265 + }, + { + "epoch": 1.1861148524550018, + "grad_norm": 1.960637092590332, + "learning_rate": 1.980296255970995e-05, + "loss": 0.6969, + "step": 7266 + }, + { + "epoch": 1.1862781110975062, + "grad_norm": 1.9349961280822754, + "learning_rate": 1.9802899871143978e-05, + "loss": 0.7018, + "step": 7267 + }, + { + "epoch": 1.1864413697400107, + "grad_norm": 1.7124922275543213, + "learning_rate": 1.980283717270649e-05, + "loss": 0.6634, + "step": 7268 + }, + { + "epoch": 1.186604628382515, + "grad_norm": 2.0709786415100098, + "learning_rate": 1.980277446439755e-05, + "loss": 0.6693, + "step": 7269 + }, + { + "epoch": 1.1867678870250193, + "grad_norm": 1.8755772113800049, + "learning_rate": 1.9802711746217222e-05, + "loss": 0.6189, + "step": 7270 + }, + { + "epoch": 1.1869311456675238, + "grad_norm": 2.1060848236083984, + "learning_rate": 1.980264901816556e-05, + "loss": 0.6175, + "step": 7271 + }, + { + "epoch": 1.1870944043100282, + "grad_norm": 1.7640128135681152, + "learning_rate": 1.9802586280242634e-05, + "loss": 0.6187, + "step": 7272 + }, + { + "epoch": 1.1872576629525327, + "grad_norm": 1.568260669708252, + "learning_rate": 1.980252353244851e-05, + "loss": 0.6559, + "step": 7273 + }, + { + "epoch": 1.1874209215950369, + "grad_norm": 1.6387568712234497, + "learning_rate": 1.9802460774783242e-05, + "loss": 0.5489, + "step": 7274 + }, + { + "epoch": 1.1875841802375413, + "grad_norm": 1.8237985372543335, + "learning_rate": 1.9802398007246902e-05, + "loss": 0.6579, + "step": 7275 + }, + { + "epoch": 1.1877474388800457, + "grad_norm": 1.8865231275558472, + "learning_rate": 1.9802335229839552e-05, + "loss": 0.6619, + "step": 7276 + }, + { + "epoch": 1.1879106975225502, + "grad_norm": 1.8043243885040283, + "learning_rate": 1.980227244256125e-05, + "loss": 0.6802, + "step": 7277 + }, + { + "epoch": 1.1880739561650544, + "grad_norm": 1.956106424331665, + "learning_rate": 1.980220964541206e-05, + "loss": 0.6292, + "step": 7278 + }, + { + "epoch": 1.1882372148075588, + "grad_norm": 1.6089187860488892, + "learning_rate": 1.980214683839205e-05, + "loss": 0.5624, + "step": 7279 + }, + { + "epoch": 1.1884004734500633, + "grad_norm": 1.9813133478164673, + "learning_rate": 1.9802084021501282e-05, + "loss": 0.6202, + "step": 7280 + }, + { + "epoch": 1.1885637320925677, + "grad_norm": 2.019984006881714, + "learning_rate": 1.9802021194739815e-05, + "loss": 0.7063, + "step": 7281 + }, + { + "epoch": 1.1887269907350722, + "grad_norm": 1.9593933820724487, + "learning_rate": 1.9801958358107718e-05, + "loss": 0.625, + "step": 7282 + }, + { + "epoch": 1.1888902493775764, + "grad_norm": 1.495078206062317, + "learning_rate": 1.980189551160505e-05, + "loss": 0.5563, + "step": 7283 + }, + { + "epoch": 1.1890535080200808, + "grad_norm": 1.9710201025009155, + "learning_rate": 1.9801832655231876e-05, + "loss": 0.671, + "step": 7284 + }, + { + "epoch": 1.1892167666625852, + "grad_norm": 1.5401666164398193, + "learning_rate": 1.980176978898826e-05, + "loss": 0.5619, + "step": 7285 + }, + { + "epoch": 1.1893800253050897, + "grad_norm": 2.0315983295440674, + "learning_rate": 1.9801706912874262e-05, + "loss": 0.7018, + "step": 7286 + }, + { + "epoch": 1.189543283947594, + "grad_norm": 1.9702680110931396, + "learning_rate": 1.980164402688995e-05, + "loss": 0.6616, + "step": 7287 + }, + { + "epoch": 1.1897065425900983, + "grad_norm": 1.7691479921340942, + "learning_rate": 1.9801581131035385e-05, + "loss": 0.713, + "step": 7288 + }, + { + "epoch": 1.1898698012326028, + "grad_norm": 1.7871026992797852, + "learning_rate": 1.980151822531063e-05, + "loss": 0.6984, + "step": 7289 + }, + { + "epoch": 1.1900330598751072, + "grad_norm": 1.8511483669281006, + "learning_rate": 1.9801455309715748e-05, + "loss": 0.6633, + "step": 7290 + }, + { + "epoch": 1.1901963185176114, + "grad_norm": 2.142162799835205, + "learning_rate": 1.9801392384250806e-05, + "loss": 0.8227, + "step": 7291 + }, + { + "epoch": 1.1903595771601159, + "grad_norm": 1.9787348508834839, + "learning_rate": 1.9801329448915863e-05, + "loss": 0.7722, + "step": 7292 + }, + { + "epoch": 1.1905228358026203, + "grad_norm": 2.010305166244507, + "learning_rate": 1.980126650371098e-05, + "loss": 0.7682, + "step": 7293 + }, + { + "epoch": 1.1906860944451247, + "grad_norm": 1.6822975873947144, + "learning_rate": 1.980120354863623e-05, + "loss": 0.6293, + "step": 7294 + }, + { + "epoch": 1.190849353087629, + "grad_norm": 1.685307502746582, + "learning_rate": 1.9801140583691666e-05, + "loss": 0.6219, + "step": 7295 + }, + { + "epoch": 1.1910126117301334, + "grad_norm": 1.659957766532898, + "learning_rate": 1.980107760887736e-05, + "loss": 0.5978, + "step": 7296 + }, + { + "epoch": 1.1911758703726378, + "grad_norm": 1.998290777206421, + "learning_rate": 1.980101462419337e-05, + "loss": 0.6747, + "step": 7297 + }, + { + "epoch": 1.1913391290151423, + "grad_norm": 1.5270265340805054, + "learning_rate": 1.9800951629639758e-05, + "loss": 0.5715, + "step": 7298 + }, + { + "epoch": 1.1915023876576467, + "grad_norm": 2.379990577697754, + "learning_rate": 1.9800888625216593e-05, + "loss": 0.6822, + "step": 7299 + }, + { + "epoch": 1.191665646300151, + "grad_norm": 1.9211238622665405, + "learning_rate": 1.9800825610923937e-05, + "loss": 0.7268, + "step": 7300 + }, + { + "epoch": 1.1918289049426554, + "grad_norm": 2.146958589553833, + "learning_rate": 1.9800762586761852e-05, + "loss": 0.7994, + "step": 7301 + }, + { + "epoch": 1.1919921635851598, + "grad_norm": 1.6015897989273071, + "learning_rate": 1.9800699552730397e-05, + "loss": 0.677, + "step": 7302 + }, + { + "epoch": 1.1921554222276642, + "grad_norm": 1.5932108163833618, + "learning_rate": 1.9800636508829646e-05, + "loss": 0.5979, + "step": 7303 + }, + { + "epoch": 1.1923186808701685, + "grad_norm": 2.099510431289673, + "learning_rate": 1.9800573455059653e-05, + "loss": 0.8138, + "step": 7304 + }, + { + "epoch": 1.192481939512673, + "grad_norm": 1.926492691040039, + "learning_rate": 1.9800510391420483e-05, + "loss": 0.7524, + "step": 7305 + }, + { + "epoch": 1.1926451981551773, + "grad_norm": 1.57281494140625, + "learning_rate": 1.9800447317912207e-05, + "loss": 0.571, + "step": 7306 + }, + { + "epoch": 1.1928084567976818, + "grad_norm": 1.8618874549865723, + "learning_rate": 1.9800384234534882e-05, + "loss": 0.8058, + "step": 7307 + }, + { + "epoch": 1.1929717154401862, + "grad_norm": 1.6998305320739746, + "learning_rate": 1.9800321141288573e-05, + "loss": 0.6863, + "step": 7308 + }, + { + "epoch": 1.1931349740826904, + "grad_norm": 1.6840764284133911, + "learning_rate": 1.980025803817334e-05, + "loss": 0.7451, + "step": 7309 + }, + { + "epoch": 1.1932982327251949, + "grad_norm": 1.5975855588912964, + "learning_rate": 1.980019492518925e-05, + "loss": 0.6996, + "step": 7310 + }, + { + "epoch": 1.1934614913676993, + "grad_norm": 2.0803775787353516, + "learning_rate": 1.9800131802336372e-05, + "loss": 0.6648, + "step": 7311 + }, + { + "epoch": 1.1936247500102037, + "grad_norm": 1.947798728942871, + "learning_rate": 1.9800068669614757e-05, + "loss": 0.7646, + "step": 7312 + }, + { + "epoch": 1.193788008652708, + "grad_norm": 1.6884888410568237, + "learning_rate": 1.980000552702448e-05, + "loss": 0.7267, + "step": 7313 + }, + { + "epoch": 1.1939512672952124, + "grad_norm": 1.6648905277252197, + "learning_rate": 1.9799942374565597e-05, + "loss": 0.6768, + "step": 7314 + }, + { + "epoch": 1.1941145259377168, + "grad_norm": 1.9926567077636719, + "learning_rate": 1.9799879212238175e-05, + "loss": 0.8388, + "step": 7315 + }, + { + "epoch": 1.1942777845802213, + "grad_norm": 1.9007270336151123, + "learning_rate": 1.979981604004228e-05, + "loss": 0.742, + "step": 7316 + }, + { + "epoch": 1.1944410432227257, + "grad_norm": 1.920228362083435, + "learning_rate": 1.979975285797797e-05, + "loss": 0.6888, + "step": 7317 + }, + { + "epoch": 1.19460430186523, + "grad_norm": 1.8172974586486816, + "learning_rate": 1.9799689666045313e-05, + "loss": 0.8582, + "step": 7318 + }, + { + "epoch": 1.1947675605077344, + "grad_norm": 1.8705157041549683, + "learning_rate": 1.979962646424437e-05, + "loss": 0.7773, + "step": 7319 + }, + { + "epoch": 1.1949308191502388, + "grad_norm": 1.8717389106750488, + "learning_rate": 1.9799563252575208e-05, + "loss": 0.6498, + "step": 7320 + }, + { + "epoch": 1.1950940777927432, + "grad_norm": 1.9353580474853516, + "learning_rate": 1.9799500031037885e-05, + "loss": 0.6992, + "step": 7321 + }, + { + "epoch": 1.1952573364352475, + "grad_norm": 2.026383876800537, + "learning_rate": 1.979943679963247e-05, + "loss": 0.7626, + "step": 7322 + }, + { + "epoch": 1.195420595077752, + "grad_norm": 1.6690312623977661, + "learning_rate": 1.9799373558359025e-05, + "loss": 0.6714, + "step": 7323 + }, + { + "epoch": 1.1955838537202563, + "grad_norm": 1.6994627714157104, + "learning_rate": 1.9799310307217613e-05, + "loss": 0.6626, + "step": 7324 + }, + { + "epoch": 1.1957471123627608, + "grad_norm": 2.0497868061065674, + "learning_rate": 1.9799247046208297e-05, + "loss": 0.5524, + "step": 7325 + }, + { + "epoch": 1.1959103710052652, + "grad_norm": 1.9385045766830444, + "learning_rate": 1.9799183775331143e-05, + "loss": 0.8281, + "step": 7326 + }, + { + "epoch": 1.1960736296477694, + "grad_norm": 1.5171760320663452, + "learning_rate": 1.9799120494586214e-05, + "loss": 0.6417, + "step": 7327 + }, + { + "epoch": 1.1962368882902739, + "grad_norm": 2.231236457824707, + "learning_rate": 1.9799057203973574e-05, + "loss": 0.7207, + "step": 7328 + }, + { + "epoch": 1.1964001469327783, + "grad_norm": 1.6312599182128906, + "learning_rate": 1.9798993903493282e-05, + "loss": 0.7419, + "step": 7329 + }, + { + "epoch": 1.1965634055752827, + "grad_norm": 1.8975082635879517, + "learning_rate": 1.9798930593145412e-05, + "loss": 0.7752, + "step": 7330 + }, + { + "epoch": 1.196726664217787, + "grad_norm": 1.7579262256622314, + "learning_rate": 1.9798867272930015e-05, + "loss": 0.695, + "step": 7331 + }, + { + "epoch": 1.1968899228602914, + "grad_norm": 1.8946683406829834, + "learning_rate": 1.9798803942847165e-05, + "loss": 0.8183, + "step": 7332 + }, + { + "epoch": 1.1970531815027958, + "grad_norm": 1.6872467994689941, + "learning_rate": 1.9798740602896924e-05, + "loss": 0.5919, + "step": 7333 + }, + { + "epoch": 1.1972164401453003, + "grad_norm": 1.8607194423675537, + "learning_rate": 1.9798677253079348e-05, + "loss": 0.742, + "step": 7334 + }, + { + "epoch": 1.1973796987878047, + "grad_norm": 1.941464900970459, + "learning_rate": 1.9798613893394512e-05, + "loss": 0.8724, + "step": 7335 + }, + { + "epoch": 1.197542957430309, + "grad_norm": 1.8272013664245605, + "learning_rate": 1.979855052384247e-05, + "loss": 0.6446, + "step": 7336 + }, + { + "epoch": 1.1977062160728134, + "grad_norm": 1.8992843627929688, + "learning_rate": 1.9798487144423293e-05, + "loss": 0.6438, + "step": 7337 + }, + { + "epoch": 1.1978694747153178, + "grad_norm": 1.8274142742156982, + "learning_rate": 1.9798423755137038e-05, + "loss": 0.7506, + "step": 7338 + }, + { + "epoch": 1.198032733357822, + "grad_norm": 1.6475982666015625, + "learning_rate": 1.9798360355983777e-05, + "loss": 0.591, + "step": 7339 + }, + { + "epoch": 1.1981959920003264, + "grad_norm": 1.7964210510253906, + "learning_rate": 1.9798296946963572e-05, + "loss": 0.6687, + "step": 7340 + }, + { + "epoch": 1.1983592506428309, + "grad_norm": 1.7517529726028442, + "learning_rate": 1.979823352807648e-05, + "loss": 0.7453, + "step": 7341 + }, + { + "epoch": 1.1985225092853353, + "grad_norm": 1.6476244926452637, + "learning_rate": 1.9798170099322573e-05, + "loss": 0.6486, + "step": 7342 + }, + { + "epoch": 1.1986857679278398, + "grad_norm": 1.8606644868850708, + "learning_rate": 1.9798106660701907e-05, + "loss": 0.7362, + "step": 7343 + }, + { + "epoch": 1.198849026570344, + "grad_norm": 1.2816218137741089, + "learning_rate": 1.9798043212214554e-05, + "loss": 0.5284, + "step": 7344 + }, + { + "epoch": 1.1990122852128484, + "grad_norm": 1.9276995658874512, + "learning_rate": 1.979797975386057e-05, + "loss": 0.7632, + "step": 7345 + }, + { + "epoch": 1.1991755438553529, + "grad_norm": 1.9830139875411987, + "learning_rate": 1.9797916285640028e-05, + "loss": 0.6872, + "step": 7346 + }, + { + "epoch": 1.1993388024978573, + "grad_norm": 2.1543588638305664, + "learning_rate": 1.9797852807552983e-05, + "loss": 0.7702, + "step": 7347 + }, + { + "epoch": 1.1995020611403615, + "grad_norm": 1.978318452835083, + "learning_rate": 1.9797789319599506e-05, + "loss": 0.8007, + "step": 7348 + }, + { + "epoch": 1.199665319782866, + "grad_norm": 1.8143136501312256, + "learning_rate": 1.9797725821779656e-05, + "loss": 0.6891, + "step": 7349 + }, + { + "epoch": 1.1998285784253704, + "grad_norm": 1.9532885551452637, + "learning_rate": 1.9797662314093496e-05, + "loss": 0.6453, + "step": 7350 + }, + { + "epoch": 1.1999918370678748, + "grad_norm": 2.064199209213257, + "learning_rate": 1.9797598796541096e-05, + "loss": 0.8316, + "step": 7351 + }, + { + "epoch": 1.2001550957103793, + "grad_norm": 1.9671635627746582, + "learning_rate": 1.9797535269122517e-05, + "loss": 0.8231, + "step": 7352 + }, + { + "epoch": 1.2003183543528835, + "grad_norm": 1.6491751670837402, + "learning_rate": 1.9797471731837824e-05, + "loss": 0.5953, + "step": 7353 + }, + { + "epoch": 1.200481612995388, + "grad_norm": 2.0630359649658203, + "learning_rate": 1.9797408184687074e-05, + "loss": 0.7362, + "step": 7354 + }, + { + "epoch": 1.2006448716378924, + "grad_norm": 1.8140848875045776, + "learning_rate": 1.9797344627670338e-05, + "loss": 0.7408, + "step": 7355 + }, + { + "epoch": 1.2008081302803968, + "grad_norm": 2.2212843894958496, + "learning_rate": 1.9797281060787684e-05, + "loss": 0.7596, + "step": 7356 + }, + { + "epoch": 1.200971388922901, + "grad_norm": 1.7277783155441284, + "learning_rate": 1.9797217484039164e-05, + "loss": 0.6246, + "step": 7357 + }, + { + "epoch": 1.2011346475654054, + "grad_norm": 1.7584041357040405, + "learning_rate": 1.9797153897424854e-05, + "loss": 0.6868, + "step": 7358 + }, + { + "epoch": 1.2012979062079099, + "grad_norm": 1.5274354219436646, + "learning_rate": 1.979709030094481e-05, + "loss": 0.5934, + "step": 7359 + }, + { + "epoch": 1.2014611648504143, + "grad_norm": 2.0911166667938232, + "learning_rate": 1.9797026694599097e-05, + "loss": 0.7944, + "step": 7360 + }, + { + "epoch": 1.2016244234929188, + "grad_norm": 1.8187841176986694, + "learning_rate": 1.9796963078387782e-05, + "loss": 0.5665, + "step": 7361 + }, + { + "epoch": 1.201787682135423, + "grad_norm": 1.5789079666137695, + "learning_rate": 1.979689945231093e-05, + "loss": 0.6763, + "step": 7362 + }, + { + "epoch": 1.2019509407779274, + "grad_norm": 1.6954537630081177, + "learning_rate": 1.97968358163686e-05, + "loss": 0.6646, + "step": 7363 + }, + { + "epoch": 1.2021141994204319, + "grad_norm": 1.6038451194763184, + "learning_rate": 1.9796772170560858e-05, + "loss": 0.5968, + "step": 7364 + }, + { + "epoch": 1.2022774580629363, + "grad_norm": 1.8748506307601929, + "learning_rate": 1.9796708514887773e-05, + "loss": 0.7605, + "step": 7365 + }, + { + "epoch": 1.2024407167054405, + "grad_norm": 2.053863763809204, + "learning_rate": 1.9796644849349403e-05, + "loss": 0.6618, + "step": 7366 + }, + { + "epoch": 1.202603975347945, + "grad_norm": 1.8704049587249756, + "learning_rate": 1.9796581173945816e-05, + "loss": 0.7232, + "step": 7367 + }, + { + "epoch": 1.2027672339904494, + "grad_norm": 2.0381157398223877, + "learning_rate": 1.979651748867707e-05, + "loss": 0.6162, + "step": 7368 + }, + { + "epoch": 1.2029304926329538, + "grad_norm": 1.7795929908752441, + "learning_rate": 1.9796453793543237e-05, + "loss": 0.7657, + "step": 7369 + }, + { + "epoch": 1.2030937512754583, + "grad_norm": 2.1925785541534424, + "learning_rate": 1.9796390088544377e-05, + "loss": 0.8536, + "step": 7370 + }, + { + "epoch": 1.2032570099179625, + "grad_norm": 2.0009453296661377, + "learning_rate": 1.9796326373680555e-05, + "loss": 0.8244, + "step": 7371 + }, + { + "epoch": 1.203420268560467, + "grad_norm": 1.6859406232833862, + "learning_rate": 1.9796262648951834e-05, + "loss": 0.582, + "step": 7372 + }, + { + "epoch": 1.2035835272029713, + "grad_norm": 1.9571212530136108, + "learning_rate": 1.979619891435828e-05, + "loss": 0.7924, + "step": 7373 + }, + { + "epoch": 1.2037467858454758, + "grad_norm": 1.5581319332122803, + "learning_rate": 1.9796135169899956e-05, + "loss": 0.669, + "step": 7374 + }, + { + "epoch": 1.20391004448798, + "grad_norm": 2.0011534690856934, + "learning_rate": 1.9796071415576925e-05, + "loss": 1.0497, + "step": 7375 + }, + { + "epoch": 1.2040733031304844, + "grad_norm": 1.766790747642517, + "learning_rate": 1.9796007651389255e-05, + "loss": 0.6862, + "step": 7376 + }, + { + "epoch": 1.2042365617729889, + "grad_norm": 2.090240001678467, + "learning_rate": 1.9795943877337007e-05, + "loss": 0.8743, + "step": 7377 + }, + { + "epoch": 1.2043998204154933, + "grad_norm": 1.8979412317276, + "learning_rate": 1.9795880093420246e-05, + "loss": 0.6517, + "step": 7378 + }, + { + "epoch": 1.2045630790579978, + "grad_norm": 1.5886799097061157, + "learning_rate": 1.9795816299639035e-05, + "loss": 0.6421, + "step": 7379 + }, + { + "epoch": 1.204726337700502, + "grad_norm": 2.1308624744415283, + "learning_rate": 1.979575249599344e-05, + "loss": 0.8061, + "step": 7380 + }, + { + "epoch": 1.2048895963430064, + "grad_norm": 2.105107307434082, + "learning_rate": 1.979568868248353e-05, + "loss": 0.7815, + "step": 7381 + }, + { + "epoch": 1.2050528549855108, + "grad_norm": 1.7962852716445923, + "learning_rate": 1.9795624859109357e-05, + "loss": 0.6426, + "step": 7382 + }, + { + "epoch": 1.205216113628015, + "grad_norm": 2.0234248638153076, + "learning_rate": 1.9795561025870996e-05, + "loss": 0.6454, + "step": 7383 + }, + { + "epoch": 1.2053793722705195, + "grad_norm": 1.873284935951233, + "learning_rate": 1.9795497182768506e-05, + "loss": 0.5997, + "step": 7384 + }, + { + "epoch": 1.205542630913024, + "grad_norm": 1.996983528137207, + "learning_rate": 1.9795433329801955e-05, + "loss": 0.6538, + "step": 7385 + }, + { + "epoch": 1.2057058895555284, + "grad_norm": 1.926608920097351, + "learning_rate": 1.9795369466971404e-05, + "loss": 0.6682, + "step": 7386 + }, + { + "epoch": 1.2058691481980328, + "grad_norm": 1.704397201538086, + "learning_rate": 1.979530559427692e-05, + "loss": 0.6599, + "step": 7387 + }, + { + "epoch": 1.206032406840537, + "grad_norm": 1.5180952548980713, + "learning_rate": 1.9795241711718565e-05, + "loss": 0.6247, + "step": 7388 + }, + { + "epoch": 1.2061956654830415, + "grad_norm": 1.8134716749191284, + "learning_rate": 1.9795177819296403e-05, + "loss": 0.7213, + "step": 7389 + }, + { + "epoch": 1.206358924125546, + "grad_norm": 1.9658679962158203, + "learning_rate": 1.97951139170105e-05, + "loss": 0.6354, + "step": 7390 + }, + { + "epoch": 1.2065221827680503, + "grad_norm": 1.792641282081604, + "learning_rate": 1.9795050004860918e-05, + "loss": 0.7693, + "step": 7391 + }, + { + "epoch": 1.2066854414105546, + "grad_norm": 1.9670463800430298, + "learning_rate": 1.9794986082847728e-05, + "loss": 0.789, + "step": 7392 + }, + { + "epoch": 1.206848700053059, + "grad_norm": 1.7635657787322998, + "learning_rate": 1.9794922150970984e-05, + "loss": 0.6938, + "step": 7393 + }, + { + "epoch": 1.2070119586955634, + "grad_norm": 2.0216426849365234, + "learning_rate": 1.979485820923076e-05, + "loss": 0.7903, + "step": 7394 + }, + { + "epoch": 1.2071752173380679, + "grad_norm": 1.8245142698287964, + "learning_rate": 1.9794794257627117e-05, + "loss": 0.5638, + "step": 7395 + }, + { + "epoch": 1.2073384759805723, + "grad_norm": 1.6951179504394531, + "learning_rate": 1.9794730296160117e-05, + "loss": 0.7518, + "step": 7396 + }, + { + "epoch": 1.2075017346230765, + "grad_norm": 1.9777193069458008, + "learning_rate": 1.9794666324829826e-05, + "loss": 0.8065, + "step": 7397 + }, + { + "epoch": 1.207664993265581, + "grad_norm": 2.2356173992156982, + "learning_rate": 1.979460234363631e-05, + "loss": 0.8131, + "step": 7398 + }, + { + "epoch": 1.2078282519080854, + "grad_norm": 1.884108066558838, + "learning_rate": 1.9794538352579628e-05, + "loss": 0.8158, + "step": 7399 + }, + { + "epoch": 1.2079915105505898, + "grad_norm": 1.6056896448135376, + "learning_rate": 1.9794474351659854e-05, + "loss": 0.5784, + "step": 7400 + }, + { + "epoch": 1.208154769193094, + "grad_norm": 1.8474441766738892, + "learning_rate": 1.9794410340877045e-05, + "loss": 0.6167, + "step": 7401 + }, + { + "epoch": 1.2083180278355985, + "grad_norm": 1.7950236797332764, + "learning_rate": 1.9794346320231265e-05, + "loss": 0.6855, + "step": 7402 + }, + { + "epoch": 1.208481286478103, + "grad_norm": 2.13885235786438, + "learning_rate": 1.979428228972258e-05, + "loss": 0.8704, + "step": 7403 + }, + { + "epoch": 1.2086445451206074, + "grad_norm": 1.4476736783981323, + "learning_rate": 1.979421824935106e-05, + "loss": 0.6461, + "step": 7404 + }, + { + "epoch": 1.2088078037631118, + "grad_norm": 2.2221169471740723, + "learning_rate": 1.9794154199116763e-05, + "loss": 0.7934, + "step": 7405 + }, + { + "epoch": 1.208971062405616, + "grad_norm": 1.5845366716384888, + "learning_rate": 1.9794090139019757e-05, + "loss": 0.5762, + "step": 7406 + }, + { + "epoch": 1.2091343210481205, + "grad_norm": 1.6272059679031372, + "learning_rate": 1.9794026069060102e-05, + "loss": 0.6487, + "step": 7407 + }, + { + "epoch": 1.209297579690625, + "grad_norm": 1.706217646598816, + "learning_rate": 1.9793961989237867e-05, + "loss": 0.7021, + "step": 7408 + }, + { + "epoch": 1.2094608383331293, + "grad_norm": 1.7049869298934937, + "learning_rate": 1.9793897899553116e-05, + "loss": 0.6993, + "step": 7409 + }, + { + "epoch": 1.2096240969756336, + "grad_norm": 1.7050567865371704, + "learning_rate": 1.979383380000591e-05, + "loss": 0.701, + "step": 7410 + }, + { + "epoch": 1.209787355618138, + "grad_norm": 1.7534161806106567, + "learning_rate": 1.9793769690596315e-05, + "loss": 0.8432, + "step": 7411 + }, + { + "epoch": 1.2099506142606424, + "grad_norm": 2.0166826248168945, + "learning_rate": 1.9793705571324397e-05, + "loss": 0.7088, + "step": 7412 + }, + { + "epoch": 1.2101138729031469, + "grad_norm": 1.508067011833191, + "learning_rate": 1.979364144219022e-05, + "loss": 0.6877, + "step": 7413 + }, + { + "epoch": 1.2102771315456513, + "grad_norm": 1.8355813026428223, + "learning_rate": 1.979357730319385e-05, + "loss": 0.7764, + "step": 7414 + }, + { + "epoch": 1.2104403901881555, + "grad_norm": 1.6093283891677856, + "learning_rate": 1.9793513154335354e-05, + "loss": 0.5639, + "step": 7415 + }, + { + "epoch": 1.21060364883066, + "grad_norm": 1.8645378351211548, + "learning_rate": 1.9793448995614785e-05, + "loss": 0.7806, + "step": 7416 + }, + { + "epoch": 1.2107669074731644, + "grad_norm": 2.003026008605957, + "learning_rate": 1.979338482703222e-05, + "loss": 0.7819, + "step": 7417 + }, + { + "epoch": 1.2109301661156688, + "grad_norm": 1.5802714824676514, + "learning_rate": 1.979332064858772e-05, + "loss": 0.6399, + "step": 7418 + }, + { + "epoch": 1.211093424758173, + "grad_norm": 2.0080184936523438, + "learning_rate": 1.9793256460281348e-05, + "loss": 0.74, + "step": 7419 + }, + { + "epoch": 1.2112566834006775, + "grad_norm": 1.827246904373169, + "learning_rate": 1.9793192262113167e-05, + "loss": 0.7025, + "step": 7420 + }, + { + "epoch": 1.211419942043182, + "grad_norm": 2.1109707355499268, + "learning_rate": 1.9793128054083245e-05, + "loss": 0.8903, + "step": 7421 + }, + { + "epoch": 1.2115832006856864, + "grad_norm": 1.726699948310852, + "learning_rate": 1.9793063836191648e-05, + "loss": 0.7622, + "step": 7422 + }, + { + "epoch": 1.2117464593281908, + "grad_norm": 1.724717617034912, + "learning_rate": 1.9792999608438436e-05, + "loss": 0.6586, + "step": 7423 + }, + { + "epoch": 1.211909717970695, + "grad_norm": 1.565402626991272, + "learning_rate": 1.9792935370823676e-05, + "loss": 0.5615, + "step": 7424 + }, + { + "epoch": 1.2120729766131995, + "grad_norm": 1.5589874982833862, + "learning_rate": 1.9792871123347434e-05, + "loss": 0.6316, + "step": 7425 + }, + { + "epoch": 1.212236235255704, + "grad_norm": 1.5304598808288574, + "learning_rate": 1.9792806866009773e-05, + "loss": 0.5517, + "step": 7426 + }, + { + "epoch": 1.2123994938982081, + "grad_norm": 1.8662205934524536, + "learning_rate": 1.9792742598810758e-05, + "loss": 0.7624, + "step": 7427 + }, + { + "epoch": 1.2125627525407126, + "grad_norm": 1.615124225616455, + "learning_rate": 1.979267832175045e-05, + "loss": 0.6313, + "step": 7428 + }, + { + "epoch": 1.212726011183217, + "grad_norm": 2.4157090187072754, + "learning_rate": 1.9792614034828923e-05, + "loss": 0.8412, + "step": 7429 + }, + { + "epoch": 1.2128892698257214, + "grad_norm": 1.7291311025619507, + "learning_rate": 1.9792549738046232e-05, + "loss": 0.6257, + "step": 7430 + }, + { + "epoch": 1.2130525284682259, + "grad_norm": 2.1258742809295654, + "learning_rate": 1.979248543140245e-05, + "loss": 0.8124, + "step": 7431 + }, + { + "epoch": 1.21321578711073, + "grad_norm": 1.614032506942749, + "learning_rate": 1.9792421114897635e-05, + "loss": 0.5492, + "step": 7432 + }, + { + "epoch": 1.2133790457532345, + "grad_norm": 1.8756171464920044, + "learning_rate": 1.9792356788531854e-05, + "loss": 0.8174, + "step": 7433 + }, + { + "epoch": 1.213542304395739, + "grad_norm": 1.996145248413086, + "learning_rate": 1.9792292452305174e-05, + "loss": 0.7093, + "step": 7434 + }, + { + "epoch": 1.2137055630382434, + "grad_norm": 1.7242380380630493, + "learning_rate": 1.979222810621766e-05, + "loss": 0.7007, + "step": 7435 + }, + { + "epoch": 1.2138688216807476, + "grad_norm": 1.8769707679748535, + "learning_rate": 1.9792163750269373e-05, + "loss": 0.6683, + "step": 7436 + }, + { + "epoch": 1.214032080323252, + "grad_norm": 1.8688710927963257, + "learning_rate": 1.9792099384460378e-05, + "loss": 0.7083, + "step": 7437 + }, + { + "epoch": 1.2141953389657565, + "grad_norm": 2.06347393989563, + "learning_rate": 1.9792035008790744e-05, + "loss": 0.8888, + "step": 7438 + }, + { + "epoch": 1.214358597608261, + "grad_norm": 1.9763275384902954, + "learning_rate": 1.979197062326053e-05, + "loss": 0.8097, + "step": 7439 + }, + { + "epoch": 1.2145218562507654, + "grad_norm": 1.916993260383606, + "learning_rate": 1.979190622786981e-05, + "loss": 0.6881, + "step": 7440 + }, + { + "epoch": 1.2146851148932696, + "grad_norm": 1.4702719449996948, + "learning_rate": 1.9791841822618637e-05, + "loss": 0.7119, + "step": 7441 + }, + { + "epoch": 1.214848373535774, + "grad_norm": 1.6330474615097046, + "learning_rate": 1.9791777407507083e-05, + "loss": 0.6813, + "step": 7442 + }, + { + "epoch": 1.2150116321782785, + "grad_norm": 1.896609902381897, + "learning_rate": 1.9791712982535215e-05, + "loss": 0.7386, + "step": 7443 + }, + { + "epoch": 1.215174890820783, + "grad_norm": 1.6126973628997803, + "learning_rate": 1.9791648547703095e-05, + "loss": 0.637, + "step": 7444 + }, + { + "epoch": 1.215338149463287, + "grad_norm": 1.545535683631897, + "learning_rate": 1.9791584103010785e-05, + "loss": 0.5901, + "step": 7445 + }, + { + "epoch": 1.2155014081057915, + "grad_norm": 1.709138035774231, + "learning_rate": 1.9791519648458352e-05, + "loss": 0.6097, + "step": 7446 + }, + { + "epoch": 1.215664666748296, + "grad_norm": 1.778785228729248, + "learning_rate": 1.979145518404586e-05, + "loss": 0.6195, + "step": 7447 + }, + { + "epoch": 1.2158279253908004, + "grad_norm": 1.553452968597412, + "learning_rate": 1.979139070977338e-05, + "loss": 0.7041, + "step": 7448 + }, + { + "epoch": 1.2159911840333049, + "grad_norm": 1.8251323699951172, + "learning_rate": 1.9791326225640967e-05, + "loss": 0.7159, + "step": 7449 + }, + { + "epoch": 1.216154442675809, + "grad_norm": 1.6588391065597534, + "learning_rate": 1.9791261731648694e-05, + "loss": 0.7381, + "step": 7450 + }, + { + "epoch": 1.2163177013183135, + "grad_norm": 2.0273489952087402, + "learning_rate": 1.9791197227796622e-05, + "loss": 0.8044, + "step": 7451 + }, + { + "epoch": 1.216480959960818, + "grad_norm": 1.787227988243103, + "learning_rate": 1.979113271408482e-05, + "loss": 0.6639, + "step": 7452 + }, + { + "epoch": 1.2166442186033224, + "grad_norm": 1.8415346145629883, + "learning_rate": 1.9791068190513346e-05, + "loss": 0.7877, + "step": 7453 + }, + { + "epoch": 1.2168074772458266, + "grad_norm": 1.6401402950286865, + "learning_rate": 1.979100365708227e-05, + "loss": 0.6705, + "step": 7454 + }, + { + "epoch": 1.216970735888331, + "grad_norm": 1.6079798936843872, + "learning_rate": 1.9790939113791657e-05, + "loss": 0.5733, + "step": 7455 + }, + { + "epoch": 1.2171339945308355, + "grad_norm": 2.3078153133392334, + "learning_rate": 1.979087456064157e-05, + "loss": 0.8324, + "step": 7456 + }, + { + "epoch": 1.21729725317334, + "grad_norm": 1.5739309787750244, + "learning_rate": 1.9790809997632076e-05, + "loss": 0.6777, + "step": 7457 + }, + { + "epoch": 1.2174605118158444, + "grad_norm": 1.692357063293457, + "learning_rate": 1.9790745424763238e-05, + "loss": 0.6101, + "step": 7458 + }, + { + "epoch": 1.2176237704583486, + "grad_norm": 1.6537089347839355, + "learning_rate": 1.9790680842035123e-05, + "loss": 0.6955, + "step": 7459 + }, + { + "epoch": 1.217787029100853, + "grad_norm": 1.7182763814926147, + "learning_rate": 1.9790616249447795e-05, + "loss": 0.6761, + "step": 7460 + }, + { + "epoch": 1.2179502877433575, + "grad_norm": 2.062626838684082, + "learning_rate": 1.979055164700132e-05, + "loss": 0.7244, + "step": 7461 + }, + { + "epoch": 1.218113546385862, + "grad_norm": 2.0550293922424316, + "learning_rate": 1.9790487034695754e-05, + "loss": 1.1498, + "step": 7462 + }, + { + "epoch": 1.218276805028366, + "grad_norm": 1.920966625213623, + "learning_rate": 1.9790422412531178e-05, + "loss": 0.7786, + "step": 7463 + }, + { + "epoch": 1.2184400636708705, + "grad_norm": 1.987795114517212, + "learning_rate": 1.979035778050765e-05, + "loss": 0.657, + "step": 7464 + }, + { + "epoch": 1.218603322313375, + "grad_norm": 1.7071714401245117, + "learning_rate": 1.979029313862523e-05, + "loss": 0.657, + "step": 7465 + }, + { + "epoch": 1.2187665809558794, + "grad_norm": 2.6193277835845947, + "learning_rate": 1.979022848688399e-05, + "loss": 0.9032, + "step": 7466 + }, + { + "epoch": 1.2189298395983839, + "grad_norm": 1.8673352003097534, + "learning_rate": 1.979016382528399e-05, + "loss": 0.7123, + "step": 7467 + }, + { + "epoch": 1.219093098240888, + "grad_norm": 1.7711907625198364, + "learning_rate": 1.97900991538253e-05, + "loss": 0.6199, + "step": 7468 + }, + { + "epoch": 1.2192563568833925, + "grad_norm": 1.7074365615844727, + "learning_rate": 1.9790034472507983e-05, + "loss": 0.567, + "step": 7469 + }, + { + "epoch": 1.219419615525897, + "grad_norm": 2.19486927986145, + "learning_rate": 1.9789969781332102e-05, + "loss": 0.682, + "step": 7470 + }, + { + "epoch": 1.2195828741684012, + "grad_norm": 2.1546709537506104, + "learning_rate": 1.9789905080297726e-05, + "loss": 0.8092, + "step": 7471 + }, + { + "epoch": 1.2197461328109056, + "grad_norm": 2.123701810836792, + "learning_rate": 1.9789840369404917e-05, + "loss": 0.6304, + "step": 7472 + }, + { + "epoch": 1.21990939145341, + "grad_norm": 1.953593373298645, + "learning_rate": 1.978977564865374e-05, + "loss": 0.7079, + "step": 7473 + }, + { + "epoch": 1.2200726500959145, + "grad_norm": 1.8833160400390625, + "learning_rate": 1.9789710918044265e-05, + "loss": 0.6876, + "step": 7474 + }, + { + "epoch": 1.220235908738419, + "grad_norm": 1.8395326137542725, + "learning_rate": 1.9789646177576548e-05, + "loss": 0.7133, + "step": 7475 + }, + { + "epoch": 1.2203991673809231, + "grad_norm": 1.5282162427902222, + "learning_rate": 1.9789581427250665e-05, + "loss": 0.5327, + "step": 7476 + }, + { + "epoch": 1.2205624260234276, + "grad_norm": 1.685650110244751, + "learning_rate": 1.9789516667066677e-05, + "loss": 0.7168, + "step": 7477 + }, + { + "epoch": 1.220725684665932, + "grad_norm": 1.9821810722351074, + "learning_rate": 1.9789451897024646e-05, + "loss": 0.7603, + "step": 7478 + }, + { + "epoch": 1.2208889433084364, + "grad_norm": 2.0764896869659424, + "learning_rate": 1.9789387117124638e-05, + "loss": 0.7399, + "step": 7479 + }, + { + "epoch": 1.2210522019509407, + "grad_norm": 1.5705757141113281, + "learning_rate": 1.9789322327366722e-05, + "loss": 0.6538, + "step": 7480 + }, + { + "epoch": 1.221215460593445, + "grad_norm": 1.7783465385437012, + "learning_rate": 1.9789257527750962e-05, + "loss": 0.7886, + "step": 7481 + }, + { + "epoch": 1.2213787192359495, + "grad_norm": 1.8706525564193726, + "learning_rate": 1.978919271827742e-05, + "loss": 0.8065, + "step": 7482 + }, + { + "epoch": 1.221541977878454, + "grad_norm": 1.6382510662078857, + "learning_rate": 1.9789127898946164e-05, + "loss": 0.6214, + "step": 7483 + }, + { + "epoch": 1.2217052365209584, + "grad_norm": 1.5115511417388916, + "learning_rate": 1.9789063069757258e-05, + "loss": 0.5285, + "step": 7484 + }, + { + "epoch": 1.2218684951634626, + "grad_norm": 1.820772409439087, + "learning_rate": 1.978899823071077e-05, + "loss": 0.7185, + "step": 7485 + }, + { + "epoch": 1.222031753805967, + "grad_norm": 2.119948387145996, + "learning_rate": 1.9788933381806762e-05, + "loss": 0.7139, + "step": 7486 + }, + { + "epoch": 1.2221950124484715, + "grad_norm": 1.9118001461029053, + "learning_rate": 1.97888685230453e-05, + "loss": 0.733, + "step": 7487 + }, + { + "epoch": 1.222358271090976, + "grad_norm": 1.7695504426956177, + "learning_rate": 1.9788803654426455e-05, + "loss": 0.6999, + "step": 7488 + }, + { + "epoch": 1.2225215297334802, + "grad_norm": 1.9525483846664429, + "learning_rate": 1.978873877595028e-05, + "loss": 0.6499, + "step": 7489 + }, + { + "epoch": 1.2226847883759846, + "grad_norm": 2.044201612472534, + "learning_rate": 1.9788673887616852e-05, + "loss": 0.7619, + "step": 7490 + }, + { + "epoch": 1.222848047018489, + "grad_norm": 1.5164257287979126, + "learning_rate": 1.9788608989426234e-05, + "loss": 0.6218, + "step": 7491 + }, + { + "epoch": 1.2230113056609935, + "grad_norm": 1.8486992120742798, + "learning_rate": 1.9788544081378483e-05, + "loss": 0.6926, + "step": 7492 + }, + { + "epoch": 1.223174564303498, + "grad_norm": 2.0172231197357178, + "learning_rate": 1.9788479163473674e-05, + "loss": 0.7996, + "step": 7493 + }, + { + "epoch": 1.2233378229460021, + "grad_norm": 1.7185602188110352, + "learning_rate": 1.978841423571187e-05, + "loss": 0.6985, + "step": 7494 + }, + { + "epoch": 1.2235010815885066, + "grad_norm": 2.0612454414367676, + "learning_rate": 1.9788349298093136e-05, + "loss": 0.7728, + "step": 7495 + }, + { + "epoch": 1.223664340231011, + "grad_norm": 1.6618387699127197, + "learning_rate": 1.9788284350617536e-05, + "loss": 0.577, + "step": 7496 + }, + { + "epoch": 1.2238275988735154, + "grad_norm": 1.7117677927017212, + "learning_rate": 1.9788219393285133e-05, + "loss": 0.7177, + "step": 7497 + }, + { + "epoch": 1.2239908575160197, + "grad_norm": 1.8012889623641968, + "learning_rate": 1.9788154426096e-05, + "loss": 0.7537, + "step": 7498 + }, + { + "epoch": 1.224154116158524, + "grad_norm": 1.5860058069229126, + "learning_rate": 1.9788089449050195e-05, + "loss": 0.5982, + "step": 7499 + }, + { + "epoch": 1.2243173748010285, + "grad_norm": 2.0279064178466797, + "learning_rate": 1.978802446214779e-05, + "loss": 0.7333, + "step": 7500 + }, + { + "epoch": 1.224480633443533, + "grad_norm": 1.8839024305343628, + "learning_rate": 1.9787959465388845e-05, + "loss": 0.7533, + "step": 7501 + }, + { + "epoch": 1.2246438920860374, + "grad_norm": 1.6589711904525757, + "learning_rate": 1.9787894458773428e-05, + "loss": 0.7363, + "step": 7502 + }, + { + "epoch": 1.2248071507285416, + "grad_norm": 1.5441715717315674, + "learning_rate": 1.9787829442301604e-05, + "loss": 0.6431, + "step": 7503 + }, + { + "epoch": 1.224970409371046, + "grad_norm": 1.8326237201690674, + "learning_rate": 1.9787764415973437e-05, + "loss": 0.6777, + "step": 7504 + }, + { + "epoch": 1.2251336680135505, + "grad_norm": 1.619685173034668, + "learning_rate": 1.9787699379788995e-05, + "loss": 0.617, + "step": 7505 + }, + { + "epoch": 1.225296926656055, + "grad_norm": 1.4395554065704346, + "learning_rate": 1.9787634333748342e-05, + "loss": 0.5516, + "step": 7506 + }, + { + "epoch": 1.2254601852985592, + "grad_norm": 1.9953265190124512, + "learning_rate": 1.9787569277851542e-05, + "loss": 0.6325, + "step": 7507 + }, + { + "epoch": 1.2256234439410636, + "grad_norm": 1.965248703956604, + "learning_rate": 1.9787504212098664e-05, + "loss": 0.7321, + "step": 7508 + }, + { + "epoch": 1.225786702583568, + "grad_norm": 1.9028277397155762, + "learning_rate": 1.978743913648977e-05, + "loss": 0.8295, + "step": 7509 + }, + { + "epoch": 1.2259499612260725, + "grad_norm": 1.8230926990509033, + "learning_rate": 1.978737405102493e-05, + "loss": 0.6857, + "step": 7510 + }, + { + "epoch": 1.226113219868577, + "grad_norm": 1.6447460651397705, + "learning_rate": 1.9787308955704206e-05, + "loss": 0.6885, + "step": 7511 + }, + { + "epoch": 1.2262764785110811, + "grad_norm": 1.6718796491622925, + "learning_rate": 1.9787243850527663e-05, + "loss": 0.7368, + "step": 7512 + }, + { + "epoch": 1.2264397371535856, + "grad_norm": 2.4728660583496094, + "learning_rate": 1.978717873549537e-05, + "loss": 0.6239, + "step": 7513 + }, + { + "epoch": 1.22660299579609, + "grad_norm": 1.6918766498565674, + "learning_rate": 1.9787113610607394e-05, + "loss": 0.5896, + "step": 7514 + }, + { + "epoch": 1.2267662544385944, + "grad_norm": 1.5402162075042725, + "learning_rate": 1.978704847586379e-05, + "loss": 0.6032, + "step": 7515 + }, + { + "epoch": 1.2269295130810987, + "grad_norm": 1.9853535890579224, + "learning_rate": 1.9786983331264634e-05, + "loss": 0.813, + "step": 7516 + }, + { + "epoch": 1.227092771723603, + "grad_norm": 1.6917046308517456, + "learning_rate": 1.978691817680999e-05, + "loss": 0.6085, + "step": 7517 + }, + { + "epoch": 1.2272560303661075, + "grad_norm": 1.9412026405334473, + "learning_rate": 1.9786853012499923e-05, + "loss": 0.7982, + "step": 7518 + }, + { + "epoch": 1.227419289008612, + "grad_norm": 1.9164992570877075, + "learning_rate": 1.9786787838334494e-05, + "loss": 0.8812, + "step": 7519 + }, + { + "epoch": 1.2275825476511162, + "grad_norm": 1.9447320699691772, + "learning_rate": 1.9786722654313773e-05, + "loss": 0.7351, + "step": 7520 + }, + { + "epoch": 1.2277458062936206, + "grad_norm": 1.603618860244751, + "learning_rate": 1.9786657460437824e-05, + "loss": 0.5899, + "step": 7521 + }, + { + "epoch": 1.227909064936125, + "grad_norm": 1.685020089149475, + "learning_rate": 1.9786592256706717e-05, + "loss": 0.6576, + "step": 7522 + }, + { + "epoch": 1.2280723235786295, + "grad_norm": 1.6966872215270996, + "learning_rate": 1.978652704312051e-05, + "loss": 0.6758, + "step": 7523 + }, + { + "epoch": 1.2282355822211337, + "grad_norm": 2.2998721599578857, + "learning_rate": 1.9786461819679276e-05, + "loss": 0.8076, + "step": 7524 + }, + { + "epoch": 1.2283988408636382, + "grad_norm": 1.8802968263626099, + "learning_rate": 1.9786396586383078e-05, + "loss": 0.9473, + "step": 7525 + }, + { + "epoch": 1.2285620995061426, + "grad_norm": 2.023282289505005, + "learning_rate": 1.978633134323198e-05, + "loss": 0.765, + "step": 7526 + }, + { + "epoch": 1.228725358148647, + "grad_norm": 1.6884429454803467, + "learning_rate": 1.978626609022605e-05, + "loss": 0.6562, + "step": 7527 + }, + { + "epoch": 1.2288886167911515, + "grad_norm": 1.9121553897857666, + "learning_rate": 1.978620082736535e-05, + "loss": 0.784, + "step": 7528 + }, + { + "epoch": 1.2290518754336557, + "grad_norm": 1.818278431892395, + "learning_rate": 1.9786135554649946e-05, + "loss": 0.7256, + "step": 7529 + }, + { + "epoch": 1.2292151340761601, + "grad_norm": 1.631551742553711, + "learning_rate": 1.978607027207991e-05, + "loss": 0.6141, + "step": 7530 + }, + { + "epoch": 1.2293783927186646, + "grad_norm": 1.8147847652435303, + "learning_rate": 1.9786004979655306e-05, + "loss": 0.6736, + "step": 7531 + }, + { + "epoch": 1.229541651361169, + "grad_norm": 1.8590754270553589, + "learning_rate": 1.9785939677376195e-05, + "loss": 0.6144, + "step": 7532 + }, + { + "epoch": 1.2297049100036732, + "grad_norm": 1.6786022186279297, + "learning_rate": 1.9785874365242645e-05, + "loss": 0.757, + "step": 7533 + }, + { + "epoch": 1.2298681686461777, + "grad_norm": 2.0117757320404053, + "learning_rate": 1.978580904325472e-05, + "loss": 0.7214, + "step": 7534 + }, + { + "epoch": 1.230031427288682, + "grad_norm": 1.4406912326812744, + "learning_rate": 1.9785743711412493e-05, + "loss": 0.6053, + "step": 7535 + }, + { + "epoch": 1.2301946859311865, + "grad_norm": 1.8663067817687988, + "learning_rate": 1.9785678369716023e-05, + "loss": 0.7099, + "step": 7536 + }, + { + "epoch": 1.230357944573691, + "grad_norm": 1.7292609214782715, + "learning_rate": 1.978561301816538e-05, + "loss": 0.6191, + "step": 7537 + }, + { + "epoch": 1.2305212032161952, + "grad_norm": 1.6086658239364624, + "learning_rate": 1.978554765676062e-05, + "loss": 0.6372, + "step": 7538 + }, + { + "epoch": 1.2306844618586996, + "grad_norm": 2.157968521118164, + "learning_rate": 1.978548228550182e-05, + "loss": 0.857, + "step": 7539 + }, + { + "epoch": 1.230847720501204, + "grad_norm": 1.5795437097549438, + "learning_rate": 1.9785416904389044e-05, + "loss": 0.661, + "step": 7540 + }, + { + "epoch": 1.2310109791437085, + "grad_norm": 1.3310855627059937, + "learning_rate": 1.9785351513422354e-05, + "loss": 0.5867, + "step": 7541 + }, + { + "epoch": 1.2311742377862127, + "grad_norm": 1.9253145456314087, + "learning_rate": 1.9785286112601816e-05, + "loss": 0.7932, + "step": 7542 + }, + { + "epoch": 1.2313374964287171, + "grad_norm": 1.7828696966171265, + "learning_rate": 1.97852207019275e-05, + "loss": 0.7077, + "step": 7543 + }, + { + "epoch": 1.2315007550712216, + "grad_norm": 1.7212601900100708, + "learning_rate": 1.9785155281399465e-05, + "loss": 0.6967, + "step": 7544 + }, + { + "epoch": 1.231664013713726, + "grad_norm": 1.7045612335205078, + "learning_rate": 1.9785089851017788e-05, + "loss": 0.7531, + "step": 7545 + }, + { + "epoch": 1.2318272723562305, + "grad_norm": 1.7298681735992432, + "learning_rate": 1.978502441078252e-05, + "loss": 0.6396, + "step": 7546 + }, + { + "epoch": 1.2319905309987347, + "grad_norm": 1.7949471473693848, + "learning_rate": 1.978495896069374e-05, + "loss": 0.785, + "step": 7547 + }, + { + "epoch": 1.2321537896412391, + "grad_norm": 1.9116441011428833, + "learning_rate": 1.978489350075151e-05, + "loss": 0.7124, + "step": 7548 + }, + { + "epoch": 1.2323170482837436, + "grad_norm": 2.066927909851074, + "learning_rate": 1.9784828030955894e-05, + "loss": 0.8599, + "step": 7549 + }, + { + "epoch": 1.232480306926248, + "grad_norm": 2.042468309402466, + "learning_rate": 1.978476255130696e-05, + "loss": 0.8423, + "step": 7550 + }, + { + "epoch": 1.2326435655687522, + "grad_norm": 1.9192731380462646, + "learning_rate": 1.978469706180477e-05, + "loss": 0.8324, + "step": 7551 + }, + { + "epoch": 1.2328068242112566, + "grad_norm": 1.5339199304580688, + "learning_rate": 1.978463156244939e-05, + "loss": 0.6332, + "step": 7552 + }, + { + "epoch": 1.232970082853761, + "grad_norm": 1.9158146381378174, + "learning_rate": 1.9784566053240893e-05, + "loss": 0.7756, + "step": 7553 + }, + { + "epoch": 1.2331333414962655, + "grad_norm": 1.5751270055770874, + "learning_rate": 1.978450053417934e-05, + "loss": 0.6795, + "step": 7554 + }, + { + "epoch": 1.23329660013877, + "grad_norm": 1.721064567565918, + "learning_rate": 1.97844350052648e-05, + "loss": 0.8753, + "step": 7555 + }, + { + "epoch": 1.2334598587812742, + "grad_norm": 1.827374815940857, + "learning_rate": 1.9784369466497333e-05, + "loss": 0.6286, + "step": 7556 + }, + { + "epoch": 1.2336231174237786, + "grad_norm": 1.7738226652145386, + "learning_rate": 1.978430391787701e-05, + "loss": 0.7312, + "step": 7557 + }, + { + "epoch": 1.233786376066283, + "grad_norm": 1.9537365436553955, + "learning_rate": 1.9784238359403893e-05, + "loss": 0.7627, + "step": 7558 + }, + { + "epoch": 1.2339496347087875, + "grad_norm": 1.6540791988372803, + "learning_rate": 1.9784172791078052e-05, + "loss": 0.6382, + "step": 7559 + }, + { + "epoch": 1.2341128933512917, + "grad_norm": 1.8772302865982056, + "learning_rate": 1.978410721289955e-05, + "loss": 0.7602, + "step": 7560 + }, + { + "epoch": 1.2342761519937961, + "grad_norm": 1.7176569700241089, + "learning_rate": 1.9784041624868458e-05, + "loss": 0.6428, + "step": 7561 + }, + { + "epoch": 1.2344394106363006, + "grad_norm": 1.8083763122558594, + "learning_rate": 1.978397602698484e-05, + "loss": 0.6789, + "step": 7562 + }, + { + "epoch": 1.234602669278805, + "grad_norm": 1.871168851852417, + "learning_rate": 1.9783910419248755e-05, + "loss": 0.7171, + "step": 7563 + }, + { + "epoch": 1.2347659279213095, + "grad_norm": 1.5781874656677246, + "learning_rate": 1.9783844801660278e-05, + "loss": 0.548, + "step": 7564 + }, + { + "epoch": 1.2349291865638137, + "grad_norm": 1.7105425596237183, + "learning_rate": 1.9783779174219472e-05, + "loss": 0.6713, + "step": 7565 + }, + { + "epoch": 1.2350924452063181, + "grad_norm": 1.7158540487289429, + "learning_rate": 1.9783713536926403e-05, + "loss": 0.7682, + "step": 7566 + }, + { + "epoch": 1.2352557038488226, + "grad_norm": 1.7054301500320435, + "learning_rate": 1.9783647889781138e-05, + "loss": 0.8025, + "step": 7567 + }, + { + "epoch": 1.2354189624913268, + "grad_norm": 2.0687429904937744, + "learning_rate": 1.978358223278374e-05, + "loss": 0.7486, + "step": 7568 + }, + { + "epoch": 1.2355822211338312, + "grad_norm": 1.774264931678772, + "learning_rate": 1.9783516565934278e-05, + "loss": 0.7578, + "step": 7569 + }, + { + "epoch": 1.2357454797763356, + "grad_norm": 2.1112060546875, + "learning_rate": 1.9783450889232818e-05, + "loss": 1.2941, + "step": 7570 + }, + { + "epoch": 1.23590873841884, + "grad_norm": 2.18243670463562, + "learning_rate": 1.9783385202679426e-05, + "loss": 0.9275, + "step": 7571 + }, + { + "epoch": 1.2360719970613445, + "grad_norm": 1.6945784091949463, + "learning_rate": 1.9783319506274167e-05, + "loss": 0.6898, + "step": 7572 + }, + { + "epoch": 1.2362352557038487, + "grad_norm": 2.231377363204956, + "learning_rate": 1.9783253800017105e-05, + "loss": 0.8277, + "step": 7573 + }, + { + "epoch": 1.2363985143463532, + "grad_norm": 1.7557791471481323, + "learning_rate": 1.9783188083908315e-05, + "loss": 0.6462, + "step": 7574 + }, + { + "epoch": 1.2365617729888576, + "grad_norm": 1.9476746320724487, + "learning_rate": 1.9783122357947854e-05, + "loss": 0.7874, + "step": 7575 + }, + { + "epoch": 1.236725031631362, + "grad_norm": 1.7728602886199951, + "learning_rate": 1.978305662213579e-05, + "loss": 0.7225, + "step": 7576 + }, + { + "epoch": 1.2368882902738663, + "grad_norm": 1.6741236448287964, + "learning_rate": 1.9782990876472193e-05, + "loss": 0.6549, + "step": 7577 + }, + { + "epoch": 1.2370515489163707, + "grad_norm": 1.8071931600570679, + "learning_rate": 1.9782925120957123e-05, + "loss": 0.6933, + "step": 7578 + }, + { + "epoch": 1.2372148075588751, + "grad_norm": 1.6948342323303223, + "learning_rate": 1.9782859355590656e-05, + "loss": 0.6599, + "step": 7579 + }, + { + "epoch": 1.2373780662013796, + "grad_norm": 1.8294676542282104, + "learning_rate": 1.9782793580372848e-05, + "loss": 0.7019, + "step": 7580 + }, + { + "epoch": 1.237541324843884, + "grad_norm": 1.938857078552246, + "learning_rate": 1.9782727795303768e-05, + "loss": 0.7807, + "step": 7581 + }, + { + "epoch": 1.2377045834863882, + "grad_norm": 1.955514669418335, + "learning_rate": 1.9782662000383488e-05, + "loss": 0.7311, + "step": 7582 + }, + { + "epoch": 1.2378678421288927, + "grad_norm": 1.8646504878997803, + "learning_rate": 1.978259619561207e-05, + "loss": 0.787, + "step": 7583 + }, + { + "epoch": 1.238031100771397, + "grad_norm": 1.7462517023086548, + "learning_rate": 1.9782530380989576e-05, + "loss": 0.7283, + "step": 7584 + }, + { + "epoch": 1.2381943594139015, + "grad_norm": 1.7047559022903442, + "learning_rate": 1.978246455651608e-05, + "loss": 0.6147, + "step": 7585 + }, + { + "epoch": 1.2383576180564058, + "grad_norm": 2.0425400733947754, + "learning_rate": 1.978239872219164e-05, + "loss": 0.7465, + "step": 7586 + }, + { + "epoch": 1.2385208766989102, + "grad_norm": 1.8751040697097778, + "learning_rate": 1.978233287801633e-05, + "loss": 0.772, + "step": 7587 + }, + { + "epoch": 1.2386841353414146, + "grad_norm": 1.9482773542404175, + "learning_rate": 1.9782267023990214e-05, + "loss": 0.7237, + "step": 7588 + }, + { + "epoch": 1.238847393983919, + "grad_norm": 1.8579981327056885, + "learning_rate": 1.9782201160113362e-05, + "loss": 0.7704, + "step": 7589 + }, + { + "epoch": 1.2390106526264235, + "grad_norm": 1.8677568435668945, + "learning_rate": 1.978213528638583e-05, + "loss": 0.714, + "step": 7590 + }, + { + "epoch": 1.2391739112689277, + "grad_norm": 1.916264295578003, + "learning_rate": 1.978206940280769e-05, + "loss": 0.6573, + "step": 7591 + }, + { + "epoch": 1.2393371699114322, + "grad_norm": 1.900738000869751, + "learning_rate": 1.9782003509379014e-05, + "loss": 0.9007, + "step": 7592 + }, + { + "epoch": 1.2395004285539366, + "grad_norm": 1.8706860542297363, + "learning_rate": 1.978193760609986e-05, + "loss": 0.9908, + "step": 7593 + }, + { + "epoch": 1.239663687196441, + "grad_norm": 1.7971549034118652, + "learning_rate": 1.9781871692970297e-05, + "loss": 0.7879, + "step": 7594 + }, + { + "epoch": 1.2398269458389453, + "grad_norm": 1.6671955585479736, + "learning_rate": 1.9781805769990393e-05, + "loss": 0.6745, + "step": 7595 + }, + { + "epoch": 1.2399902044814497, + "grad_norm": 1.849905014038086, + "learning_rate": 1.9781739837160213e-05, + "loss": 0.7276, + "step": 7596 + }, + { + "epoch": 1.2401534631239541, + "grad_norm": 1.7919561862945557, + "learning_rate": 1.978167389447982e-05, + "loss": 0.7663, + "step": 7597 + }, + { + "epoch": 1.2403167217664586, + "grad_norm": 1.9800949096679688, + "learning_rate": 1.9781607941949287e-05, + "loss": 0.848, + "step": 7598 + }, + { + "epoch": 1.240479980408963, + "grad_norm": 1.8648529052734375, + "learning_rate": 1.978154197956868e-05, + "loss": 0.6196, + "step": 7599 + }, + { + "epoch": 1.2406432390514672, + "grad_norm": 1.9672514200210571, + "learning_rate": 1.9781476007338058e-05, + "loss": 0.7438, + "step": 7600 + }, + { + "epoch": 1.2408064976939717, + "grad_norm": 1.766627311706543, + "learning_rate": 1.9781410025257493e-05, + "loss": 0.6668, + "step": 7601 + }, + { + "epoch": 1.240969756336476, + "grad_norm": 1.8047432899475098, + "learning_rate": 1.9781344033327054e-05, + "loss": 0.679, + "step": 7602 + }, + { + "epoch": 1.2411330149789805, + "grad_norm": 1.577041506767273, + "learning_rate": 1.97812780315468e-05, + "loss": 0.7485, + "step": 7603 + }, + { + "epoch": 1.2412962736214848, + "grad_norm": 1.663478136062622, + "learning_rate": 1.9781212019916807e-05, + "loss": 0.7359, + "step": 7604 + }, + { + "epoch": 1.2414595322639892, + "grad_norm": 2.0024514198303223, + "learning_rate": 1.978114599843713e-05, + "loss": 0.8007, + "step": 7605 + }, + { + "epoch": 1.2416227909064936, + "grad_norm": 1.5555124282836914, + "learning_rate": 1.9781079967107845e-05, + "loss": 0.7269, + "step": 7606 + }, + { + "epoch": 1.241786049548998, + "grad_norm": 1.819848656654358, + "learning_rate": 1.9781013925929016e-05, + "loss": 0.7114, + "step": 7607 + }, + { + "epoch": 1.2419493081915025, + "grad_norm": 1.5645266771316528, + "learning_rate": 1.9780947874900708e-05, + "loss": 0.6312, + "step": 7608 + }, + { + "epoch": 1.2421125668340067, + "grad_norm": 2.1099443435668945, + "learning_rate": 1.9780881814022986e-05, + "loss": 0.8755, + "step": 7609 + }, + { + "epoch": 1.2422758254765112, + "grad_norm": 1.6440852880477905, + "learning_rate": 1.978081574329592e-05, + "loss": 0.6424, + "step": 7610 + }, + { + "epoch": 1.2424390841190156, + "grad_norm": 1.9759521484375, + "learning_rate": 1.9780749662719573e-05, + "loss": 0.7619, + "step": 7611 + }, + { + "epoch": 1.2426023427615198, + "grad_norm": 1.7985447645187378, + "learning_rate": 1.9780683572294018e-05, + "loss": 0.6113, + "step": 7612 + }, + { + "epoch": 1.2427656014040243, + "grad_norm": 1.6091721057891846, + "learning_rate": 1.9780617472019313e-05, + "loss": 0.7038, + "step": 7613 + }, + { + "epoch": 1.2429288600465287, + "grad_norm": 1.8612099885940552, + "learning_rate": 1.978055136189553e-05, + "loss": 0.7097, + "step": 7614 + }, + { + "epoch": 1.2430921186890331, + "grad_norm": 1.870557188987732, + "learning_rate": 1.9780485241922734e-05, + "loss": 0.7068, + "step": 7615 + }, + { + "epoch": 1.2432553773315376, + "grad_norm": 1.7763323783874512, + "learning_rate": 1.9780419112100995e-05, + "loss": 0.7164, + "step": 7616 + }, + { + "epoch": 1.2434186359740418, + "grad_norm": 1.7473191022872925, + "learning_rate": 1.9780352972430375e-05, + "loss": 0.6909, + "step": 7617 + }, + { + "epoch": 1.2435818946165462, + "grad_norm": 1.9092015027999878, + "learning_rate": 1.9780286822910942e-05, + "loss": 0.7368, + "step": 7618 + }, + { + "epoch": 1.2437451532590507, + "grad_norm": 1.9576643705368042, + "learning_rate": 1.9780220663542764e-05, + "loss": 0.6332, + "step": 7619 + }, + { + "epoch": 1.243908411901555, + "grad_norm": 1.6331299543380737, + "learning_rate": 1.9780154494325902e-05, + "loss": 0.6185, + "step": 7620 + }, + { + "epoch": 1.2440716705440593, + "grad_norm": 1.696237325668335, + "learning_rate": 1.978008831526043e-05, + "loss": 0.6288, + "step": 7621 + }, + { + "epoch": 1.2442349291865638, + "grad_norm": 1.8367983102798462, + "learning_rate": 1.9780022126346413e-05, + "loss": 0.7056, + "step": 7622 + }, + { + "epoch": 1.2443981878290682, + "grad_norm": 1.749759554862976, + "learning_rate": 1.9779955927583914e-05, + "loss": 0.773, + "step": 7623 + }, + { + "epoch": 1.2445614464715726, + "grad_norm": 1.8033175468444824, + "learning_rate": 1.9779889718973004e-05, + "loss": 0.798, + "step": 7624 + }, + { + "epoch": 1.244724705114077, + "grad_norm": 1.7719976902008057, + "learning_rate": 1.9779823500513747e-05, + "loss": 0.5895, + "step": 7625 + }, + { + "epoch": 1.2448879637565813, + "grad_norm": 1.870887041091919, + "learning_rate": 1.9779757272206207e-05, + "loss": 0.724, + "step": 7626 + }, + { + "epoch": 1.2450512223990857, + "grad_norm": 1.4038174152374268, + "learning_rate": 1.977969103405046e-05, + "loss": 0.5393, + "step": 7627 + }, + { + "epoch": 1.2452144810415902, + "grad_norm": 1.7500932216644287, + "learning_rate": 1.977962478604656e-05, + "loss": 0.6522, + "step": 7628 + }, + { + "epoch": 1.2453777396840946, + "grad_norm": 2.207958459854126, + "learning_rate": 1.9779558528194585e-05, + "loss": 0.6067, + "step": 7629 + }, + { + "epoch": 1.2455409983265988, + "grad_norm": 1.92695152759552, + "learning_rate": 1.9779492260494596e-05, + "loss": 0.6675, + "step": 7630 + }, + { + "epoch": 1.2457042569691033, + "grad_norm": 1.93916916847229, + "learning_rate": 1.9779425982946662e-05, + "loss": 0.6948, + "step": 7631 + }, + { + "epoch": 1.2458675156116077, + "grad_norm": 1.7454262971878052, + "learning_rate": 1.977935969555085e-05, + "loss": 0.7128, + "step": 7632 + }, + { + "epoch": 1.2460307742541121, + "grad_norm": 1.8135579824447632, + "learning_rate": 1.977929339830722e-05, + "loss": 0.7945, + "step": 7633 + }, + { + "epoch": 1.2461940328966166, + "grad_norm": 1.382270097732544, + "learning_rate": 1.9779227091215846e-05, + "loss": 0.6059, + "step": 7634 + }, + { + "epoch": 1.2463572915391208, + "grad_norm": 2.013002634048462, + "learning_rate": 1.9779160774276794e-05, + "loss": 0.7905, + "step": 7635 + }, + { + "epoch": 1.2465205501816252, + "grad_norm": 1.7730835676193237, + "learning_rate": 1.977909444749013e-05, + "loss": 0.6421, + "step": 7636 + }, + { + "epoch": 1.2466838088241297, + "grad_norm": 1.6572157144546509, + "learning_rate": 1.977902811085592e-05, + "loss": 0.7893, + "step": 7637 + }, + { + "epoch": 1.246847067466634, + "grad_norm": 1.3499289751052856, + "learning_rate": 1.9778961764374232e-05, + "loss": 0.5738, + "step": 7638 + }, + { + "epoch": 1.2470103261091383, + "grad_norm": 1.7579472064971924, + "learning_rate": 1.977889540804513e-05, + "loss": 0.6667, + "step": 7639 + }, + { + "epoch": 1.2471735847516427, + "grad_norm": 1.6760621070861816, + "learning_rate": 1.9778829041868687e-05, + "loss": 0.6109, + "step": 7640 + }, + { + "epoch": 1.2473368433941472, + "grad_norm": 1.7900716066360474, + "learning_rate": 1.977876266584496e-05, + "loss": 0.6302, + "step": 7641 + }, + { + "epoch": 1.2475001020366516, + "grad_norm": 1.535232663154602, + "learning_rate": 1.977869627997403e-05, + "loss": 0.6592, + "step": 7642 + }, + { + "epoch": 1.247663360679156, + "grad_norm": 2.1644678115844727, + "learning_rate": 1.977862988425595e-05, + "loss": 0.7755, + "step": 7643 + }, + { + "epoch": 1.2478266193216603, + "grad_norm": 2.142556667327881, + "learning_rate": 1.977856347869079e-05, + "loss": 0.7794, + "step": 7644 + }, + { + "epoch": 1.2479898779641647, + "grad_norm": 1.6852294206619263, + "learning_rate": 1.9778497063278622e-05, + "loss": 0.7434, + "step": 7645 + }, + { + "epoch": 1.2481531366066692, + "grad_norm": 2.489095687866211, + "learning_rate": 1.977843063801951e-05, + "loss": 1.0666, + "step": 7646 + }, + { + "epoch": 1.2483163952491736, + "grad_norm": 1.9005029201507568, + "learning_rate": 1.977836420291352e-05, + "loss": 0.7712, + "step": 7647 + }, + { + "epoch": 1.2484796538916778, + "grad_norm": 1.8809047937393188, + "learning_rate": 1.9778297757960723e-05, + "loss": 0.6823, + "step": 7648 + }, + { + "epoch": 1.2486429125341822, + "grad_norm": 2.559624433517456, + "learning_rate": 1.977823130316118e-05, + "loss": 0.861, + "step": 7649 + }, + { + "epoch": 1.2488061711766867, + "grad_norm": 2.0007712841033936, + "learning_rate": 1.977816483851496e-05, + "loss": 0.8188, + "step": 7650 + }, + { + "epoch": 1.2489694298191911, + "grad_norm": 1.69324791431427, + "learning_rate": 1.977809836402213e-05, + "loss": 0.6908, + "step": 7651 + }, + { + "epoch": 1.2491326884616956, + "grad_norm": 2.074950695037842, + "learning_rate": 1.977803187968276e-05, + "loss": 0.8386, + "step": 7652 + }, + { + "epoch": 1.2492959471041998, + "grad_norm": 1.6235787868499756, + "learning_rate": 1.9777965385496912e-05, + "loss": 0.6974, + "step": 7653 + }, + { + "epoch": 1.2494592057467042, + "grad_norm": 1.6162400245666504, + "learning_rate": 1.9777898881464657e-05, + "loss": 0.6999, + "step": 7654 + }, + { + "epoch": 1.2496224643892087, + "grad_norm": 1.6110872030258179, + "learning_rate": 1.977783236758606e-05, + "loss": 0.6352, + "step": 7655 + }, + { + "epoch": 1.2497857230317129, + "grad_norm": 2.1014626026153564, + "learning_rate": 1.977776584386119e-05, + "loss": 0.8632, + "step": 7656 + }, + { + "epoch": 1.2499489816742173, + "grad_norm": 1.6196662187576294, + "learning_rate": 1.9777699310290113e-05, + "loss": 0.7205, + "step": 7657 + }, + { + "epoch": 1.2501122403167217, + "grad_norm": 1.626107931137085, + "learning_rate": 1.9777632766872893e-05, + "loss": 0.6002, + "step": 7658 + }, + { + "epoch": 1.2502754989592262, + "grad_norm": 1.876182198524475, + "learning_rate": 1.97775662136096e-05, + "loss": 0.7505, + "step": 7659 + }, + { + "epoch": 1.2504387576017306, + "grad_norm": 1.9309813976287842, + "learning_rate": 1.9777499650500303e-05, + "loss": 0.6913, + "step": 7660 + }, + { + "epoch": 1.250602016244235, + "grad_norm": 1.8079001903533936, + "learning_rate": 1.977743307754506e-05, + "loss": 0.7033, + "step": 7661 + }, + { + "epoch": 1.2507652748867393, + "grad_norm": 1.7961138486862183, + "learning_rate": 1.977736649474395e-05, + "loss": 0.8187, + "step": 7662 + }, + { + "epoch": 1.2509285335292437, + "grad_norm": 1.737817645072937, + "learning_rate": 1.9777299902097033e-05, + "loss": 0.7598, + "step": 7663 + }, + { + "epoch": 1.2510917921717482, + "grad_norm": 1.5884069204330444, + "learning_rate": 1.977723329960438e-05, + "loss": 0.6067, + "step": 7664 + }, + { + "epoch": 1.2512550508142524, + "grad_norm": 1.7477344274520874, + "learning_rate": 1.9777166687266055e-05, + "loss": 0.6746, + "step": 7665 + }, + { + "epoch": 1.2514183094567568, + "grad_norm": 1.7770012617111206, + "learning_rate": 1.977710006508212e-05, + "loss": 0.6714, + "step": 7666 + }, + { + "epoch": 1.2515815680992612, + "grad_norm": 1.717362403869629, + "learning_rate": 1.9777033433052653e-05, + "loss": 0.6306, + "step": 7667 + }, + { + "epoch": 1.2517448267417657, + "grad_norm": 2.1102142333984375, + "learning_rate": 1.9776966791177715e-05, + "loss": 0.8492, + "step": 7668 + }, + { + "epoch": 1.2519080853842701, + "grad_norm": 1.5703054666519165, + "learning_rate": 1.9776900139457375e-05, + "loss": 0.6426, + "step": 7669 + }, + { + "epoch": 1.2520713440267743, + "grad_norm": 1.5975521802902222, + "learning_rate": 1.9776833477891696e-05, + "loss": 0.5865, + "step": 7670 + }, + { + "epoch": 1.2522346026692788, + "grad_norm": 1.959734559059143, + "learning_rate": 1.977676680648075e-05, + "loss": 0.7569, + "step": 7671 + }, + { + "epoch": 1.2523978613117832, + "grad_norm": 1.758931279182434, + "learning_rate": 1.9776700125224605e-05, + "loss": 0.7334, + "step": 7672 + }, + { + "epoch": 1.2525611199542876, + "grad_norm": 1.7981336116790771, + "learning_rate": 1.977663343412332e-05, + "loss": 0.7665, + "step": 7673 + }, + { + "epoch": 1.2527243785967919, + "grad_norm": 2.122328281402588, + "learning_rate": 1.9776566733176974e-05, + "loss": 0.8494, + "step": 7674 + }, + { + "epoch": 1.2528876372392963, + "grad_norm": 1.827448844909668, + "learning_rate": 1.9776500022385623e-05, + "loss": 0.7345, + "step": 7675 + }, + { + "epoch": 1.2530508958818007, + "grad_norm": 1.9876526594161987, + "learning_rate": 1.9776433301749344e-05, + "loss": 0.8205, + "step": 7676 + }, + { + "epoch": 1.2532141545243052, + "grad_norm": 1.6124964952468872, + "learning_rate": 1.9776366571268194e-05, + "loss": 0.6994, + "step": 7677 + }, + { + "epoch": 1.2533774131668096, + "grad_norm": 2.0584192276000977, + "learning_rate": 1.9776299830942248e-05, + "loss": 0.8039, + "step": 7678 + }, + { + "epoch": 1.2535406718093138, + "grad_norm": 1.963331699371338, + "learning_rate": 1.977623308077157e-05, + "loss": 0.6681, + "step": 7679 + }, + { + "epoch": 1.2537039304518183, + "grad_norm": 1.6164655685424805, + "learning_rate": 1.9776166320756228e-05, + "loss": 0.6163, + "step": 7680 + }, + { + "epoch": 1.2538671890943227, + "grad_norm": 2.323378801345825, + "learning_rate": 1.977609955089629e-05, + "loss": 0.6766, + "step": 7681 + }, + { + "epoch": 1.2540304477368271, + "grad_norm": 1.8354038000106812, + "learning_rate": 1.977603277119182e-05, + "loss": 0.7172, + "step": 7682 + }, + { + "epoch": 1.2541937063793314, + "grad_norm": 2.1104421615600586, + "learning_rate": 1.977596598164289e-05, + "loss": 0.8365, + "step": 7683 + }, + { + "epoch": 1.2543569650218358, + "grad_norm": 1.6812859773635864, + "learning_rate": 1.9775899182249563e-05, + "loss": 0.6215, + "step": 7684 + }, + { + "epoch": 1.2545202236643402, + "grad_norm": 1.4956614971160889, + "learning_rate": 1.977583237301191e-05, + "loss": 0.6295, + "step": 7685 + }, + { + "epoch": 1.2546834823068447, + "grad_norm": 1.9258769750595093, + "learning_rate": 1.9775765553929995e-05, + "loss": 0.7462, + "step": 7686 + }, + { + "epoch": 1.2548467409493491, + "grad_norm": 2.1310577392578125, + "learning_rate": 1.9775698725003888e-05, + "loss": 0.888, + "step": 7687 + }, + { + "epoch": 1.2550099995918533, + "grad_norm": 1.5908682346343994, + "learning_rate": 1.9775631886233655e-05, + "loss": 0.6188, + "step": 7688 + }, + { + "epoch": 1.2551732582343578, + "grad_norm": 1.9881330728530884, + "learning_rate": 1.977556503761936e-05, + "loss": 0.7757, + "step": 7689 + }, + { + "epoch": 1.2553365168768622, + "grad_norm": 1.6291533708572388, + "learning_rate": 1.9775498179161077e-05, + "loss": 0.6383, + "step": 7690 + }, + { + "epoch": 1.2554997755193664, + "grad_norm": 1.8524706363677979, + "learning_rate": 1.977543131085887e-05, + "loss": 0.6524, + "step": 7691 + }, + { + "epoch": 1.2556630341618709, + "grad_norm": 1.8678884506225586, + "learning_rate": 1.9775364432712804e-05, + "loss": 0.6793, + "step": 7692 + }, + { + "epoch": 1.2558262928043753, + "grad_norm": 1.7547338008880615, + "learning_rate": 1.977529754472295e-05, + "loss": 0.7139, + "step": 7693 + }, + { + "epoch": 1.2559895514468797, + "grad_norm": 1.5473077297210693, + "learning_rate": 1.9775230646889374e-05, + "loss": 0.6014, + "step": 7694 + }, + { + "epoch": 1.2561528100893842, + "grad_norm": 1.750540018081665, + "learning_rate": 1.9775163739212143e-05, + "loss": 0.6479, + "step": 7695 + }, + { + "epoch": 1.2563160687318886, + "grad_norm": 1.7480374574661255, + "learning_rate": 1.9775096821691323e-05, + "loss": 0.651, + "step": 7696 + }, + { + "epoch": 1.2564793273743928, + "grad_norm": 1.787429928779602, + "learning_rate": 1.9775029894326987e-05, + "loss": 0.7655, + "step": 7697 + }, + { + "epoch": 1.2566425860168973, + "grad_norm": 1.5788383483886719, + "learning_rate": 1.9774962957119196e-05, + "loss": 0.8434, + "step": 7698 + }, + { + "epoch": 1.2568058446594017, + "grad_norm": 1.7003456354141235, + "learning_rate": 1.9774896010068022e-05, + "loss": 0.6534, + "step": 7699 + }, + { + "epoch": 1.256969103301906, + "grad_norm": 1.9412543773651123, + "learning_rate": 1.977482905317353e-05, + "loss": 0.8285, + "step": 7700 + }, + { + "epoch": 1.2571323619444104, + "grad_norm": 1.6494460105895996, + "learning_rate": 1.9774762086435784e-05, + "loss": 0.6975, + "step": 7701 + }, + { + "epoch": 1.2572956205869148, + "grad_norm": 1.571478009223938, + "learning_rate": 1.977469510985486e-05, + "loss": 0.6001, + "step": 7702 + }, + { + "epoch": 1.2574588792294192, + "grad_norm": 1.7906489372253418, + "learning_rate": 1.9774628123430815e-05, + "loss": 0.7018, + "step": 7703 + }, + { + "epoch": 1.2576221378719237, + "grad_norm": 1.9880855083465576, + "learning_rate": 1.9774561127163723e-05, + "loss": 0.6201, + "step": 7704 + }, + { + "epoch": 1.2577853965144281, + "grad_norm": 1.9657167196273804, + "learning_rate": 1.9774494121053655e-05, + "loss": 0.7326, + "step": 7705 + }, + { + "epoch": 1.2579486551569323, + "grad_norm": 1.5162639617919922, + "learning_rate": 1.9774427105100674e-05, + "loss": 0.5814, + "step": 7706 + }, + { + "epoch": 1.2581119137994368, + "grad_norm": 1.9122223854064941, + "learning_rate": 1.9774360079304844e-05, + "loss": 0.7358, + "step": 7707 + }, + { + "epoch": 1.2582751724419412, + "grad_norm": 1.7837613821029663, + "learning_rate": 1.9774293043666238e-05, + "loss": 0.811, + "step": 7708 + }, + { + "epoch": 1.2584384310844454, + "grad_norm": 1.8560820817947388, + "learning_rate": 1.9774225998184923e-05, + "loss": 0.7148, + "step": 7709 + }, + { + "epoch": 1.2586016897269499, + "grad_norm": 1.8998948335647583, + "learning_rate": 1.9774158942860962e-05, + "loss": 0.6309, + "step": 7710 + }, + { + "epoch": 1.2587649483694543, + "grad_norm": 1.9607232809066772, + "learning_rate": 1.9774091877694425e-05, + "loss": 0.6746, + "step": 7711 + }, + { + "epoch": 1.2589282070119587, + "grad_norm": 1.683692455291748, + "learning_rate": 1.977402480268538e-05, + "loss": 0.5997, + "step": 7712 + }, + { + "epoch": 1.2590914656544632, + "grad_norm": 1.9616204500198364, + "learning_rate": 1.9773957717833897e-05, + "loss": 0.8368, + "step": 7713 + }, + { + "epoch": 1.2592547242969676, + "grad_norm": 1.7093327045440674, + "learning_rate": 1.977389062314004e-05, + "loss": 0.6858, + "step": 7714 + }, + { + "epoch": 1.2594179829394718, + "grad_norm": 1.8307019472122192, + "learning_rate": 1.9773823518603877e-05, + "loss": 0.7845, + "step": 7715 + }, + { + "epoch": 1.2595812415819763, + "grad_norm": 1.880604863166809, + "learning_rate": 1.9773756404225478e-05, + "loss": 0.6558, + "step": 7716 + }, + { + "epoch": 1.2597445002244807, + "grad_norm": 1.7975512742996216, + "learning_rate": 1.9773689280004908e-05, + "loss": 0.6869, + "step": 7717 + }, + { + "epoch": 1.259907758866985, + "grad_norm": 1.8914908170700073, + "learning_rate": 1.977362214594224e-05, + "loss": 0.7408, + "step": 7718 + }, + { + "epoch": 1.2600710175094894, + "grad_norm": 2.219393014907837, + "learning_rate": 1.977355500203753e-05, + "loss": 0.7213, + "step": 7719 + }, + { + "epoch": 1.2602342761519938, + "grad_norm": 1.5083045959472656, + "learning_rate": 1.9773487848290856e-05, + "loss": 0.6135, + "step": 7720 + }, + { + "epoch": 1.2603975347944982, + "grad_norm": 1.557945728302002, + "learning_rate": 1.977342068470228e-05, + "loss": 0.5062, + "step": 7721 + }, + { + "epoch": 1.2605607934370027, + "grad_norm": 1.8384490013122559, + "learning_rate": 1.9773353511271876e-05, + "loss": 0.6928, + "step": 7722 + }, + { + "epoch": 1.2607240520795069, + "grad_norm": 1.7242950201034546, + "learning_rate": 1.9773286327999703e-05, + "loss": 0.675, + "step": 7723 + }, + { + "epoch": 1.2608873107220113, + "grad_norm": 1.8644347190856934, + "learning_rate": 1.9773219134885834e-05, + "loss": 0.6784, + "step": 7724 + }, + { + "epoch": 1.2610505693645158, + "grad_norm": 1.5499472618103027, + "learning_rate": 1.977315193193034e-05, + "loss": 0.5699, + "step": 7725 + }, + { + "epoch": 1.2612138280070202, + "grad_norm": 1.7247308492660522, + "learning_rate": 1.977308471913328e-05, + "loss": 0.6725, + "step": 7726 + }, + { + "epoch": 1.2613770866495244, + "grad_norm": 1.889413833618164, + "learning_rate": 1.977301749649473e-05, + "loss": 0.8358, + "step": 7727 + }, + { + "epoch": 1.2615403452920289, + "grad_norm": 1.627528190612793, + "learning_rate": 1.977295026401475e-05, + "loss": 0.6034, + "step": 7728 + }, + { + "epoch": 1.2617036039345333, + "grad_norm": 1.7325314283370972, + "learning_rate": 1.9772883021693417e-05, + "loss": 0.7557, + "step": 7729 + }, + { + "epoch": 1.2618668625770377, + "grad_norm": 1.6630433797836304, + "learning_rate": 1.977281576953079e-05, + "loss": 0.667, + "step": 7730 + }, + { + "epoch": 1.2620301212195422, + "grad_norm": 1.6435582637786865, + "learning_rate": 1.9772748507526942e-05, + "loss": 0.6536, + "step": 7731 + }, + { + "epoch": 1.2621933798620464, + "grad_norm": 1.7910091876983643, + "learning_rate": 1.9772681235681936e-05, + "loss": 0.7353, + "step": 7732 + }, + { + "epoch": 1.2623566385045508, + "grad_norm": 1.982434868812561, + "learning_rate": 1.9772613953995844e-05, + "loss": 0.7628, + "step": 7733 + }, + { + "epoch": 1.2625198971470553, + "grad_norm": 1.97652268409729, + "learning_rate": 1.9772546662468734e-05, + "loss": 0.6555, + "step": 7734 + }, + { + "epoch": 1.2626831557895595, + "grad_norm": 1.81621515750885, + "learning_rate": 1.977247936110067e-05, + "loss": 0.7882, + "step": 7735 + }, + { + "epoch": 1.262846414432064, + "grad_norm": 1.7060892581939697, + "learning_rate": 1.977241204989172e-05, + "loss": 0.635, + "step": 7736 + }, + { + "epoch": 1.2630096730745684, + "grad_norm": 1.807970643043518, + "learning_rate": 1.977234472884196e-05, + "loss": 0.7383, + "step": 7737 + }, + { + "epoch": 1.2631729317170728, + "grad_norm": 1.8335809707641602, + "learning_rate": 1.9772277397951445e-05, + "loss": 0.7604, + "step": 7738 + }, + { + "epoch": 1.2633361903595772, + "grad_norm": 1.91744065284729, + "learning_rate": 1.977221005722025e-05, + "loss": 0.7054, + "step": 7739 + }, + { + "epoch": 1.2634994490020817, + "grad_norm": 1.6507023572921753, + "learning_rate": 1.9772142706648446e-05, + "loss": 0.744, + "step": 7740 + }, + { + "epoch": 1.2636627076445859, + "grad_norm": 2.0793471336364746, + "learning_rate": 1.9772075346236096e-05, + "loss": 0.9387, + "step": 7741 + }, + { + "epoch": 1.2638259662870903, + "grad_norm": 1.916503667831421, + "learning_rate": 1.9772007975983263e-05, + "loss": 0.781, + "step": 7742 + }, + { + "epoch": 1.2639892249295948, + "grad_norm": 1.8557668924331665, + "learning_rate": 1.9771940595890025e-05, + "loss": 0.6865, + "step": 7743 + }, + { + "epoch": 1.264152483572099, + "grad_norm": 1.9744746685028076, + "learning_rate": 1.9771873205956446e-05, + "loss": 1.1702, + "step": 7744 + }, + { + "epoch": 1.2643157422146034, + "grad_norm": 1.8330345153808594, + "learning_rate": 1.9771805806182594e-05, + "loss": 0.7574, + "step": 7745 + }, + { + "epoch": 1.2644790008571078, + "grad_norm": 1.5432018041610718, + "learning_rate": 1.9771738396568537e-05, + "loss": 0.5835, + "step": 7746 + }, + { + "epoch": 1.2646422594996123, + "grad_norm": 1.8032466173171997, + "learning_rate": 1.977167097711434e-05, + "loss": 0.7626, + "step": 7747 + }, + { + "epoch": 1.2648055181421167, + "grad_norm": 2.006655693054199, + "learning_rate": 1.977160354782007e-05, + "loss": 0.7264, + "step": 7748 + }, + { + "epoch": 1.2649687767846212, + "grad_norm": 2.0377323627471924, + "learning_rate": 1.97715361086858e-05, + "loss": 0.8226, + "step": 7749 + }, + { + "epoch": 1.2651320354271254, + "grad_norm": 1.6945147514343262, + "learning_rate": 1.9771468659711595e-05, + "loss": 0.5963, + "step": 7750 + }, + { + "epoch": 1.2652952940696298, + "grad_norm": 1.6919796466827393, + "learning_rate": 1.9771401200897527e-05, + "loss": 0.7075, + "step": 7751 + }, + { + "epoch": 1.2654585527121343, + "grad_norm": 1.703816533088684, + "learning_rate": 1.9771333732243657e-05, + "loss": 0.5682, + "step": 7752 + }, + { + "epoch": 1.2656218113546385, + "grad_norm": 1.3699593544006348, + "learning_rate": 1.977126625375006e-05, + "loss": 0.5468, + "step": 7753 + }, + { + "epoch": 1.265785069997143, + "grad_norm": 1.692429780960083, + "learning_rate": 1.97711987654168e-05, + "loss": 0.698, + "step": 7754 + }, + { + "epoch": 1.2659483286396473, + "grad_norm": 2.1237411499023438, + "learning_rate": 1.9771131267243942e-05, + "loss": 0.7937, + "step": 7755 + }, + { + "epoch": 1.2661115872821518, + "grad_norm": 1.583988904953003, + "learning_rate": 1.977106375923156e-05, + "loss": 0.6226, + "step": 7756 + }, + { + "epoch": 1.2662748459246562, + "grad_norm": 1.7571914196014404, + "learning_rate": 1.9770996241379718e-05, + "loss": 0.7396, + "step": 7757 + }, + { + "epoch": 1.2664381045671607, + "grad_norm": 1.635484218597412, + "learning_rate": 1.9770928713688488e-05, + "loss": 0.7162, + "step": 7758 + }, + { + "epoch": 1.2666013632096649, + "grad_norm": 1.8203418254852295, + "learning_rate": 1.977086117615793e-05, + "loss": 0.6953, + "step": 7759 + }, + { + "epoch": 1.2667646218521693, + "grad_norm": 1.6656782627105713, + "learning_rate": 1.9770793628788123e-05, + "loss": 0.6281, + "step": 7760 + }, + { + "epoch": 1.2669278804946738, + "grad_norm": 2.008305788040161, + "learning_rate": 1.9770726071579127e-05, + "loss": 1.1971, + "step": 7761 + }, + { + "epoch": 1.267091139137178, + "grad_norm": 1.805480718612671, + "learning_rate": 1.977065850453101e-05, + "loss": 1.2614, + "step": 7762 + }, + { + "epoch": 1.2672543977796824, + "grad_norm": 2.207421064376831, + "learning_rate": 1.9770590927643846e-05, + "loss": 0.88, + "step": 7763 + }, + { + "epoch": 1.2674176564221868, + "grad_norm": 1.8903216123580933, + "learning_rate": 1.97705233409177e-05, + "loss": 0.7529, + "step": 7764 + }, + { + "epoch": 1.2675809150646913, + "grad_norm": 2.075789451599121, + "learning_rate": 1.977045574435264e-05, + "loss": 0.8089, + "step": 7765 + }, + { + "epoch": 1.2677441737071957, + "grad_norm": 1.9584882259368896, + "learning_rate": 1.9770388137948728e-05, + "loss": 0.8219, + "step": 7766 + }, + { + "epoch": 1.2679074323497, + "grad_norm": 1.5612034797668457, + "learning_rate": 1.9770320521706045e-05, + "loss": 0.5108, + "step": 7767 + }, + { + "epoch": 1.2680706909922044, + "grad_norm": 1.9009487628936768, + "learning_rate": 1.9770252895624647e-05, + "loss": 0.6953, + "step": 7768 + }, + { + "epoch": 1.2682339496347088, + "grad_norm": 1.9827795028686523, + "learning_rate": 1.977018525970461e-05, + "loss": 0.7968, + "step": 7769 + }, + { + "epoch": 1.2683972082772133, + "grad_norm": 1.7919560670852661, + "learning_rate": 1.9770117613945996e-05, + "loss": 0.647, + "step": 7770 + }, + { + "epoch": 1.2685604669197175, + "grad_norm": 1.7263643741607666, + "learning_rate": 1.9770049958348875e-05, + "loss": 0.6824, + "step": 7771 + }, + { + "epoch": 1.268723725562222, + "grad_norm": 1.5347492694854736, + "learning_rate": 1.976998229291332e-05, + "loss": 0.5474, + "step": 7772 + }, + { + "epoch": 1.2688869842047263, + "grad_norm": 2.062748432159424, + "learning_rate": 1.9769914617639397e-05, + "loss": 0.8293, + "step": 7773 + }, + { + "epoch": 1.2690502428472308, + "grad_norm": 1.599362850189209, + "learning_rate": 1.976984693252717e-05, + "loss": 0.6123, + "step": 7774 + }, + { + "epoch": 1.2692135014897352, + "grad_norm": 1.8141447305679321, + "learning_rate": 1.9769779237576707e-05, + "loss": 0.73, + "step": 7775 + }, + { + "epoch": 1.2693767601322394, + "grad_norm": 1.947662353515625, + "learning_rate": 1.9769711532788083e-05, + "loss": 0.7038, + "step": 7776 + }, + { + "epoch": 1.2695400187747439, + "grad_norm": 2.233693838119507, + "learning_rate": 1.976964381816136e-05, + "loss": 0.5868, + "step": 7777 + }, + { + "epoch": 1.2697032774172483, + "grad_norm": 1.6217938661575317, + "learning_rate": 1.976957609369661e-05, + "loss": 0.5973, + "step": 7778 + }, + { + "epoch": 1.2698665360597525, + "grad_norm": 1.966148853302002, + "learning_rate": 1.9769508359393897e-05, + "loss": 0.773, + "step": 7779 + }, + { + "epoch": 1.270029794702257, + "grad_norm": 1.6554374694824219, + "learning_rate": 1.9769440615253295e-05, + "loss": 0.7291, + "step": 7780 + }, + { + "epoch": 1.2701930533447614, + "grad_norm": 1.6029794216156006, + "learning_rate": 1.9769372861274865e-05, + "loss": 0.5961, + "step": 7781 + }, + { + "epoch": 1.2703563119872658, + "grad_norm": 1.7516472339630127, + "learning_rate": 1.9769305097458684e-05, + "loss": 0.6267, + "step": 7782 + }, + { + "epoch": 1.2705195706297703, + "grad_norm": 1.6798949241638184, + "learning_rate": 1.976923732380481e-05, + "loss": 0.5832, + "step": 7783 + }, + { + "epoch": 1.2706828292722747, + "grad_norm": 2.195237874984741, + "learning_rate": 1.976916954031332e-05, + "loss": 0.5794, + "step": 7784 + }, + { + "epoch": 1.270846087914779, + "grad_norm": 2.1128997802734375, + "learning_rate": 1.9769101746984275e-05, + "loss": 0.6734, + "step": 7785 + }, + { + "epoch": 1.2710093465572834, + "grad_norm": 2.115288257598877, + "learning_rate": 1.976903394381775e-05, + "loss": 0.8272, + "step": 7786 + }, + { + "epoch": 1.2711726051997878, + "grad_norm": 1.61924147605896, + "learning_rate": 1.976896613081381e-05, + "loss": 0.612, + "step": 7787 + }, + { + "epoch": 1.271335863842292, + "grad_norm": 1.5435549020767212, + "learning_rate": 1.9768898307972527e-05, + "loss": 0.5269, + "step": 7788 + }, + { + "epoch": 1.2714991224847965, + "grad_norm": 1.6413521766662598, + "learning_rate": 1.976883047529396e-05, + "loss": 0.6241, + "step": 7789 + }, + { + "epoch": 1.271662381127301, + "grad_norm": 1.605469822883606, + "learning_rate": 1.9768762632778188e-05, + "loss": 0.6143, + "step": 7790 + }, + { + "epoch": 1.2718256397698053, + "grad_norm": 1.7283122539520264, + "learning_rate": 1.9768694780425274e-05, + "loss": 0.7243, + "step": 7791 + }, + { + "epoch": 1.2719888984123098, + "grad_norm": 1.983344316482544, + "learning_rate": 1.9768626918235286e-05, + "loss": 0.7424, + "step": 7792 + }, + { + "epoch": 1.2721521570548142, + "grad_norm": 1.8348256349563599, + "learning_rate": 1.976855904620829e-05, + "loss": 0.7129, + "step": 7793 + }, + { + "epoch": 1.2723154156973184, + "grad_norm": 1.776906132698059, + "learning_rate": 1.9768491164344362e-05, + "loss": 0.6994, + "step": 7794 + }, + { + "epoch": 1.2724786743398229, + "grad_norm": 1.7686011791229248, + "learning_rate": 1.9768423272643566e-05, + "loss": 0.638, + "step": 7795 + }, + { + "epoch": 1.2726419329823273, + "grad_norm": 1.5960432291030884, + "learning_rate": 1.976835537110597e-05, + "loss": 0.601, + "step": 7796 + }, + { + "epoch": 1.2728051916248315, + "grad_norm": 1.6386282444000244, + "learning_rate": 1.976828745973164e-05, + "loss": 0.6379, + "step": 7797 + }, + { + "epoch": 1.272968450267336, + "grad_norm": 1.861646294593811, + "learning_rate": 1.976821953852065e-05, + "loss": 0.6203, + "step": 7798 + }, + { + "epoch": 1.2731317089098404, + "grad_norm": 1.7679351568222046, + "learning_rate": 1.9768151607473064e-05, + "loss": 0.7021, + "step": 7799 + }, + { + "epoch": 1.2732949675523448, + "grad_norm": 1.7943164110183716, + "learning_rate": 1.9768083666588954e-05, + "loss": 0.6469, + "step": 7800 + }, + { + "epoch": 1.2734582261948493, + "grad_norm": 1.6696391105651855, + "learning_rate": 1.9768015715868386e-05, + "loss": 0.6664, + "step": 7801 + }, + { + "epoch": 1.2736214848373537, + "grad_norm": 1.4941529035568237, + "learning_rate": 1.9767947755311425e-05, + "loss": 0.6303, + "step": 7802 + }, + { + "epoch": 1.273784743479858, + "grad_norm": 2.3338375091552734, + "learning_rate": 1.9767879784918148e-05, + "loss": 0.9233, + "step": 7803 + }, + { + "epoch": 1.2739480021223624, + "grad_norm": 1.883003830909729, + "learning_rate": 1.9767811804688616e-05, + "loss": 0.6893, + "step": 7804 + }, + { + "epoch": 1.2741112607648668, + "grad_norm": 1.6260789632797241, + "learning_rate": 1.97677438146229e-05, + "loss": 0.5862, + "step": 7805 + }, + { + "epoch": 1.274274519407371, + "grad_norm": 1.9480656385421753, + "learning_rate": 1.976767581472107e-05, + "loss": 0.6986, + "step": 7806 + }, + { + "epoch": 1.2744377780498755, + "grad_norm": 1.780089259147644, + "learning_rate": 1.9767607804983192e-05, + "loss": 0.5556, + "step": 7807 + }, + { + "epoch": 1.27460103669238, + "grad_norm": 1.6911287307739258, + "learning_rate": 1.9767539785409332e-05, + "loss": 0.6794, + "step": 7808 + }, + { + "epoch": 1.2747642953348843, + "grad_norm": 1.6296827793121338, + "learning_rate": 1.976747175599957e-05, + "loss": 0.5866, + "step": 7809 + }, + { + "epoch": 1.2749275539773888, + "grad_norm": 1.5960873365402222, + "learning_rate": 1.976740371675396e-05, + "loss": 0.6731, + "step": 7810 + }, + { + "epoch": 1.275090812619893, + "grad_norm": 1.8045580387115479, + "learning_rate": 1.976733566767258e-05, + "loss": 0.7185, + "step": 7811 + }, + { + "epoch": 1.2752540712623974, + "grad_norm": 1.6713072061538696, + "learning_rate": 1.9767267608755497e-05, + "loss": 0.7675, + "step": 7812 + }, + { + "epoch": 1.2754173299049019, + "grad_norm": 1.6557163000106812, + "learning_rate": 1.9767199540002772e-05, + "loss": 0.6629, + "step": 7813 + }, + { + "epoch": 1.2755805885474063, + "grad_norm": 1.3956446647644043, + "learning_rate": 1.9767131461414487e-05, + "loss": 0.6063, + "step": 7814 + }, + { + "epoch": 1.2757438471899105, + "grad_norm": 1.9779384136199951, + "learning_rate": 1.97670633729907e-05, + "loss": 0.7719, + "step": 7815 + }, + { + "epoch": 1.275907105832415, + "grad_norm": 1.9786607027053833, + "learning_rate": 1.976699527473148e-05, + "loss": 0.827, + "step": 7816 + }, + { + "epoch": 1.2760703644749194, + "grad_norm": 1.7793630361557007, + "learning_rate": 1.9766927166636903e-05, + "loss": 0.6342, + "step": 7817 + }, + { + "epoch": 1.2762336231174238, + "grad_norm": 1.824869990348816, + "learning_rate": 1.976685904870703e-05, + "loss": 0.6645, + "step": 7818 + }, + { + "epoch": 1.2763968817599283, + "grad_norm": 1.8695714473724365, + "learning_rate": 1.9766790920941933e-05, + "loss": 0.7893, + "step": 7819 + }, + { + "epoch": 1.2765601404024325, + "grad_norm": 1.6905813217163086, + "learning_rate": 1.9766722783341682e-05, + "loss": 0.6688, + "step": 7820 + }, + { + "epoch": 1.276723399044937, + "grad_norm": 1.794819951057434, + "learning_rate": 1.9766654635906342e-05, + "loss": 0.6051, + "step": 7821 + }, + { + "epoch": 1.2768866576874414, + "grad_norm": 2.206143856048584, + "learning_rate": 1.9766586478635984e-05, + "loss": 0.627, + "step": 7822 + }, + { + "epoch": 1.2770499163299456, + "grad_norm": 1.6245406866073608, + "learning_rate": 1.9766518311530675e-05, + "loss": 0.6305, + "step": 7823 + }, + { + "epoch": 1.27721317497245, + "grad_norm": 1.692724347114563, + "learning_rate": 1.9766450134590484e-05, + "loss": 0.6676, + "step": 7824 + }, + { + "epoch": 1.2773764336149545, + "grad_norm": 1.8956235647201538, + "learning_rate": 1.9766381947815484e-05, + "loss": 0.7416, + "step": 7825 + }, + { + "epoch": 1.277539692257459, + "grad_norm": 1.786946415901184, + "learning_rate": 1.9766313751205738e-05, + "loss": 0.6977, + "step": 7826 + }, + { + "epoch": 1.2777029508999633, + "grad_norm": 1.944962501525879, + "learning_rate": 1.9766245544761316e-05, + "loss": 0.8914, + "step": 7827 + }, + { + "epoch": 1.2778662095424678, + "grad_norm": 2.5685558319091797, + "learning_rate": 1.9766177328482285e-05, + "loss": 1.2777, + "step": 7828 + }, + { + "epoch": 1.278029468184972, + "grad_norm": 1.7290880680084229, + "learning_rate": 1.9766109102368717e-05, + "loss": 0.6887, + "step": 7829 + }, + { + "epoch": 1.2781927268274764, + "grad_norm": 1.6054240465164185, + "learning_rate": 1.9766040866420684e-05, + "loss": 0.6621, + "step": 7830 + }, + { + "epoch": 1.2783559854699809, + "grad_norm": 2.233534574508667, + "learning_rate": 1.976597262063825e-05, + "loss": 0.7421, + "step": 7831 + }, + { + "epoch": 1.278519244112485, + "grad_norm": 2.093970537185669, + "learning_rate": 1.976590436502148e-05, + "loss": 0.6147, + "step": 7832 + }, + { + "epoch": 1.2786825027549895, + "grad_norm": 1.656832218170166, + "learning_rate": 1.976583609957045e-05, + "loss": 0.6692, + "step": 7833 + }, + { + "epoch": 1.278845761397494, + "grad_norm": 1.8849139213562012, + "learning_rate": 1.9765767824285223e-05, + "loss": 0.7561, + "step": 7834 + }, + { + "epoch": 1.2790090200399984, + "grad_norm": 1.7517282962799072, + "learning_rate": 1.9765699539165873e-05, + "loss": 0.7164, + "step": 7835 + }, + { + "epoch": 1.2791722786825028, + "grad_norm": 1.7697194814682007, + "learning_rate": 1.9765631244212464e-05, + "loss": 0.6427, + "step": 7836 + }, + { + "epoch": 1.2793355373250073, + "grad_norm": 1.7115105390548706, + "learning_rate": 1.9765562939425067e-05, + "loss": 0.569, + "step": 7837 + }, + { + "epoch": 1.2794987959675115, + "grad_norm": 1.9331912994384766, + "learning_rate": 1.9765494624803753e-05, + "loss": 0.5535, + "step": 7838 + }, + { + "epoch": 1.279662054610016, + "grad_norm": 1.931525468826294, + "learning_rate": 1.9765426300348586e-05, + "loss": 0.8501, + "step": 7839 + }, + { + "epoch": 1.2798253132525204, + "grad_norm": 1.9250407218933105, + "learning_rate": 1.9765357966059638e-05, + "loss": 0.7133, + "step": 7840 + }, + { + "epoch": 1.2799885718950246, + "grad_norm": 1.63577401638031, + "learning_rate": 1.976528962193698e-05, + "loss": 0.7053, + "step": 7841 + }, + { + "epoch": 1.280151830537529, + "grad_norm": 2.1839020252227783, + "learning_rate": 1.9765221267980675e-05, + "loss": 0.824, + "step": 7842 + }, + { + "epoch": 1.2803150891800334, + "grad_norm": 2.152672052383423, + "learning_rate": 1.9765152904190795e-05, + "loss": 0.7613, + "step": 7843 + }, + { + "epoch": 1.2804783478225379, + "grad_norm": 2.0152671337127686, + "learning_rate": 1.9765084530567406e-05, + "loss": 0.7149, + "step": 7844 + }, + { + "epoch": 1.2806416064650423, + "grad_norm": 1.336719274520874, + "learning_rate": 1.9765016147110583e-05, + "loss": 0.5689, + "step": 7845 + }, + { + "epoch": 1.2808048651075468, + "grad_norm": 1.9520338773727417, + "learning_rate": 1.9764947753820393e-05, + "loss": 0.8217, + "step": 7846 + }, + { + "epoch": 1.280968123750051, + "grad_norm": 1.8902567625045776, + "learning_rate": 1.97648793506969e-05, + "loss": 0.6859, + "step": 7847 + }, + { + "epoch": 1.2811313823925554, + "grad_norm": 1.731766700744629, + "learning_rate": 1.976481093774018e-05, + "loss": 0.6289, + "step": 7848 + }, + { + "epoch": 1.2812946410350599, + "grad_norm": 1.7768175601959229, + "learning_rate": 1.976474251495029e-05, + "loss": 0.6862, + "step": 7849 + }, + { + "epoch": 1.281457899677564, + "grad_norm": 1.7624109983444214, + "learning_rate": 1.9764674082327313e-05, + "loss": 0.6904, + "step": 7850 + }, + { + "epoch": 1.2816211583200685, + "grad_norm": 1.9158507585525513, + "learning_rate": 1.9764605639871312e-05, + "loss": 0.7408, + "step": 7851 + }, + { + "epoch": 1.281784416962573, + "grad_norm": 1.5964126586914062, + "learning_rate": 1.9764537187582353e-05, + "loss": 0.6257, + "step": 7852 + }, + { + "epoch": 1.2819476756050774, + "grad_norm": 1.9964649677276611, + "learning_rate": 1.976446872546051e-05, + "loss": 0.8109, + "step": 7853 + }, + { + "epoch": 1.2821109342475818, + "grad_norm": 2.534752368927002, + "learning_rate": 1.9764400253505848e-05, + "loss": 0.5853, + "step": 7854 + }, + { + "epoch": 1.282274192890086, + "grad_norm": 2.2597615718841553, + "learning_rate": 1.9764331771718438e-05, + "loss": 0.7133, + "step": 7855 + }, + { + "epoch": 1.2824374515325905, + "grad_norm": 1.4301605224609375, + "learning_rate": 1.976426328009835e-05, + "loss": 0.6284, + "step": 7856 + }, + { + "epoch": 1.282600710175095, + "grad_norm": 1.9297692775726318, + "learning_rate": 1.9764194778645648e-05, + "loss": 0.6202, + "step": 7857 + }, + { + "epoch": 1.2827639688175994, + "grad_norm": 1.8295924663543701, + "learning_rate": 1.9764126267360407e-05, + "loss": 0.7669, + "step": 7858 + }, + { + "epoch": 1.2829272274601036, + "grad_norm": 1.9317713975906372, + "learning_rate": 1.9764057746242693e-05, + "loss": 0.7333, + "step": 7859 + }, + { + "epoch": 1.283090486102608, + "grad_norm": 1.668118953704834, + "learning_rate": 1.9763989215292576e-05, + "loss": 0.5821, + "step": 7860 + }, + { + "epoch": 1.2832537447451124, + "grad_norm": 2.004638910293579, + "learning_rate": 1.9763920674510124e-05, + "loss": 0.7335, + "step": 7861 + }, + { + "epoch": 1.2834170033876169, + "grad_norm": 1.9131665229797363, + "learning_rate": 1.9763852123895405e-05, + "loss": 0.7544, + "step": 7862 + }, + { + "epoch": 1.2835802620301213, + "grad_norm": 1.7500256299972534, + "learning_rate": 1.9763783563448494e-05, + "loss": 0.6237, + "step": 7863 + }, + { + "epoch": 1.2837435206726255, + "grad_norm": 1.7953238487243652, + "learning_rate": 1.976371499316945e-05, + "loss": 0.6667, + "step": 7864 + }, + { + "epoch": 1.28390677931513, + "grad_norm": 1.908550500869751, + "learning_rate": 1.9763646413058352e-05, + "loss": 0.7049, + "step": 7865 + }, + { + "epoch": 1.2840700379576344, + "grad_norm": 1.4671032428741455, + "learning_rate": 1.976357782311526e-05, + "loss": 0.4892, + "step": 7866 + }, + { + "epoch": 1.2842332966001389, + "grad_norm": 1.6461607217788696, + "learning_rate": 1.9763509223340253e-05, + "loss": 0.6934, + "step": 7867 + }, + { + "epoch": 1.284396555242643, + "grad_norm": 1.7850788831710815, + "learning_rate": 1.9763440613733393e-05, + "loss": 0.6971, + "step": 7868 + }, + { + "epoch": 1.2845598138851475, + "grad_norm": 1.7590973377227783, + "learning_rate": 1.976337199429475e-05, + "loss": 0.6887, + "step": 7869 + }, + { + "epoch": 1.284723072527652, + "grad_norm": 1.560594081878662, + "learning_rate": 1.9763303365024392e-05, + "loss": 0.6946, + "step": 7870 + }, + { + "epoch": 1.2848863311701564, + "grad_norm": 1.957930564880371, + "learning_rate": 1.976323472592239e-05, + "loss": 0.7353, + "step": 7871 + }, + { + "epoch": 1.2850495898126608, + "grad_norm": 2.104898691177368, + "learning_rate": 1.9763166076988818e-05, + "loss": 0.8932, + "step": 7872 + }, + { + "epoch": 1.285212848455165, + "grad_norm": 1.911778450012207, + "learning_rate": 1.9763097418223736e-05, + "loss": 0.7654, + "step": 7873 + }, + { + "epoch": 1.2853761070976695, + "grad_norm": 2.049375057220459, + "learning_rate": 1.976302874962722e-05, + "loss": 0.7244, + "step": 7874 + }, + { + "epoch": 1.285539365740174, + "grad_norm": 1.8312249183654785, + "learning_rate": 1.9762960071199334e-05, + "loss": 0.7212, + "step": 7875 + }, + { + "epoch": 1.2857026243826781, + "grad_norm": 1.8828734159469604, + "learning_rate": 1.9762891382940152e-05, + "loss": 0.6881, + "step": 7876 + }, + { + "epoch": 1.2858658830251826, + "grad_norm": 1.6369315385818481, + "learning_rate": 1.9762822684849743e-05, + "loss": 0.6802, + "step": 7877 + }, + { + "epoch": 1.286029141667687, + "grad_norm": 1.9573595523834229, + "learning_rate": 1.9762753976928172e-05, + "loss": 0.8298, + "step": 7878 + }, + { + "epoch": 1.2861924003101914, + "grad_norm": 1.815604329109192, + "learning_rate": 1.9762685259175506e-05, + "loss": 0.6406, + "step": 7879 + }, + { + "epoch": 1.2863556589526959, + "grad_norm": 1.72400963306427, + "learning_rate": 1.9762616531591826e-05, + "loss": 0.7195, + "step": 7880 + }, + { + "epoch": 1.2865189175952003, + "grad_norm": 1.8507131338119507, + "learning_rate": 1.9762547794177188e-05, + "loss": 0.7279, + "step": 7881 + }, + { + "epoch": 1.2866821762377045, + "grad_norm": 1.7912498712539673, + "learning_rate": 1.976247904693167e-05, + "loss": 0.7648, + "step": 7882 + }, + { + "epoch": 1.286845434880209, + "grad_norm": 1.6017032861709595, + "learning_rate": 1.976241028985534e-05, + "loss": 0.6642, + "step": 7883 + }, + { + "epoch": 1.2870086935227134, + "grad_norm": 1.5813000202178955, + "learning_rate": 1.976234152294826e-05, + "loss": 0.7493, + "step": 7884 + }, + { + "epoch": 1.2871719521652176, + "grad_norm": 1.9897912740707397, + "learning_rate": 1.9762272746210506e-05, + "loss": 0.6715, + "step": 7885 + }, + { + "epoch": 1.287335210807722, + "grad_norm": 2.0365378856658936, + "learning_rate": 1.976220395964215e-05, + "loss": 0.8664, + "step": 7886 + }, + { + "epoch": 1.2874984694502265, + "grad_norm": 1.9992727041244507, + "learning_rate": 1.9762135163243253e-05, + "loss": 0.6703, + "step": 7887 + }, + { + "epoch": 1.287661728092731, + "grad_norm": 1.9783709049224854, + "learning_rate": 1.9762066357013893e-05, + "loss": 0.7249, + "step": 7888 + }, + { + "epoch": 1.2878249867352354, + "grad_norm": 1.850899338722229, + "learning_rate": 1.9761997540954132e-05, + "loss": 0.7121, + "step": 7889 + }, + { + "epoch": 1.2879882453777398, + "grad_norm": 2.0041005611419678, + "learning_rate": 1.976192871506404e-05, + "loss": 0.8162, + "step": 7890 + }, + { + "epoch": 1.288151504020244, + "grad_norm": 1.9776133298873901, + "learning_rate": 1.9761859879343692e-05, + "loss": 0.6506, + "step": 7891 + }, + { + "epoch": 1.2883147626627485, + "grad_norm": 1.7234914302825928, + "learning_rate": 1.9761791033793152e-05, + "loss": 0.6508, + "step": 7892 + }, + { + "epoch": 1.288478021305253, + "grad_norm": 1.662414312362671, + "learning_rate": 1.976172217841249e-05, + "loss": 0.7318, + "step": 7893 + }, + { + "epoch": 1.2886412799477571, + "grad_norm": 1.699636459350586, + "learning_rate": 1.9761653313201775e-05, + "loss": 0.5838, + "step": 7894 + }, + { + "epoch": 1.2888045385902616, + "grad_norm": 1.5659583806991577, + "learning_rate": 1.9761584438161084e-05, + "loss": 0.6985, + "step": 7895 + }, + { + "epoch": 1.288967797232766, + "grad_norm": 1.8006304502487183, + "learning_rate": 1.9761515553290474e-05, + "loss": 0.6567, + "step": 7896 + }, + { + "epoch": 1.2891310558752704, + "grad_norm": 1.6377241611480713, + "learning_rate": 1.9761446658590024e-05, + "loss": 0.6389, + "step": 7897 + }, + { + "epoch": 1.2892943145177749, + "grad_norm": 1.77823007106781, + "learning_rate": 1.97613777540598e-05, + "loss": 0.7368, + "step": 7898 + }, + { + "epoch": 1.289457573160279, + "grad_norm": 1.7207746505737305, + "learning_rate": 1.9761308839699866e-05, + "loss": 0.7228, + "step": 7899 + }, + { + "epoch": 1.2896208318027835, + "grad_norm": 1.7510266304016113, + "learning_rate": 1.9761239915510302e-05, + "loss": 0.7386, + "step": 7900 + }, + { + "epoch": 1.289784090445288, + "grad_norm": 1.668355107307434, + "learning_rate": 1.976117098149117e-05, + "loss": 0.6338, + "step": 7901 + }, + { + "epoch": 1.2899473490877924, + "grad_norm": 1.5757437944412231, + "learning_rate": 1.9761102037642542e-05, + "loss": 0.5099, + "step": 7902 + }, + { + "epoch": 1.2901106077302966, + "grad_norm": 1.8097087144851685, + "learning_rate": 1.976103308396449e-05, + "loss": 0.6304, + "step": 7903 + }, + { + "epoch": 1.290273866372801, + "grad_norm": 1.902798056602478, + "learning_rate": 1.9760964120457075e-05, + "loss": 0.773, + "step": 7904 + }, + { + "epoch": 1.2904371250153055, + "grad_norm": 1.7141273021697998, + "learning_rate": 1.9760895147120372e-05, + "loss": 0.5703, + "step": 7905 + }, + { + "epoch": 1.29060038365781, + "grad_norm": 2.1649088859558105, + "learning_rate": 1.9760826163954452e-05, + "loss": 0.892, + "step": 7906 + }, + { + "epoch": 1.2907636423003144, + "grad_norm": 2.2144346237182617, + "learning_rate": 1.9760757170959382e-05, + "loss": 0.7149, + "step": 7907 + }, + { + "epoch": 1.2909269009428186, + "grad_norm": 1.6204133033752441, + "learning_rate": 1.9760688168135233e-05, + "loss": 0.5849, + "step": 7908 + }, + { + "epoch": 1.291090159585323, + "grad_norm": 1.9654207229614258, + "learning_rate": 1.9760619155482073e-05, + "loss": 0.6406, + "step": 7909 + }, + { + "epoch": 1.2912534182278275, + "grad_norm": 1.6770851612091064, + "learning_rate": 1.976055013299997e-05, + "loss": 0.6645, + "step": 7910 + }, + { + "epoch": 1.291416676870332, + "grad_norm": 1.4437510967254639, + "learning_rate": 1.9760481100688998e-05, + "loss": 0.5364, + "step": 7911 + }, + { + "epoch": 1.2915799355128361, + "grad_norm": 1.6092826128005981, + "learning_rate": 1.9760412058549226e-05, + "loss": 0.5882, + "step": 7912 + }, + { + "epoch": 1.2917431941553406, + "grad_norm": 1.610540747642517, + "learning_rate": 1.9760343006580716e-05, + "loss": 0.6371, + "step": 7913 + }, + { + "epoch": 1.291906452797845, + "grad_norm": 1.7379995584487915, + "learning_rate": 1.976027394478355e-05, + "loss": 0.6655, + "step": 7914 + }, + { + "epoch": 1.2920697114403494, + "grad_norm": 2.1518821716308594, + "learning_rate": 1.9760204873157786e-05, + "loss": 1.3302, + "step": 7915 + }, + { + "epoch": 1.2922329700828539, + "grad_norm": 2.120119094848633, + "learning_rate": 1.97601357917035e-05, + "loss": 0.7598, + "step": 7916 + }, + { + "epoch": 1.292396228725358, + "grad_norm": 2.0103819370269775, + "learning_rate": 1.9760066700420758e-05, + "loss": 0.7538, + "step": 7917 + }, + { + "epoch": 1.2925594873678625, + "grad_norm": 1.6739648580551147, + "learning_rate": 1.9759997599309636e-05, + "loss": 0.6042, + "step": 7918 + }, + { + "epoch": 1.292722746010367, + "grad_norm": 1.6913206577301025, + "learning_rate": 1.9759928488370195e-05, + "loss": 0.6321, + "step": 7919 + }, + { + "epoch": 1.2928860046528712, + "grad_norm": 1.6233775615692139, + "learning_rate": 1.975985936760251e-05, + "loss": 0.6157, + "step": 7920 + }, + { + "epoch": 1.2930492632953756, + "grad_norm": 1.6490920782089233, + "learning_rate": 1.975979023700665e-05, + "loss": 0.5568, + "step": 7921 + }, + { + "epoch": 1.29321252193788, + "grad_norm": 1.834374189376831, + "learning_rate": 1.9759721096582682e-05, + "loss": 0.787, + "step": 7922 + }, + { + "epoch": 1.2933757805803845, + "grad_norm": 1.9529129266738892, + "learning_rate": 1.9759651946330676e-05, + "loss": 0.762, + "step": 7923 + }, + { + "epoch": 1.293539039222889, + "grad_norm": 2.000338554382324, + "learning_rate": 1.9759582786250707e-05, + "loss": 0.7009, + "step": 7924 + }, + { + "epoch": 1.2937022978653934, + "grad_norm": 3.424417734146118, + "learning_rate": 1.9759513616342838e-05, + "loss": 0.7751, + "step": 7925 + }, + { + "epoch": 1.2938655565078976, + "grad_norm": 1.6400361061096191, + "learning_rate": 1.9759444436607144e-05, + "loss": 0.6112, + "step": 7926 + }, + { + "epoch": 1.294028815150402, + "grad_norm": 2.056544303894043, + "learning_rate": 1.975937524704369e-05, + "loss": 0.6866, + "step": 7927 + }, + { + "epoch": 1.2941920737929065, + "grad_norm": 1.823322057723999, + "learning_rate": 1.9759306047652544e-05, + "loss": 0.5563, + "step": 7928 + }, + { + "epoch": 1.2943553324354107, + "grad_norm": 1.6962625980377197, + "learning_rate": 1.9759236838433785e-05, + "loss": 0.6942, + "step": 7929 + }, + { + "epoch": 1.2945185910779151, + "grad_norm": 1.8685322999954224, + "learning_rate": 1.9759167619387474e-05, + "loss": 0.9046, + "step": 7930 + }, + { + "epoch": 1.2946818497204196, + "grad_norm": 1.8325632810592651, + "learning_rate": 1.9759098390513688e-05, + "loss": 0.7726, + "step": 7931 + }, + { + "epoch": 1.294845108362924, + "grad_norm": 1.7968077659606934, + "learning_rate": 1.9759029151812486e-05, + "loss": 0.6975, + "step": 7932 + }, + { + "epoch": 1.2950083670054284, + "grad_norm": 2.505927562713623, + "learning_rate": 1.975895990328395e-05, + "loss": 0.7527, + "step": 7933 + }, + { + "epoch": 1.2951716256479329, + "grad_norm": 1.9175455570220947, + "learning_rate": 1.9758890644928142e-05, + "loss": 0.7382, + "step": 7934 + }, + { + "epoch": 1.295334884290437, + "grad_norm": 1.942936897277832, + "learning_rate": 1.9758821376745136e-05, + "loss": 0.7598, + "step": 7935 + }, + { + "epoch": 1.2954981429329415, + "grad_norm": 2.1784725189208984, + "learning_rate": 1.9758752098734995e-05, + "loss": 0.9112, + "step": 7936 + }, + { + "epoch": 1.295661401575446, + "grad_norm": 1.7469836473464966, + "learning_rate": 1.9758682810897795e-05, + "loss": 0.6725, + "step": 7937 + }, + { + "epoch": 1.2958246602179502, + "grad_norm": 1.668153166770935, + "learning_rate": 1.9758613513233603e-05, + "loss": 0.5533, + "step": 7938 + }, + { + "epoch": 1.2959879188604546, + "grad_norm": 1.8061392307281494, + "learning_rate": 1.9758544205742495e-05, + "loss": 0.6103, + "step": 7939 + }, + { + "epoch": 1.296151177502959, + "grad_norm": 1.781225323677063, + "learning_rate": 1.975847488842453e-05, + "loss": 0.5765, + "step": 7940 + }, + { + "epoch": 1.2963144361454635, + "grad_norm": 1.6806485652923584, + "learning_rate": 1.9758405561279787e-05, + "loss": 0.6042, + "step": 7941 + }, + { + "epoch": 1.296477694787968, + "grad_norm": 1.7812618017196655, + "learning_rate": 1.9758336224308326e-05, + "loss": 0.6573, + "step": 7942 + }, + { + "epoch": 1.2966409534304721, + "grad_norm": 2.053067207336426, + "learning_rate": 1.975826687751023e-05, + "loss": 0.8159, + "step": 7943 + }, + { + "epoch": 1.2968042120729766, + "grad_norm": 2.0060856342315674, + "learning_rate": 1.975819752088556e-05, + "loss": 0.8457, + "step": 7944 + }, + { + "epoch": 1.296967470715481, + "grad_norm": 1.792073369026184, + "learning_rate": 1.9758128154434387e-05, + "loss": 0.6568, + "step": 7945 + }, + { + "epoch": 1.2971307293579855, + "grad_norm": 2.4094059467315674, + "learning_rate": 1.975805877815678e-05, + "loss": 0.7995, + "step": 7946 + }, + { + "epoch": 1.2972939880004897, + "grad_norm": 1.7473502159118652, + "learning_rate": 1.975798939205281e-05, + "loss": 0.5127, + "step": 7947 + }, + { + "epoch": 1.297457246642994, + "grad_norm": 1.8392223119735718, + "learning_rate": 1.9757919996122548e-05, + "loss": 0.6454, + "step": 7948 + }, + { + "epoch": 1.2976205052854985, + "grad_norm": 1.753745436668396, + "learning_rate": 1.9757850590366066e-05, + "loss": 0.6928, + "step": 7949 + }, + { + "epoch": 1.297783763928003, + "grad_norm": 1.9866712093353271, + "learning_rate": 1.975778117478343e-05, + "loss": 0.73, + "step": 7950 + }, + { + "epoch": 1.2979470225705074, + "grad_norm": 1.5264848470687866, + "learning_rate": 1.975771174937471e-05, + "loss": 0.5966, + "step": 7951 + }, + { + "epoch": 1.2981102812130116, + "grad_norm": 1.7728151082992554, + "learning_rate": 1.9757642314139977e-05, + "loss": 0.6844, + "step": 7952 + }, + { + "epoch": 1.298273539855516, + "grad_norm": 1.826828122138977, + "learning_rate": 1.9757572869079303e-05, + "loss": 0.7398, + "step": 7953 + }, + { + "epoch": 1.2984367984980205, + "grad_norm": 1.8208987712860107, + "learning_rate": 1.9757503414192752e-05, + "loss": 0.6533, + "step": 7954 + }, + { + "epoch": 1.298600057140525, + "grad_norm": 1.841678261756897, + "learning_rate": 1.9757433949480398e-05, + "loss": 0.56, + "step": 7955 + }, + { + "epoch": 1.2987633157830292, + "grad_norm": 2.5227131843566895, + "learning_rate": 1.975736447494231e-05, + "loss": 0.8545, + "step": 7956 + }, + { + "epoch": 1.2989265744255336, + "grad_norm": 1.783561110496521, + "learning_rate": 1.9757294990578565e-05, + "loss": 0.7527, + "step": 7957 + }, + { + "epoch": 1.299089833068038, + "grad_norm": 1.6390873193740845, + "learning_rate": 1.9757225496389218e-05, + "loss": 0.6454, + "step": 7958 + }, + { + "epoch": 1.2992530917105425, + "grad_norm": 1.9630640745162964, + "learning_rate": 1.9757155992374353e-05, + "loss": 0.6226, + "step": 7959 + }, + { + "epoch": 1.299416350353047, + "grad_norm": 1.6627589464187622, + "learning_rate": 1.975708647853403e-05, + "loss": 0.6783, + "step": 7960 + }, + { + "epoch": 1.2995796089955511, + "grad_norm": 1.7812631130218506, + "learning_rate": 1.975701695486833e-05, + "loss": 0.5985, + "step": 7961 + }, + { + "epoch": 1.2997428676380556, + "grad_norm": 1.687026858329773, + "learning_rate": 1.975694742137731e-05, + "loss": 0.7377, + "step": 7962 + }, + { + "epoch": 1.29990612628056, + "grad_norm": 1.7239652872085571, + "learning_rate": 1.9756877878061053e-05, + "loss": 0.7842, + "step": 7963 + }, + { + "epoch": 1.3000693849230642, + "grad_norm": 2.248943567276001, + "learning_rate": 1.9756808324919618e-05, + "loss": 0.8301, + "step": 7964 + }, + { + "epoch": 1.3002326435655687, + "grad_norm": 1.955113172531128, + "learning_rate": 1.975673876195308e-05, + "loss": 0.7522, + "step": 7965 + }, + { + "epoch": 1.300395902208073, + "grad_norm": 1.861124038696289, + "learning_rate": 1.9756669189161507e-05, + "loss": 0.7477, + "step": 7966 + }, + { + "epoch": 1.3005591608505775, + "grad_norm": 1.788654088973999, + "learning_rate": 1.9756599606544976e-05, + "loss": 0.6239, + "step": 7967 + }, + { + "epoch": 1.300722419493082, + "grad_norm": 1.594048023223877, + "learning_rate": 1.9756530014103548e-05, + "loss": 0.6346, + "step": 7968 + }, + { + "epoch": 1.3008856781355864, + "grad_norm": 1.4570657014846802, + "learning_rate": 1.9756460411837296e-05, + "loss": 0.7204, + "step": 7969 + }, + { + "epoch": 1.3010489367780906, + "grad_norm": 1.7533776760101318, + "learning_rate": 1.9756390799746295e-05, + "loss": 0.6831, + "step": 7970 + }, + { + "epoch": 1.301212195420595, + "grad_norm": 1.9141697883605957, + "learning_rate": 1.975632117783061e-05, + "loss": 0.8091, + "step": 7971 + }, + { + "epoch": 1.3013754540630995, + "grad_norm": 1.6265720129013062, + "learning_rate": 1.975625154609031e-05, + "loss": 0.6001, + "step": 7972 + }, + { + "epoch": 1.3015387127056037, + "grad_norm": 1.7112325429916382, + "learning_rate": 1.9756181904525468e-05, + "loss": 0.6261, + "step": 7973 + }, + { + "epoch": 1.3017019713481082, + "grad_norm": 1.7691562175750732, + "learning_rate": 1.9756112253136154e-05, + "loss": 0.6276, + "step": 7974 + }, + { + "epoch": 1.3018652299906126, + "grad_norm": 1.780705213546753, + "learning_rate": 1.9756042591922436e-05, + "loss": 0.6415, + "step": 7975 + }, + { + "epoch": 1.302028488633117, + "grad_norm": 1.6425621509552002, + "learning_rate": 1.9755972920884387e-05, + "loss": 0.6327, + "step": 7976 + }, + { + "epoch": 1.3021917472756215, + "grad_norm": 1.8404335975646973, + "learning_rate": 1.9755903240022073e-05, + "loss": 0.6882, + "step": 7977 + }, + { + "epoch": 1.302355005918126, + "grad_norm": 1.753991723060608, + "learning_rate": 1.975583354933557e-05, + "loss": 0.6555, + "step": 7978 + }, + { + "epoch": 1.3025182645606301, + "grad_norm": 1.7840451002120972, + "learning_rate": 1.9755763848824944e-05, + "loss": 0.6578, + "step": 7979 + }, + { + "epoch": 1.3026815232031346, + "grad_norm": 2.0250473022460938, + "learning_rate": 1.9755694138490268e-05, + "loss": 0.821, + "step": 7980 + }, + { + "epoch": 1.302844781845639, + "grad_norm": 1.6208056211471558, + "learning_rate": 1.975562441833161e-05, + "loss": 0.6431, + "step": 7981 + }, + { + "epoch": 1.3030080404881432, + "grad_norm": 1.5214015245437622, + "learning_rate": 1.975555468834904e-05, + "loss": 0.5933, + "step": 7982 + }, + { + "epoch": 1.3031712991306477, + "grad_norm": 1.5170669555664062, + "learning_rate": 1.975548494854263e-05, + "loss": 0.6946, + "step": 7983 + }, + { + "epoch": 1.303334557773152, + "grad_norm": 2.0292656421661377, + "learning_rate": 1.975541519891245e-05, + "loss": 0.8115, + "step": 7984 + }, + { + "epoch": 1.3034978164156565, + "grad_norm": 2.1278674602508545, + "learning_rate": 1.9755345439458566e-05, + "loss": 0.7302, + "step": 7985 + }, + { + "epoch": 1.303661075058161, + "grad_norm": 2.096353769302368, + "learning_rate": 1.9755275670181055e-05, + "loss": 0.6779, + "step": 7986 + }, + { + "epoch": 1.3038243337006654, + "grad_norm": 1.8634445667266846, + "learning_rate": 1.9755205891079986e-05, + "loss": 0.7434, + "step": 7987 + }, + { + "epoch": 1.3039875923431696, + "grad_norm": 2.119852066040039, + "learning_rate": 1.975513610215542e-05, + "loss": 0.7847, + "step": 7988 + }, + { + "epoch": 1.304150850985674, + "grad_norm": 1.674684762954712, + "learning_rate": 1.975506630340744e-05, + "loss": 0.6287, + "step": 7989 + }, + { + "epoch": 1.3043141096281785, + "grad_norm": 1.7489315271377563, + "learning_rate": 1.975499649483611e-05, + "loss": 0.7182, + "step": 7990 + }, + { + "epoch": 1.3044773682706827, + "grad_norm": 1.8974401950836182, + "learning_rate": 1.97549266764415e-05, + "loss": 0.7091, + "step": 7991 + }, + { + "epoch": 1.3046406269131872, + "grad_norm": 1.9123034477233887, + "learning_rate": 1.975485684822368e-05, + "loss": 0.7825, + "step": 7992 + }, + { + "epoch": 1.3048038855556916, + "grad_norm": 1.835289478302002, + "learning_rate": 1.975478701018273e-05, + "loss": 0.67, + "step": 7993 + }, + { + "epoch": 1.304967144198196, + "grad_norm": 1.5446045398712158, + "learning_rate": 1.97547171623187e-05, + "loss": 0.6607, + "step": 7994 + }, + { + "epoch": 1.3051304028407005, + "grad_norm": 1.5740768909454346, + "learning_rate": 1.975464730463168e-05, + "loss": 0.6209, + "step": 7995 + }, + { + "epoch": 1.3052936614832047, + "grad_norm": 1.8842756748199463, + "learning_rate": 1.9754577437121733e-05, + "loss": 0.6936, + "step": 7996 + }, + { + "epoch": 1.3054569201257091, + "grad_norm": 1.884804129600525, + "learning_rate": 1.9754507559788928e-05, + "loss": 0.6859, + "step": 7997 + }, + { + "epoch": 1.3056201787682136, + "grad_norm": 1.6712723970413208, + "learning_rate": 1.9754437672633332e-05, + "loss": 0.7219, + "step": 7998 + }, + { + "epoch": 1.305783437410718, + "grad_norm": 1.7728461027145386, + "learning_rate": 1.9754367775655024e-05, + "loss": 0.6836, + "step": 7999 + }, + { + "epoch": 1.3059466960532222, + "grad_norm": 1.7760695219039917, + "learning_rate": 1.9754297868854075e-05, + "loss": 0.6534, + "step": 8000 + }, + { + "epoch": 1.3061099546957267, + "grad_norm": 1.550175666809082, + "learning_rate": 1.975422795223054e-05, + "loss": 0.5462, + "step": 8001 + }, + { + "epoch": 1.306273213338231, + "grad_norm": 1.7566498517990112, + "learning_rate": 1.975415802578451e-05, + "loss": 0.7351, + "step": 8002 + }, + { + "epoch": 1.3064364719807355, + "grad_norm": 1.7652151584625244, + "learning_rate": 1.975408808951604e-05, + "loss": 0.5762, + "step": 8003 + }, + { + "epoch": 1.30659973062324, + "grad_norm": 1.7197532653808594, + "learning_rate": 1.9754018143425206e-05, + "loss": 0.6578, + "step": 8004 + }, + { + "epoch": 1.3067629892657442, + "grad_norm": 1.623373031616211, + "learning_rate": 1.975394818751208e-05, + "loss": 0.7892, + "step": 8005 + }, + { + "epoch": 1.3069262479082486, + "grad_norm": 1.8877060413360596, + "learning_rate": 1.9753878221776726e-05, + "loss": 0.7061, + "step": 8006 + }, + { + "epoch": 1.307089506550753, + "grad_norm": 1.9480184316635132, + "learning_rate": 1.9753808246219226e-05, + "loss": 0.6747, + "step": 8007 + }, + { + "epoch": 1.3072527651932573, + "grad_norm": 1.763015866279602, + "learning_rate": 1.975373826083964e-05, + "loss": 0.5604, + "step": 8008 + }, + { + "epoch": 1.3074160238357617, + "grad_norm": 1.736615777015686, + "learning_rate": 1.9753668265638043e-05, + "loss": 0.7349, + "step": 8009 + }, + { + "epoch": 1.3075792824782662, + "grad_norm": 1.5803017616271973, + "learning_rate": 1.9753598260614506e-05, + "loss": 0.5913, + "step": 8010 + }, + { + "epoch": 1.3077425411207706, + "grad_norm": 1.8399187326431274, + "learning_rate": 1.9753528245769096e-05, + "loss": 0.6181, + "step": 8011 + }, + { + "epoch": 1.307905799763275, + "grad_norm": 1.6089118719100952, + "learning_rate": 1.9753458221101886e-05, + "loss": 0.6655, + "step": 8012 + }, + { + "epoch": 1.3080690584057795, + "grad_norm": 2.0973799228668213, + "learning_rate": 1.9753388186612946e-05, + "loss": 0.6441, + "step": 8013 + }, + { + "epoch": 1.3082323170482837, + "grad_norm": 1.724628210067749, + "learning_rate": 1.9753318142302347e-05, + "loss": 0.7289, + "step": 8014 + }, + { + "epoch": 1.3083955756907881, + "grad_norm": 1.7998989820480347, + "learning_rate": 1.975324808817016e-05, + "loss": 0.7638, + "step": 8015 + }, + { + "epoch": 1.3085588343332926, + "grad_norm": 1.7368721961975098, + "learning_rate": 1.9753178024216454e-05, + "loss": 0.6393, + "step": 8016 + }, + { + "epoch": 1.3087220929757968, + "grad_norm": 1.6947457790374756, + "learning_rate": 1.9753107950441303e-05, + "loss": 0.5748, + "step": 8017 + }, + { + "epoch": 1.3088853516183012, + "grad_norm": 1.7658355236053467, + "learning_rate": 1.975303786684477e-05, + "loss": 0.6786, + "step": 8018 + }, + { + "epoch": 1.3090486102608057, + "grad_norm": 1.8872051239013672, + "learning_rate": 1.975296777342693e-05, + "loss": 0.7267, + "step": 8019 + }, + { + "epoch": 1.30921186890331, + "grad_norm": 1.6568111181259155, + "learning_rate": 1.975289767018786e-05, + "loss": 0.6146, + "step": 8020 + }, + { + "epoch": 1.3093751275458145, + "grad_norm": 1.6706973314285278, + "learning_rate": 1.9752827557127622e-05, + "loss": 0.6413, + "step": 8021 + }, + { + "epoch": 1.309538386188319, + "grad_norm": 1.4978121519088745, + "learning_rate": 1.975275743424629e-05, + "loss": 0.5876, + "step": 8022 + }, + { + "epoch": 1.3097016448308232, + "grad_norm": 1.4152112007141113, + "learning_rate": 1.9752687301543932e-05, + "loss": 0.5421, + "step": 8023 + }, + { + "epoch": 1.3098649034733276, + "grad_norm": 1.8553547859191895, + "learning_rate": 1.9752617159020618e-05, + "loss": 0.8561, + "step": 8024 + }, + { + "epoch": 1.310028162115832, + "grad_norm": 1.5379799604415894, + "learning_rate": 1.975254700667643e-05, + "loss": 0.6795, + "step": 8025 + }, + { + "epoch": 1.3101914207583363, + "grad_norm": 1.6830320358276367, + "learning_rate": 1.975247684451142e-05, + "loss": 0.8293, + "step": 8026 + }, + { + "epoch": 1.3103546794008407, + "grad_norm": 1.6142410039901733, + "learning_rate": 1.9752406672525675e-05, + "loss": 0.5814, + "step": 8027 + }, + { + "epoch": 1.3105179380433452, + "grad_norm": 1.6834763288497925, + "learning_rate": 1.9752336490719254e-05, + "loss": 0.5859, + "step": 8028 + }, + { + "epoch": 1.3106811966858496, + "grad_norm": 1.9352947473526, + "learning_rate": 1.9752266299092234e-05, + "loss": 0.7103, + "step": 8029 + }, + { + "epoch": 1.310844455328354, + "grad_norm": 2.2926385402679443, + "learning_rate": 1.9752196097644687e-05, + "loss": 0.8156, + "step": 8030 + }, + { + "epoch": 1.3110077139708585, + "grad_norm": 2.0337791442871094, + "learning_rate": 1.975212588637668e-05, + "loss": 0.6592, + "step": 8031 + }, + { + "epoch": 1.3111709726133627, + "grad_norm": 2.0646533966064453, + "learning_rate": 1.9752055665288283e-05, + "loss": 0.743, + "step": 8032 + }, + { + "epoch": 1.3113342312558671, + "grad_norm": 1.8210393190383911, + "learning_rate": 1.9751985434379572e-05, + "loss": 0.7813, + "step": 8033 + }, + { + "epoch": 1.3114974898983716, + "grad_norm": 1.658936619758606, + "learning_rate": 1.975191519365061e-05, + "loss": 0.5223, + "step": 8034 + }, + { + "epoch": 1.3116607485408758, + "grad_norm": 1.6187835931777954, + "learning_rate": 1.9751844943101476e-05, + "loss": 0.6131, + "step": 8035 + }, + { + "epoch": 1.3118240071833802, + "grad_norm": 2.1905157566070557, + "learning_rate": 1.9751774682732234e-05, + "loss": 0.7696, + "step": 8036 + }, + { + "epoch": 1.3119872658258847, + "grad_norm": 1.9017592668533325, + "learning_rate": 1.975170441254296e-05, + "loss": 0.7994, + "step": 8037 + }, + { + "epoch": 1.312150524468389, + "grad_norm": 1.7407249212265015, + "learning_rate": 1.975163413253372e-05, + "loss": 0.7763, + "step": 8038 + }, + { + "epoch": 1.3123137831108935, + "grad_norm": 1.7648754119873047, + "learning_rate": 1.975156384270459e-05, + "loss": 0.7295, + "step": 8039 + }, + { + "epoch": 1.3124770417533977, + "grad_norm": 1.6347990036010742, + "learning_rate": 1.9751493543055634e-05, + "loss": 0.6003, + "step": 8040 + }, + { + "epoch": 1.3126403003959022, + "grad_norm": 1.660689115524292, + "learning_rate": 1.9751423233586928e-05, + "loss": 0.705, + "step": 8041 + }, + { + "epoch": 1.3128035590384066, + "grad_norm": 1.5299161672592163, + "learning_rate": 1.975135291429854e-05, + "loss": 0.5818, + "step": 8042 + }, + { + "epoch": 1.312966817680911, + "grad_norm": 1.6677756309509277, + "learning_rate": 1.9751282585190546e-05, + "loss": 0.765, + "step": 8043 + }, + { + "epoch": 1.3131300763234153, + "grad_norm": 1.5151625871658325, + "learning_rate": 1.975121224626301e-05, + "loss": 0.7289, + "step": 8044 + }, + { + "epoch": 1.3132933349659197, + "grad_norm": 1.716366171836853, + "learning_rate": 1.9751141897516006e-05, + "loss": 0.7082, + "step": 8045 + }, + { + "epoch": 1.3134565936084241, + "grad_norm": 1.7935971021652222, + "learning_rate": 1.9751071538949607e-05, + "loss": 0.748, + "step": 8046 + }, + { + "epoch": 1.3136198522509286, + "grad_norm": 1.9383065700531006, + "learning_rate": 1.975100117056388e-05, + "loss": 0.61, + "step": 8047 + }, + { + "epoch": 1.313783110893433, + "grad_norm": 2.078526496887207, + "learning_rate": 1.9750930792358898e-05, + "loss": 0.8679, + "step": 8048 + }, + { + "epoch": 1.3139463695359372, + "grad_norm": 1.8518353700637817, + "learning_rate": 1.975086040433473e-05, + "loss": 0.6961, + "step": 8049 + }, + { + "epoch": 1.3141096281784417, + "grad_norm": 1.9666190147399902, + "learning_rate": 1.9750790006491447e-05, + "loss": 0.6792, + "step": 8050 + }, + { + "epoch": 1.3142728868209461, + "grad_norm": 1.8078874349594116, + "learning_rate": 1.975071959882912e-05, + "loss": 0.6854, + "step": 8051 + }, + { + "epoch": 1.3144361454634503, + "grad_norm": 1.619911551475525, + "learning_rate": 1.9750649181347827e-05, + "loss": 0.6506, + "step": 8052 + }, + { + "epoch": 1.3145994041059548, + "grad_norm": 1.752761721611023, + "learning_rate": 1.975057875404763e-05, + "loss": 0.6942, + "step": 8053 + }, + { + "epoch": 1.3147626627484592, + "grad_norm": 1.930680751800537, + "learning_rate": 1.97505083169286e-05, + "loss": 0.7038, + "step": 8054 + }, + { + "epoch": 1.3149259213909636, + "grad_norm": 1.8137420415878296, + "learning_rate": 1.9750437869990814e-05, + "loss": 0.7447, + "step": 8055 + }, + { + "epoch": 1.315089180033468, + "grad_norm": 1.6947797536849976, + "learning_rate": 1.975036741323434e-05, + "loss": 0.6758, + "step": 8056 + }, + { + "epoch": 1.3152524386759725, + "grad_norm": 1.9128025770187378, + "learning_rate": 1.9750296946659247e-05, + "loss": 0.7756, + "step": 8057 + }, + { + "epoch": 1.3154156973184767, + "grad_norm": 1.7921639680862427, + "learning_rate": 1.9750226470265608e-05, + "loss": 0.6835, + "step": 8058 + }, + { + "epoch": 1.3155789559609812, + "grad_norm": 1.5529574155807495, + "learning_rate": 1.9750155984053492e-05, + "loss": 0.6173, + "step": 8059 + }, + { + "epoch": 1.3157422146034856, + "grad_norm": 1.6517568826675415, + "learning_rate": 1.9750085488022974e-05, + "loss": 0.6938, + "step": 8060 + }, + { + "epoch": 1.3159054732459898, + "grad_norm": 1.534742832183838, + "learning_rate": 1.975001498217412e-05, + "loss": 0.5277, + "step": 8061 + }, + { + "epoch": 1.3160687318884943, + "grad_norm": 2.2611887454986572, + "learning_rate": 1.9749944466507007e-05, + "loss": 0.7019, + "step": 8062 + }, + { + "epoch": 1.3162319905309987, + "grad_norm": 1.742044448852539, + "learning_rate": 1.97498739410217e-05, + "loss": 0.6104, + "step": 8063 + }, + { + "epoch": 1.3163952491735031, + "grad_norm": 1.5628321170806885, + "learning_rate": 1.9749803405718272e-05, + "loss": 0.6045, + "step": 8064 + }, + { + "epoch": 1.3165585078160076, + "grad_norm": 1.9653853178024292, + "learning_rate": 1.9749732860596797e-05, + "loss": 0.7479, + "step": 8065 + }, + { + "epoch": 1.316721766458512, + "grad_norm": 1.7425636053085327, + "learning_rate": 1.9749662305657342e-05, + "loss": 0.715, + "step": 8066 + }, + { + "epoch": 1.3168850251010162, + "grad_norm": 1.893271565437317, + "learning_rate": 1.9749591740899977e-05, + "loss": 0.6489, + "step": 8067 + }, + { + "epoch": 1.3170482837435207, + "grad_norm": 1.837673544883728, + "learning_rate": 1.974952116632478e-05, + "loss": 0.7078, + "step": 8068 + }, + { + "epoch": 1.3172115423860251, + "grad_norm": 1.7138010263442993, + "learning_rate": 1.9749450581931816e-05, + "loss": 0.6381, + "step": 8069 + }, + { + "epoch": 1.3173748010285293, + "grad_norm": 1.743113398551941, + "learning_rate": 1.974937998772116e-05, + "loss": 0.7032, + "step": 8070 + }, + { + "epoch": 1.3175380596710338, + "grad_norm": 1.8289601802825928, + "learning_rate": 1.974930938369288e-05, + "loss": 0.7865, + "step": 8071 + }, + { + "epoch": 1.3177013183135382, + "grad_norm": 1.6350865364074707, + "learning_rate": 1.9749238769847044e-05, + "loss": 0.535, + "step": 8072 + }, + { + "epoch": 1.3178645769560426, + "grad_norm": 1.6765069961547852, + "learning_rate": 1.9749168146183734e-05, + "loss": 0.623, + "step": 8073 + }, + { + "epoch": 1.318027835598547, + "grad_norm": 1.5076298713684082, + "learning_rate": 1.9749097512703008e-05, + "loss": 0.6332, + "step": 8074 + }, + { + "epoch": 1.3181910942410515, + "grad_norm": 1.5401374101638794, + "learning_rate": 1.9749026869404945e-05, + "loss": 0.5836, + "step": 8075 + }, + { + "epoch": 1.3183543528835557, + "grad_norm": 1.817402958869934, + "learning_rate": 1.9748956216289616e-05, + "loss": 0.6298, + "step": 8076 + }, + { + "epoch": 1.3185176115260602, + "grad_norm": 1.966994047164917, + "learning_rate": 1.9748885553357093e-05, + "loss": 0.822, + "step": 8077 + }, + { + "epoch": 1.3186808701685646, + "grad_norm": 1.9398101568222046, + "learning_rate": 1.9748814880607437e-05, + "loss": 0.7489, + "step": 8078 + }, + { + "epoch": 1.3188441288110688, + "grad_norm": 1.5898211002349854, + "learning_rate": 1.974874419804073e-05, + "loss": 0.6641, + "step": 8079 + }, + { + "epoch": 1.3190073874535733, + "grad_norm": 2.002545118331909, + "learning_rate": 1.9748673505657045e-05, + "loss": 0.6466, + "step": 8080 + }, + { + "epoch": 1.3191706460960777, + "grad_norm": 1.5677013397216797, + "learning_rate": 1.9748602803456448e-05, + "loss": 0.6405, + "step": 8081 + }, + { + "epoch": 1.3193339047385821, + "grad_norm": 1.4841282367706299, + "learning_rate": 1.9748532091439006e-05, + "loss": 0.5822, + "step": 8082 + }, + { + "epoch": 1.3194971633810866, + "grad_norm": 1.5994740724563599, + "learning_rate": 1.9748461369604797e-05, + "loss": 0.6568, + "step": 8083 + }, + { + "epoch": 1.3196604220235908, + "grad_norm": 2.0916876792907715, + "learning_rate": 1.974839063795389e-05, + "loss": 0.8002, + "step": 8084 + }, + { + "epoch": 1.3198236806660952, + "grad_norm": 1.5649064779281616, + "learning_rate": 1.9748319896486357e-05, + "loss": 0.6308, + "step": 8085 + }, + { + "epoch": 1.3199869393085997, + "grad_norm": 1.684648036956787, + "learning_rate": 1.9748249145202266e-05, + "loss": 0.5968, + "step": 8086 + }, + { + "epoch": 1.320150197951104, + "grad_norm": 1.6424453258514404, + "learning_rate": 1.9748178384101694e-05, + "loss": 0.6824, + "step": 8087 + }, + { + "epoch": 1.3203134565936083, + "grad_norm": 2.351764678955078, + "learning_rate": 1.9748107613184705e-05, + "loss": 0.7244, + "step": 8088 + }, + { + "epoch": 1.3204767152361128, + "grad_norm": 1.6778290271759033, + "learning_rate": 1.9748036832451377e-05, + "loss": 0.6595, + "step": 8089 + }, + { + "epoch": 1.3206399738786172, + "grad_norm": 1.7173930406570435, + "learning_rate": 1.9747966041901776e-05, + "loss": 0.7211, + "step": 8090 + }, + { + "epoch": 1.3208032325211216, + "grad_norm": 1.709312915802002, + "learning_rate": 1.974789524153598e-05, + "loss": 0.6519, + "step": 8091 + }, + { + "epoch": 1.320966491163626, + "grad_norm": 1.7854859828948975, + "learning_rate": 1.9747824431354052e-05, + "loss": 0.7465, + "step": 8092 + }, + { + "epoch": 1.3211297498061303, + "grad_norm": 2.0232982635498047, + "learning_rate": 1.974775361135607e-05, + "loss": 0.7568, + "step": 8093 + }, + { + "epoch": 1.3212930084486347, + "grad_norm": 1.856859564781189, + "learning_rate": 1.9747682781542103e-05, + "loss": 0.8868, + "step": 8094 + }, + { + "epoch": 1.3214562670911392, + "grad_norm": 1.8469316959381104, + "learning_rate": 1.974761194191222e-05, + "loss": 0.743, + "step": 8095 + }, + { + "epoch": 1.3216195257336436, + "grad_norm": 1.8788880109786987, + "learning_rate": 1.9747541092466496e-05, + "loss": 0.7537, + "step": 8096 + }, + { + "epoch": 1.3217827843761478, + "grad_norm": 1.8057591915130615, + "learning_rate": 1.9747470233204997e-05, + "loss": 0.7791, + "step": 8097 + }, + { + "epoch": 1.3219460430186523, + "grad_norm": 1.8077384233474731, + "learning_rate": 1.9747399364127803e-05, + "loss": 0.6298, + "step": 8098 + }, + { + "epoch": 1.3221093016611567, + "grad_norm": 2.701310873031616, + "learning_rate": 1.9747328485234977e-05, + "loss": 0.7212, + "step": 8099 + }, + { + "epoch": 1.3222725603036611, + "grad_norm": 2.286334753036499, + "learning_rate": 1.9747257596526594e-05, + "loss": 0.7267, + "step": 8100 + }, + { + "epoch": 1.3224358189461656, + "grad_norm": 1.9304906129837036, + "learning_rate": 1.974718669800273e-05, + "loss": 0.713, + "step": 8101 + }, + { + "epoch": 1.3225990775886698, + "grad_norm": 1.848392367362976, + "learning_rate": 1.9747115789663443e-05, + "loss": 0.7195, + "step": 8102 + }, + { + "epoch": 1.3227623362311742, + "grad_norm": 1.5515904426574707, + "learning_rate": 1.974704487150882e-05, + "loss": 0.6979, + "step": 8103 + }, + { + "epoch": 1.3229255948736787, + "grad_norm": 1.6959209442138672, + "learning_rate": 1.9746973943538923e-05, + "loss": 0.6765, + "step": 8104 + }, + { + "epoch": 1.3230888535161829, + "grad_norm": 1.8557239770889282, + "learning_rate": 1.9746903005753828e-05, + "loss": 0.6677, + "step": 8105 + }, + { + "epoch": 1.3232521121586873, + "grad_norm": 1.8728312253952026, + "learning_rate": 1.9746832058153602e-05, + "loss": 0.6587, + "step": 8106 + }, + { + "epoch": 1.3234153708011918, + "grad_norm": 1.5390833616256714, + "learning_rate": 1.9746761100738318e-05, + "loss": 0.5208, + "step": 8107 + }, + { + "epoch": 1.3235786294436962, + "grad_norm": 2.1384029388427734, + "learning_rate": 1.974669013350805e-05, + "loss": 0.7188, + "step": 8108 + }, + { + "epoch": 1.3237418880862006, + "grad_norm": 2.177511692047119, + "learning_rate": 1.9746619156462866e-05, + "loss": 0.7011, + "step": 8109 + }, + { + "epoch": 1.323905146728705, + "grad_norm": 1.6290374994277954, + "learning_rate": 1.9746548169602843e-05, + "loss": 0.5815, + "step": 8110 + }, + { + "epoch": 1.3240684053712093, + "grad_norm": 2.136305093765259, + "learning_rate": 1.9746477172928043e-05, + "loss": 1.0092, + "step": 8111 + }, + { + "epoch": 1.3242316640137137, + "grad_norm": 1.8016566038131714, + "learning_rate": 1.9746406166438546e-05, + "loss": 0.7505, + "step": 8112 + }, + { + "epoch": 1.3243949226562182, + "grad_norm": 1.9030386209487915, + "learning_rate": 1.9746335150134418e-05, + "loss": 0.7526, + "step": 8113 + }, + { + "epoch": 1.3245581812987224, + "grad_norm": 1.9278345108032227, + "learning_rate": 1.9746264124015736e-05, + "loss": 0.6744, + "step": 8114 + }, + { + "epoch": 1.3247214399412268, + "grad_norm": 1.8157521486282349, + "learning_rate": 1.9746193088082568e-05, + "loss": 0.7071, + "step": 8115 + }, + { + "epoch": 1.3248846985837313, + "grad_norm": 1.6955139636993408, + "learning_rate": 1.974612204233499e-05, + "loss": 0.641, + "step": 8116 + }, + { + "epoch": 1.3250479572262357, + "grad_norm": 1.770561933517456, + "learning_rate": 1.9746050986773062e-05, + "loss": 0.5339, + "step": 8117 + }, + { + "epoch": 1.3252112158687401, + "grad_norm": 1.8109534978866577, + "learning_rate": 1.9745979921396866e-05, + "loss": 0.7265, + "step": 8118 + }, + { + "epoch": 1.3253744745112446, + "grad_norm": 2.2096080780029297, + "learning_rate": 1.9745908846206472e-05, + "loss": 0.7263, + "step": 8119 + }, + { + "epoch": 1.3255377331537488, + "grad_norm": 1.6850926876068115, + "learning_rate": 1.9745837761201948e-05, + "loss": 0.6671, + "step": 8120 + }, + { + "epoch": 1.3257009917962532, + "grad_norm": 1.6506586074829102, + "learning_rate": 1.974576666638337e-05, + "loss": 0.747, + "step": 8121 + }, + { + "epoch": 1.3258642504387577, + "grad_norm": 1.7427412271499634, + "learning_rate": 1.974569556175081e-05, + "loss": 0.7494, + "step": 8122 + }, + { + "epoch": 1.3260275090812619, + "grad_norm": 2.103715181350708, + "learning_rate": 1.9745624447304335e-05, + "loss": 0.7641, + "step": 8123 + }, + { + "epoch": 1.3261907677237663, + "grad_norm": 1.9754912853240967, + "learning_rate": 1.9745553323044018e-05, + "loss": 0.7744, + "step": 8124 + }, + { + "epoch": 1.3263540263662708, + "grad_norm": 1.8413755893707275, + "learning_rate": 1.974548218896993e-05, + "loss": 0.825, + "step": 8125 + }, + { + "epoch": 1.3265172850087752, + "grad_norm": 1.590610146522522, + "learning_rate": 1.974541104508215e-05, + "loss": 0.6029, + "step": 8126 + }, + { + "epoch": 1.3266805436512796, + "grad_norm": 1.9644966125488281, + "learning_rate": 1.974533989138074e-05, + "loss": 0.7711, + "step": 8127 + }, + { + "epoch": 1.3268438022937838, + "grad_norm": 1.9963268041610718, + "learning_rate": 1.9745268727865774e-05, + "loss": 0.7562, + "step": 8128 + }, + { + "epoch": 1.3270070609362883, + "grad_norm": 1.7514228820800781, + "learning_rate": 1.9745197554537326e-05, + "loss": 0.712, + "step": 8129 + }, + { + "epoch": 1.3271703195787927, + "grad_norm": 1.4804366827011108, + "learning_rate": 1.9745126371395463e-05, + "loss": 0.5842, + "step": 8130 + }, + { + "epoch": 1.3273335782212972, + "grad_norm": 1.6377873420715332, + "learning_rate": 1.9745055178440266e-05, + "loss": 0.7235, + "step": 8131 + }, + { + "epoch": 1.3274968368638014, + "grad_norm": 1.6488251686096191, + "learning_rate": 1.97449839756718e-05, + "loss": 0.7401, + "step": 8132 + }, + { + "epoch": 1.3276600955063058, + "grad_norm": 1.6622512340545654, + "learning_rate": 1.9744912763090136e-05, + "loss": 0.6575, + "step": 8133 + }, + { + "epoch": 1.3278233541488103, + "grad_norm": 1.5592079162597656, + "learning_rate": 1.9744841540695347e-05, + "loss": 0.6029, + "step": 8134 + }, + { + "epoch": 1.3279866127913147, + "grad_norm": 1.8984745740890503, + "learning_rate": 1.9744770308487508e-05, + "loss": 0.7554, + "step": 8135 + }, + { + "epoch": 1.3281498714338191, + "grad_norm": 1.8811886310577393, + "learning_rate": 1.9744699066466687e-05, + "loss": 0.6936, + "step": 8136 + }, + { + "epoch": 1.3283131300763233, + "grad_norm": 1.97040855884552, + "learning_rate": 1.9744627814632956e-05, + "loss": 0.6338, + "step": 8137 + }, + { + "epoch": 1.3284763887188278, + "grad_norm": 1.8400938510894775, + "learning_rate": 1.9744556552986385e-05, + "loss": 0.6148, + "step": 8138 + }, + { + "epoch": 1.3286396473613322, + "grad_norm": 1.9490940570831299, + "learning_rate": 1.974448528152705e-05, + "loss": 0.8123, + "step": 8139 + }, + { + "epoch": 1.3288029060038367, + "grad_norm": 1.7277472019195557, + "learning_rate": 1.974441400025502e-05, + "loss": 0.7226, + "step": 8140 + }, + { + "epoch": 1.3289661646463409, + "grad_norm": 1.7936655282974243, + "learning_rate": 1.974434270917037e-05, + "loss": 0.7749, + "step": 8141 + }, + { + "epoch": 1.3291294232888453, + "grad_norm": 1.713377833366394, + "learning_rate": 1.9744271408273168e-05, + "loss": 0.7318, + "step": 8142 + }, + { + "epoch": 1.3292926819313498, + "grad_norm": 2.0047049522399902, + "learning_rate": 1.9744200097563487e-05, + "loss": 0.8208, + "step": 8143 + }, + { + "epoch": 1.3294559405738542, + "grad_norm": 1.46015465259552, + "learning_rate": 1.97441287770414e-05, + "loss": 0.6884, + "step": 8144 + }, + { + "epoch": 1.3296191992163586, + "grad_norm": 1.7572280168533325, + "learning_rate": 1.9744057446706977e-05, + "loss": 0.6906, + "step": 8145 + }, + { + "epoch": 1.3297824578588628, + "grad_norm": 1.5216182470321655, + "learning_rate": 1.9743986106560293e-05, + "loss": 0.5991, + "step": 8146 + }, + { + "epoch": 1.3299457165013673, + "grad_norm": 2.378322124481201, + "learning_rate": 1.9743914756601413e-05, + "loss": 0.7505, + "step": 8147 + }, + { + "epoch": 1.3301089751438717, + "grad_norm": 1.7232651710510254, + "learning_rate": 1.974384339683042e-05, + "loss": 0.6279, + "step": 8148 + }, + { + "epoch": 1.330272233786376, + "grad_norm": 1.5912830829620361, + "learning_rate": 1.9743772027247375e-05, + "loss": 0.6606, + "step": 8149 + }, + { + "epoch": 1.3304354924288804, + "grad_norm": 1.8480104207992554, + "learning_rate": 1.9743700647852356e-05, + "loss": 0.8552, + "step": 8150 + }, + { + "epoch": 1.3305987510713848, + "grad_norm": 1.7613649368286133, + "learning_rate": 1.9743629258645428e-05, + "loss": 0.6728, + "step": 8151 + }, + { + "epoch": 1.3307620097138892, + "grad_norm": 1.889197587966919, + "learning_rate": 1.974355785962667e-05, + "loss": 0.6592, + "step": 8152 + }, + { + "epoch": 1.3309252683563937, + "grad_norm": 2.1506028175354004, + "learning_rate": 1.9743486450796156e-05, + "loss": 0.598, + "step": 8153 + }, + { + "epoch": 1.3310885269988981, + "grad_norm": 1.7290527820587158, + "learning_rate": 1.9743415032153953e-05, + "loss": 0.7441, + "step": 8154 + }, + { + "epoch": 1.3312517856414023, + "grad_norm": 1.5732158422470093, + "learning_rate": 1.974334360370013e-05, + "loss": 0.7231, + "step": 8155 + }, + { + "epoch": 1.3314150442839068, + "grad_norm": 1.9649945497512817, + "learning_rate": 1.9743272165434765e-05, + "loss": 0.7538, + "step": 8156 + }, + { + "epoch": 1.3315783029264112, + "grad_norm": 1.9967423677444458, + "learning_rate": 1.9743200717357927e-05, + "loss": 0.6029, + "step": 8157 + }, + { + "epoch": 1.3317415615689154, + "grad_norm": 1.8611247539520264, + "learning_rate": 1.974312925946969e-05, + "loss": 0.7021, + "step": 8158 + }, + { + "epoch": 1.3319048202114199, + "grad_norm": 1.3805376291275024, + "learning_rate": 1.9743057791770122e-05, + "loss": 0.5725, + "step": 8159 + }, + { + "epoch": 1.3320680788539243, + "grad_norm": 2.0026237964630127, + "learning_rate": 1.97429863142593e-05, + "loss": 0.7716, + "step": 8160 + }, + { + "epoch": 1.3322313374964287, + "grad_norm": 1.6132175922393799, + "learning_rate": 1.974291482693729e-05, + "loss": 0.7247, + "step": 8161 + }, + { + "epoch": 1.3323945961389332, + "grad_norm": 1.7901742458343506, + "learning_rate": 1.9742843329804168e-05, + "loss": 0.6654, + "step": 8162 + }, + { + "epoch": 1.3325578547814376, + "grad_norm": 1.7907439470291138, + "learning_rate": 1.9742771822860007e-05, + "loss": 0.6095, + "step": 8163 + }, + { + "epoch": 1.3327211134239418, + "grad_norm": 1.5700527429580688, + "learning_rate": 1.9742700306104876e-05, + "loss": 0.5795, + "step": 8164 + }, + { + "epoch": 1.3328843720664463, + "grad_norm": 1.961586356163025, + "learning_rate": 1.9742628779538848e-05, + "loss": 0.7602, + "step": 8165 + }, + { + "epoch": 1.3330476307089507, + "grad_norm": 1.613202691078186, + "learning_rate": 1.9742557243162e-05, + "loss": 0.6464, + "step": 8166 + }, + { + "epoch": 1.333210889351455, + "grad_norm": 1.7981504201889038, + "learning_rate": 1.9742485696974395e-05, + "loss": 0.7119, + "step": 8167 + }, + { + "epoch": 1.3333741479939594, + "grad_norm": 2.0982704162597656, + "learning_rate": 1.974241414097611e-05, + "loss": 0.5979, + "step": 8168 + }, + { + "epoch": 1.3335374066364638, + "grad_norm": 1.7805267572402954, + "learning_rate": 1.9742342575167213e-05, + "loss": 0.6941, + "step": 8169 + }, + { + "epoch": 1.3337006652789682, + "grad_norm": 1.9152238368988037, + "learning_rate": 1.974227099954779e-05, + "loss": 0.7014, + "step": 8170 + }, + { + "epoch": 1.3338639239214727, + "grad_norm": 1.9404221773147583, + "learning_rate": 1.9742199414117894e-05, + "loss": 0.7491, + "step": 8171 + }, + { + "epoch": 1.334027182563977, + "grad_norm": 1.9786393642425537, + "learning_rate": 1.9742127818877605e-05, + "loss": 0.6749, + "step": 8172 + }, + { + "epoch": 1.3341904412064813, + "grad_norm": 1.8398462533950806, + "learning_rate": 1.9742056213827e-05, + "loss": 0.6962, + "step": 8173 + }, + { + "epoch": 1.3343536998489858, + "grad_norm": 1.7491601705551147, + "learning_rate": 1.9741984598966146e-05, + "loss": 0.7358, + "step": 8174 + }, + { + "epoch": 1.3345169584914902, + "grad_norm": 1.5813140869140625, + "learning_rate": 1.9741912974295115e-05, + "loss": 0.6308, + "step": 8175 + }, + { + "epoch": 1.3346802171339944, + "grad_norm": 1.6845985651016235, + "learning_rate": 1.9741841339813982e-05, + "loss": 0.6448, + "step": 8176 + }, + { + "epoch": 1.3348434757764989, + "grad_norm": 2.0108141899108887, + "learning_rate": 1.9741769695522815e-05, + "loss": 0.8839, + "step": 8177 + }, + { + "epoch": 1.3350067344190033, + "grad_norm": 1.986931324005127, + "learning_rate": 1.9741698041421692e-05, + "loss": 0.7152, + "step": 8178 + }, + { + "epoch": 1.3351699930615077, + "grad_norm": 1.5547858476638794, + "learning_rate": 1.9741626377510677e-05, + "loss": 0.6183, + "step": 8179 + }, + { + "epoch": 1.3353332517040122, + "grad_norm": 1.5541472434997559, + "learning_rate": 1.9741554703789852e-05, + "loss": 0.6218, + "step": 8180 + }, + { + "epoch": 1.3354965103465164, + "grad_norm": 2.258358955383301, + "learning_rate": 1.974148302025928e-05, + "loss": 0.6975, + "step": 8181 + }, + { + "epoch": 1.3356597689890208, + "grad_norm": 1.9497052431106567, + "learning_rate": 1.974141132691904e-05, + "loss": 0.63, + "step": 8182 + }, + { + "epoch": 1.3358230276315253, + "grad_norm": 1.7772135734558105, + "learning_rate": 1.97413396237692e-05, + "loss": 0.6188, + "step": 8183 + }, + { + "epoch": 1.3359862862740297, + "grad_norm": 1.7281135320663452, + "learning_rate": 1.9741267910809832e-05, + "loss": 0.7195, + "step": 8184 + }, + { + "epoch": 1.336149544916534, + "grad_norm": 1.7641513347625732, + "learning_rate": 1.9741196188041015e-05, + "loss": 0.7744, + "step": 8185 + }, + { + "epoch": 1.3363128035590384, + "grad_norm": 1.8051087856292725, + "learning_rate": 1.974112445546281e-05, + "loss": 0.8182, + "step": 8186 + }, + { + "epoch": 1.3364760622015428, + "grad_norm": 1.5499836206436157, + "learning_rate": 1.97410527130753e-05, + "loss": 0.5419, + "step": 8187 + }, + { + "epoch": 1.3366393208440472, + "grad_norm": 1.8288648128509521, + "learning_rate": 1.9740980960878548e-05, + "loss": 0.6964, + "step": 8188 + }, + { + "epoch": 1.3368025794865517, + "grad_norm": 1.9618353843688965, + "learning_rate": 1.9740909198872634e-05, + "loss": 0.7829, + "step": 8189 + }, + { + "epoch": 1.336965838129056, + "grad_norm": 1.8466296195983887, + "learning_rate": 1.9740837427057627e-05, + "loss": 0.657, + "step": 8190 + }, + { + "epoch": 1.3371290967715603, + "grad_norm": 1.6712092161178589, + "learning_rate": 1.9740765645433597e-05, + "loss": 0.6126, + "step": 8191 + }, + { + "epoch": 1.3372923554140648, + "grad_norm": 1.8561662435531616, + "learning_rate": 1.974069385400062e-05, + "loss": 0.7387, + "step": 8192 + }, + { + "epoch": 1.337455614056569, + "grad_norm": 1.8573241233825684, + "learning_rate": 1.9740622052758767e-05, + "loss": 0.7766, + "step": 8193 + }, + { + "epoch": 1.3376188726990734, + "grad_norm": 1.5488899946212769, + "learning_rate": 1.974055024170811e-05, + "loss": 0.6702, + "step": 8194 + }, + { + "epoch": 1.3377821313415779, + "grad_norm": 1.6390408277511597, + "learning_rate": 1.974047842084872e-05, + "loss": 0.6084, + "step": 8195 + }, + { + "epoch": 1.3379453899840823, + "grad_norm": 1.8268978595733643, + "learning_rate": 1.9740406590180673e-05, + "loss": 0.6841, + "step": 8196 + }, + { + "epoch": 1.3381086486265867, + "grad_norm": 1.70668625831604, + "learning_rate": 1.974033474970404e-05, + "loss": 0.6748, + "step": 8197 + }, + { + "epoch": 1.3382719072690912, + "grad_norm": 1.8198738098144531, + "learning_rate": 1.974026289941889e-05, + "loss": 0.6791, + "step": 8198 + }, + { + "epoch": 1.3384351659115954, + "grad_norm": 1.6487101316452026, + "learning_rate": 1.9740191039325297e-05, + "loss": 0.6012, + "step": 8199 + }, + { + "epoch": 1.3385984245540998, + "grad_norm": 1.8011170625686646, + "learning_rate": 1.9740119169423337e-05, + "loss": 0.895, + "step": 8200 + }, + { + "epoch": 1.3387616831966043, + "grad_norm": 1.9904978275299072, + "learning_rate": 1.9740047289713077e-05, + "loss": 0.7848, + "step": 8201 + }, + { + "epoch": 1.3389249418391085, + "grad_norm": 2.1504199504852295, + "learning_rate": 1.9739975400194593e-05, + "loss": 0.7995, + "step": 8202 + }, + { + "epoch": 1.339088200481613, + "grad_norm": 1.6668440103530884, + "learning_rate": 1.9739903500867958e-05, + "loss": 0.6498, + "step": 8203 + }, + { + "epoch": 1.3392514591241174, + "grad_norm": 1.6868622303009033, + "learning_rate": 1.973983159173324e-05, + "loss": 0.6898, + "step": 8204 + }, + { + "epoch": 1.3394147177666218, + "grad_norm": 1.7088543176651, + "learning_rate": 1.973975967279052e-05, + "loss": 0.6725, + "step": 8205 + }, + { + "epoch": 1.3395779764091262, + "grad_norm": 2.2173802852630615, + "learning_rate": 1.9739687744039858e-05, + "loss": 0.8342, + "step": 8206 + }, + { + "epoch": 1.3397412350516307, + "grad_norm": 1.6543817520141602, + "learning_rate": 1.9739615805481337e-05, + "loss": 0.6462, + "step": 8207 + }, + { + "epoch": 1.339904493694135, + "grad_norm": 1.6901558637619019, + "learning_rate": 1.9739543857115022e-05, + "loss": 0.6842, + "step": 8208 + }, + { + "epoch": 1.3400677523366393, + "grad_norm": 1.7962465286254883, + "learning_rate": 1.973947189894099e-05, + "loss": 0.7414, + "step": 8209 + }, + { + "epoch": 1.3402310109791438, + "grad_norm": 2.0246405601501465, + "learning_rate": 1.9739399930959316e-05, + "loss": 0.7772, + "step": 8210 + }, + { + "epoch": 1.340394269621648, + "grad_norm": 1.8361958265304565, + "learning_rate": 1.9739327953170065e-05, + "loss": 0.7662, + "step": 8211 + }, + { + "epoch": 1.3405575282641524, + "grad_norm": 1.6218116283416748, + "learning_rate": 1.973925596557331e-05, + "loss": 0.6298, + "step": 8212 + }, + { + "epoch": 1.3407207869066569, + "grad_norm": 1.82340407371521, + "learning_rate": 1.9739183968169134e-05, + "loss": 0.8959, + "step": 8213 + }, + { + "epoch": 1.3408840455491613, + "grad_norm": 1.682847261428833, + "learning_rate": 1.97391119609576e-05, + "loss": 0.5834, + "step": 8214 + }, + { + "epoch": 1.3410473041916657, + "grad_norm": 1.565724492073059, + "learning_rate": 1.9739039943938784e-05, + "loss": 0.5932, + "step": 8215 + }, + { + "epoch": 1.3412105628341702, + "grad_norm": 1.6493486166000366, + "learning_rate": 1.9738967917112752e-05, + "loss": 0.5719, + "step": 8216 + }, + { + "epoch": 1.3413738214766744, + "grad_norm": 1.4760262966156006, + "learning_rate": 1.9738895880479586e-05, + "loss": 0.5388, + "step": 8217 + }, + { + "epoch": 1.3415370801191788, + "grad_norm": 1.8590553998947144, + "learning_rate": 1.9738823834039354e-05, + "loss": 0.7256, + "step": 8218 + }, + { + "epoch": 1.3417003387616833, + "grad_norm": 1.8090965747833252, + "learning_rate": 1.9738751777792128e-05, + "loss": 0.7672, + "step": 8219 + }, + { + "epoch": 1.3418635974041875, + "grad_norm": 1.9499576091766357, + "learning_rate": 1.9738679711737984e-05, + "loss": 0.7934, + "step": 8220 + }, + { + "epoch": 1.342026856046692, + "grad_norm": 1.5775538682937622, + "learning_rate": 1.9738607635876992e-05, + "loss": 0.7127, + "step": 8221 + }, + { + "epoch": 1.3421901146891964, + "grad_norm": 1.8453025817871094, + "learning_rate": 1.9738535550209224e-05, + "loss": 0.713, + "step": 8222 + }, + { + "epoch": 1.3423533733317008, + "grad_norm": 1.6190437078475952, + "learning_rate": 1.973846345473475e-05, + "loss": 0.6475, + "step": 8223 + }, + { + "epoch": 1.3425166319742052, + "grad_norm": 1.601019024848938, + "learning_rate": 1.973839134945365e-05, + "loss": 0.6121, + "step": 8224 + }, + { + "epoch": 1.3426798906167094, + "grad_norm": 1.6232457160949707, + "learning_rate": 1.973831923436599e-05, + "loss": 0.5755, + "step": 8225 + }, + { + "epoch": 1.3428431492592139, + "grad_norm": 1.4971954822540283, + "learning_rate": 1.9738247109471848e-05, + "loss": 0.5409, + "step": 8226 + }, + { + "epoch": 1.3430064079017183, + "grad_norm": 1.6471303701400757, + "learning_rate": 1.9738174974771288e-05, + "loss": 0.6231, + "step": 8227 + }, + { + "epoch": 1.3431696665442228, + "grad_norm": 1.7278141975402832, + "learning_rate": 1.9738102830264397e-05, + "loss": 0.6721, + "step": 8228 + }, + { + "epoch": 1.343332925186727, + "grad_norm": 2.037161350250244, + "learning_rate": 1.9738030675951233e-05, + "loss": 0.6543, + "step": 8229 + }, + { + "epoch": 1.3434961838292314, + "grad_norm": 1.4915215969085693, + "learning_rate": 1.9737958511831878e-05, + "loss": 0.5908, + "step": 8230 + }, + { + "epoch": 1.3436594424717359, + "grad_norm": 1.5434035062789917, + "learning_rate": 1.9737886337906394e-05, + "loss": 0.5733, + "step": 8231 + }, + { + "epoch": 1.3438227011142403, + "grad_norm": 1.872965693473816, + "learning_rate": 1.973781415417487e-05, + "loss": 0.6923, + "step": 8232 + }, + { + "epoch": 1.3439859597567447, + "grad_norm": 1.7643595933914185, + "learning_rate": 1.9737741960637363e-05, + "loss": 0.6127, + "step": 8233 + }, + { + "epoch": 1.344149218399249, + "grad_norm": 1.779660701751709, + "learning_rate": 1.9737669757293955e-05, + "loss": 0.7227, + "step": 8234 + }, + { + "epoch": 1.3443124770417534, + "grad_norm": 1.5762369632720947, + "learning_rate": 1.9737597544144717e-05, + "loss": 0.664, + "step": 8235 + }, + { + "epoch": 1.3444757356842578, + "grad_norm": 1.7186492681503296, + "learning_rate": 1.9737525321189724e-05, + "loss": 0.7634, + "step": 8236 + }, + { + "epoch": 1.344638994326762, + "grad_norm": 2.5254247188568115, + "learning_rate": 1.973745308842904e-05, + "loss": 0.8074, + "step": 8237 + }, + { + "epoch": 1.3448022529692665, + "grad_norm": 1.7636942863464355, + "learning_rate": 1.9737380845862745e-05, + "loss": 0.6222, + "step": 8238 + }, + { + "epoch": 1.344965511611771, + "grad_norm": 1.6847233772277832, + "learning_rate": 1.973730859349091e-05, + "loss": 0.7141, + "step": 8239 + }, + { + "epoch": 1.3451287702542754, + "grad_norm": 1.8828792572021484, + "learning_rate": 1.973723633131361e-05, + "loss": 0.7007, + "step": 8240 + }, + { + "epoch": 1.3452920288967798, + "grad_norm": 2.0926733016967773, + "learning_rate": 1.9737164059330914e-05, + "loss": 0.7448, + "step": 8241 + }, + { + "epoch": 1.3454552875392842, + "grad_norm": 2.036897897720337, + "learning_rate": 1.9737091777542896e-05, + "loss": 0.7313, + "step": 8242 + }, + { + "epoch": 1.3456185461817884, + "grad_norm": 1.5905730724334717, + "learning_rate": 1.973701948594963e-05, + "loss": 0.6581, + "step": 8243 + }, + { + "epoch": 1.3457818048242929, + "grad_norm": 1.699825644493103, + "learning_rate": 1.973694718455119e-05, + "loss": 0.6663, + "step": 8244 + }, + { + "epoch": 1.3459450634667973, + "grad_norm": 1.9438644647598267, + "learning_rate": 1.973687487334764e-05, + "loss": 0.7834, + "step": 8245 + }, + { + "epoch": 1.3461083221093015, + "grad_norm": 1.8610684871673584, + "learning_rate": 1.9736802552339064e-05, + "loss": 0.5558, + "step": 8246 + }, + { + "epoch": 1.346271580751806, + "grad_norm": 1.677097201347351, + "learning_rate": 1.9736730221525532e-05, + "loss": 0.5716, + "step": 8247 + }, + { + "epoch": 1.3464348393943104, + "grad_norm": 1.4830352067947388, + "learning_rate": 1.9736657880907112e-05, + "loss": 0.6042, + "step": 8248 + }, + { + "epoch": 1.3465980980368149, + "grad_norm": 1.85309636592865, + "learning_rate": 1.973658553048388e-05, + "loss": 0.7583, + "step": 8249 + }, + { + "epoch": 1.3467613566793193, + "grad_norm": 1.703464388847351, + "learning_rate": 1.973651317025591e-05, + "loss": 0.6163, + "step": 8250 + }, + { + "epoch": 1.3469246153218237, + "grad_norm": 1.9636685848236084, + "learning_rate": 1.9736440800223276e-05, + "loss": 0.9122, + "step": 8251 + }, + { + "epoch": 1.347087873964328, + "grad_norm": 1.645098328590393, + "learning_rate": 1.973636842038605e-05, + "loss": 0.6377, + "step": 8252 + }, + { + "epoch": 1.3472511326068324, + "grad_norm": 1.6568094491958618, + "learning_rate": 1.9736296030744296e-05, + "loss": 0.6104, + "step": 8253 + }, + { + "epoch": 1.3474143912493368, + "grad_norm": 1.670772910118103, + "learning_rate": 1.9736223631298103e-05, + "loss": 0.574, + "step": 8254 + }, + { + "epoch": 1.347577649891841, + "grad_norm": 2.1538875102996826, + "learning_rate": 1.9736151222047527e-05, + "loss": 0.7407, + "step": 8255 + }, + { + "epoch": 1.3477409085343455, + "grad_norm": 1.7135804891586304, + "learning_rate": 1.9736078802992654e-05, + "loss": 0.7331, + "step": 8256 + }, + { + "epoch": 1.34790416717685, + "grad_norm": 1.590151309967041, + "learning_rate": 1.9736006374133553e-05, + "loss": 0.6461, + "step": 8257 + }, + { + "epoch": 1.3480674258193543, + "grad_norm": 1.4683552980422974, + "learning_rate": 1.9735933935470294e-05, + "loss": 0.5678, + "step": 8258 + }, + { + "epoch": 1.3482306844618588, + "grad_norm": 1.7317265272140503, + "learning_rate": 1.9735861487002954e-05, + "loss": 0.6819, + "step": 8259 + }, + { + "epoch": 1.3483939431043632, + "grad_norm": 1.912006139755249, + "learning_rate": 1.9735789028731603e-05, + "loss": 0.8791, + "step": 8260 + }, + { + "epoch": 1.3485572017468674, + "grad_norm": 1.7017078399658203, + "learning_rate": 1.9735716560656314e-05, + "loss": 0.559, + "step": 8261 + }, + { + "epoch": 1.3487204603893719, + "grad_norm": 1.7753666639328003, + "learning_rate": 1.9735644082777164e-05, + "loss": 0.7439, + "step": 8262 + }, + { + "epoch": 1.3488837190318763, + "grad_norm": 1.7905590534210205, + "learning_rate": 1.9735571595094224e-05, + "loss": 0.8084, + "step": 8263 + }, + { + "epoch": 1.3490469776743805, + "grad_norm": 2.5920846462249756, + "learning_rate": 1.973549909760756e-05, + "loss": 0.709, + "step": 8264 + }, + { + "epoch": 1.349210236316885, + "grad_norm": 1.9217218160629272, + "learning_rate": 1.9735426590317256e-05, + "loss": 0.7488, + "step": 8265 + }, + { + "epoch": 1.3493734949593894, + "grad_norm": 1.7934809923171997, + "learning_rate": 1.9735354073223378e-05, + "loss": 0.6592, + "step": 8266 + }, + { + "epoch": 1.3495367536018938, + "grad_norm": 1.7547911405563354, + "learning_rate": 1.9735281546326e-05, + "loss": 0.718, + "step": 8267 + }, + { + "epoch": 1.3497000122443983, + "grad_norm": 1.705729603767395, + "learning_rate": 1.97352090096252e-05, + "loss": 0.7656, + "step": 8268 + }, + { + "epoch": 1.3498632708869025, + "grad_norm": 1.8302091360092163, + "learning_rate": 1.9735136463121043e-05, + "loss": 0.7812, + "step": 8269 + }, + { + "epoch": 1.350026529529407, + "grad_norm": 1.8392120599746704, + "learning_rate": 1.9735063906813608e-05, + "loss": 0.8434, + "step": 8270 + }, + { + "epoch": 1.3501897881719114, + "grad_norm": 2.2930102348327637, + "learning_rate": 1.9734991340702966e-05, + "loss": 1.2712, + "step": 8271 + }, + { + "epoch": 1.3503530468144158, + "grad_norm": 1.7212575674057007, + "learning_rate": 1.973491876478919e-05, + "loss": 0.7042, + "step": 8272 + }, + { + "epoch": 1.35051630545692, + "grad_norm": 1.8384109735488892, + "learning_rate": 1.9734846179072352e-05, + "loss": 0.8172, + "step": 8273 + }, + { + "epoch": 1.3506795640994245, + "grad_norm": 1.7737782001495361, + "learning_rate": 1.973477358355253e-05, + "loss": 0.7011, + "step": 8274 + }, + { + "epoch": 1.350842822741929, + "grad_norm": 1.3445665836334229, + "learning_rate": 1.973470097822979e-05, + "loss": 0.5368, + "step": 8275 + }, + { + "epoch": 1.3510060813844333, + "grad_norm": 1.594993233680725, + "learning_rate": 1.9734628363104213e-05, + "loss": 0.6287, + "step": 8276 + }, + { + "epoch": 1.3511693400269378, + "grad_norm": 1.858632206916809, + "learning_rate": 1.9734555738175864e-05, + "loss": 0.8417, + "step": 8277 + }, + { + "epoch": 1.351332598669442, + "grad_norm": 1.6728861331939697, + "learning_rate": 1.9734483103444823e-05, + "loss": 0.6752, + "step": 8278 + }, + { + "epoch": 1.3514958573119464, + "grad_norm": 1.7547121047973633, + "learning_rate": 1.9734410458911158e-05, + "loss": 0.7308, + "step": 8279 + }, + { + "epoch": 1.3516591159544509, + "grad_norm": 1.7454992532730103, + "learning_rate": 1.9734337804574943e-05, + "loss": 0.687, + "step": 8280 + }, + { + "epoch": 1.351822374596955, + "grad_norm": 1.7882388830184937, + "learning_rate": 1.9734265140436256e-05, + "loss": 0.6725, + "step": 8281 + }, + { + "epoch": 1.3519856332394595, + "grad_norm": 2.1416709423065186, + "learning_rate": 1.9734192466495162e-05, + "loss": 0.7862, + "step": 8282 + }, + { + "epoch": 1.352148891881964, + "grad_norm": 1.6497728824615479, + "learning_rate": 1.9734119782751742e-05, + "loss": 0.7424, + "step": 8283 + }, + { + "epoch": 1.3523121505244684, + "grad_norm": 1.6605168581008911, + "learning_rate": 1.9734047089206064e-05, + "loss": 0.7964, + "step": 8284 + }, + { + "epoch": 1.3524754091669728, + "grad_norm": 1.5457677841186523, + "learning_rate": 1.9733974385858205e-05, + "loss": 0.657, + "step": 8285 + }, + { + "epoch": 1.3526386678094773, + "grad_norm": 1.933210015296936, + "learning_rate": 1.9733901672708234e-05, + "loss": 0.79, + "step": 8286 + }, + { + "epoch": 1.3528019264519815, + "grad_norm": 1.7117021083831787, + "learning_rate": 1.973382894975623e-05, + "loss": 0.5921, + "step": 8287 + }, + { + "epoch": 1.352965185094486, + "grad_norm": 1.8295398950576782, + "learning_rate": 1.973375621700226e-05, + "loss": 0.7953, + "step": 8288 + }, + { + "epoch": 1.3531284437369904, + "grad_norm": 1.7313681840896606, + "learning_rate": 1.97336834744464e-05, + "loss": 0.7181, + "step": 8289 + }, + { + "epoch": 1.3532917023794946, + "grad_norm": 1.9124257564544678, + "learning_rate": 1.9733610722088724e-05, + "loss": 0.7201, + "step": 8290 + }, + { + "epoch": 1.353454961021999, + "grad_norm": 1.8623815774917603, + "learning_rate": 1.9733537959929308e-05, + "loss": 0.7329, + "step": 8291 + }, + { + "epoch": 1.3536182196645035, + "grad_norm": 1.6922436952590942, + "learning_rate": 1.973346518796822e-05, + "loss": 0.5764, + "step": 8292 + }, + { + "epoch": 1.353781478307008, + "grad_norm": 1.5830777883529663, + "learning_rate": 1.973339240620553e-05, + "loss": 0.7261, + "step": 8293 + }, + { + "epoch": 1.3539447369495123, + "grad_norm": 1.9336048364639282, + "learning_rate": 1.9733319614641325e-05, + "loss": 0.8334, + "step": 8294 + }, + { + "epoch": 1.3541079955920168, + "grad_norm": 1.6308181285858154, + "learning_rate": 1.9733246813275664e-05, + "loss": 0.7336, + "step": 8295 + }, + { + "epoch": 1.354271254234521, + "grad_norm": 1.9394677877426147, + "learning_rate": 1.9733174002108626e-05, + "loss": 0.7514, + "step": 8296 + }, + { + "epoch": 1.3544345128770254, + "grad_norm": 1.7411705255508423, + "learning_rate": 1.9733101181140285e-05, + "loss": 0.6415, + "step": 8297 + }, + { + "epoch": 1.3545977715195299, + "grad_norm": 1.7820934057235718, + "learning_rate": 1.9733028350370715e-05, + "loss": 0.7054, + "step": 8298 + }, + { + "epoch": 1.354761030162034, + "grad_norm": 1.9797648191452026, + "learning_rate": 1.973295550979999e-05, + "loss": 0.8253, + "step": 8299 + }, + { + "epoch": 1.3549242888045385, + "grad_norm": 1.7091258764266968, + "learning_rate": 1.973288265942818e-05, + "loss": 0.7096, + "step": 8300 + }, + { + "epoch": 1.355087547447043, + "grad_norm": 2.037309169769287, + "learning_rate": 1.9732809799255356e-05, + "loss": 0.6262, + "step": 8301 + }, + { + "epoch": 1.3552508060895474, + "grad_norm": 1.6845673322677612, + "learning_rate": 1.9732736929281598e-05, + "loss": 0.6478, + "step": 8302 + }, + { + "epoch": 1.3554140647320518, + "grad_norm": 1.7483186721801758, + "learning_rate": 1.9732664049506975e-05, + "loss": 0.6602, + "step": 8303 + }, + { + "epoch": 1.3555773233745563, + "grad_norm": 1.6480823755264282, + "learning_rate": 1.9732591159931564e-05, + "loss": 0.7562, + "step": 8304 + }, + { + "epoch": 1.3557405820170605, + "grad_norm": 2.142760753631592, + "learning_rate": 1.9732518260555436e-05, + "loss": 0.7359, + "step": 8305 + }, + { + "epoch": 1.355903840659565, + "grad_norm": 1.6228832006454468, + "learning_rate": 1.9732445351378663e-05, + "loss": 0.6438, + "step": 8306 + }, + { + "epoch": 1.3560670993020694, + "grad_norm": 2.149221420288086, + "learning_rate": 1.9732372432401322e-05, + "loss": 0.901, + "step": 8307 + }, + { + "epoch": 1.3562303579445736, + "grad_norm": 2.020387649536133, + "learning_rate": 1.9732299503623484e-05, + "loss": 0.6961, + "step": 8308 + }, + { + "epoch": 1.356393616587078, + "grad_norm": 1.8501379489898682, + "learning_rate": 1.973222656504522e-05, + "loss": 0.8253, + "step": 8309 + }, + { + "epoch": 1.3565568752295825, + "grad_norm": 1.7977896928787231, + "learning_rate": 1.9732153616666608e-05, + "loss": 0.7699, + "step": 8310 + }, + { + "epoch": 1.356720133872087, + "grad_norm": 1.849785327911377, + "learning_rate": 1.9732080658487725e-05, + "loss": 0.699, + "step": 8311 + }, + { + "epoch": 1.3568833925145913, + "grad_norm": 1.7371591329574585, + "learning_rate": 1.9732007690508634e-05, + "loss": 0.6855, + "step": 8312 + }, + { + "epoch": 1.3570466511570956, + "grad_norm": 1.8989543914794922, + "learning_rate": 1.9731934712729415e-05, + "loss": 0.7579, + "step": 8313 + }, + { + "epoch": 1.3572099097996, + "grad_norm": 1.6754093170166016, + "learning_rate": 1.973186172515014e-05, + "loss": 0.8034, + "step": 8314 + }, + { + "epoch": 1.3573731684421044, + "grad_norm": 1.8184170722961426, + "learning_rate": 1.9731788727770885e-05, + "loss": 0.7807, + "step": 8315 + }, + { + "epoch": 1.3575364270846089, + "grad_norm": 1.8979014158248901, + "learning_rate": 1.973171572059172e-05, + "loss": 0.7901, + "step": 8316 + }, + { + "epoch": 1.357699685727113, + "grad_norm": 1.8058786392211914, + "learning_rate": 1.973164270361272e-05, + "loss": 0.6959, + "step": 8317 + }, + { + "epoch": 1.3578629443696175, + "grad_norm": 1.9051337242126465, + "learning_rate": 1.9731569676833956e-05, + "loss": 0.682, + "step": 8318 + }, + { + "epoch": 1.358026203012122, + "grad_norm": 1.8176357746124268, + "learning_rate": 1.9731496640255506e-05, + "loss": 0.724, + "step": 8319 + }, + { + "epoch": 1.3581894616546264, + "grad_norm": 1.7155450582504272, + "learning_rate": 1.973142359387744e-05, + "loss": 0.7476, + "step": 8320 + }, + { + "epoch": 1.3583527202971308, + "grad_norm": 2.5844178199768066, + "learning_rate": 1.9731350537699834e-05, + "loss": 0.723, + "step": 8321 + }, + { + "epoch": 1.358515978939635, + "grad_norm": 1.807054042816162, + "learning_rate": 1.973127747172276e-05, + "loss": 0.8197, + "step": 8322 + }, + { + "epoch": 1.3586792375821395, + "grad_norm": 1.4998122453689575, + "learning_rate": 1.9731204395946296e-05, + "loss": 0.7217, + "step": 8323 + }, + { + "epoch": 1.358842496224644, + "grad_norm": 1.4987891912460327, + "learning_rate": 1.9731131310370507e-05, + "loss": 0.6774, + "step": 8324 + }, + { + "epoch": 1.3590057548671481, + "grad_norm": 2.301417589187622, + "learning_rate": 1.9731058214995475e-05, + "loss": 0.7901, + "step": 8325 + }, + { + "epoch": 1.3591690135096526, + "grad_norm": 1.8290350437164307, + "learning_rate": 1.9730985109821268e-05, + "loss": 0.8052, + "step": 8326 + }, + { + "epoch": 1.359332272152157, + "grad_norm": 1.6052716970443726, + "learning_rate": 1.973091199484796e-05, + "loss": 0.6198, + "step": 8327 + }, + { + "epoch": 1.3594955307946615, + "grad_norm": 1.7550108432769775, + "learning_rate": 1.973083887007563e-05, + "loss": 0.7344, + "step": 8328 + }, + { + "epoch": 1.359658789437166, + "grad_norm": 1.6170223951339722, + "learning_rate": 1.9730765735504345e-05, + "loss": 0.5764, + "step": 8329 + }, + { + "epoch": 1.3598220480796703, + "grad_norm": 1.6686460971832275, + "learning_rate": 1.9730692591134182e-05, + "loss": 0.6056, + "step": 8330 + }, + { + "epoch": 1.3599853067221745, + "grad_norm": 1.6460363864898682, + "learning_rate": 1.9730619436965215e-05, + "loss": 0.6323, + "step": 8331 + }, + { + "epoch": 1.360148565364679, + "grad_norm": 1.4896211624145508, + "learning_rate": 1.9730546272997516e-05, + "loss": 0.5703, + "step": 8332 + }, + { + "epoch": 1.3603118240071834, + "grad_norm": 1.9263339042663574, + "learning_rate": 1.973047309923116e-05, + "loss": 0.6436, + "step": 8333 + }, + { + "epoch": 1.3604750826496876, + "grad_norm": 1.7282179594039917, + "learning_rate": 1.973039991566622e-05, + "loss": 0.6072, + "step": 8334 + }, + { + "epoch": 1.360638341292192, + "grad_norm": 1.7295589447021484, + "learning_rate": 1.973032672230277e-05, + "loss": 0.6472, + "step": 8335 + }, + { + "epoch": 1.3608015999346965, + "grad_norm": 1.7596914768218994, + "learning_rate": 1.973025351914088e-05, + "loss": 0.7705, + "step": 8336 + }, + { + "epoch": 1.360964858577201, + "grad_norm": 2.0013060569763184, + "learning_rate": 1.973018030618063e-05, + "loss": 0.6773, + "step": 8337 + }, + { + "epoch": 1.3611281172197054, + "grad_norm": 1.6858500242233276, + "learning_rate": 1.9730107083422095e-05, + "loss": 0.7056, + "step": 8338 + }, + { + "epoch": 1.3612913758622098, + "grad_norm": 1.834686517715454, + "learning_rate": 1.973003385086534e-05, + "loss": 0.6982, + "step": 8339 + }, + { + "epoch": 1.361454634504714, + "grad_norm": 1.5939444303512573, + "learning_rate": 1.9729960608510444e-05, + "loss": 0.6232, + "step": 8340 + }, + { + "epoch": 1.3616178931472185, + "grad_norm": 2.04644775390625, + "learning_rate": 1.972988735635748e-05, + "loss": 0.6972, + "step": 8341 + }, + { + "epoch": 1.361781151789723, + "grad_norm": 1.950292944908142, + "learning_rate": 1.9729814094406525e-05, + "loss": 0.6302, + "step": 8342 + }, + { + "epoch": 1.3619444104322271, + "grad_norm": 1.7416141033172607, + "learning_rate": 1.9729740822657648e-05, + "loss": 0.5998, + "step": 8343 + }, + { + "epoch": 1.3621076690747316, + "grad_norm": 1.8044193983078003, + "learning_rate": 1.9729667541110923e-05, + "loss": 0.6572, + "step": 8344 + }, + { + "epoch": 1.362270927717236, + "grad_norm": 2.0061612129211426, + "learning_rate": 1.972959424976643e-05, + "loss": 0.678, + "step": 8345 + }, + { + "epoch": 1.3624341863597405, + "grad_norm": 2.8036489486694336, + "learning_rate": 1.9729520948624232e-05, + "loss": 0.7473, + "step": 8346 + }, + { + "epoch": 1.362597445002245, + "grad_norm": 2.05035138130188, + "learning_rate": 1.972944763768441e-05, + "loss": 0.7647, + "step": 8347 + }, + { + "epoch": 1.3627607036447493, + "grad_norm": 1.7214281558990479, + "learning_rate": 1.972937431694704e-05, + "loss": 0.6989, + "step": 8348 + }, + { + "epoch": 1.3629239622872535, + "grad_norm": 1.5661144256591797, + "learning_rate": 1.9729300986412193e-05, + "loss": 0.5184, + "step": 8349 + }, + { + "epoch": 1.363087220929758, + "grad_norm": 1.9768506288528442, + "learning_rate": 1.9729227646079942e-05, + "loss": 0.7588, + "step": 8350 + }, + { + "epoch": 1.3632504795722624, + "grad_norm": 2.1065573692321777, + "learning_rate": 1.972915429595036e-05, + "loss": 0.7968, + "step": 8351 + }, + { + "epoch": 1.3634137382147666, + "grad_norm": 1.6749401092529297, + "learning_rate": 1.9729080936023522e-05, + "loss": 0.6017, + "step": 8352 + }, + { + "epoch": 1.363576996857271, + "grad_norm": 1.9006191492080688, + "learning_rate": 1.97290075662995e-05, + "loss": 0.7278, + "step": 8353 + }, + { + "epoch": 1.3637402554997755, + "grad_norm": 1.978261947631836, + "learning_rate": 1.9728934186778374e-05, + "loss": 0.7953, + "step": 8354 + }, + { + "epoch": 1.36390351414228, + "grad_norm": 1.6980211734771729, + "learning_rate": 1.9728860797460213e-05, + "loss": 0.6293, + "step": 8355 + }, + { + "epoch": 1.3640667727847844, + "grad_norm": 1.7803338766098022, + "learning_rate": 1.972878739834509e-05, + "loss": 0.7124, + "step": 8356 + }, + { + "epoch": 1.3642300314272886, + "grad_norm": 2.0836610794067383, + "learning_rate": 1.9728713989433082e-05, + "loss": 0.7916, + "step": 8357 + }, + { + "epoch": 1.364393290069793, + "grad_norm": 2.059305429458618, + "learning_rate": 1.972864057072426e-05, + "loss": 0.7813, + "step": 8358 + }, + { + "epoch": 1.3645565487122975, + "grad_norm": 1.6354259252548218, + "learning_rate": 1.9728567142218705e-05, + "loss": 0.6936, + "step": 8359 + }, + { + "epoch": 1.364719807354802, + "grad_norm": 1.8395733833312988, + "learning_rate": 1.972849370391648e-05, + "loss": 0.6993, + "step": 8360 + }, + { + "epoch": 1.3648830659973061, + "grad_norm": 1.5453143119812012, + "learning_rate": 1.9728420255817666e-05, + "loss": 0.5523, + "step": 8361 + }, + { + "epoch": 1.3650463246398106, + "grad_norm": 1.8116971254348755, + "learning_rate": 1.9728346797922334e-05, + "loss": 0.5959, + "step": 8362 + }, + { + "epoch": 1.365209583282315, + "grad_norm": 1.4897252321243286, + "learning_rate": 1.972827333023056e-05, + "loss": 0.5545, + "step": 8363 + }, + { + "epoch": 1.3653728419248194, + "grad_norm": 2.1675009727478027, + "learning_rate": 1.972819985274242e-05, + "loss": 0.709, + "step": 8364 + }, + { + "epoch": 1.3655361005673239, + "grad_norm": 1.7929800748825073, + "learning_rate": 1.9728126365457982e-05, + "loss": 0.7585, + "step": 8365 + }, + { + "epoch": 1.365699359209828, + "grad_norm": 1.8542165756225586, + "learning_rate": 1.9728052868377324e-05, + "loss": 0.6729, + "step": 8366 + }, + { + "epoch": 1.3658626178523325, + "grad_norm": 1.7181657552719116, + "learning_rate": 1.972797936150052e-05, + "loss": 0.6551, + "step": 8367 + }, + { + "epoch": 1.366025876494837, + "grad_norm": 1.775887131690979, + "learning_rate": 1.9727905844827644e-05, + "loss": 0.7949, + "step": 8368 + }, + { + "epoch": 1.3661891351373414, + "grad_norm": 1.7418924570083618, + "learning_rate": 1.9727832318358765e-05, + "loss": 0.5483, + "step": 8369 + }, + { + "epoch": 1.3663523937798456, + "grad_norm": 1.5054987668991089, + "learning_rate": 1.972775878209397e-05, + "loss": 0.6097, + "step": 8370 + }, + { + "epoch": 1.36651565242235, + "grad_norm": 2.1408700942993164, + "learning_rate": 1.9727685236033317e-05, + "loss": 0.8409, + "step": 8371 + }, + { + "epoch": 1.3666789110648545, + "grad_norm": 1.7996951341629028, + "learning_rate": 1.972761168017689e-05, + "loss": 0.6961, + "step": 8372 + }, + { + "epoch": 1.366842169707359, + "grad_norm": 1.6046819686889648, + "learning_rate": 1.972753811452476e-05, + "loss": 0.589, + "step": 8373 + }, + { + "epoch": 1.3670054283498634, + "grad_norm": 1.8360761404037476, + "learning_rate": 1.9727464539077004e-05, + "loss": 0.6544, + "step": 8374 + }, + { + "epoch": 1.3671686869923676, + "grad_norm": 1.8914958238601685, + "learning_rate": 1.972739095383369e-05, + "loss": 0.6764, + "step": 8375 + }, + { + "epoch": 1.367331945634872, + "grad_norm": 1.4578981399536133, + "learning_rate": 1.97273173587949e-05, + "loss": 0.5745, + "step": 8376 + }, + { + "epoch": 1.3674952042773765, + "grad_norm": 1.8077691793441772, + "learning_rate": 1.97272437539607e-05, + "loss": 0.6334, + "step": 8377 + }, + { + "epoch": 1.3676584629198807, + "grad_norm": 1.7290151119232178, + "learning_rate": 1.972717013933117e-05, + "loss": 0.6115, + "step": 8378 + }, + { + "epoch": 1.3678217215623851, + "grad_norm": 1.860940933227539, + "learning_rate": 1.9727096514906384e-05, + "loss": 0.7351, + "step": 8379 + }, + { + "epoch": 1.3679849802048896, + "grad_norm": 1.5859614610671997, + "learning_rate": 1.9727022880686413e-05, + "loss": 0.6391, + "step": 8380 + }, + { + "epoch": 1.368148238847394, + "grad_norm": 1.8746851682662964, + "learning_rate": 1.9726949236671332e-05, + "loss": 0.8745, + "step": 8381 + }, + { + "epoch": 1.3683114974898984, + "grad_norm": 1.9152168035507202, + "learning_rate": 1.9726875582861217e-05, + "loss": 0.7353, + "step": 8382 + }, + { + "epoch": 1.3684747561324029, + "grad_norm": 1.7264350652694702, + "learning_rate": 1.9726801919256138e-05, + "loss": 0.633, + "step": 8383 + }, + { + "epoch": 1.368638014774907, + "grad_norm": 1.8237354755401611, + "learning_rate": 1.9726728245856174e-05, + "loss": 0.8483, + "step": 8384 + }, + { + "epoch": 1.3688012734174115, + "grad_norm": 1.8498300313949585, + "learning_rate": 1.9726654562661398e-05, + "loss": 0.6789, + "step": 8385 + }, + { + "epoch": 1.368964532059916, + "grad_norm": 1.6092272996902466, + "learning_rate": 1.9726580869671882e-05, + "loss": 0.6186, + "step": 8386 + }, + { + "epoch": 1.3691277907024202, + "grad_norm": 1.704261302947998, + "learning_rate": 1.9726507166887704e-05, + "loss": 0.5943, + "step": 8387 + }, + { + "epoch": 1.3692910493449246, + "grad_norm": 1.932511806488037, + "learning_rate": 1.9726433454308932e-05, + "loss": 0.7102, + "step": 8388 + }, + { + "epoch": 1.369454307987429, + "grad_norm": 1.6476396322250366, + "learning_rate": 1.9726359731935646e-05, + "loss": 0.6825, + "step": 8389 + }, + { + "epoch": 1.3696175666299335, + "grad_norm": 1.7646197080612183, + "learning_rate": 1.972628599976792e-05, + "loss": 0.6763, + "step": 8390 + }, + { + "epoch": 1.369780825272438, + "grad_norm": 1.9146653413772583, + "learning_rate": 1.9726212257805825e-05, + "loss": 0.6274, + "step": 8391 + }, + { + "epoch": 1.3699440839149424, + "grad_norm": 2.177828311920166, + "learning_rate": 1.9726138506049438e-05, + "loss": 0.8401, + "step": 8392 + }, + { + "epoch": 1.3701073425574466, + "grad_norm": 1.65445077419281, + "learning_rate": 1.972606474449883e-05, + "loss": 0.7258, + "step": 8393 + }, + { + "epoch": 1.370270601199951, + "grad_norm": 1.7009174823760986, + "learning_rate": 1.972599097315408e-05, + "loss": 0.63, + "step": 8394 + }, + { + "epoch": 1.3704338598424555, + "grad_norm": 1.6280282735824585, + "learning_rate": 1.9725917192015254e-05, + "loss": 0.6838, + "step": 8395 + }, + { + "epoch": 1.3705971184849597, + "grad_norm": 1.4994230270385742, + "learning_rate": 1.9725843401082438e-05, + "loss": 0.6918, + "step": 8396 + }, + { + "epoch": 1.3707603771274641, + "grad_norm": 1.7638756036758423, + "learning_rate": 1.97257696003557e-05, + "loss": 0.6169, + "step": 8397 + }, + { + "epoch": 1.3709236357699686, + "grad_norm": 1.8542600870132446, + "learning_rate": 1.9725695789835112e-05, + "loss": 0.6576, + "step": 8398 + }, + { + "epoch": 1.371086894412473, + "grad_norm": 1.7300984859466553, + "learning_rate": 1.9725621969520753e-05, + "loss": 0.598, + "step": 8399 + }, + { + "epoch": 1.3712501530549774, + "grad_norm": 1.796614170074463, + "learning_rate": 1.9725548139412693e-05, + "loss": 0.5613, + "step": 8400 + }, + { + "epoch": 1.3714134116974817, + "grad_norm": 1.8692671060562134, + "learning_rate": 1.9725474299511008e-05, + "loss": 0.6775, + "step": 8401 + }, + { + "epoch": 1.371576670339986, + "grad_norm": 1.8673346042633057, + "learning_rate": 1.972540044981578e-05, + "loss": 0.6933, + "step": 8402 + }, + { + "epoch": 1.3717399289824905, + "grad_norm": 1.405704379081726, + "learning_rate": 1.9725326590327066e-05, + "loss": 0.538, + "step": 8403 + }, + { + "epoch": 1.371903187624995, + "grad_norm": 1.8175907135009766, + "learning_rate": 1.9725252721044956e-05, + "loss": 0.6977, + "step": 8404 + }, + { + "epoch": 1.3720664462674992, + "grad_norm": 1.5783928632736206, + "learning_rate": 1.9725178841969522e-05, + "loss": 0.7136, + "step": 8405 + }, + { + "epoch": 1.3722297049100036, + "grad_norm": 1.7211568355560303, + "learning_rate": 1.972510495310083e-05, + "loss": 0.6778, + "step": 8406 + }, + { + "epoch": 1.372392963552508, + "grad_norm": 1.9487515687942505, + "learning_rate": 1.972503105443896e-05, + "loss": 0.7074, + "step": 8407 + }, + { + "epoch": 1.3725562221950125, + "grad_norm": 2.003492593765259, + "learning_rate": 1.972495714598399e-05, + "loss": 1.4301, + "step": 8408 + }, + { + "epoch": 1.372719480837517, + "grad_norm": 1.524784803390503, + "learning_rate": 1.972488322773599e-05, + "loss": 0.592, + "step": 8409 + }, + { + "epoch": 1.3728827394800212, + "grad_norm": 2.12522554397583, + "learning_rate": 1.9724809299695035e-05, + "loss": 0.718, + "step": 8410 + }, + { + "epoch": 1.3730459981225256, + "grad_norm": 1.7248750925064087, + "learning_rate": 1.97247353618612e-05, + "loss": 0.6978, + "step": 8411 + }, + { + "epoch": 1.37320925676503, + "grad_norm": 1.4759026765823364, + "learning_rate": 1.9724661414234556e-05, + "loss": 0.5639, + "step": 8412 + }, + { + "epoch": 1.3733725154075345, + "grad_norm": 1.5456266403198242, + "learning_rate": 1.9724587456815184e-05, + "loss": 0.5608, + "step": 8413 + }, + { + "epoch": 1.3735357740500387, + "grad_norm": 1.664540410041809, + "learning_rate": 1.9724513489603153e-05, + "loss": 0.6107, + "step": 8414 + }, + { + "epoch": 1.3736990326925431, + "grad_norm": 1.495896816253662, + "learning_rate": 1.972443951259854e-05, + "loss": 0.5233, + "step": 8415 + }, + { + "epoch": 1.3738622913350476, + "grad_norm": 1.6689540147781372, + "learning_rate": 1.972436552580142e-05, + "loss": 0.7058, + "step": 8416 + }, + { + "epoch": 1.374025549977552, + "grad_norm": 1.6329103708267212, + "learning_rate": 1.9724291529211866e-05, + "loss": 0.6715, + "step": 8417 + }, + { + "epoch": 1.3741888086200564, + "grad_norm": 1.9518340826034546, + "learning_rate": 1.972421752282995e-05, + "loss": 0.6714, + "step": 8418 + }, + { + "epoch": 1.3743520672625607, + "grad_norm": 2.0537843704223633, + "learning_rate": 1.9724143506655757e-05, + "loss": 0.8565, + "step": 8419 + }, + { + "epoch": 1.374515325905065, + "grad_norm": 1.83000910282135, + "learning_rate": 1.9724069480689346e-05, + "loss": 0.6048, + "step": 8420 + }, + { + "epoch": 1.3746785845475695, + "grad_norm": 2.023005962371826, + "learning_rate": 1.9723995444930804e-05, + "loss": 0.7768, + "step": 8421 + }, + { + "epoch": 1.3748418431900737, + "grad_norm": 1.9521901607513428, + "learning_rate": 1.97239213993802e-05, + "loss": 0.7615, + "step": 8422 + }, + { + "epoch": 1.3750051018325782, + "grad_norm": 2.0240862369537354, + "learning_rate": 1.972384734403761e-05, + "loss": 0.84, + "step": 8423 + }, + { + "epoch": 1.3751683604750826, + "grad_norm": 1.9210774898529053, + "learning_rate": 1.9723773278903106e-05, + "loss": 0.7791, + "step": 8424 + }, + { + "epoch": 1.375331619117587, + "grad_norm": 1.6599315404891968, + "learning_rate": 1.9723699203976768e-05, + "loss": 0.6974, + "step": 8425 + }, + { + "epoch": 1.3754948777600915, + "grad_norm": 1.7277590036392212, + "learning_rate": 1.9723625119258668e-05, + "loss": 0.6386, + "step": 8426 + }, + { + "epoch": 1.375658136402596, + "grad_norm": 1.7206592559814453, + "learning_rate": 1.9723551024748878e-05, + "loss": 0.6766, + "step": 8427 + }, + { + "epoch": 1.3758213950451001, + "grad_norm": 1.845370888710022, + "learning_rate": 1.9723476920447474e-05, + "loss": 0.6321, + "step": 8428 + }, + { + "epoch": 1.3759846536876046, + "grad_norm": 1.9309136867523193, + "learning_rate": 1.9723402806354533e-05, + "loss": 0.796, + "step": 8429 + }, + { + "epoch": 1.376147912330109, + "grad_norm": 1.4519853591918945, + "learning_rate": 1.9723328682470127e-05, + "loss": 0.5607, + "step": 8430 + }, + { + "epoch": 1.3763111709726132, + "grad_norm": 1.7722735404968262, + "learning_rate": 1.972325454879433e-05, + "loss": 0.7211, + "step": 8431 + }, + { + "epoch": 1.3764744296151177, + "grad_norm": 1.5729968547821045, + "learning_rate": 1.9723180405327224e-05, + "loss": 0.596, + "step": 8432 + }, + { + "epoch": 1.3766376882576221, + "grad_norm": 1.8054226636886597, + "learning_rate": 1.972310625206887e-05, + "loss": 0.675, + "step": 8433 + }, + { + "epoch": 1.3768009469001266, + "grad_norm": 1.6092917919158936, + "learning_rate": 1.9723032089019356e-05, + "loss": 0.6061, + "step": 8434 + }, + { + "epoch": 1.376964205542631, + "grad_norm": 1.646209478378296, + "learning_rate": 1.972295791617875e-05, + "loss": 0.7091, + "step": 8435 + }, + { + "epoch": 1.3771274641851354, + "grad_norm": 1.661869764328003, + "learning_rate": 1.9722883733547128e-05, + "loss": 0.6502, + "step": 8436 + }, + { + "epoch": 1.3772907228276396, + "grad_norm": 2.031036853790283, + "learning_rate": 1.9722809541124567e-05, + "loss": 0.6234, + "step": 8437 + }, + { + "epoch": 1.377453981470144, + "grad_norm": 1.404412865638733, + "learning_rate": 1.9722735338911136e-05, + "loss": 0.5742, + "step": 8438 + }, + { + "epoch": 1.3776172401126485, + "grad_norm": 1.6166009902954102, + "learning_rate": 1.9722661126906915e-05, + "loss": 0.6311, + "step": 8439 + }, + { + "epoch": 1.3777804987551527, + "grad_norm": 1.8461928367614746, + "learning_rate": 1.9722586905111975e-05, + "loss": 0.7471, + "step": 8440 + }, + { + "epoch": 1.3779437573976572, + "grad_norm": 1.308459758758545, + "learning_rate": 1.9722512673526392e-05, + "loss": 0.5337, + "step": 8441 + }, + { + "epoch": 1.3781070160401616, + "grad_norm": 1.8069099187850952, + "learning_rate": 1.9722438432150243e-05, + "loss": 0.6939, + "step": 8442 + }, + { + "epoch": 1.378270274682666, + "grad_norm": 1.8021799325942993, + "learning_rate": 1.9722364180983603e-05, + "loss": 0.636, + "step": 8443 + }, + { + "epoch": 1.3784335333251705, + "grad_norm": 1.7545087337493896, + "learning_rate": 1.9722289920026543e-05, + "loss": 0.606, + "step": 8444 + }, + { + "epoch": 1.3785967919676747, + "grad_norm": 1.8423954248428345, + "learning_rate": 1.9722215649279136e-05, + "loss": 0.8203, + "step": 8445 + }, + { + "epoch": 1.3787600506101791, + "grad_norm": 1.6208566427230835, + "learning_rate": 1.9722141368741466e-05, + "loss": 0.6257, + "step": 8446 + }, + { + "epoch": 1.3789233092526836, + "grad_norm": 1.9168658256530762, + "learning_rate": 1.97220670784136e-05, + "loss": 0.7611, + "step": 8447 + }, + { + "epoch": 1.379086567895188, + "grad_norm": 1.3439618349075317, + "learning_rate": 1.9721992778295617e-05, + "loss": 0.427, + "step": 8448 + }, + { + "epoch": 1.3792498265376922, + "grad_norm": 1.9709808826446533, + "learning_rate": 1.9721918468387588e-05, + "loss": 0.7794, + "step": 8449 + }, + { + "epoch": 1.3794130851801967, + "grad_norm": 2.0591931343078613, + "learning_rate": 1.972184414868959e-05, + "loss": 0.7943, + "step": 8450 + }, + { + "epoch": 1.3795763438227011, + "grad_norm": 2.0306127071380615, + "learning_rate": 1.9721769819201694e-05, + "loss": 0.7248, + "step": 8451 + }, + { + "epoch": 1.3797396024652056, + "grad_norm": 2.016033887863159, + "learning_rate": 1.9721695479923983e-05, + "loss": 0.7622, + "step": 8452 + }, + { + "epoch": 1.37990286110771, + "grad_norm": 1.3038848638534546, + "learning_rate": 1.9721621130856526e-05, + "loss": 0.5002, + "step": 8453 + }, + { + "epoch": 1.3800661197502142, + "grad_norm": 1.81847083568573, + "learning_rate": 1.9721546771999398e-05, + "loss": 0.7565, + "step": 8454 + }, + { + "epoch": 1.3802293783927186, + "grad_norm": 1.5064761638641357, + "learning_rate": 1.9721472403352676e-05, + "loss": 0.6592, + "step": 8455 + }, + { + "epoch": 1.380392637035223, + "grad_norm": 1.9900223016738892, + "learning_rate": 1.9721398024916435e-05, + "loss": 0.8305, + "step": 8456 + }, + { + "epoch": 1.3805558956777275, + "grad_norm": 2.13301420211792, + "learning_rate": 1.9721323636690746e-05, + "loss": 0.8289, + "step": 8457 + }, + { + "epoch": 1.3807191543202317, + "grad_norm": 2.0816996097564697, + "learning_rate": 1.9721249238675688e-05, + "loss": 0.8236, + "step": 8458 + }, + { + "epoch": 1.3808824129627362, + "grad_norm": 1.7025365829467773, + "learning_rate": 1.9721174830871334e-05, + "loss": 0.7274, + "step": 8459 + }, + { + "epoch": 1.3810456716052406, + "grad_norm": 1.9764474630355835, + "learning_rate": 1.9721100413277763e-05, + "loss": 0.6989, + "step": 8460 + }, + { + "epoch": 1.381208930247745, + "grad_norm": 2.0128912925720215, + "learning_rate": 1.9721025985895042e-05, + "loss": 0.7921, + "step": 8461 + }, + { + "epoch": 1.3813721888902495, + "grad_norm": 1.8539080619812012, + "learning_rate": 1.9720951548723253e-05, + "loss": 0.8018, + "step": 8462 + }, + { + "epoch": 1.3815354475327537, + "grad_norm": 1.7069453001022339, + "learning_rate": 1.9720877101762467e-05, + "loss": 0.6945, + "step": 8463 + }, + { + "epoch": 1.3816987061752581, + "grad_norm": 1.773798942565918, + "learning_rate": 1.9720802645012762e-05, + "loss": 0.7168, + "step": 8464 + }, + { + "epoch": 1.3818619648177626, + "grad_norm": 1.5558085441589355, + "learning_rate": 1.9720728178474208e-05, + "loss": 0.6448, + "step": 8465 + }, + { + "epoch": 1.3820252234602668, + "grad_norm": 1.868607997894287, + "learning_rate": 1.9720653702146885e-05, + "loss": 0.742, + "step": 8466 + }, + { + "epoch": 1.3821884821027712, + "grad_norm": 1.3872383832931519, + "learning_rate": 1.972057921603087e-05, + "loss": 0.524, + "step": 8467 + }, + { + "epoch": 1.3823517407452757, + "grad_norm": 1.7063668966293335, + "learning_rate": 1.972050472012623e-05, + "loss": 0.7065, + "step": 8468 + }, + { + "epoch": 1.38251499938778, + "grad_norm": 1.5860522985458374, + "learning_rate": 1.9720430214433045e-05, + "loss": 0.6801, + "step": 8469 + }, + { + "epoch": 1.3826782580302845, + "grad_norm": 1.6875216960906982, + "learning_rate": 1.9720355698951387e-05, + "loss": 0.6728, + "step": 8470 + }, + { + "epoch": 1.382841516672789, + "grad_norm": 1.8280620574951172, + "learning_rate": 1.972028117368134e-05, + "loss": 0.7737, + "step": 8471 + }, + { + "epoch": 1.3830047753152932, + "grad_norm": 1.9733465909957886, + "learning_rate": 1.972020663862297e-05, + "loss": 0.7799, + "step": 8472 + }, + { + "epoch": 1.3831680339577976, + "grad_norm": 1.6831047534942627, + "learning_rate": 1.972013209377635e-05, + "loss": 0.6566, + "step": 8473 + }, + { + "epoch": 1.383331292600302, + "grad_norm": 2.041114330291748, + "learning_rate": 1.9720057539141563e-05, + "loss": 0.8495, + "step": 8474 + }, + { + "epoch": 1.3834945512428063, + "grad_norm": 1.895912528038025, + "learning_rate": 1.971998297471868e-05, + "loss": 0.7798, + "step": 8475 + }, + { + "epoch": 1.3836578098853107, + "grad_norm": 1.6067018508911133, + "learning_rate": 1.9719908400507778e-05, + "loss": 0.6908, + "step": 8476 + }, + { + "epoch": 1.3838210685278152, + "grad_norm": 1.7075824737548828, + "learning_rate": 1.971983381650893e-05, + "loss": 0.6489, + "step": 8477 + }, + { + "epoch": 1.3839843271703196, + "grad_norm": 1.7513080835342407, + "learning_rate": 1.971975922272221e-05, + "loss": 0.8004, + "step": 8478 + }, + { + "epoch": 1.384147585812824, + "grad_norm": 2.122152805328369, + "learning_rate": 1.9719684619147698e-05, + "loss": 0.7697, + "step": 8479 + }, + { + "epoch": 1.3843108444553285, + "grad_norm": 1.645156979560852, + "learning_rate": 1.9719610005785466e-05, + "loss": 0.6356, + "step": 8480 + }, + { + "epoch": 1.3844741030978327, + "grad_norm": 1.6937166452407837, + "learning_rate": 1.9719535382635586e-05, + "loss": 0.8289, + "step": 8481 + }, + { + "epoch": 1.3846373617403371, + "grad_norm": 1.561585545539856, + "learning_rate": 1.9719460749698142e-05, + "loss": 0.5439, + "step": 8482 + }, + { + "epoch": 1.3848006203828416, + "grad_norm": 1.5185095071792603, + "learning_rate": 1.97193861069732e-05, + "loss": 0.5596, + "step": 8483 + }, + { + "epoch": 1.3849638790253458, + "grad_norm": 1.5849449634552002, + "learning_rate": 1.9719311454460836e-05, + "loss": 0.5882, + "step": 8484 + }, + { + "epoch": 1.3851271376678502, + "grad_norm": 1.8550829887390137, + "learning_rate": 1.971923679216113e-05, + "loss": 0.663, + "step": 8485 + }, + { + "epoch": 1.3852903963103547, + "grad_norm": 1.7792022228240967, + "learning_rate": 1.9719162120074156e-05, + "loss": 0.8061, + "step": 8486 + }, + { + "epoch": 1.385453654952859, + "grad_norm": 1.7230370044708252, + "learning_rate": 1.971908743819999e-05, + "loss": 0.5245, + "step": 8487 + }, + { + "epoch": 1.3856169135953635, + "grad_norm": 1.7696950435638428, + "learning_rate": 1.9719012746538704e-05, + "loss": 0.7117, + "step": 8488 + }, + { + "epoch": 1.385780172237868, + "grad_norm": 1.7883312702178955, + "learning_rate": 1.9718938045090375e-05, + "loss": 0.7331, + "step": 8489 + }, + { + "epoch": 1.3859434308803722, + "grad_norm": 1.7699166536331177, + "learning_rate": 1.971886333385508e-05, + "loss": 0.6431, + "step": 8490 + }, + { + "epoch": 1.3861066895228766, + "grad_norm": 1.5653668642044067, + "learning_rate": 1.9718788612832886e-05, + "loss": 0.5702, + "step": 8491 + }, + { + "epoch": 1.386269948165381, + "grad_norm": 1.8735970258712769, + "learning_rate": 1.971871388202388e-05, + "loss": 0.803, + "step": 8492 + }, + { + "epoch": 1.3864332068078853, + "grad_norm": 1.7645715475082397, + "learning_rate": 1.971863914142813e-05, + "loss": 0.6647, + "step": 8493 + }, + { + "epoch": 1.3865964654503897, + "grad_norm": 1.8286306858062744, + "learning_rate": 1.9718564391045712e-05, + "loss": 0.7621, + "step": 8494 + }, + { + "epoch": 1.3867597240928942, + "grad_norm": 1.7723857164382935, + "learning_rate": 1.9718489630876703e-05, + "loss": 0.7225, + "step": 8495 + }, + { + "epoch": 1.3869229827353986, + "grad_norm": 1.737130880355835, + "learning_rate": 1.9718414860921176e-05, + "loss": 0.7053, + "step": 8496 + }, + { + "epoch": 1.387086241377903, + "grad_norm": 2.0651021003723145, + "learning_rate": 1.971834008117921e-05, + "loss": 0.8018, + "step": 8497 + }, + { + "epoch": 1.3872495000204073, + "grad_norm": 2.4117748737335205, + "learning_rate": 1.971826529165088e-05, + "loss": 0.7719, + "step": 8498 + }, + { + "epoch": 1.3874127586629117, + "grad_norm": 1.6619681119918823, + "learning_rate": 1.9718190492336257e-05, + "loss": 0.6618, + "step": 8499 + }, + { + "epoch": 1.3875760173054161, + "grad_norm": 1.7703887224197388, + "learning_rate": 1.9718115683235418e-05, + "loss": 0.6855, + "step": 8500 + }, + { + "epoch": 1.3877392759479206, + "grad_norm": 1.65807044506073, + "learning_rate": 1.971804086434844e-05, + "loss": 0.5962, + "step": 8501 + }, + { + "epoch": 1.3879025345904248, + "grad_norm": 1.7825350761413574, + "learning_rate": 1.97179660356754e-05, + "loss": 0.6879, + "step": 8502 + }, + { + "epoch": 1.3880657932329292, + "grad_norm": 1.7502659559249878, + "learning_rate": 1.9717891197216367e-05, + "loss": 0.5989, + "step": 8503 + }, + { + "epoch": 1.3882290518754337, + "grad_norm": 1.7276153564453125, + "learning_rate": 1.971781634897142e-05, + "loss": 0.6905, + "step": 8504 + }, + { + "epoch": 1.388392310517938, + "grad_norm": 1.6576237678527832, + "learning_rate": 1.9717741490940637e-05, + "loss": 0.6274, + "step": 8505 + }, + { + "epoch": 1.3885555691604425, + "grad_norm": 1.527543306350708, + "learning_rate": 1.9717666623124087e-05, + "loss": 0.5538, + "step": 8506 + }, + { + "epoch": 1.3887188278029468, + "grad_norm": 1.7954412698745728, + "learning_rate": 1.9717591745521854e-05, + "loss": 0.6967, + "step": 8507 + }, + { + "epoch": 1.3888820864454512, + "grad_norm": 1.86507248878479, + "learning_rate": 1.9717516858134008e-05, + "loss": 0.9216, + "step": 8508 + }, + { + "epoch": 1.3890453450879556, + "grad_norm": 2.1200945377349854, + "learning_rate": 1.9717441960960624e-05, + "loss": 0.7155, + "step": 8509 + }, + { + "epoch": 1.3892086037304598, + "grad_norm": 1.9099801778793335, + "learning_rate": 1.9717367054001774e-05, + "loss": 0.8699, + "step": 8510 + }, + { + "epoch": 1.3893718623729643, + "grad_norm": 1.8651418685913086, + "learning_rate": 1.9717292137257545e-05, + "loss": 0.7518, + "step": 8511 + }, + { + "epoch": 1.3895351210154687, + "grad_norm": 1.9073890447616577, + "learning_rate": 1.9717217210728002e-05, + "loss": 0.7477, + "step": 8512 + }, + { + "epoch": 1.3896983796579732, + "grad_norm": 1.4458518028259277, + "learning_rate": 1.9717142274413223e-05, + "loss": 0.5297, + "step": 8513 + }, + { + "epoch": 1.3898616383004776, + "grad_norm": 2.2982072830200195, + "learning_rate": 1.971706732831329e-05, + "loss": 0.8076, + "step": 8514 + }, + { + "epoch": 1.390024896942982, + "grad_norm": 1.7847076654434204, + "learning_rate": 1.9716992372428267e-05, + "loss": 0.8157, + "step": 8515 + }, + { + "epoch": 1.3901881555854863, + "grad_norm": 1.7081732749938965, + "learning_rate": 1.9716917406758236e-05, + "loss": 0.657, + "step": 8516 + }, + { + "epoch": 1.3903514142279907, + "grad_norm": 1.6708954572677612, + "learning_rate": 1.9716842431303273e-05, + "loss": 0.6302, + "step": 8517 + }, + { + "epoch": 1.3905146728704951, + "grad_norm": 1.8395391702651978, + "learning_rate": 1.971676744606345e-05, + "loss": 0.6845, + "step": 8518 + }, + { + "epoch": 1.3906779315129993, + "grad_norm": 1.5903441905975342, + "learning_rate": 1.971669245103885e-05, + "loss": 0.5573, + "step": 8519 + }, + { + "epoch": 1.3908411901555038, + "grad_norm": 1.6655635833740234, + "learning_rate": 1.9716617446229537e-05, + "loss": 0.6297, + "step": 8520 + }, + { + "epoch": 1.3910044487980082, + "grad_norm": 1.5163934230804443, + "learning_rate": 1.9716542431635598e-05, + "loss": 0.6018, + "step": 8521 + }, + { + "epoch": 1.3911677074405127, + "grad_norm": 1.642490029335022, + "learning_rate": 1.97164674072571e-05, + "loss": 0.7182, + "step": 8522 + }, + { + "epoch": 1.391330966083017, + "grad_norm": 1.523086428642273, + "learning_rate": 1.971639237309412e-05, + "loss": 0.681, + "step": 8523 + }, + { + "epoch": 1.3914942247255215, + "grad_norm": 2.526933431625366, + "learning_rate": 1.971631732914674e-05, + "loss": 0.7705, + "step": 8524 + }, + { + "epoch": 1.3916574833680257, + "grad_norm": 1.7300961017608643, + "learning_rate": 1.971624227541503e-05, + "loss": 0.7853, + "step": 8525 + }, + { + "epoch": 1.3918207420105302, + "grad_norm": 1.736971139907837, + "learning_rate": 1.9716167211899067e-05, + "loss": 0.6571, + "step": 8526 + }, + { + "epoch": 1.3919840006530346, + "grad_norm": 1.771987795829773, + "learning_rate": 1.9716092138598924e-05, + "loss": 0.6783, + "step": 8527 + }, + { + "epoch": 1.3921472592955388, + "grad_norm": 1.593761920928955, + "learning_rate": 1.971601705551468e-05, + "loss": 0.7499, + "step": 8528 + }, + { + "epoch": 1.3923105179380433, + "grad_norm": 1.7637159824371338, + "learning_rate": 1.971594196264641e-05, + "loss": 0.7221, + "step": 8529 + }, + { + "epoch": 1.3924737765805477, + "grad_norm": 1.7757338285446167, + "learning_rate": 1.9715866859994187e-05, + "loss": 0.7513, + "step": 8530 + }, + { + "epoch": 1.3926370352230522, + "grad_norm": 1.885926365852356, + "learning_rate": 1.9715791747558093e-05, + "loss": 0.7363, + "step": 8531 + }, + { + "epoch": 1.3928002938655566, + "grad_norm": 1.9472426176071167, + "learning_rate": 1.9715716625338196e-05, + "loss": 0.7103, + "step": 8532 + }, + { + "epoch": 1.392963552508061, + "grad_norm": 1.7686352729797363, + "learning_rate": 1.9715641493334574e-05, + "loss": 0.6079, + "step": 8533 + }, + { + "epoch": 1.3931268111505652, + "grad_norm": 1.8642363548278809, + "learning_rate": 1.9715566351547305e-05, + "loss": 0.8424, + "step": 8534 + }, + { + "epoch": 1.3932900697930697, + "grad_norm": 1.9722398519515991, + "learning_rate": 1.9715491199976462e-05, + "loss": 0.8831, + "step": 8535 + }, + { + "epoch": 1.3934533284355741, + "grad_norm": 1.8228189945220947, + "learning_rate": 1.9715416038622125e-05, + "loss": 0.5906, + "step": 8536 + }, + { + "epoch": 1.3936165870780783, + "grad_norm": 1.7515661716461182, + "learning_rate": 1.971534086748436e-05, + "loss": 0.6965, + "step": 8537 + }, + { + "epoch": 1.3937798457205828, + "grad_norm": 1.684076189994812, + "learning_rate": 1.971526568656326e-05, + "loss": 0.6883, + "step": 8538 + }, + { + "epoch": 1.3939431043630872, + "grad_norm": 1.8726963996887207, + "learning_rate": 1.971519049585888e-05, + "loss": 0.757, + "step": 8539 + }, + { + "epoch": 1.3941063630055917, + "grad_norm": 1.4726639986038208, + "learning_rate": 1.9715115295371313e-05, + "loss": 0.6563, + "step": 8540 + }, + { + "epoch": 1.394269621648096, + "grad_norm": 1.849305510520935, + "learning_rate": 1.9715040085100624e-05, + "loss": 0.6945, + "step": 8541 + }, + { + "epoch": 1.3944328802906003, + "grad_norm": 1.706517219543457, + "learning_rate": 1.9714964865046893e-05, + "loss": 0.6534, + "step": 8542 + }, + { + "epoch": 1.3945961389331047, + "grad_norm": 1.6469526290893555, + "learning_rate": 1.9714889635210193e-05, + "loss": 0.6445, + "step": 8543 + }, + { + "epoch": 1.3947593975756092, + "grad_norm": 1.6730015277862549, + "learning_rate": 1.97148143955906e-05, + "loss": 0.6667, + "step": 8544 + }, + { + "epoch": 1.3949226562181136, + "grad_norm": 2.0204241275787354, + "learning_rate": 1.9714739146188196e-05, + "loss": 0.7039, + "step": 8545 + }, + { + "epoch": 1.3950859148606178, + "grad_norm": 1.676442265510559, + "learning_rate": 1.9714663887003055e-05, + "loss": 0.6555, + "step": 8546 + }, + { + "epoch": 1.3952491735031223, + "grad_norm": 1.7082538604736328, + "learning_rate": 1.9714588618035245e-05, + "loss": 0.6539, + "step": 8547 + }, + { + "epoch": 1.3954124321456267, + "grad_norm": 2.2412357330322266, + "learning_rate": 1.9714513339284845e-05, + "loss": 0.8125, + "step": 8548 + }, + { + "epoch": 1.3955756907881312, + "grad_norm": 1.7997382879257202, + "learning_rate": 1.9714438050751937e-05, + "loss": 0.5491, + "step": 8549 + }, + { + "epoch": 1.3957389494306356, + "grad_norm": 1.772769570350647, + "learning_rate": 1.971436275243659e-05, + "loss": 0.6555, + "step": 8550 + }, + { + "epoch": 1.3959022080731398, + "grad_norm": 1.7535885572433472, + "learning_rate": 1.9714287444338884e-05, + "loss": 0.7208, + "step": 8551 + }, + { + "epoch": 1.3960654667156442, + "grad_norm": 2.0372729301452637, + "learning_rate": 1.9714212126458893e-05, + "loss": 0.6892, + "step": 8552 + }, + { + "epoch": 1.3962287253581487, + "grad_norm": 1.8749943971633911, + "learning_rate": 1.971413679879669e-05, + "loss": 0.7605, + "step": 8553 + }, + { + "epoch": 1.396391984000653, + "grad_norm": 1.5726737976074219, + "learning_rate": 1.971406146135236e-05, + "loss": 0.6779, + "step": 8554 + }, + { + "epoch": 1.3965552426431573, + "grad_norm": 1.4064544439315796, + "learning_rate": 1.9713986114125966e-05, + "loss": 0.5512, + "step": 8555 + }, + { + "epoch": 1.3967185012856618, + "grad_norm": 1.8914309740066528, + "learning_rate": 1.9713910757117593e-05, + "loss": 0.7273, + "step": 8556 + }, + { + "epoch": 1.3968817599281662, + "grad_norm": 1.82962167263031, + "learning_rate": 1.9713835390327317e-05, + "loss": 0.7273, + "step": 8557 + }, + { + "epoch": 1.3970450185706706, + "grad_norm": 1.9358083009719849, + "learning_rate": 1.9713760013755207e-05, + "loss": 0.6718, + "step": 8558 + }, + { + "epoch": 1.397208277213175, + "grad_norm": 1.5062962770462036, + "learning_rate": 1.9713684627401346e-05, + "loss": 0.5279, + "step": 8559 + }, + { + "epoch": 1.3973715358556793, + "grad_norm": 2.1053473949432373, + "learning_rate": 1.9713609231265807e-05, + "loss": 0.9349, + "step": 8560 + }, + { + "epoch": 1.3975347944981837, + "grad_norm": 1.8815748691558838, + "learning_rate": 1.9713533825348664e-05, + "loss": 0.7169, + "step": 8561 + }, + { + "epoch": 1.3976980531406882, + "grad_norm": 2.067535877227783, + "learning_rate": 1.971345840965e-05, + "loss": 0.7811, + "step": 8562 + }, + { + "epoch": 1.3978613117831924, + "grad_norm": 1.6335721015930176, + "learning_rate": 1.971338298416988e-05, + "loss": 0.7191, + "step": 8563 + }, + { + "epoch": 1.3980245704256968, + "grad_norm": 1.6634080410003662, + "learning_rate": 1.971330754890839e-05, + "loss": 0.6422, + "step": 8564 + }, + { + "epoch": 1.3981878290682013, + "grad_norm": 1.6815085411071777, + "learning_rate": 1.97132321038656e-05, + "loss": 0.7019, + "step": 8565 + }, + { + "epoch": 1.3983510877107057, + "grad_norm": 1.504040002822876, + "learning_rate": 1.9713156649041587e-05, + "loss": 0.5265, + "step": 8566 + }, + { + "epoch": 1.3985143463532101, + "grad_norm": 1.8774583339691162, + "learning_rate": 1.971308118443643e-05, + "loss": 0.7659, + "step": 8567 + }, + { + "epoch": 1.3986776049957146, + "grad_norm": 1.7899527549743652, + "learning_rate": 1.9713005710050203e-05, + "loss": 0.7163, + "step": 8568 + }, + { + "epoch": 1.3988408636382188, + "grad_norm": 1.8520280122756958, + "learning_rate": 1.9712930225882976e-05, + "loss": 0.6883, + "step": 8569 + }, + { + "epoch": 1.3990041222807232, + "grad_norm": 1.8585915565490723, + "learning_rate": 1.9712854731934837e-05, + "loss": 0.7027, + "step": 8570 + }, + { + "epoch": 1.3991673809232277, + "grad_norm": 1.9673842191696167, + "learning_rate": 1.9712779228205856e-05, + "loss": 0.8631, + "step": 8571 + }, + { + "epoch": 1.399330639565732, + "grad_norm": 1.672910213470459, + "learning_rate": 1.9712703714696108e-05, + "loss": 0.7067, + "step": 8572 + }, + { + "epoch": 1.3994938982082363, + "grad_norm": 1.9864866733551025, + "learning_rate": 1.971262819140567e-05, + "loss": 0.9096, + "step": 8573 + }, + { + "epoch": 1.3996571568507408, + "grad_norm": 1.9118305444717407, + "learning_rate": 1.9712552658334617e-05, + "loss": 0.7571, + "step": 8574 + }, + { + "epoch": 1.3998204154932452, + "grad_norm": 1.7030729055404663, + "learning_rate": 1.9712477115483027e-05, + "loss": 0.6483, + "step": 8575 + }, + { + "epoch": 1.3999836741357496, + "grad_norm": 1.8887096643447876, + "learning_rate": 1.9712401562850975e-05, + "loss": 0.9116, + "step": 8576 + }, + { + "epoch": 1.400146932778254, + "grad_norm": 1.522375464439392, + "learning_rate": 1.9712326000438535e-05, + "loss": 0.6523, + "step": 8577 + }, + { + "epoch": 1.4003101914207583, + "grad_norm": 1.520180583000183, + "learning_rate": 1.9712250428245788e-05, + "loss": 0.6304, + "step": 8578 + }, + { + "epoch": 1.4004734500632627, + "grad_norm": 1.7145904302597046, + "learning_rate": 1.9712174846272806e-05, + "loss": 0.6621, + "step": 8579 + }, + { + "epoch": 1.4006367087057672, + "grad_norm": 1.7909493446350098, + "learning_rate": 1.9712099254519666e-05, + "loss": 0.6608, + "step": 8580 + }, + { + "epoch": 1.4007999673482714, + "grad_norm": 1.867161750793457, + "learning_rate": 1.9712023652986444e-05, + "loss": 0.9778, + "step": 8581 + }, + { + "epoch": 1.4009632259907758, + "grad_norm": 1.8501540422439575, + "learning_rate": 1.971194804167322e-05, + "loss": 0.7388, + "step": 8582 + }, + { + "epoch": 1.4011264846332803, + "grad_norm": 1.9510643482208252, + "learning_rate": 1.971187242058006e-05, + "loss": 0.5734, + "step": 8583 + }, + { + "epoch": 1.4012897432757847, + "grad_norm": 1.752379059791565, + "learning_rate": 1.9711796789707057e-05, + "loss": 0.6218, + "step": 8584 + }, + { + "epoch": 1.4014530019182891, + "grad_norm": 2.176736354827881, + "learning_rate": 1.9711721149054272e-05, + "loss": 0.7749, + "step": 8585 + }, + { + "epoch": 1.4016162605607934, + "grad_norm": 1.680879831314087, + "learning_rate": 1.9711645498621787e-05, + "loss": 0.7899, + "step": 8586 + }, + { + "epoch": 1.4017795192032978, + "grad_norm": 1.8759444952011108, + "learning_rate": 1.9711569838409675e-05, + "loss": 0.8015, + "step": 8587 + }, + { + "epoch": 1.4019427778458022, + "grad_norm": 1.7387847900390625, + "learning_rate": 1.9711494168418017e-05, + "loss": 0.6439, + "step": 8588 + }, + { + "epoch": 1.4021060364883067, + "grad_norm": 2.000535011291504, + "learning_rate": 1.971141848864689e-05, + "loss": 0.8285, + "step": 8589 + }, + { + "epoch": 1.4022692951308109, + "grad_norm": 1.91960871219635, + "learning_rate": 1.971134279909636e-05, + "loss": 0.6128, + "step": 8590 + }, + { + "epoch": 1.4024325537733153, + "grad_norm": 1.8916155099868774, + "learning_rate": 1.9711267099766517e-05, + "loss": 0.7893, + "step": 8591 + }, + { + "epoch": 1.4025958124158198, + "grad_norm": 1.5142968893051147, + "learning_rate": 1.971119139065743e-05, + "loss": 0.639, + "step": 8592 + }, + { + "epoch": 1.4027590710583242, + "grad_norm": 2.034125566482544, + "learning_rate": 1.9711115671769172e-05, + "loss": 0.7668, + "step": 8593 + }, + { + "epoch": 1.4029223297008286, + "grad_norm": 1.9606164693832397, + "learning_rate": 1.9711039943101826e-05, + "loss": 0.8288, + "step": 8594 + }, + { + "epoch": 1.4030855883433329, + "grad_norm": 1.683655023574829, + "learning_rate": 1.9710964204655462e-05, + "loss": 0.7308, + "step": 8595 + }, + { + "epoch": 1.4032488469858373, + "grad_norm": 2.032135248184204, + "learning_rate": 1.9710888456430163e-05, + "loss": 0.7335, + "step": 8596 + }, + { + "epoch": 1.4034121056283417, + "grad_norm": 1.6199026107788086, + "learning_rate": 1.9710812698426e-05, + "loss": 0.5885, + "step": 8597 + }, + { + "epoch": 1.4035753642708462, + "grad_norm": 1.8474699258804321, + "learning_rate": 1.9710736930643054e-05, + "loss": 0.7112, + "step": 8598 + }, + { + "epoch": 1.4037386229133504, + "grad_norm": 1.4029401540756226, + "learning_rate": 1.9710661153081396e-05, + "loss": 0.5002, + "step": 8599 + }, + { + "epoch": 1.4039018815558548, + "grad_norm": 1.5530167818069458, + "learning_rate": 1.9710585365741105e-05, + "loss": 0.6494, + "step": 8600 + }, + { + "epoch": 1.4040651401983593, + "grad_norm": 1.735854983329773, + "learning_rate": 1.971050956862226e-05, + "loss": 0.648, + "step": 8601 + }, + { + "epoch": 1.4042283988408637, + "grad_norm": 1.6918120384216309, + "learning_rate": 1.971043376172493e-05, + "loss": 0.6145, + "step": 8602 + }, + { + "epoch": 1.4043916574833681, + "grad_norm": 1.797217607498169, + "learning_rate": 1.97103579450492e-05, + "loss": 0.5966, + "step": 8603 + }, + { + "epoch": 1.4045549161258724, + "grad_norm": 1.838832974433899, + "learning_rate": 1.9710282118595137e-05, + "loss": 0.6548, + "step": 8604 + }, + { + "epoch": 1.4047181747683768, + "grad_norm": 1.9912137985229492, + "learning_rate": 1.9710206282362823e-05, + "loss": 0.719, + "step": 8605 + }, + { + "epoch": 1.4048814334108812, + "grad_norm": 1.36310613155365, + "learning_rate": 1.9710130436352338e-05, + "loss": 0.6004, + "step": 8606 + }, + { + "epoch": 1.4050446920533854, + "grad_norm": 1.7034072875976562, + "learning_rate": 1.971005458056375e-05, + "loss": 0.609, + "step": 8607 + }, + { + "epoch": 1.4052079506958899, + "grad_norm": 2.064579963684082, + "learning_rate": 1.9709978714997146e-05, + "loss": 0.7673, + "step": 8608 + }, + { + "epoch": 1.4053712093383943, + "grad_norm": 2.082993507385254, + "learning_rate": 1.970990283965259e-05, + "loss": 1.4007, + "step": 8609 + }, + { + "epoch": 1.4055344679808988, + "grad_norm": 1.8789527416229248, + "learning_rate": 1.970982695453017e-05, + "loss": 0.6507, + "step": 8610 + }, + { + "epoch": 1.4056977266234032, + "grad_norm": 1.537828803062439, + "learning_rate": 1.9709751059629952e-05, + "loss": 0.639, + "step": 8611 + }, + { + "epoch": 1.4058609852659076, + "grad_norm": 1.7417701482772827, + "learning_rate": 1.9709675154952017e-05, + "loss": 0.6875, + "step": 8612 + }, + { + "epoch": 1.4060242439084119, + "grad_norm": 1.5917898416519165, + "learning_rate": 1.970959924049644e-05, + "loss": 0.5525, + "step": 8613 + }, + { + "epoch": 1.4061875025509163, + "grad_norm": 1.656715989112854, + "learning_rate": 1.9709523316263306e-05, + "loss": 0.6078, + "step": 8614 + }, + { + "epoch": 1.4063507611934207, + "grad_norm": 1.5196971893310547, + "learning_rate": 1.970944738225268e-05, + "loss": 0.6065, + "step": 8615 + }, + { + "epoch": 1.406514019835925, + "grad_norm": 1.885778546333313, + "learning_rate": 1.9709371438464646e-05, + "loss": 0.8296, + "step": 8616 + }, + { + "epoch": 1.4066772784784294, + "grad_norm": 1.5454318523406982, + "learning_rate": 1.9709295484899275e-05, + "loss": 0.6074, + "step": 8617 + }, + { + "epoch": 1.4068405371209338, + "grad_norm": 1.496180772781372, + "learning_rate": 1.9709219521556647e-05, + "loss": 0.5919, + "step": 8618 + }, + { + "epoch": 1.4070037957634383, + "grad_norm": 1.583449363708496, + "learning_rate": 1.9709143548436837e-05, + "loss": 0.6574, + "step": 8619 + }, + { + "epoch": 1.4071670544059427, + "grad_norm": 2.218541383743286, + "learning_rate": 1.970906756553992e-05, + "loss": 0.6347, + "step": 8620 + }, + { + "epoch": 1.4073303130484471, + "grad_norm": 1.5744132995605469, + "learning_rate": 1.970899157286598e-05, + "loss": 0.6046, + "step": 8621 + }, + { + "epoch": 1.4074935716909514, + "grad_norm": 1.9045554399490356, + "learning_rate": 1.970891557041508e-05, + "loss": 0.6964, + "step": 8622 + }, + { + "epoch": 1.4076568303334558, + "grad_norm": 1.8541924953460693, + "learning_rate": 1.9708839558187313e-05, + "loss": 0.7256, + "step": 8623 + }, + { + "epoch": 1.4078200889759602, + "grad_norm": 1.7579020261764526, + "learning_rate": 1.9708763536182744e-05, + "loss": 0.6599, + "step": 8624 + }, + { + "epoch": 1.4079833476184644, + "grad_norm": 1.97645902633667, + "learning_rate": 1.970868750440145e-05, + "loss": 0.7083, + "step": 8625 + }, + { + "epoch": 1.4081466062609689, + "grad_norm": 1.5480568408966064, + "learning_rate": 1.9708611462843512e-05, + "loss": 0.5439, + "step": 8626 + }, + { + "epoch": 1.4083098649034733, + "grad_norm": 1.7147431373596191, + "learning_rate": 1.9708535411509008e-05, + "loss": 0.7496, + "step": 8627 + }, + { + "epoch": 1.4084731235459778, + "grad_norm": 1.6770806312561035, + "learning_rate": 1.9708459350398005e-05, + "loss": 0.6306, + "step": 8628 + }, + { + "epoch": 1.4086363821884822, + "grad_norm": 1.5218342542648315, + "learning_rate": 1.970838327951059e-05, + "loss": 0.6411, + "step": 8629 + }, + { + "epoch": 1.4087996408309864, + "grad_norm": 1.7869845628738403, + "learning_rate": 1.9708307198846837e-05, + "loss": 0.6951, + "step": 8630 + }, + { + "epoch": 1.4089628994734908, + "grad_norm": 1.572424292564392, + "learning_rate": 1.9708231108406817e-05, + "loss": 0.6699, + "step": 8631 + }, + { + "epoch": 1.4091261581159953, + "grad_norm": 1.7874997854232788, + "learning_rate": 1.9708155008190614e-05, + "loss": 0.6934, + "step": 8632 + }, + { + "epoch": 1.4092894167584997, + "grad_norm": 1.6763380765914917, + "learning_rate": 1.9708078898198303e-05, + "loss": 0.5726, + "step": 8633 + }, + { + "epoch": 1.409452675401004, + "grad_norm": 1.6334975957870483, + "learning_rate": 1.9708002778429957e-05, + "loss": 0.5428, + "step": 8634 + }, + { + "epoch": 1.4096159340435084, + "grad_norm": 2.127756357192993, + "learning_rate": 1.9707926648885655e-05, + "loss": 0.7091, + "step": 8635 + }, + { + "epoch": 1.4097791926860128, + "grad_norm": 1.5964088439941406, + "learning_rate": 1.970785050956547e-05, + "loss": 0.5284, + "step": 8636 + }, + { + "epoch": 1.4099424513285173, + "grad_norm": 1.9498860836029053, + "learning_rate": 1.9707774360469487e-05, + "loss": 0.7433, + "step": 8637 + }, + { + "epoch": 1.4101057099710217, + "grad_norm": 1.544713020324707, + "learning_rate": 1.9707698201597777e-05, + "loss": 0.6993, + "step": 8638 + }, + { + "epoch": 1.410268968613526, + "grad_norm": 1.9762260913848877, + "learning_rate": 1.9707622032950416e-05, + "loss": 0.6025, + "step": 8639 + }, + { + "epoch": 1.4104322272560303, + "grad_norm": 1.6134401559829712, + "learning_rate": 1.970754585452748e-05, + "loss": 0.6924, + "step": 8640 + }, + { + "epoch": 1.4105954858985348, + "grad_norm": 1.6906917095184326, + "learning_rate": 1.970746966632905e-05, + "loss": 0.5938, + "step": 8641 + }, + { + "epoch": 1.4107587445410392, + "grad_norm": 2.017824172973633, + "learning_rate": 1.9707393468355204e-05, + "loss": 0.7295, + "step": 8642 + }, + { + "epoch": 1.4109220031835434, + "grad_norm": 1.6652110815048218, + "learning_rate": 1.9707317260606014e-05, + "loss": 0.5972, + "step": 8643 + }, + { + "epoch": 1.4110852618260479, + "grad_norm": 1.6441612243652344, + "learning_rate": 1.9707241043081555e-05, + "loss": 0.6796, + "step": 8644 + }, + { + "epoch": 1.4112485204685523, + "grad_norm": 1.569763422012329, + "learning_rate": 1.970716481578191e-05, + "loss": 0.5633, + "step": 8645 + }, + { + "epoch": 1.4114117791110568, + "grad_norm": 1.7102195024490356, + "learning_rate": 1.9707088578707154e-05, + "loss": 0.6562, + "step": 8646 + }, + { + "epoch": 1.4115750377535612, + "grad_norm": 1.9142568111419678, + "learning_rate": 1.9707012331857357e-05, + "loss": 0.7614, + "step": 8647 + }, + { + "epoch": 1.4117382963960654, + "grad_norm": 2.4415085315704346, + "learning_rate": 1.97069360752326e-05, + "loss": 0.7942, + "step": 8648 + }, + { + "epoch": 1.4119015550385698, + "grad_norm": 1.930181860923767, + "learning_rate": 1.9706859808832968e-05, + "loss": 0.7382, + "step": 8649 + }, + { + "epoch": 1.4120648136810743, + "grad_norm": 1.903906226158142, + "learning_rate": 1.9706783532658528e-05, + "loss": 0.724, + "step": 8650 + }, + { + "epoch": 1.4122280723235785, + "grad_norm": 1.726954698562622, + "learning_rate": 1.970670724670936e-05, + "loss": 0.7078, + "step": 8651 + }, + { + "epoch": 1.412391330966083, + "grad_norm": 1.8145008087158203, + "learning_rate": 1.9706630950985537e-05, + "loss": 0.7004, + "step": 8652 + }, + { + "epoch": 1.4125545896085874, + "grad_norm": 2.449061632156372, + "learning_rate": 1.9706554645487142e-05, + "loss": 0.7962, + "step": 8653 + }, + { + "epoch": 1.4127178482510918, + "grad_norm": 2.031891345977783, + "learning_rate": 1.9706478330214248e-05, + "loss": 0.8449, + "step": 8654 + }, + { + "epoch": 1.4128811068935963, + "grad_norm": 1.7556610107421875, + "learning_rate": 1.9706402005166935e-05, + "loss": 0.6719, + "step": 8655 + }, + { + "epoch": 1.4130443655361007, + "grad_norm": 1.8780179023742676, + "learning_rate": 1.9706325670345276e-05, + "loss": 0.7667, + "step": 8656 + }, + { + "epoch": 1.413207624178605, + "grad_norm": 1.9974851608276367, + "learning_rate": 1.9706249325749348e-05, + "loss": 1.1717, + "step": 8657 + }, + { + "epoch": 1.4133708828211093, + "grad_norm": 1.8004084825515747, + "learning_rate": 1.970617297137923e-05, + "loss": 0.6947, + "step": 8658 + }, + { + "epoch": 1.4135341414636138, + "grad_norm": 1.635007381439209, + "learning_rate": 1.9706096607235003e-05, + "loss": 0.7377, + "step": 8659 + }, + { + "epoch": 1.413697400106118, + "grad_norm": 1.5896035432815552, + "learning_rate": 1.9706020233316735e-05, + "loss": 0.6808, + "step": 8660 + }, + { + "epoch": 1.4138606587486224, + "grad_norm": 1.8063644170761108, + "learning_rate": 1.9705943849624507e-05, + "loss": 0.8254, + "step": 8661 + }, + { + "epoch": 1.4140239173911269, + "grad_norm": 1.8654104471206665, + "learning_rate": 1.97058674561584e-05, + "loss": 0.9091, + "step": 8662 + }, + { + "epoch": 1.4141871760336313, + "grad_norm": 2.1827971935272217, + "learning_rate": 1.970579105291848e-05, + "loss": 0.8624, + "step": 8663 + }, + { + "epoch": 1.4143504346761357, + "grad_norm": 1.82045316696167, + "learning_rate": 1.9705714639904838e-05, + "loss": 0.8159, + "step": 8664 + }, + { + "epoch": 1.4145136933186402, + "grad_norm": 2.075427532196045, + "learning_rate": 1.9705638217117543e-05, + "loss": 0.7499, + "step": 8665 + }, + { + "epoch": 1.4146769519611444, + "grad_norm": 1.4834234714508057, + "learning_rate": 1.970556178455667e-05, + "loss": 0.6925, + "step": 8666 + }, + { + "epoch": 1.4148402106036488, + "grad_norm": 1.8697466850280762, + "learning_rate": 1.9705485342222302e-05, + "loss": 0.4816, + "step": 8667 + }, + { + "epoch": 1.4150034692461533, + "grad_norm": 1.6457347869873047, + "learning_rate": 1.970540889011451e-05, + "loss": 0.7459, + "step": 8668 + }, + { + "epoch": 1.4151667278886575, + "grad_norm": 1.6625922918319702, + "learning_rate": 1.9705332428233373e-05, + "loss": 0.6969, + "step": 8669 + }, + { + "epoch": 1.415329986531162, + "grad_norm": 1.704240322113037, + "learning_rate": 1.9705255956578972e-05, + "loss": 0.7391, + "step": 8670 + }, + { + "epoch": 1.4154932451736664, + "grad_norm": 1.9477009773254395, + "learning_rate": 1.9705179475151377e-05, + "loss": 0.705, + "step": 8671 + }, + { + "epoch": 1.4156565038161708, + "grad_norm": 1.6242307424545288, + "learning_rate": 1.9705102983950674e-05, + "loss": 0.6855, + "step": 8672 + }, + { + "epoch": 1.4158197624586752, + "grad_norm": 1.8269885778427124, + "learning_rate": 1.9705026482976933e-05, + "loss": 0.7559, + "step": 8673 + }, + { + "epoch": 1.4159830211011795, + "grad_norm": 1.6294444799423218, + "learning_rate": 1.970494997223023e-05, + "loss": 0.645, + "step": 8674 + }, + { + "epoch": 1.416146279743684, + "grad_norm": 1.6249995231628418, + "learning_rate": 1.9704873451710647e-05, + "loss": 0.6799, + "step": 8675 + }, + { + "epoch": 1.4163095383861883, + "grad_norm": 1.5880489349365234, + "learning_rate": 1.970479692141826e-05, + "loss": 0.6422, + "step": 8676 + }, + { + "epoch": 1.4164727970286928, + "grad_norm": 1.3066085577011108, + "learning_rate": 1.970472038135314e-05, + "loss": 0.5293, + "step": 8677 + }, + { + "epoch": 1.416636055671197, + "grad_norm": 1.8472591638565063, + "learning_rate": 1.9704643831515377e-05, + "loss": 0.7655, + "step": 8678 + }, + { + "epoch": 1.4167993143137014, + "grad_norm": 1.6922271251678467, + "learning_rate": 1.9704567271905034e-05, + "loss": 0.6675, + "step": 8679 + }, + { + "epoch": 1.4169625729562059, + "grad_norm": 1.7736334800720215, + "learning_rate": 1.9704490702522198e-05, + "loss": 0.6286, + "step": 8680 + }, + { + "epoch": 1.4171258315987103, + "grad_norm": 1.7819796800613403, + "learning_rate": 1.970441412336694e-05, + "loss": 0.6429, + "step": 8681 + }, + { + "epoch": 1.4172890902412147, + "grad_norm": 1.796210765838623, + "learning_rate": 1.9704337534439343e-05, + "loss": 0.74, + "step": 8682 + }, + { + "epoch": 1.417452348883719, + "grad_norm": 1.7743102312088013, + "learning_rate": 1.9704260935739477e-05, + "loss": 0.5991, + "step": 8683 + }, + { + "epoch": 1.4176156075262234, + "grad_norm": 1.7765942811965942, + "learning_rate": 1.9704184327267425e-05, + "loss": 0.7021, + "step": 8684 + }, + { + "epoch": 1.4177788661687278, + "grad_norm": 2.0916383266448975, + "learning_rate": 1.970410770902326e-05, + "loss": 0.9195, + "step": 8685 + }, + { + "epoch": 1.4179421248112323, + "grad_norm": 1.8041877746582031, + "learning_rate": 1.970403108100706e-05, + "loss": 0.7846, + "step": 8686 + }, + { + "epoch": 1.4181053834537365, + "grad_norm": 1.8553102016448975, + "learning_rate": 1.9703954443218908e-05, + "loss": 0.7172, + "step": 8687 + }, + { + "epoch": 1.418268642096241, + "grad_norm": 1.6210575103759766, + "learning_rate": 1.9703877795658874e-05, + "loss": 0.6126, + "step": 8688 + }, + { + "epoch": 1.4184319007387454, + "grad_norm": 1.5161592960357666, + "learning_rate": 1.970380113832704e-05, + "loss": 0.6379, + "step": 8689 + }, + { + "epoch": 1.4185951593812498, + "grad_norm": 1.5917373895645142, + "learning_rate": 1.9703724471223475e-05, + "loss": 0.6061, + "step": 8690 + }, + { + "epoch": 1.4187584180237542, + "grad_norm": 1.9403901100158691, + "learning_rate": 1.9703647794348268e-05, + "loss": 0.7945, + "step": 8691 + }, + { + "epoch": 1.4189216766662585, + "grad_norm": 2.016359329223633, + "learning_rate": 1.9703571107701486e-05, + "loss": 0.8547, + "step": 8692 + }, + { + "epoch": 1.419084935308763, + "grad_norm": 1.7833921909332275, + "learning_rate": 1.9703494411283213e-05, + "loss": 0.6531, + "step": 8693 + }, + { + "epoch": 1.4192481939512673, + "grad_norm": 1.3497540950775146, + "learning_rate": 1.9703417705093524e-05, + "loss": 0.5188, + "step": 8694 + }, + { + "epoch": 1.4194114525937715, + "grad_norm": 1.8821314573287964, + "learning_rate": 1.9703340989132493e-05, + "loss": 0.6633, + "step": 8695 + }, + { + "epoch": 1.419574711236276, + "grad_norm": 2.2048559188842773, + "learning_rate": 1.9703264263400202e-05, + "loss": 0.5689, + "step": 8696 + }, + { + "epoch": 1.4197379698787804, + "grad_norm": 1.8296291828155518, + "learning_rate": 1.970318752789673e-05, + "loss": 0.7196, + "step": 8697 + }, + { + "epoch": 1.4199012285212849, + "grad_norm": 1.7738217115402222, + "learning_rate": 1.9703110782622145e-05, + "loss": 0.7041, + "step": 8698 + }, + { + "epoch": 1.4200644871637893, + "grad_norm": 1.960978388786316, + "learning_rate": 1.970303402757653e-05, + "loss": 0.5878, + "step": 8699 + }, + { + "epoch": 1.4202277458062937, + "grad_norm": 1.8107755184173584, + "learning_rate": 1.9702957262759964e-05, + "loss": 0.6232, + "step": 8700 + }, + { + "epoch": 1.420391004448798, + "grad_norm": 1.9930211305618286, + "learning_rate": 1.9702880488172527e-05, + "loss": 0.8347, + "step": 8701 + }, + { + "epoch": 1.4205542630913024, + "grad_norm": 1.8488643169403076, + "learning_rate": 1.9702803703814288e-05, + "loss": 0.7217, + "step": 8702 + }, + { + "epoch": 1.4207175217338068, + "grad_norm": 1.8284389972686768, + "learning_rate": 1.970272690968533e-05, + "loss": 0.592, + "step": 8703 + }, + { + "epoch": 1.420880780376311, + "grad_norm": 1.5800256729125977, + "learning_rate": 1.9702650105785725e-05, + "loss": 0.6384, + "step": 8704 + }, + { + "epoch": 1.4210440390188155, + "grad_norm": 1.4261492490768433, + "learning_rate": 1.9702573292115554e-05, + "loss": 0.5645, + "step": 8705 + }, + { + "epoch": 1.42120729766132, + "grad_norm": 1.4399464130401611, + "learning_rate": 1.9702496468674896e-05, + "loss": 0.5822, + "step": 8706 + }, + { + "epoch": 1.4213705563038244, + "grad_norm": 1.874807357788086, + "learning_rate": 1.9702419635463827e-05, + "loss": 0.6434, + "step": 8707 + }, + { + "epoch": 1.4215338149463288, + "grad_norm": 1.5424790382385254, + "learning_rate": 1.9702342792482424e-05, + "loss": 0.5829, + "step": 8708 + }, + { + "epoch": 1.4216970735888332, + "grad_norm": 1.8187928199768066, + "learning_rate": 1.9702265939730766e-05, + "loss": 0.777, + "step": 8709 + }, + { + "epoch": 1.4218603322313375, + "grad_norm": 2.059610366821289, + "learning_rate": 1.9702189077208922e-05, + "loss": 0.711, + "step": 8710 + }, + { + "epoch": 1.422023590873842, + "grad_norm": 1.433377981185913, + "learning_rate": 1.9702112204916984e-05, + "loss": 0.6031, + "step": 8711 + }, + { + "epoch": 1.4221868495163463, + "grad_norm": 1.5410363674163818, + "learning_rate": 1.970203532285502e-05, + "loss": 0.6234, + "step": 8712 + }, + { + "epoch": 1.4223501081588505, + "grad_norm": 1.857173204421997, + "learning_rate": 1.9701958431023107e-05, + "loss": 0.7091, + "step": 8713 + }, + { + "epoch": 1.422513366801355, + "grad_norm": 1.596121072769165, + "learning_rate": 1.9701881529421325e-05, + "loss": 0.575, + "step": 8714 + }, + { + "epoch": 1.4226766254438594, + "grad_norm": 2.114319086074829, + "learning_rate": 1.9701804618049753e-05, + "loss": 0.8154, + "step": 8715 + }, + { + "epoch": 1.4228398840863639, + "grad_norm": 1.5954052209854126, + "learning_rate": 1.9701727696908462e-05, + "loss": 0.7106, + "step": 8716 + }, + { + "epoch": 1.4230031427288683, + "grad_norm": 1.8025619983673096, + "learning_rate": 1.9701650765997537e-05, + "loss": 0.7304, + "step": 8717 + }, + { + "epoch": 1.4231664013713727, + "grad_norm": 1.7387498617172241, + "learning_rate": 1.970157382531705e-05, + "loss": 0.6477, + "step": 8718 + }, + { + "epoch": 1.423329660013877, + "grad_norm": 1.6344341039657593, + "learning_rate": 1.9701496874867084e-05, + "loss": 0.6726, + "step": 8719 + }, + { + "epoch": 1.4234929186563814, + "grad_norm": 1.7721304893493652, + "learning_rate": 1.970141991464771e-05, + "loss": 0.6554, + "step": 8720 + }, + { + "epoch": 1.4236561772988858, + "grad_norm": 1.947679042816162, + "learning_rate": 1.970134294465901e-05, + "loss": 0.6028, + "step": 8721 + }, + { + "epoch": 1.42381943594139, + "grad_norm": 1.831961989402771, + "learning_rate": 1.970126596490106e-05, + "loss": 0.6697, + "step": 8722 + }, + { + "epoch": 1.4239826945838945, + "grad_norm": 2.8235700130462646, + "learning_rate": 1.9701188975373937e-05, + "loss": 0.7072, + "step": 8723 + }, + { + "epoch": 1.424145953226399, + "grad_norm": 1.8825876712799072, + "learning_rate": 1.9701111976077722e-05, + "loss": 0.7553, + "step": 8724 + }, + { + "epoch": 1.4243092118689034, + "grad_norm": 1.498405933380127, + "learning_rate": 1.9701034967012487e-05, + "loss": 0.5215, + "step": 8725 + }, + { + "epoch": 1.4244724705114078, + "grad_norm": 2.080427646636963, + "learning_rate": 1.9700957948178313e-05, + "loss": 0.7098, + "step": 8726 + }, + { + "epoch": 1.424635729153912, + "grad_norm": 1.6905525922775269, + "learning_rate": 1.970088091957528e-05, + "loss": 0.6493, + "step": 8727 + }, + { + "epoch": 1.4247989877964164, + "grad_norm": 1.7911847829818726, + "learning_rate": 1.9700803881203457e-05, + "loss": 0.7161, + "step": 8728 + }, + { + "epoch": 1.4249622464389209, + "grad_norm": 1.8361754417419434, + "learning_rate": 1.970072683306293e-05, + "loss": 0.622, + "step": 8729 + }, + { + "epoch": 1.4251255050814253, + "grad_norm": 1.5975465774536133, + "learning_rate": 1.9700649775153775e-05, + "loss": 0.6475, + "step": 8730 + }, + { + "epoch": 1.4252887637239295, + "grad_norm": 1.692894697189331, + "learning_rate": 1.9700572707476066e-05, + "loss": 0.6282, + "step": 8731 + }, + { + "epoch": 1.425452022366434, + "grad_norm": 1.785949468612671, + "learning_rate": 1.9700495630029884e-05, + "loss": 0.7739, + "step": 8732 + }, + { + "epoch": 1.4256152810089384, + "grad_norm": 1.609365701675415, + "learning_rate": 1.9700418542815306e-05, + "loss": 0.6576, + "step": 8733 + }, + { + "epoch": 1.4257785396514429, + "grad_norm": 2.2793776988983154, + "learning_rate": 1.9700341445832408e-05, + "loss": 0.8449, + "step": 8734 + }, + { + "epoch": 1.4259417982939473, + "grad_norm": 1.681693196296692, + "learning_rate": 1.9700264339081268e-05, + "loss": 0.6439, + "step": 8735 + }, + { + "epoch": 1.4261050569364515, + "grad_norm": 1.8200008869171143, + "learning_rate": 1.9700187222561965e-05, + "loss": 0.5901, + "step": 8736 + }, + { + "epoch": 1.426268315578956, + "grad_norm": 2.0122389793395996, + "learning_rate": 1.9700110096274577e-05, + "loss": 0.7284, + "step": 8737 + }, + { + "epoch": 1.4264315742214604, + "grad_norm": 1.8310739994049072, + "learning_rate": 1.970003296021918e-05, + "loss": 0.6716, + "step": 8738 + }, + { + "epoch": 1.4265948328639646, + "grad_norm": 1.7809780836105347, + "learning_rate": 1.969995581439585e-05, + "loss": 0.6034, + "step": 8739 + }, + { + "epoch": 1.426758091506469, + "grad_norm": 1.7563358545303345, + "learning_rate": 1.9699878658804673e-05, + "loss": 0.6303, + "step": 8740 + }, + { + "epoch": 1.4269213501489735, + "grad_norm": 2.5034730434417725, + "learning_rate": 1.9699801493445715e-05, + "loss": 0.7325, + "step": 8741 + }, + { + "epoch": 1.427084608791478, + "grad_norm": 1.8000407218933105, + "learning_rate": 1.9699724318319064e-05, + "loss": 0.7488, + "step": 8742 + }, + { + "epoch": 1.4272478674339824, + "grad_norm": 1.9862347841262817, + "learning_rate": 1.969964713342479e-05, + "loss": 0.812, + "step": 8743 + }, + { + "epoch": 1.4274111260764868, + "grad_norm": 2.2829973697662354, + "learning_rate": 1.9699569938762975e-05, + "loss": 0.7595, + "step": 8744 + }, + { + "epoch": 1.427574384718991, + "grad_norm": 1.8009029626846313, + "learning_rate": 1.9699492734333697e-05, + "loss": 0.6809, + "step": 8745 + }, + { + "epoch": 1.4277376433614954, + "grad_norm": 1.894785761833191, + "learning_rate": 1.9699415520137028e-05, + "loss": 0.6845, + "step": 8746 + }, + { + "epoch": 1.4279009020039999, + "grad_norm": 1.7852375507354736, + "learning_rate": 1.9699338296173053e-05, + "loss": 0.6888, + "step": 8747 + }, + { + "epoch": 1.428064160646504, + "grad_norm": 1.8087503910064697, + "learning_rate": 1.969926106244185e-05, + "loss": 0.6605, + "step": 8748 + }, + { + "epoch": 1.4282274192890085, + "grad_norm": 1.8518214225769043, + "learning_rate": 1.969918381894349e-05, + "loss": 0.7444, + "step": 8749 + }, + { + "epoch": 1.428390677931513, + "grad_norm": 1.5850977897644043, + "learning_rate": 1.969910656567805e-05, + "loss": 0.6481, + "step": 8750 + }, + { + "epoch": 1.4285539365740174, + "grad_norm": 2.013476610183716, + "learning_rate": 1.9699029302645622e-05, + "loss": 0.7409, + "step": 8751 + }, + { + "epoch": 1.4287171952165219, + "grad_norm": 1.9487580060958862, + "learning_rate": 1.9698952029846267e-05, + "loss": 0.7502, + "step": 8752 + }, + { + "epoch": 1.4288804538590263, + "grad_norm": 1.5989595651626587, + "learning_rate": 1.9698874747280074e-05, + "loss": 0.5553, + "step": 8753 + }, + { + "epoch": 1.4290437125015305, + "grad_norm": 1.3954507112503052, + "learning_rate": 1.9698797454947115e-05, + "loss": 0.542, + "step": 8754 + }, + { + "epoch": 1.429206971144035, + "grad_norm": 1.8733289241790771, + "learning_rate": 1.969872015284747e-05, + "loss": 0.7858, + "step": 8755 + }, + { + "epoch": 1.4293702297865394, + "grad_norm": 1.565057635307312, + "learning_rate": 1.9698642840981215e-05, + "loss": 0.5471, + "step": 8756 + }, + { + "epoch": 1.4295334884290436, + "grad_norm": 1.8549431562423706, + "learning_rate": 1.969856551934843e-05, + "loss": 0.6572, + "step": 8757 + }, + { + "epoch": 1.429696747071548, + "grad_norm": 2.008464813232422, + "learning_rate": 1.9698488187949193e-05, + "loss": 0.6773, + "step": 8758 + }, + { + "epoch": 1.4298600057140525, + "grad_norm": 1.900803804397583, + "learning_rate": 1.969841084678358e-05, + "loss": 0.6809, + "step": 8759 + }, + { + "epoch": 1.430023264356557, + "grad_norm": 1.7062910795211792, + "learning_rate": 1.9698333495851672e-05, + "loss": 0.6733, + "step": 8760 + }, + { + "epoch": 1.4301865229990613, + "grad_norm": 1.7559092044830322, + "learning_rate": 1.969825613515354e-05, + "loss": 0.7575, + "step": 8761 + }, + { + "epoch": 1.4303497816415658, + "grad_norm": 2.0171620845794678, + "learning_rate": 1.969817876468927e-05, + "loss": 0.7365, + "step": 8762 + }, + { + "epoch": 1.43051304028407, + "grad_norm": 1.7397462129592896, + "learning_rate": 1.9698101384458937e-05, + "loss": 0.6802, + "step": 8763 + }, + { + "epoch": 1.4306762989265744, + "grad_norm": 1.8439674377441406, + "learning_rate": 1.9698023994462616e-05, + "loss": 0.8354, + "step": 8764 + }, + { + "epoch": 1.4308395575690789, + "grad_norm": 1.7651145458221436, + "learning_rate": 1.969794659470039e-05, + "loss": 0.6704, + "step": 8765 + }, + { + "epoch": 1.431002816211583, + "grad_norm": 1.5611993074417114, + "learning_rate": 1.969786918517233e-05, + "loss": 0.5461, + "step": 8766 + }, + { + "epoch": 1.4311660748540875, + "grad_norm": 1.6152845621109009, + "learning_rate": 1.9697791765878526e-05, + "loss": 0.663, + "step": 8767 + }, + { + "epoch": 1.431329333496592, + "grad_norm": 1.5751186609268188, + "learning_rate": 1.9697714336819044e-05, + "loss": 0.7085, + "step": 8768 + }, + { + "epoch": 1.4314925921390964, + "grad_norm": 1.459073781967163, + "learning_rate": 1.9697636897993964e-05, + "loss": 0.5974, + "step": 8769 + }, + { + "epoch": 1.4316558507816008, + "grad_norm": 1.4595152139663696, + "learning_rate": 1.969755944940337e-05, + "loss": 0.6075, + "step": 8770 + }, + { + "epoch": 1.431819109424105, + "grad_norm": 1.6487301588058472, + "learning_rate": 1.9697481991047332e-05, + "loss": 0.6795, + "step": 8771 + }, + { + "epoch": 1.4319823680666095, + "grad_norm": 1.5374324321746826, + "learning_rate": 1.9697404522925937e-05, + "loss": 0.6009, + "step": 8772 + }, + { + "epoch": 1.432145626709114, + "grad_norm": 1.6228915452957153, + "learning_rate": 1.9697327045039255e-05, + "loss": 0.678, + "step": 8773 + }, + { + "epoch": 1.4323088853516184, + "grad_norm": 2.013782262802124, + "learning_rate": 1.9697249557387367e-05, + "loss": 0.6317, + "step": 8774 + }, + { + "epoch": 1.4324721439941226, + "grad_norm": 2.1244795322418213, + "learning_rate": 1.9697172059970355e-05, + "loss": 0.8958, + "step": 8775 + }, + { + "epoch": 1.432635402636627, + "grad_norm": 2.0930838584899902, + "learning_rate": 1.969709455278829e-05, + "loss": 0.7222, + "step": 8776 + }, + { + "epoch": 1.4327986612791315, + "grad_norm": 1.6331270933151245, + "learning_rate": 1.969701703584125e-05, + "loss": 0.5573, + "step": 8777 + }, + { + "epoch": 1.432961919921636, + "grad_norm": 1.7399390935897827, + "learning_rate": 1.9696939509129324e-05, + "loss": 0.709, + "step": 8778 + }, + { + "epoch": 1.4331251785641403, + "grad_norm": 1.8957643508911133, + "learning_rate": 1.969686197265258e-05, + "loss": 0.7133, + "step": 8779 + }, + { + "epoch": 1.4332884372066446, + "grad_norm": 2.001325845718384, + "learning_rate": 1.9696784426411097e-05, + "loss": 0.6728, + "step": 8780 + }, + { + "epoch": 1.433451695849149, + "grad_norm": 2.9992613792419434, + "learning_rate": 1.9696706870404955e-05, + "loss": 0.8508, + "step": 8781 + }, + { + "epoch": 1.4336149544916534, + "grad_norm": 1.7546895742416382, + "learning_rate": 1.969662930463423e-05, + "loss": 0.632, + "step": 8782 + }, + { + "epoch": 1.4337782131341577, + "grad_norm": 1.5804767608642578, + "learning_rate": 1.9696551729099005e-05, + "loss": 0.6293, + "step": 8783 + }, + { + "epoch": 1.433941471776662, + "grad_norm": 1.536778211593628, + "learning_rate": 1.9696474143799353e-05, + "loss": 0.5573, + "step": 8784 + }, + { + "epoch": 1.4341047304191665, + "grad_norm": 1.7767776250839233, + "learning_rate": 1.9696396548735356e-05, + "loss": 0.7113, + "step": 8785 + }, + { + "epoch": 1.434267989061671, + "grad_norm": 1.680504560470581, + "learning_rate": 1.9696318943907088e-05, + "loss": 0.6455, + "step": 8786 + }, + { + "epoch": 1.4344312477041754, + "grad_norm": 1.624477744102478, + "learning_rate": 1.969624132931463e-05, + "loss": 0.6571, + "step": 8787 + }, + { + "epoch": 1.4345945063466798, + "grad_norm": 1.6071616411209106, + "learning_rate": 1.969616370495806e-05, + "loss": 0.6614, + "step": 8788 + }, + { + "epoch": 1.434757764989184, + "grad_norm": 1.9092326164245605, + "learning_rate": 1.9696086070837458e-05, + "loss": 0.6732, + "step": 8789 + }, + { + "epoch": 1.4349210236316885, + "grad_norm": 1.5513759851455688, + "learning_rate": 1.9696008426952897e-05, + "loss": 0.6378, + "step": 8790 + }, + { + "epoch": 1.435084282274193, + "grad_norm": 1.7605129480361938, + "learning_rate": 1.969593077330446e-05, + "loss": 0.7416, + "step": 8791 + }, + { + "epoch": 1.4352475409166972, + "grad_norm": 1.7378876209259033, + "learning_rate": 1.969585310989222e-05, + "loss": 0.6029, + "step": 8792 + }, + { + "epoch": 1.4354107995592016, + "grad_norm": 1.768326759338379, + "learning_rate": 1.9695775436716263e-05, + "loss": 0.7506, + "step": 8793 + }, + { + "epoch": 1.435574058201706, + "grad_norm": 1.810838222503662, + "learning_rate": 1.969569775377666e-05, + "loss": 0.6507, + "step": 8794 + }, + { + "epoch": 1.4357373168442105, + "grad_norm": 2.145047664642334, + "learning_rate": 1.969562006107349e-05, + "loss": 0.8034, + "step": 8795 + }, + { + "epoch": 1.435900575486715, + "grad_norm": 1.648671269416809, + "learning_rate": 1.9695542358606838e-05, + "loss": 0.61, + "step": 8796 + }, + { + "epoch": 1.4360638341292193, + "grad_norm": 1.6534838676452637, + "learning_rate": 1.969546464637677e-05, + "loss": 0.7447, + "step": 8797 + }, + { + "epoch": 1.4362270927717236, + "grad_norm": 1.6590546369552612, + "learning_rate": 1.969538692438338e-05, + "loss": 0.6742, + "step": 8798 + }, + { + "epoch": 1.436390351414228, + "grad_norm": 1.8258951902389526, + "learning_rate": 1.9695309192626736e-05, + "loss": 0.6259, + "step": 8799 + }, + { + "epoch": 1.4365536100567324, + "grad_norm": 1.8605329990386963, + "learning_rate": 1.9695231451106914e-05, + "loss": 0.7122, + "step": 8800 + }, + { + "epoch": 1.4367168686992366, + "grad_norm": 2.0588760375976562, + "learning_rate": 1.9695153699824e-05, + "loss": 0.7439, + "step": 8801 + }, + { + "epoch": 1.436880127341741, + "grad_norm": 1.859056830406189, + "learning_rate": 1.9695075938778066e-05, + "loss": 0.7721, + "step": 8802 + }, + { + "epoch": 1.4370433859842455, + "grad_norm": 1.6266734600067139, + "learning_rate": 1.9694998167969196e-05, + "loss": 0.6049, + "step": 8803 + }, + { + "epoch": 1.43720664462675, + "grad_norm": 2.1042253971099854, + "learning_rate": 1.9694920387397466e-05, + "loss": 0.708, + "step": 8804 + }, + { + "epoch": 1.4373699032692544, + "grad_norm": 1.7716577053070068, + "learning_rate": 1.969484259706295e-05, + "loss": 0.7349, + "step": 8805 + }, + { + "epoch": 1.4375331619117588, + "grad_norm": 1.8162206411361694, + "learning_rate": 1.9694764796965736e-05, + "loss": 0.7807, + "step": 8806 + }, + { + "epoch": 1.437696420554263, + "grad_norm": 1.6888149976730347, + "learning_rate": 1.9694686987105893e-05, + "loss": 0.7046, + "step": 8807 + }, + { + "epoch": 1.4378596791967675, + "grad_norm": 1.7741550207138062, + "learning_rate": 1.96946091674835e-05, + "loss": 0.6955, + "step": 8808 + }, + { + "epoch": 1.438022937839272, + "grad_norm": 1.968450665473938, + "learning_rate": 1.969453133809864e-05, + "loss": 0.847, + "step": 8809 + }, + { + "epoch": 1.4381861964817761, + "grad_norm": 1.718488097190857, + "learning_rate": 1.9694453498951392e-05, + "loss": 0.7291, + "step": 8810 + }, + { + "epoch": 1.4383494551242806, + "grad_norm": 1.6668022871017456, + "learning_rate": 1.969437565004183e-05, + "loss": 0.7082, + "step": 8811 + }, + { + "epoch": 1.438512713766785, + "grad_norm": 1.5092474222183228, + "learning_rate": 1.9694297791370035e-05, + "loss": 0.5709, + "step": 8812 + }, + { + "epoch": 1.4386759724092895, + "grad_norm": 1.7654494047164917, + "learning_rate": 1.9694219922936082e-05, + "loss": 0.6133, + "step": 8813 + }, + { + "epoch": 1.438839231051794, + "grad_norm": 2.369175434112549, + "learning_rate": 1.9694142044740053e-05, + "loss": 0.77, + "step": 8814 + }, + { + "epoch": 1.4390024896942981, + "grad_norm": 1.8999091386795044, + "learning_rate": 1.9694064156782027e-05, + "loss": 0.7538, + "step": 8815 + }, + { + "epoch": 1.4391657483368026, + "grad_norm": 1.6721547842025757, + "learning_rate": 1.9693986259062082e-05, + "loss": 0.6238, + "step": 8816 + }, + { + "epoch": 1.439329006979307, + "grad_norm": 1.805945634841919, + "learning_rate": 1.9693908351580293e-05, + "loss": 0.5796, + "step": 8817 + }, + { + "epoch": 1.4394922656218114, + "grad_norm": 1.6975070238113403, + "learning_rate": 1.9693830434336743e-05, + "loss": 0.7425, + "step": 8818 + }, + { + "epoch": 1.4396555242643156, + "grad_norm": 1.7397018671035767, + "learning_rate": 1.969375250733151e-05, + "loss": 0.6267, + "step": 8819 + }, + { + "epoch": 1.43981878290682, + "grad_norm": 2.073922634124756, + "learning_rate": 1.9693674570564663e-05, + "loss": 0.6892, + "step": 8820 + }, + { + "epoch": 1.4399820415493245, + "grad_norm": 1.7947229146957397, + "learning_rate": 1.9693596624036294e-05, + "loss": 0.669, + "step": 8821 + }, + { + "epoch": 1.440145300191829, + "grad_norm": 1.7729836702346802, + "learning_rate": 1.9693518667746474e-05, + "loss": 0.7091, + "step": 8822 + }, + { + "epoch": 1.4403085588343334, + "grad_norm": 1.8461151123046875, + "learning_rate": 1.9693440701695287e-05, + "loss": 0.7911, + "step": 8823 + }, + { + "epoch": 1.4404718174768376, + "grad_norm": 1.8674719333648682, + "learning_rate": 1.9693362725882804e-05, + "loss": 0.6728, + "step": 8824 + }, + { + "epoch": 1.440635076119342, + "grad_norm": 1.5845552682876587, + "learning_rate": 1.969328474030911e-05, + "loss": 0.662, + "step": 8825 + }, + { + "epoch": 1.4407983347618465, + "grad_norm": 1.7984018325805664, + "learning_rate": 1.9693206744974276e-05, + "loss": 0.6898, + "step": 8826 + }, + { + "epoch": 1.4409615934043507, + "grad_norm": 1.4857940673828125, + "learning_rate": 1.969312873987839e-05, + "loss": 0.5708, + "step": 8827 + }, + { + "epoch": 1.4411248520468551, + "grad_norm": 1.6860756874084473, + "learning_rate": 1.9693050725021523e-05, + "loss": 0.634, + "step": 8828 + }, + { + "epoch": 1.4412881106893596, + "grad_norm": 1.8787457942962646, + "learning_rate": 1.969297270040376e-05, + "loss": 0.6958, + "step": 8829 + }, + { + "epoch": 1.441451369331864, + "grad_norm": 1.9820045232772827, + "learning_rate": 1.9692894666025174e-05, + "loss": 0.6795, + "step": 8830 + }, + { + "epoch": 1.4416146279743685, + "grad_norm": 1.8316231966018677, + "learning_rate": 1.9692816621885846e-05, + "loss": 0.5626, + "step": 8831 + }, + { + "epoch": 1.441777886616873, + "grad_norm": 1.7729562520980835, + "learning_rate": 1.9692738567985853e-05, + "loss": 0.7078, + "step": 8832 + }, + { + "epoch": 1.441941145259377, + "grad_norm": 1.785528302192688, + "learning_rate": 1.9692660504325276e-05, + "loss": 0.7462, + "step": 8833 + }, + { + "epoch": 1.4421044039018815, + "grad_norm": 1.8229944705963135, + "learning_rate": 1.9692582430904193e-05, + "loss": 0.6522, + "step": 8834 + }, + { + "epoch": 1.442267662544386, + "grad_norm": 1.9123971462249756, + "learning_rate": 1.969250434772268e-05, + "loss": 0.7042, + "step": 8835 + }, + { + "epoch": 1.4424309211868902, + "grad_norm": 1.9770201444625854, + "learning_rate": 1.9692426254780818e-05, + "loss": 0.6078, + "step": 8836 + }, + { + "epoch": 1.4425941798293946, + "grad_norm": 2.1088626384735107, + "learning_rate": 1.969234815207869e-05, + "loss": 0.7078, + "step": 8837 + }, + { + "epoch": 1.442757438471899, + "grad_norm": 1.9816428422927856, + "learning_rate": 1.9692270039616367e-05, + "loss": 0.69, + "step": 8838 + }, + { + "epoch": 1.4429206971144035, + "grad_norm": 1.7847563028335571, + "learning_rate": 1.9692191917393927e-05, + "loss": 0.6528, + "step": 8839 + }, + { + "epoch": 1.443083955756908, + "grad_norm": 1.604605793952942, + "learning_rate": 1.9692113785411456e-05, + "loss": 0.6549, + "step": 8840 + }, + { + "epoch": 1.4432472143994124, + "grad_norm": 2.4138731956481934, + "learning_rate": 1.969203564366903e-05, + "loss": 0.7697, + "step": 8841 + }, + { + "epoch": 1.4434104730419166, + "grad_norm": 1.5967594385147095, + "learning_rate": 1.9691957492166725e-05, + "loss": 0.5896, + "step": 8842 + }, + { + "epoch": 1.443573731684421, + "grad_norm": 1.6789475679397583, + "learning_rate": 1.9691879330904618e-05, + "loss": 0.6122, + "step": 8843 + }, + { + "epoch": 1.4437369903269255, + "grad_norm": 1.994183897972107, + "learning_rate": 1.9691801159882798e-05, + "loss": 0.6192, + "step": 8844 + }, + { + "epoch": 1.4439002489694297, + "grad_norm": 1.6905208826065063, + "learning_rate": 1.9691722979101332e-05, + "loss": 0.5792, + "step": 8845 + }, + { + "epoch": 1.4440635076119341, + "grad_norm": 1.8205161094665527, + "learning_rate": 1.96916447885603e-05, + "loss": 0.7432, + "step": 8846 + }, + { + "epoch": 1.4442267662544386, + "grad_norm": 2.0391979217529297, + "learning_rate": 1.969156658825979e-05, + "loss": 0.829, + "step": 8847 + }, + { + "epoch": 1.444390024896943, + "grad_norm": 1.4860278367996216, + "learning_rate": 1.9691488378199875e-05, + "loss": 0.5302, + "step": 8848 + }, + { + "epoch": 1.4445532835394475, + "grad_norm": 2.521899461746216, + "learning_rate": 1.9691410158380636e-05, + "loss": 0.7345, + "step": 8849 + }, + { + "epoch": 1.444716542181952, + "grad_norm": 1.5531392097473145, + "learning_rate": 1.9691331928802144e-05, + "loss": 0.5653, + "step": 8850 + }, + { + "epoch": 1.444879800824456, + "grad_norm": 1.8882185220718384, + "learning_rate": 1.9691253689464487e-05, + "loss": 0.6665, + "step": 8851 + }, + { + "epoch": 1.4450430594669605, + "grad_norm": 1.9289053678512573, + "learning_rate": 1.9691175440367737e-05, + "loss": 0.8581, + "step": 8852 + }, + { + "epoch": 1.445206318109465, + "grad_norm": 1.838510513305664, + "learning_rate": 1.9691097181511978e-05, + "loss": 0.6403, + "step": 8853 + }, + { + "epoch": 1.4453695767519692, + "grad_norm": 1.6136966943740845, + "learning_rate": 1.9691018912897285e-05, + "loss": 0.5826, + "step": 8854 + }, + { + "epoch": 1.4455328353944736, + "grad_norm": 1.6708736419677734, + "learning_rate": 1.9690940634523742e-05, + "loss": 0.618, + "step": 8855 + }, + { + "epoch": 1.445696094036978, + "grad_norm": 1.8291469812393188, + "learning_rate": 1.9690862346391424e-05, + "loss": 0.5951, + "step": 8856 + }, + { + "epoch": 1.4458593526794825, + "grad_norm": 1.8768844604492188, + "learning_rate": 1.9690784048500406e-05, + "loss": 0.6985, + "step": 8857 + }, + { + "epoch": 1.446022611321987, + "grad_norm": 1.7540936470031738, + "learning_rate": 1.9690705740850772e-05, + "loss": 0.7089, + "step": 8858 + }, + { + "epoch": 1.4461858699644912, + "grad_norm": 1.8675518035888672, + "learning_rate": 1.9690627423442607e-05, + "loss": 0.7893, + "step": 8859 + }, + { + "epoch": 1.4463491286069956, + "grad_norm": 1.8266639709472656, + "learning_rate": 1.9690549096275972e-05, + "loss": 0.6541, + "step": 8860 + }, + { + "epoch": 1.4465123872495, + "grad_norm": 1.8189997673034668, + "learning_rate": 1.9690470759350965e-05, + "loss": 0.8061, + "step": 8861 + }, + { + "epoch": 1.4466756458920045, + "grad_norm": 1.9061384201049805, + "learning_rate": 1.969039241266765e-05, + "loss": 0.658, + "step": 8862 + }, + { + "epoch": 1.4468389045345087, + "grad_norm": 1.792631983757019, + "learning_rate": 1.969031405622612e-05, + "loss": 0.664, + "step": 8863 + }, + { + "epoch": 1.4470021631770131, + "grad_norm": 1.602497935295105, + "learning_rate": 1.9690235690026438e-05, + "loss": 0.6939, + "step": 8864 + }, + { + "epoch": 1.4471654218195176, + "grad_norm": 1.7190130949020386, + "learning_rate": 1.9690157314068696e-05, + "loss": 0.627, + "step": 8865 + }, + { + "epoch": 1.447328680462022, + "grad_norm": 1.664627194404602, + "learning_rate": 1.969007892835297e-05, + "loss": 0.6644, + "step": 8866 + }, + { + "epoch": 1.4474919391045264, + "grad_norm": 1.7824649810791016, + "learning_rate": 1.9690000532879333e-05, + "loss": 0.7233, + "step": 8867 + }, + { + "epoch": 1.4476551977470307, + "grad_norm": 1.507237434387207, + "learning_rate": 1.9689922127647868e-05, + "loss": 0.5911, + "step": 8868 + }, + { + "epoch": 1.447818456389535, + "grad_norm": 1.8902795314788818, + "learning_rate": 1.9689843712658655e-05, + "loss": 0.702, + "step": 8869 + }, + { + "epoch": 1.4479817150320395, + "grad_norm": 3.1822617053985596, + "learning_rate": 1.9689765287911774e-05, + "loss": 0.7296, + "step": 8870 + }, + { + "epoch": 1.448144973674544, + "grad_norm": 2.177860736846924, + "learning_rate": 1.9689686853407298e-05, + "loss": 0.8308, + "step": 8871 + }, + { + "epoch": 1.4483082323170482, + "grad_norm": 1.6397078037261963, + "learning_rate": 1.9689608409145317e-05, + "loss": 0.6511, + "step": 8872 + }, + { + "epoch": 1.4484714909595526, + "grad_norm": 1.585407018661499, + "learning_rate": 1.9689529955125896e-05, + "loss": 0.5585, + "step": 8873 + }, + { + "epoch": 1.448634749602057, + "grad_norm": 1.9478856325149536, + "learning_rate": 1.9689451491349123e-05, + "loss": 0.5887, + "step": 8874 + }, + { + "epoch": 1.4487980082445615, + "grad_norm": 1.236572504043579, + "learning_rate": 1.9689373017815076e-05, + "loss": 0.4629, + "step": 8875 + }, + { + "epoch": 1.448961266887066, + "grad_norm": 1.6886515617370605, + "learning_rate": 1.968929453452383e-05, + "loss": 0.6839, + "step": 8876 + }, + { + "epoch": 1.4491245255295702, + "grad_norm": 1.6924022436141968, + "learning_rate": 1.968921604147547e-05, + "loss": 0.6596, + "step": 8877 + }, + { + "epoch": 1.4492877841720746, + "grad_norm": 1.924888253211975, + "learning_rate": 1.968913753867007e-05, + "loss": 0.8504, + "step": 8878 + }, + { + "epoch": 1.449451042814579, + "grad_norm": 1.2581098079681396, + "learning_rate": 1.9689059026107712e-05, + "loss": 0.5499, + "step": 8879 + }, + { + "epoch": 1.4496143014570833, + "grad_norm": 1.4864521026611328, + "learning_rate": 1.9688980503788474e-05, + "loss": 0.6178, + "step": 8880 + }, + { + "epoch": 1.4497775600995877, + "grad_norm": 1.8477166891098022, + "learning_rate": 1.9688901971712436e-05, + "loss": 0.749, + "step": 8881 + }, + { + "epoch": 1.4499408187420921, + "grad_norm": 1.677072286605835, + "learning_rate": 1.9688823429879676e-05, + "loss": 0.647, + "step": 8882 + }, + { + "epoch": 1.4501040773845966, + "grad_norm": 1.9211666584014893, + "learning_rate": 1.9688744878290273e-05, + "loss": 0.7423, + "step": 8883 + }, + { + "epoch": 1.450267336027101, + "grad_norm": 2.0923755168914795, + "learning_rate": 1.9688666316944306e-05, + "loss": 0.7062, + "step": 8884 + }, + { + "epoch": 1.4504305946696054, + "grad_norm": 1.7697687149047852, + "learning_rate": 1.9688587745841856e-05, + "loss": 0.7711, + "step": 8885 + }, + { + "epoch": 1.4505938533121097, + "grad_norm": 1.7329542636871338, + "learning_rate": 1.9688509164982998e-05, + "loss": 0.7016, + "step": 8886 + }, + { + "epoch": 1.450757111954614, + "grad_norm": 1.7890706062316895, + "learning_rate": 1.968843057436782e-05, + "loss": 0.6942, + "step": 8887 + }, + { + "epoch": 1.4509203705971185, + "grad_norm": 1.8924055099487305, + "learning_rate": 1.9688351973996388e-05, + "loss": 0.7323, + "step": 8888 + }, + { + "epoch": 1.4510836292396228, + "grad_norm": 2.0330381393432617, + "learning_rate": 1.968827336386879e-05, + "loss": 0.7495, + "step": 8889 + }, + { + "epoch": 1.4512468878821272, + "grad_norm": 1.9958441257476807, + "learning_rate": 1.9688194743985103e-05, + "loss": 0.8014, + "step": 8890 + }, + { + "epoch": 1.4514101465246316, + "grad_norm": 1.8537036180496216, + "learning_rate": 1.968811611434541e-05, + "loss": 0.5945, + "step": 8891 + }, + { + "epoch": 1.451573405167136, + "grad_norm": 1.607219934463501, + "learning_rate": 1.9688037474949784e-05, + "loss": 0.6038, + "step": 8892 + }, + { + "epoch": 1.4517366638096405, + "grad_norm": 1.8624411821365356, + "learning_rate": 1.9687958825798306e-05, + "loss": 0.747, + "step": 8893 + }, + { + "epoch": 1.451899922452145, + "grad_norm": 1.9149376153945923, + "learning_rate": 1.9687880166891058e-05, + "loss": 0.9008, + "step": 8894 + }, + { + "epoch": 1.4520631810946492, + "grad_norm": 1.603529453277588, + "learning_rate": 1.9687801498228114e-05, + "loss": 0.6193, + "step": 8895 + }, + { + "epoch": 1.4522264397371536, + "grad_norm": 1.8671441078186035, + "learning_rate": 1.968772281980956e-05, + "loss": 0.597, + "step": 8896 + }, + { + "epoch": 1.452389698379658, + "grad_norm": 1.612332820892334, + "learning_rate": 1.9687644131635467e-05, + "loss": 0.6899, + "step": 8897 + }, + { + "epoch": 1.4525529570221622, + "grad_norm": 1.4945591688156128, + "learning_rate": 1.9687565433705926e-05, + "loss": 0.5442, + "step": 8898 + }, + { + "epoch": 1.4527162156646667, + "grad_norm": 1.6702260971069336, + "learning_rate": 1.9687486726021005e-05, + "loss": 0.6935, + "step": 8899 + }, + { + "epoch": 1.4528794743071711, + "grad_norm": 1.7897847890853882, + "learning_rate": 1.9687408008580785e-05, + "loss": 0.6721, + "step": 8900 + }, + { + "epoch": 1.4530427329496756, + "grad_norm": 1.9352015256881714, + "learning_rate": 1.968732928138535e-05, + "loss": 0.696, + "step": 8901 + }, + { + "epoch": 1.45320599159218, + "grad_norm": 1.6360608339309692, + "learning_rate": 1.968725054443478e-05, + "loss": 0.6205, + "step": 8902 + }, + { + "epoch": 1.4533692502346842, + "grad_norm": 1.6993275880813599, + "learning_rate": 1.968717179772915e-05, + "loss": 0.6969, + "step": 8903 + }, + { + "epoch": 1.4535325088771887, + "grad_norm": 1.7121833562850952, + "learning_rate": 1.968709304126854e-05, + "loss": 0.7229, + "step": 8904 + }, + { + "epoch": 1.453695767519693, + "grad_norm": 1.625565528869629, + "learning_rate": 1.9687014275053026e-05, + "loss": 0.7491, + "step": 8905 + }, + { + "epoch": 1.4538590261621975, + "grad_norm": 1.7381452322006226, + "learning_rate": 1.9686935499082697e-05, + "loss": 0.7959, + "step": 8906 + }, + { + "epoch": 1.4540222848047017, + "grad_norm": 1.6734864711761475, + "learning_rate": 1.9686856713357625e-05, + "loss": 0.6764, + "step": 8907 + }, + { + "epoch": 1.4541855434472062, + "grad_norm": 1.7063758373260498, + "learning_rate": 1.9686777917877888e-05, + "loss": 0.6148, + "step": 8908 + }, + { + "epoch": 1.4543488020897106, + "grad_norm": 1.736620306968689, + "learning_rate": 1.9686699112643574e-05, + "loss": 0.7172, + "step": 8909 + }, + { + "epoch": 1.454512060732215, + "grad_norm": 1.738698959350586, + "learning_rate": 1.9686620297654747e-05, + "loss": 0.6342, + "step": 8910 + }, + { + "epoch": 1.4546753193747195, + "grad_norm": 1.5827399492263794, + "learning_rate": 1.9686541472911506e-05, + "loss": 0.6034, + "step": 8911 + }, + { + "epoch": 1.4548385780172237, + "grad_norm": 1.6460212469100952, + "learning_rate": 1.9686462638413914e-05, + "loss": 0.7573, + "step": 8912 + }, + { + "epoch": 1.4550018366597282, + "grad_norm": 1.6471400260925293, + "learning_rate": 1.9686383794162057e-05, + "loss": 0.6466, + "step": 8913 + }, + { + "epoch": 1.4551650953022326, + "grad_norm": 1.5103318691253662, + "learning_rate": 1.968630494015602e-05, + "loss": 0.5164, + "step": 8914 + }, + { + "epoch": 1.455328353944737, + "grad_norm": 1.4050352573394775, + "learning_rate": 1.9686226076395873e-05, + "loss": 0.6073, + "step": 8915 + }, + { + "epoch": 1.4554916125872412, + "grad_norm": 1.7616324424743652, + "learning_rate": 1.9686147202881694e-05, + "loss": 0.6873, + "step": 8916 + }, + { + "epoch": 1.4556548712297457, + "grad_norm": 1.7206569910049438, + "learning_rate": 1.968606831961357e-05, + "loss": 0.6892, + "step": 8917 + }, + { + "epoch": 1.4558181298722501, + "grad_norm": 1.9095330238342285, + "learning_rate": 1.968598942659158e-05, + "loss": 0.8287, + "step": 8918 + }, + { + "epoch": 1.4559813885147546, + "grad_norm": 2.0181291103363037, + "learning_rate": 1.96859105238158e-05, + "loss": 0.9043, + "step": 8919 + }, + { + "epoch": 1.456144647157259, + "grad_norm": 1.5471086502075195, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.5298, + "step": 8920 + }, + { + "epoch": 1.4563079057997632, + "grad_norm": 2.033290386199951, + "learning_rate": 1.9685752689003195e-05, + "loss": 0.8341, + "step": 8921 + }, + { + "epoch": 1.4564711644422677, + "grad_norm": 1.501089334487915, + "learning_rate": 1.9685673756966524e-05, + "loss": 0.6193, + "step": 8922 + }, + { + "epoch": 1.456634423084772, + "grad_norm": 1.7674059867858887, + "learning_rate": 1.9685594815176384e-05, + "loss": 0.6865, + "step": 8923 + }, + { + "epoch": 1.4567976817272763, + "grad_norm": 1.8313947916030884, + "learning_rate": 1.9685515863632852e-05, + "loss": 0.6695, + "step": 8924 + }, + { + "epoch": 1.4569609403697807, + "grad_norm": 1.558915376663208, + "learning_rate": 1.968543690233601e-05, + "loss": 0.6673, + "step": 8925 + }, + { + "epoch": 1.4571241990122852, + "grad_norm": 1.7462880611419678, + "learning_rate": 1.9685357931285934e-05, + "loss": 0.6312, + "step": 8926 + }, + { + "epoch": 1.4572874576547896, + "grad_norm": 1.8824939727783203, + "learning_rate": 1.9685278950482707e-05, + "loss": 0.7016, + "step": 8927 + }, + { + "epoch": 1.457450716297294, + "grad_norm": 2.0164718627929688, + "learning_rate": 1.9685199959926403e-05, + "loss": 0.846, + "step": 8928 + }, + { + "epoch": 1.4576139749397985, + "grad_norm": 1.7564702033996582, + "learning_rate": 1.9685120959617108e-05, + "loss": 0.7853, + "step": 8929 + }, + { + "epoch": 1.4577772335823027, + "grad_norm": 1.609257698059082, + "learning_rate": 1.9685041949554896e-05, + "loss": 0.6617, + "step": 8930 + }, + { + "epoch": 1.4579404922248071, + "grad_norm": 2.1343986988067627, + "learning_rate": 1.968496292973985e-05, + "loss": 0.7274, + "step": 8931 + }, + { + "epoch": 1.4581037508673116, + "grad_norm": 1.4686297178268433, + "learning_rate": 1.9684883900172053e-05, + "loss": 0.5256, + "step": 8932 + }, + { + "epoch": 1.4582670095098158, + "grad_norm": 1.8336305618286133, + "learning_rate": 1.9684804860851578e-05, + "loss": 0.7734, + "step": 8933 + }, + { + "epoch": 1.4584302681523202, + "grad_norm": 1.5332152843475342, + "learning_rate": 1.9684725811778507e-05, + "loss": 0.6916, + "step": 8934 + }, + { + "epoch": 1.4585935267948247, + "grad_norm": 1.8223613500595093, + "learning_rate": 1.9684646752952917e-05, + "loss": 0.627, + "step": 8935 + }, + { + "epoch": 1.4587567854373291, + "grad_norm": 1.650575041770935, + "learning_rate": 1.9684567684374897e-05, + "loss": 0.6455, + "step": 8936 + }, + { + "epoch": 1.4589200440798336, + "grad_norm": 1.8065630197525024, + "learning_rate": 1.9684488606044513e-05, + "loss": 0.708, + "step": 8937 + }, + { + "epoch": 1.459083302722338, + "grad_norm": 1.6888772249221802, + "learning_rate": 1.9684409517961852e-05, + "loss": 0.6138, + "step": 8938 + }, + { + "epoch": 1.4592465613648422, + "grad_norm": 1.49433434009552, + "learning_rate": 1.9684330420127e-05, + "loss": 0.5789, + "step": 8939 + }, + { + "epoch": 1.4594098200073466, + "grad_norm": 1.7103060483932495, + "learning_rate": 1.9684251312540023e-05, + "loss": 0.5977, + "step": 8940 + }, + { + "epoch": 1.459573078649851, + "grad_norm": 1.4001315832138062, + "learning_rate": 1.968417219520101e-05, + "loss": 0.5701, + "step": 8941 + }, + { + "epoch": 1.4597363372923553, + "grad_norm": 1.755624771118164, + "learning_rate": 1.968409306811004e-05, + "loss": 0.7509, + "step": 8942 + }, + { + "epoch": 1.4598995959348597, + "grad_norm": 1.7058089971542358, + "learning_rate": 1.9684013931267184e-05, + "loss": 0.6316, + "step": 8943 + }, + { + "epoch": 1.4600628545773642, + "grad_norm": 1.8235746622085571, + "learning_rate": 1.9683934784672535e-05, + "loss": 0.7068, + "step": 8944 + }, + { + "epoch": 1.4602261132198686, + "grad_norm": 2.3905768394470215, + "learning_rate": 1.9683855628326164e-05, + "loss": 0.7308, + "step": 8945 + }, + { + "epoch": 1.460389371862373, + "grad_norm": 1.8120334148406982, + "learning_rate": 1.9683776462228153e-05, + "loss": 0.6979, + "step": 8946 + }, + { + "epoch": 1.4605526305048773, + "grad_norm": 1.9066009521484375, + "learning_rate": 1.968369728637858e-05, + "loss": 0.6391, + "step": 8947 + }, + { + "epoch": 1.4607158891473817, + "grad_norm": 2.0647151470184326, + "learning_rate": 1.9683618100777532e-05, + "loss": 0.6905, + "step": 8948 + }, + { + "epoch": 1.4608791477898861, + "grad_norm": 1.4807260036468506, + "learning_rate": 1.968353890542508e-05, + "loss": 0.6011, + "step": 8949 + }, + { + "epoch": 1.4610424064323906, + "grad_norm": 1.5642013549804688, + "learning_rate": 1.9683459700321305e-05, + "loss": 0.5416, + "step": 8950 + }, + { + "epoch": 1.4612056650748948, + "grad_norm": 2.0606138706207275, + "learning_rate": 1.9683380485466292e-05, + "loss": 0.8551, + "step": 8951 + }, + { + "epoch": 1.4613689237173992, + "grad_norm": 1.5649231672286987, + "learning_rate": 1.9683301260860115e-05, + "loss": 0.552, + "step": 8952 + }, + { + "epoch": 1.4615321823599037, + "grad_norm": 1.970979928970337, + "learning_rate": 1.9683222026502856e-05, + "loss": 0.7977, + "step": 8953 + }, + { + "epoch": 1.4616954410024081, + "grad_norm": 1.6149612665176392, + "learning_rate": 1.96831427823946e-05, + "loss": 0.5893, + "step": 8954 + }, + { + "epoch": 1.4618586996449126, + "grad_norm": 2.6173040866851807, + "learning_rate": 1.9683063528535417e-05, + "loss": 0.7842, + "step": 8955 + }, + { + "epoch": 1.4620219582874168, + "grad_norm": 1.8031708002090454, + "learning_rate": 1.968298426492539e-05, + "loss": 0.7307, + "step": 8956 + }, + { + "epoch": 1.4621852169299212, + "grad_norm": 1.7610505819320679, + "learning_rate": 1.9682904991564603e-05, + "loss": 0.7092, + "step": 8957 + }, + { + "epoch": 1.4623484755724256, + "grad_norm": 1.911726951599121, + "learning_rate": 1.968282570845313e-05, + "loss": 0.7592, + "step": 8958 + }, + { + "epoch": 1.46251173421493, + "grad_norm": 1.5143975019454956, + "learning_rate": 1.968274641559106e-05, + "loss": 0.5559, + "step": 8959 + }, + { + "epoch": 1.4626749928574343, + "grad_norm": 1.7001502513885498, + "learning_rate": 1.9682667112978464e-05, + "loss": 0.6005, + "step": 8960 + }, + { + "epoch": 1.4628382514999387, + "grad_norm": 1.7902227640151978, + "learning_rate": 1.9682587800615425e-05, + "loss": 0.715, + "step": 8961 + }, + { + "epoch": 1.4630015101424432, + "grad_norm": 1.785680890083313, + "learning_rate": 1.968250847850202e-05, + "loss": 0.684, + "step": 8962 + }, + { + "epoch": 1.4631647687849476, + "grad_norm": 1.8269716501235962, + "learning_rate": 1.9682429146638336e-05, + "loss": 0.6348, + "step": 8963 + }, + { + "epoch": 1.463328027427452, + "grad_norm": 1.5608235597610474, + "learning_rate": 1.9682349805024447e-05, + "loss": 0.5834, + "step": 8964 + }, + { + "epoch": 1.4634912860699563, + "grad_norm": 2.2523598670959473, + "learning_rate": 1.9682270453660432e-05, + "loss": 0.787, + "step": 8965 + }, + { + "epoch": 1.4636545447124607, + "grad_norm": 1.7545371055603027, + "learning_rate": 1.9682191092546374e-05, + "loss": 0.6705, + "step": 8966 + }, + { + "epoch": 1.4638178033549651, + "grad_norm": 2.055321455001831, + "learning_rate": 1.9682111721682353e-05, + "loss": 0.6296, + "step": 8967 + }, + { + "epoch": 1.4639810619974694, + "grad_norm": 1.9529483318328857, + "learning_rate": 1.9682032341068448e-05, + "loss": 0.6944, + "step": 8968 + }, + { + "epoch": 1.4641443206399738, + "grad_norm": 1.7448019981384277, + "learning_rate": 1.9681952950704743e-05, + "loss": 0.6622, + "step": 8969 + }, + { + "epoch": 1.4643075792824782, + "grad_norm": 1.8738882541656494, + "learning_rate": 1.9681873550591306e-05, + "loss": 0.7063, + "step": 8970 + }, + { + "epoch": 1.4644708379249827, + "grad_norm": 1.9551479816436768, + "learning_rate": 1.968179414072823e-05, + "loss": 0.6939, + "step": 8971 + }, + { + "epoch": 1.464634096567487, + "grad_norm": 1.868450403213501, + "learning_rate": 1.9681714721115587e-05, + "loss": 0.7406, + "step": 8972 + }, + { + "epoch": 1.4647973552099915, + "grad_norm": 1.7002311944961548, + "learning_rate": 1.9681635291753462e-05, + "loss": 0.7327, + "step": 8973 + }, + { + "epoch": 1.4649606138524958, + "grad_norm": 1.853320598602295, + "learning_rate": 1.9681555852641933e-05, + "loss": 0.6743, + "step": 8974 + }, + { + "epoch": 1.4651238724950002, + "grad_norm": 1.9320247173309326, + "learning_rate": 1.968147640378108e-05, + "loss": 0.7754, + "step": 8975 + }, + { + "epoch": 1.4652871311375046, + "grad_norm": 1.8944302797317505, + "learning_rate": 1.968139694517098e-05, + "loss": 0.7493, + "step": 8976 + }, + { + "epoch": 1.4654503897800089, + "grad_norm": 1.8367654085159302, + "learning_rate": 1.968131747681172e-05, + "loss": 0.6369, + "step": 8977 + }, + { + "epoch": 1.4656136484225133, + "grad_norm": 1.6192930936813354, + "learning_rate": 1.9681237998703373e-05, + "loss": 0.6345, + "step": 8978 + }, + { + "epoch": 1.4657769070650177, + "grad_norm": 1.45469069480896, + "learning_rate": 1.968115851084602e-05, + "loss": 0.6031, + "step": 8979 + }, + { + "epoch": 1.4659401657075222, + "grad_norm": 1.8003584146499634, + "learning_rate": 1.9681079013239748e-05, + "loss": 0.6345, + "step": 8980 + }, + { + "epoch": 1.4661034243500266, + "grad_norm": 1.8090649843215942, + "learning_rate": 1.968099950588463e-05, + "loss": 0.6823, + "step": 8981 + }, + { + "epoch": 1.466266682992531, + "grad_norm": 1.8707369565963745, + "learning_rate": 1.968091998878075e-05, + "loss": 0.6424, + "step": 8982 + }, + { + "epoch": 1.4664299416350353, + "grad_norm": 1.453446388244629, + "learning_rate": 1.968084046192818e-05, + "loss": 0.5033, + "step": 8983 + }, + { + "epoch": 1.4665932002775397, + "grad_norm": 1.6681972742080688, + "learning_rate": 1.968076092532701e-05, + "loss": 0.7276, + "step": 8984 + }, + { + "epoch": 1.4667564589200441, + "grad_norm": 1.7357639074325562, + "learning_rate": 1.9680681378977317e-05, + "loss": 0.6636, + "step": 8985 + }, + { + "epoch": 1.4669197175625484, + "grad_norm": 1.9360885620117188, + "learning_rate": 1.968060182287918e-05, + "loss": 0.8142, + "step": 8986 + }, + { + "epoch": 1.4670829762050528, + "grad_norm": 1.8844066858291626, + "learning_rate": 1.968052225703268e-05, + "loss": 0.6701, + "step": 8987 + }, + { + "epoch": 1.4672462348475572, + "grad_norm": 1.8245185613632202, + "learning_rate": 1.9680442681437895e-05, + "loss": 0.7221, + "step": 8988 + }, + { + "epoch": 1.4674094934900617, + "grad_norm": 1.888819932937622, + "learning_rate": 1.9680363096094906e-05, + "loss": 0.7516, + "step": 8989 + }, + { + "epoch": 1.467572752132566, + "grad_norm": 2.142040967941284, + "learning_rate": 1.96802835010038e-05, + "loss": 0.7106, + "step": 8990 + }, + { + "epoch": 1.4677360107750705, + "grad_norm": 1.72548508644104, + "learning_rate": 1.9680203896164646e-05, + "loss": 0.6289, + "step": 8991 + }, + { + "epoch": 1.4678992694175748, + "grad_norm": 1.670089602470398, + "learning_rate": 1.968012428157753e-05, + "loss": 0.6565, + "step": 8992 + }, + { + "epoch": 1.4680625280600792, + "grad_norm": 1.9842661619186401, + "learning_rate": 1.9680044657242532e-05, + "loss": 0.6647, + "step": 8993 + }, + { + "epoch": 1.4682257867025836, + "grad_norm": 1.5729340314865112, + "learning_rate": 1.967996502315973e-05, + "loss": 0.6357, + "step": 8994 + }, + { + "epoch": 1.4683890453450879, + "grad_norm": 1.904778242111206, + "learning_rate": 1.9679885379329208e-05, + "loss": 0.815, + "step": 8995 + }, + { + "epoch": 1.4685523039875923, + "grad_norm": 1.6344839334487915, + "learning_rate": 1.967980572575104e-05, + "loss": 0.6655, + "step": 8996 + }, + { + "epoch": 1.4687155626300967, + "grad_norm": 1.3686703443527222, + "learning_rate": 1.9679726062425314e-05, + "loss": 0.5549, + "step": 8997 + }, + { + "epoch": 1.4688788212726012, + "grad_norm": 2.151660442352295, + "learning_rate": 1.9679646389352104e-05, + "loss": 0.7694, + "step": 8998 + }, + { + "epoch": 1.4690420799151056, + "grad_norm": 1.7980393171310425, + "learning_rate": 1.9679566706531497e-05, + "loss": 0.7108, + "step": 8999 + }, + { + "epoch": 1.4692053385576098, + "grad_norm": 1.281118392944336, + "learning_rate": 1.9679487013963566e-05, + "loss": 0.5591, + "step": 9000 + }, + { + "epoch": 1.4693685972001143, + "grad_norm": 1.8228079080581665, + "learning_rate": 1.9679407311648394e-05, + "loss": 0.8789, + "step": 9001 + }, + { + "epoch": 1.4695318558426187, + "grad_norm": 1.6603171825408936, + "learning_rate": 1.967932759958606e-05, + "loss": 0.6465, + "step": 9002 + }, + { + "epoch": 1.4696951144851231, + "grad_norm": 1.6893362998962402, + "learning_rate": 1.9679247877776647e-05, + "loss": 0.753, + "step": 9003 + }, + { + "epoch": 1.4698583731276273, + "grad_norm": 1.5887150764465332, + "learning_rate": 1.9679168146220237e-05, + "loss": 0.6982, + "step": 9004 + }, + { + "epoch": 1.4700216317701318, + "grad_norm": 2.0297529697418213, + "learning_rate": 1.96790884049169e-05, + "loss": 0.9495, + "step": 9005 + }, + { + "epoch": 1.4701848904126362, + "grad_norm": 1.7772586345672607, + "learning_rate": 1.967900865386673e-05, + "loss": 0.5847, + "step": 9006 + }, + { + "epoch": 1.4703481490551407, + "grad_norm": 1.844789981842041, + "learning_rate": 1.9678928893069797e-05, + "loss": 0.6561, + "step": 9007 + }, + { + "epoch": 1.470511407697645, + "grad_norm": 1.8459062576293945, + "learning_rate": 1.967884912252619e-05, + "loss": 0.6643, + "step": 9008 + }, + { + "epoch": 1.4706746663401493, + "grad_norm": 1.3451393842697144, + "learning_rate": 1.967876934223598e-05, + "loss": 0.5516, + "step": 9009 + }, + { + "epoch": 1.4708379249826538, + "grad_norm": 1.5642448663711548, + "learning_rate": 1.9678689552199252e-05, + "loss": 0.6731, + "step": 9010 + }, + { + "epoch": 1.4710011836251582, + "grad_norm": 1.9455457925796509, + "learning_rate": 1.967860975241609e-05, + "loss": 0.8433, + "step": 9011 + }, + { + "epoch": 1.4711644422676624, + "grad_norm": 1.8119231462478638, + "learning_rate": 1.9678529942886567e-05, + "loss": 0.7301, + "step": 9012 + }, + { + "epoch": 1.4713277009101668, + "grad_norm": 1.4559024572372437, + "learning_rate": 1.967845012361077e-05, + "loss": 0.6607, + "step": 9013 + }, + { + "epoch": 1.4714909595526713, + "grad_norm": 1.8885624408721924, + "learning_rate": 1.967837029458877e-05, + "loss": 0.6219, + "step": 9014 + }, + { + "epoch": 1.4716542181951757, + "grad_norm": 1.8942476511001587, + "learning_rate": 1.967829045582066e-05, + "loss": 0.8689, + "step": 9015 + }, + { + "epoch": 1.4718174768376802, + "grad_norm": 1.6232682466506958, + "learning_rate": 1.967821060730651e-05, + "loss": 0.7082, + "step": 9016 + }, + { + "epoch": 1.4719807354801846, + "grad_norm": 1.6811983585357666, + "learning_rate": 1.967813074904641e-05, + "loss": 0.7271, + "step": 9017 + }, + { + "epoch": 1.4721439941226888, + "grad_norm": 1.6239910125732422, + "learning_rate": 1.967805088104043e-05, + "loss": 0.6812, + "step": 9018 + }, + { + "epoch": 1.4723072527651933, + "grad_norm": 1.9905056953430176, + "learning_rate": 1.9677971003288657e-05, + "loss": 0.7455, + "step": 9019 + }, + { + "epoch": 1.4724705114076977, + "grad_norm": 1.909723162651062, + "learning_rate": 1.967789111579117e-05, + "loss": 0.8209, + "step": 9020 + }, + { + "epoch": 1.472633770050202, + "grad_norm": 1.8239691257476807, + "learning_rate": 1.9677811218548046e-05, + "loss": 0.6477, + "step": 9021 + }, + { + "epoch": 1.4727970286927063, + "grad_norm": 1.5077400207519531, + "learning_rate": 1.9677731311559373e-05, + "loss": 0.5737, + "step": 9022 + }, + { + "epoch": 1.4729602873352108, + "grad_norm": 1.6240965127944946, + "learning_rate": 1.9677651394825227e-05, + "loss": 0.6272, + "step": 9023 + }, + { + "epoch": 1.4731235459777152, + "grad_norm": 1.628836989402771, + "learning_rate": 1.9677571468345686e-05, + "loss": 0.6922, + "step": 9024 + }, + { + "epoch": 1.4732868046202197, + "grad_norm": 1.8529151678085327, + "learning_rate": 1.9677491532120834e-05, + "loss": 0.6906, + "step": 9025 + }, + { + "epoch": 1.473450063262724, + "grad_norm": 1.7926055192947388, + "learning_rate": 1.9677411586150753e-05, + "loss": 0.787, + "step": 9026 + }, + { + "epoch": 1.4736133219052283, + "grad_norm": 1.7347469329833984, + "learning_rate": 1.9677331630435517e-05, + "loss": 0.6541, + "step": 9027 + }, + { + "epoch": 1.4737765805477328, + "grad_norm": 1.986503005027771, + "learning_rate": 1.967725166497521e-05, + "loss": 0.7756, + "step": 9028 + }, + { + "epoch": 1.4739398391902372, + "grad_norm": 2.0473058223724365, + "learning_rate": 1.9677171689769916e-05, + "loss": 0.7631, + "step": 9029 + }, + { + "epoch": 1.4741030978327414, + "grad_norm": 1.802970290184021, + "learning_rate": 1.9677091704819714e-05, + "loss": 0.6057, + "step": 9030 + }, + { + "epoch": 1.4742663564752458, + "grad_norm": 2.1840810775756836, + "learning_rate": 1.9677011710124683e-05, + "loss": 0.7156, + "step": 9031 + }, + { + "epoch": 1.4744296151177503, + "grad_norm": 1.8086096048355103, + "learning_rate": 1.96769317056849e-05, + "loss": 0.6159, + "step": 9032 + }, + { + "epoch": 1.4745928737602547, + "grad_norm": 1.9140915870666504, + "learning_rate": 1.9676851691500453e-05, + "loss": 0.78, + "step": 9033 + }, + { + "epoch": 1.4747561324027592, + "grad_norm": 1.899588704109192, + "learning_rate": 1.9676771667571418e-05, + "loss": 0.7481, + "step": 9034 + }, + { + "epoch": 1.4749193910452636, + "grad_norm": 1.7222570180892944, + "learning_rate": 1.9676691633897875e-05, + "loss": 0.6163, + "step": 9035 + }, + { + "epoch": 1.4750826496877678, + "grad_norm": 1.9422483444213867, + "learning_rate": 1.9676611590479906e-05, + "loss": 0.7814, + "step": 9036 + }, + { + "epoch": 1.4752459083302722, + "grad_norm": 1.7540614604949951, + "learning_rate": 1.9676531537317595e-05, + "loss": 0.8064, + "step": 9037 + }, + { + "epoch": 1.4754091669727767, + "grad_norm": 1.5926730632781982, + "learning_rate": 1.9676451474411017e-05, + "loss": 0.7934, + "step": 9038 + }, + { + "epoch": 1.475572425615281, + "grad_norm": 1.6391246318817139, + "learning_rate": 1.9676371401760254e-05, + "loss": 0.6613, + "step": 9039 + }, + { + "epoch": 1.4757356842577853, + "grad_norm": 2.1777079105377197, + "learning_rate": 1.967629131936539e-05, + "loss": 0.8736, + "step": 9040 + }, + { + "epoch": 1.4758989429002898, + "grad_norm": 1.7378536462783813, + "learning_rate": 1.96762112272265e-05, + "loss": 0.7168, + "step": 9041 + }, + { + "epoch": 1.4760622015427942, + "grad_norm": 1.8915256261825562, + "learning_rate": 1.967613112534367e-05, + "loss": 0.6616, + "step": 9042 + }, + { + "epoch": 1.4762254601852987, + "grad_norm": 1.9580936431884766, + "learning_rate": 1.967605101371698e-05, + "loss": 0.82, + "step": 9043 + }, + { + "epoch": 1.4763887188278029, + "grad_norm": 1.555055022239685, + "learning_rate": 1.9675970892346507e-05, + "loss": 0.6338, + "step": 9044 + }, + { + "epoch": 1.4765519774703073, + "grad_norm": 1.7066384553909302, + "learning_rate": 1.9675890761232333e-05, + "loss": 0.7131, + "step": 9045 + }, + { + "epoch": 1.4767152361128117, + "grad_norm": 2.2390172481536865, + "learning_rate": 1.967581062037454e-05, + "loss": 0.7875, + "step": 9046 + }, + { + "epoch": 1.4768784947553162, + "grad_norm": 1.7735713720321655, + "learning_rate": 1.967573046977321e-05, + "loss": 0.7758, + "step": 9047 + }, + { + "epoch": 1.4770417533978204, + "grad_norm": 1.4973689317703247, + "learning_rate": 1.9675650309428422e-05, + "loss": 0.5297, + "step": 9048 + }, + { + "epoch": 1.4772050120403248, + "grad_norm": 2.062238931655884, + "learning_rate": 1.9675570139340253e-05, + "loss": 0.6909, + "step": 9049 + }, + { + "epoch": 1.4773682706828293, + "grad_norm": 1.9142225980758667, + "learning_rate": 1.9675489959508794e-05, + "loss": 0.7513, + "step": 9050 + }, + { + "epoch": 1.4775315293253337, + "grad_norm": 1.7617467641830444, + "learning_rate": 1.9675409769934114e-05, + "loss": 0.7313, + "step": 9051 + }, + { + "epoch": 1.4776947879678382, + "grad_norm": 1.784393072128296, + "learning_rate": 1.96753295706163e-05, + "loss": 0.817, + "step": 9052 + }, + { + "epoch": 1.4778580466103424, + "grad_norm": 1.799633502960205, + "learning_rate": 1.9675249361555432e-05, + "loss": 0.7463, + "step": 9053 + }, + { + "epoch": 1.4780213052528468, + "grad_norm": 1.5760775804519653, + "learning_rate": 1.967516914275159e-05, + "loss": 0.5882, + "step": 9054 + }, + { + "epoch": 1.4781845638953512, + "grad_norm": 1.7817902565002441, + "learning_rate": 1.9675088914204857e-05, + "loss": 0.7736, + "step": 9055 + }, + { + "epoch": 1.4783478225378555, + "grad_norm": 1.8306775093078613, + "learning_rate": 1.967500867591531e-05, + "loss": 0.6716, + "step": 9056 + }, + { + "epoch": 1.47851108118036, + "grad_norm": 1.504346251487732, + "learning_rate": 1.9674928427883034e-05, + "loss": 0.6421, + "step": 9057 + }, + { + "epoch": 1.4786743398228643, + "grad_norm": 1.5553910732269287, + "learning_rate": 1.9674848170108104e-05, + "loss": 0.6441, + "step": 9058 + }, + { + "epoch": 1.4788375984653688, + "grad_norm": 1.7634400129318237, + "learning_rate": 1.967476790259061e-05, + "loss": 0.6661, + "step": 9059 + }, + { + "epoch": 1.4790008571078732, + "grad_norm": 1.7333580255508423, + "learning_rate": 1.9674687625330623e-05, + "loss": 0.6357, + "step": 9060 + }, + { + "epoch": 1.4791641157503777, + "grad_norm": 1.7884796857833862, + "learning_rate": 1.9674607338328228e-05, + "loss": 0.5948, + "step": 9061 + }, + { + "epoch": 1.4793273743928819, + "grad_norm": 1.8649100065231323, + "learning_rate": 1.967452704158351e-05, + "loss": 0.78, + "step": 9062 + }, + { + "epoch": 1.4794906330353863, + "grad_norm": 2.2516536712646484, + "learning_rate": 1.9674446735096542e-05, + "loss": 0.7381, + "step": 9063 + }, + { + "epoch": 1.4796538916778907, + "grad_norm": 1.592813491821289, + "learning_rate": 1.967436641886741e-05, + "loss": 0.5817, + "step": 9064 + }, + { + "epoch": 1.479817150320395, + "grad_norm": 1.4915244579315186, + "learning_rate": 1.9674286092896195e-05, + "loss": 0.4999, + "step": 9065 + }, + { + "epoch": 1.4799804089628994, + "grad_norm": 1.7625669240951538, + "learning_rate": 1.9674205757182974e-05, + "loss": 0.7207, + "step": 9066 + }, + { + "epoch": 1.4801436676054038, + "grad_norm": 1.804884433746338, + "learning_rate": 1.967412541172783e-05, + "loss": 0.6145, + "step": 9067 + }, + { + "epoch": 1.4803069262479083, + "grad_norm": 1.881123661994934, + "learning_rate": 1.9674045056530845e-05, + "loss": 0.7023, + "step": 9068 + }, + { + "epoch": 1.4804701848904127, + "grad_norm": 1.9595675468444824, + "learning_rate": 1.9673964691592098e-05, + "loss": 0.8453, + "step": 9069 + }, + { + "epoch": 1.4806334435329171, + "grad_norm": 1.7436041831970215, + "learning_rate": 1.9673884316911673e-05, + "loss": 0.7787, + "step": 9070 + }, + { + "epoch": 1.4807967021754214, + "grad_norm": 2.031121253967285, + "learning_rate": 1.967380393248965e-05, + "loss": 0.7452, + "step": 9071 + }, + { + "epoch": 1.4809599608179258, + "grad_norm": 1.909403920173645, + "learning_rate": 1.9673723538326105e-05, + "loss": 0.7403, + "step": 9072 + }, + { + "epoch": 1.4811232194604302, + "grad_norm": 1.9976520538330078, + "learning_rate": 1.967364313442113e-05, + "loss": 0.9728, + "step": 9073 + }, + { + "epoch": 1.4812864781029345, + "grad_norm": 1.8859496116638184, + "learning_rate": 1.9673562720774792e-05, + "loss": 0.7759, + "step": 9074 + }, + { + "epoch": 1.481449736745439, + "grad_norm": 2.164538621902466, + "learning_rate": 1.9673482297387184e-05, + "loss": 0.7303, + "step": 9075 + }, + { + "epoch": 1.4816129953879433, + "grad_norm": 1.7385215759277344, + "learning_rate": 1.967340186425838e-05, + "loss": 0.6137, + "step": 9076 + }, + { + "epoch": 1.4817762540304478, + "grad_norm": 2.0848400592803955, + "learning_rate": 1.967332142138846e-05, + "loss": 0.7787, + "step": 9077 + }, + { + "epoch": 1.4819395126729522, + "grad_norm": 1.7590702772140503, + "learning_rate": 1.967324096877751e-05, + "loss": 0.7461, + "step": 9078 + }, + { + "epoch": 1.4821027713154566, + "grad_norm": 1.5054785013198853, + "learning_rate": 1.9673160506425607e-05, + "loss": 0.5668, + "step": 9079 + }, + { + "epoch": 1.4822660299579609, + "grad_norm": 1.7041674852371216, + "learning_rate": 1.967308003433284e-05, + "loss": 0.6293, + "step": 9080 + }, + { + "epoch": 1.4824292886004653, + "grad_norm": 1.8820064067840576, + "learning_rate": 1.967299955249928e-05, + "loss": 0.6842, + "step": 9081 + }, + { + "epoch": 1.4825925472429697, + "grad_norm": 2.2066659927368164, + "learning_rate": 1.967291906092501e-05, + "loss": 0.6683, + "step": 9082 + }, + { + "epoch": 1.482755805885474, + "grad_norm": 2.0514180660247803, + "learning_rate": 1.9672838559610118e-05, + "loss": 0.6462, + "step": 9083 + }, + { + "epoch": 1.4829190645279784, + "grad_norm": 2.01760196685791, + "learning_rate": 1.9672758048554677e-05, + "loss": 0.7397, + "step": 9084 + }, + { + "epoch": 1.4830823231704828, + "grad_norm": 2.738631010055542, + "learning_rate": 1.967267752775877e-05, + "loss": 0.5749, + "step": 9085 + }, + { + "epoch": 1.4832455818129873, + "grad_norm": 1.6711211204528809, + "learning_rate": 1.967259699722248e-05, + "loss": 0.6433, + "step": 9086 + }, + { + "epoch": 1.4834088404554917, + "grad_norm": 1.727579951286316, + "learning_rate": 1.9672516456945888e-05, + "loss": 0.6293, + "step": 9087 + }, + { + "epoch": 1.483572099097996, + "grad_norm": 1.698111891746521, + "learning_rate": 1.9672435906929074e-05, + "loss": 0.8248, + "step": 9088 + }, + { + "epoch": 1.4837353577405004, + "grad_norm": 1.9113200902938843, + "learning_rate": 1.9672355347172122e-05, + "loss": 0.9747, + "step": 9089 + }, + { + "epoch": 1.4838986163830048, + "grad_norm": 1.4510388374328613, + "learning_rate": 1.9672274777675108e-05, + "loss": 0.598, + "step": 9090 + }, + { + "epoch": 1.4840618750255092, + "grad_norm": 1.572234034538269, + "learning_rate": 1.9672194198438117e-05, + "loss": 0.6069, + "step": 9091 + }, + { + "epoch": 1.4842251336680135, + "grad_norm": 1.679648995399475, + "learning_rate": 1.967211360946123e-05, + "loss": 0.6325, + "step": 9092 + }, + { + "epoch": 1.484388392310518, + "grad_norm": 1.8022860288619995, + "learning_rate": 1.9672033010744526e-05, + "loss": 0.6567, + "step": 9093 + }, + { + "epoch": 1.4845516509530223, + "grad_norm": 1.748140811920166, + "learning_rate": 1.967195240228809e-05, + "loss": 0.7216, + "step": 9094 + }, + { + "epoch": 1.4847149095955268, + "grad_norm": 2.0105020999908447, + "learning_rate": 1.9671871784091997e-05, + "loss": 0.8218, + "step": 9095 + }, + { + "epoch": 1.4848781682380312, + "grad_norm": 1.7798386812210083, + "learning_rate": 1.967179115615633e-05, + "loss": 0.6534, + "step": 9096 + }, + { + "epoch": 1.4850414268805354, + "grad_norm": 1.692250370979309, + "learning_rate": 1.9671710518481177e-05, + "loss": 0.5302, + "step": 9097 + }, + { + "epoch": 1.4852046855230399, + "grad_norm": 1.853210210800171, + "learning_rate": 1.967162987106661e-05, + "loss": 0.6747, + "step": 9098 + }, + { + "epoch": 1.4853679441655443, + "grad_norm": 2.030076265335083, + "learning_rate": 1.9671549213912716e-05, + "loss": 0.8537, + "step": 9099 + }, + { + "epoch": 1.4855312028080487, + "grad_norm": 1.8264412879943848, + "learning_rate": 1.9671468547019575e-05, + "loss": 0.7458, + "step": 9100 + }, + { + "epoch": 1.485694461450553, + "grad_norm": 1.701270580291748, + "learning_rate": 1.9671387870387266e-05, + "loss": 0.733, + "step": 9101 + }, + { + "epoch": 1.4858577200930574, + "grad_norm": 1.9856560230255127, + "learning_rate": 1.9671307184015873e-05, + "loss": 0.6115, + "step": 9102 + }, + { + "epoch": 1.4860209787355618, + "grad_norm": 1.9717539548873901, + "learning_rate": 1.9671226487905476e-05, + "loss": 0.8944, + "step": 9103 + }, + { + "epoch": 1.4861842373780663, + "grad_norm": 1.9335845708847046, + "learning_rate": 1.9671145782056157e-05, + "loss": 0.667, + "step": 9104 + }, + { + "epoch": 1.4863474960205707, + "grad_norm": 1.6994874477386475, + "learning_rate": 1.9671065066467996e-05, + "loss": 0.5546, + "step": 9105 + }, + { + "epoch": 1.486510754663075, + "grad_norm": 1.7942761182785034, + "learning_rate": 1.9670984341141074e-05, + "loss": 0.6887, + "step": 9106 + }, + { + "epoch": 1.4866740133055794, + "grad_norm": 2.0976054668426514, + "learning_rate": 1.9670903606075475e-05, + "loss": 0.7733, + "step": 9107 + }, + { + "epoch": 1.4868372719480838, + "grad_norm": 1.624464988708496, + "learning_rate": 1.9670822861271278e-05, + "loss": 0.6162, + "step": 9108 + }, + { + "epoch": 1.487000530590588, + "grad_norm": 1.6011568307876587, + "learning_rate": 1.9670742106728567e-05, + "loss": 0.6705, + "step": 9109 + }, + { + "epoch": 1.4871637892330924, + "grad_norm": 1.576406717300415, + "learning_rate": 1.967066134244742e-05, + "loss": 0.6255, + "step": 9110 + }, + { + "epoch": 1.4873270478755969, + "grad_norm": 1.797203779220581, + "learning_rate": 1.9670580568427917e-05, + "loss": 0.6619, + "step": 9111 + }, + { + "epoch": 1.4874903065181013, + "grad_norm": 1.3939777612686157, + "learning_rate": 1.9670499784670145e-05, + "loss": 0.55, + "step": 9112 + }, + { + "epoch": 1.4876535651606058, + "grad_norm": 1.9019756317138672, + "learning_rate": 1.9670418991174184e-05, + "loss": 0.7272, + "step": 9113 + }, + { + "epoch": 1.4878168238031102, + "grad_norm": 1.8233121633529663, + "learning_rate": 1.967033818794011e-05, + "loss": 0.6993, + "step": 9114 + }, + { + "epoch": 1.4879800824456144, + "grad_norm": 1.8720684051513672, + "learning_rate": 1.967025737496801e-05, + "loss": 0.7981, + "step": 9115 + }, + { + "epoch": 1.4881433410881189, + "grad_norm": 1.3731341361999512, + "learning_rate": 1.967017655225796e-05, + "loss": 0.5393, + "step": 9116 + }, + { + "epoch": 1.4883065997306233, + "grad_norm": 1.834725260734558, + "learning_rate": 1.9670095719810048e-05, + "loss": 0.7205, + "step": 9117 + }, + { + "epoch": 1.4884698583731275, + "grad_norm": 2.1068835258483887, + "learning_rate": 1.9670014877624353e-05, + "loss": 0.9391, + "step": 9118 + }, + { + "epoch": 1.488633117015632, + "grad_norm": 1.5917508602142334, + "learning_rate": 1.966993402570095e-05, + "loss": 0.6302, + "step": 9119 + }, + { + "epoch": 1.4887963756581364, + "grad_norm": 1.9223756790161133, + "learning_rate": 1.9669853164039935e-05, + "loss": 0.8191, + "step": 9120 + }, + { + "epoch": 1.4889596343006408, + "grad_norm": 1.9295414686203003, + "learning_rate": 1.9669772292641375e-05, + "loss": 0.811, + "step": 9121 + }, + { + "epoch": 1.4891228929431453, + "grad_norm": 1.7273468971252441, + "learning_rate": 1.9669691411505354e-05, + "loss": 0.6129, + "step": 9122 + }, + { + "epoch": 1.4892861515856497, + "grad_norm": 1.4255917072296143, + "learning_rate": 1.966961052063196e-05, + "loss": 0.566, + "step": 9123 + }, + { + "epoch": 1.489449410228154, + "grad_norm": 1.725005030632019, + "learning_rate": 1.966952962002127e-05, + "loss": 0.7371, + "step": 9124 + }, + { + "epoch": 1.4896126688706584, + "grad_norm": 1.9894649982452393, + "learning_rate": 1.9669448709673368e-05, + "loss": 0.8302, + "step": 9125 + }, + { + "epoch": 1.4897759275131628, + "grad_norm": 1.9463542699813843, + "learning_rate": 1.9669367789588333e-05, + "loss": 0.9191, + "step": 9126 + }, + { + "epoch": 1.489939186155667, + "grad_norm": 1.8729273080825806, + "learning_rate": 1.9669286859766248e-05, + "loss": 0.6541, + "step": 9127 + }, + { + "epoch": 1.4901024447981714, + "grad_norm": 1.723410964012146, + "learning_rate": 1.9669205920207194e-05, + "loss": 0.8172, + "step": 9128 + }, + { + "epoch": 1.4902657034406759, + "grad_norm": 1.819735050201416, + "learning_rate": 1.9669124970911245e-05, + "loss": 0.6752, + "step": 9129 + }, + { + "epoch": 1.4904289620831803, + "grad_norm": 1.7257792949676514, + "learning_rate": 1.9669044011878497e-05, + "loss": 0.5587, + "step": 9130 + }, + { + "epoch": 1.4905922207256848, + "grad_norm": 1.8726648092269897, + "learning_rate": 1.9668963043109023e-05, + "loss": 0.8271, + "step": 9131 + }, + { + "epoch": 1.490755479368189, + "grad_norm": 1.4942326545715332, + "learning_rate": 1.9668882064602906e-05, + "loss": 0.5533, + "step": 9132 + }, + { + "epoch": 1.4909187380106934, + "grad_norm": 1.7653864622116089, + "learning_rate": 1.9668801076360227e-05, + "loss": 0.7308, + "step": 9133 + }, + { + "epoch": 1.4910819966531978, + "grad_norm": 1.714220643043518, + "learning_rate": 1.9668720078381066e-05, + "loss": 0.6165, + "step": 9134 + }, + { + "epoch": 1.4912452552957023, + "grad_norm": 1.9725171327590942, + "learning_rate": 1.966863907066551e-05, + "loss": 0.8641, + "step": 9135 + }, + { + "epoch": 1.4914085139382065, + "grad_norm": 1.8221533298492432, + "learning_rate": 1.9668558053213634e-05, + "loss": 0.7162, + "step": 9136 + }, + { + "epoch": 1.491571772580711, + "grad_norm": 1.8465925455093384, + "learning_rate": 1.9668477026025525e-05, + "loss": 0.8089, + "step": 9137 + }, + { + "epoch": 1.4917350312232154, + "grad_norm": 2.0709924697875977, + "learning_rate": 1.966839598910126e-05, + "loss": 0.7927, + "step": 9138 + }, + { + "epoch": 1.4918982898657198, + "grad_norm": 1.331532597541809, + "learning_rate": 1.9668314942440923e-05, + "loss": 0.5162, + "step": 9139 + }, + { + "epoch": 1.4920615485082243, + "grad_norm": 1.3391073942184448, + "learning_rate": 1.9668233886044597e-05, + "loss": 0.5695, + "step": 9140 + }, + { + "epoch": 1.4922248071507285, + "grad_norm": 1.4688925743103027, + "learning_rate": 1.966815281991236e-05, + "loss": 0.5507, + "step": 9141 + }, + { + "epoch": 1.492388065793233, + "grad_norm": 1.8267731666564941, + "learning_rate": 1.9668071744044295e-05, + "loss": 0.7267, + "step": 9142 + }, + { + "epoch": 1.4925513244357373, + "grad_norm": 2.8579459190368652, + "learning_rate": 1.9667990658440487e-05, + "loss": 0.5176, + "step": 9143 + }, + { + "epoch": 1.4927145830782418, + "grad_norm": 1.6282614469528198, + "learning_rate": 1.9667909563101015e-05, + "loss": 0.6303, + "step": 9144 + }, + { + "epoch": 1.492877841720746, + "grad_norm": 2.0728390216827393, + "learning_rate": 1.966782845802596e-05, + "loss": 0.8314, + "step": 9145 + }, + { + "epoch": 1.4930411003632504, + "grad_norm": 1.6684895753860474, + "learning_rate": 1.9667747343215402e-05, + "loss": 0.5929, + "step": 9146 + }, + { + "epoch": 1.4932043590057549, + "grad_norm": 1.7978848218917847, + "learning_rate": 1.9667666218669428e-05, + "loss": 0.6771, + "step": 9147 + }, + { + "epoch": 1.4933676176482593, + "grad_norm": 1.8945505619049072, + "learning_rate": 1.9667585084388117e-05, + "loss": 0.8041, + "step": 9148 + }, + { + "epoch": 1.4935308762907638, + "grad_norm": 1.6281328201293945, + "learning_rate": 1.966750394037155e-05, + "loss": 0.6738, + "step": 9149 + }, + { + "epoch": 1.493694134933268, + "grad_norm": 1.9344099760055542, + "learning_rate": 1.9667422786619804e-05, + "loss": 0.7119, + "step": 9150 + }, + { + "epoch": 1.4938573935757724, + "grad_norm": 1.782348871231079, + "learning_rate": 1.966734162313297e-05, + "loss": 0.6851, + "step": 9151 + }, + { + "epoch": 1.4940206522182768, + "grad_norm": 2.119680643081665, + "learning_rate": 1.9667260449911126e-05, + "loss": 0.6951, + "step": 9152 + }, + { + "epoch": 1.494183910860781, + "grad_norm": 1.817963719367981, + "learning_rate": 1.9667179266954352e-05, + "loss": 0.7162, + "step": 9153 + }, + { + "epoch": 1.4943471695032855, + "grad_norm": 1.91780424118042, + "learning_rate": 1.9667098074262734e-05, + "loss": 0.6791, + "step": 9154 + }, + { + "epoch": 1.49451042814579, + "grad_norm": 1.9460184574127197, + "learning_rate": 1.9667016871836346e-05, + "loss": 0.7164, + "step": 9155 + }, + { + "epoch": 1.4946736867882944, + "grad_norm": 1.706470012664795, + "learning_rate": 1.966693565967528e-05, + "loss": 0.5374, + "step": 9156 + }, + { + "epoch": 1.4948369454307988, + "grad_norm": 1.5911709070205688, + "learning_rate": 1.9666854437779608e-05, + "loss": 0.6614, + "step": 9157 + }, + { + "epoch": 1.4950002040733033, + "grad_norm": 1.8055311441421509, + "learning_rate": 1.9666773206149417e-05, + "loss": 0.763, + "step": 9158 + }, + { + "epoch": 1.4951634627158075, + "grad_norm": 1.8154795169830322, + "learning_rate": 1.966669196478479e-05, + "loss": 0.7054, + "step": 9159 + }, + { + "epoch": 1.495326721358312, + "grad_norm": 1.8464452028274536, + "learning_rate": 1.9666610713685804e-05, + "loss": 0.807, + "step": 9160 + }, + { + "epoch": 1.4954899800008163, + "grad_norm": 1.5572665929794312, + "learning_rate": 1.9666529452852546e-05, + "loss": 0.6909, + "step": 9161 + }, + { + "epoch": 1.4956532386433206, + "grad_norm": 1.8463075160980225, + "learning_rate": 1.9666448182285095e-05, + "loss": 0.7549, + "step": 9162 + }, + { + "epoch": 1.495816497285825, + "grad_norm": 1.896497368812561, + "learning_rate": 1.966636690198353e-05, + "loss": 0.6934, + "step": 9163 + }, + { + "epoch": 1.4959797559283294, + "grad_norm": 2.0354511737823486, + "learning_rate": 1.966628561194794e-05, + "loss": 0.6209, + "step": 9164 + }, + { + "epoch": 1.4961430145708339, + "grad_norm": 1.7633668184280396, + "learning_rate": 1.96662043121784e-05, + "loss": 0.6536, + "step": 9165 + }, + { + "epoch": 1.4963062732133383, + "grad_norm": 1.7485493421554565, + "learning_rate": 1.9666123002675e-05, + "loss": 0.7377, + "step": 9166 + }, + { + "epoch": 1.4964695318558427, + "grad_norm": 1.7833659648895264, + "learning_rate": 1.966604168343781e-05, + "loss": 0.6842, + "step": 9167 + }, + { + "epoch": 1.496632790498347, + "grad_norm": 1.9445688724517822, + "learning_rate": 1.9665960354466925e-05, + "loss": 0.7661, + "step": 9168 + }, + { + "epoch": 1.4967960491408514, + "grad_norm": 1.648514986038208, + "learning_rate": 1.9665879015762416e-05, + "loss": 0.6517, + "step": 9169 + }, + { + "epoch": 1.4969593077833558, + "grad_norm": 1.8444446325302124, + "learning_rate": 1.966579766732437e-05, + "loss": 0.6036, + "step": 9170 + }, + { + "epoch": 1.49712256642586, + "grad_norm": 1.5765466690063477, + "learning_rate": 1.9665716309152868e-05, + "loss": 0.7006, + "step": 9171 + }, + { + "epoch": 1.4972858250683645, + "grad_norm": 1.851354718208313, + "learning_rate": 1.966563494124799e-05, + "loss": 0.8096, + "step": 9172 + }, + { + "epoch": 1.497449083710869, + "grad_norm": 1.39980947971344, + "learning_rate": 1.9665553563609826e-05, + "loss": 0.5258, + "step": 9173 + }, + { + "epoch": 1.4976123423533734, + "grad_norm": 1.563685655593872, + "learning_rate": 1.9665472176238452e-05, + "loss": 0.6536, + "step": 9174 + }, + { + "epoch": 1.4977756009958778, + "grad_norm": 1.8312667608261108, + "learning_rate": 1.9665390779133945e-05, + "loss": 0.751, + "step": 9175 + }, + { + "epoch": 1.497938859638382, + "grad_norm": 1.9688165187835693, + "learning_rate": 1.9665309372296396e-05, + "loss": 0.7066, + "step": 9176 + }, + { + "epoch": 1.4981021182808865, + "grad_norm": 1.7810673713684082, + "learning_rate": 1.9665227955725882e-05, + "loss": 0.6123, + "step": 9177 + }, + { + "epoch": 1.498265376923391, + "grad_norm": 2.097825527191162, + "learning_rate": 1.9665146529422485e-05, + "loss": 0.6828, + "step": 9178 + }, + { + "epoch": 1.4984286355658953, + "grad_norm": 1.5779398679733276, + "learning_rate": 1.9665065093386287e-05, + "loss": 0.6153, + "step": 9179 + }, + { + "epoch": 1.4985918942083996, + "grad_norm": 1.778352975845337, + "learning_rate": 1.9664983647617375e-05, + "loss": 0.6283, + "step": 9180 + }, + { + "epoch": 1.498755152850904, + "grad_norm": 1.5818698406219482, + "learning_rate": 1.9664902192115825e-05, + "loss": 0.6047, + "step": 9181 + }, + { + "epoch": 1.4989184114934084, + "grad_norm": 1.9181369543075562, + "learning_rate": 1.966482072688172e-05, + "loss": 0.8305, + "step": 9182 + }, + { + "epoch": 1.4990816701359129, + "grad_norm": 1.7858409881591797, + "learning_rate": 1.9664739251915142e-05, + "loss": 0.64, + "step": 9183 + }, + { + "epoch": 1.4992449287784173, + "grad_norm": 1.523867130279541, + "learning_rate": 1.9664657767216176e-05, + "loss": 0.6484, + "step": 9184 + }, + { + "epoch": 1.4994081874209215, + "grad_norm": 1.4505800008773804, + "learning_rate": 1.9664576272784903e-05, + "loss": 0.5926, + "step": 9185 + }, + { + "epoch": 1.499571446063426, + "grad_norm": 1.8463571071624756, + "learning_rate": 1.9664494768621405e-05, + "loss": 0.7281, + "step": 9186 + }, + { + "epoch": 1.4997347047059304, + "grad_norm": 1.5847265720367432, + "learning_rate": 1.9664413254725762e-05, + "loss": 0.5892, + "step": 9187 + }, + { + "epoch": 1.4998979633484348, + "grad_norm": 2.2423532009124756, + "learning_rate": 1.9664331731098056e-05, + "loss": 0.7726, + "step": 9188 + }, + { + "epoch": 1.500061221990939, + "grad_norm": 2.0647356510162354, + "learning_rate": 1.9664250197738372e-05, + "loss": 0.617, + "step": 9189 + }, + { + "epoch": 1.5002244806334435, + "grad_norm": 1.7320938110351562, + "learning_rate": 1.966416865464679e-05, + "loss": 0.6483, + "step": 9190 + }, + { + "epoch": 1.500387739275948, + "grad_norm": 1.760252833366394, + "learning_rate": 1.9664087101823394e-05, + "loss": 0.6737, + "step": 9191 + }, + { + "epoch": 1.5005509979184524, + "grad_norm": 1.7635743618011475, + "learning_rate": 1.9664005539268263e-05, + "loss": 0.7009, + "step": 9192 + }, + { + "epoch": 1.5007142565609568, + "grad_norm": 1.603127360343933, + "learning_rate": 1.9663923966981482e-05, + "loss": 0.6958, + "step": 9193 + }, + { + "epoch": 1.5008775152034612, + "grad_norm": 1.9792087078094482, + "learning_rate": 1.9663842384963133e-05, + "loss": 0.8413, + "step": 9194 + }, + { + "epoch": 1.5010407738459655, + "grad_norm": 1.7429194450378418, + "learning_rate": 1.9663760793213297e-05, + "loss": 0.7146, + "step": 9195 + }, + { + "epoch": 1.50120403248847, + "grad_norm": 1.8340983390808105, + "learning_rate": 1.9663679191732052e-05, + "loss": 0.6614, + "step": 9196 + }, + { + "epoch": 1.5013672911309741, + "grad_norm": 1.8444374799728394, + "learning_rate": 1.966359758051949e-05, + "loss": 0.7719, + "step": 9197 + }, + { + "epoch": 1.5015305497734786, + "grad_norm": 1.5356831550598145, + "learning_rate": 1.9663515959575687e-05, + "loss": 0.5604, + "step": 9198 + }, + { + "epoch": 1.501693808415983, + "grad_norm": 1.9556879997253418, + "learning_rate": 1.9663434328900727e-05, + "loss": 0.8634, + "step": 9199 + }, + { + "epoch": 1.5018570670584874, + "grad_norm": 1.7234108448028564, + "learning_rate": 1.9663352688494686e-05, + "loss": 0.6602, + "step": 9200 + }, + { + "epoch": 1.5020203257009919, + "grad_norm": 1.6151106357574463, + "learning_rate": 1.9663271038357656e-05, + "loss": 0.669, + "step": 9201 + }, + { + "epoch": 1.5021835843434963, + "grad_norm": 1.6345739364624023, + "learning_rate": 1.966318937848971e-05, + "loss": 0.6766, + "step": 9202 + }, + { + "epoch": 1.5023468429860005, + "grad_norm": 1.4169303178787231, + "learning_rate": 1.966310770889094e-05, + "loss": 0.5183, + "step": 9203 + }, + { + "epoch": 1.502510101628505, + "grad_norm": 1.6124773025512695, + "learning_rate": 1.9663026029561422e-05, + "loss": 0.5786, + "step": 9204 + }, + { + "epoch": 1.5026733602710094, + "grad_norm": 2.07743501663208, + "learning_rate": 1.966294434050124e-05, + "loss": 0.7735, + "step": 9205 + }, + { + "epoch": 1.5028366189135136, + "grad_norm": 1.560802936553955, + "learning_rate": 1.966286264171047e-05, + "loss": 0.5835, + "step": 9206 + }, + { + "epoch": 1.502999877556018, + "grad_norm": 1.7170870304107666, + "learning_rate": 1.9662780933189208e-05, + "loss": 0.615, + "step": 9207 + }, + { + "epoch": 1.5031631361985225, + "grad_norm": 1.8181757926940918, + "learning_rate": 1.9662699214937525e-05, + "loss": 0.7619, + "step": 9208 + }, + { + "epoch": 1.503326394841027, + "grad_norm": 1.8135143518447876, + "learning_rate": 1.9662617486955505e-05, + "loss": 0.6889, + "step": 9209 + }, + { + "epoch": 1.5034896534835314, + "grad_norm": 1.8922208547592163, + "learning_rate": 1.9662535749243233e-05, + "loss": 0.699, + "step": 9210 + }, + { + "epoch": 1.5036529121260358, + "grad_norm": 1.7311550378799438, + "learning_rate": 1.966245400180079e-05, + "loss": 0.7802, + "step": 9211 + }, + { + "epoch": 1.50381617076854, + "grad_norm": 1.93391752243042, + "learning_rate": 1.9662372244628255e-05, + "loss": 0.6381, + "step": 9212 + }, + { + "epoch": 1.5039794294110445, + "grad_norm": 1.8411240577697754, + "learning_rate": 1.9662290477725717e-05, + "loss": 0.7454, + "step": 9213 + }, + { + "epoch": 1.5041426880535487, + "grad_norm": 1.8079935312271118, + "learning_rate": 1.9662208701093255e-05, + "loss": 0.6248, + "step": 9214 + }, + { + "epoch": 1.504305946696053, + "grad_norm": 1.8441911935806274, + "learning_rate": 1.966212691473095e-05, + "loss": 0.7209, + "step": 9215 + }, + { + "epoch": 1.5044692053385575, + "grad_norm": 1.901990532875061, + "learning_rate": 1.966204511863889e-05, + "loss": 0.8821, + "step": 9216 + }, + { + "epoch": 1.504632463981062, + "grad_norm": 1.8280116319656372, + "learning_rate": 1.966196331281715e-05, + "loss": 0.7127, + "step": 9217 + }, + { + "epoch": 1.5047957226235664, + "grad_norm": 1.7730761766433716, + "learning_rate": 1.9661881497265813e-05, + "loss": 0.8725, + "step": 9218 + }, + { + "epoch": 1.5049589812660709, + "grad_norm": 1.665906310081482, + "learning_rate": 1.966179967198497e-05, + "loss": 0.6916, + "step": 9219 + }, + { + "epoch": 1.5051222399085753, + "grad_norm": 2.053025960922241, + "learning_rate": 1.9661717836974694e-05, + "loss": 0.8238, + "step": 9220 + }, + { + "epoch": 1.5052854985510795, + "grad_norm": 1.7003999948501587, + "learning_rate": 1.966163599223507e-05, + "loss": 0.8462, + "step": 9221 + }, + { + "epoch": 1.505448757193584, + "grad_norm": 1.9179718494415283, + "learning_rate": 1.9661554137766178e-05, + "loss": 0.6394, + "step": 9222 + }, + { + "epoch": 1.5056120158360882, + "grad_norm": 1.6990323066711426, + "learning_rate": 1.9661472273568106e-05, + "loss": 0.6942, + "step": 9223 + }, + { + "epoch": 1.5057752744785926, + "grad_norm": 1.8510026931762695, + "learning_rate": 1.9661390399640936e-05, + "loss": 0.7517, + "step": 9224 + }, + { + "epoch": 1.505938533121097, + "grad_norm": 1.67653226852417, + "learning_rate": 1.966130851598475e-05, + "loss": 0.6773, + "step": 9225 + }, + { + "epoch": 1.5061017917636015, + "grad_norm": 2.137173891067505, + "learning_rate": 1.9661226622599627e-05, + "loss": 0.7611, + "step": 9226 + }, + { + "epoch": 1.506265050406106, + "grad_norm": 1.999756932258606, + "learning_rate": 1.966114471948565e-05, + "loss": 0.7817, + "step": 9227 + }, + { + "epoch": 1.5064283090486104, + "grad_norm": 1.7564342021942139, + "learning_rate": 1.9661062806642903e-05, + "loss": 0.6394, + "step": 9228 + }, + { + "epoch": 1.5065915676911148, + "grad_norm": 1.843449592590332, + "learning_rate": 1.9660980884071468e-05, + "loss": 0.7945, + "step": 9229 + }, + { + "epoch": 1.506754826333619, + "grad_norm": 1.749974250793457, + "learning_rate": 1.966089895177143e-05, + "loss": 0.8012, + "step": 9230 + }, + { + "epoch": 1.5069180849761235, + "grad_norm": 1.9351595640182495, + "learning_rate": 1.9660817009742867e-05, + "loss": 0.7638, + "step": 9231 + }, + { + "epoch": 1.5070813436186277, + "grad_norm": 1.7174971103668213, + "learning_rate": 1.9660735057985865e-05, + "loss": 0.7143, + "step": 9232 + }, + { + "epoch": 1.507244602261132, + "grad_norm": 1.7320640087127686, + "learning_rate": 1.9660653096500506e-05, + "loss": 0.6668, + "step": 9233 + }, + { + "epoch": 1.5074078609036365, + "grad_norm": 1.7025355100631714, + "learning_rate": 1.9660571125286873e-05, + "loss": 0.6427, + "step": 9234 + }, + { + "epoch": 1.507571119546141, + "grad_norm": 1.5663793087005615, + "learning_rate": 1.9660489144345042e-05, + "loss": 0.6787, + "step": 9235 + }, + { + "epoch": 1.5077343781886454, + "grad_norm": 1.5784380435943604, + "learning_rate": 1.966040715367511e-05, + "loss": 0.5122, + "step": 9236 + }, + { + "epoch": 1.5078976368311499, + "grad_norm": 1.7486788034439087, + "learning_rate": 1.966032515327714e-05, + "loss": 0.7114, + "step": 9237 + }, + { + "epoch": 1.5080608954736543, + "grad_norm": 1.7123584747314453, + "learning_rate": 1.966024314315123e-05, + "loss": 0.6606, + "step": 9238 + }, + { + "epoch": 1.5082241541161585, + "grad_norm": 1.9746376276016235, + "learning_rate": 1.966016112329746e-05, + "loss": 0.8135, + "step": 9239 + }, + { + "epoch": 1.508387412758663, + "grad_norm": 1.6528640985488892, + "learning_rate": 1.9660079093715906e-05, + "loss": 0.679, + "step": 9240 + }, + { + "epoch": 1.5085506714011672, + "grad_norm": 2.140596628189087, + "learning_rate": 1.9659997054406657e-05, + "loss": 0.8388, + "step": 9241 + }, + { + "epoch": 1.5087139300436716, + "grad_norm": 1.8062299489974976, + "learning_rate": 1.9659915005369795e-05, + "loss": 0.8103, + "step": 9242 + }, + { + "epoch": 1.508877188686176, + "grad_norm": 1.6751171350479126, + "learning_rate": 1.96598329466054e-05, + "loss": 0.6851, + "step": 9243 + }, + { + "epoch": 1.5090404473286805, + "grad_norm": 1.5374935865402222, + "learning_rate": 1.9659750878113555e-05, + "loss": 0.6603, + "step": 9244 + }, + { + "epoch": 1.509203705971185, + "grad_norm": 1.698721170425415, + "learning_rate": 1.9659668799894344e-05, + "loss": 0.5855, + "step": 9245 + }, + { + "epoch": 1.5093669646136894, + "grad_norm": 1.7462859153747559, + "learning_rate": 1.965958671194785e-05, + "loss": 0.7167, + "step": 9246 + }, + { + "epoch": 1.5095302232561936, + "grad_norm": 1.5849785804748535, + "learning_rate": 1.9659504614274153e-05, + "loss": 0.5612, + "step": 9247 + }, + { + "epoch": 1.509693481898698, + "grad_norm": 1.6543867588043213, + "learning_rate": 1.965942250687334e-05, + "loss": 0.7275, + "step": 9248 + }, + { + "epoch": 1.5098567405412024, + "grad_norm": 1.5935299396514893, + "learning_rate": 1.965934038974549e-05, + "loss": 0.6857, + "step": 9249 + }, + { + "epoch": 1.5100199991837067, + "grad_norm": 1.6700538396835327, + "learning_rate": 1.9659258262890683e-05, + "loss": 0.6465, + "step": 9250 + }, + { + "epoch": 1.510183257826211, + "grad_norm": 1.8055061101913452, + "learning_rate": 1.965917612630901e-05, + "loss": 0.7505, + "step": 9251 + }, + { + "epoch": 1.5103465164687155, + "grad_norm": 2.0924155712127686, + "learning_rate": 1.965909398000055e-05, + "loss": 0.7012, + "step": 9252 + }, + { + "epoch": 1.51050977511122, + "grad_norm": 1.4190868139266968, + "learning_rate": 1.965901182396538e-05, + "loss": 0.5304, + "step": 9253 + }, + { + "epoch": 1.5106730337537244, + "grad_norm": 1.8384699821472168, + "learning_rate": 1.9658929658203593e-05, + "loss": 0.7084, + "step": 9254 + }, + { + "epoch": 1.5108362923962289, + "grad_norm": 1.6622133255004883, + "learning_rate": 1.965884748271526e-05, + "loss": 0.6267, + "step": 9255 + }, + { + "epoch": 1.510999551038733, + "grad_norm": 1.6499580144882202, + "learning_rate": 1.9658765297500478e-05, + "loss": 0.6374, + "step": 9256 + }, + { + "epoch": 1.5111628096812375, + "grad_norm": 2.0668561458587646, + "learning_rate": 1.9658683102559317e-05, + "loss": 0.7817, + "step": 9257 + }, + { + "epoch": 1.5113260683237417, + "grad_norm": 1.9733960628509521, + "learning_rate": 1.9658600897891867e-05, + "loss": 0.7533, + "step": 9258 + }, + { + "epoch": 1.5114893269662462, + "grad_norm": 1.5487282276153564, + "learning_rate": 1.9658518683498204e-05, + "loss": 0.6505, + "step": 9259 + }, + { + "epoch": 1.5116525856087506, + "grad_norm": 1.4914352893829346, + "learning_rate": 1.9658436459378422e-05, + "loss": 0.6095, + "step": 9260 + }, + { + "epoch": 1.511815844251255, + "grad_norm": 1.7780154943466187, + "learning_rate": 1.965835422553259e-05, + "loss": 0.7749, + "step": 9261 + }, + { + "epoch": 1.5119791028937595, + "grad_norm": 1.8552535772323608, + "learning_rate": 1.96582719819608e-05, + "loss": 0.7629, + "step": 9262 + }, + { + "epoch": 1.512142361536264, + "grad_norm": 1.5163869857788086, + "learning_rate": 1.9658189728663136e-05, + "loss": 0.6201, + "step": 9263 + }, + { + "epoch": 1.5123056201787684, + "grad_norm": 1.7470884323120117, + "learning_rate": 1.9658107465639676e-05, + "loss": 0.7801, + "step": 9264 + }, + { + "epoch": 1.5124688788212726, + "grad_norm": 1.929487705230713, + "learning_rate": 1.9658025192890502e-05, + "loss": 0.6101, + "step": 9265 + }, + { + "epoch": 1.512632137463777, + "grad_norm": 1.523269772529602, + "learning_rate": 1.9657942910415703e-05, + "loss": 0.6186, + "step": 9266 + }, + { + "epoch": 1.5127953961062812, + "grad_norm": 1.6428195238113403, + "learning_rate": 1.9657860618215353e-05, + "loss": 0.725, + "step": 9267 + }, + { + "epoch": 1.5129586547487857, + "grad_norm": 1.737205147743225, + "learning_rate": 1.9657778316289543e-05, + "loss": 0.6692, + "step": 9268 + }, + { + "epoch": 1.51312191339129, + "grad_norm": 1.7569655179977417, + "learning_rate": 1.9657696004638355e-05, + "loss": 0.6871, + "step": 9269 + }, + { + "epoch": 1.5132851720337945, + "grad_norm": 1.7467747926712036, + "learning_rate": 1.9657613683261866e-05, + "loss": 0.6881, + "step": 9270 + }, + { + "epoch": 1.513448430676299, + "grad_norm": 1.6222825050354004, + "learning_rate": 1.9657531352160163e-05, + "loss": 0.6495, + "step": 9271 + }, + { + "epoch": 1.5136116893188034, + "grad_norm": 1.6093761920928955, + "learning_rate": 1.9657449011333328e-05, + "loss": 0.62, + "step": 9272 + }, + { + "epoch": 1.5137749479613078, + "grad_norm": 1.7211860418319702, + "learning_rate": 1.965736666078145e-05, + "loss": 0.6791, + "step": 9273 + }, + { + "epoch": 1.513938206603812, + "grad_norm": 1.9281294345855713, + "learning_rate": 1.96572843005046e-05, + "loss": 0.8709, + "step": 9274 + }, + { + "epoch": 1.5141014652463165, + "grad_norm": 1.6260515451431274, + "learning_rate": 1.9657201930502872e-05, + "loss": 0.6259, + "step": 9275 + }, + { + "epoch": 1.5142647238888207, + "grad_norm": 1.791630744934082, + "learning_rate": 1.965711955077634e-05, + "loss": 0.7608, + "step": 9276 + }, + { + "epoch": 1.5144279825313252, + "grad_norm": 1.6905677318572998, + "learning_rate": 1.9657037161325095e-05, + "loss": 0.8017, + "step": 9277 + }, + { + "epoch": 1.5145912411738296, + "grad_norm": 1.9632296562194824, + "learning_rate": 1.9656954762149213e-05, + "loss": 0.7707, + "step": 9278 + }, + { + "epoch": 1.514754499816334, + "grad_norm": 1.8477476835250854, + "learning_rate": 1.965687235324878e-05, + "loss": 0.5879, + "step": 9279 + }, + { + "epoch": 1.5149177584588385, + "grad_norm": 1.8311630487442017, + "learning_rate": 1.965678993462388e-05, + "loss": 0.7546, + "step": 9280 + }, + { + "epoch": 1.515081017101343, + "grad_norm": 1.6614177227020264, + "learning_rate": 1.9656707506274595e-05, + "loss": 0.6978, + "step": 9281 + }, + { + "epoch": 1.5152442757438473, + "grad_norm": 1.8868060111999512, + "learning_rate": 1.965662506820101e-05, + "loss": 0.8386, + "step": 9282 + }, + { + "epoch": 1.5154075343863516, + "grad_norm": 1.8684314489364624, + "learning_rate": 1.9656542620403203e-05, + "loss": 0.6841, + "step": 9283 + }, + { + "epoch": 1.515570793028856, + "grad_norm": 1.99854576587677, + "learning_rate": 1.9656460162881262e-05, + "loss": 0.7589, + "step": 9284 + }, + { + "epoch": 1.5157340516713602, + "grad_norm": 1.7998199462890625, + "learning_rate": 1.965637769563527e-05, + "loss": 0.6682, + "step": 9285 + }, + { + "epoch": 1.5158973103138647, + "grad_norm": 1.6971325874328613, + "learning_rate": 1.9656295218665306e-05, + "loss": 0.7699, + "step": 9286 + }, + { + "epoch": 1.516060568956369, + "grad_norm": 2.063441753387451, + "learning_rate": 1.9656212731971452e-05, + "loss": 0.8038, + "step": 9287 + }, + { + "epoch": 1.5162238275988735, + "grad_norm": 1.5106374025344849, + "learning_rate": 1.96561302355538e-05, + "loss": 0.6174, + "step": 9288 + }, + { + "epoch": 1.516387086241378, + "grad_norm": 2.0775322914123535, + "learning_rate": 1.9656047729412426e-05, + "loss": 0.7731, + "step": 9289 + }, + { + "epoch": 1.5165503448838824, + "grad_norm": 1.535444974899292, + "learning_rate": 1.965596521354741e-05, + "loss": 0.6611, + "step": 9290 + }, + { + "epoch": 1.5167136035263868, + "grad_norm": 1.8570891618728638, + "learning_rate": 1.9655882687958845e-05, + "loss": 0.6318, + "step": 9291 + }, + { + "epoch": 1.516876862168891, + "grad_norm": 1.9117178916931152, + "learning_rate": 1.965580015264681e-05, + "loss": 0.7406, + "step": 9292 + }, + { + "epoch": 1.5170401208113955, + "grad_norm": 2.491406202316284, + "learning_rate": 1.965571760761138e-05, + "loss": 0.8373, + "step": 9293 + }, + { + "epoch": 1.5172033794538997, + "grad_norm": 1.6160461902618408, + "learning_rate": 1.9655635052852648e-05, + "loss": 0.6843, + "step": 9294 + }, + { + "epoch": 1.5173666380964042, + "grad_norm": 1.84751558303833, + "learning_rate": 1.9655552488370694e-05, + "loss": 0.6786, + "step": 9295 + }, + { + "epoch": 1.5175298967389086, + "grad_norm": 1.9708728790283203, + "learning_rate": 1.9655469914165604e-05, + "loss": 0.6661, + "step": 9296 + }, + { + "epoch": 1.517693155381413, + "grad_norm": 1.6271531581878662, + "learning_rate": 1.9655387330237454e-05, + "loss": 0.6595, + "step": 9297 + }, + { + "epoch": 1.5178564140239175, + "grad_norm": 1.6992237567901611, + "learning_rate": 1.9655304736586335e-05, + "loss": 0.6635, + "step": 9298 + }, + { + "epoch": 1.518019672666422, + "grad_norm": 1.8905822038650513, + "learning_rate": 1.9655222133212327e-05, + "loss": 0.7588, + "step": 9299 + }, + { + "epoch": 1.5181829313089261, + "grad_norm": 1.8169599771499634, + "learning_rate": 1.965513952011551e-05, + "loss": 0.706, + "step": 9300 + }, + { + "epoch": 1.5183461899514306, + "grad_norm": 1.8131130933761597, + "learning_rate": 1.965505689729597e-05, + "loss": 0.6845, + "step": 9301 + }, + { + "epoch": 1.518509448593935, + "grad_norm": 1.4808218479156494, + "learning_rate": 1.965497426475379e-05, + "loss": 0.6094, + "step": 9302 + }, + { + "epoch": 1.5186727072364392, + "grad_norm": 1.6372148990631104, + "learning_rate": 1.965489162248906e-05, + "loss": 0.5549, + "step": 9303 + }, + { + "epoch": 1.5188359658789437, + "grad_norm": 1.836823582649231, + "learning_rate": 1.9654808970501852e-05, + "loss": 0.6963, + "step": 9304 + }, + { + "epoch": 1.518999224521448, + "grad_norm": 1.5312870740890503, + "learning_rate": 1.9654726308792252e-05, + "loss": 0.6237, + "step": 9305 + }, + { + "epoch": 1.5191624831639525, + "grad_norm": 1.572156548500061, + "learning_rate": 1.9654643637360347e-05, + "loss": 0.5897, + "step": 9306 + }, + { + "epoch": 1.519325741806457, + "grad_norm": 1.3847283124923706, + "learning_rate": 1.9654560956206218e-05, + "loss": 0.5061, + "step": 9307 + }, + { + "epoch": 1.5194890004489614, + "grad_norm": 1.8717001676559448, + "learning_rate": 1.965447826532995e-05, + "loss": 0.7329, + "step": 9308 + }, + { + "epoch": 1.5196522590914656, + "grad_norm": 1.6560218334197998, + "learning_rate": 1.9654395564731624e-05, + "loss": 0.5534, + "step": 9309 + }, + { + "epoch": 1.51981551773397, + "grad_norm": 1.86985445022583, + "learning_rate": 1.9654312854411325e-05, + "loss": 0.7143, + "step": 9310 + }, + { + "epoch": 1.5199787763764743, + "grad_norm": 1.5928958654403687, + "learning_rate": 1.9654230134369134e-05, + "loss": 0.7016, + "step": 9311 + }, + { + "epoch": 1.5201420350189787, + "grad_norm": 1.5898302793502808, + "learning_rate": 1.9654147404605136e-05, + "loss": 0.6971, + "step": 9312 + }, + { + "epoch": 1.5203052936614831, + "grad_norm": 1.5732944011688232, + "learning_rate": 1.9654064665119415e-05, + "loss": 0.6445, + "step": 9313 + }, + { + "epoch": 1.5204685523039876, + "grad_norm": 1.8308943510055542, + "learning_rate": 1.9653981915912054e-05, + "loss": 0.6964, + "step": 9314 + }, + { + "epoch": 1.520631810946492, + "grad_norm": 1.5813237428665161, + "learning_rate": 1.9653899156983132e-05, + "loss": 0.7283, + "step": 9315 + }, + { + "epoch": 1.5207950695889965, + "grad_norm": 1.772891640663147, + "learning_rate": 1.965381638833274e-05, + "loss": 0.86, + "step": 9316 + }, + { + "epoch": 1.520958328231501, + "grad_norm": 1.5530004501342773, + "learning_rate": 1.9653733609960956e-05, + "loss": 0.6495, + "step": 9317 + }, + { + "epoch": 1.5211215868740051, + "grad_norm": 1.4502285718917847, + "learning_rate": 1.9653650821867867e-05, + "loss": 0.5297, + "step": 9318 + }, + { + "epoch": 1.5212848455165096, + "grad_norm": 1.927639365196228, + "learning_rate": 1.965356802405355e-05, + "loss": 0.8034, + "step": 9319 + }, + { + "epoch": 1.5214481041590138, + "grad_norm": 1.9937869310379028, + "learning_rate": 1.9653485216518094e-05, + "loss": 0.7011, + "step": 9320 + }, + { + "epoch": 1.5216113628015182, + "grad_norm": 1.6955549716949463, + "learning_rate": 1.9653402399261586e-05, + "loss": 0.65, + "step": 9321 + }, + { + "epoch": 1.5217746214440226, + "grad_norm": 2.036925792694092, + "learning_rate": 1.9653319572284098e-05, + "loss": 0.6836, + "step": 9322 + }, + { + "epoch": 1.521937880086527, + "grad_norm": 1.7867528200149536, + "learning_rate": 1.9653236735585724e-05, + "loss": 0.7357, + "step": 9323 + }, + { + "epoch": 1.5221011387290315, + "grad_norm": 1.7286356687545776, + "learning_rate": 1.965315388916654e-05, + "loss": 0.7656, + "step": 9324 + }, + { + "epoch": 1.522264397371536, + "grad_norm": 1.7868379354476929, + "learning_rate": 1.9653071033026635e-05, + "loss": 0.7108, + "step": 9325 + }, + { + "epoch": 1.5224276560140404, + "grad_norm": 1.4771361351013184, + "learning_rate": 1.9652988167166086e-05, + "loss": 0.5685, + "step": 9326 + }, + { + "epoch": 1.5225909146565446, + "grad_norm": 1.7919398546218872, + "learning_rate": 1.9652905291584987e-05, + "loss": 0.7258, + "step": 9327 + }, + { + "epoch": 1.522754173299049, + "grad_norm": 1.9175596237182617, + "learning_rate": 1.9652822406283408e-05, + "loss": 0.7582, + "step": 9328 + }, + { + "epoch": 1.5229174319415533, + "grad_norm": 1.8960909843444824, + "learning_rate": 1.9652739511261446e-05, + "loss": 0.7412, + "step": 9329 + }, + { + "epoch": 1.5230806905840577, + "grad_norm": 1.8308054208755493, + "learning_rate": 1.9652656606519174e-05, + "loss": 0.7085, + "step": 9330 + }, + { + "epoch": 1.5232439492265621, + "grad_norm": 1.6823025941848755, + "learning_rate": 1.9652573692056677e-05, + "loss": 0.6263, + "step": 9331 + }, + { + "epoch": 1.5234072078690666, + "grad_norm": 1.8177499771118164, + "learning_rate": 1.9652490767874047e-05, + "loss": 0.7205, + "step": 9332 + }, + { + "epoch": 1.523570466511571, + "grad_norm": 1.3807194232940674, + "learning_rate": 1.9652407833971358e-05, + "loss": 0.546, + "step": 9333 + }, + { + "epoch": 1.5237337251540755, + "grad_norm": 1.6263338327407837, + "learning_rate": 1.9652324890348695e-05, + "loss": 0.6379, + "step": 9334 + }, + { + "epoch": 1.52389698379658, + "grad_norm": 1.8905985355377197, + "learning_rate": 1.9652241937006146e-05, + "loss": 0.7782, + "step": 9335 + }, + { + "epoch": 1.5240602424390841, + "grad_norm": 1.5670658349990845, + "learning_rate": 1.9652158973943792e-05, + "loss": 0.7025, + "step": 9336 + }, + { + "epoch": 1.5242235010815885, + "grad_norm": 1.6258268356323242, + "learning_rate": 1.9652076001161715e-05, + "loss": 0.6802, + "step": 9337 + }, + { + "epoch": 1.5243867597240928, + "grad_norm": 2.497962236404419, + "learning_rate": 1.9651993018660002e-05, + "loss": 0.9689, + "step": 9338 + }, + { + "epoch": 1.5245500183665972, + "grad_norm": 2.0998306274414062, + "learning_rate": 1.9651910026438732e-05, + "loss": 0.8589, + "step": 9339 + }, + { + "epoch": 1.5247132770091016, + "grad_norm": 1.6235616207122803, + "learning_rate": 1.9651827024497993e-05, + "loss": 0.597, + "step": 9340 + }, + { + "epoch": 1.524876535651606, + "grad_norm": 1.7903447151184082, + "learning_rate": 1.9651744012837866e-05, + "loss": 0.7067, + "step": 9341 + }, + { + "epoch": 1.5250397942941105, + "grad_norm": 1.436903715133667, + "learning_rate": 1.9651660991458435e-05, + "loss": 0.6438, + "step": 9342 + }, + { + "epoch": 1.525203052936615, + "grad_norm": 1.8167574405670166, + "learning_rate": 1.9651577960359783e-05, + "loss": 0.7403, + "step": 9343 + }, + { + "epoch": 1.5253663115791192, + "grad_norm": 1.548518419265747, + "learning_rate": 1.9651494919541997e-05, + "loss": 0.6544, + "step": 9344 + }, + { + "epoch": 1.5255295702216236, + "grad_norm": 1.7820303440093994, + "learning_rate": 1.9651411869005158e-05, + "loss": 0.5357, + "step": 9345 + }, + { + "epoch": 1.525692828864128, + "grad_norm": 1.4347519874572754, + "learning_rate": 1.965132880874935e-05, + "loss": 0.575, + "step": 9346 + }, + { + "epoch": 1.5258560875066323, + "grad_norm": 2.0128748416900635, + "learning_rate": 1.9651245738774655e-05, + "loss": 0.7017, + "step": 9347 + }, + { + "epoch": 1.5260193461491367, + "grad_norm": 1.557662010192871, + "learning_rate": 1.9651162659081156e-05, + "loss": 0.6049, + "step": 9348 + }, + { + "epoch": 1.5261826047916411, + "grad_norm": 1.7654147148132324, + "learning_rate": 1.9651079569668944e-05, + "loss": 0.7448, + "step": 9349 + }, + { + "epoch": 1.5263458634341456, + "grad_norm": 1.6691595315933228, + "learning_rate": 1.9650996470538093e-05, + "loss": 0.5857, + "step": 9350 + }, + { + "epoch": 1.52650912207665, + "grad_norm": 1.4767142534255981, + "learning_rate": 1.9650913361688694e-05, + "loss": 0.5637, + "step": 9351 + }, + { + "epoch": 1.5266723807191545, + "grad_norm": 1.6665476560592651, + "learning_rate": 1.9650830243120828e-05, + "loss": 0.7047, + "step": 9352 + }, + { + "epoch": 1.5268356393616587, + "grad_norm": 2.0503389835357666, + "learning_rate": 1.9650747114834578e-05, + "loss": 0.7518, + "step": 9353 + }, + { + "epoch": 1.526998898004163, + "grad_norm": 1.7322858572006226, + "learning_rate": 1.9650663976830023e-05, + "loss": 0.6056, + "step": 9354 + }, + { + "epoch": 1.5271621566466673, + "grad_norm": 1.836872935295105, + "learning_rate": 1.965058082910726e-05, + "loss": 0.8339, + "step": 9355 + }, + { + "epoch": 1.5273254152891718, + "grad_norm": 2.110888957977295, + "learning_rate": 1.965049767166636e-05, + "loss": 0.6538, + "step": 9356 + }, + { + "epoch": 1.5274886739316762, + "grad_norm": 1.788400411605835, + "learning_rate": 1.9650414504507412e-05, + "loss": 0.7375, + "step": 9357 + }, + { + "epoch": 1.5276519325741806, + "grad_norm": 1.6644761562347412, + "learning_rate": 1.96503313276305e-05, + "loss": 0.7288, + "step": 9358 + }, + { + "epoch": 1.527815191216685, + "grad_norm": 1.7994214296340942, + "learning_rate": 1.9650248141035707e-05, + "loss": 0.6819, + "step": 9359 + }, + { + "epoch": 1.5279784498591895, + "grad_norm": 1.8320527076721191, + "learning_rate": 1.9650164944723116e-05, + "loss": 0.7593, + "step": 9360 + }, + { + "epoch": 1.528141708501694, + "grad_norm": 1.9133325815200806, + "learning_rate": 1.9650081738692813e-05, + "loss": 0.7984, + "step": 9361 + }, + { + "epoch": 1.5283049671441982, + "grad_norm": 1.5597805976867676, + "learning_rate": 1.9649998522944878e-05, + "loss": 0.6208, + "step": 9362 + }, + { + "epoch": 1.5284682257867026, + "grad_norm": 2.3474831581115723, + "learning_rate": 1.9649915297479398e-05, + "loss": 0.7769, + "step": 9363 + }, + { + "epoch": 1.5286314844292068, + "grad_norm": 1.6871731281280518, + "learning_rate": 1.9649832062296458e-05, + "loss": 0.6851, + "step": 9364 + }, + { + "epoch": 1.5287947430717113, + "grad_norm": 2.04590106010437, + "learning_rate": 1.9649748817396136e-05, + "loss": 0.7258, + "step": 9365 + }, + { + "epoch": 1.5289580017142157, + "grad_norm": 1.6969974040985107, + "learning_rate": 1.964966556277852e-05, + "loss": 0.7712, + "step": 9366 + }, + { + "epoch": 1.5291212603567201, + "grad_norm": 1.7721267938613892, + "learning_rate": 1.9649582298443693e-05, + "loss": 0.7927, + "step": 9367 + }, + { + "epoch": 1.5292845189992246, + "grad_norm": 1.5970799922943115, + "learning_rate": 1.964949902439174e-05, + "loss": 0.6102, + "step": 9368 + }, + { + "epoch": 1.529447777641729, + "grad_norm": 1.5986828804016113, + "learning_rate": 1.964941574062275e-05, + "loss": 0.7066, + "step": 9369 + }, + { + "epoch": 1.5296110362842334, + "grad_norm": 1.8751593828201294, + "learning_rate": 1.964933244713679e-05, + "loss": 0.6897, + "step": 9370 + }, + { + "epoch": 1.5297742949267377, + "grad_norm": 1.4945485591888428, + "learning_rate": 1.9649249143933963e-05, + "loss": 0.562, + "step": 9371 + }, + { + "epoch": 1.529937553569242, + "grad_norm": 1.695502519607544, + "learning_rate": 1.964916583101434e-05, + "loss": 0.6652, + "step": 9372 + }, + { + "epoch": 1.5301008122117463, + "grad_norm": 1.6750316619873047, + "learning_rate": 1.9649082508378013e-05, + "loss": 0.693, + "step": 9373 + }, + { + "epoch": 1.5302640708542508, + "grad_norm": 1.7775930166244507, + "learning_rate": 1.964899917602506e-05, + "loss": 0.6717, + "step": 9374 + }, + { + "epoch": 1.5304273294967552, + "grad_norm": 1.643917202949524, + "learning_rate": 1.964891583395557e-05, + "loss": 0.6035, + "step": 9375 + }, + { + "epoch": 1.5305905881392596, + "grad_norm": 1.6897881031036377, + "learning_rate": 1.964883248216962e-05, + "loss": 0.6478, + "step": 9376 + }, + { + "epoch": 1.530753846781764, + "grad_norm": 1.842919945716858, + "learning_rate": 1.9648749120667302e-05, + "loss": 0.766, + "step": 9377 + }, + { + "epoch": 1.5309171054242685, + "grad_norm": 1.752646565437317, + "learning_rate": 1.9648665749448695e-05, + "loss": 0.6809, + "step": 9378 + }, + { + "epoch": 1.531080364066773, + "grad_norm": 1.7405864000320435, + "learning_rate": 1.9648582368513885e-05, + "loss": 0.7546, + "step": 9379 + }, + { + "epoch": 1.5312436227092772, + "grad_norm": 1.726799488067627, + "learning_rate": 1.9648498977862954e-05, + "loss": 0.6971, + "step": 9380 + }, + { + "epoch": 1.5314068813517816, + "grad_norm": 1.6051521301269531, + "learning_rate": 1.964841557749599e-05, + "loss": 0.6073, + "step": 9381 + }, + { + "epoch": 1.5315701399942858, + "grad_norm": 2.2582545280456543, + "learning_rate": 1.9648332167413067e-05, + "loss": 0.6857, + "step": 9382 + }, + { + "epoch": 1.5317333986367903, + "grad_norm": 2.403881072998047, + "learning_rate": 1.9648248747614285e-05, + "loss": 0.7608, + "step": 9383 + }, + { + "epoch": 1.5318966572792947, + "grad_norm": 1.3908957242965698, + "learning_rate": 1.9648165318099714e-05, + "loss": 0.5469, + "step": 9384 + }, + { + "epoch": 1.5320599159217991, + "grad_norm": 1.831740140914917, + "learning_rate": 1.9648081878869443e-05, + "loss": 0.7482, + "step": 9385 + }, + { + "epoch": 1.5322231745643036, + "grad_norm": 1.8593417406082153, + "learning_rate": 1.964799842992356e-05, + "loss": 0.6722, + "step": 9386 + }, + { + "epoch": 1.532386433206808, + "grad_norm": 1.6878613233566284, + "learning_rate": 1.9647914971262137e-05, + "loss": 0.5754, + "step": 9387 + }, + { + "epoch": 1.5325496918493122, + "grad_norm": 1.3713314533233643, + "learning_rate": 1.9647831502885273e-05, + "loss": 0.6178, + "step": 9388 + }, + { + "epoch": 1.5327129504918167, + "grad_norm": 1.7864274978637695, + "learning_rate": 1.9647748024793044e-05, + "loss": 0.6046, + "step": 9389 + }, + { + "epoch": 1.532876209134321, + "grad_norm": 2.0263280868530273, + "learning_rate": 1.9647664536985536e-05, + "loss": 0.6397, + "step": 9390 + }, + { + "epoch": 1.5330394677768253, + "grad_norm": 2.020458459854126, + "learning_rate": 1.964758103946283e-05, + "loss": 0.5877, + "step": 9391 + }, + { + "epoch": 1.5332027264193298, + "grad_norm": 2.217482328414917, + "learning_rate": 1.9647497532225014e-05, + "loss": 0.7866, + "step": 9392 + }, + { + "epoch": 1.5333659850618342, + "grad_norm": 1.3647398948669434, + "learning_rate": 1.964741401527217e-05, + "loss": 0.594, + "step": 9393 + }, + { + "epoch": 1.5335292437043386, + "grad_norm": 1.7567780017852783, + "learning_rate": 1.9647330488604382e-05, + "loss": 0.6781, + "step": 9394 + }, + { + "epoch": 1.533692502346843, + "grad_norm": 1.8904190063476562, + "learning_rate": 1.9647246952221734e-05, + "loss": 0.7468, + "step": 9395 + }, + { + "epoch": 1.5338557609893475, + "grad_norm": 1.7654463052749634, + "learning_rate": 1.9647163406124315e-05, + "loss": 0.6503, + "step": 9396 + }, + { + "epoch": 1.5340190196318517, + "grad_norm": 1.9758763313293457, + "learning_rate": 1.96470798503122e-05, + "loss": 0.8037, + "step": 9397 + }, + { + "epoch": 1.5341822782743562, + "grad_norm": 1.7731866836547852, + "learning_rate": 1.964699628478548e-05, + "loss": 0.6418, + "step": 9398 + }, + { + "epoch": 1.5343455369168604, + "grad_norm": 1.700462818145752, + "learning_rate": 1.964691270954424e-05, + "loss": 0.6968, + "step": 9399 + }, + { + "epoch": 1.5345087955593648, + "grad_norm": 1.5746897459030151, + "learning_rate": 1.964682912458856e-05, + "loss": 0.6294, + "step": 9400 + }, + { + "epoch": 1.5346720542018693, + "grad_norm": 1.8645474910736084, + "learning_rate": 1.9646745529918526e-05, + "loss": 0.6567, + "step": 9401 + }, + { + "epoch": 1.5348353128443737, + "grad_norm": 1.8743840456008911, + "learning_rate": 1.964666192553422e-05, + "loss": 0.7049, + "step": 9402 + }, + { + "epoch": 1.5349985714868781, + "grad_norm": 1.6272512674331665, + "learning_rate": 1.9646578311435728e-05, + "loss": 0.6325, + "step": 9403 + }, + { + "epoch": 1.5351618301293826, + "grad_norm": 1.9324036836624146, + "learning_rate": 1.9646494687623135e-05, + "loss": 0.6345, + "step": 9404 + }, + { + "epoch": 1.535325088771887, + "grad_norm": 1.6970335245132446, + "learning_rate": 1.9646411054096524e-05, + "loss": 0.7013, + "step": 9405 + }, + { + "epoch": 1.5354883474143912, + "grad_norm": 1.935442328453064, + "learning_rate": 1.964632741085598e-05, + "loss": 0.6793, + "step": 9406 + }, + { + "epoch": 1.5356516060568957, + "grad_norm": 1.86101233959198, + "learning_rate": 1.9646243757901587e-05, + "loss": 0.623, + "step": 9407 + }, + { + "epoch": 1.5358148646993999, + "grad_norm": 1.621456503868103, + "learning_rate": 1.9646160095233428e-05, + "loss": 0.561, + "step": 9408 + }, + { + "epoch": 1.5359781233419043, + "grad_norm": 2.1596784591674805, + "learning_rate": 1.964607642285159e-05, + "loss": 0.6999, + "step": 9409 + }, + { + "epoch": 1.5361413819844087, + "grad_norm": 1.689790964126587, + "learning_rate": 1.9645992740756153e-05, + "loss": 0.6402, + "step": 9410 + }, + { + "epoch": 1.5363046406269132, + "grad_norm": 1.6741058826446533, + "learning_rate": 1.9645909048947207e-05, + "loss": 0.6824, + "step": 9411 + }, + { + "epoch": 1.5364678992694176, + "grad_norm": 1.7208133935928345, + "learning_rate": 1.9645825347424833e-05, + "loss": 0.6777, + "step": 9412 + }, + { + "epoch": 1.536631157911922, + "grad_norm": 1.6265747547149658, + "learning_rate": 1.9645741636189112e-05, + "loss": 0.704, + "step": 9413 + }, + { + "epoch": 1.5367944165544265, + "grad_norm": 1.759395956993103, + "learning_rate": 1.9645657915240136e-05, + "loss": 0.4509, + "step": 9414 + }, + { + "epoch": 1.5369576751969307, + "grad_norm": 1.398733139038086, + "learning_rate": 1.9645574184577982e-05, + "loss": 0.6261, + "step": 9415 + }, + { + "epoch": 1.5371209338394352, + "grad_norm": 2.0852603912353516, + "learning_rate": 1.964549044420274e-05, + "loss": 0.8101, + "step": 9416 + }, + { + "epoch": 1.5372841924819394, + "grad_norm": 1.7341099977493286, + "learning_rate": 1.964540669411449e-05, + "loss": 0.5394, + "step": 9417 + }, + { + "epoch": 1.5374474511244438, + "grad_norm": 1.7706143856048584, + "learning_rate": 1.964532293431332e-05, + "loss": 0.7079, + "step": 9418 + }, + { + "epoch": 1.5376107097669482, + "grad_norm": 2.027695894241333, + "learning_rate": 1.964523916479931e-05, + "loss": 0.8463, + "step": 9419 + }, + { + "epoch": 1.5377739684094527, + "grad_norm": 1.492868185043335, + "learning_rate": 1.9645155385572545e-05, + "loss": 0.6326, + "step": 9420 + }, + { + "epoch": 1.5379372270519571, + "grad_norm": 1.6397600173950195, + "learning_rate": 1.9645071596633115e-05, + "loss": 0.6157, + "step": 9421 + }, + { + "epoch": 1.5381004856944616, + "grad_norm": 1.9635564088821411, + "learning_rate": 1.9644987797981097e-05, + "loss": 0.7325, + "step": 9422 + }, + { + "epoch": 1.538263744336966, + "grad_norm": 2.169646739959717, + "learning_rate": 1.9644903989616582e-05, + "loss": 0.8427, + "step": 9423 + }, + { + "epoch": 1.5384270029794702, + "grad_norm": 1.6734992265701294, + "learning_rate": 1.964482017153965e-05, + "loss": 0.6713, + "step": 9424 + }, + { + "epoch": 1.5385902616219747, + "grad_norm": 1.5758618116378784, + "learning_rate": 1.9644736343750385e-05, + "loss": 0.6104, + "step": 9425 + }, + { + "epoch": 1.5387535202644789, + "grad_norm": 1.783199429512024, + "learning_rate": 1.9644652506248872e-05, + "loss": 0.6291, + "step": 9426 + }, + { + "epoch": 1.5389167789069833, + "grad_norm": 1.4725489616394043, + "learning_rate": 1.96445686590352e-05, + "loss": 0.6347, + "step": 9427 + }, + { + "epoch": 1.5390800375494877, + "grad_norm": 2.1908011436462402, + "learning_rate": 1.964448480210945e-05, + "loss": 0.6561, + "step": 9428 + }, + { + "epoch": 1.5392432961919922, + "grad_norm": 1.6816900968551636, + "learning_rate": 1.9644400935471706e-05, + "loss": 0.5446, + "step": 9429 + }, + { + "epoch": 1.5394065548344966, + "grad_norm": 2.009796142578125, + "learning_rate": 1.964431705912205e-05, + "loss": 0.6878, + "step": 9430 + }, + { + "epoch": 1.539569813477001, + "grad_norm": 1.8893064260482788, + "learning_rate": 1.9644233173060575e-05, + "loss": 0.716, + "step": 9431 + }, + { + "epoch": 1.5397330721195053, + "grad_norm": 1.784517765045166, + "learning_rate": 1.9644149277287353e-05, + "loss": 0.6896, + "step": 9432 + }, + { + "epoch": 1.5398963307620097, + "grad_norm": 1.4236937761306763, + "learning_rate": 1.9644065371802478e-05, + "loss": 0.5781, + "step": 9433 + }, + { + "epoch": 1.5400595894045142, + "grad_norm": 1.6893795728683472, + "learning_rate": 1.9643981456606034e-05, + "loss": 0.6674, + "step": 9434 + }, + { + "epoch": 1.5402228480470184, + "grad_norm": 1.608532190322876, + "learning_rate": 1.96438975316981e-05, + "loss": 0.5973, + "step": 9435 + }, + { + "epoch": 1.5403861066895228, + "grad_norm": 1.5652847290039062, + "learning_rate": 1.9643813597078768e-05, + "loss": 0.6186, + "step": 9436 + }, + { + "epoch": 1.5405493653320272, + "grad_norm": 1.8109371662139893, + "learning_rate": 1.9643729652748115e-05, + "loss": 0.6145, + "step": 9437 + }, + { + "epoch": 1.5407126239745317, + "grad_norm": 1.8994207382202148, + "learning_rate": 1.964364569870623e-05, + "loss": 0.8121, + "step": 9438 + }, + { + "epoch": 1.5408758826170361, + "grad_norm": 2.2794837951660156, + "learning_rate": 1.9643561734953195e-05, + "loss": 0.7427, + "step": 9439 + }, + { + "epoch": 1.5410391412595406, + "grad_norm": 1.7784830331802368, + "learning_rate": 1.9643477761489097e-05, + "loss": 0.747, + "step": 9440 + }, + { + "epoch": 1.5412023999020448, + "grad_norm": 1.7229548692703247, + "learning_rate": 1.9643393778314018e-05, + "loss": 0.698, + "step": 9441 + }, + { + "epoch": 1.5413656585445492, + "grad_norm": 1.675310730934143, + "learning_rate": 1.9643309785428045e-05, + "loss": 0.6494, + "step": 9442 + }, + { + "epoch": 1.5415289171870534, + "grad_norm": 1.901889681816101, + "learning_rate": 1.9643225782831262e-05, + "loss": 0.6561, + "step": 9443 + }, + { + "epoch": 1.5416921758295579, + "grad_norm": 1.9107081890106201, + "learning_rate": 1.964314177052375e-05, + "loss": 0.6224, + "step": 9444 + }, + { + "epoch": 1.5418554344720623, + "grad_norm": 1.5033913850784302, + "learning_rate": 1.96430577485056e-05, + "loss": 0.5995, + "step": 9445 + }, + { + "epoch": 1.5420186931145667, + "grad_norm": 1.827280044555664, + "learning_rate": 1.9642973716776892e-05, + "loss": 0.6724, + "step": 9446 + }, + { + "epoch": 1.5421819517570712, + "grad_norm": 1.7390449047088623, + "learning_rate": 1.9642889675337717e-05, + "loss": 0.6183, + "step": 9447 + }, + { + "epoch": 1.5423452103995756, + "grad_norm": 1.7406712770462036, + "learning_rate": 1.964280562418815e-05, + "loss": 0.6435, + "step": 9448 + }, + { + "epoch": 1.54250846904208, + "grad_norm": 1.9621742963790894, + "learning_rate": 1.964272156332828e-05, + "loss": 0.7979, + "step": 9449 + }, + { + "epoch": 1.5426717276845843, + "grad_norm": 2.3159193992614746, + "learning_rate": 1.9642637492758193e-05, + "loss": 0.8412, + "step": 9450 + }, + { + "epoch": 1.5428349863270887, + "grad_norm": 1.9173122644424438, + "learning_rate": 1.9642553412477973e-05, + "loss": 0.6118, + "step": 9451 + }, + { + "epoch": 1.542998244969593, + "grad_norm": 1.4431688785552979, + "learning_rate": 1.9642469322487702e-05, + "loss": 0.5739, + "step": 9452 + }, + { + "epoch": 1.5431615036120974, + "grad_norm": 1.9666340351104736, + "learning_rate": 1.964238522278747e-05, + "loss": 0.6618, + "step": 9453 + }, + { + "epoch": 1.5433247622546018, + "grad_norm": 1.8873834609985352, + "learning_rate": 1.9642301113377355e-05, + "loss": 0.8051, + "step": 9454 + }, + { + "epoch": 1.5434880208971062, + "grad_norm": 1.6692343950271606, + "learning_rate": 1.9642216994257448e-05, + "loss": 0.6213, + "step": 9455 + }, + { + "epoch": 1.5436512795396107, + "grad_norm": 1.4786773920059204, + "learning_rate": 1.964213286542783e-05, + "loss": 0.6224, + "step": 9456 + }, + { + "epoch": 1.5438145381821151, + "grad_norm": 1.3987064361572266, + "learning_rate": 1.964204872688859e-05, + "loss": 0.4941, + "step": 9457 + }, + { + "epoch": 1.5439777968246196, + "grad_norm": 1.7440259456634521, + "learning_rate": 1.9641964578639805e-05, + "loss": 0.7177, + "step": 9458 + }, + { + "epoch": 1.5441410554671238, + "grad_norm": 1.8765745162963867, + "learning_rate": 1.9641880420681567e-05, + "loss": 0.7719, + "step": 9459 + }, + { + "epoch": 1.5443043141096282, + "grad_norm": 1.6687086820602417, + "learning_rate": 1.9641796253013957e-05, + "loss": 0.695, + "step": 9460 + }, + { + "epoch": 1.5444675727521324, + "grad_norm": 2.0355796813964844, + "learning_rate": 1.9641712075637062e-05, + "loss": 0.6932, + "step": 9461 + }, + { + "epoch": 1.5446308313946369, + "grad_norm": 1.624402403831482, + "learning_rate": 1.9641627888550964e-05, + "loss": 0.7277, + "step": 9462 + }, + { + "epoch": 1.5447940900371413, + "grad_norm": 1.74776291847229, + "learning_rate": 1.9641543691755747e-05, + "loss": 0.672, + "step": 9463 + }, + { + "epoch": 1.5449573486796457, + "grad_norm": 1.9137241840362549, + "learning_rate": 1.9641459485251504e-05, + "loss": 0.6882, + "step": 9464 + }, + { + "epoch": 1.5451206073221502, + "grad_norm": 1.885491132736206, + "learning_rate": 1.964137526903831e-05, + "loss": 0.8219, + "step": 9465 + }, + { + "epoch": 1.5452838659646546, + "grad_norm": 1.469007134437561, + "learning_rate": 1.9641291043116254e-05, + "loss": 0.6622, + "step": 9466 + }, + { + "epoch": 1.545447124607159, + "grad_norm": 1.6938213109970093, + "learning_rate": 1.964120680748542e-05, + "loss": 0.6371, + "step": 9467 + }, + { + "epoch": 1.5456103832496633, + "grad_norm": 1.6654342412948608, + "learning_rate": 1.9641122562145895e-05, + "loss": 0.6855, + "step": 9468 + }, + { + "epoch": 1.5457736418921677, + "grad_norm": 2.3407859802246094, + "learning_rate": 1.964103830709776e-05, + "loss": 0.8867, + "step": 9469 + }, + { + "epoch": 1.545936900534672, + "grad_norm": 1.5600014925003052, + "learning_rate": 1.96409540423411e-05, + "loss": 0.6878, + "step": 9470 + }, + { + "epoch": 1.5461001591771764, + "grad_norm": 1.5103298425674438, + "learning_rate": 1.9640869767876006e-05, + "loss": 0.4963, + "step": 9471 + }, + { + "epoch": 1.5462634178196808, + "grad_norm": 1.6041033267974854, + "learning_rate": 1.9640785483702558e-05, + "loss": 0.6611, + "step": 9472 + }, + { + "epoch": 1.5464266764621852, + "grad_norm": 1.6403136253356934, + "learning_rate": 1.964070118982084e-05, + "loss": 0.6864, + "step": 9473 + }, + { + "epoch": 1.5465899351046897, + "grad_norm": 1.4710911512374878, + "learning_rate": 1.9640616886230942e-05, + "loss": 0.5796, + "step": 9474 + }, + { + "epoch": 1.546753193747194, + "grad_norm": 1.810124397277832, + "learning_rate": 1.9640532572932944e-05, + "loss": 0.7551, + "step": 9475 + }, + { + "epoch": 1.5469164523896983, + "grad_norm": 1.5248197317123413, + "learning_rate": 1.9640448249926928e-05, + "loss": 0.5875, + "step": 9476 + }, + { + "epoch": 1.5470797110322028, + "grad_norm": 1.9090111255645752, + "learning_rate": 1.964036391721299e-05, + "loss": 0.7733, + "step": 9477 + }, + { + "epoch": 1.5472429696747072, + "grad_norm": 1.6833593845367432, + "learning_rate": 1.9640279574791203e-05, + "loss": 0.7213, + "step": 9478 + }, + { + "epoch": 1.5474062283172114, + "grad_norm": 1.626120686531067, + "learning_rate": 1.964019522266166e-05, + "loss": 0.5786, + "step": 9479 + }, + { + "epoch": 1.5475694869597159, + "grad_norm": 1.9080774784088135, + "learning_rate": 1.9640110860824442e-05, + "loss": 0.8009, + "step": 9480 + }, + { + "epoch": 1.5477327456022203, + "grad_norm": 1.9155495166778564, + "learning_rate": 1.9640026489279633e-05, + "loss": 0.7603, + "step": 9481 + }, + { + "epoch": 1.5478960042447247, + "grad_norm": 1.9354203939437866, + "learning_rate": 1.9639942108027322e-05, + "loss": 0.8244, + "step": 9482 + }, + { + "epoch": 1.5480592628872292, + "grad_norm": 1.695570707321167, + "learning_rate": 1.963985771706759e-05, + "loss": 0.7406, + "step": 9483 + }, + { + "epoch": 1.5482225215297336, + "grad_norm": 1.7488254308700562, + "learning_rate": 1.9639773316400525e-05, + "loss": 0.6846, + "step": 9484 + }, + { + "epoch": 1.5483857801722378, + "grad_norm": 1.9334919452667236, + "learning_rate": 1.9639688906026212e-05, + "loss": 0.8951, + "step": 9485 + }, + { + "epoch": 1.5485490388147423, + "grad_norm": 2.0990407466888428, + "learning_rate": 1.9639604485944735e-05, + "loss": 0.6024, + "step": 9486 + }, + { + "epoch": 1.5487122974572465, + "grad_norm": 1.7125242948532104, + "learning_rate": 1.9639520056156176e-05, + "loss": 0.6593, + "step": 9487 + }, + { + "epoch": 1.548875556099751, + "grad_norm": 1.9527026414871216, + "learning_rate": 1.9639435616660622e-05, + "loss": 0.7042, + "step": 9488 + }, + { + "epoch": 1.5490388147422554, + "grad_norm": 1.6159279346466064, + "learning_rate": 1.9639351167458163e-05, + "loss": 0.632, + "step": 9489 + }, + { + "epoch": 1.5492020733847598, + "grad_norm": 1.655697226524353, + "learning_rate": 1.9639266708548878e-05, + "loss": 0.6587, + "step": 9490 + }, + { + "epoch": 1.5493653320272642, + "grad_norm": 1.8595929145812988, + "learning_rate": 1.9639182239932854e-05, + "loss": 0.6823, + "step": 9491 + }, + { + "epoch": 1.5495285906697687, + "grad_norm": 1.7073559761047363, + "learning_rate": 1.9639097761610174e-05, + "loss": 0.6504, + "step": 9492 + }, + { + "epoch": 1.549691849312273, + "grad_norm": 1.7612725496292114, + "learning_rate": 1.963901327358093e-05, + "loss": 0.5967, + "step": 9493 + }, + { + "epoch": 1.5498551079547773, + "grad_norm": 1.9051953554153442, + "learning_rate": 1.9638928775845197e-05, + "loss": 0.7358, + "step": 9494 + }, + { + "epoch": 1.5500183665972818, + "grad_norm": 1.7650326490402222, + "learning_rate": 1.963884426840307e-05, + "loss": 0.6547, + "step": 9495 + }, + { + "epoch": 1.550181625239786, + "grad_norm": 1.5050806999206543, + "learning_rate": 1.963875975125463e-05, + "loss": 0.6222, + "step": 9496 + }, + { + "epoch": 1.5503448838822904, + "grad_norm": 1.9541840553283691, + "learning_rate": 1.9638675224399957e-05, + "loss": 0.7282, + "step": 9497 + }, + { + "epoch": 1.5505081425247949, + "grad_norm": 1.9103825092315674, + "learning_rate": 1.963859068783914e-05, + "loss": 0.7999, + "step": 9498 + }, + { + "epoch": 1.5506714011672993, + "grad_norm": 1.9060308933258057, + "learning_rate": 1.963850614157227e-05, + "loss": 0.6585, + "step": 9499 + }, + { + "epoch": 1.5508346598098037, + "grad_norm": 1.8326544761657715, + "learning_rate": 1.9638421585599422e-05, + "loss": 0.74, + "step": 9500 + }, + { + "epoch": 1.5509979184523082, + "grad_norm": 2.0242741107940674, + "learning_rate": 1.963833701992069e-05, + "loss": 0.7776, + "step": 9501 + }, + { + "epoch": 1.5511611770948126, + "grad_norm": 1.8957571983337402, + "learning_rate": 1.9638252444536152e-05, + "loss": 0.7789, + "step": 9502 + }, + { + "epoch": 1.5513244357373168, + "grad_norm": 1.5746911764144897, + "learning_rate": 1.9638167859445894e-05, + "loss": 0.6695, + "step": 9503 + }, + { + "epoch": 1.5514876943798213, + "grad_norm": 1.419299840927124, + "learning_rate": 1.963808326465001e-05, + "loss": 0.6859, + "step": 9504 + }, + { + "epoch": 1.5516509530223255, + "grad_norm": 1.813405156135559, + "learning_rate": 1.9637998660148577e-05, + "loss": 0.6895, + "step": 9505 + }, + { + "epoch": 1.55181421166483, + "grad_norm": 1.8447346687316895, + "learning_rate": 1.9637914045941677e-05, + "loss": 0.7008, + "step": 9506 + }, + { + "epoch": 1.5519774703073344, + "grad_norm": 1.4773714542388916, + "learning_rate": 1.9637829422029405e-05, + "loss": 0.6266, + "step": 9507 + }, + { + "epoch": 1.5521407289498388, + "grad_norm": 2.018190622329712, + "learning_rate": 1.963774478841184e-05, + "loss": 0.7602, + "step": 9508 + }, + { + "epoch": 1.5523039875923432, + "grad_norm": 1.513196587562561, + "learning_rate": 1.9637660145089067e-05, + "loss": 0.6334, + "step": 9509 + }, + { + "epoch": 1.5524672462348477, + "grad_norm": 2.016235828399658, + "learning_rate": 1.9637575492061176e-05, + "loss": 0.7618, + "step": 9510 + }, + { + "epoch": 1.552630504877352, + "grad_norm": 1.7155921459197998, + "learning_rate": 1.9637490829328247e-05, + "loss": 0.7685, + "step": 9511 + }, + { + "epoch": 1.5527937635198563, + "grad_norm": 1.4299681186676025, + "learning_rate": 1.963740615689037e-05, + "loss": 0.5304, + "step": 9512 + }, + { + "epoch": 1.5529570221623608, + "grad_norm": 1.7057603597640991, + "learning_rate": 1.9637321474747625e-05, + "loss": 0.6019, + "step": 9513 + }, + { + "epoch": 1.553120280804865, + "grad_norm": 1.5450531244277954, + "learning_rate": 1.96372367829001e-05, + "loss": 0.5715, + "step": 9514 + }, + { + "epoch": 1.5532835394473694, + "grad_norm": 1.659255027770996, + "learning_rate": 1.963715208134788e-05, + "loss": 0.7389, + "step": 9515 + }, + { + "epoch": 1.5534467980898738, + "grad_norm": 1.8471099138259888, + "learning_rate": 1.9637067370091048e-05, + "loss": 0.7451, + "step": 9516 + }, + { + "epoch": 1.5536100567323783, + "grad_norm": 1.7966442108154297, + "learning_rate": 1.9636982649129695e-05, + "loss": 0.7506, + "step": 9517 + }, + { + "epoch": 1.5537733153748827, + "grad_norm": 1.3493247032165527, + "learning_rate": 1.9636897918463903e-05, + "loss": 0.5638, + "step": 9518 + }, + { + "epoch": 1.5539365740173872, + "grad_norm": 1.52828049659729, + "learning_rate": 1.9636813178093756e-05, + "loss": 0.6087, + "step": 9519 + }, + { + "epoch": 1.5540998326598914, + "grad_norm": 1.4387402534484863, + "learning_rate": 1.963672842801934e-05, + "loss": 0.555, + "step": 9520 + }, + { + "epoch": 1.5542630913023958, + "grad_norm": 1.6602486371994019, + "learning_rate": 1.9636643668240743e-05, + "loss": 0.6016, + "step": 9521 + }, + { + "epoch": 1.5544263499449003, + "grad_norm": 1.9072169065475464, + "learning_rate": 1.963655889875805e-05, + "loss": 0.7039, + "step": 9522 + }, + { + "epoch": 1.5545896085874045, + "grad_norm": 1.6349225044250488, + "learning_rate": 1.963647411957134e-05, + "loss": 0.5079, + "step": 9523 + }, + { + "epoch": 1.554752867229909, + "grad_norm": 1.445678949356079, + "learning_rate": 1.9636389330680708e-05, + "loss": 0.6049, + "step": 9524 + }, + { + "epoch": 1.5549161258724133, + "grad_norm": 1.4620128870010376, + "learning_rate": 1.963630453208623e-05, + "loss": 0.5696, + "step": 9525 + }, + { + "epoch": 1.5550793845149178, + "grad_norm": 1.8855654001235962, + "learning_rate": 1.9636219723788e-05, + "loss": 0.689, + "step": 9526 + }, + { + "epoch": 1.5552426431574222, + "grad_norm": 2.1212708950042725, + "learning_rate": 1.9636134905786096e-05, + "loss": 0.8069, + "step": 9527 + }, + { + "epoch": 1.5554059017999267, + "grad_norm": 2.0869486331939697, + "learning_rate": 1.9636050078080608e-05, + "loss": 0.8693, + "step": 9528 + }, + { + "epoch": 1.5555691604424309, + "grad_norm": 2.024595260620117, + "learning_rate": 1.9635965240671622e-05, + "loss": 0.7426, + "step": 9529 + }, + { + "epoch": 1.5557324190849353, + "grad_norm": 1.9686275720596313, + "learning_rate": 1.963588039355922e-05, + "loss": 0.7862, + "step": 9530 + }, + { + "epoch": 1.5558956777274395, + "grad_norm": 1.6909815073013306, + "learning_rate": 1.9635795536743487e-05, + "loss": 0.7284, + "step": 9531 + }, + { + "epoch": 1.556058936369944, + "grad_norm": 2.011925458908081, + "learning_rate": 1.9635710670224513e-05, + "loss": 0.8127, + "step": 9532 + }, + { + "epoch": 1.5562221950124484, + "grad_norm": 1.8116381168365479, + "learning_rate": 1.963562579400238e-05, + "loss": 0.7137, + "step": 9533 + }, + { + "epoch": 1.5563854536549528, + "grad_norm": 1.5640140771865845, + "learning_rate": 1.9635540908077173e-05, + "loss": 0.5296, + "step": 9534 + }, + { + "epoch": 1.5565487122974573, + "grad_norm": 1.7783217430114746, + "learning_rate": 1.963545601244898e-05, + "loss": 0.6481, + "step": 9535 + }, + { + "epoch": 1.5567119709399617, + "grad_norm": 1.8671517372131348, + "learning_rate": 1.963537110711789e-05, + "loss": 0.7734, + "step": 9536 + }, + { + "epoch": 1.5568752295824662, + "grad_norm": 1.9079513549804688, + "learning_rate": 1.963528619208398e-05, + "loss": 0.7461, + "step": 9537 + }, + { + "epoch": 1.5570384882249704, + "grad_norm": 1.9567031860351562, + "learning_rate": 1.9635201267347336e-05, + "loss": 0.7161, + "step": 9538 + }, + { + "epoch": 1.5572017468674748, + "grad_norm": 1.6965001821517944, + "learning_rate": 1.963511633290805e-05, + "loss": 0.7697, + "step": 9539 + }, + { + "epoch": 1.557365005509979, + "grad_norm": 1.786491870880127, + "learning_rate": 1.9635031388766204e-05, + "loss": 0.6188, + "step": 9540 + }, + { + "epoch": 1.5575282641524835, + "grad_norm": 1.981308937072754, + "learning_rate": 1.9634946434921884e-05, + "loss": 0.9482, + "step": 9541 + }, + { + "epoch": 1.557691522794988, + "grad_norm": 2.067786455154419, + "learning_rate": 1.9634861471375174e-05, + "loss": 0.8016, + "step": 9542 + }, + { + "epoch": 1.5578547814374923, + "grad_norm": 1.5726114511489868, + "learning_rate": 1.9634776498126166e-05, + "loss": 0.6652, + "step": 9543 + }, + { + "epoch": 1.5580180400799968, + "grad_norm": 1.5051180124282837, + "learning_rate": 1.9634691515174934e-05, + "loss": 0.5873, + "step": 9544 + }, + { + "epoch": 1.5581812987225012, + "grad_norm": 1.5987205505371094, + "learning_rate": 1.9634606522521574e-05, + "loss": 0.5854, + "step": 9545 + }, + { + "epoch": 1.5583445573650057, + "grad_norm": 1.6799548864364624, + "learning_rate": 1.963452152016617e-05, + "loss": 0.6978, + "step": 9546 + }, + { + "epoch": 1.5585078160075099, + "grad_norm": 1.5968040227890015, + "learning_rate": 1.96344365081088e-05, + "loss": 0.6686, + "step": 9547 + }, + { + "epoch": 1.5586710746500143, + "grad_norm": 2.0574891567230225, + "learning_rate": 1.9634351486349556e-05, + "loss": 0.6583, + "step": 9548 + }, + { + "epoch": 1.5588343332925185, + "grad_norm": 1.7352831363677979, + "learning_rate": 1.9634266454888527e-05, + "loss": 0.6494, + "step": 9549 + }, + { + "epoch": 1.558997591935023, + "grad_norm": 1.7633482217788696, + "learning_rate": 1.963418141372579e-05, + "loss": 0.714, + "step": 9550 + }, + { + "epoch": 1.5591608505775274, + "grad_norm": 1.8283002376556396, + "learning_rate": 1.963409636286144e-05, + "loss": 0.7036, + "step": 9551 + }, + { + "epoch": 1.5593241092200318, + "grad_norm": 1.66374933719635, + "learning_rate": 1.963401130229555e-05, + "loss": 0.6001, + "step": 9552 + }, + { + "epoch": 1.5594873678625363, + "grad_norm": 1.5216139554977417, + "learning_rate": 1.9633926232028216e-05, + "loss": 0.5998, + "step": 9553 + }, + { + "epoch": 1.5596506265050407, + "grad_norm": 1.7688652276992798, + "learning_rate": 1.9633841152059525e-05, + "loss": 0.6438, + "step": 9554 + }, + { + "epoch": 1.5598138851475452, + "grad_norm": 1.620160698890686, + "learning_rate": 1.963375606238955e-05, + "loss": 0.6136, + "step": 9555 + }, + { + "epoch": 1.5599771437900494, + "grad_norm": 1.5937694311141968, + "learning_rate": 1.9633670963018395e-05, + "loss": 0.6295, + "step": 9556 + }, + { + "epoch": 1.5601404024325538, + "grad_norm": 1.5217456817626953, + "learning_rate": 1.9633585853946132e-05, + "loss": 0.6039, + "step": 9557 + }, + { + "epoch": 1.560303661075058, + "grad_norm": 1.7440733909606934, + "learning_rate": 1.963350073517285e-05, + "loss": 0.5994, + "step": 9558 + }, + { + "epoch": 1.5604669197175625, + "grad_norm": 1.9913594722747803, + "learning_rate": 1.9633415606698633e-05, + "loss": 0.8227, + "step": 9559 + }, + { + "epoch": 1.560630178360067, + "grad_norm": 1.7856299877166748, + "learning_rate": 1.9633330468523572e-05, + "loss": 0.7041, + "step": 9560 + }, + { + "epoch": 1.5607934370025713, + "grad_norm": 1.4807101488113403, + "learning_rate": 1.963324532064775e-05, + "loss": 0.5649, + "step": 9561 + }, + { + "epoch": 1.5609566956450758, + "grad_norm": 1.810655951499939, + "learning_rate": 1.9633160163071255e-05, + "loss": 0.6781, + "step": 9562 + }, + { + "epoch": 1.5611199542875802, + "grad_norm": 1.9491256475448608, + "learning_rate": 1.9633074995794165e-05, + "loss": 0.7119, + "step": 9563 + }, + { + "epoch": 1.5612832129300847, + "grad_norm": 1.8171676397323608, + "learning_rate": 1.9632989818816572e-05, + "loss": 0.6558, + "step": 9564 + }, + { + "epoch": 1.5614464715725889, + "grad_norm": 1.7695295810699463, + "learning_rate": 1.963290463213856e-05, + "loss": 0.8233, + "step": 9565 + }, + { + "epoch": 1.5616097302150933, + "grad_norm": 1.6034908294677734, + "learning_rate": 1.963281943576022e-05, + "loss": 0.6301, + "step": 9566 + }, + { + "epoch": 1.5617729888575975, + "grad_norm": 1.45742666721344, + "learning_rate": 1.963273422968163e-05, + "loss": 0.645, + "step": 9567 + }, + { + "epoch": 1.561936247500102, + "grad_norm": 1.860463261604309, + "learning_rate": 1.963264901390288e-05, + "loss": 0.6393, + "step": 9568 + }, + { + "epoch": 1.5620995061426064, + "grad_norm": 1.9907405376434326, + "learning_rate": 1.9632563788424055e-05, + "loss": 0.837, + "step": 9569 + }, + { + "epoch": 1.5622627647851108, + "grad_norm": 1.9828431606292725, + "learning_rate": 1.9632478553245243e-05, + "loss": 0.7032, + "step": 9570 + }, + { + "epoch": 1.5624260234276153, + "grad_norm": 1.9811490774154663, + "learning_rate": 1.9632393308366525e-05, + "loss": 0.7237, + "step": 9571 + }, + { + "epoch": 1.5625892820701197, + "grad_norm": 1.7508249282836914, + "learning_rate": 1.963230805378799e-05, + "loss": 0.6036, + "step": 9572 + }, + { + "epoch": 1.562752540712624, + "grad_norm": 1.7123987674713135, + "learning_rate": 1.9632222789509722e-05, + "loss": 0.6487, + "step": 9573 + }, + { + "epoch": 1.5629157993551284, + "grad_norm": 1.7815738916397095, + "learning_rate": 1.963213751553181e-05, + "loss": 0.665, + "step": 9574 + }, + { + "epoch": 1.5630790579976328, + "grad_norm": 1.7878392934799194, + "learning_rate": 1.9632052231854337e-05, + "loss": 0.6659, + "step": 9575 + }, + { + "epoch": 1.563242316640137, + "grad_norm": 2.019481897354126, + "learning_rate": 1.9631966938477392e-05, + "loss": 0.803, + "step": 9576 + }, + { + "epoch": 1.5634055752826415, + "grad_norm": 1.6701409816741943, + "learning_rate": 1.9631881635401056e-05, + "loss": 0.629, + "step": 9577 + }, + { + "epoch": 1.563568833925146, + "grad_norm": 1.3975422382354736, + "learning_rate": 1.963179632262542e-05, + "loss": 0.5311, + "step": 9578 + }, + { + "epoch": 1.5637320925676503, + "grad_norm": 1.4472918510437012, + "learning_rate": 1.963171100015057e-05, + "loss": 0.4911, + "step": 9579 + }, + { + "epoch": 1.5638953512101548, + "grad_norm": 1.8500815629959106, + "learning_rate": 1.9631625667976584e-05, + "loss": 0.638, + "step": 9580 + }, + { + "epoch": 1.5640586098526592, + "grad_norm": 1.8440124988555908, + "learning_rate": 1.9631540326103554e-05, + "loss": 0.7963, + "step": 9581 + }, + { + "epoch": 1.5642218684951634, + "grad_norm": 1.9842220544815063, + "learning_rate": 1.963145497453157e-05, + "loss": 0.727, + "step": 9582 + }, + { + "epoch": 1.5643851271376679, + "grad_norm": 1.47636079788208, + "learning_rate": 1.963136961326071e-05, + "loss": 0.5364, + "step": 9583 + }, + { + "epoch": 1.564548385780172, + "grad_norm": 1.92975652217865, + "learning_rate": 1.9631284242291063e-05, + "loss": 0.6864, + "step": 9584 + }, + { + "epoch": 1.5647116444226765, + "grad_norm": 1.669007658958435, + "learning_rate": 1.9631198861622714e-05, + "loss": 0.7495, + "step": 9585 + }, + { + "epoch": 1.564874903065181, + "grad_norm": 1.7122869491577148, + "learning_rate": 1.9631113471255757e-05, + "loss": 0.7092, + "step": 9586 + }, + { + "epoch": 1.5650381617076854, + "grad_norm": 1.8365459442138672, + "learning_rate": 1.9631028071190265e-05, + "loss": 0.6647, + "step": 9587 + }, + { + "epoch": 1.5652014203501898, + "grad_norm": 1.5710625648498535, + "learning_rate": 1.9630942661426335e-05, + "loss": 0.676, + "step": 9588 + }, + { + "epoch": 1.5653646789926943, + "grad_norm": 1.5955301523208618, + "learning_rate": 1.9630857241964043e-05, + "loss": 0.6991, + "step": 9589 + }, + { + "epoch": 1.5655279376351987, + "grad_norm": 1.9265618324279785, + "learning_rate": 1.9630771812803484e-05, + "loss": 0.8455, + "step": 9590 + }, + { + "epoch": 1.565691196277703, + "grad_norm": 1.7142196893692017, + "learning_rate": 1.9630686373944738e-05, + "loss": 0.6454, + "step": 9591 + }, + { + "epoch": 1.5658544549202074, + "grad_norm": 1.727154016494751, + "learning_rate": 1.9630600925387894e-05, + "loss": 0.7186, + "step": 9592 + }, + { + "epoch": 1.5660177135627116, + "grad_norm": 1.8517849445343018, + "learning_rate": 1.9630515467133038e-05, + "loss": 0.6316, + "step": 9593 + }, + { + "epoch": 1.566180972205216, + "grad_norm": 1.6774290800094604, + "learning_rate": 1.9630429999180255e-05, + "loss": 0.6176, + "step": 9594 + }, + { + "epoch": 1.5663442308477205, + "grad_norm": 1.7105761766433716, + "learning_rate": 1.9630344521529635e-05, + "loss": 0.5854, + "step": 9595 + }, + { + "epoch": 1.566507489490225, + "grad_norm": 1.564675211906433, + "learning_rate": 1.9630259034181258e-05, + "loss": 0.5798, + "step": 9596 + }, + { + "epoch": 1.5666707481327293, + "grad_norm": 1.781473994255066, + "learning_rate": 1.963017353713521e-05, + "loss": 0.6623, + "step": 9597 + }, + { + "epoch": 1.5668340067752338, + "grad_norm": 1.386099100112915, + "learning_rate": 1.9630088030391584e-05, + "loss": 0.453, + "step": 9598 + }, + { + "epoch": 1.5669972654177382, + "grad_norm": 1.7640938758850098, + "learning_rate": 1.963000251395046e-05, + "loss": 0.6131, + "step": 9599 + }, + { + "epoch": 1.5671605240602424, + "grad_norm": 1.4326421022415161, + "learning_rate": 1.9629916987811924e-05, + "loss": 0.6078, + "step": 9600 + }, + { + "epoch": 1.5673237827027469, + "grad_norm": 1.5772504806518555, + "learning_rate": 1.962983145197607e-05, + "loss": 0.6182, + "step": 9601 + }, + { + "epoch": 1.567487041345251, + "grad_norm": 1.5269213914871216, + "learning_rate": 1.9629745906442973e-05, + "loss": 0.6665, + "step": 9602 + }, + { + "epoch": 1.5676502999877555, + "grad_norm": 1.963954210281372, + "learning_rate": 1.9629660351212725e-05, + "loss": 0.6247, + "step": 9603 + }, + { + "epoch": 1.56781355863026, + "grad_norm": 1.3504971265792847, + "learning_rate": 1.9629574786285413e-05, + "loss": 0.5662, + "step": 9604 + }, + { + "epoch": 1.5679768172727644, + "grad_norm": 1.725431203842163, + "learning_rate": 1.9629489211661122e-05, + "loss": 0.6482, + "step": 9605 + }, + { + "epoch": 1.5681400759152688, + "grad_norm": 1.8823821544647217, + "learning_rate": 1.9629403627339937e-05, + "loss": 0.5387, + "step": 9606 + }, + { + "epoch": 1.5683033345577733, + "grad_norm": 1.6015703678131104, + "learning_rate": 1.9629318033321945e-05, + "loss": 0.5905, + "step": 9607 + }, + { + "epoch": 1.5684665932002777, + "grad_norm": 1.7509411573410034, + "learning_rate": 1.9629232429607233e-05, + "loss": 0.6801, + "step": 9608 + }, + { + "epoch": 1.568629851842782, + "grad_norm": 1.9154456853866577, + "learning_rate": 1.9629146816195887e-05, + "loss": 0.6379, + "step": 9609 + }, + { + "epoch": 1.5687931104852864, + "grad_norm": 1.942124366760254, + "learning_rate": 1.962906119308799e-05, + "loss": 0.8034, + "step": 9610 + }, + { + "epoch": 1.5689563691277906, + "grad_norm": 1.6453465223312378, + "learning_rate": 1.9628975560283634e-05, + "loss": 0.5165, + "step": 9611 + }, + { + "epoch": 1.569119627770295, + "grad_norm": 1.6520541906356812, + "learning_rate": 1.96288899177829e-05, + "loss": 0.6191, + "step": 9612 + }, + { + "epoch": 1.5692828864127994, + "grad_norm": 2.0133602619171143, + "learning_rate": 1.9628804265585878e-05, + "loss": 0.6422, + "step": 9613 + }, + { + "epoch": 1.5694461450553039, + "grad_norm": 1.7317301034927368, + "learning_rate": 1.962871860369265e-05, + "loss": 0.598, + "step": 9614 + }, + { + "epoch": 1.5696094036978083, + "grad_norm": 1.7891112565994263, + "learning_rate": 1.962863293210331e-05, + "loss": 0.6364, + "step": 9615 + }, + { + "epoch": 1.5697726623403128, + "grad_norm": 2.103178024291992, + "learning_rate": 1.9628547250817937e-05, + "loss": 0.7649, + "step": 9616 + }, + { + "epoch": 1.569935920982817, + "grad_norm": 1.7231590747833252, + "learning_rate": 1.9628461559836615e-05, + "loss": 0.6916, + "step": 9617 + }, + { + "epoch": 1.5700991796253214, + "grad_norm": 1.6548823118209839, + "learning_rate": 1.962837585915944e-05, + "loss": 0.6397, + "step": 9618 + }, + { + "epoch": 1.5702624382678259, + "grad_norm": 1.6805055141448975, + "learning_rate": 1.962829014878649e-05, + "loss": 0.6595, + "step": 9619 + }, + { + "epoch": 1.57042569691033, + "grad_norm": 1.789267659187317, + "learning_rate": 1.9628204428717856e-05, + "loss": 0.6939, + "step": 9620 + }, + { + "epoch": 1.5705889555528345, + "grad_norm": 1.7846240997314453, + "learning_rate": 1.9628118698953623e-05, + "loss": 0.7044, + "step": 9621 + }, + { + "epoch": 1.570752214195339, + "grad_norm": 2.000779628753662, + "learning_rate": 1.9628032959493878e-05, + "loss": 0.7943, + "step": 9622 + }, + { + "epoch": 1.5709154728378434, + "grad_norm": 2.066596746444702, + "learning_rate": 1.9627947210338702e-05, + "loss": 0.6147, + "step": 9623 + }, + { + "epoch": 1.5710787314803478, + "grad_norm": 1.521427869796753, + "learning_rate": 1.962786145148819e-05, + "loss": 0.5556, + "step": 9624 + }, + { + "epoch": 1.5712419901228523, + "grad_norm": 1.427954912185669, + "learning_rate": 1.962777568294242e-05, + "loss": 0.5564, + "step": 9625 + }, + { + "epoch": 1.5714052487653565, + "grad_norm": 1.943752408027649, + "learning_rate": 1.9627689904701486e-05, + "loss": 0.8123, + "step": 9626 + }, + { + "epoch": 1.571568507407861, + "grad_norm": 1.705159306526184, + "learning_rate": 1.962760411676547e-05, + "loss": 0.6693, + "step": 9627 + }, + { + "epoch": 1.5717317660503651, + "grad_norm": 1.8894529342651367, + "learning_rate": 1.9627518319134463e-05, + "loss": 0.8278, + "step": 9628 + }, + { + "epoch": 1.5718950246928696, + "grad_norm": 1.8755106925964355, + "learning_rate": 1.962743251180854e-05, + "loss": 0.696, + "step": 9629 + }, + { + "epoch": 1.572058283335374, + "grad_norm": 2.0930750370025635, + "learning_rate": 1.9627346694787798e-05, + "loss": 0.7219, + "step": 9630 + }, + { + "epoch": 1.5722215419778784, + "grad_norm": 1.7161297798156738, + "learning_rate": 1.9627260868072322e-05, + "loss": 0.7818, + "step": 9631 + }, + { + "epoch": 1.5723848006203829, + "grad_norm": 1.7828248739242554, + "learning_rate": 1.96271750316622e-05, + "loss": 0.6071, + "step": 9632 + }, + { + "epoch": 1.5725480592628873, + "grad_norm": 1.4134396314620972, + "learning_rate": 1.962708918555751e-05, + "loss": 0.5648, + "step": 9633 + }, + { + "epoch": 1.5727113179053918, + "grad_norm": 2.270962715148926, + "learning_rate": 1.9627003329758344e-05, + "loss": 0.788, + "step": 9634 + }, + { + "epoch": 1.572874576547896, + "grad_norm": 1.729169249534607, + "learning_rate": 1.962691746426479e-05, + "loss": 0.6177, + "step": 9635 + }, + { + "epoch": 1.5730378351904004, + "grad_norm": 1.5509408712387085, + "learning_rate": 1.9626831589076932e-05, + "loss": 0.611, + "step": 9636 + }, + { + "epoch": 1.5732010938329046, + "grad_norm": 1.3985943794250488, + "learning_rate": 1.9626745704194857e-05, + "loss": 0.56, + "step": 9637 + }, + { + "epoch": 1.573364352475409, + "grad_norm": 1.8105063438415527, + "learning_rate": 1.9626659809618652e-05, + "loss": 0.7991, + "step": 9638 + }, + { + "epoch": 1.5735276111179135, + "grad_norm": 2.213312864303589, + "learning_rate": 1.9626573905348403e-05, + "loss": 0.8037, + "step": 9639 + }, + { + "epoch": 1.573690869760418, + "grad_norm": 1.7250983715057373, + "learning_rate": 1.9626487991384194e-05, + "loss": 0.7364, + "step": 9640 + }, + { + "epoch": 1.5738541284029224, + "grad_norm": 1.6180000305175781, + "learning_rate": 1.962640206772612e-05, + "loss": 0.6066, + "step": 9641 + }, + { + "epoch": 1.5740173870454268, + "grad_norm": 1.8795654773712158, + "learning_rate": 1.9626316134374255e-05, + "loss": 0.9205, + "step": 9642 + }, + { + "epoch": 1.5741806456879313, + "grad_norm": 1.9067094326019287, + "learning_rate": 1.9626230191328697e-05, + "loss": 0.6631, + "step": 9643 + }, + { + "epoch": 1.5743439043304355, + "grad_norm": 1.6158788204193115, + "learning_rate": 1.9626144238589525e-05, + "loss": 0.6429, + "step": 9644 + }, + { + "epoch": 1.57450716297294, + "grad_norm": 1.6274019479751587, + "learning_rate": 1.962605827615683e-05, + "loss": 0.6413, + "step": 9645 + }, + { + "epoch": 1.5746704216154441, + "grad_norm": 1.8255980014801025, + "learning_rate": 1.9625972304030697e-05, + "loss": 0.7866, + "step": 9646 + }, + { + "epoch": 1.5748336802579486, + "grad_norm": 1.7922167778015137, + "learning_rate": 1.962588632221121e-05, + "loss": 0.6896, + "step": 9647 + }, + { + "epoch": 1.574996938900453, + "grad_norm": 1.7598341703414917, + "learning_rate": 1.9625800330698462e-05, + "loss": 0.7013, + "step": 9648 + }, + { + "epoch": 1.5751601975429574, + "grad_norm": 1.2833775281906128, + "learning_rate": 1.9625714329492532e-05, + "loss": 0.4882, + "step": 9649 + }, + { + "epoch": 1.5753234561854619, + "grad_norm": 1.2973188161849976, + "learning_rate": 1.9625628318593514e-05, + "loss": 0.4906, + "step": 9650 + }, + { + "epoch": 1.5754867148279663, + "grad_norm": 1.550643801689148, + "learning_rate": 1.9625542298001487e-05, + "loss": 0.5901, + "step": 9651 + }, + { + "epoch": 1.5756499734704708, + "grad_norm": 1.6381577253341675, + "learning_rate": 1.9625456267716544e-05, + "loss": 0.6771, + "step": 9652 + }, + { + "epoch": 1.575813232112975, + "grad_norm": 1.581642508506775, + "learning_rate": 1.962537022773877e-05, + "loss": 0.5938, + "step": 9653 + }, + { + "epoch": 1.5759764907554794, + "grad_norm": 1.5784766674041748, + "learning_rate": 1.9625284178068246e-05, + "loss": 0.6929, + "step": 9654 + }, + { + "epoch": 1.5761397493979836, + "grad_norm": 1.7521753311157227, + "learning_rate": 1.9625198118705065e-05, + "loss": 0.7275, + "step": 9655 + }, + { + "epoch": 1.576303008040488, + "grad_norm": 1.6148757934570312, + "learning_rate": 1.9625112049649316e-05, + "loss": 0.5798, + "step": 9656 + }, + { + "epoch": 1.5764662666829925, + "grad_norm": 1.9123215675354004, + "learning_rate": 1.9625025970901078e-05, + "loss": 0.7116, + "step": 9657 + }, + { + "epoch": 1.576629525325497, + "grad_norm": 1.487979531288147, + "learning_rate": 1.962493988246044e-05, + "loss": 0.6228, + "step": 9658 + }, + { + "epoch": 1.5767927839680014, + "grad_norm": 1.454133152961731, + "learning_rate": 1.9624853784327495e-05, + "loss": 0.5909, + "step": 9659 + }, + { + "epoch": 1.5769560426105058, + "grad_norm": 1.2248520851135254, + "learning_rate": 1.962476767650232e-05, + "loss": 0.5306, + "step": 9660 + }, + { + "epoch": 1.57711930125301, + "grad_norm": 1.439718246459961, + "learning_rate": 1.962468155898501e-05, + "loss": 0.6166, + "step": 9661 + }, + { + "epoch": 1.5772825598955145, + "grad_norm": 1.550396203994751, + "learning_rate": 1.962459543177565e-05, + "loss": 0.6085, + "step": 9662 + }, + { + "epoch": 1.577445818538019, + "grad_norm": 1.7822028398513794, + "learning_rate": 1.962450929487432e-05, + "loss": 0.609, + "step": 9663 + }, + { + "epoch": 1.5776090771805231, + "grad_norm": 1.931081771850586, + "learning_rate": 1.9624423148281114e-05, + "loss": 0.691, + "step": 9664 + }, + { + "epoch": 1.5777723358230276, + "grad_norm": 1.650681495666504, + "learning_rate": 1.9624336991996117e-05, + "loss": 0.5518, + "step": 9665 + }, + { + "epoch": 1.577935594465532, + "grad_norm": 1.5022908449172974, + "learning_rate": 1.9624250826019413e-05, + "loss": 0.6965, + "step": 9666 + }, + { + "epoch": 1.5780988531080364, + "grad_norm": 1.8011267185211182, + "learning_rate": 1.9624164650351093e-05, + "loss": 0.6192, + "step": 9667 + }, + { + "epoch": 1.5782621117505409, + "grad_norm": 2.0723159313201904, + "learning_rate": 1.962407846499124e-05, + "loss": 0.7164, + "step": 9668 + }, + { + "epoch": 1.5784253703930453, + "grad_norm": 2.1561760902404785, + "learning_rate": 1.9623992269939946e-05, + "loss": 0.9268, + "step": 9669 + }, + { + "epoch": 1.5785886290355495, + "grad_norm": 1.8231765031814575, + "learning_rate": 1.9623906065197288e-05, + "loss": 0.745, + "step": 9670 + }, + { + "epoch": 1.578751887678054, + "grad_norm": 1.982771396636963, + "learning_rate": 1.9623819850763364e-05, + "loss": 0.8288, + "step": 9671 + }, + { + "epoch": 1.5789151463205582, + "grad_norm": 1.9587383270263672, + "learning_rate": 1.9623733626638258e-05, + "loss": 0.755, + "step": 9672 + }, + { + "epoch": 1.5790784049630626, + "grad_norm": 1.9257392883300781, + "learning_rate": 1.962364739282205e-05, + "loss": 0.6861, + "step": 9673 + }, + { + "epoch": 1.579241663605567, + "grad_norm": 1.9680873155593872, + "learning_rate": 1.9623561149314832e-05, + "loss": 0.687, + "step": 9674 + }, + { + "epoch": 1.5794049222480715, + "grad_norm": 1.7040352821350098, + "learning_rate": 1.9623474896116696e-05, + "loss": 0.641, + "step": 9675 + }, + { + "epoch": 1.579568180890576, + "grad_norm": 2.0922768115997314, + "learning_rate": 1.9623388633227716e-05, + "loss": 0.6539, + "step": 9676 + }, + { + "epoch": 1.5797314395330804, + "grad_norm": 1.504915714263916, + "learning_rate": 1.9623302360647992e-05, + "loss": 0.6375, + "step": 9677 + }, + { + "epoch": 1.5798946981755848, + "grad_norm": 2.211939573287964, + "learning_rate": 1.96232160783776e-05, + "loss": 0.8916, + "step": 9678 + }, + { + "epoch": 1.580057956818089, + "grad_norm": 1.6597124338150024, + "learning_rate": 1.9623129786416635e-05, + "loss": 0.7033, + "step": 9679 + }, + { + "epoch": 1.5802212154605935, + "grad_norm": 1.8660601377487183, + "learning_rate": 1.962304348476518e-05, + "loss": 0.7886, + "step": 9680 + }, + { + "epoch": 1.5803844741030977, + "grad_norm": 2.0038650035858154, + "learning_rate": 1.9622957173423325e-05, + "loss": 0.751, + "step": 9681 + }, + { + "epoch": 1.5805477327456021, + "grad_norm": 1.450705885887146, + "learning_rate": 1.962287085239115e-05, + "loss": 0.5394, + "step": 9682 + }, + { + "epoch": 1.5807109913881066, + "grad_norm": 1.3762441873550415, + "learning_rate": 1.962278452166875e-05, + "loss": 0.5902, + "step": 9683 + }, + { + "epoch": 1.580874250030611, + "grad_norm": 1.8528920412063599, + "learning_rate": 1.9622698181256207e-05, + "loss": 0.7836, + "step": 9684 + }, + { + "epoch": 1.5810375086731154, + "grad_norm": 1.8280096054077148, + "learning_rate": 1.962261183115361e-05, + "loss": 0.7014, + "step": 9685 + }, + { + "epoch": 1.5812007673156199, + "grad_norm": 1.8464151620864868, + "learning_rate": 1.962252547136105e-05, + "loss": 0.6491, + "step": 9686 + }, + { + "epoch": 1.5813640259581243, + "grad_norm": 1.6274847984313965, + "learning_rate": 1.9622439101878603e-05, + "loss": 0.7113, + "step": 9687 + }, + { + "epoch": 1.5815272846006285, + "grad_norm": 1.425610899925232, + "learning_rate": 1.9622352722706365e-05, + "loss": 0.5823, + "step": 9688 + }, + { + "epoch": 1.581690543243133, + "grad_norm": 1.72265625, + "learning_rate": 1.962226633384442e-05, + "loss": 0.6272, + "step": 9689 + }, + { + "epoch": 1.5818538018856372, + "grad_norm": 1.851369023323059, + "learning_rate": 1.9622179935292855e-05, + "loss": 0.6878, + "step": 9690 + }, + { + "epoch": 1.5820170605281416, + "grad_norm": 1.6060316562652588, + "learning_rate": 1.9622093527051758e-05, + "loss": 0.7342, + "step": 9691 + }, + { + "epoch": 1.582180319170646, + "grad_norm": 1.7634220123291016, + "learning_rate": 1.9622007109121214e-05, + "loss": 0.6128, + "step": 9692 + }, + { + "epoch": 1.5823435778131505, + "grad_norm": 1.73552405834198, + "learning_rate": 1.9621920681501314e-05, + "loss": 0.6906, + "step": 9693 + }, + { + "epoch": 1.582506836455655, + "grad_norm": 1.418906807899475, + "learning_rate": 1.962183424419214e-05, + "loss": 0.5313, + "step": 9694 + }, + { + "epoch": 1.5826700950981594, + "grad_norm": 1.5142323970794678, + "learning_rate": 1.9621747797193784e-05, + "loss": 0.5362, + "step": 9695 + }, + { + "epoch": 1.5828333537406638, + "grad_norm": 1.5299216508865356, + "learning_rate": 1.962166134050633e-05, + "loss": 0.6117, + "step": 9696 + }, + { + "epoch": 1.582996612383168, + "grad_norm": 1.7742419242858887, + "learning_rate": 1.962157487412986e-05, + "loss": 0.7099, + "step": 9697 + }, + { + "epoch": 1.5831598710256725, + "grad_norm": 2.0223584175109863, + "learning_rate": 1.962148839806447e-05, + "loss": 0.8548, + "step": 9698 + }, + { + "epoch": 1.5833231296681767, + "grad_norm": 1.4292356967926025, + "learning_rate": 1.9621401912310247e-05, + "loss": 0.5669, + "step": 9699 + }, + { + "epoch": 1.5834863883106811, + "grad_norm": 1.6820064783096313, + "learning_rate": 1.9621315416867274e-05, + "loss": 0.6238, + "step": 9700 + }, + { + "epoch": 1.5836496469531856, + "grad_norm": 1.8175023794174194, + "learning_rate": 1.9621228911735637e-05, + "loss": 0.6982, + "step": 9701 + }, + { + "epoch": 1.58381290559569, + "grad_norm": 1.7978641986846924, + "learning_rate": 1.9621142396915423e-05, + "loss": 0.7152, + "step": 9702 + }, + { + "epoch": 1.5839761642381944, + "grad_norm": 1.5713573694229126, + "learning_rate": 1.962105587240673e-05, + "loss": 0.6519, + "step": 9703 + }, + { + "epoch": 1.5841394228806989, + "grad_norm": 1.7591252326965332, + "learning_rate": 1.9620969338209626e-05, + "loss": 0.7534, + "step": 9704 + }, + { + "epoch": 1.584302681523203, + "grad_norm": 1.8434993028640747, + "learning_rate": 1.9620882794324213e-05, + "loss": 0.6487, + "step": 9705 + }, + { + "epoch": 1.5844659401657075, + "grad_norm": 1.859194278717041, + "learning_rate": 1.962079624075057e-05, + "loss": 0.7473, + "step": 9706 + }, + { + "epoch": 1.584629198808212, + "grad_norm": 1.8070935010910034, + "learning_rate": 1.9620709677488794e-05, + "loss": 0.701, + "step": 9707 + }, + { + "epoch": 1.5847924574507162, + "grad_norm": 1.6189990043640137, + "learning_rate": 1.9620623104538963e-05, + "loss": 0.7371, + "step": 9708 + }, + { + "epoch": 1.5849557160932206, + "grad_norm": 1.8244355916976929, + "learning_rate": 1.9620536521901168e-05, + "loss": 0.6861, + "step": 9709 + }, + { + "epoch": 1.585118974735725, + "grad_norm": 2.8467419147491455, + "learning_rate": 1.9620449929575495e-05, + "loss": 0.8209, + "step": 9710 + }, + { + "epoch": 1.5852822333782295, + "grad_norm": 1.955719232559204, + "learning_rate": 1.9620363327562028e-05, + "loss": 0.6263, + "step": 9711 + }, + { + "epoch": 1.585445492020734, + "grad_norm": 2.0077857971191406, + "learning_rate": 1.962027671586086e-05, + "loss": 0.6842, + "step": 9712 + }, + { + "epoch": 1.5856087506632384, + "grad_norm": 1.3232954740524292, + "learning_rate": 1.9620190094472077e-05, + "loss": 0.5559, + "step": 9713 + }, + { + "epoch": 1.5857720093057426, + "grad_norm": 1.67738938331604, + "learning_rate": 1.9620103463395764e-05, + "loss": 0.7409, + "step": 9714 + }, + { + "epoch": 1.585935267948247, + "grad_norm": 1.9344784021377563, + "learning_rate": 1.962001682263201e-05, + "loss": 0.8101, + "step": 9715 + }, + { + "epoch": 1.5860985265907512, + "grad_norm": 1.799301266670227, + "learning_rate": 1.96199301721809e-05, + "loss": 0.6269, + "step": 9716 + }, + { + "epoch": 1.5862617852332557, + "grad_norm": 1.8337783813476562, + "learning_rate": 1.9619843512042525e-05, + "loss": 0.7435, + "step": 9717 + }, + { + "epoch": 1.58642504387576, + "grad_norm": 1.773411750793457, + "learning_rate": 1.961975684221697e-05, + "loss": 0.825, + "step": 9718 + }, + { + "epoch": 1.5865883025182645, + "grad_norm": 1.6811023950576782, + "learning_rate": 1.9619670162704322e-05, + "loss": 0.6637, + "step": 9719 + }, + { + "epoch": 1.586751561160769, + "grad_norm": 2.137704610824585, + "learning_rate": 1.961958347350467e-05, + "loss": 0.6936, + "step": 9720 + }, + { + "epoch": 1.5869148198032734, + "grad_norm": 1.6904270648956299, + "learning_rate": 1.9619496774618098e-05, + "loss": 0.6042, + "step": 9721 + }, + { + "epoch": 1.5870780784457779, + "grad_norm": 1.9231176376342773, + "learning_rate": 1.961941006604469e-05, + "loss": 0.7972, + "step": 9722 + }, + { + "epoch": 1.587241337088282, + "grad_norm": 1.6723405122756958, + "learning_rate": 1.961932334778455e-05, + "loss": 0.6414, + "step": 9723 + }, + { + "epoch": 1.5874045957307865, + "grad_norm": 1.6982735395431519, + "learning_rate": 1.9619236619837747e-05, + "loss": 0.6742, + "step": 9724 + }, + { + "epoch": 1.5875678543732907, + "grad_norm": 1.5371806621551514, + "learning_rate": 1.9619149882204375e-05, + "loss": 0.6047, + "step": 9725 + }, + { + "epoch": 1.5877311130157952, + "grad_norm": 1.7299354076385498, + "learning_rate": 1.961906313488452e-05, + "loss": 0.66, + "step": 9726 + }, + { + "epoch": 1.5878943716582996, + "grad_norm": 1.7391077280044556, + "learning_rate": 1.9618976377878275e-05, + "loss": 0.669, + "step": 9727 + }, + { + "epoch": 1.588057630300804, + "grad_norm": 1.5976592302322388, + "learning_rate": 1.9618889611185726e-05, + "loss": 0.661, + "step": 9728 + }, + { + "epoch": 1.5882208889433085, + "grad_norm": 1.6944645643234253, + "learning_rate": 1.9618802834806953e-05, + "loss": 0.7033, + "step": 9729 + }, + { + "epoch": 1.588384147585813, + "grad_norm": 1.7903512716293335, + "learning_rate": 1.961871604874205e-05, + "loss": 0.7511, + "step": 9730 + }, + { + "epoch": 1.5885474062283174, + "grad_norm": 1.5805833339691162, + "learning_rate": 1.96186292529911e-05, + "loss": 0.6207, + "step": 9731 + }, + { + "epoch": 1.5887106648708216, + "grad_norm": 2.1036124229431152, + "learning_rate": 1.9618542447554195e-05, + "loss": 0.918, + "step": 9732 + }, + { + "epoch": 1.588873923513326, + "grad_norm": 1.789934515953064, + "learning_rate": 1.961845563243142e-05, + "loss": 0.6784, + "step": 9733 + }, + { + "epoch": 1.5890371821558302, + "grad_norm": 1.4626398086547852, + "learning_rate": 1.9618368807622863e-05, + "loss": 0.7582, + "step": 9734 + }, + { + "epoch": 1.5892004407983347, + "grad_norm": 2.245030164718628, + "learning_rate": 1.961828197312861e-05, + "loss": 0.7274, + "step": 9735 + }, + { + "epoch": 1.589363699440839, + "grad_norm": 1.7031570672988892, + "learning_rate": 1.9618195128948753e-05, + "loss": 0.668, + "step": 9736 + }, + { + "epoch": 1.5895269580833435, + "grad_norm": 1.381311297416687, + "learning_rate": 1.961810827508337e-05, + "loss": 0.6, + "step": 9737 + }, + { + "epoch": 1.589690216725848, + "grad_norm": 1.9340081214904785, + "learning_rate": 1.9618021411532558e-05, + "loss": 0.8211, + "step": 9738 + }, + { + "epoch": 1.5898534753683524, + "grad_norm": 1.357742428779602, + "learning_rate": 1.9617934538296404e-05, + "loss": 0.6042, + "step": 9739 + }, + { + "epoch": 1.5900167340108569, + "grad_norm": 1.8038944005966187, + "learning_rate": 1.9617847655374988e-05, + "loss": 0.7426, + "step": 9740 + }, + { + "epoch": 1.590179992653361, + "grad_norm": 1.5917192697525024, + "learning_rate": 1.9617760762768406e-05, + "loss": 0.7127, + "step": 9741 + }, + { + "epoch": 1.5903432512958655, + "grad_norm": 1.8327029943466187, + "learning_rate": 1.9617673860476735e-05, + "loss": 0.6998, + "step": 9742 + }, + { + "epoch": 1.5905065099383697, + "grad_norm": 1.6016114950180054, + "learning_rate": 1.9617586948500076e-05, + "loss": 0.7127, + "step": 9743 + }, + { + "epoch": 1.5906697685808742, + "grad_norm": 1.760743498802185, + "learning_rate": 1.9617500026838506e-05, + "loss": 0.8217, + "step": 9744 + }, + { + "epoch": 1.5908330272233786, + "grad_norm": 1.8219060897827148, + "learning_rate": 1.9617413095492114e-05, + "loss": 0.7175, + "step": 9745 + }, + { + "epoch": 1.590996285865883, + "grad_norm": 1.7804641723632812, + "learning_rate": 1.9617326154460992e-05, + "loss": 0.5986, + "step": 9746 + }, + { + "epoch": 1.5911595445083875, + "grad_norm": 1.873049020767212, + "learning_rate": 1.9617239203745226e-05, + "loss": 0.7798, + "step": 9747 + }, + { + "epoch": 1.591322803150892, + "grad_norm": 1.6832165718078613, + "learning_rate": 1.9617152243344903e-05, + "loss": 0.6572, + "step": 9748 + }, + { + "epoch": 1.5914860617933961, + "grad_norm": 1.6698039770126343, + "learning_rate": 1.9617065273260106e-05, + "loss": 0.6375, + "step": 9749 + }, + { + "epoch": 1.5916493204359006, + "grad_norm": 1.550872564315796, + "learning_rate": 1.961697829349093e-05, + "loss": 0.5906, + "step": 9750 + }, + { + "epoch": 1.591812579078405, + "grad_norm": 1.819465160369873, + "learning_rate": 1.961689130403746e-05, + "loss": 0.6161, + "step": 9751 + }, + { + "epoch": 1.5919758377209092, + "grad_norm": 1.714302659034729, + "learning_rate": 1.9616804304899785e-05, + "loss": 0.6474, + "step": 9752 + }, + { + "epoch": 1.5921390963634137, + "grad_norm": 1.7182247638702393, + "learning_rate": 1.9616717296077986e-05, + "loss": 0.7266, + "step": 9753 + }, + { + "epoch": 1.592302355005918, + "grad_norm": 1.7214372158050537, + "learning_rate": 1.9616630277572158e-05, + "loss": 0.6537, + "step": 9754 + }, + { + "epoch": 1.5924656136484225, + "grad_norm": 2.0256571769714355, + "learning_rate": 1.9616543249382385e-05, + "loss": 0.7473, + "step": 9755 + }, + { + "epoch": 1.592628872290927, + "grad_norm": 1.7589939832687378, + "learning_rate": 1.9616456211508756e-05, + "loss": 0.7651, + "step": 9756 + }, + { + "epoch": 1.5927921309334314, + "grad_norm": 1.6143070459365845, + "learning_rate": 1.9616369163951354e-05, + "loss": 0.5442, + "step": 9757 + }, + { + "epoch": 1.5929553895759356, + "grad_norm": 2.2633056640625, + "learning_rate": 1.9616282106710276e-05, + "loss": 0.8147, + "step": 9758 + }, + { + "epoch": 1.59311864821844, + "grad_norm": 1.8863481283187866, + "learning_rate": 1.96161950397856e-05, + "loss": 0.5512, + "step": 9759 + }, + { + "epoch": 1.5932819068609443, + "grad_norm": 1.6044455766677856, + "learning_rate": 1.9616107963177423e-05, + "loss": 0.6513, + "step": 9760 + }, + { + "epoch": 1.5934451655034487, + "grad_norm": 1.6253830194473267, + "learning_rate": 1.9616020876885825e-05, + "loss": 0.6259, + "step": 9761 + }, + { + "epoch": 1.5936084241459532, + "grad_norm": 1.7241250276565552, + "learning_rate": 1.96159337809109e-05, + "loss": 0.5976, + "step": 9762 + }, + { + "epoch": 1.5937716827884576, + "grad_norm": 1.7269858121871948, + "learning_rate": 1.9615846675252726e-05, + "loss": 0.7293, + "step": 9763 + }, + { + "epoch": 1.593934941430962, + "grad_norm": 1.744154453277588, + "learning_rate": 1.96157595599114e-05, + "loss": 0.6749, + "step": 9764 + }, + { + "epoch": 1.5940982000734665, + "grad_norm": 1.6006718873977661, + "learning_rate": 1.961567243488701e-05, + "loss": 0.6577, + "step": 9765 + }, + { + "epoch": 1.594261458715971, + "grad_norm": 1.3282138109207153, + "learning_rate": 1.9615585300179638e-05, + "loss": 0.5595, + "step": 9766 + }, + { + "epoch": 1.5944247173584751, + "grad_norm": 1.617884874343872, + "learning_rate": 1.9615498155789373e-05, + "loss": 0.6207, + "step": 9767 + }, + { + "epoch": 1.5945879760009796, + "grad_norm": 1.6365516185760498, + "learning_rate": 1.9615411001716308e-05, + "loss": 0.6657, + "step": 9768 + }, + { + "epoch": 1.5947512346434838, + "grad_norm": 1.6910046339035034, + "learning_rate": 1.961532383796052e-05, + "loss": 0.6715, + "step": 9769 + }, + { + "epoch": 1.5949144932859882, + "grad_norm": 1.6335252523422241, + "learning_rate": 1.9615236664522108e-05, + "loss": 0.5335, + "step": 9770 + }, + { + "epoch": 1.5950777519284927, + "grad_norm": 1.5872143507003784, + "learning_rate": 1.9615149481401152e-05, + "loss": 0.6462, + "step": 9771 + }, + { + "epoch": 1.595241010570997, + "grad_norm": 1.7322975397109985, + "learning_rate": 1.9615062288597746e-05, + "loss": 0.6994, + "step": 9772 + }, + { + "epoch": 1.5954042692135015, + "grad_norm": 1.4623985290527344, + "learning_rate": 1.9614975086111974e-05, + "loss": 0.6346, + "step": 9773 + }, + { + "epoch": 1.595567527856006, + "grad_norm": 1.7291159629821777, + "learning_rate": 1.9614887873943928e-05, + "loss": 0.662, + "step": 9774 + }, + { + "epoch": 1.5957307864985104, + "grad_norm": 1.6546647548675537, + "learning_rate": 1.9614800652093685e-05, + "loss": 0.6841, + "step": 9775 + }, + { + "epoch": 1.5958940451410146, + "grad_norm": 2.449777603149414, + "learning_rate": 1.9614713420561348e-05, + "loss": 1.1406, + "step": 9776 + }, + { + "epoch": 1.596057303783519, + "grad_norm": 1.750852108001709, + "learning_rate": 1.961462617934699e-05, + "loss": 0.6832, + "step": 9777 + }, + { + "epoch": 1.5962205624260233, + "grad_norm": 1.6234248876571655, + "learning_rate": 1.961453892845071e-05, + "loss": 0.5867, + "step": 9778 + }, + { + "epoch": 1.5963838210685277, + "grad_norm": 2.2385153770446777, + "learning_rate": 1.9614451667872593e-05, + "loss": 0.6472, + "step": 9779 + }, + { + "epoch": 1.5965470797110322, + "grad_norm": 1.9185349941253662, + "learning_rate": 1.9614364397612723e-05, + "loss": 0.765, + "step": 9780 + }, + { + "epoch": 1.5967103383535366, + "grad_norm": 2.019627571105957, + "learning_rate": 1.9614277117671193e-05, + "loss": 0.7736, + "step": 9781 + }, + { + "epoch": 1.596873596996041, + "grad_norm": 1.8009591102600098, + "learning_rate": 1.9614189828048085e-05, + "loss": 0.6769, + "step": 9782 + }, + { + "epoch": 1.5970368556385455, + "grad_norm": 1.7499608993530273, + "learning_rate": 1.961410252874349e-05, + "loss": 0.6106, + "step": 9783 + }, + { + "epoch": 1.59720011428105, + "grad_norm": 1.6766297817230225, + "learning_rate": 1.96140152197575e-05, + "loss": 0.6041, + "step": 9784 + }, + { + "epoch": 1.5973633729235541, + "grad_norm": 1.8669971227645874, + "learning_rate": 1.9613927901090196e-05, + "loss": 0.8988, + "step": 9785 + }, + { + "epoch": 1.5975266315660586, + "grad_norm": 1.4861773252487183, + "learning_rate": 1.9613840572741674e-05, + "loss": 0.5519, + "step": 9786 + }, + { + "epoch": 1.5976898902085628, + "grad_norm": 1.947023868560791, + "learning_rate": 1.9613753234712013e-05, + "loss": 0.8491, + "step": 9787 + }, + { + "epoch": 1.5978531488510672, + "grad_norm": 1.5674654245376587, + "learning_rate": 1.9613665887001307e-05, + "loss": 0.6673, + "step": 9788 + }, + { + "epoch": 1.5980164074935717, + "grad_norm": 1.5393977165222168, + "learning_rate": 1.9613578529609642e-05, + "loss": 0.5741, + "step": 9789 + }, + { + "epoch": 1.598179666136076, + "grad_norm": 1.576027512550354, + "learning_rate": 1.9613491162537105e-05, + "loss": 0.7173, + "step": 9790 + }, + { + "epoch": 1.5983429247785805, + "grad_norm": 1.6827441453933716, + "learning_rate": 1.9613403785783784e-05, + "loss": 0.7371, + "step": 9791 + }, + { + "epoch": 1.598506183421085, + "grad_norm": 1.2762179374694824, + "learning_rate": 1.961331639934977e-05, + "loss": 0.4867, + "step": 9792 + }, + { + "epoch": 1.5986694420635894, + "grad_norm": 1.607686996459961, + "learning_rate": 1.9613229003235147e-05, + "loss": 0.6327, + "step": 9793 + }, + { + "epoch": 1.5988327007060936, + "grad_norm": 1.6687633991241455, + "learning_rate": 1.9613141597440008e-05, + "loss": 0.6449, + "step": 9794 + }, + { + "epoch": 1.598995959348598, + "grad_norm": 1.7764214277267456, + "learning_rate": 1.9613054181964433e-05, + "loss": 0.7078, + "step": 9795 + }, + { + "epoch": 1.5991592179911023, + "grad_norm": 1.3742303848266602, + "learning_rate": 1.961296675680852e-05, + "loss": 0.6207, + "step": 9796 + }, + { + "epoch": 1.5993224766336067, + "grad_norm": 2.047309398651123, + "learning_rate": 1.961287932197235e-05, + "loss": 0.7094, + "step": 9797 + }, + { + "epoch": 1.5994857352761112, + "grad_norm": 1.6702216863632202, + "learning_rate": 1.961279187745601e-05, + "loss": 0.6823, + "step": 9798 + }, + { + "epoch": 1.5996489939186156, + "grad_norm": 1.7494348287582397, + "learning_rate": 1.9612704423259596e-05, + "loss": 0.7207, + "step": 9799 + }, + { + "epoch": 1.59981225256112, + "grad_norm": 1.8203837871551514, + "learning_rate": 1.961261695938319e-05, + "loss": 0.6547, + "step": 9800 + }, + { + "epoch": 1.5999755112036245, + "grad_norm": 1.4315557479858398, + "learning_rate": 1.9612529485826882e-05, + "loss": 0.5449, + "step": 9801 + }, + { + "epoch": 1.6001387698461287, + "grad_norm": 1.6292190551757812, + "learning_rate": 1.9612442002590756e-05, + "loss": 0.6805, + "step": 9802 + }, + { + "epoch": 1.6003020284886331, + "grad_norm": 1.9793721437454224, + "learning_rate": 1.9612354509674906e-05, + "loss": 0.908, + "step": 9803 + }, + { + "epoch": 1.6004652871311376, + "grad_norm": 1.7853200435638428, + "learning_rate": 1.961226700707942e-05, + "loss": 0.723, + "step": 9804 + }, + { + "epoch": 1.6006285457736418, + "grad_norm": 1.6329513788223267, + "learning_rate": 1.9612179494804377e-05, + "loss": 0.6071, + "step": 9805 + }, + { + "epoch": 1.6007918044161462, + "grad_norm": 1.4837390184402466, + "learning_rate": 1.9612091972849876e-05, + "loss": 0.6296, + "step": 9806 + }, + { + "epoch": 1.6009550630586507, + "grad_norm": 1.8138179779052734, + "learning_rate": 1.9612004441216e-05, + "loss": 0.5592, + "step": 9807 + }, + { + "epoch": 1.601118321701155, + "grad_norm": 1.7431972026824951, + "learning_rate": 1.961191689990284e-05, + "loss": 0.6708, + "step": 9808 + }, + { + "epoch": 1.6012815803436595, + "grad_norm": 1.8263843059539795, + "learning_rate": 1.961182934891048e-05, + "loss": 0.7488, + "step": 9809 + }, + { + "epoch": 1.601444838986164, + "grad_norm": 1.8995561599731445, + "learning_rate": 1.961174178823901e-05, + "loss": 0.713, + "step": 9810 + }, + { + "epoch": 1.6016080976286682, + "grad_norm": 1.5727001428604126, + "learning_rate": 1.961165421788852e-05, + "loss": 0.5515, + "step": 9811 + }, + { + "epoch": 1.6017713562711726, + "grad_norm": 1.6766053438186646, + "learning_rate": 1.96115666378591e-05, + "loss": 0.5497, + "step": 9812 + }, + { + "epoch": 1.6019346149136768, + "grad_norm": 1.4809424877166748, + "learning_rate": 1.961147904815083e-05, + "loss": 0.548, + "step": 9813 + }, + { + "epoch": 1.6020978735561813, + "grad_norm": 1.7999751567840576, + "learning_rate": 1.9611391448763804e-05, + "loss": 0.6371, + "step": 9814 + }, + { + "epoch": 1.6022611321986857, + "grad_norm": 1.8068703413009644, + "learning_rate": 1.961130383969811e-05, + "loss": 0.7006, + "step": 9815 + }, + { + "epoch": 1.6024243908411901, + "grad_norm": 1.6693416833877563, + "learning_rate": 1.9611216220953833e-05, + "loss": 0.6286, + "step": 9816 + }, + { + "epoch": 1.6025876494836946, + "grad_norm": 2.175062417984009, + "learning_rate": 1.961112859253107e-05, + "loss": 0.7422, + "step": 9817 + }, + { + "epoch": 1.602750908126199, + "grad_norm": 2.143648862838745, + "learning_rate": 1.96110409544299e-05, + "loss": 0.7869, + "step": 9818 + }, + { + "epoch": 1.6029141667687035, + "grad_norm": 1.748962640762329, + "learning_rate": 1.961095330665041e-05, + "loss": 0.6511, + "step": 9819 + }, + { + "epoch": 1.6030774254112077, + "grad_norm": 1.6824527978897095, + "learning_rate": 1.9610865649192695e-05, + "loss": 0.5978, + "step": 9820 + }, + { + "epoch": 1.6032406840537121, + "grad_norm": 1.792402744293213, + "learning_rate": 1.9610777982056842e-05, + "loss": 0.7499, + "step": 9821 + }, + { + "epoch": 1.6034039426962163, + "grad_norm": 1.67743718624115, + "learning_rate": 1.961069030524294e-05, + "loss": 0.6277, + "step": 9822 + }, + { + "epoch": 1.6035672013387208, + "grad_norm": 2.0172066688537598, + "learning_rate": 1.9610602618751073e-05, + "loss": 0.6842, + "step": 9823 + }, + { + "epoch": 1.6037304599812252, + "grad_norm": 2.0673320293426514, + "learning_rate": 1.9610514922581333e-05, + "loss": 0.7876, + "step": 9824 + }, + { + "epoch": 1.6038937186237296, + "grad_norm": 1.6833585500717163, + "learning_rate": 1.9610427216733808e-05, + "loss": 0.6936, + "step": 9825 + }, + { + "epoch": 1.604056977266234, + "grad_norm": 1.5806291103363037, + "learning_rate": 1.9610339501208583e-05, + "loss": 0.5307, + "step": 9826 + }, + { + "epoch": 1.6042202359087385, + "grad_norm": 1.6642920970916748, + "learning_rate": 1.961025177600575e-05, + "loss": 0.6086, + "step": 9827 + }, + { + "epoch": 1.604383494551243, + "grad_norm": 2.024425983428955, + "learning_rate": 1.9610164041125393e-05, + "loss": 0.7676, + "step": 9828 + }, + { + "epoch": 1.6045467531937472, + "grad_norm": 1.5578705072402954, + "learning_rate": 1.9610076296567605e-05, + "loss": 0.6732, + "step": 9829 + }, + { + "epoch": 1.6047100118362516, + "grad_norm": 1.6982609033584595, + "learning_rate": 1.9609988542332473e-05, + "loss": 0.6038, + "step": 9830 + }, + { + "epoch": 1.6048732704787558, + "grad_norm": 1.5754255056381226, + "learning_rate": 1.9609900778420087e-05, + "loss": 0.6385, + "step": 9831 + }, + { + "epoch": 1.6050365291212603, + "grad_norm": 1.5281360149383545, + "learning_rate": 1.9609813004830533e-05, + "loss": 0.7216, + "step": 9832 + }, + { + "epoch": 1.6051997877637647, + "grad_norm": 1.5441087484359741, + "learning_rate": 1.9609725221563898e-05, + "loss": 0.5731, + "step": 9833 + }, + { + "epoch": 1.6053630464062691, + "grad_norm": 1.8557145595550537, + "learning_rate": 1.960963742862027e-05, + "loss": 0.6719, + "step": 9834 + }, + { + "epoch": 1.6055263050487736, + "grad_norm": 1.631670355796814, + "learning_rate": 1.9609549625999747e-05, + "loss": 0.6293, + "step": 9835 + }, + { + "epoch": 1.605689563691278, + "grad_norm": 1.6434465646743774, + "learning_rate": 1.9609461813702407e-05, + "loss": 0.5957, + "step": 9836 + }, + { + "epoch": 1.6058528223337825, + "grad_norm": 1.421085000038147, + "learning_rate": 1.9609373991728338e-05, + "loss": 0.5742, + "step": 9837 + }, + { + "epoch": 1.6060160809762867, + "grad_norm": 1.6619635820388794, + "learning_rate": 1.9609286160077633e-05, + "loss": 0.7523, + "step": 9838 + }, + { + "epoch": 1.6061793396187911, + "grad_norm": 1.789896011352539, + "learning_rate": 1.9609198318750383e-05, + "loss": 0.7153, + "step": 9839 + }, + { + "epoch": 1.6063425982612953, + "grad_norm": 1.8744020462036133, + "learning_rate": 1.960911046774667e-05, + "loss": 0.7169, + "step": 9840 + }, + { + "epoch": 1.6065058569037998, + "grad_norm": 1.7132737636566162, + "learning_rate": 1.9609022607066587e-05, + "loss": 0.6895, + "step": 9841 + }, + { + "epoch": 1.6066691155463042, + "grad_norm": 1.7173854112625122, + "learning_rate": 1.9608934736710217e-05, + "loss": 0.7473, + "step": 9842 + }, + { + "epoch": 1.6068323741888086, + "grad_norm": 1.2728602886199951, + "learning_rate": 1.9608846856677654e-05, + "loss": 0.4593, + "step": 9843 + }, + { + "epoch": 1.606995632831313, + "grad_norm": 1.7314422130584717, + "learning_rate": 1.9608758966968987e-05, + "loss": 0.6712, + "step": 9844 + }, + { + "epoch": 1.6071588914738175, + "grad_norm": 1.923888087272644, + "learning_rate": 1.9608671067584303e-05, + "loss": 0.8787, + "step": 9845 + }, + { + "epoch": 1.6073221501163217, + "grad_norm": 2.0452096462249756, + "learning_rate": 1.9608583158523687e-05, + "loss": 0.6715, + "step": 9846 + }, + { + "epoch": 1.6074854087588262, + "grad_norm": 1.6187763214111328, + "learning_rate": 1.9608495239787228e-05, + "loss": 0.6473, + "step": 9847 + }, + { + "epoch": 1.6076486674013306, + "grad_norm": 1.8139688968658447, + "learning_rate": 1.9608407311375023e-05, + "loss": 0.776, + "step": 9848 + }, + { + "epoch": 1.6078119260438348, + "grad_norm": 1.7220070362091064, + "learning_rate": 1.960831937328715e-05, + "loss": 0.654, + "step": 9849 + }, + { + "epoch": 1.6079751846863393, + "grad_norm": 1.7548117637634277, + "learning_rate": 1.9608231425523702e-05, + "loss": 0.6715, + "step": 9850 + }, + { + "epoch": 1.6081384433288437, + "grad_norm": 1.5513639450073242, + "learning_rate": 1.960814346808477e-05, + "loss": 0.6412, + "step": 9851 + }, + { + "epoch": 1.6083017019713481, + "grad_norm": 1.6692925691604614, + "learning_rate": 1.9608055500970437e-05, + "loss": 0.6454, + "step": 9852 + }, + { + "epoch": 1.6084649606138526, + "grad_norm": 1.9834835529327393, + "learning_rate": 1.96079675241808e-05, + "loss": 0.8211, + "step": 9853 + }, + { + "epoch": 1.608628219256357, + "grad_norm": 1.7180918455123901, + "learning_rate": 1.9607879537715937e-05, + "loss": 0.7711, + "step": 9854 + }, + { + "epoch": 1.6087914778988612, + "grad_norm": 1.7019182443618774, + "learning_rate": 1.9607791541575944e-05, + "loss": 0.7384, + "step": 9855 + }, + { + "epoch": 1.6089547365413657, + "grad_norm": 1.4341639280319214, + "learning_rate": 1.9607703535760906e-05, + "loss": 0.6709, + "step": 9856 + }, + { + "epoch": 1.6091179951838699, + "grad_norm": 1.759806513786316, + "learning_rate": 1.9607615520270916e-05, + "loss": 0.6351, + "step": 9857 + }, + { + "epoch": 1.6092812538263743, + "grad_norm": 1.6097996234893799, + "learning_rate": 1.9607527495106057e-05, + "loss": 0.6205, + "step": 9858 + }, + { + "epoch": 1.6094445124688788, + "grad_norm": 1.626193881034851, + "learning_rate": 1.960743946026642e-05, + "loss": 0.7315, + "step": 9859 + }, + { + "epoch": 1.6096077711113832, + "grad_norm": 1.7317359447479248, + "learning_rate": 1.9607351415752096e-05, + "loss": 0.7009, + "step": 9860 + }, + { + "epoch": 1.6097710297538876, + "grad_norm": 1.506295084953308, + "learning_rate": 1.960726336156317e-05, + "loss": 0.5231, + "step": 9861 + }, + { + "epoch": 1.609934288396392, + "grad_norm": 2.4288222789764404, + "learning_rate": 1.9607175297699734e-05, + "loss": 0.882, + "step": 9862 + }, + { + "epoch": 1.6100975470388965, + "grad_norm": 1.2939797639846802, + "learning_rate": 1.9607087224161874e-05, + "loss": 0.5435, + "step": 9863 + }, + { + "epoch": 1.6102608056814007, + "grad_norm": 1.8660036325454712, + "learning_rate": 1.960699914094968e-05, + "loss": 0.8991, + "step": 9864 + }, + { + "epoch": 1.6104240643239052, + "grad_norm": 1.5615251064300537, + "learning_rate": 1.960691104806324e-05, + "loss": 0.6336, + "step": 9865 + }, + { + "epoch": 1.6105873229664094, + "grad_norm": 1.701054334640503, + "learning_rate": 1.9606822945502642e-05, + "loss": 0.8322, + "step": 9866 + }, + { + "epoch": 1.6107505816089138, + "grad_norm": 1.5319828987121582, + "learning_rate": 1.9606734833267977e-05, + "loss": 0.7, + "step": 9867 + }, + { + "epoch": 1.6109138402514183, + "grad_norm": 1.693289875984192, + "learning_rate": 1.9606646711359335e-05, + "loss": 0.5494, + "step": 9868 + }, + { + "epoch": 1.6110770988939227, + "grad_norm": 1.6100633144378662, + "learning_rate": 1.9606558579776798e-05, + "loss": 0.6064, + "step": 9869 + }, + { + "epoch": 1.6112403575364271, + "grad_norm": 1.537405014038086, + "learning_rate": 1.960647043852046e-05, + "loss": 0.5941, + "step": 9870 + }, + { + "epoch": 1.6114036161789316, + "grad_norm": 1.6685229539871216, + "learning_rate": 1.960638228759041e-05, + "loss": 0.6318, + "step": 9871 + }, + { + "epoch": 1.611566874821436, + "grad_norm": 1.8390123844146729, + "learning_rate": 1.9606294126986738e-05, + "loss": 0.7122, + "step": 9872 + }, + { + "epoch": 1.6117301334639402, + "grad_norm": 1.894135594367981, + "learning_rate": 1.9606205956709527e-05, + "loss": 0.7576, + "step": 9873 + }, + { + "epoch": 1.6118933921064447, + "grad_norm": 1.9738019704818726, + "learning_rate": 1.960611777675887e-05, + "loss": 0.7139, + "step": 9874 + }, + { + "epoch": 1.6120566507489489, + "grad_norm": 1.9650458097457886, + "learning_rate": 1.9606029587134858e-05, + "loss": 0.7973, + "step": 9875 + }, + { + "epoch": 1.6122199093914533, + "grad_norm": 1.879150152206421, + "learning_rate": 1.960594138783757e-05, + "loss": 0.7883, + "step": 9876 + }, + { + "epoch": 1.6123831680339578, + "grad_norm": 1.5775115489959717, + "learning_rate": 1.9605853178867107e-05, + "loss": 0.7187, + "step": 9877 + }, + { + "epoch": 1.6125464266764622, + "grad_norm": 1.7599941492080688, + "learning_rate": 1.960576496022355e-05, + "loss": 0.7485, + "step": 9878 + }, + { + "epoch": 1.6127096853189666, + "grad_norm": 1.7559595108032227, + "learning_rate": 1.960567673190699e-05, + "loss": 0.7859, + "step": 9879 + }, + { + "epoch": 1.612872943961471, + "grad_norm": 1.7789117097854614, + "learning_rate": 1.960558849391752e-05, + "loss": 0.7202, + "step": 9880 + }, + { + "epoch": 1.6130362026039755, + "grad_norm": 1.8371061086654663, + "learning_rate": 1.960550024625522e-05, + "loss": 0.6554, + "step": 9881 + }, + { + "epoch": 1.6131994612464797, + "grad_norm": 1.47001314163208, + "learning_rate": 1.9605411988920185e-05, + "loss": 0.6252, + "step": 9882 + }, + { + "epoch": 1.6133627198889842, + "grad_norm": 1.7572476863861084, + "learning_rate": 1.9605323721912506e-05, + "loss": 0.6784, + "step": 9883 + }, + { + "epoch": 1.6135259785314884, + "grad_norm": 1.819886565208435, + "learning_rate": 1.9605235445232266e-05, + "loss": 0.71, + "step": 9884 + }, + { + "epoch": 1.6136892371739928, + "grad_norm": 1.5366079807281494, + "learning_rate": 1.9605147158879557e-05, + "loss": 0.617, + "step": 9885 + }, + { + "epoch": 1.6138524958164973, + "grad_norm": 1.7918781042099, + "learning_rate": 1.9605058862854464e-05, + "loss": 0.6851, + "step": 9886 + }, + { + "epoch": 1.6140157544590017, + "grad_norm": 1.583812952041626, + "learning_rate": 1.9604970557157084e-05, + "loss": 0.5696, + "step": 9887 + }, + { + "epoch": 1.6141790131015061, + "grad_norm": 1.5877928733825684, + "learning_rate": 1.96048822417875e-05, + "loss": 0.6823, + "step": 9888 + }, + { + "epoch": 1.6143422717440106, + "grad_norm": 2.041011095046997, + "learning_rate": 1.96047939167458e-05, + "loss": 0.9112, + "step": 9889 + }, + { + "epoch": 1.6145055303865148, + "grad_norm": 2.0541915893554688, + "learning_rate": 1.960470558203208e-05, + "loss": 0.7779, + "step": 9890 + }, + { + "epoch": 1.6146687890290192, + "grad_norm": 1.7293528318405151, + "learning_rate": 1.960461723764642e-05, + "loss": 0.7389, + "step": 9891 + }, + { + "epoch": 1.6148320476715237, + "grad_norm": 1.5841768980026245, + "learning_rate": 1.9604528883588912e-05, + "loss": 0.6533, + "step": 9892 + }, + { + "epoch": 1.6149953063140279, + "grad_norm": 1.6819813251495361, + "learning_rate": 1.9604440519859645e-05, + "loss": 0.6759, + "step": 9893 + }, + { + "epoch": 1.6151585649565323, + "grad_norm": 1.4191339015960693, + "learning_rate": 1.9604352146458715e-05, + "loss": 0.5056, + "step": 9894 + }, + { + "epoch": 1.6153218235990368, + "grad_norm": 1.487356424331665, + "learning_rate": 1.96042637633862e-05, + "loss": 0.5664, + "step": 9895 + }, + { + "epoch": 1.6154850822415412, + "grad_norm": 1.6558493375778198, + "learning_rate": 1.9604175370642196e-05, + "loss": 0.6178, + "step": 9896 + }, + { + "epoch": 1.6156483408840456, + "grad_norm": 1.8192882537841797, + "learning_rate": 1.960408696822679e-05, + "loss": 0.7783, + "step": 9897 + }, + { + "epoch": 1.61581159952655, + "grad_norm": 1.5951824188232422, + "learning_rate": 1.9603998556140066e-05, + "loss": 0.6511, + "step": 9898 + }, + { + "epoch": 1.6159748581690543, + "grad_norm": 1.6060268878936768, + "learning_rate": 1.9603910134382124e-05, + "loss": 0.6585, + "step": 9899 + }, + { + "epoch": 1.6161381168115587, + "grad_norm": 2.108264207839966, + "learning_rate": 1.9603821702953047e-05, + "loss": 0.7558, + "step": 9900 + }, + { + "epoch": 1.616301375454063, + "grad_norm": 1.8553142547607422, + "learning_rate": 1.960373326185292e-05, + "loss": 0.7651, + "step": 9901 + }, + { + "epoch": 1.6164646340965674, + "grad_norm": 1.7204899787902832, + "learning_rate": 1.960364481108184e-05, + "loss": 0.769, + "step": 9902 + }, + { + "epoch": 1.6166278927390718, + "grad_norm": 1.4396463632583618, + "learning_rate": 1.960355635063989e-05, + "loss": 0.573, + "step": 9903 + }, + { + "epoch": 1.6167911513815763, + "grad_norm": 1.8877251148223877, + "learning_rate": 1.9603467880527164e-05, + "loss": 0.7424, + "step": 9904 + }, + { + "epoch": 1.6169544100240807, + "grad_norm": 1.5495846271514893, + "learning_rate": 1.9603379400743744e-05, + "loss": 0.7612, + "step": 9905 + }, + { + "epoch": 1.6171176686665851, + "grad_norm": 1.8033593893051147, + "learning_rate": 1.960329091128973e-05, + "loss": 0.7705, + "step": 9906 + }, + { + "epoch": 1.6172809273090896, + "grad_norm": 1.632195234298706, + "learning_rate": 1.9603202412165196e-05, + "loss": 0.5575, + "step": 9907 + }, + { + "epoch": 1.6174441859515938, + "grad_norm": 1.864630103111267, + "learning_rate": 1.9603113903370245e-05, + "loss": 0.772, + "step": 9908 + }, + { + "epoch": 1.6176074445940982, + "grad_norm": 1.2784122228622437, + "learning_rate": 1.960302538490496e-05, + "loss": 0.6011, + "step": 9909 + }, + { + "epoch": 1.6177707032366024, + "grad_norm": 1.7583242654800415, + "learning_rate": 1.9602936856769432e-05, + "loss": 0.6197, + "step": 9910 + }, + { + "epoch": 1.6179339618791069, + "grad_norm": 1.7336595058441162, + "learning_rate": 1.9602848318963747e-05, + "loss": 0.6143, + "step": 9911 + }, + { + "epoch": 1.6180972205216113, + "grad_norm": 1.9464582204818726, + "learning_rate": 1.9602759771488e-05, + "loss": 0.8194, + "step": 9912 + }, + { + "epoch": 1.6182604791641158, + "grad_norm": 1.3688303232192993, + "learning_rate": 1.9602671214342272e-05, + "loss": 0.571, + "step": 9913 + }, + { + "epoch": 1.6184237378066202, + "grad_norm": 1.538170576095581, + "learning_rate": 1.960258264752666e-05, + "loss": 0.6398, + "step": 9914 + }, + { + "epoch": 1.6185869964491246, + "grad_norm": 1.807430624961853, + "learning_rate": 1.960249407104125e-05, + "loss": 0.7037, + "step": 9915 + }, + { + "epoch": 1.618750255091629, + "grad_norm": 1.639836072921753, + "learning_rate": 1.9602405484886126e-05, + "loss": 0.6233, + "step": 9916 + }, + { + "epoch": 1.6189135137341333, + "grad_norm": 1.4518277645111084, + "learning_rate": 1.9602316889061388e-05, + "loss": 0.5711, + "step": 9917 + }, + { + "epoch": 1.6190767723766377, + "grad_norm": 1.6341288089752197, + "learning_rate": 1.960222828356712e-05, + "loss": 0.6874, + "step": 9918 + }, + { + "epoch": 1.619240031019142, + "grad_norm": 1.7196002006530762, + "learning_rate": 1.9602139668403402e-05, + "loss": 0.664, + "step": 9919 + }, + { + "epoch": 1.6194032896616464, + "grad_norm": 1.426687240600586, + "learning_rate": 1.960205104357034e-05, + "loss": 0.5229, + "step": 9920 + }, + { + "epoch": 1.6195665483041508, + "grad_norm": 1.812554121017456, + "learning_rate": 1.960196240906801e-05, + "loss": 0.654, + "step": 9921 + }, + { + "epoch": 1.6197298069466552, + "grad_norm": 2.414975643157959, + "learning_rate": 1.960187376489651e-05, + "loss": 0.8779, + "step": 9922 + }, + { + "epoch": 1.6198930655891597, + "grad_norm": 1.5978323221206665, + "learning_rate": 1.9601785111055928e-05, + "loss": 0.5948, + "step": 9923 + }, + { + "epoch": 1.6200563242316641, + "grad_norm": 1.3697410821914673, + "learning_rate": 1.960169644754635e-05, + "loss": 0.5699, + "step": 9924 + }, + { + "epoch": 1.6202195828741686, + "grad_norm": 1.5069769620895386, + "learning_rate": 1.9601607774367862e-05, + "loss": 0.7036, + "step": 9925 + }, + { + "epoch": 1.6203828415166728, + "grad_norm": 1.8763434886932373, + "learning_rate": 1.960151909152056e-05, + "loss": 0.7439, + "step": 9926 + }, + { + "epoch": 1.6205461001591772, + "grad_norm": 1.546112298965454, + "learning_rate": 1.960143039900453e-05, + "loss": 0.6093, + "step": 9927 + }, + { + "epoch": 1.6207093588016814, + "grad_norm": 1.5846492052078247, + "learning_rate": 1.960134169681986e-05, + "loss": 0.5848, + "step": 9928 + }, + { + "epoch": 1.6208726174441859, + "grad_norm": 1.5905203819274902, + "learning_rate": 1.9601252984966645e-05, + "loss": 0.6502, + "step": 9929 + }, + { + "epoch": 1.6210358760866903, + "grad_norm": 1.6415377855300903, + "learning_rate": 1.960116426344497e-05, + "loss": 0.6603, + "step": 9930 + }, + { + "epoch": 1.6211991347291947, + "grad_norm": 1.7289812564849854, + "learning_rate": 1.9601075532254924e-05, + "loss": 0.7701, + "step": 9931 + }, + { + "epoch": 1.6213623933716992, + "grad_norm": 1.66036057472229, + "learning_rate": 1.96009867913966e-05, + "loss": 0.5878, + "step": 9932 + }, + { + "epoch": 1.6215256520142036, + "grad_norm": 1.8340846300125122, + "learning_rate": 1.9600898040870084e-05, + "loss": 0.8278, + "step": 9933 + }, + { + "epoch": 1.6216889106567078, + "grad_norm": 1.6465085744857788, + "learning_rate": 1.9600809280675465e-05, + "loss": 0.708, + "step": 9934 + }, + { + "epoch": 1.6218521692992123, + "grad_norm": 1.9307135343551636, + "learning_rate": 1.9600720510812833e-05, + "loss": 0.6504, + "step": 9935 + }, + { + "epoch": 1.6220154279417167, + "grad_norm": 1.862109661102295, + "learning_rate": 1.9600631731282278e-05, + "loss": 0.8577, + "step": 9936 + }, + { + "epoch": 1.622178686584221, + "grad_norm": 1.3749257326126099, + "learning_rate": 1.9600542942083893e-05, + "loss": 0.6912, + "step": 9937 + }, + { + "epoch": 1.6223419452267254, + "grad_norm": 1.867097020149231, + "learning_rate": 1.960045414321776e-05, + "loss": 0.6886, + "step": 9938 + }, + { + "epoch": 1.6225052038692298, + "grad_norm": 1.4429829120635986, + "learning_rate": 1.9600365334683972e-05, + "loss": 0.5363, + "step": 9939 + }, + { + "epoch": 1.6226684625117342, + "grad_norm": 1.7537142038345337, + "learning_rate": 1.9600276516482623e-05, + "loss": 0.6658, + "step": 9940 + }, + { + "epoch": 1.6228317211542387, + "grad_norm": 1.5109978914260864, + "learning_rate": 1.9600187688613795e-05, + "loss": 0.5686, + "step": 9941 + }, + { + "epoch": 1.6229949797967431, + "grad_norm": 1.927838683128357, + "learning_rate": 1.960009885107758e-05, + "loss": 0.7586, + "step": 9942 + }, + { + "epoch": 1.6231582384392473, + "grad_norm": 1.647126317024231, + "learning_rate": 1.9600010003874067e-05, + "loss": 0.6356, + "step": 9943 + }, + { + "epoch": 1.6233214970817518, + "grad_norm": 1.43631112575531, + "learning_rate": 1.959992114700335e-05, + "loss": 0.5849, + "step": 9944 + }, + { + "epoch": 1.623484755724256, + "grad_norm": 1.5692118406295776, + "learning_rate": 1.9599832280465513e-05, + "loss": 0.6029, + "step": 9945 + }, + { + "epoch": 1.6236480143667604, + "grad_norm": 1.751539945602417, + "learning_rate": 1.9599743404260646e-05, + "loss": 0.7169, + "step": 9946 + }, + { + "epoch": 1.6238112730092649, + "grad_norm": 1.729548692703247, + "learning_rate": 1.959965451838884e-05, + "loss": 0.5295, + "step": 9947 + }, + { + "epoch": 1.6239745316517693, + "grad_norm": 1.495545506477356, + "learning_rate": 1.959956562285019e-05, + "loss": 0.5669, + "step": 9948 + }, + { + "epoch": 1.6241377902942737, + "grad_norm": 1.667177438735962, + "learning_rate": 1.9599476717644777e-05, + "loss": 0.7324, + "step": 9949 + }, + { + "epoch": 1.6243010489367782, + "grad_norm": 1.6816600561141968, + "learning_rate": 1.9599387802772693e-05, + "loss": 0.6178, + "step": 9950 + }, + { + "epoch": 1.6244643075792826, + "grad_norm": 1.9057495594024658, + "learning_rate": 1.9599298878234024e-05, + "loss": 0.9183, + "step": 9951 + }, + { + "epoch": 1.6246275662217868, + "grad_norm": 2.120357036590576, + "learning_rate": 1.9599209944028867e-05, + "loss": 0.9988, + "step": 9952 + }, + { + "epoch": 1.6247908248642913, + "grad_norm": 1.665749430656433, + "learning_rate": 1.9599121000157312e-05, + "loss": 0.7956, + "step": 9953 + }, + { + "epoch": 1.6249540835067955, + "grad_norm": 1.842893123626709, + "learning_rate": 1.9599032046619437e-05, + "loss": 0.9737, + "step": 9954 + }, + { + "epoch": 1.6251173421493, + "grad_norm": 1.9717347621917725, + "learning_rate": 1.9598943083415345e-05, + "loss": 0.6459, + "step": 9955 + }, + { + "epoch": 1.6252806007918044, + "grad_norm": 1.6530388593673706, + "learning_rate": 1.959885411054512e-05, + "loss": 0.6793, + "step": 9956 + }, + { + "epoch": 1.6254438594343088, + "grad_norm": 1.346403956413269, + "learning_rate": 1.9598765128008847e-05, + "loss": 0.5108, + "step": 9957 + }, + { + "epoch": 1.6256071180768132, + "grad_norm": 1.682734489440918, + "learning_rate": 1.9598676135806622e-05, + "loss": 0.6677, + "step": 9958 + }, + { + "epoch": 1.6257703767193177, + "grad_norm": 1.8322635889053345, + "learning_rate": 1.9598587133938535e-05, + "loss": 0.8418, + "step": 9959 + }, + { + "epoch": 1.6259336353618221, + "grad_norm": 1.9128732681274414, + "learning_rate": 1.9598498122404674e-05, + "loss": 0.9465, + "step": 9960 + }, + { + "epoch": 1.6260968940043263, + "grad_norm": 1.4682658910751343, + "learning_rate": 1.9598409101205123e-05, + "loss": 0.6055, + "step": 9961 + }, + { + "epoch": 1.6262601526468308, + "grad_norm": 1.4844465255737305, + "learning_rate": 1.9598320070339977e-05, + "loss": 0.5966, + "step": 9962 + }, + { + "epoch": 1.626423411289335, + "grad_norm": 1.6079275608062744, + "learning_rate": 1.959823102980933e-05, + "loss": 0.6999, + "step": 9963 + }, + { + "epoch": 1.6265866699318394, + "grad_norm": 1.523175597190857, + "learning_rate": 1.9598141979613265e-05, + "loss": 0.5925, + "step": 9964 + }, + { + "epoch": 1.6267499285743439, + "grad_norm": 1.7228418588638306, + "learning_rate": 1.959805291975187e-05, + "loss": 0.7186, + "step": 9965 + }, + { + "epoch": 1.6269131872168483, + "grad_norm": 1.6734815835952759, + "learning_rate": 1.959796385022524e-05, + "loss": 0.5445, + "step": 9966 + }, + { + "epoch": 1.6270764458593527, + "grad_norm": 1.660291075706482, + "learning_rate": 1.9597874771033468e-05, + "loss": 0.7424, + "step": 9967 + }, + { + "epoch": 1.6272397045018572, + "grad_norm": 1.5311706066131592, + "learning_rate": 1.9597785682176632e-05, + "loss": 0.6522, + "step": 9968 + }, + { + "epoch": 1.6274029631443616, + "grad_norm": 1.8155827522277832, + "learning_rate": 1.959769658365483e-05, + "loss": 0.7351, + "step": 9969 + }, + { + "epoch": 1.6275662217868658, + "grad_norm": 1.6888411045074463, + "learning_rate": 1.959760747546815e-05, + "loss": 0.6635, + "step": 9970 + }, + { + "epoch": 1.6277294804293703, + "grad_norm": 1.5978652238845825, + "learning_rate": 1.9597518357616686e-05, + "loss": 0.6719, + "step": 9971 + }, + { + "epoch": 1.6278927390718745, + "grad_norm": 1.7928999662399292, + "learning_rate": 1.959742923010052e-05, + "loss": 0.7563, + "step": 9972 + }, + { + "epoch": 1.628055997714379, + "grad_norm": 1.623977541923523, + "learning_rate": 1.9597340092919747e-05, + "loss": 0.69, + "step": 9973 + }, + { + "epoch": 1.6282192563568834, + "grad_norm": 1.878576397895813, + "learning_rate": 1.9597250946074453e-05, + "loss": 1.0453, + "step": 9974 + }, + { + "epoch": 1.6283825149993878, + "grad_norm": 1.8737051486968994, + "learning_rate": 1.9597161789564732e-05, + "loss": 0.7073, + "step": 9975 + }, + { + "epoch": 1.6285457736418922, + "grad_norm": 1.7054731845855713, + "learning_rate": 1.9597072623390668e-05, + "loss": 0.7787, + "step": 9976 + }, + { + "epoch": 1.6287090322843967, + "grad_norm": 1.9507187604904175, + "learning_rate": 1.9596983447552357e-05, + "loss": 0.8347, + "step": 9977 + }, + { + "epoch": 1.628872290926901, + "grad_norm": 1.6655948162078857, + "learning_rate": 1.9596894262049885e-05, + "loss": 0.6526, + "step": 9978 + }, + { + "epoch": 1.6290355495694053, + "grad_norm": 1.2738479375839233, + "learning_rate": 1.9596805066883347e-05, + "loss": 0.4985, + "step": 9979 + }, + { + "epoch": 1.6291988082119098, + "grad_norm": 1.6888433694839478, + "learning_rate": 1.9596715862052823e-05, + "loss": 0.6368, + "step": 9980 + }, + { + "epoch": 1.629362066854414, + "grad_norm": 1.5171151161193848, + "learning_rate": 1.9596626647558412e-05, + "loss": 0.6109, + "step": 9981 + }, + { + "epoch": 1.6295253254969184, + "grad_norm": 1.7136192321777344, + "learning_rate": 1.9596537423400202e-05, + "loss": 0.7133, + "step": 9982 + }, + { + "epoch": 1.6296885841394229, + "grad_norm": 1.7613364458084106, + "learning_rate": 1.9596448189578278e-05, + "loss": 0.8167, + "step": 9983 + }, + { + "epoch": 1.6298518427819273, + "grad_norm": 1.7794079780578613, + "learning_rate": 1.9596358946092735e-05, + "loss": 0.6288, + "step": 9984 + }, + { + "epoch": 1.6300151014244317, + "grad_norm": 1.5747590065002441, + "learning_rate": 1.959626969294366e-05, + "loss": 0.6126, + "step": 9985 + }, + { + "epoch": 1.6301783600669362, + "grad_norm": 1.7081397771835327, + "learning_rate": 1.9596180430131143e-05, + "loss": 0.6379, + "step": 9986 + }, + { + "epoch": 1.6303416187094404, + "grad_norm": 1.5682002305984497, + "learning_rate": 1.959609115765528e-05, + "loss": 0.6423, + "step": 9987 + }, + { + "epoch": 1.6305048773519448, + "grad_norm": 1.8774763345718384, + "learning_rate": 1.959600187551615e-05, + "loss": 0.8596, + "step": 9988 + }, + { + "epoch": 1.630668135994449, + "grad_norm": 1.760628342628479, + "learning_rate": 1.959591258371385e-05, + "loss": 0.6936, + "step": 9989 + }, + { + "epoch": 1.6308313946369535, + "grad_norm": 1.5259106159210205, + "learning_rate": 1.959582328224847e-05, + "loss": 0.5439, + "step": 9990 + }, + { + "epoch": 1.630994653279458, + "grad_norm": 1.7732175588607788, + "learning_rate": 1.95957339711201e-05, + "loss": 0.6592, + "step": 9991 + }, + { + "epoch": 1.6311579119219624, + "grad_norm": 1.2790030241012573, + "learning_rate": 1.9595644650328823e-05, + "loss": 0.4824, + "step": 9992 + }, + { + "epoch": 1.6313211705644668, + "grad_norm": 1.744803786277771, + "learning_rate": 1.9595555319874738e-05, + "loss": 0.6015, + "step": 9993 + }, + { + "epoch": 1.6314844292069712, + "grad_norm": 1.7231745719909668, + "learning_rate": 1.959546597975793e-05, + "loss": 0.6848, + "step": 9994 + }, + { + "epoch": 1.6316476878494757, + "grad_norm": 1.746822714805603, + "learning_rate": 1.9595376629978494e-05, + "loss": 0.7217, + "step": 9995 + }, + { + "epoch": 1.6318109464919799, + "grad_norm": 1.5989131927490234, + "learning_rate": 1.9595287270536512e-05, + "loss": 0.6715, + "step": 9996 + }, + { + "epoch": 1.6319742051344843, + "grad_norm": 1.746046781539917, + "learning_rate": 1.959519790143208e-05, + "loss": 0.6402, + "step": 9997 + }, + { + "epoch": 1.6321374637769885, + "grad_norm": 1.799950361251831, + "learning_rate": 1.959510852266529e-05, + "loss": 0.6902, + "step": 9998 + }, + { + "epoch": 1.632300722419493, + "grad_norm": 1.9233204126358032, + "learning_rate": 1.9595019134236223e-05, + "loss": 0.7194, + "step": 9999 + }, + { + "epoch": 1.6324639810619974, + "grad_norm": 1.7570171356201172, + "learning_rate": 1.9594929736144978e-05, + "loss": 0.5658, + "step": 10000 + }, + { + "epoch": 1.6326272397045019, + "grad_norm": 1.7283004522323608, + "learning_rate": 1.9594840328391638e-05, + "loss": 0.7391, + "step": 10001 + }, + { + "epoch": 1.6327904983470063, + "grad_norm": 1.602363109588623, + "learning_rate": 1.9594750910976295e-05, + "loss": 0.6407, + "step": 10002 + }, + { + "epoch": 1.6329537569895107, + "grad_norm": 1.53365957736969, + "learning_rate": 1.9594661483899044e-05, + "loss": 0.6107, + "step": 10003 + }, + { + "epoch": 1.6331170156320152, + "grad_norm": 1.4159650802612305, + "learning_rate": 1.959457204715997e-05, + "loss": 0.5889, + "step": 10004 + }, + { + "epoch": 1.6332802742745194, + "grad_norm": 1.5799587965011597, + "learning_rate": 1.9594482600759166e-05, + "loss": 0.537, + "step": 10005 + }, + { + "epoch": 1.6334435329170238, + "grad_norm": 1.404411792755127, + "learning_rate": 1.9594393144696718e-05, + "loss": 0.4909, + "step": 10006 + }, + { + "epoch": 1.633606791559528, + "grad_norm": 1.931803822517395, + "learning_rate": 1.959430367897272e-05, + "loss": 0.7136, + "step": 10007 + }, + { + "epoch": 1.6337700502020325, + "grad_norm": 1.7706828117370605, + "learning_rate": 1.959421420358726e-05, + "loss": 0.6407, + "step": 10008 + }, + { + "epoch": 1.633933308844537, + "grad_norm": 2.162289619445801, + "learning_rate": 1.959412471854043e-05, + "loss": 0.759, + "step": 10009 + }, + { + "epoch": 1.6340965674870414, + "grad_norm": 1.8142954111099243, + "learning_rate": 1.959403522383232e-05, + "loss": 0.7213, + "step": 10010 + }, + { + "epoch": 1.6342598261295458, + "grad_norm": 1.6594425439834595, + "learning_rate": 1.9593945719463018e-05, + "loss": 0.7162, + "step": 10011 + }, + { + "epoch": 1.6344230847720502, + "grad_norm": 1.7348077297210693, + "learning_rate": 1.9593856205432614e-05, + "loss": 0.6746, + "step": 10012 + }, + { + "epoch": 1.6345863434145547, + "grad_norm": 1.941354513168335, + "learning_rate": 1.95937666817412e-05, + "loss": 0.8183, + "step": 10013 + }, + { + "epoch": 1.6347496020570589, + "grad_norm": 1.7943456172943115, + "learning_rate": 1.959367714838886e-05, + "loss": 0.7826, + "step": 10014 + }, + { + "epoch": 1.6349128606995633, + "grad_norm": 1.5109128952026367, + "learning_rate": 1.95935876053757e-05, + "loss": 0.4138, + "step": 10015 + }, + { + "epoch": 1.6350761193420675, + "grad_norm": 1.621902585029602, + "learning_rate": 1.9593498052701796e-05, + "loss": 0.7606, + "step": 10016 + }, + { + "epoch": 1.635239377984572, + "grad_norm": 1.5043554306030273, + "learning_rate": 1.9593408490367237e-05, + "loss": 0.6461, + "step": 10017 + }, + { + "epoch": 1.6354026366270764, + "grad_norm": 2.1163289546966553, + "learning_rate": 1.9593318918372126e-05, + "loss": 0.616, + "step": 10018 + }, + { + "epoch": 1.6355658952695808, + "grad_norm": 1.6585100889205933, + "learning_rate": 1.9593229336716542e-05, + "loss": 0.5696, + "step": 10019 + }, + { + "epoch": 1.6357291539120853, + "grad_norm": 1.8742451667785645, + "learning_rate": 1.9593139745400575e-05, + "loss": 0.787, + "step": 10020 + }, + { + "epoch": 1.6358924125545897, + "grad_norm": 1.6877708435058594, + "learning_rate": 1.9593050144424322e-05, + "loss": 0.7967, + "step": 10021 + }, + { + "epoch": 1.636055671197094, + "grad_norm": 1.4806536436080933, + "learning_rate": 1.9592960533787872e-05, + "loss": 0.597, + "step": 10022 + }, + { + "epoch": 1.6362189298395984, + "grad_norm": 1.715296745300293, + "learning_rate": 1.959287091349131e-05, + "loss": 0.6379, + "step": 10023 + }, + { + "epoch": 1.6363821884821028, + "grad_norm": 1.9717532396316528, + "learning_rate": 1.9592781283534733e-05, + "loss": 0.7833, + "step": 10024 + }, + { + "epoch": 1.636545447124607, + "grad_norm": 1.6474546194076538, + "learning_rate": 1.9592691643918226e-05, + "loss": 0.6992, + "step": 10025 + }, + { + "epoch": 1.6367087057671115, + "grad_norm": 1.6564373970031738, + "learning_rate": 1.959260199464188e-05, + "loss": 0.6613, + "step": 10026 + }, + { + "epoch": 1.636871964409616, + "grad_norm": 1.668402075767517, + "learning_rate": 1.9592512335705786e-05, + "loss": 0.6779, + "step": 10027 + }, + { + "epoch": 1.6370352230521203, + "grad_norm": 1.4628677368164062, + "learning_rate": 1.959242266711004e-05, + "loss": 0.6286, + "step": 10028 + }, + { + "epoch": 1.6371984816946248, + "grad_norm": 1.64640474319458, + "learning_rate": 1.959233298885472e-05, + "loss": 0.653, + "step": 10029 + }, + { + "epoch": 1.6373617403371292, + "grad_norm": 1.6356184482574463, + "learning_rate": 1.9592243300939926e-05, + "loss": 0.7245, + "step": 10030 + }, + { + "epoch": 1.6375249989796334, + "grad_norm": 1.652172565460205, + "learning_rate": 1.9592153603365746e-05, + "loss": 0.5934, + "step": 10031 + }, + { + "epoch": 1.6376882576221379, + "grad_norm": 1.905864953994751, + "learning_rate": 1.9592063896132266e-05, + "loss": 0.5969, + "step": 10032 + }, + { + "epoch": 1.637851516264642, + "grad_norm": 1.9108155965805054, + "learning_rate": 1.9591974179239585e-05, + "loss": 0.8251, + "step": 10033 + }, + { + "epoch": 1.6380147749071465, + "grad_norm": 1.6361916065216064, + "learning_rate": 1.9591884452687788e-05, + "loss": 0.6828, + "step": 10034 + }, + { + "epoch": 1.638178033549651, + "grad_norm": 1.8573397397994995, + "learning_rate": 1.9591794716476965e-05, + "loss": 0.7239, + "step": 10035 + }, + { + "epoch": 1.6383412921921554, + "grad_norm": 1.657787799835205, + "learning_rate": 1.9591704970607206e-05, + "loss": 0.6302, + "step": 10036 + }, + { + "epoch": 1.6385045508346598, + "grad_norm": 1.922446370124817, + "learning_rate": 1.9591615215078604e-05, + "loss": 0.6878, + "step": 10037 + }, + { + "epoch": 1.6386678094771643, + "grad_norm": 1.6157526969909668, + "learning_rate": 1.959152544989125e-05, + "loss": 0.6133, + "step": 10038 + }, + { + "epoch": 1.6388310681196687, + "grad_norm": 1.8786157369613647, + "learning_rate": 1.9591435675045227e-05, + "loss": 0.6246, + "step": 10039 + }, + { + "epoch": 1.638994326762173, + "grad_norm": 1.5269620418548584, + "learning_rate": 1.9591345890540635e-05, + "loss": 0.5995, + "step": 10040 + }, + { + "epoch": 1.6391575854046774, + "grad_norm": 1.5697052478790283, + "learning_rate": 1.959125609637756e-05, + "loss": 0.6434, + "step": 10041 + }, + { + "epoch": 1.6393208440471816, + "grad_norm": 1.6759674549102783, + "learning_rate": 1.9591166292556093e-05, + "loss": 0.7986, + "step": 10042 + }, + { + "epoch": 1.639484102689686, + "grad_norm": 1.3773276805877686, + "learning_rate": 1.959107647907632e-05, + "loss": 0.5152, + "step": 10043 + }, + { + "epoch": 1.6396473613321905, + "grad_norm": 1.9850319623947144, + "learning_rate": 1.959098665593834e-05, + "loss": 0.788, + "step": 10044 + }, + { + "epoch": 1.639810619974695, + "grad_norm": 1.3423821926116943, + "learning_rate": 1.959089682314224e-05, + "loss": 0.5296, + "step": 10045 + }, + { + "epoch": 1.6399738786171993, + "grad_norm": 2.013733148574829, + "learning_rate": 1.9590806980688108e-05, + "loss": 0.6571, + "step": 10046 + }, + { + "epoch": 1.6401371372597038, + "grad_norm": 1.755739688873291, + "learning_rate": 1.9590717128576032e-05, + "loss": 0.7032, + "step": 10047 + }, + { + "epoch": 1.6403003959022082, + "grad_norm": 1.750331997871399, + "learning_rate": 1.959062726680611e-05, + "loss": 0.6705, + "step": 10048 + }, + { + "epoch": 1.6404636545447124, + "grad_norm": 1.7550911903381348, + "learning_rate": 1.9590537395378428e-05, + "loss": 0.709, + "step": 10049 + }, + { + "epoch": 1.6406269131872169, + "grad_norm": 2.080109119415283, + "learning_rate": 1.959044751429308e-05, + "loss": 0.83, + "step": 10050 + }, + { + "epoch": 1.640790171829721, + "grad_norm": 1.7505443096160889, + "learning_rate": 1.959035762355015e-05, + "loss": 0.688, + "step": 10051 + }, + { + "epoch": 1.6409534304722255, + "grad_norm": 1.67646324634552, + "learning_rate": 1.959026772314973e-05, + "loss": 0.6329, + "step": 10052 + }, + { + "epoch": 1.64111668911473, + "grad_norm": 1.65740168094635, + "learning_rate": 1.9590177813091918e-05, + "loss": 0.6158, + "step": 10053 + }, + { + "epoch": 1.6412799477572344, + "grad_norm": 1.6394296884536743, + "learning_rate": 1.95900878933768e-05, + "loss": 0.5268, + "step": 10054 + }, + { + "epoch": 1.6414432063997388, + "grad_norm": 1.5952215194702148, + "learning_rate": 1.9589997964004466e-05, + "loss": 0.6528, + "step": 10055 + }, + { + "epoch": 1.6416064650422433, + "grad_norm": 1.866600513458252, + "learning_rate": 1.9589908024975002e-05, + "loss": 0.7535, + "step": 10056 + }, + { + "epoch": 1.6417697236847477, + "grad_norm": 2.013523578643799, + "learning_rate": 1.9589818076288506e-05, + "loss": 0.6565, + "step": 10057 + }, + { + "epoch": 1.641932982327252, + "grad_norm": 1.9465855360031128, + "learning_rate": 1.958972811794507e-05, + "loss": 0.7267, + "step": 10058 + }, + { + "epoch": 1.6420962409697564, + "grad_norm": 2.1008381843566895, + "learning_rate": 1.9589638149944774e-05, + "loss": 0.6813, + "step": 10059 + }, + { + "epoch": 1.6422594996122606, + "grad_norm": 1.7441473007202148, + "learning_rate": 1.958954817228772e-05, + "loss": 0.6865, + "step": 10060 + }, + { + "epoch": 1.642422758254765, + "grad_norm": 1.651950478553772, + "learning_rate": 1.958945818497399e-05, + "loss": 0.6578, + "step": 10061 + }, + { + "epoch": 1.6425860168972695, + "grad_norm": 1.8425498008728027, + "learning_rate": 1.9589368188003677e-05, + "loss": 0.7691, + "step": 10062 + }, + { + "epoch": 1.642749275539774, + "grad_norm": 1.5721261501312256, + "learning_rate": 1.9589278181376875e-05, + "loss": 0.5981, + "step": 10063 + }, + { + "epoch": 1.6429125341822783, + "grad_norm": 1.6836590766906738, + "learning_rate": 1.958918816509367e-05, + "loss": 0.7757, + "step": 10064 + }, + { + "epoch": 1.6430757928247828, + "grad_norm": 2.3528804779052734, + "learning_rate": 1.958909813915416e-05, + "loss": 0.7057, + "step": 10065 + }, + { + "epoch": 1.6432390514672872, + "grad_norm": 1.9285943508148193, + "learning_rate": 1.9589008103558428e-05, + "loss": 0.7917, + "step": 10066 + }, + { + "epoch": 1.6434023101097914, + "grad_norm": 1.6935430765151978, + "learning_rate": 1.9588918058306564e-05, + "loss": 0.653, + "step": 10067 + }, + { + "epoch": 1.6435655687522959, + "grad_norm": 1.6889852285385132, + "learning_rate": 1.9588828003398667e-05, + "loss": 0.7663, + "step": 10068 + }, + { + "epoch": 1.6437288273948, + "grad_norm": 1.6337755918502808, + "learning_rate": 1.958873793883482e-05, + "loss": 0.6401, + "step": 10069 + }, + { + "epoch": 1.6438920860373045, + "grad_norm": 1.941815733909607, + "learning_rate": 1.9588647864615118e-05, + "loss": 0.7321, + "step": 10070 + }, + { + "epoch": 1.644055344679809, + "grad_norm": 1.7820794582366943, + "learning_rate": 1.958855778073965e-05, + "loss": 0.7407, + "step": 10071 + }, + { + "epoch": 1.6442186033223134, + "grad_norm": 2.0601255893707275, + "learning_rate": 1.9588467687208506e-05, + "loss": 0.7374, + "step": 10072 + }, + { + "epoch": 1.6443818619648178, + "grad_norm": 1.642119288444519, + "learning_rate": 1.9588377584021778e-05, + "loss": 0.6998, + "step": 10073 + }, + { + "epoch": 1.6445451206073223, + "grad_norm": 1.5206775665283203, + "learning_rate": 1.9588287471179558e-05, + "loss": 0.5039, + "step": 10074 + }, + { + "epoch": 1.6447083792498265, + "grad_norm": 1.826811671257019, + "learning_rate": 1.958819734868193e-05, + "loss": 0.7671, + "step": 10075 + }, + { + "epoch": 1.644871637892331, + "grad_norm": 1.8885239362716675, + "learning_rate": 1.9588107216528996e-05, + "loss": 0.743, + "step": 10076 + }, + { + "epoch": 1.6450348965348354, + "grad_norm": 1.4056293964385986, + "learning_rate": 1.9588017074720838e-05, + "loss": 0.5183, + "step": 10077 + }, + { + "epoch": 1.6451981551773396, + "grad_norm": 1.5689443349838257, + "learning_rate": 1.958792692325755e-05, + "loss": 0.6731, + "step": 10078 + }, + { + "epoch": 1.645361413819844, + "grad_norm": 1.500549077987671, + "learning_rate": 1.958783676213922e-05, + "loss": 0.6045, + "step": 10079 + }, + { + "epoch": 1.6455246724623485, + "grad_norm": 1.8358031511306763, + "learning_rate": 1.958774659136594e-05, + "loss": 0.7516, + "step": 10080 + }, + { + "epoch": 1.645687931104853, + "grad_norm": 1.5842143297195435, + "learning_rate": 1.9587656410937806e-05, + "loss": 0.6669, + "step": 10081 + }, + { + "epoch": 1.6458511897473573, + "grad_norm": 1.6282466650009155, + "learning_rate": 1.9587566220854902e-05, + "loss": 0.7071, + "step": 10082 + }, + { + "epoch": 1.6460144483898618, + "grad_norm": 1.4204384088516235, + "learning_rate": 1.9587476021117324e-05, + "loss": 0.5283, + "step": 10083 + }, + { + "epoch": 1.646177707032366, + "grad_norm": 1.7629834413528442, + "learning_rate": 1.9587385811725155e-05, + "loss": 0.7256, + "step": 10084 + }, + { + "epoch": 1.6463409656748704, + "grad_norm": 1.685285210609436, + "learning_rate": 1.9587295592678495e-05, + "loss": 0.6713, + "step": 10085 + }, + { + "epoch": 1.6465042243173746, + "grad_norm": 1.574859380722046, + "learning_rate": 1.9587205363977428e-05, + "loss": 0.5617, + "step": 10086 + }, + { + "epoch": 1.646667482959879, + "grad_norm": 1.6142386198043823, + "learning_rate": 1.9587115125622052e-05, + "loss": 0.6672, + "step": 10087 + }, + { + "epoch": 1.6468307416023835, + "grad_norm": 1.6259280443191528, + "learning_rate": 1.958702487761245e-05, + "loss": 0.6546, + "step": 10088 + }, + { + "epoch": 1.646994000244888, + "grad_norm": 2.188521385192871, + "learning_rate": 1.958693461994872e-05, + "loss": 0.7794, + "step": 10089 + }, + { + "epoch": 1.6471572588873924, + "grad_norm": 2.423936367034912, + "learning_rate": 1.9586844352630943e-05, + "loss": 0.8402, + "step": 10090 + }, + { + "epoch": 1.6473205175298968, + "grad_norm": 1.6714799404144287, + "learning_rate": 1.9586754075659223e-05, + "loss": 0.5497, + "step": 10091 + }, + { + "epoch": 1.6474837761724013, + "grad_norm": 1.7275614738464355, + "learning_rate": 1.9586663789033642e-05, + "loss": 0.7274, + "step": 10092 + }, + { + "epoch": 1.6476470348149055, + "grad_norm": 1.4821561574935913, + "learning_rate": 1.958657349275429e-05, + "loss": 0.5739, + "step": 10093 + }, + { + "epoch": 1.64781029345741, + "grad_norm": 1.535975694656372, + "learning_rate": 1.9586483186821265e-05, + "loss": 0.6603, + "step": 10094 + }, + { + "epoch": 1.6479735520999141, + "grad_norm": 1.6246373653411865, + "learning_rate": 1.9586392871234655e-05, + "loss": 0.6254, + "step": 10095 + }, + { + "epoch": 1.6481368107424186, + "grad_norm": 1.7950057983398438, + "learning_rate": 1.9586302545994546e-05, + "loss": 0.7346, + "step": 10096 + }, + { + "epoch": 1.648300069384923, + "grad_norm": 1.9512600898742676, + "learning_rate": 1.9586212211101036e-05, + "loss": 0.6961, + "step": 10097 + }, + { + "epoch": 1.6484633280274275, + "grad_norm": 1.7083897590637207, + "learning_rate": 1.958612186655421e-05, + "loss": 0.6507, + "step": 10098 + }, + { + "epoch": 1.648626586669932, + "grad_norm": 1.8619893789291382, + "learning_rate": 1.9586031512354163e-05, + "loss": 0.7431, + "step": 10099 + }, + { + "epoch": 1.6487898453124363, + "grad_norm": 2.0211124420166016, + "learning_rate": 1.9585941148500987e-05, + "loss": 0.9511, + "step": 10100 + }, + { + "epoch": 1.6489531039549408, + "grad_norm": 1.8720773458480835, + "learning_rate": 1.958585077499477e-05, + "loss": 0.7469, + "step": 10101 + }, + { + "epoch": 1.649116362597445, + "grad_norm": 1.8143525123596191, + "learning_rate": 1.95857603918356e-05, + "loss": 0.7227, + "step": 10102 + }, + { + "epoch": 1.6492796212399494, + "grad_norm": 1.8428373336791992, + "learning_rate": 1.9585669999023573e-05, + "loss": 0.6781, + "step": 10103 + }, + { + "epoch": 1.6494428798824536, + "grad_norm": 1.5892603397369385, + "learning_rate": 1.9585579596558783e-05, + "loss": 0.4987, + "step": 10104 + }, + { + "epoch": 1.649606138524958, + "grad_norm": 1.902796745300293, + "learning_rate": 1.9585489184441313e-05, + "loss": 0.7587, + "step": 10105 + }, + { + "epoch": 1.6497693971674625, + "grad_norm": 2.0730555057525635, + "learning_rate": 1.958539876267126e-05, + "loss": 0.8772, + "step": 10106 + }, + { + "epoch": 1.649932655809967, + "grad_norm": 1.7063666582107544, + "learning_rate": 1.9585308331248713e-05, + "loss": 0.6221, + "step": 10107 + }, + { + "epoch": 1.6500959144524714, + "grad_norm": 1.4406131505966187, + "learning_rate": 1.958521789017376e-05, + "loss": 0.5969, + "step": 10108 + }, + { + "epoch": 1.6502591730949758, + "grad_norm": 1.9638994932174683, + "learning_rate": 1.9585127439446497e-05, + "loss": 0.7371, + "step": 10109 + }, + { + "epoch": 1.6504224317374803, + "grad_norm": 1.8706518411636353, + "learning_rate": 1.9585036979067015e-05, + "loss": 0.8596, + "step": 10110 + }, + { + "epoch": 1.6505856903799845, + "grad_norm": 1.835471272468567, + "learning_rate": 1.9584946509035402e-05, + "loss": 0.597, + "step": 10111 + }, + { + "epoch": 1.650748949022489, + "grad_norm": 1.612286925315857, + "learning_rate": 1.9584856029351747e-05, + "loss": 0.6687, + "step": 10112 + }, + { + "epoch": 1.6509122076649931, + "grad_norm": 1.4977374076843262, + "learning_rate": 1.9584765540016152e-05, + "loss": 0.6834, + "step": 10113 + }, + { + "epoch": 1.6510754663074976, + "grad_norm": 1.689755916595459, + "learning_rate": 1.9584675041028694e-05, + "loss": 0.7033, + "step": 10114 + }, + { + "epoch": 1.651238724950002, + "grad_norm": 1.6142290830612183, + "learning_rate": 1.9584584532389472e-05, + "loss": 0.6833, + "step": 10115 + }, + { + "epoch": 1.6514019835925065, + "grad_norm": 1.7026444673538208, + "learning_rate": 1.9584494014098578e-05, + "loss": 0.6793, + "step": 10116 + }, + { + "epoch": 1.6515652422350109, + "grad_norm": 1.5553830862045288, + "learning_rate": 1.95844034861561e-05, + "loss": 0.6086, + "step": 10117 + }, + { + "epoch": 1.6517285008775153, + "grad_norm": 1.4408942461013794, + "learning_rate": 1.958431294856213e-05, + "loss": 0.6797, + "step": 10118 + }, + { + "epoch": 1.6518917595200195, + "grad_norm": 1.7629644870758057, + "learning_rate": 1.958422240131676e-05, + "loss": 0.7397, + "step": 10119 + }, + { + "epoch": 1.652055018162524, + "grad_norm": 1.6951879262924194, + "learning_rate": 1.9584131844420084e-05, + "loss": 0.6441, + "step": 10120 + }, + { + "epoch": 1.6522182768050284, + "grad_norm": 1.603844165802002, + "learning_rate": 1.9584041277872184e-05, + "loss": 0.6324, + "step": 10121 + }, + { + "epoch": 1.6523815354475326, + "grad_norm": 1.7437760829925537, + "learning_rate": 1.958395070167316e-05, + "loss": 0.6812, + "step": 10122 + }, + { + "epoch": 1.652544794090037, + "grad_norm": 1.8784271478652954, + "learning_rate": 1.95838601158231e-05, + "loss": 0.8148, + "step": 10123 + }, + { + "epoch": 1.6527080527325415, + "grad_norm": 1.9167051315307617, + "learning_rate": 1.9583769520322093e-05, + "loss": 0.7725, + "step": 10124 + }, + { + "epoch": 1.652871311375046, + "grad_norm": 1.5413975715637207, + "learning_rate": 1.9583678915170236e-05, + "loss": 0.6852, + "step": 10125 + }, + { + "epoch": 1.6530345700175504, + "grad_norm": 1.6568725109100342, + "learning_rate": 1.9583588300367614e-05, + "loss": 0.8522, + "step": 10126 + }, + { + "epoch": 1.6531978286600548, + "grad_norm": 1.7261161804199219, + "learning_rate": 1.958349767591432e-05, + "loss": 0.6722, + "step": 10127 + }, + { + "epoch": 1.653361087302559, + "grad_norm": 1.7336678504943848, + "learning_rate": 1.958340704181045e-05, + "loss": 0.7354, + "step": 10128 + }, + { + "epoch": 1.6535243459450635, + "grad_norm": 1.8548883199691772, + "learning_rate": 1.958331639805609e-05, + "loss": 0.6778, + "step": 10129 + }, + { + "epoch": 1.6536876045875677, + "grad_norm": 1.915725827217102, + "learning_rate": 1.9583225744651334e-05, + "loss": 0.8282, + "step": 10130 + }, + { + "epoch": 1.6538508632300721, + "grad_norm": 1.5114574432373047, + "learning_rate": 1.958313508159627e-05, + "loss": 0.5429, + "step": 10131 + }, + { + "epoch": 1.6540141218725766, + "grad_norm": 1.8226715326309204, + "learning_rate": 1.9583044408890995e-05, + "loss": 0.7102, + "step": 10132 + }, + { + "epoch": 1.654177380515081, + "grad_norm": 1.694242238998413, + "learning_rate": 1.9582953726535595e-05, + "loss": 0.7684, + "step": 10133 + }, + { + "epoch": 1.6543406391575854, + "grad_norm": 1.9058992862701416, + "learning_rate": 1.9582863034530163e-05, + "loss": 0.7588, + "step": 10134 + }, + { + "epoch": 1.6545038978000899, + "grad_norm": 1.7672829627990723, + "learning_rate": 1.9582772332874792e-05, + "loss": 0.849, + "step": 10135 + }, + { + "epoch": 1.6546671564425943, + "grad_norm": 1.5302428007125854, + "learning_rate": 1.9582681621569568e-05, + "loss": 0.5718, + "step": 10136 + }, + { + "epoch": 1.6548304150850985, + "grad_norm": 1.6432819366455078, + "learning_rate": 1.958259090061459e-05, + "loss": 0.6586, + "step": 10137 + }, + { + "epoch": 1.654993673727603, + "grad_norm": 2.004427433013916, + "learning_rate": 1.9582500170009942e-05, + "loss": 0.6784, + "step": 10138 + }, + { + "epoch": 1.6551569323701072, + "grad_norm": 1.5160046815872192, + "learning_rate": 1.958240942975572e-05, + "loss": 0.5158, + "step": 10139 + }, + { + "epoch": 1.6553201910126116, + "grad_norm": 1.744916558265686, + "learning_rate": 1.9582318679852018e-05, + "loss": 0.6407, + "step": 10140 + }, + { + "epoch": 1.655483449655116, + "grad_norm": 1.7982934713363647, + "learning_rate": 1.9582227920298916e-05, + "loss": 0.8292, + "step": 10141 + }, + { + "epoch": 1.6556467082976205, + "grad_norm": 1.6873327493667603, + "learning_rate": 1.958213715109652e-05, + "loss": 0.6604, + "step": 10142 + }, + { + "epoch": 1.655809966940125, + "grad_norm": 1.8037564754486084, + "learning_rate": 1.9582046372244914e-05, + "loss": 0.7041, + "step": 10143 + }, + { + "epoch": 1.6559732255826294, + "grad_norm": 1.7531410455703735, + "learning_rate": 1.9581955583744187e-05, + "loss": 0.6746, + "step": 10144 + }, + { + "epoch": 1.6561364842251338, + "grad_norm": 1.8789091110229492, + "learning_rate": 1.9581864785594433e-05, + "loss": 0.6639, + "step": 10145 + }, + { + "epoch": 1.656299742867638, + "grad_norm": 1.7338184118270874, + "learning_rate": 1.9581773977795744e-05, + "loss": 0.742, + "step": 10146 + }, + { + "epoch": 1.6564630015101425, + "grad_norm": 1.6008660793304443, + "learning_rate": 1.9581683160348212e-05, + "loss": 0.6257, + "step": 10147 + }, + { + "epoch": 1.6566262601526467, + "grad_norm": 1.777541160583496, + "learning_rate": 1.9581592333251927e-05, + "loss": 0.6574, + "step": 10148 + }, + { + "epoch": 1.6567895187951511, + "grad_norm": 1.7196379899978638, + "learning_rate": 1.9581501496506985e-05, + "loss": 0.58, + "step": 10149 + }, + { + "epoch": 1.6569527774376556, + "grad_norm": 1.6762124300003052, + "learning_rate": 1.958141065011347e-05, + "loss": 0.731, + "step": 10150 + }, + { + "epoch": 1.65711603608016, + "grad_norm": 1.7658483982086182, + "learning_rate": 1.9581319794071477e-05, + "loss": 0.6609, + "step": 10151 + }, + { + "epoch": 1.6572792947226644, + "grad_norm": 1.8976913690567017, + "learning_rate": 1.95812289283811e-05, + "loss": 0.6555, + "step": 10152 + }, + { + "epoch": 1.6574425533651689, + "grad_norm": 1.6834828853607178, + "learning_rate": 1.9581138053042425e-05, + "loss": 0.6291, + "step": 10153 + }, + { + "epoch": 1.6576058120076733, + "grad_norm": 2.1503491401672363, + "learning_rate": 1.9581047168055548e-05, + "loss": 0.7018, + "step": 10154 + }, + { + "epoch": 1.6577690706501775, + "grad_norm": 1.9483917951583862, + "learning_rate": 1.9580956273420556e-05, + "loss": 0.6285, + "step": 10155 + }, + { + "epoch": 1.657932329292682, + "grad_norm": 1.728080153465271, + "learning_rate": 1.9580865369137546e-05, + "loss": 0.703, + "step": 10156 + }, + { + "epoch": 1.6580955879351862, + "grad_norm": 1.6704884767532349, + "learning_rate": 1.9580774455206608e-05, + "loss": 0.6044, + "step": 10157 + }, + { + "epoch": 1.6582588465776906, + "grad_norm": 1.386238694190979, + "learning_rate": 1.958068353162783e-05, + "loss": 0.6133, + "step": 10158 + }, + { + "epoch": 1.658422105220195, + "grad_norm": 1.697726845741272, + "learning_rate": 1.9580592598401308e-05, + "loss": 0.6337, + "step": 10159 + }, + { + "epoch": 1.6585853638626995, + "grad_norm": 1.5584254264831543, + "learning_rate": 1.9580501655527132e-05, + "loss": 0.6383, + "step": 10160 + }, + { + "epoch": 1.658748622505204, + "grad_norm": 1.5844731330871582, + "learning_rate": 1.9580410703005393e-05, + "loss": 0.6442, + "step": 10161 + }, + { + "epoch": 1.6589118811477084, + "grad_norm": 1.3618371486663818, + "learning_rate": 1.9580319740836183e-05, + "loss": 0.5364, + "step": 10162 + }, + { + "epoch": 1.6590751397902126, + "grad_norm": 1.4608584642410278, + "learning_rate": 1.9580228769019593e-05, + "loss": 0.4875, + "step": 10163 + }, + { + "epoch": 1.659238398432717, + "grad_norm": 1.603968858718872, + "learning_rate": 1.9580137787555717e-05, + "loss": 0.5694, + "step": 10164 + }, + { + "epoch": 1.6594016570752215, + "grad_norm": 1.8860054016113281, + "learning_rate": 1.958004679644464e-05, + "loss": 0.707, + "step": 10165 + }, + { + "epoch": 1.6595649157177257, + "grad_norm": 1.8118892908096313, + "learning_rate": 1.9579955795686466e-05, + "loss": 0.7506, + "step": 10166 + }, + { + "epoch": 1.6597281743602301, + "grad_norm": 1.7888940572738647, + "learning_rate": 1.9579864785281274e-05, + "loss": 0.747, + "step": 10167 + }, + { + "epoch": 1.6598914330027346, + "grad_norm": 2.2828190326690674, + "learning_rate": 1.9579773765229163e-05, + "loss": 0.9094, + "step": 10168 + }, + { + "epoch": 1.660054691645239, + "grad_norm": 1.816757082939148, + "learning_rate": 1.957968273553022e-05, + "loss": 0.7806, + "step": 10169 + }, + { + "epoch": 1.6602179502877434, + "grad_norm": 2.0269222259521484, + "learning_rate": 1.957959169618454e-05, + "loss": 0.6969, + "step": 10170 + }, + { + "epoch": 1.6603812089302479, + "grad_norm": 1.7957701683044434, + "learning_rate": 1.9579500647192214e-05, + "loss": 0.6597, + "step": 10171 + }, + { + "epoch": 1.660544467572752, + "grad_norm": 1.608731985092163, + "learning_rate": 1.9579409588553334e-05, + "loss": 0.5678, + "step": 10172 + }, + { + "epoch": 1.6607077262152565, + "grad_norm": 1.8682831525802612, + "learning_rate": 1.9579318520267992e-05, + "loss": 0.6903, + "step": 10173 + }, + { + "epoch": 1.6608709848577607, + "grad_norm": 2.083582639694214, + "learning_rate": 1.9579227442336276e-05, + "loss": 0.6915, + "step": 10174 + }, + { + "epoch": 1.6610342435002652, + "grad_norm": 1.598814606666565, + "learning_rate": 1.957913635475828e-05, + "loss": 0.6622, + "step": 10175 + }, + { + "epoch": 1.6611975021427696, + "grad_norm": 2.0166127681732178, + "learning_rate": 1.95790452575341e-05, + "loss": 0.7541, + "step": 10176 + }, + { + "epoch": 1.661360760785274, + "grad_norm": 1.907181978225708, + "learning_rate": 1.957895415066382e-05, + "loss": 0.7164, + "step": 10177 + }, + { + "epoch": 1.6615240194277785, + "grad_norm": 1.5327273607254028, + "learning_rate": 1.957886303414754e-05, + "loss": 0.591, + "step": 10178 + }, + { + "epoch": 1.661687278070283, + "grad_norm": 1.5933223962783813, + "learning_rate": 1.9578771907985344e-05, + "loss": 0.7314, + "step": 10179 + }, + { + "epoch": 1.6618505367127874, + "grad_norm": 1.9094136953353882, + "learning_rate": 1.9578680772177327e-05, + "loss": 0.7447, + "step": 10180 + }, + { + "epoch": 1.6620137953552916, + "grad_norm": 1.566475749015808, + "learning_rate": 1.9578589626723583e-05, + "loss": 0.6369, + "step": 10181 + }, + { + "epoch": 1.662177053997796, + "grad_norm": 1.7418193817138672, + "learning_rate": 1.9578498471624205e-05, + "loss": 0.6863, + "step": 10182 + }, + { + "epoch": 1.6623403126403002, + "grad_norm": 1.7187318801879883, + "learning_rate": 1.9578407306879276e-05, + "loss": 0.5922, + "step": 10183 + }, + { + "epoch": 1.6625035712828047, + "grad_norm": 1.514847755432129, + "learning_rate": 1.957831613248889e-05, + "loss": 0.5694, + "step": 10184 + }, + { + "epoch": 1.6626668299253091, + "grad_norm": 1.9945544004440308, + "learning_rate": 1.957822494845315e-05, + "loss": 0.7401, + "step": 10185 + }, + { + "epoch": 1.6628300885678136, + "grad_norm": 1.730692982673645, + "learning_rate": 1.957813375477214e-05, + "loss": 0.5652, + "step": 10186 + }, + { + "epoch": 1.662993347210318, + "grad_norm": 1.4212172031402588, + "learning_rate": 1.957804255144595e-05, + "loss": 0.6398, + "step": 10187 + }, + { + "epoch": 1.6631566058528224, + "grad_norm": 1.857919454574585, + "learning_rate": 1.957795133847467e-05, + "loss": 0.7642, + "step": 10188 + }, + { + "epoch": 1.6633198644953269, + "grad_norm": 1.8148958683013916, + "learning_rate": 1.9577860115858398e-05, + "loss": 0.6605, + "step": 10189 + }, + { + "epoch": 1.663483123137831, + "grad_norm": 1.7965545654296875, + "learning_rate": 1.9577768883597225e-05, + "loss": 0.7725, + "step": 10190 + }, + { + "epoch": 1.6636463817803355, + "grad_norm": 1.9494998455047607, + "learning_rate": 1.9577677641691237e-05, + "loss": 0.6359, + "step": 10191 + }, + { + "epoch": 1.6638096404228397, + "grad_norm": 1.5641560554504395, + "learning_rate": 1.9577586390140535e-05, + "loss": 0.703, + "step": 10192 + }, + { + "epoch": 1.6639728990653442, + "grad_norm": 1.679521083831787, + "learning_rate": 1.9577495128945204e-05, + "loss": 0.7099, + "step": 10193 + }, + { + "epoch": 1.6641361577078486, + "grad_norm": 1.698891282081604, + "learning_rate": 1.9577403858105336e-05, + "loss": 0.6898, + "step": 10194 + }, + { + "epoch": 1.664299416350353, + "grad_norm": 1.6322013139724731, + "learning_rate": 1.957731257762103e-05, + "loss": 0.7258, + "step": 10195 + }, + { + "epoch": 1.6644626749928575, + "grad_norm": 1.7092620134353638, + "learning_rate": 1.9577221287492368e-05, + "loss": 0.7265, + "step": 10196 + }, + { + "epoch": 1.664625933635362, + "grad_norm": 1.7884491682052612, + "learning_rate": 1.957712998771945e-05, + "loss": 0.6462, + "step": 10197 + }, + { + "epoch": 1.6647891922778664, + "grad_norm": 1.5271003246307373, + "learning_rate": 1.957703867830236e-05, + "loss": 0.6393, + "step": 10198 + }, + { + "epoch": 1.6649524509203706, + "grad_norm": 1.7022309303283691, + "learning_rate": 1.9576947359241202e-05, + "loss": 0.695, + "step": 10199 + }, + { + "epoch": 1.665115709562875, + "grad_norm": 1.8166682720184326, + "learning_rate": 1.9576856030536055e-05, + "loss": 0.7662, + "step": 10200 + }, + { + "epoch": 1.6652789682053792, + "grad_norm": 1.8851487636566162, + "learning_rate": 1.9576764692187017e-05, + "loss": 0.8524, + "step": 10201 + }, + { + "epoch": 1.6654422268478837, + "grad_norm": 1.9430642127990723, + "learning_rate": 1.957667334419418e-05, + "loss": 0.7425, + "step": 10202 + }, + { + "epoch": 1.6656054854903881, + "grad_norm": 2.2020504474639893, + "learning_rate": 1.9576581986557634e-05, + "loss": 0.8968, + "step": 10203 + }, + { + "epoch": 1.6657687441328926, + "grad_norm": 1.8822375535964966, + "learning_rate": 1.9576490619277474e-05, + "loss": 0.778, + "step": 10204 + }, + { + "epoch": 1.665932002775397, + "grad_norm": 1.7037522792816162, + "learning_rate": 1.9576399242353794e-05, + "loss": 0.6555, + "step": 10205 + }, + { + "epoch": 1.6660952614179014, + "grad_norm": 1.496363639831543, + "learning_rate": 1.9576307855786677e-05, + "loss": 0.5289, + "step": 10206 + }, + { + "epoch": 1.6662585200604056, + "grad_norm": 1.4819103479385376, + "learning_rate": 1.9576216459576222e-05, + "loss": 0.6322, + "step": 10207 + }, + { + "epoch": 1.66642177870291, + "grad_norm": 1.5589786767959595, + "learning_rate": 1.9576125053722525e-05, + "loss": 0.663, + "step": 10208 + }, + { + "epoch": 1.6665850373454145, + "grad_norm": 1.579542636871338, + "learning_rate": 1.9576033638225665e-05, + "loss": 0.568, + "step": 10209 + }, + { + "epoch": 1.6667482959879187, + "grad_norm": 1.6150691509246826, + "learning_rate": 1.9575942213085746e-05, + "loss": 0.6987, + "step": 10210 + }, + { + "epoch": 1.6669115546304232, + "grad_norm": 2.034017562866211, + "learning_rate": 1.9575850778302854e-05, + "loss": 0.8668, + "step": 10211 + }, + { + "epoch": 1.6670748132729276, + "grad_norm": 1.6262333393096924, + "learning_rate": 1.9575759333877082e-05, + "loss": 0.6307, + "step": 10212 + }, + { + "epoch": 1.667238071915432, + "grad_norm": 1.530486822128296, + "learning_rate": 1.9575667879808524e-05, + "loss": 0.6651, + "step": 10213 + }, + { + "epoch": 1.6674013305579365, + "grad_norm": 1.7640174627304077, + "learning_rate": 1.9575576416097272e-05, + "loss": 0.7037, + "step": 10214 + }, + { + "epoch": 1.667564589200441, + "grad_norm": 1.4648715257644653, + "learning_rate": 1.9575484942743417e-05, + "loss": 0.4878, + "step": 10215 + }, + { + "epoch": 1.6677278478429451, + "grad_norm": 1.3118517398834229, + "learning_rate": 1.9575393459747047e-05, + "loss": 0.4846, + "step": 10216 + }, + { + "epoch": 1.6678911064854496, + "grad_norm": 2.0088562965393066, + "learning_rate": 1.957530196710826e-05, + "loss": 0.7943, + "step": 10217 + }, + { + "epoch": 1.6680543651279538, + "grad_norm": 2.0152411460876465, + "learning_rate": 1.957521046482715e-05, + "loss": 0.6612, + "step": 10218 + }, + { + "epoch": 1.6682176237704582, + "grad_norm": 1.4945473670959473, + "learning_rate": 1.9575118952903803e-05, + "loss": 0.5284, + "step": 10219 + }, + { + "epoch": 1.6683808824129627, + "grad_norm": 1.4183968305587769, + "learning_rate": 1.9575027431338317e-05, + "loss": 0.5313, + "step": 10220 + }, + { + "epoch": 1.6685441410554671, + "grad_norm": 1.8439114093780518, + "learning_rate": 1.9574935900130777e-05, + "loss": 0.7539, + "step": 10221 + }, + { + "epoch": 1.6687073996979715, + "grad_norm": 1.5393112897872925, + "learning_rate": 1.957484435928128e-05, + "loss": 0.6298, + "step": 10222 + }, + { + "epoch": 1.668870658340476, + "grad_norm": 1.5791175365447998, + "learning_rate": 1.9574752808789918e-05, + "loss": 0.6354, + "step": 10223 + }, + { + "epoch": 1.6690339169829804, + "grad_norm": 1.7705105543136597, + "learning_rate": 1.9574661248656782e-05, + "loss": 0.7537, + "step": 10224 + }, + { + "epoch": 1.6691971756254846, + "grad_norm": 1.8599151372909546, + "learning_rate": 1.9574569678881965e-05, + "loss": 0.7374, + "step": 10225 + }, + { + "epoch": 1.669360434267989, + "grad_norm": 1.5086878538131714, + "learning_rate": 1.9574478099465558e-05, + "loss": 0.6066, + "step": 10226 + }, + { + "epoch": 1.6695236929104933, + "grad_norm": 1.995839238166809, + "learning_rate": 1.9574386510407656e-05, + "loss": 0.7075, + "step": 10227 + }, + { + "epoch": 1.6696869515529977, + "grad_norm": 1.6829890012741089, + "learning_rate": 1.9574294911708348e-05, + "loss": 0.7006, + "step": 10228 + }, + { + "epoch": 1.6698502101955022, + "grad_norm": 1.7326596975326538, + "learning_rate": 1.9574203303367728e-05, + "loss": 0.7382, + "step": 10229 + }, + { + "epoch": 1.6700134688380066, + "grad_norm": 1.5273820161819458, + "learning_rate": 1.9574111685385887e-05, + "loss": 0.5658, + "step": 10230 + }, + { + "epoch": 1.670176727480511, + "grad_norm": 1.5078397989273071, + "learning_rate": 1.9574020057762918e-05, + "loss": 0.6716, + "step": 10231 + }, + { + "epoch": 1.6703399861230155, + "grad_norm": 1.477341651916504, + "learning_rate": 1.9573928420498914e-05, + "loss": 0.531, + "step": 10232 + }, + { + "epoch": 1.67050324476552, + "grad_norm": 1.7649548053741455, + "learning_rate": 1.9573836773593968e-05, + "loss": 0.7227, + "step": 10233 + }, + { + "epoch": 1.6706665034080241, + "grad_norm": 1.749860405921936, + "learning_rate": 1.957374511704817e-05, + "loss": 0.5455, + "step": 10234 + }, + { + "epoch": 1.6708297620505286, + "grad_norm": 1.804117202758789, + "learning_rate": 1.957365345086161e-05, + "loss": 0.6986, + "step": 10235 + }, + { + "epoch": 1.6709930206930328, + "grad_norm": 1.6919574737548828, + "learning_rate": 1.9573561775034386e-05, + "loss": 0.7731, + "step": 10236 + }, + { + "epoch": 1.6711562793355372, + "grad_norm": 1.8538540601730347, + "learning_rate": 1.957347008956659e-05, + "loss": 0.6698, + "step": 10237 + }, + { + "epoch": 1.6713195379780417, + "grad_norm": 1.742448329925537, + "learning_rate": 1.957337839445831e-05, + "loss": 0.6145, + "step": 10238 + }, + { + "epoch": 1.671482796620546, + "grad_norm": 1.5862600803375244, + "learning_rate": 1.957328668970964e-05, + "loss": 0.5309, + "step": 10239 + }, + { + "epoch": 1.6716460552630505, + "grad_norm": 1.4562309980392456, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.5538, + "step": 10240 + }, + { + "epoch": 1.671809313905555, + "grad_norm": 1.452170968055725, + "learning_rate": 1.9573103251291503e-05, + "loss": 0.6983, + "step": 10241 + }, + { + "epoch": 1.6719725725480594, + "grad_norm": 1.381913185119629, + "learning_rate": 1.9573011517622217e-05, + "loss": 0.5774, + "step": 10242 + }, + { + "epoch": 1.6721358311905636, + "grad_norm": 1.5298691987991333, + "learning_rate": 1.9572919774312914e-05, + "loss": 0.5737, + "step": 10243 + }, + { + "epoch": 1.672299089833068, + "grad_norm": 1.8400554656982422, + "learning_rate": 1.9572828021363682e-05, + "loss": 0.6692, + "step": 10244 + }, + { + "epoch": 1.6724623484755723, + "grad_norm": 1.9534602165222168, + "learning_rate": 1.9572736258774618e-05, + "loss": 0.7591, + "step": 10245 + }, + { + "epoch": 1.6726256071180767, + "grad_norm": 1.6134785413742065, + "learning_rate": 1.9572644486545808e-05, + "loss": 0.7493, + "step": 10246 + }, + { + "epoch": 1.6727888657605812, + "grad_norm": 1.6433796882629395, + "learning_rate": 1.957255270467735e-05, + "loss": 0.7084, + "step": 10247 + }, + { + "epoch": 1.6729521244030856, + "grad_norm": 2.2395172119140625, + "learning_rate": 1.9572460913169327e-05, + "loss": 0.7314, + "step": 10248 + }, + { + "epoch": 1.67311538304559, + "grad_norm": 1.334981918334961, + "learning_rate": 1.9572369112021846e-05, + "loss": 0.5017, + "step": 10249 + }, + { + "epoch": 1.6732786416880945, + "grad_norm": 1.6467465162277222, + "learning_rate": 1.957227730123499e-05, + "loss": 0.6632, + "step": 10250 + }, + { + "epoch": 1.6734419003305987, + "grad_norm": 1.725786805152893, + "learning_rate": 1.9572185480808848e-05, + "loss": 0.6755, + "step": 10251 + }, + { + "epoch": 1.6736051589731031, + "grad_norm": 1.988476276397705, + "learning_rate": 1.9572093650743524e-05, + "loss": 0.6922, + "step": 10252 + }, + { + "epoch": 1.6737684176156076, + "grad_norm": 1.3541333675384521, + "learning_rate": 1.9572001811039107e-05, + "loss": 0.455, + "step": 10253 + }, + { + "epoch": 1.6739316762581118, + "grad_norm": 1.5863642692565918, + "learning_rate": 1.9571909961695678e-05, + "loss": 0.5986, + "step": 10254 + }, + { + "epoch": 1.6740949349006162, + "grad_norm": 1.620029330253601, + "learning_rate": 1.9571818102713343e-05, + "loss": 0.5714, + "step": 10255 + }, + { + "epoch": 1.6742581935431207, + "grad_norm": 2.104813575744629, + "learning_rate": 1.957172623409219e-05, + "loss": 0.8655, + "step": 10256 + }, + { + "epoch": 1.674421452185625, + "grad_norm": 1.7959791421890259, + "learning_rate": 1.957163435583231e-05, + "loss": 0.6586, + "step": 10257 + }, + { + "epoch": 1.6745847108281295, + "grad_norm": 1.5776945352554321, + "learning_rate": 1.9571542467933796e-05, + "loss": 0.5792, + "step": 10258 + }, + { + "epoch": 1.674747969470634, + "grad_norm": 1.9580739736557007, + "learning_rate": 1.957145057039674e-05, + "loss": 0.7608, + "step": 10259 + }, + { + "epoch": 1.6749112281131382, + "grad_norm": 1.6778337955474854, + "learning_rate": 1.957135866322124e-05, + "loss": 0.6908, + "step": 10260 + }, + { + "epoch": 1.6750744867556426, + "grad_norm": 1.6201846599578857, + "learning_rate": 1.9571266746407382e-05, + "loss": 0.6089, + "step": 10261 + }, + { + "epoch": 1.6752377453981468, + "grad_norm": 1.748227596282959, + "learning_rate": 1.9571174819955264e-05, + "loss": 0.6888, + "step": 10262 + }, + { + "epoch": 1.6754010040406513, + "grad_norm": 1.6993547677993774, + "learning_rate": 1.9571082883864973e-05, + "loss": 0.655, + "step": 10263 + }, + { + "epoch": 1.6755642626831557, + "grad_norm": 1.7010806798934937, + "learning_rate": 1.95709909381366e-05, + "loss": 0.6351, + "step": 10264 + }, + { + "epoch": 1.6757275213256602, + "grad_norm": 1.4872894287109375, + "learning_rate": 1.957089898277025e-05, + "loss": 0.6166, + "step": 10265 + }, + { + "epoch": 1.6758907799681646, + "grad_norm": 1.5588740110397339, + "learning_rate": 1.9570807017766e-05, + "loss": 0.5526, + "step": 10266 + }, + { + "epoch": 1.676054038610669, + "grad_norm": 1.8015319108963013, + "learning_rate": 1.9570715043123955e-05, + "loss": 0.7538, + "step": 10267 + }, + { + "epoch": 1.6762172972531735, + "grad_norm": 1.6484651565551758, + "learning_rate": 1.9570623058844197e-05, + "loss": 0.6181, + "step": 10268 + }, + { + "epoch": 1.6763805558956777, + "grad_norm": 1.5493019819259644, + "learning_rate": 1.957053106492683e-05, + "loss": 0.524, + "step": 10269 + }, + { + "epoch": 1.6765438145381821, + "grad_norm": 1.588729739189148, + "learning_rate": 1.9570439061371936e-05, + "loss": 0.5845, + "step": 10270 + }, + { + "epoch": 1.6767070731806863, + "grad_norm": 1.5978519916534424, + "learning_rate": 1.9570347048179617e-05, + "loss": 0.6038, + "step": 10271 + }, + { + "epoch": 1.6768703318231908, + "grad_norm": 1.2826318740844727, + "learning_rate": 1.957025502534996e-05, + "loss": 0.5497, + "step": 10272 + }, + { + "epoch": 1.6770335904656952, + "grad_norm": 1.5943729877471924, + "learning_rate": 1.9570162992883056e-05, + "loss": 0.6067, + "step": 10273 + }, + { + "epoch": 1.6771968491081997, + "grad_norm": 1.4985885620117188, + "learning_rate": 1.9570070950779002e-05, + "loss": 0.5986, + "step": 10274 + }, + { + "epoch": 1.677360107750704, + "grad_norm": 1.361028790473938, + "learning_rate": 1.9569978899037887e-05, + "loss": 0.6321, + "step": 10275 + }, + { + "epoch": 1.6775233663932085, + "grad_norm": 1.8935763835906982, + "learning_rate": 1.9569886837659808e-05, + "loss": 0.7333, + "step": 10276 + }, + { + "epoch": 1.677686625035713, + "grad_norm": 1.4201124906539917, + "learning_rate": 1.9569794766644856e-05, + "loss": 0.5616, + "step": 10277 + }, + { + "epoch": 1.6778498836782172, + "grad_norm": 1.9005646705627441, + "learning_rate": 1.956970268599312e-05, + "loss": 0.6629, + "step": 10278 + }, + { + "epoch": 1.6780131423207216, + "grad_norm": 1.4880372285842896, + "learning_rate": 1.95696105957047e-05, + "loss": 0.5651, + "step": 10279 + }, + { + "epoch": 1.6781764009632258, + "grad_norm": 1.7914921045303345, + "learning_rate": 1.9569518495779682e-05, + "loss": 0.613, + "step": 10280 + }, + { + "epoch": 1.6783396596057303, + "grad_norm": 1.5072354078292847, + "learning_rate": 1.956942638621816e-05, + "loss": 0.5642, + "step": 10281 + }, + { + "epoch": 1.6785029182482347, + "grad_norm": 1.6398664712905884, + "learning_rate": 1.9569334267020234e-05, + "loss": 0.6774, + "step": 10282 + }, + { + "epoch": 1.6786661768907392, + "grad_norm": 1.95987069606781, + "learning_rate": 1.9569242138185986e-05, + "loss": 0.7528, + "step": 10283 + }, + { + "epoch": 1.6788294355332436, + "grad_norm": 1.4311730861663818, + "learning_rate": 1.9569149999715514e-05, + "loss": 0.5813, + "step": 10284 + }, + { + "epoch": 1.678992694175748, + "grad_norm": 1.842061996459961, + "learning_rate": 1.9569057851608915e-05, + "loss": 0.7237, + "step": 10285 + }, + { + "epoch": 1.6791559528182525, + "grad_norm": 1.4468880891799927, + "learning_rate": 1.9568965693866273e-05, + "loss": 0.608, + "step": 10286 + }, + { + "epoch": 1.6793192114607567, + "grad_norm": 1.6308139562606812, + "learning_rate": 1.9568873526487685e-05, + "loss": 0.7146, + "step": 10287 + }, + { + "epoch": 1.6794824701032611, + "grad_norm": 1.9013844728469849, + "learning_rate": 1.9568781349473244e-05, + "loss": 0.7412, + "step": 10288 + }, + { + "epoch": 1.6796457287457653, + "grad_norm": 1.6285382509231567, + "learning_rate": 1.9568689162823044e-05, + "loss": 0.6613, + "step": 10289 + }, + { + "epoch": 1.6798089873882698, + "grad_norm": 1.9121285676956177, + "learning_rate": 1.9568596966537177e-05, + "loss": 0.6193, + "step": 10290 + }, + { + "epoch": 1.6799722460307742, + "grad_norm": 1.769327998161316, + "learning_rate": 1.9568504760615734e-05, + "loss": 0.7526, + "step": 10291 + }, + { + "epoch": 1.6801355046732787, + "grad_norm": 1.6257365942001343, + "learning_rate": 1.956841254505881e-05, + "loss": 0.7587, + "step": 10292 + }, + { + "epoch": 1.680298763315783, + "grad_norm": 1.6507139205932617, + "learning_rate": 1.9568320319866497e-05, + "loss": 0.7255, + "step": 10293 + }, + { + "epoch": 1.6804620219582875, + "grad_norm": 1.782328486442566, + "learning_rate": 1.9568228085038886e-05, + "loss": 0.6294, + "step": 10294 + }, + { + "epoch": 1.680625280600792, + "grad_norm": 1.5725504159927368, + "learning_rate": 1.956813584057608e-05, + "loss": 0.6373, + "step": 10295 + }, + { + "epoch": 1.6807885392432962, + "grad_norm": 1.7458202838897705, + "learning_rate": 1.9568043586478153e-05, + "loss": 0.6937, + "step": 10296 + }, + { + "epoch": 1.6809517978858006, + "grad_norm": 1.6937462091445923, + "learning_rate": 1.9567951322745214e-05, + "loss": 0.6359, + "step": 10297 + }, + { + "epoch": 1.6811150565283048, + "grad_norm": 1.3553416728973389, + "learning_rate": 1.956785904937735e-05, + "loss": 0.5682, + "step": 10298 + }, + { + "epoch": 1.6812783151708093, + "grad_norm": 1.9182075262069702, + "learning_rate": 1.9567766766374655e-05, + "loss": 0.7347, + "step": 10299 + }, + { + "epoch": 1.6814415738133137, + "grad_norm": 1.767186164855957, + "learning_rate": 1.956767447373722e-05, + "loss": 0.6102, + "step": 10300 + }, + { + "epoch": 1.6816048324558182, + "grad_norm": 1.6177268028259277, + "learning_rate": 1.9567582171465137e-05, + "loss": 0.7695, + "step": 10301 + }, + { + "epoch": 1.6817680910983226, + "grad_norm": 1.9765204191207886, + "learning_rate": 1.9567489859558506e-05, + "loss": 0.7826, + "step": 10302 + }, + { + "epoch": 1.681931349740827, + "grad_norm": 1.5368883609771729, + "learning_rate": 1.9567397538017415e-05, + "loss": 0.6195, + "step": 10303 + }, + { + "epoch": 1.6820946083833312, + "grad_norm": 1.9082727432250977, + "learning_rate": 1.9567305206841954e-05, + "loss": 0.7628, + "step": 10304 + }, + { + "epoch": 1.6822578670258357, + "grad_norm": 1.620835542678833, + "learning_rate": 1.9567212866032222e-05, + "loss": 0.6095, + "step": 10305 + }, + { + "epoch": 1.6824211256683401, + "grad_norm": 1.4895868301391602, + "learning_rate": 1.9567120515588307e-05, + "loss": 0.6356, + "step": 10306 + }, + { + "epoch": 1.6825843843108443, + "grad_norm": 1.3408678770065308, + "learning_rate": 1.9567028155510303e-05, + "loss": 0.5468, + "step": 10307 + }, + { + "epoch": 1.6827476429533488, + "grad_norm": 1.746203899383545, + "learning_rate": 1.956693578579831e-05, + "loss": 0.8664, + "step": 10308 + }, + { + "epoch": 1.6829109015958532, + "grad_norm": 1.7717413902282715, + "learning_rate": 1.956684340645241e-05, + "loss": 0.7478, + "step": 10309 + }, + { + "epoch": 1.6830741602383577, + "grad_norm": 1.9842815399169922, + "learning_rate": 1.9566751017472704e-05, + "loss": 0.5489, + "step": 10310 + }, + { + "epoch": 1.683237418880862, + "grad_norm": 2.006906270980835, + "learning_rate": 1.956665861885928e-05, + "loss": 0.6401, + "step": 10311 + }, + { + "epoch": 1.6834006775233665, + "grad_norm": 2.0050320625305176, + "learning_rate": 1.9566566210612232e-05, + "loss": 0.7387, + "step": 10312 + }, + { + "epoch": 1.6835639361658707, + "grad_norm": 1.6769111156463623, + "learning_rate": 1.9566473792731656e-05, + "loss": 0.7487, + "step": 10313 + }, + { + "epoch": 1.6837271948083752, + "grad_norm": 1.5057857036590576, + "learning_rate": 1.9566381365217646e-05, + "loss": 0.689, + "step": 10314 + }, + { + "epoch": 1.6838904534508794, + "grad_norm": 1.684083342552185, + "learning_rate": 1.956628892807029e-05, + "loss": 0.7788, + "step": 10315 + }, + { + "epoch": 1.6840537120933838, + "grad_norm": 1.783150553703308, + "learning_rate": 1.9566196481289685e-05, + "loss": 0.8543, + "step": 10316 + }, + { + "epoch": 1.6842169707358883, + "grad_norm": 1.849509596824646, + "learning_rate": 1.9566104024875924e-05, + "loss": 0.7419, + "step": 10317 + }, + { + "epoch": 1.6843802293783927, + "grad_norm": 1.7741681337356567, + "learning_rate": 1.9566011558829095e-05, + "loss": 0.7793, + "step": 10318 + }, + { + "epoch": 1.6845434880208972, + "grad_norm": 1.7577685117721558, + "learning_rate": 1.9565919083149295e-05, + "loss": 0.8548, + "step": 10319 + }, + { + "epoch": 1.6847067466634016, + "grad_norm": 1.7868727445602417, + "learning_rate": 1.9565826597836623e-05, + "loss": 0.7788, + "step": 10320 + }, + { + "epoch": 1.684870005305906, + "grad_norm": 1.92340087890625, + "learning_rate": 1.956573410289116e-05, + "loss": 0.692, + "step": 10321 + }, + { + "epoch": 1.6850332639484102, + "grad_norm": 1.3995797634124756, + "learning_rate": 1.9565641598313005e-05, + "loss": 0.5757, + "step": 10322 + }, + { + "epoch": 1.6851965225909147, + "grad_norm": 1.6776593923568726, + "learning_rate": 1.9565549084102255e-05, + "loss": 0.6972, + "step": 10323 + }, + { + "epoch": 1.685359781233419, + "grad_norm": 1.6626075506210327, + "learning_rate": 1.9565456560258997e-05, + "loss": 0.6976, + "step": 10324 + }, + { + "epoch": 1.6855230398759233, + "grad_norm": 1.7676337957382202, + "learning_rate": 1.956536402678333e-05, + "loss": 0.7629, + "step": 10325 + }, + { + "epoch": 1.6856862985184278, + "grad_norm": 1.4399057626724243, + "learning_rate": 1.956527148367534e-05, + "loss": 0.5663, + "step": 10326 + }, + { + "epoch": 1.6858495571609322, + "grad_norm": 1.4094305038452148, + "learning_rate": 1.956517893093513e-05, + "loss": 0.6158, + "step": 10327 + }, + { + "epoch": 1.6860128158034366, + "grad_norm": 1.7661116123199463, + "learning_rate": 1.956508636856278e-05, + "loss": 0.6303, + "step": 10328 + }, + { + "epoch": 1.686176074445941, + "grad_norm": 1.622406244277954, + "learning_rate": 1.9564993796558394e-05, + "loss": 0.6957, + "step": 10329 + }, + { + "epoch": 1.6863393330884455, + "grad_norm": 1.8783899545669556, + "learning_rate": 1.9564901214922063e-05, + "loss": 0.6477, + "step": 10330 + }, + { + "epoch": 1.6865025917309497, + "grad_norm": 1.7842068672180176, + "learning_rate": 1.9564808623653877e-05, + "loss": 0.7592, + "step": 10331 + }, + { + "epoch": 1.6866658503734542, + "grad_norm": 1.5500167608261108, + "learning_rate": 1.9564716022753934e-05, + "loss": 0.6556, + "step": 10332 + }, + { + "epoch": 1.6868291090159584, + "grad_norm": 1.8271013498306274, + "learning_rate": 1.956462341222232e-05, + "loss": 0.8031, + "step": 10333 + }, + { + "epoch": 1.6869923676584628, + "grad_norm": 1.6053193807601929, + "learning_rate": 1.9564530792059134e-05, + "loss": 0.5971, + "step": 10334 + }, + { + "epoch": 1.6871556263009673, + "grad_norm": 1.6521666049957275, + "learning_rate": 1.956443816226447e-05, + "loss": 0.6919, + "step": 10335 + }, + { + "epoch": 1.6873188849434717, + "grad_norm": 2.3303864002227783, + "learning_rate": 1.956434552283842e-05, + "loss": 0.7317, + "step": 10336 + }, + { + "epoch": 1.6874821435859761, + "grad_norm": 1.5630825757980347, + "learning_rate": 1.9564252873781076e-05, + "loss": 0.557, + "step": 10337 + }, + { + "epoch": 1.6876454022284806, + "grad_norm": 1.9298499822616577, + "learning_rate": 1.956416021509253e-05, + "loss": 0.7512, + "step": 10338 + }, + { + "epoch": 1.687808660870985, + "grad_norm": 1.9832113981246948, + "learning_rate": 1.9564067546772877e-05, + "loss": 0.6703, + "step": 10339 + }, + { + "epoch": 1.6879719195134892, + "grad_norm": 1.7148081064224243, + "learning_rate": 1.9563974868822212e-05, + "loss": 0.6572, + "step": 10340 + }, + { + "epoch": 1.6881351781559937, + "grad_norm": 1.9668340682983398, + "learning_rate": 1.9563882181240627e-05, + "loss": 0.734, + "step": 10341 + }, + { + "epoch": 1.688298436798498, + "grad_norm": 1.9375584125518799, + "learning_rate": 1.9563789484028217e-05, + "loss": 0.8926, + "step": 10342 + }, + { + "epoch": 1.6884616954410023, + "grad_norm": 1.7779438495635986, + "learning_rate": 1.956369677718507e-05, + "loss": 0.762, + "step": 10343 + }, + { + "epoch": 1.6886249540835068, + "grad_norm": 1.7154210805892944, + "learning_rate": 1.9563604060711284e-05, + "loss": 0.6738, + "step": 10344 + }, + { + "epoch": 1.6887882127260112, + "grad_norm": 1.7179269790649414, + "learning_rate": 1.9563511334606952e-05, + "loss": 0.7045, + "step": 10345 + }, + { + "epoch": 1.6889514713685156, + "grad_norm": 1.8920576572418213, + "learning_rate": 1.9563418598872167e-05, + "loss": 0.6675, + "step": 10346 + }, + { + "epoch": 1.68911473001102, + "grad_norm": 1.863937258720398, + "learning_rate": 1.956332585350702e-05, + "loss": 0.6899, + "step": 10347 + }, + { + "epoch": 1.6892779886535243, + "grad_norm": 1.4510084390640259, + "learning_rate": 1.956323309851161e-05, + "loss": 0.6185, + "step": 10348 + }, + { + "epoch": 1.6894412472960287, + "grad_norm": 2.0105741024017334, + "learning_rate": 1.956314033388602e-05, + "loss": 0.7947, + "step": 10349 + }, + { + "epoch": 1.6896045059385332, + "grad_norm": 1.918426752090454, + "learning_rate": 1.9563047559630356e-05, + "loss": 0.7547, + "step": 10350 + }, + { + "epoch": 1.6897677645810374, + "grad_norm": 1.740210771560669, + "learning_rate": 1.9562954775744706e-05, + "loss": 0.6539, + "step": 10351 + }, + { + "epoch": 1.6899310232235418, + "grad_norm": 1.9839599132537842, + "learning_rate": 1.956286198222916e-05, + "loss": 0.6686, + "step": 10352 + }, + { + "epoch": 1.6900942818660463, + "grad_norm": 1.5641770362854004, + "learning_rate": 1.9562769179083816e-05, + "loss": 0.6648, + "step": 10353 + }, + { + "epoch": 1.6902575405085507, + "grad_norm": 1.8603949546813965, + "learning_rate": 1.9562676366308765e-05, + "loss": 0.753, + "step": 10354 + }, + { + "epoch": 1.6904207991510551, + "grad_norm": 1.5097898244857788, + "learning_rate": 1.9562583543904102e-05, + "loss": 0.5592, + "step": 10355 + }, + { + "epoch": 1.6905840577935596, + "grad_norm": 1.6280597448349, + "learning_rate": 1.956249071186992e-05, + "loss": 0.6519, + "step": 10356 + }, + { + "epoch": 1.6907473164360638, + "grad_norm": 1.6530081033706665, + "learning_rate": 1.9562397870206315e-05, + "loss": 0.6407, + "step": 10357 + }, + { + "epoch": 1.6909105750785682, + "grad_norm": 1.8442081212997437, + "learning_rate": 1.9562305018913373e-05, + "loss": 0.6626, + "step": 10358 + }, + { + "epoch": 1.6910738337210725, + "grad_norm": 1.640952706336975, + "learning_rate": 1.9562212157991194e-05, + "loss": 0.6632, + "step": 10359 + }, + { + "epoch": 1.6912370923635769, + "grad_norm": 1.7141512632369995, + "learning_rate": 1.9562119287439874e-05, + "loss": 0.6672, + "step": 10360 + }, + { + "epoch": 1.6914003510060813, + "grad_norm": 1.871482491493225, + "learning_rate": 1.9562026407259497e-05, + "loss": 0.7884, + "step": 10361 + }, + { + "epoch": 1.6915636096485858, + "grad_norm": 1.4997402429580688, + "learning_rate": 1.9561933517450164e-05, + "loss": 0.6638, + "step": 10362 + }, + { + "epoch": 1.6917268682910902, + "grad_norm": 1.7773383855819702, + "learning_rate": 1.9561840618011968e-05, + "loss": 0.6184, + "step": 10363 + }, + { + "epoch": 1.6918901269335946, + "grad_norm": 1.4365946054458618, + "learning_rate": 1.9561747708945e-05, + "loss": 0.5647, + "step": 10364 + }, + { + "epoch": 1.692053385576099, + "grad_norm": 1.459729790687561, + "learning_rate": 1.9561654790249353e-05, + "loss": 0.66, + "step": 10365 + }, + { + "epoch": 1.6922166442186033, + "grad_norm": 1.6671079397201538, + "learning_rate": 1.9561561861925124e-05, + "loss": 0.6473, + "step": 10366 + }, + { + "epoch": 1.6923799028611077, + "grad_norm": 1.923784613609314, + "learning_rate": 1.95614689239724e-05, + "loss": 0.7695, + "step": 10367 + }, + { + "epoch": 1.692543161503612, + "grad_norm": 1.9499075412750244, + "learning_rate": 1.9561375976391287e-05, + "loss": 0.7813, + "step": 10368 + }, + { + "epoch": 1.6927064201461164, + "grad_norm": 1.6839816570281982, + "learning_rate": 1.9561283019181866e-05, + "loss": 0.6053, + "step": 10369 + }, + { + "epoch": 1.6928696787886208, + "grad_norm": 1.5903276205062866, + "learning_rate": 1.956119005234424e-05, + "loss": 0.6747, + "step": 10370 + }, + { + "epoch": 1.6930329374311253, + "grad_norm": 1.5954084396362305, + "learning_rate": 1.9561097075878492e-05, + "loss": 0.6271, + "step": 10371 + }, + { + "epoch": 1.6931961960736297, + "grad_norm": 1.581481695175171, + "learning_rate": 1.9561004089784726e-05, + "loss": 0.606, + "step": 10372 + }, + { + "epoch": 1.6933594547161341, + "grad_norm": 1.6933077573776245, + "learning_rate": 1.956091109406303e-05, + "loss": 0.7371, + "step": 10373 + }, + { + "epoch": 1.6935227133586386, + "grad_norm": 1.7279263734817505, + "learning_rate": 1.9560818088713498e-05, + "loss": 0.638, + "step": 10374 + }, + { + "epoch": 1.6936859720011428, + "grad_norm": 1.3933202028274536, + "learning_rate": 1.9560725073736226e-05, + "loss": 0.5544, + "step": 10375 + }, + { + "epoch": 1.6938492306436472, + "grad_norm": 2.2993035316467285, + "learning_rate": 1.9560632049131307e-05, + "loss": 1.0912, + "step": 10376 + }, + { + "epoch": 1.6940124892861514, + "grad_norm": 1.5983117818832397, + "learning_rate": 1.9560539014898832e-05, + "loss": 0.6224, + "step": 10377 + }, + { + "epoch": 1.6941757479286559, + "grad_norm": 1.974234938621521, + "learning_rate": 1.95604459710389e-05, + "loss": 0.8273, + "step": 10378 + }, + { + "epoch": 1.6943390065711603, + "grad_norm": 1.6951218843460083, + "learning_rate": 1.95603529175516e-05, + "loss": 0.6049, + "step": 10379 + }, + { + "epoch": 1.6945022652136648, + "grad_norm": 1.792216181755066, + "learning_rate": 1.9560259854437026e-05, + "loss": 0.7101, + "step": 10380 + }, + { + "epoch": 1.6946655238561692, + "grad_norm": 1.470623254776001, + "learning_rate": 1.9560166781695272e-05, + "loss": 0.7339, + "step": 10381 + }, + { + "epoch": 1.6948287824986736, + "grad_norm": 1.7016723155975342, + "learning_rate": 1.9560073699326433e-05, + "loss": 0.6647, + "step": 10382 + }, + { + "epoch": 1.694992041141178, + "grad_norm": 1.8739635944366455, + "learning_rate": 1.9559980607330607e-05, + "loss": 0.7532, + "step": 10383 + }, + { + "epoch": 1.6951552997836823, + "grad_norm": 1.57029128074646, + "learning_rate": 1.955988750570788e-05, + "loss": 0.7869, + "step": 10384 + }, + { + "epoch": 1.6953185584261867, + "grad_norm": 1.65780770778656, + "learning_rate": 1.9559794394458347e-05, + "loss": 0.7125, + "step": 10385 + }, + { + "epoch": 1.695481817068691, + "grad_norm": 1.6460646390914917, + "learning_rate": 1.9559701273582106e-05, + "loss": 0.6449, + "step": 10386 + }, + { + "epoch": 1.6956450757111954, + "grad_norm": 1.6348038911819458, + "learning_rate": 1.9559608143079244e-05, + "loss": 0.6261, + "step": 10387 + }, + { + "epoch": 1.6958083343536998, + "grad_norm": 1.7019981145858765, + "learning_rate": 1.9559515002949866e-05, + "loss": 0.6643, + "step": 10388 + }, + { + "epoch": 1.6959715929962043, + "grad_norm": 1.745544195175171, + "learning_rate": 1.9559421853194057e-05, + "loss": 0.6021, + "step": 10389 + }, + { + "epoch": 1.6961348516387087, + "grad_norm": 1.295472264289856, + "learning_rate": 1.955932869381191e-05, + "loss": 0.5443, + "step": 10390 + }, + { + "epoch": 1.6962981102812131, + "grad_norm": 2.1479787826538086, + "learning_rate": 1.955923552480352e-05, + "loss": 0.6557, + "step": 10391 + }, + { + "epoch": 1.6964613689237173, + "grad_norm": 1.8739631175994873, + "learning_rate": 1.9559142346168988e-05, + "loss": 0.7394, + "step": 10392 + }, + { + "epoch": 1.6966246275662218, + "grad_norm": 1.8318933248519897, + "learning_rate": 1.9559049157908396e-05, + "loss": 0.6462, + "step": 10393 + }, + { + "epoch": 1.6967878862087262, + "grad_norm": 1.789330244064331, + "learning_rate": 1.9558955960021847e-05, + "loss": 0.6907, + "step": 10394 + }, + { + "epoch": 1.6969511448512304, + "grad_norm": 1.499477505683899, + "learning_rate": 1.9558862752509433e-05, + "loss": 0.6474, + "step": 10395 + }, + { + "epoch": 1.6971144034937349, + "grad_norm": 1.9187700748443604, + "learning_rate": 1.955876953537125e-05, + "loss": 0.7723, + "step": 10396 + }, + { + "epoch": 1.6972776621362393, + "grad_norm": 1.9851876497268677, + "learning_rate": 1.955867630860738e-05, + "loss": 0.7118, + "step": 10397 + }, + { + "epoch": 1.6974409207787438, + "grad_norm": 1.7708414793014526, + "learning_rate": 1.955858307221793e-05, + "loss": 0.6746, + "step": 10398 + }, + { + "epoch": 1.6976041794212482, + "grad_norm": 1.5312581062316895, + "learning_rate": 1.955848982620299e-05, + "loss": 0.6438, + "step": 10399 + }, + { + "epoch": 1.6977674380637526, + "grad_norm": 1.7737969160079956, + "learning_rate": 1.955839657056265e-05, + "loss": 0.7262, + "step": 10400 + }, + { + "epoch": 1.6979306967062568, + "grad_norm": 1.7462016344070435, + "learning_rate": 1.9558303305297014e-05, + "loss": 0.781, + "step": 10401 + }, + { + "epoch": 1.6980939553487613, + "grad_norm": 1.778375506401062, + "learning_rate": 1.9558210030406165e-05, + "loss": 0.7123, + "step": 10402 + }, + { + "epoch": 1.6982572139912655, + "grad_norm": 1.536126971244812, + "learning_rate": 1.9558116745890202e-05, + "loss": 0.6312, + "step": 10403 + }, + { + "epoch": 1.69842047263377, + "grad_norm": 1.692869782447815, + "learning_rate": 1.9558023451749214e-05, + "loss": 0.7088, + "step": 10404 + }, + { + "epoch": 1.6985837312762744, + "grad_norm": 1.5854151248931885, + "learning_rate": 1.9557930147983303e-05, + "loss": 0.6595, + "step": 10405 + }, + { + "epoch": 1.6987469899187788, + "grad_norm": 1.4214800596237183, + "learning_rate": 1.9557836834592557e-05, + "loss": 0.5733, + "step": 10406 + }, + { + "epoch": 1.6989102485612833, + "grad_norm": 1.7989214658737183, + "learning_rate": 1.9557743511577073e-05, + "loss": 0.618, + "step": 10407 + }, + { + "epoch": 1.6990735072037877, + "grad_norm": 1.518678903579712, + "learning_rate": 1.955765017893694e-05, + "loss": 0.6208, + "step": 10408 + }, + { + "epoch": 1.6992367658462921, + "grad_norm": 2.008467674255371, + "learning_rate": 1.9557556836672264e-05, + "loss": 0.773, + "step": 10409 + }, + { + "epoch": 1.6994000244887963, + "grad_norm": 1.423970341682434, + "learning_rate": 1.9557463484783125e-05, + "loss": 0.5142, + "step": 10410 + }, + { + "epoch": 1.6995632831313008, + "grad_norm": 1.3258861303329468, + "learning_rate": 1.9557370123269624e-05, + "loss": 0.5938, + "step": 10411 + }, + { + "epoch": 1.699726541773805, + "grad_norm": 1.4250050783157349, + "learning_rate": 1.9557276752131855e-05, + "loss": 0.6355, + "step": 10412 + }, + { + "epoch": 1.6998898004163094, + "grad_norm": 1.88783860206604, + "learning_rate": 1.9557183371369907e-05, + "loss": 0.7328, + "step": 10413 + }, + { + "epoch": 1.7000530590588139, + "grad_norm": 1.9888169765472412, + "learning_rate": 1.9557089980983882e-05, + "loss": 0.7534, + "step": 10414 + }, + { + "epoch": 1.7002163177013183, + "grad_norm": 2.227163314819336, + "learning_rate": 1.9556996580973866e-05, + "loss": 0.6676, + "step": 10415 + }, + { + "epoch": 1.7003795763438228, + "grad_norm": 1.3720204830169678, + "learning_rate": 1.9556903171339963e-05, + "loss": 0.4934, + "step": 10416 + }, + { + "epoch": 1.7005428349863272, + "grad_norm": 1.652794361114502, + "learning_rate": 1.9556809752082255e-05, + "loss": 0.6418, + "step": 10417 + }, + { + "epoch": 1.7007060936288316, + "grad_norm": 1.6741639375686646, + "learning_rate": 1.9556716323200846e-05, + "loss": 0.6612, + "step": 10418 + }, + { + "epoch": 1.7008693522713358, + "grad_norm": 2.026855230331421, + "learning_rate": 1.9556622884695825e-05, + "loss": 0.7411, + "step": 10419 + }, + { + "epoch": 1.7010326109138403, + "grad_norm": 1.985580325126648, + "learning_rate": 1.9556529436567287e-05, + "loss": 0.6397, + "step": 10420 + }, + { + "epoch": 1.7011958695563445, + "grad_norm": 1.7391304969787598, + "learning_rate": 1.9556435978815326e-05, + "loss": 0.6854, + "step": 10421 + }, + { + "epoch": 1.701359128198849, + "grad_norm": 2.043095111846924, + "learning_rate": 1.9556342511440034e-05, + "loss": 0.8242, + "step": 10422 + }, + { + "epoch": 1.7015223868413534, + "grad_norm": 2.246267318725586, + "learning_rate": 1.955624903444151e-05, + "loss": 0.8087, + "step": 10423 + }, + { + "epoch": 1.7016856454838578, + "grad_norm": 1.598379135131836, + "learning_rate": 1.9556155547819847e-05, + "loss": 0.6944, + "step": 10424 + }, + { + "epoch": 1.7018489041263622, + "grad_norm": 1.617615818977356, + "learning_rate": 1.9556062051575138e-05, + "loss": 0.5413, + "step": 10425 + }, + { + "epoch": 1.7020121627688667, + "grad_norm": 1.6692211627960205, + "learning_rate": 1.9555968545707474e-05, + "loss": 0.6898, + "step": 10426 + }, + { + "epoch": 1.7021754214113711, + "grad_norm": 1.9588664770126343, + "learning_rate": 1.9555875030216957e-05, + "loss": 0.7431, + "step": 10427 + }, + { + "epoch": 1.7023386800538753, + "grad_norm": 1.6091630458831787, + "learning_rate": 1.9555781505103674e-05, + "loss": 0.6907, + "step": 10428 + }, + { + "epoch": 1.7025019386963798, + "grad_norm": 1.8552803993225098, + "learning_rate": 1.955568797036772e-05, + "loss": 0.6197, + "step": 10429 + }, + { + "epoch": 1.702665197338884, + "grad_norm": 1.7571922540664673, + "learning_rate": 1.9555594426009193e-05, + "loss": 0.7279, + "step": 10430 + }, + { + "epoch": 1.7028284559813884, + "grad_norm": 1.8002344369888306, + "learning_rate": 1.9555500872028184e-05, + "loss": 0.742, + "step": 10431 + }, + { + "epoch": 1.7029917146238929, + "grad_norm": 1.7845065593719482, + "learning_rate": 1.9555407308424786e-05, + "loss": 0.6341, + "step": 10432 + }, + { + "epoch": 1.7031549732663973, + "grad_norm": 1.9372565746307373, + "learning_rate": 1.95553137351991e-05, + "loss": 0.7339, + "step": 10433 + }, + { + "epoch": 1.7033182319089017, + "grad_norm": 1.5389457941055298, + "learning_rate": 1.9555220152351212e-05, + "loss": 0.6166, + "step": 10434 + }, + { + "epoch": 1.7034814905514062, + "grad_norm": 1.7296741008758545, + "learning_rate": 1.9555126559881222e-05, + "loss": 0.6789, + "step": 10435 + }, + { + "epoch": 1.7036447491939104, + "grad_norm": 1.4303754568099976, + "learning_rate": 1.9555032957789223e-05, + "loss": 0.5658, + "step": 10436 + }, + { + "epoch": 1.7038080078364148, + "grad_norm": 1.2684929370880127, + "learning_rate": 1.9554939346075302e-05, + "loss": 0.4861, + "step": 10437 + }, + { + "epoch": 1.7039712664789193, + "grad_norm": 1.8279582262039185, + "learning_rate": 1.9554845724739565e-05, + "loss": 0.7368, + "step": 10438 + }, + { + "epoch": 1.7041345251214235, + "grad_norm": 1.4415745735168457, + "learning_rate": 1.9554752093782102e-05, + "loss": 0.5763, + "step": 10439 + }, + { + "epoch": 1.704297783763928, + "grad_norm": 1.8117859363555908, + "learning_rate": 1.9554658453203003e-05, + "loss": 0.6321, + "step": 10440 + }, + { + "epoch": 1.7044610424064324, + "grad_norm": 1.9942576885223389, + "learning_rate": 1.9554564803002364e-05, + "loss": 0.6893, + "step": 10441 + }, + { + "epoch": 1.7046243010489368, + "grad_norm": 2.0419869422912598, + "learning_rate": 1.9554471143180286e-05, + "loss": 0.909, + "step": 10442 + }, + { + "epoch": 1.7047875596914412, + "grad_norm": 1.8236314058303833, + "learning_rate": 1.9554377473736858e-05, + "loss": 0.8843, + "step": 10443 + }, + { + "epoch": 1.7049508183339457, + "grad_norm": 1.4909166097640991, + "learning_rate": 1.955428379467217e-05, + "loss": 0.6325, + "step": 10444 + }, + { + "epoch": 1.70511407697645, + "grad_norm": 1.6972407102584839, + "learning_rate": 1.955419010598632e-05, + "loss": 0.628, + "step": 10445 + }, + { + "epoch": 1.7052773356189543, + "grad_norm": 1.5303136110305786, + "learning_rate": 1.9554096407679406e-05, + "loss": 0.6242, + "step": 10446 + }, + { + "epoch": 1.7054405942614586, + "grad_norm": 1.540179967880249, + "learning_rate": 1.955400269975152e-05, + "loss": 0.545, + "step": 10447 + }, + { + "epoch": 1.705603852903963, + "grad_norm": 1.9699978828430176, + "learning_rate": 1.9553908982202758e-05, + "loss": 0.7026, + "step": 10448 + }, + { + "epoch": 1.7057671115464674, + "grad_norm": 1.5259641408920288, + "learning_rate": 1.9553815255033208e-05, + "loss": 0.4893, + "step": 10449 + }, + { + "epoch": 1.7059303701889719, + "grad_norm": 1.7998427152633667, + "learning_rate": 1.955372151824297e-05, + "loss": 0.6406, + "step": 10450 + }, + { + "epoch": 1.7060936288314763, + "grad_norm": 1.4915443658828735, + "learning_rate": 1.9553627771832135e-05, + "loss": 0.5489, + "step": 10451 + }, + { + "epoch": 1.7062568874739807, + "grad_norm": 2.0249814987182617, + "learning_rate": 1.95535340158008e-05, + "loss": 0.7057, + "step": 10452 + }, + { + "epoch": 1.7064201461164852, + "grad_norm": 1.745384931564331, + "learning_rate": 1.955344025014906e-05, + "loss": 0.5929, + "step": 10453 + }, + { + "epoch": 1.7065834047589894, + "grad_norm": 1.53267240524292, + "learning_rate": 1.9553346474877008e-05, + "loss": 0.6604, + "step": 10454 + }, + { + "epoch": 1.7067466634014938, + "grad_norm": 1.744869351387024, + "learning_rate": 1.955325268998474e-05, + "loss": 0.7361, + "step": 10455 + }, + { + "epoch": 1.706909922043998, + "grad_norm": 2.003955364227295, + "learning_rate": 1.955315889547235e-05, + "loss": 0.8103, + "step": 10456 + }, + { + "epoch": 1.7070731806865025, + "grad_norm": 1.9372624158859253, + "learning_rate": 1.9553065091339925e-05, + "loss": 0.8346, + "step": 10457 + }, + { + "epoch": 1.707236439329007, + "grad_norm": 1.4719704389572144, + "learning_rate": 1.9552971277587572e-05, + "loss": 0.63, + "step": 10458 + }, + { + "epoch": 1.7073996979715114, + "grad_norm": 1.672316312789917, + "learning_rate": 1.9552877454215378e-05, + "loss": 0.699, + "step": 10459 + }, + { + "epoch": 1.7075629566140158, + "grad_norm": 1.620107889175415, + "learning_rate": 1.9552783621223437e-05, + "loss": 0.6643, + "step": 10460 + }, + { + "epoch": 1.7077262152565202, + "grad_norm": 1.8448437452316284, + "learning_rate": 1.9552689778611848e-05, + "loss": 0.6809, + "step": 10461 + }, + { + "epoch": 1.7078894738990247, + "grad_norm": 1.763595461845398, + "learning_rate": 1.9552595926380703e-05, + "loss": 0.7254, + "step": 10462 + }, + { + "epoch": 1.708052732541529, + "grad_norm": 1.423767328262329, + "learning_rate": 1.9552502064530096e-05, + "loss": 0.6175, + "step": 10463 + }, + { + "epoch": 1.7082159911840333, + "grad_norm": 1.5384984016418457, + "learning_rate": 1.9552408193060118e-05, + "loss": 0.628, + "step": 10464 + }, + { + "epoch": 1.7083792498265375, + "grad_norm": 1.9554121494293213, + "learning_rate": 1.9552314311970875e-05, + "loss": 0.741, + "step": 10465 + }, + { + "epoch": 1.708542508469042, + "grad_norm": 1.7623236179351807, + "learning_rate": 1.9552220421262448e-05, + "loss": 0.8253, + "step": 10466 + }, + { + "epoch": 1.7087057671115464, + "grad_norm": 1.346286416053772, + "learning_rate": 1.955212652093494e-05, + "loss": 0.5285, + "step": 10467 + }, + { + "epoch": 1.7088690257540509, + "grad_norm": 2.1291158199310303, + "learning_rate": 1.9552032610988442e-05, + "loss": 0.8457, + "step": 10468 + }, + { + "epoch": 1.7090322843965553, + "grad_norm": 1.638199806213379, + "learning_rate": 1.955193869142305e-05, + "loss": 0.6124, + "step": 10469 + }, + { + "epoch": 1.7091955430390597, + "grad_norm": 1.5657198429107666, + "learning_rate": 1.955184476223886e-05, + "loss": 0.5607, + "step": 10470 + }, + { + "epoch": 1.7093588016815642, + "grad_norm": 1.9055685997009277, + "learning_rate": 1.9551750823435963e-05, + "loss": 0.8028, + "step": 10471 + }, + { + "epoch": 1.7095220603240684, + "grad_norm": 1.5759227275848389, + "learning_rate": 1.9551656875014454e-05, + "loss": 0.6475, + "step": 10472 + }, + { + "epoch": 1.7096853189665728, + "grad_norm": 1.6537609100341797, + "learning_rate": 1.9551562916974433e-05, + "loss": 0.6772, + "step": 10473 + }, + { + "epoch": 1.709848577609077, + "grad_norm": 1.5753872394561768, + "learning_rate": 1.9551468949315987e-05, + "loss": 0.5938, + "step": 10474 + }, + { + "epoch": 1.7100118362515815, + "grad_norm": 1.4278922080993652, + "learning_rate": 1.9551374972039218e-05, + "loss": 0.6247, + "step": 10475 + }, + { + "epoch": 1.710175094894086, + "grad_norm": 1.8292039632797241, + "learning_rate": 1.9551280985144213e-05, + "loss": 0.6646, + "step": 10476 + }, + { + "epoch": 1.7103383535365904, + "grad_norm": 1.732792615890503, + "learning_rate": 1.955118698863107e-05, + "loss": 0.6285, + "step": 10477 + }, + { + "epoch": 1.7105016121790948, + "grad_norm": 1.8926588296890259, + "learning_rate": 1.955109298249989e-05, + "loss": 0.7023, + "step": 10478 + }, + { + "epoch": 1.7106648708215992, + "grad_norm": 1.47390878200531, + "learning_rate": 1.955099896675076e-05, + "loss": 0.5713, + "step": 10479 + }, + { + "epoch": 1.7108281294641035, + "grad_norm": 1.9202725887298584, + "learning_rate": 1.955090494138377e-05, + "loss": 0.7725, + "step": 10480 + }, + { + "epoch": 1.710991388106608, + "grad_norm": 1.5037055015563965, + "learning_rate": 1.955081090639903e-05, + "loss": 0.599, + "step": 10481 + }, + { + "epoch": 1.7111546467491123, + "grad_norm": 1.7637490034103394, + "learning_rate": 1.9550716861796623e-05, + "loss": 0.793, + "step": 10482 + }, + { + "epoch": 1.7113179053916165, + "grad_norm": 1.5457683801651, + "learning_rate": 1.9550622807576647e-05, + "loss": 0.6077, + "step": 10483 + }, + { + "epoch": 1.711481164034121, + "grad_norm": 1.6197036504745483, + "learning_rate": 1.9550528743739196e-05, + "loss": 0.682, + "step": 10484 + }, + { + "epoch": 1.7116444226766254, + "grad_norm": 1.9876668453216553, + "learning_rate": 1.9550434670284362e-05, + "loss": 0.7511, + "step": 10485 + }, + { + "epoch": 1.7118076813191299, + "grad_norm": 1.875082015991211, + "learning_rate": 1.9550340587212246e-05, + "loss": 0.8692, + "step": 10486 + }, + { + "epoch": 1.7119709399616343, + "grad_norm": 1.7174609899520874, + "learning_rate": 1.9550246494522938e-05, + "loss": 0.6743, + "step": 10487 + }, + { + "epoch": 1.7121341986041387, + "grad_norm": 1.7721953392028809, + "learning_rate": 1.9550152392216536e-05, + "loss": 0.6547, + "step": 10488 + }, + { + "epoch": 1.712297457246643, + "grad_norm": 1.6389391422271729, + "learning_rate": 1.9550058280293132e-05, + "loss": 0.7924, + "step": 10489 + }, + { + "epoch": 1.7124607158891474, + "grad_norm": 1.6384506225585938, + "learning_rate": 1.9549964158752825e-05, + "loss": 0.6512, + "step": 10490 + }, + { + "epoch": 1.7126239745316516, + "grad_norm": 1.6584464311599731, + "learning_rate": 1.9549870027595702e-05, + "loss": 0.7006, + "step": 10491 + }, + { + "epoch": 1.712787233174156, + "grad_norm": 1.862842082977295, + "learning_rate": 1.954977588682186e-05, + "loss": 0.732, + "step": 10492 + }, + { + "epoch": 1.7129504918166605, + "grad_norm": 2.126932382583618, + "learning_rate": 1.95496817364314e-05, + "loss": 0.8006, + "step": 10493 + }, + { + "epoch": 1.713113750459165, + "grad_norm": 1.738707184791565, + "learning_rate": 1.9549587576424418e-05, + "loss": 0.6475, + "step": 10494 + }, + { + "epoch": 1.7132770091016694, + "grad_norm": 1.613210678100586, + "learning_rate": 1.9549493406800997e-05, + "loss": 0.6921, + "step": 10495 + }, + { + "epoch": 1.7134402677441738, + "grad_norm": 1.5360653400421143, + "learning_rate": 1.9549399227561243e-05, + "loss": 0.7315, + "step": 10496 + }, + { + "epoch": 1.7136035263866782, + "grad_norm": 1.7634958028793335, + "learning_rate": 1.954930503870524e-05, + "loss": 0.6274, + "step": 10497 + }, + { + "epoch": 1.7137667850291824, + "grad_norm": 1.334031105041504, + "learning_rate": 1.9549210840233095e-05, + "loss": 0.5568, + "step": 10498 + }, + { + "epoch": 1.7139300436716869, + "grad_norm": 1.8970280885696411, + "learning_rate": 1.95491166321449e-05, + "loss": 0.6877, + "step": 10499 + }, + { + "epoch": 1.714093302314191, + "grad_norm": 1.5677272081375122, + "learning_rate": 1.9549022414440738e-05, + "loss": 0.6232, + "step": 10500 + }, + { + "epoch": 1.7142565609566955, + "grad_norm": 1.9785327911376953, + "learning_rate": 1.954892818712072e-05, + "loss": 0.7382, + "step": 10501 + }, + { + "epoch": 1.7144198195992, + "grad_norm": 1.5939688682556152, + "learning_rate": 1.9548833950184933e-05, + "loss": 0.7328, + "step": 10502 + }, + { + "epoch": 1.7145830782417044, + "grad_norm": 1.7605940103530884, + "learning_rate": 1.9548739703633472e-05, + "loss": 0.6365, + "step": 10503 + }, + { + "epoch": 1.7147463368842089, + "grad_norm": 1.9104679822921753, + "learning_rate": 1.9548645447466433e-05, + "loss": 0.7344, + "step": 10504 + }, + { + "epoch": 1.7149095955267133, + "grad_norm": 1.8453248739242554, + "learning_rate": 1.954855118168391e-05, + "loss": 0.6519, + "step": 10505 + }, + { + "epoch": 1.7150728541692177, + "grad_norm": 1.4626961946487427, + "learning_rate": 1.9548456906285996e-05, + "loss": 0.6188, + "step": 10506 + }, + { + "epoch": 1.715236112811722, + "grad_norm": 1.644094705581665, + "learning_rate": 1.954836262127279e-05, + "loss": 0.6216, + "step": 10507 + }, + { + "epoch": 1.7153993714542264, + "grad_norm": 1.6551867723464966, + "learning_rate": 1.9548268326644385e-05, + "loss": 0.5509, + "step": 10508 + }, + { + "epoch": 1.7155626300967306, + "grad_norm": 1.7874832153320312, + "learning_rate": 1.954817402240088e-05, + "loss": 0.5736, + "step": 10509 + }, + { + "epoch": 1.715725888739235, + "grad_norm": 1.7282418012619019, + "learning_rate": 1.9548079708542365e-05, + "loss": 0.6743, + "step": 10510 + }, + { + "epoch": 1.7158891473817395, + "grad_norm": 1.7626570463180542, + "learning_rate": 1.9547985385068932e-05, + "loss": 0.7101, + "step": 10511 + }, + { + "epoch": 1.716052406024244, + "grad_norm": 1.702007532119751, + "learning_rate": 1.9547891051980686e-05, + "loss": 0.6577, + "step": 10512 + }, + { + "epoch": 1.7162156646667484, + "grad_norm": 1.5831881761550903, + "learning_rate": 1.954779670927771e-05, + "loss": 0.5977, + "step": 10513 + }, + { + "epoch": 1.7163789233092528, + "grad_norm": 1.6448665857315063, + "learning_rate": 1.9547702356960112e-05, + "loss": 0.544, + "step": 10514 + }, + { + "epoch": 1.7165421819517572, + "grad_norm": 1.9218090772628784, + "learning_rate": 1.954760799502798e-05, + "loss": 0.7229, + "step": 10515 + }, + { + "epoch": 1.7167054405942614, + "grad_norm": 1.7794705629348755, + "learning_rate": 1.95475136234814e-05, + "loss": 0.6561, + "step": 10516 + }, + { + "epoch": 1.7168686992367659, + "grad_norm": 1.788110375404358, + "learning_rate": 1.9547419242320484e-05, + "loss": 0.5809, + "step": 10517 + }, + { + "epoch": 1.71703195787927, + "grad_norm": 2.137939453125, + "learning_rate": 1.9547324851545316e-05, + "loss": 0.7334, + "step": 10518 + }, + { + "epoch": 1.7171952165217745, + "grad_norm": 1.6484497785568237, + "learning_rate": 1.9547230451156e-05, + "loss": 0.6901, + "step": 10519 + }, + { + "epoch": 1.717358475164279, + "grad_norm": 1.7820580005645752, + "learning_rate": 1.954713604115262e-05, + "loss": 0.7329, + "step": 10520 + }, + { + "epoch": 1.7175217338067834, + "grad_norm": 1.6823458671569824, + "learning_rate": 1.954704162153528e-05, + "loss": 0.5886, + "step": 10521 + }, + { + "epoch": 1.7176849924492879, + "grad_norm": 1.5869474411010742, + "learning_rate": 1.9546947192304068e-05, + "loss": 0.638, + "step": 10522 + }, + { + "epoch": 1.7178482510917923, + "grad_norm": 1.735540509223938, + "learning_rate": 1.9546852753459086e-05, + "loss": 0.5968, + "step": 10523 + }, + { + "epoch": 1.7180115097342965, + "grad_norm": 1.676934838294983, + "learning_rate": 1.9546758305000422e-05, + "loss": 0.697, + "step": 10524 + }, + { + "epoch": 1.718174768376801, + "grad_norm": 2.215573787689209, + "learning_rate": 1.954666384692818e-05, + "loss": 0.9233, + "step": 10525 + }, + { + "epoch": 1.7183380270193054, + "grad_norm": 1.4089696407318115, + "learning_rate": 1.9546569379242446e-05, + "loss": 0.5532, + "step": 10526 + }, + { + "epoch": 1.7185012856618096, + "grad_norm": 1.9756587743759155, + "learning_rate": 1.954647490194332e-05, + "loss": 0.8333, + "step": 10527 + }, + { + "epoch": 1.718664544304314, + "grad_norm": 1.6064860820770264, + "learning_rate": 1.95463804150309e-05, + "loss": 0.6479, + "step": 10528 + }, + { + "epoch": 1.7188278029468185, + "grad_norm": 1.8488706350326538, + "learning_rate": 1.9546285918505274e-05, + "loss": 0.7614, + "step": 10529 + }, + { + "epoch": 1.718991061589323, + "grad_norm": 1.8997039794921875, + "learning_rate": 1.9546191412366543e-05, + "loss": 0.7544, + "step": 10530 + }, + { + "epoch": 1.7191543202318273, + "grad_norm": 1.8627084493637085, + "learning_rate": 1.9546096896614795e-05, + "loss": 0.7237, + "step": 10531 + }, + { + "epoch": 1.7193175788743318, + "grad_norm": 1.663925290107727, + "learning_rate": 1.9546002371250134e-05, + "loss": 0.5658, + "step": 10532 + }, + { + "epoch": 1.719480837516836, + "grad_norm": 1.6518642902374268, + "learning_rate": 1.954590783627265e-05, + "loss": 0.6932, + "step": 10533 + }, + { + "epoch": 1.7196440961593404, + "grad_norm": 1.8122893571853638, + "learning_rate": 1.9545813291682437e-05, + "loss": 0.7529, + "step": 10534 + }, + { + "epoch": 1.7198073548018447, + "grad_norm": 1.7759326696395874, + "learning_rate": 1.9545718737479594e-05, + "loss": 0.797, + "step": 10535 + }, + { + "epoch": 1.719970613444349, + "grad_norm": 1.7745518684387207, + "learning_rate": 1.9545624173664218e-05, + "loss": 0.7905, + "step": 10536 + }, + { + "epoch": 1.7201338720868535, + "grad_norm": 1.8253284692764282, + "learning_rate": 1.95455296002364e-05, + "loss": 0.6049, + "step": 10537 + }, + { + "epoch": 1.720297130729358, + "grad_norm": 1.4455910921096802, + "learning_rate": 1.9545435017196233e-05, + "loss": 0.6269, + "step": 10538 + }, + { + "epoch": 1.7204603893718624, + "grad_norm": 1.3832334280014038, + "learning_rate": 1.9545340424543816e-05, + "loss": 0.5732, + "step": 10539 + }, + { + "epoch": 1.7206236480143668, + "grad_norm": 1.675455093383789, + "learning_rate": 1.9545245822279243e-05, + "loss": 0.7821, + "step": 10540 + }, + { + "epoch": 1.7207869066568713, + "grad_norm": 1.5102587938308716, + "learning_rate": 1.9545151210402615e-05, + "loss": 0.6382, + "step": 10541 + }, + { + "epoch": 1.7209501652993755, + "grad_norm": 1.5390368700027466, + "learning_rate": 1.954505658891402e-05, + "loss": 0.6797, + "step": 10542 + }, + { + "epoch": 1.72111342394188, + "grad_norm": 1.714815378189087, + "learning_rate": 1.9544961957813554e-05, + "loss": 0.6286, + "step": 10543 + }, + { + "epoch": 1.7212766825843842, + "grad_norm": 1.7877800464630127, + "learning_rate": 1.9544867317101315e-05, + "loss": 0.6626, + "step": 10544 + }, + { + "epoch": 1.7214399412268886, + "grad_norm": 1.5722830295562744, + "learning_rate": 1.9544772666777397e-05, + "loss": 0.621, + "step": 10545 + }, + { + "epoch": 1.721603199869393, + "grad_norm": 1.7013568878173828, + "learning_rate": 1.9544678006841894e-05, + "loss": 0.6813, + "step": 10546 + }, + { + "epoch": 1.7217664585118975, + "grad_norm": 1.8173472881317139, + "learning_rate": 1.9544583337294902e-05, + "loss": 0.6543, + "step": 10547 + }, + { + "epoch": 1.721929717154402, + "grad_norm": 1.436177134513855, + "learning_rate": 1.9544488658136522e-05, + "loss": 0.5965, + "step": 10548 + }, + { + "epoch": 1.7220929757969063, + "grad_norm": 1.8009873628616333, + "learning_rate": 1.954439396936684e-05, + "loss": 0.7044, + "step": 10549 + }, + { + "epoch": 1.7222562344394108, + "grad_norm": 1.9202760457992554, + "learning_rate": 1.9544299270985958e-05, + "loss": 0.6664, + "step": 10550 + }, + { + "epoch": 1.722419493081915, + "grad_norm": 1.5490577220916748, + "learning_rate": 1.954420456299397e-05, + "loss": 0.6142, + "step": 10551 + }, + { + "epoch": 1.7225827517244194, + "grad_norm": 1.6230870485305786, + "learning_rate": 1.954410984539097e-05, + "loss": 0.8167, + "step": 10552 + }, + { + "epoch": 1.7227460103669237, + "grad_norm": 1.6663614511489868, + "learning_rate": 1.954401511817705e-05, + "loss": 0.6875, + "step": 10553 + }, + { + "epoch": 1.722909269009428, + "grad_norm": 1.8314623832702637, + "learning_rate": 1.954392038135231e-05, + "loss": 0.765, + "step": 10554 + }, + { + "epoch": 1.7230725276519325, + "grad_norm": 1.8969863653182983, + "learning_rate": 1.954382563491685e-05, + "loss": 0.7448, + "step": 10555 + }, + { + "epoch": 1.723235786294437, + "grad_norm": 1.8221365213394165, + "learning_rate": 1.9543730878870757e-05, + "loss": 0.7622, + "step": 10556 + }, + { + "epoch": 1.7233990449369414, + "grad_norm": 1.4239269495010376, + "learning_rate": 1.954363611321413e-05, + "loss": 0.5685, + "step": 10557 + }, + { + "epoch": 1.7235623035794458, + "grad_norm": 1.8908969163894653, + "learning_rate": 1.9543541337947063e-05, + "loss": 0.7043, + "step": 10558 + }, + { + "epoch": 1.7237255622219503, + "grad_norm": 2.024406909942627, + "learning_rate": 1.954344655306965e-05, + "loss": 0.7301, + "step": 10559 + }, + { + "epoch": 1.7238888208644545, + "grad_norm": 1.7487016916275024, + "learning_rate": 1.9543351758581995e-05, + "loss": 0.7452, + "step": 10560 + }, + { + "epoch": 1.724052079506959, + "grad_norm": 1.455611228942871, + "learning_rate": 1.9543256954484185e-05, + "loss": 0.6436, + "step": 10561 + }, + { + "epoch": 1.7242153381494632, + "grad_norm": 1.5850998163223267, + "learning_rate": 1.9543162140776316e-05, + "loss": 0.5881, + "step": 10562 + }, + { + "epoch": 1.7243785967919676, + "grad_norm": 2.248586654663086, + "learning_rate": 1.9543067317458485e-05, + "loss": 1.486, + "step": 10563 + }, + { + "epoch": 1.724541855434472, + "grad_norm": 1.5249744653701782, + "learning_rate": 1.954297248453079e-05, + "loss": 0.5178, + "step": 10564 + }, + { + "epoch": 1.7247051140769765, + "grad_norm": 1.6350375413894653, + "learning_rate": 1.9542877641993324e-05, + "loss": 0.6248, + "step": 10565 + }, + { + "epoch": 1.724868372719481, + "grad_norm": 1.7565058469772339, + "learning_rate": 1.954278278984618e-05, + "loss": 0.6314, + "step": 10566 + }, + { + "epoch": 1.7250316313619853, + "grad_norm": 1.6368539333343506, + "learning_rate": 1.954268792808946e-05, + "loss": 0.6719, + "step": 10567 + }, + { + "epoch": 1.7251948900044898, + "grad_norm": 1.6005159616470337, + "learning_rate": 1.9542593056723254e-05, + "loss": 0.6295, + "step": 10568 + }, + { + "epoch": 1.725358148646994, + "grad_norm": 1.4432224035263062, + "learning_rate": 1.9542498175747657e-05, + "loss": 0.6748, + "step": 10569 + }, + { + "epoch": 1.7255214072894984, + "grad_norm": 1.737804889678955, + "learning_rate": 1.954240328516277e-05, + "loss": 0.6874, + "step": 10570 + }, + { + "epoch": 1.7256846659320026, + "grad_norm": 1.7625536918640137, + "learning_rate": 1.9542308384968685e-05, + "loss": 0.7523, + "step": 10571 + }, + { + "epoch": 1.725847924574507, + "grad_norm": 1.730936884880066, + "learning_rate": 1.95422134751655e-05, + "loss": 0.7392, + "step": 10572 + }, + { + "epoch": 1.7260111832170115, + "grad_norm": 1.934177041053772, + "learning_rate": 1.9542118555753302e-05, + "loss": 0.7962, + "step": 10573 + }, + { + "epoch": 1.726174441859516, + "grad_norm": 1.6105421781539917, + "learning_rate": 1.9542023626732197e-05, + "loss": 0.5954, + "step": 10574 + }, + { + "epoch": 1.7263377005020204, + "grad_norm": 1.7367273569107056, + "learning_rate": 1.9541928688102278e-05, + "loss": 0.6114, + "step": 10575 + }, + { + "epoch": 1.7265009591445248, + "grad_norm": 1.6149802207946777, + "learning_rate": 1.954183373986364e-05, + "loss": 0.6535, + "step": 10576 + }, + { + "epoch": 1.726664217787029, + "grad_norm": 2.0760560035705566, + "learning_rate": 1.954173878201638e-05, + "loss": 0.752, + "step": 10577 + }, + { + "epoch": 1.7268274764295335, + "grad_norm": 1.5624138116836548, + "learning_rate": 1.9541643814560584e-05, + "loss": 0.6165, + "step": 10578 + }, + { + "epoch": 1.726990735072038, + "grad_norm": 1.9820643663406372, + "learning_rate": 1.954154883749636e-05, + "loss": 0.684, + "step": 10579 + }, + { + "epoch": 1.7271539937145421, + "grad_norm": 1.8963795900344849, + "learning_rate": 1.9541453850823796e-05, + "loss": 0.7319, + "step": 10580 + }, + { + "epoch": 1.7273172523570466, + "grad_norm": 1.5788037776947021, + "learning_rate": 1.9541358854542993e-05, + "loss": 0.735, + "step": 10581 + }, + { + "epoch": 1.727480510999551, + "grad_norm": 1.7232866287231445, + "learning_rate": 1.9541263848654044e-05, + "loss": 0.6275, + "step": 10582 + }, + { + "epoch": 1.7276437696420555, + "grad_norm": 1.9382745027542114, + "learning_rate": 1.9541168833157044e-05, + "loss": 0.8312, + "step": 10583 + }, + { + "epoch": 1.72780702828456, + "grad_norm": 1.3694684505462646, + "learning_rate": 1.954107380805209e-05, + "loss": 0.6456, + "step": 10584 + }, + { + "epoch": 1.7279702869270643, + "grad_norm": 1.4017574787139893, + "learning_rate": 1.954097877333928e-05, + "loss": 0.5594, + "step": 10585 + }, + { + "epoch": 1.7281335455695686, + "grad_norm": 1.7509292364120483, + "learning_rate": 1.95408837290187e-05, + "loss": 0.5151, + "step": 10586 + }, + { + "epoch": 1.728296804212073, + "grad_norm": 1.4830998182296753, + "learning_rate": 1.9540788675090458e-05, + "loss": 0.5924, + "step": 10587 + }, + { + "epoch": 1.7284600628545772, + "grad_norm": 1.7681158781051636, + "learning_rate": 1.9540693611554645e-05, + "loss": 0.7085, + "step": 10588 + }, + { + "epoch": 1.7286233214970816, + "grad_norm": 1.490742564201355, + "learning_rate": 1.954059853841135e-05, + "loss": 0.5884, + "step": 10589 + }, + { + "epoch": 1.728786580139586, + "grad_norm": 1.3024935722351074, + "learning_rate": 1.954050345566068e-05, + "loss": 0.517, + "step": 10590 + }, + { + "epoch": 1.7289498387820905, + "grad_norm": 1.6590701341629028, + "learning_rate": 1.9540408363302726e-05, + "loss": 0.6199, + "step": 10591 + }, + { + "epoch": 1.729113097424595, + "grad_norm": 1.4368504285812378, + "learning_rate": 1.954031326133758e-05, + "loss": 0.5192, + "step": 10592 + }, + { + "epoch": 1.7292763560670994, + "grad_norm": 1.7187329530715942, + "learning_rate": 1.9540218149765344e-05, + "loss": 0.6571, + "step": 10593 + }, + { + "epoch": 1.7294396147096038, + "grad_norm": 1.534321665763855, + "learning_rate": 1.9540123028586107e-05, + "loss": 0.6891, + "step": 10594 + }, + { + "epoch": 1.729602873352108, + "grad_norm": 1.7339712381362915, + "learning_rate": 1.9540027897799975e-05, + "loss": 0.767, + "step": 10595 + }, + { + "epoch": 1.7297661319946125, + "grad_norm": 1.9697129726409912, + "learning_rate": 1.9539932757407036e-05, + "loss": 0.7218, + "step": 10596 + }, + { + "epoch": 1.7299293906371167, + "grad_norm": 1.7139936685562134, + "learning_rate": 1.953983760740738e-05, + "loss": 0.6109, + "step": 10597 + }, + { + "epoch": 1.7300926492796211, + "grad_norm": 1.6778162717819214, + "learning_rate": 1.9539742447801115e-05, + "loss": 0.716, + "step": 10598 + }, + { + "epoch": 1.7302559079221256, + "grad_norm": 1.817650318145752, + "learning_rate": 1.9539647278588334e-05, + "loss": 0.6826, + "step": 10599 + }, + { + "epoch": 1.73041916656463, + "grad_norm": 1.484668493270874, + "learning_rate": 1.9539552099769128e-05, + "loss": 0.55, + "step": 10600 + }, + { + "epoch": 1.7305824252071345, + "grad_norm": 1.3935587406158447, + "learning_rate": 1.9539456911343596e-05, + "loss": 0.5205, + "step": 10601 + }, + { + "epoch": 1.730745683849639, + "grad_norm": 1.8158220052719116, + "learning_rate": 1.9539361713311833e-05, + "loss": 0.7207, + "step": 10602 + }, + { + "epoch": 1.7309089424921433, + "grad_norm": 1.341210126876831, + "learning_rate": 1.9539266505673938e-05, + "loss": 0.5224, + "step": 10603 + }, + { + "epoch": 1.7310722011346475, + "grad_norm": 1.9447590112686157, + "learning_rate": 1.953917128843e-05, + "loss": 0.6645, + "step": 10604 + }, + { + "epoch": 1.731235459777152, + "grad_norm": 1.3831266164779663, + "learning_rate": 1.9539076061580124e-05, + "loss": 0.6196, + "step": 10605 + }, + { + "epoch": 1.7313987184196562, + "grad_norm": 1.7333645820617676, + "learning_rate": 1.9538980825124395e-05, + "loss": 0.5062, + "step": 10606 + }, + { + "epoch": 1.7315619770621606, + "grad_norm": 1.782394528388977, + "learning_rate": 1.953888557906292e-05, + "loss": 0.6182, + "step": 10607 + }, + { + "epoch": 1.731725235704665, + "grad_norm": 2.1425974369049072, + "learning_rate": 1.9538790323395786e-05, + "loss": 0.7764, + "step": 10608 + }, + { + "epoch": 1.7318884943471695, + "grad_norm": 2.034274101257324, + "learning_rate": 1.9538695058123095e-05, + "loss": 0.69, + "step": 10609 + }, + { + "epoch": 1.732051752989674, + "grad_norm": 1.4751893281936646, + "learning_rate": 1.953859978324494e-05, + "loss": 0.5055, + "step": 10610 + }, + { + "epoch": 1.7322150116321784, + "grad_norm": 1.6922223567962646, + "learning_rate": 1.953850449876142e-05, + "loss": 0.7228, + "step": 10611 + }, + { + "epoch": 1.7323782702746828, + "grad_norm": 1.9303778409957886, + "learning_rate": 1.9538409204672624e-05, + "loss": 0.7537, + "step": 10612 + }, + { + "epoch": 1.732541528917187, + "grad_norm": 1.9266541004180908, + "learning_rate": 1.9538313900978654e-05, + "loss": 0.847, + "step": 10613 + }, + { + "epoch": 1.7327047875596915, + "grad_norm": 1.6379420757293701, + "learning_rate": 1.9538218587679605e-05, + "loss": 0.6329, + "step": 10614 + }, + { + "epoch": 1.7328680462021957, + "grad_norm": 1.579006552696228, + "learning_rate": 1.9538123264775572e-05, + "loss": 0.6304, + "step": 10615 + }, + { + "epoch": 1.7330313048447001, + "grad_norm": 1.603655219078064, + "learning_rate": 1.9538027932266653e-05, + "loss": 0.6743, + "step": 10616 + }, + { + "epoch": 1.7331945634872046, + "grad_norm": 1.7723690271377563, + "learning_rate": 1.953793259015294e-05, + "loss": 0.618, + "step": 10617 + }, + { + "epoch": 1.733357822129709, + "grad_norm": 1.7787483930587769, + "learning_rate": 1.9537837238434532e-05, + "loss": 0.6411, + "step": 10618 + }, + { + "epoch": 1.7335210807722135, + "grad_norm": 1.984925627708435, + "learning_rate": 1.9537741877111527e-05, + "loss": 0.7424, + "step": 10619 + }, + { + "epoch": 1.733684339414718, + "grad_norm": 1.7774463891983032, + "learning_rate": 1.9537646506184016e-05, + "loss": 0.7405, + "step": 10620 + }, + { + "epoch": 1.733847598057222, + "grad_norm": 1.6227116584777832, + "learning_rate": 1.9537551125652096e-05, + "loss": 0.7284, + "step": 10621 + }, + { + "epoch": 1.7340108566997265, + "grad_norm": 1.7611488103866577, + "learning_rate": 1.953745573551587e-05, + "loss": 0.7101, + "step": 10622 + }, + { + "epoch": 1.734174115342231, + "grad_norm": 1.513990044593811, + "learning_rate": 1.9537360335775425e-05, + "loss": 0.5484, + "step": 10623 + }, + { + "epoch": 1.7343373739847352, + "grad_norm": 1.768786072731018, + "learning_rate": 1.9537264926430856e-05, + "loss": 0.7907, + "step": 10624 + }, + { + "epoch": 1.7345006326272396, + "grad_norm": 2.017465829849243, + "learning_rate": 1.953716950748227e-05, + "loss": 0.7448, + "step": 10625 + }, + { + "epoch": 1.734663891269744, + "grad_norm": 1.4632030725479126, + "learning_rate": 1.9537074078929757e-05, + "loss": 0.5172, + "step": 10626 + }, + { + "epoch": 1.7348271499122485, + "grad_norm": 1.745485782623291, + "learning_rate": 1.953697864077341e-05, + "loss": 0.6497, + "step": 10627 + }, + { + "epoch": 1.734990408554753, + "grad_norm": 1.7179838418960571, + "learning_rate": 1.953688319301333e-05, + "loss": 0.5464, + "step": 10628 + }, + { + "epoch": 1.7351536671972574, + "grad_norm": 1.576981544494629, + "learning_rate": 1.9536787735649612e-05, + "loss": 0.6319, + "step": 10629 + }, + { + "epoch": 1.7353169258397616, + "grad_norm": 1.8889379501342773, + "learning_rate": 1.9536692268682348e-05, + "loss": 0.7445, + "step": 10630 + }, + { + "epoch": 1.735480184482266, + "grad_norm": 1.4420335292816162, + "learning_rate": 1.953659679211164e-05, + "loss": 0.5888, + "step": 10631 + }, + { + "epoch": 1.7356434431247703, + "grad_norm": 1.801537036895752, + "learning_rate": 1.9536501305937578e-05, + "loss": 0.712, + "step": 10632 + }, + { + "epoch": 1.7358067017672747, + "grad_norm": 1.8794673681259155, + "learning_rate": 1.9536405810160267e-05, + "loss": 0.6791, + "step": 10633 + }, + { + "epoch": 1.7359699604097791, + "grad_norm": 1.5929036140441895, + "learning_rate": 1.9536310304779797e-05, + "loss": 0.6131, + "step": 10634 + }, + { + "epoch": 1.7361332190522836, + "grad_norm": 1.7105309963226318, + "learning_rate": 1.953621478979626e-05, + "loss": 0.8359, + "step": 10635 + }, + { + "epoch": 1.736296477694788, + "grad_norm": 1.4507215023040771, + "learning_rate": 1.9536119265209763e-05, + "loss": 0.6477, + "step": 10636 + }, + { + "epoch": 1.7364597363372924, + "grad_norm": 1.645018458366394, + "learning_rate": 1.953602373102039e-05, + "loss": 0.7053, + "step": 10637 + }, + { + "epoch": 1.7366229949797969, + "grad_norm": 1.6468052864074707, + "learning_rate": 1.953592818722825e-05, + "loss": 0.7371, + "step": 10638 + }, + { + "epoch": 1.736786253622301, + "grad_norm": 1.5783240795135498, + "learning_rate": 1.953583263383343e-05, + "loss": 0.6765, + "step": 10639 + }, + { + "epoch": 1.7369495122648055, + "grad_norm": 1.2791000604629517, + "learning_rate": 1.9535737070836028e-05, + "loss": 0.5194, + "step": 10640 + }, + { + "epoch": 1.7371127709073098, + "grad_norm": 1.3839515447616577, + "learning_rate": 1.9535641498236145e-05, + "loss": 0.5557, + "step": 10641 + }, + { + "epoch": 1.7372760295498142, + "grad_norm": 1.5100871324539185, + "learning_rate": 1.953554591603387e-05, + "loss": 0.5942, + "step": 10642 + }, + { + "epoch": 1.7374392881923186, + "grad_norm": 1.7086377143859863, + "learning_rate": 1.9535450324229307e-05, + "loss": 0.5811, + "step": 10643 + }, + { + "epoch": 1.737602546834823, + "grad_norm": 1.68819260597229, + "learning_rate": 1.9535354722822545e-05, + "loss": 0.5964, + "step": 10644 + }, + { + "epoch": 1.7377658054773275, + "grad_norm": 1.4723917245864868, + "learning_rate": 1.9535259111813682e-05, + "loss": 0.6529, + "step": 10645 + }, + { + "epoch": 1.737929064119832, + "grad_norm": 1.4796161651611328, + "learning_rate": 1.9535163491202817e-05, + "loss": 0.6473, + "step": 10646 + }, + { + "epoch": 1.7380923227623364, + "grad_norm": 1.490499496459961, + "learning_rate": 1.9535067860990046e-05, + "loss": 0.5595, + "step": 10647 + }, + { + "epoch": 1.7382555814048406, + "grad_norm": 1.803398609161377, + "learning_rate": 1.9534972221175463e-05, + "loss": 0.6914, + "step": 10648 + }, + { + "epoch": 1.738418840047345, + "grad_norm": 1.7491391897201538, + "learning_rate": 1.9534876571759165e-05, + "loss": 0.7305, + "step": 10649 + }, + { + "epoch": 1.7385820986898493, + "grad_norm": 1.6702085733413696, + "learning_rate": 1.953478091274125e-05, + "loss": 0.7222, + "step": 10650 + }, + { + "epoch": 1.7387453573323537, + "grad_norm": 1.575265645980835, + "learning_rate": 1.9534685244121814e-05, + "loss": 0.5918, + "step": 10651 + }, + { + "epoch": 1.7389086159748581, + "grad_norm": 1.8249541521072388, + "learning_rate": 1.953458956590095e-05, + "loss": 0.7912, + "step": 10652 + }, + { + "epoch": 1.7390718746173626, + "grad_norm": 1.6756296157836914, + "learning_rate": 1.9534493878078756e-05, + "loss": 0.7391, + "step": 10653 + }, + { + "epoch": 1.739235133259867, + "grad_norm": 1.5395827293395996, + "learning_rate": 1.953439818065533e-05, + "loss": 0.6402, + "step": 10654 + }, + { + "epoch": 1.7393983919023714, + "grad_norm": 1.6690785884857178, + "learning_rate": 1.9534302473630774e-05, + "loss": 0.6763, + "step": 10655 + }, + { + "epoch": 1.7395616505448759, + "grad_norm": 2.1039669513702393, + "learning_rate": 1.953420675700517e-05, + "loss": 0.8104, + "step": 10656 + }, + { + "epoch": 1.73972490918738, + "grad_norm": 2.0114612579345703, + "learning_rate": 1.9534111030778623e-05, + "loss": 0.6973, + "step": 10657 + }, + { + "epoch": 1.7398881678298845, + "grad_norm": 1.9010531902313232, + "learning_rate": 1.9534015294951235e-05, + "loss": 0.8051, + "step": 10658 + }, + { + "epoch": 1.7400514264723888, + "grad_norm": 2.1739630699157715, + "learning_rate": 1.9533919549523092e-05, + "loss": 0.7386, + "step": 10659 + }, + { + "epoch": 1.7402146851148932, + "grad_norm": 1.7289904356002808, + "learning_rate": 1.9533823794494294e-05, + "loss": 0.6817, + "step": 10660 + }, + { + "epoch": 1.7403779437573976, + "grad_norm": 1.670079231262207, + "learning_rate": 1.9533728029864937e-05, + "loss": 0.6543, + "step": 10661 + }, + { + "epoch": 1.740541202399902, + "grad_norm": 1.462788701057434, + "learning_rate": 1.953363225563512e-05, + "loss": 0.593, + "step": 10662 + }, + { + "epoch": 1.7407044610424065, + "grad_norm": 1.5636025667190552, + "learning_rate": 1.9533536471804938e-05, + "loss": 0.6495, + "step": 10663 + }, + { + "epoch": 1.740867719684911, + "grad_norm": 1.6936249732971191, + "learning_rate": 1.9533440678374486e-05, + "loss": 0.7034, + "step": 10664 + }, + { + "epoch": 1.7410309783274152, + "grad_norm": 1.4976593255996704, + "learning_rate": 1.9533344875343863e-05, + "loss": 0.6615, + "step": 10665 + }, + { + "epoch": 1.7411942369699196, + "grad_norm": 1.439508318901062, + "learning_rate": 1.9533249062713163e-05, + "loss": 0.5519, + "step": 10666 + }, + { + "epoch": 1.741357495612424, + "grad_norm": 1.4585577249526978, + "learning_rate": 1.953315324048249e-05, + "loss": 0.627, + "step": 10667 + }, + { + "epoch": 1.7415207542549282, + "grad_norm": 1.6699813604354858, + "learning_rate": 1.9533057408651926e-05, + "loss": 0.6389, + "step": 10668 + }, + { + "epoch": 1.7416840128974327, + "grad_norm": 1.5431424379348755, + "learning_rate": 1.9532961567221577e-05, + "loss": 0.551, + "step": 10669 + }, + { + "epoch": 1.7418472715399371, + "grad_norm": 1.5191174745559692, + "learning_rate": 1.953286571619154e-05, + "loss": 0.6245, + "step": 10670 + }, + { + "epoch": 1.7420105301824416, + "grad_norm": 1.5608497858047485, + "learning_rate": 1.953276985556191e-05, + "loss": 0.6348, + "step": 10671 + }, + { + "epoch": 1.742173788824946, + "grad_norm": 1.6324998140335083, + "learning_rate": 1.9532673985332783e-05, + "loss": 0.6509, + "step": 10672 + }, + { + "epoch": 1.7423370474674504, + "grad_norm": 2.0397937297821045, + "learning_rate": 1.9532578105504255e-05, + "loss": 0.7965, + "step": 10673 + }, + { + "epoch": 1.7425003061099547, + "grad_norm": 1.3539122343063354, + "learning_rate": 1.9532482216076425e-05, + "loss": 0.5818, + "step": 10674 + }, + { + "epoch": 1.742663564752459, + "grad_norm": 1.609420895576477, + "learning_rate": 1.9532386317049387e-05, + "loss": 0.6696, + "step": 10675 + }, + { + "epoch": 1.7428268233949633, + "grad_norm": 1.7696812152862549, + "learning_rate": 1.9532290408423236e-05, + "loss": 0.6299, + "step": 10676 + }, + { + "epoch": 1.7429900820374677, + "grad_norm": 2.063328742980957, + "learning_rate": 1.9532194490198074e-05, + "loss": 0.817, + "step": 10677 + }, + { + "epoch": 1.7431533406799722, + "grad_norm": 1.7363944053649902, + "learning_rate": 1.9532098562373997e-05, + "loss": 0.6601, + "step": 10678 + }, + { + "epoch": 1.7433165993224766, + "grad_norm": 1.9815986156463623, + "learning_rate": 1.9532002624951097e-05, + "loss": 0.7904, + "step": 10679 + }, + { + "epoch": 1.743479857964981, + "grad_norm": 1.4014389514923096, + "learning_rate": 1.9531906677929472e-05, + "loss": 0.534, + "step": 10680 + }, + { + "epoch": 1.7436431166074855, + "grad_norm": 1.5112221240997314, + "learning_rate": 1.953181072130922e-05, + "loss": 0.6423, + "step": 10681 + }, + { + "epoch": 1.74380637524999, + "grad_norm": 1.6072019338607788, + "learning_rate": 1.9531714755090438e-05, + "loss": 0.6744, + "step": 10682 + }, + { + "epoch": 1.7439696338924942, + "grad_norm": 1.5799765586853027, + "learning_rate": 1.953161877927322e-05, + "loss": 0.7014, + "step": 10683 + }, + { + "epoch": 1.7441328925349986, + "grad_norm": 1.667450189590454, + "learning_rate": 1.9531522793857663e-05, + "loss": 0.6508, + "step": 10684 + }, + { + "epoch": 1.7442961511775028, + "grad_norm": 1.6054002046585083, + "learning_rate": 1.953142679884387e-05, + "loss": 0.613, + "step": 10685 + }, + { + "epoch": 1.7444594098200072, + "grad_norm": 1.5304906368255615, + "learning_rate": 1.9531330794231928e-05, + "loss": 0.6034, + "step": 10686 + }, + { + "epoch": 1.7446226684625117, + "grad_norm": 1.3575116395950317, + "learning_rate": 1.953123478002194e-05, + "loss": 0.5364, + "step": 10687 + }, + { + "epoch": 1.7447859271050161, + "grad_norm": 1.4334372282028198, + "learning_rate": 1.9531138756214004e-05, + "loss": 0.5706, + "step": 10688 + }, + { + "epoch": 1.7449491857475206, + "grad_norm": 1.613806128501892, + "learning_rate": 1.953104272280821e-05, + "loss": 0.6063, + "step": 10689 + }, + { + "epoch": 1.745112444390025, + "grad_norm": 1.4269624948501587, + "learning_rate": 1.953094667980466e-05, + "loss": 0.5717, + "step": 10690 + }, + { + "epoch": 1.7452757030325294, + "grad_norm": 1.5211204290390015, + "learning_rate": 1.953085062720345e-05, + "loss": 0.6624, + "step": 10691 + }, + { + "epoch": 1.7454389616750337, + "grad_norm": 1.7872917652130127, + "learning_rate": 1.9530754565004674e-05, + "loss": 0.8588, + "step": 10692 + }, + { + "epoch": 1.745602220317538, + "grad_norm": 1.6194450855255127, + "learning_rate": 1.953065849320843e-05, + "loss": 0.6213, + "step": 10693 + }, + { + "epoch": 1.7457654789600423, + "grad_norm": 1.3746157884597778, + "learning_rate": 1.953056241181482e-05, + "loss": 0.5409, + "step": 10694 + }, + { + "epoch": 1.7459287376025467, + "grad_norm": 1.9657628536224365, + "learning_rate": 1.9530466320823933e-05, + "loss": 0.7654, + "step": 10695 + }, + { + "epoch": 1.7460919962450512, + "grad_norm": 1.8772079944610596, + "learning_rate": 1.953037022023587e-05, + "loss": 0.5858, + "step": 10696 + }, + { + "epoch": 1.7462552548875556, + "grad_norm": 2.121143341064453, + "learning_rate": 1.9530274110050726e-05, + "loss": 0.7453, + "step": 10697 + }, + { + "epoch": 1.74641851353006, + "grad_norm": 2.0563087463378906, + "learning_rate": 1.95301779902686e-05, + "loss": 0.7033, + "step": 10698 + }, + { + "epoch": 1.7465817721725645, + "grad_norm": 1.6286327838897705, + "learning_rate": 1.9530081860889586e-05, + "loss": 0.6415, + "step": 10699 + }, + { + "epoch": 1.746745030815069, + "grad_norm": 1.7814826965332031, + "learning_rate": 1.952998572191378e-05, + "loss": 0.6404, + "step": 10700 + }, + { + "epoch": 1.7469082894575731, + "grad_norm": 1.6063657999038696, + "learning_rate": 1.952988957334128e-05, + "loss": 0.7378, + "step": 10701 + }, + { + "epoch": 1.7470715481000776, + "grad_norm": 1.5934327840805054, + "learning_rate": 1.952979341517219e-05, + "loss": 0.615, + "step": 10702 + }, + { + "epoch": 1.7472348067425818, + "grad_norm": 1.725294828414917, + "learning_rate": 1.9529697247406596e-05, + "loss": 0.7344, + "step": 10703 + }, + { + "epoch": 1.7473980653850862, + "grad_norm": 1.5010987520217896, + "learning_rate": 1.9529601070044603e-05, + "loss": 0.6444, + "step": 10704 + }, + { + "epoch": 1.7475613240275907, + "grad_norm": 1.8272624015808105, + "learning_rate": 1.9529504883086302e-05, + "loss": 0.7557, + "step": 10705 + }, + { + "epoch": 1.7477245826700951, + "grad_norm": 1.7367616891860962, + "learning_rate": 1.952940868653179e-05, + "loss": 0.7205, + "step": 10706 + }, + { + "epoch": 1.7478878413125996, + "grad_norm": 1.749585747718811, + "learning_rate": 1.952931248038117e-05, + "loss": 0.7402, + "step": 10707 + }, + { + "epoch": 1.748051099955104, + "grad_norm": 1.794198989868164, + "learning_rate": 1.9529216264634533e-05, + "loss": 0.7114, + "step": 10708 + }, + { + "epoch": 1.7482143585976082, + "grad_norm": 1.5261139869689941, + "learning_rate": 1.9529120039291975e-05, + "loss": 0.7004, + "step": 10709 + }, + { + "epoch": 1.7483776172401126, + "grad_norm": 1.5516608953475952, + "learning_rate": 1.9529023804353598e-05, + "loss": 0.6309, + "step": 10710 + }, + { + "epoch": 1.748540875882617, + "grad_norm": 1.5820624828338623, + "learning_rate": 1.9528927559819497e-05, + "loss": 0.5994, + "step": 10711 + }, + { + "epoch": 1.7487041345251213, + "grad_norm": 1.7250977754592896, + "learning_rate": 1.952883130568977e-05, + "loss": 0.7757, + "step": 10712 + }, + { + "epoch": 1.7488673931676257, + "grad_norm": 1.8915307521820068, + "learning_rate": 1.952873504196451e-05, + "loss": 0.8007, + "step": 10713 + }, + { + "epoch": 1.7490306518101302, + "grad_norm": 1.682375192642212, + "learning_rate": 1.9528638768643814e-05, + "loss": 0.5906, + "step": 10714 + }, + { + "epoch": 1.7491939104526346, + "grad_norm": 1.6707977056503296, + "learning_rate": 1.9528542485727787e-05, + "loss": 0.7422, + "step": 10715 + }, + { + "epoch": 1.749357169095139, + "grad_norm": 1.7416025400161743, + "learning_rate": 1.9528446193216516e-05, + "loss": 0.7058, + "step": 10716 + }, + { + "epoch": 1.7495204277376435, + "grad_norm": 1.7810405492782593, + "learning_rate": 1.9528349891110104e-05, + "loss": 0.7959, + "step": 10717 + }, + { + "epoch": 1.7496836863801477, + "grad_norm": 1.707619071006775, + "learning_rate": 1.9528253579408644e-05, + "loss": 0.7189, + "step": 10718 + }, + { + "epoch": 1.7498469450226521, + "grad_norm": 1.7502764463424683, + "learning_rate": 1.952815725811224e-05, + "loss": 0.7811, + "step": 10719 + }, + { + "epoch": 1.7500102036651564, + "grad_norm": 1.6503678560256958, + "learning_rate": 1.952806092722098e-05, + "loss": 0.6026, + "step": 10720 + }, + { + "epoch": 1.7501734623076608, + "grad_norm": 1.7462319135665894, + "learning_rate": 1.9527964586734967e-05, + "loss": 0.5974, + "step": 10721 + }, + { + "epoch": 1.7503367209501652, + "grad_norm": 1.6383886337280273, + "learning_rate": 1.9527868236654296e-05, + "loss": 0.7407, + "step": 10722 + }, + { + "epoch": 1.7504999795926697, + "grad_norm": 1.77699875831604, + "learning_rate": 1.9527771876979062e-05, + "loss": 0.5341, + "step": 10723 + }, + { + "epoch": 1.7506632382351741, + "grad_norm": 1.639184832572937, + "learning_rate": 1.9527675507709368e-05, + "loss": 0.6405, + "step": 10724 + }, + { + "epoch": 1.7508264968776786, + "grad_norm": 1.7103420495986938, + "learning_rate": 1.9527579128845304e-05, + "loss": 0.5437, + "step": 10725 + }, + { + "epoch": 1.750989755520183, + "grad_norm": 1.494810938835144, + "learning_rate": 1.9527482740386972e-05, + "loss": 0.5921, + "step": 10726 + }, + { + "epoch": 1.7511530141626872, + "grad_norm": 1.5288208723068237, + "learning_rate": 1.9527386342334468e-05, + "loss": 0.5907, + "step": 10727 + }, + { + "epoch": 1.7513162728051916, + "grad_norm": 1.9668656587600708, + "learning_rate": 1.9527289934687886e-05, + "loss": 0.7954, + "step": 10728 + }, + { + "epoch": 1.7514795314476959, + "grad_norm": 1.6285535097122192, + "learning_rate": 1.9527193517447328e-05, + "loss": 0.6791, + "step": 10729 + }, + { + "epoch": 1.7516427900902003, + "grad_norm": 1.7145187854766846, + "learning_rate": 1.9527097090612888e-05, + "loss": 0.6132, + "step": 10730 + }, + { + "epoch": 1.7518060487327047, + "grad_norm": 1.630653977394104, + "learning_rate": 1.952700065418466e-05, + "loss": 0.6366, + "step": 10731 + }, + { + "epoch": 1.7519693073752092, + "grad_norm": 1.727787971496582, + "learning_rate": 1.952690420816275e-05, + "loss": 0.6666, + "step": 10732 + }, + { + "epoch": 1.7521325660177136, + "grad_norm": 1.9507721662521362, + "learning_rate": 1.952680775254725e-05, + "loss": 0.6634, + "step": 10733 + }, + { + "epoch": 1.752295824660218, + "grad_norm": 1.7663540840148926, + "learning_rate": 1.9526711287338256e-05, + "loss": 0.7122, + "step": 10734 + }, + { + "epoch": 1.7524590833027225, + "grad_norm": 1.9366360902786255, + "learning_rate": 1.9526614812535866e-05, + "loss": 0.7152, + "step": 10735 + }, + { + "epoch": 1.7526223419452267, + "grad_norm": 1.7407790422439575, + "learning_rate": 1.9526518328140177e-05, + "loss": 0.6915, + "step": 10736 + }, + { + "epoch": 1.7527856005877311, + "grad_norm": 1.3461657762527466, + "learning_rate": 1.9526421834151284e-05, + "loss": 0.5267, + "step": 10737 + }, + { + "epoch": 1.7529488592302354, + "grad_norm": 1.7447257041931152, + "learning_rate": 1.952632533056929e-05, + "loss": 0.6183, + "step": 10738 + }, + { + "epoch": 1.7531121178727398, + "grad_norm": 1.6941795349121094, + "learning_rate": 1.952622881739429e-05, + "loss": 0.6457, + "step": 10739 + }, + { + "epoch": 1.7532753765152442, + "grad_norm": 1.592056155204773, + "learning_rate": 1.9526132294626377e-05, + "loss": 0.709, + "step": 10740 + }, + { + "epoch": 1.7534386351577487, + "grad_norm": 1.7101908922195435, + "learning_rate": 1.9526035762265652e-05, + "loss": 0.6995, + "step": 10741 + }, + { + "epoch": 1.753601893800253, + "grad_norm": 1.7220239639282227, + "learning_rate": 1.9525939220312215e-05, + "loss": 0.5336, + "step": 10742 + }, + { + "epoch": 1.7537651524427575, + "grad_norm": 2.173704147338867, + "learning_rate": 1.9525842668766156e-05, + "loss": 0.8378, + "step": 10743 + }, + { + "epoch": 1.753928411085262, + "grad_norm": 1.9480594396591187, + "learning_rate": 1.952574610762758e-05, + "loss": 0.6174, + "step": 10744 + }, + { + "epoch": 1.7540916697277662, + "grad_norm": 1.6421117782592773, + "learning_rate": 1.9525649536896573e-05, + "loss": 0.6432, + "step": 10745 + }, + { + "epoch": 1.7542549283702706, + "grad_norm": 1.4242048263549805, + "learning_rate": 1.9525552956573244e-05, + "loss": 0.5078, + "step": 10746 + }, + { + "epoch": 1.7544181870127749, + "grad_norm": 1.7420896291732788, + "learning_rate": 1.9525456366657684e-05, + "loss": 0.6498, + "step": 10747 + }, + { + "epoch": 1.7545814456552793, + "grad_norm": 1.5830692052841187, + "learning_rate": 1.9525359767149994e-05, + "loss": 0.5773, + "step": 10748 + }, + { + "epoch": 1.7547447042977837, + "grad_norm": 1.6917366981506348, + "learning_rate": 1.952526315805027e-05, + "loss": 0.6092, + "step": 10749 + }, + { + "epoch": 1.7549079629402882, + "grad_norm": 1.762800931930542, + "learning_rate": 1.9525166539358608e-05, + "loss": 0.6278, + "step": 10750 + }, + { + "epoch": 1.7550712215827926, + "grad_norm": 1.2863050699234009, + "learning_rate": 1.9525069911075105e-05, + "loss": 0.4441, + "step": 10751 + }, + { + "epoch": 1.755234480225297, + "grad_norm": 1.832234501838684, + "learning_rate": 1.9524973273199855e-05, + "loss": 0.6494, + "step": 10752 + }, + { + "epoch": 1.7553977388678013, + "grad_norm": 1.9815207719802856, + "learning_rate": 1.9524876625732963e-05, + "loss": 0.7388, + "step": 10753 + }, + { + "epoch": 1.7555609975103057, + "grad_norm": 1.674972653388977, + "learning_rate": 1.9524779968674528e-05, + "loss": 0.6514, + "step": 10754 + }, + { + "epoch": 1.7557242561528101, + "grad_norm": 1.6049739122390747, + "learning_rate": 1.9524683302024634e-05, + "loss": 0.5816, + "step": 10755 + }, + { + "epoch": 1.7558875147953144, + "grad_norm": 1.5166209936141968, + "learning_rate": 1.952458662578339e-05, + "loss": 0.5147, + "step": 10756 + }, + { + "epoch": 1.7560507734378188, + "grad_norm": 2.034099817276001, + "learning_rate": 1.9524489939950892e-05, + "loss": 0.7691, + "step": 10757 + }, + { + "epoch": 1.7562140320803232, + "grad_norm": 1.544663667678833, + "learning_rate": 1.952439324452723e-05, + "loss": 0.6134, + "step": 10758 + }, + { + "epoch": 1.7563772907228277, + "grad_norm": 1.7768033742904663, + "learning_rate": 1.952429653951251e-05, + "loss": 0.6369, + "step": 10759 + }, + { + "epoch": 1.756540549365332, + "grad_norm": 1.825433611869812, + "learning_rate": 1.9524199824906826e-05, + "loss": 0.7808, + "step": 10760 + }, + { + "epoch": 1.7567038080078365, + "grad_norm": 2.1548619270324707, + "learning_rate": 1.9524103100710276e-05, + "loss": 0.5696, + "step": 10761 + }, + { + "epoch": 1.7568670666503408, + "grad_norm": 1.6438004970550537, + "learning_rate": 1.9524006366922954e-05, + "loss": 0.5874, + "step": 10762 + }, + { + "epoch": 1.7570303252928452, + "grad_norm": 1.8138939142227173, + "learning_rate": 1.952390962354496e-05, + "loss": 0.6316, + "step": 10763 + }, + { + "epoch": 1.7571935839353494, + "grad_norm": 1.9225834608078003, + "learning_rate": 1.9523812870576395e-05, + "loss": 0.6014, + "step": 10764 + }, + { + "epoch": 1.7573568425778539, + "grad_norm": 1.7963004112243652, + "learning_rate": 1.952371610801735e-05, + "loss": 0.6416, + "step": 10765 + }, + { + "epoch": 1.7575201012203583, + "grad_norm": 1.8769713640213013, + "learning_rate": 1.9523619335867926e-05, + "loss": 0.8003, + "step": 10766 + }, + { + "epoch": 1.7576833598628627, + "grad_norm": 1.5693458318710327, + "learning_rate": 1.952352255412822e-05, + "loss": 0.5159, + "step": 10767 + }, + { + "epoch": 1.7578466185053672, + "grad_norm": 1.669358491897583, + "learning_rate": 1.9523425762798328e-05, + "loss": 0.7201, + "step": 10768 + }, + { + "epoch": 1.7580098771478716, + "grad_norm": 1.6975306272506714, + "learning_rate": 1.9523328961878353e-05, + "loss": 0.6903, + "step": 10769 + }, + { + "epoch": 1.758173135790376, + "grad_norm": 1.60775625705719, + "learning_rate": 1.9523232151368383e-05, + "loss": 0.6171, + "step": 10770 + }, + { + "epoch": 1.7583363944328803, + "grad_norm": 1.6108731031417847, + "learning_rate": 1.9523135331268523e-05, + "loss": 0.665, + "step": 10771 + }, + { + "epoch": 1.7584996530753847, + "grad_norm": 2.1032893657684326, + "learning_rate": 1.952303850157887e-05, + "loss": 0.7419, + "step": 10772 + }, + { + "epoch": 1.758662911717889, + "grad_norm": 1.6460249423980713, + "learning_rate": 1.9522941662299518e-05, + "loss": 0.7546, + "step": 10773 + }, + { + "epoch": 1.7588261703603933, + "grad_norm": 1.875888466835022, + "learning_rate": 1.9522844813430567e-05, + "loss": 0.7806, + "step": 10774 + }, + { + "epoch": 1.7589894290028978, + "grad_norm": 1.989449143409729, + "learning_rate": 1.952274795497211e-05, + "loss": 0.7138, + "step": 10775 + }, + { + "epoch": 1.7591526876454022, + "grad_norm": 1.8132860660552979, + "learning_rate": 1.9522651086924254e-05, + "loss": 0.8015, + "step": 10776 + }, + { + "epoch": 1.7593159462879067, + "grad_norm": 1.652529239654541, + "learning_rate": 1.952255420928709e-05, + "loss": 0.6898, + "step": 10777 + }, + { + "epoch": 1.759479204930411, + "grad_norm": 1.75728440284729, + "learning_rate": 1.9522457322060714e-05, + "loss": 0.6754, + "step": 10778 + }, + { + "epoch": 1.7596424635729155, + "grad_norm": 1.5660400390625, + "learning_rate": 1.9522360425245226e-05, + "loss": 0.6484, + "step": 10779 + }, + { + "epoch": 1.7598057222154198, + "grad_norm": 1.6051851511001587, + "learning_rate": 1.952226351884072e-05, + "loss": 0.7157, + "step": 10780 + }, + { + "epoch": 1.7599689808579242, + "grad_norm": 2.1345932483673096, + "learning_rate": 1.9522166602847305e-05, + "loss": 0.6072, + "step": 10781 + }, + { + "epoch": 1.7601322395004284, + "grad_norm": 1.5944386720657349, + "learning_rate": 1.9522069677265067e-05, + "loss": 0.5425, + "step": 10782 + }, + { + "epoch": 1.7602954981429328, + "grad_norm": 1.6194030046463013, + "learning_rate": 1.9521972742094107e-05, + "loss": 0.6101, + "step": 10783 + }, + { + "epoch": 1.7604587567854373, + "grad_norm": 1.5638209581375122, + "learning_rate": 1.9521875797334524e-05, + "loss": 0.6524, + "step": 10784 + }, + { + "epoch": 1.7606220154279417, + "grad_norm": 1.5616692304611206, + "learning_rate": 1.9521778842986413e-05, + "loss": 0.694, + "step": 10785 + }, + { + "epoch": 1.7607852740704462, + "grad_norm": 1.8090053796768188, + "learning_rate": 1.9521681879049876e-05, + "loss": 0.6927, + "step": 10786 + }, + { + "epoch": 1.7609485327129506, + "grad_norm": 1.8393393754959106, + "learning_rate": 1.952158490552501e-05, + "loss": 0.7657, + "step": 10787 + }, + { + "epoch": 1.761111791355455, + "grad_norm": 1.587812066078186, + "learning_rate": 1.9521487922411904e-05, + "loss": 0.5942, + "step": 10788 + }, + { + "epoch": 1.7612750499979593, + "grad_norm": 1.7164409160614014, + "learning_rate": 1.9521390929710663e-05, + "loss": 0.6661, + "step": 10789 + }, + { + "epoch": 1.7614383086404637, + "grad_norm": 1.659538984298706, + "learning_rate": 1.9521293927421388e-05, + "loss": 0.7343, + "step": 10790 + }, + { + "epoch": 1.761601567282968, + "grad_norm": 1.581421971321106, + "learning_rate": 1.952119691554417e-05, + "loss": 0.6272, + "step": 10791 + }, + { + "epoch": 1.7617648259254723, + "grad_norm": 1.690076231956482, + "learning_rate": 1.952109989407911e-05, + "loss": 0.6768, + "step": 10792 + }, + { + "epoch": 1.7619280845679768, + "grad_norm": 2.370445489883423, + "learning_rate": 1.9521002863026305e-05, + "loss": 0.8073, + "step": 10793 + }, + { + "epoch": 1.7620913432104812, + "grad_norm": 1.941224455833435, + "learning_rate": 1.9520905822385852e-05, + "loss": 0.7596, + "step": 10794 + }, + { + "epoch": 1.7622546018529857, + "grad_norm": 1.7497076988220215, + "learning_rate": 1.952080877215785e-05, + "loss": 0.5879, + "step": 10795 + }, + { + "epoch": 1.76241786049549, + "grad_norm": 1.739261507987976, + "learning_rate": 1.9520711712342394e-05, + "loss": 0.6218, + "step": 10796 + }, + { + "epoch": 1.7625811191379943, + "grad_norm": 1.5725771188735962, + "learning_rate": 1.952061464293959e-05, + "loss": 0.6383, + "step": 10797 + }, + { + "epoch": 1.7627443777804988, + "grad_norm": 1.619012713432312, + "learning_rate": 1.9520517563949522e-05, + "loss": 0.5779, + "step": 10798 + }, + { + "epoch": 1.7629076364230032, + "grad_norm": 1.9557584524154663, + "learning_rate": 1.95204204753723e-05, + "loss": 0.8222, + "step": 10799 + }, + { + "epoch": 1.7630708950655074, + "grad_norm": 1.8133105039596558, + "learning_rate": 1.9520323377208017e-05, + "loss": 0.6207, + "step": 10800 + }, + { + "epoch": 1.7632341537080118, + "grad_norm": 1.490153431892395, + "learning_rate": 1.9520226269456767e-05, + "loss": 0.5551, + "step": 10801 + }, + { + "epoch": 1.7633974123505163, + "grad_norm": 1.5352156162261963, + "learning_rate": 1.9520129152118653e-05, + "loss": 0.6902, + "step": 10802 + }, + { + "epoch": 1.7635606709930207, + "grad_norm": 1.7540897130966187, + "learning_rate": 1.9520032025193772e-05, + "loss": 0.7109, + "step": 10803 + }, + { + "epoch": 1.7637239296355252, + "grad_norm": 1.677668809890747, + "learning_rate": 1.9519934888682224e-05, + "loss": 0.6896, + "step": 10804 + }, + { + "epoch": 1.7638871882780296, + "grad_norm": 1.6755605936050415, + "learning_rate": 1.9519837742584102e-05, + "loss": 0.6232, + "step": 10805 + }, + { + "epoch": 1.7640504469205338, + "grad_norm": 1.5990185737609863, + "learning_rate": 1.951974058689951e-05, + "loss": 0.6892, + "step": 10806 + }, + { + "epoch": 1.7642137055630382, + "grad_norm": 1.263320803642273, + "learning_rate": 1.9519643421628535e-05, + "loss": 0.455, + "step": 10807 + }, + { + "epoch": 1.7643769642055427, + "grad_norm": 1.7494382858276367, + "learning_rate": 1.9519546246771283e-05, + "loss": 0.7423, + "step": 10808 + }, + { + "epoch": 1.764540222848047, + "grad_norm": 1.8148205280303955, + "learning_rate": 1.951944906232785e-05, + "loss": 0.6589, + "step": 10809 + }, + { + "epoch": 1.7647034814905513, + "grad_norm": 1.7060492038726807, + "learning_rate": 1.9519351868298337e-05, + "loss": 0.5839, + "step": 10810 + }, + { + "epoch": 1.7648667401330558, + "grad_norm": 1.5221731662750244, + "learning_rate": 1.951925466468284e-05, + "loss": 0.6025, + "step": 10811 + }, + { + "epoch": 1.7650299987755602, + "grad_norm": 1.5526695251464844, + "learning_rate": 1.9519157451481453e-05, + "loss": 0.6064, + "step": 10812 + }, + { + "epoch": 1.7651932574180647, + "grad_norm": 1.7610591650009155, + "learning_rate": 1.951906022869428e-05, + "loss": 0.8321, + "step": 10813 + }, + { + "epoch": 1.765356516060569, + "grad_norm": 1.6823147535324097, + "learning_rate": 1.9518962996321413e-05, + "loss": 0.5976, + "step": 10814 + }, + { + "epoch": 1.7655197747030733, + "grad_norm": 1.4573984146118164, + "learning_rate": 1.9518865754362953e-05, + "loss": 0.5635, + "step": 10815 + }, + { + "epoch": 1.7656830333455777, + "grad_norm": 1.669206142425537, + "learning_rate": 1.9518768502819e-05, + "loss": 0.6118, + "step": 10816 + }, + { + "epoch": 1.765846291988082, + "grad_norm": 1.8541523218154907, + "learning_rate": 1.9518671241689648e-05, + "loss": 0.6255, + "step": 10817 + }, + { + "epoch": 1.7660095506305864, + "grad_norm": 1.257539987564087, + "learning_rate": 1.9518573970975e-05, + "loss": 0.5502, + "step": 10818 + }, + { + "epoch": 1.7661728092730908, + "grad_norm": 1.723966121673584, + "learning_rate": 1.9518476690675145e-05, + "loss": 0.6388, + "step": 10819 + }, + { + "epoch": 1.7663360679155953, + "grad_norm": 1.5228670835494995, + "learning_rate": 1.9518379400790192e-05, + "loss": 0.578, + "step": 10820 + }, + { + "epoch": 1.7664993265580997, + "grad_norm": 1.5448646545410156, + "learning_rate": 1.9518282101320228e-05, + "loss": 0.5407, + "step": 10821 + }, + { + "epoch": 1.7666625852006042, + "grad_norm": 1.7004462480545044, + "learning_rate": 1.9518184792265357e-05, + "loss": 0.7933, + "step": 10822 + }, + { + "epoch": 1.7668258438431086, + "grad_norm": 1.948448657989502, + "learning_rate": 1.951808747362568e-05, + "loss": 0.7763, + "step": 10823 + }, + { + "epoch": 1.7669891024856128, + "grad_norm": 1.7008239030838013, + "learning_rate": 1.951799014540129e-05, + "loss": 0.6474, + "step": 10824 + }, + { + "epoch": 1.7671523611281172, + "grad_norm": 1.5442135334014893, + "learning_rate": 1.9517892807592288e-05, + "loss": 0.5985, + "step": 10825 + }, + { + "epoch": 1.7673156197706215, + "grad_norm": 1.7470180988311768, + "learning_rate": 1.9517795460198768e-05, + "loss": 0.7397, + "step": 10826 + }, + { + "epoch": 1.767478878413126, + "grad_norm": 1.6956099271774292, + "learning_rate": 1.951769810322083e-05, + "loss": 0.7092, + "step": 10827 + }, + { + "epoch": 1.7676421370556303, + "grad_norm": 1.9263503551483154, + "learning_rate": 1.9517600736658572e-05, + "loss": 0.6956, + "step": 10828 + }, + { + "epoch": 1.7678053956981348, + "grad_norm": 1.7992496490478516, + "learning_rate": 1.9517503360512095e-05, + "loss": 0.6904, + "step": 10829 + }, + { + "epoch": 1.7679686543406392, + "grad_norm": 1.6746838092803955, + "learning_rate": 1.9517405974781495e-05, + "loss": 0.6545, + "step": 10830 + }, + { + "epoch": 1.7681319129831436, + "grad_norm": 1.8178722858428955, + "learning_rate": 1.9517308579466866e-05, + "loss": 0.6159, + "step": 10831 + }, + { + "epoch": 1.768295171625648, + "grad_norm": 1.7905980348587036, + "learning_rate": 1.9517211174568317e-05, + "loss": 0.7204, + "step": 10832 + }, + { + "epoch": 1.7684584302681523, + "grad_norm": 1.4039496183395386, + "learning_rate": 1.9517113760085932e-05, + "loss": 0.5344, + "step": 10833 + }, + { + "epoch": 1.7686216889106567, + "grad_norm": 1.5899111032485962, + "learning_rate": 1.9517016336019817e-05, + "loss": 0.6479, + "step": 10834 + }, + { + "epoch": 1.768784947553161, + "grad_norm": 1.8183836936950684, + "learning_rate": 1.9516918902370073e-05, + "loss": 0.6267, + "step": 10835 + }, + { + "epoch": 1.7689482061956654, + "grad_norm": 1.6399550437927246, + "learning_rate": 1.951682145913679e-05, + "loss": 0.5889, + "step": 10836 + }, + { + "epoch": 1.7691114648381698, + "grad_norm": 1.7220840454101562, + "learning_rate": 1.951672400632007e-05, + "loss": 0.6122, + "step": 10837 + }, + { + "epoch": 1.7692747234806743, + "grad_norm": 1.6463868618011475, + "learning_rate": 1.9516626543920015e-05, + "loss": 0.618, + "step": 10838 + }, + { + "epoch": 1.7694379821231787, + "grad_norm": 1.4979952573776245, + "learning_rate": 1.9516529071936717e-05, + "loss": 0.5792, + "step": 10839 + }, + { + "epoch": 1.7696012407656831, + "grad_norm": 1.7675654888153076, + "learning_rate": 1.951643159037028e-05, + "loss": 0.6128, + "step": 10840 + }, + { + "epoch": 1.7697644994081876, + "grad_norm": 1.794031023979187, + "learning_rate": 1.9516334099220793e-05, + "loss": 0.657, + "step": 10841 + }, + { + "epoch": 1.7699277580506918, + "grad_norm": 1.63181471824646, + "learning_rate": 1.9516236598488364e-05, + "loss": 0.6537, + "step": 10842 + }, + { + "epoch": 1.7700910166931962, + "grad_norm": 1.6753329038619995, + "learning_rate": 1.951613908817309e-05, + "loss": 0.661, + "step": 10843 + }, + { + "epoch": 1.7702542753357005, + "grad_norm": 1.980138897895813, + "learning_rate": 1.951604156827506e-05, + "loss": 0.7286, + "step": 10844 + }, + { + "epoch": 1.770417533978205, + "grad_norm": 1.683947205543518, + "learning_rate": 1.9515944038794384e-05, + "loss": 0.5825, + "step": 10845 + }, + { + "epoch": 1.7705807926207093, + "grad_norm": 1.768072485923767, + "learning_rate": 1.9515846499731153e-05, + "loss": 0.8085, + "step": 10846 + }, + { + "epoch": 1.7707440512632138, + "grad_norm": 1.9497138261795044, + "learning_rate": 1.9515748951085468e-05, + "loss": 0.7266, + "step": 10847 + }, + { + "epoch": 1.7709073099057182, + "grad_norm": 1.6978740692138672, + "learning_rate": 1.951565139285742e-05, + "loss": 0.7681, + "step": 10848 + }, + { + "epoch": 1.7710705685482226, + "grad_norm": 1.4797953367233276, + "learning_rate": 1.951555382504712e-05, + "loss": 0.5991, + "step": 10849 + }, + { + "epoch": 1.7712338271907269, + "grad_norm": 1.6163244247436523, + "learning_rate": 1.951545624765466e-05, + "loss": 0.5965, + "step": 10850 + }, + { + "epoch": 1.7713970858332313, + "grad_norm": 1.559449315071106, + "learning_rate": 1.9515358660680137e-05, + "loss": 0.7004, + "step": 10851 + }, + { + "epoch": 1.7715603444757357, + "grad_norm": 1.8776028156280518, + "learning_rate": 1.9515261064123653e-05, + "loss": 0.6821, + "step": 10852 + }, + { + "epoch": 1.77172360311824, + "grad_norm": 1.8561935424804688, + "learning_rate": 1.9515163457985298e-05, + "loss": 0.5908, + "step": 10853 + }, + { + "epoch": 1.7718868617607444, + "grad_norm": 1.7763408422470093, + "learning_rate": 1.9515065842265178e-05, + "loss": 0.5461, + "step": 10854 + }, + { + "epoch": 1.7720501204032488, + "grad_norm": 1.7872830629348755, + "learning_rate": 1.951496821696339e-05, + "loss": 0.7028, + "step": 10855 + }, + { + "epoch": 1.7722133790457533, + "grad_norm": 1.627462387084961, + "learning_rate": 1.951487058208003e-05, + "loss": 0.5856, + "step": 10856 + }, + { + "epoch": 1.7723766376882577, + "grad_norm": 1.431952714920044, + "learning_rate": 1.95147729376152e-05, + "loss": 0.6125, + "step": 10857 + }, + { + "epoch": 1.7725398963307621, + "grad_norm": 1.858451247215271, + "learning_rate": 1.9514675283569e-05, + "loss": 0.8164, + "step": 10858 + }, + { + "epoch": 1.7727031549732664, + "grad_norm": 1.650089979171753, + "learning_rate": 1.951457761994152e-05, + "loss": 0.6906, + "step": 10859 + }, + { + "epoch": 1.7728664136157708, + "grad_norm": 1.8180104494094849, + "learning_rate": 1.951447994673286e-05, + "loss": 0.8398, + "step": 10860 + }, + { + "epoch": 1.773029672258275, + "grad_norm": 1.6162959337234497, + "learning_rate": 1.9514382263943125e-05, + "loss": 0.6292, + "step": 10861 + }, + { + "epoch": 1.7731929309007795, + "grad_norm": 1.700246810913086, + "learning_rate": 1.951428457157241e-05, + "loss": 0.7498, + "step": 10862 + }, + { + "epoch": 1.773356189543284, + "grad_norm": 2.20633864402771, + "learning_rate": 1.951418686962081e-05, + "loss": 0.6972, + "step": 10863 + }, + { + "epoch": 1.7735194481857883, + "grad_norm": 1.798872470855713, + "learning_rate": 1.951408915808843e-05, + "loss": 0.6647, + "step": 10864 + }, + { + "epoch": 1.7736827068282928, + "grad_norm": 1.6198927164077759, + "learning_rate": 1.951399143697536e-05, + "loss": 0.688, + "step": 10865 + }, + { + "epoch": 1.7738459654707972, + "grad_norm": 1.865727186203003, + "learning_rate": 1.9513893706281707e-05, + "loss": 0.8842, + "step": 10866 + }, + { + "epoch": 1.7740092241133016, + "grad_norm": 1.6654359102249146, + "learning_rate": 1.9513795966007563e-05, + "loss": 0.6891, + "step": 10867 + }, + { + "epoch": 1.7741724827558059, + "grad_norm": 1.60623037815094, + "learning_rate": 1.951369821615303e-05, + "loss": 0.7686, + "step": 10868 + }, + { + "epoch": 1.7743357413983103, + "grad_norm": 1.9587477445602417, + "learning_rate": 1.9513600456718206e-05, + "loss": 0.8218, + "step": 10869 + }, + { + "epoch": 1.7744990000408145, + "grad_norm": 1.5646796226501465, + "learning_rate": 1.951350268770319e-05, + "loss": 0.665, + "step": 10870 + }, + { + "epoch": 1.774662258683319, + "grad_norm": 1.5532495975494385, + "learning_rate": 1.9513404909108078e-05, + "loss": 0.6639, + "step": 10871 + }, + { + "epoch": 1.7748255173258234, + "grad_norm": 1.6693313121795654, + "learning_rate": 1.9513307120932968e-05, + "loss": 0.7177, + "step": 10872 + }, + { + "epoch": 1.7749887759683278, + "grad_norm": 1.5305919647216797, + "learning_rate": 1.9513209323177963e-05, + "loss": 0.6372, + "step": 10873 + }, + { + "epoch": 1.7751520346108323, + "grad_norm": 1.5721503496170044, + "learning_rate": 1.951311151584316e-05, + "loss": 0.6643, + "step": 10874 + }, + { + "epoch": 1.7753152932533367, + "grad_norm": 1.835545539855957, + "learning_rate": 1.9513013698928654e-05, + "loss": 0.8045, + "step": 10875 + }, + { + "epoch": 1.7754785518958411, + "grad_norm": 1.7381603717803955, + "learning_rate": 1.9512915872434542e-05, + "loss": 0.801, + "step": 10876 + }, + { + "epoch": 1.7756418105383454, + "grad_norm": 1.596463680267334, + "learning_rate": 1.9512818036360932e-05, + "loss": 0.6944, + "step": 10877 + }, + { + "epoch": 1.7758050691808498, + "grad_norm": 1.408737063407898, + "learning_rate": 1.9512720190707915e-05, + "loss": 0.5943, + "step": 10878 + }, + { + "epoch": 1.775968327823354, + "grad_norm": 1.5177603960037231, + "learning_rate": 1.951262233547559e-05, + "loss": 0.6052, + "step": 10879 + }, + { + "epoch": 1.7761315864658584, + "grad_norm": 1.3990031480789185, + "learning_rate": 1.9512524470664058e-05, + "loss": 0.5602, + "step": 10880 + }, + { + "epoch": 1.7762948451083629, + "grad_norm": 1.5601325035095215, + "learning_rate": 1.9512426596273418e-05, + "loss": 0.6958, + "step": 10881 + }, + { + "epoch": 1.7764581037508673, + "grad_norm": 2.587397336959839, + "learning_rate": 1.9512328712303764e-05, + "loss": 0.571, + "step": 10882 + }, + { + "epoch": 1.7766213623933718, + "grad_norm": 1.969022274017334, + "learning_rate": 1.95122308187552e-05, + "loss": 0.7032, + "step": 10883 + }, + { + "epoch": 1.7767846210358762, + "grad_norm": 1.7399964332580566, + "learning_rate": 1.951213291562782e-05, + "loss": 0.692, + "step": 10884 + }, + { + "epoch": 1.7769478796783806, + "grad_norm": 1.9375702142715454, + "learning_rate": 1.9512035002921726e-05, + "loss": 0.7376, + "step": 10885 + }, + { + "epoch": 1.7771111383208849, + "grad_norm": 1.8940571546554565, + "learning_rate": 1.9511937080637015e-05, + "loss": 0.7081, + "step": 10886 + }, + { + "epoch": 1.7772743969633893, + "grad_norm": 1.5788077116012573, + "learning_rate": 1.9511839148773783e-05, + "loss": 0.6081, + "step": 10887 + }, + { + "epoch": 1.7774376556058935, + "grad_norm": 1.567949891090393, + "learning_rate": 1.9511741207332134e-05, + "loss": 0.7016, + "step": 10888 + }, + { + "epoch": 1.777600914248398, + "grad_norm": 1.5725597143173218, + "learning_rate": 1.9511643256312165e-05, + "loss": 0.7019, + "step": 10889 + }, + { + "epoch": 1.7777641728909024, + "grad_norm": 1.3835197687149048, + "learning_rate": 1.9511545295713975e-05, + "loss": 0.5229, + "step": 10890 + }, + { + "epoch": 1.7779274315334068, + "grad_norm": 1.983066439628601, + "learning_rate": 1.9511447325537658e-05, + "loss": 0.7461, + "step": 10891 + }, + { + "epoch": 1.7780906901759113, + "grad_norm": 1.9186986684799194, + "learning_rate": 1.9511349345783316e-05, + "loss": 0.7311, + "step": 10892 + }, + { + "epoch": 1.7782539488184157, + "grad_norm": 1.739055871963501, + "learning_rate": 1.951125135645105e-05, + "loss": 0.7455, + "step": 10893 + }, + { + "epoch": 1.77841720746092, + "grad_norm": 1.6442842483520508, + "learning_rate": 1.9511153357540954e-05, + "loss": 0.7079, + "step": 10894 + }, + { + "epoch": 1.7785804661034244, + "grad_norm": 1.6049189567565918, + "learning_rate": 1.951105534905313e-05, + "loss": 0.6143, + "step": 10895 + }, + { + "epoch": 1.7787437247459288, + "grad_norm": 1.2695266008377075, + "learning_rate": 1.951095733098768e-05, + "loss": 0.4519, + "step": 10896 + }, + { + "epoch": 1.778906983388433, + "grad_norm": 1.5489606857299805, + "learning_rate": 1.9510859303344695e-05, + "loss": 0.6004, + "step": 10897 + }, + { + "epoch": 1.7790702420309374, + "grad_norm": 1.873193383216858, + "learning_rate": 1.9510761266124277e-05, + "loss": 0.7072, + "step": 10898 + }, + { + "epoch": 1.7792335006734419, + "grad_norm": 1.5930944681167603, + "learning_rate": 1.9510663219326525e-05, + "loss": 0.6252, + "step": 10899 + }, + { + "epoch": 1.7793967593159463, + "grad_norm": 2.268545389175415, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.8394, + "step": 10900 + }, + { + "epoch": 1.7795600179584508, + "grad_norm": 1.66386878490448, + "learning_rate": 1.9510467096999413e-05, + "loss": 0.7654, + "step": 10901 + }, + { + "epoch": 1.7797232766009552, + "grad_norm": 2.10508394241333, + "learning_rate": 1.9510369021470253e-05, + "loss": 0.736, + "step": 10902 + }, + { + "epoch": 1.7798865352434594, + "grad_norm": 1.7712843418121338, + "learning_rate": 1.9510270936364152e-05, + "loss": 0.6082, + "step": 10903 + }, + { + "epoch": 1.7800497938859638, + "grad_norm": 1.484865427017212, + "learning_rate": 1.951017284168121e-05, + "loss": 0.6835, + "step": 10904 + }, + { + "epoch": 1.780213052528468, + "grad_norm": 1.6732131242752075, + "learning_rate": 1.9510074737421528e-05, + "loss": 0.7047, + "step": 10905 + }, + { + "epoch": 1.7803763111709725, + "grad_norm": 1.665887713432312, + "learning_rate": 1.9509976623585204e-05, + "loss": 0.8443, + "step": 10906 + }, + { + "epoch": 1.780539569813477, + "grad_norm": 1.6836378574371338, + "learning_rate": 1.9509878500172333e-05, + "loss": 0.6747, + "step": 10907 + }, + { + "epoch": 1.7807028284559814, + "grad_norm": 2.0392541885375977, + "learning_rate": 1.950978036718302e-05, + "loss": 0.7848, + "step": 10908 + }, + { + "epoch": 1.7808660870984858, + "grad_norm": 1.3736915588378906, + "learning_rate": 1.950968222461736e-05, + "loss": 0.4559, + "step": 10909 + }, + { + "epoch": 1.7810293457409903, + "grad_norm": 1.583001732826233, + "learning_rate": 1.950958407247545e-05, + "loss": 0.6005, + "step": 10910 + }, + { + "epoch": 1.7811926043834947, + "grad_norm": 1.6234726905822754, + "learning_rate": 1.9509485910757393e-05, + "loss": 0.6487, + "step": 10911 + }, + { + "epoch": 1.781355863025999, + "grad_norm": 1.5186219215393066, + "learning_rate": 1.9509387739463284e-05, + "loss": 0.5275, + "step": 10912 + }, + { + "epoch": 1.7815191216685033, + "grad_norm": 1.4962098598480225, + "learning_rate": 1.9509289558593227e-05, + "loss": 0.5547, + "step": 10913 + }, + { + "epoch": 1.7816823803110076, + "grad_norm": 1.8600186109542847, + "learning_rate": 1.950919136814732e-05, + "loss": 0.6616, + "step": 10914 + }, + { + "epoch": 1.781845638953512, + "grad_norm": 1.588076114654541, + "learning_rate": 1.9509093168125656e-05, + "loss": 0.5861, + "step": 10915 + }, + { + "epoch": 1.7820088975960164, + "grad_norm": 1.5765563249588013, + "learning_rate": 1.9508994958528337e-05, + "loss": 0.6691, + "step": 10916 + }, + { + "epoch": 1.7821721562385209, + "grad_norm": 1.6238212585449219, + "learning_rate": 1.9508896739355467e-05, + "loss": 0.5909, + "step": 10917 + }, + { + "epoch": 1.7823354148810253, + "grad_norm": 1.9103422164916992, + "learning_rate": 1.9508798510607137e-05, + "loss": 0.6421, + "step": 10918 + }, + { + "epoch": 1.7824986735235298, + "grad_norm": 1.6012688875198364, + "learning_rate": 1.9508700272283452e-05, + "loss": 0.7118, + "step": 10919 + }, + { + "epoch": 1.7826619321660342, + "grad_norm": 1.9534409046173096, + "learning_rate": 1.9508602024384507e-05, + "loss": 1.2235, + "step": 10920 + }, + { + "epoch": 1.7828251908085384, + "grad_norm": 1.7806932926177979, + "learning_rate": 1.95085037669104e-05, + "loss": 0.7503, + "step": 10921 + }, + { + "epoch": 1.7829884494510428, + "grad_norm": 1.7099485397338867, + "learning_rate": 1.9508405499861235e-05, + "loss": 0.6738, + "step": 10922 + }, + { + "epoch": 1.783151708093547, + "grad_norm": 1.6799393892288208, + "learning_rate": 1.9508307223237105e-05, + "loss": 0.6989, + "step": 10923 + }, + { + "epoch": 1.7833149667360515, + "grad_norm": 1.8940770626068115, + "learning_rate": 1.950820893703812e-05, + "loss": 0.6656, + "step": 10924 + }, + { + "epoch": 1.783478225378556, + "grad_norm": 1.5303853750228882, + "learning_rate": 1.9508110641264363e-05, + "loss": 0.5947, + "step": 10925 + }, + { + "epoch": 1.7836414840210604, + "grad_norm": 1.6698651313781738, + "learning_rate": 1.9508012335915943e-05, + "loss": 0.5975, + "step": 10926 + }, + { + "epoch": 1.7838047426635648, + "grad_norm": 1.6515226364135742, + "learning_rate": 1.950791402099296e-05, + "loss": 0.6616, + "step": 10927 + }, + { + "epoch": 1.7839680013060693, + "grad_norm": 1.8534449338912964, + "learning_rate": 1.950781569649551e-05, + "loss": 0.7861, + "step": 10928 + }, + { + "epoch": 1.7841312599485737, + "grad_norm": 1.7015877962112427, + "learning_rate": 1.9507717362423686e-05, + "loss": 0.6197, + "step": 10929 + }, + { + "epoch": 1.784294518591078, + "grad_norm": 1.470147728919983, + "learning_rate": 1.9507619018777597e-05, + "loss": 0.5684, + "step": 10930 + }, + { + "epoch": 1.7844577772335823, + "grad_norm": 1.8318754434585571, + "learning_rate": 1.950752066555734e-05, + "loss": 0.7034, + "step": 10931 + }, + { + "epoch": 1.7846210358760866, + "grad_norm": 2.2507457733154297, + "learning_rate": 1.9507422302763012e-05, + "loss": 0.7711, + "step": 10932 + }, + { + "epoch": 1.784784294518591, + "grad_norm": 1.7883604764938354, + "learning_rate": 1.950732393039471e-05, + "loss": 0.5786, + "step": 10933 + }, + { + "epoch": 1.7849475531610954, + "grad_norm": 1.744693636894226, + "learning_rate": 1.9507225548452536e-05, + "loss": 0.861, + "step": 10934 + }, + { + "epoch": 1.7851108118035999, + "grad_norm": 1.6499569416046143, + "learning_rate": 1.950712715693659e-05, + "loss": 0.6393, + "step": 10935 + }, + { + "epoch": 1.7852740704461043, + "grad_norm": 1.3422837257385254, + "learning_rate": 1.9507028755846965e-05, + "loss": 0.4565, + "step": 10936 + }, + { + "epoch": 1.7854373290886087, + "grad_norm": 1.9206533432006836, + "learning_rate": 1.950693034518377e-05, + "loss": 0.6765, + "step": 10937 + }, + { + "epoch": 1.785600587731113, + "grad_norm": 1.8371315002441406, + "learning_rate": 1.9506831924947096e-05, + "loss": 0.6633, + "step": 10938 + }, + { + "epoch": 1.7857638463736174, + "grad_norm": 1.7113640308380127, + "learning_rate": 1.9506733495137044e-05, + "loss": 0.636, + "step": 10939 + }, + { + "epoch": 1.7859271050161218, + "grad_norm": 1.4425818920135498, + "learning_rate": 1.9506635055753714e-05, + "loss": 0.6064, + "step": 10940 + }, + { + "epoch": 1.786090363658626, + "grad_norm": 1.6489992141723633, + "learning_rate": 1.9506536606797207e-05, + "loss": 0.7039, + "step": 10941 + }, + { + "epoch": 1.7862536223011305, + "grad_norm": 1.999359369277954, + "learning_rate": 1.9506438148267623e-05, + "loss": 0.695, + "step": 10942 + }, + { + "epoch": 1.786416880943635, + "grad_norm": 1.3801072835922241, + "learning_rate": 1.950633968016505e-05, + "loss": 0.5638, + "step": 10943 + }, + { + "epoch": 1.7865801395861394, + "grad_norm": 1.8162299394607544, + "learning_rate": 1.95062412024896e-05, + "loss": 0.6687, + "step": 10944 + }, + { + "epoch": 1.7867433982286438, + "grad_norm": 1.202967643737793, + "learning_rate": 1.950614271524137e-05, + "loss": 0.541, + "step": 10945 + }, + { + "epoch": 1.7869066568711482, + "grad_norm": 1.8636294603347778, + "learning_rate": 1.9506044218420452e-05, + "loss": 0.7896, + "step": 10946 + }, + { + "epoch": 1.7870699155136525, + "grad_norm": 1.6534351110458374, + "learning_rate": 1.9505945712026953e-05, + "loss": 0.6448, + "step": 10947 + }, + { + "epoch": 1.787233174156157, + "grad_norm": 1.6277782917022705, + "learning_rate": 1.950584719606097e-05, + "loss": 0.6023, + "step": 10948 + }, + { + "epoch": 1.7873964327986611, + "grad_norm": 1.7879890203475952, + "learning_rate": 1.95057486705226e-05, + "loss": 0.7853, + "step": 10949 + }, + { + "epoch": 1.7875596914411656, + "grad_norm": 1.671383261680603, + "learning_rate": 1.950565013541194e-05, + "loss": 0.6176, + "step": 10950 + }, + { + "epoch": 1.78772295008367, + "grad_norm": 2.2748847007751465, + "learning_rate": 1.95055515907291e-05, + "loss": 0.6725, + "step": 10951 + }, + { + "epoch": 1.7878862087261744, + "grad_norm": 1.761818528175354, + "learning_rate": 1.9505453036474167e-05, + "loss": 0.6628, + "step": 10952 + }, + { + "epoch": 1.7880494673686789, + "grad_norm": 1.8150115013122559, + "learning_rate": 1.950535447264725e-05, + "loss": 0.7235, + "step": 10953 + }, + { + "epoch": 1.7882127260111833, + "grad_norm": 1.8136879205703735, + "learning_rate": 1.950525589924844e-05, + "loss": 0.7979, + "step": 10954 + }, + { + "epoch": 1.7883759846536877, + "grad_norm": 1.7881759405136108, + "learning_rate": 1.950515731627784e-05, + "loss": 0.6053, + "step": 10955 + }, + { + "epoch": 1.788539243296192, + "grad_norm": 1.6208127737045288, + "learning_rate": 1.9505058723735547e-05, + "loss": 0.719, + "step": 10956 + }, + { + "epoch": 1.7887025019386964, + "grad_norm": 1.79038405418396, + "learning_rate": 1.9504960121621664e-05, + "loss": 0.6503, + "step": 10957 + }, + { + "epoch": 1.7888657605812006, + "grad_norm": 1.914711833000183, + "learning_rate": 1.950486150993629e-05, + "loss": 0.7484, + "step": 10958 + }, + { + "epoch": 1.789029019223705, + "grad_norm": 1.4020333290100098, + "learning_rate": 1.9504762888679524e-05, + "loss": 0.5701, + "step": 10959 + }, + { + "epoch": 1.7891922778662095, + "grad_norm": 1.9173874855041504, + "learning_rate": 1.9504664257851464e-05, + "loss": 0.7554, + "step": 10960 + }, + { + "epoch": 1.789355536508714, + "grad_norm": 1.640149712562561, + "learning_rate": 1.9504565617452206e-05, + "loss": 0.6913, + "step": 10961 + }, + { + "epoch": 1.7895187951512184, + "grad_norm": 1.6545718908309937, + "learning_rate": 1.9504466967481855e-05, + "loss": 0.637, + "step": 10962 + }, + { + "epoch": 1.7896820537937228, + "grad_norm": 1.9426301717758179, + "learning_rate": 1.950436830794051e-05, + "loss": 0.6781, + "step": 10963 + }, + { + "epoch": 1.7898453124362272, + "grad_norm": 1.5893805027008057, + "learning_rate": 1.9504269638828265e-05, + "loss": 0.6844, + "step": 10964 + }, + { + "epoch": 1.7900085710787315, + "grad_norm": 1.4105567932128906, + "learning_rate": 1.9504170960145226e-05, + "loss": 0.598, + "step": 10965 + }, + { + "epoch": 1.790171829721236, + "grad_norm": 1.6279853582382202, + "learning_rate": 1.9504072271891486e-05, + "loss": 0.7025, + "step": 10966 + }, + { + "epoch": 1.7903350883637401, + "grad_norm": 1.7547433376312256, + "learning_rate": 1.9503973574067152e-05, + "loss": 0.6982, + "step": 10967 + }, + { + "epoch": 1.7904983470062446, + "grad_norm": 1.6257331371307373, + "learning_rate": 1.9503874866672318e-05, + "loss": 0.7528, + "step": 10968 + }, + { + "epoch": 1.790661605648749, + "grad_norm": 1.776758074760437, + "learning_rate": 1.9503776149707082e-05, + "loss": 0.716, + "step": 10969 + }, + { + "epoch": 1.7908248642912534, + "grad_norm": 1.7773115634918213, + "learning_rate": 1.950367742317155e-05, + "loss": 0.6629, + "step": 10970 + }, + { + "epoch": 1.7909881229337579, + "grad_norm": 1.7956613302230835, + "learning_rate": 1.9503578687065816e-05, + "loss": 0.5401, + "step": 10971 + }, + { + "epoch": 1.7911513815762623, + "grad_norm": 1.4034658670425415, + "learning_rate": 1.950347994138998e-05, + "loss": 0.5993, + "step": 10972 + }, + { + "epoch": 1.7913146402187667, + "grad_norm": 1.7711936235427856, + "learning_rate": 1.9503381186144145e-05, + "loss": 0.639, + "step": 10973 + }, + { + "epoch": 1.791477898861271, + "grad_norm": 1.958225965499878, + "learning_rate": 1.9503282421328402e-05, + "loss": 0.703, + "step": 10974 + }, + { + "epoch": 1.7916411575037754, + "grad_norm": 1.7456353902816772, + "learning_rate": 1.9503183646942857e-05, + "loss": 0.6142, + "step": 10975 + }, + { + "epoch": 1.7918044161462796, + "grad_norm": 1.5796600580215454, + "learning_rate": 1.950308486298761e-05, + "loss": 0.5672, + "step": 10976 + }, + { + "epoch": 1.791967674788784, + "grad_norm": 1.3280037641525269, + "learning_rate": 1.950298606946276e-05, + "loss": 0.5625, + "step": 10977 + }, + { + "epoch": 1.7921309334312885, + "grad_norm": 1.709578037261963, + "learning_rate": 1.9502887266368406e-05, + "loss": 0.7647, + "step": 10978 + }, + { + "epoch": 1.792294192073793, + "grad_norm": 1.7585728168487549, + "learning_rate": 1.950278845370465e-05, + "loss": 0.6456, + "step": 10979 + }, + { + "epoch": 1.7924574507162974, + "grad_norm": 1.3722118139266968, + "learning_rate": 1.9502689631471582e-05, + "loss": 0.622, + "step": 10980 + }, + { + "epoch": 1.7926207093588018, + "grad_norm": 1.5017002820968628, + "learning_rate": 1.950259079966931e-05, + "loss": 0.6302, + "step": 10981 + }, + { + "epoch": 1.792783968001306, + "grad_norm": 1.9167367219924927, + "learning_rate": 1.9502491958297932e-05, + "loss": 0.6902, + "step": 10982 + }, + { + "epoch": 1.7929472266438105, + "grad_norm": 1.8151402473449707, + "learning_rate": 1.950239310735755e-05, + "loss": 0.6232, + "step": 10983 + }, + { + "epoch": 1.793110485286315, + "grad_norm": 1.6310298442840576, + "learning_rate": 1.950229424684826e-05, + "loss": 0.5942, + "step": 10984 + }, + { + "epoch": 1.793273743928819, + "grad_norm": 1.7940322160720825, + "learning_rate": 1.9502195376770156e-05, + "loss": 0.7581, + "step": 10985 + }, + { + "epoch": 1.7934370025713235, + "grad_norm": 1.4893049001693726, + "learning_rate": 1.9502096497123352e-05, + "loss": 0.5904, + "step": 10986 + }, + { + "epoch": 1.793600261213828, + "grad_norm": 1.6086452007293701, + "learning_rate": 1.9501997607907936e-05, + "loss": 0.544, + "step": 10987 + }, + { + "epoch": 1.7937635198563324, + "grad_norm": 2.5124011039733887, + "learning_rate": 1.950189870912401e-05, + "loss": 0.6022, + "step": 10988 + }, + { + "epoch": 1.7939267784988369, + "grad_norm": 1.6374303102493286, + "learning_rate": 1.9501799800771674e-05, + "loss": 0.6678, + "step": 10989 + }, + { + "epoch": 1.7940900371413413, + "grad_norm": 1.6227725744247437, + "learning_rate": 1.950170088285103e-05, + "loss": 0.5933, + "step": 10990 + }, + { + "epoch": 1.7942532957838455, + "grad_norm": 1.307260513305664, + "learning_rate": 1.9501601955362172e-05, + "loss": 0.5326, + "step": 10991 + }, + { + "epoch": 1.79441655442635, + "grad_norm": 1.6664745807647705, + "learning_rate": 1.9501503018305206e-05, + "loss": 0.5674, + "step": 10992 + }, + { + "epoch": 1.7945798130688542, + "grad_norm": 1.581840991973877, + "learning_rate": 1.950140407168023e-05, + "loss": 0.6307, + "step": 10993 + }, + { + "epoch": 1.7947430717113586, + "grad_norm": 2.0847830772399902, + "learning_rate": 1.9501305115487345e-05, + "loss": 0.8225, + "step": 10994 + }, + { + "epoch": 1.794906330353863, + "grad_norm": 1.8122012615203857, + "learning_rate": 1.950120614972664e-05, + "loss": 0.7781, + "step": 10995 + }, + { + "epoch": 1.7950695889963675, + "grad_norm": 1.956193447113037, + "learning_rate": 1.950110717439823e-05, + "loss": 0.7514, + "step": 10996 + }, + { + "epoch": 1.795232847638872, + "grad_norm": 1.7041512727737427, + "learning_rate": 1.9501008189502206e-05, + "loss": 0.5448, + "step": 10997 + }, + { + "epoch": 1.7953961062813764, + "grad_norm": 1.6150808334350586, + "learning_rate": 1.950090919503867e-05, + "loss": 0.6938, + "step": 10998 + }, + { + "epoch": 1.7955593649238808, + "grad_norm": 2.1417901515960693, + "learning_rate": 1.9500810191007717e-05, + "loss": 0.7258, + "step": 10999 + }, + { + "epoch": 1.795722623566385, + "grad_norm": 2.011456251144409, + "learning_rate": 1.9500711177409456e-05, + "loss": 1.074, + "step": 11000 + }, + { + "epoch": 1.7958858822088895, + "grad_norm": 1.5073484182357788, + "learning_rate": 1.950061215424398e-05, + "loss": 0.6574, + "step": 11001 + }, + { + "epoch": 1.7960491408513937, + "grad_norm": 1.8210022449493408, + "learning_rate": 1.9500513121511386e-05, + "loss": 0.6497, + "step": 11002 + }, + { + "epoch": 1.796212399493898, + "grad_norm": 1.3758474588394165, + "learning_rate": 1.9500414079211782e-05, + "loss": 0.6458, + "step": 11003 + }, + { + "epoch": 1.7963756581364025, + "grad_norm": 1.6705750226974487, + "learning_rate": 1.9500315027345264e-05, + "loss": 0.6876, + "step": 11004 + }, + { + "epoch": 1.796538916778907, + "grad_norm": 1.51702082157135, + "learning_rate": 1.950021596591193e-05, + "loss": 0.5106, + "step": 11005 + }, + { + "epoch": 1.7967021754214114, + "grad_norm": 1.6331568956375122, + "learning_rate": 1.9500116894911878e-05, + "loss": 0.5728, + "step": 11006 + }, + { + "epoch": 1.7968654340639159, + "grad_norm": 1.6726586818695068, + "learning_rate": 1.9500017814345213e-05, + "loss": 0.6824, + "step": 11007 + }, + { + "epoch": 1.7970286927064203, + "grad_norm": 1.8206206560134888, + "learning_rate": 1.9499918724212034e-05, + "loss": 0.7046, + "step": 11008 + }, + { + "epoch": 1.7971919513489245, + "grad_norm": 1.7003638744354248, + "learning_rate": 1.949981962451244e-05, + "loss": 0.6858, + "step": 11009 + }, + { + "epoch": 1.797355209991429, + "grad_norm": 1.58940589427948, + "learning_rate": 1.9499720515246524e-05, + "loss": 0.7, + "step": 11010 + }, + { + "epoch": 1.7975184686339332, + "grad_norm": 1.6202011108398438, + "learning_rate": 1.94996213964144e-05, + "loss": 0.6121, + "step": 11011 + }, + { + "epoch": 1.7976817272764376, + "grad_norm": 1.948164701461792, + "learning_rate": 1.9499522268016155e-05, + "loss": 0.6405, + "step": 11012 + }, + { + "epoch": 1.797844985918942, + "grad_norm": 1.833451747894287, + "learning_rate": 1.9499423130051895e-05, + "loss": 0.6793, + "step": 11013 + }, + { + "epoch": 1.7980082445614465, + "grad_norm": 1.7831531763076782, + "learning_rate": 1.949932398252172e-05, + "loss": 0.7406, + "step": 11014 + }, + { + "epoch": 1.798171503203951, + "grad_norm": 1.360020637512207, + "learning_rate": 1.9499224825425727e-05, + "loss": 0.5295, + "step": 11015 + }, + { + "epoch": 1.7983347618464554, + "grad_norm": 1.5655766725540161, + "learning_rate": 1.9499125658764014e-05, + "loss": 0.5825, + "step": 11016 + }, + { + "epoch": 1.7984980204889598, + "grad_norm": 1.7183139324188232, + "learning_rate": 1.9499026482536688e-05, + "loss": 0.7503, + "step": 11017 + }, + { + "epoch": 1.798661279131464, + "grad_norm": 1.521177887916565, + "learning_rate": 1.9498927296743842e-05, + "loss": 0.5439, + "step": 11018 + }, + { + "epoch": 1.7988245377739684, + "grad_norm": 1.4163738489151, + "learning_rate": 1.9498828101385582e-05, + "loss": 0.4802, + "step": 11019 + }, + { + "epoch": 1.7989877964164727, + "grad_norm": 1.779533863067627, + "learning_rate": 1.9498728896462002e-05, + "loss": 0.6561, + "step": 11020 + }, + { + "epoch": 1.799151055058977, + "grad_norm": 1.7703741788864136, + "learning_rate": 1.9498629681973208e-05, + "loss": 0.6692, + "step": 11021 + }, + { + "epoch": 1.7993143137014815, + "grad_norm": 1.3677408695220947, + "learning_rate": 1.949853045791929e-05, + "loss": 0.5202, + "step": 11022 + }, + { + "epoch": 1.799477572343986, + "grad_norm": 1.752852201461792, + "learning_rate": 1.949843122430036e-05, + "loss": 0.6024, + "step": 11023 + }, + { + "epoch": 1.7996408309864904, + "grad_norm": 1.8039292097091675, + "learning_rate": 1.9498331981116512e-05, + "loss": 0.6691, + "step": 11024 + }, + { + "epoch": 1.7998040896289949, + "grad_norm": 1.6552469730377197, + "learning_rate": 1.9498232728367843e-05, + "loss": 0.5659, + "step": 11025 + }, + { + "epoch": 1.799967348271499, + "grad_norm": 1.9618523120880127, + "learning_rate": 1.9498133466054457e-05, + "loss": 0.7139, + "step": 11026 + }, + { + "epoch": 1.8001306069140035, + "grad_norm": 1.5022468566894531, + "learning_rate": 1.9498034194176454e-05, + "loss": 0.5399, + "step": 11027 + }, + { + "epoch": 1.800293865556508, + "grad_norm": 1.4631116390228271, + "learning_rate": 1.9497934912733933e-05, + "loss": 0.6006, + "step": 11028 + }, + { + "epoch": 1.8004571241990122, + "grad_norm": 1.6841527223587036, + "learning_rate": 1.9497835621726995e-05, + "loss": 0.6096, + "step": 11029 + }, + { + "epoch": 1.8006203828415166, + "grad_norm": 1.5290027856826782, + "learning_rate": 1.9497736321155737e-05, + "loss": 0.6097, + "step": 11030 + }, + { + "epoch": 1.800783641484021, + "grad_norm": 1.6787018775939941, + "learning_rate": 1.9497637011020264e-05, + "loss": 0.6967, + "step": 11031 + }, + { + "epoch": 1.8009469001265255, + "grad_norm": 1.9323915243148804, + "learning_rate": 1.949753769132067e-05, + "loss": 0.7356, + "step": 11032 + }, + { + "epoch": 1.80111015876903, + "grad_norm": 1.8231273889541626, + "learning_rate": 1.949743836205706e-05, + "loss": 0.608, + "step": 11033 + }, + { + "epoch": 1.8012734174115343, + "grad_norm": 1.6533604860305786, + "learning_rate": 1.949733902322953e-05, + "loss": 0.6386, + "step": 11034 + }, + { + "epoch": 1.8014366760540386, + "grad_norm": 1.9590764045715332, + "learning_rate": 1.9497239674838183e-05, + "loss": 0.8171, + "step": 11035 + }, + { + "epoch": 1.801599934696543, + "grad_norm": 1.7612595558166504, + "learning_rate": 1.949714031688312e-05, + "loss": 0.6796, + "step": 11036 + }, + { + "epoch": 1.8017631933390472, + "grad_norm": 1.6914145946502686, + "learning_rate": 1.949704094936444e-05, + "loss": 0.6809, + "step": 11037 + }, + { + "epoch": 1.8019264519815517, + "grad_norm": 1.405837893486023, + "learning_rate": 1.9496941572282237e-05, + "loss": 0.6175, + "step": 11038 + }, + { + "epoch": 1.802089710624056, + "grad_norm": 2.1227335929870605, + "learning_rate": 1.949684218563662e-05, + "loss": 0.6949, + "step": 11039 + }, + { + "epoch": 1.8022529692665605, + "grad_norm": 1.6746982336044312, + "learning_rate": 1.9496742789427684e-05, + "loss": 0.6762, + "step": 11040 + }, + { + "epoch": 1.802416227909065, + "grad_norm": 1.447788953781128, + "learning_rate": 1.9496643383655533e-05, + "loss": 0.5899, + "step": 11041 + }, + { + "epoch": 1.8025794865515694, + "grad_norm": 1.5123610496520996, + "learning_rate": 1.9496543968320262e-05, + "loss": 0.5845, + "step": 11042 + }, + { + "epoch": 1.8027427451940738, + "grad_norm": 1.5610320568084717, + "learning_rate": 1.9496444543421975e-05, + "loss": 0.5345, + "step": 11043 + }, + { + "epoch": 1.802906003836578, + "grad_norm": 1.5995087623596191, + "learning_rate": 1.949634510896077e-05, + "loss": 0.7436, + "step": 11044 + }, + { + "epoch": 1.8030692624790825, + "grad_norm": 1.524634599685669, + "learning_rate": 1.9496245664936752e-05, + "loss": 0.6011, + "step": 11045 + }, + { + "epoch": 1.8032325211215867, + "grad_norm": 1.7527985572814941, + "learning_rate": 1.9496146211350015e-05, + "loss": 0.7144, + "step": 11046 + }, + { + "epoch": 1.8033957797640912, + "grad_norm": 2.2442309856414795, + "learning_rate": 1.9496046748200655e-05, + "loss": 0.7303, + "step": 11047 + }, + { + "epoch": 1.8035590384065956, + "grad_norm": 1.8784981966018677, + "learning_rate": 1.9495947275488785e-05, + "loss": 0.6966, + "step": 11048 + }, + { + "epoch": 1.8037222970491, + "grad_norm": 2.1215879917144775, + "learning_rate": 1.94958477932145e-05, + "loss": 0.8389, + "step": 11049 + }, + { + "epoch": 1.8038855556916045, + "grad_norm": 1.6216996908187866, + "learning_rate": 1.9495748301377895e-05, + "loss": 0.6188, + "step": 11050 + }, + { + "epoch": 1.804048814334109, + "grad_norm": 1.7379426956176758, + "learning_rate": 1.9495648799979076e-05, + "loss": 0.6953, + "step": 11051 + }, + { + "epoch": 1.8042120729766133, + "grad_norm": 1.5838842391967773, + "learning_rate": 1.949554928901814e-05, + "loss": 0.6626, + "step": 11052 + }, + { + "epoch": 1.8043753316191176, + "grad_norm": 1.4576048851013184, + "learning_rate": 1.9495449768495185e-05, + "loss": 0.4798, + "step": 11053 + }, + { + "epoch": 1.804538590261622, + "grad_norm": 1.676416039466858, + "learning_rate": 1.949535023841032e-05, + "loss": 0.6886, + "step": 11054 + }, + { + "epoch": 1.8047018489041262, + "grad_norm": 1.896082878112793, + "learning_rate": 1.9495250698763637e-05, + "loss": 0.7831, + "step": 11055 + }, + { + "epoch": 1.8048651075466307, + "grad_norm": 2.168466329574585, + "learning_rate": 1.9495151149555243e-05, + "loss": 0.6503, + "step": 11056 + }, + { + "epoch": 1.805028366189135, + "grad_norm": 2.002229928970337, + "learning_rate": 1.949505159078523e-05, + "loss": 0.7434, + "step": 11057 + }, + { + "epoch": 1.8051916248316395, + "grad_norm": 1.827767252922058, + "learning_rate": 1.9494952022453703e-05, + "loss": 0.6399, + "step": 11058 + }, + { + "epoch": 1.805354883474144, + "grad_norm": 1.5005276203155518, + "learning_rate": 1.9494852444560764e-05, + "loss": 0.6119, + "step": 11059 + }, + { + "epoch": 1.8055181421166484, + "grad_norm": 1.9106030464172363, + "learning_rate": 1.949475285710651e-05, + "loss": 0.6963, + "step": 11060 + }, + { + "epoch": 1.8056814007591528, + "grad_norm": 1.6177887916564941, + "learning_rate": 1.949465326009104e-05, + "loss": 0.5945, + "step": 11061 + }, + { + "epoch": 1.805844659401657, + "grad_norm": 2.0745201110839844, + "learning_rate": 1.949455365351446e-05, + "loss": 0.7515, + "step": 11062 + }, + { + "epoch": 1.8060079180441615, + "grad_norm": 1.9425119161605835, + "learning_rate": 1.9494454037376866e-05, + "loss": 0.8745, + "step": 11063 + }, + { + "epoch": 1.8061711766866657, + "grad_norm": 1.957694411277771, + "learning_rate": 1.949435441167836e-05, + "loss": 0.7694, + "step": 11064 + }, + { + "epoch": 1.8063344353291702, + "grad_norm": 1.573350429534912, + "learning_rate": 1.949425477641904e-05, + "loss": 0.5876, + "step": 11065 + }, + { + "epoch": 1.8064976939716746, + "grad_norm": 1.619499921798706, + "learning_rate": 1.9494155131599007e-05, + "loss": 0.5589, + "step": 11066 + }, + { + "epoch": 1.806660952614179, + "grad_norm": 1.5997307300567627, + "learning_rate": 1.9494055477218366e-05, + "loss": 0.6226, + "step": 11067 + }, + { + "epoch": 1.8068242112566835, + "grad_norm": 1.6940546035766602, + "learning_rate": 1.949395581327721e-05, + "loss": 0.6639, + "step": 11068 + }, + { + "epoch": 1.806987469899188, + "grad_norm": 2.0067224502563477, + "learning_rate": 1.9493856139775645e-05, + "loss": 0.7095, + "step": 11069 + }, + { + "epoch": 1.8071507285416923, + "grad_norm": 1.8388453722000122, + "learning_rate": 1.949375645671377e-05, + "loss": 0.8511, + "step": 11070 + }, + { + "epoch": 1.8073139871841966, + "grad_norm": 1.4517158269882202, + "learning_rate": 1.9493656764091687e-05, + "loss": 0.579, + "step": 11071 + }, + { + "epoch": 1.807477245826701, + "grad_norm": 1.6384245157241821, + "learning_rate": 1.9493557061909487e-05, + "loss": 0.6546, + "step": 11072 + }, + { + "epoch": 1.8076405044692052, + "grad_norm": 1.8635280132293701, + "learning_rate": 1.949345735016728e-05, + "loss": 0.6664, + "step": 11073 + }, + { + "epoch": 1.8078037631117096, + "grad_norm": 1.9850223064422607, + "learning_rate": 1.949335762886517e-05, + "loss": 0.7521, + "step": 11074 + }, + { + "epoch": 1.807967021754214, + "grad_norm": 1.514289140701294, + "learning_rate": 1.9493257898003247e-05, + "loss": 0.5888, + "step": 11075 + }, + { + "epoch": 1.8081302803967185, + "grad_norm": 1.883723497390747, + "learning_rate": 1.9493158157581617e-05, + "loss": 0.749, + "step": 11076 + }, + { + "epoch": 1.808293539039223, + "grad_norm": 2.0146236419677734, + "learning_rate": 1.9493058407600375e-05, + "loss": 0.6451, + "step": 11077 + }, + { + "epoch": 1.8084567976817274, + "grad_norm": 1.5513850450515747, + "learning_rate": 1.949295864805963e-05, + "loss": 0.5897, + "step": 11078 + }, + { + "epoch": 1.8086200563242316, + "grad_norm": 1.7357043027877808, + "learning_rate": 1.949285887895948e-05, + "loss": 0.6933, + "step": 11079 + }, + { + "epoch": 1.808783314966736, + "grad_norm": 1.5706382989883423, + "learning_rate": 1.949275910030002e-05, + "loss": 0.6496, + "step": 11080 + }, + { + "epoch": 1.8089465736092405, + "grad_norm": 1.4792040586471558, + "learning_rate": 1.9492659312081353e-05, + "loss": 0.6011, + "step": 11081 + }, + { + "epoch": 1.8091098322517447, + "grad_norm": 1.7933368682861328, + "learning_rate": 1.949255951430358e-05, + "loss": 0.8305, + "step": 11082 + }, + { + "epoch": 1.8092730908942491, + "grad_norm": 1.4616010189056396, + "learning_rate": 1.9492459706966805e-05, + "loss": 0.6949, + "step": 11083 + }, + { + "epoch": 1.8094363495367536, + "grad_norm": 1.5476882457733154, + "learning_rate": 1.9492359890071126e-05, + "loss": 0.6311, + "step": 11084 + }, + { + "epoch": 1.809599608179258, + "grad_norm": 1.691097378730774, + "learning_rate": 1.949226006361664e-05, + "loss": 0.5961, + "step": 11085 + }, + { + "epoch": 1.8097628668217625, + "grad_norm": 1.705325722694397, + "learning_rate": 1.9492160227603455e-05, + "loss": 0.6277, + "step": 11086 + }, + { + "epoch": 1.809926125464267, + "grad_norm": 1.5117729902267456, + "learning_rate": 1.9492060382031663e-05, + "loss": 0.6084, + "step": 11087 + }, + { + "epoch": 1.8100893841067711, + "grad_norm": 1.4805537462234497, + "learning_rate": 1.949196052690137e-05, + "loss": 0.7033, + "step": 11088 + }, + { + "epoch": 1.8102526427492756, + "grad_norm": 1.5191841125488281, + "learning_rate": 1.949186066221268e-05, + "loss": 0.6451, + "step": 11089 + }, + { + "epoch": 1.8104159013917798, + "grad_norm": 1.7236820459365845, + "learning_rate": 1.949176078796568e-05, + "loss": 0.5559, + "step": 11090 + }, + { + "epoch": 1.8105791600342842, + "grad_norm": 1.9414710998535156, + "learning_rate": 1.9491660904160487e-05, + "loss": 0.7223, + "step": 11091 + }, + { + "epoch": 1.8107424186767886, + "grad_norm": 1.4370970726013184, + "learning_rate": 1.9491561010797188e-05, + "loss": 0.6223, + "step": 11092 + }, + { + "epoch": 1.810905677319293, + "grad_norm": 1.5884202718734741, + "learning_rate": 1.949146110787589e-05, + "loss": 0.6707, + "step": 11093 + }, + { + "epoch": 1.8110689359617975, + "grad_norm": 1.7541905641555786, + "learning_rate": 1.9491361195396702e-05, + "loss": 0.6518, + "step": 11094 + }, + { + "epoch": 1.811232194604302, + "grad_norm": 1.494960904121399, + "learning_rate": 1.9491261273359708e-05, + "loss": 0.6113, + "step": 11095 + }, + { + "epoch": 1.8113954532468064, + "grad_norm": 1.6405655145645142, + "learning_rate": 1.9491161341765018e-05, + "loss": 0.6551, + "step": 11096 + }, + { + "epoch": 1.8115587118893106, + "grad_norm": 1.634151816368103, + "learning_rate": 1.949106140061273e-05, + "loss": 0.5169, + "step": 11097 + }, + { + "epoch": 1.811721970531815, + "grad_norm": 1.7516169548034668, + "learning_rate": 1.9490961449902946e-05, + "loss": 0.6103, + "step": 11098 + }, + { + "epoch": 1.8118852291743193, + "grad_norm": 2.7211694717407227, + "learning_rate": 1.9490861489635767e-05, + "loss": 0.7449, + "step": 11099 + }, + { + "epoch": 1.8120484878168237, + "grad_norm": 1.6961859464645386, + "learning_rate": 1.9490761519811295e-05, + "loss": 0.683, + "step": 11100 + }, + { + "epoch": 1.8122117464593281, + "grad_norm": 1.8151664733886719, + "learning_rate": 1.949066154042963e-05, + "loss": 0.6443, + "step": 11101 + }, + { + "epoch": 1.8123750051018326, + "grad_norm": 1.6227350234985352, + "learning_rate": 1.9490561551490863e-05, + "loss": 0.6272, + "step": 11102 + }, + { + "epoch": 1.812538263744337, + "grad_norm": 1.4189320802688599, + "learning_rate": 1.949046155299511e-05, + "loss": 0.6091, + "step": 11103 + }, + { + "epoch": 1.8127015223868415, + "grad_norm": 1.7289100885391235, + "learning_rate": 1.949036154494246e-05, + "loss": 0.5913, + "step": 11104 + }, + { + "epoch": 1.812864781029346, + "grad_norm": 1.2770830392837524, + "learning_rate": 1.949026152733302e-05, + "loss": 0.4446, + "step": 11105 + }, + { + "epoch": 1.8130280396718501, + "grad_norm": 1.9457275867462158, + "learning_rate": 1.9490161500166892e-05, + "loss": 0.6833, + "step": 11106 + }, + { + "epoch": 1.8131912983143545, + "grad_norm": 1.361099362373352, + "learning_rate": 1.9490061463444175e-05, + "loss": 0.6293, + "step": 11107 + }, + { + "epoch": 1.8133545569568588, + "grad_norm": 1.961525321006775, + "learning_rate": 1.9489961417164968e-05, + "loss": 0.8395, + "step": 11108 + }, + { + "epoch": 1.8135178155993632, + "grad_norm": 1.8126193284988403, + "learning_rate": 1.948986136132937e-05, + "loss": 0.6789, + "step": 11109 + }, + { + "epoch": 1.8136810742418676, + "grad_norm": 1.8337323665618896, + "learning_rate": 1.9489761295937483e-05, + "loss": 0.6231, + "step": 11110 + }, + { + "epoch": 1.813844332884372, + "grad_norm": 1.478893518447876, + "learning_rate": 1.948966122098941e-05, + "loss": 0.7619, + "step": 11111 + }, + { + "epoch": 1.8140075915268765, + "grad_norm": 1.8692429065704346, + "learning_rate": 1.9489561136485252e-05, + "loss": 0.6684, + "step": 11112 + }, + { + "epoch": 1.814170850169381, + "grad_norm": 1.9728418588638306, + "learning_rate": 1.9489461042425106e-05, + "loss": 0.7428, + "step": 11113 + }, + { + "epoch": 1.8143341088118854, + "grad_norm": 1.6161795854568481, + "learning_rate": 1.9489360938809076e-05, + "loss": 0.7056, + "step": 11114 + }, + { + "epoch": 1.8144973674543896, + "grad_norm": 1.6895562410354614, + "learning_rate": 1.9489260825637265e-05, + "loss": 0.5905, + "step": 11115 + }, + { + "epoch": 1.814660626096894, + "grad_norm": 1.5149509906768799, + "learning_rate": 1.948916070290977e-05, + "loss": 0.5592, + "step": 11116 + }, + { + "epoch": 1.8148238847393983, + "grad_norm": 1.7208253145217896, + "learning_rate": 1.948906057062669e-05, + "loss": 0.721, + "step": 11117 + }, + { + "epoch": 1.8149871433819027, + "grad_norm": 2.1671388149261475, + "learning_rate": 1.948896042878813e-05, + "loss": 0.8171, + "step": 11118 + }, + { + "epoch": 1.8151504020244071, + "grad_norm": 1.679381251335144, + "learning_rate": 1.948886027739419e-05, + "loss": 0.6085, + "step": 11119 + }, + { + "epoch": 1.8153136606669116, + "grad_norm": 1.335627555847168, + "learning_rate": 1.9488760116444966e-05, + "loss": 0.5721, + "step": 11120 + }, + { + "epoch": 1.815476919309416, + "grad_norm": 2.0207953453063965, + "learning_rate": 1.9488659945940568e-05, + "loss": 0.7299, + "step": 11121 + }, + { + "epoch": 1.8156401779519205, + "grad_norm": 1.7400703430175781, + "learning_rate": 1.948855976588109e-05, + "loss": 0.7527, + "step": 11122 + }, + { + "epoch": 1.8158034365944247, + "grad_norm": 1.504014253616333, + "learning_rate": 1.9488459576266634e-05, + "loss": 0.647, + "step": 11123 + }, + { + "epoch": 1.815966695236929, + "grad_norm": 1.774139165878296, + "learning_rate": 1.9488359377097302e-05, + "loss": 0.7673, + "step": 11124 + }, + { + "epoch": 1.8161299538794335, + "grad_norm": 1.3680108785629272, + "learning_rate": 1.9488259168373198e-05, + "loss": 0.5301, + "step": 11125 + }, + { + "epoch": 1.8162932125219378, + "grad_norm": 1.664029598236084, + "learning_rate": 1.9488158950094417e-05, + "loss": 0.6912, + "step": 11126 + }, + { + "epoch": 1.8164564711644422, + "grad_norm": 1.821329116821289, + "learning_rate": 1.9488058722261063e-05, + "loss": 0.7851, + "step": 11127 + }, + { + "epoch": 1.8166197298069466, + "grad_norm": 1.7535254955291748, + "learning_rate": 1.948795848487324e-05, + "loss": 0.5771, + "step": 11128 + }, + { + "epoch": 1.816782988449451, + "grad_norm": 1.8704736232757568, + "learning_rate": 1.9487858237931035e-05, + "loss": 0.7331, + "step": 11129 + }, + { + "epoch": 1.8169462470919555, + "grad_norm": 1.6518393754959106, + "learning_rate": 1.9487757981434568e-05, + "loss": 0.7037, + "step": 11130 + }, + { + "epoch": 1.81710950573446, + "grad_norm": 1.6832218170166016, + "learning_rate": 1.9487657715383928e-05, + "loss": 0.6977, + "step": 11131 + }, + { + "epoch": 1.8172727643769642, + "grad_norm": 1.6304597854614258, + "learning_rate": 1.948755743977922e-05, + "loss": 0.6028, + "step": 11132 + }, + { + "epoch": 1.8174360230194686, + "grad_norm": 2.182384490966797, + "learning_rate": 1.9487457154620544e-05, + "loss": 0.6757, + "step": 11133 + }, + { + "epoch": 1.8175992816619728, + "grad_norm": 1.6859127283096313, + "learning_rate": 1.9487356859908003e-05, + "loss": 0.699, + "step": 11134 + }, + { + "epoch": 1.8177625403044773, + "grad_norm": 1.6341341733932495, + "learning_rate": 1.9487256555641692e-05, + "loss": 0.6604, + "step": 11135 + }, + { + "epoch": 1.8179257989469817, + "grad_norm": 1.5923099517822266, + "learning_rate": 1.948715624182172e-05, + "loss": 0.6565, + "step": 11136 + }, + { + "epoch": 1.8180890575894861, + "grad_norm": 1.6098967790603638, + "learning_rate": 1.9487055918448184e-05, + "loss": 0.6846, + "step": 11137 + }, + { + "epoch": 1.8182523162319906, + "grad_norm": 1.581106185913086, + "learning_rate": 1.9486955585521183e-05, + "loss": 0.7483, + "step": 11138 + }, + { + "epoch": 1.818415574874495, + "grad_norm": 1.449518084526062, + "learning_rate": 1.9486855243040822e-05, + "loss": 0.6561, + "step": 11139 + }, + { + "epoch": 1.8185788335169994, + "grad_norm": 1.4742976427078247, + "learning_rate": 1.9486754891007197e-05, + "loss": 0.5969, + "step": 11140 + }, + { + "epoch": 1.8187420921595037, + "grad_norm": 1.4525151252746582, + "learning_rate": 1.9486654529420415e-05, + "loss": 0.5824, + "step": 11141 + }, + { + "epoch": 1.818905350802008, + "grad_norm": 2.0713491439819336, + "learning_rate": 1.9486554158280576e-05, + "loss": 0.6884, + "step": 11142 + }, + { + "epoch": 1.8190686094445123, + "grad_norm": 1.5504671335220337, + "learning_rate": 1.9486453777587777e-05, + "loss": 0.6468, + "step": 11143 + }, + { + "epoch": 1.8192318680870168, + "grad_norm": 1.671420693397522, + "learning_rate": 1.9486353387342124e-05, + "loss": 0.6752, + "step": 11144 + }, + { + "epoch": 1.8193951267295212, + "grad_norm": 1.8407148122787476, + "learning_rate": 1.9486252987543715e-05, + "loss": 0.7886, + "step": 11145 + }, + { + "epoch": 1.8195583853720256, + "grad_norm": 1.8456368446350098, + "learning_rate": 1.948615257819265e-05, + "loss": 0.7491, + "step": 11146 + }, + { + "epoch": 1.81972164401453, + "grad_norm": 1.5769352912902832, + "learning_rate": 1.9486052159289033e-05, + "loss": 0.6474, + "step": 11147 + }, + { + "epoch": 1.8198849026570345, + "grad_norm": 1.8910030126571655, + "learning_rate": 1.9485951730832965e-05, + "loss": 0.7523, + "step": 11148 + }, + { + "epoch": 1.820048161299539, + "grad_norm": 2.1184542179107666, + "learning_rate": 1.9485851292824543e-05, + "loss": 0.8583, + "step": 11149 + }, + { + "epoch": 1.8202114199420432, + "grad_norm": 1.5855315923690796, + "learning_rate": 1.9485750845263874e-05, + "loss": 0.598, + "step": 11150 + }, + { + "epoch": 1.8203746785845476, + "grad_norm": 1.4453893899917603, + "learning_rate": 1.9485650388151054e-05, + "loss": 0.562, + "step": 11151 + }, + { + "epoch": 1.8205379372270518, + "grad_norm": 1.5758185386657715, + "learning_rate": 1.948554992148619e-05, + "loss": 0.6767, + "step": 11152 + }, + { + "epoch": 1.8207011958695563, + "grad_norm": 1.5604318380355835, + "learning_rate": 1.9485449445269376e-05, + "loss": 0.7189, + "step": 11153 + }, + { + "epoch": 1.8208644545120607, + "grad_norm": 1.8015416860580444, + "learning_rate": 1.948534895950072e-05, + "loss": 0.7494, + "step": 11154 + }, + { + "epoch": 1.8210277131545651, + "grad_norm": 1.725684404373169, + "learning_rate": 1.948524846418032e-05, + "loss": 0.6621, + "step": 11155 + }, + { + "epoch": 1.8211909717970696, + "grad_norm": 1.4514883756637573, + "learning_rate": 1.9485147959308275e-05, + "loss": 0.5491, + "step": 11156 + }, + { + "epoch": 1.821354230439574, + "grad_norm": 1.6093614101409912, + "learning_rate": 1.9485047444884688e-05, + "loss": 0.683, + "step": 11157 + }, + { + "epoch": 1.8215174890820784, + "grad_norm": 1.4736398458480835, + "learning_rate": 1.9484946920909663e-05, + "loss": 0.6712, + "step": 11158 + }, + { + "epoch": 1.8216807477245827, + "grad_norm": 1.6235793828964233, + "learning_rate": 1.9484846387383298e-05, + "loss": 0.7192, + "step": 11159 + }, + { + "epoch": 1.821844006367087, + "grad_norm": 1.9765546321868896, + "learning_rate": 1.9484745844305694e-05, + "loss": 0.7272, + "step": 11160 + }, + { + "epoch": 1.8220072650095913, + "grad_norm": 1.62570321559906, + "learning_rate": 1.9484645291676957e-05, + "loss": 0.6545, + "step": 11161 + }, + { + "epoch": 1.8221705236520958, + "grad_norm": 1.3362847566604614, + "learning_rate": 1.948454472949718e-05, + "loss": 0.5445, + "step": 11162 + }, + { + "epoch": 1.8223337822946002, + "grad_norm": 1.806139349937439, + "learning_rate": 1.9484444157766473e-05, + "loss": 0.6307, + "step": 11163 + }, + { + "epoch": 1.8224970409371046, + "grad_norm": 1.8467718362808228, + "learning_rate": 1.9484343576484935e-05, + "loss": 0.6713, + "step": 11164 + }, + { + "epoch": 1.822660299579609, + "grad_norm": 1.821985125541687, + "learning_rate": 1.948424298565266e-05, + "loss": 0.6373, + "step": 11165 + }, + { + "epoch": 1.8228235582221135, + "grad_norm": 1.55324387550354, + "learning_rate": 1.9484142385269758e-05, + "loss": 0.6142, + "step": 11166 + }, + { + "epoch": 1.8229868168646177, + "grad_norm": 1.7355291843414307, + "learning_rate": 1.9484041775336325e-05, + "loss": 0.752, + "step": 11167 + }, + { + "epoch": 1.8231500755071222, + "grad_norm": 1.7012298107147217, + "learning_rate": 1.9483941155852467e-05, + "loss": 0.7618, + "step": 11168 + }, + { + "epoch": 1.8233133341496266, + "grad_norm": 2.0106306076049805, + "learning_rate": 1.948384052681828e-05, + "loss": 0.899, + "step": 11169 + }, + { + "epoch": 1.8234765927921308, + "grad_norm": 1.5781197547912598, + "learning_rate": 1.948373988823387e-05, + "loss": 0.6533, + "step": 11170 + }, + { + "epoch": 1.8236398514346353, + "grad_norm": 1.6318929195404053, + "learning_rate": 1.9483639240099337e-05, + "loss": 0.6613, + "step": 11171 + }, + { + "epoch": 1.8238031100771397, + "grad_norm": 1.4863784313201904, + "learning_rate": 1.9483538582414784e-05, + "loss": 0.5246, + "step": 11172 + }, + { + "epoch": 1.8239663687196441, + "grad_norm": 1.7090933322906494, + "learning_rate": 1.9483437915180307e-05, + "loss": 0.6889, + "step": 11173 + }, + { + "epoch": 1.8241296273621486, + "grad_norm": 1.8445919752120972, + "learning_rate": 1.9483337238396008e-05, + "loss": 0.6615, + "step": 11174 + }, + { + "epoch": 1.824292886004653, + "grad_norm": 1.5850337743759155, + "learning_rate": 1.9483236552061996e-05, + "loss": 0.6031, + "step": 11175 + }, + { + "epoch": 1.8244561446471572, + "grad_norm": 2.027003765106201, + "learning_rate": 1.9483135856178364e-05, + "loss": 0.8073, + "step": 11176 + }, + { + "epoch": 1.8246194032896617, + "grad_norm": 1.7942676544189453, + "learning_rate": 1.948303515074522e-05, + "loss": 0.6936, + "step": 11177 + }, + { + "epoch": 1.8247826619321659, + "grad_norm": 1.6726871728897095, + "learning_rate": 1.9482934435762658e-05, + "loss": 0.5856, + "step": 11178 + }, + { + "epoch": 1.8249459205746703, + "grad_norm": 1.5492547750473022, + "learning_rate": 1.9482833711230783e-05, + "loss": 0.5953, + "step": 11179 + }, + { + "epoch": 1.8251091792171747, + "grad_norm": 1.7913460731506348, + "learning_rate": 1.94827329771497e-05, + "loss": 0.7202, + "step": 11180 + }, + { + "epoch": 1.8252724378596792, + "grad_norm": 2.021027088165283, + "learning_rate": 1.9482632233519506e-05, + "loss": 0.7561, + "step": 11181 + }, + { + "epoch": 1.8254356965021836, + "grad_norm": 1.536083459854126, + "learning_rate": 1.948253148034031e-05, + "loss": 0.5773, + "step": 11182 + }, + { + "epoch": 1.825598955144688, + "grad_norm": 1.5779736042022705, + "learning_rate": 1.94824307176122e-05, + "loss": 0.6059, + "step": 11183 + }, + { + "epoch": 1.8257622137871925, + "grad_norm": 2.1145031452178955, + "learning_rate": 1.9482329945335286e-05, + "loss": 0.7257, + "step": 11184 + }, + { + "epoch": 1.8259254724296967, + "grad_norm": 1.800539493560791, + "learning_rate": 1.948222916350967e-05, + "loss": 0.7636, + "step": 11185 + }, + { + "epoch": 1.8260887310722012, + "grad_norm": 1.9278708696365356, + "learning_rate": 1.9482128372135446e-05, + "loss": 0.8386, + "step": 11186 + }, + { + "epoch": 1.8262519897147054, + "grad_norm": 2.0208096504211426, + "learning_rate": 1.9482027571212726e-05, + "loss": 0.7656, + "step": 11187 + }, + { + "epoch": 1.8264152483572098, + "grad_norm": 1.288275957107544, + "learning_rate": 1.948192676074161e-05, + "loss": 0.4425, + "step": 11188 + }, + { + "epoch": 1.8265785069997142, + "grad_norm": 1.6276092529296875, + "learning_rate": 1.9481825940722193e-05, + "loss": 0.6872, + "step": 11189 + }, + { + "epoch": 1.8267417656422187, + "grad_norm": 1.625482439994812, + "learning_rate": 1.9481725111154577e-05, + "loss": 0.7175, + "step": 11190 + }, + { + "epoch": 1.8269050242847231, + "grad_norm": 1.7352010011672974, + "learning_rate": 1.948162427203887e-05, + "loss": 0.7251, + "step": 11191 + }, + { + "epoch": 1.8270682829272276, + "grad_norm": 1.900902509689331, + "learning_rate": 1.948152342337517e-05, + "loss": 0.731, + "step": 11192 + }, + { + "epoch": 1.827231541569732, + "grad_norm": 1.9057289361953735, + "learning_rate": 1.9481422565163577e-05, + "loss": 0.7099, + "step": 11193 + }, + { + "epoch": 1.8273948002122362, + "grad_norm": 1.4270421266555786, + "learning_rate": 1.9481321697404194e-05, + "loss": 0.6543, + "step": 11194 + }, + { + "epoch": 1.8275580588547407, + "grad_norm": 1.5865037441253662, + "learning_rate": 1.948122082009712e-05, + "loss": 0.5415, + "step": 11195 + }, + { + "epoch": 1.8277213174972449, + "grad_norm": 1.4957873821258545, + "learning_rate": 1.9481119933242464e-05, + "loss": 0.677, + "step": 11196 + }, + { + "epoch": 1.8278845761397493, + "grad_norm": 1.8010326623916626, + "learning_rate": 1.948101903684032e-05, + "loss": 0.5954, + "step": 11197 + }, + { + "epoch": 1.8280478347822537, + "grad_norm": 1.446526288986206, + "learning_rate": 1.9480918130890796e-05, + "loss": 0.6009, + "step": 11198 + }, + { + "epoch": 1.8282110934247582, + "grad_norm": 2.0267856121063232, + "learning_rate": 1.9480817215393985e-05, + "loss": 0.7043, + "step": 11199 + }, + { + "epoch": 1.8283743520672626, + "grad_norm": 1.6669411659240723, + "learning_rate": 1.9480716290349998e-05, + "loss": 0.6874, + "step": 11200 + }, + { + "epoch": 1.828537610709767, + "grad_norm": 1.96531343460083, + "learning_rate": 1.9480615355758926e-05, + "loss": 0.8037, + "step": 11201 + }, + { + "epoch": 1.8287008693522715, + "grad_norm": 1.7916260957717896, + "learning_rate": 1.9480514411620884e-05, + "loss": 0.6397, + "step": 11202 + }, + { + "epoch": 1.8288641279947757, + "grad_norm": 1.780898928642273, + "learning_rate": 1.9480413457935962e-05, + "loss": 0.638, + "step": 11203 + }, + { + "epoch": 1.8290273866372802, + "grad_norm": 1.5533488988876343, + "learning_rate": 1.9480312494704267e-05, + "loss": 0.5975, + "step": 11204 + }, + { + "epoch": 1.8291906452797844, + "grad_norm": 1.6473267078399658, + "learning_rate": 1.94802115219259e-05, + "loss": 0.5847, + "step": 11205 + }, + { + "epoch": 1.8293539039222888, + "grad_norm": 1.6156991720199585, + "learning_rate": 1.9480110539600964e-05, + "loss": 0.7599, + "step": 11206 + }, + { + "epoch": 1.8295171625647932, + "grad_norm": 1.3906582593917847, + "learning_rate": 1.948000954772956e-05, + "loss": 0.5807, + "step": 11207 + }, + { + "epoch": 1.8296804212072977, + "grad_norm": 1.4848023653030396, + "learning_rate": 1.9479908546311783e-05, + "loss": 0.6078, + "step": 11208 + }, + { + "epoch": 1.8298436798498021, + "grad_norm": 1.6552269458770752, + "learning_rate": 1.9479807535347745e-05, + "loss": 0.6589, + "step": 11209 + }, + { + "epoch": 1.8300069384923066, + "grad_norm": 1.9116599559783936, + "learning_rate": 1.9479706514837544e-05, + "loss": 0.6627, + "step": 11210 + }, + { + "epoch": 1.8301701971348108, + "grad_norm": 1.8586033582687378, + "learning_rate": 1.9479605484781282e-05, + "loss": 0.7771, + "step": 11211 + }, + { + "epoch": 1.8303334557773152, + "grad_norm": 2.0048351287841797, + "learning_rate": 1.9479504445179053e-05, + "loss": 0.7115, + "step": 11212 + }, + { + "epoch": 1.8304967144198196, + "grad_norm": 1.5767651796340942, + "learning_rate": 1.9479403396030973e-05, + "loss": 0.7223, + "step": 11213 + }, + { + "epoch": 1.8306599730623239, + "grad_norm": 1.8348174095153809, + "learning_rate": 1.9479302337337133e-05, + "loss": 0.7359, + "step": 11214 + }, + { + "epoch": 1.8308232317048283, + "grad_norm": 1.330308437347412, + "learning_rate": 1.947920126909764e-05, + "loss": 0.5897, + "step": 11215 + }, + { + "epoch": 1.8309864903473327, + "grad_norm": 1.792144775390625, + "learning_rate": 1.947910019131259e-05, + "loss": 0.7036, + "step": 11216 + }, + { + "epoch": 1.8311497489898372, + "grad_norm": 1.7983901500701904, + "learning_rate": 1.947899910398209e-05, + "loss": 0.6751, + "step": 11217 + }, + { + "epoch": 1.8313130076323416, + "grad_norm": 1.6563245058059692, + "learning_rate": 1.9478898007106243e-05, + "loss": 0.6748, + "step": 11218 + }, + { + "epoch": 1.831476266274846, + "grad_norm": 2.09661602973938, + "learning_rate": 1.9478796900685145e-05, + "loss": 0.7451, + "step": 11219 + }, + { + "epoch": 1.8316395249173503, + "grad_norm": 1.7088379859924316, + "learning_rate": 1.9478695784718905e-05, + "loss": 0.6741, + "step": 11220 + }, + { + "epoch": 1.8318027835598547, + "grad_norm": 1.6102770566940308, + "learning_rate": 1.947859465920762e-05, + "loss": 0.6785, + "step": 11221 + }, + { + "epoch": 1.831966042202359, + "grad_norm": 1.98444664478302, + "learning_rate": 1.9478493524151388e-05, + "loss": 0.725, + "step": 11222 + }, + { + "epoch": 1.8321293008448634, + "grad_norm": 1.8480366468429565, + "learning_rate": 1.9478392379550318e-05, + "loss": 0.7385, + "step": 11223 + }, + { + "epoch": 1.8322925594873678, + "grad_norm": 2.0223300457000732, + "learning_rate": 1.9478291225404512e-05, + "loss": 0.7017, + "step": 11224 + }, + { + "epoch": 1.8324558181298722, + "grad_norm": 1.3953683376312256, + "learning_rate": 1.9478190061714067e-05, + "loss": 0.66, + "step": 11225 + }, + { + "epoch": 1.8326190767723767, + "grad_norm": 1.421250343322754, + "learning_rate": 1.947808888847909e-05, + "loss": 0.568, + "step": 11226 + }, + { + "epoch": 1.8327823354148811, + "grad_norm": 1.884398341178894, + "learning_rate": 1.9477987705699676e-05, + "loss": 0.6555, + "step": 11227 + }, + { + "epoch": 1.8329455940573856, + "grad_norm": 1.7607218027114868, + "learning_rate": 1.947788651337593e-05, + "loss": 0.7077, + "step": 11228 + }, + { + "epoch": 1.8331088526998898, + "grad_norm": 1.7825236320495605, + "learning_rate": 1.947778531150796e-05, + "loss": 0.7313, + "step": 11229 + }, + { + "epoch": 1.8332721113423942, + "grad_norm": 1.4085471630096436, + "learning_rate": 1.947768410009586e-05, + "loss": 0.6416, + "step": 11230 + }, + { + "epoch": 1.8334353699848984, + "grad_norm": 1.522440791130066, + "learning_rate": 1.9477582879139733e-05, + "loss": 0.5594, + "step": 11231 + }, + { + "epoch": 1.8335986286274029, + "grad_norm": 1.6018871068954468, + "learning_rate": 1.9477481648639683e-05, + "loss": 0.6622, + "step": 11232 + }, + { + "epoch": 1.8337618872699073, + "grad_norm": 1.839788556098938, + "learning_rate": 1.9477380408595815e-05, + "loss": 0.6462, + "step": 11233 + }, + { + "epoch": 1.8339251459124117, + "grad_norm": 1.8539906740188599, + "learning_rate": 1.9477279159008223e-05, + "loss": 0.7035, + "step": 11234 + }, + { + "epoch": 1.8340884045549162, + "grad_norm": 1.6447192430496216, + "learning_rate": 1.9477177899877016e-05, + "loss": 0.6764, + "step": 11235 + }, + { + "epoch": 1.8342516631974206, + "grad_norm": 1.8596315383911133, + "learning_rate": 1.9477076631202293e-05, + "loss": 0.5726, + "step": 11236 + }, + { + "epoch": 1.834414921839925, + "grad_norm": 1.9673501253128052, + "learning_rate": 1.9476975352984157e-05, + "loss": 0.795, + "step": 11237 + }, + { + "epoch": 1.8345781804824293, + "grad_norm": 1.5047483444213867, + "learning_rate": 1.9476874065222708e-05, + "loss": 0.5813, + "step": 11238 + }, + { + "epoch": 1.8347414391249337, + "grad_norm": 1.5316208600997925, + "learning_rate": 1.9476772767918047e-05, + "loss": 0.6202, + "step": 11239 + }, + { + "epoch": 1.834904697767438, + "grad_norm": 1.510323166847229, + "learning_rate": 1.9476671461070286e-05, + "loss": 0.5993, + "step": 11240 + }, + { + "epoch": 1.8350679564099424, + "grad_norm": 1.815978765487671, + "learning_rate": 1.9476570144679513e-05, + "loss": 0.7495, + "step": 11241 + }, + { + "epoch": 1.8352312150524468, + "grad_norm": 1.5339387655258179, + "learning_rate": 1.9476468818745836e-05, + "loss": 0.6798, + "step": 11242 + }, + { + "epoch": 1.8353944736949512, + "grad_norm": 1.6140520572662354, + "learning_rate": 1.947636748326936e-05, + "loss": 0.6674, + "step": 11243 + }, + { + "epoch": 1.8355577323374557, + "grad_norm": 1.4420990943908691, + "learning_rate": 1.9476266138250186e-05, + "loss": 0.5735, + "step": 11244 + }, + { + "epoch": 1.83572099097996, + "grad_norm": 1.9324593544006348, + "learning_rate": 1.947616478368841e-05, + "loss": 0.6141, + "step": 11245 + }, + { + "epoch": 1.8358842496224645, + "grad_norm": 1.4615000486373901, + "learning_rate": 1.947606341958414e-05, + "loss": 0.5616, + "step": 11246 + }, + { + "epoch": 1.8360475082649688, + "grad_norm": 1.7870616912841797, + "learning_rate": 1.9475962045937477e-05, + "loss": 0.6902, + "step": 11247 + }, + { + "epoch": 1.8362107669074732, + "grad_norm": 1.646500825881958, + "learning_rate": 1.9475860662748524e-05, + "loss": 0.6654, + "step": 11248 + }, + { + "epoch": 1.8363740255499774, + "grad_norm": 1.6858272552490234, + "learning_rate": 1.9475759270017382e-05, + "loss": 0.643, + "step": 11249 + }, + { + "epoch": 1.8365372841924819, + "grad_norm": 2.185067653656006, + "learning_rate": 1.9475657867744153e-05, + "loss": 0.8663, + "step": 11250 + }, + { + "epoch": 1.8367005428349863, + "grad_norm": 1.560416579246521, + "learning_rate": 1.9475556455928938e-05, + "loss": 0.6258, + "step": 11251 + }, + { + "epoch": 1.8368638014774907, + "grad_norm": 1.3716140985488892, + "learning_rate": 1.947545503457184e-05, + "loss": 0.5354, + "step": 11252 + }, + { + "epoch": 1.8370270601199952, + "grad_norm": 1.3749982118606567, + "learning_rate": 1.947535360367296e-05, + "loss": 0.5336, + "step": 11253 + }, + { + "epoch": 1.8371903187624996, + "grad_norm": 1.6696398258209229, + "learning_rate": 1.9475252163232402e-05, + "loss": 0.5648, + "step": 11254 + }, + { + "epoch": 1.8373535774050038, + "grad_norm": 1.7056740522384644, + "learning_rate": 1.947515071325027e-05, + "loss": 0.6821, + "step": 11255 + }, + { + "epoch": 1.8375168360475083, + "grad_norm": 1.3943918943405151, + "learning_rate": 1.9475049253726663e-05, + "loss": 0.5998, + "step": 11256 + }, + { + "epoch": 1.8376800946900127, + "grad_norm": 1.6272138357162476, + "learning_rate": 1.947494778466168e-05, + "loss": 0.5563, + "step": 11257 + }, + { + "epoch": 1.837843353332517, + "grad_norm": 2.0819168090820312, + "learning_rate": 1.947484630605543e-05, + "loss": 0.6057, + "step": 11258 + }, + { + "epoch": 1.8380066119750214, + "grad_norm": 2.0557568073272705, + "learning_rate": 1.9474744817908013e-05, + "loss": 0.7944, + "step": 11259 + }, + { + "epoch": 1.8381698706175258, + "grad_norm": 2.0977513790130615, + "learning_rate": 1.9474643320219534e-05, + "loss": 0.6226, + "step": 11260 + }, + { + "epoch": 1.8383331292600302, + "grad_norm": 1.7435396909713745, + "learning_rate": 1.9474541812990083e-05, + "loss": 0.7028, + "step": 11261 + }, + { + "epoch": 1.8384963879025347, + "grad_norm": 1.5857936143875122, + "learning_rate": 1.947444029621978e-05, + "loss": 0.602, + "step": 11262 + }, + { + "epoch": 1.838659646545039, + "grad_norm": 1.8107887506484985, + "learning_rate": 1.9474338769908712e-05, + "loss": 0.6955, + "step": 11263 + }, + { + "epoch": 1.8388229051875433, + "grad_norm": 1.413750171661377, + "learning_rate": 1.9474237234056988e-05, + "loss": 0.53, + "step": 11264 + }, + { + "epoch": 1.8389861638300478, + "grad_norm": 1.4980103969573975, + "learning_rate": 1.947413568866471e-05, + "loss": 0.5441, + "step": 11265 + }, + { + "epoch": 1.839149422472552, + "grad_norm": 1.5717209577560425, + "learning_rate": 1.947403413373198e-05, + "loss": 0.6761, + "step": 11266 + }, + { + "epoch": 1.8393126811150564, + "grad_norm": 1.5957504510879517, + "learning_rate": 1.94739325692589e-05, + "loss": 0.5391, + "step": 11267 + }, + { + "epoch": 1.8394759397575609, + "grad_norm": 1.6413758993148804, + "learning_rate": 1.9473830995245575e-05, + "loss": 0.6432, + "step": 11268 + }, + { + "epoch": 1.8396391984000653, + "grad_norm": 1.969042420387268, + "learning_rate": 1.9473729411692103e-05, + "loss": 0.6869, + "step": 11269 + }, + { + "epoch": 1.8398024570425697, + "grad_norm": 1.914711833000183, + "learning_rate": 1.947362781859859e-05, + "loss": 0.7038, + "step": 11270 + }, + { + "epoch": 1.8399657156850742, + "grad_norm": 1.6980578899383545, + "learning_rate": 1.947352621596513e-05, + "loss": 0.6639, + "step": 11271 + }, + { + "epoch": 1.8401289743275786, + "grad_norm": 2.0990400314331055, + "learning_rate": 1.947342460379184e-05, + "loss": 0.8484, + "step": 11272 + }, + { + "epoch": 1.8402922329700828, + "grad_norm": 1.8922178745269775, + "learning_rate": 1.9473322982078807e-05, + "loss": 0.6568, + "step": 11273 + }, + { + "epoch": 1.8404554916125873, + "grad_norm": 1.7580965757369995, + "learning_rate": 1.9473221350826145e-05, + "loss": 0.716, + "step": 11274 + }, + { + "epoch": 1.8406187502550915, + "grad_norm": 1.3946014642715454, + "learning_rate": 1.9473119710033947e-05, + "loss": 0.5538, + "step": 11275 + }, + { + "epoch": 1.840782008897596, + "grad_norm": 1.6833205223083496, + "learning_rate": 1.9473018059702325e-05, + "loss": 0.6515, + "step": 11276 + }, + { + "epoch": 1.8409452675401003, + "grad_norm": 1.980355143547058, + "learning_rate": 1.947291639983137e-05, + "loss": 0.6445, + "step": 11277 + }, + { + "epoch": 1.8411085261826048, + "grad_norm": 1.5229523181915283, + "learning_rate": 1.9472814730421196e-05, + "loss": 0.5309, + "step": 11278 + }, + { + "epoch": 1.8412717848251092, + "grad_norm": 1.792183518409729, + "learning_rate": 1.94727130514719e-05, + "loss": 0.6402, + "step": 11279 + }, + { + "epoch": 1.8414350434676137, + "grad_norm": 1.711656928062439, + "learning_rate": 1.9472611362983583e-05, + "loss": 0.6272, + "step": 11280 + }, + { + "epoch": 1.841598302110118, + "grad_norm": 1.8281450271606445, + "learning_rate": 1.947250966495635e-05, + "loss": 0.7348, + "step": 11281 + }, + { + "epoch": 1.8417615607526223, + "grad_norm": 1.6806683540344238, + "learning_rate": 1.94724079573903e-05, + "loss": 0.6091, + "step": 11282 + }, + { + "epoch": 1.8419248193951268, + "grad_norm": 1.8940225839614868, + "learning_rate": 1.947230624028554e-05, + "loss": 0.6362, + "step": 11283 + }, + { + "epoch": 1.842088078037631, + "grad_norm": 1.767269492149353, + "learning_rate": 1.947220451364217e-05, + "loss": 0.5912, + "step": 11284 + }, + { + "epoch": 1.8422513366801354, + "grad_norm": 1.6992754936218262, + "learning_rate": 1.9472102777460292e-05, + "loss": 0.7633, + "step": 11285 + }, + { + "epoch": 1.8424145953226398, + "grad_norm": 1.3356091976165771, + "learning_rate": 1.9472001031740007e-05, + "loss": 0.5407, + "step": 11286 + }, + { + "epoch": 1.8425778539651443, + "grad_norm": 1.6204113960266113, + "learning_rate": 1.947189927648142e-05, + "loss": 0.6633, + "step": 11287 + }, + { + "epoch": 1.8427411126076487, + "grad_norm": 1.5495291948318481, + "learning_rate": 1.9471797511684635e-05, + "loss": 0.5875, + "step": 11288 + }, + { + "epoch": 1.8429043712501532, + "grad_norm": 1.8579989671707153, + "learning_rate": 1.9471695737349748e-05, + "loss": 0.7206, + "step": 11289 + }, + { + "epoch": 1.8430676298926576, + "grad_norm": 1.920098900794983, + "learning_rate": 1.9471593953476873e-05, + "loss": 1.2492, + "step": 11290 + }, + { + "epoch": 1.8432308885351618, + "grad_norm": 1.461244821548462, + "learning_rate": 1.9471492160066103e-05, + "loss": 0.5794, + "step": 11291 + }, + { + "epoch": 1.8433941471776663, + "grad_norm": 1.7477149963378906, + "learning_rate": 1.947139035711754e-05, + "loss": 0.6681, + "step": 11292 + }, + { + "epoch": 1.8435574058201705, + "grad_norm": 1.699796199798584, + "learning_rate": 1.947128854463129e-05, + "loss": 0.7775, + "step": 11293 + }, + { + "epoch": 1.843720664462675, + "grad_norm": 1.916603684425354, + "learning_rate": 1.9471186722607456e-05, + "loss": 0.6829, + "step": 11294 + }, + { + "epoch": 1.8438839231051793, + "grad_norm": 1.749789834022522, + "learning_rate": 1.9471084891046138e-05, + "loss": 0.713, + "step": 11295 + }, + { + "epoch": 1.8440471817476838, + "grad_norm": 1.7580369710922241, + "learning_rate": 1.9470983049947446e-05, + "loss": 0.704, + "step": 11296 + }, + { + "epoch": 1.8442104403901882, + "grad_norm": 1.5205944776535034, + "learning_rate": 1.9470881199311472e-05, + "loss": 0.5947, + "step": 11297 + }, + { + "epoch": 1.8443736990326927, + "grad_norm": 1.9895873069763184, + "learning_rate": 1.947077933913832e-05, + "loss": 0.7648, + "step": 11298 + }, + { + "epoch": 1.8445369576751969, + "grad_norm": 1.4842942953109741, + "learning_rate": 1.9470677469428102e-05, + "loss": 0.5077, + "step": 11299 + }, + { + "epoch": 1.8447002163177013, + "grad_norm": 2.0025477409362793, + "learning_rate": 1.947057559018091e-05, + "loss": 0.8958, + "step": 11300 + }, + { + "epoch": 1.8448634749602058, + "grad_norm": 1.8291630744934082, + "learning_rate": 1.9470473701396852e-05, + "loss": 0.6888, + "step": 11301 + }, + { + "epoch": 1.84502673360271, + "grad_norm": 1.8233606815338135, + "learning_rate": 1.947037180307603e-05, + "loss": 0.7836, + "step": 11302 + }, + { + "epoch": 1.8451899922452144, + "grad_norm": 1.5006870031356812, + "learning_rate": 1.9470269895218545e-05, + "loss": 0.7262, + "step": 11303 + }, + { + "epoch": 1.8453532508877188, + "grad_norm": 1.8186402320861816, + "learning_rate": 1.9470167977824502e-05, + "loss": 0.6472, + "step": 11304 + }, + { + "epoch": 1.8455165095302233, + "grad_norm": 1.984135389328003, + "learning_rate": 1.9470066050894e-05, + "loss": 0.7281, + "step": 11305 + }, + { + "epoch": 1.8456797681727277, + "grad_norm": 1.3637615442276, + "learning_rate": 1.9469964114427148e-05, + "loss": 0.555, + "step": 11306 + }, + { + "epoch": 1.8458430268152322, + "grad_norm": 1.862672209739685, + "learning_rate": 1.9469862168424042e-05, + "loss": 0.7133, + "step": 11307 + }, + { + "epoch": 1.8460062854577364, + "grad_norm": 1.6852246522903442, + "learning_rate": 1.946976021288479e-05, + "loss": 0.6778, + "step": 11308 + }, + { + "epoch": 1.8461695441002408, + "grad_norm": 1.6572946310043335, + "learning_rate": 1.946965824780949e-05, + "loss": 0.6531, + "step": 11309 + }, + { + "epoch": 1.8463328027427452, + "grad_norm": 2.092381000518799, + "learning_rate": 1.9469556273198246e-05, + "loss": 0.8154, + "step": 11310 + }, + { + "epoch": 1.8464960613852495, + "grad_norm": 1.7718756198883057, + "learning_rate": 1.946945428905116e-05, + "loss": 0.703, + "step": 11311 + }, + { + "epoch": 1.846659320027754, + "grad_norm": 1.5157105922698975, + "learning_rate": 1.946935229536834e-05, + "loss": 0.5882, + "step": 11312 + }, + { + "epoch": 1.8468225786702583, + "grad_norm": 1.5778361558914185, + "learning_rate": 1.9469250292149883e-05, + "loss": 0.6684, + "step": 11313 + }, + { + "epoch": 1.8469858373127628, + "grad_norm": 1.5157126188278198, + "learning_rate": 1.9469148279395892e-05, + "loss": 0.5557, + "step": 11314 + }, + { + "epoch": 1.8471490959552672, + "grad_norm": 1.479537844657898, + "learning_rate": 1.9469046257106472e-05, + "loss": 0.6536, + "step": 11315 + }, + { + "epoch": 1.8473123545977717, + "grad_norm": 1.673499584197998, + "learning_rate": 1.9468944225281724e-05, + "loss": 0.717, + "step": 11316 + }, + { + "epoch": 1.8474756132402759, + "grad_norm": 1.4209645986557007, + "learning_rate": 1.9468842183921753e-05, + "loss": 0.5635, + "step": 11317 + }, + { + "epoch": 1.8476388718827803, + "grad_norm": 1.4930143356323242, + "learning_rate": 1.946874013302666e-05, + "loss": 0.611, + "step": 11318 + }, + { + "epoch": 1.8478021305252845, + "grad_norm": 1.8762633800506592, + "learning_rate": 1.9468638072596547e-05, + "loss": 0.7428, + "step": 11319 + }, + { + "epoch": 1.847965389167789, + "grad_norm": 1.442726492881775, + "learning_rate": 1.946853600263152e-05, + "loss": 0.5552, + "step": 11320 + }, + { + "epoch": 1.8481286478102934, + "grad_norm": 1.7356958389282227, + "learning_rate": 1.9468433923131683e-05, + "loss": 0.6051, + "step": 11321 + }, + { + "epoch": 1.8482919064527978, + "grad_norm": 1.5646950006484985, + "learning_rate": 1.946833183409713e-05, + "loss": 0.6417, + "step": 11322 + }, + { + "epoch": 1.8484551650953023, + "grad_norm": 1.790474534034729, + "learning_rate": 1.9468229735527972e-05, + "loss": 0.7646, + "step": 11323 + }, + { + "epoch": 1.8486184237378067, + "grad_norm": 1.8446646928787231, + "learning_rate": 1.9468127627424305e-05, + "loss": 0.6614, + "step": 11324 + }, + { + "epoch": 1.8487816823803112, + "grad_norm": 1.4836266040802002, + "learning_rate": 1.9468025509786243e-05, + "loss": 0.6952, + "step": 11325 + }, + { + "epoch": 1.8489449410228154, + "grad_norm": 1.7344393730163574, + "learning_rate": 1.946792338261388e-05, + "loss": 0.6527, + "step": 11326 + }, + { + "epoch": 1.8491081996653198, + "grad_norm": 1.6935209035873413, + "learning_rate": 1.9467821245907316e-05, + "loss": 0.7319, + "step": 11327 + }, + { + "epoch": 1.849271458307824, + "grad_norm": 1.5915470123291016, + "learning_rate": 1.9467719099666662e-05, + "loss": 0.6877, + "step": 11328 + }, + { + "epoch": 1.8494347169503285, + "grad_norm": 1.5453569889068604, + "learning_rate": 1.946761694389202e-05, + "loss": 0.5632, + "step": 11329 + }, + { + "epoch": 1.849597975592833, + "grad_norm": 1.5011414289474487, + "learning_rate": 1.9467514778583484e-05, + "loss": 0.5964, + "step": 11330 + }, + { + "epoch": 1.8497612342353373, + "grad_norm": 1.698552131652832, + "learning_rate": 1.9467412603741163e-05, + "loss": 0.6291, + "step": 11331 + }, + { + "epoch": 1.8499244928778418, + "grad_norm": 1.8297092914581299, + "learning_rate": 1.9467310419365165e-05, + "loss": 0.7323, + "step": 11332 + }, + { + "epoch": 1.8500877515203462, + "grad_norm": 1.8055306673049927, + "learning_rate": 1.9467208225455587e-05, + "loss": 0.7204, + "step": 11333 + }, + { + "epoch": 1.8502510101628507, + "grad_norm": 1.5056910514831543, + "learning_rate": 1.946710602201253e-05, + "loss": 0.5595, + "step": 11334 + }, + { + "epoch": 1.8504142688053549, + "grad_norm": 1.6088930368423462, + "learning_rate": 1.9467003809036106e-05, + "loss": 0.5525, + "step": 11335 + }, + { + "epoch": 1.8505775274478593, + "grad_norm": 1.2224750518798828, + "learning_rate": 1.9466901586526403e-05, + "loss": 0.4546, + "step": 11336 + }, + { + "epoch": 1.8507407860903635, + "grad_norm": 1.5027700662612915, + "learning_rate": 1.9466799354483538e-05, + "loss": 0.6142, + "step": 11337 + }, + { + "epoch": 1.850904044732868, + "grad_norm": 1.7277427911758423, + "learning_rate": 1.9466697112907608e-05, + "loss": 0.6825, + "step": 11338 + }, + { + "epoch": 1.8510673033753724, + "grad_norm": 1.7092294692993164, + "learning_rate": 1.9466594861798715e-05, + "loss": 0.6285, + "step": 11339 + }, + { + "epoch": 1.8512305620178768, + "grad_norm": 1.2341948747634888, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.4894, + "step": 11340 + }, + { + "epoch": 1.8513938206603813, + "grad_norm": 1.4450682401657104, + "learning_rate": 1.9466390330982457e-05, + "loss": 0.6001, + "step": 11341 + }, + { + "epoch": 1.8515570793028857, + "grad_norm": 1.8335946798324585, + "learning_rate": 1.9466288051275303e-05, + "loss": 0.7955, + "step": 11342 + }, + { + "epoch": 1.8517203379453901, + "grad_norm": 1.6008360385894775, + "learning_rate": 1.9466185762035592e-05, + "loss": 0.5293, + "step": 11343 + }, + { + "epoch": 1.8518835965878944, + "grad_norm": 1.6721000671386719, + "learning_rate": 1.9466083463263437e-05, + "loss": 0.6269, + "step": 11344 + }, + { + "epoch": 1.8520468552303988, + "grad_norm": 1.7251214981079102, + "learning_rate": 1.946598115495894e-05, + "loss": 0.7318, + "step": 11345 + }, + { + "epoch": 1.852210113872903, + "grad_norm": 1.382997989654541, + "learning_rate": 1.94658788371222e-05, + "loss": 0.5113, + "step": 11346 + }, + { + "epoch": 1.8523733725154075, + "grad_norm": 1.6176731586456299, + "learning_rate": 1.9465776509753325e-05, + "loss": 0.5884, + "step": 11347 + }, + { + "epoch": 1.852536631157912, + "grad_norm": 1.3850488662719727, + "learning_rate": 1.9465674172852412e-05, + "loss": 0.5591, + "step": 11348 + }, + { + "epoch": 1.8526998898004163, + "grad_norm": 1.716545820236206, + "learning_rate": 1.946557182641957e-05, + "loss": 0.5858, + "step": 11349 + }, + { + "epoch": 1.8528631484429208, + "grad_norm": 1.6958024501800537, + "learning_rate": 1.94654694704549e-05, + "loss": 0.7041, + "step": 11350 + }, + { + "epoch": 1.8530264070854252, + "grad_norm": 1.573082685470581, + "learning_rate": 1.9465367104958507e-05, + "loss": 0.723, + "step": 11351 + }, + { + "epoch": 1.8531896657279294, + "grad_norm": 1.9408619403839111, + "learning_rate": 1.946526472993049e-05, + "loss": 0.6485, + "step": 11352 + }, + { + "epoch": 1.8533529243704339, + "grad_norm": 1.4201043844223022, + "learning_rate": 1.9465162345370953e-05, + "loss": 0.5132, + "step": 11353 + }, + { + "epoch": 1.8535161830129383, + "grad_norm": 1.8337892293930054, + "learning_rate": 1.9465059951279998e-05, + "loss": 0.6698, + "step": 11354 + }, + { + "epoch": 1.8536794416554425, + "grad_norm": 1.4373219013214111, + "learning_rate": 1.9464957547657734e-05, + "loss": 0.5897, + "step": 11355 + }, + { + "epoch": 1.853842700297947, + "grad_norm": 2.0871987342834473, + "learning_rate": 1.9464855134504256e-05, + "loss": 0.5719, + "step": 11356 + }, + { + "epoch": 1.8540059589404514, + "grad_norm": 1.658308506011963, + "learning_rate": 1.9464752711819674e-05, + "loss": 0.6588, + "step": 11357 + }, + { + "epoch": 1.8541692175829558, + "grad_norm": 2.4047887325286865, + "learning_rate": 1.946465027960409e-05, + "loss": 0.835, + "step": 11358 + }, + { + "epoch": 1.8543324762254603, + "grad_norm": 1.6464979648590088, + "learning_rate": 1.9464547837857603e-05, + "loss": 0.6283, + "step": 11359 + }, + { + "epoch": 1.8544957348679647, + "grad_norm": 1.4339265823364258, + "learning_rate": 1.946444538658032e-05, + "loss": 0.5472, + "step": 11360 + }, + { + "epoch": 1.854658993510469, + "grad_norm": 1.6774463653564453, + "learning_rate": 1.9464342925772345e-05, + "loss": 0.6762, + "step": 11361 + }, + { + "epoch": 1.8548222521529734, + "grad_norm": 1.815497875213623, + "learning_rate": 1.9464240455433775e-05, + "loss": 0.643, + "step": 11362 + }, + { + "epoch": 1.8549855107954776, + "grad_norm": 2.0059027671813965, + "learning_rate": 1.946413797556472e-05, + "loss": 0.677, + "step": 11363 + }, + { + "epoch": 1.855148769437982, + "grad_norm": 1.7404001951217651, + "learning_rate": 1.946403548616528e-05, + "loss": 0.6993, + "step": 11364 + }, + { + "epoch": 1.8553120280804865, + "grad_norm": 1.6878925561904907, + "learning_rate": 1.9463932987235558e-05, + "loss": 0.6627, + "step": 11365 + }, + { + "epoch": 1.855475286722991, + "grad_norm": 2.3901617527008057, + "learning_rate": 1.946383047877566e-05, + "loss": 0.8456, + "step": 11366 + }, + { + "epoch": 1.8556385453654953, + "grad_norm": 1.7893768548965454, + "learning_rate": 1.9463727960785686e-05, + "loss": 0.6815, + "step": 11367 + }, + { + "epoch": 1.8558018040079998, + "grad_norm": 1.5406551361083984, + "learning_rate": 1.946362543326574e-05, + "loss": 0.6584, + "step": 11368 + }, + { + "epoch": 1.8559650626505042, + "grad_norm": 1.621347188949585, + "learning_rate": 1.9463522896215927e-05, + "loss": 0.7255, + "step": 11369 + }, + { + "epoch": 1.8561283212930084, + "grad_norm": 1.5381826162338257, + "learning_rate": 1.9463420349636348e-05, + "loss": 0.7185, + "step": 11370 + }, + { + "epoch": 1.8562915799355129, + "grad_norm": 1.6084988117218018, + "learning_rate": 1.9463317793527106e-05, + "loss": 0.6015, + "step": 11371 + }, + { + "epoch": 1.856454838578017, + "grad_norm": 1.6313977241516113, + "learning_rate": 1.9463215227888307e-05, + "loss": 0.6538, + "step": 11372 + }, + { + "epoch": 1.8566180972205215, + "grad_norm": 1.4991780519485474, + "learning_rate": 1.9463112652720055e-05, + "loss": 0.7267, + "step": 11373 + }, + { + "epoch": 1.856781355863026, + "grad_norm": 1.4805326461791992, + "learning_rate": 1.946301006802245e-05, + "loss": 0.6189, + "step": 11374 + }, + { + "epoch": 1.8569446145055304, + "grad_norm": 1.5228383541107178, + "learning_rate": 1.9462907473795593e-05, + "loss": 0.634, + "step": 11375 + }, + { + "epoch": 1.8571078731480348, + "grad_norm": 1.6474980115890503, + "learning_rate": 1.9462804870039592e-05, + "loss": 0.7198, + "step": 11376 + }, + { + "epoch": 1.8572711317905393, + "grad_norm": 1.8675572872161865, + "learning_rate": 1.946270225675455e-05, + "loss": 0.6364, + "step": 11377 + }, + { + "epoch": 1.8574343904330437, + "grad_norm": 1.5638259649276733, + "learning_rate": 1.9462599633940572e-05, + "loss": 0.6255, + "step": 11378 + }, + { + "epoch": 1.857597649075548, + "grad_norm": 1.4250820875167847, + "learning_rate": 1.9462497001597752e-05, + "loss": 0.53, + "step": 11379 + }, + { + "epoch": 1.8577609077180524, + "grad_norm": 1.7427653074264526, + "learning_rate": 1.9462394359726208e-05, + "loss": 0.7597, + "step": 11380 + }, + { + "epoch": 1.8579241663605566, + "grad_norm": 1.4496889114379883, + "learning_rate": 1.9462291708326027e-05, + "loss": 0.5825, + "step": 11381 + }, + { + "epoch": 1.858087425003061, + "grad_norm": 2.135394334793091, + "learning_rate": 1.9462189047397326e-05, + "loss": 0.85, + "step": 11382 + }, + { + "epoch": 1.8582506836455654, + "grad_norm": 1.6018152236938477, + "learning_rate": 1.9462086376940205e-05, + "loss": 0.7263, + "step": 11383 + }, + { + "epoch": 1.8584139422880699, + "grad_norm": 1.8383878469467163, + "learning_rate": 1.946198369695476e-05, + "loss": 0.6741, + "step": 11384 + }, + { + "epoch": 1.8585772009305743, + "grad_norm": 1.8410464525222778, + "learning_rate": 1.9461881007441102e-05, + "loss": 0.6622, + "step": 11385 + }, + { + "epoch": 1.8587404595730788, + "grad_norm": 1.4778528213500977, + "learning_rate": 1.9461778308399336e-05, + "loss": 0.6273, + "step": 11386 + }, + { + "epoch": 1.8589037182155832, + "grad_norm": 1.4650120735168457, + "learning_rate": 1.9461675599829558e-05, + "loss": 0.5434, + "step": 11387 + }, + { + "epoch": 1.8590669768580874, + "grad_norm": 1.691633939743042, + "learning_rate": 1.9461572881731876e-05, + "loss": 0.7043, + "step": 11388 + }, + { + "epoch": 1.8592302355005919, + "grad_norm": 1.9041624069213867, + "learning_rate": 1.946147015410639e-05, + "loss": 0.6537, + "step": 11389 + }, + { + "epoch": 1.859393494143096, + "grad_norm": 1.782978892326355, + "learning_rate": 1.946136741695321e-05, + "loss": 0.7683, + "step": 11390 + }, + { + "epoch": 1.8595567527856005, + "grad_norm": 1.7229278087615967, + "learning_rate": 1.9461264670272432e-05, + "loss": 0.6608, + "step": 11391 + }, + { + "epoch": 1.859720011428105, + "grad_norm": 1.489294171333313, + "learning_rate": 1.9461161914064166e-05, + "loss": 0.7129, + "step": 11392 + }, + { + "epoch": 1.8598832700706094, + "grad_norm": 1.7740106582641602, + "learning_rate": 1.946105914832851e-05, + "loss": 0.6981, + "step": 11393 + }, + { + "epoch": 1.8600465287131138, + "grad_norm": 1.646751046180725, + "learning_rate": 1.9460956373065574e-05, + "loss": 0.5959, + "step": 11394 + }, + { + "epoch": 1.8602097873556183, + "grad_norm": 1.8456655740737915, + "learning_rate": 1.9460853588275454e-05, + "loss": 0.7989, + "step": 11395 + }, + { + "epoch": 1.8603730459981225, + "grad_norm": 1.670685052871704, + "learning_rate": 1.9460750793958257e-05, + "loss": 0.6349, + "step": 11396 + }, + { + "epoch": 1.860536304640627, + "grad_norm": 1.4241327047348022, + "learning_rate": 1.946064799011409e-05, + "loss": 0.4604, + "step": 11397 + }, + { + "epoch": 1.8606995632831314, + "grad_norm": 1.3810228109359741, + "learning_rate": 1.946054517674305e-05, + "loss": 0.5067, + "step": 11398 + }, + { + "epoch": 1.8608628219256356, + "grad_norm": 1.9902907609939575, + "learning_rate": 1.9460442353845245e-05, + "loss": 0.7067, + "step": 11399 + }, + { + "epoch": 1.86102608056814, + "grad_norm": 1.5193673372268677, + "learning_rate": 1.946033952142077e-05, + "loss": 0.5881, + "step": 11400 + }, + { + "epoch": 1.8611893392106444, + "grad_norm": 1.539060115814209, + "learning_rate": 1.9460236679469748e-05, + "loss": 0.5943, + "step": 11401 + }, + { + "epoch": 1.8613525978531489, + "grad_norm": 2.008824586868286, + "learning_rate": 1.946013382799226e-05, + "loss": 0.9271, + "step": 11402 + }, + { + "epoch": 1.8615158564956533, + "grad_norm": 1.7220029830932617, + "learning_rate": 1.9460030966988427e-05, + "loss": 0.6266, + "step": 11403 + }, + { + "epoch": 1.8616791151381578, + "grad_norm": 1.668290376663208, + "learning_rate": 1.945992809645834e-05, + "loss": 0.6703, + "step": 11404 + }, + { + "epoch": 1.861842373780662, + "grad_norm": 1.9252721071243286, + "learning_rate": 1.945982521640211e-05, + "loss": 0.6598, + "step": 11405 + }, + { + "epoch": 1.8620056324231664, + "grad_norm": 1.6196459531784058, + "learning_rate": 1.945972232681984e-05, + "loss": 0.6037, + "step": 11406 + }, + { + "epoch": 1.8621688910656706, + "grad_norm": 1.6449024677276611, + "learning_rate": 1.9459619427711626e-05, + "loss": 0.5798, + "step": 11407 + }, + { + "epoch": 1.862332149708175, + "grad_norm": 1.7002391815185547, + "learning_rate": 1.9459516519077585e-05, + "loss": 0.7592, + "step": 11408 + }, + { + "epoch": 1.8624954083506795, + "grad_norm": 1.6330289840698242, + "learning_rate": 1.9459413600917808e-05, + "loss": 0.6487, + "step": 11409 + }, + { + "epoch": 1.862658666993184, + "grad_norm": 1.6738574504852295, + "learning_rate": 1.9459310673232407e-05, + "loss": 0.7145, + "step": 11410 + }, + { + "epoch": 1.8628219256356884, + "grad_norm": 1.447174310684204, + "learning_rate": 1.9459207736021484e-05, + "loss": 0.5083, + "step": 11411 + }, + { + "epoch": 1.8629851842781928, + "grad_norm": 1.7610563039779663, + "learning_rate": 1.945910478928514e-05, + "loss": 0.7341, + "step": 11412 + }, + { + "epoch": 1.8631484429206973, + "grad_norm": 1.4928058385849, + "learning_rate": 1.9459001833023477e-05, + "loss": 0.5383, + "step": 11413 + }, + { + "epoch": 1.8633117015632015, + "grad_norm": 1.521897792816162, + "learning_rate": 1.9458898867236603e-05, + "loss": 0.5651, + "step": 11414 + }, + { + "epoch": 1.863474960205706, + "grad_norm": 1.6357287168502808, + "learning_rate": 1.9458795891924623e-05, + "loss": 0.607, + "step": 11415 + }, + { + "epoch": 1.8636382188482101, + "grad_norm": 1.4678351879119873, + "learning_rate": 1.9458692907087636e-05, + "loss": 0.496, + "step": 11416 + }, + { + "epoch": 1.8638014774907146, + "grad_norm": 1.6538318395614624, + "learning_rate": 1.9458589912725746e-05, + "loss": 0.5518, + "step": 11417 + }, + { + "epoch": 1.863964736133219, + "grad_norm": 1.44560706615448, + "learning_rate": 1.9458486908839063e-05, + "loss": 0.6461, + "step": 11418 + }, + { + "epoch": 1.8641279947757234, + "grad_norm": 1.5160542726516724, + "learning_rate": 1.9458383895427683e-05, + "loss": 0.5504, + "step": 11419 + }, + { + "epoch": 1.8642912534182279, + "grad_norm": 1.6698386669158936, + "learning_rate": 1.9458280872491713e-05, + "loss": 0.6928, + "step": 11420 + }, + { + "epoch": 1.8644545120607323, + "grad_norm": 1.6586346626281738, + "learning_rate": 1.945817784003126e-05, + "loss": 0.6696, + "step": 11421 + }, + { + "epoch": 1.8646177707032368, + "grad_norm": 1.803773283958435, + "learning_rate": 1.945807479804642e-05, + "loss": 0.7711, + "step": 11422 + }, + { + "epoch": 1.864781029345741, + "grad_norm": 1.61565363407135, + "learning_rate": 1.9457971746537306e-05, + "loss": 0.6441, + "step": 11423 + }, + { + "epoch": 1.8649442879882454, + "grad_norm": 1.5894356966018677, + "learning_rate": 1.945786868550401e-05, + "loss": 0.7086, + "step": 11424 + }, + { + "epoch": 1.8651075466307496, + "grad_norm": 1.698496699333191, + "learning_rate": 1.9457765614946648e-05, + "loss": 0.6764, + "step": 11425 + }, + { + "epoch": 1.865270805273254, + "grad_norm": 1.6154731512069702, + "learning_rate": 1.945766253486532e-05, + "loss": 0.6144, + "step": 11426 + }, + { + "epoch": 1.8654340639157585, + "grad_norm": 1.530824065208435, + "learning_rate": 1.9457559445260124e-05, + "loss": 0.5731, + "step": 11427 + }, + { + "epoch": 1.865597322558263, + "grad_norm": 1.7381176948547363, + "learning_rate": 1.945745634613117e-05, + "loss": 0.6502, + "step": 11428 + }, + { + "epoch": 1.8657605812007674, + "grad_norm": 1.4016667604446411, + "learning_rate": 1.9457353237478557e-05, + "loss": 0.5898, + "step": 11429 + }, + { + "epoch": 1.8659238398432718, + "grad_norm": 1.791313886642456, + "learning_rate": 1.9457250119302396e-05, + "loss": 0.7827, + "step": 11430 + }, + { + "epoch": 1.8660870984857763, + "grad_norm": 1.9328656196594238, + "learning_rate": 1.9457146991602785e-05, + "loss": 0.7082, + "step": 11431 + }, + { + "epoch": 1.8662503571282805, + "grad_norm": 1.9362692832946777, + "learning_rate": 1.945704385437983e-05, + "loss": 0.703, + "step": 11432 + }, + { + "epoch": 1.866413615770785, + "grad_norm": 1.9214754104614258, + "learning_rate": 1.9456940707633634e-05, + "loss": 0.6218, + "step": 11433 + }, + { + "epoch": 1.8665768744132891, + "grad_norm": 1.62966787815094, + "learning_rate": 1.9456837551364304e-05, + "loss": 0.6984, + "step": 11434 + }, + { + "epoch": 1.8667401330557936, + "grad_norm": 1.689095377922058, + "learning_rate": 1.9456734385571935e-05, + "loss": 0.7576, + "step": 11435 + }, + { + "epoch": 1.866903391698298, + "grad_norm": 1.4133380651474, + "learning_rate": 1.945663121025664e-05, + "loss": 0.5262, + "step": 11436 + }, + { + "epoch": 1.8670666503408024, + "grad_norm": 1.6677546501159668, + "learning_rate": 1.9456528025418523e-05, + "loss": 0.657, + "step": 11437 + }, + { + "epoch": 1.8672299089833069, + "grad_norm": 1.6481825113296509, + "learning_rate": 1.945642483105768e-05, + "loss": 0.7048, + "step": 11438 + }, + { + "epoch": 1.8673931676258113, + "grad_norm": 1.8044381141662598, + "learning_rate": 1.9456321627174222e-05, + "loss": 0.6084, + "step": 11439 + }, + { + "epoch": 1.8675564262683155, + "grad_norm": 1.6847569942474365, + "learning_rate": 1.945621841376825e-05, + "loss": 0.6369, + "step": 11440 + }, + { + "epoch": 1.86771968491082, + "grad_norm": 1.6886372566223145, + "learning_rate": 1.945611519083987e-05, + "loss": 0.6501, + "step": 11441 + }, + { + "epoch": 1.8678829435533244, + "grad_norm": 1.530936360359192, + "learning_rate": 1.9456011958389186e-05, + "loss": 0.5738, + "step": 11442 + }, + { + "epoch": 1.8680462021958286, + "grad_norm": 1.961145043373108, + "learning_rate": 1.9455908716416296e-05, + "loss": 0.7787, + "step": 11443 + }, + { + "epoch": 1.868209460838333, + "grad_norm": 1.7782002687454224, + "learning_rate": 1.945580546492131e-05, + "loss": 0.6667, + "step": 11444 + }, + { + "epoch": 1.8683727194808375, + "grad_norm": 1.4123544692993164, + "learning_rate": 1.9455702203904333e-05, + "loss": 0.5282, + "step": 11445 + }, + { + "epoch": 1.868535978123342, + "grad_norm": 2.269317626953125, + "learning_rate": 1.945559893336546e-05, + "loss": 0.5888, + "step": 11446 + }, + { + "epoch": 1.8686992367658464, + "grad_norm": 2.1223304271698, + "learning_rate": 1.9455495653304806e-05, + "loss": 0.5977, + "step": 11447 + }, + { + "epoch": 1.8688624954083508, + "grad_norm": 1.689969539642334, + "learning_rate": 1.9455392363722468e-05, + "loss": 0.6713, + "step": 11448 + }, + { + "epoch": 1.869025754050855, + "grad_norm": 1.640174388885498, + "learning_rate": 1.9455289064618558e-05, + "loss": 0.6213, + "step": 11449 + }, + { + "epoch": 1.8691890126933595, + "grad_norm": 2.321896553039551, + "learning_rate": 1.945518575599317e-05, + "loss": 0.7272, + "step": 11450 + }, + { + "epoch": 1.8693522713358637, + "grad_norm": 1.5930275917053223, + "learning_rate": 1.9455082437846415e-05, + "loss": 0.594, + "step": 11451 + }, + { + "epoch": 1.8695155299783681, + "grad_norm": 1.987666368484497, + "learning_rate": 1.9454979110178392e-05, + "loss": 0.8378, + "step": 11452 + }, + { + "epoch": 1.8696787886208726, + "grad_norm": 1.6309988498687744, + "learning_rate": 1.9454875772989207e-05, + "loss": 0.6281, + "step": 11453 + }, + { + "epoch": 1.869842047263377, + "grad_norm": 1.577735185623169, + "learning_rate": 1.9454772426278965e-05, + "loss": 0.6775, + "step": 11454 + }, + { + "epoch": 1.8700053059058814, + "grad_norm": 1.7238805294036865, + "learning_rate": 1.9454669070047772e-05, + "loss": 0.6788, + "step": 11455 + }, + { + "epoch": 1.8701685645483859, + "grad_norm": 1.6100451946258545, + "learning_rate": 1.9454565704295728e-05, + "loss": 0.5719, + "step": 11456 + }, + { + "epoch": 1.8703318231908903, + "grad_norm": 2.0530707836151123, + "learning_rate": 1.945446232902294e-05, + "loss": 0.8713, + "step": 11457 + }, + { + "epoch": 1.8704950818333945, + "grad_norm": 1.7088276147842407, + "learning_rate": 1.9454358944229507e-05, + "loss": 0.6137, + "step": 11458 + }, + { + "epoch": 1.870658340475899, + "grad_norm": 1.5064231157302856, + "learning_rate": 1.9454255549915542e-05, + "loss": 0.5664, + "step": 11459 + }, + { + "epoch": 1.8708215991184032, + "grad_norm": 2.5388591289520264, + "learning_rate": 1.945415214608114e-05, + "loss": 0.6976, + "step": 11460 + }, + { + "epoch": 1.8709848577609076, + "grad_norm": 1.5529669523239136, + "learning_rate": 1.9454048732726415e-05, + "loss": 0.6593, + "step": 11461 + }, + { + "epoch": 1.871148116403412, + "grad_norm": 1.5846813917160034, + "learning_rate": 1.9453945309851462e-05, + "loss": 0.6595, + "step": 11462 + }, + { + "epoch": 1.8713113750459165, + "grad_norm": 1.4811205863952637, + "learning_rate": 1.9453841877456384e-05, + "loss": 0.6129, + "step": 11463 + }, + { + "epoch": 1.871474633688421, + "grad_norm": 1.5899109840393066, + "learning_rate": 1.9453738435541296e-05, + "loss": 0.7079, + "step": 11464 + }, + { + "epoch": 1.8716378923309254, + "grad_norm": 2.0411670207977295, + "learning_rate": 1.9453634984106293e-05, + "loss": 0.8161, + "step": 11465 + }, + { + "epoch": 1.8718011509734298, + "grad_norm": 1.5072439908981323, + "learning_rate": 1.9453531523151486e-05, + "loss": 0.4786, + "step": 11466 + }, + { + "epoch": 1.871964409615934, + "grad_norm": 1.7958481311798096, + "learning_rate": 1.945342805267697e-05, + "loss": 0.8287, + "step": 11467 + }, + { + "epoch": 1.8721276682584385, + "grad_norm": 1.8937709331512451, + "learning_rate": 1.9453324572682856e-05, + "loss": 0.7355, + "step": 11468 + }, + { + "epoch": 1.8722909269009427, + "grad_norm": 1.47244131565094, + "learning_rate": 1.945322108316925e-05, + "loss": 0.6431, + "step": 11469 + }, + { + "epoch": 1.8724541855434471, + "grad_norm": 2.0092735290527344, + "learning_rate": 1.9453117584136253e-05, + "loss": 0.6744, + "step": 11470 + }, + { + "epoch": 1.8726174441859516, + "grad_norm": 1.7591285705566406, + "learning_rate": 1.9453014075583962e-05, + "loss": 0.6495, + "step": 11471 + }, + { + "epoch": 1.872780702828456, + "grad_norm": 1.8152241706848145, + "learning_rate": 1.9452910557512497e-05, + "loss": 0.7554, + "step": 11472 + }, + { + "epoch": 1.8729439614709604, + "grad_norm": 1.8670258522033691, + "learning_rate": 1.9452807029921947e-05, + "loss": 0.7125, + "step": 11473 + }, + { + "epoch": 1.8731072201134649, + "grad_norm": 1.7216867208480835, + "learning_rate": 1.9452703492812425e-05, + "loss": 0.6457, + "step": 11474 + }, + { + "epoch": 1.8732704787559693, + "grad_norm": 1.6917376518249512, + "learning_rate": 1.9452599946184033e-05, + "loss": 0.6457, + "step": 11475 + }, + { + "epoch": 1.8734337373984735, + "grad_norm": 1.7641962766647339, + "learning_rate": 1.9452496390036876e-05, + "loss": 0.6971, + "step": 11476 + }, + { + "epoch": 1.873596996040978, + "grad_norm": 1.8004573583602905, + "learning_rate": 1.9452392824371057e-05, + "loss": 0.6141, + "step": 11477 + }, + { + "epoch": 1.8737602546834822, + "grad_norm": 1.8828840255737305, + "learning_rate": 1.945228924918668e-05, + "loss": 0.7949, + "step": 11478 + }, + { + "epoch": 1.8739235133259866, + "grad_norm": 1.5240252017974854, + "learning_rate": 1.9452185664483854e-05, + "loss": 0.5971, + "step": 11479 + }, + { + "epoch": 1.874086771968491, + "grad_norm": 1.577991008758545, + "learning_rate": 1.9452082070262678e-05, + "loss": 0.6906, + "step": 11480 + }, + { + "epoch": 1.8742500306109955, + "grad_norm": 1.663020372390747, + "learning_rate": 1.9451978466523256e-05, + "loss": 0.6785, + "step": 11481 + }, + { + "epoch": 1.8744132892535, + "grad_norm": 1.7203855514526367, + "learning_rate": 1.9451874853265695e-05, + "loss": 0.7135, + "step": 11482 + }, + { + "epoch": 1.8745765478960044, + "grad_norm": 1.2431669235229492, + "learning_rate": 1.94517712304901e-05, + "loss": 0.4815, + "step": 11483 + }, + { + "epoch": 1.8747398065385086, + "grad_norm": 1.6367541551589966, + "learning_rate": 1.945166759819657e-05, + "loss": 0.671, + "step": 11484 + }, + { + "epoch": 1.874903065181013, + "grad_norm": 1.5486866235733032, + "learning_rate": 1.9451563956385217e-05, + "loss": 0.5137, + "step": 11485 + }, + { + "epoch": 1.8750663238235175, + "grad_norm": 1.619064450263977, + "learning_rate": 1.945146030505614e-05, + "loss": 0.6845, + "step": 11486 + }, + { + "epoch": 1.8752295824660217, + "grad_norm": 1.8517924547195435, + "learning_rate": 1.9451356644209445e-05, + "loss": 0.6046, + "step": 11487 + }, + { + "epoch": 1.875392841108526, + "grad_norm": 1.6384507417678833, + "learning_rate": 1.9451252973845238e-05, + "loss": 0.6861, + "step": 11488 + }, + { + "epoch": 1.8755560997510305, + "grad_norm": 1.7289226055145264, + "learning_rate": 1.945114929396362e-05, + "loss": 0.7351, + "step": 11489 + }, + { + "epoch": 1.875719358393535, + "grad_norm": 1.6299560070037842, + "learning_rate": 1.94510456045647e-05, + "loss": 0.6677, + "step": 11490 + }, + { + "epoch": 1.8758826170360394, + "grad_norm": 1.5504000186920166, + "learning_rate": 1.9450941905648575e-05, + "loss": 0.6804, + "step": 11491 + }, + { + "epoch": 1.8760458756785439, + "grad_norm": 1.9602934122085571, + "learning_rate": 1.945083819721536e-05, + "loss": 0.7653, + "step": 11492 + }, + { + "epoch": 1.876209134321048, + "grad_norm": 1.7588677406311035, + "learning_rate": 1.9450734479265146e-05, + "loss": 0.7337, + "step": 11493 + }, + { + "epoch": 1.8763723929635525, + "grad_norm": 1.8978039026260376, + "learning_rate": 1.945063075179805e-05, + "loss": 0.6901, + "step": 11494 + }, + { + "epoch": 1.8765356516060567, + "grad_norm": 1.5178502798080444, + "learning_rate": 1.9450527014814173e-05, + "loss": 0.5586, + "step": 11495 + }, + { + "epoch": 1.8766989102485612, + "grad_norm": 1.5865373611450195, + "learning_rate": 1.9450423268313613e-05, + "loss": 0.6402, + "step": 11496 + }, + { + "epoch": 1.8768621688910656, + "grad_norm": 1.5291099548339844, + "learning_rate": 1.9450319512296478e-05, + "loss": 0.6351, + "step": 11497 + }, + { + "epoch": 1.87702542753357, + "grad_norm": 1.416226863861084, + "learning_rate": 1.9450215746762878e-05, + "loss": 0.6322, + "step": 11498 + }, + { + "epoch": 1.8771886861760745, + "grad_norm": 1.5664267539978027, + "learning_rate": 1.9450111971712913e-05, + "loss": 0.7232, + "step": 11499 + }, + { + "epoch": 1.877351944818579, + "grad_norm": 1.60057532787323, + "learning_rate": 1.9450008187146685e-05, + "loss": 0.6844, + "step": 11500 + }, + { + "epoch": 1.8775152034610834, + "grad_norm": 1.940841794013977, + "learning_rate": 1.9449904393064303e-05, + "loss": 0.7875, + "step": 11501 + }, + { + "epoch": 1.8776784621035876, + "grad_norm": 1.4768779277801514, + "learning_rate": 1.944980058946587e-05, + "loss": 0.5877, + "step": 11502 + }, + { + "epoch": 1.877841720746092, + "grad_norm": 1.4608166217803955, + "learning_rate": 1.944969677635149e-05, + "loss": 0.5693, + "step": 11503 + }, + { + "epoch": 1.8780049793885962, + "grad_norm": 1.460681676864624, + "learning_rate": 1.944959295372127e-05, + "loss": 0.5551, + "step": 11504 + }, + { + "epoch": 1.8781682380311007, + "grad_norm": 1.990973949432373, + "learning_rate": 1.944948912157531e-05, + "loss": 0.7081, + "step": 11505 + }, + { + "epoch": 1.878331496673605, + "grad_norm": 1.2026201486587524, + "learning_rate": 1.9449385279913716e-05, + "loss": 0.4524, + "step": 11506 + }, + { + "epoch": 1.8784947553161095, + "grad_norm": 1.856552004814148, + "learning_rate": 1.9449281428736597e-05, + "loss": 0.7422, + "step": 11507 + }, + { + "epoch": 1.878658013958614, + "grad_norm": 1.6628409624099731, + "learning_rate": 1.9449177568044052e-05, + "loss": 0.7235, + "step": 11508 + }, + { + "epoch": 1.8788212726011184, + "grad_norm": 1.6752853393554688, + "learning_rate": 1.9449073697836187e-05, + "loss": 0.6086, + "step": 11509 + }, + { + "epoch": 1.8789845312436229, + "grad_norm": 1.7337779998779297, + "learning_rate": 1.944896981811311e-05, + "loss": 0.596, + "step": 11510 + }, + { + "epoch": 1.879147789886127, + "grad_norm": 1.7454215288162231, + "learning_rate": 1.944886592887492e-05, + "loss": 0.6668, + "step": 11511 + }, + { + "epoch": 1.8793110485286315, + "grad_norm": 1.6474817991256714, + "learning_rate": 1.9448762030121723e-05, + "loss": 0.6368, + "step": 11512 + }, + { + "epoch": 1.8794743071711357, + "grad_norm": 1.966313123703003, + "learning_rate": 1.944865812185363e-05, + "loss": 0.6665, + "step": 11513 + }, + { + "epoch": 1.8796375658136402, + "grad_norm": 1.5946966409683228, + "learning_rate": 1.9448554204070738e-05, + "loss": 0.6616, + "step": 11514 + }, + { + "epoch": 1.8798008244561446, + "grad_norm": 1.8290305137634277, + "learning_rate": 1.9448450276773154e-05, + "loss": 0.7034, + "step": 11515 + }, + { + "epoch": 1.879964083098649, + "grad_norm": 1.5924561023712158, + "learning_rate": 1.9448346339960984e-05, + "loss": 0.5389, + "step": 11516 + }, + { + "epoch": 1.8801273417411535, + "grad_norm": 1.5353035926818848, + "learning_rate": 1.944824239363433e-05, + "loss": 0.6068, + "step": 11517 + }, + { + "epoch": 1.880290600383658, + "grad_norm": 1.996812343597412, + "learning_rate": 1.94481384377933e-05, + "loss": 0.8247, + "step": 11518 + }, + { + "epoch": 1.8804538590261624, + "grad_norm": 1.9649841785430908, + "learning_rate": 1.9448034472437997e-05, + "loss": 0.7063, + "step": 11519 + }, + { + "epoch": 1.8806171176686666, + "grad_norm": 1.616782307624817, + "learning_rate": 1.9447930497568528e-05, + "loss": 0.6559, + "step": 11520 + }, + { + "epoch": 1.880780376311171, + "grad_norm": 1.795293927192688, + "learning_rate": 1.9447826513184992e-05, + "loss": 0.688, + "step": 11521 + }, + { + "epoch": 1.8809436349536752, + "grad_norm": 1.559537649154663, + "learning_rate": 1.94477225192875e-05, + "loss": 0.6232, + "step": 11522 + }, + { + "epoch": 1.8811068935961797, + "grad_norm": 1.8149657249450684, + "learning_rate": 1.9447618515876153e-05, + "loss": 0.8542, + "step": 11523 + }, + { + "epoch": 1.881270152238684, + "grad_norm": 1.6024699211120605, + "learning_rate": 1.9447514502951055e-05, + "loss": 0.7056, + "step": 11524 + }, + { + "epoch": 1.8814334108811885, + "grad_norm": 1.786564826965332, + "learning_rate": 1.9447410480512315e-05, + "loss": 0.6538, + "step": 11525 + }, + { + "epoch": 1.881596669523693, + "grad_norm": 1.922845721244812, + "learning_rate": 1.9447306448560032e-05, + "loss": 0.7443, + "step": 11526 + }, + { + "epoch": 1.8817599281661974, + "grad_norm": 1.6194251775741577, + "learning_rate": 1.9447202407094315e-05, + "loss": 0.6882, + "step": 11527 + }, + { + "epoch": 1.8819231868087016, + "grad_norm": 1.949500560760498, + "learning_rate": 1.9447098356115267e-05, + "loss": 0.7363, + "step": 11528 + }, + { + "epoch": 1.882086445451206, + "grad_norm": 1.908187747001648, + "learning_rate": 1.9446994295622995e-05, + "loss": 0.6803, + "step": 11529 + }, + { + "epoch": 1.8822497040937105, + "grad_norm": 1.6194300651550293, + "learning_rate": 1.9446890225617604e-05, + "loss": 0.6803, + "step": 11530 + }, + { + "epoch": 1.8824129627362147, + "grad_norm": 1.683687448501587, + "learning_rate": 1.9446786146099197e-05, + "loss": 0.6183, + "step": 11531 + }, + { + "epoch": 1.8825762213787192, + "grad_norm": 1.522481083869934, + "learning_rate": 1.9446682057067875e-05, + "loss": 0.5375, + "step": 11532 + }, + { + "epoch": 1.8827394800212236, + "grad_norm": 1.6315289735794067, + "learning_rate": 1.944657795852375e-05, + "loss": 0.7022, + "step": 11533 + }, + { + "epoch": 1.882902738663728, + "grad_norm": 1.6781351566314697, + "learning_rate": 1.9446473850466924e-05, + "loss": 0.6486, + "step": 11534 + }, + { + "epoch": 1.8830659973062325, + "grad_norm": 1.5806336402893066, + "learning_rate": 1.9446369732897496e-05, + "loss": 0.6879, + "step": 11535 + }, + { + "epoch": 1.883229255948737, + "grad_norm": 1.7709660530090332, + "learning_rate": 1.944626560581558e-05, + "loss": 0.6956, + "step": 11536 + }, + { + "epoch": 1.8833925145912411, + "grad_norm": 1.4701541662216187, + "learning_rate": 1.9446161469221277e-05, + "loss": 0.6622, + "step": 11537 + }, + { + "epoch": 1.8835557732337456, + "grad_norm": 1.5637449026107788, + "learning_rate": 1.944605732311469e-05, + "loss": 0.6178, + "step": 11538 + }, + { + "epoch": 1.8837190318762498, + "grad_norm": 1.6318939924240112, + "learning_rate": 1.9445953167495932e-05, + "loss": 0.6781, + "step": 11539 + }, + { + "epoch": 1.8838822905187542, + "grad_norm": 1.536285638809204, + "learning_rate": 1.9445849002365092e-05, + "loss": 0.654, + "step": 11540 + }, + { + "epoch": 1.8840455491612587, + "grad_norm": 1.452775001525879, + "learning_rate": 1.9445744827722293e-05, + "loss": 0.6759, + "step": 11541 + }, + { + "epoch": 1.884208807803763, + "grad_norm": 1.6871135234832764, + "learning_rate": 1.9445640643567626e-05, + "loss": 0.6061, + "step": 11542 + }, + { + "epoch": 1.8843720664462675, + "grad_norm": 1.548416256904602, + "learning_rate": 1.9445536449901205e-05, + "loss": 0.6378, + "step": 11543 + }, + { + "epoch": 1.884535325088772, + "grad_norm": 1.4469001293182373, + "learning_rate": 1.944543224672313e-05, + "loss": 0.6106, + "step": 11544 + }, + { + "epoch": 1.8846985837312764, + "grad_norm": 1.4708023071289062, + "learning_rate": 1.9445328034033508e-05, + "loss": 0.5439, + "step": 11545 + }, + { + "epoch": 1.8848618423737806, + "grad_norm": 1.399923324584961, + "learning_rate": 1.9445223811832438e-05, + "loss": 0.6261, + "step": 11546 + }, + { + "epoch": 1.885025101016285, + "grad_norm": 1.474892497062683, + "learning_rate": 1.9445119580120035e-05, + "loss": 0.5223, + "step": 11547 + }, + { + "epoch": 1.8851883596587893, + "grad_norm": 1.8021674156188965, + "learning_rate": 1.94450153388964e-05, + "loss": 0.5803, + "step": 11548 + }, + { + "epoch": 1.8853516183012937, + "grad_norm": 1.28997004032135, + "learning_rate": 1.9444911088161636e-05, + "loss": 0.4958, + "step": 11549 + }, + { + "epoch": 1.8855148769437982, + "grad_norm": 1.5397969484329224, + "learning_rate": 1.9444806827915848e-05, + "loss": 0.6634, + "step": 11550 + }, + { + "epoch": 1.8856781355863026, + "grad_norm": 1.5749480724334717, + "learning_rate": 1.9444702558159143e-05, + "loss": 0.6273, + "step": 11551 + }, + { + "epoch": 1.885841394228807, + "grad_norm": 1.5941251516342163, + "learning_rate": 1.9444598278891623e-05, + "loss": 0.7622, + "step": 11552 + }, + { + "epoch": 1.8860046528713115, + "grad_norm": 2.005549907684326, + "learning_rate": 1.9444493990113398e-05, + "loss": 0.8363, + "step": 11553 + }, + { + "epoch": 1.886167911513816, + "grad_norm": 1.4441014528274536, + "learning_rate": 1.9444389691824568e-05, + "loss": 0.5922, + "step": 11554 + }, + { + "epoch": 1.8863311701563201, + "grad_norm": 1.627755880355835, + "learning_rate": 1.944428538402524e-05, + "loss": 0.6353, + "step": 11555 + }, + { + "epoch": 1.8864944287988246, + "grad_norm": 1.711790919303894, + "learning_rate": 1.944418106671552e-05, + "loss": 0.7931, + "step": 11556 + }, + { + "epoch": 1.8866576874413288, + "grad_norm": 1.3947298526763916, + "learning_rate": 1.944407673989551e-05, + "loss": 0.6314, + "step": 11557 + }, + { + "epoch": 1.8868209460838332, + "grad_norm": 1.5337296724319458, + "learning_rate": 1.944397240356532e-05, + "loss": 0.6828, + "step": 11558 + }, + { + "epoch": 1.8869842047263377, + "grad_norm": 1.3683524131774902, + "learning_rate": 1.9443868057725056e-05, + "loss": 0.5719, + "step": 11559 + }, + { + "epoch": 1.887147463368842, + "grad_norm": 1.47541344165802, + "learning_rate": 1.944376370237481e-05, + "loss": 0.604, + "step": 11560 + }, + { + "epoch": 1.8873107220113465, + "grad_norm": 1.732155680656433, + "learning_rate": 1.9443659337514704e-05, + "loss": 0.6959, + "step": 11561 + }, + { + "epoch": 1.887473980653851, + "grad_norm": 1.7050541639328003, + "learning_rate": 1.9443554963144832e-05, + "loss": 0.6815, + "step": 11562 + }, + { + "epoch": 1.8876372392963554, + "grad_norm": 1.4815789461135864, + "learning_rate": 1.9443450579265305e-05, + "loss": 0.6102, + "step": 11563 + }, + { + "epoch": 1.8878004979388596, + "grad_norm": 1.3421001434326172, + "learning_rate": 1.9443346185876223e-05, + "loss": 0.5227, + "step": 11564 + }, + { + "epoch": 1.887963756581364, + "grad_norm": 1.4962694644927979, + "learning_rate": 1.9443241782977696e-05, + "loss": 0.5994, + "step": 11565 + }, + { + "epoch": 1.8881270152238683, + "grad_norm": 1.3127574920654297, + "learning_rate": 1.9443137370569825e-05, + "loss": 0.4911, + "step": 11566 + }, + { + "epoch": 1.8882902738663727, + "grad_norm": 1.929032325744629, + "learning_rate": 1.944303294865272e-05, + "loss": 0.5909, + "step": 11567 + }, + { + "epoch": 1.8884535325088772, + "grad_norm": 1.7947477102279663, + "learning_rate": 1.944292851722648e-05, + "loss": 0.8034, + "step": 11568 + }, + { + "epoch": 1.8886167911513816, + "grad_norm": 1.4883267879486084, + "learning_rate": 1.9442824076291216e-05, + "loss": 0.6011, + "step": 11569 + }, + { + "epoch": 1.888780049793886, + "grad_norm": 1.64246666431427, + "learning_rate": 1.9442719625847032e-05, + "loss": 0.5931, + "step": 11570 + }, + { + "epoch": 1.8889433084363905, + "grad_norm": 1.6180378198623657, + "learning_rate": 1.944261516589403e-05, + "loss": 0.5594, + "step": 11571 + }, + { + "epoch": 1.889106567078895, + "grad_norm": 1.7483789920806885, + "learning_rate": 1.9442510696432315e-05, + "loss": 0.6597, + "step": 11572 + }, + { + "epoch": 1.8892698257213991, + "grad_norm": 1.9996604919433594, + "learning_rate": 1.9442406217461996e-05, + "loss": 0.7814, + "step": 11573 + }, + { + "epoch": 1.8894330843639036, + "grad_norm": 1.5139063596725464, + "learning_rate": 1.9442301728983176e-05, + "loss": 0.6141, + "step": 11574 + }, + { + "epoch": 1.8895963430064078, + "grad_norm": 1.480607032775879, + "learning_rate": 1.944219723099596e-05, + "loss": 0.5782, + "step": 11575 + }, + { + "epoch": 1.8897596016489122, + "grad_norm": 1.8566299676895142, + "learning_rate": 1.9442092723500456e-05, + "loss": 0.7321, + "step": 11576 + }, + { + "epoch": 1.8899228602914167, + "grad_norm": 1.8643804788589478, + "learning_rate": 1.9441988206496768e-05, + "loss": 0.6995, + "step": 11577 + }, + { + "epoch": 1.890086118933921, + "grad_norm": 1.9485164880752563, + "learning_rate": 1.9441883679985e-05, + "loss": 0.6794, + "step": 11578 + }, + { + "epoch": 1.8902493775764255, + "grad_norm": 1.7295310497283936, + "learning_rate": 1.9441779143965254e-05, + "loss": 0.7492, + "step": 11579 + }, + { + "epoch": 1.89041263621893, + "grad_norm": 1.6741697788238525, + "learning_rate": 1.944167459843764e-05, + "loss": 0.6818, + "step": 11580 + }, + { + "epoch": 1.8905758948614342, + "grad_norm": 1.6923526525497437, + "learning_rate": 1.944157004340226e-05, + "loss": 0.5761, + "step": 11581 + }, + { + "epoch": 1.8907391535039386, + "grad_norm": 1.593882441520691, + "learning_rate": 1.944146547885923e-05, + "loss": 0.7182, + "step": 11582 + }, + { + "epoch": 1.890902412146443, + "grad_norm": 1.4845552444458008, + "learning_rate": 1.9441360904808638e-05, + "loss": 0.6187, + "step": 11583 + }, + { + "epoch": 1.8910656707889473, + "grad_norm": 1.5105583667755127, + "learning_rate": 1.9441256321250604e-05, + "loss": 0.5492, + "step": 11584 + }, + { + "epoch": 1.8912289294314517, + "grad_norm": 1.5043299198150635, + "learning_rate": 1.9441151728185225e-05, + "loss": 0.6411, + "step": 11585 + }, + { + "epoch": 1.8913921880739561, + "grad_norm": 1.834070086479187, + "learning_rate": 1.9441047125612605e-05, + "loss": 0.7349, + "step": 11586 + }, + { + "epoch": 1.8915554467164606, + "grad_norm": 1.4392379522323608, + "learning_rate": 1.944094251353286e-05, + "loss": 0.6161, + "step": 11587 + }, + { + "epoch": 1.891718705358965, + "grad_norm": 1.7833870649337769, + "learning_rate": 1.9440837891946086e-05, + "loss": 0.6482, + "step": 11588 + }, + { + "epoch": 1.8918819640014695, + "grad_norm": 1.638486385345459, + "learning_rate": 1.944073326085239e-05, + "loss": 0.6438, + "step": 11589 + }, + { + "epoch": 1.8920452226439737, + "grad_norm": 1.870792031288147, + "learning_rate": 1.9440628620251874e-05, + "loss": 0.8086, + "step": 11590 + }, + { + "epoch": 1.8922084812864781, + "grad_norm": 1.9518942832946777, + "learning_rate": 1.9440523970144654e-05, + "loss": 0.7676, + "step": 11591 + }, + { + "epoch": 1.8923717399289823, + "grad_norm": 1.6262729167938232, + "learning_rate": 1.9440419310530826e-05, + "loss": 0.7074, + "step": 11592 + }, + { + "epoch": 1.8925349985714868, + "grad_norm": 1.8143446445465088, + "learning_rate": 1.94403146414105e-05, + "loss": 0.7346, + "step": 11593 + }, + { + "epoch": 1.8926982572139912, + "grad_norm": 1.6360920667648315, + "learning_rate": 1.944020996278378e-05, + "loss": 0.7078, + "step": 11594 + }, + { + "epoch": 1.8928615158564956, + "grad_norm": 1.5741231441497803, + "learning_rate": 1.9440105274650766e-05, + "loss": 0.6758, + "step": 11595 + }, + { + "epoch": 1.893024774499, + "grad_norm": 1.2854944467544556, + "learning_rate": 1.9440000577011573e-05, + "loss": 0.5697, + "step": 11596 + }, + { + "epoch": 1.8931880331415045, + "grad_norm": 1.6311179399490356, + "learning_rate": 1.94398958698663e-05, + "loss": 0.8062, + "step": 11597 + }, + { + "epoch": 1.893351291784009, + "grad_norm": 1.479883074760437, + "learning_rate": 1.9439791153215055e-05, + "loss": 0.7036, + "step": 11598 + }, + { + "epoch": 1.8935145504265132, + "grad_norm": 1.4560647010803223, + "learning_rate": 1.9439686427057943e-05, + "loss": 0.5759, + "step": 11599 + }, + { + "epoch": 1.8936778090690176, + "grad_norm": 1.7028098106384277, + "learning_rate": 1.943958169139507e-05, + "loss": 0.737, + "step": 11600 + }, + { + "epoch": 1.8938410677115218, + "grad_norm": 1.5455909967422485, + "learning_rate": 1.943947694622654e-05, + "loss": 0.5382, + "step": 11601 + }, + { + "epoch": 1.8940043263540263, + "grad_norm": 1.5311299562454224, + "learning_rate": 1.9439372191552458e-05, + "loss": 0.7234, + "step": 11602 + }, + { + "epoch": 1.8941675849965307, + "grad_norm": 1.8698904514312744, + "learning_rate": 1.9439267427372932e-05, + "loss": 0.7324, + "step": 11603 + }, + { + "epoch": 1.8943308436390351, + "grad_norm": 1.7710331678390503, + "learning_rate": 1.9439162653688066e-05, + "loss": 0.6969, + "step": 11604 + }, + { + "epoch": 1.8944941022815396, + "grad_norm": 1.4616144895553589, + "learning_rate": 1.9439057870497966e-05, + "loss": 0.6117, + "step": 11605 + }, + { + "epoch": 1.894657360924044, + "grad_norm": 1.641790509223938, + "learning_rate": 1.9438953077802737e-05, + "loss": 0.6662, + "step": 11606 + }, + { + "epoch": 1.8948206195665485, + "grad_norm": 1.6729071140289307, + "learning_rate": 1.9438848275602484e-05, + "loss": 0.744, + "step": 11607 + }, + { + "epoch": 1.8949838782090527, + "grad_norm": 1.5238224267959595, + "learning_rate": 1.9438743463897314e-05, + "loss": 0.6175, + "step": 11608 + }, + { + "epoch": 1.8951471368515571, + "grad_norm": 1.5480231046676636, + "learning_rate": 1.943863864268733e-05, + "loss": 0.714, + "step": 11609 + }, + { + "epoch": 1.8953103954940613, + "grad_norm": 1.777389645576477, + "learning_rate": 1.9438533811972645e-05, + "loss": 0.7388, + "step": 11610 + }, + { + "epoch": 1.8954736541365658, + "grad_norm": 1.6796879768371582, + "learning_rate": 1.9438428971753355e-05, + "loss": 0.7166, + "step": 11611 + }, + { + "epoch": 1.8956369127790702, + "grad_norm": 1.7598503828048706, + "learning_rate": 1.943832412202957e-05, + "loss": 0.7949, + "step": 11612 + }, + { + "epoch": 1.8958001714215746, + "grad_norm": 1.5808384418487549, + "learning_rate": 1.9438219262801393e-05, + "loss": 0.6187, + "step": 11613 + }, + { + "epoch": 1.895963430064079, + "grad_norm": 1.736824631690979, + "learning_rate": 1.9438114394068934e-05, + "loss": 0.7397, + "step": 11614 + }, + { + "epoch": 1.8961266887065835, + "grad_norm": 1.594459056854248, + "learning_rate": 1.9438009515832298e-05, + "loss": 0.6311, + "step": 11615 + }, + { + "epoch": 1.896289947349088, + "grad_norm": 1.4716886281967163, + "learning_rate": 1.9437904628091586e-05, + "loss": 0.5776, + "step": 11616 + }, + { + "epoch": 1.8964532059915922, + "grad_norm": 1.8191249370574951, + "learning_rate": 1.9437799730846904e-05, + "loss": 0.6652, + "step": 11617 + }, + { + "epoch": 1.8966164646340966, + "grad_norm": 1.7769027948379517, + "learning_rate": 1.9437694824098367e-05, + "loss": 0.8172, + "step": 11618 + }, + { + "epoch": 1.8967797232766008, + "grad_norm": 1.6638751029968262, + "learning_rate": 1.943758990784607e-05, + "loss": 0.677, + "step": 11619 + }, + { + "epoch": 1.8969429819191053, + "grad_norm": 1.6761482954025269, + "learning_rate": 1.9437484982090122e-05, + "loss": 0.7367, + "step": 11620 + }, + { + "epoch": 1.8971062405616097, + "grad_norm": 1.449048399925232, + "learning_rate": 1.943738004683063e-05, + "loss": 0.6001, + "step": 11621 + }, + { + "epoch": 1.8972694992041141, + "grad_norm": 1.8349754810333252, + "learning_rate": 1.94372751020677e-05, + "loss": 0.9094, + "step": 11622 + }, + { + "epoch": 1.8974327578466186, + "grad_norm": 1.5482133626937866, + "learning_rate": 1.9437170147801434e-05, + "loss": 0.5745, + "step": 11623 + }, + { + "epoch": 1.897596016489123, + "grad_norm": 1.5425126552581787, + "learning_rate": 1.943706518403194e-05, + "loss": 0.5705, + "step": 11624 + }, + { + "epoch": 1.8977592751316272, + "grad_norm": 1.6129480600357056, + "learning_rate": 1.9436960210759325e-05, + "loss": 0.7195, + "step": 11625 + }, + { + "epoch": 1.8979225337741317, + "grad_norm": 1.5146262645721436, + "learning_rate": 1.9436855227983695e-05, + "loss": 0.6008, + "step": 11626 + }, + { + "epoch": 1.898085792416636, + "grad_norm": 1.348144292831421, + "learning_rate": 1.9436750235705152e-05, + "loss": 0.5853, + "step": 11627 + }, + { + "epoch": 1.8982490510591403, + "grad_norm": 1.6056288480758667, + "learning_rate": 1.9436645233923804e-05, + "loss": 0.628, + "step": 11628 + }, + { + "epoch": 1.8984123097016448, + "grad_norm": 1.5268828868865967, + "learning_rate": 1.943654022263976e-05, + "loss": 0.5902, + "step": 11629 + }, + { + "epoch": 1.8985755683441492, + "grad_norm": 1.5219417810440063, + "learning_rate": 1.943643520185312e-05, + "loss": 0.653, + "step": 11630 + }, + { + "epoch": 1.8987388269866536, + "grad_norm": 1.4454269409179688, + "learning_rate": 1.9436330171563994e-05, + "loss": 0.6454, + "step": 11631 + }, + { + "epoch": 1.898902085629158, + "grad_norm": 1.7486697435379028, + "learning_rate": 1.9436225131772482e-05, + "loss": 0.7218, + "step": 11632 + }, + { + "epoch": 1.8990653442716625, + "grad_norm": 1.4002004861831665, + "learning_rate": 1.9436120082478698e-05, + "loss": 0.5311, + "step": 11633 + }, + { + "epoch": 1.8992286029141667, + "grad_norm": 1.637635350227356, + "learning_rate": 1.9436015023682742e-05, + "loss": 0.6077, + "step": 11634 + }, + { + "epoch": 1.8993918615566712, + "grad_norm": 1.6334716081619263, + "learning_rate": 1.9435909955384724e-05, + "loss": 0.6824, + "step": 11635 + }, + { + "epoch": 1.8995551201991754, + "grad_norm": 1.6521251201629639, + "learning_rate": 1.943580487758474e-05, + "loss": 0.6652, + "step": 11636 + }, + { + "epoch": 1.8997183788416798, + "grad_norm": 1.570641279220581, + "learning_rate": 1.943569979028291e-05, + "loss": 0.6456, + "step": 11637 + }, + { + "epoch": 1.8998816374841843, + "grad_norm": 1.7872321605682373, + "learning_rate": 1.943559469347933e-05, + "loss": 0.6423, + "step": 11638 + }, + { + "epoch": 1.9000448961266887, + "grad_norm": 1.7164885997772217, + "learning_rate": 1.943548958717411e-05, + "loss": 0.6656, + "step": 11639 + }, + { + "epoch": 1.9002081547691931, + "grad_norm": 1.7776987552642822, + "learning_rate": 1.943538447136735e-05, + "loss": 0.7419, + "step": 11640 + }, + { + "epoch": 1.9003714134116976, + "grad_norm": 1.6292767524719238, + "learning_rate": 1.9435279346059166e-05, + "loss": 0.6836, + "step": 11641 + }, + { + "epoch": 1.900534672054202, + "grad_norm": 1.8345587253570557, + "learning_rate": 1.9435174211249657e-05, + "loss": 0.6446, + "step": 11642 + }, + { + "epoch": 1.9006979306967062, + "grad_norm": 1.5271564722061157, + "learning_rate": 1.9435069066938928e-05, + "loss": 0.6153, + "step": 11643 + }, + { + "epoch": 1.9008611893392107, + "grad_norm": 1.6989418268203735, + "learning_rate": 1.943496391312709e-05, + "loss": 0.6341, + "step": 11644 + }, + { + "epoch": 1.9010244479817149, + "grad_norm": 1.8898526430130005, + "learning_rate": 1.9434858749814244e-05, + "loss": 0.7082, + "step": 11645 + }, + { + "epoch": 1.9011877066242193, + "grad_norm": 1.5955911874771118, + "learning_rate": 1.9434753577000494e-05, + "loss": 0.6325, + "step": 11646 + }, + { + "epoch": 1.9013509652667238, + "grad_norm": 1.9128661155700684, + "learning_rate": 1.9434648394685952e-05, + "loss": 0.6834, + "step": 11647 + }, + { + "epoch": 1.9015142239092282, + "grad_norm": 1.6466622352600098, + "learning_rate": 1.9434543202870726e-05, + "loss": 0.6598, + "step": 11648 + }, + { + "epoch": 1.9016774825517326, + "grad_norm": 1.4612789154052734, + "learning_rate": 1.9434438001554914e-05, + "loss": 0.5859, + "step": 11649 + }, + { + "epoch": 1.901840741194237, + "grad_norm": 1.63814115524292, + "learning_rate": 1.9434332790738625e-05, + "loss": 0.6301, + "step": 11650 + }, + { + "epoch": 1.9020039998367415, + "grad_norm": 1.784630298614502, + "learning_rate": 1.9434227570421966e-05, + "loss": 0.6766, + "step": 11651 + }, + { + "epoch": 1.9021672584792457, + "grad_norm": 1.4790159463882446, + "learning_rate": 1.9434122340605044e-05, + "loss": 0.5161, + "step": 11652 + }, + { + "epoch": 1.9023305171217502, + "grad_norm": 1.7346559762954712, + "learning_rate": 1.943401710128796e-05, + "loss": 0.6128, + "step": 11653 + }, + { + "epoch": 1.9024937757642544, + "grad_norm": 1.7448608875274658, + "learning_rate": 1.9433911852470825e-05, + "loss": 0.6304, + "step": 11654 + }, + { + "epoch": 1.9026570344067588, + "grad_norm": 1.5681493282318115, + "learning_rate": 1.9433806594153744e-05, + "loss": 0.5843, + "step": 11655 + }, + { + "epoch": 1.9028202930492633, + "grad_norm": 2.0009074211120605, + "learning_rate": 1.9433701326336823e-05, + "loss": 0.8535, + "step": 11656 + }, + { + "epoch": 1.9029835516917677, + "grad_norm": 1.7074265480041504, + "learning_rate": 1.9433596049020164e-05, + "loss": 0.6463, + "step": 11657 + }, + { + "epoch": 1.9031468103342721, + "grad_norm": 1.8673193454742432, + "learning_rate": 1.943349076220388e-05, + "loss": 0.7439, + "step": 11658 + }, + { + "epoch": 1.9033100689767766, + "grad_norm": 1.4835978746414185, + "learning_rate": 1.9433385465888072e-05, + "loss": 0.5772, + "step": 11659 + }, + { + "epoch": 1.903473327619281, + "grad_norm": 1.829437017440796, + "learning_rate": 1.9433280160072846e-05, + "loss": 0.6159, + "step": 11660 + }, + { + "epoch": 1.9036365862617852, + "grad_norm": 1.6510778665542603, + "learning_rate": 1.9433174844758313e-05, + "loss": 0.7643, + "step": 11661 + }, + { + "epoch": 1.9037998449042897, + "grad_norm": 1.6902453899383545, + "learning_rate": 1.943306951994457e-05, + "loss": 0.6069, + "step": 11662 + }, + { + "epoch": 1.9039631035467939, + "grad_norm": 1.6211744546890259, + "learning_rate": 1.943296418563173e-05, + "loss": 0.7689, + "step": 11663 + }, + { + "epoch": 1.9041263621892983, + "grad_norm": 1.6205840110778809, + "learning_rate": 1.94328588418199e-05, + "loss": 0.6317, + "step": 11664 + }, + { + "epoch": 1.9042896208318028, + "grad_norm": 1.683172345161438, + "learning_rate": 1.9432753488509182e-05, + "loss": 0.6475, + "step": 11665 + }, + { + "epoch": 1.9044528794743072, + "grad_norm": 1.7775675058364868, + "learning_rate": 1.9432648125699685e-05, + "loss": 0.6487, + "step": 11666 + }, + { + "epoch": 1.9046161381168116, + "grad_norm": 2.018979787826538, + "learning_rate": 1.9432542753391512e-05, + "loss": 0.796, + "step": 11667 + }, + { + "epoch": 1.904779396759316, + "grad_norm": 1.7016969919204712, + "learning_rate": 1.943243737158477e-05, + "loss": 0.6707, + "step": 11668 + }, + { + "epoch": 1.9049426554018203, + "grad_norm": 1.751571536064148, + "learning_rate": 1.943233198027957e-05, + "loss": 0.5765, + "step": 11669 + }, + { + "epoch": 1.9051059140443247, + "grad_norm": 1.912782907485962, + "learning_rate": 1.943222657947601e-05, + "loss": 0.6805, + "step": 11670 + }, + { + "epoch": 1.9052691726868292, + "grad_norm": 1.5132701396942139, + "learning_rate": 1.94321211691742e-05, + "loss": 0.6466, + "step": 11671 + }, + { + "epoch": 1.9054324313293334, + "grad_norm": 2.2362430095672607, + "learning_rate": 1.9432015749374254e-05, + "loss": 0.712, + "step": 11672 + }, + { + "epoch": 1.9055956899718378, + "grad_norm": 1.6058048009872437, + "learning_rate": 1.9431910320076265e-05, + "loss": 0.6173, + "step": 11673 + }, + { + "epoch": 1.9057589486143423, + "grad_norm": 1.3507256507873535, + "learning_rate": 1.9431804881280346e-05, + "loss": 0.5629, + "step": 11674 + }, + { + "epoch": 1.9059222072568467, + "grad_norm": 2.0930521488189697, + "learning_rate": 1.94316994329866e-05, + "loss": 0.8736, + "step": 11675 + }, + { + "epoch": 1.9060854658993511, + "grad_norm": 1.5851383209228516, + "learning_rate": 1.9431593975195134e-05, + "loss": 0.6051, + "step": 11676 + }, + { + "epoch": 1.9062487245418556, + "grad_norm": 2.222877264022827, + "learning_rate": 1.943148850790606e-05, + "loss": 0.7083, + "step": 11677 + }, + { + "epoch": 1.9064119831843598, + "grad_norm": 1.633547067642212, + "learning_rate": 1.943138303111948e-05, + "loss": 0.7152, + "step": 11678 + }, + { + "epoch": 1.9065752418268642, + "grad_norm": 1.6204349994659424, + "learning_rate": 1.9431277544835497e-05, + "loss": 0.6645, + "step": 11679 + }, + { + "epoch": 1.9067385004693684, + "grad_norm": 1.8937236070632935, + "learning_rate": 1.943117204905422e-05, + "loss": 0.7346, + "step": 11680 + }, + { + "epoch": 1.9069017591118729, + "grad_norm": 1.3777647018432617, + "learning_rate": 1.9431066543775753e-05, + "loss": 0.6646, + "step": 11681 + }, + { + "epoch": 1.9070650177543773, + "grad_norm": 1.5879884958267212, + "learning_rate": 1.9430961029000207e-05, + "loss": 0.6131, + "step": 11682 + }, + { + "epoch": 1.9072282763968817, + "grad_norm": 1.517434000968933, + "learning_rate": 1.943085550472769e-05, + "loss": 0.6133, + "step": 11683 + }, + { + "epoch": 1.9073915350393862, + "grad_norm": 1.3848624229431152, + "learning_rate": 1.9430749970958297e-05, + "loss": 0.5924, + "step": 11684 + }, + { + "epoch": 1.9075547936818906, + "grad_norm": 1.649513840675354, + "learning_rate": 1.9430644427692143e-05, + "loss": 0.7212, + "step": 11685 + }, + { + "epoch": 1.907718052324395, + "grad_norm": 1.3362343311309814, + "learning_rate": 1.9430538874929334e-05, + "loss": 0.6522, + "step": 11686 + }, + { + "epoch": 1.9078813109668993, + "grad_norm": 1.3183152675628662, + "learning_rate": 1.9430433312669974e-05, + "loss": 0.5194, + "step": 11687 + }, + { + "epoch": 1.9080445696094037, + "grad_norm": 1.4779022932052612, + "learning_rate": 1.9430327740914173e-05, + "loss": 0.5552, + "step": 11688 + }, + { + "epoch": 1.908207828251908, + "grad_norm": 1.6023788452148438, + "learning_rate": 1.943022215966203e-05, + "loss": 0.6728, + "step": 11689 + }, + { + "epoch": 1.9083710868944124, + "grad_norm": 2.2967560291290283, + "learning_rate": 1.9430116568913656e-05, + "loss": 0.7415, + "step": 11690 + }, + { + "epoch": 1.9085343455369168, + "grad_norm": 1.6845508813858032, + "learning_rate": 1.943001096866916e-05, + "loss": 0.605, + "step": 11691 + }, + { + "epoch": 1.9086976041794212, + "grad_norm": 1.5847089290618896, + "learning_rate": 1.9429905358928648e-05, + "loss": 0.596, + "step": 11692 + }, + { + "epoch": 1.9088608628219257, + "grad_norm": 1.711670994758606, + "learning_rate": 1.9429799739692218e-05, + "loss": 0.6932, + "step": 11693 + }, + { + "epoch": 1.9090241214644301, + "grad_norm": 1.7254462242126465, + "learning_rate": 1.9429694110959986e-05, + "loss": 0.7109, + "step": 11694 + }, + { + "epoch": 1.9091873801069346, + "grad_norm": 1.537634015083313, + "learning_rate": 1.942958847273205e-05, + "loss": 0.6431, + "step": 11695 + }, + { + "epoch": 1.9093506387494388, + "grad_norm": 1.6208806037902832, + "learning_rate": 1.9429482825008527e-05, + "loss": 0.7688, + "step": 11696 + }, + { + "epoch": 1.9095138973919432, + "grad_norm": 1.7726614475250244, + "learning_rate": 1.9429377167789513e-05, + "loss": 0.7927, + "step": 11697 + }, + { + "epoch": 1.9096771560344474, + "grad_norm": 1.5612629652023315, + "learning_rate": 1.942927150107512e-05, + "loss": 0.6202, + "step": 11698 + }, + { + "epoch": 1.9098404146769519, + "grad_norm": 1.593611240386963, + "learning_rate": 1.9429165824865452e-05, + "loss": 0.6984, + "step": 11699 + }, + { + "epoch": 1.9100036733194563, + "grad_norm": 1.4913840293884277, + "learning_rate": 1.942906013916062e-05, + "loss": 0.7002, + "step": 11700 + }, + { + "epoch": 1.9101669319619607, + "grad_norm": 1.7763315439224243, + "learning_rate": 1.9428954443960722e-05, + "loss": 0.6482, + "step": 11701 + }, + { + "epoch": 1.9103301906044652, + "grad_norm": 1.5564844608306885, + "learning_rate": 1.9428848739265874e-05, + "loss": 0.6101, + "step": 11702 + }, + { + "epoch": 1.9104934492469696, + "grad_norm": 1.5636448860168457, + "learning_rate": 1.9428743025076177e-05, + "loss": 0.581, + "step": 11703 + }, + { + "epoch": 1.910656707889474, + "grad_norm": 1.5162380933761597, + "learning_rate": 1.9428637301391734e-05, + "loss": 0.539, + "step": 11704 + }, + { + "epoch": 1.9108199665319783, + "grad_norm": 1.760331153869629, + "learning_rate": 1.942853156821266e-05, + "loss": 0.6884, + "step": 11705 + }, + { + "epoch": 1.9109832251744827, + "grad_norm": 1.3705326318740845, + "learning_rate": 1.9428425825539056e-05, + "loss": 0.5517, + "step": 11706 + }, + { + "epoch": 1.911146483816987, + "grad_norm": 1.3694878816604614, + "learning_rate": 1.9428320073371027e-05, + "loss": 0.576, + "step": 11707 + }, + { + "epoch": 1.9113097424594914, + "grad_norm": 1.7955211400985718, + "learning_rate": 1.9428214311708687e-05, + "loss": 0.8505, + "step": 11708 + }, + { + "epoch": 1.9114730011019958, + "grad_norm": 1.8304907083511353, + "learning_rate": 1.9428108540552134e-05, + "loss": 0.7603, + "step": 11709 + }, + { + "epoch": 1.9116362597445002, + "grad_norm": 1.327850580215454, + "learning_rate": 1.942800275990148e-05, + "loss": 0.5691, + "step": 11710 + }, + { + "epoch": 1.9117995183870047, + "grad_norm": 1.3578276634216309, + "learning_rate": 1.942789696975683e-05, + "loss": 0.5837, + "step": 11711 + }, + { + "epoch": 1.9119627770295091, + "grad_norm": 1.7404872179031372, + "learning_rate": 1.942779117011829e-05, + "loss": 0.8727, + "step": 11712 + }, + { + "epoch": 1.9121260356720133, + "grad_norm": 1.5426862239837646, + "learning_rate": 1.9427685360985963e-05, + "loss": 0.6383, + "step": 11713 + }, + { + "epoch": 1.9122892943145178, + "grad_norm": 1.4933158159255981, + "learning_rate": 1.9427579542359966e-05, + "loss": 0.5457, + "step": 11714 + }, + { + "epoch": 1.9124525529570222, + "grad_norm": 1.4081796407699585, + "learning_rate": 1.9427473714240393e-05, + "loss": 0.615, + "step": 11715 + }, + { + "epoch": 1.9126158115995264, + "grad_norm": 1.9119536876678467, + "learning_rate": 1.942736787662736e-05, + "loss": 0.8581, + "step": 11716 + }, + { + "epoch": 1.9127790702420309, + "grad_norm": 1.7762911319732666, + "learning_rate": 1.942726202952097e-05, + "loss": 0.691, + "step": 11717 + }, + { + "epoch": 1.9129423288845353, + "grad_norm": 2.015127182006836, + "learning_rate": 1.9427156172921328e-05, + "loss": 1.461, + "step": 11718 + }, + { + "epoch": 1.9131055875270397, + "grad_norm": 1.500795841217041, + "learning_rate": 1.9427050306828543e-05, + "loss": 0.6182, + "step": 11719 + }, + { + "epoch": 1.9132688461695442, + "grad_norm": 1.5139065980911255, + "learning_rate": 1.942694443124272e-05, + "loss": 0.5902, + "step": 11720 + }, + { + "epoch": 1.9134321048120486, + "grad_norm": 1.9228928089141846, + "learning_rate": 1.942683854616397e-05, + "loss": 0.622, + "step": 11721 + }, + { + "epoch": 1.9135953634545528, + "grad_norm": 1.8428676128387451, + "learning_rate": 1.9426732651592393e-05, + "loss": 0.7201, + "step": 11722 + }, + { + "epoch": 1.9137586220970573, + "grad_norm": 1.5700511932373047, + "learning_rate": 1.94266267475281e-05, + "loss": 0.5663, + "step": 11723 + }, + { + "epoch": 1.9139218807395615, + "grad_norm": 1.761622667312622, + "learning_rate": 1.9426520833971194e-05, + "loss": 0.8199, + "step": 11724 + }, + { + "epoch": 1.914085139382066, + "grad_norm": 1.6457796096801758, + "learning_rate": 1.9426414910921785e-05, + "loss": 0.6472, + "step": 11725 + }, + { + "epoch": 1.9142483980245704, + "grad_norm": 1.7948859930038452, + "learning_rate": 1.942630897837998e-05, + "loss": 0.6941, + "step": 11726 + }, + { + "epoch": 1.9144116566670748, + "grad_norm": 1.5503339767456055, + "learning_rate": 1.9426203036345884e-05, + "loss": 0.5245, + "step": 11727 + }, + { + "epoch": 1.9145749153095792, + "grad_norm": 1.5167207717895508, + "learning_rate": 1.9426097084819605e-05, + "loss": 0.5713, + "step": 11728 + }, + { + "epoch": 1.9147381739520837, + "grad_norm": 2.0027010440826416, + "learning_rate": 1.9425991123801243e-05, + "loss": 0.8289, + "step": 11729 + }, + { + "epoch": 1.9149014325945881, + "grad_norm": 1.962835431098938, + "learning_rate": 1.942588515329092e-05, + "loss": 0.7441, + "step": 11730 + }, + { + "epoch": 1.9150646912370923, + "grad_norm": 1.800398349761963, + "learning_rate": 1.9425779173288727e-05, + "loss": 0.8642, + "step": 11731 + }, + { + "epoch": 1.9152279498795968, + "grad_norm": 1.8643889427185059, + "learning_rate": 1.9425673183794774e-05, + "loss": 0.6075, + "step": 11732 + }, + { + "epoch": 1.915391208522101, + "grad_norm": 1.6960248947143555, + "learning_rate": 1.9425567184809178e-05, + "loss": 0.767, + "step": 11733 + }, + { + "epoch": 1.9155544671646054, + "grad_norm": 1.8338183164596558, + "learning_rate": 1.9425461176332035e-05, + "loss": 0.5357, + "step": 11734 + }, + { + "epoch": 1.9157177258071099, + "grad_norm": 1.7097474336624146, + "learning_rate": 1.9425355158363454e-05, + "loss": 0.7099, + "step": 11735 + }, + { + "epoch": 1.9158809844496143, + "grad_norm": 1.8905770778656006, + "learning_rate": 1.9425249130903544e-05, + "loss": 0.7037, + "step": 11736 + }, + { + "epoch": 1.9160442430921187, + "grad_norm": 1.4940587282180786, + "learning_rate": 1.942514309395241e-05, + "loss": 0.654, + "step": 11737 + }, + { + "epoch": 1.9162075017346232, + "grad_norm": 1.5319099426269531, + "learning_rate": 1.942503704751016e-05, + "loss": 0.6736, + "step": 11738 + }, + { + "epoch": 1.9163707603771276, + "grad_norm": 2.107459545135498, + "learning_rate": 1.9424930991576897e-05, + "loss": 0.8462, + "step": 11739 + }, + { + "epoch": 1.9165340190196318, + "grad_norm": 1.6465506553649902, + "learning_rate": 1.9424824926152736e-05, + "loss": 0.6625, + "step": 11740 + }, + { + "epoch": 1.9166972776621363, + "grad_norm": 1.8361597061157227, + "learning_rate": 1.9424718851237774e-05, + "loss": 0.7942, + "step": 11741 + }, + { + "epoch": 1.9168605363046405, + "grad_norm": 1.6612377166748047, + "learning_rate": 1.9424612766832127e-05, + "loss": 0.723, + "step": 11742 + }, + { + "epoch": 1.917023794947145, + "grad_norm": 1.5709806680679321, + "learning_rate": 1.9424506672935896e-05, + "loss": 0.6978, + "step": 11743 + }, + { + "epoch": 1.9171870535896494, + "grad_norm": 1.406995415687561, + "learning_rate": 1.942440056954919e-05, + "loss": 0.5861, + "step": 11744 + }, + { + "epoch": 1.9173503122321538, + "grad_norm": 1.3830050230026245, + "learning_rate": 1.9424294456672115e-05, + "loss": 0.5819, + "step": 11745 + }, + { + "epoch": 1.9175135708746582, + "grad_norm": 1.52486252784729, + "learning_rate": 1.9424188334304778e-05, + "loss": 0.5934, + "step": 11746 + }, + { + "epoch": 1.9176768295171627, + "grad_norm": 1.3594576120376587, + "learning_rate": 1.942408220244728e-05, + "loss": 0.5656, + "step": 11747 + }, + { + "epoch": 1.9178400881596671, + "grad_norm": 1.503672480583191, + "learning_rate": 1.942397606109974e-05, + "loss": 0.508, + "step": 11748 + }, + { + "epoch": 1.9180033468021713, + "grad_norm": 1.5605601072311401, + "learning_rate": 1.942386991026226e-05, + "loss": 0.6336, + "step": 11749 + }, + { + "epoch": 1.9181666054446758, + "grad_norm": 1.738492727279663, + "learning_rate": 1.9423763749934942e-05, + "loss": 0.6391, + "step": 11750 + }, + { + "epoch": 1.91832986408718, + "grad_norm": 1.7043906450271606, + "learning_rate": 1.9423657580117898e-05, + "loss": 0.6923, + "step": 11751 + }, + { + "epoch": 1.9184931227296844, + "grad_norm": 2.0062167644500732, + "learning_rate": 1.942355140081123e-05, + "loss": 0.8504, + "step": 11752 + }, + { + "epoch": 1.9186563813721889, + "grad_norm": 1.654372215270996, + "learning_rate": 1.942344521201505e-05, + "loss": 0.7043, + "step": 11753 + }, + { + "epoch": 1.9188196400146933, + "grad_norm": 1.4305592775344849, + "learning_rate": 1.9423339013729466e-05, + "loss": 0.4818, + "step": 11754 + }, + { + "epoch": 1.9189828986571977, + "grad_norm": 1.3753505945205688, + "learning_rate": 1.942323280595458e-05, + "loss": 0.6179, + "step": 11755 + }, + { + "epoch": 1.9191461572997022, + "grad_norm": 1.6097384691238403, + "learning_rate": 1.9423126588690502e-05, + "loss": 0.6896, + "step": 11756 + }, + { + "epoch": 1.9193094159422064, + "grad_norm": 1.5493942499160767, + "learning_rate": 1.9423020361937336e-05, + "loss": 0.7047, + "step": 11757 + }, + { + "epoch": 1.9194726745847108, + "grad_norm": 1.7837023735046387, + "learning_rate": 1.942291412569519e-05, + "loss": 0.8157, + "step": 11758 + }, + { + "epoch": 1.9196359332272153, + "grad_norm": 1.3748022317886353, + "learning_rate": 1.9422807879964178e-05, + "loss": 0.5697, + "step": 11759 + }, + { + "epoch": 1.9197991918697195, + "grad_norm": 1.8129335641860962, + "learning_rate": 1.9422701624744396e-05, + "loss": 0.8158, + "step": 11760 + }, + { + "epoch": 1.919962450512224, + "grad_norm": 1.8936262130737305, + "learning_rate": 1.9422595360035958e-05, + "loss": 0.6479, + "step": 11761 + }, + { + "epoch": 1.9201257091547284, + "grad_norm": 1.962675929069519, + "learning_rate": 1.942248908583897e-05, + "loss": 0.6825, + "step": 11762 + }, + { + "epoch": 1.9202889677972328, + "grad_norm": 1.7639846801757812, + "learning_rate": 1.9422382802153533e-05, + "loss": 0.8522, + "step": 11763 + }, + { + "epoch": 1.9204522264397372, + "grad_norm": 1.6288803815841675, + "learning_rate": 1.9422276508979763e-05, + "loss": 0.6157, + "step": 11764 + }, + { + "epoch": 1.9206154850822417, + "grad_norm": 2.018666982650757, + "learning_rate": 1.942217020631776e-05, + "loss": 0.7896, + "step": 11765 + }, + { + "epoch": 1.9207787437247459, + "grad_norm": 1.672440528869629, + "learning_rate": 1.9422063894167638e-05, + "loss": 0.7105, + "step": 11766 + }, + { + "epoch": 1.9209420023672503, + "grad_norm": 1.580072283744812, + "learning_rate": 1.94219575725295e-05, + "loss": 0.5938, + "step": 11767 + }, + { + "epoch": 1.9211052610097545, + "grad_norm": 1.575501799583435, + "learning_rate": 1.9421851241403452e-05, + "loss": 0.6529, + "step": 11768 + }, + { + "epoch": 1.921268519652259, + "grad_norm": 1.7104986906051636, + "learning_rate": 1.94217449007896e-05, + "loss": 0.6936, + "step": 11769 + }, + { + "epoch": 1.9214317782947634, + "grad_norm": 1.6789088249206543, + "learning_rate": 1.9421638550688057e-05, + "loss": 0.6829, + "step": 11770 + }, + { + "epoch": 1.9215950369372679, + "grad_norm": 1.5894490480422974, + "learning_rate": 1.942153219109892e-05, + "loss": 0.6955, + "step": 11771 + }, + { + "epoch": 1.9217582955797723, + "grad_norm": 1.3835574388504028, + "learning_rate": 1.942142582202231e-05, + "loss": 0.5821, + "step": 11772 + }, + { + "epoch": 1.9219215542222767, + "grad_norm": 1.5929983854293823, + "learning_rate": 1.9421319443458325e-05, + "loss": 0.6847, + "step": 11773 + }, + { + "epoch": 1.9220848128647812, + "grad_norm": 1.5981950759887695, + "learning_rate": 1.942121305540707e-05, + "loss": 0.5845, + "step": 11774 + }, + { + "epoch": 1.9222480715072854, + "grad_norm": 1.4469727277755737, + "learning_rate": 1.942110665786866e-05, + "loss": 0.661, + "step": 11775 + }, + { + "epoch": 1.9224113301497898, + "grad_norm": 1.5446134805679321, + "learning_rate": 1.9421000250843196e-05, + "loss": 0.6548, + "step": 11776 + }, + { + "epoch": 1.922574588792294, + "grad_norm": 1.4657834768295288, + "learning_rate": 1.942089383433079e-05, + "loss": 0.564, + "step": 11777 + }, + { + "epoch": 1.9227378474347985, + "grad_norm": 1.421641230583191, + "learning_rate": 1.9420787408331544e-05, + "loss": 0.5926, + "step": 11778 + }, + { + "epoch": 1.922901106077303, + "grad_norm": 1.7069646120071411, + "learning_rate": 1.942068097284557e-05, + "loss": 0.7318, + "step": 11779 + }, + { + "epoch": 1.9230643647198074, + "grad_norm": 1.5186021327972412, + "learning_rate": 1.942057452787297e-05, + "loss": 0.6431, + "step": 11780 + }, + { + "epoch": 1.9232276233623118, + "grad_norm": 1.4159257411956787, + "learning_rate": 1.9420468073413854e-05, + "loss": 0.5645, + "step": 11781 + }, + { + "epoch": 1.9233908820048162, + "grad_norm": 1.5883588790893555, + "learning_rate": 1.9420361609468332e-05, + "loss": 0.6889, + "step": 11782 + }, + { + "epoch": 1.9235541406473207, + "grad_norm": 1.6045573949813843, + "learning_rate": 1.9420255136036505e-05, + "loss": 0.6279, + "step": 11783 + }, + { + "epoch": 1.9237173992898249, + "grad_norm": 1.5444530248641968, + "learning_rate": 1.9420148653118485e-05, + "loss": 0.6642, + "step": 11784 + }, + { + "epoch": 1.9238806579323293, + "grad_norm": 1.7348703145980835, + "learning_rate": 1.9420042160714377e-05, + "loss": 0.6852, + "step": 11785 + }, + { + "epoch": 1.9240439165748335, + "grad_norm": 2.358764886856079, + "learning_rate": 1.941993565882429e-05, + "loss": 0.7529, + "step": 11786 + }, + { + "epoch": 1.924207175217338, + "grad_norm": 1.7129441499710083, + "learning_rate": 1.9419829147448327e-05, + "loss": 0.7849, + "step": 11787 + }, + { + "epoch": 1.9243704338598424, + "grad_norm": 1.803831696510315, + "learning_rate": 1.9419722626586605e-05, + "loss": 0.7874, + "step": 11788 + }, + { + "epoch": 1.9245336925023468, + "grad_norm": 1.5189048051834106, + "learning_rate": 1.941961609623922e-05, + "loss": 0.5925, + "step": 11789 + }, + { + "epoch": 1.9246969511448513, + "grad_norm": 1.806084394454956, + "learning_rate": 1.9419509556406287e-05, + "loss": 0.7054, + "step": 11790 + }, + { + "epoch": 1.9248602097873557, + "grad_norm": 2.151031732559204, + "learning_rate": 1.9419403007087908e-05, + "loss": 0.7264, + "step": 11791 + }, + { + "epoch": 1.9250234684298602, + "grad_norm": 1.8768466711044312, + "learning_rate": 1.9419296448284193e-05, + "loss": 0.7594, + "step": 11792 + }, + { + "epoch": 1.9251867270723644, + "grad_norm": 1.6631461381912231, + "learning_rate": 1.941918987999525e-05, + "loss": 0.7077, + "step": 11793 + }, + { + "epoch": 1.9253499857148688, + "grad_norm": 2.1028552055358887, + "learning_rate": 1.9419083302221185e-05, + "loss": 0.8045, + "step": 11794 + }, + { + "epoch": 1.925513244357373, + "grad_norm": 1.4776742458343506, + "learning_rate": 1.9418976714962106e-05, + "loss": 0.591, + "step": 11795 + }, + { + "epoch": 1.9256765029998775, + "grad_norm": 1.4929262399673462, + "learning_rate": 1.9418870118218118e-05, + "loss": 0.6179, + "step": 11796 + }, + { + "epoch": 1.925839761642382, + "grad_norm": 1.5088690519332886, + "learning_rate": 1.9418763511989333e-05, + "loss": 0.6251, + "step": 11797 + }, + { + "epoch": 1.9260030202848863, + "grad_norm": 1.5284916162490845, + "learning_rate": 1.9418656896275855e-05, + "loss": 0.5944, + "step": 11798 + }, + { + "epoch": 1.9261662789273908, + "grad_norm": 1.7609362602233887, + "learning_rate": 1.941855027107779e-05, + "loss": 0.7023, + "step": 11799 + }, + { + "epoch": 1.9263295375698952, + "grad_norm": 1.5750141143798828, + "learning_rate": 1.941844363639525e-05, + "loss": 0.6773, + "step": 11800 + }, + { + "epoch": 1.9264927962123994, + "grad_norm": 1.6417129039764404, + "learning_rate": 1.941833699222834e-05, + "loss": 0.7012, + "step": 11801 + }, + { + "epoch": 1.9266560548549039, + "grad_norm": 1.643852710723877, + "learning_rate": 1.9418230338577164e-05, + "loss": 0.6879, + "step": 11802 + }, + { + "epoch": 1.9268193134974083, + "grad_norm": 1.7371487617492676, + "learning_rate": 1.9418123675441832e-05, + "loss": 0.7445, + "step": 11803 + }, + { + "epoch": 1.9269825721399125, + "grad_norm": 1.7611974477767944, + "learning_rate": 1.9418017002822455e-05, + "loss": 0.6205, + "step": 11804 + }, + { + "epoch": 1.927145830782417, + "grad_norm": 1.5743842124938965, + "learning_rate": 1.9417910320719135e-05, + "loss": 0.6265, + "step": 11805 + }, + { + "epoch": 1.9273090894249214, + "grad_norm": 1.7411830425262451, + "learning_rate": 1.9417803629131984e-05, + "loss": 0.6682, + "step": 11806 + }, + { + "epoch": 1.9274723480674258, + "grad_norm": 1.7101434469223022, + "learning_rate": 1.9417696928061106e-05, + "loss": 0.6739, + "step": 11807 + }, + { + "epoch": 1.9276356067099303, + "grad_norm": 1.4828531742095947, + "learning_rate": 1.9417590217506614e-05, + "loss": 0.6076, + "step": 11808 + }, + { + "epoch": 1.9277988653524347, + "grad_norm": 1.9070714712142944, + "learning_rate": 1.9417483497468605e-05, + "loss": 0.7921, + "step": 11809 + }, + { + "epoch": 1.927962123994939, + "grad_norm": 1.7900432348251343, + "learning_rate": 1.9417376767947194e-05, + "loss": 0.7557, + "step": 11810 + }, + { + "epoch": 1.9281253826374434, + "grad_norm": 1.9095922708511353, + "learning_rate": 1.9417270028942488e-05, + "loss": 0.795, + "step": 11811 + }, + { + "epoch": 1.9282886412799478, + "grad_norm": 1.9155068397521973, + "learning_rate": 1.9417163280454594e-05, + "loss": 0.712, + "step": 11812 + }, + { + "epoch": 1.928451899922452, + "grad_norm": 1.8026355504989624, + "learning_rate": 1.941705652248362e-05, + "loss": 0.7098, + "step": 11813 + }, + { + "epoch": 1.9286151585649565, + "grad_norm": 1.7470694780349731, + "learning_rate": 1.941694975502967e-05, + "loss": 0.6595, + "step": 11814 + }, + { + "epoch": 1.928778417207461, + "grad_norm": 1.728690266609192, + "learning_rate": 1.9416842978092856e-05, + "loss": 0.6592, + "step": 11815 + }, + { + "epoch": 1.9289416758499653, + "grad_norm": 1.8231247663497925, + "learning_rate": 1.9416736191673285e-05, + "loss": 0.6629, + "step": 11816 + }, + { + "epoch": 1.9291049344924698, + "grad_norm": 1.4220443964004517, + "learning_rate": 1.9416629395771058e-05, + "loss": 0.5997, + "step": 11817 + }, + { + "epoch": 1.9292681931349742, + "grad_norm": 1.7691798210144043, + "learning_rate": 1.9416522590386294e-05, + "loss": 0.7333, + "step": 11818 + }, + { + "epoch": 1.9294314517774784, + "grad_norm": 1.833841323852539, + "learning_rate": 1.9416415775519087e-05, + "loss": 0.7356, + "step": 11819 + }, + { + "epoch": 1.9295947104199829, + "grad_norm": 1.625185489654541, + "learning_rate": 1.9416308951169557e-05, + "loss": 0.6786, + "step": 11820 + }, + { + "epoch": 1.929757969062487, + "grad_norm": 1.5889050960540771, + "learning_rate": 1.9416202117337805e-05, + "loss": 0.7031, + "step": 11821 + }, + { + "epoch": 1.9299212277049915, + "grad_norm": 1.4354424476623535, + "learning_rate": 1.9416095274023942e-05, + "loss": 0.5267, + "step": 11822 + }, + { + "epoch": 1.930084486347496, + "grad_norm": 1.6790525913238525, + "learning_rate": 1.941598842122807e-05, + "loss": 0.6366, + "step": 11823 + }, + { + "epoch": 1.9302477449900004, + "grad_norm": 1.3715513944625854, + "learning_rate": 1.9415881558950302e-05, + "loss": 0.5502, + "step": 11824 + }, + { + "epoch": 1.9304110036325048, + "grad_norm": 1.9907474517822266, + "learning_rate": 1.9415774687190746e-05, + "loss": 0.7156, + "step": 11825 + }, + { + "epoch": 1.9305742622750093, + "grad_norm": 1.5361100435256958, + "learning_rate": 1.9415667805949506e-05, + "loss": 0.6144, + "step": 11826 + }, + { + "epoch": 1.9307375209175137, + "grad_norm": 1.6464711427688599, + "learning_rate": 1.941556091522669e-05, + "loss": 0.6942, + "step": 11827 + }, + { + "epoch": 1.930900779560018, + "grad_norm": 1.4327415227890015, + "learning_rate": 1.9415454015022405e-05, + "loss": 0.5497, + "step": 11828 + }, + { + "epoch": 1.9310640382025224, + "grad_norm": 1.3958780765533447, + "learning_rate": 1.9415347105336762e-05, + "loss": 0.5283, + "step": 11829 + }, + { + "epoch": 1.9312272968450266, + "grad_norm": 1.3692985773086548, + "learning_rate": 1.9415240186169866e-05, + "loss": 0.5165, + "step": 11830 + }, + { + "epoch": 1.931390555487531, + "grad_norm": 1.6751124858856201, + "learning_rate": 1.9415133257521828e-05, + "loss": 0.6799, + "step": 11831 + }, + { + "epoch": 1.9315538141300355, + "grad_norm": 1.6611381769180298, + "learning_rate": 1.941502631939275e-05, + "loss": 0.6025, + "step": 11832 + }, + { + "epoch": 1.93171707277254, + "grad_norm": 1.6147339344024658, + "learning_rate": 1.941491937178275e-05, + "loss": 0.5768, + "step": 11833 + }, + { + "epoch": 1.9318803314150443, + "grad_norm": 1.6821485757827759, + "learning_rate": 1.941481241469192e-05, + "loss": 0.682, + "step": 11834 + }, + { + "epoch": 1.9320435900575488, + "grad_norm": 1.6029694080352783, + "learning_rate": 1.941470544812038e-05, + "loss": 0.6723, + "step": 11835 + }, + { + "epoch": 1.9322068487000532, + "grad_norm": 1.5421282052993774, + "learning_rate": 1.9414598472068236e-05, + "loss": 0.5409, + "step": 11836 + }, + { + "epoch": 1.9323701073425574, + "grad_norm": 1.5731903314590454, + "learning_rate": 1.941449148653559e-05, + "loss": 0.5826, + "step": 11837 + }, + { + "epoch": 1.9325333659850619, + "grad_norm": 1.4583938121795654, + "learning_rate": 1.9414384491522558e-05, + "loss": 0.6523, + "step": 11838 + }, + { + "epoch": 1.932696624627566, + "grad_norm": 1.7036464214324951, + "learning_rate": 1.941427748702924e-05, + "loss": 0.7176, + "step": 11839 + }, + { + "epoch": 1.9328598832700705, + "grad_norm": 2.0888636112213135, + "learning_rate": 1.9414170473055747e-05, + "loss": 0.7365, + "step": 11840 + }, + { + "epoch": 1.933023141912575, + "grad_norm": 1.5556607246398926, + "learning_rate": 1.9414063449602188e-05, + "loss": 0.5836, + "step": 11841 + }, + { + "epoch": 1.9331864005550794, + "grad_norm": 1.3760249614715576, + "learning_rate": 1.941395641666867e-05, + "loss": 0.5117, + "step": 11842 + }, + { + "epoch": 1.9333496591975838, + "grad_norm": 1.9275193214416504, + "learning_rate": 1.9413849374255302e-05, + "loss": 0.6407, + "step": 11843 + }, + { + "epoch": 1.9335129178400883, + "grad_norm": 1.7549632787704468, + "learning_rate": 1.9413742322362185e-05, + "loss": 0.5962, + "step": 11844 + }, + { + "epoch": 1.9336761764825927, + "grad_norm": 1.7436634302139282, + "learning_rate": 1.9413635260989437e-05, + "loss": 0.6884, + "step": 11845 + }, + { + "epoch": 1.933839435125097, + "grad_norm": 1.8717013597488403, + "learning_rate": 1.9413528190137158e-05, + "loss": 0.701, + "step": 11846 + }, + { + "epoch": 1.9340026937676014, + "grad_norm": 1.5096312761306763, + "learning_rate": 1.941342110980546e-05, + "loss": 0.6416, + "step": 11847 + }, + { + "epoch": 1.9341659524101056, + "grad_norm": 1.8403706550598145, + "learning_rate": 1.941331401999445e-05, + "loss": 0.7761, + "step": 11848 + }, + { + "epoch": 1.93432921105261, + "grad_norm": 1.6784875392913818, + "learning_rate": 1.941320692070423e-05, + "loss": 0.6734, + "step": 11849 + }, + { + "epoch": 1.9344924696951145, + "grad_norm": 1.7998906373977661, + "learning_rate": 1.941309981193492e-05, + "loss": 0.7674, + "step": 11850 + }, + { + "epoch": 1.934655728337619, + "grad_norm": 1.6041111946105957, + "learning_rate": 1.9412992693686618e-05, + "loss": 0.5633, + "step": 11851 + }, + { + "epoch": 1.9348189869801233, + "grad_norm": 1.6012353897094727, + "learning_rate": 1.9412885565959434e-05, + "loss": 0.6075, + "step": 11852 + }, + { + "epoch": 1.9349822456226278, + "grad_norm": 1.3093972206115723, + "learning_rate": 1.941277842875348e-05, + "loss": 0.5503, + "step": 11853 + }, + { + "epoch": 1.935145504265132, + "grad_norm": 2.0107674598693848, + "learning_rate": 1.9412671282068855e-05, + "loss": 0.6156, + "step": 11854 + }, + { + "epoch": 1.9353087629076364, + "grad_norm": 1.5864704847335815, + "learning_rate": 1.9412564125905677e-05, + "loss": 0.6223, + "step": 11855 + }, + { + "epoch": 1.9354720215501409, + "grad_norm": 1.8109952211380005, + "learning_rate": 1.941245696026405e-05, + "loss": 0.7665, + "step": 11856 + }, + { + "epoch": 1.935635280192645, + "grad_norm": 1.72359037399292, + "learning_rate": 1.9412349785144076e-05, + "loss": 0.5815, + "step": 11857 + }, + { + "epoch": 1.9357985388351495, + "grad_norm": 1.60701322555542, + "learning_rate": 1.9412242600545874e-05, + "loss": 0.6508, + "step": 11858 + }, + { + "epoch": 1.935961797477654, + "grad_norm": 1.7096107006072998, + "learning_rate": 1.941213540646954e-05, + "loss": 0.7131, + "step": 11859 + }, + { + "epoch": 1.9361250561201584, + "grad_norm": 2.1457674503326416, + "learning_rate": 1.94120282029152e-05, + "loss": 0.8556, + "step": 11860 + }, + { + "epoch": 1.9362883147626628, + "grad_norm": 1.6849135160446167, + "learning_rate": 1.941192098988294e-05, + "loss": 0.6628, + "step": 11861 + }, + { + "epoch": 1.9364515734051673, + "grad_norm": 1.505239725112915, + "learning_rate": 1.941181376737288e-05, + "loss": 0.6133, + "step": 11862 + }, + { + "epoch": 1.9366148320476715, + "grad_norm": 1.8007752895355225, + "learning_rate": 1.9411706535385127e-05, + "loss": 0.7739, + "step": 11863 + }, + { + "epoch": 1.936778090690176, + "grad_norm": 1.5837368965148926, + "learning_rate": 1.9411599293919785e-05, + "loss": 0.5988, + "step": 11864 + }, + { + "epoch": 1.9369413493326801, + "grad_norm": 1.5620118379592896, + "learning_rate": 1.9411492042976968e-05, + "loss": 0.5642, + "step": 11865 + }, + { + "epoch": 1.9371046079751846, + "grad_norm": 1.2763797044754028, + "learning_rate": 1.941138478255678e-05, + "loss": 0.5131, + "step": 11866 + }, + { + "epoch": 1.937267866617689, + "grad_norm": 1.4895514249801636, + "learning_rate": 1.941127751265933e-05, + "loss": 0.5883, + "step": 11867 + }, + { + "epoch": 1.9374311252601935, + "grad_norm": 1.436539888381958, + "learning_rate": 1.9411170233284728e-05, + "loss": 0.641, + "step": 11868 + }, + { + "epoch": 1.937594383902698, + "grad_norm": 1.6441245079040527, + "learning_rate": 1.941106294443308e-05, + "loss": 0.666, + "step": 11869 + }, + { + "epoch": 1.9377576425452023, + "grad_norm": 1.547269344329834, + "learning_rate": 1.9410955646104492e-05, + "loss": 0.6635, + "step": 11870 + }, + { + "epoch": 1.9379209011877068, + "grad_norm": 1.745683193206787, + "learning_rate": 1.9410848338299076e-05, + "loss": 0.6699, + "step": 11871 + }, + { + "epoch": 1.938084159830211, + "grad_norm": 1.8507052659988403, + "learning_rate": 1.9410741021016936e-05, + "loss": 0.8055, + "step": 11872 + }, + { + "epoch": 1.9382474184727154, + "grad_norm": 1.5772100687026978, + "learning_rate": 1.9410633694258182e-05, + "loss": 0.6043, + "step": 11873 + }, + { + "epoch": 1.9384106771152196, + "grad_norm": 1.426726222038269, + "learning_rate": 1.9410526358022925e-05, + "loss": 0.5194, + "step": 11874 + }, + { + "epoch": 1.938573935757724, + "grad_norm": 1.352931261062622, + "learning_rate": 1.941041901231127e-05, + "loss": 0.5508, + "step": 11875 + }, + { + "epoch": 1.9387371944002285, + "grad_norm": 1.5917284488677979, + "learning_rate": 1.9410311657123325e-05, + "loss": 0.6306, + "step": 11876 + }, + { + "epoch": 1.938900453042733, + "grad_norm": 1.7282652854919434, + "learning_rate": 1.94102042924592e-05, + "loss": 0.6895, + "step": 11877 + }, + { + "epoch": 1.9390637116852374, + "grad_norm": 1.617829442024231, + "learning_rate": 1.9410096918318998e-05, + "loss": 0.6538, + "step": 11878 + }, + { + "epoch": 1.9392269703277418, + "grad_norm": 1.391880989074707, + "learning_rate": 1.9409989534702835e-05, + "loss": 0.6143, + "step": 11879 + }, + { + "epoch": 1.9393902289702463, + "grad_norm": 1.6176685094833374, + "learning_rate": 1.9409882141610813e-05, + "loss": 0.6173, + "step": 11880 + }, + { + "epoch": 1.9395534876127505, + "grad_norm": 1.3883602619171143, + "learning_rate": 1.940977473904304e-05, + "loss": 0.5834, + "step": 11881 + }, + { + "epoch": 1.939716746255255, + "grad_norm": 1.914481520652771, + "learning_rate": 1.9409667326999632e-05, + "loss": 0.6893, + "step": 11882 + }, + { + "epoch": 1.9398800048977591, + "grad_norm": 1.6219054460525513, + "learning_rate": 1.9409559905480683e-05, + "loss": 0.6163, + "step": 11883 + }, + { + "epoch": 1.9400432635402636, + "grad_norm": 1.702161192893982, + "learning_rate": 1.9409452474486315e-05, + "loss": 0.6481, + "step": 11884 + }, + { + "epoch": 1.940206522182768, + "grad_norm": 1.6728965044021606, + "learning_rate": 1.9409345034016634e-05, + "loss": 0.6089, + "step": 11885 + }, + { + "epoch": 1.9403697808252724, + "grad_norm": 1.6810117959976196, + "learning_rate": 1.940923758407174e-05, + "loss": 0.7011, + "step": 11886 + }, + { + "epoch": 1.9405330394677769, + "grad_norm": 1.6566284894943237, + "learning_rate": 1.940913012465175e-05, + "loss": 0.7046, + "step": 11887 + }, + { + "epoch": 1.9406962981102813, + "grad_norm": 1.9351093769073486, + "learning_rate": 1.9409022655756767e-05, + "loss": 0.6988, + "step": 11888 + }, + { + "epoch": 1.9408595567527858, + "grad_norm": 1.8713605403900146, + "learning_rate": 1.94089151773869e-05, + "loss": 0.7677, + "step": 11889 + }, + { + "epoch": 1.94102281539529, + "grad_norm": 1.6238309144973755, + "learning_rate": 1.9408807689542257e-05, + "loss": 0.5783, + "step": 11890 + }, + { + "epoch": 1.9411860740377944, + "grad_norm": 1.6282639503479004, + "learning_rate": 1.940870019222295e-05, + "loss": 0.7235, + "step": 11891 + }, + { + "epoch": 1.9413493326802986, + "grad_norm": 1.4680988788604736, + "learning_rate": 1.940859268542908e-05, + "loss": 0.6437, + "step": 11892 + }, + { + "epoch": 1.941512591322803, + "grad_norm": 1.4780998229980469, + "learning_rate": 1.940848516916076e-05, + "loss": 0.5537, + "step": 11893 + }, + { + "epoch": 1.9416758499653075, + "grad_norm": 1.5085655450820923, + "learning_rate": 1.94083776434181e-05, + "loss": 0.5834, + "step": 11894 + }, + { + "epoch": 1.941839108607812, + "grad_norm": 1.606579065322876, + "learning_rate": 1.940827010820121e-05, + "loss": 0.5493, + "step": 11895 + }, + { + "epoch": 1.9420023672503164, + "grad_norm": 1.626221776008606, + "learning_rate": 1.940816256351019e-05, + "loss": 0.6152, + "step": 11896 + }, + { + "epoch": 1.9421656258928208, + "grad_norm": 1.639137625694275, + "learning_rate": 1.9408055009345154e-05, + "loss": 0.7171, + "step": 11897 + }, + { + "epoch": 1.942328884535325, + "grad_norm": 1.8815704584121704, + "learning_rate": 1.940794744570621e-05, + "loss": 0.6404, + "step": 11898 + }, + { + "epoch": 1.9424921431778295, + "grad_norm": 1.4051271677017212, + "learning_rate": 1.940783987259346e-05, + "loss": 0.669, + "step": 11899 + }, + { + "epoch": 1.942655401820334, + "grad_norm": 1.8181684017181396, + "learning_rate": 1.9407732290007023e-05, + "loss": 0.8261, + "step": 11900 + }, + { + "epoch": 1.9428186604628381, + "grad_norm": 1.7623094320297241, + "learning_rate": 1.9407624697947003e-05, + "loss": 0.6683, + "step": 11901 + }, + { + "epoch": 1.9429819191053426, + "grad_norm": 1.4922568798065186, + "learning_rate": 1.9407517096413505e-05, + "loss": 0.5939, + "step": 11902 + }, + { + "epoch": 1.943145177747847, + "grad_norm": 1.7313512563705444, + "learning_rate": 1.9407409485406638e-05, + "loss": 0.7464, + "step": 11903 + }, + { + "epoch": 1.9433084363903514, + "grad_norm": 1.9489524364471436, + "learning_rate": 1.9407301864926514e-05, + "loss": 0.77, + "step": 11904 + }, + { + "epoch": 1.9434716950328559, + "grad_norm": 1.9707510471343994, + "learning_rate": 1.940719423497324e-05, + "loss": 0.6625, + "step": 11905 + }, + { + "epoch": 1.9436349536753603, + "grad_norm": 1.5538692474365234, + "learning_rate": 1.9407086595546928e-05, + "loss": 0.6655, + "step": 11906 + }, + { + "epoch": 1.9437982123178645, + "grad_norm": 1.7071088552474976, + "learning_rate": 1.9406978946647676e-05, + "loss": 0.6007, + "step": 11907 + }, + { + "epoch": 1.943961470960369, + "grad_norm": 1.5549540519714355, + "learning_rate": 1.94068712882756e-05, + "loss": 0.525, + "step": 11908 + }, + { + "epoch": 1.9441247296028732, + "grad_norm": 1.5966929197311401, + "learning_rate": 1.9406763620430808e-05, + "loss": 0.5729, + "step": 11909 + }, + { + "epoch": 1.9442879882453776, + "grad_norm": 1.6473256349563599, + "learning_rate": 1.9406655943113408e-05, + "loss": 0.7152, + "step": 11910 + }, + { + "epoch": 1.944451246887882, + "grad_norm": 1.61025071144104, + "learning_rate": 1.940654825632351e-05, + "loss": 0.5859, + "step": 11911 + }, + { + "epoch": 1.9446145055303865, + "grad_norm": 1.5468478202819824, + "learning_rate": 1.9406440560061214e-05, + "loss": 0.6572, + "step": 11912 + }, + { + "epoch": 1.944777764172891, + "grad_norm": 1.7527316808700562, + "learning_rate": 1.9406332854326638e-05, + "loss": 0.8893, + "step": 11913 + }, + { + "epoch": 1.9449410228153954, + "grad_norm": 1.5590773820877075, + "learning_rate": 1.940622513911989e-05, + "loss": 0.6309, + "step": 11914 + }, + { + "epoch": 1.9451042814578998, + "grad_norm": 1.507565975189209, + "learning_rate": 1.940611741444107e-05, + "loss": 0.612, + "step": 11915 + }, + { + "epoch": 1.945267540100404, + "grad_norm": 1.5965625047683716, + "learning_rate": 1.9406009680290298e-05, + "loss": 0.6378, + "step": 11916 + }, + { + "epoch": 1.9454307987429085, + "grad_norm": 1.2325236797332764, + "learning_rate": 1.9405901936667677e-05, + "loss": 0.5645, + "step": 11917 + }, + { + "epoch": 1.9455940573854127, + "grad_norm": 1.8313679695129395, + "learning_rate": 1.9405794183573314e-05, + "loss": 0.654, + "step": 11918 + }, + { + "epoch": 1.9457573160279171, + "grad_norm": 1.9758769273757935, + "learning_rate": 1.9405686421007316e-05, + "loss": 0.8476, + "step": 11919 + }, + { + "epoch": 1.9459205746704216, + "grad_norm": 1.7152938842773438, + "learning_rate": 1.9405578648969796e-05, + "loss": 0.6631, + "step": 11920 + }, + { + "epoch": 1.946083833312926, + "grad_norm": 1.5823169946670532, + "learning_rate": 1.940547086746086e-05, + "loss": 0.6815, + "step": 11921 + }, + { + "epoch": 1.9462470919554304, + "grad_norm": 1.6402860879898071, + "learning_rate": 1.940536307648062e-05, + "loss": 0.6395, + "step": 11922 + }, + { + "epoch": 1.9464103505979349, + "grad_norm": 2.1109843254089355, + "learning_rate": 1.940525527602918e-05, + "loss": 0.8161, + "step": 11923 + }, + { + "epoch": 1.9465736092404393, + "grad_norm": 1.7880148887634277, + "learning_rate": 1.940514746610665e-05, + "loss": 0.7565, + "step": 11924 + }, + { + "epoch": 1.9467368678829435, + "grad_norm": 1.4932868480682373, + "learning_rate": 1.9405039646713136e-05, + "loss": 0.548, + "step": 11925 + }, + { + "epoch": 1.946900126525448, + "grad_norm": 1.547722578048706, + "learning_rate": 1.9404931817848755e-05, + "loss": 0.6436, + "step": 11926 + }, + { + "epoch": 1.9470633851679522, + "grad_norm": 1.4842886924743652, + "learning_rate": 1.940482397951361e-05, + "loss": 0.7105, + "step": 11927 + }, + { + "epoch": 1.9472266438104566, + "grad_norm": 1.9733026027679443, + "learning_rate": 1.9404716131707807e-05, + "loss": 0.816, + "step": 11928 + }, + { + "epoch": 1.947389902452961, + "grad_norm": 1.324195384979248, + "learning_rate": 1.940460827443146e-05, + "loss": 0.5185, + "step": 11929 + }, + { + "epoch": 1.9475531610954655, + "grad_norm": 1.5936919450759888, + "learning_rate": 1.9404500407684673e-05, + "loss": 0.6647, + "step": 11930 + }, + { + "epoch": 1.94771641973797, + "grad_norm": 1.255367398262024, + "learning_rate": 1.9404392531467555e-05, + "loss": 0.5246, + "step": 11931 + }, + { + "epoch": 1.9478796783804744, + "grad_norm": 1.396157145500183, + "learning_rate": 1.940428464578022e-05, + "loss": 0.5967, + "step": 11932 + }, + { + "epoch": 1.9480429370229788, + "grad_norm": 1.4370572566986084, + "learning_rate": 1.940417675062277e-05, + "loss": 0.5791, + "step": 11933 + }, + { + "epoch": 1.948206195665483, + "grad_norm": 1.703366756439209, + "learning_rate": 1.9404068845995317e-05, + "loss": 0.7141, + "step": 11934 + }, + { + "epoch": 1.9483694543079875, + "grad_norm": 1.7182344198226929, + "learning_rate": 1.940396093189797e-05, + "loss": 0.6485, + "step": 11935 + }, + { + "epoch": 1.9485327129504917, + "grad_norm": 1.6210448741912842, + "learning_rate": 1.9403853008330834e-05, + "loss": 0.7121, + "step": 11936 + }, + { + "epoch": 1.9486959715929961, + "grad_norm": 1.9727495908737183, + "learning_rate": 1.9403745075294024e-05, + "loss": 0.6171, + "step": 11937 + }, + { + "epoch": 1.9488592302355006, + "grad_norm": 1.6028192043304443, + "learning_rate": 1.9403637132787644e-05, + "loss": 0.6185, + "step": 11938 + }, + { + "epoch": 1.949022488878005, + "grad_norm": 1.730743646621704, + "learning_rate": 1.9403529180811804e-05, + "loss": 0.6883, + "step": 11939 + }, + { + "epoch": 1.9491857475205094, + "grad_norm": 1.8238142728805542, + "learning_rate": 1.940342121936661e-05, + "loss": 0.67, + "step": 11940 + }, + { + "epoch": 1.9493490061630139, + "grad_norm": 1.81600022315979, + "learning_rate": 1.9403313248452182e-05, + "loss": 0.6244, + "step": 11941 + }, + { + "epoch": 1.949512264805518, + "grad_norm": 1.6514146327972412, + "learning_rate": 1.9403205268068612e-05, + "loss": 0.6431, + "step": 11942 + }, + { + "epoch": 1.9496755234480225, + "grad_norm": 1.6588079929351807, + "learning_rate": 1.9403097278216017e-05, + "loss": 0.7106, + "step": 11943 + }, + { + "epoch": 1.949838782090527, + "grad_norm": 1.656566858291626, + "learning_rate": 1.940298927889451e-05, + "loss": 0.6997, + "step": 11944 + }, + { + "epoch": 1.9500020407330312, + "grad_norm": 1.9318604469299316, + "learning_rate": 1.940288127010419e-05, + "loss": 0.5375, + "step": 11945 + }, + { + "epoch": 1.9501652993755356, + "grad_norm": 1.740936040878296, + "learning_rate": 1.9402773251845175e-05, + "loss": 0.6618, + "step": 11946 + }, + { + "epoch": 1.95032855801804, + "grad_norm": 1.7183349132537842, + "learning_rate": 1.940266522411757e-05, + "loss": 0.7356, + "step": 11947 + }, + { + "epoch": 1.9504918166605445, + "grad_norm": 1.8850504159927368, + "learning_rate": 1.940255718692148e-05, + "loss": 0.7671, + "step": 11948 + }, + { + "epoch": 1.950655075303049, + "grad_norm": 1.2519774436950684, + "learning_rate": 1.9402449140257018e-05, + "loss": 0.4942, + "step": 11949 + }, + { + "epoch": 1.9508183339455534, + "grad_norm": 1.7590252161026, + "learning_rate": 1.9402341084124298e-05, + "loss": 0.6554, + "step": 11950 + }, + { + "epoch": 1.9509815925880576, + "grad_norm": 1.5754247903823853, + "learning_rate": 1.9402233018523417e-05, + "loss": 0.612, + "step": 11951 + }, + { + "epoch": 1.951144851230562, + "grad_norm": 1.5095124244689941, + "learning_rate": 1.940212494345449e-05, + "loss": 0.66, + "step": 11952 + }, + { + "epoch": 1.9513081098730662, + "grad_norm": 1.978344202041626, + "learning_rate": 1.9402016858917628e-05, + "loss": 0.7207, + "step": 11953 + }, + { + "epoch": 1.9514713685155707, + "grad_norm": 2.0964579582214355, + "learning_rate": 1.9401908764912937e-05, + "loss": 0.7026, + "step": 11954 + }, + { + "epoch": 1.9516346271580751, + "grad_norm": 1.8792282342910767, + "learning_rate": 1.9401800661440525e-05, + "loss": 0.802, + "step": 11955 + }, + { + "epoch": 1.9517978858005796, + "grad_norm": 1.7344902753829956, + "learning_rate": 1.9401692548500504e-05, + "loss": 0.6642, + "step": 11956 + }, + { + "epoch": 1.951961144443084, + "grad_norm": 1.9206347465515137, + "learning_rate": 1.9401584426092982e-05, + "loss": 0.8014, + "step": 11957 + }, + { + "epoch": 1.9521244030855884, + "grad_norm": 1.6475178003311157, + "learning_rate": 1.9401476294218065e-05, + "loss": 0.6549, + "step": 11958 + }, + { + "epoch": 1.9522876617280929, + "grad_norm": 1.8357062339782715, + "learning_rate": 1.9401368152875863e-05, + "loss": 0.7624, + "step": 11959 + }, + { + "epoch": 1.952450920370597, + "grad_norm": 1.6990190744400024, + "learning_rate": 1.940126000206649e-05, + "loss": 0.6021, + "step": 11960 + }, + { + "epoch": 1.9526141790131015, + "grad_norm": 1.7167134284973145, + "learning_rate": 1.9401151841790046e-05, + "loss": 0.7366, + "step": 11961 + }, + { + "epoch": 1.9527774376556057, + "grad_norm": 1.4468706846237183, + "learning_rate": 1.9401043672046644e-05, + "loss": 0.6386, + "step": 11962 + }, + { + "epoch": 1.9529406962981102, + "grad_norm": 1.8635231256484985, + "learning_rate": 1.9400935492836398e-05, + "loss": 0.6615, + "step": 11963 + }, + { + "epoch": 1.9531039549406146, + "grad_norm": 1.3481049537658691, + "learning_rate": 1.940082730415941e-05, + "loss": 0.5214, + "step": 11964 + }, + { + "epoch": 1.953267213583119, + "grad_norm": 1.5708295106887817, + "learning_rate": 1.9400719106015793e-05, + "loss": 0.7488, + "step": 11965 + }, + { + "epoch": 1.9534304722256235, + "grad_norm": 1.6124669313430786, + "learning_rate": 1.940061089840565e-05, + "loss": 0.6604, + "step": 11966 + }, + { + "epoch": 1.953593730868128, + "grad_norm": 1.6876554489135742, + "learning_rate": 1.94005026813291e-05, + "loss": 0.6591, + "step": 11967 + }, + { + "epoch": 1.9537569895106324, + "grad_norm": 1.6444077491760254, + "learning_rate": 1.940039445478624e-05, + "loss": 0.6898, + "step": 11968 + }, + { + "epoch": 1.9539202481531366, + "grad_norm": 1.8075302839279175, + "learning_rate": 1.9400286218777192e-05, + "loss": 0.7073, + "step": 11969 + }, + { + "epoch": 1.954083506795641, + "grad_norm": 1.7064119577407837, + "learning_rate": 1.9400177973302054e-05, + "loss": 0.6558, + "step": 11970 + }, + { + "epoch": 1.9542467654381452, + "grad_norm": 1.7711230516433716, + "learning_rate": 1.940006971836094e-05, + "loss": 0.7402, + "step": 11971 + }, + { + "epoch": 1.9544100240806497, + "grad_norm": 1.6719141006469727, + "learning_rate": 1.939996145395396e-05, + "loss": 0.6116, + "step": 11972 + }, + { + "epoch": 1.9545732827231541, + "grad_norm": 1.4994275569915771, + "learning_rate": 1.939985318008122e-05, + "loss": 0.5857, + "step": 11973 + }, + { + "epoch": 1.9547365413656586, + "grad_norm": 1.4007822275161743, + "learning_rate": 1.9399744896742828e-05, + "loss": 0.548, + "step": 11974 + }, + { + "epoch": 1.954899800008163, + "grad_norm": 1.6796579360961914, + "learning_rate": 1.93996366039389e-05, + "loss": 0.6458, + "step": 11975 + }, + { + "epoch": 1.9550630586506674, + "grad_norm": 1.861620545387268, + "learning_rate": 1.939952830166954e-05, + "loss": 0.5731, + "step": 11976 + }, + { + "epoch": 1.9552263172931719, + "grad_norm": 1.6025257110595703, + "learning_rate": 1.9399419989934857e-05, + "loss": 0.7297, + "step": 11977 + }, + { + "epoch": 1.955389575935676, + "grad_norm": 1.6865417957305908, + "learning_rate": 1.9399311668734957e-05, + "loss": 0.6633, + "step": 11978 + }, + { + "epoch": 1.9555528345781805, + "grad_norm": 1.510353922843933, + "learning_rate": 1.9399203338069955e-05, + "loss": 0.5982, + "step": 11979 + }, + { + "epoch": 1.9557160932206847, + "grad_norm": 1.5171585083007812, + "learning_rate": 1.9399094997939957e-05, + "loss": 0.5717, + "step": 11980 + }, + { + "epoch": 1.9558793518631892, + "grad_norm": 1.6111290454864502, + "learning_rate": 1.9398986648345076e-05, + "loss": 0.6715, + "step": 11981 + }, + { + "epoch": 1.9560426105056936, + "grad_norm": 1.4529098272323608, + "learning_rate": 1.9398878289285412e-05, + "loss": 0.5377, + "step": 11982 + }, + { + "epoch": 1.956205869148198, + "grad_norm": 1.8047800064086914, + "learning_rate": 1.9398769920761083e-05, + "loss": 0.5593, + "step": 11983 + }, + { + "epoch": 1.9563691277907025, + "grad_norm": 1.4388797283172607, + "learning_rate": 1.93986615427722e-05, + "loss": 0.5896, + "step": 11984 + }, + { + "epoch": 1.956532386433207, + "grad_norm": 1.6703429222106934, + "learning_rate": 1.939855315531886e-05, + "loss": 0.6144, + "step": 11985 + }, + { + "epoch": 1.9566956450757111, + "grad_norm": 1.8608746528625488, + "learning_rate": 1.9398444758401182e-05, + "loss": 0.7315, + "step": 11986 + }, + { + "epoch": 1.9568589037182156, + "grad_norm": 1.593725323677063, + "learning_rate": 1.9398336352019275e-05, + "loss": 0.6422, + "step": 11987 + }, + { + "epoch": 1.95702216236072, + "grad_norm": 1.5424983501434326, + "learning_rate": 1.9398227936173245e-05, + "loss": 0.6332, + "step": 11988 + }, + { + "epoch": 1.9571854210032242, + "grad_norm": 1.6589657068252563, + "learning_rate": 1.9398119510863197e-05, + "loss": 0.5421, + "step": 11989 + }, + { + "epoch": 1.9573486796457287, + "grad_norm": 1.4812122583389282, + "learning_rate": 1.939801107608925e-05, + "loss": 0.691, + "step": 11990 + }, + { + "epoch": 1.957511938288233, + "grad_norm": 1.9188082218170166, + "learning_rate": 1.9397902631851506e-05, + "loss": 0.7515, + "step": 11991 + }, + { + "epoch": 1.9576751969307375, + "grad_norm": 1.9120335578918457, + "learning_rate": 1.939779417815008e-05, + "loss": 0.8544, + "step": 11992 + }, + { + "epoch": 1.957838455573242, + "grad_norm": 1.6071158647537231, + "learning_rate": 1.9397685714985076e-05, + "loss": 0.69, + "step": 11993 + }, + { + "epoch": 1.9580017142157464, + "grad_norm": 1.5111114978790283, + "learning_rate": 1.9397577242356603e-05, + "loss": 0.6307, + "step": 11994 + }, + { + "epoch": 1.9581649728582506, + "grad_norm": 1.7906697988510132, + "learning_rate": 1.9397468760264774e-05, + "loss": 0.6417, + "step": 11995 + }, + { + "epoch": 1.958328231500755, + "grad_norm": 1.601850152015686, + "learning_rate": 1.93973602687097e-05, + "loss": 0.6505, + "step": 11996 + }, + { + "epoch": 1.9584914901432593, + "grad_norm": 1.5569384098052979, + "learning_rate": 1.939725176769148e-05, + "loss": 0.5183, + "step": 11997 + }, + { + "epoch": 1.9586547487857637, + "grad_norm": 1.4771968126296997, + "learning_rate": 1.939714325721023e-05, + "loss": 0.5916, + "step": 11998 + }, + { + "epoch": 1.9588180074282682, + "grad_norm": 1.8051952123641968, + "learning_rate": 1.9397034737266067e-05, + "loss": 0.7851, + "step": 11999 + }, + { + "epoch": 1.9589812660707726, + "grad_norm": 1.4665353298187256, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.579, + "step": 12000 + }, + { + "epoch": 1.959144524713277, + "grad_norm": 1.7448382377624512, + "learning_rate": 1.9396817668989404e-05, + "loss": 0.6175, + "step": 12001 + }, + { + "epoch": 1.9593077833557815, + "grad_norm": 2.0720083713531494, + "learning_rate": 1.9396709120657128e-05, + "loss": 0.771, + "step": 12002 + }, + { + "epoch": 1.959471041998286, + "grad_norm": 1.5964202880859375, + "learning_rate": 1.9396600562862372e-05, + "loss": 0.608, + "step": 12003 + }, + { + "epoch": 1.9596343006407901, + "grad_norm": 1.5067237615585327, + "learning_rate": 1.939649199560524e-05, + "loss": 0.5785, + "step": 12004 + }, + { + "epoch": 1.9597975592832946, + "grad_norm": 1.605988621711731, + "learning_rate": 1.9396383418885842e-05, + "loss": 0.6657, + "step": 12005 + }, + { + "epoch": 1.9599608179257988, + "grad_norm": 1.1931746006011963, + "learning_rate": 1.939627483270429e-05, + "loss": 0.497, + "step": 12006 + }, + { + "epoch": 1.9601240765683032, + "grad_norm": 1.7894536256790161, + "learning_rate": 1.939616623706069e-05, + "loss": 0.733, + "step": 12007 + }, + { + "epoch": 1.9602873352108077, + "grad_norm": 1.8156068325042725, + "learning_rate": 1.9396057631955156e-05, + "loss": 0.8219, + "step": 12008 + }, + { + "epoch": 1.960450593853312, + "grad_norm": 1.7503235340118408, + "learning_rate": 1.939594901738779e-05, + "loss": 0.6707, + "step": 12009 + }, + { + "epoch": 1.9606138524958165, + "grad_norm": 1.8041573762893677, + "learning_rate": 1.9395840393358707e-05, + "loss": 0.643, + "step": 12010 + }, + { + "epoch": 1.960777111138321, + "grad_norm": 2.050607919692993, + "learning_rate": 1.939573175986802e-05, + "loss": 0.8475, + "step": 12011 + }, + { + "epoch": 1.9609403697808254, + "grad_norm": 1.6523118019104004, + "learning_rate": 1.939562311691583e-05, + "loss": 0.6726, + "step": 12012 + }, + { + "epoch": 1.9611036284233296, + "grad_norm": 1.946329951286316, + "learning_rate": 1.939551446450225e-05, + "loss": 0.6729, + "step": 12013 + }, + { + "epoch": 1.961266887065834, + "grad_norm": 1.5787888765335083, + "learning_rate": 1.939540580262739e-05, + "loss": 0.6382, + "step": 12014 + }, + { + "epoch": 1.9614301457083383, + "grad_norm": 1.4827346801757812, + "learning_rate": 1.9395297131291356e-05, + "loss": 0.5127, + "step": 12015 + }, + { + "epoch": 1.9615934043508427, + "grad_norm": 1.6452802419662476, + "learning_rate": 1.9395188450494264e-05, + "loss": 0.5958, + "step": 12016 + }, + { + "epoch": 1.9617566629933472, + "grad_norm": 1.5317522287368774, + "learning_rate": 1.9395079760236217e-05, + "loss": 0.5678, + "step": 12017 + }, + { + "epoch": 1.9619199216358516, + "grad_norm": 1.7947872877120972, + "learning_rate": 1.939497106051733e-05, + "loss": 0.7603, + "step": 12018 + }, + { + "epoch": 1.962083180278356, + "grad_norm": 1.741202473640442, + "learning_rate": 1.9394862351337705e-05, + "loss": 0.7124, + "step": 12019 + }, + { + "epoch": 1.9622464389208605, + "grad_norm": 1.6219642162322998, + "learning_rate": 1.9394753632697464e-05, + "loss": 0.67, + "step": 12020 + }, + { + "epoch": 1.962409697563365, + "grad_norm": 1.2691242694854736, + "learning_rate": 1.93946449045967e-05, + "loss": 0.5101, + "step": 12021 + }, + { + "epoch": 1.9625729562058691, + "grad_norm": 2.352670907974243, + "learning_rate": 1.9394536167035535e-05, + "loss": 0.8106, + "step": 12022 + }, + { + "epoch": 1.9627362148483736, + "grad_norm": 1.6988601684570312, + "learning_rate": 1.9394427420014076e-05, + "loss": 0.687, + "step": 12023 + }, + { + "epoch": 1.9628994734908778, + "grad_norm": 1.5797284841537476, + "learning_rate": 1.939431866353243e-05, + "loss": 0.678, + "step": 12024 + }, + { + "epoch": 1.9630627321333822, + "grad_norm": 1.591332197189331, + "learning_rate": 1.9394209897590707e-05, + "loss": 0.6662, + "step": 12025 + }, + { + "epoch": 1.9632259907758867, + "grad_norm": 1.634377360343933, + "learning_rate": 1.9394101122189016e-05, + "loss": 0.6491, + "step": 12026 + }, + { + "epoch": 1.963389249418391, + "grad_norm": 1.498908281326294, + "learning_rate": 1.939399233732747e-05, + "loss": 0.5941, + "step": 12027 + }, + { + "epoch": 1.9635525080608955, + "grad_norm": 1.9735352993011475, + "learning_rate": 1.9393883543006176e-05, + "loss": 0.7243, + "step": 12028 + }, + { + "epoch": 1.9637157667034, + "grad_norm": 1.6846818923950195, + "learning_rate": 1.9393774739225243e-05, + "loss": 0.6727, + "step": 12029 + }, + { + "epoch": 1.9638790253459042, + "grad_norm": 1.8270232677459717, + "learning_rate": 1.939366592598478e-05, + "loss": 0.7377, + "step": 12030 + }, + { + "epoch": 1.9640422839884086, + "grad_norm": 1.4123948812484741, + "learning_rate": 1.93935571032849e-05, + "loss": 0.5128, + "step": 12031 + }, + { + "epoch": 1.964205542630913, + "grad_norm": 1.7303087711334229, + "learning_rate": 1.9393448271125706e-05, + "loss": 0.7214, + "step": 12032 + }, + { + "epoch": 1.9643688012734173, + "grad_norm": 1.463138461112976, + "learning_rate": 1.9393339429507317e-05, + "loss": 0.5781, + "step": 12033 + }, + { + "epoch": 1.9645320599159217, + "grad_norm": 1.850738286972046, + "learning_rate": 1.939323057842984e-05, + "loss": 0.8161, + "step": 12034 + }, + { + "epoch": 1.9646953185584262, + "grad_norm": 1.6018259525299072, + "learning_rate": 1.9393121717893376e-05, + "loss": 0.5641, + "step": 12035 + }, + { + "epoch": 1.9648585772009306, + "grad_norm": 1.6531469821929932, + "learning_rate": 1.9393012847898044e-05, + "loss": 0.6972, + "step": 12036 + }, + { + "epoch": 1.965021835843435, + "grad_norm": 1.5803368091583252, + "learning_rate": 1.939290396844395e-05, + "loss": 0.5991, + "step": 12037 + }, + { + "epoch": 1.9651850944859395, + "grad_norm": 1.4351414442062378, + "learning_rate": 1.9392795079531205e-05, + "loss": 0.5981, + "step": 12038 + }, + { + "epoch": 1.9653483531284437, + "grad_norm": 1.5181951522827148, + "learning_rate": 1.9392686181159917e-05, + "loss": 0.7562, + "step": 12039 + }, + { + "epoch": 1.9655116117709481, + "grad_norm": 1.3229573965072632, + "learning_rate": 1.93925772733302e-05, + "loss": 0.5342, + "step": 12040 + }, + { + "epoch": 1.9656748704134523, + "grad_norm": 1.5528271198272705, + "learning_rate": 1.9392468356042155e-05, + "loss": 0.6365, + "step": 12041 + }, + { + "epoch": 1.9658381290559568, + "grad_norm": 1.7845171689987183, + "learning_rate": 1.9392359429295897e-05, + "loss": 0.7221, + "step": 12042 + }, + { + "epoch": 1.9660013876984612, + "grad_norm": 1.6315809488296509, + "learning_rate": 1.9392250493091537e-05, + "loss": 0.622, + "step": 12043 + }, + { + "epoch": 1.9661646463409657, + "grad_norm": 1.6713262796401978, + "learning_rate": 1.9392141547429183e-05, + "loss": 0.6051, + "step": 12044 + }, + { + "epoch": 1.96632790498347, + "grad_norm": 1.6520575284957886, + "learning_rate": 1.9392032592308948e-05, + "loss": 0.5678, + "step": 12045 + }, + { + "epoch": 1.9664911636259745, + "grad_norm": 1.4269263744354248, + "learning_rate": 1.9391923627730936e-05, + "loss": 0.5601, + "step": 12046 + }, + { + "epoch": 1.966654422268479, + "grad_norm": 1.8615847826004028, + "learning_rate": 1.939181465369526e-05, + "loss": 0.7337, + "step": 12047 + }, + { + "epoch": 1.9668176809109832, + "grad_norm": 1.6449984312057495, + "learning_rate": 1.939170567020203e-05, + "loss": 0.6296, + "step": 12048 + }, + { + "epoch": 1.9669809395534876, + "grad_norm": 1.7710739374160767, + "learning_rate": 1.939159667725135e-05, + "loss": 0.5948, + "step": 12049 + }, + { + "epoch": 1.9671441981959918, + "grad_norm": 1.583418369293213, + "learning_rate": 1.939148767484334e-05, + "loss": 0.6202, + "step": 12050 + }, + { + "epoch": 1.9673074568384963, + "grad_norm": 1.7176554203033447, + "learning_rate": 1.93913786629781e-05, + "loss": 0.6994, + "step": 12051 + }, + { + "epoch": 1.9674707154810007, + "grad_norm": 1.4113578796386719, + "learning_rate": 1.939126964165575e-05, + "loss": 0.6283, + "step": 12052 + }, + { + "epoch": 1.9676339741235052, + "grad_norm": 1.5644431114196777, + "learning_rate": 1.939116061087639e-05, + "loss": 0.5194, + "step": 12053 + }, + { + "epoch": 1.9677972327660096, + "grad_norm": 1.4710571765899658, + "learning_rate": 1.9391051570640138e-05, + "loss": 0.687, + "step": 12054 + }, + { + "epoch": 1.967960491408514, + "grad_norm": 1.864527702331543, + "learning_rate": 1.939094252094709e-05, + "loss": 0.6359, + "step": 12055 + }, + { + "epoch": 1.9681237500510185, + "grad_norm": 2.0226376056671143, + "learning_rate": 1.9390833461797377e-05, + "loss": 0.688, + "step": 12056 + }, + { + "epoch": 1.9682870086935227, + "grad_norm": 1.5084041357040405, + "learning_rate": 1.939072439319109e-05, + "loss": 0.5828, + "step": 12057 + }, + { + "epoch": 1.9684502673360271, + "grad_norm": 1.6363697052001953, + "learning_rate": 1.9390615315128347e-05, + "loss": 0.6178, + "step": 12058 + }, + { + "epoch": 1.9686135259785313, + "grad_norm": 1.7761976718902588, + "learning_rate": 1.9390506227609255e-05, + "loss": 0.5727, + "step": 12059 + }, + { + "epoch": 1.9687767846210358, + "grad_norm": 1.7256630659103394, + "learning_rate": 1.9390397130633932e-05, + "loss": 0.5695, + "step": 12060 + }, + { + "epoch": 1.9689400432635402, + "grad_norm": 1.5373591184616089, + "learning_rate": 1.9390288024202476e-05, + "loss": 0.66, + "step": 12061 + }, + { + "epoch": 1.9691033019060447, + "grad_norm": 1.5092006921768188, + "learning_rate": 1.9390178908315004e-05, + "loss": 0.5797, + "step": 12062 + }, + { + "epoch": 1.969266560548549, + "grad_norm": 1.4031505584716797, + "learning_rate": 1.9390069782971626e-05, + "loss": 0.5978, + "step": 12063 + }, + { + "epoch": 1.9694298191910535, + "grad_norm": 1.6241410970687866, + "learning_rate": 1.9389960648172447e-05, + "loss": 0.6566, + "step": 12064 + }, + { + "epoch": 1.969593077833558, + "grad_norm": 1.5984833240509033, + "learning_rate": 1.9389851503917584e-05, + "loss": 0.5981, + "step": 12065 + }, + { + "epoch": 1.9697563364760622, + "grad_norm": 1.8714030981063843, + "learning_rate": 1.938974235020714e-05, + "loss": 0.7991, + "step": 12066 + }, + { + "epoch": 1.9699195951185666, + "grad_norm": 1.4397425651550293, + "learning_rate": 1.938963318704123e-05, + "loss": 0.6206, + "step": 12067 + }, + { + "epoch": 1.9700828537610708, + "grad_norm": 1.7518298625946045, + "learning_rate": 1.938952401441996e-05, + "loss": 0.6507, + "step": 12068 + }, + { + "epoch": 1.9702461124035753, + "grad_norm": 1.352191686630249, + "learning_rate": 1.9389414832343444e-05, + "loss": 0.5693, + "step": 12069 + }, + { + "epoch": 1.9704093710460797, + "grad_norm": 1.6283620595932007, + "learning_rate": 1.938930564081179e-05, + "loss": 0.5533, + "step": 12070 + }, + { + "epoch": 1.9705726296885842, + "grad_norm": 1.182111382484436, + "learning_rate": 1.9389196439825103e-05, + "loss": 0.5118, + "step": 12071 + }, + { + "epoch": 1.9707358883310886, + "grad_norm": 1.5730998516082764, + "learning_rate": 1.93890872293835e-05, + "loss": 0.6207, + "step": 12072 + }, + { + "epoch": 1.970899146973593, + "grad_norm": 1.605098009109497, + "learning_rate": 1.938897800948709e-05, + "loss": 0.7108, + "step": 12073 + }, + { + "epoch": 1.9710624056160975, + "grad_norm": 1.8476513624191284, + "learning_rate": 1.938886878013598e-05, + "loss": 0.8175, + "step": 12074 + }, + { + "epoch": 1.9712256642586017, + "grad_norm": 1.972294569015503, + "learning_rate": 1.9388759541330284e-05, + "loss": 0.7987, + "step": 12075 + }, + { + "epoch": 1.9713889229011061, + "grad_norm": 1.495556116104126, + "learning_rate": 1.938865029307011e-05, + "loss": 0.6187, + "step": 12076 + }, + { + "epoch": 1.9715521815436103, + "grad_norm": 2.112205743789673, + "learning_rate": 1.9388541035355563e-05, + "loss": 0.7068, + "step": 12077 + }, + { + "epoch": 1.9717154401861148, + "grad_norm": 1.800990343093872, + "learning_rate": 1.938843176818676e-05, + "loss": 0.6639, + "step": 12078 + }, + { + "epoch": 1.9718786988286192, + "grad_norm": 1.5970319509506226, + "learning_rate": 1.9388322491563813e-05, + "loss": 0.5971, + "step": 12079 + }, + { + "epoch": 1.9720419574711237, + "grad_norm": 1.7734706401824951, + "learning_rate": 1.9388213205486823e-05, + "loss": 0.7537, + "step": 12080 + }, + { + "epoch": 1.972205216113628, + "grad_norm": 1.7885246276855469, + "learning_rate": 1.9388103909955904e-05, + "loss": 0.6601, + "step": 12081 + }, + { + "epoch": 1.9723684747561325, + "grad_norm": 2.0376458168029785, + "learning_rate": 1.938799460497117e-05, + "loss": 0.7621, + "step": 12082 + }, + { + "epoch": 1.9725317333986367, + "grad_norm": 1.8590543270111084, + "learning_rate": 1.9387885290532728e-05, + "loss": 0.7521, + "step": 12083 + }, + { + "epoch": 1.9726949920411412, + "grad_norm": 2.142286777496338, + "learning_rate": 1.9387775966640683e-05, + "loss": 0.8508, + "step": 12084 + }, + { + "epoch": 1.9728582506836456, + "grad_norm": 1.8114097118377686, + "learning_rate": 1.9387666633295158e-05, + "loss": 0.6396, + "step": 12085 + }, + { + "epoch": 1.9730215093261498, + "grad_norm": 1.8248679637908936, + "learning_rate": 1.9387557290496247e-05, + "loss": 0.6903, + "step": 12086 + }, + { + "epoch": 1.9731847679686543, + "grad_norm": 1.7347304821014404, + "learning_rate": 1.9387447938244076e-05, + "loss": 0.5835, + "step": 12087 + }, + { + "epoch": 1.9733480266111587, + "grad_norm": 1.798732876777649, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.704, + "step": 12088 + }, + { + "epoch": 1.9735112852536632, + "grad_norm": 1.6307047605514526, + "learning_rate": 1.9387229205380362e-05, + "loss": 0.7075, + "step": 12089 + }, + { + "epoch": 1.9736745438961676, + "grad_norm": 1.6454919576644897, + "learning_rate": 1.9387119824769043e-05, + "loss": 0.6971, + "step": 12090 + }, + { + "epoch": 1.973837802538672, + "grad_norm": 1.5941905975341797, + "learning_rate": 1.93870104347049e-05, + "loss": 0.8269, + "step": 12091 + }, + { + "epoch": 1.9740010611811762, + "grad_norm": 2.082308292388916, + "learning_rate": 1.938690103518804e-05, + "loss": 1.4123, + "step": 12092 + }, + { + "epoch": 1.9741643198236807, + "grad_norm": 1.4190151691436768, + "learning_rate": 1.938679162621857e-05, + "loss": 0.5935, + "step": 12093 + }, + { + "epoch": 1.974327578466185, + "grad_norm": 1.720835566520691, + "learning_rate": 1.938668220779661e-05, + "loss": 0.8161, + "step": 12094 + }, + { + "epoch": 1.9744908371086893, + "grad_norm": 1.4237022399902344, + "learning_rate": 1.9386572779922254e-05, + "loss": 0.6986, + "step": 12095 + }, + { + "epoch": 1.9746540957511938, + "grad_norm": 1.5185050964355469, + "learning_rate": 1.938646334259563e-05, + "loss": 0.6821, + "step": 12096 + }, + { + "epoch": 1.9748173543936982, + "grad_norm": 1.6775425672531128, + "learning_rate": 1.9386353895816833e-05, + "loss": 0.6664, + "step": 12097 + }, + { + "epoch": 1.9749806130362026, + "grad_norm": 1.555795431137085, + "learning_rate": 1.9386244439585984e-05, + "loss": 0.6958, + "step": 12098 + }, + { + "epoch": 1.975143871678707, + "grad_norm": 1.7459584474563599, + "learning_rate": 1.938613497390319e-05, + "loss": 0.6423, + "step": 12099 + }, + { + "epoch": 1.9753071303212115, + "grad_norm": 1.8588008880615234, + "learning_rate": 1.938602549876856e-05, + "loss": 0.7026, + "step": 12100 + }, + { + "epoch": 1.9754703889637157, + "grad_norm": 2.1107265949249268, + "learning_rate": 1.9385916014182202e-05, + "loss": 1.4162, + "step": 12101 + }, + { + "epoch": 1.9756336476062202, + "grad_norm": 1.789146900177002, + "learning_rate": 1.9385806520144234e-05, + "loss": 0.6781, + "step": 12102 + }, + { + "epoch": 1.9757969062487244, + "grad_norm": 1.7236100435256958, + "learning_rate": 1.9385697016654756e-05, + "loss": 0.7142, + "step": 12103 + }, + { + "epoch": 1.9759601648912288, + "grad_norm": 1.5753521919250488, + "learning_rate": 1.9385587503713886e-05, + "loss": 0.589, + "step": 12104 + }, + { + "epoch": 1.9761234235337333, + "grad_norm": 1.7589904069900513, + "learning_rate": 1.9385477981321733e-05, + "loss": 0.6734, + "step": 12105 + }, + { + "epoch": 1.9762866821762377, + "grad_norm": 1.6662052869796753, + "learning_rate": 1.9385368449478404e-05, + "loss": 0.6203, + "step": 12106 + }, + { + "epoch": 1.9764499408187421, + "grad_norm": 1.709804892539978, + "learning_rate": 1.9385258908184014e-05, + "loss": 0.6899, + "step": 12107 + }, + { + "epoch": 1.9766131994612466, + "grad_norm": 1.7481664419174194, + "learning_rate": 1.9385149357438667e-05, + "loss": 0.6542, + "step": 12108 + }, + { + "epoch": 1.976776458103751, + "grad_norm": 1.5527619123458862, + "learning_rate": 1.938503979724248e-05, + "loss": 0.6715, + "step": 12109 + }, + { + "epoch": 1.9769397167462552, + "grad_norm": 1.7784616947174072, + "learning_rate": 1.938493022759556e-05, + "loss": 0.6524, + "step": 12110 + }, + { + "epoch": 1.9771029753887597, + "grad_norm": 1.630524754524231, + "learning_rate": 1.938482064849802e-05, + "loss": 0.6704, + "step": 12111 + }, + { + "epoch": 1.977266234031264, + "grad_norm": 1.3653780221939087, + "learning_rate": 1.9384711059949964e-05, + "loss": 0.5405, + "step": 12112 + }, + { + "epoch": 1.9774294926737683, + "grad_norm": 1.605733871459961, + "learning_rate": 1.938460146195151e-05, + "loss": 0.6783, + "step": 12113 + }, + { + "epoch": 1.9775927513162728, + "grad_norm": 1.586165189743042, + "learning_rate": 1.938449185450276e-05, + "loss": 0.6009, + "step": 12114 + }, + { + "epoch": 1.9777560099587772, + "grad_norm": 1.5338736772537231, + "learning_rate": 1.9384382237603832e-05, + "loss": 0.5675, + "step": 12115 + }, + { + "epoch": 1.9779192686012816, + "grad_norm": 1.5550426244735718, + "learning_rate": 1.9384272611254835e-05, + "loss": 0.6254, + "step": 12116 + }, + { + "epoch": 1.978082527243786, + "grad_norm": 1.650942325592041, + "learning_rate": 1.9384162975455877e-05, + "loss": 0.7351, + "step": 12117 + }, + { + "epoch": 1.9782457858862905, + "grad_norm": 1.7282867431640625, + "learning_rate": 1.9384053330207068e-05, + "loss": 0.693, + "step": 12118 + }, + { + "epoch": 1.9784090445287947, + "grad_norm": 1.9156718254089355, + "learning_rate": 1.938394367550852e-05, + "loss": 0.6331, + "step": 12119 + }, + { + "epoch": 1.9785723031712992, + "grad_norm": 1.570723533630371, + "learning_rate": 1.9383834011360347e-05, + "loss": 0.5542, + "step": 12120 + }, + { + "epoch": 1.9787355618138034, + "grad_norm": 1.5698423385620117, + "learning_rate": 1.938372433776265e-05, + "loss": 0.6428, + "step": 12121 + }, + { + "epoch": 1.9788988204563078, + "grad_norm": 1.5610013008117676, + "learning_rate": 1.9383614654715547e-05, + "loss": 0.6275, + "step": 12122 + }, + { + "epoch": 1.9790620790988123, + "grad_norm": 1.7522907257080078, + "learning_rate": 1.938350496221915e-05, + "loss": 0.6258, + "step": 12123 + }, + { + "epoch": 1.9792253377413167, + "grad_norm": 1.4422379732131958, + "learning_rate": 1.9383395260273564e-05, + "loss": 0.7006, + "step": 12124 + }, + { + "epoch": 1.9793885963838211, + "grad_norm": 1.5264736413955688, + "learning_rate": 1.93832855488789e-05, + "loss": 0.5363, + "step": 12125 + }, + { + "epoch": 1.9795518550263256, + "grad_norm": 1.7550451755523682, + "learning_rate": 1.938317582803527e-05, + "loss": 0.6404, + "step": 12126 + }, + { + "epoch": 1.9797151136688298, + "grad_norm": 1.7797822952270508, + "learning_rate": 1.9383066097742785e-05, + "loss": 0.5658, + "step": 12127 + }, + { + "epoch": 1.9798783723113342, + "grad_norm": 1.6075304746627808, + "learning_rate": 1.9382956358001555e-05, + "loss": 0.7103, + "step": 12128 + }, + { + "epoch": 1.9800416309538387, + "grad_norm": 1.686180830001831, + "learning_rate": 1.9382846608811693e-05, + "loss": 0.6425, + "step": 12129 + }, + { + "epoch": 1.9802048895963429, + "grad_norm": 2.1154184341430664, + "learning_rate": 1.9382736850173303e-05, + "loss": 0.7659, + "step": 12130 + }, + { + "epoch": 1.9803681482388473, + "grad_norm": 1.4713122844696045, + "learning_rate": 1.9382627082086502e-05, + "loss": 0.6419, + "step": 12131 + }, + { + "epoch": 1.9805314068813518, + "grad_norm": 1.5829592943191528, + "learning_rate": 1.9382517304551397e-05, + "loss": 0.5693, + "step": 12132 + }, + { + "epoch": 1.9806946655238562, + "grad_norm": 1.9809218645095825, + "learning_rate": 1.9382407517568097e-05, + "loss": 0.8544, + "step": 12133 + }, + { + "epoch": 1.9808579241663606, + "grad_norm": 1.4909061193466187, + "learning_rate": 1.938229772113672e-05, + "loss": 0.5676, + "step": 12134 + }, + { + "epoch": 1.981021182808865, + "grad_norm": 1.5948035717010498, + "learning_rate": 1.938218791525737e-05, + "loss": 0.6318, + "step": 12135 + }, + { + "epoch": 1.9811844414513693, + "grad_norm": 1.7518815994262695, + "learning_rate": 1.9382078099930157e-05, + "loss": 0.805, + "step": 12136 + }, + { + "epoch": 1.9813477000938737, + "grad_norm": 1.9464157819747925, + "learning_rate": 1.9381968275155195e-05, + "loss": 0.6807, + "step": 12137 + }, + { + "epoch": 1.981510958736378, + "grad_norm": 1.6631522178649902, + "learning_rate": 1.9381858440932592e-05, + "loss": 0.6681, + "step": 12138 + }, + { + "epoch": 1.9816742173788824, + "grad_norm": 1.3863495588302612, + "learning_rate": 1.9381748597262463e-05, + "loss": 0.512, + "step": 12139 + }, + { + "epoch": 1.9818374760213868, + "grad_norm": 1.447217583656311, + "learning_rate": 1.9381638744144914e-05, + "loss": 0.5365, + "step": 12140 + }, + { + "epoch": 1.9820007346638913, + "grad_norm": 1.1956762075424194, + "learning_rate": 1.938152888158006e-05, + "loss": 0.492, + "step": 12141 + }, + { + "epoch": 1.9821639933063957, + "grad_norm": 1.561561107635498, + "learning_rate": 1.938141900956801e-05, + "loss": 0.6441, + "step": 12142 + }, + { + "epoch": 1.9823272519489001, + "grad_norm": 1.634190320968628, + "learning_rate": 1.9381309128108866e-05, + "loss": 0.6629, + "step": 12143 + }, + { + "epoch": 1.9824905105914046, + "grad_norm": 1.5614255666732788, + "learning_rate": 1.938119923720275e-05, + "loss": 0.6071, + "step": 12144 + }, + { + "epoch": 1.9826537692339088, + "grad_norm": 1.8012393712997437, + "learning_rate": 1.9381089336849773e-05, + "loss": 0.5887, + "step": 12145 + }, + { + "epoch": 1.9828170278764132, + "grad_norm": 1.8545680046081543, + "learning_rate": 1.9380979427050036e-05, + "loss": 0.6725, + "step": 12146 + }, + { + "epoch": 1.9829802865189174, + "grad_norm": 1.6923454999923706, + "learning_rate": 1.9380869507803656e-05, + "loss": 0.7273, + "step": 12147 + }, + { + "epoch": 1.9831435451614219, + "grad_norm": 1.7317448854446411, + "learning_rate": 1.9380759579110745e-05, + "loss": 0.7501, + "step": 12148 + }, + { + "epoch": 1.9833068038039263, + "grad_norm": 2.078296184539795, + "learning_rate": 1.938064964097141e-05, + "loss": 0.9509, + "step": 12149 + }, + { + "epoch": 1.9834700624464308, + "grad_norm": 1.6791608333587646, + "learning_rate": 1.9380539693385763e-05, + "loss": 0.5995, + "step": 12150 + }, + { + "epoch": 1.9836333210889352, + "grad_norm": 1.5884119272232056, + "learning_rate": 1.9380429736353918e-05, + "loss": 0.6409, + "step": 12151 + }, + { + "epoch": 1.9837965797314396, + "grad_norm": 1.6735621690750122, + "learning_rate": 1.938031976987598e-05, + "loss": 0.7408, + "step": 12152 + }, + { + "epoch": 1.983959838373944, + "grad_norm": 1.7484724521636963, + "learning_rate": 1.938020979395206e-05, + "loss": 0.7344, + "step": 12153 + }, + { + "epoch": 1.9841230970164483, + "grad_norm": 1.7733862400054932, + "learning_rate": 1.9380099808582278e-05, + "loss": 0.5507, + "step": 12154 + }, + { + "epoch": 1.9842863556589527, + "grad_norm": 1.4923404455184937, + "learning_rate": 1.9379989813766732e-05, + "loss": 0.5817, + "step": 12155 + }, + { + "epoch": 1.984449614301457, + "grad_norm": 1.462174415588379, + "learning_rate": 1.9379879809505542e-05, + "loss": 0.5973, + "step": 12156 + }, + { + "epoch": 1.9846128729439614, + "grad_norm": 1.5134918689727783, + "learning_rate": 1.9379769795798817e-05, + "loss": 0.6221, + "step": 12157 + }, + { + "epoch": 1.9847761315864658, + "grad_norm": 1.8579589128494263, + "learning_rate": 1.937965977264666e-05, + "loss": 0.5809, + "step": 12158 + }, + { + "epoch": 1.9849393902289703, + "grad_norm": 1.865466833114624, + "learning_rate": 1.9379549740049197e-05, + "loss": 0.7381, + "step": 12159 + }, + { + "epoch": 1.9851026488714747, + "grad_norm": 1.8792990446090698, + "learning_rate": 1.9379439698006522e-05, + "loss": 0.6755, + "step": 12160 + }, + { + "epoch": 1.9852659075139791, + "grad_norm": 1.6961243152618408, + "learning_rate": 1.9379329646518756e-05, + "loss": 0.6004, + "step": 12161 + }, + { + "epoch": 1.9854291661564836, + "grad_norm": 1.8712011575698853, + "learning_rate": 1.9379219585586007e-05, + "loss": 0.6596, + "step": 12162 + }, + { + "epoch": 1.9855924247989878, + "grad_norm": 1.4411910772323608, + "learning_rate": 1.9379109515208387e-05, + "loss": 0.5271, + "step": 12163 + }, + { + "epoch": 1.9857556834414922, + "grad_norm": 1.598284125328064, + "learning_rate": 1.937899943538601e-05, + "loss": 0.6606, + "step": 12164 + }, + { + "epoch": 1.9859189420839964, + "grad_norm": 1.925343632698059, + "learning_rate": 1.937888934611898e-05, + "loss": 0.7625, + "step": 12165 + }, + { + "epoch": 1.9860822007265009, + "grad_norm": 1.6749473810195923, + "learning_rate": 1.9378779247407408e-05, + "loss": 0.6631, + "step": 12166 + }, + { + "epoch": 1.9862454593690053, + "grad_norm": 1.9570813179016113, + "learning_rate": 1.937866913925141e-05, + "loss": 0.6842, + "step": 12167 + }, + { + "epoch": 1.9864087180115098, + "grad_norm": 1.6535437107086182, + "learning_rate": 1.9378559021651096e-05, + "loss": 0.6573, + "step": 12168 + }, + { + "epoch": 1.9865719766540142, + "grad_norm": 1.5824257135391235, + "learning_rate": 1.9378448894606575e-05, + "loss": 0.7661, + "step": 12169 + }, + { + "epoch": 1.9867352352965186, + "grad_norm": 1.619402527809143, + "learning_rate": 1.937833875811796e-05, + "loss": 0.6753, + "step": 12170 + }, + { + "epoch": 1.9868984939390228, + "grad_norm": 1.6263985633850098, + "learning_rate": 1.9378228612185358e-05, + "loss": 0.6029, + "step": 12171 + }, + { + "epoch": 1.9870617525815273, + "grad_norm": 1.5036473274230957, + "learning_rate": 1.937811845680888e-05, + "loss": 0.6215, + "step": 12172 + }, + { + "epoch": 1.9872250112240317, + "grad_norm": 1.5280168056488037, + "learning_rate": 1.9378008291988644e-05, + "loss": 0.578, + "step": 12173 + }, + { + "epoch": 1.987388269866536, + "grad_norm": 2.04551362991333, + "learning_rate": 1.9377898117724753e-05, + "loss": 0.8163, + "step": 12174 + }, + { + "epoch": 1.9875515285090404, + "grad_norm": 1.5004938840866089, + "learning_rate": 1.9377787934017323e-05, + "loss": 0.661, + "step": 12175 + }, + { + "epoch": 1.9877147871515448, + "grad_norm": 1.6754908561706543, + "learning_rate": 1.937767774086646e-05, + "loss": 0.6899, + "step": 12176 + }, + { + "epoch": 1.9878780457940493, + "grad_norm": 2.010058879852295, + "learning_rate": 1.9377567538272283e-05, + "loss": 0.7004, + "step": 12177 + }, + { + "epoch": 1.9880413044365537, + "grad_norm": 2.1062633991241455, + "learning_rate": 1.9377457326234893e-05, + "loss": 0.8205, + "step": 12178 + }, + { + "epoch": 1.9882045630790581, + "grad_norm": 1.7984371185302734, + "learning_rate": 1.9377347104754408e-05, + "loss": 0.735, + "step": 12179 + }, + { + "epoch": 1.9883678217215623, + "grad_norm": 1.6382217407226562, + "learning_rate": 1.937723687383094e-05, + "loss": 0.5169, + "step": 12180 + }, + { + "epoch": 1.9885310803640668, + "grad_norm": 1.5597251653671265, + "learning_rate": 1.937712663346459e-05, + "loss": 0.6359, + "step": 12181 + }, + { + "epoch": 1.988694339006571, + "grad_norm": 1.7302318811416626, + "learning_rate": 1.9377016383655484e-05, + "loss": 0.6939, + "step": 12182 + }, + { + "epoch": 1.9888575976490754, + "grad_norm": 1.4742039442062378, + "learning_rate": 1.937690612440372e-05, + "loss": 0.7445, + "step": 12183 + }, + { + "epoch": 1.9890208562915799, + "grad_norm": 1.6477793455123901, + "learning_rate": 1.9376795855709414e-05, + "loss": 0.674, + "step": 12184 + }, + { + "epoch": 1.9891841149340843, + "grad_norm": 1.4963077306747437, + "learning_rate": 1.9376685577572677e-05, + "loss": 0.6402, + "step": 12185 + }, + { + "epoch": 1.9893473735765888, + "grad_norm": 1.5093870162963867, + "learning_rate": 1.9376575289993624e-05, + "loss": 0.5624, + "step": 12186 + }, + { + "epoch": 1.9895106322190932, + "grad_norm": 1.3610533475875854, + "learning_rate": 1.9376464992972358e-05, + "loss": 0.5352, + "step": 12187 + }, + { + "epoch": 1.9896738908615976, + "grad_norm": 1.4411801099777222, + "learning_rate": 1.9376354686508996e-05, + "loss": 0.5623, + "step": 12188 + }, + { + "epoch": 1.9898371495041018, + "grad_norm": 1.5642800331115723, + "learning_rate": 1.9376244370603646e-05, + "loss": 0.6226, + "step": 12189 + }, + { + "epoch": 1.9900004081466063, + "grad_norm": 1.593117594718933, + "learning_rate": 1.9376134045256424e-05, + "loss": 0.5637, + "step": 12190 + }, + { + "epoch": 1.9901636667891105, + "grad_norm": 1.8389207124710083, + "learning_rate": 1.9376023710467433e-05, + "loss": 0.7843, + "step": 12191 + }, + { + "epoch": 1.990326925431615, + "grad_norm": 1.5381425619125366, + "learning_rate": 1.9375913366236793e-05, + "loss": 0.5866, + "step": 12192 + }, + { + "epoch": 1.9904901840741194, + "grad_norm": 1.5661377906799316, + "learning_rate": 1.9375803012564605e-05, + "loss": 0.4959, + "step": 12193 + }, + { + "epoch": 1.9906534427166238, + "grad_norm": 1.8130446672439575, + "learning_rate": 1.937569264945099e-05, + "loss": 0.8026, + "step": 12194 + }, + { + "epoch": 1.9908167013591282, + "grad_norm": 1.7404340505599976, + "learning_rate": 1.9375582276896052e-05, + "loss": 0.6427, + "step": 12195 + }, + { + "epoch": 1.9909799600016327, + "grad_norm": 1.7790809869766235, + "learning_rate": 1.937547189489991e-05, + "loss": 0.6589, + "step": 12196 + }, + { + "epoch": 1.9911432186441371, + "grad_norm": 1.2181130647659302, + "learning_rate": 1.9375361503462666e-05, + "loss": 0.4636, + "step": 12197 + }, + { + "epoch": 1.9913064772866413, + "grad_norm": 1.6261563301086426, + "learning_rate": 1.9375251102584438e-05, + "loss": 0.6624, + "step": 12198 + }, + { + "epoch": 1.9914697359291458, + "grad_norm": 1.4645826816558838, + "learning_rate": 1.9375140692265333e-05, + "loss": 0.6294, + "step": 12199 + }, + { + "epoch": 1.99163299457165, + "grad_norm": 1.4746330976486206, + "learning_rate": 1.9375030272505463e-05, + "loss": 0.6459, + "step": 12200 + }, + { + "epoch": 1.9917962532141544, + "grad_norm": 1.9358316659927368, + "learning_rate": 1.9374919843304944e-05, + "loss": 0.6549, + "step": 12201 + }, + { + "epoch": 1.9919595118566589, + "grad_norm": 1.4916775226593018, + "learning_rate": 1.937480940466388e-05, + "loss": 0.588, + "step": 12202 + }, + { + "epoch": 1.9921227704991633, + "grad_norm": 1.6753145456314087, + "learning_rate": 1.9374698956582385e-05, + "loss": 0.6789, + "step": 12203 + }, + { + "epoch": 1.9922860291416677, + "grad_norm": 1.6334155797958374, + "learning_rate": 1.937458849906057e-05, + "loss": 0.581, + "step": 12204 + }, + { + "epoch": 1.9924492877841722, + "grad_norm": 2.046504259109497, + "learning_rate": 1.937447803209855e-05, + "loss": 0.7244, + "step": 12205 + }, + { + "epoch": 1.9926125464266766, + "grad_norm": 1.638554334640503, + "learning_rate": 1.937436755569643e-05, + "loss": 0.749, + "step": 12206 + }, + { + "epoch": 1.9927758050691808, + "grad_norm": 1.4775389432907104, + "learning_rate": 1.9374257069854327e-05, + "loss": 0.5999, + "step": 12207 + }, + { + "epoch": 1.9929390637116853, + "grad_norm": 1.6128982305526733, + "learning_rate": 1.937414657457235e-05, + "loss": 0.5861, + "step": 12208 + }, + { + "epoch": 1.9931023223541895, + "grad_norm": 1.7144010066986084, + "learning_rate": 1.937403606985061e-05, + "loss": 0.7138, + "step": 12209 + }, + { + "epoch": 1.993265580996694, + "grad_norm": 1.6843595504760742, + "learning_rate": 1.9373925555689212e-05, + "loss": 0.6638, + "step": 12210 + }, + { + "epoch": 1.9934288396391984, + "grad_norm": 1.7137644290924072, + "learning_rate": 1.937381503208828e-05, + "loss": 0.6537, + "step": 12211 + }, + { + "epoch": 1.9935920982817028, + "grad_norm": 2.0475170612335205, + "learning_rate": 1.9373704499047918e-05, + "loss": 0.8825, + "step": 12212 + }, + { + "epoch": 1.9937553569242072, + "grad_norm": 1.7538343667984009, + "learning_rate": 1.9373593956568234e-05, + "loss": 0.697, + "step": 12213 + }, + { + "epoch": 1.9939186155667117, + "grad_norm": 1.7188959121704102, + "learning_rate": 1.9373483404649347e-05, + "loss": 0.5923, + "step": 12214 + }, + { + "epoch": 1.994081874209216, + "grad_norm": 1.9927308559417725, + "learning_rate": 1.9373372843291366e-05, + "loss": 0.7649, + "step": 12215 + }, + { + "epoch": 1.9942451328517203, + "grad_norm": 1.2580665349960327, + "learning_rate": 1.9373262272494398e-05, + "loss": 0.4279, + "step": 12216 + }, + { + "epoch": 1.9944083914942248, + "grad_norm": 1.657604455947876, + "learning_rate": 1.9373151692258556e-05, + "loss": 0.7364, + "step": 12217 + }, + { + "epoch": 1.994571650136729, + "grad_norm": 1.3979759216308594, + "learning_rate": 1.9373041102583955e-05, + "loss": 0.5351, + "step": 12218 + }, + { + "epoch": 1.9947349087792334, + "grad_norm": 1.6789911985397339, + "learning_rate": 1.9372930503470706e-05, + "loss": 0.5724, + "step": 12219 + }, + { + "epoch": 1.9948981674217379, + "grad_norm": 1.469632863998413, + "learning_rate": 1.937281989491892e-05, + "loss": 0.6318, + "step": 12220 + }, + { + "epoch": 1.9950614260642423, + "grad_norm": 1.9700437784194946, + "learning_rate": 1.93727092769287e-05, + "loss": 0.6045, + "step": 12221 + }, + { + "epoch": 1.9952246847067467, + "grad_norm": 1.65242338180542, + "learning_rate": 1.9372598649500167e-05, + "loss": 0.5749, + "step": 12222 + }, + { + "epoch": 1.9953879433492512, + "grad_norm": 1.6727186441421509, + "learning_rate": 1.9372488012633434e-05, + "loss": 0.6501, + "step": 12223 + }, + { + "epoch": 1.9955512019917554, + "grad_norm": 1.6649644374847412, + "learning_rate": 1.9372377366328602e-05, + "loss": 0.5341, + "step": 12224 + }, + { + "epoch": 1.9957144606342598, + "grad_norm": 1.7199417352676392, + "learning_rate": 1.937226671058579e-05, + "loss": 0.6127, + "step": 12225 + }, + { + "epoch": 1.995877719276764, + "grad_norm": 1.600156307220459, + "learning_rate": 1.937215604540511e-05, + "loss": 0.6201, + "step": 12226 + }, + { + "epoch": 1.9960409779192685, + "grad_norm": 1.7136313915252686, + "learning_rate": 1.937204537078667e-05, + "loss": 0.6862, + "step": 12227 + }, + { + "epoch": 1.996204236561773, + "grad_norm": 1.6513633728027344, + "learning_rate": 1.9371934686730583e-05, + "loss": 0.6691, + "step": 12228 + }, + { + "epoch": 1.9963674952042774, + "grad_norm": 1.8737003803253174, + "learning_rate": 1.937182399323696e-05, + "loss": 0.6682, + "step": 12229 + }, + { + "epoch": 1.9965307538467818, + "grad_norm": 1.7241592407226562, + "learning_rate": 1.937171329030591e-05, + "loss": 0.8136, + "step": 12230 + }, + { + "epoch": 1.9966940124892862, + "grad_norm": 1.7646976709365845, + "learning_rate": 1.9371602577937554e-05, + "loss": 0.7151, + "step": 12231 + }, + { + "epoch": 1.9968572711317907, + "grad_norm": 1.985079288482666, + "learning_rate": 1.9371491856131993e-05, + "loss": 0.7016, + "step": 12232 + }, + { + "epoch": 1.997020529774295, + "grad_norm": 1.5202877521514893, + "learning_rate": 1.937138112488934e-05, + "loss": 0.6558, + "step": 12233 + }, + { + "epoch": 1.9971837884167993, + "grad_norm": 1.837958812713623, + "learning_rate": 1.9371270384209713e-05, + "loss": 0.7943, + "step": 12234 + }, + { + "epoch": 1.9973470470593035, + "grad_norm": 1.968964695930481, + "learning_rate": 1.9371159634093214e-05, + "loss": 0.7701, + "step": 12235 + }, + { + "epoch": 1.997510305701808, + "grad_norm": 1.9323967695236206, + "learning_rate": 1.9371048874539965e-05, + "loss": 0.6578, + "step": 12236 + }, + { + "epoch": 1.9976735643443124, + "grad_norm": 1.8352723121643066, + "learning_rate": 1.9370938105550068e-05, + "loss": 1.06, + "step": 12237 + }, + { + "epoch": 1.9978368229868169, + "grad_norm": 1.5047938823699951, + "learning_rate": 1.937082732712364e-05, + "loss": 0.5885, + "step": 12238 + }, + { + "epoch": 1.9980000816293213, + "grad_norm": 1.8986798524856567, + "learning_rate": 1.9370716539260793e-05, + "loss": 0.8025, + "step": 12239 + }, + { + "epoch": 1.9981633402718257, + "grad_norm": 1.7441338300704956, + "learning_rate": 1.9370605741961638e-05, + "loss": 0.7996, + "step": 12240 + }, + { + "epoch": 1.9983265989143302, + "grad_norm": 1.7328318357467651, + "learning_rate": 1.9370494935226283e-05, + "loss": 0.7662, + "step": 12241 + }, + { + "epoch": 1.9984898575568344, + "grad_norm": 1.4956060647964478, + "learning_rate": 1.937038411905484e-05, + "loss": 0.7831, + "step": 12242 + }, + { + "epoch": 1.9986531161993388, + "grad_norm": 1.570144772529602, + "learning_rate": 1.937027329344743e-05, + "loss": 0.5667, + "step": 12243 + }, + { + "epoch": 1.998816374841843, + "grad_norm": 1.3083720207214355, + "learning_rate": 1.937016245840415e-05, + "loss": 0.4757, + "step": 12244 + }, + { + "epoch": 1.9989796334843475, + "grad_norm": 1.7528181076049805, + "learning_rate": 1.9370051613925122e-05, + "loss": 0.8001, + "step": 12245 + }, + { + "epoch": 1.999142892126852, + "grad_norm": 1.518257737159729, + "learning_rate": 1.9369940760010454e-05, + "loss": 0.5588, + "step": 12246 + }, + { + "epoch": 1.9993061507693564, + "grad_norm": 1.6943553686141968, + "learning_rate": 1.9369829896660257e-05, + "loss": 0.7421, + "step": 12247 + }, + { + "epoch": 1.9994694094118608, + "grad_norm": 1.6327831745147705, + "learning_rate": 1.9369719023874644e-05, + "loss": 0.6137, + "step": 12248 + }, + { + "epoch": 1.9996326680543652, + "grad_norm": 1.5350393056869507, + "learning_rate": 1.9369608141653728e-05, + "loss": 0.6702, + "step": 12249 + }, + { + "epoch": 1.9997959266968697, + "grad_norm": 1.4994316101074219, + "learning_rate": 1.936949724999762e-05, + "loss": 0.6092, + "step": 12250 + }, + { + "epoch": 1.999959185339374, + "grad_norm": 1.688586950302124, + "learning_rate": 1.936938634890643e-05, + "loss": 0.6434, + "step": 12251 + }, + { + "epoch": 2.0, + "grad_norm": 3.8284318447113037, + "learning_rate": 1.9369275438380268e-05, + "loss": 1.0179, + "step": 12252 + }, + { + "epoch": 2.0001632586425044, + "grad_norm": 1.525017499923706, + "learning_rate": 1.936916451841925e-05, + "loss": 0.6397, + "step": 12253 + }, + { + "epoch": 2.000326517285009, + "grad_norm": 1.4315767288208008, + "learning_rate": 1.9369053589023485e-05, + "loss": 0.617, + "step": 12254 + }, + { + "epoch": 2.0004897759275133, + "grad_norm": 1.3437190055847168, + "learning_rate": 1.9368942650193087e-05, + "loss": 0.5643, + "step": 12255 + }, + { + "epoch": 2.0006530345700178, + "grad_norm": 1.2458715438842773, + "learning_rate": 1.9368831701928165e-05, + "loss": 0.5059, + "step": 12256 + }, + { + "epoch": 2.0008162932125217, + "grad_norm": 1.2643648386001587, + "learning_rate": 1.9368720744228834e-05, + "loss": 0.5238, + "step": 12257 + }, + { + "epoch": 2.000979551855026, + "grad_norm": 1.5440683364868164, + "learning_rate": 1.93686097770952e-05, + "loss": 0.7249, + "step": 12258 + }, + { + "epoch": 2.0011428104975306, + "grad_norm": 1.7303239107131958, + "learning_rate": 1.936849880052738e-05, + "loss": 0.7757, + "step": 12259 + }, + { + "epoch": 2.001306069140035, + "grad_norm": 1.387161135673523, + "learning_rate": 1.9368387814525483e-05, + "loss": 0.6196, + "step": 12260 + }, + { + "epoch": 2.0014693277825395, + "grad_norm": 1.632778286933899, + "learning_rate": 1.9368276819089625e-05, + "loss": 0.6434, + "step": 12261 + }, + { + "epoch": 2.001632586425044, + "grad_norm": 1.4366050958633423, + "learning_rate": 1.9368165814219914e-05, + "loss": 0.6531, + "step": 12262 + }, + { + "epoch": 2.0017958450675484, + "grad_norm": 1.8104180097579956, + "learning_rate": 1.936805479991646e-05, + "loss": 0.7032, + "step": 12263 + }, + { + "epoch": 2.001959103710053, + "grad_norm": 1.5994017124176025, + "learning_rate": 1.936794377617938e-05, + "loss": 0.6576, + "step": 12264 + }, + { + "epoch": 2.0021223623525572, + "grad_norm": 1.330633521080017, + "learning_rate": 1.9367832743008782e-05, + "loss": 0.5244, + "step": 12265 + }, + { + "epoch": 2.0022856209950612, + "grad_norm": 1.5582550764083862, + "learning_rate": 1.9367721700404776e-05, + "loss": 0.5558, + "step": 12266 + }, + { + "epoch": 2.0024488796375657, + "grad_norm": 1.4332902431488037, + "learning_rate": 1.9367610648367483e-05, + "loss": 0.4751, + "step": 12267 + }, + { + "epoch": 2.00261213828007, + "grad_norm": 1.9560657739639282, + "learning_rate": 1.9367499586897004e-05, + "loss": 0.5831, + "step": 12268 + }, + { + "epoch": 2.0027753969225746, + "grad_norm": 2.0147335529327393, + "learning_rate": 1.9367388515993458e-05, + "loss": 0.7942, + "step": 12269 + }, + { + "epoch": 2.002938655565079, + "grad_norm": 1.9275023937225342, + "learning_rate": 1.9367277435656953e-05, + "loss": 0.6588, + "step": 12270 + }, + { + "epoch": 2.0031019142075834, + "grad_norm": 1.6925292015075684, + "learning_rate": 1.93671663458876e-05, + "loss": 0.6191, + "step": 12271 + }, + { + "epoch": 2.003265172850088, + "grad_norm": 1.3208918571472168, + "learning_rate": 1.9367055246685518e-05, + "loss": 0.55, + "step": 12272 + }, + { + "epoch": 2.0034284314925923, + "grad_norm": 1.4352906942367554, + "learning_rate": 1.936694413805081e-05, + "loss": 0.5203, + "step": 12273 + }, + { + "epoch": 2.0035916901350967, + "grad_norm": 1.5033046007156372, + "learning_rate": 1.9366833019983594e-05, + "loss": 0.6316, + "step": 12274 + }, + { + "epoch": 2.0037549487776007, + "grad_norm": 1.9667242765426636, + "learning_rate": 1.9366721892483976e-05, + "loss": 0.673, + "step": 12275 + }, + { + "epoch": 2.003918207420105, + "grad_norm": 1.6786645650863647, + "learning_rate": 1.9366610755552077e-05, + "loss": 0.5257, + "step": 12276 + }, + { + "epoch": 2.0040814660626096, + "grad_norm": 1.7024633884429932, + "learning_rate": 1.9366499609188e-05, + "loss": 0.6784, + "step": 12277 + }, + { + "epoch": 2.004244724705114, + "grad_norm": 1.5858862400054932, + "learning_rate": 1.936638845339186e-05, + "loss": 0.6209, + "step": 12278 + }, + { + "epoch": 2.0044079833476185, + "grad_norm": 1.7487678527832031, + "learning_rate": 1.9366277288163768e-05, + "loss": 0.5504, + "step": 12279 + }, + { + "epoch": 2.004571241990123, + "grad_norm": 1.7262192964553833, + "learning_rate": 1.9366166113503843e-05, + "loss": 0.5333, + "step": 12280 + }, + { + "epoch": 2.0047345006326274, + "grad_norm": 1.573883295059204, + "learning_rate": 1.9366054929412185e-05, + "loss": 0.558, + "step": 12281 + }, + { + "epoch": 2.004897759275132, + "grad_norm": 1.6428101062774658, + "learning_rate": 1.9365943735888914e-05, + "loss": 0.6364, + "step": 12282 + }, + { + "epoch": 2.005061017917636, + "grad_norm": 1.5487192869186401, + "learning_rate": 1.936583253293414e-05, + "loss": 0.5481, + "step": 12283 + }, + { + "epoch": 2.0052242765601402, + "grad_norm": 1.6665948629379272, + "learning_rate": 1.9365721320547978e-05, + "loss": 0.5613, + "step": 12284 + }, + { + "epoch": 2.0053875352026447, + "grad_norm": 1.5535932779312134, + "learning_rate": 1.9365610098730533e-05, + "loss": 0.5116, + "step": 12285 + }, + { + "epoch": 2.005550793845149, + "grad_norm": 1.7239588499069214, + "learning_rate": 1.9365498867481926e-05, + "loss": 0.5803, + "step": 12286 + }, + { + "epoch": 2.0057140524876536, + "grad_norm": 1.7922890186309814, + "learning_rate": 1.936538762680226e-05, + "loss": 0.5568, + "step": 12287 + }, + { + "epoch": 2.005877311130158, + "grad_norm": 1.456236481666565, + "learning_rate": 1.9365276376691652e-05, + "loss": 0.5147, + "step": 12288 + }, + { + "epoch": 2.0060405697726624, + "grad_norm": 2.087960958480835, + "learning_rate": 1.9365165117150213e-05, + "loss": 0.647, + "step": 12289 + }, + { + "epoch": 2.006203828415167, + "grad_norm": 1.4387080669403076, + "learning_rate": 1.9365053848178058e-05, + "loss": 0.5161, + "step": 12290 + }, + { + "epoch": 2.0063670870576713, + "grad_norm": 1.8212391138076782, + "learning_rate": 1.9364942569775292e-05, + "loss": 0.5775, + "step": 12291 + }, + { + "epoch": 2.0065303457001753, + "grad_norm": 1.7119109630584717, + "learning_rate": 1.9364831281942034e-05, + "loss": 0.6184, + "step": 12292 + }, + { + "epoch": 2.0066936043426797, + "grad_norm": 1.444809913635254, + "learning_rate": 1.9364719984678393e-05, + "loss": 0.5046, + "step": 12293 + }, + { + "epoch": 2.006856862985184, + "grad_norm": 1.689087152481079, + "learning_rate": 1.9364608677984483e-05, + "loss": 0.6092, + "step": 12294 + }, + { + "epoch": 2.0070201216276886, + "grad_norm": 1.5682522058486938, + "learning_rate": 1.9364497361860413e-05, + "loss": 0.5588, + "step": 12295 + }, + { + "epoch": 2.007183380270193, + "grad_norm": 1.5811374187469482, + "learning_rate": 1.9364386036306294e-05, + "loss": 0.6533, + "step": 12296 + }, + { + "epoch": 2.0073466389126975, + "grad_norm": 1.5118218660354614, + "learning_rate": 1.9364274701322246e-05, + "loss": 0.5512, + "step": 12297 + }, + { + "epoch": 2.007509897555202, + "grad_norm": 1.7992173433303833, + "learning_rate": 1.936416335690837e-05, + "loss": 0.5553, + "step": 12298 + }, + { + "epoch": 2.0076731561977064, + "grad_norm": 1.8048683404922485, + "learning_rate": 1.9364052003064787e-05, + "loss": 0.5243, + "step": 12299 + }, + { + "epoch": 2.007836414840211, + "grad_norm": 1.7227141857147217, + "learning_rate": 1.9363940639791607e-05, + "loss": 0.5865, + "step": 12300 + }, + { + "epoch": 2.007999673482715, + "grad_norm": 1.9699747562408447, + "learning_rate": 1.936382926708894e-05, + "loss": 0.9084, + "step": 12301 + }, + { + "epoch": 2.0081629321252192, + "grad_norm": 1.9576135873794556, + "learning_rate": 1.93637178849569e-05, + "loss": 0.5981, + "step": 12302 + }, + { + "epoch": 2.0083261907677237, + "grad_norm": 1.8755521774291992, + "learning_rate": 1.9363606493395596e-05, + "loss": 0.5671, + "step": 12303 + }, + { + "epoch": 2.008489449410228, + "grad_norm": 1.4214897155761719, + "learning_rate": 1.9363495092405147e-05, + "loss": 0.5102, + "step": 12304 + }, + { + "epoch": 2.0086527080527325, + "grad_norm": 1.9460335969924927, + "learning_rate": 1.936338368198566e-05, + "loss": 0.7092, + "step": 12305 + }, + { + "epoch": 2.008815966695237, + "grad_norm": 1.6770280599594116, + "learning_rate": 1.9363272262137247e-05, + "loss": 0.6075, + "step": 12306 + }, + { + "epoch": 2.0089792253377414, + "grad_norm": 1.5190061330795288, + "learning_rate": 1.936316083286002e-05, + "loss": 0.4775, + "step": 12307 + }, + { + "epoch": 2.009142483980246, + "grad_norm": 1.4265611171722412, + "learning_rate": 1.9363049394154095e-05, + "loss": 0.4223, + "step": 12308 + }, + { + "epoch": 2.0093057426227503, + "grad_norm": 2.0123677253723145, + "learning_rate": 1.936293794601958e-05, + "loss": 0.6592, + "step": 12309 + }, + { + "epoch": 2.0094690012652543, + "grad_norm": 1.6626603603363037, + "learning_rate": 1.936282648845659e-05, + "loss": 0.5924, + "step": 12310 + }, + { + "epoch": 2.0096322599077587, + "grad_norm": 1.5935218334197998, + "learning_rate": 1.9362715021465236e-05, + "loss": 0.5072, + "step": 12311 + }, + { + "epoch": 2.009795518550263, + "grad_norm": 1.7734816074371338, + "learning_rate": 1.9362603545045632e-05, + "loss": 0.569, + "step": 12312 + }, + { + "epoch": 2.0099587771927676, + "grad_norm": 1.55368971824646, + "learning_rate": 1.9362492059197887e-05, + "loss": 0.5343, + "step": 12313 + }, + { + "epoch": 2.010122035835272, + "grad_norm": 2.029787540435791, + "learning_rate": 1.9362380563922113e-05, + "loss": 0.6977, + "step": 12314 + }, + { + "epoch": 2.0102852944777765, + "grad_norm": 2.0183026790618896, + "learning_rate": 1.9362269059218426e-05, + "loss": 0.7911, + "step": 12315 + }, + { + "epoch": 2.010448553120281, + "grad_norm": 1.7496566772460938, + "learning_rate": 1.9362157545086936e-05, + "loss": 0.5947, + "step": 12316 + }, + { + "epoch": 2.0106118117627854, + "grad_norm": 1.8365520238876343, + "learning_rate": 1.9362046021527757e-05, + "loss": 0.663, + "step": 12317 + }, + { + "epoch": 2.01077507040529, + "grad_norm": 1.6675431728363037, + "learning_rate": 1.9361934488541002e-05, + "loss": 0.5584, + "step": 12318 + }, + { + "epoch": 2.010938329047794, + "grad_norm": 1.592437744140625, + "learning_rate": 1.936182294612678e-05, + "loss": 0.549, + "step": 12319 + }, + { + "epoch": 2.0111015876902982, + "grad_norm": 1.6111193895339966, + "learning_rate": 1.9361711394285202e-05, + "loss": 0.5623, + "step": 12320 + }, + { + "epoch": 2.0112648463328027, + "grad_norm": 1.9162113666534424, + "learning_rate": 1.9361599833016387e-05, + "loss": 0.673, + "step": 12321 + }, + { + "epoch": 2.011428104975307, + "grad_norm": 1.6031928062438965, + "learning_rate": 1.936148826232044e-05, + "loss": 0.5496, + "step": 12322 + }, + { + "epoch": 2.0115913636178115, + "grad_norm": 1.9766395092010498, + "learning_rate": 1.9361376682197478e-05, + "loss": 0.5971, + "step": 12323 + }, + { + "epoch": 2.011754622260316, + "grad_norm": 1.4820067882537842, + "learning_rate": 1.9361265092647615e-05, + "loss": 0.6003, + "step": 12324 + }, + { + "epoch": 2.0119178809028204, + "grad_norm": 1.613274335861206, + "learning_rate": 1.936115349367096e-05, + "loss": 0.5027, + "step": 12325 + }, + { + "epoch": 2.012081139545325, + "grad_norm": 1.7153983116149902, + "learning_rate": 1.9361041885267623e-05, + "loss": 0.5366, + "step": 12326 + }, + { + "epoch": 2.012244398187829, + "grad_norm": 1.82578706741333, + "learning_rate": 1.936093026743772e-05, + "loss": 0.6272, + "step": 12327 + }, + { + "epoch": 2.0124076568303333, + "grad_norm": 1.8054364919662476, + "learning_rate": 1.936081864018136e-05, + "loss": 0.5157, + "step": 12328 + }, + { + "epoch": 2.0125709154728377, + "grad_norm": 1.8185817003250122, + "learning_rate": 1.936070700349866e-05, + "loss": 0.5669, + "step": 12329 + }, + { + "epoch": 2.012734174115342, + "grad_norm": 1.687887191772461, + "learning_rate": 1.9360595357389735e-05, + "loss": 0.6349, + "step": 12330 + }, + { + "epoch": 2.0128974327578466, + "grad_norm": 2.1315226554870605, + "learning_rate": 1.9360483701854687e-05, + "loss": 0.7192, + "step": 12331 + }, + { + "epoch": 2.013060691400351, + "grad_norm": 1.6780625581741333, + "learning_rate": 1.936037203689364e-05, + "loss": 0.5634, + "step": 12332 + }, + { + "epoch": 2.0132239500428555, + "grad_norm": 1.7298437356948853, + "learning_rate": 1.93602603625067e-05, + "loss": 0.586, + "step": 12333 + }, + { + "epoch": 2.01338720868536, + "grad_norm": 1.8218159675598145, + "learning_rate": 1.9360148678693974e-05, + "loss": 0.7182, + "step": 12334 + }, + { + "epoch": 2.0135504673278644, + "grad_norm": 1.6506799459457397, + "learning_rate": 1.9360036985455586e-05, + "loss": 0.6337, + "step": 12335 + }, + { + "epoch": 2.0137137259703684, + "grad_norm": 2.09773588180542, + "learning_rate": 1.9359925282791642e-05, + "loss": 0.5782, + "step": 12336 + }, + { + "epoch": 2.013876984612873, + "grad_norm": 1.899759292602539, + "learning_rate": 1.9359813570702254e-05, + "loss": 0.6659, + "step": 12337 + }, + { + "epoch": 2.0140402432553772, + "grad_norm": 2.1470119953155518, + "learning_rate": 1.935970184918754e-05, + "loss": 0.6836, + "step": 12338 + }, + { + "epoch": 2.0142035018978817, + "grad_norm": 1.4537296295166016, + "learning_rate": 1.9359590118247608e-05, + "loss": 0.4847, + "step": 12339 + }, + { + "epoch": 2.014366760540386, + "grad_norm": 2.1467232704162598, + "learning_rate": 1.9359478377882567e-05, + "loss": 0.6309, + "step": 12340 + }, + { + "epoch": 2.0145300191828905, + "grad_norm": 1.8243601322174072, + "learning_rate": 1.935936662809254e-05, + "loss": 0.5927, + "step": 12341 + }, + { + "epoch": 2.014693277825395, + "grad_norm": 1.7992668151855469, + "learning_rate": 1.935925486887763e-05, + "loss": 0.6132, + "step": 12342 + }, + { + "epoch": 2.0148565364678994, + "grad_norm": 1.5309432744979858, + "learning_rate": 1.935914310023795e-05, + "loss": 0.4946, + "step": 12343 + }, + { + "epoch": 2.015019795110404, + "grad_norm": 1.3196916580200195, + "learning_rate": 1.935903132217362e-05, + "loss": 0.484, + "step": 12344 + }, + { + "epoch": 2.015183053752908, + "grad_norm": 1.5638799667358398, + "learning_rate": 1.9358919534684748e-05, + "loss": 0.5677, + "step": 12345 + }, + { + "epoch": 2.0153463123954123, + "grad_norm": 1.9837442636489868, + "learning_rate": 1.9358807737771444e-05, + "loss": 0.5411, + "step": 12346 + }, + { + "epoch": 2.0155095710379167, + "grad_norm": 1.7685649394989014, + "learning_rate": 1.9358695931433825e-05, + "loss": 0.5711, + "step": 12347 + }, + { + "epoch": 2.015672829680421, + "grad_norm": 1.9924731254577637, + "learning_rate": 1.9358584115672e-05, + "loss": 0.561, + "step": 12348 + }, + { + "epoch": 2.0158360883229256, + "grad_norm": 1.5001784563064575, + "learning_rate": 1.9358472290486085e-05, + "loss": 0.5066, + "step": 12349 + }, + { + "epoch": 2.01599934696543, + "grad_norm": 1.7629691362380981, + "learning_rate": 1.935836045587619e-05, + "loss": 0.5885, + "step": 12350 + }, + { + "epoch": 2.0161626056079345, + "grad_norm": 1.4926044940948486, + "learning_rate": 1.9358248611842427e-05, + "loss": 0.5029, + "step": 12351 + }, + { + "epoch": 2.016325864250439, + "grad_norm": 1.8621244430541992, + "learning_rate": 1.935813675838491e-05, + "loss": 0.5142, + "step": 12352 + }, + { + "epoch": 2.0164891228929434, + "grad_norm": 1.6747856140136719, + "learning_rate": 1.9358024895503753e-05, + "loss": 0.5897, + "step": 12353 + }, + { + "epoch": 2.0166523815354473, + "grad_norm": 1.4368631839752197, + "learning_rate": 1.9357913023199066e-05, + "loss": 0.5021, + "step": 12354 + }, + { + "epoch": 2.016815640177952, + "grad_norm": 2.0751402378082275, + "learning_rate": 1.9357801141470967e-05, + "loss": 0.604, + "step": 12355 + }, + { + "epoch": 2.0169788988204562, + "grad_norm": 1.9507529735565186, + "learning_rate": 1.9357689250319563e-05, + "loss": 0.6411, + "step": 12356 + }, + { + "epoch": 2.0171421574629607, + "grad_norm": 2.1072123050689697, + "learning_rate": 1.935757734974497e-05, + "loss": 0.6936, + "step": 12357 + }, + { + "epoch": 2.017305416105465, + "grad_norm": 1.6697245836257935, + "learning_rate": 1.9357465439747295e-05, + "loss": 0.5252, + "step": 12358 + }, + { + "epoch": 2.0174686747479695, + "grad_norm": 1.679677128791809, + "learning_rate": 1.9357353520326658e-05, + "loss": 0.5411, + "step": 12359 + }, + { + "epoch": 2.017631933390474, + "grad_norm": 1.3646066188812256, + "learning_rate": 1.9357241591483165e-05, + "loss": 0.4612, + "step": 12360 + }, + { + "epoch": 2.0177951920329784, + "grad_norm": 1.9299532175064087, + "learning_rate": 1.9357129653216932e-05, + "loss": 1.161, + "step": 12361 + }, + { + "epoch": 2.017958450675483, + "grad_norm": 1.958855152130127, + "learning_rate": 1.9357017705528075e-05, + "loss": 0.8137, + "step": 12362 + }, + { + "epoch": 2.018121709317987, + "grad_norm": 1.9061126708984375, + "learning_rate": 1.9356905748416704e-05, + "loss": 0.5766, + "step": 12363 + }, + { + "epoch": 2.0182849679604913, + "grad_norm": 1.6292773485183716, + "learning_rate": 1.935679378188293e-05, + "loss": 0.5095, + "step": 12364 + }, + { + "epoch": 2.0184482266029957, + "grad_norm": 1.476036787033081, + "learning_rate": 1.9356681805926867e-05, + "loss": 0.4676, + "step": 12365 + }, + { + "epoch": 2.0186114852455, + "grad_norm": 1.4244312047958374, + "learning_rate": 1.9356569820548628e-05, + "loss": 0.5021, + "step": 12366 + }, + { + "epoch": 2.0187747438880046, + "grad_norm": 1.6099629402160645, + "learning_rate": 1.9356457825748326e-05, + "loss": 0.5215, + "step": 12367 + }, + { + "epoch": 2.018938002530509, + "grad_norm": 1.7246471643447876, + "learning_rate": 1.9356345821526074e-05, + "loss": 0.6251, + "step": 12368 + }, + { + "epoch": 2.0191012611730135, + "grad_norm": 1.8226594924926758, + "learning_rate": 1.9356233807881983e-05, + "loss": 0.5616, + "step": 12369 + }, + { + "epoch": 2.019264519815518, + "grad_norm": 1.8126475811004639, + "learning_rate": 1.9356121784816165e-05, + "loss": 0.6473, + "step": 12370 + }, + { + "epoch": 2.019427778458022, + "grad_norm": 1.3992434740066528, + "learning_rate": 1.935600975232874e-05, + "loss": 0.5093, + "step": 12371 + }, + { + "epoch": 2.0195910371005263, + "grad_norm": 1.6207711696624756, + "learning_rate": 1.935589771041981e-05, + "loss": 0.4469, + "step": 12372 + }, + { + "epoch": 2.019754295743031, + "grad_norm": 1.5531736612319946, + "learning_rate": 1.93557856590895e-05, + "loss": 0.5149, + "step": 12373 + }, + { + "epoch": 2.019917554385535, + "grad_norm": 1.960896372795105, + "learning_rate": 1.9355673598337916e-05, + "loss": 0.6126, + "step": 12374 + }, + { + "epoch": 2.0200808130280397, + "grad_norm": 2.001063823699951, + "learning_rate": 1.9355561528165166e-05, + "loss": 0.6533, + "step": 12375 + }, + { + "epoch": 2.020244071670544, + "grad_norm": 1.4003616571426392, + "learning_rate": 1.935544944857137e-05, + "loss": 0.4508, + "step": 12376 + }, + { + "epoch": 2.0204073303130485, + "grad_norm": 2.0550425052642822, + "learning_rate": 1.9355337359556642e-05, + "loss": 0.6309, + "step": 12377 + }, + { + "epoch": 2.020570588955553, + "grad_norm": 1.7690050601959229, + "learning_rate": 1.9355225261121087e-05, + "loss": 0.6835, + "step": 12378 + }, + { + "epoch": 2.0207338475980574, + "grad_norm": 1.7926242351531982, + "learning_rate": 1.9355113153264824e-05, + "loss": 0.5439, + "step": 12379 + }, + { + "epoch": 2.0208971062405614, + "grad_norm": 1.500222086906433, + "learning_rate": 1.9355001035987966e-05, + "loss": 0.5466, + "step": 12380 + }, + { + "epoch": 2.021060364883066, + "grad_norm": 1.769160509109497, + "learning_rate": 1.935488890929062e-05, + "loss": 0.5374, + "step": 12381 + }, + { + "epoch": 2.0212236235255703, + "grad_norm": 1.8263441324234009, + "learning_rate": 1.935477677317291e-05, + "loss": 0.532, + "step": 12382 + }, + { + "epoch": 2.0213868821680747, + "grad_norm": 2.171236753463745, + "learning_rate": 1.935466462763494e-05, + "loss": 0.5571, + "step": 12383 + }, + { + "epoch": 2.021550140810579, + "grad_norm": 1.4735336303710938, + "learning_rate": 1.935455247267682e-05, + "loss": 0.4806, + "step": 12384 + }, + { + "epoch": 2.0217133994530836, + "grad_norm": 1.8918373584747314, + "learning_rate": 1.9354440308298676e-05, + "loss": 0.5282, + "step": 12385 + }, + { + "epoch": 2.021876658095588, + "grad_norm": 1.8940712213516235, + "learning_rate": 1.9354328134500608e-05, + "loss": 1.325, + "step": 12386 + }, + { + "epoch": 2.0220399167380925, + "grad_norm": 1.9090542793273926, + "learning_rate": 1.9354215951282736e-05, + "loss": 0.5491, + "step": 12387 + }, + { + "epoch": 2.022203175380597, + "grad_norm": 1.726650357246399, + "learning_rate": 1.935410375864517e-05, + "loss": 0.5842, + "step": 12388 + }, + { + "epoch": 2.022366434023101, + "grad_norm": 1.8594613075256348, + "learning_rate": 1.9353991556588026e-05, + "loss": 0.6461, + "step": 12389 + }, + { + "epoch": 2.0225296926656053, + "grad_norm": 1.581127643585205, + "learning_rate": 1.935387934511141e-05, + "loss": 0.5789, + "step": 12390 + }, + { + "epoch": 2.0226929513081098, + "grad_norm": 1.7179625034332275, + "learning_rate": 1.9353767124215448e-05, + "loss": 0.5809, + "step": 12391 + }, + { + "epoch": 2.022856209950614, + "grad_norm": 1.5810734033584595, + "learning_rate": 1.9353654893900237e-05, + "loss": 0.5242, + "step": 12392 + }, + { + "epoch": 2.0230194685931187, + "grad_norm": 1.9803438186645508, + "learning_rate": 1.9353542654165905e-05, + "loss": 0.6232, + "step": 12393 + }, + { + "epoch": 2.023182727235623, + "grad_norm": 1.6825182437896729, + "learning_rate": 1.9353430405012553e-05, + "loss": 0.5652, + "step": 12394 + }, + { + "epoch": 2.0233459858781275, + "grad_norm": 1.779036521911621, + "learning_rate": 1.93533181464403e-05, + "loss": 0.5515, + "step": 12395 + }, + { + "epoch": 2.023509244520632, + "grad_norm": 1.5682705640792847, + "learning_rate": 1.935320587844926e-05, + "loss": 0.497, + "step": 12396 + }, + { + "epoch": 2.0236725031631364, + "grad_norm": 1.9355415105819702, + "learning_rate": 1.935309360103954e-05, + "loss": 0.6478, + "step": 12397 + }, + { + "epoch": 2.0238357618056404, + "grad_norm": 2.005685329437256, + "learning_rate": 1.935298131421126e-05, + "loss": 0.645, + "step": 12398 + }, + { + "epoch": 2.023999020448145, + "grad_norm": 1.915358543395996, + "learning_rate": 1.9352869017964533e-05, + "loss": 0.6761, + "step": 12399 + }, + { + "epoch": 2.0241622790906493, + "grad_norm": 1.7697430849075317, + "learning_rate": 1.9352756712299467e-05, + "loss": 0.5733, + "step": 12400 + }, + { + "epoch": 2.0243255377331537, + "grad_norm": 1.916601538658142, + "learning_rate": 1.9352644397216177e-05, + "loss": 0.6181, + "step": 12401 + }, + { + "epoch": 2.024488796375658, + "grad_norm": 1.694218635559082, + "learning_rate": 1.935253207271478e-05, + "loss": 0.5112, + "step": 12402 + }, + { + "epoch": 2.0246520550181626, + "grad_norm": 1.7563910484313965, + "learning_rate": 1.935241973879538e-05, + "loss": 0.5661, + "step": 12403 + }, + { + "epoch": 2.024815313660667, + "grad_norm": 1.9390674829483032, + "learning_rate": 1.9352307395458097e-05, + "loss": 0.6035, + "step": 12404 + }, + { + "epoch": 2.0249785723031715, + "grad_norm": 1.9004501104354858, + "learning_rate": 1.9352195042703045e-05, + "loss": 0.5084, + "step": 12405 + }, + { + "epoch": 2.025141830945676, + "grad_norm": 1.617708444595337, + "learning_rate": 1.9352082680530334e-05, + "loss": 0.6307, + "step": 12406 + }, + { + "epoch": 2.02530508958818, + "grad_norm": 1.418436050415039, + "learning_rate": 1.935197030894008e-05, + "loss": 0.4534, + "step": 12407 + }, + { + "epoch": 2.0254683482306843, + "grad_norm": 1.4722530841827393, + "learning_rate": 1.9351857927932392e-05, + "loss": 0.4057, + "step": 12408 + }, + { + "epoch": 2.0256316068731888, + "grad_norm": 1.5121897459030151, + "learning_rate": 1.9351745537507386e-05, + "loss": 0.4033, + "step": 12409 + }, + { + "epoch": 2.025794865515693, + "grad_norm": 1.6805142164230347, + "learning_rate": 1.9351633137665175e-05, + "loss": 0.516, + "step": 12410 + }, + { + "epoch": 2.0259581241581976, + "grad_norm": 1.7234915494918823, + "learning_rate": 1.9351520728405872e-05, + "loss": 0.5185, + "step": 12411 + }, + { + "epoch": 2.026121382800702, + "grad_norm": 1.5997729301452637, + "learning_rate": 1.9351408309729592e-05, + "loss": 0.5622, + "step": 12412 + }, + { + "epoch": 2.0262846414432065, + "grad_norm": 2.1638271808624268, + "learning_rate": 1.9351295881636444e-05, + "loss": 0.5176, + "step": 12413 + }, + { + "epoch": 2.026447900085711, + "grad_norm": 1.865201711654663, + "learning_rate": 1.9351183444126542e-05, + "loss": 0.6058, + "step": 12414 + }, + { + "epoch": 2.026611158728215, + "grad_norm": 1.8605846166610718, + "learning_rate": 1.9351070997200003e-05, + "loss": 0.5994, + "step": 12415 + }, + { + "epoch": 2.0267744173707194, + "grad_norm": 2.8188579082489014, + "learning_rate": 1.935095854085694e-05, + "loss": 0.5067, + "step": 12416 + }, + { + "epoch": 2.026937676013224, + "grad_norm": 1.77742338180542, + "learning_rate": 1.935084607509746e-05, + "loss": 0.5478, + "step": 12417 + }, + { + "epoch": 2.0271009346557283, + "grad_norm": 1.7860585451126099, + "learning_rate": 1.9350733599921684e-05, + "loss": 0.6017, + "step": 12418 + }, + { + "epoch": 2.0272641932982327, + "grad_norm": 1.9925343990325928, + "learning_rate": 1.935062111532972e-05, + "loss": 0.6638, + "step": 12419 + }, + { + "epoch": 2.027427451940737, + "grad_norm": 1.6430071592330933, + "learning_rate": 1.9350508621321685e-05, + "loss": 0.5226, + "step": 12420 + }, + { + "epoch": 2.0275907105832416, + "grad_norm": 1.5705299377441406, + "learning_rate": 1.935039611789769e-05, + "loss": 0.4806, + "step": 12421 + }, + { + "epoch": 2.027753969225746, + "grad_norm": 1.5908104181289673, + "learning_rate": 1.9350283605057844e-05, + "loss": 0.5549, + "step": 12422 + }, + { + "epoch": 2.0279172278682505, + "grad_norm": 1.6370774507522583, + "learning_rate": 1.935017108280227e-05, + "loss": 0.5892, + "step": 12423 + }, + { + "epoch": 2.0280804865107545, + "grad_norm": 1.6377973556518555, + "learning_rate": 1.9350058551131072e-05, + "loss": 0.5636, + "step": 12424 + }, + { + "epoch": 2.028243745153259, + "grad_norm": 1.7274359464645386, + "learning_rate": 1.9349946010044373e-05, + "loss": 0.61, + "step": 12425 + }, + { + "epoch": 2.0284070037957633, + "grad_norm": 1.643977403640747, + "learning_rate": 1.9349833459542275e-05, + "loss": 0.5339, + "step": 12426 + }, + { + "epoch": 2.0285702624382678, + "grad_norm": 1.6858141422271729, + "learning_rate": 1.93497208996249e-05, + "loss": 0.5399, + "step": 12427 + }, + { + "epoch": 2.028733521080772, + "grad_norm": 1.6895197629928589, + "learning_rate": 1.9349608330292357e-05, + "loss": 0.5947, + "step": 12428 + }, + { + "epoch": 2.0288967797232766, + "grad_norm": 1.646095633506775, + "learning_rate": 1.9349495751544763e-05, + "loss": 0.6307, + "step": 12429 + }, + { + "epoch": 2.029060038365781, + "grad_norm": 1.4837992191314697, + "learning_rate": 1.934938316338223e-05, + "loss": 0.4354, + "step": 12430 + }, + { + "epoch": 2.0292232970082855, + "grad_norm": 1.675665020942688, + "learning_rate": 1.934927056580487e-05, + "loss": 0.5829, + "step": 12431 + }, + { + "epoch": 2.02938655565079, + "grad_norm": 1.5921735763549805, + "learning_rate": 1.9349157958812795e-05, + "loss": 0.4982, + "step": 12432 + }, + { + "epoch": 2.029549814293294, + "grad_norm": 1.9002126455307007, + "learning_rate": 1.934904534240612e-05, + "loss": 0.6137, + "step": 12433 + }, + { + "epoch": 2.0297130729357984, + "grad_norm": 1.4445796012878418, + "learning_rate": 1.9348932716584962e-05, + "loss": 0.4812, + "step": 12434 + }, + { + "epoch": 2.029876331578303, + "grad_norm": 1.7333933115005493, + "learning_rate": 1.9348820081349432e-05, + "loss": 0.6302, + "step": 12435 + }, + { + "epoch": 2.0300395902208073, + "grad_norm": 1.585367202758789, + "learning_rate": 1.9348707436699636e-05, + "loss": 0.4908, + "step": 12436 + }, + { + "epoch": 2.0302028488633117, + "grad_norm": 1.7089742422103882, + "learning_rate": 1.9348594782635702e-05, + "loss": 0.6236, + "step": 12437 + }, + { + "epoch": 2.030366107505816, + "grad_norm": 1.989702820777893, + "learning_rate": 1.934848211915773e-05, + "loss": 0.6611, + "step": 12438 + }, + { + "epoch": 2.0305293661483206, + "grad_norm": 1.8191251754760742, + "learning_rate": 1.934836944626584e-05, + "loss": 0.5858, + "step": 12439 + }, + { + "epoch": 2.030692624790825, + "grad_norm": 1.9148839712142944, + "learning_rate": 1.9348256763960146e-05, + "loss": 0.5899, + "step": 12440 + }, + { + "epoch": 2.0308558834333295, + "grad_norm": 1.5678642988204956, + "learning_rate": 1.934814407224076e-05, + "loss": 0.5242, + "step": 12441 + }, + { + "epoch": 2.0310191420758335, + "grad_norm": 1.4896069765090942, + "learning_rate": 1.9348031371107794e-05, + "loss": 0.5328, + "step": 12442 + }, + { + "epoch": 2.031182400718338, + "grad_norm": 1.612774133682251, + "learning_rate": 1.9347918660561365e-05, + "loss": 0.4938, + "step": 12443 + }, + { + "epoch": 2.0313456593608423, + "grad_norm": 2.08781361579895, + "learning_rate": 1.9347805940601582e-05, + "loss": 0.5837, + "step": 12444 + }, + { + "epoch": 2.0315089180033468, + "grad_norm": 1.7026292085647583, + "learning_rate": 1.9347693211228562e-05, + "loss": 0.586, + "step": 12445 + }, + { + "epoch": 2.031672176645851, + "grad_norm": 1.6097503900527954, + "learning_rate": 1.9347580472442414e-05, + "loss": 0.5045, + "step": 12446 + }, + { + "epoch": 2.0318354352883556, + "grad_norm": 1.8581963777542114, + "learning_rate": 1.934746772424326e-05, + "loss": 0.5824, + "step": 12447 + }, + { + "epoch": 2.03199869393086, + "grad_norm": 1.5170527696609497, + "learning_rate": 1.934735496663121e-05, + "loss": 0.547, + "step": 12448 + }, + { + "epoch": 2.0321619525733645, + "grad_norm": 1.7561402320861816, + "learning_rate": 1.9347242199606372e-05, + "loss": 0.5077, + "step": 12449 + }, + { + "epoch": 2.032325211215869, + "grad_norm": 1.9444552659988403, + "learning_rate": 1.934712942316886e-05, + "loss": 0.7042, + "step": 12450 + }, + { + "epoch": 2.032488469858373, + "grad_norm": 1.7026249170303345, + "learning_rate": 1.9347016637318797e-05, + "loss": 0.5485, + "step": 12451 + }, + { + "epoch": 2.0326517285008774, + "grad_norm": 1.5949026346206665, + "learning_rate": 1.934690384205629e-05, + "loss": 0.44, + "step": 12452 + }, + { + "epoch": 2.032814987143382, + "grad_norm": 2.2150423526763916, + "learning_rate": 1.9346791037381452e-05, + "loss": 0.641, + "step": 12453 + }, + { + "epoch": 2.0329782457858863, + "grad_norm": 1.4780746698379517, + "learning_rate": 1.93466782232944e-05, + "loss": 0.5431, + "step": 12454 + }, + { + "epoch": 2.0331415044283907, + "grad_norm": 2.0856263637542725, + "learning_rate": 1.9346565399795244e-05, + "loss": 0.5978, + "step": 12455 + }, + { + "epoch": 2.033304763070895, + "grad_norm": 2.01977801322937, + "learning_rate": 1.93464525668841e-05, + "loss": 0.5665, + "step": 12456 + }, + { + "epoch": 2.0334680217133996, + "grad_norm": 1.4879562854766846, + "learning_rate": 1.9346339724561078e-05, + "loss": 0.4708, + "step": 12457 + }, + { + "epoch": 2.033631280355904, + "grad_norm": 1.9953008890151978, + "learning_rate": 1.9346226872826295e-05, + "loss": 0.6554, + "step": 12458 + }, + { + "epoch": 2.033794538998408, + "grad_norm": 1.7277441024780273, + "learning_rate": 1.9346114011679865e-05, + "loss": 0.4859, + "step": 12459 + }, + { + "epoch": 2.0339577976409124, + "grad_norm": 1.5084885358810425, + "learning_rate": 1.9346001141121903e-05, + "loss": 0.5421, + "step": 12460 + }, + { + "epoch": 2.034121056283417, + "grad_norm": 1.8652210235595703, + "learning_rate": 1.9345888261152517e-05, + "loss": 0.5531, + "step": 12461 + }, + { + "epoch": 2.0342843149259213, + "grad_norm": 1.6558841466903687, + "learning_rate": 1.9345775371771826e-05, + "loss": 0.5458, + "step": 12462 + }, + { + "epoch": 2.0344475735684258, + "grad_norm": 2.0411274433135986, + "learning_rate": 1.934566247297994e-05, + "loss": 0.7397, + "step": 12463 + }, + { + "epoch": 2.03461083221093, + "grad_norm": 2.1410293579101562, + "learning_rate": 1.9345549564776975e-05, + "loss": 0.6801, + "step": 12464 + }, + { + "epoch": 2.0347740908534346, + "grad_norm": 1.7940441370010376, + "learning_rate": 1.9345436647163046e-05, + "loss": 0.5766, + "step": 12465 + }, + { + "epoch": 2.034937349495939, + "grad_norm": 1.463208794593811, + "learning_rate": 1.934532372013826e-05, + "loss": 0.452, + "step": 12466 + }, + { + "epoch": 2.0351006081384435, + "grad_norm": 2.064758062362671, + "learning_rate": 1.934521078370274e-05, + "loss": 0.6166, + "step": 12467 + }, + { + "epoch": 2.0352638667809475, + "grad_norm": 1.3015520572662354, + "learning_rate": 1.9345097837856596e-05, + "loss": 0.4459, + "step": 12468 + }, + { + "epoch": 2.035427125423452, + "grad_norm": 1.7860995531082153, + "learning_rate": 1.9344984882599937e-05, + "loss": 0.6001, + "step": 12469 + }, + { + "epoch": 2.0355903840659564, + "grad_norm": 1.982786774635315, + "learning_rate": 1.9344871917932884e-05, + "loss": 0.6464, + "step": 12470 + }, + { + "epoch": 2.035753642708461, + "grad_norm": 2.14611554145813, + "learning_rate": 1.9344758943855545e-05, + "loss": 0.5493, + "step": 12471 + }, + { + "epoch": 2.0359169013509653, + "grad_norm": 1.7070122957229614, + "learning_rate": 1.9344645960368037e-05, + "loss": 0.4881, + "step": 12472 + }, + { + "epoch": 2.0360801599934697, + "grad_norm": 1.6498547792434692, + "learning_rate": 1.934453296747047e-05, + "loss": 0.5667, + "step": 12473 + }, + { + "epoch": 2.036243418635974, + "grad_norm": 1.8670216798782349, + "learning_rate": 1.9344419965162967e-05, + "loss": 0.6098, + "step": 12474 + }, + { + "epoch": 2.0364066772784786, + "grad_norm": 1.786757469177246, + "learning_rate": 1.9344306953445632e-05, + "loss": 0.5724, + "step": 12475 + }, + { + "epoch": 2.036569935920983, + "grad_norm": 2.0428555011749268, + "learning_rate": 1.934419393231858e-05, + "loss": 0.6509, + "step": 12476 + }, + { + "epoch": 2.036733194563487, + "grad_norm": 1.5577988624572754, + "learning_rate": 1.9344080901781933e-05, + "loss": 0.5211, + "step": 12477 + }, + { + "epoch": 2.0368964532059914, + "grad_norm": 1.6693497896194458, + "learning_rate": 1.9343967861835796e-05, + "loss": 0.6094, + "step": 12478 + }, + { + "epoch": 2.037059711848496, + "grad_norm": 2.1763880252838135, + "learning_rate": 1.9343854812480285e-05, + "loss": 0.7541, + "step": 12479 + }, + { + "epoch": 2.0372229704910003, + "grad_norm": 1.8230410814285278, + "learning_rate": 1.934374175371552e-05, + "loss": 0.5326, + "step": 12480 + }, + { + "epoch": 2.0373862291335048, + "grad_norm": 1.7685227394104004, + "learning_rate": 1.93436286855416e-05, + "loss": 0.678, + "step": 12481 + }, + { + "epoch": 2.037549487776009, + "grad_norm": 1.429332971572876, + "learning_rate": 1.9343515607958653e-05, + "loss": 0.5011, + "step": 12482 + }, + { + "epoch": 2.0377127464185136, + "grad_norm": 1.814310073852539, + "learning_rate": 1.9343402520966788e-05, + "loss": 0.5916, + "step": 12483 + }, + { + "epoch": 2.037876005061018, + "grad_norm": 1.6616448163986206, + "learning_rate": 1.9343289424566122e-05, + "loss": 0.6497, + "step": 12484 + }, + { + "epoch": 2.0380392637035225, + "grad_norm": 1.8351829051971436, + "learning_rate": 1.9343176318756766e-05, + "loss": 0.5515, + "step": 12485 + }, + { + "epoch": 2.0382025223460265, + "grad_norm": 1.3728736639022827, + "learning_rate": 1.934306320353883e-05, + "loss": 0.4055, + "step": 12486 + }, + { + "epoch": 2.038365780988531, + "grad_norm": 1.5474668741226196, + "learning_rate": 1.9342950078912436e-05, + "loss": 0.5315, + "step": 12487 + }, + { + "epoch": 2.0385290396310354, + "grad_norm": 1.3752912282943726, + "learning_rate": 1.934283694487769e-05, + "loss": 0.488, + "step": 12488 + }, + { + "epoch": 2.03869229827354, + "grad_norm": 2.0109174251556396, + "learning_rate": 1.934272380143471e-05, + "loss": 0.5553, + "step": 12489 + }, + { + "epoch": 2.0388555569160443, + "grad_norm": 1.4402562379837036, + "learning_rate": 1.934261064858361e-05, + "loss": 0.5427, + "step": 12490 + }, + { + "epoch": 2.0390188155585487, + "grad_norm": 1.9702736139297485, + "learning_rate": 1.9342497486324504e-05, + "loss": 0.6181, + "step": 12491 + }, + { + "epoch": 2.039182074201053, + "grad_norm": 1.692156434059143, + "learning_rate": 1.9342384314657506e-05, + "loss": 0.5389, + "step": 12492 + }, + { + "epoch": 2.0393453328435576, + "grad_norm": 1.6527729034423828, + "learning_rate": 1.934227113358273e-05, + "loss": 0.5567, + "step": 12493 + }, + { + "epoch": 2.039508591486062, + "grad_norm": 1.7281928062438965, + "learning_rate": 1.934215794310029e-05, + "loss": 0.5414, + "step": 12494 + }, + { + "epoch": 2.039671850128566, + "grad_norm": 1.7402275800704956, + "learning_rate": 1.9342044743210295e-05, + "loss": 0.5785, + "step": 12495 + }, + { + "epoch": 2.0398351087710704, + "grad_norm": 1.8919745683670044, + "learning_rate": 1.934193153391287e-05, + "loss": 0.5979, + "step": 12496 + }, + { + "epoch": 2.039998367413575, + "grad_norm": 1.5552284717559814, + "learning_rate": 1.9341818315208117e-05, + "loss": 0.5182, + "step": 12497 + }, + { + "epoch": 2.0401616260560793, + "grad_norm": 1.9799880981445312, + "learning_rate": 1.9341705087096158e-05, + "loss": 0.608, + "step": 12498 + }, + { + "epoch": 2.0403248846985838, + "grad_norm": 1.6899117231369019, + "learning_rate": 1.9341591849577102e-05, + "loss": 0.5076, + "step": 12499 + }, + { + "epoch": 2.040488143341088, + "grad_norm": 2.5051069259643555, + "learning_rate": 1.9341478602651068e-05, + "loss": 0.5333, + "step": 12500 + }, + { + "epoch": 2.0406514019835926, + "grad_norm": 1.6199774742126465, + "learning_rate": 1.9341365346318167e-05, + "loss": 0.5833, + "step": 12501 + }, + { + "epoch": 2.040814660626097, + "grad_norm": 1.7962948083877563, + "learning_rate": 1.9341252080578515e-05, + "loss": 0.6494, + "step": 12502 + }, + { + "epoch": 2.0409779192686015, + "grad_norm": 1.9111764430999756, + "learning_rate": 1.9341138805432223e-05, + "loss": 0.5882, + "step": 12503 + }, + { + "epoch": 2.0411411779111055, + "grad_norm": 2.0144999027252197, + "learning_rate": 1.9341025520879406e-05, + "loss": 0.6614, + "step": 12504 + }, + { + "epoch": 2.04130443655361, + "grad_norm": 1.5839784145355225, + "learning_rate": 1.934091222692018e-05, + "loss": 0.5596, + "step": 12505 + }, + { + "epoch": 2.0414676951961144, + "grad_norm": 1.8219949007034302, + "learning_rate": 1.9340798923554657e-05, + "loss": 0.621, + "step": 12506 + }, + { + "epoch": 2.041630953838619, + "grad_norm": 1.5695778131484985, + "learning_rate": 1.934068561078295e-05, + "loss": 0.5569, + "step": 12507 + }, + { + "epoch": 2.0417942124811232, + "grad_norm": 1.4816148281097412, + "learning_rate": 1.9340572288605178e-05, + "loss": 0.5715, + "step": 12508 + }, + { + "epoch": 2.0419574711236277, + "grad_norm": 1.5732719898223877, + "learning_rate": 1.9340458957021453e-05, + "loss": 0.4809, + "step": 12509 + }, + { + "epoch": 2.042120729766132, + "grad_norm": 2.169745445251465, + "learning_rate": 1.9340345616031885e-05, + "loss": 0.613, + "step": 12510 + }, + { + "epoch": 2.0422839884086366, + "grad_norm": 1.6917805671691895, + "learning_rate": 1.9340232265636596e-05, + "loss": 0.6739, + "step": 12511 + }, + { + "epoch": 2.0424472470511406, + "grad_norm": 1.7692861557006836, + "learning_rate": 1.9340118905835693e-05, + "loss": 0.5478, + "step": 12512 + }, + { + "epoch": 2.042610505693645, + "grad_norm": 1.4979761838912964, + "learning_rate": 1.9340005536629292e-05, + "loss": 0.4662, + "step": 12513 + }, + { + "epoch": 2.0427737643361494, + "grad_norm": 1.532591700553894, + "learning_rate": 1.933989215801751e-05, + "loss": 0.4955, + "step": 12514 + }, + { + "epoch": 2.042937022978654, + "grad_norm": 1.9427249431610107, + "learning_rate": 1.9339778770000458e-05, + "loss": 0.5909, + "step": 12515 + }, + { + "epoch": 2.0431002816211583, + "grad_norm": 1.8342094421386719, + "learning_rate": 1.9339665372578248e-05, + "loss": 0.8122, + "step": 12516 + }, + { + "epoch": 2.0432635402636627, + "grad_norm": 1.7162177562713623, + "learning_rate": 1.9339551965751e-05, + "loss": 0.4843, + "step": 12517 + }, + { + "epoch": 2.043426798906167, + "grad_norm": 1.5974608659744263, + "learning_rate": 1.933943854951883e-05, + "loss": 0.4792, + "step": 12518 + }, + { + "epoch": 2.0435900575486716, + "grad_norm": 1.6007624864578247, + "learning_rate": 1.933932512388184e-05, + "loss": 0.5269, + "step": 12519 + }, + { + "epoch": 2.043753316191176, + "grad_norm": 1.6031157970428467, + "learning_rate": 1.933921168884016e-05, + "loss": 0.616, + "step": 12520 + }, + { + "epoch": 2.04391657483368, + "grad_norm": 1.3850005865097046, + "learning_rate": 1.933909824439389e-05, + "loss": 0.3902, + "step": 12521 + }, + { + "epoch": 2.0440798334761845, + "grad_norm": 1.3859142065048218, + "learning_rate": 1.9338984790543153e-05, + "loss": 0.5398, + "step": 12522 + }, + { + "epoch": 2.044243092118689, + "grad_norm": 1.790244460105896, + "learning_rate": 1.9338871327288062e-05, + "loss": 0.6564, + "step": 12523 + }, + { + "epoch": 2.0444063507611934, + "grad_norm": 1.4430124759674072, + "learning_rate": 1.9338757854628726e-05, + "loss": 0.4684, + "step": 12524 + }, + { + "epoch": 2.044569609403698, + "grad_norm": 1.8428969383239746, + "learning_rate": 1.933864437256527e-05, + "loss": 0.5077, + "step": 12525 + }, + { + "epoch": 2.0447328680462022, + "grad_norm": 1.6809026002883911, + "learning_rate": 1.9338530881097793e-05, + "loss": 0.5707, + "step": 12526 + }, + { + "epoch": 2.0448961266887067, + "grad_norm": 2.2282097339630127, + "learning_rate": 1.9338417380226423e-05, + "loss": 0.6724, + "step": 12527 + }, + { + "epoch": 2.045059385331211, + "grad_norm": 1.7579717636108398, + "learning_rate": 1.933830386995127e-05, + "loss": 0.5445, + "step": 12528 + }, + { + "epoch": 2.0452226439737156, + "grad_norm": 1.9988796710968018, + "learning_rate": 1.9338190350272447e-05, + "loss": 0.6335, + "step": 12529 + }, + { + "epoch": 2.0453859026162196, + "grad_norm": 1.5773271322250366, + "learning_rate": 1.9338076821190065e-05, + "loss": 0.555, + "step": 12530 + }, + { + "epoch": 2.045549161258724, + "grad_norm": 1.8481450080871582, + "learning_rate": 1.9337963282704245e-05, + "loss": 0.647, + "step": 12531 + }, + { + "epoch": 2.0457124199012284, + "grad_norm": 1.876672387123108, + "learning_rate": 1.93378497348151e-05, + "loss": 0.6028, + "step": 12532 + }, + { + "epoch": 2.045875678543733, + "grad_norm": 2.0604612827301025, + "learning_rate": 1.9337736177522742e-05, + "loss": 0.68, + "step": 12533 + }, + { + "epoch": 2.0460389371862373, + "grad_norm": 1.5987591743469238, + "learning_rate": 1.9337622610827286e-05, + "loss": 0.5647, + "step": 12534 + }, + { + "epoch": 2.0462021958287417, + "grad_norm": 2.2853779792785645, + "learning_rate": 1.9337509034728846e-05, + "loss": 0.6818, + "step": 12535 + }, + { + "epoch": 2.046365454471246, + "grad_norm": 1.705328106880188, + "learning_rate": 1.9337395449227534e-05, + "loss": 0.6408, + "step": 12536 + }, + { + "epoch": 2.0465287131137506, + "grad_norm": 1.797687292098999, + "learning_rate": 1.933728185432347e-05, + "loss": 0.5666, + "step": 12537 + }, + { + "epoch": 2.046691971756255, + "grad_norm": 1.8294883966445923, + "learning_rate": 1.9337168250016768e-05, + "loss": 0.5539, + "step": 12538 + }, + { + "epoch": 2.046855230398759, + "grad_norm": 1.8282926082611084, + "learning_rate": 1.9337054636307537e-05, + "loss": 0.5123, + "step": 12539 + }, + { + "epoch": 2.0470184890412635, + "grad_norm": 2.043750286102295, + "learning_rate": 1.9336941013195892e-05, + "loss": 0.5285, + "step": 12540 + }, + { + "epoch": 2.047181747683768, + "grad_norm": 1.9946180582046509, + "learning_rate": 1.9336827380681958e-05, + "loss": 0.5975, + "step": 12541 + }, + { + "epoch": 2.0473450063262724, + "grad_norm": 1.500851035118103, + "learning_rate": 1.9336713738765836e-05, + "loss": 0.5388, + "step": 12542 + }, + { + "epoch": 2.047508264968777, + "grad_norm": 2.602703332901001, + "learning_rate": 1.9336600087447645e-05, + "loss": 0.5361, + "step": 12543 + }, + { + "epoch": 2.0476715236112812, + "grad_norm": 1.8026779890060425, + "learning_rate": 1.93364864267275e-05, + "loss": 0.6221, + "step": 12544 + }, + { + "epoch": 2.0478347822537857, + "grad_norm": 1.512753963470459, + "learning_rate": 1.933637275660552e-05, + "loss": 0.4695, + "step": 12545 + }, + { + "epoch": 2.04799804089629, + "grad_norm": 1.9472894668579102, + "learning_rate": 1.933625907708181e-05, + "loss": 0.5246, + "step": 12546 + }, + { + "epoch": 2.048161299538794, + "grad_norm": 1.7606332302093506, + "learning_rate": 1.933614538815649e-05, + "loss": 0.6655, + "step": 12547 + }, + { + "epoch": 2.0483245581812985, + "grad_norm": 1.8288816213607788, + "learning_rate": 1.9336031689829677e-05, + "loss": 0.5786, + "step": 12548 + }, + { + "epoch": 2.048487816823803, + "grad_norm": 1.6529552936553955, + "learning_rate": 1.933591798210148e-05, + "loss": 0.6159, + "step": 12549 + }, + { + "epoch": 2.0486510754663074, + "grad_norm": 1.7312804460525513, + "learning_rate": 1.9335804264972018e-05, + "loss": 0.5779, + "step": 12550 + }, + { + "epoch": 2.048814334108812, + "grad_norm": 1.4990112781524658, + "learning_rate": 1.9335690538441404e-05, + "loss": 0.4872, + "step": 12551 + }, + { + "epoch": 2.0489775927513163, + "grad_norm": 1.7711176872253418, + "learning_rate": 1.933557680250975e-05, + "loss": 0.5639, + "step": 12552 + }, + { + "epoch": 2.0491408513938207, + "grad_norm": 2.063809394836426, + "learning_rate": 1.9335463057177174e-05, + "loss": 0.6599, + "step": 12553 + }, + { + "epoch": 2.049304110036325, + "grad_norm": 2.1126773357391357, + "learning_rate": 1.9335349302443788e-05, + "loss": 0.7738, + "step": 12554 + }, + { + "epoch": 2.0494673686788296, + "grad_norm": 1.9001386165618896, + "learning_rate": 1.9335235538309712e-05, + "loss": 0.6021, + "step": 12555 + }, + { + "epoch": 2.0496306273213336, + "grad_norm": 1.6554162502288818, + "learning_rate": 1.933512176477505e-05, + "loss": 0.5893, + "step": 12556 + }, + { + "epoch": 2.049793885963838, + "grad_norm": 2.0169413089752197, + "learning_rate": 1.9335007981839928e-05, + "loss": 0.603, + "step": 12557 + }, + { + "epoch": 2.0499571446063425, + "grad_norm": 1.7343255281448364, + "learning_rate": 1.9334894189504452e-05, + "loss": 0.5544, + "step": 12558 + }, + { + "epoch": 2.050120403248847, + "grad_norm": 2.0387775897979736, + "learning_rate": 1.933478038776874e-05, + "loss": 0.647, + "step": 12559 + }, + { + "epoch": 2.0502836618913514, + "grad_norm": 1.725394368171692, + "learning_rate": 1.933466657663291e-05, + "loss": 0.5494, + "step": 12560 + }, + { + "epoch": 2.050446920533856, + "grad_norm": 2.0786938667297363, + "learning_rate": 1.933455275609707e-05, + "loss": 0.664, + "step": 12561 + }, + { + "epoch": 2.0506101791763602, + "grad_norm": 1.697468638420105, + "learning_rate": 1.933443892616134e-05, + "loss": 0.4705, + "step": 12562 + }, + { + "epoch": 2.0507734378188647, + "grad_norm": 1.6789138317108154, + "learning_rate": 1.933432508682583e-05, + "loss": 0.5362, + "step": 12563 + }, + { + "epoch": 2.050936696461369, + "grad_norm": 1.6684296131134033, + "learning_rate": 1.933421123809066e-05, + "loss": 0.6823, + "step": 12564 + }, + { + "epoch": 2.051099955103873, + "grad_norm": 1.9454288482666016, + "learning_rate": 1.9334097379955938e-05, + "loss": 0.6874, + "step": 12565 + }, + { + "epoch": 2.0512632137463775, + "grad_norm": 1.6074955463409424, + "learning_rate": 1.9333983512421785e-05, + "loss": 0.5413, + "step": 12566 + }, + { + "epoch": 2.051426472388882, + "grad_norm": 1.3516676425933838, + "learning_rate": 1.9333869635488315e-05, + "loss": 0.4735, + "step": 12567 + }, + { + "epoch": 2.0515897310313864, + "grad_norm": 1.6292965412139893, + "learning_rate": 1.9333755749155635e-05, + "loss": 0.5705, + "step": 12568 + }, + { + "epoch": 2.051752989673891, + "grad_norm": 1.8955661058425903, + "learning_rate": 1.933364185342387e-05, + "loss": 0.6148, + "step": 12569 + }, + { + "epoch": 2.0519162483163953, + "grad_norm": 1.6887623071670532, + "learning_rate": 1.9333527948293128e-05, + "loss": 0.5179, + "step": 12570 + }, + { + "epoch": 2.0520795069588997, + "grad_norm": 1.8155368566513062, + "learning_rate": 1.9333414033763528e-05, + "loss": 0.5887, + "step": 12571 + }, + { + "epoch": 2.052242765601404, + "grad_norm": 1.8958420753479004, + "learning_rate": 1.9333300109835182e-05, + "loss": 0.579, + "step": 12572 + }, + { + "epoch": 2.0524060242439086, + "grad_norm": 1.960214376449585, + "learning_rate": 1.9333186176508207e-05, + "loss": 0.6479, + "step": 12573 + }, + { + "epoch": 2.0525692828864126, + "grad_norm": 1.780833125114441, + "learning_rate": 1.9333072233782713e-05, + "loss": 0.4443, + "step": 12574 + }, + { + "epoch": 2.052732541528917, + "grad_norm": 1.7124276161193848, + "learning_rate": 1.9332958281658815e-05, + "loss": 0.7264, + "step": 12575 + }, + { + "epoch": 2.0528958001714215, + "grad_norm": 1.6373205184936523, + "learning_rate": 1.9332844320136637e-05, + "loss": 0.4935, + "step": 12576 + }, + { + "epoch": 2.053059058813926, + "grad_norm": 1.945823073387146, + "learning_rate": 1.9332730349216283e-05, + "loss": 0.6495, + "step": 12577 + }, + { + "epoch": 2.0532223174564304, + "grad_norm": 1.7909729480743408, + "learning_rate": 1.9332616368897874e-05, + "loss": 0.5061, + "step": 12578 + }, + { + "epoch": 2.053385576098935, + "grad_norm": 1.9581807851791382, + "learning_rate": 1.9332502379181523e-05, + "loss": 0.4968, + "step": 12579 + }, + { + "epoch": 2.0535488347414392, + "grad_norm": 1.9305682182312012, + "learning_rate": 1.933238838006734e-05, + "loss": 0.5987, + "step": 12580 + }, + { + "epoch": 2.0537120933839437, + "grad_norm": 1.540681004524231, + "learning_rate": 1.933227437155545e-05, + "loss": 0.4776, + "step": 12581 + }, + { + "epoch": 2.053875352026448, + "grad_norm": 1.877291202545166, + "learning_rate": 1.933216035364596e-05, + "loss": 0.5797, + "step": 12582 + }, + { + "epoch": 2.054038610668952, + "grad_norm": 1.8473215103149414, + "learning_rate": 1.9332046326338985e-05, + "loss": 0.6114, + "step": 12583 + }, + { + "epoch": 2.0542018693114565, + "grad_norm": 1.7840884923934937, + "learning_rate": 1.9331932289634644e-05, + "loss": 0.658, + "step": 12584 + }, + { + "epoch": 2.054365127953961, + "grad_norm": 1.6521497964859009, + "learning_rate": 1.933181824353305e-05, + "loss": 0.5061, + "step": 12585 + }, + { + "epoch": 2.0545283865964654, + "grad_norm": 1.8632780313491821, + "learning_rate": 1.933170418803432e-05, + "loss": 0.6222, + "step": 12586 + }, + { + "epoch": 2.05469164523897, + "grad_norm": 1.6111079454421997, + "learning_rate": 1.9331590123138562e-05, + "loss": 0.435, + "step": 12587 + }, + { + "epoch": 2.0548549038814743, + "grad_norm": 1.8012115955352783, + "learning_rate": 1.9331476048845897e-05, + "loss": 0.5256, + "step": 12588 + }, + { + "epoch": 2.0550181625239787, + "grad_norm": 1.7738654613494873, + "learning_rate": 1.9331361965156438e-05, + "loss": 0.5197, + "step": 12589 + }, + { + "epoch": 2.055181421166483, + "grad_norm": 1.762214183807373, + "learning_rate": 1.93312478720703e-05, + "loss": 0.5451, + "step": 12590 + }, + { + "epoch": 2.0553446798089876, + "grad_norm": 1.6847076416015625, + "learning_rate": 1.93311337695876e-05, + "loss": 0.4882, + "step": 12591 + }, + { + "epoch": 2.0555079384514916, + "grad_norm": 1.8761510848999023, + "learning_rate": 1.9331019657708446e-05, + "loss": 0.5782, + "step": 12592 + }, + { + "epoch": 2.055671197093996, + "grad_norm": 1.5720994472503662, + "learning_rate": 1.933090553643296e-05, + "loss": 0.5495, + "step": 12593 + }, + { + "epoch": 2.0558344557365005, + "grad_norm": 1.8600852489471436, + "learning_rate": 1.9330791405761254e-05, + "loss": 0.5643, + "step": 12594 + }, + { + "epoch": 2.055997714379005, + "grad_norm": 1.7638306617736816, + "learning_rate": 1.9330677265693444e-05, + "loss": 0.6003, + "step": 12595 + }, + { + "epoch": 2.0561609730215094, + "grad_norm": 1.5942293405532837, + "learning_rate": 1.9330563116229647e-05, + "loss": 0.5426, + "step": 12596 + }, + { + "epoch": 2.056324231664014, + "grad_norm": 1.6974012851715088, + "learning_rate": 1.933044895736997e-05, + "loss": 0.5384, + "step": 12597 + }, + { + "epoch": 2.0564874903065182, + "grad_norm": 1.6993228197097778, + "learning_rate": 1.933033478911454e-05, + "loss": 0.5113, + "step": 12598 + }, + { + "epoch": 2.0566507489490227, + "grad_norm": 1.7888575792312622, + "learning_rate": 1.933022061146346e-05, + "loss": 0.7109, + "step": 12599 + }, + { + "epoch": 2.0568140075915267, + "grad_norm": 1.951226830482483, + "learning_rate": 1.9330106424416852e-05, + "loss": 0.6587, + "step": 12600 + }, + { + "epoch": 2.056977266234031, + "grad_norm": 1.4595797061920166, + "learning_rate": 1.932999222797483e-05, + "loss": 0.4788, + "step": 12601 + }, + { + "epoch": 2.0571405248765355, + "grad_norm": 1.8984992504119873, + "learning_rate": 1.9329878022137507e-05, + "loss": 0.6361, + "step": 12602 + }, + { + "epoch": 2.05730378351904, + "grad_norm": 1.605939269065857, + "learning_rate": 1.9329763806905e-05, + "loss": 0.5431, + "step": 12603 + }, + { + "epoch": 2.0574670421615444, + "grad_norm": 1.856075644493103, + "learning_rate": 1.9329649582277424e-05, + "loss": 0.4899, + "step": 12604 + }, + { + "epoch": 2.057630300804049, + "grad_norm": 2.3160364627838135, + "learning_rate": 1.9329535348254893e-05, + "loss": 0.6828, + "step": 12605 + }, + { + "epoch": 2.0577935594465533, + "grad_norm": 1.8986384868621826, + "learning_rate": 1.932942110483752e-05, + "loss": 0.5909, + "step": 12606 + }, + { + "epoch": 2.0579568180890577, + "grad_norm": 2.1761670112609863, + "learning_rate": 1.932930685202543e-05, + "loss": 0.7399, + "step": 12607 + }, + { + "epoch": 2.058120076731562, + "grad_norm": 1.7712815999984741, + "learning_rate": 1.932919258981872e-05, + "loss": 0.6028, + "step": 12608 + }, + { + "epoch": 2.058283335374066, + "grad_norm": 1.920531988143921, + "learning_rate": 1.9329078318217523e-05, + "loss": 0.5751, + "step": 12609 + }, + { + "epoch": 2.0584465940165706, + "grad_norm": 1.7769887447357178, + "learning_rate": 1.9328964037221944e-05, + "loss": 0.5855, + "step": 12610 + }, + { + "epoch": 2.058609852659075, + "grad_norm": 1.8933099508285522, + "learning_rate": 1.9328849746832098e-05, + "loss": 0.5895, + "step": 12611 + }, + { + "epoch": 2.0587731113015795, + "grad_norm": 1.7800958156585693, + "learning_rate": 1.9328735447048105e-05, + "loss": 0.5172, + "step": 12612 + }, + { + "epoch": 2.058936369944084, + "grad_norm": 1.8484512567520142, + "learning_rate": 1.932862113787008e-05, + "loss": 0.5871, + "step": 12613 + }, + { + "epoch": 2.0590996285865883, + "grad_norm": 2.0675179958343506, + "learning_rate": 1.9328506819298134e-05, + "loss": 0.573, + "step": 12614 + }, + { + "epoch": 2.059262887229093, + "grad_norm": 1.616305947303772, + "learning_rate": 1.9328392491332385e-05, + "loss": 0.5152, + "step": 12615 + }, + { + "epoch": 2.0594261458715972, + "grad_norm": 1.860044240951538, + "learning_rate": 1.9328278153972947e-05, + "loss": 0.6316, + "step": 12616 + }, + { + "epoch": 2.0595894045141017, + "grad_norm": 1.9057366847991943, + "learning_rate": 1.9328163807219937e-05, + "loss": 0.542, + "step": 12617 + }, + { + "epoch": 2.0597526631566057, + "grad_norm": 1.7970547676086426, + "learning_rate": 1.9328049451073467e-05, + "loss": 0.6663, + "step": 12618 + }, + { + "epoch": 2.05991592179911, + "grad_norm": 2.053895950317383, + "learning_rate": 1.9327935085533652e-05, + "loss": 0.4785, + "step": 12619 + }, + { + "epoch": 2.0600791804416145, + "grad_norm": 1.5729748010635376, + "learning_rate": 1.932782071060061e-05, + "loss": 0.4702, + "step": 12620 + }, + { + "epoch": 2.060242439084119, + "grad_norm": 1.899421215057373, + "learning_rate": 1.9327706326274453e-05, + "loss": 0.5488, + "step": 12621 + }, + { + "epoch": 2.0604056977266234, + "grad_norm": 1.6139971017837524, + "learning_rate": 1.9327591932555302e-05, + "loss": 0.5064, + "step": 12622 + }, + { + "epoch": 2.060568956369128, + "grad_norm": 2.0870306491851807, + "learning_rate": 1.9327477529443264e-05, + "loss": 0.5855, + "step": 12623 + }, + { + "epoch": 2.0607322150116323, + "grad_norm": 1.9327545166015625, + "learning_rate": 1.9327363116938464e-05, + "loss": 0.5225, + "step": 12624 + }, + { + "epoch": 2.0608954736541367, + "grad_norm": 1.848079800605774, + "learning_rate": 1.932724869504101e-05, + "loss": 0.6117, + "step": 12625 + }, + { + "epoch": 2.061058732296641, + "grad_norm": 1.5856224298477173, + "learning_rate": 1.9327134263751016e-05, + "loss": 0.5516, + "step": 12626 + }, + { + "epoch": 2.061221990939145, + "grad_norm": 1.5233838558197021, + "learning_rate": 1.9327019823068605e-05, + "loss": 0.4796, + "step": 12627 + }, + { + "epoch": 2.0613852495816496, + "grad_norm": 1.8427157402038574, + "learning_rate": 1.9326905372993886e-05, + "loss": 0.6382, + "step": 12628 + }, + { + "epoch": 2.061548508224154, + "grad_norm": 1.8353664875030518, + "learning_rate": 1.9326790913526974e-05, + "loss": 0.5899, + "step": 12629 + }, + { + "epoch": 2.0617117668666585, + "grad_norm": 1.675552487373352, + "learning_rate": 1.9326676444667988e-05, + "loss": 0.5601, + "step": 12630 + }, + { + "epoch": 2.061875025509163, + "grad_norm": 1.7763067483901978, + "learning_rate": 1.932656196641704e-05, + "loss": 0.5633, + "step": 12631 + }, + { + "epoch": 2.0620382841516673, + "grad_norm": 2.267127513885498, + "learning_rate": 1.9326447478774244e-05, + "loss": 1.1416, + "step": 12632 + }, + { + "epoch": 2.062201542794172, + "grad_norm": 1.8018039464950562, + "learning_rate": 1.9326332981739723e-05, + "loss": 0.5626, + "step": 12633 + }, + { + "epoch": 2.062364801436676, + "grad_norm": 1.7337849140167236, + "learning_rate": 1.9326218475313583e-05, + "loss": 0.5809, + "step": 12634 + }, + { + "epoch": 2.06252806007918, + "grad_norm": 1.8127861022949219, + "learning_rate": 1.9326103959495947e-05, + "loss": 0.588, + "step": 12635 + }, + { + "epoch": 2.0626913187216847, + "grad_norm": 1.949481725692749, + "learning_rate": 1.9325989434286922e-05, + "loss": 0.5974, + "step": 12636 + }, + { + "epoch": 2.062854577364189, + "grad_norm": 1.8584425449371338, + "learning_rate": 1.9325874899686632e-05, + "loss": 0.6104, + "step": 12637 + }, + { + "epoch": 2.0630178360066935, + "grad_norm": 1.9630550146102905, + "learning_rate": 1.932576035569519e-05, + "loss": 0.5538, + "step": 12638 + }, + { + "epoch": 2.063181094649198, + "grad_norm": 1.6239497661590576, + "learning_rate": 1.9325645802312704e-05, + "loss": 0.587, + "step": 12639 + }, + { + "epoch": 2.0633443532917024, + "grad_norm": 2.3025307655334473, + "learning_rate": 1.9325531239539303e-05, + "loss": 0.6148, + "step": 12640 + }, + { + "epoch": 2.063507611934207, + "grad_norm": 1.8608815670013428, + "learning_rate": 1.9325416667375087e-05, + "loss": 0.6554, + "step": 12641 + }, + { + "epoch": 2.0636708705767113, + "grad_norm": 1.7780479192733765, + "learning_rate": 1.9325302085820184e-05, + "loss": 0.667, + "step": 12642 + }, + { + "epoch": 2.0638341292192157, + "grad_norm": 2.0076966285705566, + "learning_rate": 1.93251874948747e-05, + "loss": 0.5612, + "step": 12643 + }, + { + "epoch": 2.06399738786172, + "grad_norm": 1.4718210697174072, + "learning_rate": 1.932507289453876e-05, + "loss": 0.5589, + "step": 12644 + }, + { + "epoch": 2.064160646504224, + "grad_norm": 1.916379451751709, + "learning_rate": 1.9324958284812468e-05, + "loss": 0.5264, + "step": 12645 + }, + { + "epoch": 2.0643239051467286, + "grad_norm": 1.9040310382843018, + "learning_rate": 1.932484366569595e-05, + "loss": 0.5728, + "step": 12646 + }, + { + "epoch": 2.064487163789233, + "grad_norm": 1.8359315395355225, + "learning_rate": 1.9324729037189314e-05, + "loss": 0.542, + "step": 12647 + }, + { + "epoch": 2.0646504224317375, + "grad_norm": 1.491902470588684, + "learning_rate": 1.932461439929268e-05, + "loss": 0.5532, + "step": 12648 + }, + { + "epoch": 2.064813681074242, + "grad_norm": 1.7316187620162964, + "learning_rate": 1.932449975200616e-05, + "loss": 0.5702, + "step": 12649 + }, + { + "epoch": 2.0649769397167463, + "grad_norm": 1.763700246810913, + "learning_rate": 1.9324385095329875e-05, + "loss": 0.5254, + "step": 12650 + }, + { + "epoch": 2.065140198359251, + "grad_norm": 2.0634474754333496, + "learning_rate": 1.9324270429263933e-05, + "loss": 0.5955, + "step": 12651 + }, + { + "epoch": 2.065303457001755, + "grad_norm": 1.8930258750915527, + "learning_rate": 1.9324155753808454e-05, + "loss": 0.5506, + "step": 12652 + }, + { + "epoch": 2.065466715644259, + "grad_norm": 1.843095302581787, + "learning_rate": 1.9324041068963554e-05, + "loss": 0.7134, + "step": 12653 + }, + { + "epoch": 2.0656299742867636, + "grad_norm": 1.580033302307129, + "learning_rate": 1.932392637472935e-05, + "loss": 0.5362, + "step": 12654 + }, + { + "epoch": 2.065793232929268, + "grad_norm": 1.6660610437393188, + "learning_rate": 1.932381167110595e-05, + "loss": 0.5189, + "step": 12655 + }, + { + "epoch": 2.0659564915717725, + "grad_norm": 1.7034409046173096, + "learning_rate": 1.9323696958093473e-05, + "loss": 0.6897, + "step": 12656 + }, + { + "epoch": 2.066119750214277, + "grad_norm": 1.7669264078140259, + "learning_rate": 1.9323582235692037e-05, + "loss": 0.5185, + "step": 12657 + }, + { + "epoch": 2.0662830088567814, + "grad_norm": 1.8125616312026978, + "learning_rate": 1.9323467503901756e-05, + "loss": 0.6183, + "step": 12658 + }, + { + "epoch": 2.066446267499286, + "grad_norm": 1.959320068359375, + "learning_rate": 1.9323352762722748e-05, + "loss": 0.6294, + "step": 12659 + }, + { + "epoch": 2.0666095261417903, + "grad_norm": 1.6433420181274414, + "learning_rate": 1.9323238012155125e-05, + "loss": 0.5146, + "step": 12660 + }, + { + "epoch": 2.0667727847842947, + "grad_norm": 1.512661337852478, + "learning_rate": 1.9323123252199003e-05, + "loss": 0.5128, + "step": 12661 + }, + { + "epoch": 2.0669360434267987, + "grad_norm": 1.5553524494171143, + "learning_rate": 1.9323008482854496e-05, + "loss": 0.4267, + "step": 12662 + }, + { + "epoch": 2.067099302069303, + "grad_norm": 1.6307882070541382, + "learning_rate": 1.9322893704121726e-05, + "loss": 0.5761, + "step": 12663 + }, + { + "epoch": 2.0672625607118076, + "grad_norm": 1.757677674293518, + "learning_rate": 1.9322778916000803e-05, + "loss": 0.5991, + "step": 12664 + }, + { + "epoch": 2.067425819354312, + "grad_norm": 1.939069390296936, + "learning_rate": 1.9322664118491844e-05, + "loss": 0.5635, + "step": 12665 + }, + { + "epoch": 2.0675890779968165, + "grad_norm": 1.7910054922103882, + "learning_rate": 1.932254931159497e-05, + "loss": 0.5479, + "step": 12666 + }, + { + "epoch": 2.067752336639321, + "grad_norm": 1.8593946695327759, + "learning_rate": 1.932243449531028e-05, + "loss": 0.5603, + "step": 12667 + }, + { + "epoch": 2.0679155952818253, + "grad_norm": 1.5384607315063477, + "learning_rate": 1.932231966963791e-05, + "loss": 0.4445, + "step": 12668 + }, + { + "epoch": 2.0680788539243298, + "grad_norm": 1.777850866317749, + "learning_rate": 1.9322204834577965e-05, + "loss": 0.611, + "step": 12669 + }, + { + "epoch": 2.068242112566834, + "grad_norm": 1.8123424053192139, + "learning_rate": 1.932208999013056e-05, + "loss": 0.5212, + "step": 12670 + }, + { + "epoch": 2.068405371209338, + "grad_norm": 1.6231980323791504, + "learning_rate": 1.9321975136295815e-05, + "loss": 0.5443, + "step": 12671 + }, + { + "epoch": 2.0685686298518426, + "grad_norm": 1.8646429777145386, + "learning_rate": 1.9321860273073843e-05, + "loss": 0.5765, + "step": 12672 + }, + { + "epoch": 2.068731888494347, + "grad_norm": 1.7683428525924683, + "learning_rate": 1.932174540046476e-05, + "loss": 0.6043, + "step": 12673 + }, + { + "epoch": 2.0688951471368515, + "grad_norm": 1.8449351787567139, + "learning_rate": 1.932163051846868e-05, + "loss": 0.6074, + "step": 12674 + }, + { + "epoch": 2.069058405779356, + "grad_norm": 2.0063798427581787, + "learning_rate": 1.932151562708572e-05, + "loss": 0.6125, + "step": 12675 + }, + { + "epoch": 2.0692216644218604, + "grad_norm": 1.6139687299728394, + "learning_rate": 1.9321400726316e-05, + "loss": 0.4728, + "step": 12676 + }, + { + "epoch": 2.069384923064365, + "grad_norm": 1.8459407091140747, + "learning_rate": 1.9321285816159633e-05, + "loss": 0.5456, + "step": 12677 + }, + { + "epoch": 2.0695481817068693, + "grad_norm": 1.8801714181900024, + "learning_rate": 1.9321170896616726e-05, + "loss": 0.7617, + "step": 12678 + }, + { + "epoch": 2.0697114403493737, + "grad_norm": 1.7647229433059692, + "learning_rate": 1.932105596768741e-05, + "loss": 0.6596, + "step": 12679 + }, + { + "epoch": 2.0698746989918777, + "grad_norm": 1.5715336799621582, + "learning_rate": 1.932094102937179e-05, + "loss": 0.4616, + "step": 12680 + }, + { + "epoch": 2.070037957634382, + "grad_norm": 1.7332555055618286, + "learning_rate": 1.9320826081669986e-05, + "loss": 0.6306, + "step": 12681 + }, + { + "epoch": 2.0702012162768866, + "grad_norm": 1.8285822868347168, + "learning_rate": 1.932071112458211e-05, + "loss": 0.5698, + "step": 12682 + }, + { + "epoch": 2.070364474919391, + "grad_norm": 1.7740310430526733, + "learning_rate": 1.9320596158108283e-05, + "loss": 0.5412, + "step": 12683 + }, + { + "epoch": 2.0705277335618955, + "grad_norm": 1.881882905960083, + "learning_rate": 1.932048118224862e-05, + "loss": 0.656, + "step": 12684 + }, + { + "epoch": 2.0706909922044, + "grad_norm": 1.8201229572296143, + "learning_rate": 1.9320366197003227e-05, + "loss": 0.6844, + "step": 12685 + }, + { + "epoch": 2.0708542508469043, + "grad_norm": 1.67357337474823, + "learning_rate": 1.932025120237224e-05, + "loss": 0.5128, + "step": 12686 + }, + { + "epoch": 2.0710175094894088, + "grad_norm": 1.7050341367721558, + "learning_rate": 1.9320136198355753e-05, + "loss": 0.5759, + "step": 12687 + }, + { + "epoch": 2.0711807681319128, + "grad_norm": 1.924009084701538, + "learning_rate": 1.932002118495389e-05, + "loss": 0.6153, + "step": 12688 + }, + { + "epoch": 2.071344026774417, + "grad_norm": 1.8531798124313354, + "learning_rate": 1.9319906162166776e-05, + "loss": 0.6291, + "step": 12689 + }, + { + "epoch": 2.0715072854169216, + "grad_norm": 1.772476077079773, + "learning_rate": 1.9319791129994515e-05, + "loss": 0.5986, + "step": 12690 + }, + { + "epoch": 2.071670544059426, + "grad_norm": 1.6120312213897705, + "learning_rate": 1.9319676088437224e-05, + "loss": 0.538, + "step": 12691 + }, + { + "epoch": 2.0718338027019305, + "grad_norm": 1.7034331560134888, + "learning_rate": 1.9319561037495025e-05, + "loss": 0.593, + "step": 12692 + }, + { + "epoch": 2.071997061344435, + "grad_norm": 1.7978113889694214, + "learning_rate": 1.931944597716803e-05, + "loss": 0.6591, + "step": 12693 + }, + { + "epoch": 2.0721603199869394, + "grad_norm": 1.4312381744384766, + "learning_rate": 1.9319330907456356e-05, + "loss": 0.5417, + "step": 12694 + }, + { + "epoch": 2.072323578629444, + "grad_norm": 1.745811104774475, + "learning_rate": 1.9319215828360117e-05, + "loss": 0.542, + "step": 12695 + }, + { + "epoch": 2.0724868372719483, + "grad_norm": 1.7988154888153076, + "learning_rate": 1.931910073987943e-05, + "loss": 0.5179, + "step": 12696 + }, + { + "epoch": 2.0726500959144523, + "grad_norm": 2.2833430767059326, + "learning_rate": 1.931898564201441e-05, + "loss": 0.5247, + "step": 12697 + }, + { + "epoch": 2.0728133545569567, + "grad_norm": 1.4173334836959839, + "learning_rate": 1.9318870534765178e-05, + "loss": 0.4447, + "step": 12698 + }, + { + "epoch": 2.072976613199461, + "grad_norm": 1.5551263093948364, + "learning_rate": 1.9318755418131844e-05, + "loss": 0.4973, + "step": 12699 + }, + { + "epoch": 2.0731398718419656, + "grad_norm": 1.6092066764831543, + "learning_rate": 1.9318640292114526e-05, + "loss": 0.565, + "step": 12700 + }, + { + "epoch": 2.07330313048447, + "grad_norm": 1.602735161781311, + "learning_rate": 1.931852515671334e-05, + "loss": 0.5195, + "step": 12701 + }, + { + "epoch": 2.0734663891269745, + "grad_norm": 1.7686277627944946, + "learning_rate": 1.9318410011928398e-05, + "loss": 0.5639, + "step": 12702 + }, + { + "epoch": 2.073629647769479, + "grad_norm": 1.7777825593948364, + "learning_rate": 1.931829485775982e-05, + "loss": 0.547, + "step": 12703 + }, + { + "epoch": 2.0737929064119833, + "grad_norm": 2.0533862113952637, + "learning_rate": 1.9318179694207726e-05, + "loss": 0.6574, + "step": 12704 + }, + { + "epoch": 2.0739561650544878, + "grad_norm": 1.7774242162704468, + "learning_rate": 1.9318064521272223e-05, + "loss": 0.6212, + "step": 12705 + }, + { + "epoch": 2.0741194236969918, + "grad_norm": 1.6633682250976562, + "learning_rate": 1.9317949338953435e-05, + "loss": 0.5105, + "step": 12706 + }, + { + "epoch": 2.074282682339496, + "grad_norm": 1.607882022857666, + "learning_rate": 1.9317834147251477e-05, + "loss": 0.5881, + "step": 12707 + }, + { + "epoch": 2.0744459409820006, + "grad_norm": 2.4702701568603516, + "learning_rate": 1.9317718946166457e-05, + "loss": 1.0495, + "step": 12708 + }, + { + "epoch": 2.074609199624505, + "grad_norm": 2.163066864013672, + "learning_rate": 1.9317603735698497e-05, + "loss": 0.7426, + "step": 12709 + }, + { + "epoch": 2.0747724582670095, + "grad_norm": 1.581940770149231, + "learning_rate": 1.9317488515847717e-05, + "loss": 0.4888, + "step": 12710 + }, + { + "epoch": 2.074935716909514, + "grad_norm": 2.1653759479522705, + "learning_rate": 1.9317373286614223e-05, + "loss": 0.6081, + "step": 12711 + }, + { + "epoch": 2.0750989755520184, + "grad_norm": 1.4381012916564941, + "learning_rate": 1.931725804799814e-05, + "loss": 0.4, + "step": 12712 + }, + { + "epoch": 2.075262234194523, + "grad_norm": 1.4134166240692139, + "learning_rate": 1.9317142799999576e-05, + "loss": 0.4162, + "step": 12713 + }, + { + "epoch": 2.0754254928370273, + "grad_norm": 1.5760836601257324, + "learning_rate": 1.9317027542618656e-05, + "loss": 0.5043, + "step": 12714 + }, + { + "epoch": 2.0755887514795313, + "grad_norm": 1.5688426494598389, + "learning_rate": 1.931691227585549e-05, + "loss": 0.5073, + "step": 12715 + }, + { + "epoch": 2.0757520101220357, + "grad_norm": 1.4302319288253784, + "learning_rate": 1.93167969997102e-05, + "loss": 0.4648, + "step": 12716 + }, + { + "epoch": 2.07591526876454, + "grad_norm": 1.7129186391830444, + "learning_rate": 1.9316681714182893e-05, + "loss": 0.5514, + "step": 12717 + }, + { + "epoch": 2.0760785274070446, + "grad_norm": 2.374008893966675, + "learning_rate": 1.931656641927369e-05, + "loss": 0.6021, + "step": 12718 + }, + { + "epoch": 2.076241786049549, + "grad_norm": 1.6718158721923828, + "learning_rate": 1.931645111498271e-05, + "loss": 0.5376, + "step": 12719 + }, + { + "epoch": 2.0764050446920534, + "grad_norm": 1.8470888137817383, + "learning_rate": 1.9316335801310064e-05, + "loss": 0.5037, + "step": 12720 + }, + { + "epoch": 2.076568303334558, + "grad_norm": 1.7440561056137085, + "learning_rate": 1.931622047825587e-05, + "loss": 0.5293, + "step": 12721 + }, + { + "epoch": 2.0767315619770623, + "grad_norm": 1.5683214664459229, + "learning_rate": 1.931610514582025e-05, + "loss": 0.5407, + "step": 12722 + }, + { + "epoch": 2.0768948206195668, + "grad_norm": 1.9340459108352661, + "learning_rate": 1.9315989804003307e-05, + "loss": 0.5647, + "step": 12723 + }, + { + "epoch": 2.0770580792620708, + "grad_norm": 1.668990135192871, + "learning_rate": 1.9315874452805167e-05, + "loss": 0.5884, + "step": 12724 + }, + { + "epoch": 2.077221337904575, + "grad_norm": 1.6062568426132202, + "learning_rate": 1.9315759092225947e-05, + "loss": 0.5312, + "step": 12725 + }, + { + "epoch": 2.0773845965470796, + "grad_norm": 1.3526465892791748, + "learning_rate": 1.931564372226576e-05, + "loss": 0.4637, + "step": 12726 + }, + { + "epoch": 2.077547855189584, + "grad_norm": 1.8592619895935059, + "learning_rate": 1.9315528342924716e-05, + "loss": 0.6208, + "step": 12727 + }, + { + "epoch": 2.0777111138320885, + "grad_norm": 1.641052007675171, + "learning_rate": 1.9315412954202945e-05, + "loss": 0.5761, + "step": 12728 + }, + { + "epoch": 2.077874372474593, + "grad_norm": 1.8448584079742432, + "learning_rate": 1.9315297556100548e-05, + "loss": 0.5195, + "step": 12729 + }, + { + "epoch": 2.0780376311170974, + "grad_norm": 1.907055139541626, + "learning_rate": 1.9315182148617655e-05, + "loss": 0.5774, + "step": 12730 + }, + { + "epoch": 2.078200889759602, + "grad_norm": 1.950094223022461, + "learning_rate": 1.9315066731754373e-05, + "loss": 0.5712, + "step": 12731 + }, + { + "epoch": 2.0783641484021063, + "grad_norm": 1.4499282836914062, + "learning_rate": 1.931495130551082e-05, + "loss": 0.4431, + "step": 12732 + }, + { + "epoch": 2.0785274070446103, + "grad_norm": 2.6697895526885986, + "learning_rate": 1.931483586988712e-05, + "loss": 0.5708, + "step": 12733 + }, + { + "epoch": 2.0786906656871147, + "grad_norm": 2.078718662261963, + "learning_rate": 1.9314720424883376e-05, + "loss": 0.7159, + "step": 12734 + }, + { + "epoch": 2.078853924329619, + "grad_norm": 1.626994013786316, + "learning_rate": 1.9314604970499716e-05, + "loss": 0.46, + "step": 12735 + }, + { + "epoch": 2.0790171829721236, + "grad_norm": 1.812000036239624, + "learning_rate": 1.9314489506736247e-05, + "loss": 0.579, + "step": 12736 + }, + { + "epoch": 2.079180441614628, + "grad_norm": 1.9553996324539185, + "learning_rate": 1.931437403359309e-05, + "loss": 0.6228, + "step": 12737 + }, + { + "epoch": 2.0793437002571324, + "grad_norm": 1.7759252786636353, + "learning_rate": 1.9314258551070363e-05, + "loss": 0.5596, + "step": 12738 + }, + { + "epoch": 2.079506958899637, + "grad_norm": 2.067488431930542, + "learning_rate": 1.931414305916818e-05, + "loss": 0.5906, + "step": 12739 + }, + { + "epoch": 2.0796702175421413, + "grad_norm": 1.7963440418243408, + "learning_rate": 1.9314027557886655e-05, + "loss": 0.6192, + "step": 12740 + }, + { + "epoch": 2.0798334761846453, + "grad_norm": 1.6041064262390137, + "learning_rate": 1.931391204722591e-05, + "loss": 0.5709, + "step": 12741 + }, + { + "epoch": 2.0799967348271498, + "grad_norm": 1.804365634918213, + "learning_rate": 1.9313796527186056e-05, + "loss": 0.5542, + "step": 12742 + }, + { + "epoch": 2.080159993469654, + "grad_norm": 1.6745089292526245, + "learning_rate": 1.9313680997767213e-05, + "loss": 0.5464, + "step": 12743 + }, + { + "epoch": 2.0803232521121586, + "grad_norm": 1.7077293395996094, + "learning_rate": 1.9313565458969493e-05, + "loss": 0.6145, + "step": 12744 + }, + { + "epoch": 2.080486510754663, + "grad_norm": 1.7200086116790771, + "learning_rate": 1.9313449910793015e-05, + "loss": 0.5089, + "step": 12745 + }, + { + "epoch": 2.0806497693971675, + "grad_norm": 2.014143705368042, + "learning_rate": 1.93133343532379e-05, + "loss": 0.6371, + "step": 12746 + }, + { + "epoch": 2.080813028039672, + "grad_norm": 1.9303611516952515, + "learning_rate": 1.9313218786304255e-05, + "loss": 0.6676, + "step": 12747 + }, + { + "epoch": 2.0809762866821764, + "grad_norm": 1.9105255603790283, + "learning_rate": 1.9313103209992205e-05, + "loss": 0.5546, + "step": 12748 + }, + { + "epoch": 2.081139545324681, + "grad_norm": 1.661882996559143, + "learning_rate": 1.931298762430186e-05, + "loss": 0.4664, + "step": 12749 + }, + { + "epoch": 2.081302803967185, + "grad_norm": 1.94900381565094, + "learning_rate": 1.931287202923334e-05, + "loss": 0.6287, + "step": 12750 + }, + { + "epoch": 2.0814660626096892, + "grad_norm": 1.7645976543426514, + "learning_rate": 1.9312756424786758e-05, + "loss": 0.616, + "step": 12751 + }, + { + "epoch": 2.0816293212521937, + "grad_norm": 1.8442646265029907, + "learning_rate": 1.9312640810962237e-05, + "loss": 0.5621, + "step": 12752 + }, + { + "epoch": 2.081792579894698, + "grad_norm": 1.9327770471572876, + "learning_rate": 1.9312525187759886e-05, + "loss": 0.5189, + "step": 12753 + }, + { + "epoch": 2.0819558385372026, + "grad_norm": 1.5647932291030884, + "learning_rate": 1.9312409555179827e-05, + "loss": 0.5038, + "step": 12754 + }, + { + "epoch": 2.082119097179707, + "grad_norm": 2.269789218902588, + "learning_rate": 1.931229391322217e-05, + "loss": 0.6673, + "step": 12755 + }, + { + "epoch": 2.0822823558222114, + "grad_norm": 1.7380080223083496, + "learning_rate": 1.9312178261887037e-05, + "loss": 0.6179, + "step": 12756 + }, + { + "epoch": 2.082445614464716, + "grad_norm": 1.6851855516433716, + "learning_rate": 1.9312062601174543e-05, + "loss": 0.5069, + "step": 12757 + }, + { + "epoch": 2.0826088731072203, + "grad_norm": 1.553062915802002, + "learning_rate": 1.9311946931084806e-05, + "loss": 0.5146, + "step": 12758 + }, + { + "epoch": 2.0827721317497243, + "grad_norm": 1.7488071918487549, + "learning_rate": 1.9311831251617942e-05, + "loss": 0.6017, + "step": 12759 + }, + { + "epoch": 2.0829353903922287, + "grad_norm": 2.0195295810699463, + "learning_rate": 1.931171556277406e-05, + "loss": 0.5826, + "step": 12760 + }, + { + "epoch": 2.083098649034733, + "grad_norm": 1.4833500385284424, + "learning_rate": 1.9311599864553292e-05, + "loss": 0.4804, + "step": 12761 + }, + { + "epoch": 2.0832619076772376, + "grad_norm": 1.6721388101577759, + "learning_rate": 1.931148415695574e-05, + "loss": 0.5314, + "step": 12762 + }, + { + "epoch": 2.083425166319742, + "grad_norm": 1.6811213493347168, + "learning_rate": 1.9311368439981526e-05, + "loss": 0.5465, + "step": 12763 + }, + { + "epoch": 2.0835884249622465, + "grad_norm": 2.176949977874756, + "learning_rate": 1.931125271363077e-05, + "loss": 0.6339, + "step": 12764 + }, + { + "epoch": 2.083751683604751, + "grad_norm": 1.8629239797592163, + "learning_rate": 1.931113697790358e-05, + "loss": 0.596, + "step": 12765 + }, + { + "epoch": 2.0839149422472554, + "grad_norm": 1.9455536603927612, + "learning_rate": 1.931102123280008e-05, + "loss": 0.78, + "step": 12766 + }, + { + "epoch": 2.08407820088976, + "grad_norm": 1.7788071632385254, + "learning_rate": 1.931090547832038e-05, + "loss": 0.555, + "step": 12767 + }, + { + "epoch": 2.084241459532264, + "grad_norm": 1.8842626810073853, + "learning_rate": 1.9310789714464605e-05, + "loss": 0.6504, + "step": 12768 + }, + { + "epoch": 2.0844047181747682, + "grad_norm": 1.8657252788543701, + "learning_rate": 1.9310673941232868e-05, + "loss": 0.4955, + "step": 12769 + }, + { + "epoch": 2.0845679768172727, + "grad_norm": 1.6530393362045288, + "learning_rate": 1.9310558158625286e-05, + "loss": 0.4669, + "step": 12770 + }, + { + "epoch": 2.084731235459777, + "grad_norm": 2.150618553161621, + "learning_rate": 1.931044236664197e-05, + "loss": 0.5823, + "step": 12771 + }, + { + "epoch": 2.0848944941022816, + "grad_norm": 2.16544771194458, + "learning_rate": 1.9310326565283045e-05, + "loss": 0.7343, + "step": 12772 + }, + { + "epoch": 2.085057752744786, + "grad_norm": 1.8436336517333984, + "learning_rate": 1.931021075454862e-05, + "loss": 0.6213, + "step": 12773 + }, + { + "epoch": 2.0852210113872904, + "grad_norm": 2.0119311809539795, + "learning_rate": 1.9310094934438816e-05, + "loss": 0.7431, + "step": 12774 + }, + { + "epoch": 2.085384270029795, + "grad_norm": 1.5817744731903076, + "learning_rate": 1.9309979104953747e-05, + "loss": 0.4391, + "step": 12775 + }, + { + "epoch": 2.085547528672299, + "grad_norm": 1.7501106262207031, + "learning_rate": 1.930986326609354e-05, + "loss": 0.5823, + "step": 12776 + }, + { + "epoch": 2.0857107873148033, + "grad_norm": 1.8551634550094604, + "learning_rate": 1.9309747417858295e-05, + "loss": 0.6388, + "step": 12777 + }, + { + "epoch": 2.0858740459573077, + "grad_norm": 2.226353883743286, + "learning_rate": 1.9309631560248137e-05, + "loss": 0.6789, + "step": 12778 + }, + { + "epoch": 2.086037304599812, + "grad_norm": 2.0928761959075928, + "learning_rate": 1.9309515693263185e-05, + "loss": 0.6382, + "step": 12779 + }, + { + "epoch": 2.0862005632423166, + "grad_norm": 1.9373363256454468, + "learning_rate": 1.9309399816903554e-05, + "loss": 0.5996, + "step": 12780 + }, + { + "epoch": 2.086363821884821, + "grad_norm": 1.7545485496520996, + "learning_rate": 1.930928393116936e-05, + "loss": 0.4969, + "step": 12781 + }, + { + "epoch": 2.0865270805273255, + "grad_norm": 1.7791355848312378, + "learning_rate": 1.9309168036060717e-05, + "loss": 0.5554, + "step": 12782 + }, + { + "epoch": 2.08669033916983, + "grad_norm": 1.6384872198104858, + "learning_rate": 1.9309052131577746e-05, + "loss": 0.4806, + "step": 12783 + }, + { + "epoch": 2.0868535978123344, + "grad_norm": 1.8091706037521362, + "learning_rate": 1.930893621772056e-05, + "loss": 0.5309, + "step": 12784 + }, + { + "epoch": 2.0870168564548384, + "grad_norm": 1.9727658033370972, + "learning_rate": 1.9308820294489278e-05, + "loss": 0.5911, + "step": 12785 + }, + { + "epoch": 2.087180115097343, + "grad_norm": 1.785212755203247, + "learning_rate": 1.930870436188402e-05, + "loss": 0.5195, + "step": 12786 + }, + { + "epoch": 2.0873433737398472, + "grad_norm": 1.5724595785140991, + "learning_rate": 1.9308588419904895e-05, + "loss": 0.5063, + "step": 12787 + }, + { + "epoch": 2.0875066323823517, + "grad_norm": 1.9323153495788574, + "learning_rate": 1.9308472468552026e-05, + "loss": 0.6168, + "step": 12788 + }, + { + "epoch": 2.087669891024856, + "grad_norm": 1.6479941606521606, + "learning_rate": 1.930835650782553e-05, + "loss": 0.5812, + "step": 12789 + }, + { + "epoch": 2.0878331496673606, + "grad_norm": 1.8004562854766846, + "learning_rate": 1.930824053772552e-05, + "loss": 0.6025, + "step": 12790 + }, + { + "epoch": 2.087996408309865, + "grad_norm": 1.6458659172058105, + "learning_rate": 1.9308124558252112e-05, + "loss": 0.4834, + "step": 12791 + }, + { + "epoch": 2.0881596669523694, + "grad_norm": 2.229768753051758, + "learning_rate": 1.9308008569405424e-05, + "loss": 0.7925, + "step": 12792 + }, + { + "epoch": 2.088322925594874, + "grad_norm": 1.835206389427185, + "learning_rate": 1.9307892571185575e-05, + "loss": 0.5327, + "step": 12793 + }, + { + "epoch": 2.088486184237378, + "grad_norm": 1.8722730875015259, + "learning_rate": 1.9307776563592683e-05, + "loss": 0.5966, + "step": 12794 + }, + { + "epoch": 2.0886494428798823, + "grad_norm": 1.6240946054458618, + "learning_rate": 1.930766054662686e-05, + "loss": 0.5512, + "step": 12795 + }, + { + "epoch": 2.0888127015223867, + "grad_norm": 1.616065502166748, + "learning_rate": 1.9307544520288227e-05, + "loss": 0.5672, + "step": 12796 + }, + { + "epoch": 2.088975960164891, + "grad_norm": 1.912476658821106, + "learning_rate": 1.93074284845769e-05, + "loss": 0.6024, + "step": 12797 + }, + { + "epoch": 2.0891392188073956, + "grad_norm": 1.7547523975372314, + "learning_rate": 1.9307312439492992e-05, + "loss": 0.6347, + "step": 12798 + }, + { + "epoch": 2.0893024774499, + "grad_norm": 1.6140174865722656, + "learning_rate": 1.9307196385036623e-05, + "loss": 0.6588, + "step": 12799 + }, + { + "epoch": 2.0894657360924045, + "grad_norm": 2.0069427490234375, + "learning_rate": 1.9307080321207913e-05, + "loss": 0.5967, + "step": 12800 + }, + { + "epoch": 2.089628994734909, + "grad_norm": 1.824831485748291, + "learning_rate": 1.9306964248006973e-05, + "loss": 0.5794, + "step": 12801 + }, + { + "epoch": 2.0897922533774134, + "grad_norm": 1.7463593482971191, + "learning_rate": 1.9306848165433924e-05, + "loss": 0.5485, + "step": 12802 + }, + { + "epoch": 2.0899555120199174, + "grad_norm": 1.568799376487732, + "learning_rate": 1.930673207348888e-05, + "loss": 0.5203, + "step": 12803 + }, + { + "epoch": 2.090118770662422, + "grad_norm": 1.9064339399337769, + "learning_rate": 1.9306615972171962e-05, + "loss": 0.6937, + "step": 12804 + }, + { + "epoch": 2.0902820293049262, + "grad_norm": 1.4719138145446777, + "learning_rate": 1.9306499861483278e-05, + "loss": 0.549, + "step": 12805 + }, + { + "epoch": 2.0904452879474307, + "grad_norm": 1.8527660369873047, + "learning_rate": 1.9306383741422957e-05, + "loss": 0.6776, + "step": 12806 + }, + { + "epoch": 2.090608546589935, + "grad_norm": 1.8649863004684448, + "learning_rate": 1.9306267611991108e-05, + "loss": 0.5637, + "step": 12807 + }, + { + "epoch": 2.0907718052324396, + "grad_norm": 1.808889627456665, + "learning_rate": 1.930615147318785e-05, + "loss": 0.6557, + "step": 12808 + }, + { + "epoch": 2.090935063874944, + "grad_norm": 1.9260441064834595, + "learning_rate": 1.9306035325013298e-05, + "loss": 0.5877, + "step": 12809 + }, + { + "epoch": 2.0910983225174484, + "grad_norm": 1.9437551498413086, + "learning_rate": 1.9305919167467575e-05, + "loss": 0.6026, + "step": 12810 + }, + { + "epoch": 2.091261581159953, + "grad_norm": 1.8337372541427612, + "learning_rate": 1.930580300055079e-05, + "loss": 0.6183, + "step": 12811 + }, + { + "epoch": 2.091424839802457, + "grad_norm": 2.004124879837036, + "learning_rate": 1.9305686824263067e-05, + "loss": 0.6373, + "step": 12812 + }, + { + "epoch": 2.0915880984449613, + "grad_norm": 1.938170313835144, + "learning_rate": 1.9305570638604517e-05, + "loss": 0.6469, + "step": 12813 + }, + { + "epoch": 2.0917513570874657, + "grad_norm": 1.8495286703109741, + "learning_rate": 1.930545444357526e-05, + "loss": 0.6266, + "step": 12814 + }, + { + "epoch": 2.09191461572997, + "grad_norm": 1.817981481552124, + "learning_rate": 1.9305338239175416e-05, + "loss": 0.581, + "step": 12815 + }, + { + "epoch": 2.0920778743724746, + "grad_norm": 1.730657935142517, + "learning_rate": 1.9305222025405096e-05, + "loss": 0.529, + "step": 12816 + }, + { + "epoch": 2.092241133014979, + "grad_norm": 1.6864612102508545, + "learning_rate": 1.930510580226442e-05, + "loss": 0.6214, + "step": 12817 + }, + { + "epoch": 2.0924043916574835, + "grad_norm": 1.9470428228378296, + "learning_rate": 1.9304989569753506e-05, + "loss": 0.6677, + "step": 12818 + }, + { + "epoch": 2.092567650299988, + "grad_norm": 2.1851179599761963, + "learning_rate": 1.930487332787247e-05, + "loss": 0.6372, + "step": 12819 + }, + { + "epoch": 2.0927309089424924, + "grad_norm": 1.6719392538070679, + "learning_rate": 1.930475707662143e-05, + "loss": 0.5754, + "step": 12820 + }, + { + "epoch": 2.0928941675849964, + "grad_norm": 2.1996285915374756, + "learning_rate": 1.93046408160005e-05, + "loss": 0.6308, + "step": 12821 + }, + { + "epoch": 2.093057426227501, + "grad_norm": 1.7618937492370605, + "learning_rate": 1.93045245460098e-05, + "loss": 0.6064, + "step": 12822 + }, + { + "epoch": 2.0932206848700052, + "grad_norm": 1.930561900138855, + "learning_rate": 1.9304408266649444e-05, + "loss": 0.5459, + "step": 12823 + }, + { + "epoch": 2.0933839435125097, + "grad_norm": 1.7970324754714966, + "learning_rate": 1.9304291977919554e-05, + "loss": 0.6112, + "step": 12824 + }, + { + "epoch": 2.093547202155014, + "grad_norm": 1.4806278944015503, + "learning_rate": 1.9304175679820247e-05, + "loss": 0.5414, + "step": 12825 + }, + { + "epoch": 2.0937104607975185, + "grad_norm": 1.8206032514572144, + "learning_rate": 1.9304059372351633e-05, + "loss": 0.6972, + "step": 12826 + }, + { + "epoch": 2.093873719440023, + "grad_norm": 1.4162654876708984, + "learning_rate": 1.9303943055513836e-05, + "loss": 0.4925, + "step": 12827 + }, + { + "epoch": 2.0940369780825274, + "grad_norm": 1.4881614446640015, + "learning_rate": 1.9303826729306973e-05, + "loss": 0.4947, + "step": 12828 + }, + { + "epoch": 2.0942002367250314, + "grad_norm": 1.645896077156067, + "learning_rate": 1.9303710393731153e-05, + "loss": 0.571, + "step": 12829 + }, + { + "epoch": 2.094363495367536, + "grad_norm": 2.346682071685791, + "learning_rate": 1.9303594048786505e-05, + "loss": 0.6869, + "step": 12830 + }, + { + "epoch": 2.0945267540100403, + "grad_norm": 1.619586706161499, + "learning_rate": 1.9303477694473135e-05, + "loss": 0.5221, + "step": 12831 + }, + { + "epoch": 2.0946900126525447, + "grad_norm": 1.8854387998580933, + "learning_rate": 1.930336133079117e-05, + "loss": 0.5595, + "step": 12832 + }, + { + "epoch": 2.094853271295049, + "grad_norm": 1.6953614950180054, + "learning_rate": 1.9303244957740718e-05, + "loss": 1.0982, + "step": 12833 + }, + { + "epoch": 2.0950165299375536, + "grad_norm": 1.6432044506072998, + "learning_rate": 1.9303128575321906e-05, + "loss": 0.5428, + "step": 12834 + }, + { + "epoch": 2.095179788580058, + "grad_norm": 1.9454517364501953, + "learning_rate": 1.9303012183534842e-05, + "loss": 0.5796, + "step": 12835 + }, + { + "epoch": 2.0953430472225625, + "grad_norm": 1.6888647079467773, + "learning_rate": 1.9302895782379648e-05, + "loss": 0.5245, + "step": 12836 + }, + { + "epoch": 2.095506305865067, + "grad_norm": 2.2397663593292236, + "learning_rate": 1.9302779371856443e-05, + "loss": 0.719, + "step": 12837 + }, + { + "epoch": 2.095669564507571, + "grad_norm": 2.0789718627929688, + "learning_rate": 1.9302662951965337e-05, + "loss": 0.7743, + "step": 12838 + }, + { + "epoch": 2.0958328231500754, + "grad_norm": 1.7546110153198242, + "learning_rate": 1.9302546522706454e-05, + "loss": 0.5634, + "step": 12839 + }, + { + "epoch": 2.09599608179258, + "grad_norm": 1.5024144649505615, + "learning_rate": 1.930243008407991e-05, + "loss": 0.4643, + "step": 12840 + }, + { + "epoch": 2.0961593404350842, + "grad_norm": 1.6136242151260376, + "learning_rate": 1.930231363608582e-05, + "loss": 0.5467, + "step": 12841 + }, + { + "epoch": 2.0963225990775887, + "grad_norm": 1.9911699295043945, + "learning_rate": 1.9302197178724306e-05, + "loss": 0.6726, + "step": 12842 + }, + { + "epoch": 2.096485857720093, + "grad_norm": 1.9862892627716064, + "learning_rate": 1.9302080711995477e-05, + "loss": 0.7915, + "step": 12843 + }, + { + "epoch": 2.0966491163625975, + "grad_norm": 1.5588868856430054, + "learning_rate": 1.930196423589946e-05, + "loss": 0.4626, + "step": 12844 + }, + { + "epoch": 2.096812375005102, + "grad_norm": 1.8360767364501953, + "learning_rate": 1.9301847750436362e-05, + "loss": 0.4991, + "step": 12845 + }, + { + "epoch": 2.0969756336476064, + "grad_norm": 1.7421984672546387, + "learning_rate": 1.930173125560631e-05, + "loss": 0.6701, + "step": 12846 + }, + { + "epoch": 2.0971388922901104, + "grad_norm": 1.665395736694336, + "learning_rate": 1.9301614751409416e-05, + "loss": 0.546, + "step": 12847 + }, + { + "epoch": 2.097302150932615, + "grad_norm": 1.9602199792861938, + "learning_rate": 1.93014982378458e-05, + "loss": 0.578, + "step": 12848 + }, + { + "epoch": 2.0974654095751193, + "grad_norm": 1.969148874282837, + "learning_rate": 1.9301381714915573e-05, + "loss": 0.6749, + "step": 12849 + }, + { + "epoch": 2.0976286682176237, + "grad_norm": 1.9804785251617432, + "learning_rate": 1.9301265182618862e-05, + "loss": 0.5903, + "step": 12850 + }, + { + "epoch": 2.097791926860128, + "grad_norm": 1.754127025604248, + "learning_rate": 1.930114864095578e-05, + "loss": 0.4807, + "step": 12851 + }, + { + "epoch": 2.0979551855026326, + "grad_norm": 2.0960655212402344, + "learning_rate": 1.930103208992644e-05, + "loss": 0.6274, + "step": 12852 + }, + { + "epoch": 2.098118444145137, + "grad_norm": 1.4841028451919556, + "learning_rate": 1.9300915529530963e-05, + "loss": 0.5314, + "step": 12853 + }, + { + "epoch": 2.0982817027876415, + "grad_norm": 1.9858944416046143, + "learning_rate": 1.930079895976947e-05, + "loss": 0.6127, + "step": 12854 + }, + { + "epoch": 2.098444961430146, + "grad_norm": 2.44119930267334, + "learning_rate": 1.9300682380642072e-05, + "loss": 0.6278, + "step": 12855 + }, + { + "epoch": 2.09860822007265, + "grad_norm": 1.89239501953125, + "learning_rate": 1.9300565792148892e-05, + "loss": 0.5974, + "step": 12856 + }, + { + "epoch": 2.0987714787151543, + "grad_norm": 1.5920915603637695, + "learning_rate": 1.9300449194290042e-05, + "loss": 0.506, + "step": 12857 + }, + { + "epoch": 2.098934737357659, + "grad_norm": 1.789681315422058, + "learning_rate": 1.9300332587065644e-05, + "loss": 0.5882, + "step": 12858 + }, + { + "epoch": 2.0990979960001632, + "grad_norm": 2.000105857849121, + "learning_rate": 1.9300215970475812e-05, + "loss": 0.5771, + "step": 12859 + }, + { + "epoch": 2.0992612546426677, + "grad_norm": 1.5265791416168213, + "learning_rate": 1.9300099344520665e-05, + "loss": 0.5304, + "step": 12860 + }, + { + "epoch": 2.099424513285172, + "grad_norm": 2.1608119010925293, + "learning_rate": 1.9299982709200323e-05, + "loss": 0.697, + "step": 12861 + }, + { + "epoch": 2.0995877719276765, + "grad_norm": 1.6674336194992065, + "learning_rate": 1.9299866064514896e-05, + "loss": 0.5152, + "step": 12862 + }, + { + "epoch": 2.099751030570181, + "grad_norm": 2.232393264770508, + "learning_rate": 1.929974941046451e-05, + "loss": 0.6372, + "step": 12863 + }, + { + "epoch": 2.099914289212685, + "grad_norm": 1.7510950565338135, + "learning_rate": 1.9299632747049278e-05, + "loss": 0.6015, + "step": 12864 + }, + { + "epoch": 2.1000775478551894, + "grad_norm": 1.6037365198135376, + "learning_rate": 1.929951607426932e-05, + "loss": 0.5706, + "step": 12865 + }, + { + "epoch": 2.100240806497694, + "grad_norm": 1.9804211854934692, + "learning_rate": 1.929939939212475e-05, + "loss": 0.6736, + "step": 12866 + }, + { + "epoch": 2.1004040651401983, + "grad_norm": 2.003833770751953, + "learning_rate": 1.9299282700615687e-05, + "loss": 0.6562, + "step": 12867 + }, + { + "epoch": 2.1005673237827027, + "grad_norm": 2.0173146724700928, + "learning_rate": 1.9299165999742248e-05, + "loss": 0.6591, + "step": 12868 + }, + { + "epoch": 2.100730582425207, + "grad_norm": 1.878605842590332, + "learning_rate": 1.9299049289504555e-05, + "loss": 0.5936, + "step": 12869 + }, + { + "epoch": 2.1008938410677116, + "grad_norm": 1.564061164855957, + "learning_rate": 1.9298932569902717e-05, + "loss": 0.6235, + "step": 12870 + }, + { + "epoch": 2.101057099710216, + "grad_norm": 1.8321222066879272, + "learning_rate": 1.929881584093686e-05, + "loss": 0.5163, + "step": 12871 + }, + { + "epoch": 2.1012203583527205, + "grad_norm": 1.5386427640914917, + "learning_rate": 1.9298699102607097e-05, + "loss": 0.4882, + "step": 12872 + }, + { + "epoch": 2.101383616995225, + "grad_norm": 1.909063458442688, + "learning_rate": 1.9298582354913543e-05, + "loss": 0.5134, + "step": 12873 + }, + { + "epoch": 2.101546875637729, + "grad_norm": 1.92169189453125, + "learning_rate": 1.9298465597856328e-05, + "loss": 0.5799, + "step": 12874 + }, + { + "epoch": 2.1017101342802333, + "grad_norm": 2.3973891735076904, + "learning_rate": 1.9298348831435553e-05, + "loss": 0.504, + "step": 12875 + }, + { + "epoch": 2.101873392922738, + "grad_norm": 1.7647136449813843, + "learning_rate": 1.9298232055651344e-05, + "loss": 0.5765, + "step": 12876 + }, + { + "epoch": 2.102036651565242, + "grad_norm": 2.097569227218628, + "learning_rate": 1.929811527050382e-05, + "loss": 0.5802, + "step": 12877 + }, + { + "epoch": 2.1021999102077467, + "grad_norm": 1.5814440250396729, + "learning_rate": 1.9297998475993094e-05, + "loss": 0.5572, + "step": 12878 + }, + { + "epoch": 2.102363168850251, + "grad_norm": 1.9188121557235718, + "learning_rate": 1.9297881672119287e-05, + "loss": 0.6783, + "step": 12879 + }, + { + "epoch": 2.1025264274927555, + "grad_norm": 1.8461754322052002, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.6291, + "step": 12880 + }, + { + "epoch": 2.10268968613526, + "grad_norm": 1.695420742034912, + "learning_rate": 1.9297648036282898e-05, + "loss": 0.5469, + "step": 12881 + }, + { + "epoch": 2.102852944777764, + "grad_norm": 1.5807846784591675, + "learning_rate": 1.929753120432055e-05, + "loss": 0.5666, + "step": 12882 + }, + { + "epoch": 2.1030162034202684, + "grad_norm": 2.0511667728424072, + "learning_rate": 1.929741436299559e-05, + "loss": 0.7042, + "step": 12883 + }, + { + "epoch": 2.103179462062773, + "grad_norm": 1.9001705646514893, + "learning_rate": 1.929729751230814e-05, + "loss": 0.5536, + "step": 12884 + }, + { + "epoch": 2.1033427207052773, + "grad_norm": 1.7495828866958618, + "learning_rate": 1.929718065225831e-05, + "loss": 0.5722, + "step": 12885 + }, + { + "epoch": 2.1035059793477817, + "grad_norm": 2.039482831954956, + "learning_rate": 1.9297063782846224e-05, + "loss": 0.6866, + "step": 12886 + }, + { + "epoch": 2.103669237990286, + "grad_norm": 1.6972050666809082, + "learning_rate": 1.9296946904071998e-05, + "loss": 0.545, + "step": 12887 + }, + { + "epoch": 2.1038324966327906, + "grad_norm": 1.448533535003662, + "learning_rate": 1.9296830015935746e-05, + "loss": 0.5019, + "step": 12888 + }, + { + "epoch": 2.103995755275295, + "grad_norm": 1.7679989337921143, + "learning_rate": 1.9296713118437588e-05, + "loss": 0.6086, + "step": 12889 + }, + { + "epoch": 2.1041590139177995, + "grad_norm": 1.6184943914413452, + "learning_rate": 1.9296596211577646e-05, + "loss": 0.5443, + "step": 12890 + }, + { + "epoch": 2.1043222725603035, + "grad_norm": 1.554587483406067, + "learning_rate": 1.9296479295356035e-05, + "loss": 0.505, + "step": 12891 + }, + { + "epoch": 2.104485531202808, + "grad_norm": 1.683470368385315, + "learning_rate": 1.9296362369772867e-05, + "loss": 0.5609, + "step": 12892 + }, + { + "epoch": 2.1046487898453123, + "grad_norm": 1.8868921995162964, + "learning_rate": 1.9296245434828266e-05, + "loss": 0.5713, + "step": 12893 + }, + { + "epoch": 2.104812048487817, + "grad_norm": 1.7698732614517212, + "learning_rate": 1.929612849052235e-05, + "loss": 0.5312, + "step": 12894 + }, + { + "epoch": 2.104975307130321, + "grad_norm": 1.8295979499816895, + "learning_rate": 1.9296011536855235e-05, + "loss": 0.5849, + "step": 12895 + }, + { + "epoch": 2.1051385657728257, + "grad_norm": 2.1352109909057617, + "learning_rate": 1.929589457382704e-05, + "loss": 0.6152, + "step": 12896 + }, + { + "epoch": 2.10530182441533, + "grad_norm": 2.173583507537842, + "learning_rate": 1.9295777601437878e-05, + "loss": 0.6982, + "step": 12897 + }, + { + "epoch": 2.1054650830578345, + "grad_norm": 1.594587802886963, + "learning_rate": 1.929566061968787e-05, + "loss": 0.6216, + "step": 12898 + }, + { + "epoch": 2.105628341700339, + "grad_norm": 1.94664466381073, + "learning_rate": 1.929554362857714e-05, + "loss": 0.6313, + "step": 12899 + }, + { + "epoch": 2.105791600342843, + "grad_norm": 1.4956679344177246, + "learning_rate": 1.9295426628105792e-05, + "loss": 0.4595, + "step": 12900 + }, + { + "epoch": 2.1059548589853474, + "grad_norm": 1.496766448020935, + "learning_rate": 1.929530961827396e-05, + "loss": 0.5557, + "step": 12901 + }, + { + "epoch": 2.106118117627852, + "grad_norm": 2.3608617782592773, + "learning_rate": 1.9295192599081747e-05, + "loss": 0.6138, + "step": 12902 + }, + { + "epoch": 2.1062813762703563, + "grad_norm": 1.79420804977417, + "learning_rate": 1.929507557052928e-05, + "loss": 0.6272, + "step": 12903 + }, + { + "epoch": 2.1064446349128607, + "grad_norm": 1.6578106880187988, + "learning_rate": 1.9294958532616675e-05, + "loss": 0.4415, + "step": 12904 + }, + { + "epoch": 2.106607893555365, + "grad_norm": 1.9367291927337646, + "learning_rate": 1.929484148534405e-05, + "loss": 0.6192, + "step": 12905 + }, + { + "epoch": 2.1067711521978696, + "grad_norm": 1.8938994407653809, + "learning_rate": 1.929472442871152e-05, + "loss": 0.5565, + "step": 12906 + }, + { + "epoch": 2.106934410840374, + "grad_norm": 1.9660028219223022, + "learning_rate": 1.9294607362719206e-05, + "loss": 0.6419, + "step": 12907 + }, + { + "epoch": 2.1070976694828785, + "grad_norm": 1.880381464958191, + "learning_rate": 1.9294490287367226e-05, + "loss": 0.5692, + "step": 12908 + }, + { + "epoch": 2.1072609281253825, + "grad_norm": 1.6451698541641235, + "learning_rate": 1.9294373202655694e-05, + "loss": 0.5156, + "step": 12909 + }, + { + "epoch": 2.107424186767887, + "grad_norm": 1.4751999378204346, + "learning_rate": 1.9294256108584734e-05, + "loss": 0.4374, + "step": 12910 + }, + { + "epoch": 2.1075874454103913, + "grad_norm": 2.071787118911743, + "learning_rate": 1.929413900515446e-05, + "loss": 0.6966, + "step": 12911 + }, + { + "epoch": 2.1077507040528958, + "grad_norm": 1.7881094217300415, + "learning_rate": 1.9294021892364988e-05, + "loss": 0.6526, + "step": 12912 + }, + { + "epoch": 2.1079139626954, + "grad_norm": 2.0441176891326904, + "learning_rate": 1.929390477021644e-05, + "loss": 0.5501, + "step": 12913 + }, + { + "epoch": 2.1080772213379046, + "grad_norm": 1.5230073928833008, + "learning_rate": 1.929378763870893e-05, + "loss": 0.4732, + "step": 12914 + }, + { + "epoch": 2.108240479980409, + "grad_norm": 1.7061731815338135, + "learning_rate": 1.9293670497842584e-05, + "loss": 0.5125, + "step": 12915 + }, + { + "epoch": 2.1084037386229135, + "grad_norm": 2.0876922607421875, + "learning_rate": 1.9293553347617506e-05, + "loss": 0.6393, + "step": 12916 + }, + { + "epoch": 2.1085669972654175, + "grad_norm": 1.8641828298568726, + "learning_rate": 1.9293436188033826e-05, + "loss": 0.5623, + "step": 12917 + }, + { + "epoch": 2.108730255907922, + "grad_norm": 1.638543963432312, + "learning_rate": 1.929331901909166e-05, + "loss": 0.5653, + "step": 12918 + }, + { + "epoch": 2.1088935145504264, + "grad_norm": 2.119798183441162, + "learning_rate": 1.9293201840791124e-05, + "loss": 0.7147, + "step": 12919 + }, + { + "epoch": 2.109056773192931, + "grad_norm": 2.2072949409484863, + "learning_rate": 1.9293084653132336e-05, + "loss": 0.6604, + "step": 12920 + }, + { + "epoch": 2.1092200318354353, + "grad_norm": 1.8547977209091187, + "learning_rate": 1.9292967456115414e-05, + "loss": 0.5887, + "step": 12921 + }, + { + "epoch": 2.1093832904779397, + "grad_norm": 2.3774960041046143, + "learning_rate": 1.9292850249740474e-05, + "loss": 0.577, + "step": 12922 + }, + { + "epoch": 2.109546549120444, + "grad_norm": 1.7576109170913696, + "learning_rate": 1.929273303400764e-05, + "loss": 0.707, + "step": 12923 + }, + { + "epoch": 2.1097098077629486, + "grad_norm": 1.6765239238739014, + "learning_rate": 1.9292615808917027e-05, + "loss": 0.5163, + "step": 12924 + }, + { + "epoch": 2.109873066405453, + "grad_norm": 1.9253649711608887, + "learning_rate": 1.929249857446875e-05, + "loss": 0.5313, + "step": 12925 + }, + { + "epoch": 2.110036325047957, + "grad_norm": 1.8469806909561157, + "learning_rate": 1.9292381330662926e-05, + "loss": 0.6545, + "step": 12926 + }, + { + "epoch": 2.1101995836904615, + "grad_norm": 1.8150173425674438, + "learning_rate": 1.929226407749968e-05, + "loss": 0.6059, + "step": 12927 + }, + { + "epoch": 2.110362842332966, + "grad_norm": 1.2963793277740479, + "learning_rate": 1.929214681497913e-05, + "loss": 0.528, + "step": 12928 + }, + { + "epoch": 2.1105261009754703, + "grad_norm": 1.7331339120864868, + "learning_rate": 1.9292029543101385e-05, + "loss": 0.5856, + "step": 12929 + }, + { + "epoch": 2.1106893596179748, + "grad_norm": 1.7204519510269165, + "learning_rate": 1.9291912261866568e-05, + "loss": 0.6, + "step": 12930 + }, + { + "epoch": 2.110852618260479, + "grad_norm": 1.9969788789749146, + "learning_rate": 1.9291794971274802e-05, + "loss": 0.6849, + "step": 12931 + }, + { + "epoch": 2.1110158769029836, + "grad_norm": 1.696711778640747, + "learning_rate": 1.92916776713262e-05, + "loss": 0.5093, + "step": 12932 + }, + { + "epoch": 2.111179135545488, + "grad_norm": 2.358063220977783, + "learning_rate": 1.929156036202088e-05, + "loss": 0.6199, + "step": 12933 + }, + { + "epoch": 2.1113423941879925, + "grad_norm": 1.5103769302368164, + "learning_rate": 1.9291443043358963e-05, + "loss": 0.5052, + "step": 12934 + }, + { + "epoch": 2.1115056528304965, + "grad_norm": 1.847495675086975, + "learning_rate": 1.9291325715340562e-05, + "loss": 0.6394, + "step": 12935 + }, + { + "epoch": 2.111668911473001, + "grad_norm": 1.923357605934143, + "learning_rate": 1.92912083779658e-05, + "loss": 0.4998, + "step": 12936 + }, + { + "epoch": 2.1118321701155054, + "grad_norm": 2.1979894638061523, + "learning_rate": 1.9291091031234795e-05, + "loss": 0.5148, + "step": 12937 + }, + { + "epoch": 2.11199542875801, + "grad_norm": 1.7269877195358276, + "learning_rate": 1.9290973675147663e-05, + "loss": 0.6196, + "step": 12938 + }, + { + "epoch": 2.1121586874005143, + "grad_norm": 2.001535177230835, + "learning_rate": 1.929085630970452e-05, + "loss": 0.5706, + "step": 12939 + }, + { + "epoch": 2.1123219460430187, + "grad_norm": 1.6670235395431519, + "learning_rate": 1.9290738934905492e-05, + "loss": 0.5754, + "step": 12940 + }, + { + "epoch": 2.112485204685523, + "grad_norm": 1.1788421869277954, + "learning_rate": 1.929062155075069e-05, + "loss": 0.3482, + "step": 12941 + }, + { + "epoch": 2.1126484633280276, + "grad_norm": 1.6506545543670654, + "learning_rate": 1.9290504157240234e-05, + "loss": 0.561, + "step": 12942 + }, + { + "epoch": 2.112811721970532, + "grad_norm": 1.6213057041168213, + "learning_rate": 1.9290386754374247e-05, + "loss": 0.5665, + "step": 12943 + }, + { + "epoch": 2.112974980613036, + "grad_norm": 1.6599394083023071, + "learning_rate": 1.929026934215284e-05, + "loss": 0.5726, + "step": 12944 + }, + { + "epoch": 2.1131382392555405, + "grad_norm": 1.4732892513275146, + "learning_rate": 1.9290151920576132e-05, + "loss": 0.5081, + "step": 12945 + }, + { + "epoch": 2.113301497898045, + "grad_norm": 1.743784785270691, + "learning_rate": 1.9290034489644247e-05, + "loss": 0.551, + "step": 12946 + }, + { + "epoch": 2.1134647565405493, + "grad_norm": 2.028871774673462, + "learning_rate": 1.9289917049357297e-05, + "loss": 0.6447, + "step": 12947 + }, + { + "epoch": 2.1136280151830538, + "grad_norm": 1.8813812732696533, + "learning_rate": 1.9289799599715403e-05, + "loss": 0.6653, + "step": 12948 + }, + { + "epoch": 2.113791273825558, + "grad_norm": 1.7372645139694214, + "learning_rate": 1.9289682140718685e-05, + "loss": 0.5415, + "step": 12949 + }, + { + "epoch": 2.1139545324680626, + "grad_norm": 1.7192362546920776, + "learning_rate": 1.928956467236726e-05, + "loss": 0.5472, + "step": 12950 + }, + { + "epoch": 2.114117791110567, + "grad_norm": 1.5034383535385132, + "learning_rate": 1.9289447194661243e-05, + "loss": 0.5089, + "step": 12951 + }, + { + "epoch": 2.1142810497530715, + "grad_norm": 1.4364275932312012, + "learning_rate": 1.9289329707600758e-05, + "loss": 0.4967, + "step": 12952 + }, + { + "epoch": 2.1144443083955755, + "grad_norm": 1.6427183151245117, + "learning_rate": 1.9289212211185918e-05, + "loss": 0.5045, + "step": 12953 + }, + { + "epoch": 2.11460756703808, + "grad_norm": 1.8998587131500244, + "learning_rate": 1.9289094705416846e-05, + "loss": 0.5558, + "step": 12954 + }, + { + "epoch": 2.1147708256805844, + "grad_norm": 1.8749254941940308, + "learning_rate": 1.9288977190293658e-05, + "loss": 0.5944, + "step": 12955 + }, + { + "epoch": 2.114934084323089, + "grad_norm": 1.710814118385315, + "learning_rate": 1.928885966581647e-05, + "loss": 0.5232, + "step": 12956 + }, + { + "epoch": 2.1150973429655933, + "grad_norm": 1.962756872177124, + "learning_rate": 1.9288742131985408e-05, + "loss": 0.6692, + "step": 12957 + }, + { + "epoch": 2.1152606016080977, + "grad_norm": 2.016226291656494, + "learning_rate": 1.928862458880058e-05, + "loss": 0.5967, + "step": 12958 + }, + { + "epoch": 2.115423860250602, + "grad_norm": 1.709696650505066, + "learning_rate": 1.928850703626211e-05, + "loss": 0.566, + "step": 12959 + }, + { + "epoch": 2.1155871188931066, + "grad_norm": 1.9613150358200073, + "learning_rate": 1.9288389474370116e-05, + "loss": 0.6301, + "step": 12960 + }, + { + "epoch": 2.115750377535611, + "grad_norm": 1.7462822198867798, + "learning_rate": 1.928827190312472e-05, + "loss": 0.4582, + "step": 12961 + }, + { + "epoch": 2.115913636178115, + "grad_norm": 1.916809320449829, + "learning_rate": 1.9288154322526033e-05, + "loss": 0.5294, + "step": 12962 + }, + { + "epoch": 2.1160768948206194, + "grad_norm": 1.6629599332809448, + "learning_rate": 1.928803673257418e-05, + "loss": 0.5439, + "step": 12963 + }, + { + "epoch": 2.116240153463124, + "grad_norm": 2.01239275932312, + "learning_rate": 1.928791913326927e-05, + "loss": 0.5594, + "step": 12964 + }, + { + "epoch": 2.1164034121056283, + "grad_norm": 2.9051990509033203, + "learning_rate": 1.9287801524611436e-05, + "loss": 0.5719, + "step": 12965 + }, + { + "epoch": 2.1165666707481328, + "grad_norm": 2.0981526374816895, + "learning_rate": 1.928768390660078e-05, + "loss": 0.6224, + "step": 12966 + }, + { + "epoch": 2.116729929390637, + "grad_norm": 1.9037827253341675, + "learning_rate": 1.9287566279237437e-05, + "loss": 0.4912, + "step": 12967 + }, + { + "epoch": 2.1168931880331416, + "grad_norm": 1.8557428121566772, + "learning_rate": 1.9287448642521513e-05, + "loss": 0.6067, + "step": 12968 + }, + { + "epoch": 2.117056446675646, + "grad_norm": 2.0474653244018555, + "learning_rate": 1.9287330996453133e-05, + "loss": 0.5754, + "step": 12969 + }, + { + "epoch": 2.11721970531815, + "grad_norm": 1.8348212242126465, + "learning_rate": 1.928721334103241e-05, + "loss": 0.5261, + "step": 12970 + }, + { + "epoch": 2.1173829639606545, + "grad_norm": 1.9003703594207764, + "learning_rate": 1.9287095676259467e-05, + "loss": 0.5738, + "step": 12971 + }, + { + "epoch": 2.117546222603159, + "grad_norm": 1.6596264839172363, + "learning_rate": 1.928697800213442e-05, + "loss": 0.563, + "step": 12972 + }, + { + "epoch": 2.1177094812456634, + "grad_norm": 1.8570271730422974, + "learning_rate": 1.928686031865739e-05, + "loss": 0.5492, + "step": 12973 + }, + { + "epoch": 2.117872739888168, + "grad_norm": 1.8317410945892334, + "learning_rate": 1.9286742625828495e-05, + "loss": 0.4928, + "step": 12974 + }, + { + "epoch": 2.1180359985306723, + "grad_norm": 1.449820876121521, + "learning_rate": 1.928662492364785e-05, + "loss": 0.4936, + "step": 12975 + }, + { + "epoch": 2.1181992571731767, + "grad_norm": 1.746020793914795, + "learning_rate": 1.9286507212115578e-05, + "loss": 0.5303, + "step": 12976 + }, + { + "epoch": 2.118362515815681, + "grad_norm": 2.1669554710388184, + "learning_rate": 1.9286389491231796e-05, + "loss": 1.0245, + "step": 12977 + }, + { + "epoch": 2.1185257744581856, + "grad_norm": 1.7132627964019775, + "learning_rate": 1.9286271760996622e-05, + "loss": 0.5513, + "step": 12978 + }, + { + "epoch": 2.1186890331006896, + "grad_norm": 1.8609387874603271, + "learning_rate": 1.9286154021410177e-05, + "loss": 0.6054, + "step": 12979 + }, + { + "epoch": 2.118852291743194, + "grad_norm": 1.9595474004745483, + "learning_rate": 1.9286036272472572e-05, + "loss": 0.59, + "step": 12980 + }, + { + "epoch": 2.1190155503856984, + "grad_norm": 1.9528062343597412, + "learning_rate": 1.9285918514183934e-05, + "loss": 0.4918, + "step": 12981 + }, + { + "epoch": 2.119178809028203, + "grad_norm": 1.806640863418579, + "learning_rate": 1.9285800746544378e-05, + "loss": 0.512, + "step": 12982 + }, + { + "epoch": 2.1193420676707073, + "grad_norm": 1.9533607959747314, + "learning_rate": 1.9285682969554025e-05, + "loss": 0.5615, + "step": 12983 + }, + { + "epoch": 2.1195053263132118, + "grad_norm": 1.6812686920166016, + "learning_rate": 1.9285565183212987e-05, + "loss": 0.5775, + "step": 12984 + }, + { + "epoch": 2.119668584955716, + "grad_norm": 2.073803663253784, + "learning_rate": 1.9285447387521394e-05, + "loss": 0.7774, + "step": 12985 + }, + { + "epoch": 2.1198318435982206, + "grad_norm": 1.6792383193969727, + "learning_rate": 1.9285329582479353e-05, + "loss": 0.5182, + "step": 12986 + }, + { + "epoch": 2.119995102240725, + "grad_norm": 1.5999168157577515, + "learning_rate": 1.928521176808699e-05, + "loss": 0.5586, + "step": 12987 + }, + { + "epoch": 2.120158360883229, + "grad_norm": 1.8149020671844482, + "learning_rate": 1.928509394434442e-05, + "loss": 0.5401, + "step": 12988 + }, + { + "epoch": 2.1203216195257335, + "grad_norm": 1.7051281929016113, + "learning_rate": 1.928497611125176e-05, + "loss": 0.5132, + "step": 12989 + }, + { + "epoch": 2.120484878168238, + "grad_norm": 1.6804461479187012, + "learning_rate": 1.9284858268809135e-05, + "loss": 0.5076, + "step": 12990 + }, + { + "epoch": 2.1206481368107424, + "grad_norm": 1.8424662351608276, + "learning_rate": 1.9284740417016663e-05, + "loss": 0.5278, + "step": 12991 + }, + { + "epoch": 2.120811395453247, + "grad_norm": 1.6963834762573242, + "learning_rate": 1.9284622555874457e-05, + "loss": 0.6294, + "step": 12992 + }, + { + "epoch": 2.1209746540957513, + "grad_norm": 1.7007827758789062, + "learning_rate": 1.9284504685382638e-05, + "loss": 0.4885, + "step": 12993 + }, + { + "epoch": 2.1211379127382557, + "grad_norm": 1.9952865839004517, + "learning_rate": 1.9284386805541323e-05, + "loss": 0.5565, + "step": 12994 + }, + { + "epoch": 2.12130117138076, + "grad_norm": 1.8487213850021362, + "learning_rate": 1.9284268916350637e-05, + "loss": 0.6852, + "step": 12995 + }, + { + "epoch": 2.1214644300232646, + "grad_norm": 1.9019463062286377, + "learning_rate": 1.9284151017810694e-05, + "loss": 0.5997, + "step": 12996 + }, + { + "epoch": 2.1216276886657686, + "grad_norm": 1.7040404081344604, + "learning_rate": 1.928403310992161e-05, + "loss": 0.5941, + "step": 12997 + }, + { + "epoch": 2.121790947308273, + "grad_norm": 1.6581823825836182, + "learning_rate": 1.928391519268351e-05, + "loss": 0.4844, + "step": 12998 + }, + { + "epoch": 2.1219542059507774, + "grad_norm": 1.5634639263153076, + "learning_rate": 1.928379726609651e-05, + "loss": 0.5327, + "step": 12999 + }, + { + "epoch": 2.122117464593282, + "grad_norm": 1.648926019668579, + "learning_rate": 1.9283679330160726e-05, + "loss": 0.5486, + "step": 13000 + }, + { + "epoch": 2.1222807232357863, + "grad_norm": 1.9633926153182983, + "learning_rate": 1.9283561384876284e-05, + "loss": 0.6063, + "step": 13001 + }, + { + "epoch": 2.1224439818782908, + "grad_norm": 1.9149971008300781, + "learning_rate": 1.9283443430243298e-05, + "loss": 0.5935, + "step": 13002 + }, + { + "epoch": 2.122607240520795, + "grad_norm": 1.838114619255066, + "learning_rate": 1.9283325466261883e-05, + "loss": 0.5686, + "step": 13003 + }, + { + "epoch": 2.1227704991632996, + "grad_norm": 1.6921919584274292, + "learning_rate": 1.928320749293216e-05, + "loss": 0.553, + "step": 13004 + }, + { + "epoch": 2.1229337578058036, + "grad_norm": 1.5522513389587402, + "learning_rate": 1.9283089510254255e-05, + "loss": 0.4657, + "step": 13005 + }, + { + "epoch": 2.123097016448308, + "grad_norm": 1.7602649927139282, + "learning_rate": 1.9282971518228278e-05, + "loss": 0.6145, + "step": 13006 + }, + { + "epoch": 2.1232602750908125, + "grad_norm": 1.9693505764007568, + "learning_rate": 1.928285351685435e-05, + "loss": 0.5497, + "step": 13007 + }, + { + "epoch": 2.123423533733317, + "grad_norm": 1.6961129903793335, + "learning_rate": 1.9282735506132594e-05, + "loss": 0.4944, + "step": 13008 + }, + { + "epoch": 2.1235867923758214, + "grad_norm": 1.7691706418991089, + "learning_rate": 1.9282617486063125e-05, + "loss": 0.5929, + "step": 13009 + }, + { + "epoch": 2.123750051018326, + "grad_norm": 1.7694048881530762, + "learning_rate": 1.9282499456646064e-05, + "loss": 0.5965, + "step": 13010 + }, + { + "epoch": 2.1239133096608303, + "grad_norm": 1.679465651512146, + "learning_rate": 1.9282381417881528e-05, + "loss": 0.5286, + "step": 13011 + }, + { + "epoch": 2.1240765683033347, + "grad_norm": 2.418351888656616, + "learning_rate": 1.9282263369769633e-05, + "loss": 0.6385, + "step": 13012 + }, + { + "epoch": 2.124239826945839, + "grad_norm": 1.5249971151351929, + "learning_rate": 1.9282145312310507e-05, + "loss": 0.4818, + "step": 13013 + }, + { + "epoch": 2.124403085588343, + "grad_norm": 1.717676043510437, + "learning_rate": 1.9282027245504256e-05, + "loss": 0.5133, + "step": 13014 + }, + { + "epoch": 2.1245663442308476, + "grad_norm": 1.8878817558288574, + "learning_rate": 1.928190916935101e-05, + "loss": 0.5087, + "step": 13015 + }, + { + "epoch": 2.124729602873352, + "grad_norm": 1.6088862419128418, + "learning_rate": 1.9281791083850886e-05, + "loss": 0.4869, + "step": 13016 + }, + { + "epoch": 2.1248928615158564, + "grad_norm": 1.5224419832229614, + "learning_rate": 1.9281672989004e-05, + "loss": 0.5759, + "step": 13017 + }, + { + "epoch": 2.125056120158361, + "grad_norm": 1.6167081594467163, + "learning_rate": 1.928155488481047e-05, + "loss": 0.5962, + "step": 13018 + }, + { + "epoch": 2.1252193788008653, + "grad_norm": 1.6737920045852661, + "learning_rate": 1.9281436771270417e-05, + "loss": 0.5134, + "step": 13019 + }, + { + "epoch": 2.1253826374433697, + "grad_norm": 1.5267287492752075, + "learning_rate": 1.928131864838396e-05, + "loss": 0.5508, + "step": 13020 + }, + { + "epoch": 2.125545896085874, + "grad_norm": 1.6343398094177246, + "learning_rate": 1.9281200516151216e-05, + "loss": 0.5567, + "step": 13021 + }, + { + "epoch": 2.1257091547283786, + "grad_norm": 1.6703914403915405, + "learning_rate": 1.928108237457231e-05, + "loss": 0.6666, + "step": 13022 + }, + { + "epoch": 2.1258724133708826, + "grad_norm": 1.598329782485962, + "learning_rate": 1.928096422364735e-05, + "loss": 0.5472, + "step": 13023 + }, + { + "epoch": 2.126035672013387, + "grad_norm": 1.754770040512085, + "learning_rate": 1.9280846063376465e-05, + "loss": 0.6124, + "step": 13024 + }, + { + "epoch": 2.1261989306558915, + "grad_norm": 1.6449315547943115, + "learning_rate": 1.928072789375977e-05, + "loss": 0.4821, + "step": 13025 + }, + { + "epoch": 2.126362189298396, + "grad_norm": 1.987069845199585, + "learning_rate": 1.9280609714797383e-05, + "loss": 0.5595, + "step": 13026 + }, + { + "epoch": 2.1265254479409004, + "grad_norm": 1.9565759897232056, + "learning_rate": 1.928049152648943e-05, + "loss": 0.6535, + "step": 13027 + }, + { + "epoch": 2.126688706583405, + "grad_norm": 1.4868226051330566, + "learning_rate": 1.928037332883602e-05, + "loss": 0.5597, + "step": 13028 + }, + { + "epoch": 2.1268519652259092, + "grad_norm": 1.940415620803833, + "learning_rate": 1.9280255121837276e-05, + "loss": 0.5797, + "step": 13029 + }, + { + "epoch": 2.1270152238684137, + "grad_norm": 1.524004340171814, + "learning_rate": 1.9280136905493322e-05, + "loss": 0.5554, + "step": 13030 + }, + { + "epoch": 2.127178482510918, + "grad_norm": 1.7778284549713135, + "learning_rate": 1.928001867980427e-05, + "loss": 0.5625, + "step": 13031 + }, + { + "epoch": 2.127341741153422, + "grad_norm": 1.687001347541809, + "learning_rate": 1.927990044477024e-05, + "loss": 0.6283, + "step": 13032 + }, + { + "epoch": 2.1275049997959266, + "grad_norm": 2.341477155685425, + "learning_rate": 1.927978220039135e-05, + "loss": 0.6157, + "step": 13033 + }, + { + "epoch": 2.127668258438431, + "grad_norm": 2.1837122440338135, + "learning_rate": 1.927966394666773e-05, + "loss": 0.6189, + "step": 13034 + }, + { + "epoch": 2.1278315170809354, + "grad_norm": 1.8442182540893555, + "learning_rate": 1.9279545683599482e-05, + "loss": 0.4427, + "step": 13035 + }, + { + "epoch": 2.12799477572344, + "grad_norm": 1.9116489887237549, + "learning_rate": 1.927942741118674e-05, + "loss": 0.6565, + "step": 13036 + }, + { + "epoch": 2.1281580343659443, + "grad_norm": 2.3225440979003906, + "learning_rate": 1.9279309129429617e-05, + "loss": 0.8159, + "step": 13037 + }, + { + "epoch": 2.1283212930084487, + "grad_norm": 1.9097423553466797, + "learning_rate": 1.927919083832823e-05, + "loss": 0.5791, + "step": 13038 + }, + { + "epoch": 2.128484551650953, + "grad_norm": 1.9382257461547852, + "learning_rate": 1.92790725378827e-05, + "loss": 0.5696, + "step": 13039 + }, + { + "epoch": 2.128647810293457, + "grad_norm": 1.5638095140457153, + "learning_rate": 1.927895422809315e-05, + "loss": 0.5278, + "step": 13040 + }, + { + "epoch": 2.1288110689359616, + "grad_norm": 1.467095136642456, + "learning_rate": 1.9278835908959694e-05, + "loss": 0.5028, + "step": 13041 + }, + { + "epoch": 2.128974327578466, + "grad_norm": 1.5715941190719604, + "learning_rate": 1.927871758048245e-05, + "loss": 0.5404, + "step": 13042 + }, + { + "epoch": 2.1291375862209705, + "grad_norm": 1.6392290592193604, + "learning_rate": 1.927859924266154e-05, + "loss": 0.6445, + "step": 13043 + }, + { + "epoch": 2.129300844863475, + "grad_norm": 2.0634756088256836, + "learning_rate": 1.9278480895497086e-05, + "loss": 0.6739, + "step": 13044 + }, + { + "epoch": 2.1294641035059794, + "grad_norm": 1.6276482343673706, + "learning_rate": 1.92783625389892e-05, + "loss": 0.4989, + "step": 13045 + }, + { + "epoch": 2.129627362148484, + "grad_norm": 1.6199721097946167, + "learning_rate": 1.927824417313801e-05, + "loss": 0.5221, + "step": 13046 + }, + { + "epoch": 2.1297906207909882, + "grad_norm": 1.7544598579406738, + "learning_rate": 1.9278125797943626e-05, + "loss": 0.6904, + "step": 13047 + }, + { + "epoch": 2.1299538794334927, + "grad_norm": 1.6774381399154663, + "learning_rate": 1.9278007413406176e-05, + "loss": 0.5101, + "step": 13048 + }, + { + "epoch": 2.130117138075997, + "grad_norm": 1.7682301998138428, + "learning_rate": 1.9277889019525773e-05, + "loss": 0.5897, + "step": 13049 + }, + { + "epoch": 2.130280396718501, + "grad_norm": 1.5831581354141235, + "learning_rate": 1.927777061630254e-05, + "loss": 0.5085, + "step": 13050 + }, + { + "epoch": 2.1304436553610056, + "grad_norm": 2.016889810562134, + "learning_rate": 1.9277652203736593e-05, + "loss": 0.5934, + "step": 13051 + }, + { + "epoch": 2.13060691400351, + "grad_norm": 1.8319611549377441, + "learning_rate": 1.9277533781828053e-05, + "loss": 0.7135, + "step": 13052 + }, + { + "epoch": 2.1307701726460144, + "grad_norm": 1.475691795349121, + "learning_rate": 1.9277415350577037e-05, + "loss": 0.4819, + "step": 13053 + }, + { + "epoch": 2.130933431288519, + "grad_norm": 1.8789970874786377, + "learning_rate": 1.927729690998367e-05, + "loss": 0.6174, + "step": 13054 + }, + { + "epoch": 2.1310966899310233, + "grad_norm": 1.7319424152374268, + "learning_rate": 1.9277178460048065e-05, + "loss": 0.5609, + "step": 13055 + }, + { + "epoch": 2.1312599485735277, + "grad_norm": 1.6316282749176025, + "learning_rate": 1.9277060000770342e-05, + "loss": 0.4712, + "step": 13056 + }, + { + "epoch": 2.131423207216032, + "grad_norm": 1.7025171518325806, + "learning_rate": 1.9276941532150625e-05, + "loss": 0.5782, + "step": 13057 + }, + { + "epoch": 2.131586465858536, + "grad_norm": 1.869409203529358, + "learning_rate": 1.9276823054189032e-05, + "loss": 0.5458, + "step": 13058 + }, + { + "epoch": 2.1317497245010406, + "grad_norm": 1.722122073173523, + "learning_rate": 1.9276704566885676e-05, + "loss": 0.6209, + "step": 13059 + }, + { + "epoch": 2.131912983143545, + "grad_norm": 1.9541490077972412, + "learning_rate": 1.9276586070240684e-05, + "loss": 0.6257, + "step": 13060 + }, + { + "epoch": 2.1320762417860495, + "grad_norm": 1.6492030620574951, + "learning_rate": 1.9276467564254173e-05, + "loss": 0.5336, + "step": 13061 + }, + { + "epoch": 2.132239500428554, + "grad_norm": 1.7177084684371948, + "learning_rate": 1.9276349048926257e-05, + "loss": 0.5809, + "step": 13062 + }, + { + "epoch": 2.1324027590710584, + "grad_norm": 1.829740047454834, + "learning_rate": 1.9276230524257066e-05, + "loss": 0.6908, + "step": 13063 + }, + { + "epoch": 2.132566017713563, + "grad_norm": 1.8301079273223877, + "learning_rate": 1.927611199024671e-05, + "loss": 0.5622, + "step": 13064 + }, + { + "epoch": 2.1327292763560672, + "grad_norm": 1.6498534679412842, + "learning_rate": 1.9275993446895312e-05, + "loss": 0.5438, + "step": 13065 + }, + { + "epoch": 2.1328925349985717, + "grad_norm": 1.8239853382110596, + "learning_rate": 1.927587489420299e-05, + "loss": 0.5542, + "step": 13066 + }, + { + "epoch": 2.1330557936410757, + "grad_norm": 1.7599238157272339, + "learning_rate": 1.9275756332169865e-05, + "loss": 0.5621, + "step": 13067 + }, + { + "epoch": 2.13321905228358, + "grad_norm": 1.4714633226394653, + "learning_rate": 1.927563776079606e-05, + "loss": 0.5733, + "step": 13068 + }, + { + "epoch": 2.1333823109260845, + "grad_norm": 1.7328193187713623, + "learning_rate": 1.9275519180081682e-05, + "loss": 0.5929, + "step": 13069 + }, + { + "epoch": 2.133545569568589, + "grad_norm": 1.851211428642273, + "learning_rate": 1.9275400590026866e-05, + "loss": 0.5593, + "step": 13070 + }, + { + "epoch": 2.1337088282110934, + "grad_norm": 1.7401132583618164, + "learning_rate": 1.927528199063172e-05, + "loss": 0.6928, + "step": 13071 + }, + { + "epoch": 2.133872086853598, + "grad_norm": 1.8318625688552856, + "learning_rate": 1.9275163381896368e-05, + "loss": 0.6206, + "step": 13072 + }, + { + "epoch": 2.1340353454961023, + "grad_norm": 1.8565585613250732, + "learning_rate": 1.927504476382093e-05, + "loss": 0.6266, + "step": 13073 + }, + { + "epoch": 2.1341986041386067, + "grad_norm": 1.7599871158599854, + "learning_rate": 1.9274926136405524e-05, + "loss": 0.566, + "step": 13074 + }, + { + "epoch": 2.134361862781111, + "grad_norm": 1.7907156944274902, + "learning_rate": 1.927480749965027e-05, + "loss": 0.6216, + "step": 13075 + }, + { + "epoch": 2.134525121423615, + "grad_norm": 1.6799432039260864, + "learning_rate": 1.9274688853555288e-05, + "loss": 0.5685, + "step": 13076 + }, + { + "epoch": 2.1346883800661196, + "grad_norm": 1.7050100564956665, + "learning_rate": 1.9274570198120696e-05, + "loss": 0.5368, + "step": 13077 + }, + { + "epoch": 2.134851638708624, + "grad_norm": 1.5879122018814087, + "learning_rate": 1.9274451533346617e-05, + "loss": 0.5423, + "step": 13078 + }, + { + "epoch": 2.1350148973511285, + "grad_norm": 1.7706385850906372, + "learning_rate": 1.9274332859233163e-05, + "loss": 0.5228, + "step": 13079 + }, + { + "epoch": 2.135178155993633, + "grad_norm": 1.5726711750030518, + "learning_rate": 1.927421417578046e-05, + "loss": 0.518, + "step": 13080 + }, + { + "epoch": 2.1353414146361374, + "grad_norm": 1.7866709232330322, + "learning_rate": 1.9274095482988627e-05, + "loss": 0.595, + "step": 13081 + }, + { + "epoch": 2.135504673278642, + "grad_norm": 1.7645374536514282, + "learning_rate": 1.927397678085778e-05, + "loss": 0.574, + "step": 13082 + }, + { + "epoch": 2.1356679319211462, + "grad_norm": 1.7819381952285767, + "learning_rate": 1.927385806938804e-05, + "loss": 0.7238, + "step": 13083 + }, + { + "epoch": 2.1358311905636507, + "grad_norm": 1.8026423454284668, + "learning_rate": 1.927373934857953e-05, + "loss": 0.674, + "step": 13084 + }, + { + "epoch": 2.1359944492061547, + "grad_norm": 2.2905080318450928, + "learning_rate": 1.927362061843237e-05, + "loss": 0.7526, + "step": 13085 + }, + { + "epoch": 2.136157707848659, + "grad_norm": 1.7917044162750244, + "learning_rate": 1.9273501878946672e-05, + "loss": 0.6069, + "step": 13086 + }, + { + "epoch": 2.1363209664911635, + "grad_norm": 1.6695772409439087, + "learning_rate": 1.927338313012256e-05, + "loss": 0.5842, + "step": 13087 + }, + { + "epoch": 2.136484225133668, + "grad_norm": 2.1657235622406006, + "learning_rate": 1.9273264371960155e-05, + "loss": 0.7708, + "step": 13088 + }, + { + "epoch": 2.1366474837761724, + "grad_norm": 1.5874484777450562, + "learning_rate": 1.9273145604459577e-05, + "loss": 0.4867, + "step": 13089 + }, + { + "epoch": 2.136810742418677, + "grad_norm": 2.5058553218841553, + "learning_rate": 1.9273026827620942e-05, + "loss": 0.6271, + "step": 13090 + }, + { + "epoch": 2.1369740010611813, + "grad_norm": 1.798949122428894, + "learning_rate": 1.9272908041444372e-05, + "loss": 0.6136, + "step": 13091 + }, + { + "epoch": 2.1371372597036857, + "grad_norm": 1.543386697769165, + "learning_rate": 1.9272789245929985e-05, + "loss": 0.4961, + "step": 13092 + }, + { + "epoch": 2.1373005183461897, + "grad_norm": 1.8983169794082642, + "learning_rate": 1.92726704410779e-05, + "loss": 0.615, + "step": 13093 + }, + { + "epoch": 2.137463776988694, + "grad_norm": 1.5488721132278442, + "learning_rate": 1.927255162688824e-05, + "loss": 0.4531, + "step": 13094 + }, + { + "epoch": 2.1376270356311986, + "grad_norm": 1.9322967529296875, + "learning_rate": 1.9272432803361124e-05, + "loss": 0.5782, + "step": 13095 + }, + { + "epoch": 2.137790294273703, + "grad_norm": 1.7699416875839233, + "learning_rate": 1.9272313970496674e-05, + "loss": 0.5419, + "step": 13096 + }, + { + "epoch": 2.1379535529162075, + "grad_norm": 1.9889681339263916, + "learning_rate": 1.9272195128295e-05, + "loss": 0.5764, + "step": 13097 + }, + { + "epoch": 2.138116811558712, + "grad_norm": 1.914892315864563, + "learning_rate": 1.927207627675623e-05, + "loss": 0.5599, + "step": 13098 + }, + { + "epoch": 2.1382800702012164, + "grad_norm": 1.8267251253128052, + "learning_rate": 1.9271957415880482e-05, + "loss": 0.6368, + "step": 13099 + }, + { + "epoch": 2.138443328843721, + "grad_norm": 1.7069894075393677, + "learning_rate": 1.9271838545667876e-05, + "loss": 0.5844, + "step": 13100 + }, + { + "epoch": 2.1386065874862252, + "grad_norm": 2.155895709991455, + "learning_rate": 1.927171966611853e-05, + "loss": 0.64, + "step": 13101 + }, + { + "epoch": 2.1387698461287297, + "grad_norm": 1.8097240924835205, + "learning_rate": 1.9271600777232567e-05, + "loss": 0.5209, + "step": 13102 + }, + { + "epoch": 2.1389331047712337, + "grad_norm": 1.8916829824447632, + "learning_rate": 1.9271481879010103e-05, + "loss": 0.5459, + "step": 13103 + }, + { + "epoch": 2.139096363413738, + "grad_norm": 1.7086131572723389, + "learning_rate": 1.9271362971451255e-05, + "loss": 0.5533, + "step": 13104 + }, + { + "epoch": 2.1392596220562425, + "grad_norm": 1.851006031036377, + "learning_rate": 1.9271244054556152e-05, + "loss": 0.6274, + "step": 13105 + }, + { + "epoch": 2.139422880698747, + "grad_norm": 1.6000010967254639, + "learning_rate": 1.9271125128324906e-05, + "loss": 0.5284, + "step": 13106 + }, + { + "epoch": 2.1395861393412514, + "grad_norm": 1.5376876592636108, + "learning_rate": 1.9271006192757643e-05, + "loss": 0.4894, + "step": 13107 + }, + { + "epoch": 2.139749397983756, + "grad_norm": 1.7992032766342163, + "learning_rate": 1.9270887247854478e-05, + "loss": 0.5119, + "step": 13108 + }, + { + "epoch": 2.1399126566262603, + "grad_norm": 1.824082374572754, + "learning_rate": 1.927076829361553e-05, + "loss": 0.5893, + "step": 13109 + }, + { + "epoch": 2.1400759152687647, + "grad_norm": 1.5587283372879028, + "learning_rate": 1.927064933004092e-05, + "loss": 0.5201, + "step": 13110 + }, + { + "epoch": 2.1402391739112687, + "grad_norm": 1.8371176719665527, + "learning_rate": 1.927053035713077e-05, + "loss": 0.6409, + "step": 13111 + }, + { + "epoch": 2.140402432553773, + "grad_norm": 1.7079236507415771, + "learning_rate": 1.92704113748852e-05, + "loss": 0.5633, + "step": 13112 + }, + { + "epoch": 2.1405656911962776, + "grad_norm": 2.1155457496643066, + "learning_rate": 1.927029238330433e-05, + "loss": 0.6251, + "step": 13113 + }, + { + "epoch": 2.140728949838782, + "grad_norm": 1.4213157892227173, + "learning_rate": 1.9270173382388274e-05, + "loss": 0.4607, + "step": 13114 + }, + { + "epoch": 2.1408922084812865, + "grad_norm": 1.6777281761169434, + "learning_rate": 1.9270054372137154e-05, + "loss": 0.5148, + "step": 13115 + }, + { + "epoch": 2.141055467123791, + "grad_norm": 1.7544119358062744, + "learning_rate": 1.9269935352551097e-05, + "loss": 0.5974, + "step": 13116 + }, + { + "epoch": 2.1412187257662953, + "grad_norm": 1.826151728630066, + "learning_rate": 1.9269816323630215e-05, + "loss": 0.5302, + "step": 13117 + }, + { + "epoch": 2.1413819844088, + "grad_norm": 1.797020435333252, + "learning_rate": 1.926969728537463e-05, + "loss": 0.5746, + "step": 13118 + }, + { + "epoch": 2.1415452430513042, + "grad_norm": 1.9144299030303955, + "learning_rate": 1.926957823778446e-05, + "loss": 0.583, + "step": 13119 + }, + { + "epoch": 2.141708501693808, + "grad_norm": 1.6890424489974976, + "learning_rate": 1.9269459180859834e-05, + "loss": 0.5882, + "step": 13120 + }, + { + "epoch": 2.1418717603363127, + "grad_norm": 1.6111280918121338, + "learning_rate": 1.9269340114600862e-05, + "loss": 0.4042, + "step": 13121 + }, + { + "epoch": 2.142035018978817, + "grad_norm": 2.025073766708374, + "learning_rate": 1.9269221039007666e-05, + "loss": 0.6863, + "step": 13122 + }, + { + "epoch": 2.1421982776213215, + "grad_norm": 1.650171160697937, + "learning_rate": 1.9269101954080366e-05, + "loss": 0.5196, + "step": 13123 + }, + { + "epoch": 2.142361536263826, + "grad_norm": 1.6853817701339722, + "learning_rate": 1.9268982859819085e-05, + "loss": 0.5148, + "step": 13124 + }, + { + "epoch": 2.1425247949063304, + "grad_norm": 1.796764612197876, + "learning_rate": 1.9268863756223937e-05, + "loss": 0.5443, + "step": 13125 + }, + { + "epoch": 2.142688053548835, + "grad_norm": 1.546323299407959, + "learning_rate": 1.926874464329505e-05, + "loss": 0.5152, + "step": 13126 + }, + { + "epoch": 2.1428513121913393, + "grad_norm": 1.8875670433044434, + "learning_rate": 1.9268625521032536e-05, + "loss": 0.5158, + "step": 13127 + }, + { + "epoch": 2.1430145708338437, + "grad_norm": 2.0400378704071045, + "learning_rate": 1.926850638943652e-05, + "loss": 0.6719, + "step": 13128 + }, + { + "epoch": 2.1431778294763477, + "grad_norm": 1.8072713613510132, + "learning_rate": 1.926838724850712e-05, + "loss": 0.5579, + "step": 13129 + }, + { + "epoch": 2.143341088118852, + "grad_norm": 1.831193208694458, + "learning_rate": 1.926826809824446e-05, + "loss": 0.6237, + "step": 13130 + }, + { + "epoch": 2.1435043467613566, + "grad_norm": 1.7746719121932983, + "learning_rate": 1.926814893864865e-05, + "loss": 0.5449, + "step": 13131 + }, + { + "epoch": 2.143667605403861, + "grad_norm": 1.7644745111465454, + "learning_rate": 1.9268029769719824e-05, + "loss": 0.5521, + "step": 13132 + }, + { + "epoch": 2.1438308640463655, + "grad_norm": 1.879843831062317, + "learning_rate": 1.926791059145809e-05, + "loss": 0.6075, + "step": 13133 + }, + { + "epoch": 2.14399412268887, + "grad_norm": 1.881650447845459, + "learning_rate": 1.9267791403863575e-05, + "loss": 0.6647, + "step": 13134 + }, + { + "epoch": 2.1441573813313743, + "grad_norm": 1.6046339273452759, + "learning_rate": 1.9267672206936395e-05, + "loss": 0.4598, + "step": 13135 + }, + { + "epoch": 2.144320639973879, + "grad_norm": 2.1067168712615967, + "learning_rate": 1.9267553000676667e-05, + "loss": 0.6689, + "step": 13136 + }, + { + "epoch": 2.144483898616383, + "grad_norm": 1.7307701110839844, + "learning_rate": 1.9267433785084523e-05, + "loss": 0.4428, + "step": 13137 + }, + { + "epoch": 2.144647157258887, + "grad_norm": 1.6352276802062988, + "learning_rate": 1.9267314560160072e-05, + "loss": 0.5099, + "step": 13138 + }, + { + "epoch": 2.1448104159013917, + "grad_norm": 2.214813232421875, + "learning_rate": 1.926719532590344e-05, + "loss": 0.6402, + "step": 13139 + }, + { + "epoch": 2.144973674543896, + "grad_norm": 1.831155776977539, + "learning_rate": 1.926707608231474e-05, + "loss": 0.5639, + "step": 13140 + }, + { + "epoch": 2.1451369331864005, + "grad_norm": 1.5921908617019653, + "learning_rate": 1.9266956829394103e-05, + "loss": 0.5579, + "step": 13141 + }, + { + "epoch": 2.145300191828905, + "grad_norm": 2.1882338523864746, + "learning_rate": 1.9266837567141638e-05, + "loss": 0.691, + "step": 13142 + }, + { + "epoch": 2.1454634504714094, + "grad_norm": 1.9765212535858154, + "learning_rate": 1.9266718295557472e-05, + "loss": 0.6538, + "step": 13143 + }, + { + "epoch": 2.145626709113914, + "grad_norm": 1.86708402633667, + "learning_rate": 1.9266599014641724e-05, + "loss": 0.5008, + "step": 13144 + }, + { + "epoch": 2.1457899677564183, + "grad_norm": 1.6254773139953613, + "learning_rate": 1.926647972439451e-05, + "loss": 0.5784, + "step": 13145 + }, + { + "epoch": 2.1459532263989223, + "grad_norm": 1.7189005613327026, + "learning_rate": 1.9266360424815957e-05, + "loss": 0.5603, + "step": 13146 + }, + { + "epoch": 2.1461164850414267, + "grad_norm": 1.7170103788375854, + "learning_rate": 1.926624111590618e-05, + "loss": 0.5072, + "step": 13147 + }, + { + "epoch": 2.146279743683931, + "grad_norm": 1.7181285619735718, + "learning_rate": 1.92661217976653e-05, + "loss": 0.5086, + "step": 13148 + }, + { + "epoch": 2.1464430023264356, + "grad_norm": 2.139894485473633, + "learning_rate": 1.926600247009344e-05, + "loss": 0.657, + "step": 13149 + }, + { + "epoch": 2.14660626096894, + "grad_norm": 1.9918763637542725, + "learning_rate": 1.9265883133190715e-05, + "loss": 0.6001, + "step": 13150 + }, + { + "epoch": 2.1467695196114445, + "grad_norm": 1.7548608779907227, + "learning_rate": 1.926576378695725e-05, + "loss": 0.6883, + "step": 13151 + }, + { + "epoch": 2.146932778253949, + "grad_norm": 1.5560482740402222, + "learning_rate": 1.9265644431393166e-05, + "loss": 0.4754, + "step": 13152 + }, + { + "epoch": 2.1470960368964533, + "grad_norm": 1.2967911958694458, + "learning_rate": 1.9265525066498577e-05, + "loss": 0.4751, + "step": 13153 + }, + { + "epoch": 2.147259295538958, + "grad_norm": 2.20422625541687, + "learning_rate": 1.9265405692273608e-05, + "loss": 0.6701, + "step": 13154 + }, + { + "epoch": 2.1474225541814618, + "grad_norm": 1.3159680366516113, + "learning_rate": 1.9265286308718374e-05, + "loss": 0.4389, + "step": 13155 + }, + { + "epoch": 2.147585812823966, + "grad_norm": 2.35994815826416, + "learning_rate": 1.9265166915833005e-05, + "loss": 0.64, + "step": 13156 + }, + { + "epoch": 2.1477490714664706, + "grad_norm": 1.6138525009155273, + "learning_rate": 1.926504751361761e-05, + "loss": 0.4502, + "step": 13157 + }, + { + "epoch": 2.147912330108975, + "grad_norm": 1.8167921304702759, + "learning_rate": 1.9264928102072318e-05, + "loss": 0.5672, + "step": 13158 + }, + { + "epoch": 2.1480755887514795, + "grad_norm": 1.911700963973999, + "learning_rate": 1.9264808681197246e-05, + "loss": 0.5956, + "step": 13159 + }, + { + "epoch": 2.148238847393984, + "grad_norm": 1.746031403541565, + "learning_rate": 1.9264689250992514e-05, + "loss": 0.6001, + "step": 13160 + }, + { + "epoch": 2.1484021060364884, + "grad_norm": 1.4843804836273193, + "learning_rate": 1.926456981145824e-05, + "loss": 0.4442, + "step": 13161 + }, + { + "epoch": 2.148565364678993, + "grad_norm": 2.0404913425445557, + "learning_rate": 1.926445036259455e-05, + "loss": 0.5969, + "step": 13162 + }, + { + "epoch": 2.1487286233214973, + "grad_norm": 1.5024774074554443, + "learning_rate": 1.9264330904401557e-05, + "loss": 0.4421, + "step": 13163 + }, + { + "epoch": 2.1488918819640013, + "grad_norm": 1.955682635307312, + "learning_rate": 1.926421143687939e-05, + "loss": 0.596, + "step": 13164 + }, + { + "epoch": 2.1490551406065057, + "grad_norm": 1.6358774900436401, + "learning_rate": 1.926409196002816e-05, + "loss": 0.5451, + "step": 13165 + }, + { + "epoch": 2.14921839924901, + "grad_norm": 2.0252084732055664, + "learning_rate": 1.9263972473847995e-05, + "loss": 0.5922, + "step": 13166 + }, + { + "epoch": 2.1493816578915146, + "grad_norm": 1.8088293075561523, + "learning_rate": 1.926385297833901e-05, + "loss": 0.5594, + "step": 13167 + }, + { + "epoch": 2.149544916534019, + "grad_norm": 1.9169856309890747, + "learning_rate": 1.9263733473501328e-05, + "loss": 0.6319, + "step": 13168 + }, + { + "epoch": 2.1497081751765235, + "grad_norm": 2.1439688205718994, + "learning_rate": 1.926361395933507e-05, + "loss": 0.6, + "step": 13169 + }, + { + "epoch": 2.149871433819028, + "grad_norm": 2.30246639251709, + "learning_rate": 1.9263494435840355e-05, + "loss": 0.7072, + "step": 13170 + }, + { + "epoch": 2.1500346924615323, + "grad_norm": 1.481549620628357, + "learning_rate": 1.9263374903017303e-05, + "loss": 0.5321, + "step": 13171 + }, + { + "epoch": 2.1501979511040368, + "grad_norm": 1.6236931085586548, + "learning_rate": 1.9263255360866037e-05, + "loss": 0.4655, + "step": 13172 + }, + { + "epoch": 2.1503612097465408, + "grad_norm": 1.913177728652954, + "learning_rate": 1.926313580938667e-05, + "loss": 0.6358, + "step": 13173 + }, + { + "epoch": 2.150524468389045, + "grad_norm": 1.6828123331069946, + "learning_rate": 1.926301624857933e-05, + "loss": 0.5827, + "step": 13174 + }, + { + "epoch": 2.1506877270315496, + "grad_norm": 1.8531734943389893, + "learning_rate": 1.9262896678444138e-05, + "loss": 0.6032, + "step": 13175 + }, + { + "epoch": 2.150850985674054, + "grad_norm": 2.0760715007781982, + "learning_rate": 1.9262777098981212e-05, + "loss": 0.6962, + "step": 13176 + }, + { + "epoch": 2.1510142443165585, + "grad_norm": 1.9740575551986694, + "learning_rate": 1.926265751019067e-05, + "loss": 0.6351, + "step": 13177 + }, + { + "epoch": 2.151177502959063, + "grad_norm": 1.6746083498001099, + "learning_rate": 1.926253791207263e-05, + "loss": 0.6021, + "step": 13178 + }, + { + "epoch": 2.1513407616015674, + "grad_norm": 1.5780713558197021, + "learning_rate": 1.9262418304627224e-05, + "loss": 0.4793, + "step": 13179 + }, + { + "epoch": 2.151504020244072, + "grad_norm": 2.1342222690582275, + "learning_rate": 1.926229868785456e-05, + "loss": 0.7751, + "step": 13180 + }, + { + "epoch": 2.151667278886576, + "grad_norm": 1.9073501825332642, + "learning_rate": 1.9262179061754766e-05, + "loss": 0.621, + "step": 13181 + }, + { + "epoch": 2.1518305375290803, + "grad_norm": 1.872809886932373, + "learning_rate": 1.926205942632796e-05, + "loss": 0.613, + "step": 13182 + }, + { + "epoch": 2.1519937961715847, + "grad_norm": 1.8838062286376953, + "learning_rate": 1.9261939781574264e-05, + "loss": 0.5195, + "step": 13183 + }, + { + "epoch": 2.152157054814089, + "grad_norm": 1.78023099899292, + "learning_rate": 1.9261820127493794e-05, + "loss": 0.6679, + "step": 13184 + }, + { + "epoch": 2.1523203134565936, + "grad_norm": 1.7167141437530518, + "learning_rate": 1.9261700464086677e-05, + "loss": 0.6751, + "step": 13185 + }, + { + "epoch": 2.152483572099098, + "grad_norm": 2.216970205307007, + "learning_rate": 1.9261580791353024e-05, + "loss": 0.5846, + "step": 13186 + }, + { + "epoch": 2.1526468307416025, + "grad_norm": 1.689347267150879, + "learning_rate": 1.9261461109292968e-05, + "loss": 0.5116, + "step": 13187 + }, + { + "epoch": 2.152810089384107, + "grad_norm": 1.6530646085739136, + "learning_rate": 1.9261341417906622e-05, + "loss": 0.5166, + "step": 13188 + }, + { + "epoch": 2.1529733480266113, + "grad_norm": 1.799710750579834, + "learning_rate": 1.9261221717194105e-05, + "loss": 0.588, + "step": 13189 + }, + { + "epoch": 2.1531366066691158, + "grad_norm": 1.815379023551941, + "learning_rate": 1.926110200715554e-05, + "loss": 0.5929, + "step": 13190 + }, + { + "epoch": 2.1532998653116198, + "grad_norm": 2.0140457153320312, + "learning_rate": 1.9260982287791053e-05, + "loss": 0.6932, + "step": 13191 + }, + { + "epoch": 2.153463123954124, + "grad_norm": 1.8705366849899292, + "learning_rate": 1.9260862559100756e-05, + "loss": 0.642, + "step": 13192 + }, + { + "epoch": 2.1536263825966286, + "grad_norm": 1.9946731328964233, + "learning_rate": 1.926074282108477e-05, + "loss": 0.6607, + "step": 13193 + }, + { + "epoch": 2.153789641239133, + "grad_norm": 1.6843860149383545, + "learning_rate": 1.926062307374322e-05, + "loss": 0.5892, + "step": 13194 + }, + { + "epoch": 2.1539528998816375, + "grad_norm": 1.8102991580963135, + "learning_rate": 1.9260503317076228e-05, + "loss": 0.5409, + "step": 13195 + }, + { + "epoch": 2.154116158524142, + "grad_norm": 1.634926199913025, + "learning_rate": 1.926038355108391e-05, + "loss": 0.5554, + "step": 13196 + }, + { + "epoch": 2.1542794171666464, + "grad_norm": 1.5880622863769531, + "learning_rate": 1.9260263775766388e-05, + "loss": 0.5064, + "step": 13197 + }, + { + "epoch": 2.154442675809151, + "grad_norm": 1.5944206714630127, + "learning_rate": 1.926014399112378e-05, + "loss": 0.6022, + "step": 13198 + }, + { + "epoch": 2.154605934451655, + "grad_norm": 1.6530132293701172, + "learning_rate": 1.9260024197156216e-05, + "loss": 0.5597, + "step": 13199 + }, + { + "epoch": 2.1547691930941593, + "grad_norm": 1.8063910007476807, + "learning_rate": 1.9259904393863804e-05, + "loss": 0.5044, + "step": 13200 + }, + { + "epoch": 2.1549324517366637, + "grad_norm": 1.698850393295288, + "learning_rate": 1.9259784581246674e-05, + "loss": 0.5267, + "step": 13201 + }, + { + "epoch": 2.155095710379168, + "grad_norm": 1.6063040494918823, + "learning_rate": 1.925966475930494e-05, + "loss": 0.5123, + "step": 13202 + }, + { + "epoch": 2.1552589690216726, + "grad_norm": 1.8648347854614258, + "learning_rate": 1.925954492803873e-05, + "loss": 0.5763, + "step": 13203 + }, + { + "epoch": 2.155422227664177, + "grad_norm": 1.5986442565917969, + "learning_rate": 1.925942508744816e-05, + "loss": 0.5386, + "step": 13204 + }, + { + "epoch": 2.1555854863066815, + "grad_norm": 1.562756061553955, + "learning_rate": 1.9259305237533352e-05, + "loss": 0.5908, + "step": 13205 + }, + { + "epoch": 2.155748744949186, + "grad_norm": 1.5717968940734863, + "learning_rate": 1.9259185378294424e-05, + "loss": 0.4781, + "step": 13206 + }, + { + "epoch": 2.1559120035916903, + "grad_norm": 1.7932546138763428, + "learning_rate": 1.92590655097315e-05, + "loss": 0.573, + "step": 13207 + }, + { + "epoch": 2.1560752622341943, + "grad_norm": 1.8231627941131592, + "learning_rate": 1.9258945631844697e-05, + "loss": 0.5617, + "step": 13208 + }, + { + "epoch": 2.1562385208766988, + "grad_norm": 2.121133804321289, + "learning_rate": 1.925882574463414e-05, + "loss": 0.6376, + "step": 13209 + }, + { + "epoch": 2.156401779519203, + "grad_norm": 1.7184325456619263, + "learning_rate": 1.925870584809995e-05, + "loss": 0.5598, + "step": 13210 + }, + { + "epoch": 2.1565650381617076, + "grad_norm": 2.033714532852173, + "learning_rate": 1.9258585942242244e-05, + "loss": 0.6907, + "step": 13211 + }, + { + "epoch": 2.156728296804212, + "grad_norm": 1.9756031036376953, + "learning_rate": 1.9258466027061143e-05, + "loss": 0.6249, + "step": 13212 + }, + { + "epoch": 2.1568915554467165, + "grad_norm": 1.8345876932144165, + "learning_rate": 1.9258346102556768e-05, + "loss": 0.5917, + "step": 13213 + }, + { + "epoch": 2.157054814089221, + "grad_norm": 2.030219316482544, + "learning_rate": 1.9258226168729247e-05, + "loss": 0.6446, + "step": 13214 + }, + { + "epoch": 2.1572180727317254, + "grad_norm": 1.7602249383926392, + "learning_rate": 1.9258106225578688e-05, + "loss": 0.5641, + "step": 13215 + }, + { + "epoch": 2.15738133137423, + "grad_norm": 2.131706476211548, + "learning_rate": 1.9257986273105224e-05, + "loss": 0.6314, + "step": 13216 + }, + { + "epoch": 2.157544590016734, + "grad_norm": 1.7575105428695679, + "learning_rate": 1.9257866311308966e-05, + "loss": 0.5612, + "step": 13217 + }, + { + "epoch": 2.1577078486592383, + "grad_norm": 1.7343469858169556, + "learning_rate": 1.925774634019004e-05, + "loss": 0.5473, + "step": 13218 + }, + { + "epoch": 2.1578711073017427, + "grad_norm": 1.8957282304763794, + "learning_rate": 1.925762635974857e-05, + "loss": 0.572, + "step": 13219 + }, + { + "epoch": 2.158034365944247, + "grad_norm": 1.7767362594604492, + "learning_rate": 1.925750636998467e-05, + "loss": 0.5033, + "step": 13220 + }, + { + "epoch": 2.1581976245867516, + "grad_norm": 1.8607797622680664, + "learning_rate": 1.925738637089846e-05, + "loss": 0.7595, + "step": 13221 + }, + { + "epoch": 2.158360883229256, + "grad_norm": 1.633574366569519, + "learning_rate": 1.9257266362490067e-05, + "loss": 0.4543, + "step": 13222 + }, + { + "epoch": 2.1585241418717604, + "grad_norm": 1.7657698392868042, + "learning_rate": 1.925714634475961e-05, + "loss": 0.5089, + "step": 13223 + }, + { + "epoch": 2.158687400514265, + "grad_norm": 1.797600269317627, + "learning_rate": 1.925702631770721e-05, + "loss": 0.5885, + "step": 13224 + }, + { + "epoch": 2.1588506591567693, + "grad_norm": 2.0702497959136963, + "learning_rate": 1.9256906281332983e-05, + "loss": 0.5202, + "step": 13225 + }, + { + "epoch": 2.1590139177992733, + "grad_norm": 1.7848105430603027, + "learning_rate": 1.9256786235637058e-05, + "loss": 0.5817, + "step": 13226 + }, + { + "epoch": 2.1591771764417778, + "grad_norm": 1.9321907758712769, + "learning_rate": 1.925666618061955e-05, + "loss": 0.6298, + "step": 13227 + }, + { + "epoch": 2.159340435084282, + "grad_norm": 1.898276925086975, + "learning_rate": 1.925654611628058e-05, + "loss": 0.5927, + "step": 13228 + }, + { + "epoch": 2.1595036937267866, + "grad_norm": 1.6375820636749268, + "learning_rate": 1.9256426042620274e-05, + "loss": 0.4826, + "step": 13229 + }, + { + "epoch": 2.159666952369291, + "grad_norm": 1.6119370460510254, + "learning_rate": 1.9256305959638748e-05, + "loss": 0.4631, + "step": 13230 + }, + { + "epoch": 2.1598302110117955, + "grad_norm": 1.9640681743621826, + "learning_rate": 1.9256185867336123e-05, + "loss": 0.6279, + "step": 13231 + }, + { + "epoch": 2.1599934696543, + "grad_norm": 1.7074779272079468, + "learning_rate": 1.9256065765712524e-05, + "loss": 0.5932, + "step": 13232 + }, + { + "epoch": 2.1601567282968044, + "grad_norm": 1.7291450500488281, + "learning_rate": 1.9255945654768065e-05, + "loss": 0.4429, + "step": 13233 + }, + { + "epoch": 2.1603199869393084, + "grad_norm": 1.8338356018066406, + "learning_rate": 1.9255825534502873e-05, + "loss": 0.5627, + "step": 13234 + }, + { + "epoch": 2.160483245581813, + "grad_norm": 1.673629879951477, + "learning_rate": 1.925570540491707e-05, + "loss": 0.5329, + "step": 13235 + }, + { + "epoch": 2.1606465042243173, + "grad_norm": 1.596173882484436, + "learning_rate": 1.9255585266010773e-05, + "loss": 0.5372, + "step": 13236 + }, + { + "epoch": 2.1608097628668217, + "grad_norm": 1.7379825115203857, + "learning_rate": 1.9255465117784102e-05, + "loss": 0.5289, + "step": 13237 + }, + { + "epoch": 2.160973021509326, + "grad_norm": 2.1120975017547607, + "learning_rate": 1.925534496023718e-05, + "loss": 0.6401, + "step": 13238 + }, + { + "epoch": 2.1611362801518306, + "grad_norm": 1.7900390625, + "learning_rate": 1.925522479337013e-05, + "loss": 0.5491, + "step": 13239 + }, + { + "epoch": 2.161299538794335, + "grad_norm": 1.7088052034378052, + "learning_rate": 1.9255104617183068e-05, + "loss": 0.5099, + "step": 13240 + }, + { + "epoch": 2.1614627974368394, + "grad_norm": 2.258354663848877, + "learning_rate": 1.9254984431676122e-05, + "loss": 0.5915, + "step": 13241 + }, + { + "epoch": 2.161626056079344, + "grad_norm": 1.8004249334335327, + "learning_rate": 1.925486423684941e-05, + "loss": 0.6788, + "step": 13242 + }, + { + "epoch": 2.1617893147218483, + "grad_norm": 1.8633472919464111, + "learning_rate": 1.925474403270305e-05, + "loss": 0.6255, + "step": 13243 + }, + { + "epoch": 2.1619525733643523, + "grad_norm": 1.8590691089630127, + "learning_rate": 1.9254623819237165e-05, + "loss": 0.6347, + "step": 13244 + }, + { + "epoch": 2.1621158320068568, + "grad_norm": 1.8551139831542969, + "learning_rate": 1.9254503596451875e-05, + "loss": 0.696, + "step": 13245 + }, + { + "epoch": 2.162279090649361, + "grad_norm": 1.7692632675170898, + "learning_rate": 1.9254383364347302e-05, + "loss": 0.6011, + "step": 13246 + }, + { + "epoch": 2.1624423492918656, + "grad_norm": 1.6236116886138916, + "learning_rate": 1.9254263122923568e-05, + "loss": 0.5363, + "step": 13247 + }, + { + "epoch": 2.16260560793437, + "grad_norm": 1.745067834854126, + "learning_rate": 1.9254142872180797e-05, + "loss": 0.6588, + "step": 13248 + }, + { + "epoch": 2.1627688665768745, + "grad_norm": 1.8074969053268433, + "learning_rate": 1.9254022612119102e-05, + "loss": 0.6116, + "step": 13249 + }, + { + "epoch": 2.162932125219379, + "grad_norm": 1.5905216932296753, + "learning_rate": 1.9253902342738612e-05, + "loss": 0.5311, + "step": 13250 + }, + { + "epoch": 2.1630953838618834, + "grad_norm": 1.7934671640396118, + "learning_rate": 1.9253782064039444e-05, + "loss": 0.4831, + "step": 13251 + }, + { + "epoch": 2.1632586425043874, + "grad_norm": 1.6281468868255615, + "learning_rate": 1.9253661776021718e-05, + "loss": 0.5404, + "step": 13252 + }, + { + "epoch": 2.163421901146892, + "grad_norm": 1.7358895540237427, + "learning_rate": 1.925354147868556e-05, + "loss": 0.5318, + "step": 13253 + }, + { + "epoch": 2.1635851597893963, + "grad_norm": 1.7757686376571655, + "learning_rate": 1.9253421172031086e-05, + "loss": 0.524, + "step": 13254 + }, + { + "epoch": 2.1637484184319007, + "grad_norm": 1.7743455171585083, + "learning_rate": 1.925330085605842e-05, + "loss": 0.5903, + "step": 13255 + }, + { + "epoch": 2.163911677074405, + "grad_norm": 1.8542088270187378, + "learning_rate": 1.9253180530767683e-05, + "loss": 0.631, + "step": 13256 + }, + { + "epoch": 2.1640749357169096, + "grad_norm": 1.8892920017242432, + "learning_rate": 1.9253060196158994e-05, + "loss": 0.6456, + "step": 13257 + }, + { + "epoch": 2.164238194359414, + "grad_norm": 2.0208425521850586, + "learning_rate": 1.9252939852232476e-05, + "loss": 0.8602, + "step": 13258 + }, + { + "epoch": 2.1644014530019184, + "grad_norm": 1.846900463104248, + "learning_rate": 1.9252819498988253e-05, + "loss": 0.6158, + "step": 13259 + }, + { + "epoch": 2.164564711644423, + "grad_norm": 1.6417129039764404, + "learning_rate": 1.925269913642644e-05, + "loss": 0.5304, + "step": 13260 + }, + { + "epoch": 2.164727970286927, + "grad_norm": 1.9658843278884888, + "learning_rate": 1.9252578764547164e-05, + "loss": 0.6263, + "step": 13261 + }, + { + "epoch": 2.1648912289294313, + "grad_norm": 1.8743219375610352, + "learning_rate": 1.925245838335054e-05, + "loss": 0.5888, + "step": 13262 + }, + { + "epoch": 2.1650544875719357, + "grad_norm": 2.058260917663574, + "learning_rate": 1.9252337992836696e-05, + "loss": 0.6393, + "step": 13263 + }, + { + "epoch": 2.16521774621444, + "grad_norm": 1.5320395231246948, + "learning_rate": 1.9252217593005752e-05, + "loss": 0.4925, + "step": 13264 + }, + { + "epoch": 2.1653810048569446, + "grad_norm": 1.7881627082824707, + "learning_rate": 1.9252097183857822e-05, + "loss": 0.5587, + "step": 13265 + }, + { + "epoch": 2.165544263499449, + "grad_norm": 1.7892240285873413, + "learning_rate": 1.9251976765393038e-05, + "loss": 0.5129, + "step": 13266 + }, + { + "epoch": 2.1657075221419535, + "grad_norm": 1.7081775665283203, + "learning_rate": 1.925185633761151e-05, + "loss": 0.5148, + "step": 13267 + }, + { + "epoch": 2.165870780784458, + "grad_norm": 1.7175195217132568, + "learning_rate": 1.9251735900513367e-05, + "loss": 0.5027, + "step": 13268 + }, + { + "epoch": 2.166034039426962, + "grad_norm": 1.7063868045806885, + "learning_rate": 1.9251615454098732e-05, + "loss": 0.6208, + "step": 13269 + }, + { + "epoch": 2.1661972980694664, + "grad_norm": 1.3373922109603882, + "learning_rate": 1.925149499836772e-05, + "loss": 0.4396, + "step": 13270 + }, + { + "epoch": 2.166360556711971, + "grad_norm": 1.4201699495315552, + "learning_rate": 1.9251374533320454e-05, + "loss": 0.4165, + "step": 13271 + }, + { + "epoch": 2.1665238153544752, + "grad_norm": 1.8484610319137573, + "learning_rate": 1.9251254058957058e-05, + "loss": 0.5992, + "step": 13272 + }, + { + "epoch": 2.1666870739969797, + "grad_norm": 1.8344168663024902, + "learning_rate": 1.9251133575277652e-05, + "loss": 0.6136, + "step": 13273 + }, + { + "epoch": 2.166850332639484, + "grad_norm": 1.8022921085357666, + "learning_rate": 1.9251013082282357e-05, + "loss": 0.5364, + "step": 13274 + }, + { + "epoch": 2.1670135912819886, + "grad_norm": 1.5622553825378418, + "learning_rate": 1.9250892579971293e-05, + "loss": 0.4909, + "step": 13275 + }, + { + "epoch": 2.167176849924493, + "grad_norm": 2.1457207202911377, + "learning_rate": 1.925077206834458e-05, + "loss": 0.8582, + "step": 13276 + }, + { + "epoch": 2.1673401085669974, + "grad_norm": 1.829325556755066, + "learning_rate": 1.9250651547402345e-05, + "loss": 0.5328, + "step": 13277 + }, + { + "epoch": 2.167503367209502, + "grad_norm": 1.7083674669265747, + "learning_rate": 1.925053101714471e-05, + "loss": 0.5332, + "step": 13278 + }, + { + "epoch": 2.167666625852006, + "grad_norm": 1.7561875581741333, + "learning_rate": 1.9250410477571787e-05, + "loss": 0.5152, + "step": 13279 + }, + { + "epoch": 2.1678298844945103, + "grad_norm": 2.0389132499694824, + "learning_rate": 1.9250289928683706e-05, + "loss": 0.6242, + "step": 13280 + }, + { + "epoch": 2.1679931431370147, + "grad_norm": 1.6548242568969727, + "learning_rate": 1.9250169370480582e-05, + "loss": 0.5099, + "step": 13281 + }, + { + "epoch": 2.168156401779519, + "grad_norm": 1.7876954078674316, + "learning_rate": 1.9250048802962543e-05, + "loss": 0.6512, + "step": 13282 + }, + { + "epoch": 2.1683196604220236, + "grad_norm": 1.6596119403839111, + "learning_rate": 1.924992822612971e-05, + "loss": 0.5427, + "step": 13283 + }, + { + "epoch": 2.168482919064528, + "grad_norm": 1.7031524181365967, + "learning_rate": 1.9249807639982197e-05, + "loss": 0.5989, + "step": 13284 + }, + { + "epoch": 2.1686461777070325, + "grad_norm": 1.5483020544052124, + "learning_rate": 1.924968704452013e-05, + "loss": 0.4282, + "step": 13285 + }, + { + "epoch": 2.168809436349537, + "grad_norm": 1.8957362174987793, + "learning_rate": 1.9249566439743636e-05, + "loss": 0.6492, + "step": 13286 + }, + { + "epoch": 2.168972694992041, + "grad_norm": 1.879695177078247, + "learning_rate": 1.9249445825652825e-05, + "loss": 0.5606, + "step": 13287 + }, + { + "epoch": 2.1691359536345454, + "grad_norm": 1.5606549978256226, + "learning_rate": 1.9249325202247826e-05, + "loss": 0.5832, + "step": 13288 + }, + { + "epoch": 2.16929921227705, + "grad_norm": 1.8519388437271118, + "learning_rate": 1.924920456952876e-05, + "loss": 0.5079, + "step": 13289 + }, + { + "epoch": 2.1694624709195542, + "grad_norm": 1.4684165716171265, + "learning_rate": 1.924908392749575e-05, + "loss": 0.4344, + "step": 13290 + }, + { + "epoch": 2.1696257295620587, + "grad_norm": 1.822198748588562, + "learning_rate": 1.924896327614891e-05, + "loss": 0.6252, + "step": 13291 + }, + { + "epoch": 2.169788988204563, + "grad_norm": 1.4850627183914185, + "learning_rate": 1.924884261548837e-05, + "loss": 0.4389, + "step": 13292 + }, + { + "epoch": 2.1699522468470676, + "grad_norm": 1.7573785781860352, + "learning_rate": 1.9248721945514248e-05, + "loss": 0.58, + "step": 13293 + }, + { + "epoch": 2.170115505489572, + "grad_norm": 1.7373608350753784, + "learning_rate": 1.924860126622666e-05, + "loss": 0.5564, + "step": 13294 + }, + { + "epoch": 2.1702787641320764, + "grad_norm": 2.0441455841064453, + "learning_rate": 1.924848057762574e-05, + "loss": 0.6987, + "step": 13295 + }, + { + "epoch": 2.1704420227745804, + "grad_norm": 1.7932547330856323, + "learning_rate": 1.92483598797116e-05, + "loss": 0.5548, + "step": 13296 + }, + { + "epoch": 2.170605281417085, + "grad_norm": 1.781400203704834, + "learning_rate": 1.924823917248436e-05, + "loss": 0.5621, + "step": 13297 + }, + { + "epoch": 2.1707685400595893, + "grad_norm": 1.7107067108154297, + "learning_rate": 1.9248118455944153e-05, + "loss": 0.5102, + "step": 13298 + }, + { + "epoch": 2.1709317987020937, + "grad_norm": 1.8702898025512695, + "learning_rate": 1.924799773009109e-05, + "loss": 0.5524, + "step": 13299 + }, + { + "epoch": 2.171095057344598, + "grad_norm": 1.6808828115463257, + "learning_rate": 1.9247876994925293e-05, + "loss": 0.627, + "step": 13300 + }, + { + "epoch": 2.1712583159871026, + "grad_norm": 2.002847194671631, + "learning_rate": 1.924775625044689e-05, + "loss": 0.5551, + "step": 13301 + }, + { + "epoch": 2.171421574629607, + "grad_norm": 2.0416860580444336, + "learning_rate": 1.9247635496655994e-05, + "loss": 0.6609, + "step": 13302 + }, + { + "epoch": 2.1715848332721115, + "grad_norm": 2.192258596420288, + "learning_rate": 1.9247514733552738e-05, + "loss": 0.677, + "step": 13303 + }, + { + "epoch": 2.171748091914616, + "grad_norm": 2.152134895324707, + "learning_rate": 1.9247393961137232e-05, + "loss": 0.7819, + "step": 13304 + }, + { + "epoch": 2.17191135055712, + "grad_norm": 1.9223251342773438, + "learning_rate": 1.9247273179409605e-05, + "loss": 0.6318, + "step": 13305 + }, + { + "epoch": 2.1720746091996244, + "grad_norm": 1.882735013961792, + "learning_rate": 1.9247152388369976e-05, + "loss": 0.5903, + "step": 13306 + }, + { + "epoch": 2.172237867842129, + "grad_norm": 1.9254473447799683, + "learning_rate": 1.9247031588018467e-05, + "loss": 0.6631, + "step": 13307 + }, + { + "epoch": 2.1724011264846332, + "grad_norm": 1.8240844011306763, + "learning_rate": 1.9246910778355202e-05, + "loss": 0.6525, + "step": 13308 + }, + { + "epoch": 2.1725643851271377, + "grad_norm": 1.6929633617401123, + "learning_rate": 1.9246789959380297e-05, + "loss": 0.4961, + "step": 13309 + }, + { + "epoch": 2.172727643769642, + "grad_norm": 1.691746473312378, + "learning_rate": 1.9246669131093875e-05, + "loss": 0.5295, + "step": 13310 + }, + { + "epoch": 2.1728909024121466, + "grad_norm": 1.863141417503357, + "learning_rate": 1.9246548293496063e-05, + "loss": 0.6106, + "step": 13311 + }, + { + "epoch": 2.173054161054651, + "grad_norm": 1.856151819229126, + "learning_rate": 1.924642744658698e-05, + "loss": 0.5864, + "step": 13312 + }, + { + "epoch": 2.1732174196971554, + "grad_norm": 1.7903735637664795, + "learning_rate": 1.9246306590366747e-05, + "loss": 0.6202, + "step": 13313 + }, + { + "epoch": 2.1733806783396594, + "grad_norm": 1.6766685247421265, + "learning_rate": 1.9246185724835483e-05, + "loss": 0.5529, + "step": 13314 + }, + { + "epoch": 2.173543936982164, + "grad_norm": 1.7331205606460571, + "learning_rate": 1.9246064849993314e-05, + "loss": 0.6332, + "step": 13315 + }, + { + "epoch": 2.1737071956246683, + "grad_norm": 1.8539085388183594, + "learning_rate": 1.9245943965840363e-05, + "loss": 0.6159, + "step": 13316 + }, + { + "epoch": 2.1738704542671727, + "grad_norm": 1.651315450668335, + "learning_rate": 1.9245823072376747e-05, + "loss": 0.5413, + "step": 13317 + }, + { + "epoch": 2.174033712909677, + "grad_norm": 1.856483817100525, + "learning_rate": 1.9245702169602586e-05, + "loss": 0.6459, + "step": 13318 + }, + { + "epoch": 2.1741969715521816, + "grad_norm": 2.1413705348968506, + "learning_rate": 1.9245581257518008e-05, + "loss": 0.6285, + "step": 13319 + }, + { + "epoch": 2.174360230194686, + "grad_norm": 1.7569209337234497, + "learning_rate": 1.9245460336123136e-05, + "loss": 0.5657, + "step": 13320 + }, + { + "epoch": 2.1745234888371905, + "grad_norm": 1.9354357719421387, + "learning_rate": 1.924533940541808e-05, + "loss": 0.6508, + "step": 13321 + }, + { + "epoch": 2.1746867474796945, + "grad_norm": 2.2263355255126953, + "learning_rate": 1.9245218465402974e-05, + "loss": 0.6645, + "step": 13322 + }, + { + "epoch": 2.174850006122199, + "grad_norm": 1.7292553186416626, + "learning_rate": 1.9245097516077935e-05, + "loss": 0.5695, + "step": 13323 + }, + { + "epoch": 2.1750132647647034, + "grad_norm": 1.8626881837844849, + "learning_rate": 1.9244976557443086e-05, + "loss": 0.6079, + "step": 13324 + }, + { + "epoch": 2.175176523407208, + "grad_norm": 1.5982189178466797, + "learning_rate": 1.924485558949855e-05, + "loss": 0.4996, + "step": 13325 + }, + { + "epoch": 2.1753397820497122, + "grad_norm": 1.5233722925186157, + "learning_rate": 1.9244734612244442e-05, + "loss": 0.474, + "step": 13326 + }, + { + "epoch": 2.1755030406922167, + "grad_norm": 2.3430371284484863, + "learning_rate": 1.924461362568089e-05, + "loss": 0.7936, + "step": 13327 + }, + { + "epoch": 2.175666299334721, + "grad_norm": 1.9748408794403076, + "learning_rate": 1.9244492629808017e-05, + "loss": 0.6437, + "step": 13328 + }, + { + "epoch": 2.1758295579772255, + "grad_norm": 1.8745267391204834, + "learning_rate": 1.924437162462594e-05, + "loss": 0.6342, + "step": 13329 + }, + { + "epoch": 2.17599281661973, + "grad_norm": 1.8029619455337524, + "learning_rate": 1.9244250610134787e-05, + "loss": 0.5692, + "step": 13330 + }, + { + "epoch": 2.1761560752622344, + "grad_norm": 1.5643092393875122, + "learning_rate": 1.924412958633467e-05, + "loss": 0.4394, + "step": 13331 + }, + { + "epoch": 2.1763193339047384, + "grad_norm": 1.904140830039978, + "learning_rate": 1.9244008553225725e-05, + "loss": 0.5619, + "step": 13332 + }, + { + "epoch": 2.176482592547243, + "grad_norm": 1.8070687055587769, + "learning_rate": 1.924388751080806e-05, + "loss": 0.6492, + "step": 13333 + }, + { + "epoch": 2.1766458511897473, + "grad_norm": 2.0452511310577393, + "learning_rate": 1.9243766459081802e-05, + "loss": 0.6885, + "step": 13334 + }, + { + "epoch": 2.1768091098322517, + "grad_norm": 2.0648157596588135, + "learning_rate": 1.9243645398047073e-05, + "loss": 0.6516, + "step": 13335 + }, + { + "epoch": 2.176972368474756, + "grad_norm": 2.163074254989624, + "learning_rate": 1.9243524327703998e-05, + "loss": 1.0042, + "step": 13336 + }, + { + "epoch": 2.1771356271172606, + "grad_norm": 1.9026952981948853, + "learning_rate": 1.92434032480527e-05, + "loss": 0.5626, + "step": 13337 + }, + { + "epoch": 2.177298885759765, + "grad_norm": 2.0409882068634033, + "learning_rate": 1.9243282159093292e-05, + "loss": 0.5259, + "step": 13338 + }, + { + "epoch": 2.1774621444022695, + "grad_norm": 1.584869146347046, + "learning_rate": 1.9243161060825906e-05, + "loss": 0.5877, + "step": 13339 + }, + { + "epoch": 2.1776254030447735, + "grad_norm": 1.7438567876815796, + "learning_rate": 1.9243039953250654e-05, + "loss": 0.6189, + "step": 13340 + }, + { + "epoch": 2.177788661687278, + "grad_norm": 1.9519093036651611, + "learning_rate": 1.924291883636767e-05, + "loss": 0.6082, + "step": 13341 + }, + { + "epoch": 2.1779519203297824, + "grad_norm": 1.9211331605911255, + "learning_rate": 1.924279771017706e-05, + "loss": 0.5631, + "step": 13342 + }, + { + "epoch": 2.178115178972287, + "grad_norm": 1.781083345413208, + "learning_rate": 1.924267657467896e-05, + "loss": 0.5093, + "step": 13343 + }, + { + "epoch": 2.1782784376147912, + "grad_norm": 1.6289771795272827, + "learning_rate": 1.9242555429873488e-05, + "loss": 0.4486, + "step": 13344 + }, + { + "epoch": 2.1784416962572957, + "grad_norm": 1.8659967184066772, + "learning_rate": 1.9242434275760765e-05, + "loss": 0.5297, + "step": 13345 + }, + { + "epoch": 2.1786049548998, + "grad_norm": 1.5624890327453613, + "learning_rate": 1.9242313112340912e-05, + "loss": 0.5076, + "step": 13346 + }, + { + "epoch": 2.1787682135423045, + "grad_norm": 1.9392064809799194, + "learning_rate": 1.9242191939614054e-05, + "loss": 0.5934, + "step": 13347 + }, + { + "epoch": 2.178931472184809, + "grad_norm": 2.0035412311553955, + "learning_rate": 1.924207075758031e-05, + "loss": 0.5403, + "step": 13348 + }, + { + "epoch": 2.179094730827313, + "grad_norm": 1.8310558795928955, + "learning_rate": 1.92419495662398e-05, + "loss": 0.5981, + "step": 13349 + }, + { + "epoch": 2.1792579894698174, + "grad_norm": 1.8200416564941406, + "learning_rate": 1.9241828365592653e-05, + "loss": 0.5224, + "step": 13350 + }, + { + "epoch": 2.179421248112322, + "grad_norm": 1.7179251909255981, + "learning_rate": 1.9241707155638985e-05, + "loss": 0.5132, + "step": 13351 + }, + { + "epoch": 2.1795845067548263, + "grad_norm": 1.9483088254928589, + "learning_rate": 1.9241585936378926e-05, + "loss": 0.6387, + "step": 13352 + }, + { + "epoch": 2.1797477653973307, + "grad_norm": 1.8727397918701172, + "learning_rate": 1.9241464707812586e-05, + "loss": 0.6614, + "step": 13353 + }, + { + "epoch": 2.179911024039835, + "grad_norm": 1.6678638458251953, + "learning_rate": 1.9241343469940096e-05, + "loss": 0.5025, + "step": 13354 + }, + { + "epoch": 2.1800742826823396, + "grad_norm": 1.7355008125305176, + "learning_rate": 1.9241222222761576e-05, + "loss": 0.5095, + "step": 13355 + }, + { + "epoch": 2.180237541324844, + "grad_norm": 2.2528584003448486, + "learning_rate": 1.9241100966277146e-05, + "loss": 0.7494, + "step": 13356 + }, + { + "epoch": 2.180400799967348, + "grad_norm": 1.528761625289917, + "learning_rate": 1.9240979700486934e-05, + "loss": 0.513, + "step": 13357 + }, + { + "epoch": 2.1805640586098525, + "grad_norm": 1.707395076751709, + "learning_rate": 1.924085842539105e-05, + "loss": 0.504, + "step": 13358 + }, + { + "epoch": 2.180727317252357, + "grad_norm": 1.741479516029358, + "learning_rate": 1.9240737140989632e-05, + "loss": 0.566, + "step": 13359 + }, + { + "epoch": 2.1808905758948613, + "grad_norm": 2.0772225856781006, + "learning_rate": 1.924061584728279e-05, + "loss": 0.6394, + "step": 13360 + }, + { + "epoch": 2.181053834537366, + "grad_norm": 1.7293297052383423, + "learning_rate": 1.9240494544270653e-05, + "loss": 0.5311, + "step": 13361 + }, + { + "epoch": 2.1812170931798702, + "grad_norm": 2.201112985610962, + "learning_rate": 1.9240373231953334e-05, + "loss": 0.5863, + "step": 13362 + }, + { + "epoch": 2.1813803518223747, + "grad_norm": 1.8002632856369019, + "learning_rate": 1.924025191033097e-05, + "loss": 0.6145, + "step": 13363 + }, + { + "epoch": 2.181543610464879, + "grad_norm": 1.7468950748443604, + "learning_rate": 1.924013057940367e-05, + "loss": 0.5685, + "step": 13364 + }, + { + "epoch": 2.1817068691073835, + "grad_norm": 1.7704704999923706, + "learning_rate": 1.9240009239171564e-05, + "loss": 0.6277, + "step": 13365 + }, + { + "epoch": 2.181870127749888, + "grad_norm": 2.0964515209198, + "learning_rate": 1.9239887889634764e-05, + "loss": 0.6714, + "step": 13366 + }, + { + "epoch": 2.182033386392392, + "grad_norm": 1.9130288362503052, + "learning_rate": 1.9239766530793405e-05, + "loss": 0.6312, + "step": 13367 + }, + { + "epoch": 2.1821966450348964, + "grad_norm": 1.6292823553085327, + "learning_rate": 1.9239645162647603e-05, + "loss": 0.5424, + "step": 13368 + }, + { + "epoch": 2.182359903677401, + "grad_norm": 1.9070968627929688, + "learning_rate": 1.9239523785197483e-05, + "loss": 0.5828, + "step": 13369 + }, + { + "epoch": 2.1825231623199053, + "grad_norm": 1.6151317358016968, + "learning_rate": 1.923940239844316e-05, + "loss": 0.5367, + "step": 13370 + }, + { + "epoch": 2.1826864209624097, + "grad_norm": 1.9887224435806274, + "learning_rate": 1.9239281002384766e-05, + "loss": 0.5508, + "step": 13371 + }, + { + "epoch": 2.182849679604914, + "grad_norm": 1.6799800395965576, + "learning_rate": 1.9239159597022416e-05, + "loss": 0.5255, + "step": 13372 + }, + { + "epoch": 2.1830129382474186, + "grad_norm": 1.644047737121582, + "learning_rate": 1.9239038182356236e-05, + "loss": 0.5753, + "step": 13373 + }, + { + "epoch": 2.183176196889923, + "grad_norm": 1.7875139713287354, + "learning_rate": 1.9238916758386345e-05, + "loss": 0.6355, + "step": 13374 + }, + { + "epoch": 2.183339455532427, + "grad_norm": 2.339210271835327, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.7761, + "step": 13375 + }, + { + "epoch": 2.1835027141749315, + "grad_norm": 1.7501087188720703, + "learning_rate": 1.923867388253593e-05, + "loss": 0.5876, + "step": 13376 + }, + { + "epoch": 2.183665972817436, + "grad_norm": 1.6933979988098145, + "learning_rate": 1.9238552430655645e-05, + "loss": 0.56, + "step": 13377 + }, + { + "epoch": 2.1838292314599403, + "grad_norm": 1.9278876781463623, + "learning_rate": 1.9238430969472143e-05, + "loss": 0.6243, + "step": 13378 + }, + { + "epoch": 2.183992490102445, + "grad_norm": 2.16371488571167, + "learning_rate": 1.923830949898554e-05, + "loss": 0.5745, + "step": 13379 + }, + { + "epoch": 2.184155748744949, + "grad_norm": 1.969250202178955, + "learning_rate": 1.9238188019195964e-05, + "loss": 0.5714, + "step": 13380 + }, + { + "epoch": 2.1843190073874537, + "grad_norm": 1.540621280670166, + "learning_rate": 1.9238066530103537e-05, + "loss": 0.5353, + "step": 13381 + }, + { + "epoch": 2.184482266029958, + "grad_norm": 1.8438284397125244, + "learning_rate": 1.9237945031708378e-05, + "loss": 0.6587, + "step": 13382 + }, + { + "epoch": 2.1846455246724625, + "grad_norm": 1.608363151550293, + "learning_rate": 1.923782352401061e-05, + "loss": 0.4409, + "step": 13383 + }, + { + "epoch": 2.1848087833149665, + "grad_norm": 1.6656283140182495, + "learning_rate": 1.9237702007010356e-05, + "loss": 0.5441, + "step": 13384 + }, + { + "epoch": 2.184972041957471, + "grad_norm": 1.4580650329589844, + "learning_rate": 1.923758048070774e-05, + "loss": 0.5641, + "step": 13385 + }, + { + "epoch": 2.1851353005999754, + "grad_norm": 1.6788207292556763, + "learning_rate": 1.923745894510288e-05, + "loss": 0.5347, + "step": 13386 + }, + { + "epoch": 2.18529855924248, + "grad_norm": 1.7024714946746826, + "learning_rate": 1.9237337400195906e-05, + "loss": 0.5595, + "step": 13387 + }, + { + "epoch": 2.1854618178849843, + "grad_norm": 1.7942980527877808, + "learning_rate": 1.9237215845986933e-05, + "loss": 0.6056, + "step": 13388 + }, + { + "epoch": 2.1856250765274887, + "grad_norm": 1.6418932676315308, + "learning_rate": 1.923709428247609e-05, + "loss": 0.4852, + "step": 13389 + }, + { + "epoch": 2.185788335169993, + "grad_norm": 1.5173143148422241, + "learning_rate": 1.9236972709663487e-05, + "loss": 0.4785, + "step": 13390 + }, + { + "epoch": 2.1859515938124976, + "grad_norm": 2.240436553955078, + "learning_rate": 1.9236851127549262e-05, + "loss": 0.6179, + "step": 13391 + }, + { + "epoch": 2.186114852455002, + "grad_norm": 1.967621088027954, + "learning_rate": 1.9236729536133527e-05, + "loss": 0.5886, + "step": 13392 + }, + { + "epoch": 2.186278111097506, + "grad_norm": 1.6209501028060913, + "learning_rate": 1.9236607935416408e-05, + "loss": 0.4573, + "step": 13393 + }, + { + "epoch": 2.1864413697400105, + "grad_norm": 1.7111046314239502, + "learning_rate": 1.923648632539803e-05, + "loss": 0.5861, + "step": 13394 + }, + { + "epoch": 2.186604628382515, + "grad_norm": 2.1182708740234375, + "learning_rate": 1.9236364706078512e-05, + "loss": 0.6681, + "step": 13395 + }, + { + "epoch": 2.1867678870250193, + "grad_norm": 1.714556336402893, + "learning_rate": 1.9236243077457973e-05, + "loss": 0.5123, + "step": 13396 + }, + { + "epoch": 2.186931145667524, + "grad_norm": 1.823424220085144, + "learning_rate": 1.9236121439536544e-05, + "loss": 0.566, + "step": 13397 + }, + { + "epoch": 2.187094404310028, + "grad_norm": 1.9098000526428223, + "learning_rate": 1.9235999792314342e-05, + "loss": 0.5211, + "step": 13398 + }, + { + "epoch": 2.1872576629525327, + "grad_norm": 1.5521291494369507, + "learning_rate": 1.923587813579149e-05, + "loss": 0.5918, + "step": 13399 + }, + { + "epoch": 2.187420921595037, + "grad_norm": 1.7459940910339355, + "learning_rate": 1.9235756469968112e-05, + "loss": 0.6029, + "step": 13400 + }, + { + "epoch": 2.1875841802375415, + "grad_norm": 2.00018048286438, + "learning_rate": 1.923563479484433e-05, + "loss": 0.9114, + "step": 13401 + }, + { + "epoch": 2.1877474388800455, + "grad_norm": 1.6527129411697388, + "learning_rate": 1.9235513110420267e-05, + "loss": 0.5736, + "step": 13402 + }, + { + "epoch": 2.18791069752255, + "grad_norm": 1.4924031496047974, + "learning_rate": 1.923539141669604e-05, + "loss": 0.5062, + "step": 13403 + }, + { + "epoch": 2.1880739561650544, + "grad_norm": 1.8272820711135864, + "learning_rate": 1.923526971367178e-05, + "loss": 0.5093, + "step": 13404 + }, + { + "epoch": 2.188237214807559, + "grad_norm": 1.8849310874938965, + "learning_rate": 1.923514800134761e-05, + "loss": 0.6635, + "step": 13405 + }, + { + "epoch": 2.1884004734500633, + "grad_norm": 1.7769842147827148, + "learning_rate": 1.923502627972364e-05, + "loss": 0.5692, + "step": 13406 + }, + { + "epoch": 2.1885637320925677, + "grad_norm": 1.963986873626709, + "learning_rate": 1.9234904548800008e-05, + "loss": 0.5406, + "step": 13407 + }, + { + "epoch": 2.188726990735072, + "grad_norm": 1.981879711151123, + "learning_rate": 1.9234782808576823e-05, + "loss": 0.5592, + "step": 13408 + }, + { + "epoch": 2.1888902493775766, + "grad_norm": 1.6330256462097168, + "learning_rate": 1.923466105905422e-05, + "loss": 0.5981, + "step": 13409 + }, + { + "epoch": 2.1890535080200806, + "grad_norm": 1.7463089227676392, + "learning_rate": 1.9234539300232312e-05, + "loss": 0.5694, + "step": 13410 + }, + { + "epoch": 2.189216766662585, + "grad_norm": 1.9125310182571411, + "learning_rate": 1.9234417532111227e-05, + "loss": 0.5796, + "step": 13411 + }, + { + "epoch": 2.1893800253050895, + "grad_norm": 1.978279948234558, + "learning_rate": 1.9234295754691085e-05, + "loss": 0.5798, + "step": 13412 + }, + { + "epoch": 2.189543283947594, + "grad_norm": 1.6654285192489624, + "learning_rate": 1.9234173967972012e-05, + "loss": 0.5525, + "step": 13413 + }, + { + "epoch": 2.1897065425900983, + "grad_norm": 1.899915337562561, + "learning_rate": 1.9234052171954127e-05, + "loss": 0.635, + "step": 13414 + }, + { + "epoch": 2.1898698012326028, + "grad_norm": 1.8322410583496094, + "learning_rate": 1.923393036663755e-05, + "loss": 0.6394, + "step": 13415 + }, + { + "epoch": 2.190033059875107, + "grad_norm": 1.889222264289856, + "learning_rate": 1.9233808552022414e-05, + "loss": 0.559, + "step": 13416 + }, + { + "epoch": 2.1901963185176117, + "grad_norm": 1.769713044166565, + "learning_rate": 1.923368672810883e-05, + "loss": 0.4811, + "step": 13417 + }, + { + "epoch": 2.190359577160116, + "grad_norm": 2.021545886993408, + "learning_rate": 1.923356489489693e-05, + "loss": 0.6794, + "step": 13418 + }, + { + "epoch": 2.1905228358026205, + "grad_norm": 2.222835063934326, + "learning_rate": 1.9233443052386832e-05, + "loss": 0.6443, + "step": 13419 + }, + { + "epoch": 2.1906860944451245, + "grad_norm": 1.3782037496566772, + "learning_rate": 1.9233321200578657e-05, + "loss": 0.5168, + "step": 13420 + }, + { + "epoch": 2.190849353087629, + "grad_norm": 1.6466208696365356, + "learning_rate": 1.923319933947253e-05, + "loss": 0.5173, + "step": 13421 + }, + { + "epoch": 2.1910126117301334, + "grad_norm": 1.6654553413391113, + "learning_rate": 1.923307746906858e-05, + "loss": 0.6111, + "step": 13422 + }, + { + "epoch": 2.191175870372638, + "grad_norm": 1.6676487922668457, + "learning_rate": 1.9232955589366914e-05, + "loss": 0.5175, + "step": 13423 + }, + { + "epoch": 2.1913391290151423, + "grad_norm": 1.8205662965774536, + "learning_rate": 1.923283370036767e-05, + "loss": 0.6621, + "step": 13424 + }, + { + "epoch": 2.1915023876576467, + "grad_norm": 1.918403148651123, + "learning_rate": 1.923271180207096e-05, + "loss": 0.5198, + "step": 13425 + }, + { + "epoch": 2.191665646300151, + "grad_norm": 1.8093727827072144, + "learning_rate": 1.923258989447692e-05, + "loss": 0.5758, + "step": 13426 + }, + { + "epoch": 2.1918289049426556, + "grad_norm": 1.6721630096435547, + "learning_rate": 1.9232467977585657e-05, + "loss": 0.5093, + "step": 13427 + }, + { + "epoch": 2.1919921635851596, + "grad_norm": 2.337087392807007, + "learning_rate": 1.9232346051397303e-05, + "loss": 0.6548, + "step": 13428 + }, + { + "epoch": 2.192155422227664, + "grad_norm": 1.796854853630066, + "learning_rate": 1.923222411591198e-05, + "loss": 0.597, + "step": 13429 + }, + { + "epoch": 2.1923186808701685, + "grad_norm": 2.088977575302124, + "learning_rate": 1.923210217112981e-05, + "loss": 0.7017, + "step": 13430 + }, + { + "epoch": 2.192481939512673, + "grad_norm": 1.794472098350525, + "learning_rate": 1.9231980217050916e-05, + "loss": 0.5673, + "step": 13431 + }, + { + "epoch": 2.1926451981551773, + "grad_norm": 1.7347118854522705, + "learning_rate": 1.923185825367542e-05, + "loss": 0.5805, + "step": 13432 + }, + { + "epoch": 2.1928084567976818, + "grad_norm": 1.9131489992141724, + "learning_rate": 1.9231736281003444e-05, + "loss": 0.6746, + "step": 13433 + }, + { + "epoch": 2.192971715440186, + "grad_norm": 2.022796154022217, + "learning_rate": 1.923161429903511e-05, + "loss": 0.7153, + "step": 13434 + }, + { + "epoch": 2.1931349740826906, + "grad_norm": 1.7130711078643799, + "learning_rate": 1.9231492307770548e-05, + "loss": 0.5937, + "step": 13435 + }, + { + "epoch": 2.193298232725195, + "grad_norm": 1.5773271322250366, + "learning_rate": 1.9231370307209873e-05, + "loss": 0.4632, + "step": 13436 + }, + { + "epoch": 2.193461491367699, + "grad_norm": 2.2565109729766846, + "learning_rate": 1.923124829735321e-05, + "loss": 0.6851, + "step": 13437 + }, + { + "epoch": 2.1936247500102035, + "grad_norm": 1.5666379928588867, + "learning_rate": 1.923112627820068e-05, + "loss": 0.4837, + "step": 13438 + }, + { + "epoch": 2.193788008652708, + "grad_norm": 1.834142804145813, + "learning_rate": 1.9231004249752415e-05, + "loss": 0.6526, + "step": 13439 + }, + { + "epoch": 2.1939512672952124, + "grad_norm": 1.8211784362792969, + "learning_rate": 1.9230882212008528e-05, + "loss": 0.5908, + "step": 13440 + }, + { + "epoch": 2.194114525937717, + "grad_norm": 1.8099595308303833, + "learning_rate": 1.9230760164969146e-05, + "loss": 0.5487, + "step": 13441 + }, + { + "epoch": 2.1942777845802213, + "grad_norm": 1.7351914644241333, + "learning_rate": 1.9230638108634387e-05, + "loss": 0.5711, + "step": 13442 + }, + { + "epoch": 2.1944410432227257, + "grad_norm": 1.9830671548843384, + "learning_rate": 1.923051604300438e-05, + "loss": 0.5731, + "step": 13443 + }, + { + "epoch": 2.19460430186523, + "grad_norm": 1.9587249755859375, + "learning_rate": 1.9230393968079247e-05, + "loss": 0.545, + "step": 13444 + }, + { + "epoch": 2.1947675605077346, + "grad_norm": 1.9675657749176025, + "learning_rate": 1.923027188385911e-05, + "loss": 0.6309, + "step": 13445 + }, + { + "epoch": 2.1949308191502386, + "grad_norm": 1.7375847101211548, + "learning_rate": 1.923014979034409e-05, + "loss": 0.6055, + "step": 13446 + }, + { + "epoch": 2.195094077792743, + "grad_norm": 1.9729317426681519, + "learning_rate": 1.9230027687534313e-05, + "loss": 0.6547, + "step": 13447 + }, + { + "epoch": 2.1952573364352475, + "grad_norm": 2.4664926528930664, + "learning_rate": 1.92299055754299e-05, + "loss": 0.6233, + "step": 13448 + }, + { + "epoch": 2.195420595077752, + "grad_norm": 1.6607191562652588, + "learning_rate": 1.9229783454030975e-05, + "loss": 0.6075, + "step": 13449 + }, + { + "epoch": 2.1955838537202563, + "grad_norm": 1.4437894821166992, + "learning_rate": 1.922966132333766e-05, + "loss": 0.5114, + "step": 13450 + }, + { + "epoch": 2.1957471123627608, + "grad_norm": 1.8462218046188354, + "learning_rate": 1.9229539183350076e-05, + "loss": 0.6588, + "step": 13451 + }, + { + "epoch": 2.195910371005265, + "grad_norm": 1.777529001235962, + "learning_rate": 1.9229417034068352e-05, + "loss": 0.5058, + "step": 13452 + }, + { + "epoch": 2.1960736296477696, + "grad_norm": 1.6091852188110352, + "learning_rate": 1.9229294875492606e-05, + "loss": 0.5175, + "step": 13453 + }, + { + "epoch": 2.196236888290274, + "grad_norm": 2.086238384246826, + "learning_rate": 1.922917270762296e-05, + "loss": 0.5934, + "step": 13454 + }, + { + "epoch": 2.196400146932778, + "grad_norm": 1.6014565229415894, + "learning_rate": 1.9229050530459545e-05, + "loss": 0.5169, + "step": 13455 + }, + { + "epoch": 2.1965634055752825, + "grad_norm": 1.6017178297042847, + "learning_rate": 1.9228928344002477e-05, + "loss": 0.4643, + "step": 13456 + }, + { + "epoch": 2.196726664217787, + "grad_norm": 1.7577970027923584, + "learning_rate": 1.9228806148251878e-05, + "loss": 0.5202, + "step": 13457 + }, + { + "epoch": 2.1968899228602914, + "grad_norm": 1.6741358041763306, + "learning_rate": 1.9228683943207875e-05, + "loss": 0.562, + "step": 13458 + }, + { + "epoch": 2.197053181502796, + "grad_norm": 1.987088680267334, + "learning_rate": 1.9228561728870588e-05, + "loss": 0.628, + "step": 13459 + }, + { + "epoch": 2.1972164401453003, + "grad_norm": 1.5931546688079834, + "learning_rate": 1.9228439505240147e-05, + "loss": 0.4406, + "step": 13460 + }, + { + "epoch": 2.1973796987878047, + "grad_norm": 1.8192857503890991, + "learning_rate": 1.9228317272316664e-05, + "loss": 0.5608, + "step": 13461 + }, + { + "epoch": 2.197542957430309, + "grad_norm": 1.5892902612686157, + "learning_rate": 1.922819503010027e-05, + "loss": 0.5075, + "step": 13462 + }, + { + "epoch": 2.197706216072813, + "grad_norm": 1.7999153137207031, + "learning_rate": 1.922807277859109e-05, + "loss": 0.5398, + "step": 13463 + }, + { + "epoch": 2.1978694747153176, + "grad_norm": 1.64551842212677, + "learning_rate": 1.9227950517789238e-05, + "loss": 0.5547, + "step": 13464 + }, + { + "epoch": 2.198032733357822, + "grad_norm": 1.8345314264297485, + "learning_rate": 1.9227828247694845e-05, + "loss": 0.6431, + "step": 13465 + }, + { + "epoch": 2.1981959920003264, + "grad_norm": 1.669814944267273, + "learning_rate": 1.922770596830803e-05, + "loss": 0.5025, + "step": 13466 + }, + { + "epoch": 2.198359250642831, + "grad_norm": 1.76397705078125, + "learning_rate": 1.922758367962892e-05, + "loss": 0.5114, + "step": 13467 + }, + { + "epoch": 2.1985225092853353, + "grad_norm": 1.790084719657898, + "learning_rate": 1.9227461381657632e-05, + "loss": 0.475, + "step": 13468 + }, + { + "epoch": 2.1986857679278398, + "grad_norm": 1.4444540739059448, + "learning_rate": 1.9227339074394295e-05, + "loss": 0.469, + "step": 13469 + }, + { + "epoch": 2.198849026570344, + "grad_norm": 1.3645896911621094, + "learning_rate": 1.9227216757839028e-05, + "loss": 0.5128, + "step": 13470 + }, + { + "epoch": 2.1990122852128486, + "grad_norm": 1.8578639030456543, + "learning_rate": 1.922709443199196e-05, + "loss": 0.6214, + "step": 13471 + }, + { + "epoch": 2.199175543855353, + "grad_norm": 1.8554776906967163, + "learning_rate": 1.9226972096853207e-05, + "loss": 0.647, + "step": 13472 + }, + { + "epoch": 2.199338802497857, + "grad_norm": 1.5406322479248047, + "learning_rate": 1.9226849752422895e-05, + "loss": 0.5386, + "step": 13473 + }, + { + "epoch": 2.1995020611403615, + "grad_norm": 2.119314670562744, + "learning_rate": 1.922672739870115e-05, + "loss": 0.6352, + "step": 13474 + }, + { + "epoch": 2.199665319782866, + "grad_norm": 1.2740099430084229, + "learning_rate": 1.922660503568809e-05, + "loss": 0.4432, + "step": 13475 + }, + { + "epoch": 2.1998285784253704, + "grad_norm": 1.9824882745742798, + "learning_rate": 1.9226482663383845e-05, + "loss": 0.5975, + "step": 13476 + }, + { + "epoch": 2.199991837067875, + "grad_norm": 1.7407184839248657, + "learning_rate": 1.9226360281788536e-05, + "loss": 0.5823, + "step": 13477 + }, + { + "epoch": 2.2001550957103793, + "grad_norm": 1.8667157888412476, + "learning_rate": 1.9226237890902278e-05, + "loss": 0.5907, + "step": 13478 + }, + { + "epoch": 2.2003183543528837, + "grad_norm": 1.4921947717666626, + "learning_rate": 1.9226115490725207e-05, + "loss": 0.4381, + "step": 13479 + }, + { + "epoch": 2.200481612995388, + "grad_norm": 2.118964433670044, + "learning_rate": 1.9225993081257435e-05, + "loss": 0.6268, + "step": 13480 + }, + { + "epoch": 2.200644871637892, + "grad_norm": 1.8599441051483154, + "learning_rate": 1.9225870662499094e-05, + "loss": 0.5398, + "step": 13481 + }, + { + "epoch": 2.2008081302803966, + "grad_norm": 1.5314562320709229, + "learning_rate": 1.92257482344503e-05, + "loss": 0.465, + "step": 13482 + }, + { + "epoch": 2.200971388922901, + "grad_norm": 1.7340726852416992, + "learning_rate": 1.9225625797111186e-05, + "loss": 0.4932, + "step": 13483 + }, + { + "epoch": 2.2011346475654054, + "grad_norm": 1.5865488052368164, + "learning_rate": 1.9225503350481863e-05, + "loss": 0.403, + "step": 13484 + }, + { + "epoch": 2.20129790620791, + "grad_norm": 1.6666765213012695, + "learning_rate": 1.9225380894562466e-05, + "loss": 0.462, + "step": 13485 + }, + { + "epoch": 2.2014611648504143, + "grad_norm": 1.6721558570861816, + "learning_rate": 1.922525842935311e-05, + "loss": 0.5228, + "step": 13486 + }, + { + "epoch": 2.2016244234929188, + "grad_norm": 1.8734030723571777, + "learning_rate": 1.922513595485392e-05, + "loss": 0.5889, + "step": 13487 + }, + { + "epoch": 2.201787682135423, + "grad_norm": 1.8705997467041016, + "learning_rate": 1.9225013471065022e-05, + "loss": 0.5843, + "step": 13488 + }, + { + "epoch": 2.2019509407779276, + "grad_norm": 1.887967586517334, + "learning_rate": 1.922489097798654e-05, + "loss": 0.5936, + "step": 13489 + }, + { + "epoch": 2.2021141994204316, + "grad_norm": 2.0482609272003174, + "learning_rate": 1.922476847561859e-05, + "loss": 0.565, + "step": 13490 + }, + { + "epoch": 2.202277458062936, + "grad_norm": 1.8741295337677002, + "learning_rate": 1.9224645963961308e-05, + "loss": 0.5712, + "step": 13491 + }, + { + "epoch": 2.2024407167054405, + "grad_norm": 2.0207552909851074, + "learning_rate": 1.9224523443014804e-05, + "loss": 0.6382, + "step": 13492 + }, + { + "epoch": 2.202603975347945, + "grad_norm": 1.6354000568389893, + "learning_rate": 1.922440091277921e-05, + "loss": 0.4657, + "step": 13493 + }, + { + "epoch": 2.2027672339904494, + "grad_norm": 1.8365674018859863, + "learning_rate": 1.9224278373254644e-05, + "loss": 0.5391, + "step": 13494 + }, + { + "epoch": 2.202930492632954, + "grad_norm": 2.068016529083252, + "learning_rate": 1.922415582444123e-05, + "loss": 0.709, + "step": 13495 + }, + { + "epoch": 2.2030937512754583, + "grad_norm": 1.5636731386184692, + "learning_rate": 1.9224033266339103e-05, + "loss": 0.499, + "step": 13496 + }, + { + "epoch": 2.2032570099179627, + "grad_norm": 1.9505335092544556, + "learning_rate": 1.922391069894837e-05, + "loss": 0.6586, + "step": 13497 + }, + { + "epoch": 2.2034202685604667, + "grad_norm": 1.7289849519729614, + "learning_rate": 1.9223788122269163e-05, + "loss": 0.6301, + "step": 13498 + }, + { + "epoch": 2.203583527202971, + "grad_norm": 2.1119890213012695, + "learning_rate": 1.9223665536301602e-05, + "loss": 0.6882, + "step": 13499 + }, + { + "epoch": 2.2037467858454756, + "grad_norm": 1.773877739906311, + "learning_rate": 1.9223542941045817e-05, + "loss": 0.5544, + "step": 13500 + }, + { + "epoch": 2.20391004448798, + "grad_norm": 1.6729321479797363, + "learning_rate": 1.9223420336501922e-05, + "loss": 0.5634, + "step": 13501 + }, + { + "epoch": 2.2040733031304844, + "grad_norm": 1.437172532081604, + "learning_rate": 1.9223297722670047e-05, + "loss": 0.4753, + "step": 13502 + }, + { + "epoch": 2.204236561772989, + "grad_norm": 1.5925832986831665, + "learning_rate": 1.9223175099550313e-05, + "loss": 0.5886, + "step": 13503 + }, + { + "epoch": 2.2043998204154933, + "grad_norm": 1.923754096031189, + "learning_rate": 1.9223052467142846e-05, + "loss": 0.5966, + "step": 13504 + }, + { + "epoch": 2.2045630790579978, + "grad_norm": 1.6006555557250977, + "learning_rate": 1.9222929825447764e-05, + "loss": 0.612, + "step": 13505 + }, + { + "epoch": 2.204726337700502, + "grad_norm": 1.749432921409607, + "learning_rate": 1.9222807174465196e-05, + "loss": 0.6333, + "step": 13506 + }, + { + "epoch": 2.2048895963430066, + "grad_norm": 1.9583512544631958, + "learning_rate": 1.9222684514195265e-05, + "loss": 0.6154, + "step": 13507 + }, + { + "epoch": 2.2050528549855106, + "grad_norm": 1.8669524192810059, + "learning_rate": 1.922256184463809e-05, + "loss": 0.5986, + "step": 13508 + }, + { + "epoch": 2.205216113628015, + "grad_norm": 1.385091781616211, + "learning_rate": 1.92224391657938e-05, + "loss": 0.4544, + "step": 13509 + }, + { + "epoch": 2.2053793722705195, + "grad_norm": 1.8015996217727661, + "learning_rate": 1.9222316477662517e-05, + "loss": 0.604, + "step": 13510 + }, + { + "epoch": 2.205542630913024, + "grad_norm": 1.626786708831787, + "learning_rate": 1.9222193780244363e-05, + "loss": 0.5264, + "step": 13511 + }, + { + "epoch": 2.2057058895555284, + "grad_norm": 2.028411626815796, + "learning_rate": 1.9222071073539462e-05, + "loss": 0.591, + "step": 13512 + }, + { + "epoch": 2.205869148198033, + "grad_norm": 1.7009451389312744, + "learning_rate": 1.9221948357547936e-05, + "loss": 0.671, + "step": 13513 + }, + { + "epoch": 2.2060324068405373, + "grad_norm": 1.5573978424072266, + "learning_rate": 1.9221825632269913e-05, + "loss": 0.4957, + "step": 13514 + }, + { + "epoch": 2.2061956654830417, + "grad_norm": 1.6353881359100342, + "learning_rate": 1.9221702897705516e-05, + "loss": 0.4891, + "step": 13515 + }, + { + "epoch": 2.2063589241255457, + "grad_norm": 1.687212586402893, + "learning_rate": 1.9221580153854862e-05, + "loss": 0.5559, + "step": 13516 + }, + { + "epoch": 2.20652218276805, + "grad_norm": 2.0059144496917725, + "learning_rate": 1.9221457400718078e-05, + "loss": 0.6417, + "step": 13517 + }, + { + "epoch": 2.2066854414105546, + "grad_norm": 1.771941900253296, + "learning_rate": 1.9221334638295296e-05, + "loss": 0.5782, + "step": 13518 + }, + { + "epoch": 2.206848700053059, + "grad_norm": 1.8332910537719727, + "learning_rate": 1.9221211866586627e-05, + "loss": 0.6157, + "step": 13519 + }, + { + "epoch": 2.2070119586955634, + "grad_norm": 1.806488037109375, + "learning_rate": 1.9221089085592203e-05, + "loss": 0.5286, + "step": 13520 + }, + { + "epoch": 2.207175217338068, + "grad_norm": 1.764085292816162, + "learning_rate": 1.9220966295312143e-05, + "loss": 0.5275, + "step": 13521 + }, + { + "epoch": 2.2073384759805723, + "grad_norm": 1.7545183897018433, + "learning_rate": 1.9220843495746573e-05, + "loss": 0.5164, + "step": 13522 + }, + { + "epoch": 2.2075017346230768, + "grad_norm": 1.907439112663269, + "learning_rate": 1.9220720686895614e-05, + "loss": 0.577, + "step": 13523 + }, + { + "epoch": 2.207664993265581, + "grad_norm": 1.778931736946106, + "learning_rate": 1.9220597868759395e-05, + "loss": 0.5667, + "step": 13524 + }, + { + "epoch": 2.207828251908085, + "grad_norm": 2.207648515701294, + "learning_rate": 1.9220475041338035e-05, + "loss": 0.6942, + "step": 13525 + }, + { + "epoch": 2.2079915105505896, + "grad_norm": 1.6599267721176147, + "learning_rate": 1.922035220463166e-05, + "loss": 0.5335, + "step": 13526 + }, + { + "epoch": 2.208154769193094, + "grad_norm": 1.9255611896514893, + "learning_rate": 1.922022935864039e-05, + "loss": 0.5976, + "step": 13527 + }, + { + "epoch": 2.2083180278355985, + "grad_norm": 1.6730433702468872, + "learning_rate": 1.9220106503364354e-05, + "loss": 0.557, + "step": 13528 + }, + { + "epoch": 2.208481286478103, + "grad_norm": 1.9317359924316406, + "learning_rate": 1.9219983638803672e-05, + "loss": 0.6123, + "step": 13529 + }, + { + "epoch": 2.2086445451206074, + "grad_norm": 1.6502612829208374, + "learning_rate": 1.9219860764958466e-05, + "loss": 0.5258, + "step": 13530 + }, + { + "epoch": 2.208807803763112, + "grad_norm": 1.9240596294403076, + "learning_rate": 1.9219737881828867e-05, + "loss": 0.562, + "step": 13531 + }, + { + "epoch": 2.2089710624056162, + "grad_norm": 2.038191795349121, + "learning_rate": 1.9219614989414994e-05, + "loss": 0.6322, + "step": 13532 + }, + { + "epoch": 2.2091343210481207, + "grad_norm": 1.7270886898040771, + "learning_rate": 1.921949208771697e-05, + "loss": 0.6074, + "step": 13533 + }, + { + "epoch": 2.2092975796906247, + "grad_norm": 1.8884261846542358, + "learning_rate": 1.921936917673492e-05, + "loss": 0.583, + "step": 13534 + }, + { + "epoch": 2.209460838333129, + "grad_norm": 1.6123483180999756, + "learning_rate": 1.921924625646897e-05, + "loss": 0.5285, + "step": 13535 + }, + { + "epoch": 2.2096240969756336, + "grad_norm": 1.6566444635391235, + "learning_rate": 1.9219123326919237e-05, + "loss": 0.6298, + "step": 13536 + }, + { + "epoch": 2.209787355618138, + "grad_norm": 1.7489402294158936, + "learning_rate": 1.9219000388085855e-05, + "loss": 0.5581, + "step": 13537 + }, + { + "epoch": 2.2099506142606424, + "grad_norm": 1.5402017831802368, + "learning_rate": 1.9218877439968937e-05, + "loss": 0.4795, + "step": 13538 + }, + { + "epoch": 2.210113872903147, + "grad_norm": 1.8335278034210205, + "learning_rate": 1.9218754482568613e-05, + "loss": 0.5245, + "step": 13539 + }, + { + "epoch": 2.2102771315456513, + "grad_norm": 1.7608600854873657, + "learning_rate": 1.9218631515885007e-05, + "loss": 0.5421, + "step": 13540 + }, + { + "epoch": 2.2104403901881557, + "grad_norm": 1.7252782583236694, + "learning_rate": 1.9218508539918243e-05, + "loss": 0.6085, + "step": 13541 + }, + { + "epoch": 2.21060364883066, + "grad_norm": 1.7853487730026245, + "learning_rate": 1.921838555466844e-05, + "loss": 0.5746, + "step": 13542 + }, + { + "epoch": 2.210766907473164, + "grad_norm": 1.7860963344573975, + "learning_rate": 1.9218262560135727e-05, + "loss": 0.5188, + "step": 13543 + }, + { + "epoch": 2.2109301661156686, + "grad_norm": 1.9549198150634766, + "learning_rate": 1.9218139556320223e-05, + "loss": 0.6255, + "step": 13544 + }, + { + "epoch": 2.211093424758173, + "grad_norm": 1.5937292575836182, + "learning_rate": 1.9218016543222058e-05, + "loss": 0.5692, + "step": 13545 + }, + { + "epoch": 2.2112566834006775, + "grad_norm": 1.8778589963912964, + "learning_rate": 1.9217893520841354e-05, + "loss": 0.5808, + "step": 13546 + }, + { + "epoch": 2.211419942043182, + "grad_norm": 1.771462321281433, + "learning_rate": 1.921777048917823e-05, + "loss": 0.6087, + "step": 13547 + }, + { + "epoch": 2.2115832006856864, + "grad_norm": 1.6931540966033936, + "learning_rate": 1.9217647448232816e-05, + "loss": 0.51, + "step": 13548 + }, + { + "epoch": 2.211746459328191, + "grad_norm": 2.114635944366455, + "learning_rate": 1.9217524398005233e-05, + "loss": 0.6827, + "step": 13549 + }, + { + "epoch": 2.2119097179706952, + "grad_norm": 1.8352621793746948, + "learning_rate": 1.9217401338495605e-05, + "loss": 0.6093, + "step": 13550 + }, + { + "epoch": 2.2120729766131992, + "grad_norm": 1.6217172145843506, + "learning_rate": 1.9217278269704055e-05, + "loss": 0.501, + "step": 13551 + }, + { + "epoch": 2.2122362352557037, + "grad_norm": 1.7867323160171509, + "learning_rate": 1.921715519163071e-05, + "loss": 0.6063, + "step": 13552 + }, + { + "epoch": 2.212399493898208, + "grad_norm": 1.7343645095825195, + "learning_rate": 1.9217032104275692e-05, + "loss": 0.5364, + "step": 13553 + }, + { + "epoch": 2.2125627525407126, + "grad_norm": 1.8269546031951904, + "learning_rate": 1.9216909007639126e-05, + "loss": 0.5633, + "step": 13554 + }, + { + "epoch": 2.212726011183217, + "grad_norm": 1.51897132396698, + "learning_rate": 1.9216785901721136e-05, + "loss": 0.5591, + "step": 13555 + }, + { + "epoch": 2.2128892698257214, + "grad_norm": 1.6662660837173462, + "learning_rate": 1.9216662786521843e-05, + "loss": 0.5975, + "step": 13556 + }, + { + "epoch": 2.213052528468226, + "grad_norm": 1.9194362163543701, + "learning_rate": 1.921653966204137e-05, + "loss": 0.6315, + "step": 13557 + }, + { + "epoch": 2.2132157871107303, + "grad_norm": 2.01161527633667, + "learning_rate": 1.9216416528279848e-05, + "loss": 0.6387, + "step": 13558 + }, + { + "epoch": 2.2133790457532347, + "grad_norm": 1.324353575706482, + "learning_rate": 1.9216293385237396e-05, + "loss": 0.4356, + "step": 13559 + }, + { + "epoch": 2.213542304395739, + "grad_norm": 2.0111591815948486, + "learning_rate": 1.921617023291414e-05, + "loss": 0.7215, + "step": 13560 + }, + { + "epoch": 2.213705563038243, + "grad_norm": 1.8432612419128418, + "learning_rate": 1.9216047071310202e-05, + "loss": 0.54, + "step": 13561 + }, + { + "epoch": 2.2138688216807476, + "grad_norm": 1.6190499067306519, + "learning_rate": 1.921592390042571e-05, + "loss": 0.533, + "step": 13562 + }, + { + "epoch": 2.214032080323252, + "grad_norm": 1.7931301593780518, + "learning_rate": 1.921580072026078e-05, + "loss": 0.6241, + "step": 13563 + }, + { + "epoch": 2.2141953389657565, + "grad_norm": 1.547144889831543, + "learning_rate": 1.921567753081554e-05, + "loss": 0.5226, + "step": 13564 + }, + { + "epoch": 2.214358597608261, + "grad_norm": 1.7574440240859985, + "learning_rate": 1.921555433209012e-05, + "loss": 0.6014, + "step": 13565 + }, + { + "epoch": 2.2145218562507654, + "grad_norm": 1.9926255941390991, + "learning_rate": 1.921543112408464e-05, + "loss": 0.5248, + "step": 13566 + }, + { + "epoch": 2.21468511489327, + "grad_norm": 1.995849609375, + "learning_rate": 1.921530790679922e-05, + "loss": 0.6959, + "step": 13567 + }, + { + "epoch": 2.2148483735357742, + "grad_norm": 1.853157877922058, + "learning_rate": 1.9215184680233988e-05, + "loss": 0.4849, + "step": 13568 + }, + { + "epoch": 2.2150116321782782, + "grad_norm": 1.7080700397491455, + "learning_rate": 1.9215061444389068e-05, + "loss": 0.5376, + "step": 13569 + }, + { + "epoch": 2.2151748908207827, + "grad_norm": 2.378742218017578, + "learning_rate": 1.9214938199264584e-05, + "loss": 0.5555, + "step": 13570 + }, + { + "epoch": 2.215338149463287, + "grad_norm": 1.8883365392684937, + "learning_rate": 1.921481494486066e-05, + "loss": 0.6269, + "step": 13571 + }, + { + "epoch": 2.2155014081057915, + "grad_norm": 1.768269419670105, + "learning_rate": 1.921469168117742e-05, + "loss": 0.5501, + "step": 13572 + }, + { + "epoch": 2.215664666748296, + "grad_norm": 1.536367416381836, + "learning_rate": 1.9214568408214986e-05, + "loss": 0.5422, + "step": 13573 + }, + { + "epoch": 2.2158279253908004, + "grad_norm": 1.4148045778274536, + "learning_rate": 1.9214445125973484e-05, + "loss": 0.4636, + "step": 13574 + }, + { + "epoch": 2.215991184033305, + "grad_norm": 1.997836709022522, + "learning_rate": 1.9214321834453042e-05, + "loss": 0.5852, + "step": 13575 + }, + { + "epoch": 2.2161544426758093, + "grad_norm": 1.787115216255188, + "learning_rate": 1.9214198533653777e-05, + "loss": 0.6068, + "step": 13576 + }, + { + "epoch": 2.2163177013183137, + "grad_norm": 1.6745878458023071, + "learning_rate": 1.9214075223575818e-05, + "loss": 0.5591, + "step": 13577 + }, + { + "epoch": 2.2164809599608177, + "grad_norm": 2.0698964595794678, + "learning_rate": 1.9213951904219284e-05, + "loss": 0.6532, + "step": 13578 + }, + { + "epoch": 2.216644218603322, + "grad_norm": 1.9353781938552856, + "learning_rate": 1.9213828575584304e-05, + "loss": 0.6464, + "step": 13579 + }, + { + "epoch": 2.2168074772458266, + "grad_norm": 2.049626588821411, + "learning_rate": 1.9213705237671007e-05, + "loss": 0.483, + "step": 13580 + }, + { + "epoch": 2.216970735888331, + "grad_norm": 1.7843250036239624, + "learning_rate": 1.9213581890479503e-05, + "loss": 0.4853, + "step": 13581 + }, + { + "epoch": 2.2171339945308355, + "grad_norm": 1.8463362455368042, + "learning_rate": 1.9213458534009933e-05, + "loss": 0.5713, + "step": 13582 + }, + { + "epoch": 2.21729725317334, + "grad_norm": 1.8813565969467163, + "learning_rate": 1.9213335168262407e-05, + "loss": 0.6072, + "step": 13583 + }, + { + "epoch": 2.2174605118158444, + "grad_norm": 2.41782283782959, + "learning_rate": 1.9213211793237056e-05, + "loss": 0.6551, + "step": 13584 + }, + { + "epoch": 2.217623770458349, + "grad_norm": 1.6749900579452515, + "learning_rate": 1.9213088408934003e-05, + "loss": 0.5389, + "step": 13585 + }, + { + "epoch": 2.217787029100853, + "grad_norm": 1.705018401145935, + "learning_rate": 1.921296501535337e-05, + "loss": 0.5046, + "step": 13586 + }, + { + "epoch": 2.2179502877433572, + "grad_norm": 1.6513890027999878, + "learning_rate": 1.921284161249529e-05, + "loss": 0.5133, + "step": 13587 + }, + { + "epoch": 2.2181135463858617, + "grad_norm": 1.7584933042526245, + "learning_rate": 1.9212718200359876e-05, + "loss": 0.5469, + "step": 13588 + }, + { + "epoch": 2.218276805028366, + "grad_norm": 1.9311105012893677, + "learning_rate": 1.9212594778947256e-05, + "loss": 0.6027, + "step": 13589 + }, + { + "epoch": 2.2184400636708705, + "grad_norm": 1.5126897096633911, + "learning_rate": 1.9212471348257562e-05, + "loss": 0.4942, + "step": 13590 + }, + { + "epoch": 2.218603322313375, + "grad_norm": 1.4252760410308838, + "learning_rate": 1.9212347908290906e-05, + "loss": 0.4826, + "step": 13591 + }, + { + "epoch": 2.2187665809558794, + "grad_norm": 1.8107327222824097, + "learning_rate": 1.921222445904742e-05, + "loss": 0.5526, + "step": 13592 + }, + { + "epoch": 2.218929839598384, + "grad_norm": 2.0095150470733643, + "learning_rate": 1.9212101000527225e-05, + "loss": 0.5954, + "step": 13593 + }, + { + "epoch": 2.2190930982408883, + "grad_norm": 1.8584787845611572, + "learning_rate": 1.9211977532730448e-05, + "loss": 0.5531, + "step": 13594 + }, + { + "epoch": 2.2192563568833927, + "grad_norm": 1.8032480478286743, + "learning_rate": 1.9211854055657216e-05, + "loss": 0.5584, + "step": 13595 + }, + { + "epoch": 2.2194196155258967, + "grad_norm": 1.5795607566833496, + "learning_rate": 1.9211730569307642e-05, + "loss": 0.5806, + "step": 13596 + }, + { + "epoch": 2.219582874168401, + "grad_norm": 1.893045425415039, + "learning_rate": 1.9211607073681865e-05, + "loss": 0.6217, + "step": 13597 + }, + { + "epoch": 2.2197461328109056, + "grad_norm": 1.8879095315933228, + "learning_rate": 1.9211483568779996e-05, + "loss": 0.6045, + "step": 13598 + }, + { + "epoch": 2.21990939145341, + "grad_norm": 3.2096142768859863, + "learning_rate": 1.9211360054602167e-05, + "loss": 0.6962, + "step": 13599 + }, + { + "epoch": 2.2200726500959145, + "grad_norm": 1.289203405380249, + "learning_rate": 1.92112365311485e-05, + "loss": 0.453, + "step": 13600 + }, + { + "epoch": 2.220235908738419, + "grad_norm": 1.6069953441619873, + "learning_rate": 1.9211112998419127e-05, + "loss": 0.5033, + "step": 13601 + }, + { + "epoch": 2.2203991673809234, + "grad_norm": 1.496275782585144, + "learning_rate": 1.921098945641416e-05, + "loss": 0.5011, + "step": 13602 + }, + { + "epoch": 2.220562426023428, + "grad_norm": 1.8447808027267456, + "learning_rate": 1.921086590513373e-05, + "loss": 0.5403, + "step": 13603 + }, + { + "epoch": 2.220725684665932, + "grad_norm": 1.7715197801589966, + "learning_rate": 1.921074234457796e-05, + "loss": 0.5345, + "step": 13604 + }, + { + "epoch": 2.2208889433084362, + "grad_norm": 1.9201494455337524, + "learning_rate": 1.9210618774746974e-05, + "loss": 0.5244, + "step": 13605 + }, + { + "epoch": 2.2210522019509407, + "grad_norm": 1.7460635900497437, + "learning_rate": 1.9210495195640895e-05, + "loss": 0.6641, + "step": 13606 + }, + { + "epoch": 2.221215460593445, + "grad_norm": 1.421812653541565, + "learning_rate": 1.9210371607259857e-05, + "loss": 0.5238, + "step": 13607 + }, + { + "epoch": 2.2213787192359495, + "grad_norm": 1.7292649745941162, + "learning_rate": 1.9210248009603974e-05, + "loss": 0.4437, + "step": 13608 + }, + { + "epoch": 2.221541977878454, + "grad_norm": 1.6033774614334106, + "learning_rate": 1.921012440267337e-05, + "loss": 0.5585, + "step": 13609 + }, + { + "epoch": 2.2217052365209584, + "grad_norm": 1.7057805061340332, + "learning_rate": 1.9210000786468178e-05, + "loss": 0.5226, + "step": 13610 + }, + { + "epoch": 2.221868495163463, + "grad_norm": 1.9785815477371216, + "learning_rate": 1.9209877160988516e-05, + "loss": 0.5909, + "step": 13611 + }, + { + "epoch": 2.2220317538059673, + "grad_norm": 1.5307905673980713, + "learning_rate": 1.920975352623451e-05, + "loss": 0.4785, + "step": 13612 + }, + { + "epoch": 2.2221950124484713, + "grad_norm": 1.7326740026474, + "learning_rate": 1.9209629882206284e-05, + "loss": 0.5041, + "step": 13613 + }, + { + "epoch": 2.2223582710909757, + "grad_norm": 2.1607391834259033, + "learning_rate": 1.9209506228903965e-05, + "loss": 0.632, + "step": 13614 + }, + { + "epoch": 2.22252152973348, + "grad_norm": 2.011892795562744, + "learning_rate": 1.9209382566327675e-05, + "loss": 0.6038, + "step": 13615 + }, + { + "epoch": 2.2226847883759846, + "grad_norm": 1.4998561143875122, + "learning_rate": 1.9209258894477537e-05, + "loss": 0.5252, + "step": 13616 + }, + { + "epoch": 2.222848047018489, + "grad_norm": 1.5509207248687744, + "learning_rate": 1.920913521335368e-05, + "loss": 0.5204, + "step": 13617 + }, + { + "epoch": 2.2230113056609935, + "grad_norm": 2.1628544330596924, + "learning_rate": 1.9209011522956226e-05, + "loss": 0.7085, + "step": 13618 + }, + { + "epoch": 2.223174564303498, + "grad_norm": 1.7155131101608276, + "learning_rate": 1.92088878232853e-05, + "loss": 0.5317, + "step": 13619 + }, + { + "epoch": 2.2233378229460024, + "grad_norm": 1.5279570817947388, + "learning_rate": 1.9208764114341028e-05, + "loss": 0.4553, + "step": 13620 + }, + { + "epoch": 2.223501081588507, + "grad_norm": 1.885162591934204, + "learning_rate": 1.920864039612353e-05, + "loss": 0.6281, + "step": 13621 + }, + { + "epoch": 2.223664340231011, + "grad_norm": 2.0555436611175537, + "learning_rate": 1.9208516668632936e-05, + "loss": 0.6174, + "step": 13622 + }, + { + "epoch": 2.223827598873515, + "grad_norm": 1.6779931783676147, + "learning_rate": 1.9208392931869367e-05, + "loss": 0.5731, + "step": 13623 + }, + { + "epoch": 2.2239908575160197, + "grad_norm": 1.7192808389663696, + "learning_rate": 1.920826918583295e-05, + "loss": 0.6876, + "step": 13624 + }, + { + "epoch": 2.224154116158524, + "grad_norm": 2.0194222927093506, + "learning_rate": 1.9208145430523804e-05, + "loss": 0.6239, + "step": 13625 + }, + { + "epoch": 2.2243173748010285, + "grad_norm": 1.5343436002731323, + "learning_rate": 1.920802166594206e-05, + "loss": 0.5947, + "step": 13626 + }, + { + "epoch": 2.224480633443533, + "grad_norm": 1.7890980243682861, + "learning_rate": 1.9207897892087844e-05, + "loss": 0.5279, + "step": 13627 + }, + { + "epoch": 2.2246438920860374, + "grad_norm": 1.6333937644958496, + "learning_rate": 1.9207774108961273e-05, + "loss": 0.5045, + "step": 13628 + }, + { + "epoch": 2.224807150728542, + "grad_norm": 1.6821261644363403, + "learning_rate": 1.920765031656248e-05, + "loss": 0.5682, + "step": 13629 + }, + { + "epoch": 2.2249704093710463, + "grad_norm": 2.1920340061187744, + "learning_rate": 1.9207526514891582e-05, + "loss": 0.9176, + "step": 13630 + }, + { + "epoch": 2.2251336680135503, + "grad_norm": 2.1574747562408447, + "learning_rate": 1.9207402703948707e-05, + "loss": 0.5489, + "step": 13631 + }, + { + "epoch": 2.2252969266560547, + "grad_norm": 1.9292341470718384, + "learning_rate": 1.920727888373398e-05, + "loss": 0.6237, + "step": 13632 + }, + { + "epoch": 2.225460185298559, + "grad_norm": 1.6960676908493042, + "learning_rate": 1.920715505424753e-05, + "loss": 0.541, + "step": 13633 + }, + { + "epoch": 2.2256234439410636, + "grad_norm": 1.442441463470459, + "learning_rate": 1.9207031215489474e-05, + "loss": 0.4037, + "step": 13634 + }, + { + "epoch": 2.225786702583568, + "grad_norm": 1.270574927330017, + "learning_rate": 1.920690736745994e-05, + "loss": 0.3262, + "step": 13635 + }, + { + "epoch": 2.2259499612260725, + "grad_norm": 2.1209495067596436, + "learning_rate": 1.9206783510159054e-05, + "loss": 0.7687, + "step": 13636 + }, + { + "epoch": 2.226113219868577, + "grad_norm": 2.033233880996704, + "learning_rate": 1.9206659643586938e-05, + "loss": 0.6396, + "step": 13637 + }, + { + "epoch": 2.2262764785110813, + "grad_norm": 1.7493888139724731, + "learning_rate": 1.9206535767743717e-05, + "loss": 0.6214, + "step": 13638 + }, + { + "epoch": 2.2264397371535853, + "grad_norm": 1.8327149152755737, + "learning_rate": 1.920641188262952e-05, + "loss": 0.5637, + "step": 13639 + }, + { + "epoch": 2.22660299579609, + "grad_norm": 2.0350399017333984, + "learning_rate": 1.9206287988244467e-05, + "loss": 0.7454, + "step": 13640 + }, + { + "epoch": 2.226766254438594, + "grad_norm": 1.6738038063049316, + "learning_rate": 1.9206164084588685e-05, + "loss": 0.5479, + "step": 13641 + }, + { + "epoch": 2.2269295130810987, + "grad_norm": 1.7716044187545776, + "learning_rate": 1.92060401716623e-05, + "loss": 0.5822, + "step": 13642 + }, + { + "epoch": 2.227092771723603, + "grad_norm": 1.9412842988967896, + "learning_rate": 1.9205916249465432e-05, + "loss": 0.6407, + "step": 13643 + }, + { + "epoch": 2.2272560303661075, + "grad_norm": 1.6795233488082886, + "learning_rate": 1.9205792317998208e-05, + "loss": 0.4843, + "step": 13644 + }, + { + "epoch": 2.227419289008612, + "grad_norm": 1.7944285869598389, + "learning_rate": 1.9205668377260757e-05, + "loss": 0.5733, + "step": 13645 + }, + { + "epoch": 2.2275825476511164, + "grad_norm": 1.9152007102966309, + "learning_rate": 1.9205544427253198e-05, + "loss": 0.6878, + "step": 13646 + }, + { + "epoch": 2.227745806293621, + "grad_norm": 1.965161919593811, + "learning_rate": 1.9205420467975656e-05, + "loss": 0.7353, + "step": 13647 + }, + { + "epoch": 2.2279090649361253, + "grad_norm": 1.8432848453521729, + "learning_rate": 1.9205296499428264e-05, + "loss": 0.4868, + "step": 13648 + }, + { + "epoch": 2.2280723235786293, + "grad_norm": 1.7385543584823608, + "learning_rate": 1.9205172521611136e-05, + "loss": 0.5237, + "step": 13649 + }, + { + "epoch": 2.2282355822211337, + "grad_norm": 1.6774332523345947, + "learning_rate": 1.9205048534524405e-05, + "loss": 0.5438, + "step": 13650 + }, + { + "epoch": 2.228398840863638, + "grad_norm": 1.697908878326416, + "learning_rate": 1.9204924538168192e-05, + "loss": 0.5761, + "step": 13651 + }, + { + "epoch": 2.2285620995061426, + "grad_norm": 1.8424638509750366, + "learning_rate": 1.920480053254262e-05, + "loss": 0.5592, + "step": 13652 + }, + { + "epoch": 2.228725358148647, + "grad_norm": 1.926652193069458, + "learning_rate": 1.9204676517647818e-05, + "loss": 0.6077, + "step": 13653 + }, + { + "epoch": 2.2288886167911515, + "grad_norm": 1.8519768714904785, + "learning_rate": 1.920455249348391e-05, + "loss": 0.6069, + "step": 13654 + }, + { + "epoch": 2.229051875433656, + "grad_norm": 1.6059880256652832, + "learning_rate": 1.920442846005102e-05, + "loss": 0.6116, + "step": 13655 + }, + { + "epoch": 2.2292151340761603, + "grad_norm": 2.3965744972229004, + "learning_rate": 1.920430441734927e-05, + "loss": 0.6136, + "step": 13656 + }, + { + "epoch": 2.2293783927186643, + "grad_norm": 1.9318472146987915, + "learning_rate": 1.9204180365378792e-05, + "loss": 0.6887, + "step": 13657 + }, + { + "epoch": 2.2295416513611688, + "grad_norm": 1.564235806465149, + "learning_rate": 1.9204056304139703e-05, + "loss": 0.5458, + "step": 13658 + }, + { + "epoch": 2.229704910003673, + "grad_norm": 2.087393283843994, + "learning_rate": 1.9203932233632133e-05, + "loss": 0.6895, + "step": 13659 + }, + { + "epoch": 2.2298681686461777, + "grad_norm": 1.6476703882217407, + "learning_rate": 1.9203808153856206e-05, + "loss": 0.5532, + "step": 13660 + }, + { + "epoch": 2.230031427288682, + "grad_norm": 1.919629454612732, + "learning_rate": 1.9203684064812047e-05, + "loss": 0.5048, + "step": 13661 + }, + { + "epoch": 2.2301946859311865, + "grad_norm": 2.098191022872925, + "learning_rate": 1.920355996649978e-05, + "loss": 0.6111, + "step": 13662 + }, + { + "epoch": 2.230357944573691, + "grad_norm": 1.801698088645935, + "learning_rate": 1.9203435858919532e-05, + "loss": 0.5761, + "step": 13663 + }, + { + "epoch": 2.2305212032161954, + "grad_norm": 2.014416456222534, + "learning_rate": 1.9203311742071426e-05, + "loss": 0.7002, + "step": 13664 + }, + { + "epoch": 2.2306844618587, + "grad_norm": 1.6290326118469238, + "learning_rate": 1.9203187615955587e-05, + "loss": 0.5589, + "step": 13665 + }, + { + "epoch": 2.230847720501204, + "grad_norm": 1.9970515966415405, + "learning_rate": 1.920306348057214e-05, + "loss": 0.6231, + "step": 13666 + }, + { + "epoch": 2.2310109791437083, + "grad_norm": 1.9401311874389648, + "learning_rate": 1.920293933592121e-05, + "loss": 0.6371, + "step": 13667 + }, + { + "epoch": 2.2311742377862127, + "grad_norm": 2.0539445877075195, + "learning_rate": 1.9202815182002924e-05, + "loss": 0.652, + "step": 13668 + }, + { + "epoch": 2.231337496428717, + "grad_norm": 1.8345069885253906, + "learning_rate": 1.9202691018817406e-05, + "loss": 0.5604, + "step": 13669 + }, + { + "epoch": 2.2315007550712216, + "grad_norm": 1.569280743598938, + "learning_rate": 1.9202566846364782e-05, + "loss": 0.4602, + "step": 13670 + }, + { + "epoch": 2.231664013713726, + "grad_norm": 1.7116658687591553, + "learning_rate": 1.920244266464517e-05, + "loss": 0.5133, + "step": 13671 + }, + { + "epoch": 2.2318272723562305, + "grad_norm": 2.007737398147583, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.622, + "step": 13672 + }, + { + "epoch": 2.231990530998735, + "grad_norm": 1.6424188613891602, + "learning_rate": 1.9202194273405506e-05, + "loss": 0.546, + "step": 13673 + }, + { + "epoch": 2.2321537896412393, + "grad_norm": 1.7496731281280518, + "learning_rate": 1.9202070063885703e-05, + "loss": 0.5046, + "step": 13674 + }, + { + "epoch": 2.2323170482837433, + "grad_norm": 1.5988408327102661, + "learning_rate": 1.9201945845099415e-05, + "loss": 0.5005, + "step": 13675 + }, + { + "epoch": 2.2324803069262478, + "grad_norm": 1.7019726037979126, + "learning_rate": 1.920182161704677e-05, + "loss": 0.519, + "step": 13676 + }, + { + "epoch": 2.232643565568752, + "grad_norm": 1.8633251190185547, + "learning_rate": 1.9201697379727894e-05, + "loss": 0.5243, + "step": 13677 + }, + { + "epoch": 2.2328068242112566, + "grad_norm": 1.7601333856582642, + "learning_rate": 1.920157313314291e-05, + "loss": 0.5426, + "step": 13678 + }, + { + "epoch": 2.232970082853761, + "grad_norm": 1.72750723361969, + "learning_rate": 1.9201448877291942e-05, + "loss": 0.5703, + "step": 13679 + }, + { + "epoch": 2.2331333414962655, + "grad_norm": 1.9131797552108765, + "learning_rate": 1.9201324612175123e-05, + "loss": 0.5821, + "step": 13680 + }, + { + "epoch": 2.23329660013877, + "grad_norm": 1.756535530090332, + "learning_rate": 1.920120033779257e-05, + "loss": 0.5365, + "step": 13681 + }, + { + "epoch": 2.2334598587812744, + "grad_norm": 1.9451779127120972, + "learning_rate": 1.9201076054144412e-05, + "loss": 0.5343, + "step": 13682 + }, + { + "epoch": 2.233623117423779, + "grad_norm": 1.742173433303833, + "learning_rate": 1.920095176123077e-05, + "loss": 0.5621, + "step": 13683 + }, + { + "epoch": 2.233786376066283, + "grad_norm": 1.9369580745697021, + "learning_rate": 1.9200827459051774e-05, + "loss": 0.5254, + "step": 13684 + }, + { + "epoch": 2.2339496347087873, + "grad_norm": 1.525497317314148, + "learning_rate": 1.9200703147607545e-05, + "loss": 0.5334, + "step": 13685 + }, + { + "epoch": 2.2341128933512917, + "grad_norm": 1.8263508081436157, + "learning_rate": 1.9200578826898212e-05, + "loss": 0.5596, + "step": 13686 + }, + { + "epoch": 2.234276151993796, + "grad_norm": 1.809680461883545, + "learning_rate": 1.92004544969239e-05, + "loss": 0.5233, + "step": 13687 + }, + { + "epoch": 2.2344394106363006, + "grad_norm": 1.5661643743515015, + "learning_rate": 1.920033015768473e-05, + "loss": 0.4759, + "step": 13688 + }, + { + "epoch": 2.234602669278805, + "grad_norm": 2.0662524700164795, + "learning_rate": 1.920020580918083e-05, + "loss": 0.6404, + "step": 13689 + }, + { + "epoch": 2.2347659279213095, + "grad_norm": 1.9582611322402954, + "learning_rate": 1.920008145141233e-05, + "loss": 0.5272, + "step": 13690 + }, + { + "epoch": 2.234929186563814, + "grad_norm": 1.7234505414962769, + "learning_rate": 1.9199957084379347e-05, + "loss": 0.5547, + "step": 13691 + }, + { + "epoch": 2.235092445206318, + "grad_norm": 1.6556750535964966, + "learning_rate": 1.919983270808201e-05, + "loss": 0.5056, + "step": 13692 + }, + { + "epoch": 2.2352557038488223, + "grad_norm": 1.8328620195388794, + "learning_rate": 1.9199708322520443e-05, + "loss": 0.5382, + "step": 13693 + }, + { + "epoch": 2.2354189624913268, + "grad_norm": 1.713192343711853, + "learning_rate": 1.9199583927694775e-05, + "loss": 0.4894, + "step": 13694 + }, + { + "epoch": 2.235582221133831, + "grad_norm": 1.8599199056625366, + "learning_rate": 1.919945952360512e-05, + "loss": 0.5441, + "step": 13695 + }, + { + "epoch": 2.2357454797763356, + "grad_norm": 1.7368930578231812, + "learning_rate": 1.919933511025162e-05, + "loss": 0.4791, + "step": 13696 + }, + { + "epoch": 2.23590873841884, + "grad_norm": 1.895760416984558, + "learning_rate": 1.9199210687634392e-05, + "loss": 0.6363, + "step": 13697 + }, + { + "epoch": 2.2360719970613445, + "grad_norm": 2.0075573921203613, + "learning_rate": 1.9199086255753557e-05, + "loss": 0.58, + "step": 13698 + }, + { + "epoch": 2.236235255703849, + "grad_norm": 1.6400787830352783, + "learning_rate": 1.9198961814609248e-05, + "loss": 0.5148, + "step": 13699 + }, + { + "epoch": 2.2363985143463534, + "grad_norm": 1.8108351230621338, + "learning_rate": 1.9198837364201587e-05, + "loss": 0.5497, + "step": 13700 + }, + { + "epoch": 2.236561772988858, + "grad_norm": 1.849120020866394, + "learning_rate": 1.9198712904530695e-05, + "loss": 0.6571, + "step": 13701 + }, + { + "epoch": 2.236725031631362, + "grad_norm": 1.6454564332962036, + "learning_rate": 1.9198588435596705e-05, + "loss": 0.4953, + "step": 13702 + }, + { + "epoch": 2.2368882902738663, + "grad_norm": 1.6178202629089355, + "learning_rate": 1.919846395739974e-05, + "loss": 0.5045, + "step": 13703 + }, + { + "epoch": 2.2370515489163707, + "grad_norm": 1.6802425384521484, + "learning_rate": 1.919833946993992e-05, + "loss": 0.5456, + "step": 13704 + }, + { + "epoch": 2.237214807558875, + "grad_norm": 1.8576278686523438, + "learning_rate": 1.919821497321738e-05, + "loss": 0.6614, + "step": 13705 + }, + { + "epoch": 2.2373780662013796, + "grad_norm": 1.723831295967102, + "learning_rate": 1.9198090467232235e-05, + "loss": 0.4974, + "step": 13706 + }, + { + "epoch": 2.237541324843884, + "grad_norm": 1.556442141532898, + "learning_rate": 1.9197965951984618e-05, + "loss": 0.5439, + "step": 13707 + }, + { + "epoch": 2.2377045834863885, + "grad_norm": 1.8674726486206055, + "learning_rate": 1.9197841427474652e-05, + "loss": 0.5716, + "step": 13708 + }, + { + "epoch": 2.237867842128893, + "grad_norm": 1.748146414756775, + "learning_rate": 1.9197716893702458e-05, + "loss": 0.6134, + "step": 13709 + }, + { + "epoch": 2.238031100771397, + "grad_norm": 1.9865103960037231, + "learning_rate": 1.919759235066817e-05, + "loss": 0.5648, + "step": 13710 + }, + { + "epoch": 2.2381943594139013, + "grad_norm": 1.5851380825042725, + "learning_rate": 1.919746779837191e-05, + "loss": 0.5418, + "step": 13711 + }, + { + "epoch": 2.2383576180564058, + "grad_norm": 1.774580717086792, + "learning_rate": 1.9197343236813798e-05, + "loss": 0.5809, + "step": 13712 + }, + { + "epoch": 2.23852087669891, + "grad_norm": 2.014620304107666, + "learning_rate": 1.9197218665993965e-05, + "loss": 0.5974, + "step": 13713 + }, + { + "epoch": 2.2386841353414146, + "grad_norm": 1.7066744565963745, + "learning_rate": 1.9197094085912536e-05, + "loss": 0.5592, + "step": 13714 + }, + { + "epoch": 2.238847393983919, + "grad_norm": 1.8475141525268555, + "learning_rate": 1.9196969496569638e-05, + "loss": 0.6696, + "step": 13715 + }, + { + "epoch": 2.2390106526264235, + "grad_norm": 2.2379250526428223, + "learning_rate": 1.9196844897965393e-05, + "loss": 0.7083, + "step": 13716 + }, + { + "epoch": 2.239173911268928, + "grad_norm": 1.96945059299469, + "learning_rate": 1.9196720290099925e-05, + "loss": 0.616, + "step": 13717 + }, + { + "epoch": 2.2393371699114324, + "grad_norm": 1.854867696762085, + "learning_rate": 1.919659567297336e-05, + "loss": 0.677, + "step": 13718 + }, + { + "epoch": 2.2395004285539364, + "grad_norm": 1.9322553873062134, + "learning_rate": 1.9196471046585832e-05, + "loss": 0.5433, + "step": 13719 + }, + { + "epoch": 2.239663687196441, + "grad_norm": 1.7704932689666748, + "learning_rate": 1.9196346410937455e-05, + "loss": 0.4946, + "step": 13720 + }, + { + "epoch": 2.2398269458389453, + "grad_norm": 2.099966526031494, + "learning_rate": 1.9196221766028366e-05, + "loss": 0.6185, + "step": 13721 + }, + { + "epoch": 2.2399902044814497, + "grad_norm": 1.8084121942520142, + "learning_rate": 1.919609711185868e-05, + "loss": 0.661, + "step": 13722 + }, + { + "epoch": 2.240153463123954, + "grad_norm": 1.846971869468689, + "learning_rate": 1.9195972448428523e-05, + "loss": 0.5587, + "step": 13723 + }, + { + "epoch": 2.2403167217664586, + "grad_norm": 1.7566790580749512, + "learning_rate": 1.919584777573803e-05, + "loss": 0.6137, + "step": 13724 + }, + { + "epoch": 2.240479980408963, + "grad_norm": 1.6030325889587402, + "learning_rate": 1.9195723093787316e-05, + "loss": 0.527, + "step": 13725 + }, + { + "epoch": 2.2406432390514675, + "grad_norm": 2.250749111175537, + "learning_rate": 1.9195598402576516e-05, + "loss": 0.6198, + "step": 13726 + }, + { + "epoch": 2.2408064976939714, + "grad_norm": 1.690442442893982, + "learning_rate": 1.9195473702105748e-05, + "loss": 0.5035, + "step": 13727 + }, + { + "epoch": 2.240969756336476, + "grad_norm": 2.0565600395202637, + "learning_rate": 1.919534899237514e-05, + "loss": 0.7277, + "step": 13728 + }, + { + "epoch": 2.2411330149789803, + "grad_norm": 1.8652559518814087, + "learning_rate": 1.919522427338482e-05, + "loss": 0.4829, + "step": 13729 + }, + { + "epoch": 2.2412962736214848, + "grad_norm": 1.6995346546173096, + "learning_rate": 1.9195099545134913e-05, + "loss": 0.5189, + "step": 13730 + }, + { + "epoch": 2.241459532263989, + "grad_norm": 2.6613330841064453, + "learning_rate": 1.9194974807625543e-05, + "loss": 0.7003, + "step": 13731 + }, + { + "epoch": 2.2416227909064936, + "grad_norm": 2.0136265754699707, + "learning_rate": 1.9194850060856832e-05, + "loss": 0.6499, + "step": 13732 + }, + { + "epoch": 2.241786049548998, + "grad_norm": 2.1207540035247803, + "learning_rate": 1.919472530482891e-05, + "loss": 0.6356, + "step": 13733 + }, + { + "epoch": 2.2419493081915025, + "grad_norm": 1.6578316688537598, + "learning_rate": 1.9194600539541906e-05, + "loss": 0.4922, + "step": 13734 + }, + { + "epoch": 2.242112566834007, + "grad_norm": 1.7077155113220215, + "learning_rate": 1.919447576499594e-05, + "loss": 0.6153, + "step": 13735 + }, + { + "epoch": 2.2422758254765114, + "grad_norm": 2.079155445098877, + "learning_rate": 1.9194350981191135e-05, + "loss": 0.6113, + "step": 13736 + }, + { + "epoch": 2.2424390841190154, + "grad_norm": 1.8367098569869995, + "learning_rate": 1.9194226188127625e-05, + "loss": 0.6027, + "step": 13737 + }, + { + "epoch": 2.24260234276152, + "grad_norm": 2.134187698364258, + "learning_rate": 1.919410138580553e-05, + "loss": 0.594, + "step": 13738 + }, + { + "epoch": 2.2427656014040243, + "grad_norm": 1.9266157150268555, + "learning_rate": 1.919397657422498e-05, + "loss": 0.5071, + "step": 13739 + }, + { + "epoch": 2.2429288600465287, + "grad_norm": 1.9382504224777222, + "learning_rate": 1.9193851753386095e-05, + "loss": 0.661, + "step": 13740 + }, + { + "epoch": 2.243092118689033, + "grad_norm": 1.9716973304748535, + "learning_rate": 1.9193726923289006e-05, + "loss": 0.6356, + "step": 13741 + }, + { + "epoch": 2.2432553773315376, + "grad_norm": 1.9010920524597168, + "learning_rate": 1.9193602083933834e-05, + "loss": 0.7011, + "step": 13742 + }, + { + "epoch": 2.243418635974042, + "grad_norm": 2.4176313877105713, + "learning_rate": 1.919347723532071e-05, + "loss": 0.7279, + "step": 13743 + }, + { + "epoch": 2.2435818946165464, + "grad_norm": 1.9597762823104858, + "learning_rate": 1.9193352377449757e-05, + "loss": 0.591, + "step": 13744 + }, + { + "epoch": 2.2437451532590504, + "grad_norm": 2.0583512783050537, + "learning_rate": 1.9193227510321096e-05, + "loss": 0.6118, + "step": 13745 + }, + { + "epoch": 2.243908411901555, + "grad_norm": 1.846514344215393, + "learning_rate": 1.919310263393486e-05, + "loss": 0.5783, + "step": 13746 + }, + { + "epoch": 2.2440716705440593, + "grad_norm": 1.6857624053955078, + "learning_rate": 1.9192977748291174e-05, + "loss": 0.5163, + "step": 13747 + }, + { + "epoch": 2.2442349291865638, + "grad_norm": 1.8853702545166016, + "learning_rate": 1.919285285339016e-05, + "loss": 0.5414, + "step": 13748 + }, + { + "epoch": 2.244398187829068, + "grad_norm": 1.7761893272399902, + "learning_rate": 1.9192727949231945e-05, + "loss": 0.5489, + "step": 13749 + }, + { + "epoch": 2.2445614464715726, + "grad_norm": 1.9203208684921265, + "learning_rate": 1.9192603035816657e-05, + "loss": 0.5609, + "step": 13750 + }, + { + "epoch": 2.244724705114077, + "grad_norm": 1.6524074077606201, + "learning_rate": 1.919247811314442e-05, + "loss": 0.5297, + "step": 13751 + }, + { + "epoch": 2.2448879637565815, + "grad_norm": 1.7700055837631226, + "learning_rate": 1.919235318121536e-05, + "loss": 0.5477, + "step": 13752 + }, + { + "epoch": 2.245051222399086, + "grad_norm": 1.578218936920166, + "learning_rate": 1.9192228240029604e-05, + "loss": 0.5708, + "step": 13753 + }, + { + "epoch": 2.24521448104159, + "grad_norm": 1.9086763858795166, + "learning_rate": 1.9192103289587273e-05, + "loss": 0.523, + "step": 13754 + }, + { + "epoch": 2.2453777396840944, + "grad_norm": 1.6069018840789795, + "learning_rate": 1.91919783298885e-05, + "loss": 0.4637, + "step": 13755 + }, + { + "epoch": 2.245540998326599, + "grad_norm": 1.522386908531189, + "learning_rate": 1.9191853360933403e-05, + "loss": 0.4893, + "step": 13756 + }, + { + "epoch": 2.2457042569691033, + "grad_norm": 1.5854510068893433, + "learning_rate": 1.9191728382722114e-05, + "loss": 0.5097, + "step": 13757 + }, + { + "epoch": 2.2458675156116077, + "grad_norm": 1.8278493881225586, + "learning_rate": 1.9191603395254758e-05, + "loss": 0.623, + "step": 13758 + }, + { + "epoch": 2.246030774254112, + "grad_norm": 1.8326702117919922, + "learning_rate": 1.919147839853146e-05, + "loss": 0.5889, + "step": 13759 + }, + { + "epoch": 2.2461940328966166, + "grad_norm": 2.199416160583496, + "learning_rate": 1.9191353392552346e-05, + "loss": 0.7342, + "step": 13760 + }, + { + "epoch": 2.246357291539121, + "grad_norm": 2.0428128242492676, + "learning_rate": 1.9191228377317542e-05, + "loss": 0.8464, + "step": 13761 + }, + { + "epoch": 2.2465205501816254, + "grad_norm": 1.8300420045852661, + "learning_rate": 1.919110335282717e-05, + "loss": 0.5663, + "step": 13762 + }, + { + "epoch": 2.2466838088241294, + "grad_norm": 1.7930339574813843, + "learning_rate": 1.9190978319081363e-05, + "loss": 0.5249, + "step": 13763 + }, + { + "epoch": 2.246847067466634, + "grad_norm": 1.9573358297348022, + "learning_rate": 1.9190853276080243e-05, + "loss": 0.6438, + "step": 13764 + }, + { + "epoch": 2.2470103261091383, + "grad_norm": 2.268261671066284, + "learning_rate": 1.9190728223823932e-05, + "loss": 0.7009, + "step": 13765 + }, + { + "epoch": 2.2471735847516427, + "grad_norm": 1.532370686531067, + "learning_rate": 1.9190603162312564e-05, + "loss": 0.4785, + "step": 13766 + }, + { + "epoch": 2.247336843394147, + "grad_norm": 2.237332582473755, + "learning_rate": 1.9190478091546262e-05, + "loss": 0.6256, + "step": 13767 + }, + { + "epoch": 2.2475001020366516, + "grad_norm": 2.086373805999756, + "learning_rate": 1.9190353011525147e-05, + "loss": 0.6136, + "step": 13768 + }, + { + "epoch": 2.247663360679156, + "grad_norm": 2.136364698410034, + "learning_rate": 1.9190227922249353e-05, + "loss": 0.6996, + "step": 13769 + }, + { + "epoch": 2.2478266193216605, + "grad_norm": 1.9089447259902954, + "learning_rate": 1.9190102823719e-05, + "loss": 0.6719, + "step": 13770 + }, + { + "epoch": 2.247989877964165, + "grad_norm": 1.6479437351226807, + "learning_rate": 1.9189977715934214e-05, + "loss": 0.5249, + "step": 13771 + }, + { + "epoch": 2.248153136606669, + "grad_norm": 1.7013919353485107, + "learning_rate": 1.9189852598895126e-05, + "loss": 0.5759, + "step": 13772 + }, + { + "epoch": 2.2483163952491734, + "grad_norm": 2.2088284492492676, + "learning_rate": 1.9189727472601858e-05, + "loss": 0.6854, + "step": 13773 + }, + { + "epoch": 2.248479653891678, + "grad_norm": 1.5300652980804443, + "learning_rate": 1.9189602337054537e-05, + "loss": 0.4941, + "step": 13774 + }, + { + "epoch": 2.2486429125341822, + "grad_norm": 2.220555067062378, + "learning_rate": 1.9189477192253288e-05, + "loss": 0.7048, + "step": 13775 + }, + { + "epoch": 2.2488061711766867, + "grad_norm": 1.9535768032073975, + "learning_rate": 1.9189352038198237e-05, + "loss": 0.5705, + "step": 13776 + }, + { + "epoch": 2.248969429819191, + "grad_norm": 2.086860179901123, + "learning_rate": 1.9189226874889513e-05, + "loss": 0.5631, + "step": 13777 + }, + { + "epoch": 2.2491326884616956, + "grad_norm": 1.9180946350097656, + "learning_rate": 1.918910170232724e-05, + "loss": 0.6258, + "step": 13778 + }, + { + "epoch": 2.2492959471042, + "grad_norm": 1.7438100576400757, + "learning_rate": 1.9188976520511544e-05, + "loss": 0.5196, + "step": 13779 + }, + { + "epoch": 2.249459205746704, + "grad_norm": 1.5394387245178223, + "learning_rate": 1.918885132944255e-05, + "loss": 0.524, + "step": 13780 + }, + { + "epoch": 2.2496224643892084, + "grad_norm": 1.583945870399475, + "learning_rate": 1.9188726129120384e-05, + "loss": 0.5357, + "step": 13781 + }, + { + "epoch": 2.249785723031713, + "grad_norm": 1.5881071090698242, + "learning_rate": 1.9188600919545176e-05, + "loss": 0.4859, + "step": 13782 + }, + { + "epoch": 2.2499489816742173, + "grad_norm": 1.777314305305481, + "learning_rate": 1.9188475700717048e-05, + "loss": 0.5029, + "step": 13783 + }, + { + "epoch": 2.2501122403167217, + "grad_norm": 1.783778190612793, + "learning_rate": 1.918835047263613e-05, + "loss": 0.6128, + "step": 13784 + }, + { + "epoch": 2.250275498959226, + "grad_norm": 1.4925121068954468, + "learning_rate": 1.918822523530254e-05, + "loss": 0.492, + "step": 13785 + }, + { + "epoch": 2.2504387576017306, + "grad_norm": 2.04084849357605, + "learning_rate": 1.9188099988716413e-05, + "loss": 0.7082, + "step": 13786 + }, + { + "epoch": 2.250602016244235, + "grad_norm": 2.0846080780029297, + "learning_rate": 1.918797473287787e-05, + "loss": 0.6181, + "step": 13787 + }, + { + "epoch": 2.2507652748867395, + "grad_norm": 1.808350682258606, + "learning_rate": 1.918784946778704e-05, + "loss": 0.5681, + "step": 13788 + }, + { + "epoch": 2.250928533529244, + "grad_norm": 1.5540213584899902, + "learning_rate": 1.9187724193444048e-05, + "loss": 0.4644, + "step": 13789 + }, + { + "epoch": 2.251091792171748, + "grad_norm": 1.8823357820510864, + "learning_rate": 1.9187598909849023e-05, + "loss": 0.5541, + "step": 13790 + }, + { + "epoch": 2.2512550508142524, + "grad_norm": 1.7802637815475464, + "learning_rate": 1.9187473617002084e-05, + "loss": 0.5994, + "step": 13791 + }, + { + "epoch": 2.251418309456757, + "grad_norm": 1.6765565872192383, + "learning_rate": 1.9187348314903363e-05, + "loss": 0.4512, + "step": 13792 + }, + { + "epoch": 2.2515815680992612, + "grad_norm": 1.6791355609893799, + "learning_rate": 1.9187223003552986e-05, + "loss": 0.5165, + "step": 13793 + }, + { + "epoch": 2.2517448267417657, + "grad_norm": 1.7535674571990967, + "learning_rate": 1.9187097682951078e-05, + "loss": 0.6803, + "step": 13794 + }, + { + "epoch": 2.25190808538427, + "grad_norm": 1.520835041999817, + "learning_rate": 1.9186972353097764e-05, + "loss": 0.5403, + "step": 13795 + }, + { + "epoch": 2.2520713440267746, + "grad_norm": 2.1422929763793945, + "learning_rate": 1.918684701399317e-05, + "loss": 0.7642, + "step": 13796 + }, + { + "epoch": 2.252234602669279, + "grad_norm": 2.0887372493743896, + "learning_rate": 1.9186721665637424e-05, + "loss": 0.668, + "step": 13797 + }, + { + "epoch": 2.252397861311783, + "grad_norm": 1.7719815969467163, + "learning_rate": 1.9186596308030652e-05, + "loss": 0.577, + "step": 13798 + }, + { + "epoch": 2.2525611199542874, + "grad_norm": 2.3719582557678223, + "learning_rate": 1.918647094117298e-05, + "loss": 0.6861, + "step": 13799 + }, + { + "epoch": 2.252724378596792, + "grad_norm": 1.7096679210662842, + "learning_rate": 1.918634556506454e-05, + "loss": 0.5144, + "step": 13800 + }, + { + "epoch": 2.2528876372392963, + "grad_norm": 1.9554489850997925, + "learning_rate": 1.9186220179705444e-05, + "loss": 0.6177, + "step": 13801 + }, + { + "epoch": 2.2530508958818007, + "grad_norm": 1.6707007884979248, + "learning_rate": 1.9186094785095827e-05, + "loss": 0.5346, + "step": 13802 + }, + { + "epoch": 2.253214154524305, + "grad_norm": 1.5249888896942139, + "learning_rate": 1.9185969381235822e-05, + "loss": 0.4965, + "step": 13803 + }, + { + "epoch": 2.2533774131668096, + "grad_norm": 1.7888262271881104, + "learning_rate": 1.9185843968125543e-05, + "loss": 0.5457, + "step": 13804 + }, + { + "epoch": 2.253540671809314, + "grad_norm": 1.7828142642974854, + "learning_rate": 1.918571854576512e-05, + "loss": 0.5668, + "step": 13805 + }, + { + "epoch": 2.2537039304518185, + "grad_norm": 1.419290542602539, + "learning_rate": 1.9185593114154683e-05, + "loss": 0.5243, + "step": 13806 + }, + { + "epoch": 2.2538671890943225, + "grad_norm": 1.5581082105636597, + "learning_rate": 1.9185467673294358e-05, + "loss": 0.5372, + "step": 13807 + }, + { + "epoch": 2.254030447736827, + "grad_norm": 1.8153951168060303, + "learning_rate": 1.918534222318427e-05, + "loss": 0.5529, + "step": 13808 + }, + { + "epoch": 2.2541937063793314, + "grad_norm": 2.0038979053497314, + "learning_rate": 1.918521676382454e-05, + "loss": 0.7703, + "step": 13809 + }, + { + "epoch": 2.254356965021836, + "grad_norm": 1.8248510360717773, + "learning_rate": 1.9185091295215305e-05, + "loss": 0.5376, + "step": 13810 + }, + { + "epoch": 2.2545202236643402, + "grad_norm": 1.948110818862915, + "learning_rate": 1.918496581735668e-05, + "loss": 0.5838, + "step": 13811 + }, + { + "epoch": 2.2546834823068447, + "grad_norm": 1.3878388404846191, + "learning_rate": 1.91848403302488e-05, + "loss": 0.4536, + "step": 13812 + }, + { + "epoch": 2.254846740949349, + "grad_norm": 1.7024873495101929, + "learning_rate": 1.9184714833891788e-05, + "loss": 0.5511, + "step": 13813 + }, + { + "epoch": 2.2550099995918536, + "grad_norm": 1.5209946632385254, + "learning_rate": 1.918458932828577e-05, + "loss": 0.4905, + "step": 13814 + }, + { + "epoch": 2.2551732582343575, + "grad_norm": 2.1684179306030273, + "learning_rate": 1.9184463813430874e-05, + "loss": 0.6477, + "step": 13815 + }, + { + "epoch": 2.255336516876862, + "grad_norm": 1.8219887018203735, + "learning_rate": 1.9184338289327223e-05, + "loss": 0.4933, + "step": 13816 + }, + { + "epoch": 2.2554997755193664, + "grad_norm": 2.1536450386047363, + "learning_rate": 1.918421275597495e-05, + "loss": 0.6975, + "step": 13817 + }, + { + "epoch": 2.255663034161871, + "grad_norm": 1.6983513832092285, + "learning_rate": 1.9184087213374175e-05, + "loss": 0.541, + "step": 13818 + }, + { + "epoch": 2.2558262928043753, + "grad_norm": 2.055955648422241, + "learning_rate": 1.9183961661525025e-05, + "loss": 0.6257, + "step": 13819 + }, + { + "epoch": 2.2559895514468797, + "grad_norm": 1.859616994857788, + "learning_rate": 1.9183836100427627e-05, + "loss": 0.5175, + "step": 13820 + }, + { + "epoch": 2.256152810089384, + "grad_norm": 1.765570878982544, + "learning_rate": 1.918371053008211e-05, + "loss": 0.5113, + "step": 13821 + }, + { + "epoch": 2.2563160687318886, + "grad_norm": 1.5935189723968506, + "learning_rate": 1.9183584950488603e-05, + "loss": 0.468, + "step": 13822 + }, + { + "epoch": 2.256479327374393, + "grad_norm": 2.0314278602600098, + "learning_rate": 1.9183459361647223e-05, + "loss": 0.7029, + "step": 13823 + }, + { + "epoch": 2.2566425860168975, + "grad_norm": 1.4410511255264282, + "learning_rate": 1.9183333763558104e-05, + "loss": 0.4078, + "step": 13824 + }, + { + "epoch": 2.2568058446594015, + "grad_norm": 1.612371802330017, + "learning_rate": 1.918320815622137e-05, + "loss": 0.4749, + "step": 13825 + }, + { + "epoch": 2.256969103301906, + "grad_norm": 1.6539796590805054, + "learning_rate": 1.918308253963715e-05, + "loss": 0.6211, + "step": 13826 + }, + { + "epoch": 2.2571323619444104, + "grad_norm": 1.6949870586395264, + "learning_rate": 1.9182956913805566e-05, + "loss": 0.5949, + "step": 13827 + }, + { + "epoch": 2.257295620586915, + "grad_norm": 1.9051918983459473, + "learning_rate": 1.9182831278726746e-05, + "loss": 0.6761, + "step": 13828 + }, + { + "epoch": 2.2574588792294192, + "grad_norm": 1.7296584844589233, + "learning_rate": 1.918270563440082e-05, + "loss": 0.5487, + "step": 13829 + }, + { + "epoch": 2.2576221378719237, + "grad_norm": 1.65151846408844, + "learning_rate": 1.9182579980827908e-05, + "loss": 0.6251, + "step": 13830 + }, + { + "epoch": 2.257785396514428, + "grad_norm": 1.6073336601257324, + "learning_rate": 1.9182454318008144e-05, + "loss": 0.5555, + "step": 13831 + }, + { + "epoch": 2.2579486551569325, + "grad_norm": 1.6467797756195068, + "learning_rate": 1.918232864594165e-05, + "loss": 0.563, + "step": 13832 + }, + { + "epoch": 2.2581119137994365, + "grad_norm": 1.6051732301712036, + "learning_rate": 1.918220296462855e-05, + "loss": 0.4862, + "step": 13833 + }, + { + "epoch": 2.258275172441941, + "grad_norm": 1.9760619401931763, + "learning_rate": 1.918207727406898e-05, + "loss": 0.681, + "step": 13834 + }, + { + "epoch": 2.2584384310844454, + "grad_norm": 1.5145838260650635, + "learning_rate": 1.918195157426306e-05, + "loss": 0.4378, + "step": 13835 + }, + { + "epoch": 2.25860168972695, + "grad_norm": 1.745262622833252, + "learning_rate": 1.9181825865210913e-05, + "loss": 0.5508, + "step": 13836 + }, + { + "epoch": 2.2587649483694543, + "grad_norm": 1.4948962926864624, + "learning_rate": 1.918170014691267e-05, + "loss": 0.51, + "step": 13837 + }, + { + "epoch": 2.2589282070119587, + "grad_norm": 1.5989919900894165, + "learning_rate": 1.9181574419368463e-05, + "loss": 0.5212, + "step": 13838 + }, + { + "epoch": 2.259091465654463, + "grad_norm": 1.9504706859588623, + "learning_rate": 1.9181448682578408e-05, + "loss": 0.6199, + "step": 13839 + }, + { + "epoch": 2.2592547242969676, + "grad_norm": 1.9903374910354614, + "learning_rate": 1.9181322936542638e-05, + "loss": 0.6058, + "step": 13840 + }, + { + "epoch": 2.259417982939472, + "grad_norm": 1.735977053642273, + "learning_rate": 1.9181197181261277e-05, + "loss": 0.5242, + "step": 13841 + }, + { + "epoch": 2.2595812415819765, + "grad_norm": 1.8422589302062988, + "learning_rate": 1.9181071416734453e-05, + "loss": 0.5314, + "step": 13842 + }, + { + "epoch": 2.2597445002244805, + "grad_norm": 1.844992995262146, + "learning_rate": 1.9180945642962294e-05, + "loss": 0.5738, + "step": 13843 + }, + { + "epoch": 2.259907758866985, + "grad_norm": 1.797642469406128, + "learning_rate": 1.9180819859944927e-05, + "loss": 0.5951, + "step": 13844 + }, + { + "epoch": 2.2600710175094894, + "grad_norm": 1.7090747356414795, + "learning_rate": 1.9180694067682474e-05, + "loss": 0.5398, + "step": 13845 + }, + { + "epoch": 2.260234276151994, + "grad_norm": 1.628329873085022, + "learning_rate": 1.9180568266175065e-05, + "loss": 0.5362, + "step": 13846 + }, + { + "epoch": 2.2603975347944982, + "grad_norm": 1.6464911699295044, + "learning_rate": 1.9180442455422824e-05, + "loss": 0.58, + "step": 13847 + }, + { + "epoch": 2.2605607934370027, + "grad_norm": 1.969878911972046, + "learning_rate": 1.9180316635425883e-05, + "loss": 0.7173, + "step": 13848 + }, + { + "epoch": 2.260724052079507, + "grad_norm": 1.8124635219573975, + "learning_rate": 1.9180190806184366e-05, + "loss": 0.525, + "step": 13849 + }, + { + "epoch": 2.260887310722011, + "grad_norm": 2.1061315536499023, + "learning_rate": 1.91800649676984e-05, + "loss": 0.6079, + "step": 13850 + }, + { + "epoch": 2.2610505693645155, + "grad_norm": 1.582314372062683, + "learning_rate": 1.9179939119968105e-05, + "loss": 0.5472, + "step": 13851 + }, + { + "epoch": 2.26121382800702, + "grad_norm": 1.6782244443893433, + "learning_rate": 1.9179813262993618e-05, + "loss": 0.5281, + "step": 13852 + }, + { + "epoch": 2.2613770866495244, + "grad_norm": 1.934603214263916, + "learning_rate": 1.917968739677506e-05, + "loss": 0.6356, + "step": 13853 + }, + { + "epoch": 2.261540345292029, + "grad_norm": 1.730089783668518, + "learning_rate": 1.9179561521312562e-05, + "loss": 0.5705, + "step": 13854 + }, + { + "epoch": 2.2617036039345333, + "grad_norm": 1.7121391296386719, + "learning_rate": 1.9179435636606247e-05, + "loss": 0.5205, + "step": 13855 + }, + { + "epoch": 2.2618668625770377, + "grad_norm": 1.9675350189208984, + "learning_rate": 1.917930974265624e-05, + "loss": 0.5935, + "step": 13856 + }, + { + "epoch": 2.262030121219542, + "grad_norm": 1.6442006826400757, + "learning_rate": 1.9179183839462673e-05, + "loss": 0.4458, + "step": 13857 + }, + { + "epoch": 2.2621933798620466, + "grad_norm": 2.1272647380828857, + "learning_rate": 1.917905792702567e-05, + "loss": 0.7081, + "step": 13858 + }, + { + "epoch": 2.262356638504551, + "grad_norm": 1.631751537322998, + "learning_rate": 1.917893200534536e-05, + "loss": 0.4702, + "step": 13859 + }, + { + "epoch": 2.262519897147055, + "grad_norm": 1.8436928987503052, + "learning_rate": 1.9178806074421866e-05, + "loss": 0.5533, + "step": 13860 + }, + { + "epoch": 2.2626831557895595, + "grad_norm": 1.3239208459854126, + "learning_rate": 1.9178680134255314e-05, + "loss": 0.5252, + "step": 13861 + }, + { + "epoch": 2.262846414432064, + "grad_norm": 1.4567985534667969, + "learning_rate": 1.917855418484584e-05, + "loss": 0.4545, + "step": 13862 + }, + { + "epoch": 2.2630096730745684, + "grad_norm": 1.751530647277832, + "learning_rate": 1.9178428226193558e-05, + "loss": 0.6543, + "step": 13863 + }, + { + "epoch": 2.263172931717073, + "grad_norm": 1.664368748664856, + "learning_rate": 1.9178302258298606e-05, + "loss": 0.5076, + "step": 13864 + }, + { + "epoch": 2.2633361903595772, + "grad_norm": 2.0118162631988525, + "learning_rate": 1.9178176281161104e-05, + "loss": 0.5549, + "step": 13865 + }, + { + "epoch": 2.2634994490020817, + "grad_norm": 1.7365078926086426, + "learning_rate": 1.917805029478118e-05, + "loss": 0.5534, + "step": 13866 + }, + { + "epoch": 2.263662707644586, + "grad_norm": 1.823733925819397, + "learning_rate": 1.9177924299158963e-05, + "loss": 0.5633, + "step": 13867 + }, + { + "epoch": 2.26382596628709, + "grad_norm": 1.671911358833313, + "learning_rate": 1.9177798294294576e-05, + "loss": 0.5678, + "step": 13868 + }, + { + "epoch": 2.2639892249295945, + "grad_norm": 1.678178310394287, + "learning_rate": 1.917767228018815e-05, + "loss": 0.5135, + "step": 13869 + }, + { + "epoch": 2.264152483572099, + "grad_norm": 1.7610039710998535, + "learning_rate": 1.9177546256839814e-05, + "loss": 0.5588, + "step": 13870 + }, + { + "epoch": 2.2643157422146034, + "grad_norm": 1.5994195938110352, + "learning_rate": 1.9177420224249688e-05, + "loss": 0.5346, + "step": 13871 + }, + { + "epoch": 2.264479000857108, + "grad_norm": 1.4797704219818115, + "learning_rate": 1.9177294182417904e-05, + "loss": 0.5345, + "step": 13872 + }, + { + "epoch": 2.2646422594996123, + "grad_norm": 1.9207110404968262, + "learning_rate": 1.9177168131344587e-05, + "loss": 0.6362, + "step": 13873 + }, + { + "epoch": 2.2648055181421167, + "grad_norm": 1.6864339113235474, + "learning_rate": 1.917704207102986e-05, + "loss": 0.4922, + "step": 13874 + }, + { + "epoch": 2.264968776784621, + "grad_norm": 2.0660033226013184, + "learning_rate": 1.9176916001473857e-05, + "loss": 0.753, + "step": 13875 + }, + { + "epoch": 2.2651320354271256, + "grad_norm": 2.0036473274230957, + "learning_rate": 1.9176789922676705e-05, + "loss": 0.6404, + "step": 13876 + }, + { + "epoch": 2.26529529406963, + "grad_norm": 1.9450055360794067, + "learning_rate": 1.9176663834638525e-05, + "loss": 0.6422, + "step": 13877 + }, + { + "epoch": 2.265458552712134, + "grad_norm": 1.956730604171753, + "learning_rate": 1.9176537737359446e-05, + "loss": 0.5563, + "step": 13878 + }, + { + "epoch": 2.2656218113546385, + "grad_norm": 1.7608140707015991, + "learning_rate": 1.9176411630839597e-05, + "loss": 0.6208, + "step": 13879 + }, + { + "epoch": 2.265785069997143, + "grad_norm": 1.691562533378601, + "learning_rate": 1.9176285515079102e-05, + "loss": 0.6006, + "step": 13880 + }, + { + "epoch": 2.2659483286396473, + "grad_norm": 1.8563393354415894, + "learning_rate": 1.9176159390078095e-05, + "loss": 0.5589, + "step": 13881 + }, + { + "epoch": 2.266111587282152, + "grad_norm": 1.8398960828781128, + "learning_rate": 1.9176033255836694e-05, + "loss": 0.641, + "step": 13882 + }, + { + "epoch": 2.2662748459246562, + "grad_norm": 1.6929503679275513, + "learning_rate": 1.9175907112355034e-05, + "loss": 0.4726, + "step": 13883 + }, + { + "epoch": 2.2664381045671607, + "grad_norm": 1.6380404233932495, + "learning_rate": 1.9175780959633234e-05, + "loss": 0.4983, + "step": 13884 + }, + { + "epoch": 2.266601363209665, + "grad_norm": 2.0752813816070557, + "learning_rate": 1.9175654797671422e-05, + "loss": 0.5857, + "step": 13885 + }, + { + "epoch": 2.266764621852169, + "grad_norm": 1.9312282800674438, + "learning_rate": 1.917552862646973e-05, + "loss": 0.5801, + "step": 13886 + }, + { + "epoch": 2.2669278804946735, + "grad_norm": 1.9264646768569946, + "learning_rate": 1.917540244602829e-05, + "loss": 0.5609, + "step": 13887 + }, + { + "epoch": 2.267091139137178, + "grad_norm": 1.870976448059082, + "learning_rate": 1.9175276256347217e-05, + "loss": 0.6939, + "step": 13888 + }, + { + "epoch": 2.2672543977796824, + "grad_norm": 1.9260129928588867, + "learning_rate": 1.917515005742664e-05, + "loss": 0.5571, + "step": 13889 + }, + { + "epoch": 2.267417656422187, + "grad_norm": 1.7759521007537842, + "learning_rate": 1.9175023849266697e-05, + "loss": 0.5906, + "step": 13890 + }, + { + "epoch": 2.2675809150646913, + "grad_norm": 2.107051134109497, + "learning_rate": 1.91748976318675e-05, + "loss": 0.5653, + "step": 13891 + }, + { + "epoch": 2.2677441737071957, + "grad_norm": 1.763473391532898, + "learning_rate": 1.9174771405229187e-05, + "loss": 0.555, + "step": 13892 + }, + { + "epoch": 2.2679074323497, + "grad_norm": 1.7075201272964478, + "learning_rate": 1.9174645169351882e-05, + "loss": 0.5686, + "step": 13893 + }, + { + "epoch": 2.2680706909922046, + "grad_norm": 1.8023390769958496, + "learning_rate": 1.917451892423571e-05, + "loss": 0.5845, + "step": 13894 + }, + { + "epoch": 2.268233949634709, + "grad_norm": 1.7866251468658447, + "learning_rate": 1.9174392669880803e-05, + "loss": 0.5897, + "step": 13895 + }, + { + "epoch": 2.268397208277213, + "grad_norm": 1.7991306781768799, + "learning_rate": 1.9174266406287282e-05, + "loss": 0.6519, + "step": 13896 + }, + { + "epoch": 2.2685604669197175, + "grad_norm": 1.4861586093902588, + "learning_rate": 1.9174140133455278e-05, + "loss": 0.4436, + "step": 13897 + }, + { + "epoch": 2.268723725562222, + "grad_norm": 1.8733670711517334, + "learning_rate": 1.917401385138492e-05, + "loss": 0.7434, + "step": 13898 + }, + { + "epoch": 2.2688869842047263, + "grad_norm": 1.8632951974868774, + "learning_rate": 1.917388756007633e-05, + "loss": 0.4958, + "step": 13899 + }, + { + "epoch": 2.269050242847231, + "grad_norm": 1.8713417053222656, + "learning_rate": 1.9173761259529634e-05, + "loss": 0.5695, + "step": 13900 + }, + { + "epoch": 2.269213501489735, + "grad_norm": 1.8278506994247437, + "learning_rate": 1.9173634949744967e-05, + "loss": 0.5632, + "step": 13901 + }, + { + "epoch": 2.2693767601322397, + "grad_norm": 2.0684211254119873, + "learning_rate": 1.9173508630722454e-05, + "loss": 0.6358, + "step": 13902 + }, + { + "epoch": 2.2695400187747437, + "grad_norm": 1.6291264295578003, + "learning_rate": 1.9173382302462217e-05, + "loss": 0.5608, + "step": 13903 + }, + { + "epoch": 2.269703277417248, + "grad_norm": 1.6471129655838013, + "learning_rate": 1.9173255964964384e-05, + "loss": 0.5843, + "step": 13904 + }, + { + "epoch": 2.2698665360597525, + "grad_norm": 1.525292158126831, + "learning_rate": 1.917312961822909e-05, + "loss": 0.5178, + "step": 13905 + }, + { + "epoch": 2.270029794702257, + "grad_norm": 1.6771358251571655, + "learning_rate": 1.9173003262256453e-05, + "loss": 0.5718, + "step": 13906 + }, + { + "epoch": 2.2701930533447614, + "grad_norm": 1.5734871625900269, + "learning_rate": 1.9172876897046606e-05, + "loss": 0.5007, + "step": 13907 + }, + { + "epoch": 2.270356311987266, + "grad_norm": 1.906890630722046, + "learning_rate": 1.9172750522599678e-05, + "loss": 0.6257, + "step": 13908 + }, + { + "epoch": 2.2705195706297703, + "grad_norm": 2.0448012351989746, + "learning_rate": 1.9172624138915784e-05, + "loss": 0.7494, + "step": 13909 + }, + { + "epoch": 2.2706828292722747, + "grad_norm": 1.5969666242599487, + "learning_rate": 1.9172497745995068e-05, + "loss": 0.5192, + "step": 13910 + }, + { + "epoch": 2.270846087914779, + "grad_norm": 1.8770338296890259, + "learning_rate": 1.9172371343837643e-05, + "loss": 0.702, + "step": 13911 + }, + { + "epoch": 2.2710093465572836, + "grad_norm": 1.6171174049377441, + "learning_rate": 1.9172244932443646e-05, + "loss": 0.6126, + "step": 13912 + }, + { + "epoch": 2.2711726051997876, + "grad_norm": 1.7274426221847534, + "learning_rate": 1.9172118511813202e-05, + "loss": 0.562, + "step": 13913 + }, + { + "epoch": 2.271335863842292, + "grad_norm": 1.8385990858078003, + "learning_rate": 1.9171992081946436e-05, + "loss": 0.5759, + "step": 13914 + }, + { + "epoch": 2.2714991224847965, + "grad_norm": 1.3460861444473267, + "learning_rate": 1.9171865642843474e-05, + "loss": 0.5354, + "step": 13915 + }, + { + "epoch": 2.271662381127301, + "grad_norm": 2.1049082279205322, + "learning_rate": 1.9171739194504448e-05, + "loss": 0.579, + "step": 13916 + }, + { + "epoch": 2.2718256397698053, + "grad_norm": 1.9142627716064453, + "learning_rate": 1.9171612736929483e-05, + "loss": 0.5955, + "step": 13917 + }, + { + "epoch": 2.2719888984123098, + "grad_norm": 1.6016350984573364, + "learning_rate": 1.9171486270118708e-05, + "loss": 0.4039, + "step": 13918 + }, + { + "epoch": 2.272152157054814, + "grad_norm": 1.8482050895690918, + "learning_rate": 1.9171359794072245e-05, + "loss": 0.5555, + "step": 13919 + }, + { + "epoch": 2.2723154156973187, + "grad_norm": 1.7666816711425781, + "learning_rate": 1.9171233308790225e-05, + "loss": 0.6542, + "step": 13920 + }, + { + "epoch": 2.2724786743398226, + "grad_norm": 1.8585255146026611, + "learning_rate": 1.917110681427278e-05, + "loss": 0.5775, + "step": 13921 + }, + { + "epoch": 2.272641932982327, + "grad_norm": 1.8754884004592896, + "learning_rate": 1.917098031052003e-05, + "loss": 0.7501, + "step": 13922 + }, + { + "epoch": 2.2728051916248315, + "grad_norm": 1.4371553659439087, + "learning_rate": 1.9170853797532106e-05, + "loss": 0.4502, + "step": 13923 + }, + { + "epoch": 2.272968450267336, + "grad_norm": 1.476637363433838, + "learning_rate": 1.9170727275309133e-05, + "loss": 0.5945, + "step": 13924 + }, + { + "epoch": 2.2731317089098404, + "grad_norm": 1.504349946975708, + "learning_rate": 1.917060074385124e-05, + "loss": 0.4571, + "step": 13925 + }, + { + "epoch": 2.273294967552345, + "grad_norm": 1.3802858591079712, + "learning_rate": 1.9170474203158556e-05, + "loss": 0.4233, + "step": 13926 + }, + { + "epoch": 2.2734582261948493, + "grad_norm": 1.9984899759292603, + "learning_rate": 1.9170347653231206e-05, + "loss": 0.6941, + "step": 13927 + }, + { + "epoch": 2.2736214848373537, + "grad_norm": 1.4699420928955078, + "learning_rate": 1.917022109406932e-05, + "loss": 0.523, + "step": 13928 + }, + { + "epoch": 2.273784743479858, + "grad_norm": 1.716710090637207, + "learning_rate": 1.9170094525673023e-05, + "loss": 0.5732, + "step": 13929 + }, + { + "epoch": 2.2739480021223626, + "grad_norm": 2.2451701164245605, + "learning_rate": 1.9169967948042444e-05, + "loss": 0.6466, + "step": 13930 + }, + { + "epoch": 2.2741112607648666, + "grad_norm": 1.795754075050354, + "learning_rate": 1.9169841361177708e-05, + "loss": 0.5759, + "step": 13931 + }, + { + "epoch": 2.274274519407371, + "grad_norm": 1.625292181968689, + "learning_rate": 1.9169714765078947e-05, + "loss": 0.5369, + "step": 13932 + }, + { + "epoch": 2.2744377780498755, + "grad_norm": 1.8310788869857788, + "learning_rate": 1.916958815974628e-05, + "loss": 0.6478, + "step": 13933 + }, + { + "epoch": 2.27460103669238, + "grad_norm": 1.5734957456588745, + "learning_rate": 1.9169461545179848e-05, + "loss": 0.52, + "step": 13934 + }, + { + "epoch": 2.2747642953348843, + "grad_norm": 1.4443728923797607, + "learning_rate": 1.9169334921379766e-05, + "loss": 0.4994, + "step": 13935 + }, + { + "epoch": 2.2749275539773888, + "grad_norm": 1.7485495805740356, + "learning_rate": 1.9169208288346168e-05, + "loss": 0.5886, + "step": 13936 + }, + { + "epoch": 2.275090812619893, + "grad_norm": 1.7580534219741821, + "learning_rate": 1.9169081646079175e-05, + "loss": 0.5276, + "step": 13937 + }, + { + "epoch": 2.275254071262397, + "grad_norm": 1.6626437902450562, + "learning_rate": 1.9168954994578924e-05, + "loss": 0.5694, + "step": 13938 + }, + { + "epoch": 2.2754173299049016, + "grad_norm": 1.5509529113769531, + "learning_rate": 1.9168828333845536e-05, + "loss": 0.475, + "step": 13939 + }, + { + "epoch": 2.275580588547406, + "grad_norm": 1.7139511108398438, + "learning_rate": 1.9168701663879143e-05, + "loss": 0.4913, + "step": 13940 + }, + { + "epoch": 2.2757438471899105, + "grad_norm": 1.8559212684631348, + "learning_rate": 1.9168574984679864e-05, + "loss": 0.527, + "step": 13941 + }, + { + "epoch": 2.275907105832415, + "grad_norm": 1.9713969230651855, + "learning_rate": 1.9168448296247834e-05, + "loss": 0.568, + "step": 13942 + }, + { + "epoch": 2.2760703644749194, + "grad_norm": 1.736005187034607, + "learning_rate": 1.9168321598583183e-05, + "loss": 0.6284, + "step": 13943 + }, + { + "epoch": 2.276233623117424, + "grad_norm": 1.8689674139022827, + "learning_rate": 1.9168194891686035e-05, + "loss": 0.5551, + "step": 13944 + }, + { + "epoch": 2.2763968817599283, + "grad_norm": 1.9291325807571411, + "learning_rate": 1.9168068175556512e-05, + "loss": 0.5484, + "step": 13945 + }, + { + "epoch": 2.2765601404024327, + "grad_norm": 1.8394428491592407, + "learning_rate": 1.916794145019475e-05, + "loss": 0.5823, + "step": 13946 + }, + { + "epoch": 2.276723399044937, + "grad_norm": 1.834821105003357, + "learning_rate": 1.9167814715600872e-05, + "loss": 0.6726, + "step": 13947 + }, + { + "epoch": 2.276886657687441, + "grad_norm": 1.7654497623443604, + "learning_rate": 1.916768797177501e-05, + "loss": 0.5592, + "step": 13948 + }, + { + "epoch": 2.2770499163299456, + "grad_norm": 1.7998294830322266, + "learning_rate": 1.9167561218717283e-05, + "loss": 0.5517, + "step": 13949 + }, + { + "epoch": 2.27721317497245, + "grad_norm": 1.5590392351150513, + "learning_rate": 1.916743445642783e-05, + "loss": 0.4697, + "step": 13950 + }, + { + "epoch": 2.2773764336149545, + "grad_norm": 1.7349083423614502, + "learning_rate": 1.916730768490677e-05, + "loss": 0.5409, + "step": 13951 + }, + { + "epoch": 2.277539692257459, + "grad_norm": 2.072094202041626, + "learning_rate": 1.9167180904154234e-05, + "loss": 0.5964, + "step": 13952 + }, + { + "epoch": 2.2777029508999633, + "grad_norm": 1.7526817321777344, + "learning_rate": 1.916705411417035e-05, + "loss": 0.5473, + "step": 13953 + }, + { + "epoch": 2.2778662095424678, + "grad_norm": 1.4860246181488037, + "learning_rate": 1.9166927314955244e-05, + "loss": 0.495, + "step": 13954 + }, + { + "epoch": 2.278029468184972, + "grad_norm": 1.520379662513733, + "learning_rate": 1.9166800506509044e-05, + "loss": 0.4581, + "step": 13955 + }, + { + "epoch": 2.278192726827476, + "grad_norm": 2.184462785720825, + "learning_rate": 1.9166673688831882e-05, + "loss": 0.7387, + "step": 13956 + }, + { + "epoch": 2.2783559854699806, + "grad_norm": 2.0433216094970703, + "learning_rate": 1.9166546861923877e-05, + "loss": 0.6148, + "step": 13957 + }, + { + "epoch": 2.278519244112485, + "grad_norm": 1.6449540853500366, + "learning_rate": 1.9166420025785165e-05, + "loss": 0.5342, + "step": 13958 + }, + { + "epoch": 2.2786825027549895, + "grad_norm": 2.091167449951172, + "learning_rate": 1.916629318041587e-05, + "loss": 0.686, + "step": 13959 + }, + { + "epoch": 2.278845761397494, + "grad_norm": 2.1705434322357178, + "learning_rate": 1.916616632581612e-05, + "loss": 0.5623, + "step": 13960 + }, + { + "epoch": 2.2790090200399984, + "grad_norm": 1.7093569040298462, + "learning_rate": 1.9166039461986043e-05, + "loss": 0.5136, + "step": 13961 + }, + { + "epoch": 2.279172278682503, + "grad_norm": 1.945244312286377, + "learning_rate": 1.9165912588925764e-05, + "loss": 0.6018, + "step": 13962 + }, + { + "epoch": 2.2793355373250073, + "grad_norm": 1.7293232679367065, + "learning_rate": 1.9165785706635418e-05, + "loss": 0.6074, + "step": 13963 + }, + { + "epoch": 2.2794987959675117, + "grad_norm": 2.224622964859009, + "learning_rate": 1.9165658815115123e-05, + "loss": 0.54, + "step": 13964 + }, + { + "epoch": 2.279662054610016, + "grad_norm": 1.6428642272949219, + "learning_rate": 1.9165531914365015e-05, + "loss": 0.5152, + "step": 13965 + }, + { + "epoch": 2.27982531325252, + "grad_norm": 1.4884840250015259, + "learning_rate": 1.9165405004385223e-05, + "loss": 0.4862, + "step": 13966 + }, + { + "epoch": 2.2799885718950246, + "grad_norm": 1.6254823207855225, + "learning_rate": 1.9165278085175865e-05, + "loss": 0.5543, + "step": 13967 + }, + { + "epoch": 2.280151830537529, + "grad_norm": 1.8934619426727295, + "learning_rate": 1.9165151156737077e-05, + "loss": 0.6022, + "step": 13968 + }, + { + "epoch": 2.2803150891800334, + "grad_norm": 2.0586204528808594, + "learning_rate": 1.916502421906898e-05, + "loss": 0.6284, + "step": 13969 + }, + { + "epoch": 2.280478347822538, + "grad_norm": 1.8739511966705322, + "learning_rate": 1.916489727217171e-05, + "loss": 0.6291, + "step": 13970 + }, + { + "epoch": 2.2806416064650423, + "grad_norm": 1.8632862567901611, + "learning_rate": 1.9164770316045392e-05, + "loss": 0.5492, + "step": 13971 + }, + { + "epoch": 2.2808048651075468, + "grad_norm": 1.848002314567566, + "learning_rate": 1.916464335069015e-05, + "loss": 0.6224, + "step": 13972 + }, + { + "epoch": 2.280968123750051, + "grad_norm": 2.01834774017334, + "learning_rate": 1.9164516376106115e-05, + "loss": 0.6815, + "step": 13973 + }, + { + "epoch": 2.281131382392555, + "grad_norm": 2.079613447189331, + "learning_rate": 1.9164389392293415e-05, + "loss": 0.5925, + "step": 13974 + }, + { + "epoch": 2.2812946410350596, + "grad_norm": 1.7629497051239014, + "learning_rate": 1.9164262399252176e-05, + "loss": 0.5652, + "step": 13975 + }, + { + "epoch": 2.281457899677564, + "grad_norm": 1.8439525365829468, + "learning_rate": 1.9164135396982527e-05, + "loss": 0.5682, + "step": 13976 + }, + { + "epoch": 2.2816211583200685, + "grad_norm": 1.7546031475067139, + "learning_rate": 1.91640083854846e-05, + "loss": 0.5225, + "step": 13977 + }, + { + "epoch": 2.281784416962573, + "grad_norm": 1.9836018085479736, + "learning_rate": 1.9163881364758516e-05, + "loss": 0.5627, + "step": 13978 + }, + { + "epoch": 2.2819476756050774, + "grad_norm": 1.817509412765503, + "learning_rate": 1.9163754334804404e-05, + "loss": 0.5638, + "step": 13979 + }, + { + "epoch": 2.282110934247582, + "grad_norm": 1.97105073928833, + "learning_rate": 1.9163627295622397e-05, + "loss": 0.5787, + "step": 13980 + }, + { + "epoch": 2.2822741928900863, + "grad_norm": 1.803890585899353, + "learning_rate": 1.916350024721262e-05, + "loss": 0.6098, + "step": 13981 + }, + { + "epoch": 2.2824374515325907, + "grad_norm": 1.6055262088775635, + "learning_rate": 1.91633731895752e-05, + "loss": 0.5156, + "step": 13982 + }, + { + "epoch": 2.282600710175095, + "grad_norm": 1.7024269104003906, + "learning_rate": 1.9163246122710265e-05, + "loss": 0.5403, + "step": 13983 + }, + { + "epoch": 2.282763968817599, + "grad_norm": 1.7593562602996826, + "learning_rate": 1.9163119046617944e-05, + "loss": 0.6545, + "step": 13984 + }, + { + "epoch": 2.2829272274601036, + "grad_norm": 1.645037055015564, + "learning_rate": 1.9162991961298364e-05, + "loss": 0.5592, + "step": 13985 + }, + { + "epoch": 2.283090486102608, + "grad_norm": 2.214200019836426, + "learning_rate": 1.916286486675165e-05, + "loss": 0.7321, + "step": 13986 + }, + { + "epoch": 2.2832537447451124, + "grad_norm": 1.9174823760986328, + "learning_rate": 1.916273776297794e-05, + "loss": 0.6998, + "step": 13987 + }, + { + "epoch": 2.283417003387617, + "grad_norm": 1.563968539237976, + "learning_rate": 1.9162610649977355e-05, + "loss": 0.5357, + "step": 13988 + }, + { + "epoch": 2.2835802620301213, + "grad_norm": 1.6708457469940186, + "learning_rate": 1.916248352775002e-05, + "loss": 0.5448, + "step": 13989 + }, + { + "epoch": 2.2837435206726258, + "grad_norm": 1.9790898561477661, + "learning_rate": 1.9162356396296068e-05, + "loss": 0.635, + "step": 13990 + }, + { + "epoch": 2.2839067793151298, + "grad_norm": 1.683519721031189, + "learning_rate": 1.9162229255615624e-05, + "loss": 0.5981, + "step": 13991 + }, + { + "epoch": 2.284070037957634, + "grad_norm": 1.5720479488372803, + "learning_rate": 1.916210210570882e-05, + "loss": 0.5401, + "step": 13992 + }, + { + "epoch": 2.2842332966001386, + "grad_norm": 1.8959178924560547, + "learning_rate": 1.916197494657578e-05, + "loss": 0.56, + "step": 13993 + }, + { + "epoch": 2.284396555242643, + "grad_norm": 1.7567049264907837, + "learning_rate": 1.9161847778216635e-05, + "loss": 0.5012, + "step": 13994 + }, + { + "epoch": 2.2845598138851475, + "grad_norm": 1.9192942380905151, + "learning_rate": 1.916172060063151e-05, + "loss": 0.6129, + "step": 13995 + }, + { + "epoch": 2.284723072527652, + "grad_norm": 1.5325020551681519, + "learning_rate": 1.9161593413820535e-05, + "loss": 0.4445, + "step": 13996 + }, + { + "epoch": 2.2848863311701564, + "grad_norm": 1.5968296527862549, + "learning_rate": 1.916146621778384e-05, + "loss": 0.4667, + "step": 13997 + }, + { + "epoch": 2.285049589812661, + "grad_norm": 1.6320220232009888, + "learning_rate": 1.9161339012521548e-05, + "loss": 0.5763, + "step": 13998 + }, + { + "epoch": 2.2852128484551653, + "grad_norm": 2.0445926189422607, + "learning_rate": 1.916121179803379e-05, + "loss": 0.6495, + "step": 13999 + }, + { + "epoch": 2.2853761070976697, + "grad_norm": 1.9737379550933838, + "learning_rate": 1.9161084574320696e-05, + "loss": 0.6285, + "step": 14000 + }, + { + "epoch": 2.2855393657401737, + "grad_norm": 2.0363075733184814, + "learning_rate": 1.9160957341382393e-05, + "loss": 0.7316, + "step": 14001 + }, + { + "epoch": 2.285702624382678, + "grad_norm": 1.5856351852416992, + "learning_rate": 1.9160830099219007e-05, + "loss": 0.5161, + "step": 14002 + }, + { + "epoch": 2.2858658830251826, + "grad_norm": 1.8207496404647827, + "learning_rate": 1.916070284783067e-05, + "loss": 0.5728, + "step": 14003 + }, + { + "epoch": 2.286029141667687, + "grad_norm": 1.6594120264053345, + "learning_rate": 1.9160575587217506e-05, + "loss": 0.6058, + "step": 14004 + }, + { + "epoch": 2.2861924003101914, + "grad_norm": 1.884810447692871, + "learning_rate": 1.9160448317379642e-05, + "loss": 0.6148, + "step": 14005 + }, + { + "epoch": 2.286355658952696, + "grad_norm": 1.9922125339508057, + "learning_rate": 1.916032103831721e-05, + "loss": 0.656, + "step": 14006 + }, + { + "epoch": 2.2865189175952003, + "grad_norm": 1.8830081224441528, + "learning_rate": 1.9160193750030342e-05, + "loss": 0.5808, + "step": 14007 + }, + { + "epoch": 2.2866821762377048, + "grad_norm": 1.6651763916015625, + "learning_rate": 1.9160066452519156e-05, + "loss": 0.4672, + "step": 14008 + }, + { + "epoch": 2.2868454348802087, + "grad_norm": 2.015568494796753, + "learning_rate": 1.915993914578379e-05, + "loss": 0.6941, + "step": 14009 + }, + { + "epoch": 2.287008693522713, + "grad_norm": 1.7918131351470947, + "learning_rate": 1.9159811829824364e-05, + "loss": 0.6605, + "step": 14010 + }, + { + "epoch": 2.2871719521652176, + "grad_norm": 1.711225986480713, + "learning_rate": 1.915968450464101e-05, + "loss": 0.6412, + "step": 14011 + }, + { + "epoch": 2.287335210807722, + "grad_norm": 1.6190986633300781, + "learning_rate": 1.9159557170233857e-05, + "loss": 0.5695, + "step": 14012 + }, + { + "epoch": 2.2874984694502265, + "grad_norm": 1.6358613967895508, + "learning_rate": 1.9159429826603032e-05, + "loss": 0.6199, + "step": 14013 + }, + { + "epoch": 2.287661728092731, + "grad_norm": 1.7216846942901611, + "learning_rate": 1.9159302473748665e-05, + "loss": 0.5687, + "step": 14014 + }, + { + "epoch": 2.2878249867352354, + "grad_norm": 1.7303502559661865, + "learning_rate": 1.9159175111670882e-05, + "loss": 0.514, + "step": 14015 + }, + { + "epoch": 2.28798824537774, + "grad_norm": 1.743077039718628, + "learning_rate": 1.915904774036981e-05, + "loss": 0.4675, + "step": 14016 + }, + { + "epoch": 2.2881515040202443, + "grad_norm": 1.7576959133148193, + "learning_rate": 1.915892035984558e-05, + "loss": 0.5728, + "step": 14017 + }, + { + "epoch": 2.2883147626627487, + "grad_norm": 1.7040982246398926, + "learning_rate": 1.9158792970098322e-05, + "loss": 0.5756, + "step": 14018 + }, + { + "epoch": 2.2884780213052527, + "grad_norm": 1.888571858406067, + "learning_rate": 1.9158665571128164e-05, + "loss": 0.6309, + "step": 14019 + }, + { + "epoch": 2.288641279947757, + "grad_norm": 1.9732773303985596, + "learning_rate": 1.9158538162935227e-05, + "loss": 0.583, + "step": 14020 + }, + { + "epoch": 2.2888045385902616, + "grad_norm": 1.9240121841430664, + "learning_rate": 1.9158410745519647e-05, + "loss": 0.6326, + "step": 14021 + }, + { + "epoch": 2.288967797232766, + "grad_norm": 2.009688138961792, + "learning_rate": 1.9158283318881548e-05, + "loss": 0.6401, + "step": 14022 + }, + { + "epoch": 2.2891310558752704, + "grad_norm": 1.5068720579147339, + "learning_rate": 1.9158155883021062e-05, + "loss": 0.4659, + "step": 14023 + }, + { + "epoch": 2.289294314517775, + "grad_norm": 1.6102581024169922, + "learning_rate": 1.9158028437938316e-05, + "loss": 0.5174, + "step": 14024 + }, + { + "epoch": 2.2894575731602793, + "grad_norm": 1.8191735744476318, + "learning_rate": 1.9157900983633437e-05, + "loss": 0.5618, + "step": 14025 + }, + { + "epoch": 2.2896208318027838, + "grad_norm": 1.8497214317321777, + "learning_rate": 1.9157773520106552e-05, + "loss": 0.5995, + "step": 14026 + }, + { + "epoch": 2.2897840904452877, + "grad_norm": 1.8703248500823975, + "learning_rate": 1.9157646047357795e-05, + "loss": 0.5425, + "step": 14027 + }, + { + "epoch": 2.289947349087792, + "grad_norm": 1.9407916069030762, + "learning_rate": 1.915751856538729e-05, + "loss": 0.669, + "step": 14028 + }, + { + "epoch": 2.2901106077302966, + "grad_norm": 1.5875822305679321, + "learning_rate": 1.9157391074195163e-05, + "loss": 0.4915, + "step": 14029 + }, + { + "epoch": 2.290273866372801, + "grad_norm": 2.180553674697876, + "learning_rate": 1.915726357378155e-05, + "loss": 0.7204, + "step": 14030 + }, + { + "epoch": 2.2904371250153055, + "grad_norm": 1.9645739793777466, + "learning_rate": 1.9157136064146573e-05, + "loss": 0.4869, + "step": 14031 + }, + { + "epoch": 2.29060038365781, + "grad_norm": 1.8563637733459473, + "learning_rate": 1.915700854529036e-05, + "loss": 0.551, + "step": 14032 + }, + { + "epoch": 2.2907636423003144, + "grad_norm": 1.72524094581604, + "learning_rate": 1.9156881017213045e-05, + "loss": 0.542, + "step": 14033 + }, + { + "epoch": 2.290926900942819, + "grad_norm": 1.5695867538452148, + "learning_rate": 1.915675347991475e-05, + "loss": 0.5227, + "step": 14034 + }, + { + "epoch": 2.2910901595853232, + "grad_norm": 1.8060929775238037, + "learning_rate": 1.9156625933395614e-05, + "loss": 0.6802, + "step": 14035 + }, + { + "epoch": 2.2912534182278272, + "grad_norm": 1.7625160217285156, + "learning_rate": 1.915649837765575e-05, + "loss": 0.6382, + "step": 14036 + }, + { + "epoch": 2.2914166768703317, + "grad_norm": 1.6379420757293701, + "learning_rate": 1.91563708126953e-05, + "loss": 0.5415, + "step": 14037 + }, + { + "epoch": 2.291579935512836, + "grad_norm": 2.006383180618286, + "learning_rate": 1.9156243238514384e-05, + "loss": 0.6792, + "step": 14038 + }, + { + "epoch": 2.2917431941553406, + "grad_norm": 1.5719441175460815, + "learning_rate": 1.9156115655113136e-05, + "loss": 0.5896, + "step": 14039 + }, + { + "epoch": 2.291906452797845, + "grad_norm": 1.9235527515411377, + "learning_rate": 1.915598806249168e-05, + "loss": 0.6069, + "step": 14040 + }, + { + "epoch": 2.2920697114403494, + "grad_norm": 1.7831084728240967, + "learning_rate": 1.9155860460650145e-05, + "loss": 0.5955, + "step": 14041 + }, + { + "epoch": 2.292232970082854, + "grad_norm": 1.7408894300460815, + "learning_rate": 1.9155732849588664e-05, + "loss": 0.514, + "step": 14042 + }, + { + "epoch": 2.2923962287253583, + "grad_norm": 2.004639148712158, + "learning_rate": 1.915560522930736e-05, + "loss": 0.758, + "step": 14043 + }, + { + "epoch": 2.2925594873678623, + "grad_norm": 1.3885266780853271, + "learning_rate": 1.9155477599806365e-05, + "loss": 0.3964, + "step": 14044 + }, + { + "epoch": 2.2927227460103667, + "grad_norm": 1.8664318323135376, + "learning_rate": 1.915534996108581e-05, + "loss": 0.6531, + "step": 14045 + }, + { + "epoch": 2.292886004652871, + "grad_norm": 1.8279571533203125, + "learning_rate": 1.9155222313145817e-05, + "loss": 0.5377, + "step": 14046 + }, + { + "epoch": 2.2930492632953756, + "grad_norm": 1.7790271043777466, + "learning_rate": 1.9155094655986515e-05, + "loss": 0.5529, + "step": 14047 + }, + { + "epoch": 2.29321252193788, + "grad_norm": 1.754335641860962, + "learning_rate": 1.9154966989608036e-05, + "loss": 0.6506, + "step": 14048 + }, + { + "epoch": 2.2933757805803845, + "grad_norm": 1.8760497570037842, + "learning_rate": 1.915483931401051e-05, + "loss": 0.7208, + "step": 14049 + }, + { + "epoch": 2.293539039222889, + "grad_norm": 1.6028623580932617, + "learning_rate": 1.9154711629194062e-05, + "loss": 0.5117, + "step": 14050 + }, + { + "epoch": 2.2937022978653934, + "grad_norm": 1.6003228425979614, + "learning_rate": 1.9154583935158822e-05, + "loss": 0.5233, + "step": 14051 + }, + { + "epoch": 2.293865556507898, + "grad_norm": 1.939603328704834, + "learning_rate": 1.915445623190492e-05, + "loss": 0.6351, + "step": 14052 + }, + { + "epoch": 2.2940288151504022, + "grad_norm": 1.49372136592865, + "learning_rate": 1.9154328519432483e-05, + "loss": 0.5051, + "step": 14053 + }, + { + "epoch": 2.2941920737929062, + "grad_norm": 1.8941140174865723, + "learning_rate": 1.9154200797741637e-05, + "loss": 0.6131, + "step": 14054 + }, + { + "epoch": 2.2943553324354107, + "grad_norm": 1.7595065832138062, + "learning_rate": 1.9154073066832512e-05, + "loss": 0.5542, + "step": 14055 + }, + { + "epoch": 2.294518591077915, + "grad_norm": 1.6753801107406616, + "learning_rate": 1.9153945326705242e-05, + "loss": 0.4709, + "step": 14056 + }, + { + "epoch": 2.2946818497204196, + "grad_norm": 1.8019970655441284, + "learning_rate": 1.915381757735995e-05, + "loss": 0.5084, + "step": 14057 + }, + { + "epoch": 2.294845108362924, + "grad_norm": 1.8557394742965698, + "learning_rate": 1.9153689818796764e-05, + "loss": 0.578, + "step": 14058 + }, + { + "epoch": 2.2950083670054284, + "grad_norm": 1.7790182828903198, + "learning_rate": 1.9153562051015817e-05, + "loss": 0.5592, + "step": 14059 + }, + { + "epoch": 2.295171625647933, + "grad_norm": 1.8044377565383911, + "learning_rate": 1.9153434274017234e-05, + "loss": 0.5492, + "step": 14060 + }, + { + "epoch": 2.2953348842904373, + "grad_norm": 1.6618716716766357, + "learning_rate": 1.9153306487801146e-05, + "loss": 0.5, + "step": 14061 + }, + { + "epoch": 2.2954981429329413, + "grad_norm": 1.8789710998535156, + "learning_rate": 1.9153178692367684e-05, + "loss": 0.6739, + "step": 14062 + }, + { + "epoch": 2.2956614015754457, + "grad_norm": 1.8143547773361206, + "learning_rate": 1.9153050887716968e-05, + "loss": 0.491, + "step": 14063 + }, + { + "epoch": 2.29582466021795, + "grad_norm": 2.187600612640381, + "learning_rate": 1.9152923073849134e-05, + "loss": 0.6486, + "step": 14064 + }, + { + "epoch": 2.2959879188604546, + "grad_norm": 1.7732521295547485, + "learning_rate": 1.9152795250764308e-05, + "loss": 0.5036, + "step": 14065 + }, + { + "epoch": 2.296151177502959, + "grad_norm": 1.6295660734176636, + "learning_rate": 1.915266741846262e-05, + "loss": 0.5818, + "step": 14066 + }, + { + "epoch": 2.2963144361454635, + "grad_norm": 1.7453280687332153, + "learning_rate": 1.9152539576944203e-05, + "loss": 0.5373, + "step": 14067 + }, + { + "epoch": 2.296477694787968, + "grad_norm": 2.1421189308166504, + "learning_rate": 1.9152411726209176e-05, + "loss": 0.6456, + "step": 14068 + }, + { + "epoch": 2.2966409534304724, + "grad_norm": 1.9632409811019897, + "learning_rate": 1.9152283866257674e-05, + "loss": 0.5898, + "step": 14069 + }, + { + "epoch": 2.296804212072977, + "grad_norm": 1.945724606513977, + "learning_rate": 1.9152155997089824e-05, + "loss": 0.6625, + "step": 14070 + }, + { + "epoch": 2.2969674707154812, + "grad_norm": 2.2041547298431396, + "learning_rate": 1.9152028118705757e-05, + "loss": 0.6348, + "step": 14071 + }, + { + "epoch": 2.2971307293579852, + "grad_norm": 1.9027799367904663, + "learning_rate": 1.9151900231105603e-05, + "loss": 0.6368, + "step": 14072 + }, + { + "epoch": 2.2972939880004897, + "grad_norm": 1.7713886499404907, + "learning_rate": 1.9151772334289483e-05, + "loss": 0.5514, + "step": 14073 + }, + { + "epoch": 2.297457246642994, + "grad_norm": 1.8441851139068604, + "learning_rate": 1.915164442825753e-05, + "loss": 0.5982, + "step": 14074 + }, + { + "epoch": 2.2976205052854985, + "grad_norm": 1.478020191192627, + "learning_rate": 1.9151516513009877e-05, + "loss": 0.4825, + "step": 14075 + }, + { + "epoch": 2.297783763928003, + "grad_norm": 1.828572154045105, + "learning_rate": 1.9151388588546645e-05, + "loss": 0.6346, + "step": 14076 + }, + { + "epoch": 2.2979470225705074, + "grad_norm": 2.098597288131714, + "learning_rate": 1.9151260654867973e-05, + "loss": 0.6596, + "step": 14077 + }, + { + "epoch": 2.298110281213012, + "grad_norm": 1.7956889867782593, + "learning_rate": 1.915113271197398e-05, + "loss": 0.586, + "step": 14078 + }, + { + "epoch": 2.298273539855516, + "grad_norm": 1.848307490348816, + "learning_rate": 1.91510047598648e-05, + "loss": 0.6178, + "step": 14079 + }, + { + "epoch": 2.2984367984980203, + "grad_norm": 1.7283990383148193, + "learning_rate": 1.915087679854056e-05, + "loss": 0.5527, + "step": 14080 + }, + { + "epoch": 2.2986000571405247, + "grad_norm": 1.5191999673843384, + "learning_rate": 1.915074882800139e-05, + "loss": 0.5149, + "step": 14081 + }, + { + "epoch": 2.298763315783029, + "grad_norm": 1.6771363019943237, + "learning_rate": 1.915062084824742e-05, + "loss": 0.5611, + "step": 14082 + }, + { + "epoch": 2.2989265744255336, + "grad_norm": 1.9180035591125488, + "learning_rate": 1.9150492859278774e-05, + "loss": 0.5653, + "step": 14083 + }, + { + "epoch": 2.299089833068038, + "grad_norm": 1.9641889333724976, + "learning_rate": 1.9150364861095582e-05, + "loss": 0.6864, + "step": 14084 + }, + { + "epoch": 2.2992530917105425, + "grad_norm": 1.5821303129196167, + "learning_rate": 1.9150236853697982e-05, + "loss": 0.5575, + "step": 14085 + }, + { + "epoch": 2.299416350353047, + "grad_norm": 1.8300634622573853, + "learning_rate": 1.9150108837086094e-05, + "loss": 0.5832, + "step": 14086 + }, + { + "epoch": 2.2995796089955514, + "grad_norm": 1.8990364074707031, + "learning_rate": 1.9149980811260045e-05, + "loss": 0.5506, + "step": 14087 + }, + { + "epoch": 2.299742867638056, + "grad_norm": 1.5598406791687012, + "learning_rate": 1.914985277621997e-05, + "loss": 0.5713, + "step": 14088 + }, + { + "epoch": 2.29990612628056, + "grad_norm": 2.1838178634643555, + "learning_rate": 1.9149724731965994e-05, + "loss": 0.5592, + "step": 14089 + }, + { + "epoch": 2.3000693849230642, + "grad_norm": 1.5228079557418823, + "learning_rate": 1.914959667849825e-05, + "loss": 0.4385, + "step": 14090 + }, + { + "epoch": 2.3002326435655687, + "grad_norm": 1.4911350011825562, + "learning_rate": 1.9149468615816862e-05, + "loss": 0.5544, + "step": 14091 + }, + { + "epoch": 2.300395902208073, + "grad_norm": 2.0439836978912354, + "learning_rate": 1.9149340543921962e-05, + "loss": 0.687, + "step": 14092 + }, + { + "epoch": 2.3005591608505775, + "grad_norm": 1.4747322797775269, + "learning_rate": 1.9149212462813682e-05, + "loss": 0.5479, + "step": 14093 + }, + { + "epoch": 2.300722419493082, + "grad_norm": 1.6484969854354858, + "learning_rate": 1.9149084372492146e-05, + "loss": 0.4587, + "step": 14094 + }, + { + "epoch": 2.3008856781355864, + "grad_norm": 1.674045443534851, + "learning_rate": 1.9148956272957484e-05, + "loss": 0.6373, + "step": 14095 + }, + { + "epoch": 2.301048936778091, + "grad_norm": 1.902550220489502, + "learning_rate": 1.9148828164209824e-05, + "loss": 0.6373, + "step": 14096 + }, + { + "epoch": 2.301212195420595, + "grad_norm": 2.0768749713897705, + "learning_rate": 1.91487000462493e-05, + "loss": 0.6566, + "step": 14097 + }, + { + "epoch": 2.3013754540630993, + "grad_norm": 1.6310875415802002, + "learning_rate": 1.9148571919076032e-05, + "loss": 0.5571, + "step": 14098 + }, + { + "epoch": 2.3015387127056037, + "grad_norm": 1.6493828296661377, + "learning_rate": 1.914844378269016e-05, + "loss": 0.4933, + "step": 14099 + }, + { + "epoch": 2.301701971348108, + "grad_norm": 1.9574527740478516, + "learning_rate": 1.9148315637091805e-05, + "loss": 0.6504, + "step": 14100 + }, + { + "epoch": 2.3018652299906126, + "grad_norm": 1.8924041986465454, + "learning_rate": 1.9148187482281097e-05, + "loss": 0.5752, + "step": 14101 + }, + { + "epoch": 2.302028488633117, + "grad_norm": 1.6946848630905151, + "learning_rate": 1.914805931825817e-05, + "loss": 0.5395, + "step": 14102 + }, + { + "epoch": 2.3021917472756215, + "grad_norm": 1.5627840757369995, + "learning_rate": 1.914793114502315e-05, + "loss": 0.4926, + "step": 14103 + }, + { + "epoch": 2.302355005918126, + "grad_norm": 1.950657844543457, + "learning_rate": 1.914780296257616e-05, + "loss": 0.5717, + "step": 14104 + }, + { + "epoch": 2.3025182645606304, + "grad_norm": 2.0718977451324463, + "learning_rate": 1.9147674770917342e-05, + "loss": 0.6216, + "step": 14105 + }, + { + "epoch": 2.302681523203135, + "grad_norm": 1.5791618824005127, + "learning_rate": 1.9147546570046812e-05, + "loss": 0.4456, + "step": 14106 + }, + { + "epoch": 2.302844781845639, + "grad_norm": 1.7787023782730103, + "learning_rate": 1.9147418359964708e-05, + "loss": 0.5692, + "step": 14107 + }, + { + "epoch": 2.3030080404881432, + "grad_norm": 1.9981205463409424, + "learning_rate": 1.9147290140671154e-05, + "loss": 0.5548, + "step": 14108 + }, + { + "epoch": 2.3031712991306477, + "grad_norm": 1.9247280359268188, + "learning_rate": 1.9147161912166286e-05, + "loss": 0.655, + "step": 14109 + }, + { + "epoch": 2.303334557773152, + "grad_norm": 1.9564175605773926, + "learning_rate": 1.9147033674450223e-05, + "loss": 0.5562, + "step": 14110 + }, + { + "epoch": 2.3034978164156565, + "grad_norm": 2.096853494644165, + "learning_rate": 1.9146905427523104e-05, + "loss": 0.687, + "step": 14111 + }, + { + "epoch": 2.303661075058161, + "grad_norm": 2.0932681560516357, + "learning_rate": 1.914677717138505e-05, + "loss": 0.6163, + "step": 14112 + }, + { + "epoch": 2.3038243337006654, + "grad_norm": 2.260066270828247, + "learning_rate": 1.9146648906036197e-05, + "loss": 0.638, + "step": 14113 + }, + { + "epoch": 2.30398759234317, + "grad_norm": 1.5339609384536743, + "learning_rate": 1.914652063147667e-05, + "loss": 0.4889, + "step": 14114 + }, + { + "epoch": 2.304150850985674, + "grad_norm": 1.5624340772628784, + "learning_rate": 1.9146392347706596e-05, + "loss": 0.516, + "step": 14115 + }, + { + "epoch": 2.3043141096281783, + "grad_norm": 2.204514503479004, + "learning_rate": 1.9146264054726112e-05, + "loss": 0.6039, + "step": 14116 + }, + { + "epoch": 2.3044773682706827, + "grad_norm": 1.8411991596221924, + "learning_rate": 1.914613575253534e-05, + "loss": 0.4855, + "step": 14117 + }, + { + "epoch": 2.304640626913187, + "grad_norm": 1.8212238550186157, + "learning_rate": 1.9146007441134412e-05, + "loss": 0.5599, + "step": 14118 + }, + { + "epoch": 2.3048038855556916, + "grad_norm": 1.7575026750564575, + "learning_rate": 1.914587912052346e-05, + "loss": 0.6144, + "step": 14119 + }, + { + "epoch": 2.304967144198196, + "grad_norm": 1.8553175926208496, + "learning_rate": 1.9145750790702606e-05, + "loss": 0.6296, + "step": 14120 + }, + { + "epoch": 2.3051304028407005, + "grad_norm": 2.024660587310791, + "learning_rate": 1.9145622451671983e-05, + "loss": 0.6517, + "step": 14121 + }, + { + "epoch": 2.305293661483205, + "grad_norm": 2.256958484649658, + "learning_rate": 1.914549410343172e-05, + "loss": 0.6391, + "step": 14122 + }, + { + "epoch": 2.3054569201257094, + "grad_norm": 1.6476469039916992, + "learning_rate": 1.914536574598195e-05, + "loss": 0.5352, + "step": 14123 + }, + { + "epoch": 2.305620178768214, + "grad_norm": 1.9945690631866455, + "learning_rate": 1.91452373793228e-05, + "loss": 0.638, + "step": 14124 + }, + { + "epoch": 2.305783437410718, + "grad_norm": 1.8745495080947876, + "learning_rate": 1.9145109003454396e-05, + "loss": 0.5366, + "step": 14125 + }, + { + "epoch": 2.3059466960532222, + "grad_norm": 2.0358731746673584, + "learning_rate": 1.9144980618376873e-05, + "loss": 0.583, + "step": 14126 + }, + { + "epoch": 2.3061099546957267, + "grad_norm": 1.4960285425186157, + "learning_rate": 1.914485222409035e-05, + "loss": 0.4974, + "step": 14127 + }, + { + "epoch": 2.306273213338231, + "grad_norm": 1.9147897958755493, + "learning_rate": 1.9144723820594968e-05, + "loss": 0.5022, + "step": 14128 + }, + { + "epoch": 2.3064364719807355, + "grad_norm": 1.941955327987671, + "learning_rate": 1.9144595407890853e-05, + "loss": 0.7099, + "step": 14129 + }, + { + "epoch": 2.30659973062324, + "grad_norm": 1.6317088603973389, + "learning_rate": 1.914446698597813e-05, + "loss": 0.523, + "step": 14130 + }, + { + "epoch": 2.3067629892657444, + "grad_norm": 1.5471748113632202, + "learning_rate": 1.914433855485693e-05, + "loss": 0.492, + "step": 14131 + }, + { + "epoch": 2.3069262479082484, + "grad_norm": 1.6434290409088135, + "learning_rate": 1.914421011452739e-05, + "loss": 0.5822, + "step": 14132 + }, + { + "epoch": 2.307089506550753, + "grad_norm": 1.5649094581604004, + "learning_rate": 1.9144081664989626e-05, + "loss": 0.5127, + "step": 14133 + }, + { + "epoch": 2.3072527651932573, + "grad_norm": 1.8652970790863037, + "learning_rate": 1.9143953206243778e-05, + "loss": 0.5234, + "step": 14134 + }, + { + "epoch": 2.3074160238357617, + "grad_norm": 1.956431269645691, + "learning_rate": 1.914382473828997e-05, + "loss": 0.5477, + "step": 14135 + }, + { + "epoch": 2.307579282478266, + "grad_norm": 2.069138765335083, + "learning_rate": 1.914369626112833e-05, + "loss": 0.604, + "step": 14136 + }, + { + "epoch": 2.3077425411207706, + "grad_norm": 1.891721248626709, + "learning_rate": 1.9143567774758994e-05, + "loss": 0.6642, + "step": 14137 + }, + { + "epoch": 2.307905799763275, + "grad_norm": 1.613600254058838, + "learning_rate": 1.9143439279182087e-05, + "loss": 0.5552, + "step": 14138 + }, + { + "epoch": 2.3080690584057795, + "grad_norm": 1.5516656637191772, + "learning_rate": 1.914331077439774e-05, + "loss": 0.5555, + "step": 14139 + }, + { + "epoch": 2.308232317048284, + "grad_norm": 1.6903424263000488, + "learning_rate": 1.914318226040608e-05, + "loss": 0.4903, + "step": 14140 + }, + { + "epoch": 2.3083955756907883, + "grad_norm": 1.584118127822876, + "learning_rate": 1.914305373720724e-05, + "loss": 0.54, + "step": 14141 + }, + { + "epoch": 2.3085588343332923, + "grad_norm": 1.6930195093154907, + "learning_rate": 1.9142925204801343e-05, + "loss": 0.5386, + "step": 14142 + }, + { + "epoch": 2.308722092975797, + "grad_norm": 1.9773590564727783, + "learning_rate": 1.9142796663188527e-05, + "loss": 0.6823, + "step": 14143 + }, + { + "epoch": 2.308885351618301, + "grad_norm": 1.552107334136963, + "learning_rate": 1.9142668112368917e-05, + "loss": 0.5271, + "step": 14144 + }, + { + "epoch": 2.3090486102608057, + "grad_norm": 1.7365341186523438, + "learning_rate": 1.9142539552342638e-05, + "loss": 0.4922, + "step": 14145 + }, + { + "epoch": 2.30921186890331, + "grad_norm": 2.3511784076690674, + "learning_rate": 1.9142410983109826e-05, + "loss": 0.992, + "step": 14146 + }, + { + "epoch": 2.3093751275458145, + "grad_norm": 1.538337230682373, + "learning_rate": 1.9142282404670613e-05, + "loss": 0.502, + "step": 14147 + }, + { + "epoch": 2.309538386188319, + "grad_norm": 1.6977447271347046, + "learning_rate": 1.914215381702512e-05, + "loss": 0.5336, + "step": 14148 + }, + { + "epoch": 2.3097016448308234, + "grad_norm": 1.9761089086532593, + "learning_rate": 1.914202522017348e-05, + "loss": 0.6506, + "step": 14149 + }, + { + "epoch": 2.3098649034733274, + "grad_norm": 1.6668850183486938, + "learning_rate": 1.9141896614115824e-05, + "loss": 0.5473, + "step": 14150 + }, + { + "epoch": 2.310028162115832, + "grad_norm": 1.6683109998703003, + "learning_rate": 1.9141767998852283e-05, + "loss": 0.5639, + "step": 14151 + }, + { + "epoch": 2.3101914207583363, + "grad_norm": 1.8965120315551758, + "learning_rate": 1.9141639374382982e-05, + "loss": 0.6094, + "step": 14152 + }, + { + "epoch": 2.3103546794008407, + "grad_norm": 1.781065821647644, + "learning_rate": 1.9141510740708053e-05, + "loss": 0.5865, + "step": 14153 + }, + { + "epoch": 2.310517938043345, + "grad_norm": 1.341018795967102, + "learning_rate": 1.9141382097827623e-05, + "loss": 0.4409, + "step": 14154 + }, + { + "epoch": 2.3106811966858496, + "grad_norm": 1.6383600234985352, + "learning_rate": 1.9141253445741826e-05, + "loss": 0.5083, + "step": 14155 + }, + { + "epoch": 2.310844455328354, + "grad_norm": 1.5889445543289185, + "learning_rate": 1.914112478445079e-05, + "loss": 0.502, + "step": 14156 + }, + { + "epoch": 2.3110077139708585, + "grad_norm": 1.8370862007141113, + "learning_rate": 1.9140996113954643e-05, + "loss": 0.6326, + "step": 14157 + }, + { + "epoch": 2.311170972613363, + "grad_norm": 2.0889790058135986, + "learning_rate": 1.9140867434253515e-05, + "loss": 0.6836, + "step": 14158 + }, + { + "epoch": 2.3113342312558673, + "grad_norm": 2.2921338081359863, + "learning_rate": 1.9140738745347535e-05, + "loss": 0.6075, + "step": 14159 + }, + { + "epoch": 2.3114974898983713, + "grad_norm": 1.751992106437683, + "learning_rate": 1.9140610047236834e-05, + "loss": 0.513, + "step": 14160 + }, + { + "epoch": 2.3116607485408758, + "grad_norm": 1.9034423828125, + "learning_rate": 1.914048133992154e-05, + "loss": 0.5331, + "step": 14161 + }, + { + "epoch": 2.31182400718338, + "grad_norm": 1.8179794549942017, + "learning_rate": 1.9140352623401785e-05, + "loss": 0.6295, + "step": 14162 + }, + { + "epoch": 2.3119872658258847, + "grad_norm": 1.812585473060608, + "learning_rate": 1.9140223897677697e-05, + "loss": 0.4853, + "step": 14163 + }, + { + "epoch": 2.312150524468389, + "grad_norm": 1.7048133611679077, + "learning_rate": 1.9140095162749408e-05, + "loss": 0.524, + "step": 14164 + }, + { + "epoch": 2.3123137831108935, + "grad_norm": 1.662611722946167, + "learning_rate": 1.9139966418617045e-05, + "loss": 0.5854, + "step": 14165 + }, + { + "epoch": 2.312477041753398, + "grad_norm": 2.0646820068359375, + "learning_rate": 1.9139837665280735e-05, + "loss": 0.6356, + "step": 14166 + }, + { + "epoch": 2.312640300395902, + "grad_norm": 1.9403454065322876, + "learning_rate": 1.913970890274061e-05, + "loss": 0.6402, + "step": 14167 + }, + { + "epoch": 2.3128035590384064, + "grad_norm": 1.8528285026550293, + "learning_rate": 1.9139580130996807e-05, + "loss": 0.586, + "step": 14168 + }, + { + "epoch": 2.312966817680911, + "grad_norm": 2.365898370742798, + "learning_rate": 1.913945135004945e-05, + "loss": 0.66, + "step": 14169 + }, + { + "epoch": 2.3131300763234153, + "grad_norm": 1.7872637510299683, + "learning_rate": 1.913932255989866e-05, + "loss": 0.5508, + "step": 14170 + }, + { + "epoch": 2.3132933349659197, + "grad_norm": 1.7277421951293945, + "learning_rate": 1.913919376054458e-05, + "loss": 0.5171, + "step": 14171 + }, + { + "epoch": 2.313456593608424, + "grad_norm": 1.4254661798477173, + "learning_rate": 1.9139064951987332e-05, + "loss": 0.5041, + "step": 14172 + }, + { + "epoch": 2.3136198522509286, + "grad_norm": 1.7253443002700806, + "learning_rate": 1.9138936134227047e-05, + "loss": 0.601, + "step": 14173 + }, + { + "epoch": 2.313783110893433, + "grad_norm": 1.595473051071167, + "learning_rate": 1.9138807307263858e-05, + "loss": 0.4817, + "step": 14174 + }, + { + "epoch": 2.3139463695359375, + "grad_norm": 2.0962610244750977, + "learning_rate": 1.9138678471097892e-05, + "loss": 0.6712, + "step": 14175 + }, + { + "epoch": 2.314109628178442, + "grad_norm": 1.4534828662872314, + "learning_rate": 1.913854962572928e-05, + "loss": 0.4836, + "step": 14176 + }, + { + "epoch": 2.314272886820946, + "grad_norm": 1.6335846185684204, + "learning_rate": 1.913842077115815e-05, + "loss": 0.5543, + "step": 14177 + }, + { + "epoch": 2.3144361454634503, + "grad_norm": 1.7109801769256592, + "learning_rate": 1.9138291907384632e-05, + "loss": 0.4767, + "step": 14178 + }, + { + "epoch": 2.3145994041059548, + "grad_norm": 1.7814111709594727, + "learning_rate": 1.9138163034408858e-05, + "loss": 0.5595, + "step": 14179 + }, + { + "epoch": 2.314762662748459, + "grad_norm": 1.659364938735962, + "learning_rate": 1.9138034152230952e-05, + "loss": 0.5119, + "step": 14180 + }, + { + "epoch": 2.3149259213909636, + "grad_norm": 1.566986083984375, + "learning_rate": 1.9137905260851056e-05, + "loss": 0.5273, + "step": 14181 + }, + { + "epoch": 2.315089180033468, + "grad_norm": 2.0784876346588135, + "learning_rate": 1.9137776360269285e-05, + "loss": 0.7071, + "step": 14182 + }, + { + "epoch": 2.3152524386759725, + "grad_norm": 1.762555718421936, + "learning_rate": 1.9137647450485775e-05, + "loss": 0.6498, + "step": 14183 + }, + { + "epoch": 2.315415697318477, + "grad_norm": 1.6993416547775269, + "learning_rate": 1.9137518531500662e-05, + "loss": 0.5871, + "step": 14184 + }, + { + "epoch": 2.315578955960981, + "grad_norm": 1.7905813455581665, + "learning_rate": 1.913738960331407e-05, + "loss": 0.6284, + "step": 14185 + }, + { + "epoch": 2.3157422146034854, + "grad_norm": 1.7983088493347168, + "learning_rate": 1.9137260665926123e-05, + "loss": 0.5997, + "step": 14186 + }, + { + "epoch": 2.31590547324599, + "grad_norm": 1.7449491024017334, + "learning_rate": 1.9137131719336962e-05, + "loss": 0.5514, + "step": 14187 + }, + { + "epoch": 2.3160687318884943, + "grad_norm": 1.5621578693389893, + "learning_rate": 1.913700276354671e-05, + "loss": 0.4959, + "step": 14188 + }, + { + "epoch": 2.3162319905309987, + "grad_norm": 1.7015128135681152, + "learning_rate": 1.91368737985555e-05, + "loss": 0.5658, + "step": 14189 + }, + { + "epoch": 2.316395249173503, + "grad_norm": 1.746320366859436, + "learning_rate": 1.913674482436346e-05, + "loss": 0.5591, + "step": 14190 + }, + { + "epoch": 2.3165585078160076, + "grad_norm": 1.8337692022323608, + "learning_rate": 1.913661584097072e-05, + "loss": 0.5585, + "step": 14191 + }, + { + "epoch": 2.316721766458512, + "grad_norm": 1.8575688600540161, + "learning_rate": 1.913648684837741e-05, + "loss": 0.5424, + "step": 14192 + }, + { + "epoch": 2.3168850251010165, + "grad_norm": 1.9168716669082642, + "learning_rate": 1.9136357846583658e-05, + "loss": 0.6102, + "step": 14193 + }, + { + "epoch": 2.317048283743521, + "grad_norm": 1.627753496170044, + "learning_rate": 1.91362288355896e-05, + "loss": 0.5121, + "step": 14194 + }, + { + "epoch": 2.317211542386025, + "grad_norm": 1.8297207355499268, + "learning_rate": 1.913609981539536e-05, + "loss": 0.5474, + "step": 14195 + }, + { + "epoch": 2.3173748010285293, + "grad_norm": 1.5398311614990234, + "learning_rate": 1.913597078600107e-05, + "loss": 0.478, + "step": 14196 + }, + { + "epoch": 2.3175380596710338, + "grad_norm": 1.639058232307434, + "learning_rate": 1.9135841747406863e-05, + "loss": 0.5408, + "step": 14197 + }, + { + "epoch": 2.317701318313538, + "grad_norm": 1.6240248680114746, + "learning_rate": 1.913571269961286e-05, + "loss": 0.5214, + "step": 14198 + }, + { + "epoch": 2.3178645769560426, + "grad_norm": 1.5397125482559204, + "learning_rate": 1.91355836426192e-05, + "loss": 0.5547, + "step": 14199 + }, + { + "epoch": 2.318027835598547, + "grad_norm": 2.00486159324646, + "learning_rate": 1.913545457642601e-05, + "loss": 0.6436, + "step": 14200 + }, + { + "epoch": 2.3181910942410515, + "grad_norm": 1.72153639793396, + "learning_rate": 1.913532550103342e-05, + "loss": 0.4965, + "step": 14201 + }, + { + "epoch": 2.318354352883556, + "grad_norm": 1.600995659828186, + "learning_rate": 1.9135196416441558e-05, + "loss": 0.5547, + "step": 14202 + }, + { + "epoch": 2.31851761152606, + "grad_norm": 1.9395190477371216, + "learning_rate": 1.913506732265056e-05, + "loss": 0.7194, + "step": 14203 + }, + { + "epoch": 2.3186808701685644, + "grad_norm": 2.1096675395965576, + "learning_rate": 1.9134938219660545e-05, + "loss": 0.6371, + "step": 14204 + }, + { + "epoch": 2.318844128811069, + "grad_norm": 1.8336228132247925, + "learning_rate": 1.9134809107471652e-05, + "loss": 0.5803, + "step": 14205 + }, + { + "epoch": 2.3190073874535733, + "grad_norm": 2.0344910621643066, + "learning_rate": 1.9134679986084008e-05, + "loss": 0.6741, + "step": 14206 + }, + { + "epoch": 2.3191706460960777, + "grad_norm": 1.856908917427063, + "learning_rate": 1.9134550855497745e-05, + "loss": 0.6134, + "step": 14207 + }, + { + "epoch": 2.319333904738582, + "grad_norm": 1.5771139860153198, + "learning_rate": 1.9134421715712993e-05, + "loss": 0.4426, + "step": 14208 + }, + { + "epoch": 2.3194971633810866, + "grad_norm": 1.9810947179794312, + "learning_rate": 1.913429256672988e-05, + "loss": 0.5809, + "step": 14209 + }, + { + "epoch": 2.319660422023591, + "grad_norm": 1.7833067178726196, + "learning_rate": 1.9134163408548537e-05, + "loss": 0.5816, + "step": 14210 + }, + { + "epoch": 2.3198236806660955, + "grad_norm": 1.6517349481582642, + "learning_rate": 1.913403424116909e-05, + "loss": 0.4788, + "step": 14211 + }, + { + "epoch": 2.3199869393086, + "grad_norm": 1.6973838806152344, + "learning_rate": 1.9133905064591677e-05, + "loss": 0.5069, + "step": 14212 + }, + { + "epoch": 2.320150197951104, + "grad_norm": 1.4741166830062866, + "learning_rate": 1.9133775878816423e-05, + "loss": 0.5158, + "step": 14213 + }, + { + "epoch": 2.3203134565936083, + "grad_norm": 1.7736214399337769, + "learning_rate": 1.913364668384346e-05, + "loss": 0.5577, + "step": 14214 + }, + { + "epoch": 2.3204767152361128, + "grad_norm": 1.7309151887893677, + "learning_rate": 1.9133517479672917e-05, + "loss": 0.5824, + "step": 14215 + }, + { + "epoch": 2.320639973878617, + "grad_norm": 1.7922288179397583, + "learning_rate": 1.9133388266304925e-05, + "loss": 0.6659, + "step": 14216 + }, + { + "epoch": 2.3208032325211216, + "grad_norm": 1.6768302917480469, + "learning_rate": 1.9133259043739612e-05, + "loss": 0.5371, + "step": 14217 + }, + { + "epoch": 2.320966491163626, + "grad_norm": 2.1058316230773926, + "learning_rate": 1.913312981197711e-05, + "loss": 0.6223, + "step": 14218 + }, + { + "epoch": 2.3211297498061305, + "grad_norm": 2.086566925048828, + "learning_rate": 1.9133000571017546e-05, + "loss": 0.6546, + "step": 14219 + }, + { + "epoch": 2.3212930084486345, + "grad_norm": 1.7139840126037598, + "learning_rate": 1.9132871320861056e-05, + "loss": 0.6002, + "step": 14220 + }, + { + "epoch": 2.321456267091139, + "grad_norm": 1.8418947458267212, + "learning_rate": 1.9132742061507764e-05, + "loss": 0.5489, + "step": 14221 + }, + { + "epoch": 2.3216195257336434, + "grad_norm": 1.9892988204956055, + "learning_rate": 1.9132612792957808e-05, + "loss": 0.6459, + "step": 14222 + }, + { + "epoch": 2.321782784376148, + "grad_norm": 1.4540761709213257, + "learning_rate": 1.913248351521131e-05, + "loss": 0.4882, + "step": 14223 + }, + { + "epoch": 2.3219460430186523, + "grad_norm": 2.1280593872070312, + "learning_rate": 1.9132354228268407e-05, + "loss": 0.6558, + "step": 14224 + }, + { + "epoch": 2.3221093016611567, + "grad_norm": 1.5609755516052246, + "learning_rate": 1.913222493212922e-05, + "loss": 0.4896, + "step": 14225 + }, + { + "epoch": 2.322272560303661, + "grad_norm": 2.295138120651245, + "learning_rate": 1.913209562679389e-05, + "loss": 0.5907, + "step": 14226 + }, + { + "epoch": 2.3224358189461656, + "grad_norm": 1.9804073572158813, + "learning_rate": 1.9131966312262538e-05, + "loss": 0.6443, + "step": 14227 + }, + { + "epoch": 2.32259907758867, + "grad_norm": 1.8519036769866943, + "learning_rate": 1.91318369885353e-05, + "loss": 0.5636, + "step": 14228 + }, + { + "epoch": 2.3227623362311745, + "grad_norm": 1.998543620109558, + "learning_rate": 1.9131707655612305e-05, + "loss": 0.6263, + "step": 14229 + }, + { + "epoch": 2.3229255948736784, + "grad_norm": 1.9605021476745605, + "learning_rate": 1.9131578313493685e-05, + "loss": 0.6959, + "step": 14230 + }, + { + "epoch": 2.323088853516183, + "grad_norm": 1.8227728605270386, + "learning_rate": 1.9131448962179564e-05, + "loss": 0.6289, + "step": 14231 + }, + { + "epoch": 2.3232521121586873, + "grad_norm": 1.7512179613113403, + "learning_rate": 1.9131319601670077e-05, + "loss": 0.5964, + "step": 14232 + }, + { + "epoch": 2.3234153708011918, + "grad_norm": 1.6004233360290527, + "learning_rate": 1.9131190231965356e-05, + "loss": 0.5552, + "step": 14233 + }, + { + "epoch": 2.323578629443696, + "grad_norm": 1.7075024843215942, + "learning_rate": 1.9131060853065527e-05, + "loss": 0.4941, + "step": 14234 + }, + { + "epoch": 2.3237418880862006, + "grad_norm": 1.756809115409851, + "learning_rate": 1.9130931464970725e-05, + "loss": 0.4692, + "step": 14235 + }, + { + "epoch": 2.323905146728705, + "grad_norm": 2.066211223602295, + "learning_rate": 1.9130802067681075e-05, + "loss": 0.6146, + "step": 14236 + }, + { + "epoch": 2.3240684053712095, + "grad_norm": 1.6379774808883667, + "learning_rate": 1.913067266119671e-05, + "loss": 0.5181, + "step": 14237 + }, + { + "epoch": 2.3242316640137135, + "grad_norm": 1.8124704360961914, + "learning_rate": 1.9130543245517762e-05, + "loss": 0.4951, + "step": 14238 + }, + { + "epoch": 2.324394922656218, + "grad_norm": 2.002039909362793, + "learning_rate": 1.9130413820644355e-05, + "loss": 0.5836, + "step": 14239 + }, + { + "epoch": 2.3245581812987224, + "grad_norm": 1.8837475776672363, + "learning_rate": 1.9130284386576627e-05, + "loss": 0.5724, + "step": 14240 + }, + { + "epoch": 2.324721439941227, + "grad_norm": 1.7485705614089966, + "learning_rate": 1.9130154943314704e-05, + "loss": 0.5909, + "step": 14241 + }, + { + "epoch": 2.3248846985837313, + "grad_norm": 1.8324661254882812, + "learning_rate": 1.9130025490858717e-05, + "loss": 0.5851, + "step": 14242 + }, + { + "epoch": 2.3250479572262357, + "grad_norm": 1.813321828842163, + "learning_rate": 1.9129896029208796e-05, + "loss": 0.6307, + "step": 14243 + }, + { + "epoch": 2.32521121586874, + "grad_norm": 1.611656665802002, + "learning_rate": 1.9129766558365076e-05, + "loss": 0.5568, + "step": 14244 + }, + { + "epoch": 2.3253744745112446, + "grad_norm": 2.147583246231079, + "learning_rate": 1.912963707832768e-05, + "loss": 0.7279, + "step": 14245 + }, + { + "epoch": 2.325537733153749, + "grad_norm": 1.8664380311965942, + "learning_rate": 1.9129507589096744e-05, + "loss": 0.6078, + "step": 14246 + }, + { + "epoch": 2.3257009917962534, + "grad_norm": 1.6883580684661865, + "learning_rate": 1.9129378090672393e-05, + "loss": 0.5709, + "step": 14247 + }, + { + "epoch": 2.3258642504387574, + "grad_norm": 1.8360906839370728, + "learning_rate": 1.912924858305476e-05, + "loss": 0.5584, + "step": 14248 + }, + { + "epoch": 2.326027509081262, + "grad_norm": 1.8159902095794678, + "learning_rate": 1.9129119066243982e-05, + "loss": 0.5408, + "step": 14249 + }, + { + "epoch": 2.3261907677237663, + "grad_norm": 1.7340307235717773, + "learning_rate": 1.9128989540240178e-05, + "loss": 0.5493, + "step": 14250 + }, + { + "epoch": 2.3263540263662708, + "grad_norm": 1.8884925842285156, + "learning_rate": 1.9128860005043488e-05, + "loss": 0.6084, + "step": 14251 + }, + { + "epoch": 2.326517285008775, + "grad_norm": 1.6388918161392212, + "learning_rate": 1.9128730460654038e-05, + "loss": 0.5564, + "step": 14252 + }, + { + "epoch": 2.3266805436512796, + "grad_norm": 2.0416810512542725, + "learning_rate": 1.9128600907071956e-05, + "loss": 0.6814, + "step": 14253 + }, + { + "epoch": 2.326843802293784, + "grad_norm": 2.058668613433838, + "learning_rate": 1.9128471344297377e-05, + "loss": 0.6637, + "step": 14254 + }, + { + "epoch": 2.3270070609362885, + "grad_norm": 1.6928290128707886, + "learning_rate": 1.9128341772330428e-05, + "loss": 0.5371, + "step": 14255 + }, + { + "epoch": 2.3271703195787925, + "grad_norm": 2.1240878105163574, + "learning_rate": 1.9128212191171244e-05, + "loss": 0.6247, + "step": 14256 + }, + { + "epoch": 2.327333578221297, + "grad_norm": 1.7627924680709839, + "learning_rate": 1.9128082600819953e-05, + "loss": 0.5704, + "step": 14257 + }, + { + "epoch": 2.3274968368638014, + "grad_norm": 1.7222764492034912, + "learning_rate": 1.9127953001276684e-05, + "loss": 0.6094, + "step": 14258 + }, + { + "epoch": 2.327660095506306, + "grad_norm": 1.4306830167770386, + "learning_rate": 1.912782339254157e-05, + "loss": 0.4233, + "step": 14259 + }, + { + "epoch": 2.3278233541488103, + "grad_norm": 1.4827498197555542, + "learning_rate": 1.9127693774614738e-05, + "loss": 0.5053, + "step": 14260 + }, + { + "epoch": 2.3279866127913147, + "grad_norm": 1.9849332571029663, + "learning_rate": 1.912756414749632e-05, + "loss": 0.626, + "step": 14261 + }, + { + "epoch": 2.328149871433819, + "grad_norm": 1.6904468536376953, + "learning_rate": 1.912743451118645e-05, + "loss": 0.5314, + "step": 14262 + }, + { + "epoch": 2.3283131300763236, + "grad_norm": 1.6485413312911987, + "learning_rate": 1.9127304865685253e-05, + "loss": 0.498, + "step": 14263 + }, + { + "epoch": 2.328476388718828, + "grad_norm": 1.930696725845337, + "learning_rate": 1.9127175210992865e-05, + "loss": 0.6132, + "step": 14264 + }, + { + "epoch": 2.328639647361332, + "grad_norm": 1.8501136302947998, + "learning_rate": 1.9127045547109414e-05, + "loss": 0.6081, + "step": 14265 + }, + { + "epoch": 2.3288029060038364, + "grad_norm": 2.0912435054779053, + "learning_rate": 1.912691587403503e-05, + "loss": 0.7638, + "step": 14266 + }, + { + "epoch": 2.328966164646341, + "grad_norm": 1.9085915088653564, + "learning_rate": 1.9126786191769845e-05, + "loss": 0.654, + "step": 14267 + }, + { + "epoch": 2.3291294232888453, + "grad_norm": 1.686488151550293, + "learning_rate": 1.9126656500313987e-05, + "loss": 0.5002, + "step": 14268 + }, + { + "epoch": 2.3292926819313498, + "grad_norm": 2.228661060333252, + "learning_rate": 1.9126526799667588e-05, + "loss": 0.592, + "step": 14269 + }, + { + "epoch": 2.329455940573854, + "grad_norm": 2.001030445098877, + "learning_rate": 1.912639708983078e-05, + "loss": 0.567, + "step": 14270 + }, + { + "epoch": 2.3296191992163586, + "grad_norm": 2.0792925357818604, + "learning_rate": 1.9126267370803692e-05, + "loss": 0.6462, + "step": 14271 + }, + { + "epoch": 2.329782457858863, + "grad_norm": 1.6004130840301514, + "learning_rate": 1.912613764258646e-05, + "loss": 0.4556, + "step": 14272 + }, + { + "epoch": 2.329945716501367, + "grad_norm": 1.4718183279037476, + "learning_rate": 1.9126007905179203e-05, + "loss": 0.4712, + "step": 14273 + }, + { + "epoch": 2.3301089751438715, + "grad_norm": 1.7839319705963135, + "learning_rate": 1.912587815858206e-05, + "loss": 0.6048, + "step": 14274 + }, + { + "epoch": 2.330272233786376, + "grad_norm": 1.712062954902649, + "learning_rate": 1.912574840279516e-05, + "loss": 0.5209, + "step": 14275 + }, + { + "epoch": 2.3304354924288804, + "grad_norm": 1.8125019073486328, + "learning_rate": 1.9125618637818636e-05, + "loss": 0.5509, + "step": 14276 + }, + { + "epoch": 2.330598751071385, + "grad_norm": 1.8435832262039185, + "learning_rate": 1.9125488863652614e-05, + "loss": 0.5736, + "step": 14277 + }, + { + "epoch": 2.3307620097138892, + "grad_norm": 1.7678258419036865, + "learning_rate": 1.912535908029723e-05, + "loss": 0.5895, + "step": 14278 + }, + { + "epoch": 2.3309252683563937, + "grad_norm": 1.8242915868759155, + "learning_rate": 1.912522928775261e-05, + "loss": 0.6025, + "step": 14279 + }, + { + "epoch": 2.331088526998898, + "grad_norm": 1.5700633525848389, + "learning_rate": 1.9125099486018887e-05, + "loss": 0.5432, + "step": 14280 + }, + { + "epoch": 2.3312517856414026, + "grad_norm": 1.8027968406677246, + "learning_rate": 1.9124969675096188e-05, + "loss": 0.5087, + "step": 14281 + }, + { + "epoch": 2.331415044283907, + "grad_norm": 2.140265941619873, + "learning_rate": 1.9124839854984652e-05, + "loss": 0.5949, + "step": 14282 + }, + { + "epoch": 2.331578302926411, + "grad_norm": 2.099541187286377, + "learning_rate": 1.9124710025684405e-05, + "loss": 0.7018, + "step": 14283 + }, + { + "epoch": 2.3317415615689154, + "grad_norm": 1.5682073831558228, + "learning_rate": 1.9124580187195574e-05, + "loss": 0.4786, + "step": 14284 + }, + { + "epoch": 2.33190482021142, + "grad_norm": 1.7691290378570557, + "learning_rate": 1.9124450339518296e-05, + "loss": 0.5646, + "step": 14285 + }, + { + "epoch": 2.3320680788539243, + "grad_norm": 1.543060064315796, + "learning_rate": 1.9124320482652696e-05, + "loss": 0.5621, + "step": 14286 + }, + { + "epoch": 2.3322313374964287, + "grad_norm": 1.9433298110961914, + "learning_rate": 1.9124190616598906e-05, + "loss": 0.53, + "step": 14287 + }, + { + "epoch": 2.332394596138933, + "grad_norm": 1.4831067323684692, + "learning_rate": 1.9124060741357065e-05, + "loss": 0.4647, + "step": 14288 + }, + { + "epoch": 2.3325578547814376, + "grad_norm": 1.80315363407135, + "learning_rate": 1.9123930856927292e-05, + "loss": 0.544, + "step": 14289 + }, + { + "epoch": 2.332721113423942, + "grad_norm": 1.7740825414657593, + "learning_rate": 1.9123800963309725e-05, + "loss": 0.5292, + "step": 14290 + }, + { + "epoch": 2.332884372066446, + "grad_norm": 1.4382270574569702, + "learning_rate": 1.912367106050449e-05, + "loss": 0.4602, + "step": 14291 + }, + { + "epoch": 2.3330476307089505, + "grad_norm": 1.7135275602340698, + "learning_rate": 1.9123541148511723e-05, + "loss": 0.518, + "step": 14292 + }, + { + "epoch": 2.333210889351455, + "grad_norm": 1.728021502494812, + "learning_rate": 1.9123411227331555e-05, + "loss": 0.4977, + "step": 14293 + }, + { + "epoch": 2.3333741479939594, + "grad_norm": 1.7978817224502563, + "learning_rate": 1.9123281296964112e-05, + "loss": 0.5328, + "step": 14294 + }, + { + "epoch": 2.333537406636464, + "grad_norm": 2.020028829574585, + "learning_rate": 1.912315135740953e-05, + "loss": 0.6713, + "step": 14295 + }, + { + "epoch": 2.3337006652789682, + "grad_norm": 1.782649278640747, + "learning_rate": 1.912302140866793e-05, + "loss": 0.5856, + "step": 14296 + }, + { + "epoch": 2.3338639239214727, + "grad_norm": 1.75487220287323, + "learning_rate": 1.9122891450739453e-05, + "loss": 0.5032, + "step": 14297 + }, + { + "epoch": 2.334027182563977, + "grad_norm": 1.9601994752883911, + "learning_rate": 1.912276148362423e-05, + "loss": 0.6021, + "step": 14298 + }, + { + "epoch": 2.3341904412064816, + "grad_norm": 2.075417995452881, + "learning_rate": 1.9122631507322388e-05, + "loss": 0.6338, + "step": 14299 + }, + { + "epoch": 2.334353699848986, + "grad_norm": 1.8739588260650635, + "learning_rate": 1.9122501521834052e-05, + "loss": 0.4959, + "step": 14300 + }, + { + "epoch": 2.33451695849149, + "grad_norm": 1.7790136337280273, + "learning_rate": 1.9122371527159367e-05, + "loss": 0.5995, + "step": 14301 + }, + { + "epoch": 2.3346802171339944, + "grad_norm": 1.8788526058197021, + "learning_rate": 1.9122241523298456e-05, + "loss": 0.5384, + "step": 14302 + }, + { + "epoch": 2.334843475776499, + "grad_norm": 1.642607569694519, + "learning_rate": 1.9122111510251446e-05, + "loss": 0.5308, + "step": 14303 + }, + { + "epoch": 2.3350067344190033, + "grad_norm": 1.7622816562652588, + "learning_rate": 1.9121981488018472e-05, + "loss": 0.6074, + "step": 14304 + }, + { + "epoch": 2.3351699930615077, + "grad_norm": 1.9012219905853271, + "learning_rate": 1.912185145659967e-05, + "loss": 0.5567, + "step": 14305 + }, + { + "epoch": 2.335333251704012, + "grad_norm": 1.618207335472107, + "learning_rate": 1.912172141599516e-05, + "loss": 0.5222, + "step": 14306 + }, + { + "epoch": 2.3354965103465166, + "grad_norm": 1.7620047330856323, + "learning_rate": 1.9121591366205085e-05, + "loss": 0.5918, + "step": 14307 + }, + { + "epoch": 2.3356597689890206, + "grad_norm": 1.486535668373108, + "learning_rate": 1.9121461307229566e-05, + "loss": 0.5289, + "step": 14308 + }, + { + "epoch": 2.335823027631525, + "grad_norm": 2.0269103050231934, + "learning_rate": 1.9121331239068738e-05, + "loss": 0.6905, + "step": 14309 + }, + { + "epoch": 2.3359862862740295, + "grad_norm": 1.6623226404190063, + "learning_rate": 1.9121201161722732e-05, + "loss": 0.5609, + "step": 14310 + }, + { + "epoch": 2.336149544916534, + "grad_norm": 1.5255972146987915, + "learning_rate": 1.912107107519168e-05, + "loss": 0.483, + "step": 14311 + }, + { + "epoch": 2.3363128035590384, + "grad_norm": 1.5768585205078125, + "learning_rate": 1.912094097947571e-05, + "loss": 0.4628, + "step": 14312 + }, + { + "epoch": 2.336476062201543, + "grad_norm": 1.6449605226516724, + "learning_rate": 1.9120810874574958e-05, + "loss": 0.5334, + "step": 14313 + }, + { + "epoch": 2.3366393208440472, + "grad_norm": 1.7894138097763062, + "learning_rate": 1.9120680760489548e-05, + "loss": 0.6191, + "step": 14314 + }, + { + "epoch": 2.3368025794865517, + "grad_norm": 2.2405810356140137, + "learning_rate": 1.9120550637219618e-05, + "loss": 0.661, + "step": 14315 + }, + { + "epoch": 2.336965838129056, + "grad_norm": 1.530287265777588, + "learning_rate": 1.9120420504765292e-05, + "loss": 0.4335, + "step": 14316 + }, + { + "epoch": 2.3371290967715606, + "grad_norm": 2.3999314308166504, + "learning_rate": 1.912029036312671e-05, + "loss": 0.6515, + "step": 14317 + }, + { + "epoch": 2.3372923554140645, + "grad_norm": 1.6913529634475708, + "learning_rate": 1.9120160212303995e-05, + "loss": 0.5503, + "step": 14318 + }, + { + "epoch": 2.337455614056569, + "grad_norm": 1.6279637813568115, + "learning_rate": 1.9120030052297283e-05, + "loss": 0.5677, + "step": 14319 + }, + { + "epoch": 2.3376188726990734, + "grad_norm": 1.6507736444473267, + "learning_rate": 1.9119899883106702e-05, + "loss": 0.5793, + "step": 14320 + }, + { + "epoch": 2.337782131341578, + "grad_norm": 1.957861304283142, + "learning_rate": 1.9119769704732382e-05, + "loss": 0.6089, + "step": 14321 + }, + { + "epoch": 2.3379453899840823, + "grad_norm": 1.6451245546340942, + "learning_rate": 1.911963951717446e-05, + "loss": 0.5335, + "step": 14322 + }, + { + "epoch": 2.3381086486265867, + "grad_norm": 2.011702537536621, + "learning_rate": 1.9119509320433062e-05, + "loss": 0.6383, + "step": 14323 + }, + { + "epoch": 2.338271907269091, + "grad_norm": 1.5543642044067383, + "learning_rate": 1.9119379114508322e-05, + "loss": 0.5177, + "step": 14324 + }, + { + "epoch": 2.3384351659115956, + "grad_norm": 1.8404370546340942, + "learning_rate": 1.9119248899400366e-05, + "loss": 0.6137, + "step": 14325 + }, + { + "epoch": 2.3385984245540996, + "grad_norm": 2.0368075370788574, + "learning_rate": 1.9119118675109332e-05, + "loss": 0.559, + "step": 14326 + }, + { + "epoch": 2.338761683196604, + "grad_norm": 2.5275943279266357, + "learning_rate": 1.9118988441635347e-05, + "loss": 0.6466, + "step": 14327 + }, + { + "epoch": 2.3389249418391085, + "grad_norm": 2.061138153076172, + "learning_rate": 1.9118858198978542e-05, + "loss": 0.6275, + "step": 14328 + }, + { + "epoch": 2.339088200481613, + "grad_norm": 1.6408828496932983, + "learning_rate": 1.911872794713905e-05, + "loss": 0.5534, + "step": 14329 + }, + { + "epoch": 2.3392514591241174, + "grad_norm": 1.9304453134536743, + "learning_rate": 1.9118597686117e-05, + "loss": 0.62, + "step": 14330 + }, + { + "epoch": 2.339414717766622, + "grad_norm": 1.8575464487075806, + "learning_rate": 1.9118467415912526e-05, + "loss": 0.6228, + "step": 14331 + }, + { + "epoch": 2.3395779764091262, + "grad_norm": 2.1903560161590576, + "learning_rate": 1.911833713652576e-05, + "loss": 0.6428, + "step": 14332 + }, + { + "epoch": 2.3397412350516307, + "grad_norm": 1.8867651224136353, + "learning_rate": 1.911820684795683e-05, + "loss": 0.6746, + "step": 14333 + }, + { + "epoch": 2.339904493694135, + "grad_norm": 1.8304061889648438, + "learning_rate": 1.9118076550205864e-05, + "loss": 0.5753, + "step": 14334 + }, + { + "epoch": 2.3400677523366396, + "grad_norm": 2.2735514640808105, + "learning_rate": 1.9117946243273003e-05, + "loss": 0.5904, + "step": 14335 + }, + { + "epoch": 2.3402310109791435, + "grad_norm": 1.6983143091201782, + "learning_rate": 1.9117815927158367e-05, + "loss": 0.5738, + "step": 14336 + }, + { + "epoch": 2.340394269621648, + "grad_norm": 1.6759073734283447, + "learning_rate": 1.9117685601862094e-05, + "loss": 0.6015, + "step": 14337 + }, + { + "epoch": 2.3405575282641524, + "grad_norm": 2.011970043182373, + "learning_rate": 1.911755526738432e-05, + "loss": 0.6494, + "step": 14338 + }, + { + "epoch": 2.340720786906657, + "grad_norm": 2.054267168045044, + "learning_rate": 1.9117424923725164e-05, + "loss": 0.6877, + "step": 14339 + }, + { + "epoch": 2.3408840455491613, + "grad_norm": 2.092139959335327, + "learning_rate": 1.9117294570884764e-05, + "loss": 0.7091, + "step": 14340 + }, + { + "epoch": 2.3410473041916657, + "grad_norm": 2.2219839096069336, + "learning_rate": 1.9117164208863253e-05, + "loss": 0.7297, + "step": 14341 + }, + { + "epoch": 2.34121056283417, + "grad_norm": 1.974481225013733, + "learning_rate": 1.911703383766076e-05, + "loss": 0.5656, + "step": 14342 + }, + { + "epoch": 2.3413738214766746, + "grad_norm": 1.9667754173278809, + "learning_rate": 1.9116903457277413e-05, + "loss": 0.5152, + "step": 14343 + }, + { + "epoch": 2.3415370801191786, + "grad_norm": 2.1050865650177, + "learning_rate": 1.911677306771335e-05, + "loss": 0.5313, + "step": 14344 + }, + { + "epoch": 2.341700338761683, + "grad_norm": 2.0704214572906494, + "learning_rate": 1.91166426689687e-05, + "loss": 0.7297, + "step": 14345 + }, + { + "epoch": 2.3418635974041875, + "grad_norm": 1.6567240953445435, + "learning_rate": 1.911651226104359e-05, + "loss": 0.6524, + "step": 14346 + }, + { + "epoch": 2.342026856046692, + "grad_norm": 1.7096531391143799, + "learning_rate": 1.9116381843938153e-05, + "loss": 0.5473, + "step": 14347 + }, + { + "epoch": 2.3421901146891964, + "grad_norm": 1.4323474168777466, + "learning_rate": 1.9116251417652527e-05, + "loss": 0.4942, + "step": 14348 + }, + { + "epoch": 2.342353373331701, + "grad_norm": 1.7105293273925781, + "learning_rate": 1.9116120982186835e-05, + "loss": 0.674, + "step": 14349 + }, + { + "epoch": 2.3425166319742052, + "grad_norm": 1.4878153800964355, + "learning_rate": 1.9115990537541217e-05, + "loss": 0.4937, + "step": 14350 + }, + { + "epoch": 2.3426798906167097, + "grad_norm": 1.825160026550293, + "learning_rate": 1.911586008371579e-05, + "loss": 0.5951, + "step": 14351 + }, + { + "epoch": 2.342843149259214, + "grad_norm": 1.8508250713348389, + "learning_rate": 1.91157296207107e-05, + "loss": 0.5814, + "step": 14352 + }, + { + "epoch": 2.3430064079017185, + "grad_norm": 1.5965656042099, + "learning_rate": 1.9115599148526073e-05, + "loss": 0.4927, + "step": 14353 + }, + { + "epoch": 2.3431696665442225, + "grad_norm": 1.9318604469299316, + "learning_rate": 1.9115468667162038e-05, + "loss": 0.5331, + "step": 14354 + }, + { + "epoch": 2.343332925186727, + "grad_norm": 1.62886381149292, + "learning_rate": 1.911533817661873e-05, + "loss": 0.5146, + "step": 14355 + }, + { + "epoch": 2.3434961838292314, + "grad_norm": 1.7920275926589966, + "learning_rate": 1.911520767689628e-05, + "loss": 0.5501, + "step": 14356 + }, + { + "epoch": 2.343659442471736, + "grad_norm": 1.8077408075332642, + "learning_rate": 1.9115077167994812e-05, + "loss": 0.5271, + "step": 14357 + }, + { + "epoch": 2.3438227011142403, + "grad_norm": 1.6737130880355835, + "learning_rate": 1.9114946649914467e-05, + "loss": 0.519, + "step": 14358 + }, + { + "epoch": 2.3439859597567447, + "grad_norm": 1.6862773895263672, + "learning_rate": 1.9114816122655378e-05, + "loss": 0.4691, + "step": 14359 + }, + { + "epoch": 2.344149218399249, + "grad_norm": 1.6092745065689087, + "learning_rate": 1.9114685586217666e-05, + "loss": 0.5611, + "step": 14360 + }, + { + "epoch": 2.344312477041753, + "grad_norm": 1.782630205154419, + "learning_rate": 1.911455504060147e-05, + "loss": 0.5303, + "step": 14361 + }, + { + "epoch": 2.3444757356842576, + "grad_norm": 1.6904139518737793, + "learning_rate": 1.911442448580692e-05, + "loss": 0.4815, + "step": 14362 + }, + { + "epoch": 2.344638994326762, + "grad_norm": 2.0363881587982178, + "learning_rate": 1.9114293921834144e-05, + "loss": 0.4933, + "step": 14363 + }, + { + "epoch": 2.3448022529692665, + "grad_norm": 1.9386905431747437, + "learning_rate": 1.9114163348683277e-05, + "loss": 0.6176, + "step": 14364 + }, + { + "epoch": 2.344965511611771, + "grad_norm": 1.7429319620132446, + "learning_rate": 1.9114032766354453e-05, + "loss": 0.5071, + "step": 14365 + }, + { + "epoch": 2.3451287702542754, + "grad_norm": 1.5838344097137451, + "learning_rate": 1.91139021748478e-05, + "loss": 0.5488, + "step": 14366 + }, + { + "epoch": 2.34529202889678, + "grad_norm": 2.0674376487731934, + "learning_rate": 1.911377157416345e-05, + "loss": 0.6119, + "step": 14367 + }, + { + "epoch": 2.3454552875392842, + "grad_norm": 1.8318235874176025, + "learning_rate": 1.911364096430153e-05, + "loss": 0.5395, + "step": 14368 + }, + { + "epoch": 2.3456185461817887, + "grad_norm": 1.494236707687378, + "learning_rate": 1.9113510345262183e-05, + "loss": 0.4589, + "step": 14369 + }, + { + "epoch": 2.345781804824293, + "grad_norm": 1.6821712255477905, + "learning_rate": 1.9113379717045528e-05, + "loss": 0.5136, + "step": 14370 + }, + { + "epoch": 2.345945063466797, + "grad_norm": 1.8862628936767578, + "learning_rate": 1.9113249079651705e-05, + "loss": 0.5677, + "step": 14371 + }, + { + "epoch": 2.3461083221093015, + "grad_norm": 1.7608035802841187, + "learning_rate": 1.911311843308084e-05, + "loss": 0.5561, + "step": 14372 + }, + { + "epoch": 2.346271580751806, + "grad_norm": 1.6609976291656494, + "learning_rate": 1.911298777733307e-05, + "loss": 0.5127, + "step": 14373 + }, + { + "epoch": 2.3464348393943104, + "grad_norm": 1.4505976438522339, + "learning_rate": 1.911285711240852e-05, + "loss": 0.4316, + "step": 14374 + }, + { + "epoch": 2.346598098036815, + "grad_norm": 1.6782935857772827, + "learning_rate": 1.9112726438307328e-05, + "loss": 0.5032, + "step": 14375 + }, + { + "epoch": 2.3467613566793193, + "grad_norm": 1.8453587293624878, + "learning_rate": 1.9112595755029625e-05, + "loss": 0.5802, + "step": 14376 + }, + { + "epoch": 2.3469246153218237, + "grad_norm": 1.7005547285079956, + "learning_rate": 1.9112465062575536e-05, + "loss": 0.6038, + "step": 14377 + }, + { + "epoch": 2.347087873964328, + "grad_norm": 2.008389472961426, + "learning_rate": 1.91123343609452e-05, + "loss": 0.6202, + "step": 14378 + }, + { + "epoch": 2.347251132606832, + "grad_norm": 1.6299771070480347, + "learning_rate": 1.9112203650138744e-05, + "loss": 0.5347, + "step": 14379 + }, + { + "epoch": 2.3474143912493366, + "grad_norm": 1.9562451839447021, + "learning_rate": 1.91120729301563e-05, + "loss": 0.5753, + "step": 14380 + }, + { + "epoch": 2.347577649891841, + "grad_norm": 1.6553243398666382, + "learning_rate": 1.9111942200998005e-05, + "loss": 0.5225, + "step": 14381 + }, + { + "epoch": 2.3477409085343455, + "grad_norm": 1.9632201194763184, + "learning_rate": 1.9111811462663987e-05, + "loss": 0.7094, + "step": 14382 + }, + { + "epoch": 2.34790416717685, + "grad_norm": 1.9656965732574463, + "learning_rate": 1.9111680715154373e-05, + "loss": 0.6056, + "step": 14383 + }, + { + "epoch": 2.3480674258193543, + "grad_norm": 1.7770682573318481, + "learning_rate": 1.9111549958469302e-05, + "loss": 0.4968, + "step": 14384 + }, + { + "epoch": 2.348230684461859, + "grad_norm": 1.6981698274612427, + "learning_rate": 1.91114191926089e-05, + "loss": 0.5233, + "step": 14385 + }, + { + "epoch": 2.3483939431043632, + "grad_norm": 2.042553186416626, + "learning_rate": 1.9111288417573302e-05, + "loss": 0.6057, + "step": 14386 + }, + { + "epoch": 2.3485572017468677, + "grad_norm": 1.9041385650634766, + "learning_rate": 1.9111157633362642e-05, + "loss": 0.6243, + "step": 14387 + }, + { + "epoch": 2.348720460389372, + "grad_norm": 1.7111514806747437, + "learning_rate": 1.9111026839977046e-05, + "loss": 0.5369, + "step": 14388 + }, + { + "epoch": 2.348883719031876, + "grad_norm": 1.933585286140442, + "learning_rate": 1.911089603741665e-05, + "loss": 0.6223, + "step": 14389 + }, + { + "epoch": 2.3490469776743805, + "grad_norm": 1.7815806865692139, + "learning_rate": 1.9110765225681582e-05, + "loss": 0.5755, + "step": 14390 + }, + { + "epoch": 2.349210236316885, + "grad_norm": 1.6987594366073608, + "learning_rate": 1.9110634404771976e-05, + "loss": 0.5437, + "step": 14391 + }, + { + "epoch": 2.3493734949593894, + "grad_norm": 1.580695390701294, + "learning_rate": 1.9110503574687963e-05, + "loss": 0.5305, + "step": 14392 + }, + { + "epoch": 2.349536753601894, + "grad_norm": 1.701859712600708, + "learning_rate": 1.9110372735429678e-05, + "loss": 0.5509, + "step": 14393 + }, + { + "epoch": 2.3497000122443983, + "grad_norm": 1.5803920030593872, + "learning_rate": 1.911024188699725e-05, + "loss": 0.5826, + "step": 14394 + }, + { + "epoch": 2.3498632708869027, + "grad_norm": 1.7201087474822998, + "learning_rate": 1.911011102939081e-05, + "loss": 0.5015, + "step": 14395 + }, + { + "epoch": 2.3500265295294067, + "grad_norm": 1.5218333005905151, + "learning_rate": 1.910998016261049e-05, + "loss": 0.5102, + "step": 14396 + }, + { + "epoch": 2.350189788171911, + "grad_norm": 1.6910895109176636, + "learning_rate": 1.9109849286656422e-05, + "loss": 0.5927, + "step": 14397 + }, + { + "epoch": 2.3503530468144156, + "grad_norm": 1.8621644973754883, + "learning_rate": 1.9109718401528742e-05, + "loss": 0.6189, + "step": 14398 + }, + { + "epoch": 2.35051630545692, + "grad_norm": 1.8819994926452637, + "learning_rate": 1.9109587507227573e-05, + "loss": 0.5461, + "step": 14399 + }, + { + "epoch": 2.3506795640994245, + "grad_norm": 1.9560438394546509, + "learning_rate": 1.910945660375305e-05, + "loss": 0.6483, + "step": 14400 + }, + { + "epoch": 2.350842822741929, + "grad_norm": 1.5872201919555664, + "learning_rate": 1.9109325691105314e-05, + "loss": 0.5485, + "step": 14401 + }, + { + "epoch": 2.3510060813844333, + "grad_norm": 1.6689941883087158, + "learning_rate": 1.9109194769284484e-05, + "loss": 0.5875, + "step": 14402 + }, + { + "epoch": 2.351169340026938, + "grad_norm": 1.8009836673736572, + "learning_rate": 1.9109063838290702e-05, + "loss": 0.651, + "step": 14403 + }, + { + "epoch": 2.351332598669442, + "grad_norm": 1.8500860929489136, + "learning_rate": 1.910893289812409e-05, + "loss": 0.6987, + "step": 14404 + }, + { + "epoch": 2.3514958573119467, + "grad_norm": 1.9255222082138062, + "learning_rate": 1.9108801948784788e-05, + "loss": 0.5873, + "step": 14405 + }, + { + "epoch": 2.3516591159544507, + "grad_norm": 1.519940733909607, + "learning_rate": 1.9108670990272924e-05, + "loss": 0.5779, + "step": 14406 + }, + { + "epoch": 2.351822374596955, + "grad_norm": 1.5802947282791138, + "learning_rate": 1.9108540022588632e-05, + "loss": 0.494, + "step": 14407 + }, + { + "epoch": 2.3519856332394595, + "grad_norm": 2.043674945831299, + "learning_rate": 1.910840904573204e-05, + "loss": 0.5983, + "step": 14408 + }, + { + "epoch": 2.352148891881964, + "grad_norm": 1.8385940790176392, + "learning_rate": 1.910827805970328e-05, + "loss": 0.6363, + "step": 14409 + }, + { + "epoch": 2.3523121505244684, + "grad_norm": 1.7474274635314941, + "learning_rate": 1.9108147064502494e-05, + "loss": 0.6258, + "step": 14410 + }, + { + "epoch": 2.352475409166973, + "grad_norm": 1.6245766878128052, + "learning_rate": 1.9108016060129803e-05, + "loss": 0.5469, + "step": 14411 + }, + { + "epoch": 2.3526386678094773, + "grad_norm": 1.6365611553192139, + "learning_rate": 1.910788504658534e-05, + "loss": 0.5482, + "step": 14412 + }, + { + "epoch": 2.3528019264519817, + "grad_norm": 1.7441145181655884, + "learning_rate": 1.910775402386924e-05, + "loss": 0.5945, + "step": 14413 + }, + { + "epoch": 2.3529651850944857, + "grad_norm": 1.8089488744735718, + "learning_rate": 1.9107622991981637e-05, + "loss": 0.5811, + "step": 14414 + }, + { + "epoch": 2.35312844373699, + "grad_norm": 1.6562858819961548, + "learning_rate": 1.9107491950922658e-05, + "loss": 0.534, + "step": 14415 + }, + { + "epoch": 2.3532917023794946, + "grad_norm": 1.5324574708938599, + "learning_rate": 1.9107360900692437e-05, + "loss": 0.4788, + "step": 14416 + }, + { + "epoch": 2.353454961021999, + "grad_norm": 1.6852049827575684, + "learning_rate": 1.9107229841291106e-05, + "loss": 0.5934, + "step": 14417 + }, + { + "epoch": 2.3536182196645035, + "grad_norm": 1.945752501487732, + "learning_rate": 1.9107098772718796e-05, + "loss": 0.6156, + "step": 14418 + }, + { + "epoch": 2.353781478307008, + "grad_norm": 1.561339259147644, + "learning_rate": 1.910696769497564e-05, + "loss": 0.5102, + "step": 14419 + }, + { + "epoch": 2.3539447369495123, + "grad_norm": 1.7985044717788696, + "learning_rate": 1.910683660806177e-05, + "loss": 0.6483, + "step": 14420 + }, + { + "epoch": 2.3541079955920168, + "grad_norm": 1.8315913677215576, + "learning_rate": 1.910670551197732e-05, + "loss": 0.6206, + "step": 14421 + }, + { + "epoch": 2.354271254234521, + "grad_norm": 1.9382907152175903, + "learning_rate": 1.910657440672242e-05, + "loss": 0.575, + "step": 14422 + }, + { + "epoch": 2.3544345128770257, + "grad_norm": 1.43495512008667, + "learning_rate": 1.91064432922972e-05, + "loss": 0.5026, + "step": 14423 + }, + { + "epoch": 2.3545977715195296, + "grad_norm": 2.062392473220825, + "learning_rate": 1.9106312168701797e-05, + "loss": 0.6579, + "step": 14424 + }, + { + "epoch": 2.354761030162034, + "grad_norm": 1.9551305770874023, + "learning_rate": 1.9106181035936337e-05, + "loss": 0.6234, + "step": 14425 + }, + { + "epoch": 2.3549242888045385, + "grad_norm": 1.7322885990142822, + "learning_rate": 1.9106049894000955e-05, + "loss": 0.5475, + "step": 14426 + }, + { + "epoch": 2.355087547447043, + "grad_norm": 1.791019320487976, + "learning_rate": 1.9105918742895785e-05, + "loss": 0.6421, + "step": 14427 + }, + { + "epoch": 2.3552508060895474, + "grad_norm": 1.533620834350586, + "learning_rate": 1.9105787582620957e-05, + "loss": 0.5514, + "step": 14428 + }, + { + "epoch": 2.355414064732052, + "grad_norm": 1.7709577083587646, + "learning_rate": 1.9105656413176602e-05, + "loss": 0.5494, + "step": 14429 + }, + { + "epoch": 2.3555773233745563, + "grad_norm": 1.6765358448028564, + "learning_rate": 1.9105525234562858e-05, + "loss": 0.5618, + "step": 14430 + }, + { + "epoch": 2.3557405820170607, + "grad_norm": 1.8906395435333252, + "learning_rate": 1.9105394046779846e-05, + "loss": 0.5855, + "step": 14431 + }, + { + "epoch": 2.3559038406595647, + "grad_norm": 1.9154672622680664, + "learning_rate": 1.910526284982771e-05, + "loss": 0.6469, + "step": 14432 + }, + { + "epoch": 2.356067099302069, + "grad_norm": 2.0492875576019287, + "learning_rate": 1.910513164370657e-05, + "loss": 0.6619, + "step": 14433 + }, + { + "epoch": 2.3562303579445736, + "grad_norm": 2.400404930114746, + "learning_rate": 1.910500042841657e-05, + "loss": 0.7213, + "step": 14434 + }, + { + "epoch": 2.356393616587078, + "grad_norm": 2.19319748878479, + "learning_rate": 1.9104869203957835e-05, + "loss": 0.7499, + "step": 14435 + }, + { + "epoch": 2.3565568752295825, + "grad_norm": 1.7461258172988892, + "learning_rate": 1.91047379703305e-05, + "loss": 0.5462, + "step": 14436 + }, + { + "epoch": 2.356720133872087, + "grad_norm": 1.8114628791809082, + "learning_rate": 1.9104606727534698e-05, + "loss": 0.5827, + "step": 14437 + }, + { + "epoch": 2.3568833925145913, + "grad_norm": 2.1949691772460938, + "learning_rate": 1.910447547557056e-05, + "loss": 0.6456, + "step": 14438 + }, + { + "epoch": 2.3570466511570958, + "grad_norm": 1.6933521032333374, + "learning_rate": 1.910434421443821e-05, + "loss": 0.5158, + "step": 14439 + }, + { + "epoch": 2.3572099097996, + "grad_norm": 1.886391043663025, + "learning_rate": 1.9104212944137796e-05, + "loss": 0.5743, + "step": 14440 + }, + { + "epoch": 2.3573731684421046, + "grad_norm": 1.5729193687438965, + "learning_rate": 1.9104081664669437e-05, + "loss": 0.51, + "step": 14441 + }, + { + "epoch": 2.3575364270846086, + "grad_norm": 1.7811365127563477, + "learning_rate": 1.9103950376033276e-05, + "loss": 0.5074, + "step": 14442 + }, + { + "epoch": 2.357699685727113, + "grad_norm": 1.8287787437438965, + "learning_rate": 1.9103819078229432e-05, + "loss": 0.6224, + "step": 14443 + }, + { + "epoch": 2.3578629443696175, + "grad_norm": 1.912573218345642, + "learning_rate": 1.910368777125805e-05, + "loss": 0.5924, + "step": 14444 + }, + { + "epoch": 2.358026203012122, + "grad_norm": 1.4859349727630615, + "learning_rate": 1.9103556455119253e-05, + "loss": 0.4939, + "step": 14445 + }, + { + "epoch": 2.3581894616546264, + "grad_norm": 1.7044672966003418, + "learning_rate": 1.910342512981318e-05, + "loss": 0.5543, + "step": 14446 + }, + { + "epoch": 2.358352720297131, + "grad_norm": 1.663388729095459, + "learning_rate": 1.910329379533996e-05, + "loss": 0.5162, + "step": 14447 + }, + { + "epoch": 2.3585159789396353, + "grad_norm": 1.9986932277679443, + "learning_rate": 1.9103162451699726e-05, + "loss": 0.6826, + "step": 14448 + }, + { + "epoch": 2.3586792375821393, + "grad_norm": 1.6204969882965088, + "learning_rate": 1.910303109889261e-05, + "loss": 0.5178, + "step": 14449 + }, + { + "epoch": 2.3588424962246437, + "grad_norm": 1.6501067876815796, + "learning_rate": 1.9102899736918742e-05, + "loss": 0.5335, + "step": 14450 + }, + { + "epoch": 2.359005754867148, + "grad_norm": 1.5919413566589355, + "learning_rate": 1.9102768365778258e-05, + "loss": 0.5026, + "step": 14451 + }, + { + "epoch": 2.3591690135096526, + "grad_norm": 1.7240034341812134, + "learning_rate": 1.9102636985471288e-05, + "loss": 0.5139, + "step": 14452 + }, + { + "epoch": 2.359332272152157, + "grad_norm": 1.749560832977295, + "learning_rate": 1.9102505595997965e-05, + "loss": 0.6168, + "step": 14453 + }, + { + "epoch": 2.3594955307946615, + "grad_norm": 1.7278269529342651, + "learning_rate": 1.9102374197358423e-05, + "loss": 0.5518, + "step": 14454 + }, + { + "epoch": 2.359658789437166, + "grad_norm": 2.265058994293213, + "learning_rate": 1.910224278955279e-05, + "loss": 0.736, + "step": 14455 + }, + { + "epoch": 2.3598220480796703, + "grad_norm": 1.7528774738311768, + "learning_rate": 1.91021113725812e-05, + "loss": 0.5364, + "step": 14456 + }, + { + "epoch": 2.3599853067221748, + "grad_norm": 1.5723092555999756, + "learning_rate": 1.9101979946443787e-05, + "loss": 0.5318, + "step": 14457 + }, + { + "epoch": 2.360148565364679, + "grad_norm": 1.6260379552841187, + "learning_rate": 1.9101848511140682e-05, + "loss": 0.52, + "step": 14458 + }, + { + "epoch": 2.360311824007183, + "grad_norm": 2.1655828952789307, + "learning_rate": 1.9101717066672024e-05, + "loss": 0.6565, + "step": 14459 + }, + { + "epoch": 2.3604750826496876, + "grad_norm": 1.7303217649459839, + "learning_rate": 1.9101585613037933e-05, + "loss": 0.6312, + "step": 14460 + }, + { + "epoch": 2.360638341292192, + "grad_norm": 2.1012842655181885, + "learning_rate": 1.910145415023855e-05, + "loss": 0.6315, + "step": 14461 + }, + { + "epoch": 2.3608015999346965, + "grad_norm": 1.7669142484664917, + "learning_rate": 1.9101322678274002e-05, + "loss": 0.5596, + "step": 14462 + }, + { + "epoch": 2.360964858577201, + "grad_norm": 1.7769362926483154, + "learning_rate": 1.910119119714443e-05, + "loss": 0.5343, + "step": 14463 + }, + { + "epoch": 2.3611281172197054, + "grad_norm": 1.7003995180130005, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.5863, + "step": 14464 + }, + { + "epoch": 2.36129137586221, + "grad_norm": 1.8442423343658447, + "learning_rate": 1.910092820739072e-05, + "loss": 0.5672, + "step": 14465 + }, + { + "epoch": 2.3614546345047143, + "grad_norm": 1.6677004098892212, + "learning_rate": 1.9100796698766854e-05, + "loss": 0.5233, + "step": 14466 + }, + { + "epoch": 2.3616178931472183, + "grad_norm": 1.6248165369033813, + "learning_rate": 1.9100665180978485e-05, + "loss": 0.4656, + "step": 14467 + }, + { + "epoch": 2.3617811517897227, + "grad_norm": 1.4707319736480713, + "learning_rate": 1.9100533654025748e-05, + "loss": 0.4492, + "step": 14468 + }, + { + "epoch": 2.361944410432227, + "grad_norm": 1.9002872705459595, + "learning_rate": 1.9100402117908775e-05, + "loss": 0.5751, + "step": 14469 + }, + { + "epoch": 2.3621076690747316, + "grad_norm": 1.5169641971588135, + "learning_rate": 1.9100270572627705e-05, + "loss": 0.535, + "step": 14470 + }, + { + "epoch": 2.362270927717236, + "grad_norm": 2.03393292427063, + "learning_rate": 1.910013901818266e-05, + "loss": 0.6056, + "step": 14471 + }, + { + "epoch": 2.3624341863597405, + "grad_norm": 2.0755615234375, + "learning_rate": 1.910000745457378e-05, + "loss": 0.6416, + "step": 14472 + }, + { + "epoch": 2.362597445002245, + "grad_norm": 1.8453636169433594, + "learning_rate": 1.909987588180119e-05, + "loss": 0.5917, + "step": 14473 + }, + { + "epoch": 2.3627607036447493, + "grad_norm": 1.7990072965621948, + "learning_rate": 1.909974429986503e-05, + "loss": 0.584, + "step": 14474 + }, + { + "epoch": 2.3629239622872538, + "grad_norm": 1.7349976301193237, + "learning_rate": 1.9099612708765432e-05, + "loss": 0.5147, + "step": 14475 + }, + { + "epoch": 2.363087220929758, + "grad_norm": 2.2915091514587402, + "learning_rate": 1.9099481108502526e-05, + "loss": 0.9908, + "step": 14476 + }, + { + "epoch": 2.363250479572262, + "grad_norm": 1.7935649156570435, + "learning_rate": 1.9099349499076443e-05, + "loss": 0.5623, + "step": 14477 + }, + { + "epoch": 2.3634137382147666, + "grad_norm": 2.100945472717285, + "learning_rate": 1.9099217880487318e-05, + "loss": 0.7619, + "step": 14478 + }, + { + "epoch": 2.363576996857271, + "grad_norm": 1.7584258317947388, + "learning_rate": 1.9099086252735285e-05, + "loss": 0.496, + "step": 14479 + }, + { + "epoch": 2.3637402554997755, + "grad_norm": 1.7087547779083252, + "learning_rate": 1.9098954615820477e-05, + "loss": 0.4775, + "step": 14480 + }, + { + "epoch": 2.36390351414228, + "grad_norm": 2.043039321899414, + "learning_rate": 1.9098822969743018e-05, + "loss": 0.6354, + "step": 14481 + }, + { + "epoch": 2.3640667727847844, + "grad_norm": 1.8453885316848755, + "learning_rate": 1.909869131450305e-05, + "loss": 0.6038, + "step": 14482 + }, + { + "epoch": 2.364230031427289, + "grad_norm": 1.8179603815078735, + "learning_rate": 1.9098559650100702e-05, + "loss": 0.5857, + "step": 14483 + }, + { + "epoch": 2.3643932900697933, + "grad_norm": 1.8247332572937012, + "learning_rate": 1.9098427976536103e-05, + "loss": 0.6672, + "step": 14484 + }, + { + "epoch": 2.3645565487122973, + "grad_norm": 1.4729063510894775, + "learning_rate": 1.9098296293809396e-05, + "loss": 0.4669, + "step": 14485 + }, + { + "epoch": 2.3647198073548017, + "grad_norm": 1.6245359182357788, + "learning_rate": 1.9098164601920702e-05, + "loss": 0.5561, + "step": 14486 + }, + { + "epoch": 2.364883065997306, + "grad_norm": 1.9141895771026611, + "learning_rate": 1.909803290087016e-05, + "loss": 0.6332, + "step": 14487 + }, + { + "epoch": 2.3650463246398106, + "grad_norm": 1.5456106662750244, + "learning_rate": 1.9097901190657902e-05, + "loss": 0.5045, + "step": 14488 + }, + { + "epoch": 2.365209583282315, + "grad_norm": 1.813405990600586, + "learning_rate": 1.909776947128406e-05, + "loss": 0.529, + "step": 14489 + }, + { + "epoch": 2.3653728419248194, + "grad_norm": 1.9589474201202393, + "learning_rate": 1.9097637742748768e-05, + "loss": 0.663, + "step": 14490 + }, + { + "epoch": 2.365536100567324, + "grad_norm": 1.8977588415145874, + "learning_rate": 1.9097506005052153e-05, + "loss": 0.5374, + "step": 14491 + }, + { + "epoch": 2.3656993592098283, + "grad_norm": 1.9282033443450928, + "learning_rate": 1.9097374258194355e-05, + "loss": 0.665, + "step": 14492 + }, + { + "epoch": 2.3658626178523328, + "grad_norm": 1.8801733255386353, + "learning_rate": 1.9097242502175503e-05, + "loss": 0.512, + "step": 14493 + }, + { + "epoch": 2.3660258764948368, + "grad_norm": 1.8180233240127563, + "learning_rate": 1.909711073699573e-05, + "loss": 0.6267, + "step": 14494 + }, + { + "epoch": 2.366189135137341, + "grad_norm": 1.6442453861236572, + "learning_rate": 1.9096978962655167e-05, + "loss": 0.5371, + "step": 14495 + }, + { + "epoch": 2.3663523937798456, + "grad_norm": 2.0334718227386475, + "learning_rate": 1.909684717915395e-05, + "loss": 0.5985, + "step": 14496 + }, + { + "epoch": 2.36651565242235, + "grad_norm": 1.63556706905365, + "learning_rate": 1.9096715386492214e-05, + "loss": 0.5117, + "step": 14497 + }, + { + "epoch": 2.3666789110648545, + "grad_norm": 2.0091745853424072, + "learning_rate": 1.9096583584670082e-05, + "loss": 0.6487, + "step": 14498 + }, + { + "epoch": 2.366842169707359, + "grad_norm": 1.5969957113265991, + "learning_rate": 1.9096451773687695e-05, + "loss": 0.5736, + "step": 14499 + }, + { + "epoch": 2.3670054283498634, + "grad_norm": 1.6788409948349, + "learning_rate": 1.9096319953545186e-05, + "loss": 0.6032, + "step": 14500 + }, + { + "epoch": 2.367168686992368, + "grad_norm": 1.6106635332107544, + "learning_rate": 1.9096188124242683e-05, + "loss": 0.5147, + "step": 14501 + }, + { + "epoch": 2.367331945634872, + "grad_norm": 1.7689419984817505, + "learning_rate": 1.909605628578032e-05, + "loss": 0.6423, + "step": 14502 + }, + { + "epoch": 2.3674952042773763, + "grad_norm": 1.7424798011779785, + "learning_rate": 1.9095924438158235e-05, + "loss": 0.539, + "step": 14503 + }, + { + "epoch": 2.3676584629198807, + "grad_norm": 1.6533763408660889, + "learning_rate": 1.909579258137655e-05, + "loss": 0.5102, + "step": 14504 + }, + { + "epoch": 2.367821721562385, + "grad_norm": 1.7636202573776245, + "learning_rate": 1.909566071543541e-05, + "loss": 0.6133, + "step": 14505 + }, + { + "epoch": 2.3679849802048896, + "grad_norm": 1.7142084836959839, + "learning_rate": 1.909552884033494e-05, + "loss": 0.587, + "step": 14506 + }, + { + "epoch": 2.368148238847394, + "grad_norm": 1.8404573202133179, + "learning_rate": 1.9095396956075276e-05, + "loss": 0.4405, + "step": 14507 + }, + { + "epoch": 2.3683114974898984, + "grad_norm": 1.5434514284133911, + "learning_rate": 1.9095265062656546e-05, + "loss": 0.5483, + "step": 14508 + }, + { + "epoch": 2.368474756132403, + "grad_norm": 2.1177189350128174, + "learning_rate": 1.909513316007889e-05, + "loss": 0.6488, + "step": 14509 + }, + { + "epoch": 2.3686380147749073, + "grad_norm": 1.8193331956863403, + "learning_rate": 1.9095001248342436e-05, + "loss": 0.6492, + "step": 14510 + }, + { + "epoch": 2.3688012734174118, + "grad_norm": 1.6815204620361328, + "learning_rate": 1.9094869327447316e-05, + "loss": 0.4814, + "step": 14511 + }, + { + "epoch": 2.3689645320599158, + "grad_norm": 1.64766263961792, + "learning_rate": 1.909473739739367e-05, + "loss": 0.5084, + "step": 14512 + }, + { + "epoch": 2.36912779070242, + "grad_norm": 2.028972625732422, + "learning_rate": 1.909460545818162e-05, + "loss": 0.5884, + "step": 14513 + }, + { + "epoch": 2.3692910493449246, + "grad_norm": 1.8911207914352417, + "learning_rate": 1.909447350981131e-05, + "loss": 0.5427, + "step": 14514 + }, + { + "epoch": 2.369454307987429, + "grad_norm": 2.079075813293457, + "learning_rate": 1.9094341552282866e-05, + "loss": 0.6702, + "step": 14515 + }, + { + "epoch": 2.3696175666299335, + "grad_norm": 1.9665350914001465, + "learning_rate": 1.909420958559642e-05, + "loss": 0.702, + "step": 14516 + }, + { + "epoch": 2.369780825272438, + "grad_norm": 2.0632164478302, + "learning_rate": 1.9094077609752108e-05, + "loss": 0.6227, + "step": 14517 + }, + { + "epoch": 2.3699440839149424, + "grad_norm": 1.4412235021591187, + "learning_rate": 1.9093945624750065e-05, + "loss": 0.4507, + "step": 14518 + }, + { + "epoch": 2.370107342557447, + "grad_norm": 1.6485024690628052, + "learning_rate": 1.9093813630590417e-05, + "loss": 0.4806, + "step": 14519 + }, + { + "epoch": 2.370270601199951, + "grad_norm": 1.6636178493499756, + "learning_rate": 1.9093681627273306e-05, + "loss": 0.5798, + "step": 14520 + }, + { + "epoch": 2.3704338598424552, + "grad_norm": 1.6252580881118774, + "learning_rate": 1.9093549614798858e-05, + "loss": 0.5549, + "step": 14521 + }, + { + "epoch": 2.3705971184849597, + "grad_norm": 1.8036388158798218, + "learning_rate": 1.9093417593167207e-05, + "loss": 0.5645, + "step": 14522 + }, + { + "epoch": 2.370760377127464, + "grad_norm": 1.6162010431289673, + "learning_rate": 1.9093285562378487e-05, + "loss": 0.5239, + "step": 14523 + }, + { + "epoch": 2.3709236357699686, + "grad_norm": 2.0794332027435303, + "learning_rate": 1.9093153522432832e-05, + "loss": 0.5933, + "step": 14524 + }, + { + "epoch": 2.371086894412473, + "grad_norm": 1.7037785053253174, + "learning_rate": 1.9093021473330372e-05, + "loss": 0.5221, + "step": 14525 + }, + { + "epoch": 2.3712501530549774, + "grad_norm": 1.7603610754013062, + "learning_rate": 1.9092889415071245e-05, + "loss": 0.5339, + "step": 14526 + }, + { + "epoch": 2.371413411697482, + "grad_norm": 1.7453241348266602, + "learning_rate": 1.909275734765558e-05, + "loss": 0.6, + "step": 14527 + }, + { + "epoch": 2.3715766703399863, + "grad_norm": 1.862191081047058, + "learning_rate": 1.909262527108351e-05, + "loss": 0.6564, + "step": 14528 + }, + { + "epoch": 2.3717399289824908, + "grad_norm": 1.6965841054916382, + "learning_rate": 1.9092493185355168e-05, + "loss": 0.6068, + "step": 14529 + }, + { + "epoch": 2.3719031876249947, + "grad_norm": 1.6654671430587769, + "learning_rate": 1.9092361090470688e-05, + "loss": 0.6282, + "step": 14530 + }, + { + "epoch": 2.372066446267499, + "grad_norm": 1.637475609779358, + "learning_rate": 1.9092228986430203e-05, + "loss": 0.5804, + "step": 14531 + }, + { + "epoch": 2.3722297049100036, + "grad_norm": 1.7522648572921753, + "learning_rate": 1.9092096873233847e-05, + "loss": 0.5775, + "step": 14532 + }, + { + "epoch": 2.372392963552508, + "grad_norm": 1.7098793983459473, + "learning_rate": 1.909196475088175e-05, + "loss": 0.5717, + "step": 14533 + }, + { + "epoch": 2.3725562221950125, + "grad_norm": 1.4197559356689453, + "learning_rate": 1.9091832619374045e-05, + "loss": 0.5261, + "step": 14534 + }, + { + "epoch": 2.372719480837517, + "grad_norm": 1.7026376724243164, + "learning_rate": 1.909170047871087e-05, + "loss": 0.5689, + "step": 14535 + }, + { + "epoch": 2.3728827394800214, + "grad_norm": 2.0012881755828857, + "learning_rate": 1.9091568328892354e-05, + "loss": 0.5652, + "step": 14536 + }, + { + "epoch": 2.3730459981225254, + "grad_norm": 1.7350362539291382, + "learning_rate": 1.9091436169918634e-05, + "loss": 0.5033, + "step": 14537 + }, + { + "epoch": 2.37320925676503, + "grad_norm": 1.8039628267288208, + "learning_rate": 1.9091304001789837e-05, + "loss": 0.5297, + "step": 14538 + }, + { + "epoch": 2.3733725154075342, + "grad_norm": 1.4128865003585815, + "learning_rate": 1.90911718245061e-05, + "loss": 0.5091, + "step": 14539 + }, + { + "epoch": 2.3735357740500387, + "grad_norm": 1.803348183631897, + "learning_rate": 1.9091039638067555e-05, + "loss": 0.5167, + "step": 14540 + }, + { + "epoch": 2.373699032692543, + "grad_norm": 1.8317461013793945, + "learning_rate": 1.9090907442474334e-05, + "loss": 0.4822, + "step": 14541 + }, + { + "epoch": 2.3738622913350476, + "grad_norm": 1.6216344833374023, + "learning_rate": 1.9090775237726575e-05, + "loss": 0.465, + "step": 14542 + }, + { + "epoch": 2.374025549977552, + "grad_norm": 1.6102354526519775, + "learning_rate": 1.909064302382441e-05, + "loss": 0.5395, + "step": 14543 + }, + { + "epoch": 2.3741888086200564, + "grad_norm": 1.7881611585617065, + "learning_rate": 1.9090510800767964e-05, + "loss": 0.5165, + "step": 14544 + }, + { + "epoch": 2.374352067262561, + "grad_norm": 1.8531694412231445, + "learning_rate": 1.9090378568557377e-05, + "loss": 0.5406, + "step": 14545 + }, + { + "epoch": 2.3745153259050653, + "grad_norm": 1.7482800483703613, + "learning_rate": 1.9090246327192783e-05, + "loss": 0.5487, + "step": 14546 + }, + { + "epoch": 2.3746785845475693, + "grad_norm": 1.7140703201293945, + "learning_rate": 1.909011407667431e-05, + "loss": 0.5155, + "step": 14547 + }, + { + "epoch": 2.3748418431900737, + "grad_norm": 1.9508895874023438, + "learning_rate": 1.9089981817002102e-05, + "loss": 0.6524, + "step": 14548 + }, + { + "epoch": 2.375005101832578, + "grad_norm": 1.6928495168685913, + "learning_rate": 1.9089849548176276e-05, + "loss": 0.6571, + "step": 14549 + }, + { + "epoch": 2.3751683604750826, + "grad_norm": 1.7684071063995361, + "learning_rate": 1.9089717270196982e-05, + "loss": 0.5536, + "step": 14550 + }, + { + "epoch": 2.375331619117587, + "grad_norm": 1.759058952331543, + "learning_rate": 1.908958498306434e-05, + "loss": 0.5448, + "step": 14551 + }, + { + "epoch": 2.3754948777600915, + "grad_norm": 1.5509284734725952, + "learning_rate": 1.908945268677849e-05, + "loss": 0.4936, + "step": 14552 + }, + { + "epoch": 2.375658136402596, + "grad_norm": 1.928565502166748, + "learning_rate": 1.908932038133956e-05, + "loss": 0.612, + "step": 14553 + }, + { + "epoch": 2.3758213950451004, + "grad_norm": 1.989676833152771, + "learning_rate": 1.908918806674769e-05, + "loss": 0.5859, + "step": 14554 + }, + { + "epoch": 2.3759846536876044, + "grad_norm": 1.637134313583374, + "learning_rate": 1.908905574300301e-05, + "loss": 0.5531, + "step": 14555 + }, + { + "epoch": 2.376147912330109, + "grad_norm": 1.8181407451629639, + "learning_rate": 1.9088923410105654e-05, + "loss": 0.6026, + "step": 14556 + }, + { + "epoch": 2.3763111709726132, + "grad_norm": 1.7707496881484985, + "learning_rate": 1.9088791068055755e-05, + "loss": 0.5673, + "step": 14557 + }, + { + "epoch": 2.3764744296151177, + "grad_norm": 1.8202983140945435, + "learning_rate": 1.9088658716853444e-05, + "loss": 0.5302, + "step": 14558 + }, + { + "epoch": 2.376637688257622, + "grad_norm": 1.8425580263137817, + "learning_rate": 1.9088526356498854e-05, + "loss": 0.6265, + "step": 14559 + }, + { + "epoch": 2.3768009469001266, + "grad_norm": 1.8676517009735107, + "learning_rate": 1.9088393986992124e-05, + "loss": 0.5433, + "step": 14560 + }, + { + "epoch": 2.376964205542631, + "grad_norm": 1.8974515199661255, + "learning_rate": 1.9088261608333382e-05, + "loss": 0.5695, + "step": 14561 + }, + { + "epoch": 2.3771274641851354, + "grad_norm": 1.7107239961624146, + "learning_rate": 1.9088129220522765e-05, + "loss": 0.5522, + "step": 14562 + }, + { + "epoch": 2.37729072282764, + "grad_norm": 2.1135764122009277, + "learning_rate": 1.9087996823560404e-05, + "loss": 0.6722, + "step": 14563 + }, + { + "epoch": 2.3774539814701443, + "grad_norm": 1.9616618156433105, + "learning_rate": 1.9087864417446428e-05, + "loss": 0.6052, + "step": 14564 + }, + { + "epoch": 2.3776172401126483, + "grad_norm": 1.6884421110153198, + "learning_rate": 1.9087732002180982e-05, + "loss": 0.5562, + "step": 14565 + }, + { + "epoch": 2.3777804987551527, + "grad_norm": 1.6799652576446533, + "learning_rate": 1.9087599577764186e-05, + "loss": 0.489, + "step": 14566 + }, + { + "epoch": 2.377943757397657, + "grad_norm": 1.6229826211929321, + "learning_rate": 1.9087467144196185e-05, + "loss": 0.567, + "step": 14567 + }, + { + "epoch": 2.3781070160401616, + "grad_norm": 1.6193634271621704, + "learning_rate": 1.9087334701477104e-05, + "loss": 0.5388, + "step": 14568 + }, + { + "epoch": 2.378270274682666, + "grad_norm": 1.818680763244629, + "learning_rate": 1.908720224960708e-05, + "loss": 0.5694, + "step": 14569 + }, + { + "epoch": 2.3784335333251705, + "grad_norm": 1.7210971117019653, + "learning_rate": 1.9087069788586245e-05, + "loss": 0.5807, + "step": 14570 + }, + { + "epoch": 2.378596791967675, + "grad_norm": 2.1259872913360596, + "learning_rate": 1.9086937318414735e-05, + "loss": 0.7285, + "step": 14571 + }, + { + "epoch": 2.3787600506101794, + "grad_norm": 1.6096590757369995, + "learning_rate": 1.908680483909268e-05, + "loss": 0.5941, + "step": 14572 + }, + { + "epoch": 2.3789233092526834, + "grad_norm": 1.6167224645614624, + "learning_rate": 1.9086672350620213e-05, + "loss": 0.5888, + "step": 14573 + }, + { + "epoch": 2.379086567895188, + "grad_norm": 1.708115577697754, + "learning_rate": 1.908653985299747e-05, + "loss": 0.5102, + "step": 14574 + }, + { + "epoch": 2.3792498265376922, + "grad_norm": 2.078360080718994, + "learning_rate": 1.908640734622459e-05, + "loss": 0.7995, + "step": 14575 + }, + { + "epoch": 2.3794130851801967, + "grad_norm": 1.5572773218154907, + "learning_rate": 1.908627483030169e-05, + "loss": 0.4823, + "step": 14576 + }, + { + "epoch": 2.379576343822701, + "grad_norm": 2.022814989089966, + "learning_rate": 1.908614230522892e-05, + "loss": 0.6716, + "step": 14577 + }, + { + "epoch": 2.3797396024652056, + "grad_norm": 1.779279112815857, + "learning_rate": 1.9086009771006405e-05, + "loss": 0.6936, + "step": 14578 + }, + { + "epoch": 2.37990286110771, + "grad_norm": 1.8015373945236206, + "learning_rate": 1.908587722763428e-05, + "loss": 0.6473, + "step": 14579 + }, + { + "epoch": 2.3800661197502144, + "grad_norm": 1.9395558834075928, + "learning_rate": 1.9085744675112682e-05, + "loss": 0.6401, + "step": 14580 + }, + { + "epoch": 2.380229378392719, + "grad_norm": 1.5874874591827393, + "learning_rate": 1.9085612113441742e-05, + "loss": 0.5598, + "step": 14581 + }, + { + "epoch": 2.380392637035223, + "grad_norm": 2.0969114303588867, + "learning_rate": 1.9085479542621593e-05, + "loss": 0.6516, + "step": 14582 + }, + { + "epoch": 2.3805558956777273, + "grad_norm": 1.743098497390747, + "learning_rate": 1.9085346962652366e-05, + "loss": 0.6153, + "step": 14583 + }, + { + "epoch": 2.3807191543202317, + "grad_norm": 1.717746615409851, + "learning_rate": 1.9085214373534198e-05, + "loss": 0.5523, + "step": 14584 + }, + { + "epoch": 2.380882412962736, + "grad_norm": 1.7856336832046509, + "learning_rate": 1.908508177526722e-05, + "loss": 0.5338, + "step": 14585 + }, + { + "epoch": 2.3810456716052406, + "grad_norm": 1.3245549201965332, + "learning_rate": 1.9084949167851567e-05, + "loss": 0.4829, + "step": 14586 + }, + { + "epoch": 2.381208930247745, + "grad_norm": 1.897556185722351, + "learning_rate": 1.9084816551287376e-05, + "loss": 0.5764, + "step": 14587 + }, + { + "epoch": 2.3813721888902495, + "grad_norm": 1.4993222951889038, + "learning_rate": 1.9084683925574772e-05, + "loss": 0.509, + "step": 14588 + }, + { + "epoch": 2.381535447532754, + "grad_norm": 1.6357264518737793, + "learning_rate": 1.90845512907139e-05, + "loss": 0.5312, + "step": 14589 + }, + { + "epoch": 2.381698706175258, + "grad_norm": 1.6717731952667236, + "learning_rate": 1.9084418646704884e-05, + "loss": 0.5115, + "step": 14590 + }, + { + "epoch": 2.3818619648177624, + "grad_norm": 1.9612261056900024, + "learning_rate": 1.908428599354786e-05, + "loss": 0.5858, + "step": 14591 + }, + { + "epoch": 2.382025223460267, + "grad_norm": 1.5940357446670532, + "learning_rate": 1.908415333124296e-05, + "loss": 0.5609, + "step": 14592 + }, + { + "epoch": 2.3821884821027712, + "grad_norm": 1.9829597473144531, + "learning_rate": 1.9084020659790328e-05, + "loss": 0.5778, + "step": 14593 + }, + { + "epoch": 2.3823517407452757, + "grad_norm": 1.8137401342391968, + "learning_rate": 1.9083887979190084e-05, + "loss": 0.6343, + "step": 14594 + }, + { + "epoch": 2.38251499938778, + "grad_norm": 1.647303581237793, + "learning_rate": 1.9083755289442368e-05, + "loss": 0.5873, + "step": 14595 + }, + { + "epoch": 2.3826782580302845, + "grad_norm": 1.4914467334747314, + "learning_rate": 1.9083622590547313e-05, + "loss": 0.4778, + "step": 14596 + }, + { + "epoch": 2.382841516672789, + "grad_norm": 1.6948738098144531, + "learning_rate": 1.9083489882505052e-05, + "loss": 0.6132, + "step": 14597 + }, + { + "epoch": 2.3830047753152934, + "grad_norm": 2.0593371391296387, + "learning_rate": 1.908335716531572e-05, + "loss": 0.6427, + "step": 14598 + }, + { + "epoch": 2.383168033957798, + "grad_norm": 2.097874164581299, + "learning_rate": 1.908322443897945e-05, + "loss": 0.5341, + "step": 14599 + }, + { + "epoch": 2.383331292600302, + "grad_norm": 1.9144996404647827, + "learning_rate": 1.9083091703496373e-05, + "loss": 0.5461, + "step": 14600 + }, + { + "epoch": 2.3834945512428063, + "grad_norm": 2.0969812870025635, + "learning_rate": 1.9082958958866628e-05, + "loss": 0.5864, + "step": 14601 + }, + { + "epoch": 2.3836578098853107, + "grad_norm": 1.8062586784362793, + "learning_rate": 1.9082826205090343e-05, + "loss": 0.6145, + "step": 14602 + }, + { + "epoch": 2.383821068527815, + "grad_norm": 1.794722080230713, + "learning_rate": 1.9082693442167658e-05, + "loss": 0.5225, + "step": 14603 + }, + { + "epoch": 2.3839843271703196, + "grad_norm": 1.8394451141357422, + "learning_rate": 1.90825606700987e-05, + "loss": 0.5762, + "step": 14604 + }, + { + "epoch": 2.384147585812824, + "grad_norm": 1.8057453632354736, + "learning_rate": 1.9082427888883604e-05, + "loss": 0.6581, + "step": 14605 + }, + { + "epoch": 2.3843108444553285, + "grad_norm": 1.522560715675354, + "learning_rate": 1.9082295098522513e-05, + "loss": 0.4934, + "step": 14606 + }, + { + "epoch": 2.384474103097833, + "grad_norm": 1.5829105377197266, + "learning_rate": 1.9082162299015547e-05, + "loss": 0.503, + "step": 14607 + }, + { + "epoch": 2.384637361740337, + "grad_norm": 2.348994731903076, + "learning_rate": 1.9082029490362844e-05, + "loss": 0.6501, + "step": 14608 + }, + { + "epoch": 2.3848006203828414, + "grad_norm": 1.94192636013031, + "learning_rate": 1.9081896672564547e-05, + "loss": 0.5549, + "step": 14609 + }, + { + "epoch": 2.384963879025346, + "grad_norm": 1.696832537651062, + "learning_rate": 1.9081763845620777e-05, + "loss": 0.517, + "step": 14610 + }, + { + "epoch": 2.3851271376678502, + "grad_norm": 1.5302447080612183, + "learning_rate": 1.9081631009531677e-05, + "loss": 0.5225, + "step": 14611 + }, + { + "epoch": 2.3852903963103547, + "grad_norm": 1.8243528604507446, + "learning_rate": 1.9081498164297373e-05, + "loss": 0.6125, + "step": 14612 + }, + { + "epoch": 2.385453654952859, + "grad_norm": 1.79802668094635, + "learning_rate": 1.9081365309918006e-05, + "loss": 0.5831, + "step": 14613 + }, + { + "epoch": 2.3856169135953635, + "grad_norm": 1.8712985515594482, + "learning_rate": 1.9081232446393706e-05, + "loss": 0.5618, + "step": 14614 + }, + { + "epoch": 2.385780172237868, + "grad_norm": 1.8246898651123047, + "learning_rate": 1.9081099573724607e-05, + "loss": 0.5415, + "step": 14615 + }, + { + "epoch": 2.3859434308803724, + "grad_norm": 1.6117380857467651, + "learning_rate": 1.908096669191084e-05, + "loss": 0.4432, + "step": 14616 + }, + { + "epoch": 2.386106689522877, + "grad_norm": 1.6681281328201294, + "learning_rate": 1.9080833800952545e-05, + "loss": 0.5841, + "step": 14617 + }, + { + "epoch": 2.386269948165381, + "grad_norm": 1.7787386178970337, + "learning_rate": 1.9080700900849855e-05, + "loss": 0.6334, + "step": 14618 + }, + { + "epoch": 2.3864332068078853, + "grad_norm": 2.475806713104248, + "learning_rate": 1.90805679916029e-05, + "loss": 0.8241, + "step": 14619 + }, + { + "epoch": 2.3865964654503897, + "grad_norm": 1.8686087131500244, + "learning_rate": 1.9080435073211812e-05, + "loss": 0.5138, + "step": 14620 + }, + { + "epoch": 2.386759724092894, + "grad_norm": 2.107147216796875, + "learning_rate": 1.908030214567673e-05, + "loss": 0.7122, + "step": 14621 + }, + { + "epoch": 2.3869229827353986, + "grad_norm": 1.6094859838485718, + "learning_rate": 1.9080169208997786e-05, + "loss": 0.4895, + "step": 14622 + }, + { + "epoch": 2.387086241377903, + "grad_norm": 1.7905776500701904, + "learning_rate": 1.9080036263175118e-05, + "loss": 0.6505, + "step": 14623 + }, + { + "epoch": 2.3872495000204075, + "grad_norm": 1.4778149127960205, + "learning_rate": 1.907990330820885e-05, + "loss": 0.5193, + "step": 14624 + }, + { + "epoch": 2.3874127586629115, + "grad_norm": 1.8297946453094482, + "learning_rate": 1.9079770344099126e-05, + "loss": 0.5953, + "step": 14625 + }, + { + "epoch": 2.387576017305416, + "grad_norm": 1.744399070739746, + "learning_rate": 1.9079637370846075e-05, + "loss": 0.6365, + "step": 14626 + }, + { + "epoch": 2.3877392759479203, + "grad_norm": 1.756588101387024, + "learning_rate": 1.907950438844983e-05, + "loss": 0.4671, + "step": 14627 + }, + { + "epoch": 2.387902534590425, + "grad_norm": 1.6498795747756958, + "learning_rate": 1.9079371396910528e-05, + "loss": 0.5224, + "step": 14628 + }, + { + "epoch": 2.3880657932329292, + "grad_norm": 1.7140398025512695, + "learning_rate": 1.90792383962283e-05, + "loss": 0.5012, + "step": 14629 + }, + { + "epoch": 2.3882290518754337, + "grad_norm": 2.0745060443878174, + "learning_rate": 1.9079105386403283e-05, + "loss": 0.6148, + "step": 14630 + }, + { + "epoch": 2.388392310517938, + "grad_norm": 1.9971262216567993, + "learning_rate": 1.9078972367435606e-05, + "loss": 0.5599, + "step": 14631 + }, + { + "epoch": 2.3885555691604425, + "grad_norm": 1.9585063457489014, + "learning_rate": 1.907883933932541e-05, + "loss": 0.6518, + "step": 14632 + }, + { + "epoch": 2.388718827802947, + "grad_norm": 1.6268223524093628, + "learning_rate": 1.9078706302072824e-05, + "loss": 0.5663, + "step": 14633 + }, + { + "epoch": 2.3888820864454514, + "grad_norm": 1.8286161422729492, + "learning_rate": 1.9078573255677983e-05, + "loss": 0.5789, + "step": 14634 + }, + { + "epoch": 2.3890453450879554, + "grad_norm": 1.783400297164917, + "learning_rate": 1.907844020014102e-05, + "loss": 0.5839, + "step": 14635 + }, + { + "epoch": 2.38920860373046, + "grad_norm": 1.869644284248352, + "learning_rate": 1.9078307135462072e-05, + "loss": 0.5291, + "step": 14636 + }, + { + "epoch": 2.3893718623729643, + "grad_norm": 1.6742433309555054, + "learning_rate": 1.907817406164127e-05, + "loss": 0.5154, + "step": 14637 + }, + { + "epoch": 2.3895351210154687, + "grad_norm": 1.7840083837509155, + "learning_rate": 1.907804097867875e-05, + "loss": 0.5395, + "step": 14638 + }, + { + "epoch": 2.389698379657973, + "grad_norm": 1.7052687406539917, + "learning_rate": 1.9077907886574646e-05, + "loss": 0.5652, + "step": 14639 + }, + { + "epoch": 2.3898616383004776, + "grad_norm": 2.168511152267456, + "learning_rate": 1.907777478532909e-05, + "loss": 0.559, + "step": 14640 + }, + { + "epoch": 2.390024896942982, + "grad_norm": 1.8107311725616455, + "learning_rate": 1.9077641674942216e-05, + "loss": 0.6268, + "step": 14641 + }, + { + "epoch": 2.3901881555854865, + "grad_norm": 1.5295219421386719, + "learning_rate": 1.907750855541416e-05, + "loss": 0.4994, + "step": 14642 + }, + { + "epoch": 2.3903514142279905, + "grad_norm": 1.6677207946777344, + "learning_rate": 1.9077375426745055e-05, + "loss": 0.5429, + "step": 14643 + }, + { + "epoch": 2.390514672870495, + "grad_norm": 1.7574526071548462, + "learning_rate": 1.9077242288935036e-05, + "loss": 0.5775, + "step": 14644 + }, + { + "epoch": 2.3906779315129993, + "grad_norm": 1.4158931970596313, + "learning_rate": 1.9077109141984235e-05, + "loss": 0.4598, + "step": 14645 + }, + { + "epoch": 2.390841190155504, + "grad_norm": 1.8125561475753784, + "learning_rate": 1.907697598589279e-05, + "loss": 0.5748, + "step": 14646 + }, + { + "epoch": 2.391004448798008, + "grad_norm": 1.7812215089797974, + "learning_rate": 1.9076842820660832e-05, + "loss": 0.5778, + "step": 14647 + }, + { + "epoch": 2.3911677074405127, + "grad_norm": 2.1403253078460693, + "learning_rate": 1.9076709646288495e-05, + "loss": 0.6951, + "step": 14648 + }, + { + "epoch": 2.391330966083017, + "grad_norm": 1.9364113807678223, + "learning_rate": 1.9076576462775913e-05, + "loss": 0.5481, + "step": 14649 + }, + { + "epoch": 2.3914942247255215, + "grad_norm": 1.9292993545532227, + "learning_rate": 1.9076443270123222e-05, + "loss": 0.6017, + "step": 14650 + }, + { + "epoch": 2.391657483368026, + "grad_norm": 1.7391222715377808, + "learning_rate": 1.9076310068330554e-05, + "loss": 0.5621, + "step": 14651 + }, + { + "epoch": 2.3918207420105304, + "grad_norm": 1.964460849761963, + "learning_rate": 1.9076176857398045e-05, + "loss": 0.6621, + "step": 14652 + }, + { + "epoch": 2.3919840006530344, + "grad_norm": 1.7468537092208862, + "learning_rate": 1.907604363732583e-05, + "loss": 0.5074, + "step": 14653 + }, + { + "epoch": 2.392147259295539, + "grad_norm": 1.6747630834579468, + "learning_rate": 1.907591040811404e-05, + "loss": 0.5047, + "step": 14654 + }, + { + "epoch": 2.3923105179380433, + "grad_norm": 1.6960432529449463, + "learning_rate": 1.907577716976281e-05, + "loss": 0.4634, + "step": 14655 + }, + { + "epoch": 2.3924737765805477, + "grad_norm": 1.7186912298202515, + "learning_rate": 1.9075643922272277e-05, + "loss": 0.5235, + "step": 14656 + }, + { + "epoch": 2.392637035223052, + "grad_norm": 1.6262362003326416, + "learning_rate": 1.9075510665642567e-05, + "loss": 0.4903, + "step": 14657 + }, + { + "epoch": 2.3928002938655566, + "grad_norm": 1.9649072885513306, + "learning_rate": 1.9075377399873827e-05, + "loss": 0.6927, + "step": 14658 + }, + { + "epoch": 2.392963552508061, + "grad_norm": 2.14729380607605, + "learning_rate": 1.907524412496618e-05, + "loss": 0.7139, + "step": 14659 + }, + { + "epoch": 2.3931268111505655, + "grad_norm": 1.8604710102081299, + "learning_rate": 1.9075110840919765e-05, + "loss": 0.5795, + "step": 14660 + }, + { + "epoch": 2.3932900697930695, + "grad_norm": 1.7697038650512695, + "learning_rate": 1.907497754773472e-05, + "loss": 0.693, + "step": 14661 + }, + { + "epoch": 2.393453328435574, + "grad_norm": 2.3853352069854736, + "learning_rate": 1.907484424541117e-05, + "loss": 0.7036, + "step": 14662 + }, + { + "epoch": 2.3936165870780783, + "grad_norm": 1.9073824882507324, + "learning_rate": 1.9074710933949257e-05, + "loss": 0.5918, + "step": 14663 + }, + { + "epoch": 2.3937798457205828, + "grad_norm": 1.7636950016021729, + "learning_rate": 1.9074577613349113e-05, + "loss": 0.5373, + "step": 14664 + }, + { + "epoch": 2.393943104363087, + "grad_norm": 1.7081987857818604, + "learning_rate": 1.907444428361087e-05, + "loss": 0.6219, + "step": 14665 + }, + { + "epoch": 2.3941063630055917, + "grad_norm": 1.7382813692092896, + "learning_rate": 1.9074310944734663e-05, + "loss": 0.5226, + "step": 14666 + }, + { + "epoch": 2.394269621648096, + "grad_norm": 1.7222046852111816, + "learning_rate": 1.907417759672063e-05, + "loss": 0.5302, + "step": 14667 + }, + { + "epoch": 2.3944328802906005, + "grad_norm": 1.750404953956604, + "learning_rate": 1.9074044239568904e-05, + "loss": 0.5986, + "step": 14668 + }, + { + "epoch": 2.394596138933105, + "grad_norm": 1.6623457670211792, + "learning_rate": 1.9073910873279613e-05, + "loss": 0.5085, + "step": 14669 + }, + { + "epoch": 2.3947593975756094, + "grad_norm": 2.331394672393799, + "learning_rate": 1.90737774978529e-05, + "loss": 0.7563, + "step": 14670 + }, + { + "epoch": 2.3949226562181134, + "grad_norm": 1.7024861574172974, + "learning_rate": 1.907364411328889e-05, + "loss": 0.7184, + "step": 14671 + }, + { + "epoch": 2.395085914860618, + "grad_norm": 1.8200258016586304, + "learning_rate": 1.907351071958773e-05, + "loss": 0.5787, + "step": 14672 + }, + { + "epoch": 2.3952491735031223, + "grad_norm": 1.5663678646087646, + "learning_rate": 1.9073377316749543e-05, + "loss": 0.5011, + "step": 14673 + }, + { + "epoch": 2.3954124321456267, + "grad_norm": 2.1348984241485596, + "learning_rate": 1.9073243904774468e-05, + "loss": 0.6318, + "step": 14674 + }, + { + "epoch": 2.395575690788131, + "grad_norm": 1.499284029006958, + "learning_rate": 1.907311048366264e-05, + "loss": 0.4455, + "step": 14675 + }, + { + "epoch": 2.3957389494306356, + "grad_norm": 1.3987929821014404, + "learning_rate": 1.907297705341419e-05, + "loss": 0.5374, + "step": 14676 + }, + { + "epoch": 2.39590220807314, + "grad_norm": 1.3970788717269897, + "learning_rate": 1.9072843614029256e-05, + "loss": 0.5357, + "step": 14677 + }, + { + "epoch": 2.396065466715644, + "grad_norm": 1.793351411819458, + "learning_rate": 1.907271016550797e-05, + "loss": 0.5541, + "step": 14678 + }, + { + "epoch": 2.3962287253581485, + "grad_norm": 1.7402982711791992, + "learning_rate": 1.9072576707850467e-05, + "loss": 0.4924, + "step": 14679 + }, + { + "epoch": 2.396391984000653, + "grad_norm": 2.1115381717681885, + "learning_rate": 1.9072443241056884e-05, + "loss": 0.686, + "step": 14680 + }, + { + "epoch": 2.3965552426431573, + "grad_norm": 1.940934419631958, + "learning_rate": 1.907230976512735e-05, + "loss": 0.5689, + "step": 14681 + }, + { + "epoch": 2.3967185012856618, + "grad_norm": 1.57938551902771, + "learning_rate": 1.9072176280062006e-05, + "loss": 0.5357, + "step": 14682 + }, + { + "epoch": 2.396881759928166, + "grad_norm": 1.8515774011611938, + "learning_rate": 1.907204278586098e-05, + "loss": 0.6576, + "step": 14683 + }, + { + "epoch": 2.3970450185706706, + "grad_norm": 1.6698349714279175, + "learning_rate": 1.907190928252441e-05, + "loss": 0.5405, + "step": 14684 + }, + { + "epoch": 2.397208277213175, + "grad_norm": 2.2046680450439453, + "learning_rate": 1.9071775770052432e-05, + "loss": 0.6873, + "step": 14685 + }, + { + "epoch": 2.3973715358556795, + "grad_norm": 1.9014753103256226, + "learning_rate": 1.9071642248445176e-05, + "loss": 0.6229, + "step": 14686 + }, + { + "epoch": 2.397534794498184, + "grad_norm": 1.6532944440841675, + "learning_rate": 1.9071508717702777e-05, + "loss": 0.5228, + "step": 14687 + }, + { + "epoch": 2.397698053140688, + "grad_norm": 1.8697896003723145, + "learning_rate": 1.9071375177825375e-05, + "loss": 0.5669, + "step": 14688 + }, + { + "epoch": 2.3978613117831924, + "grad_norm": 1.5795090198516846, + "learning_rate": 1.9071241628813096e-05, + "loss": 0.4954, + "step": 14689 + }, + { + "epoch": 2.398024570425697, + "grad_norm": 1.8542009592056274, + "learning_rate": 1.907110807066608e-05, + "loss": 0.6128, + "step": 14690 + }, + { + "epoch": 2.3981878290682013, + "grad_norm": 1.5209999084472656, + "learning_rate": 1.9070974503384464e-05, + "loss": 0.5493, + "step": 14691 + }, + { + "epoch": 2.3983510877107057, + "grad_norm": 1.774599552154541, + "learning_rate": 1.9070840926968378e-05, + "loss": 0.5764, + "step": 14692 + }, + { + "epoch": 2.39851434635321, + "grad_norm": 1.7918599843978882, + "learning_rate": 1.907070734141796e-05, + "loss": 0.6262, + "step": 14693 + }, + { + "epoch": 2.3986776049957146, + "grad_norm": 1.5264027118682861, + "learning_rate": 1.907057374673334e-05, + "loss": 0.4734, + "step": 14694 + }, + { + "epoch": 2.398840863638219, + "grad_norm": 2.0934627056121826, + "learning_rate": 1.907044014291465e-05, + "loss": 0.7255, + "step": 14695 + }, + { + "epoch": 2.399004122280723, + "grad_norm": 1.6206717491149902, + "learning_rate": 1.9070306529962033e-05, + "loss": 0.5541, + "step": 14696 + }, + { + "epoch": 2.3991673809232275, + "grad_norm": 1.9858759641647339, + "learning_rate": 1.9070172907875618e-05, + "loss": 0.645, + "step": 14697 + }, + { + "epoch": 2.399330639565732, + "grad_norm": 1.8839404582977295, + "learning_rate": 1.9070039276655545e-05, + "loss": 0.661, + "step": 14698 + }, + { + "epoch": 2.3994938982082363, + "grad_norm": 1.7998151779174805, + "learning_rate": 1.9069905636301943e-05, + "loss": 0.6131, + "step": 14699 + }, + { + "epoch": 2.3996571568507408, + "grad_norm": 1.6512155532836914, + "learning_rate": 1.9069771986814948e-05, + "loss": 0.5403, + "step": 14700 + }, + { + "epoch": 2.399820415493245, + "grad_norm": 1.6819300651550293, + "learning_rate": 1.9069638328194698e-05, + "loss": 0.5892, + "step": 14701 + }, + { + "epoch": 2.3999836741357496, + "grad_norm": 1.523626685142517, + "learning_rate": 1.906950466044132e-05, + "loss": 0.5304, + "step": 14702 + }, + { + "epoch": 2.400146932778254, + "grad_norm": 1.759896993637085, + "learning_rate": 1.9069370983554955e-05, + "loss": 0.7048, + "step": 14703 + }, + { + "epoch": 2.4003101914207585, + "grad_norm": 1.4748098850250244, + "learning_rate": 1.9069237297535737e-05, + "loss": 0.4646, + "step": 14704 + }, + { + "epoch": 2.400473450063263, + "grad_norm": 1.749779462814331, + "learning_rate": 1.9069103602383796e-05, + "loss": 0.6186, + "step": 14705 + }, + { + "epoch": 2.400636708705767, + "grad_norm": 1.6567038297653198, + "learning_rate": 1.906896989809927e-05, + "loss": 0.5227, + "step": 14706 + }, + { + "epoch": 2.4007999673482714, + "grad_norm": 1.6152395009994507, + "learning_rate": 1.90688361846823e-05, + "loss": 0.6302, + "step": 14707 + }, + { + "epoch": 2.400963225990776, + "grad_norm": 1.5224673748016357, + "learning_rate": 1.9068702462133012e-05, + "loss": 0.5862, + "step": 14708 + }, + { + "epoch": 2.4011264846332803, + "grad_norm": 1.981552004814148, + "learning_rate": 1.906856873045154e-05, + "loss": 0.6635, + "step": 14709 + }, + { + "epoch": 2.4012897432757847, + "grad_norm": 1.6401937007904053, + "learning_rate": 1.9068434989638023e-05, + "loss": 0.5192, + "step": 14710 + }, + { + "epoch": 2.401453001918289, + "grad_norm": 1.7433654069900513, + "learning_rate": 1.9068301239692595e-05, + "loss": 0.5727, + "step": 14711 + }, + { + "epoch": 2.4016162605607936, + "grad_norm": 1.8013827800750732, + "learning_rate": 1.906816748061539e-05, + "loss": 0.5871, + "step": 14712 + }, + { + "epoch": 2.4017795192032976, + "grad_norm": 1.7176638841629028, + "learning_rate": 1.906803371240654e-05, + "loss": 0.5521, + "step": 14713 + }, + { + "epoch": 2.401942777845802, + "grad_norm": 1.7259089946746826, + "learning_rate": 1.9067899935066187e-05, + "loss": 0.5915, + "step": 14714 + }, + { + "epoch": 2.4021060364883065, + "grad_norm": 1.5743516683578491, + "learning_rate": 1.906776614859446e-05, + "loss": 0.4919, + "step": 14715 + }, + { + "epoch": 2.402269295130811, + "grad_norm": 2.2616286277770996, + "learning_rate": 1.906763235299149e-05, + "loss": 0.706, + "step": 14716 + }, + { + "epoch": 2.4024325537733153, + "grad_norm": 2.226349353790283, + "learning_rate": 1.9067498548257425e-05, + "loss": 0.7012, + "step": 14717 + }, + { + "epoch": 2.4025958124158198, + "grad_norm": 2.07782244682312, + "learning_rate": 1.9067364734392386e-05, + "loss": 0.6988, + "step": 14718 + }, + { + "epoch": 2.402759071058324, + "grad_norm": 1.454687476158142, + "learning_rate": 1.9067230911396512e-05, + "loss": 0.444, + "step": 14719 + }, + { + "epoch": 2.4029223297008286, + "grad_norm": 2.0480470657348633, + "learning_rate": 1.9067097079269942e-05, + "loss": 0.6606, + "step": 14720 + }, + { + "epoch": 2.403085588343333, + "grad_norm": 1.7591325044631958, + "learning_rate": 1.9066963238012807e-05, + "loss": 0.4727, + "step": 14721 + }, + { + "epoch": 2.4032488469858375, + "grad_norm": 1.9034820795059204, + "learning_rate": 1.9066829387625243e-05, + "loss": 0.577, + "step": 14722 + }, + { + "epoch": 2.4034121056283415, + "grad_norm": 1.6604607105255127, + "learning_rate": 1.906669552810738e-05, + "loss": 0.5901, + "step": 14723 + }, + { + "epoch": 2.403575364270846, + "grad_norm": 1.922224998474121, + "learning_rate": 1.9066561659459363e-05, + "loss": 0.6552, + "step": 14724 + }, + { + "epoch": 2.4037386229133504, + "grad_norm": 1.7556606531143188, + "learning_rate": 1.9066427781681314e-05, + "loss": 0.6136, + "step": 14725 + }, + { + "epoch": 2.403901881555855, + "grad_norm": 1.7586984634399414, + "learning_rate": 1.9066293894773383e-05, + "loss": 0.5203, + "step": 14726 + }, + { + "epoch": 2.4040651401983593, + "grad_norm": 1.5935351848602295, + "learning_rate": 1.906615999873569e-05, + "loss": 0.4635, + "step": 14727 + }, + { + "epoch": 2.4042283988408637, + "grad_norm": 1.7105528116226196, + "learning_rate": 1.906602609356838e-05, + "loss": 0.5311, + "step": 14728 + }, + { + "epoch": 2.404391657483368, + "grad_norm": 1.5983600616455078, + "learning_rate": 1.906589217927158e-05, + "loss": 0.4762, + "step": 14729 + }, + { + "epoch": 2.4045549161258726, + "grad_norm": 1.4015930891036987, + "learning_rate": 1.9065758255845432e-05, + "loss": 0.4643, + "step": 14730 + }, + { + "epoch": 2.4047181747683766, + "grad_norm": 1.5947763919830322, + "learning_rate": 1.906562432329007e-05, + "loss": 0.5042, + "step": 14731 + }, + { + "epoch": 2.404881433410881, + "grad_norm": 1.8998064994812012, + "learning_rate": 1.9065490381605624e-05, + "loss": 0.5755, + "step": 14732 + }, + { + "epoch": 2.4050446920533854, + "grad_norm": 1.704728126525879, + "learning_rate": 1.906535643079223e-05, + "loss": 0.4731, + "step": 14733 + }, + { + "epoch": 2.40520795069589, + "grad_norm": 1.8300803899765015, + "learning_rate": 1.9065222470850025e-05, + "loss": 0.63, + "step": 14734 + }, + { + "epoch": 2.4053712093383943, + "grad_norm": 1.6360726356506348, + "learning_rate": 1.9065088501779145e-05, + "loss": 0.5193, + "step": 14735 + }, + { + "epoch": 2.4055344679808988, + "grad_norm": 1.6334954500198364, + "learning_rate": 1.906495452357972e-05, + "loss": 0.5124, + "step": 14736 + }, + { + "epoch": 2.405697726623403, + "grad_norm": 1.6755024194717407, + "learning_rate": 1.9064820536251892e-05, + "loss": 0.4698, + "step": 14737 + }, + { + "epoch": 2.4058609852659076, + "grad_norm": 1.803622841835022, + "learning_rate": 1.9064686539795794e-05, + "loss": 0.5628, + "step": 14738 + }, + { + "epoch": 2.406024243908412, + "grad_norm": 1.3860976696014404, + "learning_rate": 1.9064552534211556e-05, + "loss": 0.4295, + "step": 14739 + }, + { + "epoch": 2.4061875025509165, + "grad_norm": 2.0035171508789062, + "learning_rate": 1.9064418519499316e-05, + "loss": 0.6024, + "step": 14740 + }, + { + "epoch": 2.4063507611934205, + "grad_norm": 1.7655789852142334, + "learning_rate": 1.9064284495659208e-05, + "loss": 0.5239, + "step": 14741 + }, + { + "epoch": 2.406514019835925, + "grad_norm": 1.8731989860534668, + "learning_rate": 1.906415046269137e-05, + "loss": 0.5741, + "step": 14742 + }, + { + "epoch": 2.4066772784784294, + "grad_norm": 1.5570639371871948, + "learning_rate": 1.9064016420595934e-05, + "loss": 0.521, + "step": 14743 + }, + { + "epoch": 2.406840537120934, + "grad_norm": 1.9177916049957275, + "learning_rate": 1.9063882369373036e-05, + "loss": 0.6063, + "step": 14744 + }, + { + "epoch": 2.4070037957634383, + "grad_norm": 1.9931893348693848, + "learning_rate": 1.906374830902281e-05, + "loss": 0.6578, + "step": 14745 + }, + { + "epoch": 2.4071670544059427, + "grad_norm": 2.3861536979675293, + "learning_rate": 1.9063614239545393e-05, + "loss": 0.4748, + "step": 14746 + }, + { + "epoch": 2.407330313048447, + "grad_norm": 2.101872444152832, + "learning_rate": 1.906348016094092e-05, + "loss": 0.632, + "step": 14747 + }, + { + "epoch": 2.4074935716909516, + "grad_norm": 1.6305328607559204, + "learning_rate": 1.9063346073209522e-05, + "loss": 0.475, + "step": 14748 + }, + { + "epoch": 2.4076568303334556, + "grad_norm": 1.6323062181472778, + "learning_rate": 1.906321197635134e-05, + "loss": 0.5701, + "step": 14749 + }, + { + "epoch": 2.40782008897596, + "grad_norm": 2.069180727005005, + "learning_rate": 1.9063077870366504e-05, + "loss": 0.5457, + "step": 14750 + }, + { + "epoch": 2.4079833476184644, + "grad_norm": 1.7335231304168701, + "learning_rate": 1.9062943755255146e-05, + "loss": 0.6336, + "step": 14751 + }, + { + "epoch": 2.408146606260969, + "grad_norm": 1.6940103769302368, + "learning_rate": 1.9062809631017412e-05, + "loss": 0.5197, + "step": 14752 + }, + { + "epoch": 2.4083098649034733, + "grad_norm": 1.7385491132736206, + "learning_rate": 1.9062675497653433e-05, + "loss": 0.545, + "step": 14753 + }, + { + "epoch": 2.4084731235459778, + "grad_norm": 1.8014689683914185, + "learning_rate": 1.906254135516334e-05, + "loss": 0.6122, + "step": 14754 + }, + { + "epoch": 2.408636382188482, + "grad_norm": 1.9219131469726562, + "learning_rate": 1.9062407203547267e-05, + "loss": 0.6147, + "step": 14755 + }, + { + "epoch": 2.4087996408309866, + "grad_norm": 1.8238698244094849, + "learning_rate": 1.9062273042805354e-05, + "loss": 0.6405, + "step": 14756 + }, + { + "epoch": 2.408962899473491, + "grad_norm": 1.8655375242233276, + "learning_rate": 1.9062138872937738e-05, + "loss": 0.5127, + "step": 14757 + }, + { + "epoch": 2.4091261581159955, + "grad_norm": 1.5455180406570435, + "learning_rate": 1.9062004693944548e-05, + "loss": 0.5114, + "step": 14758 + }, + { + "epoch": 2.4092894167584995, + "grad_norm": 2.050560235977173, + "learning_rate": 1.906187050582592e-05, + "loss": 0.6829, + "step": 14759 + }, + { + "epoch": 2.409452675401004, + "grad_norm": 1.7802586555480957, + "learning_rate": 1.9061736308581996e-05, + "loss": 0.545, + "step": 14760 + }, + { + "epoch": 2.4096159340435084, + "grad_norm": 1.9317597150802612, + "learning_rate": 1.9061602102212898e-05, + "loss": 0.643, + "step": 14761 + }, + { + "epoch": 2.409779192686013, + "grad_norm": 1.6242008209228516, + "learning_rate": 1.9061467886718775e-05, + "loss": 0.5194, + "step": 14762 + }, + { + "epoch": 2.4099424513285173, + "grad_norm": 2.0996603965759277, + "learning_rate": 1.9061333662099756e-05, + "loss": 0.652, + "step": 14763 + }, + { + "epoch": 2.4101057099710217, + "grad_norm": 2.262483596801758, + "learning_rate": 1.9061199428355973e-05, + "loss": 0.7058, + "step": 14764 + }, + { + "epoch": 2.410268968613526, + "grad_norm": 1.7398741245269775, + "learning_rate": 1.9061065185487568e-05, + "loss": 0.5245, + "step": 14765 + }, + { + "epoch": 2.41043222725603, + "grad_norm": 1.7028193473815918, + "learning_rate": 1.9060930933494673e-05, + "loss": 0.4912, + "step": 14766 + }, + { + "epoch": 2.4105954858985346, + "grad_norm": 1.6147454977035522, + "learning_rate": 1.906079667237742e-05, + "loss": 0.4913, + "step": 14767 + }, + { + "epoch": 2.410758744541039, + "grad_norm": 2.5599560737609863, + "learning_rate": 1.906066240213595e-05, + "loss": 0.6997, + "step": 14768 + }, + { + "epoch": 2.4109220031835434, + "grad_norm": 2.267305374145508, + "learning_rate": 1.9060528122770393e-05, + "loss": 0.6033, + "step": 14769 + }, + { + "epoch": 2.411085261826048, + "grad_norm": 1.720969796180725, + "learning_rate": 1.906039383428089e-05, + "loss": 0.5541, + "step": 14770 + }, + { + "epoch": 2.4112485204685523, + "grad_norm": 1.8384428024291992, + "learning_rate": 1.906025953666757e-05, + "loss": 0.5723, + "step": 14771 + }, + { + "epoch": 2.4114117791110568, + "grad_norm": 1.7477805614471436, + "learning_rate": 1.9060125229930572e-05, + "loss": 0.5779, + "step": 14772 + }, + { + "epoch": 2.411575037753561, + "grad_norm": 1.7447000741958618, + "learning_rate": 1.9059990914070025e-05, + "loss": 0.5695, + "step": 14773 + }, + { + "epoch": 2.4117382963960656, + "grad_norm": 1.604271650314331, + "learning_rate": 1.9059856589086075e-05, + "loss": 0.5674, + "step": 14774 + }, + { + "epoch": 2.41190155503857, + "grad_norm": 1.7622084617614746, + "learning_rate": 1.9059722254978852e-05, + "loss": 0.5518, + "step": 14775 + }, + { + "epoch": 2.412064813681074, + "grad_norm": 1.586521863937378, + "learning_rate": 1.9059587911748488e-05, + "loss": 0.5407, + "step": 14776 + }, + { + "epoch": 2.4122280723235785, + "grad_norm": 1.6929539442062378, + "learning_rate": 1.9059453559395128e-05, + "loss": 0.5309, + "step": 14777 + }, + { + "epoch": 2.412391330966083, + "grad_norm": 2.2214932441711426, + "learning_rate": 1.9059319197918895e-05, + "loss": 0.8017, + "step": 14778 + }, + { + "epoch": 2.4125545896085874, + "grad_norm": 1.7591204643249512, + "learning_rate": 1.905918482731993e-05, + "loss": 0.5062, + "step": 14779 + }, + { + "epoch": 2.412717848251092, + "grad_norm": 1.8807835578918457, + "learning_rate": 1.905905044759837e-05, + "loss": 0.7022, + "step": 14780 + }, + { + "epoch": 2.4128811068935963, + "grad_norm": 1.6877732276916504, + "learning_rate": 1.9058916058754347e-05, + "loss": 0.5284, + "step": 14781 + }, + { + "epoch": 2.4130443655361007, + "grad_norm": 1.597824215888977, + "learning_rate": 1.9058781660788003e-05, + "loss": 0.5613, + "step": 14782 + }, + { + "epoch": 2.413207624178605, + "grad_norm": 1.8140716552734375, + "learning_rate": 1.9058647253699462e-05, + "loss": 0.5816, + "step": 14783 + }, + { + "epoch": 2.413370882821109, + "grad_norm": 1.638429045677185, + "learning_rate": 1.9058512837488868e-05, + "loss": 0.5773, + "step": 14784 + }, + { + "epoch": 2.4135341414636136, + "grad_norm": 1.5304937362670898, + "learning_rate": 1.9058378412156353e-05, + "loss": 0.4968, + "step": 14785 + }, + { + "epoch": 2.413697400106118, + "grad_norm": 1.8113245964050293, + "learning_rate": 1.9058243977702054e-05, + "loss": 0.5965, + "step": 14786 + }, + { + "epoch": 2.4138606587486224, + "grad_norm": 1.7824804782867432, + "learning_rate": 1.9058109534126106e-05, + "loss": 0.5699, + "step": 14787 + }, + { + "epoch": 2.414023917391127, + "grad_norm": 1.809175968170166, + "learning_rate": 1.9057975081428646e-05, + "loss": 0.5645, + "step": 14788 + }, + { + "epoch": 2.4141871760336313, + "grad_norm": 1.492437720298767, + "learning_rate": 1.9057840619609804e-05, + "loss": 0.4711, + "step": 14789 + }, + { + "epoch": 2.4143504346761357, + "grad_norm": 1.4843319654464722, + "learning_rate": 1.905770614866972e-05, + "loss": 0.4885, + "step": 14790 + }, + { + "epoch": 2.41451369331864, + "grad_norm": 1.9791122674942017, + "learning_rate": 1.905757166860853e-05, + "loss": 0.6397, + "step": 14791 + }, + { + "epoch": 2.4146769519611446, + "grad_norm": 1.8094377517700195, + "learning_rate": 1.9057437179426365e-05, + "loss": 0.5596, + "step": 14792 + }, + { + "epoch": 2.414840210603649, + "grad_norm": 1.8248212337493896, + "learning_rate": 1.9057302681123367e-05, + "loss": 0.5714, + "step": 14793 + }, + { + "epoch": 2.415003469246153, + "grad_norm": 1.7966430187225342, + "learning_rate": 1.9057168173699664e-05, + "loss": 0.6608, + "step": 14794 + }, + { + "epoch": 2.4151667278886575, + "grad_norm": 1.5687203407287598, + "learning_rate": 1.90570336571554e-05, + "loss": 0.5298, + "step": 14795 + }, + { + "epoch": 2.415329986531162, + "grad_norm": 2.063647508621216, + "learning_rate": 1.90568991314907e-05, + "loss": 0.656, + "step": 14796 + }, + { + "epoch": 2.4154932451736664, + "grad_norm": 1.8128690719604492, + "learning_rate": 1.9056764596705704e-05, + "loss": 0.546, + "step": 14797 + }, + { + "epoch": 2.415656503816171, + "grad_norm": 1.4784066677093506, + "learning_rate": 1.9056630052800553e-05, + "loss": 0.4318, + "step": 14798 + }, + { + "epoch": 2.4158197624586752, + "grad_norm": 1.7448246479034424, + "learning_rate": 1.9056495499775374e-05, + "loss": 0.5502, + "step": 14799 + }, + { + "epoch": 2.4159830211011797, + "grad_norm": 1.9303172826766968, + "learning_rate": 1.905636093763031e-05, + "loss": 0.5205, + "step": 14800 + }, + { + "epoch": 2.416146279743684, + "grad_norm": 1.5416854619979858, + "learning_rate": 1.905622636636549e-05, + "loss": 0.5259, + "step": 14801 + }, + { + "epoch": 2.416309538386188, + "grad_norm": 2.1024954319000244, + "learning_rate": 1.9056091785981056e-05, + "loss": 0.6584, + "step": 14802 + }, + { + "epoch": 2.4164727970286926, + "grad_norm": 1.7921574115753174, + "learning_rate": 1.9055957196477137e-05, + "loss": 0.6165, + "step": 14803 + }, + { + "epoch": 2.416636055671197, + "grad_norm": 2.083899736404419, + "learning_rate": 1.9055822597853874e-05, + "loss": 0.6694, + "step": 14804 + }, + { + "epoch": 2.4167993143137014, + "grad_norm": 1.9700617790222168, + "learning_rate": 1.9055687990111397e-05, + "loss": 0.6902, + "step": 14805 + }, + { + "epoch": 2.416962572956206, + "grad_norm": 1.5287399291992188, + "learning_rate": 1.9055553373249848e-05, + "loss": 0.5092, + "step": 14806 + }, + { + "epoch": 2.4171258315987103, + "grad_norm": 1.9218041896820068, + "learning_rate": 1.9055418747269356e-05, + "loss": 0.5973, + "step": 14807 + }, + { + "epoch": 2.4172890902412147, + "grad_norm": 1.8346599340438843, + "learning_rate": 1.9055284112170062e-05, + "loss": 0.4647, + "step": 14808 + }, + { + "epoch": 2.417452348883719, + "grad_norm": 1.8453583717346191, + "learning_rate": 1.9055149467952097e-05, + "loss": 0.5577, + "step": 14809 + }, + { + "epoch": 2.4176156075262236, + "grad_norm": 1.9742813110351562, + "learning_rate": 1.90550148146156e-05, + "loss": 0.5864, + "step": 14810 + }, + { + "epoch": 2.4177788661687276, + "grad_norm": 1.7055878639221191, + "learning_rate": 1.9054880152160705e-05, + "loss": 0.5656, + "step": 14811 + }, + { + "epoch": 2.417942124811232, + "grad_norm": 2.0273289680480957, + "learning_rate": 1.905474548058755e-05, + "loss": 0.6992, + "step": 14812 + }, + { + "epoch": 2.4181053834537365, + "grad_norm": 1.8495516777038574, + "learning_rate": 1.9054610799896268e-05, + "loss": 0.6063, + "step": 14813 + }, + { + "epoch": 2.418268642096241, + "grad_norm": 2.204801082611084, + "learning_rate": 1.9054476110086995e-05, + "loss": 0.5926, + "step": 14814 + }, + { + "epoch": 2.4184319007387454, + "grad_norm": 1.861804485321045, + "learning_rate": 1.9054341411159866e-05, + "loss": 0.6655, + "step": 14815 + }, + { + "epoch": 2.41859515938125, + "grad_norm": 1.7758255004882812, + "learning_rate": 1.905420670311502e-05, + "loss": 0.5918, + "step": 14816 + }, + { + "epoch": 2.4187584180237542, + "grad_norm": 1.5058903694152832, + "learning_rate": 1.9054071985952587e-05, + "loss": 0.4629, + "step": 14817 + }, + { + "epoch": 2.4189216766662587, + "grad_norm": 1.7889235019683838, + "learning_rate": 1.9053937259672707e-05, + "loss": 0.5187, + "step": 14818 + }, + { + "epoch": 2.4190849353087627, + "grad_norm": 1.6276010274887085, + "learning_rate": 1.9053802524275514e-05, + "loss": 0.6057, + "step": 14819 + }, + { + "epoch": 2.419248193951267, + "grad_norm": 1.5704641342163086, + "learning_rate": 1.9053667779761147e-05, + "loss": 0.6078, + "step": 14820 + }, + { + "epoch": 2.4194114525937715, + "grad_norm": 1.8271715641021729, + "learning_rate": 1.9053533026129737e-05, + "loss": 0.5453, + "step": 14821 + }, + { + "epoch": 2.419574711236276, + "grad_norm": 1.6790430545806885, + "learning_rate": 1.9053398263381423e-05, + "loss": 0.589, + "step": 14822 + }, + { + "epoch": 2.4197379698787804, + "grad_norm": 1.749173879623413, + "learning_rate": 1.9053263491516338e-05, + "loss": 0.4726, + "step": 14823 + }, + { + "epoch": 2.419901228521285, + "grad_norm": 1.710088849067688, + "learning_rate": 1.905312871053462e-05, + "loss": 0.549, + "step": 14824 + }, + { + "epoch": 2.4200644871637893, + "grad_norm": 2.0475499629974365, + "learning_rate": 1.9052993920436402e-05, + "loss": 0.7673, + "step": 14825 + }, + { + "epoch": 2.4202277458062937, + "grad_norm": 1.6885813474655151, + "learning_rate": 1.9052859121221822e-05, + "loss": 0.5865, + "step": 14826 + }, + { + "epoch": 2.420391004448798, + "grad_norm": 1.534018635749817, + "learning_rate": 1.9052724312891017e-05, + "loss": 0.5714, + "step": 14827 + }, + { + "epoch": 2.4205542630913026, + "grad_norm": 2.4394185543060303, + "learning_rate": 1.905258949544412e-05, + "loss": 0.5351, + "step": 14828 + }, + { + "epoch": 2.4207175217338066, + "grad_norm": 1.395139455795288, + "learning_rate": 1.905245466888127e-05, + "loss": 0.4477, + "step": 14829 + }, + { + "epoch": 2.420880780376311, + "grad_norm": 1.9183334112167358, + "learning_rate": 1.9052319833202596e-05, + "loss": 0.6233, + "step": 14830 + }, + { + "epoch": 2.4210440390188155, + "grad_norm": 1.765745759010315, + "learning_rate": 1.905218498840824e-05, + "loss": 0.5411, + "step": 14831 + }, + { + "epoch": 2.42120729766132, + "grad_norm": 1.7402782440185547, + "learning_rate": 1.905205013449834e-05, + "loss": 0.5483, + "step": 14832 + }, + { + "epoch": 2.4213705563038244, + "grad_norm": 1.7447539567947388, + "learning_rate": 1.9051915271473024e-05, + "loss": 0.5911, + "step": 14833 + }, + { + "epoch": 2.421533814946329, + "grad_norm": 1.9866255521774292, + "learning_rate": 1.9051780399332427e-05, + "loss": 0.6231, + "step": 14834 + }, + { + "epoch": 2.4216970735888332, + "grad_norm": 1.777695894241333, + "learning_rate": 1.9051645518076696e-05, + "loss": 0.5308, + "step": 14835 + }, + { + "epoch": 2.4218603322313377, + "grad_norm": 1.5730823278427124, + "learning_rate": 1.905151062770596e-05, + "loss": 0.5256, + "step": 14836 + }, + { + "epoch": 2.4220235908738417, + "grad_norm": 1.7182598114013672, + "learning_rate": 1.9051375728220358e-05, + "loss": 0.5752, + "step": 14837 + }, + { + "epoch": 2.422186849516346, + "grad_norm": 2.0204169750213623, + "learning_rate": 1.9051240819620018e-05, + "loss": 0.7307, + "step": 14838 + }, + { + "epoch": 2.4223501081588505, + "grad_norm": 1.667989730834961, + "learning_rate": 1.905110590190508e-05, + "loss": 0.5753, + "step": 14839 + }, + { + "epoch": 2.422513366801355, + "grad_norm": 1.7466654777526855, + "learning_rate": 1.9050970975075685e-05, + "loss": 0.5848, + "step": 14840 + }, + { + "epoch": 2.4226766254438594, + "grad_norm": 1.8040430545806885, + "learning_rate": 1.9050836039131962e-05, + "loss": 0.5613, + "step": 14841 + }, + { + "epoch": 2.422839884086364, + "grad_norm": 1.7952693700790405, + "learning_rate": 1.905070109407405e-05, + "loss": 0.5722, + "step": 14842 + }, + { + "epoch": 2.4230031427288683, + "grad_norm": 1.8103914260864258, + "learning_rate": 1.9050566139902088e-05, + "loss": 0.5457, + "step": 14843 + }, + { + "epoch": 2.4231664013713727, + "grad_norm": 2.134258985519409, + "learning_rate": 1.9050431176616203e-05, + "loss": 0.6451, + "step": 14844 + }, + { + "epoch": 2.423329660013877, + "grad_norm": 1.7964415550231934, + "learning_rate": 1.905029620421654e-05, + "loss": 0.5329, + "step": 14845 + }, + { + "epoch": 2.4234929186563816, + "grad_norm": 1.764803171157837, + "learning_rate": 1.9050161222703227e-05, + "loss": 0.6333, + "step": 14846 + }, + { + "epoch": 2.4236561772988856, + "grad_norm": 1.7330634593963623, + "learning_rate": 1.905002623207641e-05, + "loss": 0.5338, + "step": 14847 + }, + { + "epoch": 2.42381943594139, + "grad_norm": 2.092863082885742, + "learning_rate": 1.9049891232336212e-05, + "loss": 0.62, + "step": 14848 + }, + { + "epoch": 2.4239826945838945, + "grad_norm": 1.8405756950378418, + "learning_rate": 1.9049756223482777e-05, + "loss": 0.6262, + "step": 14849 + }, + { + "epoch": 2.424145953226399, + "grad_norm": 1.886425256729126, + "learning_rate": 1.9049621205516243e-05, + "loss": 0.6508, + "step": 14850 + }, + { + "epoch": 2.4243092118689034, + "grad_norm": 1.5993865728378296, + "learning_rate": 1.9049486178436743e-05, + "loss": 0.5966, + "step": 14851 + }, + { + "epoch": 2.424472470511408, + "grad_norm": 1.4431703090667725, + "learning_rate": 1.904935114224441e-05, + "loss": 0.5062, + "step": 14852 + }, + { + "epoch": 2.4246357291539122, + "grad_norm": 1.6261473894119263, + "learning_rate": 1.9049216096939388e-05, + "loss": 0.5105, + "step": 14853 + }, + { + "epoch": 2.4247989877964162, + "grad_norm": 1.8378595113754272, + "learning_rate": 1.9049081042521804e-05, + "loss": 0.5607, + "step": 14854 + }, + { + "epoch": 2.4249622464389207, + "grad_norm": 1.5016839504241943, + "learning_rate": 1.9048945978991796e-05, + "loss": 0.4837, + "step": 14855 + }, + { + "epoch": 2.425125505081425, + "grad_norm": 1.5996778011322021, + "learning_rate": 1.9048810906349506e-05, + "loss": 0.5098, + "step": 14856 + }, + { + "epoch": 2.4252887637239295, + "grad_norm": 1.747170090675354, + "learning_rate": 1.9048675824595066e-05, + "loss": 0.5879, + "step": 14857 + }, + { + "epoch": 2.425452022366434, + "grad_norm": 1.7887680530548096, + "learning_rate": 1.9048540733728608e-05, + "loss": 0.5943, + "step": 14858 + }, + { + "epoch": 2.4256152810089384, + "grad_norm": 1.8542648553848267, + "learning_rate": 1.9048405633750274e-05, + "loss": 0.5916, + "step": 14859 + }, + { + "epoch": 2.425778539651443, + "grad_norm": 2.09249210357666, + "learning_rate": 1.9048270524660197e-05, + "loss": 0.5938, + "step": 14860 + }, + { + "epoch": 2.4259417982939473, + "grad_norm": 2.090928316116333, + "learning_rate": 1.9048135406458515e-05, + "loss": 0.6596, + "step": 14861 + }, + { + "epoch": 2.4261050569364517, + "grad_norm": 1.7580206394195557, + "learning_rate": 1.9048000279145364e-05, + "loss": 0.5455, + "step": 14862 + }, + { + "epoch": 2.426268315578956, + "grad_norm": 1.9046690464019775, + "learning_rate": 1.9047865142720876e-05, + "loss": 0.5714, + "step": 14863 + }, + { + "epoch": 2.42643157422146, + "grad_norm": 2.2106893062591553, + "learning_rate": 1.904772999718519e-05, + "loss": 0.6831, + "step": 14864 + }, + { + "epoch": 2.4265948328639646, + "grad_norm": 1.7953835725784302, + "learning_rate": 1.9047594842538445e-05, + "loss": 0.5246, + "step": 14865 + }, + { + "epoch": 2.426758091506469, + "grad_norm": 1.7825596332550049, + "learning_rate": 1.9047459678780774e-05, + "loss": 0.5379, + "step": 14866 + }, + { + "epoch": 2.4269213501489735, + "grad_norm": 1.8099297285079956, + "learning_rate": 1.9047324505912314e-05, + "loss": 0.514, + "step": 14867 + }, + { + "epoch": 2.427084608791478, + "grad_norm": 1.9682555198669434, + "learning_rate": 1.9047189323933198e-05, + "loss": 0.6363, + "step": 14868 + }, + { + "epoch": 2.4272478674339824, + "grad_norm": 1.9289470911026, + "learning_rate": 1.904705413284357e-05, + "loss": 0.6305, + "step": 14869 + }, + { + "epoch": 2.427411126076487, + "grad_norm": 1.7994563579559326, + "learning_rate": 1.9046918932643555e-05, + "loss": 0.5964, + "step": 14870 + }, + { + "epoch": 2.4275743847189912, + "grad_norm": 2.031937599182129, + "learning_rate": 1.9046783723333298e-05, + "loss": 0.5701, + "step": 14871 + }, + { + "epoch": 2.4277376433614952, + "grad_norm": 1.6080982685089111, + "learning_rate": 1.904664850491293e-05, + "loss": 0.5139, + "step": 14872 + }, + { + "epoch": 2.4279009020039997, + "grad_norm": 1.848858118057251, + "learning_rate": 1.9046513277382592e-05, + "loss": 0.5961, + "step": 14873 + }, + { + "epoch": 2.428064160646504, + "grad_norm": 1.8274022340774536, + "learning_rate": 1.9046378040742418e-05, + "loss": 0.5792, + "step": 14874 + }, + { + "epoch": 2.4282274192890085, + "grad_norm": 1.9679005146026611, + "learning_rate": 1.904624279499254e-05, + "loss": 0.5385, + "step": 14875 + }, + { + "epoch": 2.428390677931513, + "grad_norm": 2.0087180137634277, + "learning_rate": 1.90461075401331e-05, + "loss": 0.5514, + "step": 14876 + }, + { + "epoch": 2.4285539365740174, + "grad_norm": 1.6504930257797241, + "learning_rate": 1.904597227616423e-05, + "loss": 0.5953, + "step": 14877 + }, + { + "epoch": 2.428717195216522, + "grad_norm": 2.1415722370147705, + "learning_rate": 1.9045837003086074e-05, + "loss": 0.596, + "step": 14878 + }, + { + "epoch": 2.4288804538590263, + "grad_norm": 1.9976701736450195, + "learning_rate": 1.9045701720898756e-05, + "loss": 0.5605, + "step": 14879 + }, + { + "epoch": 2.4290437125015307, + "grad_norm": 1.840915322303772, + "learning_rate": 1.9045566429602425e-05, + "loss": 0.6397, + "step": 14880 + }, + { + "epoch": 2.429206971144035, + "grad_norm": 2.1169936656951904, + "learning_rate": 1.9045431129197207e-05, + "loss": 0.5956, + "step": 14881 + }, + { + "epoch": 2.429370229786539, + "grad_norm": 2.191061496734619, + "learning_rate": 1.904529581968324e-05, + "loss": 0.6675, + "step": 14882 + }, + { + "epoch": 2.4295334884290436, + "grad_norm": 1.7287367582321167, + "learning_rate": 1.9045160501060665e-05, + "loss": 0.4869, + "step": 14883 + }, + { + "epoch": 2.429696747071548, + "grad_norm": 1.813077449798584, + "learning_rate": 1.904502517332962e-05, + "loss": 0.5146, + "step": 14884 + }, + { + "epoch": 2.4298600057140525, + "grad_norm": 2.002856731414795, + "learning_rate": 1.904488983649023e-05, + "loss": 0.5986, + "step": 14885 + }, + { + "epoch": 2.430023264356557, + "grad_norm": 1.5827481746673584, + "learning_rate": 1.9044754490542643e-05, + "loss": 0.5903, + "step": 14886 + }, + { + "epoch": 2.4301865229990613, + "grad_norm": 1.7373671531677246, + "learning_rate": 1.904461913548699e-05, + "loss": 0.5271, + "step": 14887 + }, + { + "epoch": 2.430349781641566, + "grad_norm": 1.7727181911468506, + "learning_rate": 1.9044483771323408e-05, + "loss": 0.5905, + "step": 14888 + }, + { + "epoch": 2.4305130402840702, + "grad_norm": 1.6190342903137207, + "learning_rate": 1.9044348398052032e-05, + "loss": 0.5697, + "step": 14889 + }, + { + "epoch": 2.430676298926574, + "grad_norm": 1.8190252780914307, + "learning_rate": 1.9044213015672998e-05, + "loss": 0.5705, + "step": 14890 + }, + { + "epoch": 2.4308395575690787, + "grad_norm": 1.8055258989334106, + "learning_rate": 1.904407762418645e-05, + "loss": 0.6185, + "step": 14891 + }, + { + "epoch": 2.431002816211583, + "grad_norm": 1.4095808267593384, + "learning_rate": 1.904394222359251e-05, + "loss": 0.4806, + "step": 14892 + }, + { + "epoch": 2.4311660748540875, + "grad_norm": 1.7190310955047607, + "learning_rate": 1.904380681389133e-05, + "loss": 0.6455, + "step": 14893 + }, + { + "epoch": 2.431329333496592, + "grad_norm": 1.48473060131073, + "learning_rate": 1.9043671395083034e-05, + "loss": 0.5749, + "step": 14894 + }, + { + "epoch": 2.4314925921390964, + "grad_norm": 1.797882318496704, + "learning_rate": 1.9043535967167766e-05, + "loss": 0.5967, + "step": 14895 + }, + { + "epoch": 2.431655850781601, + "grad_norm": 1.6980783939361572, + "learning_rate": 1.9043400530145658e-05, + "loss": 0.523, + "step": 14896 + }, + { + "epoch": 2.4318191094241053, + "grad_norm": 1.9749912023544312, + "learning_rate": 1.9043265084016848e-05, + "loss": 0.6604, + "step": 14897 + }, + { + "epoch": 2.4319823680666097, + "grad_norm": 1.783868432044983, + "learning_rate": 1.9043129628781474e-05, + "loss": 0.5797, + "step": 14898 + }, + { + "epoch": 2.432145626709114, + "grad_norm": 1.9679243564605713, + "learning_rate": 1.904299416443967e-05, + "loss": 0.6334, + "step": 14899 + }, + { + "epoch": 2.432308885351618, + "grad_norm": 2.1388704776763916, + "learning_rate": 1.9042858690991574e-05, + "loss": 0.6447, + "step": 14900 + }, + { + "epoch": 2.4324721439941226, + "grad_norm": 1.7700965404510498, + "learning_rate": 1.9042723208437318e-05, + "loss": 0.6459, + "step": 14901 + }, + { + "epoch": 2.432635402636627, + "grad_norm": 1.8787976503372192, + "learning_rate": 1.9042587716777048e-05, + "loss": 0.5728, + "step": 14902 + }, + { + "epoch": 2.4327986612791315, + "grad_norm": 2.136075496673584, + "learning_rate": 1.9042452216010893e-05, + "loss": 0.627, + "step": 14903 + }, + { + "epoch": 2.432961919921636, + "grad_norm": 1.864426851272583, + "learning_rate": 1.9042316706138987e-05, + "loss": 0.6036, + "step": 14904 + }, + { + "epoch": 2.4331251785641403, + "grad_norm": 1.6448875665664673, + "learning_rate": 1.9042181187161474e-05, + "loss": 0.5031, + "step": 14905 + }, + { + "epoch": 2.433288437206645, + "grad_norm": 1.9625319242477417, + "learning_rate": 1.904204565907849e-05, + "loss": 0.5304, + "step": 14906 + }, + { + "epoch": 2.4334516958491488, + "grad_norm": 2.187103271484375, + "learning_rate": 1.904191012189016e-05, + "loss": 0.8059, + "step": 14907 + }, + { + "epoch": 2.433614954491653, + "grad_norm": 1.5603947639465332, + "learning_rate": 1.9041774575596635e-05, + "loss": 0.4557, + "step": 14908 + }, + { + "epoch": 2.4337782131341577, + "grad_norm": 2.010899782180786, + "learning_rate": 1.9041639020198044e-05, + "loss": 0.5878, + "step": 14909 + }, + { + "epoch": 2.433941471776662, + "grad_norm": 2.4543817043304443, + "learning_rate": 1.9041503455694522e-05, + "loss": 0.6061, + "step": 14910 + }, + { + "epoch": 2.4341047304191665, + "grad_norm": 1.7556078433990479, + "learning_rate": 1.9041367882086213e-05, + "loss": 0.6362, + "step": 14911 + }, + { + "epoch": 2.434267989061671, + "grad_norm": 1.9327290058135986, + "learning_rate": 1.9041232299373245e-05, + "loss": 0.6683, + "step": 14912 + }, + { + "epoch": 2.4344312477041754, + "grad_norm": 1.7654426097869873, + "learning_rate": 1.9041096707555764e-05, + "loss": 0.6395, + "step": 14913 + }, + { + "epoch": 2.43459450634668, + "grad_norm": 1.897874355316162, + "learning_rate": 1.90409611066339e-05, + "loss": 0.6443, + "step": 14914 + }, + { + "epoch": 2.4347577649891843, + "grad_norm": 1.5646510124206543, + "learning_rate": 1.9040825496607788e-05, + "loss": 0.5478, + "step": 14915 + }, + { + "epoch": 2.4349210236316887, + "grad_norm": 1.668532371520996, + "learning_rate": 1.9040689877477567e-05, + "loss": 0.5525, + "step": 14916 + }, + { + "epoch": 2.4350842822741927, + "grad_norm": 1.7304482460021973, + "learning_rate": 1.904055424924337e-05, + "loss": 0.5968, + "step": 14917 + }, + { + "epoch": 2.435247540916697, + "grad_norm": 1.4615495204925537, + "learning_rate": 1.9040418611905343e-05, + "loss": 0.4702, + "step": 14918 + }, + { + "epoch": 2.4354107995592016, + "grad_norm": 1.5160919427871704, + "learning_rate": 1.9040282965463615e-05, + "loss": 0.4733, + "step": 14919 + }, + { + "epoch": 2.435574058201706, + "grad_norm": 1.9439952373504639, + "learning_rate": 1.9040147309918326e-05, + "loss": 0.5766, + "step": 14920 + }, + { + "epoch": 2.4357373168442105, + "grad_norm": 1.8062443733215332, + "learning_rate": 1.904001164526961e-05, + "loss": 0.5706, + "step": 14921 + }, + { + "epoch": 2.435900575486715, + "grad_norm": 2.034292697906494, + "learning_rate": 1.9039875971517604e-05, + "loss": 0.6815, + "step": 14922 + }, + { + "epoch": 2.4360638341292193, + "grad_norm": 1.5407143831253052, + "learning_rate": 1.9039740288662445e-05, + "loss": 0.4941, + "step": 14923 + }, + { + "epoch": 2.436227092771724, + "grad_norm": 1.4836732149124146, + "learning_rate": 1.903960459670427e-05, + "loss": 0.5115, + "step": 14924 + }, + { + "epoch": 2.4363903514142278, + "grad_norm": 1.9302396774291992, + "learning_rate": 1.9039468895643215e-05, + "loss": 0.7025, + "step": 14925 + }, + { + "epoch": 2.436553610056732, + "grad_norm": 1.770150065422058, + "learning_rate": 1.903933318547942e-05, + "loss": 0.5931, + "step": 14926 + }, + { + "epoch": 2.4367168686992366, + "grad_norm": 1.4910515546798706, + "learning_rate": 1.9039197466213017e-05, + "loss": 0.4523, + "step": 14927 + }, + { + "epoch": 2.436880127341741, + "grad_norm": 1.749051809310913, + "learning_rate": 1.9039061737844145e-05, + "loss": 0.4911, + "step": 14928 + }, + { + "epoch": 2.4370433859842455, + "grad_norm": 1.8939744234085083, + "learning_rate": 1.9038926000372938e-05, + "loss": 0.5764, + "step": 14929 + }, + { + "epoch": 2.43720664462675, + "grad_norm": 1.5359153747558594, + "learning_rate": 1.903879025379954e-05, + "loss": 0.5389, + "step": 14930 + }, + { + "epoch": 2.4373699032692544, + "grad_norm": 1.6750926971435547, + "learning_rate": 1.903865449812408e-05, + "loss": 0.5044, + "step": 14931 + }, + { + "epoch": 2.437533161911759, + "grad_norm": 1.5458006858825684, + "learning_rate": 1.9038518733346696e-05, + "loss": 0.4845, + "step": 14932 + }, + { + "epoch": 2.4376964205542633, + "grad_norm": 1.6655356884002686, + "learning_rate": 1.9038382959467526e-05, + "loss": 0.4993, + "step": 14933 + }, + { + "epoch": 2.4378596791967677, + "grad_norm": 1.591012716293335, + "learning_rate": 1.9038247176486706e-05, + "loss": 0.5029, + "step": 14934 + }, + { + "epoch": 2.4380229378392717, + "grad_norm": 1.887017846107483, + "learning_rate": 1.9038111384404375e-05, + "loss": 0.5149, + "step": 14935 + }, + { + "epoch": 2.438186196481776, + "grad_norm": 1.9391728639602661, + "learning_rate": 1.9037975583220668e-05, + "loss": 0.6618, + "step": 14936 + }, + { + "epoch": 2.4383494551242806, + "grad_norm": 1.9103147983551025, + "learning_rate": 1.903783977293572e-05, + "loss": 0.5757, + "step": 14937 + }, + { + "epoch": 2.438512713766785, + "grad_norm": 1.8828321695327759, + "learning_rate": 1.9037703953549675e-05, + "loss": 0.6099, + "step": 14938 + }, + { + "epoch": 2.4386759724092895, + "grad_norm": 2.043605089187622, + "learning_rate": 1.903756812506266e-05, + "loss": 0.6691, + "step": 14939 + }, + { + "epoch": 2.438839231051794, + "grad_norm": 2.107741355895996, + "learning_rate": 1.903743228747482e-05, + "loss": 0.5656, + "step": 14940 + }, + { + "epoch": 2.4390024896942983, + "grad_norm": 1.573472499847412, + "learning_rate": 1.9037296440786287e-05, + "loss": 0.5112, + "step": 14941 + }, + { + "epoch": 2.4391657483368023, + "grad_norm": 1.7890576124191284, + "learning_rate": 1.9037160584997195e-05, + "loss": 0.5929, + "step": 14942 + }, + { + "epoch": 2.4393290069793068, + "grad_norm": 1.8422499895095825, + "learning_rate": 1.903702472010769e-05, + "loss": 0.6321, + "step": 14943 + }, + { + "epoch": 2.439492265621811, + "grad_norm": 1.359668254852295, + "learning_rate": 1.90368888461179e-05, + "loss": 0.4669, + "step": 14944 + }, + { + "epoch": 2.4396555242643156, + "grad_norm": 1.8932963609695435, + "learning_rate": 1.903675296302797e-05, + "loss": 0.6175, + "step": 14945 + }, + { + "epoch": 2.43981878290682, + "grad_norm": 1.9303944110870361, + "learning_rate": 1.9036617070838026e-05, + "loss": 0.5268, + "step": 14946 + }, + { + "epoch": 2.4399820415493245, + "grad_norm": 1.621153712272644, + "learning_rate": 1.9036481169548215e-05, + "loss": 0.4906, + "step": 14947 + }, + { + "epoch": 2.440145300191829, + "grad_norm": 1.9538962841033936, + "learning_rate": 1.9036345259158667e-05, + "loss": 0.6298, + "step": 14948 + }, + { + "epoch": 2.4403085588343334, + "grad_norm": 1.7463122606277466, + "learning_rate": 1.9036209339669523e-05, + "loss": 0.6015, + "step": 14949 + }, + { + "epoch": 2.440471817476838, + "grad_norm": 1.76893949508667, + "learning_rate": 1.9036073411080917e-05, + "loss": 0.5653, + "step": 14950 + }, + { + "epoch": 2.4406350761193423, + "grad_norm": 1.826669692993164, + "learning_rate": 1.9035937473392992e-05, + "loss": 0.5307, + "step": 14951 + }, + { + "epoch": 2.4407983347618463, + "grad_norm": 1.7388789653778076, + "learning_rate": 1.9035801526605876e-05, + "loss": 0.5117, + "step": 14952 + }, + { + "epoch": 2.4409615934043507, + "grad_norm": 1.8112494945526123, + "learning_rate": 1.9035665570719713e-05, + "loss": 0.5061, + "step": 14953 + }, + { + "epoch": 2.441124852046855, + "grad_norm": 1.9270402193069458, + "learning_rate": 1.9035529605734637e-05, + "loss": 0.615, + "step": 14954 + }, + { + "epoch": 2.4412881106893596, + "grad_norm": 2.0388882160186768, + "learning_rate": 1.9035393631650783e-05, + "loss": 0.5641, + "step": 14955 + }, + { + "epoch": 2.441451369331864, + "grad_norm": 1.9731343984603882, + "learning_rate": 1.903525764846829e-05, + "loss": 0.6547, + "step": 14956 + }, + { + "epoch": 2.4416146279743685, + "grad_norm": 1.829236626625061, + "learning_rate": 1.9035121656187297e-05, + "loss": 0.5669, + "step": 14957 + }, + { + "epoch": 2.441777886616873, + "grad_norm": 1.9590879678726196, + "learning_rate": 1.903498565480794e-05, + "loss": 0.6146, + "step": 14958 + }, + { + "epoch": 2.4419411452593773, + "grad_norm": 1.857578992843628, + "learning_rate": 1.903484964433035e-05, + "loss": 0.5316, + "step": 14959 + }, + { + "epoch": 2.4421044039018813, + "grad_norm": 2.309530735015869, + "learning_rate": 1.903471362475467e-05, + "loss": 0.6668, + "step": 14960 + }, + { + "epoch": 2.4422676625443858, + "grad_norm": 1.5466243028640747, + "learning_rate": 1.903457759608104e-05, + "loss": 0.4598, + "step": 14961 + }, + { + "epoch": 2.44243092118689, + "grad_norm": 1.8385168313980103, + "learning_rate": 1.9034441558309588e-05, + "loss": 0.5117, + "step": 14962 + }, + { + "epoch": 2.4425941798293946, + "grad_norm": 1.7189327478408813, + "learning_rate": 1.9034305511440457e-05, + "loss": 0.5935, + "step": 14963 + }, + { + "epoch": 2.442757438471899, + "grad_norm": 1.6266891956329346, + "learning_rate": 1.9034169455473787e-05, + "loss": 0.5592, + "step": 14964 + }, + { + "epoch": 2.4429206971144035, + "grad_norm": 1.8783230781555176, + "learning_rate": 1.9034033390409704e-05, + "loss": 0.5342, + "step": 14965 + }, + { + "epoch": 2.443083955756908, + "grad_norm": 1.6742109060287476, + "learning_rate": 1.9033897316248356e-05, + "loss": 0.5205, + "step": 14966 + }, + { + "epoch": 2.4432472143994124, + "grad_norm": 2.421536922454834, + "learning_rate": 1.9033761232989877e-05, + "loss": 0.607, + "step": 14967 + }, + { + "epoch": 2.443410473041917, + "grad_norm": 1.8752896785736084, + "learning_rate": 1.90336251406344e-05, + "loss": 0.5878, + "step": 14968 + }, + { + "epoch": 2.4435737316844213, + "grad_norm": 1.5289314985275269, + "learning_rate": 1.9033489039182063e-05, + "loss": 0.4939, + "step": 14969 + }, + { + "epoch": 2.4437369903269253, + "grad_norm": 1.9914402961730957, + "learning_rate": 1.903335292863301e-05, + "loss": 0.5923, + "step": 14970 + }, + { + "epoch": 2.4439002489694297, + "grad_norm": 1.845375657081604, + "learning_rate": 1.903321680898737e-05, + "loss": 0.5593, + "step": 14971 + }, + { + "epoch": 2.444063507611934, + "grad_norm": 1.656806230545044, + "learning_rate": 1.9033080680245283e-05, + "loss": 0.5411, + "step": 14972 + }, + { + "epoch": 2.4442267662544386, + "grad_norm": 1.857143521308899, + "learning_rate": 1.9032944542406884e-05, + "loss": 0.5469, + "step": 14973 + }, + { + "epoch": 2.444390024896943, + "grad_norm": 1.881013035774231, + "learning_rate": 1.903280839547232e-05, + "loss": 0.5027, + "step": 14974 + }, + { + "epoch": 2.4445532835394475, + "grad_norm": 1.9495882987976074, + "learning_rate": 1.9032672239441715e-05, + "loss": 0.6646, + "step": 14975 + }, + { + "epoch": 2.444716542181952, + "grad_norm": 1.4398552179336548, + "learning_rate": 1.903253607431521e-05, + "loss": 0.4165, + "step": 14976 + }, + { + "epoch": 2.4448798008244563, + "grad_norm": 1.7296417951583862, + "learning_rate": 1.9032399900092946e-05, + "loss": 0.6142, + "step": 14977 + }, + { + "epoch": 2.4450430594669603, + "grad_norm": 1.6350985765457153, + "learning_rate": 1.9032263716775058e-05, + "loss": 0.5574, + "step": 14978 + }, + { + "epoch": 2.4452063181094648, + "grad_norm": 1.6692779064178467, + "learning_rate": 1.903212752436168e-05, + "loss": 0.4892, + "step": 14979 + }, + { + "epoch": 2.445369576751969, + "grad_norm": 2.1317837238311768, + "learning_rate": 1.9031991322852956e-05, + "loss": 0.5617, + "step": 14980 + }, + { + "epoch": 2.4455328353944736, + "grad_norm": 1.774700403213501, + "learning_rate": 1.9031855112249016e-05, + "loss": 0.5384, + "step": 14981 + }, + { + "epoch": 2.445696094036978, + "grad_norm": 1.7266255617141724, + "learning_rate": 1.9031718892550003e-05, + "loss": 0.5124, + "step": 14982 + }, + { + "epoch": 2.4458593526794825, + "grad_norm": 2.062485933303833, + "learning_rate": 1.9031582663756048e-05, + "loss": 0.6594, + "step": 14983 + }, + { + "epoch": 2.446022611321987, + "grad_norm": 2.1809816360473633, + "learning_rate": 1.9031446425867296e-05, + "loss": 0.7766, + "step": 14984 + }, + { + "epoch": 2.4461858699644914, + "grad_norm": 1.9755696058273315, + "learning_rate": 1.9031310178883874e-05, + "loss": 0.5768, + "step": 14985 + }, + { + "epoch": 2.446349128606996, + "grad_norm": 1.7174519300460815, + "learning_rate": 1.903117392280593e-05, + "loss": 0.527, + "step": 14986 + }, + { + "epoch": 2.4465123872495003, + "grad_norm": 1.7275046110153198, + "learning_rate": 1.9031037657633594e-05, + "loss": 0.4424, + "step": 14987 + }, + { + "epoch": 2.4466756458920043, + "grad_norm": 1.404295563697815, + "learning_rate": 1.9030901383367007e-05, + "loss": 0.4538, + "step": 14988 + }, + { + "epoch": 2.4468389045345087, + "grad_norm": 1.7492846250534058, + "learning_rate": 1.9030765100006302e-05, + "loss": 0.6055, + "step": 14989 + }, + { + "epoch": 2.447002163177013, + "grad_norm": 1.8098983764648438, + "learning_rate": 1.903062880755162e-05, + "loss": 0.6113, + "step": 14990 + }, + { + "epoch": 2.4471654218195176, + "grad_norm": 2.1297707557678223, + "learning_rate": 1.90304925060031e-05, + "loss": 0.6535, + "step": 14991 + }, + { + "epoch": 2.447328680462022, + "grad_norm": 2.0867292881011963, + "learning_rate": 1.9030356195360875e-05, + "loss": 0.6314, + "step": 14992 + }, + { + "epoch": 2.4474919391045264, + "grad_norm": 1.5578144788742065, + "learning_rate": 1.903021987562508e-05, + "loss": 0.5092, + "step": 14993 + }, + { + "epoch": 2.447655197747031, + "grad_norm": 1.9064526557922363, + "learning_rate": 1.903008354679586e-05, + "loss": 0.6382, + "step": 14994 + }, + { + "epoch": 2.447818456389535, + "grad_norm": 2.0602874755859375, + "learning_rate": 1.902994720887335e-05, + "loss": 0.6535, + "step": 14995 + }, + { + "epoch": 2.4479817150320393, + "grad_norm": 1.568360686302185, + "learning_rate": 1.902981086185768e-05, + "loss": 0.6401, + "step": 14996 + }, + { + "epoch": 2.4481449736745438, + "grad_norm": 1.6843199729919434, + "learning_rate": 1.9029674505748998e-05, + "loss": 0.5393, + "step": 14997 + }, + { + "epoch": 2.448308232317048, + "grad_norm": 1.482163667678833, + "learning_rate": 1.9029538140547434e-05, + "loss": 0.4759, + "step": 14998 + }, + { + "epoch": 2.4484714909595526, + "grad_norm": 1.6809319257736206, + "learning_rate": 1.9029401766253127e-05, + "loss": 0.492, + "step": 14999 + }, + { + "epoch": 2.448634749602057, + "grad_norm": 1.6218905448913574, + "learning_rate": 1.9029265382866216e-05, + "loss": 0.4917, + "step": 15000 + }, + { + "epoch": 2.4487980082445615, + "grad_norm": 1.687637209892273, + "learning_rate": 1.9029128990386833e-05, + "loss": 0.5065, + "step": 15001 + }, + { + "epoch": 2.448961266887066, + "grad_norm": 2.033961057662964, + "learning_rate": 1.9028992588815124e-05, + "loss": 0.6625, + "step": 15002 + }, + { + "epoch": 2.4491245255295704, + "grad_norm": 1.5695745944976807, + "learning_rate": 1.9028856178151222e-05, + "loss": 0.4937, + "step": 15003 + }, + { + "epoch": 2.449287784172075, + "grad_norm": 1.7977287769317627, + "learning_rate": 1.902871975839526e-05, + "loss": 0.5313, + "step": 15004 + }, + { + "epoch": 2.449451042814579, + "grad_norm": 1.871834397315979, + "learning_rate": 1.9028583329547383e-05, + "loss": 0.5812, + "step": 15005 + }, + { + "epoch": 2.4496143014570833, + "grad_norm": 1.6361526250839233, + "learning_rate": 1.9028446891607726e-05, + "loss": 0.591, + "step": 15006 + }, + { + "epoch": 2.4497775600995877, + "grad_norm": 1.6358771324157715, + "learning_rate": 1.9028310444576423e-05, + "loss": 0.5985, + "step": 15007 + }, + { + "epoch": 2.449940818742092, + "grad_norm": 1.9050759077072144, + "learning_rate": 1.9028173988453617e-05, + "loss": 0.6309, + "step": 15008 + }, + { + "epoch": 2.4501040773845966, + "grad_norm": 1.615654706954956, + "learning_rate": 1.9028037523239437e-05, + "loss": 0.5583, + "step": 15009 + }, + { + "epoch": 2.450267336027101, + "grad_norm": 1.4890351295471191, + "learning_rate": 1.902790104893403e-05, + "loss": 0.5137, + "step": 15010 + }, + { + "epoch": 2.4504305946696054, + "grad_norm": 2.1243813037872314, + "learning_rate": 1.9027764565537525e-05, + "loss": 0.6493, + "step": 15011 + }, + { + "epoch": 2.45059385331211, + "grad_norm": 1.8206160068511963, + "learning_rate": 1.9027628073050067e-05, + "loss": 0.6266, + "step": 15012 + }, + { + "epoch": 2.450757111954614, + "grad_norm": 1.9584447145462036, + "learning_rate": 1.902749157147179e-05, + "loss": 0.5887, + "step": 15013 + }, + { + "epoch": 2.4509203705971183, + "grad_norm": 1.7082204818725586, + "learning_rate": 1.902735506080283e-05, + "loss": 0.5551, + "step": 15014 + }, + { + "epoch": 2.4510836292396228, + "grad_norm": 1.491708517074585, + "learning_rate": 1.9027218541043327e-05, + "loss": 0.4827, + "step": 15015 + }, + { + "epoch": 2.451246887882127, + "grad_norm": 1.8928724527359009, + "learning_rate": 1.9027082012193416e-05, + "loss": 0.5443, + "step": 15016 + }, + { + "epoch": 2.4514101465246316, + "grad_norm": 1.6438833475112915, + "learning_rate": 1.9026945474253234e-05, + "loss": 0.6198, + "step": 15017 + }, + { + "epoch": 2.451573405167136, + "grad_norm": 1.7827366590499878, + "learning_rate": 1.9026808927222923e-05, + "loss": 0.6596, + "step": 15018 + }, + { + "epoch": 2.4517366638096405, + "grad_norm": 1.6143018007278442, + "learning_rate": 1.902667237110262e-05, + "loss": 0.5806, + "step": 15019 + }, + { + "epoch": 2.451899922452145, + "grad_norm": 1.8547422885894775, + "learning_rate": 1.9026535805892456e-05, + "loss": 0.5297, + "step": 15020 + }, + { + "epoch": 2.4520631810946494, + "grad_norm": 1.5370044708251953, + "learning_rate": 1.9026399231592572e-05, + "loss": 0.5152, + "step": 15021 + }, + { + "epoch": 2.452226439737154, + "grad_norm": 1.8321579694747925, + "learning_rate": 1.902626264820311e-05, + "loss": 0.5173, + "step": 15022 + }, + { + "epoch": 2.452389698379658, + "grad_norm": 1.5255614519119263, + "learning_rate": 1.9026126055724202e-05, + "loss": 0.5068, + "step": 15023 + }, + { + "epoch": 2.4525529570221622, + "grad_norm": 1.5533872842788696, + "learning_rate": 1.902598945415599e-05, + "loss": 0.5514, + "step": 15024 + }, + { + "epoch": 2.4527162156646667, + "grad_norm": 1.9138580560684204, + "learning_rate": 1.902585284349861e-05, + "loss": 0.6053, + "step": 15025 + }, + { + "epoch": 2.452879474307171, + "grad_norm": 2.018965244293213, + "learning_rate": 1.9025716223752193e-05, + "loss": 0.6203, + "step": 15026 + }, + { + "epoch": 2.4530427329496756, + "grad_norm": 1.786868929862976, + "learning_rate": 1.9025579594916886e-05, + "loss": 0.5624, + "step": 15027 + }, + { + "epoch": 2.45320599159218, + "grad_norm": 1.641600489616394, + "learning_rate": 1.902544295699282e-05, + "loss": 0.5317, + "step": 15028 + }, + { + "epoch": 2.4533692502346844, + "grad_norm": 1.9949982166290283, + "learning_rate": 1.902530630998014e-05, + "loss": 0.5588, + "step": 15029 + }, + { + "epoch": 2.453532508877189, + "grad_norm": 1.7500920295715332, + "learning_rate": 1.9025169653878973e-05, + "loss": 0.6008, + "step": 15030 + }, + { + "epoch": 2.453695767519693, + "grad_norm": 2.1440834999084473, + "learning_rate": 1.9025032988689466e-05, + "loss": 0.6528, + "step": 15031 + }, + { + "epoch": 2.4538590261621973, + "grad_norm": 1.8911616802215576, + "learning_rate": 1.9024896314411753e-05, + "loss": 0.5395, + "step": 15032 + }, + { + "epoch": 2.4540222848047017, + "grad_norm": 1.6172834634780884, + "learning_rate": 1.9024759631045972e-05, + "loss": 0.5412, + "step": 15033 + }, + { + "epoch": 2.454185543447206, + "grad_norm": 1.9009448289871216, + "learning_rate": 1.902462293859226e-05, + "loss": 0.5641, + "step": 15034 + }, + { + "epoch": 2.4543488020897106, + "grad_norm": 1.596114993095398, + "learning_rate": 1.9024486237050755e-05, + "loss": 0.4583, + "step": 15035 + }, + { + "epoch": 2.454512060732215, + "grad_norm": 1.8259263038635254, + "learning_rate": 1.9024349526421596e-05, + "loss": 0.587, + "step": 15036 + }, + { + "epoch": 2.4546753193747195, + "grad_norm": 2.1422815322875977, + "learning_rate": 1.902421280670492e-05, + "loss": 0.5244, + "step": 15037 + }, + { + "epoch": 2.454838578017224, + "grad_norm": 1.8045785427093506, + "learning_rate": 1.902407607790086e-05, + "loss": 0.5979, + "step": 15038 + }, + { + "epoch": 2.4550018366597284, + "grad_norm": 1.9214386940002441, + "learning_rate": 1.9023939340009558e-05, + "loss": 0.5193, + "step": 15039 + }, + { + "epoch": 2.4551650953022324, + "grad_norm": 1.7562785148620605, + "learning_rate": 1.9023802593031156e-05, + "loss": 0.5341, + "step": 15040 + }, + { + "epoch": 2.455328353944737, + "grad_norm": 1.5503790378570557, + "learning_rate": 1.9023665836965784e-05, + "loss": 0.5894, + "step": 15041 + }, + { + "epoch": 2.4554916125872412, + "grad_norm": 2.0818519592285156, + "learning_rate": 1.9023529071813582e-05, + "loss": 0.53, + "step": 15042 + }, + { + "epoch": 2.4556548712297457, + "grad_norm": 2.168790578842163, + "learning_rate": 1.902339229757469e-05, + "loss": 0.6292, + "step": 15043 + }, + { + "epoch": 2.45581812987225, + "grad_norm": 1.5458272695541382, + "learning_rate": 1.902325551424925e-05, + "loss": 0.4547, + "step": 15044 + }, + { + "epoch": 2.4559813885147546, + "grad_norm": 2.122361183166504, + "learning_rate": 1.9023118721837385e-05, + "loss": 0.6254, + "step": 15045 + }, + { + "epoch": 2.456144647157259, + "grad_norm": 1.8119949102401733, + "learning_rate": 1.9022981920339246e-05, + "loss": 0.5938, + "step": 15046 + }, + { + "epoch": 2.4563079057997634, + "grad_norm": 1.815977692604065, + "learning_rate": 1.9022845109754965e-05, + "loss": 0.4857, + "step": 15047 + }, + { + "epoch": 2.4564711644422674, + "grad_norm": 1.611280083656311, + "learning_rate": 1.9022708290084683e-05, + "loss": 0.4708, + "step": 15048 + }, + { + "epoch": 2.456634423084772, + "grad_norm": 2.2484607696533203, + "learning_rate": 1.9022571461328536e-05, + "loss": 0.6244, + "step": 15049 + }, + { + "epoch": 2.4567976817272763, + "grad_norm": 1.7807224988937378, + "learning_rate": 1.902243462348666e-05, + "loss": 0.5949, + "step": 15050 + }, + { + "epoch": 2.4569609403697807, + "grad_norm": 1.8739080429077148, + "learning_rate": 1.90222977765592e-05, + "loss": 0.5853, + "step": 15051 + }, + { + "epoch": 2.457124199012285, + "grad_norm": 1.6931546926498413, + "learning_rate": 1.9022160920546282e-05, + "loss": 0.609, + "step": 15052 + }, + { + "epoch": 2.4572874576547896, + "grad_norm": 1.7945717573165894, + "learning_rate": 1.9022024055448055e-05, + "loss": 0.6308, + "step": 15053 + }, + { + "epoch": 2.457450716297294, + "grad_norm": 1.745080828666687, + "learning_rate": 1.902188718126465e-05, + "loss": 0.5564, + "step": 15054 + }, + { + "epoch": 2.4576139749397985, + "grad_norm": 1.9419431686401367, + "learning_rate": 1.9021750297996207e-05, + "loss": 0.6249, + "step": 15055 + }, + { + "epoch": 2.457777233582303, + "grad_norm": 1.8568979501724243, + "learning_rate": 1.9021613405642865e-05, + "loss": 0.6111, + "step": 15056 + }, + { + "epoch": 2.4579404922248074, + "grad_norm": 1.5583370923995972, + "learning_rate": 1.902147650420476e-05, + "loss": 0.4446, + "step": 15057 + }, + { + "epoch": 2.4581037508673114, + "grad_norm": 1.7760329246520996, + "learning_rate": 1.902133959368203e-05, + "loss": 0.5152, + "step": 15058 + }, + { + "epoch": 2.458267009509816, + "grad_norm": 2.060270071029663, + "learning_rate": 1.9021202674074812e-05, + "loss": 0.6859, + "step": 15059 + }, + { + "epoch": 2.4584302681523202, + "grad_norm": 2.0132858753204346, + "learning_rate": 1.9021065745383248e-05, + "loss": 0.6735, + "step": 15060 + }, + { + "epoch": 2.4585935267948247, + "grad_norm": 1.5735855102539062, + "learning_rate": 1.9020928807607473e-05, + "loss": 0.5342, + "step": 15061 + }, + { + "epoch": 2.458756785437329, + "grad_norm": 1.746121883392334, + "learning_rate": 1.9020791860747625e-05, + "loss": 0.4461, + "step": 15062 + }, + { + "epoch": 2.4589200440798336, + "grad_norm": 2.1104133129119873, + "learning_rate": 1.902065490480384e-05, + "loss": 0.6795, + "step": 15063 + }, + { + "epoch": 2.459083302722338, + "grad_norm": 2.395312547683716, + "learning_rate": 1.902051793977626e-05, + "loss": 0.7587, + "step": 15064 + }, + { + "epoch": 2.4592465613648424, + "grad_norm": 1.2433826923370361, + "learning_rate": 1.902038096566502e-05, + "loss": 0.4197, + "step": 15065 + }, + { + "epoch": 2.4594098200073464, + "grad_norm": 2.1077568531036377, + "learning_rate": 1.9020243982470262e-05, + "loss": 0.6682, + "step": 15066 + }, + { + "epoch": 2.459573078649851, + "grad_norm": 1.9799845218658447, + "learning_rate": 1.9020106990192114e-05, + "loss": 0.6173, + "step": 15067 + }, + { + "epoch": 2.4597363372923553, + "grad_norm": 1.9480339288711548, + "learning_rate": 1.9019969988830728e-05, + "loss": 0.5817, + "step": 15068 + }, + { + "epoch": 2.4598995959348597, + "grad_norm": 2.01764178276062, + "learning_rate": 1.9019832978386227e-05, + "loss": 0.6017, + "step": 15069 + }, + { + "epoch": 2.460062854577364, + "grad_norm": 1.6369704008102417, + "learning_rate": 1.9019695958858762e-05, + "loss": 0.5295, + "step": 15070 + }, + { + "epoch": 2.4602261132198686, + "grad_norm": 1.9070568084716797, + "learning_rate": 1.9019558930248464e-05, + "loss": 0.4758, + "step": 15071 + }, + { + "epoch": 2.460389371862373, + "grad_norm": 1.7836257219314575, + "learning_rate": 1.9019421892555473e-05, + "loss": 0.5111, + "step": 15072 + }, + { + "epoch": 2.4605526305048775, + "grad_norm": 1.7310724258422852, + "learning_rate": 1.9019284845779927e-05, + "loss": 0.5942, + "step": 15073 + }, + { + "epoch": 2.460715889147382, + "grad_norm": 2.0403265953063965, + "learning_rate": 1.9019147789921965e-05, + "loss": 0.6915, + "step": 15074 + }, + { + "epoch": 2.4608791477898864, + "grad_norm": 2.352276086807251, + "learning_rate": 1.9019010724981716e-05, + "loss": 0.737, + "step": 15075 + }, + { + "epoch": 2.4610424064323904, + "grad_norm": 1.679337978363037, + "learning_rate": 1.9018873650959333e-05, + "loss": 0.4336, + "step": 15076 + }, + { + "epoch": 2.461205665074895, + "grad_norm": 2.0560288429260254, + "learning_rate": 1.9018736567854943e-05, + "loss": 0.7122, + "step": 15077 + }, + { + "epoch": 2.4613689237173992, + "grad_norm": 1.8166484832763672, + "learning_rate": 1.901859947566869e-05, + "loss": 0.6747, + "step": 15078 + }, + { + "epoch": 2.4615321823599037, + "grad_norm": 1.947636604309082, + "learning_rate": 1.901846237440071e-05, + "loss": 0.5478, + "step": 15079 + }, + { + "epoch": 2.461695441002408, + "grad_norm": 1.780179500579834, + "learning_rate": 1.901832526405114e-05, + "loss": 0.5968, + "step": 15080 + }, + { + "epoch": 2.4618586996449126, + "grad_norm": 1.8683199882507324, + "learning_rate": 1.901818814462012e-05, + "loss": 0.6298, + "step": 15081 + }, + { + "epoch": 2.462021958287417, + "grad_norm": 1.784684658050537, + "learning_rate": 1.9018051016107784e-05, + "loss": 0.6407, + "step": 15082 + }, + { + "epoch": 2.462185216929921, + "grad_norm": 1.8813763856887817, + "learning_rate": 1.9017913878514274e-05, + "loss": 0.6266, + "step": 15083 + }, + { + "epoch": 2.4623484755724254, + "grad_norm": 1.8458768129348755, + "learning_rate": 1.9017776731839726e-05, + "loss": 0.6466, + "step": 15084 + }, + { + "epoch": 2.46251173421493, + "grad_norm": 2.0297114849090576, + "learning_rate": 1.901763957608428e-05, + "loss": 0.6608, + "step": 15085 + }, + { + "epoch": 2.4626749928574343, + "grad_norm": 1.710324764251709, + "learning_rate": 1.9017502411248076e-05, + "loss": 0.5125, + "step": 15086 + }, + { + "epoch": 2.4628382514999387, + "grad_norm": 1.6096725463867188, + "learning_rate": 1.9017365237331245e-05, + "loss": 0.5301, + "step": 15087 + }, + { + "epoch": 2.463001510142443, + "grad_norm": 1.76421320438385, + "learning_rate": 1.9017228054333936e-05, + "loss": 0.5355, + "step": 15088 + }, + { + "epoch": 2.4631647687849476, + "grad_norm": 1.6556580066680908, + "learning_rate": 1.9017090862256275e-05, + "loss": 0.5233, + "step": 15089 + }, + { + "epoch": 2.463328027427452, + "grad_norm": 1.7011058330535889, + "learning_rate": 1.901695366109841e-05, + "loss": 0.4933, + "step": 15090 + }, + { + "epoch": 2.4634912860699565, + "grad_norm": 1.8047064542770386, + "learning_rate": 1.9016816450860474e-05, + "loss": 0.5453, + "step": 15091 + }, + { + "epoch": 2.463654544712461, + "grad_norm": 1.6429277658462524, + "learning_rate": 1.9016679231542602e-05, + "loss": 0.5101, + "step": 15092 + }, + { + "epoch": 2.463817803354965, + "grad_norm": 1.814264178276062, + "learning_rate": 1.9016542003144943e-05, + "loss": 0.611, + "step": 15093 + }, + { + "epoch": 2.4639810619974694, + "grad_norm": 1.573014736175537, + "learning_rate": 1.9016404765667624e-05, + "loss": 0.4746, + "step": 15094 + }, + { + "epoch": 2.464144320639974, + "grad_norm": 1.8303940296173096, + "learning_rate": 1.901626751911079e-05, + "loss": 0.6112, + "step": 15095 + }, + { + "epoch": 2.4643075792824782, + "grad_norm": 1.7938103675842285, + "learning_rate": 1.9016130263474573e-05, + "loss": 0.613, + "step": 15096 + }, + { + "epoch": 2.4644708379249827, + "grad_norm": 1.808402419090271, + "learning_rate": 1.901599299875912e-05, + "loss": 0.5745, + "step": 15097 + }, + { + "epoch": 2.464634096567487, + "grad_norm": 1.6637253761291504, + "learning_rate": 1.901585572496456e-05, + "loss": 0.6278, + "step": 15098 + }, + { + "epoch": 2.4647973552099915, + "grad_norm": 1.875270128250122, + "learning_rate": 1.901571844209104e-05, + "loss": 0.5814, + "step": 15099 + }, + { + "epoch": 2.464960613852496, + "grad_norm": 1.7218828201293945, + "learning_rate": 1.9015581150138693e-05, + "loss": 0.5247, + "step": 15100 + }, + { + "epoch": 2.465123872495, + "grad_norm": 1.9707529544830322, + "learning_rate": 1.9015443849107655e-05, + "loss": 0.6169, + "step": 15101 + }, + { + "epoch": 2.4652871311375044, + "grad_norm": 1.5089385509490967, + "learning_rate": 1.901530653899807e-05, + "loss": 0.4602, + "step": 15102 + }, + { + "epoch": 2.465450389780009, + "grad_norm": 1.6014118194580078, + "learning_rate": 1.9015169219810073e-05, + "loss": 0.4803, + "step": 15103 + }, + { + "epoch": 2.4656136484225133, + "grad_norm": 1.8710399866104126, + "learning_rate": 1.9015031891543805e-05, + "loss": 0.6258, + "step": 15104 + }, + { + "epoch": 2.4657769070650177, + "grad_norm": 2.395296335220337, + "learning_rate": 1.90148945541994e-05, + "loss": 0.4889, + "step": 15105 + }, + { + "epoch": 2.465940165707522, + "grad_norm": 1.8598685264587402, + "learning_rate": 1.9014757207776998e-05, + "loss": 0.6156, + "step": 15106 + }, + { + "epoch": 2.4661034243500266, + "grad_norm": 2.0251171588897705, + "learning_rate": 1.901461985227674e-05, + "loss": 0.7113, + "step": 15107 + }, + { + "epoch": 2.466266682992531, + "grad_norm": 1.9036893844604492, + "learning_rate": 1.9014482487698762e-05, + "loss": 0.6309, + "step": 15108 + }, + { + "epoch": 2.4664299416350355, + "grad_norm": 1.8295382261276245, + "learning_rate": 1.9014345114043203e-05, + "loss": 0.6087, + "step": 15109 + }, + { + "epoch": 2.46659320027754, + "grad_norm": 1.6802847385406494, + "learning_rate": 1.9014207731310202e-05, + "loss": 0.5688, + "step": 15110 + }, + { + "epoch": 2.466756458920044, + "grad_norm": 1.7458312511444092, + "learning_rate": 1.901407033949989e-05, + "loss": 0.4958, + "step": 15111 + }, + { + "epoch": 2.4669197175625484, + "grad_norm": 2.0647404193878174, + "learning_rate": 1.9013932938612417e-05, + "loss": 0.6858, + "step": 15112 + }, + { + "epoch": 2.467082976205053, + "grad_norm": 1.7774913311004639, + "learning_rate": 1.9013795528647913e-05, + "loss": 0.6144, + "step": 15113 + }, + { + "epoch": 2.4672462348475572, + "grad_norm": 1.5004442930221558, + "learning_rate": 1.9013658109606523e-05, + "loss": 0.4796, + "step": 15114 + }, + { + "epoch": 2.4674094934900617, + "grad_norm": 1.7838020324707031, + "learning_rate": 1.901352068148838e-05, + "loss": 0.5532, + "step": 15115 + }, + { + "epoch": 2.467572752132566, + "grad_norm": 1.670720100402832, + "learning_rate": 1.9013383244293623e-05, + "loss": 0.5051, + "step": 15116 + }, + { + "epoch": 2.4677360107750705, + "grad_norm": 1.9448769092559814, + "learning_rate": 1.901324579802239e-05, + "loss": 0.6159, + "step": 15117 + }, + { + "epoch": 2.467899269417575, + "grad_norm": 1.898565411567688, + "learning_rate": 1.9013108342674824e-05, + "loss": 0.6134, + "step": 15118 + }, + { + "epoch": 2.468062528060079, + "grad_norm": 1.9035621881484985, + "learning_rate": 1.9012970878251062e-05, + "loss": 0.5897, + "step": 15119 + }, + { + "epoch": 2.4682257867025834, + "grad_norm": 1.769464135169983, + "learning_rate": 1.9012833404751237e-05, + "loss": 0.511, + "step": 15120 + }, + { + "epoch": 2.468389045345088, + "grad_norm": 1.6830134391784668, + "learning_rate": 1.901269592217549e-05, + "loss": 0.5636, + "step": 15121 + }, + { + "epoch": 2.4685523039875923, + "grad_norm": 1.6824496984481812, + "learning_rate": 1.9012558430523964e-05, + "loss": 0.6103, + "step": 15122 + }, + { + "epoch": 2.4687155626300967, + "grad_norm": 1.6571964025497437, + "learning_rate": 1.901242092979679e-05, + "loss": 0.4837, + "step": 15123 + }, + { + "epoch": 2.468878821272601, + "grad_norm": 1.5567094087600708, + "learning_rate": 1.9012283419994115e-05, + "loss": 0.478, + "step": 15124 + }, + { + "epoch": 2.4690420799151056, + "grad_norm": 1.8253682851791382, + "learning_rate": 1.9012145901116072e-05, + "loss": 0.5947, + "step": 15125 + }, + { + "epoch": 2.46920533855761, + "grad_norm": 1.6572198867797852, + "learning_rate": 1.9012008373162796e-05, + "loss": 0.5015, + "step": 15126 + }, + { + "epoch": 2.4693685972001145, + "grad_norm": 1.8171839714050293, + "learning_rate": 1.9011870836134437e-05, + "loss": 0.5597, + "step": 15127 + }, + { + "epoch": 2.469531855842619, + "grad_norm": 1.7802621126174927, + "learning_rate": 1.901173329003112e-05, + "loss": 0.5633, + "step": 15128 + }, + { + "epoch": 2.469695114485123, + "grad_norm": 1.8197767734527588, + "learning_rate": 1.9011595734852997e-05, + "loss": 0.5921, + "step": 15129 + }, + { + "epoch": 2.4698583731276273, + "grad_norm": 1.727607011795044, + "learning_rate": 1.9011458170600195e-05, + "loss": 0.4911, + "step": 15130 + }, + { + "epoch": 2.470021631770132, + "grad_norm": 1.8526906967163086, + "learning_rate": 1.9011320597272855e-05, + "loss": 0.6218, + "step": 15131 + }, + { + "epoch": 2.4701848904126362, + "grad_norm": 1.9520831108093262, + "learning_rate": 1.9011183014871122e-05, + "loss": 0.5625, + "step": 15132 + }, + { + "epoch": 2.4703481490551407, + "grad_norm": 1.952811598777771, + "learning_rate": 1.9011045423395126e-05, + "loss": 0.6225, + "step": 15133 + }, + { + "epoch": 2.470511407697645, + "grad_norm": 1.8954098224639893, + "learning_rate": 1.9010907822845014e-05, + "loss": 0.6284, + "step": 15134 + }, + { + "epoch": 2.4706746663401495, + "grad_norm": 2.031651258468628, + "learning_rate": 1.9010770213220916e-05, + "loss": 0.7121, + "step": 15135 + }, + { + "epoch": 2.4708379249826535, + "grad_norm": 1.8550819158554077, + "learning_rate": 1.9010632594522978e-05, + "loss": 0.5702, + "step": 15136 + }, + { + "epoch": 2.471001183625158, + "grad_norm": 1.689990758895874, + "learning_rate": 1.9010494966751334e-05, + "loss": 0.5347, + "step": 15137 + }, + { + "epoch": 2.4711644422676624, + "grad_norm": 1.8603922128677368, + "learning_rate": 1.9010357329906125e-05, + "loss": 0.5533, + "step": 15138 + }, + { + "epoch": 2.471327700910167, + "grad_norm": 1.677438497543335, + "learning_rate": 1.901021968398749e-05, + "loss": 0.5382, + "step": 15139 + }, + { + "epoch": 2.4714909595526713, + "grad_norm": 1.8486850261688232, + "learning_rate": 1.901008202899556e-05, + "loss": 0.5667, + "step": 15140 + }, + { + "epoch": 2.4716542181951757, + "grad_norm": 1.883485198020935, + "learning_rate": 1.9009944364930484e-05, + "loss": 0.6736, + "step": 15141 + }, + { + "epoch": 2.47181747683768, + "grad_norm": 1.6772841215133667, + "learning_rate": 1.90098066917924e-05, + "loss": 0.522, + "step": 15142 + }, + { + "epoch": 2.4719807354801846, + "grad_norm": 1.6935551166534424, + "learning_rate": 1.9009669009581437e-05, + "loss": 0.5793, + "step": 15143 + }, + { + "epoch": 2.472143994122689, + "grad_norm": 1.7868235111236572, + "learning_rate": 1.900953131829774e-05, + "loss": 0.4982, + "step": 15144 + }, + { + "epoch": 2.4723072527651935, + "grad_norm": 2.2121779918670654, + "learning_rate": 1.900939361794145e-05, + "loss": 0.5456, + "step": 15145 + }, + { + "epoch": 2.4724705114076975, + "grad_norm": 1.795838475227356, + "learning_rate": 1.9009255908512704e-05, + "loss": 0.7068, + "step": 15146 + }, + { + "epoch": 2.472633770050202, + "grad_norm": 1.62517511844635, + "learning_rate": 1.9009118190011638e-05, + "loss": 0.57, + "step": 15147 + }, + { + "epoch": 2.4727970286927063, + "grad_norm": 1.8500266075134277, + "learning_rate": 1.900898046243839e-05, + "loss": 0.5699, + "step": 15148 + }, + { + "epoch": 2.472960287335211, + "grad_norm": 1.863038420677185, + "learning_rate": 1.9008842725793105e-05, + "loss": 0.6256, + "step": 15149 + }, + { + "epoch": 2.473123545977715, + "grad_norm": 1.8703656196594238, + "learning_rate": 1.9008704980075915e-05, + "loss": 0.6495, + "step": 15150 + }, + { + "epoch": 2.4732868046202197, + "grad_norm": 1.809712529182434, + "learning_rate": 1.900856722528696e-05, + "loss": 0.6218, + "step": 15151 + }, + { + "epoch": 2.473450063262724, + "grad_norm": 1.9566702842712402, + "learning_rate": 1.9008429461426384e-05, + "loss": 0.56, + "step": 15152 + }, + { + "epoch": 2.4736133219052285, + "grad_norm": 1.5516674518585205, + "learning_rate": 1.9008291688494323e-05, + "loss": 0.5836, + "step": 15153 + }, + { + "epoch": 2.4737765805477325, + "grad_norm": 1.9827086925506592, + "learning_rate": 1.9008153906490913e-05, + "loss": 0.6463, + "step": 15154 + }, + { + "epoch": 2.473939839190237, + "grad_norm": 1.7164757251739502, + "learning_rate": 1.900801611541629e-05, + "loss": 0.4705, + "step": 15155 + }, + { + "epoch": 2.4741030978327414, + "grad_norm": 1.7821171283721924, + "learning_rate": 1.9007878315270604e-05, + "loss": 0.478, + "step": 15156 + }, + { + "epoch": 2.474266356475246, + "grad_norm": 2.3209540843963623, + "learning_rate": 1.9007740506053983e-05, + "loss": 0.6818, + "step": 15157 + }, + { + "epoch": 2.4744296151177503, + "grad_norm": 1.5763410329818726, + "learning_rate": 1.900760268776657e-05, + "loss": 0.5306, + "step": 15158 + }, + { + "epoch": 2.4745928737602547, + "grad_norm": 2.298036813735962, + "learning_rate": 1.9007464860408504e-05, + "loss": 0.6126, + "step": 15159 + }, + { + "epoch": 2.474756132402759, + "grad_norm": 2.6756398677825928, + "learning_rate": 1.9007327023979924e-05, + "loss": 0.6266, + "step": 15160 + }, + { + "epoch": 2.4749193910452636, + "grad_norm": 1.3603636026382446, + "learning_rate": 1.900718917848097e-05, + "loss": 0.4576, + "step": 15161 + }, + { + "epoch": 2.475082649687768, + "grad_norm": 1.667629361152649, + "learning_rate": 1.900705132391177e-05, + "loss": 0.5269, + "step": 15162 + }, + { + "epoch": 2.4752459083302725, + "grad_norm": 1.5204596519470215, + "learning_rate": 1.900691346027248e-05, + "loss": 0.475, + "step": 15163 + }, + { + "epoch": 2.4754091669727765, + "grad_norm": 1.5564417839050293, + "learning_rate": 1.900677558756323e-05, + "loss": 0.5074, + "step": 15164 + }, + { + "epoch": 2.475572425615281, + "grad_norm": 1.821802020072937, + "learning_rate": 1.9006637705784155e-05, + "loss": 0.5144, + "step": 15165 + }, + { + "epoch": 2.4757356842577853, + "grad_norm": 2.4843497276306152, + "learning_rate": 1.9006499814935405e-05, + "loss": 0.6783, + "step": 15166 + }, + { + "epoch": 2.47589894290029, + "grad_norm": 1.717063546180725, + "learning_rate": 1.9006361915017107e-05, + "loss": 0.4608, + "step": 15167 + }, + { + "epoch": 2.476062201542794, + "grad_norm": 1.9928948879241943, + "learning_rate": 1.9006224006029404e-05, + "loss": 0.662, + "step": 15168 + }, + { + "epoch": 2.4762254601852987, + "grad_norm": 2.0073435306549072, + "learning_rate": 1.9006086087972438e-05, + "loss": 0.4893, + "step": 15169 + }, + { + "epoch": 2.476388718827803, + "grad_norm": 2.0309934616088867, + "learning_rate": 1.9005948160846347e-05, + "loss": 0.5092, + "step": 15170 + }, + { + "epoch": 2.476551977470307, + "grad_norm": 1.8023977279663086, + "learning_rate": 1.9005810224651265e-05, + "loss": 0.5427, + "step": 15171 + }, + { + "epoch": 2.4767152361128115, + "grad_norm": 1.6927844285964966, + "learning_rate": 1.900567227938734e-05, + "loss": 0.486, + "step": 15172 + }, + { + "epoch": 2.476878494755316, + "grad_norm": 1.8854275941848755, + "learning_rate": 1.9005534325054703e-05, + "loss": 0.5394, + "step": 15173 + }, + { + "epoch": 2.4770417533978204, + "grad_norm": 1.5753405094146729, + "learning_rate": 1.9005396361653492e-05, + "loss": 0.5319, + "step": 15174 + }, + { + "epoch": 2.477205012040325, + "grad_norm": 1.9788743257522583, + "learning_rate": 1.900525838918385e-05, + "loss": 0.6757, + "step": 15175 + }, + { + "epoch": 2.4773682706828293, + "grad_norm": 1.8647326231002808, + "learning_rate": 1.900512040764592e-05, + "loss": 0.5689, + "step": 15176 + }, + { + "epoch": 2.4775315293253337, + "grad_norm": 1.7844104766845703, + "learning_rate": 1.9004982417039832e-05, + "loss": 0.6098, + "step": 15177 + }, + { + "epoch": 2.477694787967838, + "grad_norm": 2.419398069381714, + "learning_rate": 1.900484441736573e-05, + "loss": 0.6842, + "step": 15178 + }, + { + "epoch": 2.4778580466103426, + "grad_norm": 2.2698049545288086, + "learning_rate": 1.900470640862375e-05, + "loss": 0.6117, + "step": 15179 + }, + { + "epoch": 2.478021305252847, + "grad_norm": 1.9199270009994507, + "learning_rate": 1.900456839081404e-05, + "loss": 0.5222, + "step": 15180 + }, + { + "epoch": 2.478184563895351, + "grad_norm": 1.8950188159942627, + "learning_rate": 1.9004430363936724e-05, + "loss": 0.5419, + "step": 15181 + }, + { + "epoch": 2.4783478225378555, + "grad_norm": 1.6892343759536743, + "learning_rate": 1.9004292327991952e-05, + "loss": 0.5608, + "step": 15182 + }, + { + "epoch": 2.47851108118036, + "grad_norm": 1.599321961402893, + "learning_rate": 1.900415428297986e-05, + "loss": 0.5418, + "step": 15183 + }, + { + "epoch": 2.4786743398228643, + "grad_norm": 1.99397611618042, + "learning_rate": 1.9004016228900588e-05, + "loss": 0.6084, + "step": 15184 + }, + { + "epoch": 2.4788375984653688, + "grad_norm": 1.6892507076263428, + "learning_rate": 1.9003878165754274e-05, + "loss": 0.6074, + "step": 15185 + }, + { + "epoch": 2.479000857107873, + "grad_norm": 1.6449456214904785, + "learning_rate": 1.9003740093541055e-05, + "loss": 0.5475, + "step": 15186 + }, + { + "epoch": 2.4791641157503777, + "grad_norm": 2.0181963443756104, + "learning_rate": 1.9003602012261074e-05, + "loss": 0.6803, + "step": 15187 + }, + { + "epoch": 2.479327374392882, + "grad_norm": 1.7729406356811523, + "learning_rate": 1.900346392191447e-05, + "loss": 0.575, + "step": 15188 + }, + { + "epoch": 2.479490633035386, + "grad_norm": 2.0336103439331055, + "learning_rate": 1.900332582250138e-05, + "loss": 0.6859, + "step": 15189 + }, + { + "epoch": 2.4796538916778905, + "grad_norm": 1.8749920129776, + "learning_rate": 1.9003187714021936e-05, + "loss": 0.5831, + "step": 15190 + }, + { + "epoch": 2.479817150320395, + "grad_norm": 1.7840384244918823, + "learning_rate": 1.900304959647629e-05, + "loss": 0.5962, + "step": 15191 + }, + { + "epoch": 2.4799804089628994, + "grad_norm": 1.561258316040039, + "learning_rate": 1.9002911469864576e-05, + "loss": 0.5054, + "step": 15192 + }, + { + "epoch": 2.480143667605404, + "grad_norm": 1.976454496383667, + "learning_rate": 1.900277333418693e-05, + "loss": 0.7426, + "step": 15193 + }, + { + "epoch": 2.4803069262479083, + "grad_norm": 1.9106090068817139, + "learning_rate": 1.90026351894435e-05, + "loss": 0.6512, + "step": 15194 + }, + { + "epoch": 2.4804701848904127, + "grad_norm": 1.602236270904541, + "learning_rate": 1.900249703563441e-05, + "loss": 0.581, + "step": 15195 + }, + { + "epoch": 2.480633443532917, + "grad_norm": 1.7835651636123657, + "learning_rate": 1.9002358872759813e-05, + "loss": 0.5008, + "step": 15196 + }, + { + "epoch": 2.4807967021754216, + "grad_norm": 1.5464705228805542, + "learning_rate": 1.900222070081984e-05, + "loss": 0.4333, + "step": 15197 + }, + { + "epoch": 2.480959960817926, + "grad_norm": 2.0875637531280518, + "learning_rate": 1.9002082519814636e-05, + "loss": 0.5532, + "step": 15198 + }, + { + "epoch": 2.48112321946043, + "grad_norm": 1.9468439817428589, + "learning_rate": 1.9001944329744335e-05, + "loss": 0.5839, + "step": 15199 + }, + { + "epoch": 2.4812864781029345, + "grad_norm": 1.879669189453125, + "learning_rate": 1.900180613060908e-05, + "loss": 0.5882, + "step": 15200 + }, + { + "epoch": 2.481449736745439, + "grad_norm": 1.751906394958496, + "learning_rate": 1.9001667922409008e-05, + "loss": 0.5233, + "step": 15201 + }, + { + "epoch": 2.4816129953879433, + "grad_norm": 1.9696546792984009, + "learning_rate": 1.900152970514426e-05, + "loss": 0.5081, + "step": 15202 + }, + { + "epoch": 2.4817762540304478, + "grad_norm": 1.6874808073043823, + "learning_rate": 1.900139147881497e-05, + "loss": 0.5831, + "step": 15203 + }, + { + "epoch": 2.481939512672952, + "grad_norm": 1.6555726528167725, + "learning_rate": 1.9001253243421286e-05, + "loss": 0.537, + "step": 15204 + }, + { + "epoch": 2.4821027713154566, + "grad_norm": 1.921563982963562, + "learning_rate": 1.900111499896334e-05, + "loss": 0.5775, + "step": 15205 + }, + { + "epoch": 2.482266029957961, + "grad_norm": 1.6779253482818604, + "learning_rate": 1.9000976745441277e-05, + "loss": 0.6254, + "step": 15206 + }, + { + "epoch": 2.482429288600465, + "grad_norm": 1.539422631263733, + "learning_rate": 1.9000838482855228e-05, + "loss": 0.5699, + "step": 15207 + }, + { + "epoch": 2.4825925472429695, + "grad_norm": 1.6548807621002197, + "learning_rate": 1.9000700211205337e-05, + "loss": 0.5166, + "step": 15208 + }, + { + "epoch": 2.482755805885474, + "grad_norm": 1.6303443908691406, + "learning_rate": 1.9000561930491746e-05, + "loss": 0.5526, + "step": 15209 + }, + { + "epoch": 2.4829190645279784, + "grad_norm": 2.0284645557403564, + "learning_rate": 1.9000423640714595e-05, + "loss": 0.5512, + "step": 15210 + }, + { + "epoch": 2.483082323170483, + "grad_norm": 1.4915478229522705, + "learning_rate": 1.9000285341874014e-05, + "loss": 0.4477, + "step": 15211 + }, + { + "epoch": 2.4832455818129873, + "grad_norm": 1.590324878692627, + "learning_rate": 1.9000147033970148e-05, + "loss": 0.5298, + "step": 15212 + }, + { + "epoch": 2.4834088404554917, + "grad_norm": 1.695208191871643, + "learning_rate": 1.9000008717003137e-05, + "loss": 0.515, + "step": 15213 + }, + { + "epoch": 2.483572099097996, + "grad_norm": 1.4615570306777954, + "learning_rate": 1.899987039097312e-05, + "loss": 0.4946, + "step": 15214 + }, + { + "epoch": 2.4837353577405006, + "grad_norm": 1.7002546787261963, + "learning_rate": 1.899973205588024e-05, + "loss": 0.5536, + "step": 15215 + }, + { + "epoch": 2.483898616383005, + "grad_norm": 1.7983248233795166, + "learning_rate": 1.8999593711724626e-05, + "loss": 0.5377, + "step": 15216 + }, + { + "epoch": 2.484061875025509, + "grad_norm": 1.9844273328781128, + "learning_rate": 1.8999455358506427e-05, + "loss": 0.6496, + "step": 15217 + }, + { + "epoch": 2.4842251336680135, + "grad_norm": 1.8883588314056396, + "learning_rate": 1.899931699622578e-05, + "loss": 0.6048, + "step": 15218 + }, + { + "epoch": 2.484388392310518, + "grad_norm": 1.7291942834854126, + "learning_rate": 1.8999178624882818e-05, + "loss": 0.5524, + "step": 15219 + }, + { + "epoch": 2.4845516509530223, + "grad_norm": 1.7158102989196777, + "learning_rate": 1.899904024447769e-05, + "loss": 0.5377, + "step": 15220 + }, + { + "epoch": 2.4847149095955268, + "grad_norm": 2.1454694271087646, + "learning_rate": 1.899890185501053e-05, + "loss": 0.6982, + "step": 15221 + }, + { + "epoch": 2.484878168238031, + "grad_norm": 1.630250096321106, + "learning_rate": 1.899876345648148e-05, + "loss": 0.5719, + "step": 15222 + }, + { + "epoch": 2.4850414268805356, + "grad_norm": 1.9126834869384766, + "learning_rate": 1.8998625048890674e-05, + "loss": 0.6892, + "step": 15223 + }, + { + "epoch": 2.4852046855230396, + "grad_norm": 1.623711347579956, + "learning_rate": 1.8998486632238256e-05, + "loss": 0.5224, + "step": 15224 + }, + { + "epoch": 2.485367944165544, + "grad_norm": 1.9641931056976318, + "learning_rate": 1.8998348206524365e-05, + "loss": 0.6337, + "step": 15225 + }, + { + "epoch": 2.4855312028080485, + "grad_norm": 1.6714107990264893, + "learning_rate": 1.899820977174914e-05, + "loss": 0.4543, + "step": 15226 + }, + { + "epoch": 2.485694461450553, + "grad_norm": 2.3408260345458984, + "learning_rate": 1.899807132791272e-05, + "loss": 0.7726, + "step": 15227 + }, + { + "epoch": 2.4858577200930574, + "grad_norm": 1.7402749061584473, + "learning_rate": 1.899793287501524e-05, + "loss": 0.5057, + "step": 15228 + }, + { + "epoch": 2.486020978735562, + "grad_norm": 1.8257228136062622, + "learning_rate": 1.899779441305685e-05, + "loss": 0.5099, + "step": 15229 + }, + { + "epoch": 2.4861842373780663, + "grad_norm": 1.818774700164795, + "learning_rate": 1.899765594203768e-05, + "loss": 0.5561, + "step": 15230 + }, + { + "epoch": 2.4863474960205707, + "grad_norm": 2.010359764099121, + "learning_rate": 1.8997517461957877e-05, + "loss": 0.6246, + "step": 15231 + }, + { + "epoch": 2.486510754663075, + "grad_norm": 1.5944195985794067, + "learning_rate": 1.8997378972817572e-05, + "loss": 0.5231, + "step": 15232 + }, + { + "epoch": 2.4866740133055796, + "grad_norm": 1.987410545349121, + "learning_rate": 1.8997240474616912e-05, + "loss": 0.7832, + "step": 15233 + }, + { + "epoch": 2.4868372719480836, + "grad_norm": 1.8529878854751587, + "learning_rate": 1.899710196735603e-05, + "loss": 0.6229, + "step": 15234 + }, + { + "epoch": 2.487000530590588, + "grad_norm": 1.608444333076477, + "learning_rate": 1.899696345103507e-05, + "loss": 0.4981, + "step": 15235 + }, + { + "epoch": 2.4871637892330924, + "grad_norm": 1.7475303411483765, + "learning_rate": 1.8996824925654172e-05, + "loss": 0.5922, + "step": 15236 + }, + { + "epoch": 2.487327047875597, + "grad_norm": 1.798302173614502, + "learning_rate": 1.8996686391213472e-05, + "loss": 0.5363, + "step": 15237 + }, + { + "epoch": 2.4874903065181013, + "grad_norm": 2.215925693511963, + "learning_rate": 1.899654784771311e-05, + "loss": 0.6162, + "step": 15238 + }, + { + "epoch": 2.4876535651606058, + "grad_norm": 1.8501368761062622, + "learning_rate": 1.899640929515323e-05, + "loss": 0.5736, + "step": 15239 + }, + { + "epoch": 2.48781682380311, + "grad_norm": 1.763005018234253, + "learning_rate": 1.8996270733533965e-05, + "loss": 0.6036, + "step": 15240 + }, + { + "epoch": 2.4879800824456146, + "grad_norm": 1.5302706956863403, + "learning_rate": 1.899613216285546e-05, + "loss": 0.5074, + "step": 15241 + }, + { + "epoch": 2.4881433410881186, + "grad_norm": 1.887856364250183, + "learning_rate": 1.899599358311785e-05, + "loss": 0.615, + "step": 15242 + }, + { + "epoch": 2.488306599730623, + "grad_norm": 2.036198616027832, + "learning_rate": 1.899585499432128e-05, + "loss": 0.631, + "step": 15243 + }, + { + "epoch": 2.4884698583731275, + "grad_norm": 1.6444265842437744, + "learning_rate": 1.8995716396465886e-05, + "loss": 0.5222, + "step": 15244 + }, + { + "epoch": 2.488633117015632, + "grad_norm": 1.5952168703079224, + "learning_rate": 1.8995577789551806e-05, + "loss": 0.4757, + "step": 15245 + }, + { + "epoch": 2.4887963756581364, + "grad_norm": 1.681412935256958, + "learning_rate": 1.8995439173579183e-05, + "loss": 0.5971, + "step": 15246 + }, + { + "epoch": 2.488959634300641, + "grad_norm": 1.9150434732437134, + "learning_rate": 1.8995300548548153e-05, + "loss": 0.622, + "step": 15247 + }, + { + "epoch": 2.4891228929431453, + "grad_norm": 1.6267591714859009, + "learning_rate": 1.8995161914458858e-05, + "loss": 0.5587, + "step": 15248 + }, + { + "epoch": 2.4892861515856497, + "grad_norm": 1.592461347579956, + "learning_rate": 1.899502327131144e-05, + "loss": 0.4647, + "step": 15249 + }, + { + "epoch": 2.489449410228154, + "grad_norm": 2.1651246547698975, + "learning_rate": 1.8994884619106034e-05, + "loss": 0.503, + "step": 15250 + }, + { + "epoch": 2.4896126688706586, + "grad_norm": 1.6857830286026, + "learning_rate": 1.8994745957842783e-05, + "loss": 0.5857, + "step": 15251 + }, + { + "epoch": 2.4897759275131626, + "grad_norm": 2.2691988945007324, + "learning_rate": 1.8994607287521822e-05, + "loss": 0.7148, + "step": 15252 + }, + { + "epoch": 2.489939186155667, + "grad_norm": 1.685960054397583, + "learning_rate": 1.8994468608143295e-05, + "loss": 0.5004, + "step": 15253 + }, + { + "epoch": 2.4901024447981714, + "grad_norm": 2.007960557937622, + "learning_rate": 1.8994329919707342e-05, + "loss": 0.5989, + "step": 15254 + }, + { + "epoch": 2.490265703440676, + "grad_norm": 1.7944884300231934, + "learning_rate": 1.89941912222141e-05, + "loss": 0.6293, + "step": 15255 + }, + { + "epoch": 2.4904289620831803, + "grad_norm": 1.9970473051071167, + "learning_rate": 1.899405251566371e-05, + "loss": 0.571, + "step": 15256 + }, + { + "epoch": 2.4905922207256848, + "grad_norm": 1.4883344173431396, + "learning_rate": 1.8993913800056314e-05, + "loss": 0.468, + "step": 15257 + }, + { + "epoch": 2.490755479368189, + "grad_norm": 2.1223599910736084, + "learning_rate": 1.8993775075392047e-05, + "loss": 0.7051, + "step": 15258 + }, + { + "epoch": 2.4909187380106936, + "grad_norm": 1.7278823852539062, + "learning_rate": 1.899363634167105e-05, + "loss": 0.5523, + "step": 15259 + }, + { + "epoch": 2.4910819966531976, + "grad_norm": 1.8149518966674805, + "learning_rate": 1.8993497598893465e-05, + "loss": 0.5478, + "step": 15260 + }, + { + "epoch": 2.491245255295702, + "grad_norm": 1.7363868951797485, + "learning_rate": 1.899335884705943e-05, + "loss": 0.4921, + "step": 15261 + }, + { + "epoch": 2.4914085139382065, + "grad_norm": 1.8275532722473145, + "learning_rate": 1.8993220086169083e-05, + "loss": 0.5882, + "step": 15262 + }, + { + "epoch": 2.491571772580711, + "grad_norm": 1.6526087522506714, + "learning_rate": 1.899308131622257e-05, + "loss": 0.5515, + "step": 15263 + }, + { + "epoch": 2.4917350312232154, + "grad_norm": 1.7846477031707764, + "learning_rate": 1.8992942537220026e-05, + "loss": 0.6172, + "step": 15264 + }, + { + "epoch": 2.49189828986572, + "grad_norm": 1.6031692028045654, + "learning_rate": 1.8992803749161587e-05, + "loss": 0.5059, + "step": 15265 + }, + { + "epoch": 2.4920615485082243, + "grad_norm": 1.8497732877731323, + "learning_rate": 1.89926649520474e-05, + "loss": 0.6631, + "step": 15266 + }, + { + "epoch": 2.4922248071507287, + "grad_norm": 1.8849421739578247, + "learning_rate": 1.8992526145877603e-05, + "loss": 0.6304, + "step": 15267 + }, + { + "epoch": 2.492388065793233, + "grad_norm": 1.8591523170471191, + "learning_rate": 1.899238733065233e-05, + "loss": 0.5659, + "step": 15268 + }, + { + "epoch": 2.492551324435737, + "grad_norm": 1.6644717454910278, + "learning_rate": 1.899224850637173e-05, + "loss": 0.4635, + "step": 15269 + }, + { + "epoch": 2.4927145830782416, + "grad_norm": 1.6555181741714478, + "learning_rate": 1.8992109673035936e-05, + "loss": 0.6116, + "step": 15270 + }, + { + "epoch": 2.492877841720746, + "grad_norm": 1.6697585582733154, + "learning_rate": 1.899197083064509e-05, + "loss": 0.5205, + "step": 15271 + }, + { + "epoch": 2.4930411003632504, + "grad_norm": 1.789363145828247, + "learning_rate": 1.8991831979199335e-05, + "loss": 0.5597, + "step": 15272 + }, + { + "epoch": 2.493204359005755, + "grad_norm": 2.005833148956299, + "learning_rate": 1.8991693118698803e-05, + "loss": 0.5309, + "step": 15273 + }, + { + "epoch": 2.4933676176482593, + "grad_norm": 1.5895578861236572, + "learning_rate": 1.899155424914364e-05, + "loss": 0.5387, + "step": 15274 + }, + { + "epoch": 2.4935308762907638, + "grad_norm": 1.6257562637329102, + "learning_rate": 1.8991415370533984e-05, + "loss": 0.4927, + "step": 15275 + }, + { + "epoch": 2.493694134933268, + "grad_norm": 1.803202748298645, + "learning_rate": 1.8991276482869976e-05, + "loss": 0.5953, + "step": 15276 + }, + { + "epoch": 2.493857393575772, + "grad_norm": 1.58625328540802, + "learning_rate": 1.8991137586151753e-05, + "loss": 0.4788, + "step": 15277 + }, + { + "epoch": 2.4940206522182766, + "grad_norm": 2.1425929069519043, + "learning_rate": 1.8990998680379458e-05, + "loss": 0.6907, + "step": 15278 + }, + { + "epoch": 2.494183910860781, + "grad_norm": 1.7461820840835571, + "learning_rate": 1.899085976555323e-05, + "loss": 0.6068, + "step": 15279 + }, + { + "epoch": 2.4943471695032855, + "grad_norm": 1.4905539751052856, + "learning_rate": 1.8990720841673206e-05, + "loss": 0.4801, + "step": 15280 + }, + { + "epoch": 2.49451042814579, + "grad_norm": 1.4298346042633057, + "learning_rate": 1.8990581908739533e-05, + "loss": 0.4701, + "step": 15281 + }, + { + "epoch": 2.4946736867882944, + "grad_norm": 1.7368241548538208, + "learning_rate": 1.8990442966752345e-05, + "loss": 0.5527, + "step": 15282 + }, + { + "epoch": 2.494836945430799, + "grad_norm": 1.7311266660690308, + "learning_rate": 1.899030401571178e-05, + "loss": 0.5393, + "step": 15283 + }, + { + "epoch": 2.4950002040733033, + "grad_norm": 2.0342414379119873, + "learning_rate": 1.8990165055617987e-05, + "loss": 0.7085, + "step": 15284 + }, + { + "epoch": 2.4951634627158077, + "grad_norm": 1.289905309677124, + "learning_rate": 1.8990026086471097e-05, + "loss": 0.4247, + "step": 15285 + }, + { + "epoch": 2.495326721358312, + "grad_norm": 1.7010374069213867, + "learning_rate": 1.8989887108271255e-05, + "loss": 0.5278, + "step": 15286 + }, + { + "epoch": 2.495489980000816, + "grad_norm": 1.576651930809021, + "learning_rate": 1.8989748121018596e-05, + "loss": 0.4877, + "step": 15287 + }, + { + "epoch": 2.4956532386433206, + "grad_norm": 4.0991716384887695, + "learning_rate": 1.8989609124713265e-05, + "loss": 0.5511, + "step": 15288 + }, + { + "epoch": 2.495816497285825, + "grad_norm": 1.574345350265503, + "learning_rate": 1.89894701193554e-05, + "loss": 0.4738, + "step": 15289 + }, + { + "epoch": 2.4959797559283294, + "grad_norm": 1.840479850769043, + "learning_rate": 1.8989331104945135e-05, + "loss": 0.5494, + "step": 15290 + }, + { + "epoch": 2.496143014570834, + "grad_norm": 1.7220674753189087, + "learning_rate": 1.8989192081482627e-05, + "loss": 0.5814, + "step": 15291 + }, + { + "epoch": 2.4963062732133383, + "grad_norm": 1.5821717977523804, + "learning_rate": 1.8989053048967997e-05, + "loss": 0.5492, + "step": 15292 + }, + { + "epoch": 2.4964695318558427, + "grad_norm": 1.813076376914978, + "learning_rate": 1.8988914007401393e-05, + "loss": 0.4876, + "step": 15293 + }, + { + "epoch": 2.496632790498347, + "grad_norm": 1.4857866764068604, + "learning_rate": 1.898877495678296e-05, + "loss": 0.4909, + "step": 15294 + }, + { + "epoch": 2.496796049140851, + "grad_norm": 1.3186414241790771, + "learning_rate": 1.898863589711283e-05, + "loss": 0.4719, + "step": 15295 + }, + { + "epoch": 2.4969593077833556, + "grad_norm": 2.0170066356658936, + "learning_rate": 1.8988496828391147e-05, + "loss": 0.5521, + "step": 15296 + }, + { + "epoch": 2.49712256642586, + "grad_norm": 1.9184571504592896, + "learning_rate": 1.8988357750618048e-05, + "loss": 0.5693, + "step": 15297 + }, + { + "epoch": 2.4972858250683645, + "grad_norm": 1.8366007804870605, + "learning_rate": 1.8988218663793678e-05, + "loss": 0.5973, + "step": 15298 + }, + { + "epoch": 2.497449083710869, + "grad_norm": 1.6250534057617188, + "learning_rate": 1.898807956791817e-05, + "loss": 0.4282, + "step": 15299 + }, + { + "epoch": 2.4976123423533734, + "grad_norm": 2.224246025085449, + "learning_rate": 1.8987940462991673e-05, + "loss": 0.6863, + "step": 15300 + }, + { + "epoch": 2.497775600995878, + "grad_norm": 1.6314862966537476, + "learning_rate": 1.898780134901432e-05, + "loss": 0.5315, + "step": 15301 + }, + { + "epoch": 2.4979388596383822, + "grad_norm": 1.4989501237869263, + "learning_rate": 1.8987662225986253e-05, + "loss": 0.4616, + "step": 15302 + }, + { + "epoch": 2.4981021182808867, + "grad_norm": 2.0461878776550293, + "learning_rate": 1.8987523093907612e-05, + "loss": 0.6376, + "step": 15303 + }, + { + "epoch": 2.498265376923391, + "grad_norm": 1.9393723011016846, + "learning_rate": 1.898738395277854e-05, + "loss": 0.679, + "step": 15304 + }, + { + "epoch": 2.498428635565895, + "grad_norm": 2.1441428661346436, + "learning_rate": 1.8987244802599175e-05, + "loss": 0.5527, + "step": 15305 + }, + { + "epoch": 2.4985918942083996, + "grad_norm": 2.1467645168304443, + "learning_rate": 1.8987105643369652e-05, + "loss": 0.762, + "step": 15306 + }, + { + "epoch": 2.498755152850904, + "grad_norm": 1.7199220657348633, + "learning_rate": 1.898696647509012e-05, + "loss": 0.5999, + "step": 15307 + }, + { + "epoch": 2.4989184114934084, + "grad_norm": 1.9264482259750366, + "learning_rate": 1.8986827297760714e-05, + "loss": 0.6411, + "step": 15308 + }, + { + "epoch": 2.499081670135913, + "grad_norm": 1.6712530851364136, + "learning_rate": 1.8986688111381576e-05, + "loss": 0.5775, + "step": 15309 + }, + { + "epoch": 2.4992449287784173, + "grad_norm": 1.4681607484817505, + "learning_rate": 1.8986548915952843e-05, + "loss": 0.4458, + "step": 15310 + }, + { + "epoch": 2.4994081874209217, + "grad_norm": 1.9930959939956665, + "learning_rate": 1.8986409711474665e-05, + "loss": 0.5564, + "step": 15311 + }, + { + "epoch": 2.4995714460634257, + "grad_norm": 1.7959718704223633, + "learning_rate": 1.898627049794717e-05, + "loss": 0.5402, + "step": 15312 + }, + { + "epoch": 2.49973470470593, + "grad_norm": 1.711512804031372, + "learning_rate": 1.8986131275370502e-05, + "loss": 0.5727, + "step": 15313 + }, + { + "epoch": 2.4998979633484346, + "grad_norm": 1.5735743045806885, + "learning_rate": 1.8985992043744802e-05, + "loss": 0.5444, + "step": 15314 + }, + { + "epoch": 2.500061221990939, + "grad_norm": 1.7886338233947754, + "learning_rate": 1.8985852803070212e-05, + "loss": 0.567, + "step": 15315 + }, + { + "epoch": 2.5002244806334435, + "grad_norm": 1.5958024263381958, + "learning_rate": 1.8985713553346867e-05, + "loss": 0.5049, + "step": 15316 + }, + { + "epoch": 2.500387739275948, + "grad_norm": 1.9063758850097656, + "learning_rate": 1.8985574294574917e-05, + "loss": 0.553, + "step": 15317 + }, + { + "epoch": 2.5005509979184524, + "grad_norm": 1.4029306173324585, + "learning_rate": 1.898543502675449e-05, + "loss": 0.4095, + "step": 15318 + }, + { + "epoch": 2.500714256560957, + "grad_norm": 1.8453274965286255, + "learning_rate": 1.8985295749885738e-05, + "loss": 0.6256, + "step": 15319 + }, + { + "epoch": 2.5008775152034612, + "grad_norm": 1.4727449417114258, + "learning_rate": 1.898515646396879e-05, + "loss": 0.5519, + "step": 15320 + }, + { + "epoch": 2.5010407738459657, + "grad_norm": 2.1756129264831543, + "learning_rate": 1.8985017169003797e-05, + "loss": 0.7058, + "step": 15321 + }, + { + "epoch": 2.50120403248847, + "grad_norm": 1.8086318969726562, + "learning_rate": 1.8984877864990888e-05, + "loss": 0.4992, + "step": 15322 + }, + { + "epoch": 2.501367291130974, + "grad_norm": 1.976905345916748, + "learning_rate": 1.8984738551930214e-05, + "loss": 0.6073, + "step": 15323 + }, + { + "epoch": 2.5015305497734786, + "grad_norm": 1.7100523710250854, + "learning_rate": 1.898459922982191e-05, + "loss": 0.4957, + "step": 15324 + }, + { + "epoch": 2.501693808415983, + "grad_norm": 2.042928695678711, + "learning_rate": 1.8984459898666116e-05, + "loss": 0.6483, + "step": 15325 + }, + { + "epoch": 2.5018570670584874, + "grad_norm": 2.088162422180176, + "learning_rate": 1.8984320558462976e-05, + "loss": 0.6257, + "step": 15326 + }, + { + "epoch": 2.502020325700992, + "grad_norm": 1.781080961227417, + "learning_rate": 1.8984181209212623e-05, + "loss": 0.5795, + "step": 15327 + }, + { + "epoch": 2.5021835843434963, + "grad_norm": 2.4284236431121826, + "learning_rate": 1.8984041850915206e-05, + "loss": 0.687, + "step": 15328 + }, + { + "epoch": 2.5023468429860003, + "grad_norm": 1.6588951349258423, + "learning_rate": 1.898390248357086e-05, + "loss": 0.4768, + "step": 15329 + }, + { + "epoch": 2.5025101016285047, + "grad_norm": 1.8674017190933228, + "learning_rate": 1.8983763107179726e-05, + "loss": 0.6167, + "step": 15330 + }, + { + "epoch": 2.502673360271009, + "grad_norm": 1.909936547279358, + "learning_rate": 1.8983623721741943e-05, + "loss": 0.5807, + "step": 15331 + }, + { + "epoch": 2.5028366189135136, + "grad_norm": 1.4796059131622314, + "learning_rate": 1.898348432725766e-05, + "loss": 0.5099, + "step": 15332 + }, + { + "epoch": 2.502999877556018, + "grad_norm": 1.7317863702774048, + "learning_rate": 1.8983344923727002e-05, + "loss": 0.5628, + "step": 15333 + }, + { + "epoch": 2.5031631361985225, + "grad_norm": 1.5917835235595703, + "learning_rate": 1.898320551115012e-05, + "loss": 0.5213, + "step": 15334 + }, + { + "epoch": 2.503326394841027, + "grad_norm": 1.8672573566436768, + "learning_rate": 1.898306608952716e-05, + "loss": 0.5514, + "step": 15335 + }, + { + "epoch": 2.5034896534835314, + "grad_norm": 2.1263020038604736, + "learning_rate": 1.8982926658858248e-05, + "loss": 0.5521, + "step": 15336 + }, + { + "epoch": 2.503652912126036, + "grad_norm": 1.7098876237869263, + "learning_rate": 1.8982787219143532e-05, + "loss": 0.5029, + "step": 15337 + }, + { + "epoch": 2.5038161707685402, + "grad_norm": 1.8353744745254517, + "learning_rate": 1.8982647770383152e-05, + "loss": 0.5267, + "step": 15338 + }, + { + "epoch": 2.5039794294110447, + "grad_norm": 2.0243725776672363, + "learning_rate": 1.8982508312577247e-05, + "loss": 0.7436, + "step": 15339 + }, + { + "epoch": 2.5041426880535487, + "grad_norm": 1.8983629941940308, + "learning_rate": 1.898236884572596e-05, + "loss": 0.5658, + "step": 15340 + }, + { + "epoch": 2.504305946696053, + "grad_norm": 1.69605553150177, + "learning_rate": 1.898222936982943e-05, + "loss": 0.5125, + "step": 15341 + }, + { + "epoch": 2.5044692053385575, + "grad_norm": 1.5911725759506226, + "learning_rate": 1.8982089884887797e-05, + "loss": 0.4691, + "step": 15342 + }, + { + "epoch": 2.504632463981062, + "grad_norm": 1.9559563398361206, + "learning_rate": 1.89819503909012e-05, + "loss": 0.6164, + "step": 15343 + }, + { + "epoch": 2.5047957226235664, + "grad_norm": 1.741892695426941, + "learning_rate": 1.8981810887869784e-05, + "loss": 0.5469, + "step": 15344 + }, + { + "epoch": 2.504958981266071, + "grad_norm": 1.7450881004333496, + "learning_rate": 1.8981671375793688e-05, + "loss": 0.5759, + "step": 15345 + }, + { + "epoch": 2.5051222399085753, + "grad_norm": 1.718074917793274, + "learning_rate": 1.8981531854673048e-05, + "loss": 0.5642, + "step": 15346 + }, + { + "epoch": 2.5052854985510793, + "grad_norm": 1.8150519132614136, + "learning_rate": 1.898139232450801e-05, + "loss": 0.554, + "step": 15347 + }, + { + "epoch": 2.5054487571935837, + "grad_norm": 1.5116610527038574, + "learning_rate": 1.8981252785298712e-05, + "loss": 0.4169, + "step": 15348 + }, + { + "epoch": 2.505612015836088, + "grad_norm": 1.7404770851135254, + "learning_rate": 1.8981113237045297e-05, + "loss": 0.5721, + "step": 15349 + }, + { + "epoch": 2.5057752744785926, + "grad_norm": 2.4761667251586914, + "learning_rate": 1.8980973679747897e-05, + "loss": 0.6561, + "step": 15350 + }, + { + "epoch": 2.505938533121097, + "grad_norm": 1.6084812879562378, + "learning_rate": 1.8980834113406666e-05, + "loss": 0.4764, + "step": 15351 + }, + { + "epoch": 2.5061017917636015, + "grad_norm": 1.7925325632095337, + "learning_rate": 1.8980694538021735e-05, + "loss": 0.6335, + "step": 15352 + }, + { + "epoch": 2.506265050406106, + "grad_norm": 1.8174947500228882, + "learning_rate": 1.8980554953593243e-05, + "loss": 0.5407, + "step": 15353 + }, + { + "epoch": 2.5064283090486104, + "grad_norm": 2.1591715812683105, + "learning_rate": 1.898041536012134e-05, + "loss": 0.6912, + "step": 15354 + }, + { + "epoch": 2.506591567691115, + "grad_norm": 1.8550106287002563, + "learning_rate": 1.8980275757606157e-05, + "loss": 0.5265, + "step": 15355 + }, + { + "epoch": 2.5067548263336192, + "grad_norm": 1.8440481424331665, + "learning_rate": 1.8980136146047843e-05, + "loss": 0.6039, + "step": 15356 + }, + { + "epoch": 2.5069180849761237, + "grad_norm": 1.8630213737487793, + "learning_rate": 1.897999652544653e-05, + "loss": 0.5567, + "step": 15357 + }, + { + "epoch": 2.5070813436186277, + "grad_norm": 1.8569464683532715, + "learning_rate": 1.8979856895802364e-05, + "loss": 0.6151, + "step": 15358 + }, + { + "epoch": 2.507244602261132, + "grad_norm": 1.953184962272644, + "learning_rate": 1.8979717257115483e-05, + "loss": 0.5485, + "step": 15359 + }, + { + "epoch": 2.5074078609036365, + "grad_norm": 1.8059511184692383, + "learning_rate": 1.8979577609386033e-05, + "loss": 0.6601, + "step": 15360 + }, + { + "epoch": 2.507571119546141, + "grad_norm": 2.0469164848327637, + "learning_rate": 1.8979437952614152e-05, + "loss": 0.5971, + "step": 15361 + }, + { + "epoch": 2.5077343781886454, + "grad_norm": 2.0267255306243896, + "learning_rate": 1.8979298286799973e-05, + "loss": 0.5241, + "step": 15362 + }, + { + "epoch": 2.50789763683115, + "grad_norm": 1.7859468460083008, + "learning_rate": 1.8979158611943647e-05, + "loss": 0.5398, + "step": 15363 + }, + { + "epoch": 2.5080608954736543, + "grad_norm": 1.7014647722244263, + "learning_rate": 1.897901892804531e-05, + "loss": 0.5599, + "step": 15364 + }, + { + "epoch": 2.5082241541161583, + "grad_norm": 1.8629844188690186, + "learning_rate": 1.8978879235105102e-05, + "loss": 0.523, + "step": 15365 + }, + { + "epoch": 2.5083874127586627, + "grad_norm": 1.5623042583465576, + "learning_rate": 1.897873953312317e-05, + "loss": 0.559, + "step": 15366 + }, + { + "epoch": 2.508550671401167, + "grad_norm": 1.8575050830841064, + "learning_rate": 1.8978599822099647e-05, + "loss": 0.5539, + "step": 15367 + }, + { + "epoch": 2.5087139300436716, + "grad_norm": 1.934207797050476, + "learning_rate": 1.897846010203467e-05, + "loss": 0.6537, + "step": 15368 + }, + { + "epoch": 2.508877188686176, + "grad_norm": 2.8371829986572266, + "learning_rate": 1.8978320372928395e-05, + "loss": 0.7919, + "step": 15369 + }, + { + "epoch": 2.5090404473286805, + "grad_norm": 1.6814024448394775, + "learning_rate": 1.897818063478095e-05, + "loss": 0.5888, + "step": 15370 + }, + { + "epoch": 2.509203705971185, + "grad_norm": 1.7262859344482422, + "learning_rate": 1.897804088759248e-05, + "loss": 0.5028, + "step": 15371 + }, + { + "epoch": 2.5093669646136894, + "grad_norm": 1.9233598709106445, + "learning_rate": 1.8977901131363124e-05, + "loss": 0.7353, + "step": 15372 + }, + { + "epoch": 2.509530223256194, + "grad_norm": 1.8921241760253906, + "learning_rate": 1.8977761366093027e-05, + "loss": 0.6059, + "step": 15373 + }, + { + "epoch": 2.5096934818986982, + "grad_norm": 1.6143262386322021, + "learning_rate": 1.8977621591782323e-05, + "loss": 0.6245, + "step": 15374 + }, + { + "epoch": 2.5098567405412027, + "grad_norm": 1.716952919960022, + "learning_rate": 1.8977481808431156e-05, + "loss": 0.5108, + "step": 15375 + }, + { + "epoch": 2.5100199991837067, + "grad_norm": 1.907759666442871, + "learning_rate": 1.897734201603967e-05, + "loss": 0.5285, + "step": 15376 + }, + { + "epoch": 2.510183257826211, + "grad_norm": 1.6511770486831665, + "learning_rate": 1.8977202214608002e-05, + "loss": 0.5196, + "step": 15377 + }, + { + "epoch": 2.5103465164687155, + "grad_norm": 1.6389825344085693, + "learning_rate": 1.8977062404136296e-05, + "loss": 0.559, + "step": 15378 + }, + { + "epoch": 2.51050977511122, + "grad_norm": 1.9579825401306152, + "learning_rate": 1.8976922584624686e-05, + "loss": 0.5964, + "step": 15379 + }, + { + "epoch": 2.5106730337537244, + "grad_norm": 1.9952925443649292, + "learning_rate": 1.8976782756073323e-05, + "loss": 0.6116, + "step": 15380 + }, + { + "epoch": 2.510836292396229, + "grad_norm": 1.7638075351715088, + "learning_rate": 1.8976642918482337e-05, + "loss": 0.6135, + "step": 15381 + }, + { + "epoch": 2.510999551038733, + "grad_norm": 1.7285293340682983, + "learning_rate": 1.8976503071851876e-05, + "loss": 0.5496, + "step": 15382 + }, + { + "epoch": 2.5111628096812373, + "grad_norm": 2.0081753730773926, + "learning_rate": 1.8976363216182075e-05, + "loss": 0.6473, + "step": 15383 + }, + { + "epoch": 2.5113260683237417, + "grad_norm": 1.5133947134017944, + "learning_rate": 1.8976223351473085e-05, + "loss": 0.5066, + "step": 15384 + }, + { + "epoch": 2.511489326966246, + "grad_norm": 1.7987388372421265, + "learning_rate": 1.8976083477725037e-05, + "loss": 0.6885, + "step": 15385 + }, + { + "epoch": 2.5116525856087506, + "grad_norm": 1.9150711297988892, + "learning_rate": 1.8975943594938077e-05, + "loss": 0.7141, + "step": 15386 + }, + { + "epoch": 2.511815844251255, + "grad_norm": 1.8509345054626465, + "learning_rate": 1.8975803703112343e-05, + "loss": 0.609, + "step": 15387 + }, + { + "epoch": 2.5119791028937595, + "grad_norm": 1.4671988487243652, + "learning_rate": 1.8975663802247978e-05, + "loss": 0.5441, + "step": 15388 + }, + { + "epoch": 2.512142361536264, + "grad_norm": 1.507286787033081, + "learning_rate": 1.897552389234512e-05, + "loss": 0.4773, + "step": 15389 + }, + { + "epoch": 2.5123056201787684, + "grad_norm": 1.7129830121994019, + "learning_rate": 1.8975383973403915e-05, + "loss": 0.4857, + "step": 15390 + }, + { + "epoch": 2.512468878821273, + "grad_norm": 1.5951813459396362, + "learning_rate": 1.8975244045424498e-05, + "loss": 0.6345, + "step": 15391 + }, + { + "epoch": 2.5126321374637772, + "grad_norm": 1.9841580390930176, + "learning_rate": 1.8975104108407012e-05, + "loss": 0.7037, + "step": 15392 + }, + { + "epoch": 2.512795396106281, + "grad_norm": 1.7159720659255981, + "learning_rate": 1.89749641623516e-05, + "loss": 0.503, + "step": 15393 + }, + { + "epoch": 2.5129586547487857, + "grad_norm": 1.8520512580871582, + "learning_rate": 1.8974824207258404e-05, + "loss": 0.5101, + "step": 15394 + }, + { + "epoch": 2.51312191339129, + "grad_norm": 1.7864210605621338, + "learning_rate": 1.8974684243127556e-05, + "loss": 0.666, + "step": 15395 + }, + { + "epoch": 2.5132851720337945, + "grad_norm": 1.454616904258728, + "learning_rate": 1.897454426995921e-05, + "loss": 0.486, + "step": 15396 + }, + { + "epoch": 2.513448430676299, + "grad_norm": 1.5657826662063599, + "learning_rate": 1.8974404287753498e-05, + "loss": 0.5622, + "step": 15397 + }, + { + "epoch": 2.5136116893188034, + "grad_norm": 1.9600003957748413, + "learning_rate": 1.897426429651056e-05, + "loss": 0.6929, + "step": 15398 + }, + { + "epoch": 2.513774947961308, + "grad_norm": 2.154644012451172, + "learning_rate": 1.8974124296230543e-05, + "loss": 0.6483, + "step": 15399 + }, + { + "epoch": 2.513938206603812, + "grad_norm": 1.8486080169677734, + "learning_rate": 1.8973984286913584e-05, + "loss": 0.5765, + "step": 15400 + }, + { + "epoch": 2.5141014652463163, + "grad_norm": 1.4991530179977417, + "learning_rate": 1.8973844268559827e-05, + "loss": 0.5031, + "step": 15401 + }, + { + "epoch": 2.5142647238888207, + "grad_norm": 1.601968765258789, + "learning_rate": 1.897370424116941e-05, + "loss": 0.5304, + "step": 15402 + }, + { + "epoch": 2.514427982531325, + "grad_norm": 1.7877416610717773, + "learning_rate": 1.897356420474248e-05, + "loss": 0.5772, + "step": 15403 + }, + { + "epoch": 2.5145912411738296, + "grad_norm": 1.8593254089355469, + "learning_rate": 1.8973424159279168e-05, + "loss": 0.4647, + "step": 15404 + }, + { + "epoch": 2.514754499816334, + "grad_norm": 1.4628194570541382, + "learning_rate": 1.897328410477962e-05, + "loss": 0.4336, + "step": 15405 + }, + { + "epoch": 2.5149177584588385, + "grad_norm": 1.9044932126998901, + "learning_rate": 1.897314404124398e-05, + "loss": 0.5589, + "step": 15406 + }, + { + "epoch": 2.515081017101343, + "grad_norm": 1.550415277481079, + "learning_rate": 1.8973003968672382e-05, + "loss": 0.5043, + "step": 15407 + }, + { + "epoch": 2.5152442757438473, + "grad_norm": 1.948367714881897, + "learning_rate": 1.8972863887064976e-05, + "loss": 0.4931, + "step": 15408 + }, + { + "epoch": 2.515407534386352, + "grad_norm": 1.710620641708374, + "learning_rate": 1.8972723796421894e-05, + "loss": 0.5084, + "step": 15409 + }, + { + "epoch": 2.5155707930288562, + "grad_norm": 1.8493553400039673, + "learning_rate": 1.8972583696743284e-05, + "loss": 0.6129, + "step": 15410 + }, + { + "epoch": 2.51573405167136, + "grad_norm": 1.5985106229782104, + "learning_rate": 1.8972443588029285e-05, + "loss": 0.5095, + "step": 15411 + }, + { + "epoch": 2.5158973103138647, + "grad_norm": 1.7483115196228027, + "learning_rate": 1.8972303470280037e-05, + "loss": 0.5175, + "step": 15412 + }, + { + "epoch": 2.516060568956369, + "grad_norm": 1.9234007596969604, + "learning_rate": 1.8972163343495685e-05, + "loss": 0.5876, + "step": 15413 + }, + { + "epoch": 2.5162238275988735, + "grad_norm": 1.8653463125228882, + "learning_rate": 1.897202320767636e-05, + "loss": 0.5263, + "step": 15414 + }, + { + "epoch": 2.516387086241378, + "grad_norm": 2.034271001815796, + "learning_rate": 1.897188306282222e-05, + "loss": 0.6845, + "step": 15415 + }, + { + "epoch": 2.5165503448838824, + "grad_norm": 2.0700554847717285, + "learning_rate": 1.8971742908933388e-05, + "loss": 0.6745, + "step": 15416 + }, + { + "epoch": 2.516713603526387, + "grad_norm": 1.6086506843566895, + "learning_rate": 1.8971602746010016e-05, + "loss": 0.5404, + "step": 15417 + }, + { + "epoch": 2.516876862168891, + "grad_norm": 1.7305883169174194, + "learning_rate": 1.8971462574052242e-05, + "loss": 0.5603, + "step": 15418 + }, + { + "epoch": 2.5170401208113953, + "grad_norm": 1.7362991571426392, + "learning_rate": 1.8971322393060207e-05, + "loss": 0.4697, + "step": 15419 + }, + { + "epoch": 2.5172033794538997, + "grad_norm": 2.006256580352783, + "learning_rate": 1.8971182203034055e-05, + "loss": 0.574, + "step": 15420 + }, + { + "epoch": 2.517366638096404, + "grad_norm": 1.6268655061721802, + "learning_rate": 1.8971042003973923e-05, + "loss": 0.4831, + "step": 15421 + }, + { + "epoch": 2.5175298967389086, + "grad_norm": 2.052827835083008, + "learning_rate": 1.897090179587995e-05, + "loss": 0.6425, + "step": 15422 + }, + { + "epoch": 2.517693155381413, + "grad_norm": 1.8361598253250122, + "learning_rate": 1.8970761578752288e-05, + "loss": 0.6706, + "step": 15423 + }, + { + "epoch": 2.5178564140239175, + "grad_norm": 2.0285630226135254, + "learning_rate": 1.897062135259107e-05, + "loss": 0.674, + "step": 15424 + }, + { + "epoch": 2.518019672666422, + "grad_norm": 1.724624752998352, + "learning_rate": 1.8970481117396438e-05, + "loss": 0.4434, + "step": 15425 + }, + { + "epoch": 2.5181829313089263, + "grad_norm": 2.0234262943267822, + "learning_rate": 1.897034087316853e-05, + "loss": 0.5974, + "step": 15426 + }, + { + "epoch": 2.518346189951431, + "grad_norm": 2.1521477699279785, + "learning_rate": 1.8970200619907497e-05, + "loss": 0.6329, + "step": 15427 + }, + { + "epoch": 2.518509448593935, + "grad_norm": 2.5792434215545654, + "learning_rate": 1.8970060357613472e-05, + "loss": 0.6436, + "step": 15428 + }, + { + "epoch": 2.518672707236439, + "grad_norm": 1.5217803716659546, + "learning_rate": 1.89699200862866e-05, + "loss": 0.4623, + "step": 15429 + }, + { + "epoch": 2.5188359658789437, + "grad_norm": 1.6253303289413452, + "learning_rate": 1.896977980592702e-05, + "loss": 0.5392, + "step": 15430 + }, + { + "epoch": 2.518999224521448, + "grad_norm": 1.8891643285751343, + "learning_rate": 1.8969639516534873e-05, + "loss": 0.5863, + "step": 15431 + }, + { + "epoch": 2.5191624831639525, + "grad_norm": 1.9075859785079956, + "learning_rate": 1.8969499218110302e-05, + "loss": 0.5881, + "step": 15432 + }, + { + "epoch": 2.519325741806457, + "grad_norm": 1.914431095123291, + "learning_rate": 1.896935891065345e-05, + "loss": 0.5814, + "step": 15433 + }, + { + "epoch": 2.5194890004489614, + "grad_norm": 1.6938759088516235, + "learning_rate": 1.896921859416445e-05, + "loss": 0.5835, + "step": 15434 + }, + { + "epoch": 2.5196522590914654, + "grad_norm": 1.9179179668426514, + "learning_rate": 1.8969078268643453e-05, + "loss": 0.5383, + "step": 15435 + }, + { + "epoch": 2.51981551773397, + "grad_norm": 2.0753955841064453, + "learning_rate": 1.8968937934090596e-05, + "loss": 0.5461, + "step": 15436 + }, + { + "epoch": 2.5199787763764743, + "grad_norm": 1.4770301580429077, + "learning_rate": 1.8968797590506023e-05, + "loss": 0.5085, + "step": 15437 + }, + { + "epoch": 2.5201420350189787, + "grad_norm": 1.8185725212097168, + "learning_rate": 1.896865723788987e-05, + "loss": 0.5712, + "step": 15438 + }, + { + "epoch": 2.520305293661483, + "grad_norm": 1.9534437656402588, + "learning_rate": 1.8968516876242282e-05, + "loss": 0.5489, + "step": 15439 + }, + { + "epoch": 2.5204685523039876, + "grad_norm": 1.800718069076538, + "learning_rate": 1.8968376505563402e-05, + "loss": 0.5623, + "step": 15440 + }, + { + "epoch": 2.520631810946492, + "grad_norm": 1.7886732816696167, + "learning_rate": 1.896823612585337e-05, + "loss": 0.5806, + "step": 15441 + }, + { + "epoch": 2.5207950695889965, + "grad_norm": 3.0370326042175293, + "learning_rate": 1.8968095737112325e-05, + "loss": 0.52, + "step": 15442 + }, + { + "epoch": 2.520958328231501, + "grad_norm": 1.6140779256820679, + "learning_rate": 1.896795533934041e-05, + "loss": 0.5862, + "step": 15443 + }, + { + "epoch": 2.5211215868740053, + "grad_norm": 1.8379931449890137, + "learning_rate": 1.8967814932537765e-05, + "loss": 0.5331, + "step": 15444 + }, + { + "epoch": 2.5212848455165098, + "grad_norm": 1.5922820568084717, + "learning_rate": 1.896767451670453e-05, + "loss": 0.5421, + "step": 15445 + }, + { + "epoch": 2.5214481041590138, + "grad_norm": 1.796104073524475, + "learning_rate": 1.896753409184086e-05, + "loss": 0.5935, + "step": 15446 + }, + { + "epoch": 2.521611362801518, + "grad_norm": 1.5361642837524414, + "learning_rate": 1.8967393657946877e-05, + "loss": 0.553, + "step": 15447 + }, + { + "epoch": 2.5217746214440226, + "grad_norm": 1.5535531044006348, + "learning_rate": 1.896725321502273e-05, + "loss": 0.5761, + "step": 15448 + }, + { + "epoch": 2.521937880086527, + "grad_norm": 2.1837503910064697, + "learning_rate": 1.8967112763068566e-05, + "loss": 0.5466, + "step": 15449 + }, + { + "epoch": 2.5221011387290315, + "grad_norm": 1.7846993207931519, + "learning_rate": 1.8966972302084516e-05, + "loss": 0.4383, + "step": 15450 + }, + { + "epoch": 2.522264397371536, + "grad_norm": 1.4909768104553223, + "learning_rate": 1.8966831832070735e-05, + "loss": 0.5374, + "step": 15451 + }, + { + "epoch": 2.5224276560140404, + "grad_norm": 1.7795583009719849, + "learning_rate": 1.8966691353027352e-05, + "loss": 0.528, + "step": 15452 + }, + { + "epoch": 2.5225909146565444, + "grad_norm": 1.8623428344726562, + "learning_rate": 1.8966550864954513e-05, + "loss": 0.6029, + "step": 15453 + }, + { + "epoch": 2.522754173299049, + "grad_norm": 1.6299068927764893, + "learning_rate": 1.896641036785236e-05, + "loss": 0.5059, + "step": 15454 + }, + { + "epoch": 2.5229174319415533, + "grad_norm": 1.6741482019424438, + "learning_rate": 1.8966269861721037e-05, + "loss": 0.4828, + "step": 15455 + }, + { + "epoch": 2.5230806905840577, + "grad_norm": 1.8065836429595947, + "learning_rate": 1.896612934656068e-05, + "loss": 0.5465, + "step": 15456 + }, + { + "epoch": 2.523243949226562, + "grad_norm": 2.06701922416687, + "learning_rate": 1.8965988822371432e-05, + "loss": 0.6215, + "step": 15457 + }, + { + "epoch": 2.5234072078690666, + "grad_norm": 2.1135871410369873, + "learning_rate": 1.8965848289153438e-05, + "loss": 0.7192, + "step": 15458 + }, + { + "epoch": 2.523570466511571, + "grad_norm": 1.7681896686553955, + "learning_rate": 1.8965707746906835e-05, + "loss": 0.525, + "step": 15459 + }, + { + "epoch": 2.5237337251540755, + "grad_norm": 1.8735759258270264, + "learning_rate": 1.8965567195631766e-05, + "loss": 0.5356, + "step": 15460 + }, + { + "epoch": 2.52389698379658, + "grad_norm": 1.756192922592163, + "learning_rate": 1.8965426635328376e-05, + "loss": 0.517, + "step": 15461 + }, + { + "epoch": 2.5240602424390843, + "grad_norm": 1.905102252960205, + "learning_rate": 1.8965286065996804e-05, + "loss": 0.5389, + "step": 15462 + }, + { + "epoch": 2.5242235010815888, + "grad_norm": 1.9529906511306763, + "learning_rate": 1.896514548763719e-05, + "loss": 0.7071, + "step": 15463 + }, + { + "epoch": 2.5243867597240928, + "grad_norm": 1.8024744987487793, + "learning_rate": 1.8965004900249676e-05, + "loss": 0.5975, + "step": 15464 + }, + { + "epoch": 2.524550018366597, + "grad_norm": 1.928567886352539, + "learning_rate": 1.8964864303834408e-05, + "loss": 0.6711, + "step": 15465 + }, + { + "epoch": 2.5247132770091016, + "grad_norm": 1.501198172569275, + "learning_rate": 1.896472369839152e-05, + "loss": 0.5708, + "step": 15466 + }, + { + "epoch": 2.524876535651606, + "grad_norm": 1.8302795886993408, + "learning_rate": 1.896458308392116e-05, + "loss": 0.546, + "step": 15467 + }, + { + "epoch": 2.5250397942941105, + "grad_norm": 1.9496045112609863, + "learning_rate": 1.8964442460423463e-05, + "loss": 0.5482, + "step": 15468 + }, + { + "epoch": 2.525203052936615, + "grad_norm": 1.6023030281066895, + "learning_rate": 1.896430182789858e-05, + "loss": 0.4724, + "step": 15469 + }, + { + "epoch": 2.525366311579119, + "grad_norm": 1.7500455379486084, + "learning_rate": 1.896416118634665e-05, + "loss": 0.4877, + "step": 15470 + }, + { + "epoch": 2.5255295702216234, + "grad_norm": 1.707309603691101, + "learning_rate": 1.8964020535767805e-05, + "loss": 0.509, + "step": 15471 + }, + { + "epoch": 2.525692828864128, + "grad_norm": 1.752016305923462, + "learning_rate": 1.8963879876162196e-05, + "loss": 0.5319, + "step": 15472 + }, + { + "epoch": 2.5258560875066323, + "grad_norm": 1.6784148216247559, + "learning_rate": 1.8963739207529963e-05, + "loss": 0.4793, + "step": 15473 + }, + { + "epoch": 2.5260193461491367, + "grad_norm": 2.0695927143096924, + "learning_rate": 1.8963598529871245e-05, + "loss": 0.6274, + "step": 15474 + }, + { + "epoch": 2.526182604791641, + "grad_norm": 1.80709707736969, + "learning_rate": 1.896345784318619e-05, + "loss": 0.5136, + "step": 15475 + }, + { + "epoch": 2.5263458634341456, + "grad_norm": 1.9935286045074463, + "learning_rate": 1.896331714747493e-05, + "loss": 0.5119, + "step": 15476 + }, + { + "epoch": 2.52650912207665, + "grad_norm": 1.9305263757705688, + "learning_rate": 1.8963176442737616e-05, + "loss": 0.5051, + "step": 15477 + }, + { + "epoch": 2.5266723807191545, + "grad_norm": 1.718499779701233, + "learning_rate": 1.8963035728974386e-05, + "loss": 0.6092, + "step": 15478 + }, + { + "epoch": 2.526835639361659, + "grad_norm": 1.955952763557434, + "learning_rate": 1.8962895006185377e-05, + "loss": 0.6377, + "step": 15479 + }, + { + "epoch": 2.5269988980041633, + "grad_norm": 1.9952985048294067, + "learning_rate": 1.896275427437074e-05, + "loss": 0.5392, + "step": 15480 + }, + { + "epoch": 2.5271621566466673, + "grad_norm": 1.9005556106567383, + "learning_rate": 1.896261353353061e-05, + "loss": 0.6465, + "step": 15481 + }, + { + "epoch": 2.5273254152891718, + "grad_norm": 1.6104600429534912, + "learning_rate": 1.896247278366513e-05, + "loss": 0.4685, + "step": 15482 + }, + { + "epoch": 2.527488673931676, + "grad_norm": 1.663163185119629, + "learning_rate": 1.8962332024774444e-05, + "loss": 0.5659, + "step": 15483 + }, + { + "epoch": 2.5276519325741806, + "grad_norm": 1.6410837173461914, + "learning_rate": 1.8962191256858687e-05, + "loss": 0.5055, + "step": 15484 + }, + { + "epoch": 2.527815191216685, + "grad_norm": 1.904684066772461, + "learning_rate": 1.896205047991801e-05, + "loss": 0.6281, + "step": 15485 + }, + { + "epoch": 2.5279784498591895, + "grad_norm": 1.7176446914672852, + "learning_rate": 1.8961909693952552e-05, + "loss": 0.5319, + "step": 15486 + }, + { + "epoch": 2.528141708501694, + "grad_norm": 1.823876976966858, + "learning_rate": 1.8961768898962448e-05, + "loss": 0.595, + "step": 15487 + }, + { + "epoch": 2.528304967144198, + "grad_norm": 2.0251004695892334, + "learning_rate": 1.896162809494785e-05, + "loss": 0.6194, + "step": 15488 + }, + { + "epoch": 2.5284682257867024, + "grad_norm": 1.6831303834915161, + "learning_rate": 1.896148728190889e-05, + "loss": 0.5433, + "step": 15489 + }, + { + "epoch": 2.528631484429207, + "grad_norm": 1.906517744064331, + "learning_rate": 1.8961346459845724e-05, + "loss": 0.6347, + "step": 15490 + }, + { + "epoch": 2.5287947430717113, + "grad_norm": 1.420283555984497, + "learning_rate": 1.8961205628758477e-05, + "loss": 0.5645, + "step": 15491 + }, + { + "epoch": 2.5289580017142157, + "grad_norm": 1.9725983142852783, + "learning_rate": 1.89610647886473e-05, + "loss": 0.6512, + "step": 15492 + }, + { + "epoch": 2.52912126035672, + "grad_norm": 1.9901896715164185, + "learning_rate": 1.896092393951233e-05, + "loss": 0.579, + "step": 15493 + }, + { + "epoch": 2.5292845189992246, + "grad_norm": 1.5632842779159546, + "learning_rate": 1.8960783081353716e-05, + "loss": 0.4979, + "step": 15494 + }, + { + "epoch": 2.529447777641729, + "grad_norm": 1.9501749277114868, + "learning_rate": 1.8960642214171594e-05, + "loss": 0.6494, + "step": 15495 + }, + { + "epoch": 2.5296110362842334, + "grad_norm": 1.4787616729736328, + "learning_rate": 1.8960501337966107e-05, + "loss": 0.5249, + "step": 15496 + }, + { + "epoch": 2.529774294926738, + "grad_norm": 1.8069329261779785, + "learning_rate": 1.8960360452737402e-05, + "loss": 0.4489, + "step": 15497 + }, + { + "epoch": 2.5299375535692423, + "grad_norm": 2.054622173309326, + "learning_rate": 1.896021955848561e-05, + "loss": 0.7116, + "step": 15498 + }, + { + "epoch": 2.5301008122117463, + "grad_norm": 2.12978196144104, + "learning_rate": 1.8960078655210886e-05, + "loss": 0.6414, + "step": 15499 + }, + { + "epoch": 2.5302640708542508, + "grad_norm": 2.1424694061279297, + "learning_rate": 1.895993774291336e-05, + "loss": 0.6949, + "step": 15500 + }, + { + "epoch": 2.530427329496755, + "grad_norm": 1.7027549743652344, + "learning_rate": 1.895979682159318e-05, + "loss": 0.599, + "step": 15501 + }, + { + "epoch": 2.5305905881392596, + "grad_norm": 1.7298223972320557, + "learning_rate": 1.8959655891250487e-05, + "loss": 0.5505, + "step": 15502 + }, + { + "epoch": 2.530753846781764, + "grad_norm": 1.855724811553955, + "learning_rate": 1.8959514951885426e-05, + "loss": 0.6658, + "step": 15503 + }, + { + "epoch": 2.5309171054242685, + "grad_norm": 1.6589604616165161, + "learning_rate": 1.8959374003498133e-05, + "loss": 0.4898, + "step": 15504 + }, + { + "epoch": 2.531080364066773, + "grad_norm": 1.9370160102844238, + "learning_rate": 1.8959233046088753e-05, + "loss": 0.59, + "step": 15505 + }, + { + "epoch": 2.531243622709277, + "grad_norm": 1.7273080348968506, + "learning_rate": 1.8959092079657426e-05, + "loss": 0.4995, + "step": 15506 + }, + { + "epoch": 2.5314068813517814, + "grad_norm": 1.750929594039917, + "learning_rate": 1.89589511042043e-05, + "loss": 0.5758, + "step": 15507 + }, + { + "epoch": 2.531570139994286, + "grad_norm": 1.652711033821106, + "learning_rate": 1.895881011972951e-05, + "loss": 0.4901, + "step": 15508 + }, + { + "epoch": 2.5317333986367903, + "grad_norm": 1.6591339111328125, + "learning_rate": 1.89586691262332e-05, + "loss": 0.5729, + "step": 15509 + }, + { + "epoch": 2.5318966572792947, + "grad_norm": 1.7995052337646484, + "learning_rate": 1.8958528123715513e-05, + "loss": 0.4821, + "step": 15510 + }, + { + "epoch": 2.532059915921799, + "grad_norm": 1.6878199577331543, + "learning_rate": 1.895838711217659e-05, + "loss": 0.5776, + "step": 15511 + }, + { + "epoch": 2.5322231745643036, + "grad_norm": 1.5728720426559448, + "learning_rate": 1.8958246091616574e-05, + "loss": 0.4677, + "step": 15512 + }, + { + "epoch": 2.532386433206808, + "grad_norm": 1.6997599601745605, + "learning_rate": 1.895810506203561e-05, + "loss": 0.5363, + "step": 15513 + }, + { + "epoch": 2.5325496918493124, + "grad_norm": 1.8433016538619995, + "learning_rate": 1.8957964023433833e-05, + "loss": 0.592, + "step": 15514 + }, + { + "epoch": 2.532712950491817, + "grad_norm": 1.5629569292068481, + "learning_rate": 1.8957822975811388e-05, + "loss": 0.4375, + "step": 15515 + }, + { + "epoch": 2.5328762091343213, + "grad_norm": 1.7639583349227905, + "learning_rate": 1.895768191916842e-05, + "loss": 0.5267, + "step": 15516 + }, + { + "epoch": 2.5330394677768253, + "grad_norm": 1.476737141609192, + "learning_rate": 1.8957540853505067e-05, + "loss": 0.5005, + "step": 15517 + }, + { + "epoch": 2.5332027264193298, + "grad_norm": 1.859265923500061, + "learning_rate": 1.8957399778821472e-05, + "loss": 0.5351, + "step": 15518 + }, + { + "epoch": 2.533365985061834, + "grad_norm": 1.825221061706543, + "learning_rate": 1.895725869511778e-05, + "loss": 0.5414, + "step": 15519 + }, + { + "epoch": 2.5335292437043386, + "grad_norm": 1.7404991388320923, + "learning_rate": 1.895711760239413e-05, + "loss": 0.5549, + "step": 15520 + }, + { + "epoch": 2.533692502346843, + "grad_norm": 1.8609822988510132, + "learning_rate": 1.8956976500650664e-05, + "loss": 0.5512, + "step": 15521 + }, + { + "epoch": 2.5338557609893475, + "grad_norm": 2.158123731613159, + "learning_rate": 1.8956835389887528e-05, + "loss": 0.5479, + "step": 15522 + }, + { + "epoch": 2.5340190196318515, + "grad_norm": 1.8159396648406982, + "learning_rate": 1.895669427010486e-05, + "loss": 0.6136, + "step": 15523 + }, + { + "epoch": 2.534182278274356, + "grad_norm": 1.5652967691421509, + "learning_rate": 1.89565531413028e-05, + "loss": 0.4917, + "step": 15524 + }, + { + "epoch": 2.5343455369168604, + "grad_norm": 1.7781562805175781, + "learning_rate": 1.89564120034815e-05, + "loss": 0.5603, + "step": 15525 + }, + { + "epoch": 2.534508795559365, + "grad_norm": 1.5345760583877563, + "learning_rate": 1.8956270856641087e-05, + "loss": 0.5152, + "step": 15526 + }, + { + "epoch": 2.5346720542018693, + "grad_norm": 1.6790302991867065, + "learning_rate": 1.8956129700781717e-05, + "loss": 0.4991, + "step": 15527 + }, + { + "epoch": 2.5348353128443737, + "grad_norm": 1.9713993072509766, + "learning_rate": 1.8955988535903526e-05, + "loss": 0.6142, + "step": 15528 + }, + { + "epoch": 2.534998571486878, + "grad_norm": 2.530395746231079, + "learning_rate": 1.8955847362006653e-05, + "loss": 0.5279, + "step": 15529 + }, + { + "epoch": 2.5351618301293826, + "grad_norm": 1.7072526216506958, + "learning_rate": 1.8955706179091248e-05, + "loss": 0.526, + "step": 15530 + }, + { + "epoch": 2.535325088771887, + "grad_norm": 1.8654084205627441, + "learning_rate": 1.895556498715745e-05, + "loss": 0.5038, + "step": 15531 + }, + { + "epoch": 2.5354883474143914, + "grad_norm": 1.7975820302963257, + "learning_rate": 1.8955423786205398e-05, + "loss": 0.5622, + "step": 15532 + }, + { + "epoch": 2.535651606056896, + "grad_norm": 1.7557883262634277, + "learning_rate": 1.895528257623524e-05, + "loss": 0.5618, + "step": 15533 + }, + { + "epoch": 2.5358148646994, + "grad_norm": 1.7216062545776367, + "learning_rate": 1.8955141357247113e-05, + "loss": 0.5782, + "step": 15534 + }, + { + "epoch": 2.5359781233419043, + "grad_norm": 1.8765580654144287, + "learning_rate": 1.895500012924116e-05, + "loss": 0.6088, + "step": 15535 + }, + { + "epoch": 2.5361413819844087, + "grad_norm": 1.914957880973816, + "learning_rate": 1.8954858892217522e-05, + "loss": 0.6056, + "step": 15536 + }, + { + "epoch": 2.536304640626913, + "grad_norm": 1.5626541376113892, + "learning_rate": 1.895471764617635e-05, + "loss": 0.4733, + "step": 15537 + }, + { + "epoch": 2.5364678992694176, + "grad_norm": 1.852906584739685, + "learning_rate": 1.8954576391117772e-05, + "loss": 0.5996, + "step": 15538 + }, + { + "epoch": 2.536631157911922, + "grad_norm": 1.8698798418045044, + "learning_rate": 1.8954435127041947e-05, + "loss": 0.5192, + "step": 15539 + }, + { + "epoch": 2.5367944165544265, + "grad_norm": 1.8380168676376343, + "learning_rate": 1.8954293853949002e-05, + "loss": 0.5901, + "step": 15540 + }, + { + "epoch": 2.5369576751969305, + "grad_norm": 1.6121821403503418, + "learning_rate": 1.8954152571839082e-05, + "loss": 0.4956, + "step": 15541 + }, + { + "epoch": 2.537120933839435, + "grad_norm": 1.842129111289978, + "learning_rate": 1.895401128071234e-05, + "loss": 0.5084, + "step": 15542 + }, + { + "epoch": 2.5372841924819394, + "grad_norm": 1.707373857498169, + "learning_rate": 1.8953869980568908e-05, + "loss": 0.6056, + "step": 15543 + }, + { + "epoch": 2.537447451124444, + "grad_norm": 1.626772403717041, + "learning_rate": 1.8953728671408934e-05, + "loss": 0.537, + "step": 15544 + }, + { + "epoch": 2.5376107097669482, + "grad_norm": 1.9726788997650146, + "learning_rate": 1.8953587353232555e-05, + "loss": 0.6284, + "step": 15545 + }, + { + "epoch": 2.5377739684094527, + "grad_norm": 1.9556183815002441, + "learning_rate": 1.8953446026039913e-05, + "loss": 0.6291, + "step": 15546 + }, + { + "epoch": 2.537937227051957, + "grad_norm": 1.956395149230957, + "learning_rate": 1.895330468983116e-05, + "loss": 0.7261, + "step": 15547 + }, + { + "epoch": 2.5381004856944616, + "grad_norm": 1.6748870611190796, + "learning_rate": 1.8953163344606426e-05, + "loss": 0.6116, + "step": 15548 + }, + { + "epoch": 2.538263744336966, + "grad_norm": 1.7852253913879395, + "learning_rate": 1.8953021990365864e-05, + "loss": 0.6092, + "step": 15549 + }, + { + "epoch": 2.5384270029794704, + "grad_norm": 1.5583792924880981, + "learning_rate": 1.8952880627109606e-05, + "loss": 0.551, + "step": 15550 + }, + { + "epoch": 2.538590261621975, + "grad_norm": 1.6606791019439697, + "learning_rate": 1.8952739254837803e-05, + "loss": 0.563, + "step": 15551 + }, + { + "epoch": 2.538753520264479, + "grad_norm": 1.7670856714248657, + "learning_rate": 1.8952597873550595e-05, + "loss": 0.593, + "step": 15552 + }, + { + "epoch": 2.5389167789069833, + "grad_norm": 1.5748131275177002, + "learning_rate": 1.8952456483248117e-05, + "loss": 0.5256, + "step": 15553 + }, + { + "epoch": 2.5390800375494877, + "grad_norm": 1.585320234298706, + "learning_rate": 1.8952315083930522e-05, + "loss": 0.5012, + "step": 15554 + }, + { + "epoch": 2.539243296191992, + "grad_norm": 1.7997065782546997, + "learning_rate": 1.895217367559795e-05, + "loss": 0.6637, + "step": 15555 + }, + { + "epoch": 2.5394065548344966, + "grad_norm": 1.8043428659439087, + "learning_rate": 1.895203225825054e-05, + "loss": 0.5643, + "step": 15556 + }, + { + "epoch": 2.539569813477001, + "grad_norm": 2.0066006183624268, + "learning_rate": 1.8951890831888432e-05, + "loss": 0.622, + "step": 15557 + }, + { + "epoch": 2.539733072119505, + "grad_norm": 2.0161194801330566, + "learning_rate": 1.8951749396511777e-05, + "loss": 0.5959, + "step": 15558 + }, + { + "epoch": 2.5398963307620095, + "grad_norm": 1.3443061113357544, + "learning_rate": 1.8951607952120712e-05, + "loss": 0.4416, + "step": 15559 + }, + { + "epoch": 2.540059589404514, + "grad_norm": 1.6278437376022339, + "learning_rate": 1.8951466498715378e-05, + "loss": 0.5545, + "step": 15560 + }, + { + "epoch": 2.5402228480470184, + "grad_norm": 1.6980572938919067, + "learning_rate": 1.8951325036295922e-05, + "loss": 0.5355, + "step": 15561 + }, + { + "epoch": 2.540386106689523, + "grad_norm": 1.6848294734954834, + "learning_rate": 1.8951183564862484e-05, + "loss": 0.5526, + "step": 15562 + }, + { + "epoch": 2.5405493653320272, + "grad_norm": 1.9395794868469238, + "learning_rate": 1.8951042084415205e-05, + "loss": 0.4949, + "step": 15563 + }, + { + "epoch": 2.5407126239745317, + "grad_norm": 1.9922313690185547, + "learning_rate": 1.8950900594954226e-05, + "loss": 0.6193, + "step": 15564 + }, + { + "epoch": 2.540875882617036, + "grad_norm": 1.7556859254837036, + "learning_rate": 1.8950759096479698e-05, + "loss": 0.5547, + "step": 15565 + }, + { + "epoch": 2.5410391412595406, + "grad_norm": 1.9881080389022827, + "learning_rate": 1.8950617588991757e-05, + "loss": 0.6136, + "step": 15566 + }, + { + "epoch": 2.541202399902045, + "grad_norm": 1.8095468282699585, + "learning_rate": 1.8950476072490545e-05, + "loss": 0.5569, + "step": 15567 + }, + { + "epoch": 2.5413656585445494, + "grad_norm": 1.8556568622589111, + "learning_rate": 1.8950334546976208e-05, + "loss": 0.5566, + "step": 15568 + }, + { + "epoch": 2.5415289171870534, + "grad_norm": 1.5052355527877808, + "learning_rate": 1.895019301244888e-05, + "loss": 0.4917, + "step": 15569 + }, + { + "epoch": 2.541692175829558, + "grad_norm": 1.7763417959213257, + "learning_rate": 1.8950051468908718e-05, + "loss": 0.5765, + "step": 15570 + }, + { + "epoch": 2.5418554344720623, + "grad_norm": 1.7401331663131714, + "learning_rate": 1.8949909916355853e-05, + "loss": 0.5738, + "step": 15571 + }, + { + "epoch": 2.5420186931145667, + "grad_norm": 1.7547327280044556, + "learning_rate": 1.894976835479043e-05, + "loss": 0.5577, + "step": 15572 + }, + { + "epoch": 2.542181951757071, + "grad_norm": 1.6878411769866943, + "learning_rate": 1.89496267842126e-05, + "loss": 0.5177, + "step": 15573 + }, + { + "epoch": 2.5423452103995756, + "grad_norm": 1.9020100831985474, + "learning_rate": 1.894948520462249e-05, + "loss": 0.6596, + "step": 15574 + }, + { + "epoch": 2.54250846904208, + "grad_norm": 2.1679320335388184, + "learning_rate": 1.894934361602025e-05, + "loss": 0.6023, + "step": 15575 + }, + { + "epoch": 2.542671727684584, + "grad_norm": 1.7715985774993896, + "learning_rate": 1.894920201840603e-05, + "loss": 0.5435, + "step": 15576 + }, + { + "epoch": 2.5428349863270885, + "grad_norm": 1.6473362445831299, + "learning_rate": 1.894906041177996e-05, + "loss": 0.5801, + "step": 15577 + }, + { + "epoch": 2.542998244969593, + "grad_norm": 1.842539668083191, + "learning_rate": 1.894891879614219e-05, + "loss": 0.5775, + "step": 15578 + }, + { + "epoch": 2.5431615036120974, + "grad_norm": 1.7441186904907227, + "learning_rate": 1.8948777171492863e-05, + "loss": 0.617, + "step": 15579 + }, + { + "epoch": 2.543324762254602, + "grad_norm": 1.5394890308380127, + "learning_rate": 1.894863553783212e-05, + "loss": 0.4797, + "step": 15580 + }, + { + "epoch": 2.5434880208971062, + "grad_norm": 1.685868501663208, + "learning_rate": 1.8948493895160103e-05, + "loss": 0.4872, + "step": 15581 + }, + { + "epoch": 2.5436512795396107, + "grad_norm": 2.037733316421509, + "learning_rate": 1.8948352243476953e-05, + "loss": 0.6049, + "step": 15582 + }, + { + "epoch": 2.543814538182115, + "grad_norm": 2.066713333129883, + "learning_rate": 1.8948210582782818e-05, + "loss": 0.6357, + "step": 15583 + }, + { + "epoch": 2.5439777968246196, + "grad_norm": 2.1460177898406982, + "learning_rate": 1.8948068913077832e-05, + "loss": 0.5165, + "step": 15584 + }, + { + "epoch": 2.544141055467124, + "grad_norm": 1.7273250818252563, + "learning_rate": 1.894792723436215e-05, + "loss": 0.5814, + "step": 15585 + }, + { + "epoch": 2.5443043141096284, + "grad_norm": 1.7541208267211914, + "learning_rate": 1.8947785546635905e-05, + "loss": 0.5993, + "step": 15586 + }, + { + "epoch": 2.5444675727521324, + "grad_norm": 1.8623584508895874, + "learning_rate": 1.894764384989924e-05, + "loss": 0.6044, + "step": 15587 + }, + { + "epoch": 2.544630831394637, + "grad_norm": 1.8277976512908936, + "learning_rate": 1.89475021441523e-05, + "loss": 0.5522, + "step": 15588 + }, + { + "epoch": 2.5447940900371413, + "grad_norm": 1.9964991807937622, + "learning_rate": 1.894736042939523e-05, + "loss": 0.5847, + "step": 15589 + }, + { + "epoch": 2.5449573486796457, + "grad_norm": 1.558369755744934, + "learning_rate": 1.894721870562817e-05, + "loss": 0.5862, + "step": 15590 + }, + { + "epoch": 2.54512060732215, + "grad_norm": 1.9518016576766968, + "learning_rate": 1.8947076972851263e-05, + "loss": 0.504, + "step": 15591 + }, + { + "epoch": 2.5452838659646546, + "grad_norm": 1.5962103605270386, + "learning_rate": 1.894693523106465e-05, + "loss": 0.5302, + "step": 15592 + }, + { + "epoch": 2.545447124607159, + "grad_norm": 1.7716047763824463, + "learning_rate": 1.8946793480268474e-05, + "loss": 0.4306, + "step": 15593 + }, + { + "epoch": 2.545610383249663, + "grad_norm": 1.7575047016143799, + "learning_rate": 1.8946651720462886e-05, + "loss": 0.5178, + "step": 15594 + }, + { + "epoch": 2.5457736418921675, + "grad_norm": 2.043139934539795, + "learning_rate": 1.8946509951648018e-05, + "loss": 0.705, + "step": 15595 + }, + { + "epoch": 2.545936900534672, + "grad_norm": 1.8266173601150513, + "learning_rate": 1.8946368173824017e-05, + "loss": 0.6087, + "step": 15596 + }, + { + "epoch": 2.5461001591771764, + "grad_norm": 1.7963411808013916, + "learning_rate": 1.8946226386991027e-05, + "loss": 0.5569, + "step": 15597 + }, + { + "epoch": 2.546263417819681, + "grad_norm": 2.164484739303589, + "learning_rate": 1.8946084591149187e-05, + "loss": 0.663, + "step": 15598 + }, + { + "epoch": 2.5464266764621852, + "grad_norm": 1.7659807205200195, + "learning_rate": 1.8945942786298644e-05, + "loss": 0.5304, + "step": 15599 + }, + { + "epoch": 2.5465899351046897, + "grad_norm": 1.973142147064209, + "learning_rate": 1.894580097243954e-05, + "loss": 0.5616, + "step": 15600 + }, + { + "epoch": 2.546753193747194, + "grad_norm": 1.531906008720398, + "learning_rate": 1.8945659149572014e-05, + "loss": 0.4796, + "step": 15601 + }, + { + "epoch": 2.5469164523896985, + "grad_norm": 1.6793532371520996, + "learning_rate": 1.8945517317696214e-05, + "loss": 0.4864, + "step": 15602 + }, + { + "epoch": 2.547079711032203, + "grad_norm": 1.5959155559539795, + "learning_rate": 1.8945375476812276e-05, + "loss": 0.4431, + "step": 15603 + }, + { + "epoch": 2.5472429696747074, + "grad_norm": 2.0457799434661865, + "learning_rate": 1.8945233626920353e-05, + "loss": 0.5376, + "step": 15604 + }, + { + "epoch": 2.5474062283172114, + "grad_norm": 1.5931330919265747, + "learning_rate": 1.8945091768020577e-05, + "loss": 0.5122, + "step": 15605 + }, + { + "epoch": 2.547569486959716, + "grad_norm": 1.6402108669281006, + "learning_rate": 1.89449499001131e-05, + "loss": 0.4923, + "step": 15606 + }, + { + "epoch": 2.5477327456022203, + "grad_norm": 1.8780555725097656, + "learning_rate": 1.894480802319806e-05, + "loss": 0.6072, + "step": 15607 + }, + { + "epoch": 2.5478960042447247, + "grad_norm": 2.0728824138641357, + "learning_rate": 1.89446661372756e-05, + "loss": 1.2433, + "step": 15608 + }, + { + "epoch": 2.548059262887229, + "grad_norm": 1.8120286464691162, + "learning_rate": 1.8944524242345864e-05, + "loss": 0.535, + "step": 15609 + }, + { + "epoch": 2.5482225215297336, + "grad_norm": 1.6752923727035522, + "learning_rate": 1.8944382338408994e-05, + "loss": 0.4965, + "step": 15610 + }, + { + "epoch": 2.5483857801722376, + "grad_norm": 1.6241793632507324, + "learning_rate": 1.8944240425465134e-05, + "loss": 0.6201, + "step": 15611 + }, + { + "epoch": 2.548549038814742, + "grad_norm": 1.8830856084823608, + "learning_rate": 1.8944098503514424e-05, + "loss": 0.5452, + "step": 15612 + }, + { + "epoch": 2.5487122974572465, + "grad_norm": 1.4826581478118896, + "learning_rate": 1.8943956572557013e-05, + "loss": 0.4514, + "step": 15613 + }, + { + "epoch": 2.548875556099751, + "grad_norm": 1.3465677499771118, + "learning_rate": 1.894381463259304e-05, + "loss": 0.4039, + "step": 15614 + }, + { + "epoch": 2.5490388147422554, + "grad_norm": 2.042473316192627, + "learning_rate": 1.8943672683622646e-05, + "loss": 0.5517, + "step": 15615 + }, + { + "epoch": 2.54920207338476, + "grad_norm": 1.9467122554779053, + "learning_rate": 1.8943530725645975e-05, + "loss": 0.6433, + "step": 15616 + }, + { + "epoch": 2.5493653320272642, + "grad_norm": 1.8233556747436523, + "learning_rate": 1.894338875866317e-05, + "loss": 0.6141, + "step": 15617 + }, + { + "epoch": 2.5495285906697687, + "grad_norm": 2.173415184020996, + "learning_rate": 1.894324678267438e-05, + "loss": 0.7738, + "step": 15618 + }, + { + "epoch": 2.549691849312273, + "grad_norm": 1.7372469902038574, + "learning_rate": 1.894310479767974e-05, + "loss": 0.5739, + "step": 15619 + }, + { + "epoch": 2.5498551079547775, + "grad_norm": 1.7950176000595093, + "learning_rate": 1.8942962803679393e-05, + "loss": 0.5903, + "step": 15620 + }, + { + "epoch": 2.550018366597282, + "grad_norm": 1.7515422105789185, + "learning_rate": 1.8942820800673488e-05, + "loss": 0.5195, + "step": 15621 + }, + { + "epoch": 2.550181625239786, + "grad_norm": 1.5905182361602783, + "learning_rate": 1.8942678788662166e-05, + "loss": 0.4967, + "step": 15622 + }, + { + "epoch": 2.5503448838822904, + "grad_norm": 1.6982676982879639, + "learning_rate": 1.8942536767645566e-05, + "loss": 0.5904, + "step": 15623 + }, + { + "epoch": 2.550508142524795, + "grad_norm": 1.6693389415740967, + "learning_rate": 1.8942394737623836e-05, + "loss": 0.4844, + "step": 15624 + }, + { + "epoch": 2.5506714011672993, + "grad_norm": 1.9208927154541016, + "learning_rate": 1.8942252698597113e-05, + "loss": 0.7133, + "step": 15625 + }, + { + "epoch": 2.5508346598098037, + "grad_norm": 2.012354850769043, + "learning_rate": 1.894211065056555e-05, + "loss": 0.5379, + "step": 15626 + }, + { + "epoch": 2.550997918452308, + "grad_norm": 1.8031601905822754, + "learning_rate": 1.894196859352928e-05, + "loss": 0.5064, + "step": 15627 + }, + { + "epoch": 2.5511611770948126, + "grad_norm": 1.643278956413269, + "learning_rate": 1.894182652748845e-05, + "loss": 0.5472, + "step": 15628 + }, + { + "epoch": 2.5513244357373166, + "grad_norm": 1.4583779573440552, + "learning_rate": 1.8941684452443203e-05, + "loss": 0.5828, + "step": 15629 + }, + { + "epoch": 2.551487694379821, + "grad_norm": 1.809288740158081, + "learning_rate": 1.8941542368393683e-05, + "loss": 0.591, + "step": 15630 + }, + { + "epoch": 2.5516509530223255, + "grad_norm": 1.8054394721984863, + "learning_rate": 1.8941400275340034e-05, + "loss": 0.6059, + "step": 15631 + }, + { + "epoch": 2.55181421166483, + "grad_norm": 1.4790408611297607, + "learning_rate": 1.894125817328239e-05, + "loss": 0.4831, + "step": 15632 + }, + { + "epoch": 2.5519774703073344, + "grad_norm": 1.5264103412628174, + "learning_rate": 1.894111606222091e-05, + "loss": 0.4468, + "step": 15633 + }, + { + "epoch": 2.552140728949839, + "grad_norm": 1.927931308746338, + "learning_rate": 1.8940973942155726e-05, + "loss": 0.6412, + "step": 15634 + }, + { + "epoch": 2.5523039875923432, + "grad_norm": 1.6289501190185547, + "learning_rate": 1.8940831813086982e-05, + "loss": 0.6249, + "step": 15635 + }, + { + "epoch": 2.5524672462348477, + "grad_norm": 1.8751503229141235, + "learning_rate": 1.8940689675014826e-05, + "loss": 0.6228, + "step": 15636 + }, + { + "epoch": 2.552630504877352, + "grad_norm": 1.7840107679367065, + "learning_rate": 1.8940547527939395e-05, + "loss": 0.6338, + "step": 15637 + }, + { + "epoch": 2.5527937635198565, + "grad_norm": 2.008552074432373, + "learning_rate": 1.8940405371860836e-05, + "loss": 0.6181, + "step": 15638 + }, + { + "epoch": 2.552957022162361, + "grad_norm": 1.4462943077087402, + "learning_rate": 1.894026320677929e-05, + "loss": 0.4949, + "step": 15639 + }, + { + "epoch": 2.553120280804865, + "grad_norm": 1.5256214141845703, + "learning_rate": 1.8940121032694902e-05, + "loss": 0.4637, + "step": 15640 + }, + { + "epoch": 2.5532835394473694, + "grad_norm": 1.7905299663543701, + "learning_rate": 1.8939978849607814e-05, + "loss": 0.578, + "step": 15641 + }, + { + "epoch": 2.553446798089874, + "grad_norm": 1.6936005353927612, + "learning_rate": 1.8939836657518168e-05, + "loss": 0.4902, + "step": 15642 + }, + { + "epoch": 2.5536100567323783, + "grad_norm": 2.2653205394744873, + "learning_rate": 1.893969445642611e-05, + "loss": 0.5904, + "step": 15643 + }, + { + "epoch": 2.5537733153748827, + "grad_norm": 1.896876573562622, + "learning_rate": 1.8939552246331783e-05, + "loss": 0.6364, + "step": 15644 + }, + { + "epoch": 2.553936574017387, + "grad_norm": 1.8304800987243652, + "learning_rate": 1.8939410027235332e-05, + "loss": 0.6611, + "step": 15645 + }, + { + "epoch": 2.554099832659891, + "grad_norm": 2.0894553661346436, + "learning_rate": 1.8939267799136896e-05, + "loss": 0.6249, + "step": 15646 + }, + { + "epoch": 2.5542630913023956, + "grad_norm": 1.9280693531036377, + "learning_rate": 1.8939125562036618e-05, + "loss": 0.5937, + "step": 15647 + }, + { + "epoch": 2.5544263499449, + "grad_norm": 1.822906732559204, + "learning_rate": 1.8938983315934646e-05, + "loss": 0.5671, + "step": 15648 + }, + { + "epoch": 2.5545896085874045, + "grad_norm": 2.0976510047912598, + "learning_rate": 1.8938841060831116e-05, + "loss": 0.7772, + "step": 15649 + }, + { + "epoch": 2.554752867229909, + "grad_norm": 1.9113601446151733, + "learning_rate": 1.8938698796726177e-05, + "loss": 0.5702, + "step": 15650 + }, + { + "epoch": 2.5549161258724133, + "grad_norm": 1.9257571697235107, + "learning_rate": 1.8938556523619967e-05, + "loss": 0.6007, + "step": 15651 + }, + { + "epoch": 2.555079384514918, + "grad_norm": 1.712743878364563, + "learning_rate": 1.893841424151264e-05, + "loss": 0.566, + "step": 15652 + }, + { + "epoch": 2.5552426431574222, + "grad_norm": 1.7775379419326782, + "learning_rate": 1.893827195040433e-05, + "loss": 0.5243, + "step": 15653 + }, + { + "epoch": 2.5554059017999267, + "grad_norm": 1.8614720106124878, + "learning_rate": 1.893812965029518e-05, + "loss": 0.6168, + "step": 15654 + }, + { + "epoch": 2.555569160442431, + "grad_norm": 1.7139643430709839, + "learning_rate": 1.8937987341185337e-05, + "loss": 0.5732, + "step": 15655 + }, + { + "epoch": 2.5557324190849355, + "grad_norm": 1.6032263040542603, + "learning_rate": 1.8937845023074943e-05, + "loss": 0.5256, + "step": 15656 + }, + { + "epoch": 2.5558956777274395, + "grad_norm": 1.7292115688323975, + "learning_rate": 1.8937702695964142e-05, + "loss": 0.5963, + "step": 15657 + }, + { + "epoch": 2.556058936369944, + "grad_norm": 1.4581458568572998, + "learning_rate": 1.8937560359853074e-05, + "loss": 0.4983, + "step": 15658 + }, + { + "epoch": 2.5562221950124484, + "grad_norm": 1.5375405550003052, + "learning_rate": 1.8937418014741892e-05, + "loss": 0.4256, + "step": 15659 + }, + { + "epoch": 2.556385453654953, + "grad_norm": 1.7295705080032349, + "learning_rate": 1.8937275660630727e-05, + "loss": 0.5194, + "step": 15660 + }, + { + "epoch": 2.5565487122974573, + "grad_norm": 1.4985508918762207, + "learning_rate": 1.893713329751973e-05, + "loss": 0.4739, + "step": 15661 + }, + { + "epoch": 2.5567119709399617, + "grad_norm": 1.9848827123641968, + "learning_rate": 1.893699092540904e-05, + "loss": 0.6396, + "step": 15662 + }, + { + "epoch": 2.556875229582466, + "grad_norm": 1.4446439743041992, + "learning_rate": 1.8936848544298804e-05, + "loss": 0.458, + "step": 15663 + }, + { + "epoch": 2.55703848822497, + "grad_norm": 1.621735692024231, + "learning_rate": 1.8936706154189162e-05, + "loss": 0.551, + "step": 15664 + }, + { + "epoch": 2.5572017468674746, + "grad_norm": 1.9785785675048828, + "learning_rate": 1.8936563755080263e-05, + "loss": 0.5974, + "step": 15665 + }, + { + "epoch": 2.557365005509979, + "grad_norm": 1.8769173622131348, + "learning_rate": 1.8936421346972242e-05, + "loss": 0.561, + "step": 15666 + }, + { + "epoch": 2.5575282641524835, + "grad_norm": 1.7787165641784668, + "learning_rate": 1.8936278929865248e-05, + "loss": 0.5403, + "step": 15667 + }, + { + "epoch": 2.557691522794988, + "grad_norm": 1.7975636720657349, + "learning_rate": 1.8936136503759427e-05, + "loss": 0.5953, + "step": 15668 + }, + { + "epoch": 2.5578547814374923, + "grad_norm": 1.7871112823486328, + "learning_rate": 1.8935994068654917e-05, + "loss": 0.6264, + "step": 15669 + }, + { + "epoch": 2.558018040079997, + "grad_norm": 2.076077938079834, + "learning_rate": 1.893585162455186e-05, + "loss": 0.7105, + "step": 15670 + }, + { + "epoch": 2.558181298722501, + "grad_norm": 1.8031333684921265, + "learning_rate": 1.893570917145041e-05, + "loss": 0.5939, + "step": 15671 + }, + { + "epoch": 2.5583445573650057, + "grad_norm": 1.9196693897247314, + "learning_rate": 1.8935566709350695e-05, + "loss": 0.6698, + "step": 15672 + }, + { + "epoch": 2.55850781600751, + "grad_norm": 2.058677911758423, + "learning_rate": 1.8935424238252872e-05, + "loss": 0.7123, + "step": 15673 + }, + { + "epoch": 2.5586710746500145, + "grad_norm": 1.5348143577575684, + "learning_rate": 1.893528175815708e-05, + "loss": 0.4883, + "step": 15674 + }, + { + "epoch": 2.5588343332925185, + "grad_norm": 1.6589624881744385, + "learning_rate": 1.893513926906346e-05, + "loss": 0.5548, + "step": 15675 + }, + { + "epoch": 2.558997591935023, + "grad_norm": 1.827877402305603, + "learning_rate": 1.8934996770972157e-05, + "loss": 0.703, + "step": 15676 + }, + { + "epoch": 2.5591608505775274, + "grad_norm": 1.819804310798645, + "learning_rate": 1.8934854263883314e-05, + "loss": 0.5253, + "step": 15677 + }, + { + "epoch": 2.559324109220032, + "grad_norm": 2.0691587924957275, + "learning_rate": 1.8934711747797075e-05, + "loss": 0.6284, + "step": 15678 + }, + { + "epoch": 2.5594873678625363, + "grad_norm": 1.7142539024353027, + "learning_rate": 1.8934569222713583e-05, + "loss": 0.5859, + "step": 15679 + }, + { + "epoch": 2.5596506265050407, + "grad_norm": 2.1480462551116943, + "learning_rate": 1.8934426688632986e-05, + "loss": 0.6399, + "step": 15680 + }, + { + "epoch": 2.559813885147545, + "grad_norm": 1.5304440259933472, + "learning_rate": 1.8934284145555418e-05, + "loss": 0.5315, + "step": 15681 + }, + { + "epoch": 2.559977143790049, + "grad_norm": 1.8432364463806152, + "learning_rate": 1.8934141593481032e-05, + "loss": 0.6141, + "step": 15682 + }, + { + "epoch": 2.5601404024325536, + "grad_norm": 1.966802954673767, + "learning_rate": 1.893399903240997e-05, + "loss": 0.5964, + "step": 15683 + }, + { + "epoch": 2.560303661075058, + "grad_norm": 1.6444041728973389, + "learning_rate": 1.8933856462342368e-05, + "loss": 0.5788, + "step": 15684 + }, + { + "epoch": 2.5604669197175625, + "grad_norm": 1.7112160921096802, + "learning_rate": 1.893371388327838e-05, + "loss": 0.6076, + "step": 15685 + }, + { + "epoch": 2.560630178360067, + "grad_norm": 1.9970309734344482, + "learning_rate": 1.893357129521814e-05, + "loss": 0.6931, + "step": 15686 + }, + { + "epoch": 2.5607934370025713, + "grad_norm": 1.9249969720840454, + "learning_rate": 1.8933428698161798e-05, + "loss": 0.6374, + "step": 15687 + }, + { + "epoch": 2.5609566956450758, + "grad_norm": 1.7529265880584717, + "learning_rate": 1.8933286092109492e-05, + "loss": 0.5781, + "step": 15688 + }, + { + "epoch": 2.56111995428758, + "grad_norm": 1.374541997909546, + "learning_rate": 1.8933143477061374e-05, + "loss": 0.486, + "step": 15689 + }, + { + "epoch": 2.5612832129300847, + "grad_norm": 1.8761184215545654, + "learning_rate": 1.8933000853017584e-05, + "loss": 0.6065, + "step": 15690 + }, + { + "epoch": 2.561446471572589, + "grad_norm": 1.9811145067214966, + "learning_rate": 1.893285821997826e-05, + "loss": 0.6722, + "step": 15691 + }, + { + "epoch": 2.5616097302150935, + "grad_norm": 2.0062708854675293, + "learning_rate": 1.8932715577943553e-05, + "loss": 0.7309, + "step": 15692 + }, + { + "epoch": 2.5617729888575975, + "grad_norm": 1.9147820472717285, + "learning_rate": 1.8932572926913605e-05, + "loss": 0.5837, + "step": 15693 + }, + { + "epoch": 2.561936247500102, + "grad_norm": 1.5340783596038818, + "learning_rate": 1.8932430266888556e-05, + "loss": 0.5541, + "step": 15694 + }, + { + "epoch": 2.5620995061426064, + "grad_norm": 1.5542755126953125, + "learning_rate": 1.893228759786855e-05, + "loss": 0.5202, + "step": 15695 + }, + { + "epoch": 2.562262764785111, + "grad_norm": 1.6196397542953491, + "learning_rate": 1.893214491985374e-05, + "loss": 0.4854, + "step": 15696 + }, + { + "epoch": 2.5624260234276153, + "grad_norm": 1.7390884160995483, + "learning_rate": 1.8932002232844254e-05, + "loss": 0.567, + "step": 15697 + }, + { + "epoch": 2.5625892820701197, + "grad_norm": 1.4687983989715576, + "learning_rate": 1.8931859536840248e-05, + "loss": 0.4662, + "step": 15698 + }, + { + "epoch": 2.5627525407126237, + "grad_norm": 1.6067798137664795, + "learning_rate": 1.8931716831841863e-05, + "loss": 0.5649, + "step": 15699 + }, + { + "epoch": 2.562915799355128, + "grad_norm": 1.681220531463623, + "learning_rate": 1.893157411784924e-05, + "loss": 0.519, + "step": 15700 + }, + { + "epoch": 2.5630790579976326, + "grad_norm": 1.347490906715393, + "learning_rate": 1.8931431394862526e-05, + "loss": 0.4217, + "step": 15701 + }, + { + "epoch": 2.563242316640137, + "grad_norm": 2.174295663833618, + "learning_rate": 1.8931288662881862e-05, + "loss": 0.6446, + "step": 15702 + }, + { + "epoch": 2.5634055752826415, + "grad_norm": 1.7396668195724487, + "learning_rate": 1.893114592190739e-05, + "loss": 0.5685, + "step": 15703 + }, + { + "epoch": 2.563568833925146, + "grad_norm": 1.9732019901275635, + "learning_rate": 1.893100317193926e-05, + "loss": 0.6087, + "step": 15704 + }, + { + "epoch": 2.5637320925676503, + "grad_norm": 1.8301602602005005, + "learning_rate": 1.893086041297761e-05, + "loss": 0.6439, + "step": 15705 + }, + { + "epoch": 2.5638953512101548, + "grad_norm": 1.9886420965194702, + "learning_rate": 1.8930717645022584e-05, + "loss": 0.6108, + "step": 15706 + }, + { + "epoch": 2.564058609852659, + "grad_norm": 1.9181303977966309, + "learning_rate": 1.8930574868074333e-05, + "loss": 0.6773, + "step": 15707 + }, + { + "epoch": 2.5642218684951636, + "grad_norm": 1.5719108581542969, + "learning_rate": 1.8930432082132992e-05, + "loss": 0.5201, + "step": 15708 + }, + { + "epoch": 2.564385127137668, + "grad_norm": 1.9036204814910889, + "learning_rate": 1.8930289287198712e-05, + "loss": 0.6377, + "step": 15709 + }, + { + "epoch": 2.564548385780172, + "grad_norm": 1.7010380029678345, + "learning_rate": 1.893014648327163e-05, + "loss": 0.5555, + "step": 15710 + }, + { + "epoch": 2.5647116444226765, + "grad_norm": 1.7698684930801392, + "learning_rate": 1.893000367035189e-05, + "loss": 0.6181, + "step": 15711 + }, + { + "epoch": 2.564874903065181, + "grad_norm": 1.8683429956436157, + "learning_rate": 1.8929860848439642e-05, + "loss": 0.5407, + "step": 15712 + }, + { + "epoch": 2.5650381617076854, + "grad_norm": 1.6565359830856323, + "learning_rate": 1.8929718017535027e-05, + "loss": 0.4555, + "step": 15713 + }, + { + "epoch": 2.56520142035019, + "grad_norm": 1.6794636249542236, + "learning_rate": 1.8929575177638185e-05, + "loss": 0.5679, + "step": 15714 + }, + { + "epoch": 2.5653646789926943, + "grad_norm": 1.561374545097351, + "learning_rate": 1.892943232874927e-05, + "loss": 0.5297, + "step": 15715 + }, + { + "epoch": 2.5655279376351987, + "grad_norm": 1.7824444770812988, + "learning_rate": 1.8929289470868412e-05, + "loss": 0.5657, + "step": 15716 + }, + { + "epoch": 2.5656911962777027, + "grad_norm": 1.5982351303100586, + "learning_rate": 1.8929146603995766e-05, + "loss": 0.5162, + "step": 15717 + }, + { + "epoch": 2.565854454920207, + "grad_norm": 1.9045556783676147, + "learning_rate": 1.892900372813147e-05, + "loss": 0.5869, + "step": 15718 + }, + { + "epoch": 2.5660177135627116, + "grad_norm": 1.7640644311904907, + "learning_rate": 1.892886084327567e-05, + "loss": 0.5462, + "step": 15719 + }, + { + "epoch": 2.566180972205216, + "grad_norm": 1.9124782085418701, + "learning_rate": 1.8928717949428508e-05, + "loss": 0.6403, + "step": 15720 + }, + { + "epoch": 2.5663442308477205, + "grad_norm": 1.7458771467208862, + "learning_rate": 1.892857504659013e-05, + "loss": 0.4842, + "step": 15721 + }, + { + "epoch": 2.566507489490225, + "grad_norm": 1.74147367477417, + "learning_rate": 1.8928432134760683e-05, + "loss": 0.6693, + "step": 15722 + }, + { + "epoch": 2.5666707481327293, + "grad_norm": 1.7702559232711792, + "learning_rate": 1.8928289213940302e-05, + "loss": 0.6016, + "step": 15723 + }, + { + "epoch": 2.5668340067752338, + "grad_norm": 1.966869831085205, + "learning_rate": 1.892814628412914e-05, + "loss": 0.7009, + "step": 15724 + }, + { + "epoch": 2.566997265417738, + "grad_norm": 2.0722568035125732, + "learning_rate": 1.8928003345327334e-05, + "loss": 0.6248, + "step": 15725 + }, + { + "epoch": 2.5671605240602426, + "grad_norm": 1.7482815980911255, + "learning_rate": 1.8927860397535035e-05, + "loss": 0.5573, + "step": 15726 + }, + { + "epoch": 2.567323782702747, + "grad_norm": 1.6302883625030518, + "learning_rate": 1.892771744075238e-05, + "loss": 0.5371, + "step": 15727 + }, + { + "epoch": 2.567487041345251, + "grad_norm": 1.843042254447937, + "learning_rate": 1.892757447497952e-05, + "loss": 0.6029, + "step": 15728 + }, + { + "epoch": 2.5676502999877555, + "grad_norm": 1.5761293172836304, + "learning_rate": 1.8927431500216587e-05, + "loss": 0.5286, + "step": 15729 + }, + { + "epoch": 2.56781355863026, + "grad_norm": 1.8380481004714966, + "learning_rate": 1.8927288516463738e-05, + "loss": 0.6033, + "step": 15730 + }, + { + "epoch": 2.5679768172727644, + "grad_norm": 1.7294728755950928, + "learning_rate": 1.8927145523721112e-05, + "loss": 0.5382, + "step": 15731 + }, + { + "epoch": 2.568140075915269, + "grad_norm": 1.7725814580917358, + "learning_rate": 1.892700252198885e-05, + "loss": 0.575, + "step": 15732 + }, + { + "epoch": 2.5683033345577733, + "grad_norm": 1.9153327941894531, + "learning_rate": 1.8926859511267103e-05, + "loss": 0.5352, + "step": 15733 + }, + { + "epoch": 2.5684665932002777, + "grad_norm": 1.6355431079864502, + "learning_rate": 1.8926716491556004e-05, + "loss": 0.5071, + "step": 15734 + }, + { + "epoch": 2.5686298518427817, + "grad_norm": 2.0125465393066406, + "learning_rate": 1.892657346285571e-05, + "loss": 0.5367, + "step": 15735 + }, + { + "epoch": 2.568793110485286, + "grad_norm": 2.0137040615081787, + "learning_rate": 1.892643042516636e-05, + "loss": 0.6779, + "step": 15736 + }, + { + "epoch": 2.5689563691277906, + "grad_norm": 2.2872300148010254, + "learning_rate": 1.892628737848809e-05, + "loss": 0.7026, + "step": 15737 + }, + { + "epoch": 2.569119627770295, + "grad_norm": 1.7693305015563965, + "learning_rate": 1.892614432282106e-05, + "loss": 0.591, + "step": 15738 + }, + { + "epoch": 2.5692828864127994, + "grad_norm": 1.7878426313400269, + "learning_rate": 1.8926001258165397e-05, + "loss": 0.6154, + "step": 15739 + }, + { + "epoch": 2.569446145055304, + "grad_norm": 1.8393193483352661, + "learning_rate": 1.892585818452126e-05, + "loss": 0.5839, + "step": 15740 + }, + { + "epoch": 2.5696094036978083, + "grad_norm": 1.7133667469024658, + "learning_rate": 1.892571510188878e-05, + "loss": 0.5093, + "step": 15741 + }, + { + "epoch": 2.5697726623403128, + "grad_norm": 1.9416930675506592, + "learning_rate": 1.892557201026811e-05, + "loss": 0.665, + "step": 15742 + }, + { + "epoch": 2.569935920982817, + "grad_norm": 1.9800057411193848, + "learning_rate": 1.892542890965939e-05, + "loss": 0.5711, + "step": 15743 + }, + { + "epoch": 2.5700991796253216, + "grad_norm": 2.033428907394409, + "learning_rate": 1.8925285800062763e-05, + "loss": 0.5595, + "step": 15744 + }, + { + "epoch": 2.570262438267826, + "grad_norm": 1.861674189567566, + "learning_rate": 1.892514268147838e-05, + "loss": 0.5523, + "step": 15745 + }, + { + "epoch": 2.57042569691033, + "grad_norm": 1.7300775051116943, + "learning_rate": 1.892499955390638e-05, + "loss": 0.5899, + "step": 15746 + }, + { + "epoch": 2.5705889555528345, + "grad_norm": 1.8629488945007324, + "learning_rate": 1.892485641734691e-05, + "loss": 0.5597, + "step": 15747 + }, + { + "epoch": 2.570752214195339, + "grad_norm": 1.7867040634155273, + "learning_rate": 1.8924713271800107e-05, + "loss": 0.5897, + "step": 15748 + }, + { + "epoch": 2.5709154728378434, + "grad_norm": 1.5962499380111694, + "learning_rate": 1.8924570117266124e-05, + "loss": 0.5598, + "step": 15749 + }, + { + "epoch": 2.571078731480348, + "grad_norm": 1.5919663906097412, + "learning_rate": 1.89244269537451e-05, + "loss": 0.5145, + "step": 15750 + }, + { + "epoch": 2.5712419901228523, + "grad_norm": 1.7783151865005493, + "learning_rate": 1.892428378123718e-05, + "loss": 0.5371, + "step": 15751 + }, + { + "epoch": 2.5714052487653563, + "grad_norm": 1.3907052278518677, + "learning_rate": 1.8924140599742512e-05, + "loss": 0.4358, + "step": 15752 + }, + { + "epoch": 2.5715685074078607, + "grad_norm": 2.055863618850708, + "learning_rate": 1.892399740926123e-05, + "loss": 0.5541, + "step": 15753 + }, + { + "epoch": 2.571731766050365, + "grad_norm": 1.675504207611084, + "learning_rate": 1.8923854209793487e-05, + "loss": 0.5513, + "step": 15754 + }, + { + "epoch": 2.5718950246928696, + "grad_norm": 2.0403356552124023, + "learning_rate": 1.892371100133943e-05, + "loss": 0.6035, + "step": 15755 + }, + { + "epoch": 2.572058283335374, + "grad_norm": 1.5648068189620972, + "learning_rate": 1.8923567783899193e-05, + "loss": 0.6304, + "step": 15756 + }, + { + "epoch": 2.5722215419778784, + "grad_norm": 1.6164612770080566, + "learning_rate": 1.892342455747293e-05, + "loss": 0.547, + "step": 15757 + }, + { + "epoch": 2.572384800620383, + "grad_norm": 1.578420639038086, + "learning_rate": 1.892328132206078e-05, + "loss": 0.5086, + "step": 15758 + }, + { + "epoch": 2.5725480592628873, + "grad_norm": 1.7190542221069336, + "learning_rate": 1.8923138077662885e-05, + "loss": 0.4716, + "step": 15759 + }, + { + "epoch": 2.5727113179053918, + "grad_norm": 1.7485283613204956, + "learning_rate": 1.8922994824279394e-05, + "loss": 0.5989, + "step": 15760 + }, + { + "epoch": 2.572874576547896, + "grad_norm": 1.7738200426101685, + "learning_rate": 1.892285156191045e-05, + "loss": 0.565, + "step": 15761 + }, + { + "epoch": 2.5730378351904006, + "grad_norm": 1.8992691040039062, + "learning_rate": 1.8922708290556197e-05, + "loss": 0.5937, + "step": 15762 + }, + { + "epoch": 2.5732010938329046, + "grad_norm": 1.6902225017547607, + "learning_rate": 1.8922565010216778e-05, + "loss": 0.4885, + "step": 15763 + }, + { + "epoch": 2.573364352475409, + "grad_norm": 1.6345921754837036, + "learning_rate": 1.8922421720892338e-05, + "loss": 0.5224, + "step": 15764 + }, + { + "epoch": 2.5735276111179135, + "grad_norm": 1.5695724487304688, + "learning_rate": 1.8922278422583026e-05, + "loss": 0.5419, + "step": 15765 + }, + { + "epoch": 2.573690869760418, + "grad_norm": 1.8224223852157593, + "learning_rate": 1.8922135115288976e-05, + "loss": 0.4616, + "step": 15766 + }, + { + "epoch": 2.5738541284029224, + "grad_norm": 1.7406085729599, + "learning_rate": 1.892199179901034e-05, + "loss": 0.5693, + "step": 15767 + }, + { + "epoch": 2.574017387045427, + "grad_norm": 2.2137234210968018, + "learning_rate": 1.8921848473747262e-05, + "loss": 0.6735, + "step": 15768 + }, + { + "epoch": 2.5741806456879313, + "grad_norm": 2.037755012512207, + "learning_rate": 1.8921705139499885e-05, + "loss": 0.5338, + "step": 15769 + }, + { + "epoch": 2.5743439043304353, + "grad_norm": 2.189572334289551, + "learning_rate": 1.8921561796268354e-05, + "loss": 0.5991, + "step": 15770 + }, + { + "epoch": 2.5745071629729397, + "grad_norm": 1.9346760511398315, + "learning_rate": 1.8921418444052812e-05, + "loss": 0.6172, + "step": 15771 + }, + { + "epoch": 2.574670421615444, + "grad_norm": 2.050011396408081, + "learning_rate": 1.89212750828534e-05, + "loss": 0.6562, + "step": 15772 + }, + { + "epoch": 2.5748336802579486, + "grad_norm": 1.7066612243652344, + "learning_rate": 1.892113171267027e-05, + "loss": 0.5855, + "step": 15773 + }, + { + "epoch": 2.574996938900453, + "grad_norm": 1.7254384756088257, + "learning_rate": 1.8920988333503564e-05, + "loss": 0.4837, + "step": 15774 + }, + { + "epoch": 2.5751601975429574, + "grad_norm": 1.6464064121246338, + "learning_rate": 1.8920844945353425e-05, + "loss": 0.5361, + "step": 15775 + }, + { + "epoch": 2.575323456185462, + "grad_norm": 1.5578243732452393, + "learning_rate": 1.8920701548219997e-05, + "loss": 0.4661, + "step": 15776 + }, + { + "epoch": 2.5754867148279663, + "grad_norm": 2.0118672847747803, + "learning_rate": 1.8920558142103426e-05, + "loss": 0.6217, + "step": 15777 + }, + { + "epoch": 2.5756499734704708, + "grad_norm": 1.501422643661499, + "learning_rate": 1.892041472700385e-05, + "loss": 0.5418, + "step": 15778 + }, + { + "epoch": 2.575813232112975, + "grad_norm": 1.822941780090332, + "learning_rate": 1.892027130292142e-05, + "loss": 0.572, + "step": 15779 + }, + { + "epoch": 2.5759764907554796, + "grad_norm": 1.895145297050476, + "learning_rate": 1.8920127869856283e-05, + "loss": 0.6293, + "step": 15780 + }, + { + "epoch": 2.5761397493979836, + "grad_norm": 1.561397671699524, + "learning_rate": 1.891998442780858e-05, + "loss": 0.4845, + "step": 15781 + }, + { + "epoch": 2.576303008040488, + "grad_norm": 1.6171908378601074, + "learning_rate": 1.8919840976778453e-05, + "loss": 0.5607, + "step": 15782 + }, + { + "epoch": 2.5764662666829925, + "grad_norm": 2.2037105560302734, + "learning_rate": 1.8919697516766046e-05, + "loss": 0.6366, + "step": 15783 + }, + { + "epoch": 2.576629525325497, + "grad_norm": 1.674755573272705, + "learning_rate": 1.8919554047771508e-05, + "loss": 0.5069, + "step": 15784 + }, + { + "epoch": 2.5767927839680014, + "grad_norm": 1.725561261177063, + "learning_rate": 1.8919410569794982e-05, + "loss": 0.547, + "step": 15785 + }, + { + "epoch": 2.576956042610506, + "grad_norm": 1.6767146587371826, + "learning_rate": 1.8919267082836614e-05, + "loss": 0.5358, + "step": 15786 + }, + { + "epoch": 2.57711930125301, + "grad_norm": 1.635693073272705, + "learning_rate": 1.8919123586896545e-05, + "loss": 0.4985, + "step": 15787 + }, + { + "epoch": 2.5772825598955142, + "grad_norm": 2.0647835731506348, + "learning_rate": 1.8918980081974922e-05, + "loss": 0.6539, + "step": 15788 + }, + { + "epoch": 2.5774458185380187, + "grad_norm": 1.804109811782837, + "learning_rate": 1.8918836568071883e-05, + "loss": 0.6401, + "step": 15789 + }, + { + "epoch": 2.577609077180523, + "grad_norm": 1.7453705072402954, + "learning_rate": 1.891869304518758e-05, + "loss": 0.5421, + "step": 15790 + }, + { + "epoch": 2.5777723358230276, + "grad_norm": 1.8223216533660889, + "learning_rate": 1.891854951332216e-05, + "loss": 0.5115, + "step": 15791 + }, + { + "epoch": 2.577935594465532, + "grad_norm": 1.7414997816085815, + "learning_rate": 1.8918405972475757e-05, + "loss": 0.4422, + "step": 15792 + }, + { + "epoch": 2.5780988531080364, + "grad_norm": 1.6820363998413086, + "learning_rate": 1.8918262422648527e-05, + "loss": 0.5098, + "step": 15793 + }, + { + "epoch": 2.578262111750541, + "grad_norm": 1.5713824033737183, + "learning_rate": 1.8918118863840605e-05, + "loss": 0.574, + "step": 15794 + }, + { + "epoch": 2.5784253703930453, + "grad_norm": 1.8226797580718994, + "learning_rate": 1.8917975296052143e-05, + "loss": 0.6404, + "step": 15795 + }, + { + "epoch": 2.5785886290355498, + "grad_norm": 1.5766535997390747, + "learning_rate": 1.891783171928328e-05, + "loss": 0.4927, + "step": 15796 + }, + { + "epoch": 2.578751887678054, + "grad_norm": 1.5046097040176392, + "learning_rate": 1.8917688133534162e-05, + "loss": 0.4988, + "step": 15797 + }, + { + "epoch": 2.578915146320558, + "grad_norm": 1.7505607604980469, + "learning_rate": 1.891754453880494e-05, + "loss": 0.5072, + "step": 15798 + }, + { + "epoch": 2.5790784049630626, + "grad_norm": 1.5969103574752808, + "learning_rate": 1.8917400935095744e-05, + "loss": 0.4862, + "step": 15799 + }, + { + "epoch": 2.579241663605567, + "grad_norm": 2.0600175857543945, + "learning_rate": 1.8917257322406735e-05, + "loss": 0.6632, + "step": 15800 + }, + { + "epoch": 2.5794049222480715, + "grad_norm": 1.8838386535644531, + "learning_rate": 1.8917113700738046e-05, + "loss": 0.5599, + "step": 15801 + }, + { + "epoch": 2.579568180890576, + "grad_norm": 2.030688524246216, + "learning_rate": 1.8916970070089828e-05, + "loss": 0.62, + "step": 15802 + }, + { + "epoch": 2.5797314395330804, + "grad_norm": 2.0646302700042725, + "learning_rate": 1.8916826430462224e-05, + "loss": 0.6633, + "step": 15803 + }, + { + "epoch": 2.579894698175585, + "grad_norm": 1.6400400400161743, + "learning_rate": 1.8916682781855377e-05, + "loss": 0.5426, + "step": 15804 + }, + { + "epoch": 2.580057956818089, + "grad_norm": 2.012605667114258, + "learning_rate": 1.8916539124269435e-05, + "loss": 0.535, + "step": 15805 + }, + { + "epoch": 2.5802212154605932, + "grad_norm": 1.821671724319458, + "learning_rate": 1.8916395457704536e-05, + "loss": 0.8128, + "step": 15806 + }, + { + "epoch": 2.5803844741030977, + "grad_norm": 1.6671395301818848, + "learning_rate": 1.891625178216083e-05, + "loss": 0.4903, + "step": 15807 + }, + { + "epoch": 2.580547732745602, + "grad_norm": 1.6975494623184204, + "learning_rate": 1.8916108097638462e-05, + "loss": 0.5366, + "step": 15808 + }, + { + "epoch": 2.5807109913881066, + "grad_norm": 1.6413053274154663, + "learning_rate": 1.8915964404137577e-05, + "loss": 0.5377, + "step": 15809 + }, + { + "epoch": 2.580874250030611, + "grad_norm": 1.5170762538909912, + "learning_rate": 1.891582070165832e-05, + "loss": 0.4717, + "step": 15810 + }, + { + "epoch": 2.5810375086731154, + "grad_norm": 1.4555954933166504, + "learning_rate": 1.891567699020083e-05, + "loss": 0.5238, + "step": 15811 + }, + { + "epoch": 2.58120076731562, + "grad_norm": 1.7267125844955444, + "learning_rate": 1.8915533269765258e-05, + "loss": 0.5178, + "step": 15812 + }, + { + "epoch": 2.5813640259581243, + "grad_norm": 1.5560095310211182, + "learning_rate": 1.891538954035174e-05, + "loss": 0.5476, + "step": 15813 + }, + { + "epoch": 2.5815272846006287, + "grad_norm": 1.7520489692687988, + "learning_rate": 1.8915245801960435e-05, + "loss": 0.5535, + "step": 15814 + }, + { + "epoch": 2.581690543243133, + "grad_norm": 1.7954952716827393, + "learning_rate": 1.891510205459148e-05, + "loss": 0.585, + "step": 15815 + }, + { + "epoch": 2.581853801885637, + "grad_norm": 1.7255955934524536, + "learning_rate": 1.8914958298245013e-05, + "loss": 0.5262, + "step": 15816 + }, + { + "epoch": 2.5820170605281416, + "grad_norm": 1.8265472650527954, + "learning_rate": 1.891481453292119e-05, + "loss": 0.5358, + "step": 15817 + }, + { + "epoch": 2.582180319170646, + "grad_norm": 1.7701958417892456, + "learning_rate": 1.891467075862015e-05, + "loss": 0.5671, + "step": 15818 + }, + { + "epoch": 2.5823435778131505, + "grad_norm": 1.921331524848938, + "learning_rate": 1.891452697534204e-05, + "loss": 0.6144, + "step": 15819 + }, + { + "epoch": 2.582506836455655, + "grad_norm": 1.575853943824768, + "learning_rate": 1.8914383183087e-05, + "loss": 0.4378, + "step": 15820 + }, + { + "epoch": 2.5826700950981594, + "grad_norm": 1.6944061517715454, + "learning_rate": 1.8914239381855184e-05, + "loss": 0.5377, + "step": 15821 + }, + { + "epoch": 2.582833353740664, + "grad_norm": 1.879197597503662, + "learning_rate": 1.8914095571646726e-05, + "loss": 0.6653, + "step": 15822 + }, + { + "epoch": 2.582996612383168, + "grad_norm": 1.566739797592163, + "learning_rate": 1.8913951752461778e-05, + "loss": 0.4407, + "step": 15823 + }, + { + "epoch": 2.5831598710256722, + "grad_norm": 2.046302556991577, + "learning_rate": 1.8913807924300486e-05, + "loss": 0.6234, + "step": 15824 + }, + { + "epoch": 2.5833231296681767, + "grad_norm": 1.9719228744506836, + "learning_rate": 1.891366408716299e-05, + "loss": 0.6525, + "step": 15825 + }, + { + "epoch": 2.583486388310681, + "grad_norm": 1.6049559116363525, + "learning_rate": 1.8913520241049435e-05, + "loss": 0.5155, + "step": 15826 + }, + { + "epoch": 2.5836496469531856, + "grad_norm": 1.7825756072998047, + "learning_rate": 1.8913376385959966e-05, + "loss": 0.5804, + "step": 15827 + }, + { + "epoch": 2.58381290559569, + "grad_norm": 1.9748889207839966, + "learning_rate": 1.8913232521894734e-05, + "loss": 0.5435, + "step": 15828 + }, + { + "epoch": 2.5839761642381944, + "grad_norm": 1.9985871315002441, + "learning_rate": 1.891308864885388e-05, + "loss": 0.6174, + "step": 15829 + }, + { + "epoch": 2.584139422880699, + "grad_norm": 1.878721833229065, + "learning_rate": 1.8912944766837542e-05, + "loss": 0.5449, + "step": 15830 + }, + { + "epoch": 2.5843026815232033, + "grad_norm": 1.6558092832565308, + "learning_rate": 1.8912800875845874e-05, + "loss": 0.5568, + "step": 15831 + }, + { + "epoch": 2.5844659401657077, + "grad_norm": 1.6641840934753418, + "learning_rate": 1.891265697587902e-05, + "loss": 0.5293, + "step": 15832 + }, + { + "epoch": 2.584629198808212, + "grad_norm": 1.6958576440811157, + "learning_rate": 1.891251306693712e-05, + "loss": 0.4627, + "step": 15833 + }, + { + "epoch": 2.584792457450716, + "grad_norm": 1.9580596685409546, + "learning_rate": 1.8912369149020323e-05, + "loss": 0.6556, + "step": 15834 + }, + { + "epoch": 2.5849557160932206, + "grad_norm": 1.9817659854888916, + "learning_rate": 1.891222522212877e-05, + "loss": 0.6182, + "step": 15835 + }, + { + "epoch": 2.585118974735725, + "grad_norm": 1.73812735080719, + "learning_rate": 1.891208128626261e-05, + "loss": 0.5295, + "step": 15836 + }, + { + "epoch": 2.5852822333782295, + "grad_norm": 1.6987968683242798, + "learning_rate": 1.891193734142199e-05, + "loss": 0.484, + "step": 15837 + }, + { + "epoch": 2.585445492020734, + "grad_norm": 1.8195581436157227, + "learning_rate": 1.8911793387607045e-05, + "loss": 0.5954, + "step": 15838 + }, + { + "epoch": 2.5856087506632384, + "grad_norm": 1.6944156885147095, + "learning_rate": 1.8911649424817934e-05, + "loss": 0.5161, + "step": 15839 + }, + { + "epoch": 2.5857720093057424, + "grad_norm": 1.5771658420562744, + "learning_rate": 1.891150545305479e-05, + "loss": 0.5152, + "step": 15840 + }, + { + "epoch": 2.585935267948247, + "grad_norm": 1.7848204374313354, + "learning_rate": 1.8911361472317762e-05, + "loss": 0.5362, + "step": 15841 + }, + { + "epoch": 2.5860985265907512, + "grad_norm": 2.2011070251464844, + "learning_rate": 1.8911217482606996e-05, + "loss": 0.6322, + "step": 15842 + }, + { + "epoch": 2.5862617852332557, + "grad_norm": 1.771855115890503, + "learning_rate": 1.8911073483922634e-05, + "loss": 0.6122, + "step": 15843 + }, + { + "epoch": 2.58642504387576, + "grad_norm": 1.7070363759994507, + "learning_rate": 1.891092947626483e-05, + "loss": 0.5577, + "step": 15844 + }, + { + "epoch": 2.5865883025182645, + "grad_norm": 1.6505470275878906, + "learning_rate": 1.891078545963372e-05, + "loss": 0.5344, + "step": 15845 + }, + { + "epoch": 2.586751561160769, + "grad_norm": 1.484018325805664, + "learning_rate": 1.8910641434029447e-05, + "loss": 0.511, + "step": 15846 + }, + { + "epoch": 2.5869148198032734, + "grad_norm": 1.7940258979797363, + "learning_rate": 1.8910497399452162e-05, + "loss": 0.5868, + "step": 15847 + }, + { + "epoch": 2.587078078445778, + "grad_norm": 1.9618513584136963, + "learning_rate": 1.891035335590201e-05, + "loss": 0.5922, + "step": 15848 + }, + { + "epoch": 2.5872413370882823, + "grad_norm": 1.543418049812317, + "learning_rate": 1.8910209303379133e-05, + "loss": 0.5329, + "step": 15849 + }, + { + "epoch": 2.5874045957307867, + "grad_norm": 1.6834832429885864, + "learning_rate": 1.891006524188368e-05, + "loss": 0.5519, + "step": 15850 + }, + { + "epoch": 2.5875678543732907, + "grad_norm": 1.712114691734314, + "learning_rate": 1.8909921171415793e-05, + "loss": 0.5092, + "step": 15851 + }, + { + "epoch": 2.587731113015795, + "grad_norm": 1.7542380094528198, + "learning_rate": 1.8909777091975615e-05, + "loss": 0.6018, + "step": 15852 + }, + { + "epoch": 2.5878943716582996, + "grad_norm": 1.6798354387283325, + "learning_rate": 1.8909633003563298e-05, + "loss": 0.5403, + "step": 15853 + }, + { + "epoch": 2.588057630300804, + "grad_norm": 1.9015182256698608, + "learning_rate": 1.890948890617898e-05, + "loss": 0.5781, + "step": 15854 + }, + { + "epoch": 2.5882208889433085, + "grad_norm": 1.5162180662155151, + "learning_rate": 1.890934479982281e-05, + "loss": 0.4704, + "step": 15855 + }, + { + "epoch": 2.588384147585813, + "grad_norm": 1.6700900793075562, + "learning_rate": 1.8909200684494933e-05, + "loss": 0.526, + "step": 15856 + }, + { + "epoch": 2.5885474062283174, + "grad_norm": 1.513726830482483, + "learning_rate": 1.890905656019549e-05, + "loss": 0.4144, + "step": 15857 + }, + { + "epoch": 2.5887106648708214, + "grad_norm": 1.5931607484817505, + "learning_rate": 1.8908912426924633e-05, + "loss": 0.4991, + "step": 15858 + }, + { + "epoch": 2.588873923513326, + "grad_norm": 1.903397798538208, + "learning_rate": 1.8908768284682505e-05, + "loss": 0.6478, + "step": 15859 + }, + { + "epoch": 2.5890371821558302, + "grad_norm": 1.803954005241394, + "learning_rate": 1.8908624133469248e-05, + "loss": 0.6067, + "step": 15860 + }, + { + "epoch": 2.5892004407983347, + "grad_norm": 1.8710887432098389, + "learning_rate": 1.8908479973285007e-05, + "loss": 0.647, + "step": 15861 + }, + { + "epoch": 2.589363699440839, + "grad_norm": 1.6811472177505493, + "learning_rate": 1.8908335804129928e-05, + "loss": 0.4872, + "step": 15862 + }, + { + "epoch": 2.5895269580833435, + "grad_norm": 1.5404911041259766, + "learning_rate": 1.890819162600416e-05, + "loss": 0.4736, + "step": 15863 + }, + { + "epoch": 2.589690216725848, + "grad_norm": 1.798147439956665, + "learning_rate": 1.8908047438907843e-05, + "loss": 0.5745, + "step": 15864 + }, + { + "epoch": 2.5898534753683524, + "grad_norm": 1.382952332496643, + "learning_rate": 1.8907903242841124e-05, + "loss": 0.4495, + "step": 15865 + }, + { + "epoch": 2.590016734010857, + "grad_norm": 1.7136355638504028, + "learning_rate": 1.8907759037804154e-05, + "loss": 0.5679, + "step": 15866 + }, + { + "epoch": 2.5901799926533613, + "grad_norm": 2.2534217834472656, + "learning_rate": 1.8907614823797068e-05, + "loss": 0.6794, + "step": 15867 + }, + { + "epoch": 2.5903432512958657, + "grad_norm": 1.7388484477996826, + "learning_rate": 1.890747060082002e-05, + "loss": 0.5581, + "step": 15868 + }, + { + "epoch": 2.5905065099383697, + "grad_norm": 1.9411038160324097, + "learning_rate": 1.890732636887315e-05, + "loss": 0.5773, + "step": 15869 + }, + { + "epoch": 2.590669768580874, + "grad_norm": 1.9532725811004639, + "learning_rate": 1.8907182127956604e-05, + "loss": 0.6277, + "step": 15870 + }, + { + "epoch": 2.5908330272233786, + "grad_norm": 1.7046955823898315, + "learning_rate": 1.890703787807053e-05, + "loss": 0.5807, + "step": 15871 + }, + { + "epoch": 2.590996285865883, + "grad_norm": 1.6634621620178223, + "learning_rate": 1.890689361921507e-05, + "loss": 0.5699, + "step": 15872 + }, + { + "epoch": 2.5911595445083875, + "grad_norm": 1.79916250705719, + "learning_rate": 1.890674935139037e-05, + "loss": 0.5174, + "step": 15873 + }, + { + "epoch": 2.591322803150892, + "grad_norm": 1.8918606042861938, + "learning_rate": 1.8906605074596575e-05, + "loss": 0.6279, + "step": 15874 + }, + { + "epoch": 2.591486061793396, + "grad_norm": 1.6218488216400146, + "learning_rate": 1.890646078883383e-05, + "loss": 0.5551, + "step": 15875 + }, + { + "epoch": 2.5916493204359003, + "grad_norm": 2.553119659423828, + "learning_rate": 1.8906316494102283e-05, + "loss": 0.86, + "step": 15876 + }, + { + "epoch": 2.591812579078405, + "grad_norm": 1.9147651195526123, + "learning_rate": 1.890617219040208e-05, + "loss": 0.6002, + "step": 15877 + }, + { + "epoch": 2.5919758377209092, + "grad_norm": 2.0101771354675293, + "learning_rate": 1.890602787773336e-05, + "loss": 0.5656, + "step": 15878 + }, + { + "epoch": 2.5921390963634137, + "grad_norm": 2.0387187004089355, + "learning_rate": 1.8905883556096274e-05, + "loss": 0.6124, + "step": 15879 + }, + { + "epoch": 2.592302355005918, + "grad_norm": 1.7414919137954712, + "learning_rate": 1.8905739225490966e-05, + "loss": 0.5055, + "step": 15880 + }, + { + "epoch": 2.5924656136484225, + "grad_norm": 1.8036211729049683, + "learning_rate": 1.890559488591758e-05, + "loss": 0.6259, + "step": 15881 + }, + { + "epoch": 2.592628872290927, + "grad_norm": 1.829856514930725, + "learning_rate": 1.8905450537376266e-05, + "loss": 0.4774, + "step": 15882 + }, + { + "epoch": 2.5927921309334314, + "grad_norm": 1.883961796760559, + "learning_rate": 1.890530617986716e-05, + "loss": 0.5315, + "step": 15883 + }, + { + "epoch": 2.592955389575936, + "grad_norm": 1.7925584316253662, + "learning_rate": 1.8905161813390415e-05, + "loss": 0.4918, + "step": 15884 + }, + { + "epoch": 2.5931186482184403, + "grad_norm": 2.084967613220215, + "learning_rate": 1.8905017437946177e-05, + "loss": 0.6604, + "step": 15885 + }, + { + "epoch": 2.5932819068609443, + "grad_norm": 1.6382275819778442, + "learning_rate": 1.8904873053534585e-05, + "loss": 0.5518, + "step": 15886 + }, + { + "epoch": 2.5934451655034487, + "grad_norm": 1.723232388496399, + "learning_rate": 1.890472866015579e-05, + "loss": 0.5846, + "step": 15887 + }, + { + "epoch": 2.593608424145953, + "grad_norm": 1.9139111042022705, + "learning_rate": 1.8904584257809936e-05, + "loss": 0.5708, + "step": 15888 + }, + { + "epoch": 2.5937716827884576, + "grad_norm": 1.7365732192993164, + "learning_rate": 1.890443984649717e-05, + "loss": 0.6446, + "step": 15889 + }, + { + "epoch": 2.593934941430962, + "grad_norm": 1.9514857530593872, + "learning_rate": 1.8904295426217632e-05, + "loss": 0.5815, + "step": 15890 + }, + { + "epoch": 2.5940982000734665, + "grad_norm": 1.9302114248275757, + "learning_rate": 1.890415099697147e-05, + "loss": 0.653, + "step": 15891 + }, + { + "epoch": 2.594261458715971, + "grad_norm": 2.1644363403320312, + "learning_rate": 1.8904006558758833e-05, + "loss": 0.6152, + "step": 15892 + }, + { + "epoch": 2.594424717358475, + "grad_norm": 2.203773021697998, + "learning_rate": 1.8903862111579863e-05, + "loss": 0.6157, + "step": 15893 + }, + { + "epoch": 2.5945879760009793, + "grad_norm": 1.5776147842407227, + "learning_rate": 1.8903717655434708e-05, + "loss": 0.503, + "step": 15894 + }, + { + "epoch": 2.594751234643484, + "grad_norm": 1.9565842151641846, + "learning_rate": 1.8903573190323508e-05, + "loss": 0.6873, + "step": 15895 + }, + { + "epoch": 2.594914493285988, + "grad_norm": 1.9383671283721924, + "learning_rate": 1.8903428716246414e-05, + "loss": 0.5725, + "step": 15896 + }, + { + "epoch": 2.5950777519284927, + "grad_norm": 1.7007063627243042, + "learning_rate": 1.890328423320357e-05, + "loss": 0.6438, + "step": 15897 + }, + { + "epoch": 2.595241010570997, + "grad_norm": 1.829313039779663, + "learning_rate": 1.8903139741195122e-05, + "loss": 0.5856, + "step": 15898 + }, + { + "epoch": 2.5954042692135015, + "grad_norm": 1.7949849367141724, + "learning_rate": 1.8902995240221215e-05, + "loss": 0.5748, + "step": 15899 + }, + { + "epoch": 2.595567527856006, + "grad_norm": 1.934328317642212, + "learning_rate": 1.8902850730281993e-05, + "loss": 0.544, + "step": 15900 + }, + { + "epoch": 2.5957307864985104, + "grad_norm": 1.6825941801071167, + "learning_rate": 1.89027062113776e-05, + "loss": 0.583, + "step": 15901 + }, + { + "epoch": 2.595894045141015, + "grad_norm": 1.6888014078140259, + "learning_rate": 1.8902561683508186e-05, + "loss": 0.56, + "step": 15902 + }, + { + "epoch": 2.5960573037835193, + "grad_norm": 1.4530231952667236, + "learning_rate": 1.89024171466739e-05, + "loss": 0.4513, + "step": 15903 + }, + { + "epoch": 2.5962205624260233, + "grad_norm": 1.7298177480697632, + "learning_rate": 1.8902272600874877e-05, + "loss": 0.6552, + "step": 15904 + }, + { + "epoch": 2.5963838210685277, + "grad_norm": 1.6382168531417847, + "learning_rate": 1.8902128046111267e-05, + "loss": 0.5871, + "step": 15905 + }, + { + "epoch": 2.596547079711032, + "grad_norm": 1.8850725889205933, + "learning_rate": 1.8901983482383217e-05, + "loss": 0.5957, + "step": 15906 + }, + { + "epoch": 2.5967103383535366, + "grad_norm": 2.123091697692871, + "learning_rate": 1.8901838909690874e-05, + "loss": 0.7189, + "step": 15907 + }, + { + "epoch": 2.596873596996041, + "grad_norm": 1.7301981449127197, + "learning_rate": 1.890169432803438e-05, + "loss": 0.5927, + "step": 15908 + }, + { + "epoch": 2.5970368556385455, + "grad_norm": 2.0424020290374756, + "learning_rate": 1.8901549737413883e-05, + "loss": 0.6074, + "step": 15909 + }, + { + "epoch": 2.59720011428105, + "grad_norm": 1.440181016921997, + "learning_rate": 1.890140513782953e-05, + "loss": 0.4932, + "step": 15910 + }, + { + "epoch": 2.597363372923554, + "grad_norm": 1.7189973592758179, + "learning_rate": 1.890126052928146e-05, + "loss": 0.5087, + "step": 15911 + }, + { + "epoch": 2.5975266315660583, + "grad_norm": 1.9165375232696533, + "learning_rate": 1.8901115911769824e-05, + "loss": 0.6578, + "step": 15912 + }, + { + "epoch": 2.597689890208563, + "grad_norm": 1.7366178035736084, + "learning_rate": 1.890097128529477e-05, + "loss": 0.5166, + "step": 15913 + }, + { + "epoch": 2.597853148851067, + "grad_norm": 1.8777168989181519, + "learning_rate": 1.8900826649856435e-05, + "loss": 0.5918, + "step": 15914 + }, + { + "epoch": 2.5980164074935717, + "grad_norm": 1.648166537284851, + "learning_rate": 1.8900682005454974e-05, + "loss": 0.5175, + "step": 15915 + }, + { + "epoch": 2.598179666136076, + "grad_norm": 1.4606493711471558, + "learning_rate": 1.8900537352090523e-05, + "loss": 0.4997, + "step": 15916 + }, + { + "epoch": 2.5983429247785805, + "grad_norm": 1.9237103462219238, + "learning_rate": 1.890039268976324e-05, + "loss": 0.6462, + "step": 15917 + }, + { + "epoch": 2.598506183421085, + "grad_norm": 1.84309983253479, + "learning_rate": 1.890024801847326e-05, + "loss": 0.5394, + "step": 15918 + }, + { + "epoch": 2.5986694420635894, + "grad_norm": 1.6959056854248047, + "learning_rate": 1.8900103338220737e-05, + "loss": 0.5698, + "step": 15919 + }, + { + "epoch": 2.598832700706094, + "grad_norm": 1.764296054840088, + "learning_rate": 1.889995864900581e-05, + "loss": 0.5898, + "step": 15920 + }, + { + "epoch": 2.5989959593485983, + "grad_norm": 1.6734576225280762, + "learning_rate": 1.8899813950828626e-05, + "loss": 0.5124, + "step": 15921 + }, + { + "epoch": 2.5991592179911023, + "grad_norm": 1.6070983409881592, + "learning_rate": 1.889966924368933e-05, + "loss": 0.5523, + "step": 15922 + }, + { + "epoch": 2.5993224766336067, + "grad_norm": 1.7419391870498657, + "learning_rate": 1.889952452758807e-05, + "loss": 0.6179, + "step": 15923 + }, + { + "epoch": 2.599485735276111, + "grad_norm": 2.025076150894165, + "learning_rate": 1.8899379802524994e-05, + "loss": 0.5504, + "step": 15924 + }, + { + "epoch": 2.5996489939186156, + "grad_norm": 1.7113566398620605, + "learning_rate": 1.8899235068500245e-05, + "loss": 0.615, + "step": 15925 + }, + { + "epoch": 2.59981225256112, + "grad_norm": 1.3928203582763672, + "learning_rate": 1.8899090325513968e-05, + "loss": 0.4944, + "step": 15926 + }, + { + "epoch": 2.5999755112036245, + "grad_norm": 1.744742751121521, + "learning_rate": 1.8898945573566306e-05, + "loss": 0.5388, + "step": 15927 + }, + { + "epoch": 2.6001387698461285, + "grad_norm": 1.4217644929885864, + "learning_rate": 1.8898800812657412e-05, + "loss": 0.5056, + "step": 15928 + }, + { + "epoch": 2.600302028488633, + "grad_norm": 2.317075490951538, + "learning_rate": 1.8898656042787428e-05, + "loss": 0.6882, + "step": 15929 + }, + { + "epoch": 2.6004652871311373, + "grad_norm": 1.8306225538253784, + "learning_rate": 1.88985112639565e-05, + "loss": 0.5602, + "step": 15930 + }, + { + "epoch": 2.6006285457736418, + "grad_norm": 1.4872678518295288, + "learning_rate": 1.8898366476164768e-05, + "loss": 0.5519, + "step": 15931 + }, + { + "epoch": 2.600791804416146, + "grad_norm": 2.0384857654571533, + "learning_rate": 1.889822167941239e-05, + "loss": 0.5188, + "step": 15932 + }, + { + "epoch": 2.6009550630586507, + "grad_norm": 1.783871054649353, + "learning_rate": 1.88980768736995e-05, + "loss": 0.5524, + "step": 15933 + }, + { + "epoch": 2.601118321701155, + "grad_norm": 1.6538704633712769, + "learning_rate": 1.8897932059026255e-05, + "loss": 0.5456, + "step": 15934 + }, + { + "epoch": 2.6012815803436595, + "grad_norm": 1.7779598236083984, + "learning_rate": 1.889778723539279e-05, + "loss": 0.6299, + "step": 15935 + }, + { + "epoch": 2.601444838986164, + "grad_norm": 1.7048213481903076, + "learning_rate": 1.8897642402799256e-05, + "loss": 0.5981, + "step": 15936 + }, + { + "epoch": 2.6016080976286684, + "grad_norm": 1.8793658018112183, + "learning_rate": 1.8897497561245798e-05, + "loss": 0.5167, + "step": 15937 + }, + { + "epoch": 2.601771356271173, + "grad_norm": 1.89106285572052, + "learning_rate": 1.8897352710732564e-05, + "loss": 0.6911, + "step": 15938 + }, + { + "epoch": 2.601934614913677, + "grad_norm": 1.621195912361145, + "learning_rate": 1.8897207851259698e-05, + "loss": 0.5344, + "step": 15939 + }, + { + "epoch": 2.6020978735561813, + "grad_norm": 2.2734928131103516, + "learning_rate": 1.8897062982827347e-05, + "loss": 0.688, + "step": 15940 + }, + { + "epoch": 2.6022611321986857, + "grad_norm": 1.7665585279464722, + "learning_rate": 1.8896918105435654e-05, + "loss": 0.5693, + "step": 15941 + }, + { + "epoch": 2.60242439084119, + "grad_norm": 1.818091630935669, + "learning_rate": 1.8896773219084768e-05, + "loss": 0.5532, + "step": 15942 + }, + { + "epoch": 2.6025876494836946, + "grad_norm": 1.6954485177993774, + "learning_rate": 1.8896628323774832e-05, + "loss": 0.5275, + "step": 15943 + }, + { + "epoch": 2.602750908126199, + "grad_norm": 1.8419591188430786, + "learning_rate": 1.8896483419505994e-05, + "loss": 0.5672, + "step": 15944 + }, + { + "epoch": 2.6029141667687035, + "grad_norm": 1.5640428066253662, + "learning_rate": 1.88963385062784e-05, + "loss": 0.5541, + "step": 15945 + }, + { + "epoch": 2.6030774254112075, + "grad_norm": 2.0099897384643555, + "learning_rate": 1.8896193584092197e-05, + "loss": 0.6267, + "step": 15946 + }, + { + "epoch": 2.603240684053712, + "grad_norm": 1.71855628490448, + "learning_rate": 1.8896048652947526e-05, + "loss": 0.5864, + "step": 15947 + }, + { + "epoch": 2.6034039426962163, + "grad_norm": 1.6069624423980713, + "learning_rate": 1.8895903712844542e-05, + "loss": 0.5484, + "step": 15948 + }, + { + "epoch": 2.6035672013387208, + "grad_norm": 1.8283424377441406, + "learning_rate": 1.8895758763783383e-05, + "loss": 0.5475, + "step": 15949 + }, + { + "epoch": 2.603730459981225, + "grad_norm": 2.099742889404297, + "learning_rate": 1.8895613805764196e-05, + "loss": 0.6749, + "step": 15950 + }, + { + "epoch": 2.6038937186237296, + "grad_norm": 1.3237226009368896, + "learning_rate": 1.8895468838787127e-05, + "loss": 0.4767, + "step": 15951 + }, + { + "epoch": 2.604056977266234, + "grad_norm": 1.611938714981079, + "learning_rate": 1.8895323862852322e-05, + "loss": 0.5198, + "step": 15952 + }, + { + "epoch": 2.6042202359087385, + "grad_norm": 1.5646326541900635, + "learning_rate": 1.8895178877959934e-05, + "loss": 0.4804, + "step": 15953 + }, + { + "epoch": 2.604383494551243, + "grad_norm": 1.726051688194275, + "learning_rate": 1.88950338841101e-05, + "loss": 0.5261, + "step": 15954 + }, + { + "epoch": 2.6045467531937474, + "grad_norm": 1.7808393239974976, + "learning_rate": 1.889488888130297e-05, + "loss": 0.5571, + "step": 15955 + }, + { + "epoch": 2.604710011836252, + "grad_norm": 1.513533115386963, + "learning_rate": 1.889474386953869e-05, + "loss": 0.4762, + "step": 15956 + }, + { + "epoch": 2.604873270478756, + "grad_norm": 1.9746071100234985, + "learning_rate": 1.8894598848817403e-05, + "loss": 0.6804, + "step": 15957 + }, + { + "epoch": 2.6050365291212603, + "grad_norm": 1.8099535703659058, + "learning_rate": 1.889445381913926e-05, + "loss": 0.5938, + "step": 15958 + }, + { + "epoch": 2.6051997877637647, + "grad_norm": 1.952694296836853, + "learning_rate": 1.88943087805044e-05, + "loss": 0.5928, + "step": 15959 + }, + { + "epoch": 2.605363046406269, + "grad_norm": 1.9216512441635132, + "learning_rate": 1.889416373291298e-05, + "loss": 0.5502, + "step": 15960 + }, + { + "epoch": 2.6055263050487736, + "grad_norm": 1.5623756647109985, + "learning_rate": 1.8894018676365134e-05, + "loss": 0.4726, + "step": 15961 + }, + { + "epoch": 2.605689563691278, + "grad_norm": 1.5827032327651978, + "learning_rate": 1.8893873610861013e-05, + "loss": 0.577, + "step": 15962 + }, + { + "epoch": 2.6058528223337825, + "grad_norm": 1.7254403829574585, + "learning_rate": 1.8893728536400766e-05, + "loss": 0.5333, + "step": 15963 + }, + { + "epoch": 2.6060160809762865, + "grad_norm": 1.6910972595214844, + "learning_rate": 1.8893583452984535e-05, + "loss": 0.5862, + "step": 15964 + }, + { + "epoch": 2.606179339618791, + "grad_norm": 1.5573484897613525, + "learning_rate": 1.889343836061247e-05, + "loss": 0.4901, + "step": 15965 + }, + { + "epoch": 2.6063425982612953, + "grad_norm": 1.8216878175735474, + "learning_rate": 1.8893293259284715e-05, + "loss": 0.5334, + "step": 15966 + }, + { + "epoch": 2.6065058569037998, + "grad_norm": 1.7856532335281372, + "learning_rate": 1.8893148149001415e-05, + "loss": 0.5492, + "step": 15967 + }, + { + "epoch": 2.606669115546304, + "grad_norm": 1.9734309911727905, + "learning_rate": 1.8893003029762717e-05, + "loss": 0.6394, + "step": 15968 + }, + { + "epoch": 2.6068323741888086, + "grad_norm": 2.0858616828918457, + "learning_rate": 1.889285790156877e-05, + "loss": 0.6078, + "step": 15969 + }, + { + "epoch": 2.606995632831313, + "grad_norm": 2.152456760406494, + "learning_rate": 1.8892712764419716e-05, + "loss": 0.6943, + "step": 15970 + }, + { + "epoch": 2.6071588914738175, + "grad_norm": 1.6914440393447876, + "learning_rate": 1.88925676183157e-05, + "loss": 0.5183, + "step": 15971 + }, + { + "epoch": 2.607322150116322, + "grad_norm": 1.440643548965454, + "learning_rate": 1.8892422463256873e-05, + "loss": 0.3959, + "step": 15972 + }, + { + "epoch": 2.6074854087588264, + "grad_norm": 1.933098554611206, + "learning_rate": 1.889227729924338e-05, + "loss": 0.6145, + "step": 15973 + }, + { + "epoch": 2.607648667401331, + "grad_norm": 1.8300108909606934, + "learning_rate": 1.8892132126275366e-05, + "loss": 0.5894, + "step": 15974 + }, + { + "epoch": 2.607811926043835, + "grad_norm": 1.7827945947647095, + "learning_rate": 1.8891986944352974e-05, + "loss": 0.5237, + "step": 15975 + }, + { + "epoch": 2.6079751846863393, + "grad_norm": 2.116607189178467, + "learning_rate": 1.8891841753476356e-05, + "loss": 0.7397, + "step": 15976 + }, + { + "epoch": 2.6081384433288437, + "grad_norm": 1.7372130155563354, + "learning_rate": 1.8891696553645654e-05, + "loss": 0.5921, + "step": 15977 + }, + { + "epoch": 2.608301701971348, + "grad_norm": 1.721508264541626, + "learning_rate": 1.8891551344861018e-05, + "loss": 0.6362, + "step": 15978 + }, + { + "epoch": 2.6084649606138526, + "grad_norm": 1.8527566194534302, + "learning_rate": 1.8891406127122593e-05, + "loss": 0.6013, + "step": 15979 + }, + { + "epoch": 2.608628219256357, + "grad_norm": 1.9926713705062866, + "learning_rate": 1.8891260900430518e-05, + "loss": 0.6905, + "step": 15980 + }, + { + "epoch": 2.608791477898861, + "grad_norm": 1.7163844108581543, + "learning_rate": 1.8891115664784953e-05, + "loss": 0.6407, + "step": 15981 + }, + { + "epoch": 2.6089547365413654, + "grad_norm": 1.8728090524673462, + "learning_rate": 1.8890970420186035e-05, + "loss": 0.617, + "step": 15982 + }, + { + "epoch": 2.60911799518387, + "grad_norm": 2.281320095062256, + "learning_rate": 1.889082516663391e-05, + "loss": 0.7085, + "step": 15983 + }, + { + "epoch": 2.6092812538263743, + "grad_norm": 1.5436761379241943, + "learning_rate": 1.889067990412873e-05, + "loss": 0.5235, + "step": 15984 + }, + { + "epoch": 2.6094445124688788, + "grad_norm": 1.6117883920669556, + "learning_rate": 1.889053463267063e-05, + "loss": 0.5525, + "step": 15985 + }, + { + "epoch": 2.609607771111383, + "grad_norm": 1.634171485900879, + "learning_rate": 1.8890389352259774e-05, + "loss": 0.518, + "step": 15986 + }, + { + "epoch": 2.6097710297538876, + "grad_norm": 1.7762027978897095, + "learning_rate": 1.8890244062896294e-05, + "loss": 0.6243, + "step": 15987 + }, + { + "epoch": 2.609934288396392, + "grad_norm": 1.839897632598877, + "learning_rate": 1.889009876458034e-05, + "loss": 0.6744, + "step": 15988 + }, + { + "epoch": 2.6100975470388965, + "grad_norm": 1.4397517442703247, + "learning_rate": 1.8889953457312057e-05, + "loss": 0.505, + "step": 15989 + }, + { + "epoch": 2.610260805681401, + "grad_norm": 1.5810065269470215, + "learning_rate": 1.8889808141091598e-05, + "loss": 0.5364, + "step": 15990 + }, + { + "epoch": 2.6104240643239054, + "grad_norm": 1.7853951454162598, + "learning_rate": 1.8889662815919102e-05, + "loss": 0.6085, + "step": 15991 + }, + { + "epoch": 2.6105873229664094, + "grad_norm": 1.9415342807769775, + "learning_rate": 1.8889517481794718e-05, + "loss": 0.6611, + "step": 15992 + }, + { + "epoch": 2.610750581608914, + "grad_norm": 1.524259090423584, + "learning_rate": 1.888937213871859e-05, + "loss": 0.5362, + "step": 15993 + }, + { + "epoch": 2.6109138402514183, + "grad_norm": 1.7251135110855103, + "learning_rate": 1.888922678669087e-05, + "loss": 0.5772, + "step": 15994 + }, + { + "epoch": 2.6110770988939227, + "grad_norm": 1.9065886735916138, + "learning_rate": 1.8889081425711698e-05, + "loss": 0.6064, + "step": 15995 + }, + { + "epoch": 2.611240357536427, + "grad_norm": 1.726266860961914, + "learning_rate": 1.8888936055781226e-05, + "loss": 0.5668, + "step": 15996 + }, + { + "epoch": 2.6114036161789316, + "grad_norm": 1.6911771297454834, + "learning_rate": 1.8888790676899597e-05, + "loss": 0.5419, + "step": 15997 + }, + { + "epoch": 2.611566874821436, + "grad_norm": 1.5842859745025635, + "learning_rate": 1.8888645289066958e-05, + "loss": 0.5437, + "step": 15998 + }, + { + "epoch": 2.61173013346394, + "grad_norm": 1.443833827972412, + "learning_rate": 1.8888499892283458e-05, + "loss": 0.4901, + "step": 15999 + }, + { + "epoch": 2.6118933921064444, + "grad_norm": 1.9202293157577515, + "learning_rate": 1.8888354486549238e-05, + "loss": 0.73, + "step": 16000 + }, + { + "epoch": 2.612056650748949, + "grad_norm": 2.20485782623291, + "learning_rate": 1.8888209071864448e-05, + "loss": 0.7008, + "step": 16001 + }, + { + "epoch": 2.6122199093914533, + "grad_norm": 1.5755879878997803, + "learning_rate": 1.8888063648229234e-05, + "loss": 0.4696, + "step": 16002 + }, + { + "epoch": 2.6123831680339578, + "grad_norm": 1.660231590270996, + "learning_rate": 1.8887918215643738e-05, + "loss": 0.479, + "step": 16003 + }, + { + "epoch": 2.612546426676462, + "grad_norm": 1.9225854873657227, + "learning_rate": 1.8887772774108116e-05, + "loss": 0.674, + "step": 16004 + }, + { + "epoch": 2.6127096853189666, + "grad_norm": 1.6600604057312012, + "learning_rate": 1.888762732362251e-05, + "loss": 0.5113, + "step": 16005 + }, + { + "epoch": 2.612872943961471, + "grad_norm": 1.730082392692566, + "learning_rate": 1.888748186418706e-05, + "loss": 0.5264, + "step": 16006 + }, + { + "epoch": 2.6130362026039755, + "grad_norm": 1.6567931175231934, + "learning_rate": 1.888733639580192e-05, + "loss": 0.5475, + "step": 16007 + }, + { + "epoch": 2.61319946124648, + "grad_norm": 1.7413051128387451, + "learning_rate": 1.888719091846724e-05, + "loss": 0.6082, + "step": 16008 + }, + { + "epoch": 2.6133627198889844, + "grad_norm": 1.8411896228790283, + "learning_rate": 1.8887045432183156e-05, + "loss": 0.5347, + "step": 16009 + }, + { + "epoch": 2.6135259785314884, + "grad_norm": 1.6246861219406128, + "learning_rate": 1.8886899936949822e-05, + "loss": 0.5789, + "step": 16010 + }, + { + "epoch": 2.613689237173993, + "grad_norm": 1.7005807161331177, + "learning_rate": 1.8886754432767382e-05, + "loss": 0.5952, + "step": 16011 + }, + { + "epoch": 2.6138524958164973, + "grad_norm": 1.8778045177459717, + "learning_rate": 1.888660891963598e-05, + "loss": 0.6395, + "step": 16012 + }, + { + "epoch": 2.6140157544590017, + "grad_norm": 1.590225100517273, + "learning_rate": 1.888646339755577e-05, + "loss": 0.4592, + "step": 16013 + }, + { + "epoch": 2.614179013101506, + "grad_norm": 1.8885276317596436, + "learning_rate": 1.8886317866526888e-05, + "loss": 0.5128, + "step": 16014 + }, + { + "epoch": 2.6143422717440106, + "grad_norm": 1.7249761819839478, + "learning_rate": 1.888617232654949e-05, + "loss": 0.5781, + "step": 16015 + }, + { + "epoch": 2.6145055303865146, + "grad_norm": 1.9317954778671265, + "learning_rate": 1.888602677762372e-05, + "loss": 0.6543, + "step": 16016 + }, + { + "epoch": 2.614668789029019, + "grad_norm": 1.8166687488555908, + "learning_rate": 1.888588121974972e-05, + "loss": 0.5995, + "step": 16017 + }, + { + "epoch": 2.6148320476715234, + "grad_norm": 1.4822840690612793, + "learning_rate": 1.888573565292764e-05, + "loss": 0.5089, + "step": 16018 + }, + { + "epoch": 2.614995306314028, + "grad_norm": 1.746072769165039, + "learning_rate": 1.8885590077157627e-05, + "loss": 0.576, + "step": 16019 + }, + { + "epoch": 2.6151585649565323, + "grad_norm": 1.7825593948364258, + "learning_rate": 1.888544449243983e-05, + "loss": 0.5585, + "step": 16020 + }, + { + "epoch": 2.6153218235990368, + "grad_norm": 1.7522176504135132, + "learning_rate": 1.888529889877439e-05, + "loss": 0.509, + "step": 16021 + }, + { + "epoch": 2.615485082241541, + "grad_norm": 1.8622236251831055, + "learning_rate": 1.8885153296161456e-05, + "loss": 0.4859, + "step": 16022 + }, + { + "epoch": 2.6156483408840456, + "grad_norm": 1.5628907680511475, + "learning_rate": 1.8885007684601175e-05, + "loss": 0.5187, + "step": 16023 + }, + { + "epoch": 2.61581159952655, + "grad_norm": 1.306572675704956, + "learning_rate": 1.8884862064093698e-05, + "loss": 0.4316, + "step": 16024 + }, + { + "epoch": 2.6159748581690545, + "grad_norm": 2.157076358795166, + "learning_rate": 1.888471643463916e-05, + "loss": 0.6938, + "step": 16025 + }, + { + "epoch": 2.616138116811559, + "grad_norm": 1.960085153579712, + "learning_rate": 1.888457079623772e-05, + "loss": 0.5651, + "step": 16026 + }, + { + "epoch": 2.616301375454063, + "grad_norm": 1.660498023033142, + "learning_rate": 1.8884425148889517e-05, + "loss": 0.5688, + "step": 16027 + }, + { + "epoch": 2.6164646340965674, + "grad_norm": 1.741895079612732, + "learning_rate": 1.8884279492594705e-05, + "loss": 0.5044, + "step": 16028 + }, + { + "epoch": 2.616627892739072, + "grad_norm": 1.8037664890289307, + "learning_rate": 1.888413382735342e-05, + "loss": 0.5938, + "step": 16029 + }, + { + "epoch": 2.6167911513815763, + "grad_norm": 2.1374611854553223, + "learning_rate": 1.8883988153165815e-05, + "loss": 0.5723, + "step": 16030 + }, + { + "epoch": 2.6169544100240807, + "grad_norm": 1.8800091743469238, + "learning_rate": 1.8883842470032042e-05, + "loss": 0.5698, + "step": 16031 + }, + { + "epoch": 2.617117668666585, + "grad_norm": 2.136476755142212, + "learning_rate": 1.8883696777952236e-05, + "loss": 0.6524, + "step": 16032 + }, + { + "epoch": 2.6172809273090896, + "grad_norm": 1.968754529953003, + "learning_rate": 1.8883551076926552e-05, + "loss": 0.535, + "step": 16033 + }, + { + "epoch": 2.6174441859515936, + "grad_norm": 1.8754411935806274, + "learning_rate": 1.8883405366955134e-05, + "loss": 0.578, + "step": 16034 + }, + { + "epoch": 2.617607444594098, + "grad_norm": 1.8510242700576782, + "learning_rate": 1.888325964803813e-05, + "loss": 0.574, + "step": 16035 + }, + { + "epoch": 2.6177707032366024, + "grad_norm": 1.6279468536376953, + "learning_rate": 1.8883113920175687e-05, + "loss": 0.5077, + "step": 16036 + }, + { + "epoch": 2.617933961879107, + "grad_norm": 1.721198320388794, + "learning_rate": 1.888296818336795e-05, + "loss": 0.5512, + "step": 16037 + }, + { + "epoch": 2.6180972205216113, + "grad_norm": 1.9198247194290161, + "learning_rate": 1.888282243761506e-05, + "loss": 0.6996, + "step": 16038 + }, + { + "epoch": 2.6182604791641158, + "grad_norm": 1.865804672241211, + "learning_rate": 1.888267668291718e-05, + "loss": 0.5449, + "step": 16039 + }, + { + "epoch": 2.61842373780662, + "grad_norm": 1.718827247619629, + "learning_rate": 1.8882530919274442e-05, + "loss": 0.525, + "step": 16040 + }, + { + "epoch": 2.6185869964491246, + "grad_norm": 1.5678411722183228, + "learning_rate": 1.8882385146686997e-05, + "loss": 0.5419, + "step": 16041 + }, + { + "epoch": 2.618750255091629, + "grad_norm": 1.8673676252365112, + "learning_rate": 1.8882239365154996e-05, + "loss": 0.5889, + "step": 16042 + }, + { + "epoch": 2.6189135137341335, + "grad_norm": 1.54221773147583, + "learning_rate": 1.888209357467858e-05, + "loss": 0.5131, + "step": 16043 + }, + { + "epoch": 2.619076772376638, + "grad_norm": 1.9522465467453003, + "learning_rate": 1.8881947775257898e-05, + "loss": 0.6681, + "step": 16044 + }, + { + "epoch": 2.619240031019142, + "grad_norm": 1.5264219045639038, + "learning_rate": 1.8881801966893095e-05, + "loss": 0.5317, + "step": 16045 + }, + { + "epoch": 2.6194032896616464, + "grad_norm": 1.9213430881500244, + "learning_rate": 1.8881656149584323e-05, + "loss": 0.5448, + "step": 16046 + }, + { + "epoch": 2.619566548304151, + "grad_norm": 2.132582664489746, + "learning_rate": 1.8881510323331723e-05, + "loss": 0.7566, + "step": 16047 + }, + { + "epoch": 2.6197298069466552, + "grad_norm": 2.031709909439087, + "learning_rate": 1.8881364488135448e-05, + "loss": 0.5826, + "step": 16048 + }, + { + "epoch": 2.6198930655891597, + "grad_norm": 1.8390910625457764, + "learning_rate": 1.8881218643995637e-05, + "loss": 0.6175, + "step": 16049 + }, + { + "epoch": 2.620056324231664, + "grad_norm": 1.7144988775253296, + "learning_rate": 1.8881072790912445e-05, + "loss": 0.5647, + "step": 16050 + }, + { + "epoch": 2.6202195828741686, + "grad_norm": 1.6184475421905518, + "learning_rate": 1.8880926928886012e-05, + "loss": 0.4858, + "step": 16051 + }, + { + "epoch": 2.6203828415166726, + "grad_norm": 2.1351308822631836, + "learning_rate": 1.888078105791649e-05, + "loss": 0.6459, + "step": 16052 + }, + { + "epoch": 2.620546100159177, + "grad_norm": 1.8463491201400757, + "learning_rate": 1.8880635178004024e-05, + "loss": 0.6501, + "step": 16053 + }, + { + "epoch": 2.6207093588016814, + "grad_norm": 1.6715669631958008, + "learning_rate": 1.888048928914876e-05, + "loss": 0.5513, + "step": 16054 + }, + { + "epoch": 2.620872617444186, + "grad_norm": 1.9554920196533203, + "learning_rate": 1.8880343391350845e-05, + "loss": 0.7249, + "step": 16055 + }, + { + "epoch": 2.6210358760866903, + "grad_norm": 1.59914231300354, + "learning_rate": 1.8880197484610427e-05, + "loss": 0.5215, + "step": 16056 + }, + { + "epoch": 2.6211991347291947, + "grad_norm": 1.7899322509765625, + "learning_rate": 1.8880051568927655e-05, + "loss": 0.6089, + "step": 16057 + }, + { + "epoch": 2.621362393371699, + "grad_norm": 1.810563087463379, + "learning_rate": 1.887990564430267e-05, + "loss": 0.6498, + "step": 16058 + }, + { + "epoch": 2.6215256520142036, + "grad_norm": 1.8868913650512695, + "learning_rate": 1.8879759710735625e-05, + "loss": 0.6287, + "step": 16059 + }, + { + "epoch": 2.621688910656708, + "grad_norm": 1.8164985179901123, + "learning_rate": 1.8879613768226662e-05, + "loss": 0.621, + "step": 16060 + }, + { + "epoch": 2.6218521692992125, + "grad_norm": 1.9097576141357422, + "learning_rate": 1.887946781677593e-05, + "loss": 0.5875, + "step": 16061 + }, + { + "epoch": 2.622015427941717, + "grad_norm": 1.8151062726974487, + "learning_rate": 1.887932185638358e-05, + "loss": 0.5762, + "step": 16062 + }, + { + "epoch": 2.622178686584221, + "grad_norm": 1.9798498153686523, + "learning_rate": 1.887917588704975e-05, + "loss": 0.6161, + "step": 16063 + }, + { + "epoch": 2.6223419452267254, + "grad_norm": 1.6471327543258667, + "learning_rate": 1.8879029908774594e-05, + "loss": 0.5381, + "step": 16064 + }, + { + "epoch": 2.62250520386923, + "grad_norm": 1.6532760858535767, + "learning_rate": 1.887888392155826e-05, + "loss": 0.5, + "step": 16065 + }, + { + "epoch": 2.6226684625117342, + "grad_norm": 1.4578909873962402, + "learning_rate": 1.887873792540089e-05, + "loss": 0.4605, + "step": 16066 + }, + { + "epoch": 2.6228317211542387, + "grad_norm": 1.70945143699646, + "learning_rate": 1.8878591920302637e-05, + "loss": 0.5323, + "step": 16067 + }, + { + "epoch": 2.622994979796743, + "grad_norm": 1.795084834098816, + "learning_rate": 1.887844590626364e-05, + "loss": 0.6355, + "step": 16068 + }, + { + "epoch": 2.623158238439247, + "grad_norm": 1.9677430391311646, + "learning_rate": 1.887829988328405e-05, + "loss": 0.5515, + "step": 16069 + }, + { + "epoch": 2.6233214970817516, + "grad_norm": 1.7874765396118164, + "learning_rate": 1.8878153851364013e-05, + "loss": 0.5875, + "step": 16070 + }, + { + "epoch": 2.623484755724256, + "grad_norm": 1.8282233476638794, + "learning_rate": 1.8878007810503683e-05, + "loss": 0.6815, + "step": 16071 + }, + { + "epoch": 2.6236480143667604, + "grad_norm": 1.4319413900375366, + "learning_rate": 1.8877861760703198e-05, + "loss": 0.5353, + "step": 16072 + }, + { + "epoch": 2.623811273009265, + "grad_norm": 1.6155452728271484, + "learning_rate": 1.887771570196271e-05, + "loss": 0.5258, + "step": 16073 + }, + { + "epoch": 2.6239745316517693, + "grad_norm": 1.999325156211853, + "learning_rate": 1.8877569634282363e-05, + "loss": 0.6271, + "step": 16074 + }, + { + "epoch": 2.6241377902942737, + "grad_norm": 1.917514443397522, + "learning_rate": 1.8877423557662307e-05, + "loss": 0.5758, + "step": 16075 + }, + { + "epoch": 2.624301048936778, + "grad_norm": 1.7548868656158447, + "learning_rate": 1.8877277472102687e-05, + "loss": 0.5443, + "step": 16076 + }, + { + "epoch": 2.6244643075792826, + "grad_norm": 1.596685528755188, + "learning_rate": 1.887713137760365e-05, + "loss": 0.5489, + "step": 16077 + }, + { + "epoch": 2.624627566221787, + "grad_norm": 1.501184344291687, + "learning_rate": 1.8876985274165345e-05, + "loss": 0.4903, + "step": 16078 + }, + { + "epoch": 2.6247908248642915, + "grad_norm": 2.070462226867676, + "learning_rate": 1.887683916178792e-05, + "loss": 0.594, + "step": 16079 + }, + { + "epoch": 2.6249540835067955, + "grad_norm": 1.7820243835449219, + "learning_rate": 1.8876693040471518e-05, + "loss": 0.5281, + "step": 16080 + }, + { + "epoch": 2.6251173421493, + "grad_norm": 2.274573802947998, + "learning_rate": 1.887654691021629e-05, + "loss": 0.6145, + "step": 16081 + }, + { + "epoch": 2.6252806007918044, + "grad_norm": 1.9335205554962158, + "learning_rate": 1.887640077102238e-05, + "loss": 0.5572, + "step": 16082 + }, + { + "epoch": 2.625443859434309, + "grad_norm": 1.6394728422164917, + "learning_rate": 1.887625462288994e-05, + "loss": 0.5724, + "step": 16083 + }, + { + "epoch": 2.6256071180768132, + "grad_norm": 1.9937756061553955, + "learning_rate": 1.887610846581911e-05, + "loss": 0.6163, + "step": 16084 + }, + { + "epoch": 2.6257703767193177, + "grad_norm": 1.7527790069580078, + "learning_rate": 1.8875962299810042e-05, + "loss": 0.507, + "step": 16085 + }, + { + "epoch": 2.625933635361822, + "grad_norm": 1.701773762702942, + "learning_rate": 1.8875816124862885e-05, + "loss": 0.5451, + "step": 16086 + }, + { + "epoch": 2.626096894004326, + "grad_norm": 1.7611727714538574, + "learning_rate": 1.887566994097778e-05, + "loss": 0.5088, + "step": 16087 + }, + { + "epoch": 2.6262601526468305, + "grad_norm": 1.3634651899337769, + "learning_rate": 1.887552374815488e-05, + "loss": 0.4625, + "step": 16088 + }, + { + "epoch": 2.626423411289335, + "grad_norm": 1.5296481847763062, + "learning_rate": 1.887537754639433e-05, + "loss": 0.4768, + "step": 16089 + }, + { + "epoch": 2.6265866699318394, + "grad_norm": 1.5896655321121216, + "learning_rate": 1.8875231335696277e-05, + "loss": 0.5251, + "step": 16090 + }, + { + "epoch": 2.626749928574344, + "grad_norm": 1.8863362073898315, + "learning_rate": 1.8875085116060865e-05, + "loss": 0.5552, + "step": 16091 + }, + { + "epoch": 2.6269131872168483, + "grad_norm": 1.7911053895950317, + "learning_rate": 1.887493888748825e-05, + "loss": 0.625, + "step": 16092 + }, + { + "epoch": 2.6270764458593527, + "grad_norm": 1.4910928010940552, + "learning_rate": 1.887479264997857e-05, + "loss": 0.459, + "step": 16093 + }, + { + "epoch": 2.627239704501857, + "grad_norm": 1.7951254844665527, + "learning_rate": 1.8874646403531978e-05, + "loss": 0.6646, + "step": 16094 + }, + { + "epoch": 2.6274029631443616, + "grad_norm": 1.73624849319458, + "learning_rate": 1.8874500148148617e-05, + "loss": 0.535, + "step": 16095 + }, + { + "epoch": 2.627566221786866, + "grad_norm": 1.7552388906478882, + "learning_rate": 1.8874353883828643e-05, + "loss": 0.5179, + "step": 16096 + }, + { + "epoch": 2.6277294804293705, + "grad_norm": 1.8437241315841675, + "learning_rate": 1.887420761057219e-05, + "loss": 0.5855, + "step": 16097 + }, + { + "epoch": 2.6278927390718745, + "grad_norm": 1.7313461303710938, + "learning_rate": 1.8874061328379416e-05, + "loss": 0.6025, + "step": 16098 + }, + { + "epoch": 2.628055997714379, + "grad_norm": 1.8763130903244019, + "learning_rate": 1.8873915037250463e-05, + "loss": 0.5781, + "step": 16099 + }, + { + "epoch": 2.6282192563568834, + "grad_norm": 1.7636576890945435, + "learning_rate": 1.887376873718548e-05, + "loss": 0.5688, + "step": 16100 + }, + { + "epoch": 2.628382514999388, + "grad_norm": 1.8299024105072021, + "learning_rate": 1.8873622428184616e-05, + "loss": 0.5631, + "step": 16101 + }, + { + "epoch": 2.6285457736418922, + "grad_norm": 1.7506914138793945, + "learning_rate": 1.8873476110248015e-05, + "loss": 0.5106, + "step": 16102 + }, + { + "epoch": 2.6287090322843967, + "grad_norm": 1.6863343715667725, + "learning_rate": 1.8873329783375823e-05, + "loss": 0.5548, + "step": 16103 + }, + { + "epoch": 2.6288722909269007, + "grad_norm": 1.896637201309204, + "learning_rate": 1.8873183447568195e-05, + "loss": 0.6055, + "step": 16104 + }, + { + "epoch": 2.629035549569405, + "grad_norm": 1.4419078826904297, + "learning_rate": 1.8873037102825275e-05, + "loss": 0.4617, + "step": 16105 + }, + { + "epoch": 2.6291988082119095, + "grad_norm": 2.1398956775665283, + "learning_rate": 1.8872890749147204e-05, + "loss": 0.655, + "step": 16106 + }, + { + "epoch": 2.629362066854414, + "grad_norm": 2.2492048740386963, + "learning_rate": 1.8872744386534138e-05, + "loss": 0.5584, + "step": 16107 + }, + { + "epoch": 2.6295253254969184, + "grad_norm": 1.6752861738204956, + "learning_rate": 1.8872598014986216e-05, + "loss": 0.5328, + "step": 16108 + }, + { + "epoch": 2.629688584139423, + "grad_norm": 1.6917341947555542, + "learning_rate": 1.8872451634503594e-05, + "loss": 0.5753, + "step": 16109 + }, + { + "epoch": 2.6298518427819273, + "grad_norm": 2.009004592895508, + "learning_rate": 1.8872305245086414e-05, + "loss": 0.5951, + "step": 16110 + }, + { + "epoch": 2.6300151014244317, + "grad_norm": 1.724295973777771, + "learning_rate": 1.8872158846734826e-05, + "loss": 0.6016, + "step": 16111 + }, + { + "epoch": 2.630178360066936, + "grad_norm": 1.5764682292938232, + "learning_rate": 1.8872012439448978e-05, + "loss": 0.4724, + "step": 16112 + }, + { + "epoch": 2.6303416187094406, + "grad_norm": 1.9193719625473022, + "learning_rate": 1.8871866023229013e-05, + "loss": 0.5805, + "step": 16113 + }, + { + "epoch": 2.630504877351945, + "grad_norm": 1.9446511268615723, + "learning_rate": 1.8871719598075083e-05, + "loss": 0.638, + "step": 16114 + }, + { + "epoch": 2.630668135994449, + "grad_norm": 1.6109675168991089, + "learning_rate": 1.8871573163987334e-05, + "loss": 0.5152, + "step": 16115 + }, + { + "epoch": 2.6308313946369535, + "grad_norm": 1.8202100992202759, + "learning_rate": 1.8871426720965915e-05, + "loss": 0.637, + "step": 16116 + }, + { + "epoch": 2.630994653279458, + "grad_norm": 1.6964645385742188, + "learning_rate": 1.8871280269010964e-05, + "loss": 0.5944, + "step": 16117 + }, + { + "epoch": 2.6311579119219624, + "grad_norm": 1.760259985923767, + "learning_rate": 1.8871133808122642e-05, + "loss": 0.5621, + "step": 16118 + }, + { + "epoch": 2.631321170564467, + "grad_norm": 1.672680139541626, + "learning_rate": 1.887098733830109e-05, + "loss": 0.5452, + "step": 16119 + }, + { + "epoch": 2.6314844292069712, + "grad_norm": 1.8114062547683716, + "learning_rate": 1.8870840859546455e-05, + "loss": 0.4889, + "step": 16120 + }, + { + "epoch": 2.6316476878494757, + "grad_norm": 1.7800145149230957, + "learning_rate": 1.8870694371858888e-05, + "loss": 0.6913, + "step": 16121 + }, + { + "epoch": 2.6318109464919797, + "grad_norm": 1.6153721809387207, + "learning_rate": 1.887054787523853e-05, + "loss": 0.5644, + "step": 16122 + }, + { + "epoch": 2.631974205134484, + "grad_norm": 1.689859390258789, + "learning_rate": 1.8870401369685535e-05, + "loss": 0.5183, + "step": 16123 + }, + { + "epoch": 2.6321374637769885, + "grad_norm": 1.8619061708450317, + "learning_rate": 1.887025485520005e-05, + "loss": 0.5424, + "step": 16124 + }, + { + "epoch": 2.632300722419493, + "grad_norm": 1.7084953784942627, + "learning_rate": 1.887010833178222e-05, + "loss": 0.5882, + "step": 16125 + }, + { + "epoch": 2.6324639810619974, + "grad_norm": 1.764589786529541, + "learning_rate": 1.886996179943219e-05, + "loss": 0.62, + "step": 16126 + }, + { + "epoch": 2.632627239704502, + "grad_norm": 2.0343589782714844, + "learning_rate": 1.8869815258150114e-05, + "loss": 0.6857, + "step": 16127 + }, + { + "epoch": 2.6327904983470063, + "grad_norm": 1.246065616607666, + "learning_rate": 1.8869668707936137e-05, + "loss": 0.3738, + "step": 16128 + }, + { + "epoch": 2.6329537569895107, + "grad_norm": 1.7290823459625244, + "learning_rate": 1.8869522148790404e-05, + "loss": 0.5381, + "step": 16129 + }, + { + "epoch": 2.633117015632015, + "grad_norm": 1.8423837423324585, + "learning_rate": 1.8869375580713064e-05, + "loss": 0.6456, + "step": 16130 + }, + { + "epoch": 2.6332802742745196, + "grad_norm": 1.8904454708099365, + "learning_rate": 1.8869229003704266e-05, + "loss": 0.6426, + "step": 16131 + }, + { + "epoch": 2.633443532917024, + "grad_norm": 1.729357123374939, + "learning_rate": 1.8869082417764154e-05, + "loss": 0.5179, + "step": 16132 + }, + { + "epoch": 2.633606791559528, + "grad_norm": 1.7497581243515015, + "learning_rate": 1.8868935822892885e-05, + "loss": 0.6118, + "step": 16133 + }, + { + "epoch": 2.6337700502020325, + "grad_norm": 1.662306785583496, + "learning_rate": 1.8868789219090596e-05, + "loss": 0.5083, + "step": 16134 + }, + { + "epoch": 2.633933308844537, + "grad_norm": 2.0248637199401855, + "learning_rate": 1.886864260635744e-05, + "loss": 0.7194, + "step": 16135 + }, + { + "epoch": 2.6340965674870414, + "grad_norm": 1.7927619218826294, + "learning_rate": 1.886849598469356e-05, + "loss": 0.655, + "step": 16136 + }, + { + "epoch": 2.634259826129546, + "grad_norm": 1.553969144821167, + "learning_rate": 1.886834935409911e-05, + "loss": 0.5198, + "step": 16137 + }, + { + "epoch": 2.6344230847720502, + "grad_norm": 1.825742483139038, + "learning_rate": 1.8868202714574232e-05, + "loss": 0.5596, + "step": 16138 + }, + { + "epoch": 2.6345863434145547, + "grad_norm": 1.827074408531189, + "learning_rate": 1.886805606611908e-05, + "loss": 0.5739, + "step": 16139 + }, + { + "epoch": 2.6347496020570587, + "grad_norm": 1.9935625791549683, + "learning_rate": 1.8867909408733794e-05, + "loss": 0.6928, + "step": 16140 + }, + { + "epoch": 2.634912860699563, + "grad_norm": 1.7250573635101318, + "learning_rate": 1.886776274241853e-05, + "loss": 0.5739, + "step": 16141 + }, + { + "epoch": 2.6350761193420675, + "grad_norm": 1.816119909286499, + "learning_rate": 1.8867616067173425e-05, + "loss": 0.5935, + "step": 16142 + }, + { + "epoch": 2.635239377984572, + "grad_norm": 1.7341365814208984, + "learning_rate": 1.8867469382998636e-05, + "loss": 0.4416, + "step": 16143 + }, + { + "epoch": 2.6354026366270764, + "grad_norm": 2.303443431854248, + "learning_rate": 1.8867322689894307e-05, + "loss": 0.6315, + "step": 16144 + }, + { + "epoch": 2.635565895269581, + "grad_norm": 2.281344175338745, + "learning_rate": 1.886717598786059e-05, + "loss": 0.7293, + "step": 16145 + }, + { + "epoch": 2.6357291539120853, + "grad_norm": 1.8779067993164062, + "learning_rate": 1.8867029276897625e-05, + "loss": 0.5825, + "step": 16146 + }, + { + "epoch": 2.6358924125545897, + "grad_norm": 1.6128805875778198, + "learning_rate": 1.8866882557005567e-05, + "loss": 0.5189, + "step": 16147 + }, + { + "epoch": 2.636055671197094, + "grad_norm": 1.7921674251556396, + "learning_rate": 1.886673582818456e-05, + "loss": 0.514, + "step": 16148 + }, + { + "epoch": 2.6362189298395986, + "grad_norm": 1.586660385131836, + "learning_rate": 1.886658909043475e-05, + "loss": 0.5449, + "step": 16149 + }, + { + "epoch": 2.636382188482103, + "grad_norm": 1.739637851715088, + "learning_rate": 1.8866442343756288e-05, + "loss": 0.5927, + "step": 16150 + }, + { + "epoch": 2.636545447124607, + "grad_norm": 1.9291633367538452, + "learning_rate": 1.8866295588149323e-05, + "loss": 0.6249, + "step": 16151 + }, + { + "epoch": 2.6367087057671115, + "grad_norm": 1.7400315999984741, + "learning_rate": 1.8866148823613998e-05, + "loss": 0.5373, + "step": 16152 + }, + { + "epoch": 2.636871964409616, + "grad_norm": 2.225931167602539, + "learning_rate": 1.8866002050150463e-05, + "loss": 0.596, + "step": 16153 + }, + { + "epoch": 2.6370352230521203, + "grad_norm": 2.1829609870910645, + "learning_rate": 1.886585526775887e-05, + "loss": 0.7269, + "step": 16154 + }, + { + "epoch": 2.637198481694625, + "grad_norm": 1.969670057296753, + "learning_rate": 1.8865708476439363e-05, + "loss": 0.5938, + "step": 16155 + }, + { + "epoch": 2.6373617403371292, + "grad_norm": 1.846751093864441, + "learning_rate": 1.8865561676192085e-05, + "loss": 0.5662, + "step": 16156 + }, + { + "epoch": 2.637524998979633, + "grad_norm": 2.0010201930999756, + "learning_rate": 1.8865414867017194e-05, + "loss": 0.6319, + "step": 16157 + }, + { + "epoch": 2.6376882576221377, + "grad_norm": 1.7987765073776245, + "learning_rate": 1.8865268048914828e-05, + "loss": 0.6771, + "step": 16158 + }, + { + "epoch": 2.637851516264642, + "grad_norm": 2.124239683151245, + "learning_rate": 1.8865121221885143e-05, + "loss": 0.62, + "step": 16159 + }, + { + "epoch": 2.6380147749071465, + "grad_norm": 1.831083059310913, + "learning_rate": 1.8864974385928284e-05, + "loss": 0.5243, + "step": 16160 + }, + { + "epoch": 2.638178033549651, + "grad_norm": 1.8009288311004639, + "learning_rate": 1.8864827541044396e-05, + "loss": 0.5931, + "step": 16161 + }, + { + "epoch": 2.6383412921921554, + "grad_norm": 1.7230037450790405, + "learning_rate": 1.8864680687233627e-05, + "loss": 0.5868, + "step": 16162 + }, + { + "epoch": 2.63850455083466, + "grad_norm": 1.6651865243911743, + "learning_rate": 1.886453382449613e-05, + "loss": 0.5005, + "step": 16163 + }, + { + "epoch": 2.6386678094771643, + "grad_norm": 1.809056043624878, + "learning_rate": 1.886438695283205e-05, + "loss": 0.5999, + "step": 16164 + }, + { + "epoch": 2.6388310681196687, + "grad_norm": 1.9765888452529907, + "learning_rate": 1.8864240072241533e-05, + "loss": 0.6539, + "step": 16165 + }, + { + "epoch": 2.638994326762173, + "grad_norm": 1.644740104675293, + "learning_rate": 1.886409318272473e-05, + "loss": 0.4967, + "step": 16166 + }, + { + "epoch": 2.6391575854046776, + "grad_norm": 1.8078258037567139, + "learning_rate": 1.8863946284281787e-05, + "loss": 0.5985, + "step": 16167 + }, + { + "epoch": 2.6393208440471816, + "grad_norm": 2.0044445991516113, + "learning_rate": 1.886379937691285e-05, + "loss": 0.609, + "step": 16168 + }, + { + "epoch": 2.639484102689686, + "grad_norm": 1.7423810958862305, + "learning_rate": 1.886365246061807e-05, + "loss": 0.5789, + "step": 16169 + }, + { + "epoch": 2.6396473613321905, + "grad_norm": 1.798461675643921, + "learning_rate": 1.8863505535397597e-05, + "loss": 0.6076, + "step": 16170 + }, + { + "epoch": 2.639810619974695, + "grad_norm": 1.9436919689178467, + "learning_rate": 1.8863358601251577e-05, + "loss": 0.6663, + "step": 16171 + }, + { + "epoch": 2.6399738786171993, + "grad_norm": 1.5544646978378296, + "learning_rate": 1.8863211658180154e-05, + "loss": 0.5383, + "step": 16172 + }, + { + "epoch": 2.640137137259704, + "grad_norm": 1.7842367887496948, + "learning_rate": 1.886306470618348e-05, + "loss": 0.5444, + "step": 16173 + }, + { + "epoch": 2.640300395902208, + "grad_norm": 1.795445442199707, + "learning_rate": 1.8862917745261703e-05, + "loss": 0.5532, + "step": 16174 + }, + { + "epoch": 2.640463654544712, + "grad_norm": 1.665205478668213, + "learning_rate": 1.8862770775414973e-05, + "loss": 0.5093, + "step": 16175 + }, + { + "epoch": 2.6406269131872167, + "grad_norm": 1.816694736480713, + "learning_rate": 1.8862623796643432e-05, + "loss": 0.56, + "step": 16176 + }, + { + "epoch": 2.640790171829721, + "grad_norm": 2.035830020904541, + "learning_rate": 1.886247680894723e-05, + "loss": 0.7155, + "step": 16177 + }, + { + "epoch": 2.6409534304722255, + "grad_norm": 1.5601911544799805, + "learning_rate": 1.8862329812326518e-05, + "loss": 0.5756, + "step": 16178 + }, + { + "epoch": 2.64111668911473, + "grad_norm": 1.8580330610275269, + "learning_rate": 1.8862182806781438e-05, + "loss": 0.5196, + "step": 16179 + }, + { + "epoch": 2.6412799477572344, + "grad_norm": 1.6392097473144531, + "learning_rate": 1.8862035792312148e-05, + "loss": 0.5782, + "step": 16180 + }, + { + "epoch": 2.641443206399739, + "grad_norm": 1.7013221979141235, + "learning_rate": 1.886188876891879e-05, + "loss": 0.5522, + "step": 16181 + }, + { + "epoch": 2.6416064650422433, + "grad_norm": 1.7511489391326904, + "learning_rate": 1.886174173660151e-05, + "loss": 0.4866, + "step": 16182 + }, + { + "epoch": 2.6417697236847477, + "grad_norm": 1.7694509029388428, + "learning_rate": 1.886159469536046e-05, + "loss": 0.5365, + "step": 16183 + }, + { + "epoch": 2.641932982327252, + "grad_norm": 1.851187825202942, + "learning_rate": 1.8861447645195784e-05, + "loss": 0.5835, + "step": 16184 + }, + { + "epoch": 2.6420962409697566, + "grad_norm": 2.531167507171631, + "learning_rate": 1.8861300586107635e-05, + "loss": 0.7188, + "step": 16185 + }, + { + "epoch": 2.6422594996122606, + "grad_norm": 2.0404052734375, + "learning_rate": 1.886115351809616e-05, + "loss": 0.6644, + "step": 16186 + }, + { + "epoch": 2.642422758254765, + "grad_norm": 1.6433522701263428, + "learning_rate": 1.8861006441161502e-05, + "loss": 0.5687, + "step": 16187 + }, + { + "epoch": 2.6425860168972695, + "grad_norm": 1.792381763458252, + "learning_rate": 1.8860859355303815e-05, + "loss": 0.5123, + "step": 16188 + }, + { + "epoch": 2.642749275539774, + "grad_norm": 1.9141826629638672, + "learning_rate": 1.8860712260523245e-05, + "loss": 0.6684, + "step": 16189 + }, + { + "epoch": 2.6429125341822783, + "grad_norm": 1.4881551265716553, + "learning_rate": 1.8860565156819935e-05, + "loss": 0.4516, + "step": 16190 + }, + { + "epoch": 2.6430757928247828, + "grad_norm": 1.9429618120193481, + "learning_rate": 1.8860418044194048e-05, + "loss": 0.6601, + "step": 16191 + }, + { + "epoch": 2.643239051467287, + "grad_norm": 1.9214049577713013, + "learning_rate": 1.8860270922645716e-05, + "loss": 0.6405, + "step": 16192 + }, + { + "epoch": 2.643402310109791, + "grad_norm": 1.4729013442993164, + "learning_rate": 1.8860123792175094e-05, + "loss": 0.4605, + "step": 16193 + }, + { + "epoch": 2.6435655687522956, + "grad_norm": 1.8745107650756836, + "learning_rate": 1.885997665278233e-05, + "loss": 0.6432, + "step": 16194 + }, + { + "epoch": 2.6437288273948, + "grad_norm": 1.6613126993179321, + "learning_rate": 1.8859829504467573e-05, + "loss": 0.6369, + "step": 16195 + }, + { + "epoch": 2.6438920860373045, + "grad_norm": 1.7476122379302979, + "learning_rate": 1.8859682347230968e-05, + "loss": 0.5818, + "step": 16196 + }, + { + "epoch": 2.644055344679809, + "grad_norm": 1.8865282535552979, + "learning_rate": 1.885953518107267e-05, + "loss": 0.6126, + "step": 16197 + }, + { + "epoch": 2.6442186033223134, + "grad_norm": 1.722013235092163, + "learning_rate": 1.8859388005992817e-05, + "loss": 0.575, + "step": 16198 + }, + { + "epoch": 2.644381861964818, + "grad_norm": 2.230564594268799, + "learning_rate": 1.8859240821991563e-05, + "loss": 0.6261, + "step": 16199 + }, + { + "epoch": 2.6445451206073223, + "grad_norm": 1.7928712368011475, + "learning_rate": 1.8859093629069057e-05, + "loss": 0.608, + "step": 16200 + }, + { + "epoch": 2.6447083792498267, + "grad_norm": 1.8802642822265625, + "learning_rate": 1.8858946427225447e-05, + "loss": 0.5181, + "step": 16201 + }, + { + "epoch": 2.644871637892331, + "grad_norm": 1.7463246583938599, + "learning_rate": 1.8858799216460883e-05, + "loss": 0.5898, + "step": 16202 + }, + { + "epoch": 2.6450348965348356, + "grad_norm": 1.863582730293274, + "learning_rate": 1.8858651996775506e-05, + "loss": 0.5035, + "step": 16203 + }, + { + "epoch": 2.6451981551773396, + "grad_norm": 1.9661266803741455, + "learning_rate": 1.8858504768169467e-05, + "loss": 0.6467, + "step": 16204 + }, + { + "epoch": 2.645361413819844, + "grad_norm": 1.652485966682434, + "learning_rate": 1.885835753064292e-05, + "loss": 0.4884, + "step": 16205 + }, + { + "epoch": 2.6455246724623485, + "grad_norm": 1.7477139234542847, + "learning_rate": 1.885821028419601e-05, + "loss": 0.5954, + "step": 16206 + }, + { + "epoch": 2.645687931104853, + "grad_norm": 2.143411874771118, + "learning_rate": 1.8858063028828882e-05, + "loss": 0.8235, + "step": 16207 + }, + { + "epoch": 2.6458511897473573, + "grad_norm": 1.5023454427719116, + "learning_rate": 1.8857915764541688e-05, + "loss": 0.5487, + "step": 16208 + }, + { + "epoch": 2.6460144483898618, + "grad_norm": 1.6875169277191162, + "learning_rate": 1.8857768491334573e-05, + "loss": 0.5038, + "step": 16209 + }, + { + "epoch": 2.6461777070323658, + "grad_norm": 1.9951764345169067, + "learning_rate": 1.8857621209207693e-05, + "loss": 0.6007, + "step": 16210 + }, + { + "epoch": 2.64634096567487, + "grad_norm": 1.3140172958374023, + "learning_rate": 1.8857473918161187e-05, + "loss": 0.4011, + "step": 16211 + }, + { + "epoch": 2.6465042243173746, + "grad_norm": 1.5986403226852417, + "learning_rate": 1.8857326618195207e-05, + "loss": 0.6061, + "step": 16212 + }, + { + "epoch": 2.646667482959879, + "grad_norm": 1.9875656366348267, + "learning_rate": 1.8857179309309902e-05, + "loss": 0.5583, + "step": 16213 + }, + { + "epoch": 2.6468307416023835, + "grad_norm": 1.7265067100524902, + "learning_rate": 1.885703199150542e-05, + "loss": 0.5153, + "step": 16214 + }, + { + "epoch": 2.646994000244888, + "grad_norm": 1.6552222967147827, + "learning_rate": 1.8856884664781908e-05, + "loss": 0.5433, + "step": 16215 + }, + { + "epoch": 2.6471572588873924, + "grad_norm": 1.86235511302948, + "learning_rate": 1.8856737329139517e-05, + "loss": 0.587, + "step": 16216 + }, + { + "epoch": 2.647320517529897, + "grad_norm": 1.6712238788604736, + "learning_rate": 1.8856589984578394e-05, + "loss": 0.5225, + "step": 16217 + }, + { + "epoch": 2.6474837761724013, + "grad_norm": 1.5294249057769775, + "learning_rate": 1.8856442631098685e-05, + "loss": 0.5374, + "step": 16218 + }, + { + "epoch": 2.6476470348149057, + "grad_norm": 1.8831952810287476, + "learning_rate": 1.8856295268700542e-05, + "loss": 0.5528, + "step": 16219 + }, + { + "epoch": 2.64781029345741, + "grad_norm": 1.9194308519363403, + "learning_rate": 1.8856147897384112e-05, + "loss": 0.533, + "step": 16220 + }, + { + "epoch": 2.647973552099914, + "grad_norm": 1.6087665557861328, + "learning_rate": 1.8856000517149546e-05, + "loss": 0.4849, + "step": 16221 + }, + { + "epoch": 2.6481368107424186, + "grad_norm": 1.7546992301940918, + "learning_rate": 1.8855853127996987e-05, + "loss": 0.5249, + "step": 16222 + }, + { + "epoch": 2.648300069384923, + "grad_norm": 1.6486681699752808, + "learning_rate": 1.8855705729926583e-05, + "loss": 0.5105, + "step": 16223 + }, + { + "epoch": 2.6484633280274275, + "grad_norm": 1.7884869575500488, + "learning_rate": 1.8855558322938492e-05, + "loss": 0.558, + "step": 16224 + }, + { + "epoch": 2.648626586669932, + "grad_norm": 1.7307721376419067, + "learning_rate": 1.8855410907032854e-05, + "loss": 0.5633, + "step": 16225 + }, + { + "epoch": 2.6487898453124363, + "grad_norm": 1.789711356163025, + "learning_rate": 1.8855263482209817e-05, + "loss": 0.439, + "step": 16226 + }, + { + "epoch": 2.6489531039549408, + "grad_norm": 1.885918378829956, + "learning_rate": 1.885511604846953e-05, + "loss": 0.6275, + "step": 16227 + }, + { + "epoch": 2.6491163625974448, + "grad_norm": 1.5928608179092407, + "learning_rate": 1.885496860581215e-05, + "loss": 0.5639, + "step": 16228 + }, + { + "epoch": 2.649279621239949, + "grad_norm": 1.3987754583358765, + "learning_rate": 1.885482115423782e-05, + "loss": 0.4712, + "step": 16229 + }, + { + "epoch": 2.6494428798824536, + "grad_norm": 1.635408639907837, + "learning_rate": 1.8854673693746683e-05, + "loss": 0.529, + "step": 16230 + }, + { + "epoch": 2.649606138524958, + "grad_norm": 1.9827603101730347, + "learning_rate": 1.885452622433889e-05, + "loss": 0.6958, + "step": 16231 + }, + { + "epoch": 2.6497693971674625, + "grad_norm": 1.8621461391448975, + "learning_rate": 1.8854378746014595e-05, + "loss": 0.5694, + "step": 16232 + }, + { + "epoch": 2.649932655809967, + "grad_norm": 1.4278231859207153, + "learning_rate": 1.8854231258773944e-05, + "loss": 0.5078, + "step": 16233 + }, + { + "epoch": 2.6500959144524714, + "grad_norm": 1.6745015382766724, + "learning_rate": 1.885408376261708e-05, + "loss": 0.4491, + "step": 16234 + }, + { + "epoch": 2.650259173094976, + "grad_norm": 1.8383516073226929, + "learning_rate": 1.885393625754416e-05, + "loss": 0.5309, + "step": 16235 + }, + { + "epoch": 2.6504224317374803, + "grad_norm": 1.7920525074005127, + "learning_rate": 1.885378874355533e-05, + "loss": 0.5321, + "step": 16236 + }, + { + "epoch": 2.6505856903799847, + "grad_norm": 1.893619179725647, + "learning_rate": 1.885364122065073e-05, + "loss": 0.6308, + "step": 16237 + }, + { + "epoch": 2.650748949022489, + "grad_norm": 1.8555587530136108, + "learning_rate": 1.8853493688830523e-05, + "loss": 0.5851, + "step": 16238 + }, + { + "epoch": 2.650912207664993, + "grad_norm": 2.1000163555145264, + "learning_rate": 1.8853346148094848e-05, + "loss": 1.1442, + "step": 16239 + }, + { + "epoch": 2.6510754663074976, + "grad_norm": 2.2729551792144775, + "learning_rate": 1.885319859844385e-05, + "loss": 0.6307, + "step": 16240 + }, + { + "epoch": 2.651238724950002, + "grad_norm": 2.029829502105713, + "learning_rate": 1.885305103987769e-05, + "loss": 0.6656, + "step": 16241 + }, + { + "epoch": 2.6514019835925065, + "grad_norm": 1.7489749193191528, + "learning_rate": 1.8852903472396507e-05, + "loss": 0.5407, + "step": 16242 + }, + { + "epoch": 2.651565242235011, + "grad_norm": 1.7285611629486084, + "learning_rate": 1.8852755896000458e-05, + "loss": 0.5766, + "step": 16243 + }, + { + "epoch": 2.6517285008775153, + "grad_norm": 2.9301564693450928, + "learning_rate": 1.885260831068968e-05, + "loss": 0.6094, + "step": 16244 + }, + { + "epoch": 2.6518917595200193, + "grad_norm": 1.8312351703643799, + "learning_rate": 1.885246071646433e-05, + "loss": 0.6549, + "step": 16245 + }, + { + "epoch": 2.6520550181625238, + "grad_norm": 1.7664475440979004, + "learning_rate": 1.8852313113324553e-05, + "loss": 0.6417, + "step": 16246 + }, + { + "epoch": 2.652218276805028, + "grad_norm": 1.6098604202270508, + "learning_rate": 1.88521655012705e-05, + "loss": 0.5551, + "step": 16247 + }, + { + "epoch": 2.6523815354475326, + "grad_norm": 1.5812747478485107, + "learning_rate": 1.885201788030232e-05, + "loss": 0.4848, + "step": 16248 + }, + { + "epoch": 2.652544794090037, + "grad_norm": 1.7860987186431885, + "learning_rate": 1.8851870250420157e-05, + "loss": 0.6558, + "step": 16249 + }, + { + "epoch": 2.6527080527325415, + "grad_norm": 1.6866869926452637, + "learning_rate": 1.8851722611624166e-05, + "loss": 0.5354, + "step": 16250 + }, + { + "epoch": 2.652871311375046, + "grad_norm": 1.724359154701233, + "learning_rate": 1.8851574963914495e-05, + "loss": 0.6485, + "step": 16251 + }, + { + "epoch": 2.6530345700175504, + "grad_norm": 1.6165668964385986, + "learning_rate": 1.8851427307291286e-05, + "loss": 0.5707, + "step": 16252 + }, + { + "epoch": 2.653197828660055, + "grad_norm": 1.714489221572876, + "learning_rate": 1.885127964175469e-05, + "loss": 0.5052, + "step": 16253 + }, + { + "epoch": 2.6533610873025593, + "grad_norm": 1.3358865976333618, + "learning_rate": 1.8851131967304864e-05, + "loss": 0.4177, + "step": 16254 + }, + { + "epoch": 2.6535243459450637, + "grad_norm": 1.8005118370056152, + "learning_rate": 1.8850984283941947e-05, + "loss": 0.6258, + "step": 16255 + }, + { + "epoch": 2.6536876045875677, + "grad_norm": 1.8048179149627686, + "learning_rate": 1.8850836591666094e-05, + "loss": 0.5419, + "step": 16256 + }, + { + "epoch": 2.653850863230072, + "grad_norm": 1.573318362236023, + "learning_rate": 1.8850688890477446e-05, + "loss": 0.4881, + "step": 16257 + }, + { + "epoch": 2.6540141218725766, + "grad_norm": 1.639529824256897, + "learning_rate": 1.885054118037616e-05, + "loss": 0.5152, + "step": 16258 + }, + { + "epoch": 2.654177380515081, + "grad_norm": 1.6184738874435425, + "learning_rate": 1.885039346136238e-05, + "loss": 0.5439, + "step": 16259 + }, + { + "epoch": 2.6543406391575854, + "grad_norm": 2.0479931831359863, + "learning_rate": 1.8850245733436255e-05, + "loss": 0.5914, + "step": 16260 + }, + { + "epoch": 2.65450389780009, + "grad_norm": 1.4357616901397705, + "learning_rate": 1.885009799659794e-05, + "loss": 0.4732, + "step": 16261 + }, + { + "epoch": 2.6546671564425943, + "grad_norm": 1.6171537637710571, + "learning_rate": 1.8849950250847575e-05, + "loss": 0.5625, + "step": 16262 + }, + { + "epoch": 2.6548304150850983, + "grad_norm": 1.7594616413116455, + "learning_rate": 1.884980249618531e-05, + "loss": 0.581, + "step": 16263 + }, + { + "epoch": 2.6549936737276028, + "grad_norm": 1.5942631959915161, + "learning_rate": 1.88496547326113e-05, + "loss": 0.5962, + "step": 16264 + }, + { + "epoch": 2.655156932370107, + "grad_norm": 1.7833468914031982, + "learning_rate": 1.884950696012569e-05, + "loss": 0.5352, + "step": 16265 + }, + { + "epoch": 2.6553201910126116, + "grad_norm": 1.8331431150436401, + "learning_rate": 1.8849359178728628e-05, + "loss": 0.5857, + "step": 16266 + }, + { + "epoch": 2.655483449655116, + "grad_norm": 1.709747076034546, + "learning_rate": 1.8849211388420262e-05, + "loss": 0.5066, + "step": 16267 + }, + { + "epoch": 2.6556467082976205, + "grad_norm": 1.4912408590316772, + "learning_rate": 1.8849063589200744e-05, + "loss": 0.4892, + "step": 16268 + }, + { + "epoch": 2.655809966940125, + "grad_norm": 1.5374032258987427, + "learning_rate": 1.8848915781070222e-05, + "loss": 0.5589, + "step": 16269 + }, + { + "epoch": 2.6559732255826294, + "grad_norm": 2.048097610473633, + "learning_rate": 1.8848767964028846e-05, + "loss": 0.5935, + "step": 16270 + }, + { + "epoch": 2.656136484225134, + "grad_norm": 1.604047417640686, + "learning_rate": 1.8848620138076758e-05, + "loss": 0.5658, + "step": 16271 + }, + { + "epoch": 2.6562997428676383, + "grad_norm": 1.857582688331604, + "learning_rate": 1.8848472303214113e-05, + "loss": 0.6611, + "step": 16272 + }, + { + "epoch": 2.6564630015101427, + "grad_norm": 1.8161566257476807, + "learning_rate": 1.884832445944106e-05, + "loss": 0.5649, + "step": 16273 + }, + { + "epoch": 2.6566262601526467, + "grad_norm": 1.669310450553894, + "learning_rate": 1.8848176606757745e-05, + "loss": 0.6082, + "step": 16274 + }, + { + "epoch": 2.656789518795151, + "grad_norm": 2.02079176902771, + "learning_rate": 1.8848028745164323e-05, + "loss": 0.7283, + "step": 16275 + }, + { + "epoch": 2.6569527774376556, + "grad_norm": 1.805809497833252, + "learning_rate": 1.8847880874660934e-05, + "loss": 0.5378, + "step": 16276 + }, + { + "epoch": 2.65711603608016, + "grad_norm": 1.876278042793274, + "learning_rate": 1.8847732995247735e-05, + "loss": 0.6316, + "step": 16277 + }, + { + "epoch": 2.6572792947226644, + "grad_norm": 1.9347690343856812, + "learning_rate": 1.884758510692487e-05, + "loss": 0.662, + "step": 16278 + }, + { + "epoch": 2.657442553365169, + "grad_norm": 1.5389243364334106, + "learning_rate": 1.8847437209692486e-05, + "loss": 0.4492, + "step": 16279 + }, + { + "epoch": 2.6576058120076733, + "grad_norm": 1.623598575592041, + "learning_rate": 1.8847289303550738e-05, + "loss": 0.6392, + "step": 16280 + }, + { + "epoch": 2.6577690706501773, + "grad_norm": 1.7707563638687134, + "learning_rate": 1.8847141388499772e-05, + "loss": 0.479, + "step": 16281 + }, + { + "epoch": 2.6579323292926818, + "grad_norm": 1.6646294593811035, + "learning_rate": 1.8846993464539735e-05, + "loss": 0.5562, + "step": 16282 + }, + { + "epoch": 2.658095587935186, + "grad_norm": 1.5497221946716309, + "learning_rate": 1.884684553167078e-05, + "loss": 0.505, + "step": 16283 + }, + { + "epoch": 2.6582588465776906, + "grad_norm": 1.5549051761627197, + "learning_rate": 1.8846697589893052e-05, + "loss": 0.5114, + "step": 16284 + }, + { + "epoch": 2.658422105220195, + "grad_norm": 1.5753626823425293, + "learning_rate": 1.8846549639206702e-05, + "loss": 0.582, + "step": 16285 + }, + { + "epoch": 2.6585853638626995, + "grad_norm": 1.6876548528671265, + "learning_rate": 1.884640167961188e-05, + "loss": 0.6231, + "step": 16286 + }, + { + "epoch": 2.658748622505204, + "grad_norm": 1.7835813760757446, + "learning_rate": 1.8846253711108734e-05, + "loss": 0.6401, + "step": 16287 + }, + { + "epoch": 2.6589118811477084, + "grad_norm": 2.055309295654297, + "learning_rate": 1.8846105733697414e-05, + "loss": 0.6458, + "step": 16288 + }, + { + "epoch": 2.659075139790213, + "grad_norm": 1.672776222229004, + "learning_rate": 1.8845957747378065e-05, + "loss": 0.5901, + "step": 16289 + }, + { + "epoch": 2.6592383984327173, + "grad_norm": 1.7826297283172607, + "learning_rate": 1.884580975215084e-05, + "loss": 0.521, + "step": 16290 + }, + { + "epoch": 2.6594016570752217, + "grad_norm": 1.903480887413025, + "learning_rate": 1.8845661748015888e-05, + "loss": 0.6205, + "step": 16291 + }, + { + "epoch": 2.6595649157177257, + "grad_norm": 2.035736560821533, + "learning_rate": 1.8845513734973355e-05, + "loss": 0.6213, + "step": 16292 + }, + { + "epoch": 2.65972817436023, + "grad_norm": 1.8266675472259521, + "learning_rate": 1.8845365713023396e-05, + "loss": 0.5025, + "step": 16293 + }, + { + "epoch": 2.6598914330027346, + "grad_norm": 1.570272445678711, + "learning_rate": 1.8845217682166153e-05, + "loss": 0.5163, + "step": 16294 + }, + { + "epoch": 2.660054691645239, + "grad_norm": 1.885453224182129, + "learning_rate": 1.8845069642401777e-05, + "loss": 0.5661, + "step": 16295 + }, + { + "epoch": 2.6602179502877434, + "grad_norm": 1.5214983224868774, + "learning_rate": 1.8844921593730418e-05, + "loss": 0.479, + "step": 16296 + }, + { + "epoch": 2.660381208930248, + "grad_norm": 1.9252363443374634, + "learning_rate": 1.884477353615223e-05, + "loss": 0.6086, + "step": 16297 + }, + { + "epoch": 2.660544467572752, + "grad_norm": 2.2613422870635986, + "learning_rate": 1.8844625469667353e-05, + "loss": 0.6426, + "step": 16298 + }, + { + "epoch": 2.6607077262152563, + "grad_norm": 1.803234577178955, + "learning_rate": 1.884447739427594e-05, + "loss": 0.5315, + "step": 16299 + }, + { + "epoch": 2.6608709848577607, + "grad_norm": 1.7807650566101074, + "learning_rate": 1.8844329309978146e-05, + "loss": 0.6508, + "step": 16300 + }, + { + "epoch": 2.661034243500265, + "grad_norm": 1.4943678379058838, + "learning_rate": 1.884418121677411e-05, + "loss": 0.5016, + "step": 16301 + }, + { + "epoch": 2.6611975021427696, + "grad_norm": 1.9605904817581177, + "learning_rate": 1.8844033114663987e-05, + "loss": 0.6572, + "step": 16302 + }, + { + "epoch": 2.661360760785274, + "grad_norm": 1.72105872631073, + "learning_rate": 1.8843885003647923e-05, + "loss": 0.5905, + "step": 16303 + }, + { + "epoch": 2.6615240194277785, + "grad_norm": 1.8847185373306274, + "learning_rate": 1.8843736883726075e-05, + "loss": 0.6238, + "step": 16304 + }, + { + "epoch": 2.661687278070283, + "grad_norm": 1.846256971359253, + "learning_rate": 1.884358875489858e-05, + "loss": 0.5676, + "step": 16305 + }, + { + "epoch": 2.6618505367127874, + "grad_norm": 1.717538595199585, + "learning_rate": 1.8843440617165596e-05, + "loss": 0.5632, + "step": 16306 + }, + { + "epoch": 2.662013795355292, + "grad_norm": 1.641369104385376, + "learning_rate": 1.884329247052727e-05, + "loss": 0.5648, + "step": 16307 + }, + { + "epoch": 2.6621770539977962, + "grad_norm": 1.630169153213501, + "learning_rate": 1.8843144314983753e-05, + "loss": 0.6239, + "step": 16308 + }, + { + "epoch": 2.6623403126403002, + "grad_norm": 1.8606915473937988, + "learning_rate": 1.8842996150535188e-05, + "loss": 0.5993, + "step": 16309 + }, + { + "epoch": 2.6625035712828047, + "grad_norm": 1.7253917455673218, + "learning_rate": 1.8842847977181732e-05, + "loss": 0.5894, + "step": 16310 + }, + { + "epoch": 2.662666829925309, + "grad_norm": 1.6378947496414185, + "learning_rate": 1.8842699794923523e-05, + "loss": 0.5157, + "step": 16311 + }, + { + "epoch": 2.6628300885678136, + "grad_norm": 1.9302153587341309, + "learning_rate": 1.8842551603760725e-05, + "loss": 0.5701, + "step": 16312 + }, + { + "epoch": 2.662993347210318, + "grad_norm": 1.6023072004318237, + "learning_rate": 1.8842403403693476e-05, + "loss": 0.4996, + "step": 16313 + }, + { + "epoch": 2.6631566058528224, + "grad_norm": 1.376382827758789, + "learning_rate": 1.8842255194721932e-05, + "loss": 0.4813, + "step": 16314 + }, + { + "epoch": 2.663319864495327, + "grad_norm": 1.5550614595413208, + "learning_rate": 1.884210697684624e-05, + "loss": 0.5146, + "step": 16315 + }, + { + "epoch": 2.663483123137831, + "grad_norm": 1.8244661092758179, + "learning_rate": 1.8841958750066545e-05, + "loss": 0.6391, + "step": 16316 + }, + { + "epoch": 2.6636463817803353, + "grad_norm": 1.5459095239639282, + "learning_rate": 1.8841810514383004e-05, + "loss": 0.5313, + "step": 16317 + }, + { + "epoch": 2.6638096404228397, + "grad_norm": 1.9217784404754639, + "learning_rate": 1.8841662269795758e-05, + "loss": 0.6477, + "step": 16318 + }, + { + "epoch": 2.663972899065344, + "grad_norm": 2.1114747524261475, + "learning_rate": 1.884151401630496e-05, + "loss": 0.6372, + "step": 16319 + }, + { + "epoch": 2.6641361577078486, + "grad_norm": 1.9109761714935303, + "learning_rate": 1.8841365753910765e-05, + "loss": 0.5919, + "step": 16320 + }, + { + "epoch": 2.664299416350353, + "grad_norm": 1.7391923666000366, + "learning_rate": 1.8841217482613313e-05, + "loss": 0.6371, + "step": 16321 + }, + { + "epoch": 2.6644626749928575, + "grad_norm": 1.9180046319961548, + "learning_rate": 1.884106920241276e-05, + "loss": 0.5316, + "step": 16322 + }, + { + "epoch": 2.664625933635362, + "grad_norm": 1.6787445545196533, + "learning_rate": 1.884092091330925e-05, + "loss": 0.5097, + "step": 16323 + }, + { + "epoch": 2.6647891922778664, + "grad_norm": 1.766187310218811, + "learning_rate": 1.8840772615302935e-05, + "loss": 0.559, + "step": 16324 + }, + { + "epoch": 2.664952450920371, + "grad_norm": 2.1056883335113525, + "learning_rate": 1.8840624308393965e-05, + "loss": 0.5294, + "step": 16325 + }, + { + "epoch": 2.6651157095628752, + "grad_norm": 1.7649235725402832, + "learning_rate": 1.884047599258249e-05, + "loss": 0.5074, + "step": 16326 + }, + { + "epoch": 2.6652789682053792, + "grad_norm": 1.5243574380874634, + "learning_rate": 1.8840327667868657e-05, + "loss": 0.581, + "step": 16327 + }, + { + "epoch": 2.6654422268478837, + "grad_norm": 1.3903610706329346, + "learning_rate": 1.8840179334252617e-05, + "loss": 0.5232, + "step": 16328 + }, + { + "epoch": 2.665605485490388, + "grad_norm": 1.9203052520751953, + "learning_rate": 1.8840030991734518e-05, + "loss": 0.6296, + "step": 16329 + }, + { + "epoch": 2.6657687441328926, + "grad_norm": 1.6550929546356201, + "learning_rate": 1.8839882640314512e-05, + "loss": 0.5643, + "step": 16330 + }, + { + "epoch": 2.665932002775397, + "grad_norm": 1.4799842834472656, + "learning_rate": 1.883973427999274e-05, + "loss": 0.4881, + "step": 16331 + }, + { + "epoch": 2.6660952614179014, + "grad_norm": 1.6732831001281738, + "learning_rate": 1.8839585910769365e-05, + "loss": 0.5598, + "step": 16332 + }, + { + "epoch": 2.6662585200604054, + "grad_norm": 1.7194199562072754, + "learning_rate": 1.883943753264453e-05, + "loss": 0.4957, + "step": 16333 + }, + { + "epoch": 2.66642177870291, + "grad_norm": 1.6308283805847168, + "learning_rate": 1.8839289145618378e-05, + "loss": 0.545, + "step": 16334 + }, + { + "epoch": 2.6665850373454143, + "grad_norm": 1.8110735416412354, + "learning_rate": 1.8839140749691064e-05, + "loss": 0.5534, + "step": 16335 + }, + { + "epoch": 2.6667482959879187, + "grad_norm": 1.539918303489685, + "learning_rate": 1.8838992344862744e-05, + "loss": 0.5283, + "step": 16336 + }, + { + "epoch": 2.666911554630423, + "grad_norm": 1.7568938732147217, + "learning_rate": 1.8838843931133555e-05, + "loss": 0.576, + "step": 16337 + }, + { + "epoch": 2.6670748132729276, + "grad_norm": 1.9397773742675781, + "learning_rate": 1.8838695508503656e-05, + "loss": 0.4885, + "step": 16338 + }, + { + "epoch": 2.667238071915432, + "grad_norm": 1.9094289541244507, + "learning_rate": 1.8838547076973192e-05, + "loss": 0.7238, + "step": 16339 + }, + { + "epoch": 2.6674013305579365, + "grad_norm": 1.950736165046692, + "learning_rate": 1.8838398636542316e-05, + "loss": 0.5922, + "step": 16340 + }, + { + "epoch": 2.667564589200441, + "grad_norm": 1.8103185892105103, + "learning_rate": 1.883825018721117e-05, + "loss": 0.5568, + "step": 16341 + }, + { + "epoch": 2.6677278478429454, + "grad_norm": 1.8735820055007935, + "learning_rate": 1.8838101728979913e-05, + "loss": 0.6055, + "step": 16342 + }, + { + "epoch": 2.66789110648545, + "grad_norm": 1.6604368686676025, + "learning_rate": 1.883795326184869e-05, + "loss": 0.5426, + "step": 16343 + }, + { + "epoch": 2.668054365127954, + "grad_norm": 1.6365835666656494, + "learning_rate": 1.8837804785817644e-05, + "loss": 0.5054, + "step": 16344 + }, + { + "epoch": 2.6682176237704582, + "grad_norm": 1.7236286401748657, + "learning_rate": 1.8837656300886937e-05, + "loss": 0.5484, + "step": 16345 + }, + { + "epoch": 2.6683808824129627, + "grad_norm": 1.8289629220962524, + "learning_rate": 1.883750780705671e-05, + "loss": 0.4917, + "step": 16346 + }, + { + "epoch": 2.668544141055467, + "grad_norm": 1.9440494775772095, + "learning_rate": 1.8837359304327115e-05, + "loss": 0.6301, + "step": 16347 + }, + { + "epoch": 2.6687073996979715, + "grad_norm": 1.7163053750991821, + "learning_rate": 1.8837210792698305e-05, + "loss": 0.5432, + "step": 16348 + }, + { + "epoch": 2.668870658340476, + "grad_norm": 2.062157154083252, + "learning_rate": 1.8837062272170418e-05, + "loss": 0.6044, + "step": 16349 + }, + { + "epoch": 2.6690339169829804, + "grad_norm": 1.9886558055877686, + "learning_rate": 1.883691374274362e-05, + "loss": 0.5685, + "step": 16350 + }, + { + "epoch": 2.6691971756254844, + "grad_norm": 1.8802244663238525, + "learning_rate": 1.883676520441805e-05, + "loss": 0.5886, + "step": 16351 + }, + { + "epoch": 2.669360434267989, + "grad_norm": 1.789820909500122, + "learning_rate": 1.8836616657193855e-05, + "loss": 0.6346, + "step": 16352 + }, + { + "epoch": 2.6695236929104933, + "grad_norm": 1.5130960941314697, + "learning_rate": 1.8836468101071194e-05, + "loss": 0.5325, + "step": 16353 + }, + { + "epoch": 2.6696869515529977, + "grad_norm": 1.7319607734680176, + "learning_rate": 1.883631953605021e-05, + "loss": 0.4436, + "step": 16354 + }, + { + "epoch": 2.669850210195502, + "grad_norm": 1.9163321256637573, + "learning_rate": 1.8836170962131056e-05, + "loss": 0.5967, + "step": 16355 + }, + { + "epoch": 2.6700134688380066, + "grad_norm": 2.0167734622955322, + "learning_rate": 1.8836022379313884e-05, + "loss": 0.6176, + "step": 16356 + }, + { + "epoch": 2.670176727480511, + "grad_norm": 1.5765957832336426, + "learning_rate": 1.8835873787598834e-05, + "loss": 0.5518, + "step": 16357 + }, + { + "epoch": 2.6703399861230155, + "grad_norm": 1.7647868394851685, + "learning_rate": 1.8835725186986062e-05, + "loss": 0.5673, + "step": 16358 + }, + { + "epoch": 2.67050324476552, + "grad_norm": 1.7314445972442627, + "learning_rate": 1.8835576577475717e-05, + "loss": 0.539, + "step": 16359 + }, + { + "epoch": 2.6706665034080244, + "grad_norm": 1.651344656944275, + "learning_rate": 1.8835427959067952e-05, + "loss": 0.6652, + "step": 16360 + }, + { + "epoch": 2.670829762050529, + "grad_norm": 2.3495843410491943, + "learning_rate": 1.883527933176291e-05, + "loss": 0.7014, + "step": 16361 + }, + { + "epoch": 2.670993020693033, + "grad_norm": 1.8624143600463867, + "learning_rate": 1.8835130695560746e-05, + "loss": 0.5578, + "step": 16362 + }, + { + "epoch": 2.6711562793355372, + "grad_norm": 1.9127405881881714, + "learning_rate": 1.8834982050461608e-05, + "loss": 0.4842, + "step": 16363 + }, + { + "epoch": 2.6713195379780417, + "grad_norm": 1.7331244945526123, + "learning_rate": 1.8834833396465645e-05, + "loss": 0.5711, + "step": 16364 + }, + { + "epoch": 2.671482796620546, + "grad_norm": 1.5312727689743042, + "learning_rate": 1.8834684733573007e-05, + "loss": 0.5238, + "step": 16365 + }, + { + "epoch": 2.6716460552630505, + "grad_norm": 1.526473045349121, + "learning_rate": 1.8834536061783843e-05, + "loss": 0.5668, + "step": 16366 + }, + { + "epoch": 2.671809313905555, + "grad_norm": 2.219524621963501, + "learning_rate": 1.8834387381098302e-05, + "loss": 0.6601, + "step": 16367 + }, + { + "epoch": 2.6719725725480594, + "grad_norm": 2.108445167541504, + "learning_rate": 1.8834238691516537e-05, + "loss": 0.6909, + "step": 16368 + }, + { + "epoch": 2.6721358311905634, + "grad_norm": 1.6152878999710083, + "learning_rate": 1.8834089993038696e-05, + "loss": 0.5854, + "step": 16369 + }, + { + "epoch": 2.672299089833068, + "grad_norm": 1.5773333311080933, + "learning_rate": 1.883394128566493e-05, + "loss": 0.5028, + "step": 16370 + }, + { + "epoch": 2.6724623484755723, + "grad_norm": 2.3028788566589355, + "learning_rate": 1.8833792569395385e-05, + "loss": 0.5882, + "step": 16371 + }, + { + "epoch": 2.6726256071180767, + "grad_norm": 1.9720379114151, + "learning_rate": 1.8833643844230217e-05, + "loss": 0.6176, + "step": 16372 + }, + { + "epoch": 2.672788865760581, + "grad_norm": 1.9768034219741821, + "learning_rate": 1.883349511016957e-05, + "loss": 0.6645, + "step": 16373 + }, + { + "epoch": 2.6729521244030856, + "grad_norm": 1.636730432510376, + "learning_rate": 1.8833346367213595e-05, + "loss": 0.6023, + "step": 16374 + }, + { + "epoch": 2.67311538304559, + "grad_norm": 1.9114065170288086, + "learning_rate": 1.8833197615362443e-05, + "loss": 0.5746, + "step": 16375 + }, + { + "epoch": 2.6732786416880945, + "grad_norm": 1.5204781293869019, + "learning_rate": 1.8833048854616263e-05, + "loss": 0.5572, + "step": 16376 + }, + { + "epoch": 2.673441900330599, + "grad_norm": 1.6491953134536743, + "learning_rate": 1.8832900084975203e-05, + "loss": 0.5639, + "step": 16377 + }, + { + "epoch": 2.6736051589731034, + "grad_norm": 2.0114426612854004, + "learning_rate": 1.883275130643942e-05, + "loss": 0.7387, + "step": 16378 + }, + { + "epoch": 2.673768417615608, + "grad_norm": 1.8034809827804565, + "learning_rate": 1.8832602519009056e-05, + "loss": 0.5551, + "step": 16379 + }, + { + "epoch": 2.673931676258112, + "grad_norm": 1.7541903257369995, + "learning_rate": 1.8832453722684263e-05, + "loss": 0.551, + "step": 16380 + }, + { + "epoch": 2.6740949349006162, + "grad_norm": 1.9957820177078247, + "learning_rate": 1.883230491746519e-05, + "loss": 0.6291, + "step": 16381 + }, + { + "epoch": 2.6742581935431207, + "grad_norm": 1.6463096141815186, + "learning_rate": 1.8832156103351994e-05, + "loss": 0.5892, + "step": 16382 + }, + { + "epoch": 2.674421452185625, + "grad_norm": 2.114961862564087, + "learning_rate": 1.8832007280344813e-05, + "loss": 0.6794, + "step": 16383 + }, + { + "epoch": 2.6745847108281295, + "grad_norm": 1.7838817834854126, + "learning_rate": 1.8831858448443806e-05, + "loss": 0.6838, + "step": 16384 + }, + { + "epoch": 2.674747969470634, + "grad_norm": 1.6413896083831787, + "learning_rate": 1.883170960764912e-05, + "loss": 0.5618, + "step": 16385 + }, + { + "epoch": 2.674911228113138, + "grad_norm": 2.0213186740875244, + "learning_rate": 1.8831560757960906e-05, + "loss": 0.6263, + "step": 16386 + }, + { + "epoch": 2.6750744867556424, + "grad_norm": 1.7649832963943481, + "learning_rate": 1.883141189937931e-05, + "loss": 0.6743, + "step": 16387 + }, + { + "epoch": 2.675237745398147, + "grad_norm": 1.4128681421279907, + "learning_rate": 1.8831263031904485e-05, + "loss": 0.5038, + "step": 16388 + }, + { + "epoch": 2.6754010040406513, + "grad_norm": 1.7522263526916504, + "learning_rate": 1.883111415553658e-05, + "loss": 0.5897, + "step": 16389 + }, + { + "epoch": 2.6755642626831557, + "grad_norm": 1.7551922798156738, + "learning_rate": 1.8830965270275746e-05, + "loss": 0.6048, + "step": 16390 + }, + { + "epoch": 2.67572752132566, + "grad_norm": 2.0264551639556885, + "learning_rate": 1.8830816376122134e-05, + "loss": 0.5616, + "step": 16391 + }, + { + "epoch": 2.6758907799681646, + "grad_norm": 1.5114728212356567, + "learning_rate": 1.8830667473075892e-05, + "loss": 0.4735, + "step": 16392 + }, + { + "epoch": 2.676054038610669, + "grad_norm": 1.6736599206924438, + "learning_rate": 1.883051856113717e-05, + "loss": 0.5702, + "step": 16393 + }, + { + "epoch": 2.6762172972531735, + "grad_norm": 2.0058517456054688, + "learning_rate": 1.8830369640306117e-05, + "loss": 0.6281, + "step": 16394 + }, + { + "epoch": 2.676380555895678, + "grad_norm": 1.7630877494812012, + "learning_rate": 1.883022071058288e-05, + "loss": 0.6108, + "step": 16395 + }, + { + "epoch": 2.6765438145381824, + "grad_norm": 1.886971116065979, + "learning_rate": 1.883007177196762e-05, + "loss": 0.5812, + "step": 16396 + }, + { + "epoch": 2.6767070731806863, + "grad_norm": 1.9686473608016968, + "learning_rate": 1.882992282446048e-05, + "loss": 0.6015, + "step": 16397 + }, + { + "epoch": 2.676870331823191, + "grad_norm": 1.6692005395889282, + "learning_rate": 1.8829773868061604e-05, + "loss": 0.5463, + "step": 16398 + }, + { + "epoch": 2.6770335904656952, + "grad_norm": 1.9452427625656128, + "learning_rate": 1.8829624902771153e-05, + "loss": 0.7179, + "step": 16399 + }, + { + "epoch": 2.6771968491081997, + "grad_norm": 1.4966535568237305, + "learning_rate": 1.8829475928589272e-05, + "loss": 0.4509, + "step": 16400 + }, + { + "epoch": 2.677360107750704, + "grad_norm": 1.7980726957321167, + "learning_rate": 1.882932694551611e-05, + "loss": 0.6445, + "step": 16401 + }, + { + "epoch": 2.6775233663932085, + "grad_norm": 1.9372528791427612, + "learning_rate": 1.882917795355182e-05, + "loss": 0.6071, + "step": 16402 + }, + { + "epoch": 2.677686625035713, + "grad_norm": 1.7761136293411255, + "learning_rate": 1.8829028952696545e-05, + "loss": 0.5456, + "step": 16403 + }, + { + "epoch": 2.677849883678217, + "grad_norm": 2.028926372528076, + "learning_rate": 1.8828879942950444e-05, + "loss": 0.5935, + "step": 16404 + }, + { + "epoch": 2.6780131423207214, + "grad_norm": 1.7885468006134033, + "learning_rate": 1.8828730924313662e-05, + "loss": 0.5598, + "step": 16405 + }, + { + "epoch": 2.678176400963226, + "grad_norm": 1.5184855461120605, + "learning_rate": 1.8828581896786347e-05, + "loss": 0.5006, + "step": 16406 + }, + { + "epoch": 2.6783396596057303, + "grad_norm": 1.7889596223831177, + "learning_rate": 1.882843286036866e-05, + "loss": 0.5949, + "step": 16407 + }, + { + "epoch": 2.6785029182482347, + "grad_norm": 1.5437264442443848, + "learning_rate": 1.882828381506074e-05, + "loss": 0.5261, + "step": 16408 + }, + { + "epoch": 2.678666176890739, + "grad_norm": 1.5310728549957275, + "learning_rate": 1.882813476086274e-05, + "loss": 0.553, + "step": 16409 + }, + { + "epoch": 2.6788294355332436, + "grad_norm": 1.9613131284713745, + "learning_rate": 1.882798569777481e-05, + "loss": 0.5911, + "step": 16410 + }, + { + "epoch": 2.678992694175748, + "grad_norm": 1.9702357053756714, + "learning_rate": 1.88278366257971e-05, + "loss": 0.4882, + "step": 16411 + }, + { + "epoch": 2.6791559528182525, + "grad_norm": 1.707716941833496, + "learning_rate": 1.8827687544929763e-05, + "loss": 0.5353, + "step": 16412 + }, + { + "epoch": 2.679319211460757, + "grad_norm": 2.0635123252868652, + "learning_rate": 1.8827538455172947e-05, + "loss": 0.7019, + "step": 16413 + }, + { + "epoch": 2.6794824701032613, + "grad_norm": 1.6630780696868896, + "learning_rate": 1.88273893565268e-05, + "loss": 0.5177, + "step": 16414 + }, + { + "epoch": 2.6796457287457653, + "grad_norm": 1.9748984575271606, + "learning_rate": 1.882724024899148e-05, + "loss": 0.5502, + "step": 16415 + }, + { + "epoch": 2.67980898738827, + "grad_norm": 1.7943633794784546, + "learning_rate": 1.8827091132567124e-05, + "loss": 0.6065, + "step": 16416 + }, + { + "epoch": 2.679972246030774, + "grad_norm": 2.2598495483398438, + "learning_rate": 1.8826942007253894e-05, + "loss": 0.7273, + "step": 16417 + }, + { + "epoch": 2.6801355046732787, + "grad_norm": 1.8314197063446045, + "learning_rate": 1.8826792873051935e-05, + "loss": 0.597, + "step": 16418 + }, + { + "epoch": 2.680298763315783, + "grad_norm": 1.4663925170898438, + "learning_rate": 1.8826643729961394e-05, + "loss": 0.4544, + "step": 16419 + }, + { + "epoch": 2.6804620219582875, + "grad_norm": 2.2930233478546143, + "learning_rate": 1.8826494577982432e-05, + "loss": 0.6963, + "step": 16420 + }, + { + "epoch": 2.680625280600792, + "grad_norm": 1.8547337055206299, + "learning_rate": 1.8826345417115188e-05, + "loss": 0.6509, + "step": 16421 + }, + { + "epoch": 2.680788539243296, + "grad_norm": 1.3991401195526123, + "learning_rate": 1.882619624735982e-05, + "loss": 0.4293, + "step": 16422 + }, + { + "epoch": 2.6809517978858004, + "grad_norm": 1.5716923475265503, + "learning_rate": 1.882604706871647e-05, + "loss": 0.5636, + "step": 16423 + }, + { + "epoch": 2.681115056528305, + "grad_norm": 1.9680172204971313, + "learning_rate": 1.8825897881185296e-05, + "loss": 0.5836, + "step": 16424 + }, + { + "epoch": 2.6812783151708093, + "grad_norm": 1.5834296941757202, + "learning_rate": 1.8825748684766442e-05, + "loss": 0.4677, + "step": 16425 + }, + { + "epoch": 2.6814415738133137, + "grad_norm": 1.818985104560852, + "learning_rate": 1.8825599479460064e-05, + "loss": 0.6159, + "step": 16426 + }, + { + "epoch": 2.681604832455818, + "grad_norm": 1.916646122932434, + "learning_rate": 1.882545026526631e-05, + "loss": 0.5339, + "step": 16427 + }, + { + "epoch": 2.6817680910983226, + "grad_norm": 1.809349536895752, + "learning_rate": 1.8825301042185328e-05, + "loss": 0.5584, + "step": 16428 + }, + { + "epoch": 2.681931349740827, + "grad_norm": 1.521785020828247, + "learning_rate": 1.8825151810217273e-05, + "loss": 0.5263, + "step": 16429 + }, + { + "epoch": 2.6820946083833315, + "grad_norm": 1.7768723964691162, + "learning_rate": 1.882500256936229e-05, + "loss": 0.6229, + "step": 16430 + }, + { + "epoch": 2.682257867025836, + "grad_norm": 1.848021149635315, + "learning_rate": 1.8824853319620532e-05, + "loss": 0.5617, + "step": 16431 + }, + { + "epoch": 2.6824211256683403, + "grad_norm": 1.4697195291519165, + "learning_rate": 1.882470406099215e-05, + "loss": 0.4573, + "step": 16432 + }, + { + "epoch": 2.6825843843108443, + "grad_norm": 1.5814824104309082, + "learning_rate": 1.8824554793477294e-05, + "loss": 0.6007, + "step": 16433 + }, + { + "epoch": 2.6827476429533488, + "grad_norm": 1.5659681558609009, + "learning_rate": 1.882440551707611e-05, + "loss": 0.4893, + "step": 16434 + }, + { + "epoch": 2.682910901595853, + "grad_norm": 1.6011825799942017, + "learning_rate": 1.8824256231788755e-05, + "loss": 0.5045, + "step": 16435 + }, + { + "epoch": 2.6830741602383577, + "grad_norm": 1.7143847942352295, + "learning_rate": 1.8824106937615377e-05, + "loss": 0.504, + "step": 16436 + }, + { + "epoch": 2.683237418880862, + "grad_norm": 1.6532143354415894, + "learning_rate": 1.8823957634556125e-05, + "loss": 0.4922, + "step": 16437 + }, + { + "epoch": 2.6834006775233665, + "grad_norm": 1.6793723106384277, + "learning_rate": 1.882380832261115e-05, + "loss": 0.5157, + "step": 16438 + }, + { + "epoch": 2.6835639361658705, + "grad_norm": 1.5882152318954468, + "learning_rate": 1.88236590017806e-05, + "loss": 0.5313, + "step": 16439 + }, + { + "epoch": 2.683727194808375, + "grad_norm": 2.365739583969116, + "learning_rate": 1.882350967206463e-05, + "loss": 0.652, + "step": 16440 + }, + { + "epoch": 2.6838904534508794, + "grad_norm": 1.7916666269302368, + "learning_rate": 1.8823360333463387e-05, + "loss": 0.6072, + "step": 16441 + }, + { + "epoch": 2.684053712093384, + "grad_norm": 1.6151963472366333, + "learning_rate": 1.8823210985977024e-05, + "loss": 0.5249, + "step": 16442 + }, + { + "epoch": 2.6842169707358883, + "grad_norm": 1.4359116554260254, + "learning_rate": 1.882306162960569e-05, + "loss": 0.4668, + "step": 16443 + }, + { + "epoch": 2.6843802293783927, + "grad_norm": 1.6685482263565063, + "learning_rate": 1.8822912264349535e-05, + "loss": 0.6116, + "step": 16444 + }, + { + "epoch": 2.684543488020897, + "grad_norm": 1.4240553379058838, + "learning_rate": 1.8822762890208712e-05, + "loss": 0.5196, + "step": 16445 + }, + { + "epoch": 2.6847067466634016, + "grad_norm": 2.044994354248047, + "learning_rate": 1.8822613507183363e-05, + "loss": 0.6244, + "step": 16446 + }, + { + "epoch": 2.684870005305906, + "grad_norm": 1.5365080833435059, + "learning_rate": 1.8822464115273645e-05, + "loss": 0.5001, + "step": 16447 + }, + { + "epoch": 2.6850332639484105, + "grad_norm": 1.5827518701553345, + "learning_rate": 1.8822314714479714e-05, + "loss": 0.4836, + "step": 16448 + }, + { + "epoch": 2.685196522590915, + "grad_norm": 1.6679178476333618, + "learning_rate": 1.882216530480171e-05, + "loss": 0.5897, + "step": 16449 + }, + { + "epoch": 2.685359781233419, + "grad_norm": 1.7004810571670532, + "learning_rate": 1.882201588623979e-05, + "loss": 0.6024, + "step": 16450 + }, + { + "epoch": 2.6855230398759233, + "grad_norm": 1.6526628732681274, + "learning_rate": 1.88218664587941e-05, + "loss": 0.5092, + "step": 16451 + }, + { + "epoch": 2.6856862985184278, + "grad_norm": 1.6409322023391724, + "learning_rate": 1.8821717022464794e-05, + "loss": 0.5299, + "step": 16452 + }, + { + "epoch": 2.685849557160932, + "grad_norm": 1.5565569400787354, + "learning_rate": 1.882156757725202e-05, + "loss": 0.5748, + "step": 16453 + }, + { + "epoch": 2.6860128158034366, + "grad_norm": 1.7933627367019653, + "learning_rate": 1.8821418123155936e-05, + "loss": 0.5654, + "step": 16454 + }, + { + "epoch": 2.686176074445941, + "grad_norm": 1.9940145015716553, + "learning_rate": 1.882126866017668e-05, + "loss": 0.5634, + "step": 16455 + }, + { + "epoch": 2.6863393330884455, + "grad_norm": 1.8357791900634766, + "learning_rate": 1.8821119188314408e-05, + "loss": 0.617, + "step": 16456 + }, + { + "epoch": 2.6865025917309495, + "grad_norm": 1.6583068370819092, + "learning_rate": 1.8820969707569278e-05, + "loss": 0.5335, + "step": 16457 + }, + { + "epoch": 2.686665850373454, + "grad_norm": 2.2446439266204834, + "learning_rate": 1.8820820217941427e-05, + "loss": 0.6181, + "step": 16458 + }, + { + "epoch": 2.6868291090159584, + "grad_norm": 1.8129420280456543, + "learning_rate": 1.8820670719431017e-05, + "loss": 0.6143, + "step": 16459 + }, + { + "epoch": 2.686992367658463, + "grad_norm": 1.750885009765625, + "learning_rate": 1.882052121203819e-05, + "loss": 0.6359, + "step": 16460 + }, + { + "epoch": 2.6871556263009673, + "grad_norm": 1.7610363960266113, + "learning_rate": 1.8820371695763103e-05, + "loss": 0.5386, + "step": 16461 + }, + { + "epoch": 2.6873188849434717, + "grad_norm": 1.8827223777770996, + "learning_rate": 1.8820222170605903e-05, + "loss": 0.6642, + "step": 16462 + }, + { + "epoch": 2.687482143585976, + "grad_norm": 2.0353565216064453, + "learning_rate": 1.882007263656674e-05, + "loss": 0.6577, + "step": 16463 + }, + { + "epoch": 2.6876454022284806, + "grad_norm": 2.02829909324646, + "learning_rate": 1.8819923093645773e-05, + "loss": 0.6335, + "step": 16464 + }, + { + "epoch": 2.687808660870985, + "grad_norm": 1.4245282411575317, + "learning_rate": 1.8819773541843136e-05, + "loss": 0.4546, + "step": 16465 + }, + { + "epoch": 2.6879719195134895, + "grad_norm": 1.769622564315796, + "learning_rate": 1.8819623981158996e-05, + "loss": 0.5955, + "step": 16466 + }, + { + "epoch": 2.688135178155994, + "grad_norm": 1.7487099170684814, + "learning_rate": 1.8819474411593496e-05, + "loss": 0.5685, + "step": 16467 + }, + { + "epoch": 2.688298436798498, + "grad_norm": 1.7942384481430054, + "learning_rate": 1.8819324833146788e-05, + "loss": 0.6007, + "step": 16468 + }, + { + "epoch": 2.6884616954410023, + "grad_norm": 1.7670613527297974, + "learning_rate": 1.881917524581902e-05, + "loss": 0.6037, + "step": 16469 + }, + { + "epoch": 2.6886249540835068, + "grad_norm": 1.563623070716858, + "learning_rate": 1.8819025649610346e-05, + "loss": 0.5186, + "step": 16470 + }, + { + "epoch": 2.688788212726011, + "grad_norm": 1.5182452201843262, + "learning_rate": 1.8818876044520914e-05, + "loss": 0.5143, + "step": 16471 + }, + { + "epoch": 2.6889514713685156, + "grad_norm": 2.2427406311035156, + "learning_rate": 1.881872643055088e-05, + "loss": 0.6879, + "step": 16472 + }, + { + "epoch": 2.68911473001102, + "grad_norm": 1.8487319946289062, + "learning_rate": 1.8818576807700387e-05, + "loss": 0.5463, + "step": 16473 + }, + { + "epoch": 2.689277988653524, + "grad_norm": 1.7995370626449585, + "learning_rate": 1.881842717596959e-05, + "loss": 0.5885, + "step": 16474 + }, + { + "epoch": 2.6894412472960285, + "grad_norm": 1.601436972618103, + "learning_rate": 1.881827753535864e-05, + "loss": 0.5312, + "step": 16475 + }, + { + "epoch": 2.689604505938533, + "grad_norm": 1.5976933240890503, + "learning_rate": 1.881812788586769e-05, + "loss": 0.4825, + "step": 16476 + }, + { + "epoch": 2.6897677645810374, + "grad_norm": 1.3869500160217285, + "learning_rate": 1.8817978227496883e-05, + "loss": 0.4844, + "step": 16477 + }, + { + "epoch": 2.689931023223542, + "grad_norm": 1.9042530059814453, + "learning_rate": 1.8817828560246376e-05, + "loss": 0.5218, + "step": 16478 + }, + { + "epoch": 2.6900942818660463, + "grad_norm": 1.628491997718811, + "learning_rate": 1.8817678884116318e-05, + "loss": 0.5454, + "step": 16479 + }, + { + "epoch": 2.6902575405085507, + "grad_norm": 1.6562044620513916, + "learning_rate": 1.8817529199106858e-05, + "loss": 0.5306, + "step": 16480 + }, + { + "epoch": 2.690420799151055, + "grad_norm": 1.854171633720398, + "learning_rate": 1.881737950521815e-05, + "loss": 0.66, + "step": 16481 + }, + { + "epoch": 2.6905840577935596, + "grad_norm": 1.4614008665084839, + "learning_rate": 1.8817229802450347e-05, + "loss": 0.4772, + "step": 16482 + }, + { + "epoch": 2.690747316436064, + "grad_norm": 1.879970669746399, + "learning_rate": 1.881708009080359e-05, + "loss": 0.5882, + "step": 16483 + }, + { + "epoch": 2.6909105750785685, + "grad_norm": 1.737597107887268, + "learning_rate": 1.881693037027804e-05, + "loss": 0.5556, + "step": 16484 + }, + { + "epoch": 2.6910738337210725, + "grad_norm": 2.020453453063965, + "learning_rate": 1.881678064087384e-05, + "loss": 0.6428, + "step": 16485 + }, + { + "epoch": 2.691237092363577, + "grad_norm": 2.0557048320770264, + "learning_rate": 1.8816630902591143e-05, + "loss": 0.6984, + "step": 16486 + }, + { + "epoch": 2.6914003510060813, + "grad_norm": 1.9158618450164795, + "learning_rate": 1.8816481155430105e-05, + "loss": 0.5671, + "step": 16487 + }, + { + "epoch": 2.6915636096485858, + "grad_norm": 1.7840900421142578, + "learning_rate": 1.881633139939087e-05, + "loss": 0.5578, + "step": 16488 + }, + { + "epoch": 2.69172686829109, + "grad_norm": 1.990853190422058, + "learning_rate": 1.8816181634473593e-05, + "loss": 0.5474, + "step": 16489 + }, + { + "epoch": 2.6918901269335946, + "grad_norm": 1.889896273612976, + "learning_rate": 1.8816031860678423e-05, + "loss": 0.578, + "step": 16490 + }, + { + "epoch": 2.692053385576099, + "grad_norm": 1.517315149307251, + "learning_rate": 1.8815882078005515e-05, + "loss": 0.4624, + "step": 16491 + }, + { + "epoch": 2.692216644218603, + "grad_norm": 1.7731789350509644, + "learning_rate": 1.881573228645501e-05, + "loss": 0.5488, + "step": 16492 + }, + { + "epoch": 2.6923799028611075, + "grad_norm": 1.7199431657791138, + "learning_rate": 1.881558248602707e-05, + "loss": 0.5441, + "step": 16493 + }, + { + "epoch": 2.692543161503612, + "grad_norm": 1.9129719734191895, + "learning_rate": 1.8815432676721835e-05, + "loss": 0.658, + "step": 16494 + }, + { + "epoch": 2.6927064201461164, + "grad_norm": 1.8507459163665771, + "learning_rate": 1.8815282858539466e-05, + "loss": 0.6596, + "step": 16495 + }, + { + "epoch": 2.692869678788621, + "grad_norm": 1.8156758546829224, + "learning_rate": 1.8815133031480107e-05, + "loss": 0.5235, + "step": 16496 + }, + { + "epoch": 2.6930329374311253, + "grad_norm": 1.791508674621582, + "learning_rate": 1.8814983195543918e-05, + "loss": 0.6647, + "step": 16497 + }, + { + "epoch": 2.6931961960736297, + "grad_norm": 2.2884979248046875, + "learning_rate": 1.8814833350731036e-05, + "loss": 0.5802, + "step": 16498 + }, + { + "epoch": 2.693359454716134, + "grad_norm": 2.0803933143615723, + "learning_rate": 1.8814683497041622e-05, + "loss": 0.5026, + "step": 16499 + }, + { + "epoch": 2.6935227133586386, + "grad_norm": 1.817204236984253, + "learning_rate": 1.881453363447582e-05, + "loss": 0.6821, + "step": 16500 + }, + { + "epoch": 2.693685972001143, + "grad_norm": 1.7782094478607178, + "learning_rate": 1.881438376303379e-05, + "loss": 0.4712, + "step": 16501 + }, + { + "epoch": 2.6938492306436475, + "grad_norm": 1.5835224390029907, + "learning_rate": 1.8814233882715678e-05, + "loss": 0.5595, + "step": 16502 + }, + { + "epoch": 2.6940124892861514, + "grad_norm": 1.5017518997192383, + "learning_rate": 1.881408399352163e-05, + "loss": 0.5296, + "step": 16503 + }, + { + "epoch": 2.694175747928656, + "grad_norm": 1.5261878967285156, + "learning_rate": 1.8813934095451807e-05, + "loss": 0.5171, + "step": 16504 + }, + { + "epoch": 2.6943390065711603, + "grad_norm": 1.526418685913086, + "learning_rate": 1.881378418850635e-05, + "loss": 0.4374, + "step": 16505 + }, + { + "epoch": 2.6945022652136648, + "grad_norm": 1.4413286447525024, + "learning_rate": 1.881363427268542e-05, + "loss": 0.4282, + "step": 16506 + }, + { + "epoch": 2.694665523856169, + "grad_norm": 1.694679617881775, + "learning_rate": 1.881348434798916e-05, + "loss": 0.5875, + "step": 16507 + }, + { + "epoch": 2.6948287824986736, + "grad_norm": 1.9033890962600708, + "learning_rate": 1.881333441441772e-05, + "loss": 0.6268, + "step": 16508 + }, + { + "epoch": 2.694992041141178, + "grad_norm": 1.3697019815444946, + "learning_rate": 1.881318447197126e-05, + "loss": 0.4468, + "step": 16509 + }, + { + "epoch": 2.695155299783682, + "grad_norm": 1.806424617767334, + "learning_rate": 1.8813034520649923e-05, + "loss": 0.6355, + "step": 16510 + }, + { + "epoch": 2.6953185584261865, + "grad_norm": 1.8098347187042236, + "learning_rate": 1.8812884560453865e-05, + "loss": 0.5381, + "step": 16511 + }, + { + "epoch": 2.695481817068691, + "grad_norm": 1.7840319871902466, + "learning_rate": 1.8812734591383232e-05, + "loss": 0.5588, + "step": 16512 + }, + { + "epoch": 2.6956450757111954, + "grad_norm": 1.5228395462036133, + "learning_rate": 1.8812584613438177e-05, + "loss": 0.4827, + "step": 16513 + }, + { + "epoch": 2.6958083343537, + "grad_norm": 1.6747251749038696, + "learning_rate": 1.8812434626618853e-05, + "loss": 0.5112, + "step": 16514 + }, + { + "epoch": 2.6959715929962043, + "grad_norm": 1.563176155090332, + "learning_rate": 1.881228463092541e-05, + "loss": 0.4793, + "step": 16515 + }, + { + "epoch": 2.6961348516387087, + "grad_norm": 1.8001735210418701, + "learning_rate": 1.8812134626358e-05, + "loss": 0.6773, + "step": 16516 + }, + { + "epoch": 2.696298110281213, + "grad_norm": 2.195585012435913, + "learning_rate": 1.881198461291677e-05, + "loss": 0.6791, + "step": 16517 + }, + { + "epoch": 2.6964613689237176, + "grad_norm": 1.9948112964630127, + "learning_rate": 1.8811834590601872e-05, + "loss": 0.5377, + "step": 16518 + }, + { + "epoch": 2.696624627566222, + "grad_norm": 1.5578739643096924, + "learning_rate": 1.8811684559413465e-05, + "loss": 0.4841, + "step": 16519 + }, + { + "epoch": 2.6967878862087264, + "grad_norm": 1.7769899368286133, + "learning_rate": 1.881153451935169e-05, + "loss": 0.6268, + "step": 16520 + }, + { + "epoch": 2.6969511448512304, + "grad_norm": 1.8941421508789062, + "learning_rate": 1.8811384470416705e-05, + "loss": 0.6025, + "step": 16521 + }, + { + "epoch": 2.697114403493735, + "grad_norm": 1.4604666233062744, + "learning_rate": 1.8811234412608654e-05, + "loss": 0.482, + "step": 16522 + }, + { + "epoch": 2.6972776621362393, + "grad_norm": 1.617811679840088, + "learning_rate": 1.8811084345927696e-05, + "loss": 0.4978, + "step": 16523 + }, + { + "epoch": 2.6974409207787438, + "grad_norm": 1.727844476699829, + "learning_rate": 1.8810934270373977e-05, + "loss": 0.5606, + "step": 16524 + }, + { + "epoch": 2.697604179421248, + "grad_norm": 1.5739357471466064, + "learning_rate": 1.8810784185947648e-05, + "loss": 0.4965, + "step": 16525 + }, + { + "epoch": 2.6977674380637526, + "grad_norm": 1.7645779848098755, + "learning_rate": 1.8810634092648862e-05, + "loss": 0.532, + "step": 16526 + }, + { + "epoch": 2.6979306967062566, + "grad_norm": 2.123354196548462, + "learning_rate": 1.8810483990477773e-05, + "loss": 0.543, + "step": 16527 + }, + { + "epoch": 2.698093955348761, + "grad_norm": 1.8157551288604736, + "learning_rate": 1.8810333879434524e-05, + "loss": 0.6277, + "step": 16528 + }, + { + "epoch": 2.6982572139912655, + "grad_norm": 2.0122199058532715, + "learning_rate": 1.8810183759519277e-05, + "loss": 0.5866, + "step": 16529 + }, + { + "epoch": 2.69842047263377, + "grad_norm": 1.9084804058074951, + "learning_rate": 1.8810033630732172e-05, + "loss": 0.5862, + "step": 16530 + }, + { + "epoch": 2.6985837312762744, + "grad_norm": 1.8074448108673096, + "learning_rate": 1.880988349307337e-05, + "loss": 0.5745, + "step": 16531 + }, + { + "epoch": 2.698746989918779, + "grad_norm": 1.8627989292144775, + "learning_rate": 1.8809733346543013e-05, + "loss": 0.5792, + "step": 16532 + }, + { + "epoch": 2.6989102485612833, + "grad_norm": 2.193448305130005, + "learning_rate": 1.8809583191141262e-05, + "loss": 0.745, + "step": 16533 + }, + { + "epoch": 2.6990735072037877, + "grad_norm": 1.813879370689392, + "learning_rate": 1.8809433026868258e-05, + "loss": 0.6248, + "step": 16534 + }, + { + "epoch": 2.699236765846292, + "grad_norm": 1.8677297830581665, + "learning_rate": 1.880928285372416e-05, + "loss": 0.6189, + "step": 16535 + }, + { + "epoch": 2.6994000244887966, + "grad_norm": 1.7241209745407104, + "learning_rate": 1.8809132671709114e-05, + "loss": 0.5318, + "step": 16536 + }, + { + "epoch": 2.699563283131301, + "grad_norm": 1.7797558307647705, + "learning_rate": 1.8808982480823277e-05, + "loss": 0.5321, + "step": 16537 + }, + { + "epoch": 2.699726541773805, + "grad_norm": 1.8045668601989746, + "learning_rate": 1.8808832281066795e-05, + "loss": 0.6273, + "step": 16538 + }, + { + "epoch": 2.6998898004163094, + "grad_norm": 1.7744349241256714, + "learning_rate": 1.8808682072439822e-05, + "loss": 0.5851, + "step": 16539 + }, + { + "epoch": 2.700053059058814, + "grad_norm": 1.7285329103469849, + "learning_rate": 1.880853185494251e-05, + "loss": 0.632, + "step": 16540 + }, + { + "epoch": 2.7002163177013183, + "grad_norm": 2.214113712310791, + "learning_rate": 1.8808381628575008e-05, + "loss": 0.6095, + "step": 16541 + }, + { + "epoch": 2.7003795763438228, + "grad_norm": 1.640531063079834, + "learning_rate": 1.8808231393337464e-05, + "loss": 0.5605, + "step": 16542 + }, + { + "epoch": 2.700542834986327, + "grad_norm": 1.5705642700195312, + "learning_rate": 1.8808081149230036e-05, + "loss": 0.498, + "step": 16543 + }, + { + "epoch": 2.7007060936288316, + "grad_norm": 1.8231168985366821, + "learning_rate": 1.8807930896252875e-05, + "loss": 0.5351, + "step": 16544 + }, + { + "epoch": 2.7008693522713356, + "grad_norm": 1.7822043895721436, + "learning_rate": 1.8807780634406127e-05, + "loss": 0.5773, + "step": 16545 + }, + { + "epoch": 2.70103261091384, + "grad_norm": 1.804272174835205, + "learning_rate": 1.880763036368995e-05, + "loss": 0.5759, + "step": 16546 + }, + { + "epoch": 2.7011958695563445, + "grad_norm": 1.7340136766433716, + "learning_rate": 1.8807480084104484e-05, + "loss": 0.5341, + "step": 16547 + }, + { + "epoch": 2.701359128198849, + "grad_norm": 1.7237608432769775, + "learning_rate": 1.8807329795649897e-05, + "loss": 0.6114, + "step": 16548 + }, + { + "epoch": 2.7015223868413534, + "grad_norm": 1.5833439826965332, + "learning_rate": 1.8807179498326323e-05, + "loss": 0.539, + "step": 16549 + }, + { + "epoch": 2.701685645483858, + "grad_norm": 1.5692659616470337, + "learning_rate": 1.8807029192133927e-05, + "loss": 0.489, + "step": 16550 + }, + { + "epoch": 2.7018489041263622, + "grad_norm": 1.9736303091049194, + "learning_rate": 1.8806878877072856e-05, + "loss": 0.6699, + "step": 16551 + }, + { + "epoch": 2.7020121627688667, + "grad_norm": 1.5361465215682983, + "learning_rate": 1.8806728553143256e-05, + "loss": 0.516, + "step": 16552 + }, + { + "epoch": 2.702175421411371, + "grad_norm": 1.657161831855774, + "learning_rate": 1.8806578220345284e-05, + "loss": 0.5394, + "step": 16553 + }, + { + "epoch": 2.7023386800538756, + "grad_norm": 2.107928991317749, + "learning_rate": 1.880642787867909e-05, + "loss": 0.5911, + "step": 16554 + }, + { + "epoch": 2.70250193869638, + "grad_norm": 1.5459269285202026, + "learning_rate": 1.8806277528144826e-05, + "loss": 0.4879, + "step": 16555 + }, + { + "epoch": 2.702665197338884, + "grad_norm": 1.8966012001037598, + "learning_rate": 1.8806127168742644e-05, + "loss": 0.521, + "step": 16556 + }, + { + "epoch": 2.7028284559813884, + "grad_norm": 2.0177032947540283, + "learning_rate": 1.8805976800472695e-05, + "loss": 0.6944, + "step": 16557 + }, + { + "epoch": 2.702991714623893, + "grad_norm": 1.823409080505371, + "learning_rate": 1.8805826423335127e-05, + "loss": 0.5546, + "step": 16558 + }, + { + "epoch": 2.7031549732663973, + "grad_norm": 2.233720302581787, + "learning_rate": 1.8805676037330093e-05, + "loss": 0.572, + "step": 16559 + }, + { + "epoch": 2.7033182319089017, + "grad_norm": 2.1746251583099365, + "learning_rate": 1.880552564245775e-05, + "loss": 0.8631, + "step": 16560 + }, + { + "epoch": 2.703481490551406, + "grad_norm": 1.6702942848205566, + "learning_rate": 1.880537523871824e-05, + "loss": 0.5932, + "step": 16561 + }, + { + "epoch": 2.70364474919391, + "grad_norm": 1.9586342573165894, + "learning_rate": 1.8805224826111725e-05, + "loss": 0.6639, + "step": 16562 + }, + { + "epoch": 2.7038080078364146, + "grad_norm": 2.095334053039551, + "learning_rate": 1.8805074404638345e-05, + "loss": 0.6709, + "step": 16563 + }, + { + "epoch": 2.703971266478919, + "grad_norm": 1.852708339691162, + "learning_rate": 1.8804923974298265e-05, + "loss": 0.5594, + "step": 16564 + }, + { + "epoch": 2.7041345251214235, + "grad_norm": 1.661108374595642, + "learning_rate": 1.880477353509162e-05, + "loss": 0.4876, + "step": 16565 + }, + { + "epoch": 2.704297783763928, + "grad_norm": 1.7343050241470337, + "learning_rate": 1.8804623087018577e-05, + "loss": 0.6197, + "step": 16566 + }, + { + "epoch": 2.7044610424064324, + "grad_norm": 1.691110610961914, + "learning_rate": 1.8804472630079277e-05, + "loss": 0.613, + "step": 16567 + }, + { + "epoch": 2.704624301048937, + "grad_norm": 1.8345650434494019, + "learning_rate": 1.8804322164273877e-05, + "loss": 0.6519, + "step": 16568 + }, + { + "epoch": 2.7047875596914412, + "grad_norm": 1.3634482622146606, + "learning_rate": 1.8804171689602525e-05, + "loss": 0.4572, + "step": 16569 + }, + { + "epoch": 2.7049508183339457, + "grad_norm": 2.1294054985046387, + "learning_rate": 1.8804021206065378e-05, + "loss": 0.6143, + "step": 16570 + }, + { + "epoch": 2.70511407697645, + "grad_norm": 1.7674267292022705, + "learning_rate": 1.880387071366258e-05, + "loss": 0.5275, + "step": 16571 + }, + { + "epoch": 2.7052773356189546, + "grad_norm": 1.4791922569274902, + "learning_rate": 1.8803720212394293e-05, + "loss": 0.4905, + "step": 16572 + }, + { + "epoch": 2.7054405942614586, + "grad_norm": 1.7082043886184692, + "learning_rate": 1.8803569702260657e-05, + "loss": 0.5182, + "step": 16573 + }, + { + "epoch": 2.705603852903963, + "grad_norm": 1.9373130798339844, + "learning_rate": 1.8803419183261828e-05, + "loss": 0.5749, + "step": 16574 + }, + { + "epoch": 2.7057671115464674, + "grad_norm": 1.6909286975860596, + "learning_rate": 1.8803268655397963e-05, + "loss": 0.6454, + "step": 16575 + }, + { + "epoch": 2.705930370188972, + "grad_norm": 1.77646803855896, + "learning_rate": 1.8803118118669203e-05, + "loss": 0.5989, + "step": 16576 + }, + { + "epoch": 2.7060936288314763, + "grad_norm": 1.8345304727554321, + "learning_rate": 1.880296757307571e-05, + "loss": 0.4991, + "step": 16577 + }, + { + "epoch": 2.7062568874739807, + "grad_norm": 1.7071967124938965, + "learning_rate": 1.8802817018617627e-05, + "loss": 0.6025, + "step": 16578 + }, + { + "epoch": 2.706420146116485, + "grad_norm": 1.9636720418930054, + "learning_rate": 1.8802666455295113e-05, + "loss": 0.845, + "step": 16579 + }, + { + "epoch": 2.706583404758989, + "grad_norm": 1.5531482696533203, + "learning_rate": 1.8802515883108314e-05, + "loss": 0.4535, + "step": 16580 + }, + { + "epoch": 2.7067466634014936, + "grad_norm": 2.162168264389038, + "learning_rate": 1.8802365302057386e-05, + "loss": 0.6311, + "step": 16581 + }, + { + "epoch": 2.706909922043998, + "grad_norm": 2.0841856002807617, + "learning_rate": 1.8802214712142475e-05, + "loss": 0.6327, + "step": 16582 + }, + { + "epoch": 2.7070731806865025, + "grad_norm": 1.8149399757385254, + "learning_rate": 1.8802064113363738e-05, + "loss": 0.5667, + "step": 16583 + }, + { + "epoch": 2.707236439329007, + "grad_norm": 1.9262553453445435, + "learning_rate": 1.8801913505721325e-05, + "loss": 0.5071, + "step": 16584 + }, + { + "epoch": 2.7073996979715114, + "grad_norm": 1.4386838674545288, + "learning_rate": 1.880176288921539e-05, + "loss": 0.5018, + "step": 16585 + }, + { + "epoch": 2.707562956614016, + "grad_norm": 1.5238865613937378, + "learning_rate": 1.880161226384608e-05, + "loss": 0.5378, + "step": 16586 + }, + { + "epoch": 2.7077262152565202, + "grad_norm": 1.634042501449585, + "learning_rate": 1.8801461629613548e-05, + "loss": 0.4689, + "step": 16587 + }, + { + "epoch": 2.7078894738990247, + "grad_norm": 1.6673656702041626, + "learning_rate": 1.8801310986517945e-05, + "loss": 0.6194, + "step": 16588 + }, + { + "epoch": 2.708052732541529, + "grad_norm": 1.5786864757537842, + "learning_rate": 1.8801160334559426e-05, + "loss": 0.6182, + "step": 16589 + }, + { + "epoch": 2.7082159911840336, + "grad_norm": 1.3529037237167358, + "learning_rate": 1.880100967373814e-05, + "loss": 0.4933, + "step": 16590 + }, + { + "epoch": 2.7083792498265375, + "grad_norm": 1.7782104015350342, + "learning_rate": 1.8800859004054238e-05, + "loss": 0.6349, + "step": 16591 + }, + { + "epoch": 2.708542508469042, + "grad_norm": 1.918752670288086, + "learning_rate": 1.880070832550788e-05, + "loss": 0.6142, + "step": 16592 + }, + { + "epoch": 2.7087057671115464, + "grad_norm": 1.9540215730667114, + "learning_rate": 1.8800557638099203e-05, + "loss": 0.6851, + "step": 16593 + }, + { + "epoch": 2.708869025754051, + "grad_norm": 1.85176420211792, + "learning_rate": 1.8800406941828372e-05, + "loss": 0.6341, + "step": 16594 + }, + { + "epoch": 2.7090322843965553, + "grad_norm": 1.8179359436035156, + "learning_rate": 1.8800256236695534e-05, + "loss": 0.5486, + "step": 16595 + }, + { + "epoch": 2.7091955430390597, + "grad_norm": 1.8988083600997925, + "learning_rate": 1.8800105522700837e-05, + "loss": 0.5393, + "step": 16596 + }, + { + "epoch": 2.709358801681564, + "grad_norm": 1.580170750617981, + "learning_rate": 1.8799954799844437e-05, + "loss": 0.4873, + "step": 16597 + }, + { + "epoch": 2.709522060324068, + "grad_norm": 2.1389758586883545, + "learning_rate": 1.8799804068126487e-05, + "loss": 0.711, + "step": 16598 + }, + { + "epoch": 2.7096853189665726, + "grad_norm": 1.977710485458374, + "learning_rate": 1.8799653327547133e-05, + "loss": 0.6161, + "step": 16599 + }, + { + "epoch": 2.709848577609077, + "grad_norm": 1.7854108810424805, + "learning_rate": 1.8799502578106533e-05, + "loss": 0.5118, + "step": 16600 + }, + { + "epoch": 2.7100118362515815, + "grad_norm": 1.479286551475525, + "learning_rate": 1.8799351819804837e-05, + "loss": 0.4876, + "step": 16601 + }, + { + "epoch": 2.710175094894086, + "grad_norm": 1.6649599075317383, + "learning_rate": 1.8799201052642194e-05, + "loss": 0.4743, + "step": 16602 + }, + { + "epoch": 2.7103383535365904, + "grad_norm": 1.775039792060852, + "learning_rate": 1.879905027661876e-05, + "loss": 0.5546, + "step": 16603 + }, + { + "epoch": 2.710501612179095, + "grad_norm": 1.8469204902648926, + "learning_rate": 1.8798899491734682e-05, + "loss": 0.6379, + "step": 16604 + }, + { + "epoch": 2.7106648708215992, + "grad_norm": 1.8928533792495728, + "learning_rate": 1.8798748697990115e-05, + "loss": 0.5627, + "step": 16605 + }, + { + "epoch": 2.7108281294641037, + "grad_norm": 1.8749570846557617, + "learning_rate": 1.879859789538521e-05, + "loss": 0.5143, + "step": 16606 + }, + { + "epoch": 2.710991388106608, + "grad_norm": 1.9530638456344604, + "learning_rate": 1.8798447083920123e-05, + "loss": 0.6604, + "step": 16607 + }, + { + "epoch": 2.7111546467491126, + "grad_norm": 2.0698938369750977, + "learning_rate": 1.8798296263594998e-05, + "loss": 0.6282, + "step": 16608 + }, + { + "epoch": 2.7113179053916165, + "grad_norm": 1.7837302684783936, + "learning_rate": 1.879814543440999e-05, + "loss": 0.6364, + "step": 16609 + }, + { + "epoch": 2.711481164034121, + "grad_norm": 2.018670082092285, + "learning_rate": 1.8797994596365258e-05, + "loss": 0.5643, + "step": 16610 + }, + { + "epoch": 2.7116444226766254, + "grad_norm": 1.6347397565841675, + "learning_rate": 1.879784374946094e-05, + "loss": 0.4858, + "step": 16611 + }, + { + "epoch": 2.71180768131913, + "grad_norm": 1.7204593420028687, + "learning_rate": 1.8797692893697205e-05, + "loss": 0.5143, + "step": 16612 + }, + { + "epoch": 2.7119709399616343, + "grad_norm": 1.865268588066101, + "learning_rate": 1.8797542029074188e-05, + "loss": 0.5639, + "step": 16613 + }, + { + "epoch": 2.7121341986041387, + "grad_norm": 1.8346552848815918, + "learning_rate": 1.8797391155592054e-05, + "loss": 0.6255, + "step": 16614 + }, + { + "epoch": 2.7122974572466427, + "grad_norm": 1.6340967416763306, + "learning_rate": 1.8797240273250945e-05, + "loss": 0.5238, + "step": 16615 + }, + { + "epoch": 2.712460715889147, + "grad_norm": 1.6979809999465942, + "learning_rate": 1.8797089382051016e-05, + "loss": 0.5662, + "step": 16616 + }, + { + "epoch": 2.7126239745316516, + "grad_norm": 1.4810415506362915, + "learning_rate": 1.8796938481992427e-05, + "loss": 0.4577, + "step": 16617 + }, + { + "epoch": 2.712787233174156, + "grad_norm": 1.8998157978057861, + "learning_rate": 1.8796787573075316e-05, + "loss": 0.5308, + "step": 16618 + }, + { + "epoch": 2.7129504918166605, + "grad_norm": 2.0942726135253906, + "learning_rate": 1.8796636655299847e-05, + "loss": 0.5444, + "step": 16619 + }, + { + "epoch": 2.713113750459165, + "grad_norm": 1.8948887586593628, + "learning_rate": 1.879648572866617e-05, + "loss": 0.5389, + "step": 16620 + }, + { + "epoch": 2.7132770091016694, + "grad_norm": 1.6946784257888794, + "learning_rate": 1.8796334793174426e-05, + "loss": 0.5761, + "step": 16621 + }, + { + "epoch": 2.713440267744174, + "grad_norm": 1.5329535007476807, + "learning_rate": 1.879618384882478e-05, + "loss": 0.5013, + "step": 16622 + }, + { + "epoch": 2.7136035263866782, + "grad_norm": 1.8743880987167358, + "learning_rate": 1.8796032895617377e-05, + "loss": 0.6285, + "step": 16623 + }, + { + "epoch": 2.7137667850291827, + "grad_norm": 1.8764699697494507, + "learning_rate": 1.8795881933552374e-05, + "loss": 0.5677, + "step": 16624 + }, + { + "epoch": 2.713930043671687, + "grad_norm": 1.971200942993164, + "learning_rate": 1.8795730962629918e-05, + "loss": 0.632, + "step": 16625 + }, + { + "epoch": 2.714093302314191, + "grad_norm": 1.7965399026870728, + "learning_rate": 1.8795579982850167e-05, + "loss": 0.5302, + "step": 16626 + }, + { + "epoch": 2.7142565609566955, + "grad_norm": 2.2528023719787598, + "learning_rate": 1.8795428994213266e-05, + "loss": 0.6555, + "step": 16627 + }, + { + "epoch": 2.7144198195992, + "grad_norm": 1.658044695854187, + "learning_rate": 1.879527799671937e-05, + "loss": 0.5568, + "step": 16628 + }, + { + "epoch": 2.7145830782417044, + "grad_norm": 2.045572519302368, + "learning_rate": 1.8795126990368632e-05, + "loss": 0.5551, + "step": 16629 + }, + { + "epoch": 2.714746336884209, + "grad_norm": 1.8214598894119263, + "learning_rate": 1.8794975975161206e-05, + "loss": 0.5636, + "step": 16630 + }, + { + "epoch": 2.7149095955267133, + "grad_norm": 1.425578236579895, + "learning_rate": 1.8794824951097237e-05, + "loss": 0.4438, + "step": 16631 + }, + { + "epoch": 2.7150728541692177, + "grad_norm": 1.7139400243759155, + "learning_rate": 1.8794673918176882e-05, + "loss": 0.6041, + "step": 16632 + }, + { + "epoch": 2.7152361128117217, + "grad_norm": 2.133733034133911, + "learning_rate": 1.8794522876400296e-05, + "loss": 0.6604, + "step": 16633 + }, + { + "epoch": 2.715399371454226, + "grad_norm": 1.8545811176300049, + "learning_rate": 1.8794371825767624e-05, + "loss": 0.5866, + "step": 16634 + }, + { + "epoch": 2.7155626300967306, + "grad_norm": 1.8968883752822876, + "learning_rate": 1.8794220766279027e-05, + "loss": 0.6723, + "step": 16635 + }, + { + "epoch": 2.715725888739235, + "grad_norm": 1.88693106174469, + "learning_rate": 1.879406969793465e-05, + "loss": 0.6755, + "step": 16636 + }, + { + "epoch": 2.7158891473817395, + "grad_norm": 1.441852331161499, + "learning_rate": 1.8793918620734643e-05, + "loss": 0.497, + "step": 16637 + }, + { + "epoch": 2.716052406024244, + "grad_norm": 1.3071404695510864, + "learning_rate": 1.879376753467917e-05, + "loss": 0.4534, + "step": 16638 + }, + { + "epoch": 2.7162156646667484, + "grad_norm": 1.850710391998291, + "learning_rate": 1.879361643976837e-05, + "loss": 0.7048, + "step": 16639 + }, + { + "epoch": 2.716378923309253, + "grad_norm": 1.9601540565490723, + "learning_rate": 1.87934653360024e-05, + "loss": 0.6449, + "step": 16640 + }, + { + "epoch": 2.7165421819517572, + "grad_norm": 1.572615146636963, + "learning_rate": 1.8793314223381416e-05, + "loss": 0.5101, + "step": 16641 + }, + { + "epoch": 2.7167054405942617, + "grad_norm": 1.519018292427063, + "learning_rate": 1.8793163101905562e-05, + "loss": 0.5297, + "step": 16642 + }, + { + "epoch": 2.716868699236766, + "grad_norm": 1.9437828063964844, + "learning_rate": 1.8793011971575e-05, + "loss": 0.65, + "step": 16643 + }, + { + "epoch": 2.71703195787927, + "grad_norm": 1.7062525749206543, + "learning_rate": 1.8792860832389877e-05, + "loss": 0.4476, + "step": 16644 + }, + { + "epoch": 2.7171952165217745, + "grad_norm": 1.7000830173492432, + "learning_rate": 1.8792709684350344e-05, + "loss": 0.574, + "step": 16645 + }, + { + "epoch": 2.717358475164279, + "grad_norm": 1.6324061155319214, + "learning_rate": 1.8792558527456556e-05, + "loss": 0.5573, + "step": 16646 + }, + { + "epoch": 2.7175217338067834, + "grad_norm": 1.7867839336395264, + "learning_rate": 1.879240736170866e-05, + "loss": 0.5573, + "step": 16647 + }, + { + "epoch": 2.717684992449288, + "grad_norm": 2.056640863418579, + "learning_rate": 1.8792256187106816e-05, + "loss": 0.6017, + "step": 16648 + }, + { + "epoch": 2.7178482510917923, + "grad_norm": 1.9215099811553955, + "learning_rate": 1.8792105003651172e-05, + "loss": 0.6105, + "step": 16649 + }, + { + "epoch": 2.7180115097342963, + "grad_norm": 1.9160763025283813, + "learning_rate": 1.879195381134188e-05, + "loss": 0.6402, + "step": 16650 + }, + { + "epoch": 2.7181747683768007, + "grad_norm": 1.4280481338500977, + "learning_rate": 1.879180261017909e-05, + "loss": 0.4932, + "step": 16651 + }, + { + "epoch": 2.718338027019305, + "grad_norm": 1.6068733930587769, + "learning_rate": 1.879165140016296e-05, + "loss": 0.5188, + "step": 16652 + }, + { + "epoch": 2.7185012856618096, + "grad_norm": 1.447710633277893, + "learning_rate": 1.879150018129364e-05, + "loss": 0.4978, + "step": 16653 + }, + { + "epoch": 2.718664544304314, + "grad_norm": 1.8120697736740112, + "learning_rate": 1.879134895357128e-05, + "loss": 0.5563, + "step": 16654 + }, + { + "epoch": 2.7188278029468185, + "grad_norm": 2.2914133071899414, + "learning_rate": 1.8791197716996038e-05, + "loss": 0.5367, + "step": 16655 + }, + { + "epoch": 2.718991061589323, + "grad_norm": 1.7524776458740234, + "learning_rate": 1.8791046471568056e-05, + "loss": 0.5722, + "step": 16656 + }, + { + "epoch": 2.7191543202318273, + "grad_norm": 1.7793126106262207, + "learning_rate": 1.8790895217287498e-05, + "loss": 0.5813, + "step": 16657 + }, + { + "epoch": 2.719317578874332, + "grad_norm": 1.802713394165039, + "learning_rate": 1.8790743954154508e-05, + "loss": 0.4965, + "step": 16658 + }, + { + "epoch": 2.7194808375168362, + "grad_norm": 1.9872863292694092, + "learning_rate": 1.879059268216924e-05, + "loss": 0.6009, + "step": 16659 + }, + { + "epoch": 2.7196440961593407, + "grad_norm": 2.0681357383728027, + "learning_rate": 1.8790441401331848e-05, + "loss": 0.629, + "step": 16660 + }, + { + "epoch": 2.7198073548018447, + "grad_norm": 1.749673843383789, + "learning_rate": 1.8790290111642484e-05, + "loss": 0.5015, + "step": 16661 + }, + { + "epoch": 2.719970613444349, + "grad_norm": 1.8369203805923462, + "learning_rate": 1.87901388131013e-05, + "loss": 0.6317, + "step": 16662 + }, + { + "epoch": 2.7201338720868535, + "grad_norm": 1.3952044248580933, + "learning_rate": 1.878998750570845e-05, + "loss": 0.4118, + "step": 16663 + }, + { + "epoch": 2.720297130729358, + "grad_norm": 1.8993173837661743, + "learning_rate": 1.878983618946409e-05, + "loss": 0.5701, + "step": 16664 + }, + { + "epoch": 2.7204603893718624, + "grad_norm": 1.7989118099212646, + "learning_rate": 1.8789684864368358e-05, + "loss": 0.5316, + "step": 16665 + }, + { + "epoch": 2.720623648014367, + "grad_norm": 1.7507249116897583, + "learning_rate": 1.878953353042142e-05, + "loss": 0.485, + "step": 16666 + }, + { + "epoch": 2.7207869066568713, + "grad_norm": 1.9456791877746582, + "learning_rate": 1.8789382187623423e-05, + "loss": 0.6038, + "step": 16667 + }, + { + "epoch": 2.7209501652993753, + "grad_norm": 1.4723124504089355, + "learning_rate": 1.878923083597452e-05, + "loss": 0.4495, + "step": 16668 + }, + { + "epoch": 2.7211134239418797, + "grad_norm": 1.9290400743484497, + "learning_rate": 1.8789079475474866e-05, + "loss": 0.649, + "step": 16669 + }, + { + "epoch": 2.721276682584384, + "grad_norm": 1.994280219078064, + "learning_rate": 1.8788928106124608e-05, + "loss": 0.6313, + "step": 16670 + }, + { + "epoch": 2.7214399412268886, + "grad_norm": 2.1381192207336426, + "learning_rate": 1.8788776727923906e-05, + "loss": 0.6639, + "step": 16671 + }, + { + "epoch": 2.721603199869393, + "grad_norm": 1.627790093421936, + "learning_rate": 1.8788625340872906e-05, + "loss": 0.6853, + "step": 16672 + }, + { + "epoch": 2.7217664585118975, + "grad_norm": 1.889522671699524, + "learning_rate": 1.8788473944971763e-05, + "loss": 0.5948, + "step": 16673 + }, + { + "epoch": 2.721929717154402, + "grad_norm": 1.3115930557250977, + "learning_rate": 1.8788322540220627e-05, + "loss": 0.417, + "step": 16674 + }, + { + "epoch": 2.7220929757969063, + "grad_norm": 1.9176661968231201, + "learning_rate": 1.8788171126619653e-05, + "loss": 0.6588, + "step": 16675 + }, + { + "epoch": 2.722256234439411, + "grad_norm": 1.703707218170166, + "learning_rate": 1.8788019704168995e-05, + "loss": 0.5996, + "step": 16676 + }, + { + "epoch": 2.722419493081915, + "grad_norm": 1.491767406463623, + "learning_rate": 1.8787868272868804e-05, + "loss": 0.5016, + "step": 16677 + }, + { + "epoch": 2.7225827517244197, + "grad_norm": 1.7939825057983398, + "learning_rate": 1.8787716832719232e-05, + "loss": 0.635, + "step": 16678 + }, + { + "epoch": 2.7227460103669237, + "grad_norm": 1.7195117473602295, + "learning_rate": 1.878756538372043e-05, + "loss": 0.5065, + "step": 16679 + }, + { + "epoch": 2.722909269009428, + "grad_norm": 1.9369292259216309, + "learning_rate": 1.878741392587255e-05, + "loss": 0.7323, + "step": 16680 + }, + { + "epoch": 2.7230725276519325, + "grad_norm": 1.4369068145751953, + "learning_rate": 1.878726245917575e-05, + "loss": 0.4396, + "step": 16681 + }, + { + "epoch": 2.723235786294437, + "grad_norm": 1.8907595872879028, + "learning_rate": 1.878711098363018e-05, + "loss": 0.51, + "step": 16682 + }, + { + "epoch": 2.7233990449369414, + "grad_norm": 1.8821135759353638, + "learning_rate": 1.8786959499235987e-05, + "loss": 0.575, + "step": 16683 + }, + { + "epoch": 2.723562303579446, + "grad_norm": 1.588097095489502, + "learning_rate": 1.878680800599333e-05, + "loss": 0.4821, + "step": 16684 + }, + { + "epoch": 2.7237255622219503, + "grad_norm": 1.920490026473999, + "learning_rate": 1.878665650390236e-05, + "loss": 0.5614, + "step": 16685 + }, + { + "epoch": 2.7238888208644543, + "grad_norm": 1.8171836137771606, + "learning_rate": 1.878650499296323e-05, + "loss": 0.6028, + "step": 16686 + }, + { + "epoch": 2.7240520795069587, + "grad_norm": 1.9502956867218018, + "learning_rate": 1.8786353473176093e-05, + "loss": 0.6246, + "step": 16687 + }, + { + "epoch": 2.724215338149463, + "grad_norm": 1.5336875915527344, + "learning_rate": 1.8786201944541096e-05, + "loss": 0.5273, + "step": 16688 + }, + { + "epoch": 2.7243785967919676, + "grad_norm": 1.7241657972335815, + "learning_rate": 1.87860504070584e-05, + "loss": 0.5382, + "step": 16689 + }, + { + "epoch": 2.724541855434472, + "grad_norm": 2.3924672603607178, + "learning_rate": 1.8785898860728152e-05, + "loss": 0.8048, + "step": 16690 + }, + { + "epoch": 2.7247051140769765, + "grad_norm": 1.622147798538208, + "learning_rate": 1.8785747305550507e-05, + "loss": 0.5711, + "step": 16691 + }, + { + "epoch": 2.724868372719481, + "grad_norm": 2.042023181915283, + "learning_rate": 1.8785595741525613e-05, + "loss": 0.622, + "step": 16692 + }, + { + "epoch": 2.7250316313619853, + "grad_norm": 1.5831685066223145, + "learning_rate": 1.878544416865363e-05, + "loss": 0.479, + "step": 16693 + }, + { + "epoch": 2.72519489000449, + "grad_norm": 1.8269283771514893, + "learning_rate": 1.8785292586934705e-05, + "loss": 0.4946, + "step": 16694 + }, + { + "epoch": 2.725358148646994, + "grad_norm": 1.5305780172348022, + "learning_rate": 1.8785140996368996e-05, + "loss": 0.5501, + "step": 16695 + }, + { + "epoch": 2.7255214072894987, + "grad_norm": 1.732620120048523, + "learning_rate": 1.878498939695665e-05, + "loss": 0.5566, + "step": 16696 + }, + { + "epoch": 2.7256846659320026, + "grad_norm": 1.7710344791412354, + "learning_rate": 1.8784837788697823e-05, + "loss": 0.5024, + "step": 16697 + }, + { + "epoch": 2.725847924574507, + "grad_norm": 1.466112732887268, + "learning_rate": 1.8784686171592668e-05, + "loss": 0.4514, + "step": 16698 + }, + { + "epoch": 2.7260111832170115, + "grad_norm": 1.9642612934112549, + "learning_rate": 1.8784534545641334e-05, + "loss": 0.6457, + "step": 16699 + }, + { + "epoch": 2.726174441859516, + "grad_norm": 1.8240896463394165, + "learning_rate": 1.8784382910843978e-05, + "loss": 0.5198, + "step": 16700 + }, + { + "epoch": 2.7263377005020204, + "grad_norm": 1.9138877391815186, + "learning_rate": 1.8784231267200748e-05, + "loss": 0.5609, + "step": 16701 + }, + { + "epoch": 2.726500959144525, + "grad_norm": 1.8895182609558105, + "learning_rate": 1.8784079614711798e-05, + "loss": 0.6141, + "step": 16702 + }, + { + "epoch": 2.726664217787029, + "grad_norm": 1.8375157117843628, + "learning_rate": 1.8783927953377285e-05, + "loss": 0.5583, + "step": 16703 + }, + { + "epoch": 2.7268274764295333, + "grad_norm": 1.9944090843200684, + "learning_rate": 1.878377628319736e-05, + "loss": 0.5868, + "step": 16704 + }, + { + "epoch": 2.7269907350720377, + "grad_norm": 2.024186849594116, + "learning_rate": 1.8783624604172175e-05, + "loss": 0.6246, + "step": 16705 + }, + { + "epoch": 2.727153993714542, + "grad_norm": 1.732542872428894, + "learning_rate": 1.878347291630188e-05, + "loss": 0.6237, + "step": 16706 + }, + { + "epoch": 2.7273172523570466, + "grad_norm": 1.546504020690918, + "learning_rate": 1.878332121958663e-05, + "loss": 0.4664, + "step": 16707 + }, + { + "epoch": 2.727480510999551, + "grad_norm": 1.6097654104232788, + "learning_rate": 1.878316951402658e-05, + "loss": 0.5469, + "step": 16708 + }, + { + "epoch": 2.7276437696420555, + "grad_norm": 1.458191156387329, + "learning_rate": 1.878301779962188e-05, + "loss": 0.4594, + "step": 16709 + }, + { + "epoch": 2.72780702828456, + "grad_norm": 1.7755744457244873, + "learning_rate": 1.8782866076372685e-05, + "loss": 0.5836, + "step": 16710 + }, + { + "epoch": 2.7279702869270643, + "grad_norm": 1.8162027597427368, + "learning_rate": 1.878271434427914e-05, + "loss": 0.5578, + "step": 16711 + }, + { + "epoch": 2.7281335455695688, + "grad_norm": 1.7298256158828735, + "learning_rate": 1.878256260334141e-05, + "loss": 0.5701, + "step": 16712 + }, + { + "epoch": 2.728296804212073, + "grad_norm": 1.408016562461853, + "learning_rate": 1.878241085355964e-05, + "loss": 0.4861, + "step": 16713 + }, + { + "epoch": 2.728460062854577, + "grad_norm": 1.7586253881454468, + "learning_rate": 1.8782259094933985e-05, + "loss": 0.5908, + "step": 16714 + }, + { + "epoch": 2.7286233214970816, + "grad_norm": 1.6779379844665527, + "learning_rate": 1.8782107327464594e-05, + "loss": 0.5349, + "step": 16715 + }, + { + "epoch": 2.728786580139586, + "grad_norm": 1.6519625186920166, + "learning_rate": 1.8781955551151628e-05, + "loss": 0.4597, + "step": 16716 + }, + { + "epoch": 2.7289498387820905, + "grad_norm": 1.4247581958770752, + "learning_rate": 1.8781803765995234e-05, + "loss": 0.5584, + "step": 16717 + }, + { + "epoch": 2.729113097424595, + "grad_norm": 2.082822799682617, + "learning_rate": 1.878165197199556e-05, + "loss": 0.8922, + "step": 16718 + }, + { + "epoch": 2.7292763560670994, + "grad_norm": 2.524829387664795, + "learning_rate": 1.8781500169152774e-05, + "loss": 0.5864, + "step": 16719 + }, + { + "epoch": 2.729439614709604, + "grad_norm": 1.9011597633361816, + "learning_rate": 1.8781348357467017e-05, + "loss": 0.5581, + "step": 16720 + }, + { + "epoch": 2.729602873352108, + "grad_norm": 1.8007005453109741, + "learning_rate": 1.878119653693844e-05, + "loss": 0.5467, + "step": 16721 + }, + { + "epoch": 2.7297661319946123, + "grad_norm": 1.3887399435043335, + "learning_rate": 1.8781044707567203e-05, + "loss": 0.4357, + "step": 16722 + }, + { + "epoch": 2.7299293906371167, + "grad_norm": 1.507433533668518, + "learning_rate": 1.8780892869353458e-05, + "loss": 0.4686, + "step": 16723 + }, + { + "epoch": 2.730092649279621, + "grad_norm": 1.9193627834320068, + "learning_rate": 1.8780741022297353e-05, + "loss": 0.5365, + "step": 16724 + }, + { + "epoch": 2.7302559079221256, + "grad_norm": 1.6746186017990112, + "learning_rate": 1.8780589166399047e-05, + "loss": 0.5456, + "step": 16725 + }, + { + "epoch": 2.73041916656463, + "grad_norm": 1.877157211303711, + "learning_rate": 1.878043730165869e-05, + "loss": 0.6339, + "step": 16726 + }, + { + "epoch": 2.7305824252071345, + "grad_norm": 2.0553653240203857, + "learning_rate": 1.8780285428076434e-05, + "loss": 0.6164, + "step": 16727 + }, + { + "epoch": 2.730745683849639, + "grad_norm": 2.1768383979797363, + "learning_rate": 1.878013354565243e-05, + "loss": 0.6683, + "step": 16728 + }, + { + "epoch": 2.7309089424921433, + "grad_norm": 1.7574193477630615, + "learning_rate": 1.877998165438684e-05, + "loss": 0.5537, + "step": 16729 + }, + { + "epoch": 2.7310722011346478, + "grad_norm": 1.8012056350708008, + "learning_rate": 1.8779829754279806e-05, + "loss": 0.5602, + "step": 16730 + }, + { + "epoch": 2.731235459777152, + "grad_norm": 1.911354422569275, + "learning_rate": 1.8779677845331488e-05, + "loss": 0.6451, + "step": 16731 + }, + { + "epoch": 2.731398718419656, + "grad_norm": 1.9095596075057983, + "learning_rate": 1.8779525927542037e-05, + "loss": 0.6009, + "step": 16732 + }, + { + "epoch": 2.7315619770621606, + "grad_norm": 1.8988440036773682, + "learning_rate": 1.8779374000911605e-05, + "loss": 0.5955, + "step": 16733 + }, + { + "epoch": 2.731725235704665, + "grad_norm": 1.9598146677017212, + "learning_rate": 1.8779222065440344e-05, + "loss": 0.6229, + "step": 16734 + }, + { + "epoch": 2.7318884943471695, + "grad_norm": 1.763124942779541, + "learning_rate": 1.877907012112841e-05, + "loss": 0.5493, + "step": 16735 + }, + { + "epoch": 2.732051752989674, + "grad_norm": 1.951460361480713, + "learning_rate": 1.8778918167975958e-05, + "loss": 0.6425, + "step": 16736 + }, + { + "epoch": 2.7322150116321784, + "grad_norm": 2.147158622741699, + "learning_rate": 1.8778766205983133e-05, + "loss": 0.7509, + "step": 16737 + }, + { + "epoch": 2.732378270274683, + "grad_norm": 1.8510369062423706, + "learning_rate": 1.8778614235150094e-05, + "loss": 0.606, + "step": 16738 + }, + { + "epoch": 2.732541528917187, + "grad_norm": 1.7507165670394897, + "learning_rate": 1.8778462255476995e-05, + "loss": 0.6535, + "step": 16739 + }, + { + "epoch": 2.7327047875596913, + "grad_norm": 1.7132530212402344, + "learning_rate": 1.8778310266963985e-05, + "loss": 0.6151, + "step": 16740 + }, + { + "epoch": 2.7328680462021957, + "grad_norm": 1.9740732908248901, + "learning_rate": 1.877815826961122e-05, + "loss": 0.6662, + "step": 16741 + }, + { + "epoch": 2.7330313048447, + "grad_norm": 1.7233861684799194, + "learning_rate": 1.877800626341885e-05, + "loss": 0.5857, + "step": 16742 + }, + { + "epoch": 2.7331945634872046, + "grad_norm": 1.5028626918792725, + "learning_rate": 1.877785424838703e-05, + "loss": 0.4955, + "step": 16743 + }, + { + "epoch": 2.733357822129709, + "grad_norm": 1.602030634880066, + "learning_rate": 1.8777702224515915e-05, + "loss": 0.5553, + "step": 16744 + }, + { + "epoch": 2.7335210807722135, + "grad_norm": 1.7345693111419678, + "learning_rate": 1.8777550191805652e-05, + "loss": 0.513, + "step": 16745 + }, + { + "epoch": 2.733684339414718, + "grad_norm": 1.8383170366287231, + "learning_rate": 1.8777398150256406e-05, + "loss": 0.6283, + "step": 16746 + }, + { + "epoch": 2.7338475980572223, + "grad_norm": 1.5996431112289429, + "learning_rate": 1.8777246099868315e-05, + "loss": 0.5206, + "step": 16747 + }, + { + "epoch": 2.7340108566997268, + "grad_norm": 1.6426550149917603, + "learning_rate": 1.8777094040641542e-05, + "loss": 0.498, + "step": 16748 + }, + { + "epoch": 2.734174115342231, + "grad_norm": 1.6450637578964233, + "learning_rate": 1.8776941972576242e-05, + "loss": 0.5218, + "step": 16749 + }, + { + "epoch": 2.734337373984735, + "grad_norm": 1.64905846118927, + "learning_rate": 1.8776789895672557e-05, + "loss": 0.5135, + "step": 16750 + }, + { + "epoch": 2.7345006326272396, + "grad_norm": 1.8968874216079712, + "learning_rate": 1.877663780993065e-05, + "loss": 0.6455, + "step": 16751 + }, + { + "epoch": 2.734663891269744, + "grad_norm": 2.083348035812378, + "learning_rate": 1.8776485715350672e-05, + "loss": 0.5825, + "step": 16752 + }, + { + "epoch": 2.7348271499122485, + "grad_norm": 2.069751024246216, + "learning_rate": 1.8776333611932775e-05, + "loss": 0.6877, + "step": 16753 + }, + { + "epoch": 2.734990408554753, + "grad_norm": 1.7509840726852417, + "learning_rate": 1.877618149967711e-05, + "loss": 0.5187, + "step": 16754 + }, + { + "epoch": 2.7351536671972574, + "grad_norm": 1.5698353052139282, + "learning_rate": 1.8776029378583836e-05, + "loss": 0.5395, + "step": 16755 + }, + { + "epoch": 2.7353169258397614, + "grad_norm": 1.9295275211334229, + "learning_rate": 1.87758772486531e-05, + "loss": 0.5725, + "step": 16756 + }, + { + "epoch": 2.735480184482266, + "grad_norm": 1.5096006393432617, + "learning_rate": 1.8775725109885058e-05, + "loss": 0.4812, + "step": 16757 + }, + { + "epoch": 2.7356434431247703, + "grad_norm": 1.7118198871612549, + "learning_rate": 1.877557296227986e-05, + "loss": 0.5771, + "step": 16758 + }, + { + "epoch": 2.7358067017672747, + "grad_norm": 1.5132182836532593, + "learning_rate": 1.877542080583767e-05, + "loss": 0.5337, + "step": 16759 + }, + { + "epoch": 2.735969960409779, + "grad_norm": 1.5363600254058838, + "learning_rate": 1.877526864055863e-05, + "loss": 0.4846, + "step": 16760 + }, + { + "epoch": 2.7361332190522836, + "grad_norm": 1.838297963142395, + "learning_rate": 1.8775116466442897e-05, + "loss": 0.526, + "step": 16761 + }, + { + "epoch": 2.736296477694788, + "grad_norm": 1.6352523565292358, + "learning_rate": 1.8774964283490622e-05, + "loss": 0.5389, + "step": 16762 + }, + { + "epoch": 2.7364597363372924, + "grad_norm": 1.6770631074905396, + "learning_rate": 1.877481209170196e-05, + "loss": 0.4954, + "step": 16763 + }, + { + "epoch": 2.736622994979797, + "grad_norm": 1.5595831871032715, + "learning_rate": 1.877465989107707e-05, + "loss": 0.4965, + "step": 16764 + }, + { + "epoch": 2.7367862536223013, + "grad_norm": 1.7029294967651367, + "learning_rate": 1.8774507681616093e-05, + "loss": 0.5287, + "step": 16765 + }, + { + "epoch": 2.7369495122648058, + "grad_norm": 1.8645366430282593, + "learning_rate": 1.8774355463319193e-05, + "loss": 0.6893, + "step": 16766 + }, + { + "epoch": 2.7371127709073098, + "grad_norm": 1.6478272676467896, + "learning_rate": 1.877420323618652e-05, + "loss": 0.554, + "step": 16767 + }, + { + "epoch": 2.737276029549814, + "grad_norm": 1.992116093635559, + "learning_rate": 1.8774051000218226e-05, + "loss": 0.5876, + "step": 16768 + }, + { + "epoch": 2.7374392881923186, + "grad_norm": 1.8322604894638062, + "learning_rate": 1.8773898755414463e-05, + "loss": 0.5641, + "step": 16769 + }, + { + "epoch": 2.737602546834823, + "grad_norm": 1.4089696407318115, + "learning_rate": 1.8773746501775388e-05, + "loss": 0.3949, + "step": 16770 + }, + { + "epoch": 2.7377658054773275, + "grad_norm": 1.9402270317077637, + "learning_rate": 1.8773594239301152e-05, + "loss": 0.6209, + "step": 16771 + }, + { + "epoch": 2.737929064119832, + "grad_norm": 2.4936766624450684, + "learning_rate": 1.877344196799191e-05, + "loss": 0.5362, + "step": 16772 + }, + { + "epoch": 2.7380923227623364, + "grad_norm": 1.7907745838165283, + "learning_rate": 1.8773289687847815e-05, + "loss": 0.6112, + "step": 16773 + }, + { + "epoch": 2.7382555814048404, + "grad_norm": 1.9813156127929688, + "learning_rate": 1.8773137398869017e-05, + "loss": 0.5451, + "step": 16774 + }, + { + "epoch": 2.738418840047345, + "grad_norm": 1.6885403394699097, + "learning_rate": 1.8772985101055673e-05, + "loss": 0.52, + "step": 16775 + }, + { + "epoch": 2.7385820986898493, + "grad_norm": 1.7506283521652222, + "learning_rate": 1.8772832794407934e-05, + "loss": 0.5701, + "step": 16776 + }, + { + "epoch": 2.7387453573323537, + "grad_norm": 1.6005529165267944, + "learning_rate": 1.8772680478925956e-05, + "loss": 0.5207, + "step": 16777 + }, + { + "epoch": 2.738908615974858, + "grad_norm": 2.098233699798584, + "learning_rate": 1.877252815460989e-05, + "loss": 0.6144, + "step": 16778 + }, + { + "epoch": 2.7390718746173626, + "grad_norm": 1.5514872074127197, + "learning_rate": 1.8772375821459893e-05, + "loss": 0.5328, + "step": 16779 + }, + { + "epoch": 2.739235133259867, + "grad_norm": 2.032209634780884, + "learning_rate": 1.8772223479476114e-05, + "loss": 0.6277, + "step": 16780 + }, + { + "epoch": 2.7393983919023714, + "grad_norm": 1.6552964448928833, + "learning_rate": 1.8772071128658708e-05, + "loss": 0.6522, + "step": 16781 + }, + { + "epoch": 2.739561650544876, + "grad_norm": 1.6934640407562256, + "learning_rate": 1.877191876900783e-05, + "loss": 0.5072, + "step": 16782 + }, + { + "epoch": 2.7397249091873803, + "grad_norm": 1.8199266195297241, + "learning_rate": 1.877176640052363e-05, + "loss": 0.5343, + "step": 16783 + }, + { + "epoch": 2.7398881678298848, + "grad_norm": 1.571297287940979, + "learning_rate": 1.8771614023206267e-05, + "loss": 0.5129, + "step": 16784 + }, + { + "epoch": 2.7400514264723888, + "grad_norm": 2.4696574211120605, + "learning_rate": 1.877146163705589e-05, + "loss": 0.6305, + "step": 16785 + }, + { + "epoch": 2.740214685114893, + "grad_norm": 1.9648605585098267, + "learning_rate": 1.8771309242072654e-05, + "loss": 0.6302, + "step": 16786 + }, + { + "epoch": 2.7403779437573976, + "grad_norm": 1.6896098852157593, + "learning_rate": 1.8771156838256707e-05, + "loss": 0.557, + "step": 16787 + }, + { + "epoch": 2.740541202399902, + "grad_norm": 1.9716390371322632, + "learning_rate": 1.8771004425608213e-05, + "loss": 0.6711, + "step": 16788 + }, + { + "epoch": 2.7407044610424065, + "grad_norm": 2.393695592880249, + "learning_rate": 1.877085200412732e-05, + "loss": 0.6095, + "step": 16789 + }, + { + "epoch": 2.740867719684911, + "grad_norm": 1.632895827293396, + "learning_rate": 1.8770699573814176e-05, + "loss": 0.5757, + "step": 16790 + }, + { + "epoch": 2.741030978327415, + "grad_norm": 1.917678952217102, + "learning_rate": 1.8770547134668943e-05, + "loss": 0.5184, + "step": 16791 + }, + { + "epoch": 2.7411942369699194, + "grad_norm": 1.729008674621582, + "learning_rate": 1.8770394686691775e-05, + "loss": 0.5548, + "step": 16792 + }, + { + "epoch": 2.741357495612424, + "grad_norm": 1.8719189167022705, + "learning_rate": 1.8770242229882814e-05, + "loss": 0.4971, + "step": 16793 + }, + { + "epoch": 2.7415207542549282, + "grad_norm": 1.6740864515304565, + "learning_rate": 1.8770089764242224e-05, + "loss": 0.5417, + "step": 16794 + }, + { + "epoch": 2.7416840128974327, + "grad_norm": 1.6230462789535522, + "learning_rate": 1.876993728977016e-05, + "loss": 0.5379, + "step": 16795 + }, + { + "epoch": 2.741847271539937, + "grad_norm": 1.8141647577285767, + "learning_rate": 1.8769784806466768e-05, + "loss": 0.6516, + "step": 16796 + }, + { + "epoch": 2.7420105301824416, + "grad_norm": 2.278895616531372, + "learning_rate": 1.8769632314332207e-05, + "loss": 0.6947, + "step": 16797 + }, + { + "epoch": 2.742173788824946, + "grad_norm": 1.9790170192718506, + "learning_rate": 1.8769479813366626e-05, + "loss": 0.5816, + "step": 16798 + }, + { + "epoch": 2.7423370474674504, + "grad_norm": 1.7881243228912354, + "learning_rate": 1.8769327303570185e-05, + "loss": 0.6053, + "step": 16799 + }, + { + "epoch": 2.742500306109955, + "grad_norm": 1.897326946258545, + "learning_rate": 1.8769174784943032e-05, + "loss": 0.5671, + "step": 16800 + }, + { + "epoch": 2.7426635647524593, + "grad_norm": 1.8113373517990112, + "learning_rate": 1.876902225748532e-05, + "loss": 0.4837, + "step": 16801 + }, + { + "epoch": 2.7428268233949633, + "grad_norm": 1.5825766324996948, + "learning_rate": 1.876886972119721e-05, + "loss": 0.5429, + "step": 16802 + }, + { + "epoch": 2.7429900820374677, + "grad_norm": 1.944901704788208, + "learning_rate": 1.8768717176078846e-05, + "loss": 0.6478, + "step": 16803 + }, + { + "epoch": 2.743153340679972, + "grad_norm": 1.6745465993881226, + "learning_rate": 1.876856462213039e-05, + "loss": 0.5718, + "step": 16804 + }, + { + "epoch": 2.7433165993224766, + "grad_norm": 1.5432240962982178, + "learning_rate": 1.8768412059351986e-05, + "loss": 0.5526, + "step": 16805 + }, + { + "epoch": 2.743479857964981, + "grad_norm": 2.0743541717529297, + "learning_rate": 1.8768259487743796e-05, + "loss": 0.697, + "step": 16806 + }, + { + "epoch": 2.7436431166074855, + "grad_norm": 1.7069040536880493, + "learning_rate": 1.8768106907305973e-05, + "loss": 0.5564, + "step": 16807 + }, + { + "epoch": 2.74380637524999, + "grad_norm": 1.960318922996521, + "learning_rate": 1.8767954318038667e-05, + "loss": 0.6121, + "step": 16808 + }, + { + "epoch": 2.743969633892494, + "grad_norm": 1.8554294109344482, + "learning_rate": 1.8767801719942033e-05, + "loss": 0.5951, + "step": 16809 + }, + { + "epoch": 2.7441328925349984, + "grad_norm": 1.765335202217102, + "learning_rate": 1.8767649113016225e-05, + "loss": 0.5592, + "step": 16810 + }, + { + "epoch": 2.744296151177503, + "grad_norm": 1.6094262599945068, + "learning_rate": 1.8767496497261397e-05, + "loss": 0.4411, + "step": 16811 + }, + { + "epoch": 2.7444594098200072, + "grad_norm": 1.9136241674423218, + "learning_rate": 1.8767343872677703e-05, + "loss": 0.6591, + "step": 16812 + }, + { + "epoch": 2.7446226684625117, + "grad_norm": 1.9376603364944458, + "learning_rate": 1.8767191239265297e-05, + "loss": 0.6051, + "step": 16813 + }, + { + "epoch": 2.744785927105016, + "grad_norm": 1.4612798690795898, + "learning_rate": 1.876703859702433e-05, + "loss": 0.5468, + "step": 16814 + }, + { + "epoch": 2.7449491857475206, + "grad_norm": 2.2661523818969727, + "learning_rate": 1.8766885945954958e-05, + "loss": 0.7061, + "step": 16815 + }, + { + "epoch": 2.745112444390025, + "grad_norm": 1.6905848979949951, + "learning_rate": 1.8766733286057332e-05, + "loss": 0.598, + "step": 16816 + }, + { + "epoch": 2.7452757030325294, + "grad_norm": 1.8901417255401611, + "learning_rate": 1.8766580617331608e-05, + "loss": 0.6787, + "step": 16817 + }, + { + "epoch": 2.745438961675034, + "grad_norm": 1.3594547510147095, + "learning_rate": 1.8766427939777943e-05, + "loss": 0.4933, + "step": 16818 + }, + { + "epoch": 2.7456022203175383, + "grad_norm": 1.2098275423049927, + "learning_rate": 1.8766275253396488e-05, + "loss": 0.4076, + "step": 16819 + }, + { + "epoch": 2.7457654789600423, + "grad_norm": 1.7732212543487549, + "learning_rate": 1.8766122558187394e-05, + "loss": 0.7026, + "step": 16820 + }, + { + "epoch": 2.7459287376025467, + "grad_norm": 1.5285594463348389, + "learning_rate": 1.8765969854150817e-05, + "loss": 0.5351, + "step": 16821 + }, + { + "epoch": 2.746091996245051, + "grad_norm": 1.7328625917434692, + "learning_rate": 1.8765817141286907e-05, + "loss": 0.6164, + "step": 16822 + }, + { + "epoch": 2.7462552548875556, + "grad_norm": 1.8076311349868774, + "learning_rate": 1.876566441959583e-05, + "loss": 0.5931, + "step": 16823 + }, + { + "epoch": 2.74641851353006, + "grad_norm": 2.3309121131896973, + "learning_rate": 1.8765511689077723e-05, + "loss": 0.592, + "step": 16824 + }, + { + "epoch": 2.7465817721725645, + "grad_norm": 2.0019733905792236, + "learning_rate": 1.876535894973275e-05, + "loss": 0.6443, + "step": 16825 + }, + { + "epoch": 2.746745030815069, + "grad_norm": 1.7664947509765625, + "learning_rate": 1.876520620156107e-05, + "loss": 0.5447, + "step": 16826 + }, + { + "epoch": 2.746908289457573, + "grad_norm": 1.6529799699783325, + "learning_rate": 1.8765053444562822e-05, + "loss": 0.5613, + "step": 16827 + }, + { + "epoch": 2.7470715481000774, + "grad_norm": 1.8328337669372559, + "learning_rate": 1.876490067873817e-05, + "loss": 0.6489, + "step": 16828 + }, + { + "epoch": 2.747234806742582, + "grad_norm": 1.515157699584961, + "learning_rate": 1.8764747904087262e-05, + "loss": 0.5273, + "step": 16829 + }, + { + "epoch": 2.7473980653850862, + "grad_norm": 2.0224263668060303, + "learning_rate": 1.8764595120610258e-05, + "loss": 0.6283, + "step": 16830 + }, + { + "epoch": 2.7475613240275907, + "grad_norm": 1.8425991535186768, + "learning_rate": 1.8764442328307312e-05, + "loss": 0.6339, + "step": 16831 + }, + { + "epoch": 2.747724582670095, + "grad_norm": 1.9475247859954834, + "learning_rate": 1.876428952717857e-05, + "loss": 0.6193, + "step": 16832 + }, + { + "epoch": 2.7478878413125996, + "grad_norm": 1.8596323728561401, + "learning_rate": 1.8764136717224197e-05, + "loss": 0.5551, + "step": 16833 + }, + { + "epoch": 2.748051099955104, + "grad_norm": 1.417097806930542, + "learning_rate": 1.8763983898444332e-05, + "loss": 0.4611, + "step": 16834 + }, + { + "epoch": 2.7482143585976084, + "grad_norm": 1.6716039180755615, + "learning_rate": 1.8763831070839144e-05, + "loss": 0.5426, + "step": 16835 + }, + { + "epoch": 2.748377617240113, + "grad_norm": 1.7773022651672363, + "learning_rate": 1.8763678234408776e-05, + "loss": 0.5088, + "step": 16836 + }, + { + "epoch": 2.7485408758826173, + "grad_norm": 1.9376754760742188, + "learning_rate": 1.876352538915339e-05, + "loss": 0.6252, + "step": 16837 + }, + { + "epoch": 2.7487041345251213, + "grad_norm": 1.9052051305770874, + "learning_rate": 1.8763372535073136e-05, + "loss": 0.6604, + "step": 16838 + }, + { + "epoch": 2.7488673931676257, + "grad_norm": 1.6814298629760742, + "learning_rate": 1.8763219672168164e-05, + "loss": 0.6237, + "step": 16839 + }, + { + "epoch": 2.74903065181013, + "grad_norm": 1.789160132408142, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.5277, + "step": 16840 + }, + { + "epoch": 2.7491939104526346, + "grad_norm": 1.9426923990249634, + "learning_rate": 1.87629139198847e-05, + "loss": 0.5578, + "step": 16841 + }, + { + "epoch": 2.749357169095139, + "grad_norm": 1.9338479042053223, + "learning_rate": 1.8762761030506514e-05, + "loss": 0.5236, + "step": 16842 + }, + { + "epoch": 2.7495204277376435, + "grad_norm": 1.8518275022506714, + "learning_rate": 1.876260813230423e-05, + "loss": 0.5908, + "step": 16843 + }, + { + "epoch": 2.7496836863801475, + "grad_norm": 2.14155912399292, + "learning_rate": 1.8762455225278003e-05, + "loss": 0.6113, + "step": 16844 + }, + { + "epoch": 2.749846945022652, + "grad_norm": 1.7078956365585327, + "learning_rate": 1.8762302309427983e-05, + "loss": 0.6079, + "step": 16845 + }, + { + "epoch": 2.7500102036651564, + "grad_norm": 1.999302864074707, + "learning_rate": 1.876214938475433e-05, + "loss": 0.6705, + "step": 16846 + }, + { + "epoch": 2.750173462307661, + "grad_norm": 1.9432547092437744, + "learning_rate": 1.8761996451257192e-05, + "loss": 0.6325, + "step": 16847 + }, + { + "epoch": 2.7503367209501652, + "grad_norm": 1.8753087520599365, + "learning_rate": 1.876184350893673e-05, + "loss": 0.5627, + "step": 16848 + }, + { + "epoch": 2.7504999795926697, + "grad_norm": 2.045257091522217, + "learning_rate": 1.876169055779309e-05, + "loss": 0.5809, + "step": 16849 + }, + { + "epoch": 2.750663238235174, + "grad_norm": 1.702256679534912, + "learning_rate": 1.8761537597826426e-05, + "loss": 0.5033, + "step": 16850 + }, + { + "epoch": 2.7508264968776786, + "grad_norm": 1.5831900835037231, + "learning_rate": 1.87613846290369e-05, + "loss": 0.5716, + "step": 16851 + }, + { + "epoch": 2.750989755520183, + "grad_norm": 1.8476853370666504, + "learning_rate": 1.8761231651424668e-05, + "loss": 0.5678, + "step": 16852 + }, + { + "epoch": 2.7511530141626874, + "grad_norm": 1.6984878778457642, + "learning_rate": 1.8761078664989872e-05, + "loss": 0.5655, + "step": 16853 + }, + { + "epoch": 2.751316272805192, + "grad_norm": 1.846522569656372, + "learning_rate": 1.8760925669732672e-05, + "loss": 0.5436, + "step": 16854 + }, + { + "epoch": 2.751479531447696, + "grad_norm": 1.9222733974456787, + "learning_rate": 1.876077266565322e-05, + "loss": 0.5139, + "step": 16855 + }, + { + "epoch": 2.7516427900902003, + "grad_norm": 2.0479063987731934, + "learning_rate": 1.8760619652751677e-05, + "loss": 0.6918, + "step": 16856 + }, + { + "epoch": 2.7518060487327047, + "grad_norm": 1.9014701843261719, + "learning_rate": 1.876046663102819e-05, + "loss": 0.6567, + "step": 16857 + }, + { + "epoch": 2.751969307375209, + "grad_norm": 1.5004875659942627, + "learning_rate": 1.8760313600482916e-05, + "loss": 0.4818, + "step": 16858 + }, + { + "epoch": 2.7521325660177136, + "grad_norm": 1.6105849742889404, + "learning_rate": 1.8760160561116008e-05, + "loss": 0.4964, + "step": 16859 + }, + { + "epoch": 2.752295824660218, + "grad_norm": 1.8664335012435913, + "learning_rate": 1.8760007512927624e-05, + "loss": 0.5978, + "step": 16860 + }, + { + "epoch": 2.7524590833027225, + "grad_norm": 1.8365596532821655, + "learning_rate": 1.875985445591791e-05, + "loss": 0.5759, + "step": 16861 + }, + { + "epoch": 2.7526223419452265, + "grad_norm": 1.594326138496399, + "learning_rate": 1.8759701390087026e-05, + "loss": 0.47, + "step": 16862 + }, + { + "epoch": 2.752785600587731, + "grad_norm": 1.9349603652954102, + "learning_rate": 1.8759548315435124e-05, + "loss": 0.6154, + "step": 16863 + }, + { + "epoch": 2.7529488592302354, + "grad_norm": 1.6826462745666504, + "learning_rate": 1.8759395231962366e-05, + "loss": 0.5161, + "step": 16864 + }, + { + "epoch": 2.75311211787274, + "grad_norm": 1.9525145292282104, + "learning_rate": 1.8759242139668892e-05, + "loss": 0.6068, + "step": 16865 + }, + { + "epoch": 2.7532753765152442, + "grad_norm": 2.0448999404907227, + "learning_rate": 1.8759089038554864e-05, + "loss": 0.4898, + "step": 16866 + }, + { + "epoch": 2.7534386351577487, + "grad_norm": 1.835203766822815, + "learning_rate": 1.875893592862044e-05, + "loss": 0.567, + "step": 16867 + }, + { + "epoch": 2.753601893800253, + "grad_norm": 1.6607319116592407, + "learning_rate": 1.8758782809865766e-05, + "loss": 0.4964, + "step": 16868 + }, + { + "epoch": 2.7537651524427575, + "grad_norm": 1.672486424446106, + "learning_rate": 1.8758629682291002e-05, + "loss": 0.5313, + "step": 16869 + }, + { + "epoch": 2.753928411085262, + "grad_norm": 1.9060086011886597, + "learning_rate": 1.87584765458963e-05, + "loss": 0.6721, + "step": 16870 + }, + { + "epoch": 2.7540916697277664, + "grad_norm": 1.7637985944747925, + "learning_rate": 1.8758323400681813e-05, + "loss": 0.5677, + "step": 16871 + }, + { + "epoch": 2.754254928370271, + "grad_norm": 1.7966690063476562, + "learning_rate": 1.87581702466477e-05, + "loss": 0.5428, + "step": 16872 + }, + { + "epoch": 2.754418187012775, + "grad_norm": 2.068167209625244, + "learning_rate": 1.875801708379411e-05, + "loss": 0.5976, + "step": 16873 + }, + { + "epoch": 2.7545814456552793, + "grad_norm": 1.5054138898849487, + "learning_rate": 1.8757863912121196e-05, + "loss": 0.4965, + "step": 16874 + }, + { + "epoch": 2.7547447042977837, + "grad_norm": 1.5841615200042725, + "learning_rate": 1.875771073162912e-05, + "loss": 0.4663, + "step": 16875 + }, + { + "epoch": 2.754907962940288, + "grad_norm": 1.6457993984222412, + "learning_rate": 1.875755754231803e-05, + "loss": 0.5876, + "step": 16876 + }, + { + "epoch": 2.7550712215827926, + "grad_norm": 1.9504085779190063, + "learning_rate": 1.8757404344188083e-05, + "loss": 0.5744, + "step": 16877 + }, + { + "epoch": 2.755234480225297, + "grad_norm": 1.7165213823318481, + "learning_rate": 1.8757251137239426e-05, + "loss": 0.5207, + "step": 16878 + }, + { + "epoch": 2.755397738867801, + "grad_norm": 1.7890019416809082, + "learning_rate": 1.8757097921472228e-05, + "loss": 0.6171, + "step": 16879 + }, + { + "epoch": 2.7555609975103055, + "grad_norm": 2.007624626159668, + "learning_rate": 1.875694469688663e-05, + "loss": 0.6573, + "step": 16880 + }, + { + "epoch": 2.75572425615281, + "grad_norm": 1.8155338764190674, + "learning_rate": 1.875679146348279e-05, + "loss": 0.5984, + "step": 16881 + }, + { + "epoch": 2.7558875147953144, + "grad_norm": 1.9199455976486206, + "learning_rate": 1.8756638221260866e-05, + "loss": 0.6746, + "step": 16882 + }, + { + "epoch": 2.756050773437819, + "grad_norm": 1.8992630243301392, + "learning_rate": 1.875648497022101e-05, + "loss": 0.6742, + "step": 16883 + }, + { + "epoch": 2.7562140320803232, + "grad_norm": 1.6756820678710938, + "learning_rate": 1.8756331710363375e-05, + "loss": 0.5035, + "step": 16884 + }, + { + "epoch": 2.7563772907228277, + "grad_norm": 1.7867639064788818, + "learning_rate": 1.8756178441688117e-05, + "loss": 0.555, + "step": 16885 + }, + { + "epoch": 2.756540549365332, + "grad_norm": 1.5557234287261963, + "learning_rate": 1.8756025164195385e-05, + "loss": 0.5158, + "step": 16886 + }, + { + "epoch": 2.7567038080078365, + "grad_norm": 1.9616869688034058, + "learning_rate": 1.8755871877885345e-05, + "loss": 0.6255, + "step": 16887 + }, + { + "epoch": 2.756867066650341, + "grad_norm": 2.1582016944885254, + "learning_rate": 1.875571858275814e-05, + "loss": 0.7274, + "step": 16888 + }, + { + "epoch": 2.7570303252928454, + "grad_norm": 1.8098737001419067, + "learning_rate": 1.875556527881393e-05, + "loss": 0.7092, + "step": 16889 + }, + { + "epoch": 2.7571935839353494, + "grad_norm": 1.7838102579116821, + "learning_rate": 1.8755411966052867e-05, + "loss": 0.5013, + "step": 16890 + }, + { + "epoch": 2.757356842577854, + "grad_norm": 1.5255036354064941, + "learning_rate": 1.8755258644475106e-05, + "loss": 0.5188, + "step": 16891 + }, + { + "epoch": 2.7575201012203583, + "grad_norm": 1.5052714347839355, + "learning_rate": 1.8755105314080803e-05, + "loss": 0.4541, + "step": 16892 + }, + { + "epoch": 2.7576833598628627, + "grad_norm": 2.1033787727355957, + "learning_rate": 1.8754951974870113e-05, + "loss": 0.698, + "step": 16893 + }, + { + "epoch": 2.757846618505367, + "grad_norm": 1.7871756553649902, + "learning_rate": 1.875479862684319e-05, + "loss": 0.5934, + "step": 16894 + }, + { + "epoch": 2.7580098771478716, + "grad_norm": 2.0921781063079834, + "learning_rate": 1.875464527000018e-05, + "loss": 0.6287, + "step": 16895 + }, + { + "epoch": 2.758173135790376, + "grad_norm": 1.814234733581543, + "learning_rate": 1.875449190434125e-05, + "loss": 0.5955, + "step": 16896 + }, + { + "epoch": 2.75833639443288, + "grad_norm": 1.800386905670166, + "learning_rate": 1.8754338529866547e-05, + "loss": 0.5795, + "step": 16897 + }, + { + "epoch": 2.7584996530753845, + "grad_norm": 1.914287805557251, + "learning_rate": 1.8754185146576226e-05, + "loss": 0.5994, + "step": 16898 + }, + { + "epoch": 2.758662911717889, + "grad_norm": 1.8128507137298584, + "learning_rate": 1.8754031754470446e-05, + "loss": 0.5569, + "step": 16899 + }, + { + "epoch": 2.7588261703603933, + "grad_norm": 1.9445407390594482, + "learning_rate": 1.8753878353549357e-05, + "loss": 0.6656, + "step": 16900 + }, + { + "epoch": 2.758989429002898, + "grad_norm": 1.5113492012023926, + "learning_rate": 1.8753724943813118e-05, + "loss": 0.4717, + "step": 16901 + }, + { + "epoch": 2.7591526876454022, + "grad_norm": 1.8402085304260254, + "learning_rate": 1.8753571525261875e-05, + "loss": 0.5583, + "step": 16902 + }, + { + "epoch": 2.7593159462879067, + "grad_norm": 1.9338228702545166, + "learning_rate": 1.8753418097895794e-05, + "loss": 0.6869, + "step": 16903 + }, + { + "epoch": 2.759479204930411, + "grad_norm": 1.645857572555542, + "learning_rate": 1.8753264661715017e-05, + "loss": 0.59, + "step": 16904 + }, + { + "epoch": 2.7596424635729155, + "grad_norm": 1.68617582321167, + "learning_rate": 1.8753111216719707e-05, + "loss": 0.6373, + "step": 16905 + }, + { + "epoch": 2.75980572221542, + "grad_norm": 1.7602789402008057, + "learning_rate": 1.8752957762910016e-05, + "loss": 0.6192, + "step": 16906 + }, + { + "epoch": 2.7599689808579244, + "grad_norm": 1.7922712564468384, + "learning_rate": 1.87528043002861e-05, + "loss": 0.6352, + "step": 16907 + }, + { + "epoch": 2.7601322395004284, + "grad_norm": 2.0188441276550293, + "learning_rate": 1.8752650828848113e-05, + "loss": 0.7232, + "step": 16908 + }, + { + "epoch": 2.760295498142933, + "grad_norm": 1.8489463329315186, + "learning_rate": 1.8752497348596205e-05, + "loss": 0.6326, + "step": 16909 + }, + { + "epoch": 2.7604587567854373, + "grad_norm": 1.7247189283370972, + "learning_rate": 1.875234385953054e-05, + "loss": 0.5075, + "step": 16910 + }, + { + "epoch": 2.7606220154279417, + "grad_norm": 2.0305428504943848, + "learning_rate": 1.8752190361651263e-05, + "loss": 0.6302, + "step": 16911 + }, + { + "epoch": 2.760785274070446, + "grad_norm": 1.8630255460739136, + "learning_rate": 1.8752036854958534e-05, + "loss": 0.5643, + "step": 16912 + }, + { + "epoch": 2.7609485327129506, + "grad_norm": 1.9256080389022827, + "learning_rate": 1.875188333945251e-05, + "loss": 0.6233, + "step": 16913 + }, + { + "epoch": 2.761111791355455, + "grad_norm": 2.1069793701171875, + "learning_rate": 1.8751729815133335e-05, + "loss": 0.5763, + "step": 16914 + }, + { + "epoch": 2.761275049997959, + "grad_norm": 1.69350266456604, + "learning_rate": 1.8751576282001174e-05, + "loss": 0.5775, + "step": 16915 + }, + { + "epoch": 2.7614383086404635, + "grad_norm": 2.007568359375, + "learning_rate": 1.8751422740056177e-05, + "loss": 0.6677, + "step": 16916 + }, + { + "epoch": 2.761601567282968, + "grad_norm": 1.5601775646209717, + "learning_rate": 1.87512691892985e-05, + "loss": 0.5225, + "step": 16917 + }, + { + "epoch": 2.7617648259254723, + "grad_norm": 1.7741409540176392, + "learning_rate": 1.8751115629728296e-05, + "loss": 0.5656, + "step": 16918 + }, + { + "epoch": 2.761928084567977, + "grad_norm": 1.7115882635116577, + "learning_rate": 1.8750962061345724e-05, + "loss": 0.552, + "step": 16919 + }, + { + "epoch": 2.762091343210481, + "grad_norm": 1.8240242004394531, + "learning_rate": 1.8750808484150936e-05, + "loss": 0.514, + "step": 16920 + }, + { + "epoch": 2.7622546018529857, + "grad_norm": 1.8704540729522705, + "learning_rate": 1.875065489814408e-05, + "loss": 0.6002, + "step": 16921 + }, + { + "epoch": 2.76241786049549, + "grad_norm": 1.90280020236969, + "learning_rate": 1.8750501303325323e-05, + "loss": 0.5628, + "step": 16922 + }, + { + "epoch": 2.7625811191379945, + "grad_norm": 1.679436206817627, + "learning_rate": 1.875034769969481e-05, + "loss": 0.5531, + "step": 16923 + }, + { + "epoch": 2.762744377780499, + "grad_norm": 1.394688606262207, + "learning_rate": 1.8750194087252704e-05, + "loss": 0.4696, + "step": 16924 + }, + { + "epoch": 2.7629076364230034, + "grad_norm": 1.52779221534729, + "learning_rate": 1.8750040465999148e-05, + "loss": 0.5005, + "step": 16925 + }, + { + "epoch": 2.7630708950655074, + "grad_norm": 2.225602865219116, + "learning_rate": 1.8749886835934308e-05, + "loss": 0.6775, + "step": 16926 + }, + { + "epoch": 2.763234153708012, + "grad_norm": 1.5751527547836304, + "learning_rate": 1.8749733197058334e-05, + "loss": 0.5114, + "step": 16927 + }, + { + "epoch": 2.7633974123505163, + "grad_norm": 1.6829917430877686, + "learning_rate": 1.874957954937138e-05, + "loss": 0.6078, + "step": 16928 + }, + { + "epoch": 2.7635606709930207, + "grad_norm": 1.8106110095977783, + "learning_rate": 1.87494258928736e-05, + "loss": 0.6205, + "step": 16929 + }, + { + "epoch": 2.763723929635525, + "grad_norm": 1.9297690391540527, + "learning_rate": 1.8749272227565153e-05, + "loss": 0.6594, + "step": 16930 + }, + { + "epoch": 2.7638871882780296, + "grad_norm": 1.6832027435302734, + "learning_rate": 1.874911855344619e-05, + "loss": 0.5847, + "step": 16931 + }, + { + "epoch": 2.7640504469205336, + "grad_norm": 1.7857534885406494, + "learning_rate": 1.874896487051687e-05, + "loss": 0.5538, + "step": 16932 + }, + { + "epoch": 2.764213705563038, + "grad_norm": 1.753980040550232, + "learning_rate": 1.874881117877734e-05, + "loss": 0.4916, + "step": 16933 + }, + { + "epoch": 2.7643769642055425, + "grad_norm": 1.8775554895401, + "learning_rate": 1.874865747822776e-05, + "loss": 0.5821, + "step": 16934 + }, + { + "epoch": 2.764540222848047, + "grad_norm": 2.005479574203491, + "learning_rate": 1.8748503768868286e-05, + "loss": 0.6104, + "step": 16935 + }, + { + "epoch": 2.7647034814905513, + "grad_norm": 1.531949520111084, + "learning_rate": 1.8748350050699074e-05, + "loss": 0.5238, + "step": 16936 + }, + { + "epoch": 2.764866740133056, + "grad_norm": 1.9429813623428345, + "learning_rate": 1.8748196323720272e-05, + "loss": 0.6242, + "step": 16937 + }, + { + "epoch": 2.76502999877556, + "grad_norm": 2.273709297180176, + "learning_rate": 1.874804258793204e-05, + "loss": 0.5909, + "step": 16938 + }, + { + "epoch": 2.7651932574180647, + "grad_norm": 1.535709023475647, + "learning_rate": 1.8747888843334528e-05, + "loss": 0.4223, + "step": 16939 + }, + { + "epoch": 2.765356516060569, + "grad_norm": 1.8722307682037354, + "learning_rate": 1.8747735089927898e-05, + "loss": 0.5313, + "step": 16940 + }, + { + "epoch": 2.7655197747030735, + "grad_norm": 1.79629385471344, + "learning_rate": 1.8747581327712302e-05, + "loss": 0.5732, + "step": 16941 + }, + { + "epoch": 2.765683033345578, + "grad_norm": 1.6924244165420532, + "learning_rate": 1.8747427556687888e-05, + "loss": 0.6017, + "step": 16942 + }, + { + "epoch": 2.765846291988082, + "grad_norm": 1.3711211681365967, + "learning_rate": 1.8747273776854823e-05, + "loss": 0.4404, + "step": 16943 + }, + { + "epoch": 2.7660095506305864, + "grad_norm": 1.361436367034912, + "learning_rate": 1.8747119988213252e-05, + "loss": 0.4441, + "step": 16944 + }, + { + "epoch": 2.766172809273091, + "grad_norm": 2.1270649433135986, + "learning_rate": 1.8746966190763335e-05, + "loss": 0.6017, + "step": 16945 + }, + { + "epoch": 2.7663360679155953, + "grad_norm": 1.7463562488555908, + "learning_rate": 1.8746812384505227e-05, + "loss": 0.6078, + "step": 16946 + }, + { + "epoch": 2.7664993265580997, + "grad_norm": 1.4082316160202026, + "learning_rate": 1.874665856943908e-05, + "loss": 0.442, + "step": 16947 + }, + { + "epoch": 2.766662585200604, + "grad_norm": 1.8391592502593994, + "learning_rate": 1.874650474556505e-05, + "loss": 0.6308, + "step": 16948 + }, + { + "epoch": 2.7668258438431086, + "grad_norm": 1.8903462886810303, + "learning_rate": 1.874635091288329e-05, + "loss": 0.6096, + "step": 16949 + }, + { + "epoch": 2.7669891024856126, + "grad_norm": 1.5629572868347168, + "learning_rate": 1.874619707139396e-05, + "loss": 0.5136, + "step": 16950 + }, + { + "epoch": 2.767152361128117, + "grad_norm": 1.8885281085968018, + "learning_rate": 1.874604322109721e-05, + "loss": 0.6094, + "step": 16951 + }, + { + "epoch": 2.7673156197706215, + "grad_norm": 1.883802056312561, + "learning_rate": 1.8745889361993197e-05, + "loss": 0.6684, + "step": 16952 + }, + { + "epoch": 2.767478878413126, + "grad_norm": 1.8270710706710815, + "learning_rate": 1.874573549408208e-05, + "loss": 0.5847, + "step": 16953 + }, + { + "epoch": 2.7676421370556303, + "grad_norm": 1.7092247009277344, + "learning_rate": 1.8745581617364003e-05, + "loss": 0.5272, + "step": 16954 + }, + { + "epoch": 2.7678053956981348, + "grad_norm": 1.7039164304733276, + "learning_rate": 1.874542773183913e-05, + "loss": 0.5858, + "step": 16955 + }, + { + "epoch": 2.767968654340639, + "grad_norm": 2.011967182159424, + "learning_rate": 1.8745273837507612e-05, + "loss": 0.6049, + "step": 16956 + }, + { + "epoch": 2.7681319129831436, + "grad_norm": 1.7507271766662598, + "learning_rate": 1.874511993436961e-05, + "loss": 0.6013, + "step": 16957 + }, + { + "epoch": 2.768295171625648, + "grad_norm": 1.7641527652740479, + "learning_rate": 1.874496602242527e-05, + "loss": 0.6079, + "step": 16958 + }, + { + "epoch": 2.7684584302681525, + "grad_norm": 1.7927541732788086, + "learning_rate": 1.8744812101674755e-05, + "loss": 0.465, + "step": 16959 + }, + { + "epoch": 2.768621688910657, + "grad_norm": 1.8434326648712158, + "learning_rate": 1.8744658172118215e-05, + "loss": 0.5949, + "step": 16960 + }, + { + "epoch": 2.768784947553161, + "grad_norm": 1.7073403596878052, + "learning_rate": 1.874450423375581e-05, + "loss": 0.5581, + "step": 16961 + }, + { + "epoch": 2.7689482061956654, + "grad_norm": 2.008556842803955, + "learning_rate": 1.8744350286587685e-05, + "loss": 0.6232, + "step": 16962 + }, + { + "epoch": 2.76911146483817, + "grad_norm": 2.1378302574157715, + "learning_rate": 1.8744196330614003e-05, + "loss": 0.7641, + "step": 16963 + }, + { + "epoch": 2.7692747234806743, + "grad_norm": 1.867311716079712, + "learning_rate": 1.8744042365834923e-05, + "loss": 0.6203, + "step": 16964 + }, + { + "epoch": 2.7694379821231787, + "grad_norm": 1.5785242319107056, + "learning_rate": 1.874388839225059e-05, + "loss": 0.556, + "step": 16965 + }, + { + "epoch": 2.769601240765683, + "grad_norm": 1.8768928050994873, + "learning_rate": 1.8743734409861165e-05, + "loss": 0.6068, + "step": 16966 + }, + { + "epoch": 2.7697644994081876, + "grad_norm": 1.7226799726486206, + "learning_rate": 1.87435804186668e-05, + "loss": 0.5626, + "step": 16967 + }, + { + "epoch": 2.7699277580506916, + "grad_norm": 1.5295146703720093, + "learning_rate": 1.8743426418667652e-05, + "loss": 0.5435, + "step": 16968 + }, + { + "epoch": 2.770091016693196, + "grad_norm": 1.7757264375686646, + "learning_rate": 1.874327240986388e-05, + "loss": 0.5592, + "step": 16969 + }, + { + "epoch": 2.7702542753357005, + "grad_norm": 1.5258785486221313, + "learning_rate": 1.874311839225563e-05, + "loss": 0.5153, + "step": 16970 + }, + { + "epoch": 2.770417533978205, + "grad_norm": 1.4512251615524292, + "learning_rate": 1.8742964365843066e-05, + "loss": 0.4998, + "step": 16971 + }, + { + "epoch": 2.7705807926207093, + "grad_norm": 1.7135977745056152, + "learning_rate": 1.8742810330626338e-05, + "loss": 0.6472, + "step": 16972 + }, + { + "epoch": 2.7707440512632138, + "grad_norm": 1.8380926847457886, + "learning_rate": 1.87426562866056e-05, + "loss": 0.6191, + "step": 16973 + }, + { + "epoch": 2.770907309905718, + "grad_norm": 1.5992674827575684, + "learning_rate": 1.8742502233781013e-05, + "loss": 0.5958, + "step": 16974 + }, + { + "epoch": 2.7710705685482226, + "grad_norm": 2.077838659286499, + "learning_rate": 1.8742348172152728e-05, + "loss": 0.5966, + "step": 16975 + }, + { + "epoch": 2.771233827190727, + "grad_norm": 1.5644733905792236, + "learning_rate": 1.87421941017209e-05, + "loss": 0.5105, + "step": 16976 + }, + { + "epoch": 2.7713970858332315, + "grad_norm": 1.7718591690063477, + "learning_rate": 1.8742040022485683e-05, + "loss": 0.5661, + "step": 16977 + }, + { + "epoch": 2.771560344475736, + "grad_norm": 2.096832513809204, + "learning_rate": 1.8741885934447235e-05, + "loss": 0.7577, + "step": 16978 + }, + { + "epoch": 2.77172360311824, + "grad_norm": 1.4819700717926025, + "learning_rate": 1.8741731837605713e-05, + "loss": 0.5233, + "step": 16979 + }, + { + "epoch": 2.7718868617607444, + "grad_norm": 1.7022075653076172, + "learning_rate": 1.8741577731961266e-05, + "loss": 0.5784, + "step": 16980 + }, + { + "epoch": 2.772050120403249, + "grad_norm": 1.6016932725906372, + "learning_rate": 1.8741423617514055e-05, + "loss": 0.5713, + "step": 16981 + }, + { + "epoch": 2.7722133790457533, + "grad_norm": 1.6289968490600586, + "learning_rate": 1.8741269494264228e-05, + "loss": 0.5279, + "step": 16982 + }, + { + "epoch": 2.7723766376882577, + "grad_norm": 1.9414409399032593, + "learning_rate": 1.874111536221195e-05, + "loss": 0.6329, + "step": 16983 + }, + { + "epoch": 2.772539896330762, + "grad_norm": 1.9337849617004395, + "learning_rate": 1.874096122135737e-05, + "loss": 0.6237, + "step": 16984 + }, + { + "epoch": 2.772703154973266, + "grad_norm": 1.6559940576553345, + "learning_rate": 1.8740807071700642e-05, + "loss": 0.6515, + "step": 16985 + }, + { + "epoch": 2.7728664136157706, + "grad_norm": 2.0726354122161865, + "learning_rate": 1.874065291324193e-05, + "loss": 0.7788, + "step": 16986 + }, + { + "epoch": 2.773029672258275, + "grad_norm": 1.705725073814392, + "learning_rate": 1.8740498745981374e-05, + "loss": 0.4955, + "step": 16987 + }, + { + "epoch": 2.7731929309007795, + "grad_norm": 1.7953778505325317, + "learning_rate": 1.8740344569919144e-05, + "loss": 0.5691, + "step": 16988 + }, + { + "epoch": 2.773356189543284, + "grad_norm": 1.7822595834732056, + "learning_rate": 1.8740190385055384e-05, + "loss": 0.628, + "step": 16989 + }, + { + "epoch": 2.7735194481857883, + "grad_norm": 1.4764539003372192, + "learning_rate": 1.874003619139026e-05, + "loss": 0.5521, + "step": 16990 + }, + { + "epoch": 2.7736827068282928, + "grad_norm": 1.725106120109558, + "learning_rate": 1.8739881988923923e-05, + "loss": 0.6145, + "step": 16991 + }, + { + "epoch": 2.773845965470797, + "grad_norm": 1.8168293237686157, + "learning_rate": 1.873972777765652e-05, + "loss": 0.5614, + "step": 16992 + }, + { + "epoch": 2.7740092241133016, + "grad_norm": 1.907143473625183, + "learning_rate": 1.873957355758822e-05, + "loss": 0.5186, + "step": 16993 + }, + { + "epoch": 2.774172482755806, + "grad_norm": 1.8508812189102173, + "learning_rate": 1.873941932871917e-05, + "loss": 0.5601, + "step": 16994 + }, + { + "epoch": 2.7743357413983105, + "grad_norm": 1.9566173553466797, + "learning_rate": 1.8739265091049526e-05, + "loss": 0.5614, + "step": 16995 + }, + { + "epoch": 2.7744990000408145, + "grad_norm": 1.7837294340133667, + "learning_rate": 1.8739110844579444e-05, + "loss": 0.6121, + "step": 16996 + }, + { + "epoch": 2.774662258683319, + "grad_norm": 1.5795108079910278, + "learning_rate": 1.8738956589309083e-05, + "loss": 0.5244, + "step": 16997 + }, + { + "epoch": 2.7748255173258234, + "grad_norm": 1.720319390296936, + "learning_rate": 1.873880232523859e-05, + "loss": 0.5599, + "step": 16998 + }, + { + "epoch": 2.774988775968328, + "grad_norm": 1.6609724760055542, + "learning_rate": 1.873864805236813e-05, + "loss": 0.5471, + "step": 16999 + }, + { + "epoch": 2.7751520346108323, + "grad_norm": 1.7408483028411865, + "learning_rate": 1.873849377069785e-05, + "loss": 0.4542, + "step": 17000 + }, + { + "epoch": 2.7753152932533367, + "grad_norm": 1.9549657106399536, + "learning_rate": 1.8738339480227914e-05, + "loss": 0.5408, + "step": 17001 + }, + { + "epoch": 2.775478551895841, + "grad_norm": 2.4929585456848145, + "learning_rate": 1.8738185180958467e-05, + "loss": 0.6834, + "step": 17002 + }, + { + "epoch": 2.775641810538345, + "grad_norm": 1.3876250982284546, + "learning_rate": 1.8738030872889673e-05, + "loss": 0.4875, + "step": 17003 + }, + { + "epoch": 2.7758050691808496, + "grad_norm": 1.7589631080627441, + "learning_rate": 1.8737876556021683e-05, + "loss": 0.6145, + "step": 17004 + }, + { + "epoch": 2.775968327823354, + "grad_norm": 1.6112018823623657, + "learning_rate": 1.8737722230354654e-05, + "loss": 0.5336, + "step": 17005 + }, + { + "epoch": 2.7761315864658584, + "grad_norm": 2.0593907833099365, + "learning_rate": 1.8737567895888742e-05, + "loss": 0.6309, + "step": 17006 + }, + { + "epoch": 2.776294845108363, + "grad_norm": 2.0985512733459473, + "learning_rate": 1.87374135526241e-05, + "loss": 0.7184, + "step": 17007 + }, + { + "epoch": 2.7764581037508673, + "grad_norm": 1.8007304668426514, + "learning_rate": 1.8737259200560884e-05, + "loss": 0.5451, + "step": 17008 + }, + { + "epoch": 2.7766213623933718, + "grad_norm": 1.6653670072555542, + "learning_rate": 1.8737104839699253e-05, + "loss": 0.5857, + "step": 17009 + }, + { + "epoch": 2.776784621035876, + "grad_norm": 1.586082100868225, + "learning_rate": 1.8736950470039355e-05, + "loss": 0.5328, + "step": 17010 + }, + { + "epoch": 2.7769478796783806, + "grad_norm": 1.762214183807373, + "learning_rate": 1.8736796091581357e-05, + "loss": 0.5489, + "step": 17011 + }, + { + "epoch": 2.777111138320885, + "grad_norm": 1.9369316101074219, + "learning_rate": 1.8736641704325404e-05, + "loss": 0.6414, + "step": 17012 + }, + { + "epoch": 2.7772743969633895, + "grad_norm": 2.3751633167266846, + "learning_rate": 1.8736487308271655e-05, + "loss": 0.7247, + "step": 17013 + }, + { + "epoch": 2.7774376556058935, + "grad_norm": 1.6939313411712646, + "learning_rate": 1.8736332903420266e-05, + "loss": 0.5603, + "step": 17014 + }, + { + "epoch": 2.777600914248398, + "grad_norm": 1.9070448875427246, + "learning_rate": 1.8736178489771393e-05, + "loss": 0.5707, + "step": 17015 + }, + { + "epoch": 2.7777641728909024, + "grad_norm": 1.7823611497879028, + "learning_rate": 1.8736024067325188e-05, + "loss": 0.5066, + "step": 17016 + }, + { + "epoch": 2.777927431533407, + "grad_norm": 1.689481496810913, + "learning_rate": 1.873586963608181e-05, + "loss": 0.5182, + "step": 17017 + }, + { + "epoch": 2.7780906901759113, + "grad_norm": 1.8869339227676392, + "learning_rate": 1.8735715196041414e-05, + "loss": 0.6062, + "step": 17018 + }, + { + "epoch": 2.7782539488184157, + "grad_norm": 1.6360747814178467, + "learning_rate": 1.8735560747204156e-05, + "loss": 0.4913, + "step": 17019 + }, + { + "epoch": 2.7784172074609197, + "grad_norm": 1.7881152629852295, + "learning_rate": 1.8735406289570193e-05, + "loss": 0.6256, + "step": 17020 + }, + { + "epoch": 2.778580466103424, + "grad_norm": 1.7792365550994873, + "learning_rate": 1.8735251823139672e-05, + "loss": 0.5059, + "step": 17021 + }, + { + "epoch": 2.7787437247459286, + "grad_norm": 2.0360183715820312, + "learning_rate": 1.8735097347912757e-05, + "loss": 0.703, + "step": 17022 + }, + { + "epoch": 2.778906983388433, + "grad_norm": 1.7216336727142334, + "learning_rate": 1.8734942863889605e-05, + "loss": 0.5431, + "step": 17023 + }, + { + "epoch": 2.7790702420309374, + "grad_norm": 1.6142094135284424, + "learning_rate": 1.8734788371070365e-05, + "loss": 0.5498, + "step": 17024 + }, + { + "epoch": 2.779233500673442, + "grad_norm": 1.9985442161560059, + "learning_rate": 1.8734633869455196e-05, + "loss": 0.6746, + "step": 17025 + }, + { + "epoch": 2.7793967593159463, + "grad_norm": 1.6376078128814697, + "learning_rate": 1.8734479359044253e-05, + "loss": 0.5469, + "step": 17026 + }, + { + "epoch": 2.7795600179584508, + "grad_norm": 1.4445065259933472, + "learning_rate": 1.873432483983769e-05, + "loss": 0.4502, + "step": 17027 + }, + { + "epoch": 2.779723276600955, + "grad_norm": 1.7250950336456299, + "learning_rate": 1.8734170311835663e-05, + "loss": 0.5663, + "step": 17028 + }, + { + "epoch": 2.7798865352434596, + "grad_norm": 1.860705018043518, + "learning_rate": 1.8734015775038333e-05, + "loss": 0.577, + "step": 17029 + }, + { + "epoch": 2.780049793885964, + "grad_norm": 2.1266579627990723, + "learning_rate": 1.873386122944585e-05, + "loss": 0.5481, + "step": 17030 + }, + { + "epoch": 2.780213052528468, + "grad_norm": 1.8674778938293457, + "learning_rate": 1.873370667505837e-05, + "loss": 0.5041, + "step": 17031 + }, + { + "epoch": 2.7803763111709725, + "grad_norm": 1.5213466882705688, + "learning_rate": 1.873355211187605e-05, + "loss": 0.5444, + "step": 17032 + }, + { + "epoch": 2.780539569813477, + "grad_norm": 1.6364349126815796, + "learning_rate": 1.8733397539899046e-05, + "loss": 0.5316, + "step": 17033 + }, + { + "epoch": 2.7807028284559814, + "grad_norm": 1.8161762952804565, + "learning_rate": 1.8733242959127512e-05, + "loss": 0.6229, + "step": 17034 + }, + { + "epoch": 2.780866087098486, + "grad_norm": 1.9668946266174316, + "learning_rate": 1.8733088369561606e-05, + "loss": 0.489, + "step": 17035 + }, + { + "epoch": 2.7810293457409903, + "grad_norm": 1.5849436521530151, + "learning_rate": 1.873293377120148e-05, + "loss": 0.4852, + "step": 17036 + }, + { + "epoch": 2.7811926043834947, + "grad_norm": 2.0606863498687744, + "learning_rate": 1.8732779164047294e-05, + "loss": 0.5733, + "step": 17037 + }, + { + "epoch": 2.7813558630259987, + "grad_norm": 1.7260112762451172, + "learning_rate": 1.8732624548099204e-05, + "loss": 0.563, + "step": 17038 + }, + { + "epoch": 2.781519121668503, + "grad_norm": 1.7873455286026, + "learning_rate": 1.873246992335736e-05, + "loss": 0.5471, + "step": 17039 + }, + { + "epoch": 2.7816823803110076, + "grad_norm": 1.614669919013977, + "learning_rate": 1.873231528982192e-05, + "loss": 0.4987, + "step": 17040 + }, + { + "epoch": 2.781845638953512, + "grad_norm": 1.6999400854110718, + "learning_rate": 1.8732160647493046e-05, + "loss": 0.5705, + "step": 17041 + }, + { + "epoch": 2.7820088975960164, + "grad_norm": 1.3909984827041626, + "learning_rate": 1.8732005996370883e-05, + "loss": 0.4141, + "step": 17042 + }, + { + "epoch": 2.782172156238521, + "grad_norm": 1.9530187845230103, + "learning_rate": 1.8731851336455597e-05, + "loss": 0.6404, + "step": 17043 + }, + { + "epoch": 2.7823354148810253, + "grad_norm": 1.9728834629058838, + "learning_rate": 1.8731696667747336e-05, + "loss": 0.5424, + "step": 17044 + }, + { + "epoch": 2.7824986735235298, + "grad_norm": 2.137962579727173, + "learning_rate": 1.8731541990246256e-05, + "loss": 0.701, + "step": 17045 + }, + { + "epoch": 2.782661932166034, + "grad_norm": 1.5402780771255493, + "learning_rate": 1.873138730395252e-05, + "loss": 0.4745, + "step": 17046 + }, + { + "epoch": 2.7828251908085386, + "grad_norm": 1.7344380617141724, + "learning_rate": 1.873123260886628e-05, + "loss": 0.5688, + "step": 17047 + }, + { + "epoch": 2.782988449451043, + "grad_norm": 2.059652328491211, + "learning_rate": 1.8731077904987688e-05, + "loss": 0.6012, + "step": 17048 + }, + { + "epoch": 2.783151708093547, + "grad_norm": 2.09590744972229, + "learning_rate": 1.8730923192316903e-05, + "loss": 0.6186, + "step": 17049 + }, + { + "epoch": 2.7833149667360515, + "grad_norm": 1.9133718013763428, + "learning_rate": 1.8730768470854085e-05, + "loss": 0.5351, + "step": 17050 + }, + { + "epoch": 2.783478225378556, + "grad_norm": 1.7149291038513184, + "learning_rate": 1.873061374059938e-05, + "loss": 0.5734, + "step": 17051 + }, + { + "epoch": 2.7836414840210604, + "grad_norm": 2.2559893131256104, + "learning_rate": 1.873045900155295e-05, + "loss": 0.7068, + "step": 17052 + }, + { + "epoch": 2.783804742663565, + "grad_norm": 1.9016553163528442, + "learning_rate": 1.8730304253714954e-05, + "loss": 0.6271, + "step": 17053 + }, + { + "epoch": 2.7839680013060693, + "grad_norm": 1.6053084135055542, + "learning_rate": 1.873014949708554e-05, + "loss": 0.5697, + "step": 17054 + }, + { + "epoch": 2.7841312599485737, + "grad_norm": 1.8142805099487305, + "learning_rate": 1.872999473166487e-05, + "loss": 0.532, + "step": 17055 + }, + { + "epoch": 2.7842945185910777, + "grad_norm": 1.6667978763580322, + "learning_rate": 1.87298399574531e-05, + "loss": 0.574, + "step": 17056 + }, + { + "epoch": 2.784457777233582, + "grad_norm": 1.54232656955719, + "learning_rate": 1.872968517445038e-05, + "loss": 0.5214, + "step": 17057 + }, + { + "epoch": 2.7846210358760866, + "grad_norm": 1.6067293882369995, + "learning_rate": 1.8729530382656868e-05, + "loss": 0.5584, + "step": 17058 + }, + { + "epoch": 2.784784294518591, + "grad_norm": 1.6169523000717163, + "learning_rate": 1.8729375582072722e-05, + "loss": 0.5651, + "step": 17059 + }, + { + "epoch": 2.7849475531610954, + "grad_norm": 1.9359793663024902, + "learning_rate": 1.8729220772698096e-05, + "loss": 0.4096, + "step": 17060 + }, + { + "epoch": 2.7851108118036, + "grad_norm": 1.6039377450942993, + "learning_rate": 1.8729065954533152e-05, + "loss": 0.502, + "step": 17061 + }, + { + "epoch": 2.7852740704461043, + "grad_norm": 1.6492377519607544, + "learning_rate": 1.872891112757804e-05, + "loss": 0.5062, + "step": 17062 + }, + { + "epoch": 2.7854373290886087, + "grad_norm": 1.6289323568344116, + "learning_rate": 1.8728756291832912e-05, + "loss": 0.5624, + "step": 17063 + }, + { + "epoch": 2.785600587731113, + "grad_norm": 1.7960495948791504, + "learning_rate": 1.872860144729793e-05, + "loss": 0.6016, + "step": 17064 + }, + { + "epoch": 2.7857638463736176, + "grad_norm": 1.5653235912322998, + "learning_rate": 1.872844659397325e-05, + "loss": 0.4977, + "step": 17065 + }, + { + "epoch": 2.785927105016122, + "grad_norm": 1.8588969707489014, + "learning_rate": 1.8728291731859028e-05, + "loss": 0.6093, + "step": 17066 + }, + { + "epoch": 2.786090363658626, + "grad_norm": 1.6995714902877808, + "learning_rate": 1.8728136860955417e-05, + "loss": 0.5664, + "step": 17067 + }, + { + "epoch": 2.7862536223011305, + "grad_norm": 2.384856700897217, + "learning_rate": 1.8727981981262576e-05, + "loss": 0.9222, + "step": 17068 + }, + { + "epoch": 2.786416880943635, + "grad_norm": 2.0632901191711426, + "learning_rate": 1.8727827092780656e-05, + "loss": 0.5646, + "step": 17069 + }, + { + "epoch": 2.7865801395861394, + "grad_norm": 1.7999776601791382, + "learning_rate": 1.872767219550982e-05, + "loss": 0.5127, + "step": 17070 + }, + { + "epoch": 2.786743398228644, + "grad_norm": 1.620798110961914, + "learning_rate": 1.872751728945022e-05, + "loss": 0.6278, + "step": 17071 + }, + { + "epoch": 2.7869066568711482, + "grad_norm": 1.822139024734497, + "learning_rate": 1.872736237460201e-05, + "loss": 0.5738, + "step": 17072 + }, + { + "epoch": 2.7870699155136522, + "grad_norm": 1.8186088800430298, + "learning_rate": 1.872720745096535e-05, + "loss": 0.5764, + "step": 17073 + }, + { + "epoch": 2.7872331741561567, + "grad_norm": 1.8539931774139404, + "learning_rate": 1.8727052518540395e-05, + "loss": 0.6174, + "step": 17074 + }, + { + "epoch": 2.787396432798661, + "grad_norm": 1.7007077932357788, + "learning_rate": 1.87268975773273e-05, + "loss": 0.5026, + "step": 17075 + }, + { + "epoch": 2.7875596914411656, + "grad_norm": 1.686411738395691, + "learning_rate": 1.872674262732622e-05, + "loss": 0.5571, + "step": 17076 + }, + { + "epoch": 2.78772295008367, + "grad_norm": 1.556025505065918, + "learning_rate": 1.8726587668537314e-05, + "loss": 0.5077, + "step": 17077 + }, + { + "epoch": 2.7878862087261744, + "grad_norm": 1.6237934827804565, + "learning_rate": 1.8726432700960738e-05, + "loss": 0.5509, + "step": 17078 + }, + { + "epoch": 2.788049467368679, + "grad_norm": 1.8861494064331055, + "learning_rate": 1.8726277724596643e-05, + "loss": 0.5755, + "step": 17079 + }, + { + "epoch": 2.7882127260111833, + "grad_norm": 1.9063094854354858, + "learning_rate": 1.872612273944519e-05, + "loss": 0.5177, + "step": 17080 + }, + { + "epoch": 2.7883759846536877, + "grad_norm": 1.7446725368499756, + "learning_rate": 1.8725967745506538e-05, + "loss": 0.5418, + "step": 17081 + }, + { + "epoch": 2.788539243296192, + "grad_norm": 2.0573136806488037, + "learning_rate": 1.8725812742780832e-05, + "loss": 0.6485, + "step": 17082 + }, + { + "epoch": 2.7887025019386966, + "grad_norm": 1.8344477415084839, + "learning_rate": 1.8725657731268238e-05, + "loss": 0.4873, + "step": 17083 + }, + { + "epoch": 2.7888657605812006, + "grad_norm": 1.7149672508239746, + "learning_rate": 1.872550271096891e-05, + "loss": 0.5654, + "step": 17084 + }, + { + "epoch": 2.789029019223705, + "grad_norm": 1.780392050743103, + "learning_rate": 1.8725347681883e-05, + "loss": 0.5549, + "step": 17085 + }, + { + "epoch": 2.7891922778662095, + "grad_norm": 1.728710412979126, + "learning_rate": 1.872519264401067e-05, + "loss": 0.5184, + "step": 17086 + }, + { + "epoch": 2.789355536508714, + "grad_norm": 1.779109239578247, + "learning_rate": 1.8725037597352075e-05, + "loss": 0.5754, + "step": 17087 + }, + { + "epoch": 2.7895187951512184, + "grad_norm": 1.7444349527359009, + "learning_rate": 1.8724882541907365e-05, + "loss": 0.6128, + "step": 17088 + }, + { + "epoch": 2.789682053793723, + "grad_norm": 1.7429448366165161, + "learning_rate": 1.8724727477676703e-05, + "loss": 0.5194, + "step": 17089 + }, + { + "epoch": 2.7898453124362272, + "grad_norm": 1.955499291419983, + "learning_rate": 1.8724572404660243e-05, + "loss": 0.5169, + "step": 17090 + }, + { + "epoch": 2.7900085710787312, + "grad_norm": 1.8044217824935913, + "learning_rate": 1.8724417322858137e-05, + "loss": 0.6319, + "step": 17091 + }, + { + "epoch": 2.7901718297212357, + "grad_norm": 2.0699164867401123, + "learning_rate": 1.872426223227055e-05, + "loss": 0.5836, + "step": 17092 + }, + { + "epoch": 2.79033508836374, + "grad_norm": 1.8543236255645752, + "learning_rate": 1.872410713289763e-05, + "loss": 0.6447, + "step": 17093 + }, + { + "epoch": 2.7904983470062446, + "grad_norm": 1.9564565420150757, + "learning_rate": 1.8723952024739536e-05, + "loss": 0.5402, + "step": 17094 + }, + { + "epoch": 2.790661605648749, + "grad_norm": 1.7094992399215698, + "learning_rate": 1.872379690779643e-05, + "loss": 0.6222, + "step": 17095 + }, + { + "epoch": 2.7908248642912534, + "grad_norm": 1.8307757377624512, + "learning_rate": 1.872364178206846e-05, + "loss": 0.5281, + "step": 17096 + }, + { + "epoch": 2.790988122933758, + "grad_norm": 1.814497709274292, + "learning_rate": 1.8723486647555783e-05, + "loss": 0.6575, + "step": 17097 + }, + { + "epoch": 2.7911513815762623, + "grad_norm": 2.0067806243896484, + "learning_rate": 1.8723331504258557e-05, + "loss": 0.6343, + "step": 17098 + }, + { + "epoch": 2.7913146402187667, + "grad_norm": 2.057681083679199, + "learning_rate": 1.8723176352176945e-05, + "loss": 0.5795, + "step": 17099 + }, + { + "epoch": 2.791477898861271, + "grad_norm": 1.8357090950012207, + "learning_rate": 1.872302119131109e-05, + "loss": 0.6084, + "step": 17100 + }, + { + "epoch": 2.7916411575037756, + "grad_norm": 1.8608717918395996, + "learning_rate": 1.8722866021661157e-05, + "loss": 0.5463, + "step": 17101 + }, + { + "epoch": 2.7918044161462796, + "grad_norm": 2.0890164375305176, + "learning_rate": 1.87227108432273e-05, + "loss": 0.7032, + "step": 17102 + }, + { + "epoch": 2.791967674788784, + "grad_norm": 1.7082842588424683, + "learning_rate": 1.8722555656009677e-05, + "loss": 0.5671, + "step": 17103 + }, + { + "epoch": 2.7921309334312885, + "grad_norm": 2.0605380535125732, + "learning_rate": 1.8722400460008437e-05, + "loss": 0.7509, + "step": 17104 + }, + { + "epoch": 2.792294192073793, + "grad_norm": 1.736609697341919, + "learning_rate": 1.8722245255223748e-05, + "loss": 0.5872, + "step": 17105 + }, + { + "epoch": 2.7924574507162974, + "grad_norm": 1.7840811014175415, + "learning_rate": 1.872209004165576e-05, + "loss": 0.6096, + "step": 17106 + }, + { + "epoch": 2.792620709358802, + "grad_norm": 2.0116987228393555, + "learning_rate": 1.8721934819304627e-05, + "loss": 0.7399, + "step": 17107 + }, + { + "epoch": 2.792783968001306, + "grad_norm": 1.9538835287094116, + "learning_rate": 1.872177958817051e-05, + "loss": 0.6343, + "step": 17108 + }, + { + "epoch": 2.7929472266438102, + "grad_norm": 1.5346919298171997, + "learning_rate": 1.872162434825356e-05, + "loss": 0.4795, + "step": 17109 + }, + { + "epoch": 2.7931104852863147, + "grad_norm": 1.6239686012268066, + "learning_rate": 1.8721469099553943e-05, + "loss": 0.4918, + "step": 17110 + }, + { + "epoch": 2.793273743928819, + "grad_norm": 1.5280585289001465, + "learning_rate": 1.8721313842071803e-05, + "loss": 0.574, + "step": 17111 + }, + { + "epoch": 2.7934370025713235, + "grad_norm": 1.464654803276062, + "learning_rate": 1.8721158575807307e-05, + "loss": 0.5057, + "step": 17112 + }, + { + "epoch": 2.793600261213828, + "grad_norm": 1.8277825117111206, + "learning_rate": 1.87210033007606e-05, + "loss": 0.5502, + "step": 17113 + }, + { + "epoch": 2.7937635198563324, + "grad_norm": 1.8506476879119873, + "learning_rate": 1.872084801693185e-05, + "loss": 0.618, + "step": 17114 + }, + { + "epoch": 2.793926778498837, + "grad_norm": 1.645798921585083, + "learning_rate": 1.8720692724321207e-05, + "loss": 0.6144, + "step": 17115 + }, + { + "epoch": 2.7940900371413413, + "grad_norm": 1.7858378887176514, + "learning_rate": 1.8720537422928832e-05, + "loss": 0.6424, + "step": 17116 + }, + { + "epoch": 2.7942532957838457, + "grad_norm": 1.481689453125, + "learning_rate": 1.8720382112754873e-05, + "loss": 0.4384, + "step": 17117 + }, + { + "epoch": 2.79441655442635, + "grad_norm": 1.6285743713378906, + "learning_rate": 1.8720226793799493e-05, + "loss": 0.4709, + "step": 17118 + }, + { + "epoch": 2.794579813068854, + "grad_norm": 1.6571630239486694, + "learning_rate": 1.8720071466062852e-05, + "loss": 0.5534, + "step": 17119 + }, + { + "epoch": 2.7947430717113586, + "grad_norm": 1.4141291379928589, + "learning_rate": 1.8719916129545094e-05, + "loss": 0.447, + "step": 17120 + }, + { + "epoch": 2.794906330353863, + "grad_norm": 1.3394076824188232, + "learning_rate": 1.8719760784246387e-05, + "loss": 0.4554, + "step": 17121 + }, + { + "epoch": 2.7950695889963675, + "grad_norm": 1.8764984607696533, + "learning_rate": 1.8719605430166878e-05, + "loss": 0.6091, + "step": 17122 + }, + { + "epoch": 2.795232847638872, + "grad_norm": 2.231739044189453, + "learning_rate": 1.8719450067306735e-05, + "loss": 0.7727, + "step": 17123 + }, + { + "epoch": 2.7953961062813764, + "grad_norm": 1.5209869146347046, + "learning_rate": 1.8719294695666105e-05, + "loss": 0.5474, + "step": 17124 + }, + { + "epoch": 2.795559364923881, + "grad_norm": 1.4742927551269531, + "learning_rate": 1.8719139315245146e-05, + "loss": 0.5118, + "step": 17125 + }, + { + "epoch": 2.795722623566385, + "grad_norm": 1.7178586721420288, + "learning_rate": 1.871898392604402e-05, + "loss": 0.4972, + "step": 17126 + }, + { + "epoch": 2.7958858822088892, + "grad_norm": 1.903619647026062, + "learning_rate": 1.8718828528062878e-05, + "loss": 0.5818, + "step": 17127 + }, + { + "epoch": 2.7960491408513937, + "grad_norm": 1.9767827987670898, + "learning_rate": 1.871867312130188e-05, + "loss": 0.6215, + "step": 17128 + }, + { + "epoch": 2.796212399493898, + "grad_norm": 1.5967093706130981, + "learning_rate": 1.8718517705761177e-05, + "loss": 0.5628, + "step": 17129 + }, + { + "epoch": 2.7963756581364025, + "grad_norm": 1.785354733467102, + "learning_rate": 1.871836228144093e-05, + "loss": 0.5652, + "step": 17130 + }, + { + "epoch": 2.796538916778907, + "grad_norm": 1.7153159379959106, + "learning_rate": 1.8718206848341298e-05, + "loss": 0.5984, + "step": 17131 + }, + { + "epoch": 2.7967021754214114, + "grad_norm": 1.9811500310897827, + "learning_rate": 1.8718051406462426e-05, + "loss": 0.5033, + "step": 17132 + }, + { + "epoch": 2.796865434063916, + "grad_norm": 1.675065279006958, + "learning_rate": 1.8717895955804482e-05, + "loss": 0.555, + "step": 17133 + }, + { + "epoch": 2.7970286927064203, + "grad_norm": 1.811787486076355, + "learning_rate": 1.8717740496367624e-05, + "loss": 0.5971, + "step": 17134 + }, + { + "epoch": 2.7971919513489247, + "grad_norm": 1.789440631866455, + "learning_rate": 1.8717585028152e-05, + "loss": 0.5585, + "step": 17135 + }, + { + "epoch": 2.797355209991429, + "grad_norm": 1.4940167665481567, + "learning_rate": 1.871742955115777e-05, + "loss": 0.4836, + "step": 17136 + }, + { + "epoch": 2.797518468633933, + "grad_norm": 1.6896947622299194, + "learning_rate": 1.8717274065385092e-05, + "loss": 0.5759, + "step": 17137 + }, + { + "epoch": 2.7976817272764376, + "grad_norm": 1.5029852390289307, + "learning_rate": 1.8717118570834118e-05, + "loss": 0.5062, + "step": 17138 + }, + { + "epoch": 2.797844985918942, + "grad_norm": 1.517225980758667, + "learning_rate": 1.8716963067505012e-05, + "loss": 0.4995, + "step": 17139 + }, + { + "epoch": 2.7980082445614465, + "grad_norm": 1.822387456893921, + "learning_rate": 1.8716807555397924e-05, + "loss": 0.7399, + "step": 17140 + }, + { + "epoch": 2.798171503203951, + "grad_norm": 1.838546872138977, + "learning_rate": 1.8716652034513013e-05, + "loss": 0.5645, + "step": 17141 + }, + { + "epoch": 2.7983347618464554, + "grad_norm": 1.6739639043807983, + "learning_rate": 1.8716496504850436e-05, + "loss": 0.5677, + "step": 17142 + }, + { + "epoch": 2.79849802048896, + "grad_norm": 1.655452013015747, + "learning_rate": 1.871634096641035e-05, + "loss": 0.5197, + "step": 17143 + }, + { + "epoch": 2.798661279131464, + "grad_norm": 2.1170830726623535, + "learning_rate": 1.871618541919291e-05, + "loss": 0.7637, + "step": 17144 + }, + { + "epoch": 2.7988245377739682, + "grad_norm": 1.7444794178009033, + "learning_rate": 1.8716029863198274e-05, + "loss": 0.5315, + "step": 17145 + }, + { + "epoch": 2.7989877964164727, + "grad_norm": 1.647229552268982, + "learning_rate": 1.87158742984266e-05, + "loss": 0.455, + "step": 17146 + }, + { + "epoch": 2.799151055058977, + "grad_norm": 1.9116860628128052, + "learning_rate": 1.8715718724878044e-05, + "loss": 0.674, + "step": 17147 + }, + { + "epoch": 2.7993143137014815, + "grad_norm": 1.8898736238479614, + "learning_rate": 1.8715563142552758e-05, + "loss": 0.5633, + "step": 17148 + }, + { + "epoch": 2.799477572343986, + "grad_norm": 1.8704588413238525, + "learning_rate": 1.8715407551450903e-05, + "loss": 0.6057, + "step": 17149 + }, + { + "epoch": 2.7996408309864904, + "grad_norm": 1.6999191045761108, + "learning_rate": 1.8715251951572635e-05, + "loss": 0.5372, + "step": 17150 + }, + { + "epoch": 2.799804089628995, + "grad_norm": 1.6348469257354736, + "learning_rate": 1.871509634291811e-05, + "loss": 0.5297, + "step": 17151 + }, + { + "epoch": 2.7999673482714993, + "grad_norm": 1.91736900806427, + "learning_rate": 1.8714940725487486e-05, + "loss": 0.5542, + "step": 17152 + }, + { + "epoch": 2.8001306069140037, + "grad_norm": 1.776286244392395, + "learning_rate": 1.871478509928092e-05, + "loss": 0.4571, + "step": 17153 + }, + { + "epoch": 2.800293865556508, + "grad_norm": 2.1735870838165283, + "learning_rate": 1.8714629464298567e-05, + "loss": 0.7549, + "step": 17154 + }, + { + "epoch": 2.800457124199012, + "grad_norm": 1.7920817136764526, + "learning_rate": 1.8714473820540584e-05, + "loss": 0.5412, + "step": 17155 + }, + { + "epoch": 2.8006203828415166, + "grad_norm": 1.8092846870422363, + "learning_rate": 1.871431816800713e-05, + "loss": 0.5587, + "step": 17156 + }, + { + "epoch": 2.800783641484021, + "grad_norm": 1.7333770990371704, + "learning_rate": 1.8714162506698355e-05, + "loss": 0.5478, + "step": 17157 + }, + { + "epoch": 2.8009469001265255, + "grad_norm": 2.304982900619507, + "learning_rate": 1.8714006836614426e-05, + "loss": 0.7371, + "step": 17158 + }, + { + "epoch": 2.80111015876903, + "grad_norm": 1.5867565870285034, + "learning_rate": 1.8713851157755495e-05, + "loss": 0.5156, + "step": 17159 + }, + { + "epoch": 2.8012734174115343, + "grad_norm": 1.616225004196167, + "learning_rate": 1.8713695470121714e-05, + "loss": 0.3999, + "step": 17160 + }, + { + "epoch": 2.8014366760540383, + "grad_norm": 2.1913020610809326, + "learning_rate": 1.8713539773713246e-05, + "loss": 0.6375, + "step": 17161 + }, + { + "epoch": 2.801599934696543, + "grad_norm": 1.5897860527038574, + "learning_rate": 1.8713384068530243e-05, + "loss": 0.4895, + "step": 17162 + }, + { + "epoch": 2.801763193339047, + "grad_norm": 1.4825299978256226, + "learning_rate": 1.8713228354572866e-05, + "loss": 0.4742, + "step": 17163 + }, + { + "epoch": 2.8019264519815517, + "grad_norm": 1.9240847826004028, + "learning_rate": 1.8713072631841275e-05, + "loss": 0.7003, + "step": 17164 + }, + { + "epoch": 2.802089710624056, + "grad_norm": 1.6458756923675537, + "learning_rate": 1.8712916900335618e-05, + "loss": 0.4961, + "step": 17165 + }, + { + "epoch": 2.8022529692665605, + "grad_norm": 1.6858245134353638, + "learning_rate": 1.8712761160056055e-05, + "loss": 0.5507, + "step": 17166 + }, + { + "epoch": 2.802416227909065, + "grad_norm": 1.6827044486999512, + "learning_rate": 1.8712605411002746e-05, + "loss": 0.5317, + "step": 17167 + }, + { + "epoch": 2.8025794865515694, + "grad_norm": 1.9226188659667969, + "learning_rate": 1.8712449653175845e-05, + "loss": 0.5778, + "step": 17168 + }, + { + "epoch": 2.802742745194074, + "grad_norm": 1.6174261569976807, + "learning_rate": 1.871229388657551e-05, + "loss": 0.4903, + "step": 17169 + }, + { + "epoch": 2.8029060038365783, + "grad_norm": 1.9690009355545044, + "learning_rate": 1.8712138111201898e-05, + "loss": 0.5587, + "step": 17170 + }, + { + "epoch": 2.8030692624790827, + "grad_norm": 2.0121724605560303, + "learning_rate": 1.8711982327055163e-05, + "loss": 0.648, + "step": 17171 + }, + { + "epoch": 2.8032325211215867, + "grad_norm": 1.773256778717041, + "learning_rate": 1.8711826534135463e-05, + "loss": 0.5565, + "step": 17172 + }, + { + "epoch": 2.803395779764091, + "grad_norm": 1.8513621091842651, + "learning_rate": 1.871167073244296e-05, + "loss": 0.597, + "step": 17173 + }, + { + "epoch": 2.8035590384065956, + "grad_norm": 2.258924722671509, + "learning_rate": 1.8711514921977806e-05, + "loss": 0.6837, + "step": 17174 + }, + { + "epoch": 2.8037222970491, + "grad_norm": 1.616266131401062, + "learning_rate": 1.8711359102740156e-05, + "loss": 0.5134, + "step": 17175 + }, + { + "epoch": 2.8038855556916045, + "grad_norm": 2.142273187637329, + "learning_rate": 1.8711203274730172e-05, + "loss": 0.6675, + "step": 17176 + }, + { + "epoch": 2.804048814334109, + "grad_norm": 1.697885513305664, + "learning_rate": 1.871104743794801e-05, + "loss": 0.6194, + "step": 17177 + }, + { + "epoch": 2.8042120729766133, + "grad_norm": 2.2029380798339844, + "learning_rate": 1.8710891592393823e-05, + "loss": 0.6885, + "step": 17178 + }, + { + "epoch": 2.8043753316191173, + "grad_norm": 1.9925802946090698, + "learning_rate": 1.871073573806777e-05, + "loss": 0.7104, + "step": 17179 + }, + { + "epoch": 2.8045385902616218, + "grad_norm": 1.9414154291152954, + "learning_rate": 1.871057987497001e-05, + "loss": 0.726, + "step": 17180 + }, + { + "epoch": 2.804701848904126, + "grad_norm": 1.7174521684646606, + "learning_rate": 1.8710424003100698e-05, + "loss": 0.5749, + "step": 17181 + }, + { + "epoch": 2.8048651075466307, + "grad_norm": 1.6031368970870972, + "learning_rate": 1.871026812245999e-05, + "loss": 0.5501, + "step": 17182 + }, + { + "epoch": 2.805028366189135, + "grad_norm": 1.9977513551712036, + "learning_rate": 1.871011223304805e-05, + "loss": 0.5638, + "step": 17183 + }, + { + "epoch": 2.8051916248316395, + "grad_norm": 1.5023610591888428, + "learning_rate": 1.870995633486502e-05, + "loss": 0.5202, + "step": 17184 + }, + { + "epoch": 2.805354883474144, + "grad_norm": 1.6566226482391357, + "learning_rate": 1.8709800427911072e-05, + "loss": 0.509, + "step": 17185 + }, + { + "epoch": 2.8055181421166484, + "grad_norm": 1.5050160884857178, + "learning_rate": 1.8709644512186358e-05, + "loss": 0.589, + "step": 17186 + }, + { + "epoch": 2.805681400759153, + "grad_norm": 2.4857699871063232, + "learning_rate": 1.870948858769103e-05, + "loss": 0.6379, + "step": 17187 + }, + { + "epoch": 2.8058446594016573, + "grad_norm": 1.5097664594650269, + "learning_rate": 1.8709332654425253e-05, + "loss": 0.5236, + "step": 17188 + }, + { + "epoch": 2.8060079180441617, + "grad_norm": 1.9204481840133667, + "learning_rate": 1.8709176712389178e-05, + "loss": 0.551, + "step": 17189 + }, + { + "epoch": 2.8061711766866657, + "grad_norm": 1.6182008981704712, + "learning_rate": 1.8709020761582967e-05, + "loss": 0.4532, + "step": 17190 + }, + { + "epoch": 2.80633443532917, + "grad_norm": 1.5383261442184448, + "learning_rate": 1.8708864802006774e-05, + "loss": 0.4872, + "step": 17191 + }, + { + "epoch": 2.8064976939716746, + "grad_norm": 1.7319843769073486, + "learning_rate": 1.8708708833660755e-05, + "loss": 0.6197, + "step": 17192 + }, + { + "epoch": 2.806660952614179, + "grad_norm": 1.7208744287490845, + "learning_rate": 1.870855285654507e-05, + "loss": 0.5299, + "step": 17193 + }, + { + "epoch": 2.8068242112566835, + "grad_norm": 1.6972123384475708, + "learning_rate": 1.8708396870659872e-05, + "loss": 0.4846, + "step": 17194 + }, + { + "epoch": 2.806987469899188, + "grad_norm": 1.3941377401351929, + "learning_rate": 1.8708240876005324e-05, + "loss": 0.4706, + "step": 17195 + }, + { + "epoch": 2.8071507285416923, + "grad_norm": 1.69076669216156, + "learning_rate": 1.870808487258158e-05, + "loss": 0.5484, + "step": 17196 + }, + { + "epoch": 2.8073139871841963, + "grad_norm": 1.6400057077407837, + "learning_rate": 1.870792886038879e-05, + "loss": 0.6573, + "step": 17197 + }, + { + "epoch": 2.8074772458267008, + "grad_norm": 1.5634244680404663, + "learning_rate": 1.8707772839427124e-05, + "loss": 0.4815, + "step": 17198 + }, + { + "epoch": 2.807640504469205, + "grad_norm": 1.7634202241897583, + "learning_rate": 1.8707616809696735e-05, + "loss": 0.6148, + "step": 17199 + }, + { + "epoch": 2.8078037631117096, + "grad_norm": 1.8735967874526978, + "learning_rate": 1.8707460771197773e-05, + "loss": 0.6037, + "step": 17200 + }, + { + "epoch": 2.807967021754214, + "grad_norm": 1.9749094247817993, + "learning_rate": 1.8707304723930404e-05, + "loss": 0.6898, + "step": 17201 + }, + { + "epoch": 2.8081302803967185, + "grad_norm": 1.5848931074142456, + "learning_rate": 1.870714866789478e-05, + "loss": 0.4809, + "step": 17202 + }, + { + "epoch": 2.808293539039223, + "grad_norm": 1.6882058382034302, + "learning_rate": 1.8706992603091057e-05, + "loss": 0.5379, + "step": 17203 + }, + { + "epoch": 2.8084567976817274, + "grad_norm": 1.6271591186523438, + "learning_rate": 1.8706836529519398e-05, + "loss": 0.5051, + "step": 17204 + }, + { + "epoch": 2.808620056324232, + "grad_norm": 1.7973142862319946, + "learning_rate": 1.8706680447179955e-05, + "loss": 0.5146, + "step": 17205 + }, + { + "epoch": 2.8087833149667363, + "grad_norm": 1.6381945610046387, + "learning_rate": 1.870652435607289e-05, + "loss": 0.5068, + "step": 17206 + }, + { + "epoch": 2.8089465736092407, + "grad_norm": 1.9906976222991943, + "learning_rate": 1.8706368256198356e-05, + "loss": 0.6096, + "step": 17207 + }, + { + "epoch": 2.8091098322517447, + "grad_norm": 2.055434226989746, + "learning_rate": 1.870621214755651e-05, + "loss": 0.5372, + "step": 17208 + }, + { + "epoch": 2.809273090894249, + "grad_norm": 1.377748727798462, + "learning_rate": 1.8706056030147512e-05, + "loss": 0.4449, + "step": 17209 + }, + { + "epoch": 2.8094363495367536, + "grad_norm": 2.1051223278045654, + "learning_rate": 1.870589990397152e-05, + "loss": 0.605, + "step": 17210 + }, + { + "epoch": 2.809599608179258, + "grad_norm": 2.3544068336486816, + "learning_rate": 1.8705743769028687e-05, + "loss": 0.7284, + "step": 17211 + }, + { + "epoch": 2.8097628668217625, + "grad_norm": 2.2355659008026123, + "learning_rate": 1.8705587625319167e-05, + "loss": 0.7012, + "step": 17212 + }, + { + "epoch": 2.809926125464267, + "grad_norm": 1.9192676544189453, + "learning_rate": 1.870543147284313e-05, + "loss": 0.581, + "step": 17213 + }, + { + "epoch": 2.810089384106771, + "grad_norm": 1.6217750310897827, + "learning_rate": 1.8705275311600724e-05, + "loss": 0.5392, + "step": 17214 + }, + { + "epoch": 2.8102526427492753, + "grad_norm": 1.7613965272903442, + "learning_rate": 1.8705119141592108e-05, + "loss": 0.5949, + "step": 17215 + }, + { + "epoch": 2.8104159013917798, + "grad_norm": 1.7473149299621582, + "learning_rate": 1.870496296281744e-05, + "loss": 0.5289, + "step": 17216 + }, + { + "epoch": 2.810579160034284, + "grad_norm": 1.8169668912887573, + "learning_rate": 1.8704806775276874e-05, + "loss": 0.6406, + "step": 17217 + }, + { + "epoch": 2.8107424186767886, + "grad_norm": 1.5891401767730713, + "learning_rate": 1.870465057897057e-05, + "loss": 0.4285, + "step": 17218 + }, + { + "epoch": 2.810905677319293, + "grad_norm": 1.9422376155853271, + "learning_rate": 1.8704494373898687e-05, + "loss": 0.6147, + "step": 17219 + }, + { + "epoch": 2.8110689359617975, + "grad_norm": 1.9877514839172363, + "learning_rate": 1.870433816006138e-05, + "loss": 0.592, + "step": 17220 + }, + { + "epoch": 2.811232194604302, + "grad_norm": 1.6401351690292358, + "learning_rate": 1.8704181937458808e-05, + "loss": 0.6111, + "step": 17221 + }, + { + "epoch": 2.8113954532468064, + "grad_norm": 1.542031168937683, + "learning_rate": 1.8704025706091124e-05, + "loss": 0.5188, + "step": 17222 + }, + { + "epoch": 2.811558711889311, + "grad_norm": 1.5012637376785278, + "learning_rate": 1.870386946595849e-05, + "loss": 0.5066, + "step": 17223 + }, + { + "epoch": 2.8117219705318153, + "grad_norm": 1.4282277822494507, + "learning_rate": 1.8703713217061064e-05, + "loss": 0.477, + "step": 17224 + }, + { + "epoch": 2.8118852291743193, + "grad_norm": 1.7379660606384277, + "learning_rate": 1.8703556959398998e-05, + "loss": 0.5908, + "step": 17225 + }, + { + "epoch": 2.8120484878168237, + "grad_norm": 1.5535730123519897, + "learning_rate": 1.8703400692972455e-05, + "loss": 0.4807, + "step": 17226 + }, + { + "epoch": 2.812211746459328, + "grad_norm": 2.0971555709838867, + "learning_rate": 1.8703244417781587e-05, + "loss": 0.629, + "step": 17227 + }, + { + "epoch": 2.8123750051018326, + "grad_norm": 1.9601941108703613, + "learning_rate": 1.8703088133826558e-05, + "loss": 0.5598, + "step": 17228 + }, + { + "epoch": 2.812538263744337, + "grad_norm": 1.6379350423812866, + "learning_rate": 1.870293184110752e-05, + "loss": 0.529, + "step": 17229 + }, + { + "epoch": 2.8127015223868415, + "grad_norm": 1.5349886417388916, + "learning_rate": 1.8702775539624628e-05, + "loss": 0.5102, + "step": 17230 + }, + { + "epoch": 2.812864781029346, + "grad_norm": 1.7630665302276611, + "learning_rate": 1.8702619229378048e-05, + "loss": 0.546, + "step": 17231 + }, + { + "epoch": 2.81302803967185, + "grad_norm": 1.6848056316375732, + "learning_rate": 1.870246291036793e-05, + "loss": 0.516, + "step": 17232 + }, + { + "epoch": 2.8131912983143543, + "grad_norm": 1.9701974391937256, + "learning_rate": 1.8702306582594433e-05, + "loss": 0.6317, + "step": 17233 + }, + { + "epoch": 2.8133545569568588, + "grad_norm": 1.8394863605499268, + "learning_rate": 1.870215024605772e-05, + "loss": 0.555, + "step": 17234 + }, + { + "epoch": 2.813517815599363, + "grad_norm": 1.7940844297409058, + "learning_rate": 1.8701993900757942e-05, + "loss": 0.556, + "step": 17235 + }, + { + "epoch": 2.8136810742418676, + "grad_norm": 1.4964839220046997, + "learning_rate": 1.870183754669526e-05, + "loss": 0.4725, + "step": 17236 + }, + { + "epoch": 2.813844332884372, + "grad_norm": 1.9736336469650269, + "learning_rate": 1.8701681183869825e-05, + "loss": 0.5773, + "step": 17237 + }, + { + "epoch": 2.8140075915268765, + "grad_norm": 1.6926928758621216, + "learning_rate": 1.8701524812281804e-05, + "loss": 0.5413, + "step": 17238 + }, + { + "epoch": 2.814170850169381, + "grad_norm": 1.8435646295547485, + "learning_rate": 1.870136843193135e-05, + "loss": 0.5857, + "step": 17239 + }, + { + "epoch": 2.8143341088118854, + "grad_norm": 1.6880252361297607, + "learning_rate": 1.8701212042818616e-05, + "loss": 0.4825, + "step": 17240 + }, + { + "epoch": 2.81449736745439, + "grad_norm": 1.8600562810897827, + "learning_rate": 1.8701055644943768e-05, + "loss": 0.6058, + "step": 17241 + }, + { + "epoch": 2.8146606260968943, + "grad_norm": 1.5524762868881226, + "learning_rate": 1.8700899238306956e-05, + "loss": 0.5135, + "step": 17242 + }, + { + "epoch": 2.8148238847393983, + "grad_norm": 1.9206198453903198, + "learning_rate": 1.8700742822908345e-05, + "loss": 0.5699, + "step": 17243 + }, + { + "epoch": 2.8149871433819027, + "grad_norm": 1.8925408124923706, + "learning_rate": 1.8700586398748085e-05, + "loss": 0.6105, + "step": 17244 + }, + { + "epoch": 2.815150402024407, + "grad_norm": 1.8709620237350464, + "learning_rate": 1.8700429965826337e-05, + "loss": 0.5302, + "step": 17245 + }, + { + "epoch": 2.8153136606669116, + "grad_norm": 1.6788350343704224, + "learning_rate": 1.870027352414326e-05, + "loss": 0.5345, + "step": 17246 + }, + { + "epoch": 2.815476919309416, + "grad_norm": 1.9488186836242676, + "learning_rate": 1.870011707369901e-05, + "loss": 0.619, + "step": 17247 + }, + { + "epoch": 2.8156401779519205, + "grad_norm": 1.7203797101974487, + "learning_rate": 1.869996061449374e-05, + "loss": 0.5373, + "step": 17248 + }, + { + "epoch": 2.8158034365944244, + "grad_norm": 1.9176677465438843, + "learning_rate": 1.8699804146527617e-05, + "loss": 0.5035, + "step": 17249 + }, + { + "epoch": 2.815966695236929, + "grad_norm": 1.9909449815750122, + "learning_rate": 1.869964766980079e-05, + "loss": 0.5778, + "step": 17250 + }, + { + "epoch": 2.8161299538794333, + "grad_norm": 1.7532309293746948, + "learning_rate": 1.8699491184313422e-05, + "loss": 0.5043, + "step": 17251 + }, + { + "epoch": 2.8162932125219378, + "grad_norm": 2.1183066368103027, + "learning_rate": 1.8699334690065672e-05, + "loss": 0.5595, + "step": 17252 + }, + { + "epoch": 2.816456471164442, + "grad_norm": 1.8930306434631348, + "learning_rate": 1.869917818705769e-05, + "loss": 0.5782, + "step": 17253 + }, + { + "epoch": 2.8166197298069466, + "grad_norm": 1.609371304512024, + "learning_rate": 1.869902167528964e-05, + "loss": 0.5401, + "step": 17254 + }, + { + "epoch": 2.816782988449451, + "grad_norm": 1.9157899618148804, + "learning_rate": 1.8698865154761673e-05, + "loss": 0.6099, + "step": 17255 + }, + { + "epoch": 2.8169462470919555, + "grad_norm": 1.6352628469467163, + "learning_rate": 1.8698708625473956e-05, + "loss": 0.5308, + "step": 17256 + }, + { + "epoch": 2.81710950573446, + "grad_norm": 1.9747720956802368, + "learning_rate": 1.869855208742664e-05, + "loss": 0.588, + "step": 17257 + }, + { + "epoch": 2.8172727643769644, + "grad_norm": 1.4987632036209106, + "learning_rate": 1.8698395540619883e-05, + "loss": 0.4975, + "step": 17258 + }, + { + "epoch": 2.817436023019469, + "grad_norm": 1.613078236579895, + "learning_rate": 1.8698238985053846e-05, + "loss": 0.5238, + "step": 17259 + }, + { + "epoch": 2.817599281661973, + "grad_norm": 1.5811842679977417, + "learning_rate": 1.8698082420728685e-05, + "loss": 0.5388, + "step": 17260 + }, + { + "epoch": 2.8177625403044773, + "grad_norm": 1.8937411308288574, + "learning_rate": 1.8697925847644557e-05, + "loss": 0.5406, + "step": 17261 + }, + { + "epoch": 2.8179257989469817, + "grad_norm": 1.880462646484375, + "learning_rate": 1.8697769265801616e-05, + "loss": 0.5855, + "step": 17262 + }, + { + "epoch": 2.818089057589486, + "grad_norm": 2.0337398052215576, + "learning_rate": 1.8697612675200027e-05, + "loss": 0.5994, + "step": 17263 + }, + { + "epoch": 2.8182523162319906, + "grad_norm": 1.9251532554626465, + "learning_rate": 1.8697456075839947e-05, + "loss": 0.5425, + "step": 17264 + }, + { + "epoch": 2.818415574874495, + "grad_norm": 1.8267496824264526, + "learning_rate": 1.869729946772153e-05, + "loss": 0.5301, + "step": 17265 + }, + { + "epoch": 2.8185788335169994, + "grad_norm": 1.497531771659851, + "learning_rate": 1.869714285084493e-05, + "loss": 0.5046, + "step": 17266 + }, + { + "epoch": 2.8187420921595034, + "grad_norm": 1.7988173961639404, + "learning_rate": 1.8696986225210315e-05, + "loss": 0.6142, + "step": 17267 + }, + { + "epoch": 2.818905350802008, + "grad_norm": 2.074739456176758, + "learning_rate": 1.8696829590817833e-05, + "loss": 0.6618, + "step": 17268 + }, + { + "epoch": 2.8190686094445123, + "grad_norm": 1.7981610298156738, + "learning_rate": 1.8696672947667648e-05, + "loss": 0.5993, + "step": 17269 + }, + { + "epoch": 2.8192318680870168, + "grad_norm": 1.7672735452651978, + "learning_rate": 1.8696516295759914e-05, + "loss": 0.5863, + "step": 17270 + }, + { + "epoch": 2.819395126729521, + "grad_norm": 1.9021672010421753, + "learning_rate": 1.869635963509479e-05, + "loss": 0.6537, + "step": 17271 + }, + { + "epoch": 2.8195583853720256, + "grad_norm": 1.8504589796066284, + "learning_rate": 1.8696202965672435e-05, + "loss": 0.6168, + "step": 17272 + }, + { + "epoch": 2.81972164401453, + "grad_norm": 1.7229737043380737, + "learning_rate": 1.869604628749301e-05, + "loss": 0.6462, + "step": 17273 + }, + { + "epoch": 2.8198849026570345, + "grad_norm": 1.9509785175323486, + "learning_rate": 1.8695889600556665e-05, + "loss": 0.6163, + "step": 17274 + }, + { + "epoch": 2.820048161299539, + "grad_norm": 2.0354652404785156, + "learning_rate": 1.869573290486356e-05, + "loss": 0.6104, + "step": 17275 + }, + { + "epoch": 2.8202114199420434, + "grad_norm": 1.6404008865356445, + "learning_rate": 1.869557620041386e-05, + "loss": 0.6008, + "step": 17276 + }, + { + "epoch": 2.820374678584548, + "grad_norm": 1.6229885816574097, + "learning_rate": 1.8695419487207713e-05, + "loss": 0.524, + "step": 17277 + }, + { + "epoch": 2.820537937227052, + "grad_norm": 1.7044950723648071, + "learning_rate": 1.869526276524528e-05, + "loss": 0.5732, + "step": 17278 + }, + { + "epoch": 2.8207011958695563, + "grad_norm": 1.6687562465667725, + "learning_rate": 1.869510603452672e-05, + "loss": 0.5452, + "step": 17279 + }, + { + "epoch": 2.8208644545120607, + "grad_norm": 1.6968309879302979, + "learning_rate": 1.869494929505219e-05, + "loss": 0.5012, + "step": 17280 + }, + { + "epoch": 2.821027713154565, + "grad_norm": 1.8802205324172974, + "learning_rate": 1.8694792546821852e-05, + "loss": 0.5975, + "step": 17281 + }, + { + "epoch": 2.8211909717970696, + "grad_norm": 1.5958646535873413, + "learning_rate": 1.8694635789835856e-05, + "loss": 0.5162, + "step": 17282 + }, + { + "epoch": 2.821354230439574, + "grad_norm": 1.7507811784744263, + "learning_rate": 1.869447902409437e-05, + "loss": 0.4816, + "step": 17283 + }, + { + "epoch": 2.8215174890820784, + "grad_norm": 2.0087223052978516, + "learning_rate": 1.8694322249597538e-05, + "loss": 0.6325, + "step": 17284 + }, + { + "epoch": 2.8216807477245824, + "grad_norm": 1.9035451412200928, + "learning_rate": 1.869416546634553e-05, + "loss": 0.6042, + "step": 17285 + }, + { + "epoch": 2.821844006367087, + "grad_norm": 1.5695096254348755, + "learning_rate": 1.86940086743385e-05, + "loss": 0.4772, + "step": 17286 + }, + { + "epoch": 2.8220072650095913, + "grad_norm": 1.8475184440612793, + "learning_rate": 1.8693851873576605e-05, + "loss": 0.5506, + "step": 17287 + }, + { + "epoch": 2.8221705236520958, + "grad_norm": 1.7161576747894287, + "learning_rate": 1.8693695064060004e-05, + "loss": 0.5508, + "step": 17288 + }, + { + "epoch": 2.8223337822946, + "grad_norm": 1.456768274307251, + "learning_rate": 1.8693538245788853e-05, + "loss": 0.5322, + "step": 17289 + }, + { + "epoch": 2.8224970409371046, + "grad_norm": 1.8262561559677124, + "learning_rate": 1.8693381418763312e-05, + "loss": 0.5345, + "step": 17290 + }, + { + "epoch": 2.822660299579609, + "grad_norm": 1.8271857500076294, + "learning_rate": 1.869322458298354e-05, + "loss": 0.5737, + "step": 17291 + }, + { + "epoch": 2.8228235582221135, + "grad_norm": 1.9797736406326294, + "learning_rate": 1.8693067738449692e-05, + "loss": 0.6836, + "step": 17292 + }, + { + "epoch": 2.822986816864618, + "grad_norm": 1.5971038341522217, + "learning_rate": 1.8692910885161925e-05, + "loss": 0.5506, + "step": 17293 + }, + { + "epoch": 2.8231500755071224, + "grad_norm": 1.6488807201385498, + "learning_rate": 1.86927540231204e-05, + "loss": 0.5509, + "step": 17294 + }, + { + "epoch": 2.823313334149627, + "grad_norm": 1.6878246068954468, + "learning_rate": 1.8692597152325275e-05, + "loss": 0.5592, + "step": 17295 + }, + { + "epoch": 2.823476592792131, + "grad_norm": 1.6148552894592285, + "learning_rate": 1.8692440272776705e-05, + "loss": 0.5445, + "step": 17296 + }, + { + "epoch": 2.8236398514346353, + "grad_norm": 1.648364782333374, + "learning_rate": 1.8692283384474856e-05, + "loss": 0.5099, + "step": 17297 + }, + { + "epoch": 2.8238031100771397, + "grad_norm": 1.793453574180603, + "learning_rate": 1.8692126487419876e-05, + "loss": 0.5706, + "step": 17298 + }, + { + "epoch": 2.823966368719644, + "grad_norm": 2.2620513439178467, + "learning_rate": 1.8691969581611922e-05, + "loss": 0.6555, + "step": 17299 + }, + { + "epoch": 2.8241296273621486, + "grad_norm": 2.074971914291382, + "learning_rate": 1.8691812667051164e-05, + "loss": 0.6315, + "step": 17300 + }, + { + "epoch": 2.824292886004653, + "grad_norm": 1.8155781030654907, + "learning_rate": 1.869165574373775e-05, + "loss": 0.583, + "step": 17301 + }, + { + "epoch": 2.824456144647157, + "grad_norm": 1.484980583190918, + "learning_rate": 1.869149881167184e-05, + "loss": 0.4651, + "step": 17302 + }, + { + "epoch": 2.8246194032896614, + "grad_norm": 1.712096929550171, + "learning_rate": 1.8691341870853598e-05, + "loss": 0.5567, + "step": 17303 + }, + { + "epoch": 2.824782661932166, + "grad_norm": 1.8923858404159546, + "learning_rate": 1.8691184921283172e-05, + "loss": 0.5617, + "step": 17304 + }, + { + "epoch": 2.8249459205746703, + "grad_norm": 1.701290488243103, + "learning_rate": 1.8691027962960727e-05, + "loss": 0.5175, + "step": 17305 + }, + { + "epoch": 2.8251091792171747, + "grad_norm": 1.4980497360229492, + "learning_rate": 1.8690870995886417e-05, + "loss": 0.4396, + "step": 17306 + }, + { + "epoch": 2.825272437859679, + "grad_norm": 1.5317814350128174, + "learning_rate": 1.8690714020060402e-05, + "loss": 0.5027, + "step": 17307 + }, + { + "epoch": 2.8254356965021836, + "grad_norm": 2.0754806995391846, + "learning_rate": 1.8690557035482843e-05, + "loss": 0.7177, + "step": 17308 + }, + { + "epoch": 2.825598955144688, + "grad_norm": 1.9141489267349243, + "learning_rate": 1.8690400042153895e-05, + "loss": 0.6023, + "step": 17309 + }, + { + "epoch": 2.8257622137871925, + "grad_norm": 1.7760319709777832, + "learning_rate": 1.8690243040073715e-05, + "loss": 0.5827, + "step": 17310 + }, + { + "epoch": 2.825925472429697, + "grad_norm": 1.6024688482284546, + "learning_rate": 1.869008602924246e-05, + "loss": 0.5342, + "step": 17311 + }, + { + "epoch": 2.8260887310722014, + "grad_norm": 1.8162543773651123, + "learning_rate": 1.86899290096603e-05, + "loss": 0.5916, + "step": 17312 + }, + { + "epoch": 2.8262519897147054, + "grad_norm": 1.550423502922058, + "learning_rate": 1.8689771981327377e-05, + "loss": 0.546, + "step": 17313 + }, + { + "epoch": 2.82641524835721, + "grad_norm": 2.0149052143096924, + "learning_rate": 1.8689614944243855e-05, + "loss": 0.5946, + "step": 17314 + }, + { + "epoch": 2.8265785069997142, + "grad_norm": 1.6024562120437622, + "learning_rate": 1.8689457898409897e-05, + "loss": 0.5705, + "step": 17315 + }, + { + "epoch": 2.8267417656422187, + "grad_norm": 1.7897839546203613, + "learning_rate": 1.8689300843825654e-05, + "loss": 0.5865, + "step": 17316 + }, + { + "epoch": 2.826905024284723, + "grad_norm": 1.6873918771743774, + "learning_rate": 1.8689143780491287e-05, + "loss": 0.5232, + "step": 17317 + }, + { + "epoch": 2.8270682829272276, + "grad_norm": 1.8137292861938477, + "learning_rate": 1.8688986708406958e-05, + "loss": 0.5364, + "step": 17318 + }, + { + "epoch": 2.827231541569732, + "grad_norm": 1.51260244846344, + "learning_rate": 1.868882962757282e-05, + "loss": 0.5956, + "step": 17319 + }, + { + "epoch": 2.827394800212236, + "grad_norm": 2.0289108753204346, + "learning_rate": 1.8688672537989032e-05, + "loss": 0.6131, + "step": 17320 + }, + { + "epoch": 2.8275580588547404, + "grad_norm": 1.393983006477356, + "learning_rate": 1.868851543965575e-05, + "loss": 0.4733, + "step": 17321 + }, + { + "epoch": 2.827721317497245, + "grad_norm": 1.6007542610168457, + "learning_rate": 1.8688358332573142e-05, + "loss": 0.5224, + "step": 17322 + }, + { + "epoch": 2.8278845761397493, + "grad_norm": 1.9269564151763916, + "learning_rate": 1.8688201216741357e-05, + "loss": 0.5624, + "step": 17323 + }, + { + "epoch": 2.8280478347822537, + "grad_norm": 1.8556692600250244, + "learning_rate": 1.8688044092160554e-05, + "loss": 0.6026, + "step": 17324 + }, + { + "epoch": 2.828211093424758, + "grad_norm": 1.9639966487884521, + "learning_rate": 1.8687886958830894e-05, + "loss": 0.4986, + "step": 17325 + }, + { + "epoch": 2.8283743520672626, + "grad_norm": 1.833815574645996, + "learning_rate": 1.8687729816752536e-05, + "loss": 0.5826, + "step": 17326 + }, + { + "epoch": 2.828537610709767, + "grad_norm": 1.7176768779754639, + "learning_rate": 1.868757266592563e-05, + "loss": 0.5888, + "step": 17327 + }, + { + "epoch": 2.8287008693522715, + "grad_norm": 1.6722604036331177, + "learning_rate": 1.8687415506350347e-05, + "loss": 0.5354, + "step": 17328 + }, + { + "epoch": 2.828864127994776, + "grad_norm": 1.935530185699463, + "learning_rate": 1.868725833802684e-05, + "loss": 0.7398, + "step": 17329 + }, + { + "epoch": 2.8290273866372804, + "grad_norm": 1.9576176404953003, + "learning_rate": 1.868710116095526e-05, + "loss": 0.5482, + "step": 17330 + }, + { + "epoch": 2.8291906452797844, + "grad_norm": 1.6977697610855103, + "learning_rate": 1.8686943975135774e-05, + "loss": 0.5549, + "step": 17331 + }, + { + "epoch": 2.829353903922289, + "grad_norm": 1.880038857460022, + "learning_rate": 1.868678678056854e-05, + "loss": 0.6695, + "step": 17332 + }, + { + "epoch": 2.8295171625647932, + "grad_norm": 1.537307858467102, + "learning_rate": 1.8686629577253713e-05, + "loss": 0.4679, + "step": 17333 + }, + { + "epoch": 2.8296804212072977, + "grad_norm": 1.343293309211731, + "learning_rate": 1.868647236519145e-05, + "loss": 0.4562, + "step": 17334 + }, + { + "epoch": 2.829843679849802, + "grad_norm": 1.4677590131759644, + "learning_rate": 1.8686315144381914e-05, + "loss": 0.5497, + "step": 17335 + }, + { + "epoch": 2.8300069384923066, + "grad_norm": 1.6494159698486328, + "learning_rate": 1.868615791482526e-05, + "loss": 0.5568, + "step": 17336 + }, + { + "epoch": 2.8301701971348106, + "grad_norm": 2.12111496925354, + "learning_rate": 1.8686000676521648e-05, + "loss": 0.6442, + "step": 17337 + }, + { + "epoch": 2.830333455777315, + "grad_norm": 2.138129234313965, + "learning_rate": 1.8685843429471235e-05, + "loss": 0.7177, + "step": 17338 + }, + { + "epoch": 2.8304967144198194, + "grad_norm": 2.2274515628814697, + "learning_rate": 1.868568617367418e-05, + "loss": 0.6653, + "step": 17339 + }, + { + "epoch": 2.830659973062324, + "grad_norm": 2.2251148223876953, + "learning_rate": 1.8685528909130643e-05, + "loss": 0.6858, + "step": 17340 + }, + { + "epoch": 2.8308232317048283, + "grad_norm": 1.7982655763626099, + "learning_rate": 1.868537163584078e-05, + "loss": 0.6309, + "step": 17341 + }, + { + "epoch": 2.8309864903473327, + "grad_norm": 1.6273930072784424, + "learning_rate": 1.8685214353804748e-05, + "loss": 0.4908, + "step": 17342 + }, + { + "epoch": 2.831149748989837, + "grad_norm": 1.8796451091766357, + "learning_rate": 1.8685057063022708e-05, + "loss": 0.6342, + "step": 17343 + }, + { + "epoch": 2.8313130076323416, + "grad_norm": 1.6495277881622314, + "learning_rate": 1.8684899763494816e-05, + "loss": 0.5511, + "step": 17344 + }, + { + "epoch": 2.831476266274846, + "grad_norm": 1.8027392625808716, + "learning_rate": 1.8684742455221238e-05, + "loss": 0.6454, + "step": 17345 + }, + { + "epoch": 2.8316395249173505, + "grad_norm": 1.4123716354370117, + "learning_rate": 1.8684585138202122e-05, + "loss": 0.4695, + "step": 17346 + }, + { + "epoch": 2.831802783559855, + "grad_norm": 1.8869311809539795, + "learning_rate": 1.8684427812437632e-05, + "loss": 0.6065, + "step": 17347 + }, + { + "epoch": 2.831966042202359, + "grad_norm": 1.5599076747894287, + "learning_rate": 1.8684270477927927e-05, + "loss": 0.4726, + "step": 17348 + }, + { + "epoch": 2.8321293008448634, + "grad_norm": 1.7237598896026611, + "learning_rate": 1.868411313467316e-05, + "loss": 0.5791, + "step": 17349 + }, + { + "epoch": 2.832292559487368, + "grad_norm": 1.591324806213379, + "learning_rate": 1.8683955782673496e-05, + "loss": 0.4902, + "step": 17350 + }, + { + "epoch": 2.8324558181298722, + "grad_norm": 1.777327060699463, + "learning_rate": 1.8683798421929093e-05, + "loss": 0.556, + "step": 17351 + }, + { + "epoch": 2.8326190767723767, + "grad_norm": 1.5322059392929077, + "learning_rate": 1.8683641052440105e-05, + "loss": 0.5479, + "step": 17352 + }, + { + "epoch": 2.832782335414881, + "grad_norm": 1.9605932235717773, + "learning_rate": 1.868348367420669e-05, + "loss": 0.602, + "step": 17353 + }, + { + "epoch": 2.8329455940573856, + "grad_norm": 2.215287685394287, + "learning_rate": 1.8683326287229017e-05, + "loss": 0.8022, + "step": 17354 + }, + { + "epoch": 2.8331088526998895, + "grad_norm": 1.7239046096801758, + "learning_rate": 1.8683168891507232e-05, + "loss": 0.5746, + "step": 17355 + }, + { + "epoch": 2.833272111342394, + "grad_norm": 2.1312901973724365, + "learning_rate": 1.86830114870415e-05, + "loss": 0.5981, + "step": 17356 + }, + { + "epoch": 2.8334353699848984, + "grad_norm": 1.6060967445373535, + "learning_rate": 1.8682854073831974e-05, + "loss": 0.5592, + "step": 17357 + }, + { + "epoch": 2.833598628627403, + "grad_norm": 2.1366775035858154, + "learning_rate": 1.868269665187882e-05, + "loss": 0.6748, + "step": 17358 + }, + { + "epoch": 2.8337618872699073, + "grad_norm": 1.7305536270141602, + "learning_rate": 1.868253922118219e-05, + "loss": 0.5244, + "step": 17359 + }, + { + "epoch": 2.8339251459124117, + "grad_norm": 1.7873256206512451, + "learning_rate": 1.8682381781742246e-05, + "loss": 0.6359, + "step": 17360 + }, + { + "epoch": 2.834088404554916, + "grad_norm": 1.8258682489395142, + "learning_rate": 1.8682224333559146e-05, + "loss": 0.6322, + "step": 17361 + }, + { + "epoch": 2.8342516631974206, + "grad_norm": 1.987223505973816, + "learning_rate": 1.8682066876633048e-05, + "loss": 0.5879, + "step": 17362 + }, + { + "epoch": 2.834414921839925, + "grad_norm": 1.4359761476516724, + "learning_rate": 1.868190941096411e-05, + "loss": 0.5234, + "step": 17363 + }, + { + "epoch": 2.8345781804824295, + "grad_norm": 1.655376672744751, + "learning_rate": 1.8681751936552496e-05, + "loss": 0.5049, + "step": 17364 + }, + { + "epoch": 2.834741439124934, + "grad_norm": 1.418330192565918, + "learning_rate": 1.8681594453398354e-05, + "loss": 0.5243, + "step": 17365 + }, + { + "epoch": 2.834904697767438, + "grad_norm": 1.6683681011199951, + "learning_rate": 1.8681436961501853e-05, + "loss": 0.5321, + "step": 17366 + }, + { + "epoch": 2.8350679564099424, + "grad_norm": 1.9677988290786743, + "learning_rate": 1.8681279460863145e-05, + "loss": 0.6173, + "step": 17367 + }, + { + "epoch": 2.835231215052447, + "grad_norm": 1.9433727264404297, + "learning_rate": 1.8681121951482397e-05, + "loss": 0.5813, + "step": 17368 + }, + { + "epoch": 2.8353944736949512, + "grad_norm": 1.6319072246551514, + "learning_rate": 1.8680964433359753e-05, + "loss": 0.5052, + "step": 17369 + }, + { + "epoch": 2.8355577323374557, + "grad_norm": 1.9237794876098633, + "learning_rate": 1.8680806906495384e-05, + "loss": 0.5363, + "step": 17370 + }, + { + "epoch": 2.83572099097996, + "grad_norm": 1.8750869035720825, + "learning_rate": 1.8680649370889442e-05, + "loss": 0.6142, + "step": 17371 + }, + { + "epoch": 2.8358842496224645, + "grad_norm": 1.906185507774353, + "learning_rate": 1.8680491826542093e-05, + "loss": 0.5762, + "step": 17372 + }, + { + "epoch": 2.8360475082649685, + "grad_norm": 1.9801242351531982, + "learning_rate": 1.868033427345349e-05, + "loss": 0.7093, + "step": 17373 + }, + { + "epoch": 2.836210766907473, + "grad_norm": 2.0190465450286865, + "learning_rate": 1.868017671162379e-05, + "loss": 0.5502, + "step": 17374 + }, + { + "epoch": 2.8363740255499774, + "grad_norm": 1.9609578847885132, + "learning_rate": 1.8680019141053157e-05, + "loss": 0.6459, + "step": 17375 + }, + { + "epoch": 2.836537284192482, + "grad_norm": 1.7368839979171753, + "learning_rate": 1.8679861561741745e-05, + "loss": 0.5822, + "step": 17376 + }, + { + "epoch": 2.8367005428349863, + "grad_norm": 1.8037288188934326, + "learning_rate": 1.8679703973689714e-05, + "loss": 0.6085, + "step": 17377 + }, + { + "epoch": 2.8368638014774907, + "grad_norm": 1.735209584236145, + "learning_rate": 1.8679546376897226e-05, + "loss": 0.5528, + "step": 17378 + }, + { + "epoch": 2.837027060119995, + "grad_norm": 1.6232365369796753, + "learning_rate": 1.8679388771364438e-05, + "loss": 0.5215, + "step": 17379 + }, + { + "epoch": 2.8371903187624996, + "grad_norm": 1.6668144464492798, + "learning_rate": 1.8679231157091507e-05, + "loss": 0.5751, + "step": 17380 + }, + { + "epoch": 2.837353577405004, + "grad_norm": 1.928308129310608, + "learning_rate": 1.8679073534078588e-05, + "loss": 0.5364, + "step": 17381 + }, + { + "epoch": 2.8375168360475085, + "grad_norm": 1.5737284421920776, + "learning_rate": 1.8678915902325848e-05, + "loss": 0.4611, + "step": 17382 + }, + { + "epoch": 2.837680094690013, + "grad_norm": 1.7641549110412598, + "learning_rate": 1.8678758261833442e-05, + "loss": 0.5171, + "step": 17383 + }, + { + "epoch": 2.837843353332517, + "grad_norm": 1.588366150856018, + "learning_rate": 1.867860061260153e-05, + "loss": 0.5185, + "step": 17384 + }, + { + "epoch": 2.8380066119750214, + "grad_norm": 1.7705422639846802, + "learning_rate": 1.8678442954630267e-05, + "loss": 0.5633, + "step": 17385 + }, + { + "epoch": 2.838169870617526, + "grad_norm": 1.7404228448867798, + "learning_rate": 1.8678285287919813e-05, + "loss": 0.4903, + "step": 17386 + }, + { + "epoch": 2.8383331292600302, + "grad_norm": 1.700900912284851, + "learning_rate": 1.8678127612470334e-05, + "loss": 0.5739, + "step": 17387 + }, + { + "epoch": 2.8384963879025347, + "grad_norm": 1.7212839126586914, + "learning_rate": 1.867796992828198e-05, + "loss": 0.5926, + "step": 17388 + }, + { + "epoch": 2.838659646545039, + "grad_norm": 1.9364198446273804, + "learning_rate": 1.867781223535491e-05, + "loss": 0.5376, + "step": 17389 + }, + { + "epoch": 2.838822905187543, + "grad_norm": 2.1636710166931152, + "learning_rate": 1.8677654533689287e-05, + "loss": 0.6149, + "step": 17390 + }, + { + "epoch": 2.8389861638300475, + "grad_norm": 1.5275121927261353, + "learning_rate": 1.8677496823285265e-05, + "loss": 0.5104, + "step": 17391 + }, + { + "epoch": 2.839149422472552, + "grad_norm": 1.6003931760787964, + "learning_rate": 1.867733910414301e-05, + "loss": 0.4403, + "step": 17392 + }, + { + "epoch": 2.8393126811150564, + "grad_norm": 1.9879560470581055, + "learning_rate": 1.867718137626268e-05, + "loss": 0.6583, + "step": 17393 + }, + { + "epoch": 2.839475939757561, + "grad_norm": 2.1571993827819824, + "learning_rate": 1.8677023639644422e-05, + "loss": 0.6243, + "step": 17394 + }, + { + "epoch": 2.8396391984000653, + "grad_norm": 1.4449546337127686, + "learning_rate": 1.867686589428841e-05, + "loss": 0.511, + "step": 17395 + }, + { + "epoch": 2.8398024570425697, + "grad_norm": 1.694981575012207, + "learning_rate": 1.8676708140194794e-05, + "loss": 0.554, + "step": 17396 + }, + { + "epoch": 2.839965715685074, + "grad_norm": 1.7250173091888428, + "learning_rate": 1.867655037736374e-05, + "loss": 0.5579, + "step": 17397 + }, + { + "epoch": 2.8401289743275786, + "grad_norm": 1.85979425907135, + "learning_rate": 1.8676392605795393e-05, + "loss": 0.5294, + "step": 17398 + }, + { + "epoch": 2.840292232970083, + "grad_norm": 1.8543678522109985, + "learning_rate": 1.8676234825489923e-05, + "loss": 0.6311, + "step": 17399 + }, + { + "epoch": 2.8404554916125875, + "grad_norm": 1.8117822408676147, + "learning_rate": 1.867607703644749e-05, + "loss": 0.5866, + "step": 17400 + }, + { + "epoch": 2.8406187502550915, + "grad_norm": 1.9270445108413696, + "learning_rate": 1.867591923866825e-05, + "loss": 0.6441, + "step": 17401 + }, + { + "epoch": 2.840782008897596, + "grad_norm": 1.664080023765564, + "learning_rate": 1.8675761432152362e-05, + "loss": 0.5101, + "step": 17402 + }, + { + "epoch": 2.8409452675401003, + "grad_norm": 1.8976043462753296, + "learning_rate": 1.8675603616899983e-05, + "loss": 0.6161, + "step": 17403 + }, + { + "epoch": 2.841108526182605, + "grad_norm": 1.9277417659759521, + "learning_rate": 1.8675445792911276e-05, + "loss": 0.588, + "step": 17404 + }, + { + "epoch": 2.8412717848251092, + "grad_norm": 1.645962119102478, + "learning_rate": 1.8675287960186393e-05, + "loss": 0.514, + "step": 17405 + }, + { + "epoch": 2.8414350434676137, + "grad_norm": 1.787660002708435, + "learning_rate": 1.86751301187255e-05, + "loss": 0.5516, + "step": 17406 + }, + { + "epoch": 2.841598302110118, + "grad_norm": 1.8849565982818604, + "learning_rate": 1.8674972268528753e-05, + "loss": 0.6826, + "step": 17407 + }, + { + "epoch": 2.841761560752622, + "grad_norm": 1.5399669408798218, + "learning_rate": 1.8674814409596315e-05, + "loss": 0.4394, + "step": 17408 + }, + { + "epoch": 2.8419248193951265, + "grad_norm": 1.6384273767471313, + "learning_rate": 1.8674656541928334e-05, + "loss": 0.5397, + "step": 17409 + }, + { + "epoch": 2.842088078037631, + "grad_norm": 1.8995320796966553, + "learning_rate": 1.867449866552498e-05, + "loss": 0.5777, + "step": 17410 + }, + { + "epoch": 2.8422513366801354, + "grad_norm": 1.6528632640838623, + "learning_rate": 1.8674340780386407e-05, + "loss": 0.5792, + "step": 17411 + }, + { + "epoch": 2.84241459532264, + "grad_norm": 2.0069093704223633, + "learning_rate": 1.8674182886512776e-05, + "loss": 0.7137, + "step": 17412 + }, + { + "epoch": 2.8425778539651443, + "grad_norm": 2.015998363494873, + "learning_rate": 1.8674024983904246e-05, + "loss": 0.627, + "step": 17413 + }, + { + "epoch": 2.8427411126076487, + "grad_norm": 1.7591241598129272, + "learning_rate": 1.8673867072560973e-05, + "loss": 0.5666, + "step": 17414 + }, + { + "epoch": 2.842904371250153, + "grad_norm": 1.5675101280212402, + "learning_rate": 1.867370915248312e-05, + "loss": 0.4807, + "step": 17415 + }, + { + "epoch": 2.8430676298926576, + "grad_norm": 2.0212364196777344, + "learning_rate": 1.8673551223670843e-05, + "loss": 0.6933, + "step": 17416 + }, + { + "epoch": 2.843230888535162, + "grad_norm": 2.004387140274048, + "learning_rate": 1.86733932861243e-05, + "loss": 0.6939, + "step": 17417 + }, + { + "epoch": 2.8433941471776665, + "grad_norm": 1.5747078657150269, + "learning_rate": 1.8673235339843657e-05, + "loss": 0.53, + "step": 17418 + }, + { + "epoch": 2.8435574058201705, + "grad_norm": 1.8123526573181152, + "learning_rate": 1.8673077384829067e-05, + "loss": 0.6437, + "step": 17419 + }, + { + "epoch": 2.843720664462675, + "grad_norm": 1.7192234992980957, + "learning_rate": 1.867291942108069e-05, + "loss": 0.5295, + "step": 17420 + }, + { + "epoch": 2.8438839231051793, + "grad_norm": 1.7792750597000122, + "learning_rate": 1.8672761448598683e-05, + "loss": 0.5714, + "step": 17421 + }, + { + "epoch": 2.844047181747684, + "grad_norm": 1.9339157342910767, + "learning_rate": 1.8672603467383212e-05, + "loss": 0.4951, + "step": 17422 + }, + { + "epoch": 2.844210440390188, + "grad_norm": 1.7594730854034424, + "learning_rate": 1.8672445477434428e-05, + "loss": 0.5997, + "step": 17423 + }, + { + "epoch": 2.8443736990326927, + "grad_norm": 1.863518238067627, + "learning_rate": 1.8672287478752494e-05, + "loss": 0.5586, + "step": 17424 + }, + { + "epoch": 2.8445369576751967, + "grad_norm": 1.8950573205947876, + "learning_rate": 1.8672129471337568e-05, + "loss": 0.6036, + "step": 17425 + }, + { + "epoch": 2.844700216317701, + "grad_norm": 1.7747693061828613, + "learning_rate": 1.867197145518981e-05, + "loss": 0.5618, + "step": 17426 + }, + { + "epoch": 2.8448634749602055, + "grad_norm": 2.0241665840148926, + "learning_rate": 1.8671813430309385e-05, + "loss": 0.6268, + "step": 17427 + }, + { + "epoch": 2.84502673360271, + "grad_norm": 1.4320847988128662, + "learning_rate": 1.867165539669644e-05, + "loss": 0.5063, + "step": 17428 + }, + { + "epoch": 2.8451899922452144, + "grad_norm": 1.6522828340530396, + "learning_rate": 1.8671497354351142e-05, + "loss": 0.5439, + "step": 17429 + }, + { + "epoch": 2.845353250887719, + "grad_norm": 1.6280933618545532, + "learning_rate": 1.8671339303273647e-05, + "loss": 0.4953, + "step": 17430 + }, + { + "epoch": 2.8455165095302233, + "grad_norm": 1.5003137588500977, + "learning_rate": 1.867118124346412e-05, + "loss": 0.484, + "step": 17431 + }, + { + "epoch": 2.8456797681727277, + "grad_norm": 1.9186122417449951, + "learning_rate": 1.867102317492271e-05, + "loss": 0.662, + "step": 17432 + }, + { + "epoch": 2.845843026815232, + "grad_norm": 1.7311291694641113, + "learning_rate": 1.8670865097649585e-05, + "loss": 0.5083, + "step": 17433 + }, + { + "epoch": 2.8460062854577366, + "grad_norm": 1.72804594039917, + "learning_rate": 1.86707070116449e-05, + "loss": 0.5171, + "step": 17434 + }, + { + "epoch": 2.846169544100241, + "grad_norm": 1.773207426071167, + "learning_rate": 1.8670548916908817e-05, + "loss": 0.5484, + "step": 17435 + }, + { + "epoch": 2.8463328027427455, + "grad_norm": 1.7377712726593018, + "learning_rate": 1.8670390813441494e-05, + "loss": 0.5337, + "step": 17436 + }, + { + "epoch": 2.8464960613852495, + "grad_norm": 2.0627620220184326, + "learning_rate": 1.867023270124309e-05, + "loss": 0.5933, + "step": 17437 + }, + { + "epoch": 2.846659320027754, + "grad_norm": 1.6090749502182007, + "learning_rate": 1.8670074580313763e-05, + "loss": 0.4719, + "step": 17438 + }, + { + "epoch": 2.8468225786702583, + "grad_norm": 1.9800165891647339, + "learning_rate": 1.866991645065367e-05, + "loss": 0.558, + "step": 17439 + }, + { + "epoch": 2.846985837312763, + "grad_norm": 1.7464150190353394, + "learning_rate": 1.866975831226298e-05, + "loss": 0.5441, + "step": 17440 + }, + { + "epoch": 2.847149095955267, + "grad_norm": 1.6675916910171509, + "learning_rate": 1.866960016514184e-05, + "loss": 0.4991, + "step": 17441 + }, + { + "epoch": 2.8473123545977717, + "grad_norm": 2.1955721378326416, + "learning_rate": 1.8669442009290416e-05, + "loss": 0.6648, + "step": 17442 + }, + { + "epoch": 2.8474756132402756, + "grad_norm": 1.6307456493377686, + "learning_rate": 1.8669283844708866e-05, + "loss": 0.523, + "step": 17443 + }, + { + "epoch": 2.84763887188278, + "grad_norm": 2.0129706859588623, + "learning_rate": 1.866912567139735e-05, + "loss": 0.6666, + "step": 17444 + }, + { + "epoch": 2.8478021305252845, + "grad_norm": 1.8485057353973389, + "learning_rate": 1.866896748935603e-05, + "loss": 0.5605, + "step": 17445 + }, + { + "epoch": 2.847965389167789, + "grad_norm": 1.8940174579620361, + "learning_rate": 1.866880929858506e-05, + "loss": 0.5711, + "step": 17446 + }, + { + "epoch": 2.8481286478102934, + "grad_norm": 1.8853673934936523, + "learning_rate": 1.8668651099084602e-05, + "loss": 0.5983, + "step": 17447 + }, + { + "epoch": 2.848291906452798, + "grad_norm": 1.8383829593658447, + "learning_rate": 1.866849289085481e-05, + "loss": 0.5288, + "step": 17448 + }, + { + "epoch": 2.8484551650953023, + "grad_norm": 1.6661547422409058, + "learning_rate": 1.866833467389585e-05, + "loss": 0.5, + "step": 17449 + }, + { + "epoch": 2.8486184237378067, + "grad_norm": 2.011430501937866, + "learning_rate": 1.8668176448207883e-05, + "loss": 0.6637, + "step": 17450 + }, + { + "epoch": 2.848781682380311, + "grad_norm": 1.3845654726028442, + "learning_rate": 1.8668018213791062e-05, + "loss": 0.4378, + "step": 17451 + }, + { + "epoch": 2.8489449410228156, + "grad_norm": 1.6427555084228516, + "learning_rate": 1.8667859970645547e-05, + "loss": 0.5432, + "step": 17452 + }, + { + "epoch": 2.84910819966532, + "grad_norm": 2.185577154159546, + "learning_rate": 1.8667701718771502e-05, + "loss": 0.6113, + "step": 17453 + }, + { + "epoch": 2.849271458307824, + "grad_norm": 1.5195000171661377, + "learning_rate": 1.8667543458169084e-05, + "loss": 0.5803, + "step": 17454 + }, + { + "epoch": 2.8494347169503285, + "grad_norm": 2.003730535507202, + "learning_rate": 1.8667385188838453e-05, + "loss": 0.6801, + "step": 17455 + }, + { + "epoch": 2.849597975592833, + "grad_norm": 2.300915241241455, + "learning_rate": 1.8667226910779767e-05, + "loss": 0.6326, + "step": 17456 + }, + { + "epoch": 2.8497612342353373, + "grad_norm": 1.5472261905670166, + "learning_rate": 1.8667068623993182e-05, + "loss": 0.6965, + "step": 17457 + }, + { + "epoch": 2.8499244928778418, + "grad_norm": 1.9418416023254395, + "learning_rate": 1.8666910328478862e-05, + "loss": 0.575, + "step": 17458 + }, + { + "epoch": 2.850087751520346, + "grad_norm": 1.6450414657592773, + "learning_rate": 1.8666752024236968e-05, + "loss": 0.5569, + "step": 17459 + }, + { + "epoch": 2.8502510101628507, + "grad_norm": 1.922398328781128, + "learning_rate": 1.8666593711267656e-05, + "loss": 0.6325, + "step": 17460 + }, + { + "epoch": 2.8504142688053546, + "grad_norm": 1.3919157981872559, + "learning_rate": 1.8666435389571087e-05, + "loss": 0.4254, + "step": 17461 + }, + { + "epoch": 2.850577527447859, + "grad_norm": 1.784253478050232, + "learning_rate": 1.8666277059147418e-05, + "loss": 0.6469, + "step": 17462 + }, + { + "epoch": 2.8507407860903635, + "grad_norm": 1.956870198249817, + "learning_rate": 1.8666118719996813e-05, + "loss": 0.5544, + "step": 17463 + }, + { + "epoch": 2.850904044732868, + "grad_norm": 2.0529632568359375, + "learning_rate": 1.8665960372119427e-05, + "loss": 0.7675, + "step": 17464 + }, + { + "epoch": 2.8510673033753724, + "grad_norm": 2.0510621070861816, + "learning_rate": 1.8665802015515423e-05, + "loss": 0.5761, + "step": 17465 + }, + { + "epoch": 2.851230562017877, + "grad_norm": 1.7224055528640747, + "learning_rate": 1.8665643650184954e-05, + "loss": 0.5832, + "step": 17466 + }, + { + "epoch": 2.8513938206603813, + "grad_norm": 2.0949854850769043, + "learning_rate": 1.866548527612819e-05, + "loss": 0.6459, + "step": 17467 + }, + { + "epoch": 2.8515570793028857, + "grad_norm": 2.1213436126708984, + "learning_rate": 1.866532689334528e-05, + "loss": 0.7041, + "step": 17468 + }, + { + "epoch": 2.85172033794539, + "grad_norm": 1.7411916255950928, + "learning_rate": 1.866516850183639e-05, + "loss": 0.6601, + "step": 17469 + }, + { + "epoch": 2.8518835965878946, + "grad_norm": 1.9790385961532593, + "learning_rate": 1.866501010160168e-05, + "loss": 0.5977, + "step": 17470 + }, + { + "epoch": 2.852046855230399, + "grad_norm": 1.84042489528656, + "learning_rate": 1.8664851692641305e-05, + "loss": 0.5726, + "step": 17471 + }, + { + "epoch": 2.852210113872903, + "grad_norm": 1.8415707349777222, + "learning_rate": 1.8664693274955428e-05, + "loss": 0.5649, + "step": 17472 + }, + { + "epoch": 2.8523733725154075, + "grad_norm": 1.610642671585083, + "learning_rate": 1.8664534848544203e-05, + "loss": 0.545, + "step": 17473 + }, + { + "epoch": 2.852536631157912, + "grad_norm": 1.9735251665115356, + "learning_rate": 1.8664376413407798e-05, + "loss": 0.6371, + "step": 17474 + }, + { + "epoch": 2.8526998898004163, + "grad_norm": 1.6927485466003418, + "learning_rate": 1.8664217969546366e-05, + "loss": 0.5617, + "step": 17475 + }, + { + "epoch": 2.8528631484429208, + "grad_norm": 1.5354328155517578, + "learning_rate": 1.866405951696007e-05, + "loss": 0.4784, + "step": 17476 + }, + { + "epoch": 2.853026407085425, + "grad_norm": 1.9953933954238892, + "learning_rate": 1.8663901055649067e-05, + "loss": 0.6408, + "step": 17477 + }, + { + "epoch": 2.853189665727929, + "grad_norm": 1.8826286792755127, + "learning_rate": 1.866374258561352e-05, + "loss": 0.6654, + "step": 17478 + }, + { + "epoch": 2.8533529243704336, + "grad_norm": 1.6042603254318237, + "learning_rate": 1.8663584106853584e-05, + "loss": 0.5381, + "step": 17479 + }, + { + "epoch": 2.853516183012938, + "grad_norm": 1.732203722000122, + "learning_rate": 1.8663425619369424e-05, + "loss": 0.5959, + "step": 17480 + }, + { + "epoch": 2.8536794416554425, + "grad_norm": 1.9894624948501587, + "learning_rate": 1.8663267123161194e-05, + "loss": 0.513, + "step": 17481 + }, + { + "epoch": 2.853842700297947, + "grad_norm": 1.67531418800354, + "learning_rate": 1.866310861822906e-05, + "loss": 0.5622, + "step": 17482 + }, + { + "epoch": 2.8540059589404514, + "grad_norm": 1.5791893005371094, + "learning_rate": 1.8662950104573174e-05, + "loss": 0.4944, + "step": 17483 + }, + { + "epoch": 2.854169217582956, + "grad_norm": 2.155879020690918, + "learning_rate": 1.86627915821937e-05, + "loss": 0.6619, + "step": 17484 + }, + { + "epoch": 2.8543324762254603, + "grad_norm": 1.9815895557403564, + "learning_rate": 1.8662633051090798e-05, + "loss": 0.6129, + "step": 17485 + }, + { + "epoch": 2.8544957348679647, + "grad_norm": 1.4639344215393066, + "learning_rate": 1.8662474511264627e-05, + "loss": 0.4944, + "step": 17486 + }, + { + "epoch": 2.854658993510469, + "grad_norm": 1.8373457193374634, + "learning_rate": 1.8662315962715347e-05, + "loss": 0.6026, + "step": 17487 + }, + { + "epoch": 2.8548222521529736, + "grad_norm": 1.7456088066101074, + "learning_rate": 1.8662157405443116e-05, + "loss": 0.5829, + "step": 17488 + }, + { + "epoch": 2.8549855107954776, + "grad_norm": 1.827806830406189, + "learning_rate": 1.8661998839448096e-05, + "loss": 0.5329, + "step": 17489 + }, + { + "epoch": 2.855148769437982, + "grad_norm": 1.616477370262146, + "learning_rate": 1.8661840264730445e-05, + "loss": 0.5581, + "step": 17490 + }, + { + "epoch": 2.8553120280804865, + "grad_norm": 1.620615005493164, + "learning_rate": 1.8661681681290323e-05, + "loss": 0.5168, + "step": 17491 + }, + { + "epoch": 2.855475286722991, + "grad_norm": 1.754102110862732, + "learning_rate": 1.866152308912789e-05, + "loss": 0.5461, + "step": 17492 + }, + { + "epoch": 2.8556385453654953, + "grad_norm": 1.7308567762374878, + "learning_rate": 1.8661364488243305e-05, + "loss": 0.5739, + "step": 17493 + }, + { + "epoch": 2.8558018040079998, + "grad_norm": 1.6517187356948853, + "learning_rate": 1.8661205878636726e-05, + "loss": 0.4752, + "step": 17494 + }, + { + "epoch": 2.855965062650504, + "grad_norm": 1.7421560287475586, + "learning_rate": 1.866104726030832e-05, + "loss": 0.5761, + "step": 17495 + }, + { + "epoch": 2.856128321293008, + "grad_norm": 1.6477164030075073, + "learning_rate": 1.8660888633258237e-05, + "loss": 0.5157, + "step": 17496 + }, + { + "epoch": 2.8562915799355126, + "grad_norm": 2.123060941696167, + "learning_rate": 1.8660729997486648e-05, + "loss": 0.6828, + "step": 17497 + }, + { + "epoch": 2.856454838578017, + "grad_norm": 1.7502100467681885, + "learning_rate": 1.86605713529937e-05, + "loss": 0.6177, + "step": 17498 + }, + { + "epoch": 2.8566180972205215, + "grad_norm": 1.6819252967834473, + "learning_rate": 1.866041269977956e-05, + "loss": 0.6148, + "step": 17499 + }, + { + "epoch": 2.856781355863026, + "grad_norm": 1.6932228803634644, + "learning_rate": 1.866025403784439e-05, + "loss": 0.3907, + "step": 17500 + }, + { + "epoch": 2.8569446145055304, + "grad_norm": 1.842013955116272, + "learning_rate": 1.8660095367188343e-05, + "loss": 0.5749, + "step": 17501 + }, + { + "epoch": 2.857107873148035, + "grad_norm": 2.0424630641937256, + "learning_rate": 1.8659936687811583e-05, + "loss": 0.6381, + "step": 17502 + }, + { + "epoch": 2.8572711317905393, + "grad_norm": 1.8658137321472168, + "learning_rate": 1.865977799971427e-05, + "loss": 0.582, + "step": 17503 + }, + { + "epoch": 2.8574343904330437, + "grad_norm": 1.716103434562683, + "learning_rate": 1.865961930289656e-05, + "loss": 0.5881, + "step": 17504 + }, + { + "epoch": 2.857597649075548, + "grad_norm": 1.4804948568344116, + "learning_rate": 1.865946059735862e-05, + "loss": 0.4856, + "step": 17505 + }, + { + "epoch": 2.8577609077180526, + "grad_norm": 1.758158564567566, + "learning_rate": 1.86593018831006e-05, + "loss": 0.5857, + "step": 17506 + }, + { + "epoch": 2.8579241663605566, + "grad_norm": 1.6069107055664062, + "learning_rate": 1.865914316012267e-05, + "loss": 0.6033, + "step": 17507 + }, + { + "epoch": 2.858087425003061, + "grad_norm": 1.6949272155761719, + "learning_rate": 1.8658984428424984e-05, + "loss": 0.4893, + "step": 17508 + }, + { + "epoch": 2.8582506836455654, + "grad_norm": 1.8007099628448486, + "learning_rate": 1.8658825688007702e-05, + "loss": 0.6568, + "step": 17509 + }, + { + "epoch": 2.85841394228807, + "grad_norm": 1.657939076423645, + "learning_rate": 1.8658666938870985e-05, + "loss": 0.5483, + "step": 17510 + }, + { + "epoch": 2.8585772009305743, + "grad_norm": 1.9038469791412354, + "learning_rate": 1.8658508181014996e-05, + "loss": 0.618, + "step": 17511 + }, + { + "epoch": 2.8587404595730788, + "grad_norm": 1.7034354209899902, + "learning_rate": 1.8658349414439887e-05, + "loss": 0.5781, + "step": 17512 + }, + { + "epoch": 2.858903718215583, + "grad_norm": 1.9488487243652344, + "learning_rate": 1.8658190639145827e-05, + "loss": 0.5316, + "step": 17513 + }, + { + "epoch": 2.859066976858087, + "grad_norm": 2.0149948596954346, + "learning_rate": 1.8658031855132965e-05, + "loss": 0.5645, + "step": 17514 + }, + { + "epoch": 2.8592302355005916, + "grad_norm": 1.8149365186691284, + "learning_rate": 1.8657873062401472e-05, + "loss": 0.5878, + "step": 17515 + }, + { + "epoch": 2.859393494143096, + "grad_norm": 1.7306607961654663, + "learning_rate": 1.8657714260951502e-05, + "loss": 0.608, + "step": 17516 + }, + { + "epoch": 2.8595567527856005, + "grad_norm": 1.5607361793518066, + "learning_rate": 1.865755545078322e-05, + "loss": 0.531, + "step": 17517 + }, + { + "epoch": 2.859720011428105, + "grad_norm": 1.9952592849731445, + "learning_rate": 1.8657396631896775e-05, + "loss": 0.5939, + "step": 17518 + }, + { + "epoch": 2.8598832700706094, + "grad_norm": 1.93032705783844, + "learning_rate": 1.8657237804292335e-05, + "loss": 0.6348, + "step": 17519 + }, + { + "epoch": 2.860046528713114, + "grad_norm": 1.628165602684021, + "learning_rate": 1.8657078967970063e-05, + "loss": 0.5479, + "step": 17520 + }, + { + "epoch": 2.8602097873556183, + "grad_norm": 1.8972247838974, + "learning_rate": 1.865692012293011e-05, + "loss": 0.6577, + "step": 17521 + }, + { + "epoch": 2.8603730459981227, + "grad_norm": 1.8272473812103271, + "learning_rate": 1.8656761269172645e-05, + "loss": 0.6382, + "step": 17522 + }, + { + "epoch": 2.860536304640627, + "grad_norm": 1.6367708444595337, + "learning_rate": 1.865660240669782e-05, + "loss": 0.5295, + "step": 17523 + }, + { + "epoch": 2.8606995632831316, + "grad_norm": 1.4242279529571533, + "learning_rate": 1.86564435355058e-05, + "loss": 0.4989, + "step": 17524 + }, + { + "epoch": 2.8608628219256356, + "grad_norm": 1.5849908590316772, + "learning_rate": 1.8656284655596744e-05, + "loss": 0.5418, + "step": 17525 + }, + { + "epoch": 2.86102608056814, + "grad_norm": 1.596906065940857, + "learning_rate": 1.8656125766970815e-05, + "loss": 0.4781, + "step": 17526 + }, + { + "epoch": 2.8611893392106444, + "grad_norm": 1.9862418174743652, + "learning_rate": 1.8655966869628167e-05, + "loss": 0.6936, + "step": 17527 + }, + { + "epoch": 2.861352597853149, + "grad_norm": 1.6952341794967651, + "learning_rate": 1.865580796356896e-05, + "loss": 0.5362, + "step": 17528 + }, + { + "epoch": 2.8615158564956533, + "grad_norm": 1.8117387294769287, + "learning_rate": 1.865564904879336e-05, + "loss": 0.6141, + "step": 17529 + }, + { + "epoch": 2.8616791151381578, + "grad_norm": 1.7546583414077759, + "learning_rate": 1.8655490125301523e-05, + "loss": 0.5842, + "step": 17530 + }, + { + "epoch": 2.8618423737806618, + "grad_norm": 1.698388934135437, + "learning_rate": 1.8655331193093607e-05, + "loss": 0.524, + "step": 17531 + }, + { + "epoch": 2.862005632423166, + "grad_norm": 1.8024442195892334, + "learning_rate": 1.8655172252169776e-05, + "loss": 0.657, + "step": 17532 + }, + { + "epoch": 2.8621688910656706, + "grad_norm": 1.6800333261489868, + "learning_rate": 1.8655013302530193e-05, + "loss": 0.5904, + "step": 17533 + }, + { + "epoch": 2.862332149708175, + "grad_norm": 1.5582960844039917, + "learning_rate": 1.865485434417501e-05, + "loss": 0.5344, + "step": 17534 + }, + { + "epoch": 2.8624954083506795, + "grad_norm": 1.8022692203521729, + "learning_rate": 1.865469537710439e-05, + "loss": 0.6043, + "step": 17535 + }, + { + "epoch": 2.862658666993184, + "grad_norm": 1.6592050790786743, + "learning_rate": 1.8654536401318495e-05, + "loss": 0.489, + "step": 17536 + }, + { + "epoch": 2.8628219256356884, + "grad_norm": 1.4314780235290527, + "learning_rate": 1.8654377416817486e-05, + "loss": 0.4792, + "step": 17537 + }, + { + "epoch": 2.862985184278193, + "grad_norm": 1.4500106573104858, + "learning_rate": 1.865421842360152e-05, + "loss": 0.4921, + "step": 17538 + }, + { + "epoch": 2.8631484429206973, + "grad_norm": 1.9490898847579956, + "learning_rate": 1.865405942167076e-05, + "loss": 0.6414, + "step": 17539 + }, + { + "epoch": 2.8633117015632017, + "grad_norm": 2.2474560737609863, + "learning_rate": 1.8653900411025365e-05, + "loss": 0.8462, + "step": 17540 + }, + { + "epoch": 2.863474960205706, + "grad_norm": 1.816515564918518, + "learning_rate": 1.865374139166549e-05, + "loss": 0.5417, + "step": 17541 + }, + { + "epoch": 2.86363821884821, + "grad_norm": 1.8097059726715088, + "learning_rate": 1.8653582363591303e-05, + "loss": 0.5553, + "step": 17542 + }, + { + "epoch": 2.8638014774907146, + "grad_norm": 1.8746459484100342, + "learning_rate": 1.865342332680296e-05, + "loss": 0.6117, + "step": 17543 + }, + { + "epoch": 2.863964736133219, + "grad_norm": 1.5990581512451172, + "learning_rate": 1.8653264281300622e-05, + "loss": 0.5709, + "step": 17544 + }, + { + "epoch": 2.8641279947757234, + "grad_norm": 1.3300626277923584, + "learning_rate": 1.8653105227084447e-05, + "loss": 0.4405, + "step": 17545 + }, + { + "epoch": 2.864291253418228, + "grad_norm": 1.8248804807662964, + "learning_rate": 1.86529461641546e-05, + "loss": 0.5835, + "step": 17546 + }, + { + "epoch": 2.8644545120607323, + "grad_norm": 1.4382096529006958, + "learning_rate": 1.865278709251124e-05, + "loss": 0.4535, + "step": 17547 + }, + { + "epoch": 2.8646177707032368, + "grad_norm": 1.7208261489868164, + "learning_rate": 1.8652628012154525e-05, + "loss": 0.6345, + "step": 17548 + }, + { + "epoch": 2.8647810293457407, + "grad_norm": 1.6773741245269775, + "learning_rate": 1.865246892308461e-05, + "loss": 0.5144, + "step": 17549 + }, + { + "epoch": 2.864944287988245, + "grad_norm": 1.7468316555023193, + "learning_rate": 1.865230982530167e-05, + "loss": 0.6554, + "step": 17550 + }, + { + "epoch": 2.8651075466307496, + "grad_norm": 1.5242927074432373, + "learning_rate": 1.865215071880585e-05, + "loss": 0.4758, + "step": 17551 + }, + { + "epoch": 2.865270805273254, + "grad_norm": 1.5083783864974976, + "learning_rate": 1.865199160359732e-05, + "loss": 0.526, + "step": 17552 + }, + { + "epoch": 2.8654340639157585, + "grad_norm": 1.577842116355896, + "learning_rate": 1.8651832479676236e-05, + "loss": 0.5423, + "step": 17553 + }, + { + "epoch": 2.865597322558263, + "grad_norm": 1.6997284889221191, + "learning_rate": 1.8651673347042757e-05, + "loss": 0.5465, + "step": 17554 + }, + { + "epoch": 2.8657605812007674, + "grad_norm": 1.5696581602096558, + "learning_rate": 1.8651514205697046e-05, + "loss": 0.4627, + "step": 17555 + }, + { + "epoch": 2.865923839843272, + "grad_norm": 1.7012516260147095, + "learning_rate": 1.8651355055639263e-05, + "loss": 0.4834, + "step": 17556 + }, + { + "epoch": 2.8660870984857763, + "grad_norm": 1.5914578437805176, + "learning_rate": 1.865119589686957e-05, + "loss": 0.5102, + "step": 17557 + }, + { + "epoch": 2.8662503571282807, + "grad_norm": 2.114042043685913, + "learning_rate": 1.865103672938812e-05, + "loss": 0.6213, + "step": 17558 + }, + { + "epoch": 2.866413615770785, + "grad_norm": 2.059462785720825, + "learning_rate": 1.8650877553195085e-05, + "loss": 0.6161, + "step": 17559 + }, + { + "epoch": 2.866576874413289, + "grad_norm": 1.84483003616333, + "learning_rate": 1.8650718368290613e-05, + "loss": 0.636, + "step": 17560 + }, + { + "epoch": 2.8667401330557936, + "grad_norm": 2.01324462890625, + "learning_rate": 1.865055917467487e-05, + "loss": 0.5591, + "step": 17561 + }, + { + "epoch": 2.866903391698298, + "grad_norm": 1.8647021055221558, + "learning_rate": 1.865039997234802e-05, + "loss": 0.5371, + "step": 17562 + }, + { + "epoch": 2.8670666503408024, + "grad_norm": 1.8802692890167236, + "learning_rate": 1.8650240761310217e-05, + "loss": 0.6158, + "step": 17563 + }, + { + "epoch": 2.867229908983307, + "grad_norm": 1.8047963380813599, + "learning_rate": 1.8650081541561622e-05, + "loss": 0.6092, + "step": 17564 + }, + { + "epoch": 2.8673931676258113, + "grad_norm": 2.3199312686920166, + "learning_rate": 1.86499223131024e-05, + "loss": 0.7841, + "step": 17565 + }, + { + "epoch": 2.8675564262683153, + "grad_norm": 1.8640300035476685, + "learning_rate": 1.864976307593271e-05, + "loss": 0.6003, + "step": 17566 + }, + { + "epoch": 2.8677196849108197, + "grad_norm": 1.6161607503890991, + "learning_rate": 1.8649603830052708e-05, + "loss": 0.4905, + "step": 17567 + }, + { + "epoch": 2.867882943553324, + "grad_norm": 1.6727256774902344, + "learning_rate": 1.8649444575462558e-05, + "loss": 0.4968, + "step": 17568 + }, + { + "epoch": 2.8680462021958286, + "grad_norm": 1.6471205949783325, + "learning_rate": 1.864928531216242e-05, + "loss": 0.4968, + "step": 17569 + }, + { + "epoch": 2.868209460838333, + "grad_norm": 1.935549259185791, + "learning_rate": 1.8649126040152454e-05, + "loss": 0.6276, + "step": 17570 + }, + { + "epoch": 2.8683727194808375, + "grad_norm": 1.6889914274215698, + "learning_rate": 1.8648966759432818e-05, + "loss": 0.5076, + "step": 17571 + }, + { + "epoch": 2.868535978123342, + "grad_norm": 1.7923798561096191, + "learning_rate": 1.8648807470003677e-05, + "loss": 0.5293, + "step": 17572 + }, + { + "epoch": 2.8686992367658464, + "grad_norm": 2.0820131301879883, + "learning_rate": 1.8648648171865188e-05, + "loss": 0.5377, + "step": 17573 + }, + { + "epoch": 2.868862495408351, + "grad_norm": 1.7815223932266235, + "learning_rate": 1.8648488865017517e-05, + "loss": 0.6062, + "step": 17574 + }, + { + "epoch": 2.8690257540508552, + "grad_norm": 2.004448652267456, + "learning_rate": 1.864832954946081e-05, + "loss": 0.5053, + "step": 17575 + }, + { + "epoch": 2.8691890126933597, + "grad_norm": 1.7800742387771606, + "learning_rate": 1.864817022519525e-05, + "loss": 0.6172, + "step": 17576 + }, + { + "epoch": 2.8693522713358637, + "grad_norm": 1.7194077968597412, + "learning_rate": 1.864801089222098e-05, + "loss": 0.5352, + "step": 17577 + }, + { + "epoch": 2.869515529978368, + "grad_norm": 1.6917822360992432, + "learning_rate": 1.8647851550538162e-05, + "loss": 0.6193, + "step": 17578 + }, + { + "epoch": 2.8696787886208726, + "grad_norm": 1.6163334846496582, + "learning_rate": 1.8647692200146963e-05, + "loss": 0.5251, + "step": 17579 + }, + { + "epoch": 2.869842047263377, + "grad_norm": 1.7542035579681396, + "learning_rate": 1.8647532841047536e-05, + "loss": 0.556, + "step": 17580 + }, + { + "epoch": 2.8700053059058814, + "grad_norm": 1.7970563173294067, + "learning_rate": 1.8647373473240052e-05, + "loss": 0.5309, + "step": 17581 + }, + { + "epoch": 2.870168564548386, + "grad_norm": 1.4615005254745483, + "learning_rate": 1.8647214096724662e-05, + "loss": 0.4686, + "step": 17582 + }, + { + "epoch": 2.8703318231908903, + "grad_norm": 1.7775514125823975, + "learning_rate": 1.8647054711501527e-05, + "loss": 0.5241, + "step": 17583 + }, + { + "epoch": 2.8704950818333943, + "grad_norm": 1.611187219619751, + "learning_rate": 1.8646895317570815e-05, + "loss": 0.5597, + "step": 17584 + }, + { + "epoch": 2.8706583404758987, + "grad_norm": 1.8891314268112183, + "learning_rate": 1.864673591493268e-05, + "loss": 0.6259, + "step": 17585 + }, + { + "epoch": 2.870821599118403, + "grad_norm": 1.3723366260528564, + "learning_rate": 1.8646576503587284e-05, + "loss": 0.5028, + "step": 17586 + }, + { + "epoch": 2.8709848577609076, + "grad_norm": 1.7788639068603516, + "learning_rate": 1.8646417083534787e-05, + "loss": 0.5962, + "step": 17587 + }, + { + "epoch": 2.871148116403412, + "grad_norm": 1.9227019548416138, + "learning_rate": 1.864625765477535e-05, + "loss": 0.6189, + "step": 17588 + }, + { + "epoch": 2.8713113750459165, + "grad_norm": 1.703065276145935, + "learning_rate": 1.8646098217309137e-05, + "loss": 0.4898, + "step": 17589 + }, + { + "epoch": 2.871474633688421, + "grad_norm": 1.681036353111267, + "learning_rate": 1.8645938771136303e-05, + "loss": 0.5607, + "step": 17590 + }, + { + "epoch": 2.8716378923309254, + "grad_norm": 1.6883313655853271, + "learning_rate": 1.864577931625701e-05, + "loss": 0.5561, + "step": 17591 + }, + { + "epoch": 2.87180115097343, + "grad_norm": 1.5422677993774414, + "learning_rate": 1.864561985267142e-05, + "loss": 0.4847, + "step": 17592 + }, + { + "epoch": 2.8719644096159342, + "grad_norm": 1.4655989408493042, + "learning_rate": 1.8645460380379695e-05, + "loss": 0.4964, + "step": 17593 + }, + { + "epoch": 2.8721276682584387, + "grad_norm": 1.9326354265213013, + "learning_rate": 1.864530089938199e-05, + "loss": 0.6187, + "step": 17594 + }, + { + "epoch": 2.8722909269009427, + "grad_norm": 1.7947176694869995, + "learning_rate": 1.8645141409678477e-05, + "loss": 0.4636, + "step": 17595 + }, + { + "epoch": 2.872454185543447, + "grad_norm": 1.9411805868148804, + "learning_rate": 1.8644981911269303e-05, + "loss": 0.6111, + "step": 17596 + }, + { + "epoch": 2.8726174441859516, + "grad_norm": 1.568211555480957, + "learning_rate": 1.8644822404154634e-05, + "loss": 0.4971, + "step": 17597 + }, + { + "epoch": 2.872780702828456, + "grad_norm": 2.163645029067993, + "learning_rate": 1.8644662888334634e-05, + "loss": 0.7066, + "step": 17598 + }, + { + "epoch": 2.8729439614709604, + "grad_norm": 1.703851580619812, + "learning_rate": 1.8644503363809456e-05, + "loss": 0.5285, + "step": 17599 + }, + { + "epoch": 2.873107220113465, + "grad_norm": 2.012359857559204, + "learning_rate": 1.864434383057927e-05, + "loss": 0.6198, + "step": 17600 + }, + { + "epoch": 2.8732704787559693, + "grad_norm": 1.4841735363006592, + "learning_rate": 1.864418428864423e-05, + "loss": 0.5174, + "step": 17601 + }, + { + "epoch": 2.8734337373984733, + "grad_norm": 1.7246414422988892, + "learning_rate": 1.8644024738004498e-05, + "loss": 0.5154, + "step": 17602 + }, + { + "epoch": 2.8735969960409777, + "grad_norm": 2.029541254043579, + "learning_rate": 1.8643865178660235e-05, + "loss": 0.6633, + "step": 17603 + }, + { + "epoch": 2.873760254683482, + "grad_norm": 1.8386707305908203, + "learning_rate": 1.8643705610611607e-05, + "loss": 0.5895, + "step": 17604 + }, + { + "epoch": 2.8739235133259866, + "grad_norm": 1.473022699356079, + "learning_rate": 1.8643546033858764e-05, + "loss": 0.4513, + "step": 17605 + }, + { + "epoch": 2.874086771968491, + "grad_norm": 1.6123994588851929, + "learning_rate": 1.8643386448401878e-05, + "loss": 0.4875, + "step": 17606 + }, + { + "epoch": 2.8742500306109955, + "grad_norm": 2.146212100982666, + "learning_rate": 1.8643226854241102e-05, + "loss": 0.6185, + "step": 17607 + }, + { + "epoch": 2.8744132892535, + "grad_norm": 1.9847946166992188, + "learning_rate": 1.8643067251376598e-05, + "loss": 0.645, + "step": 17608 + }, + { + "epoch": 2.8745765478960044, + "grad_norm": 1.6543450355529785, + "learning_rate": 1.8642907639808525e-05, + "loss": 0.4809, + "step": 17609 + }, + { + "epoch": 2.874739806538509, + "grad_norm": 1.9411708116531372, + "learning_rate": 1.864274801953705e-05, + "loss": 0.5918, + "step": 17610 + }, + { + "epoch": 2.8749030651810132, + "grad_norm": 2.1923749446868896, + "learning_rate": 1.8642588390562325e-05, + "loss": 0.6027, + "step": 17611 + }, + { + "epoch": 2.8750663238235177, + "grad_norm": 1.6383713483810425, + "learning_rate": 1.864242875288452e-05, + "loss": 0.5417, + "step": 17612 + }, + { + "epoch": 2.8752295824660217, + "grad_norm": 1.5123146772384644, + "learning_rate": 1.8642269106503793e-05, + "loss": 0.4899, + "step": 17613 + }, + { + "epoch": 2.875392841108526, + "grad_norm": 1.8014166355133057, + "learning_rate": 1.86421094514203e-05, + "loss": 0.545, + "step": 17614 + }, + { + "epoch": 2.8755560997510305, + "grad_norm": 1.7251265048980713, + "learning_rate": 1.864194978763421e-05, + "loss": 0.5634, + "step": 17615 + }, + { + "epoch": 2.875719358393535, + "grad_norm": 1.7460079193115234, + "learning_rate": 1.8641790115145673e-05, + "loss": 0.576, + "step": 17616 + }, + { + "epoch": 2.8758826170360394, + "grad_norm": 2.12522554397583, + "learning_rate": 1.8641630433954857e-05, + "loss": 0.757, + "step": 17617 + }, + { + "epoch": 2.876045875678544, + "grad_norm": 1.5534641742706299, + "learning_rate": 1.8641470744061926e-05, + "loss": 0.5338, + "step": 17618 + }, + { + "epoch": 2.876209134321048, + "grad_norm": 1.3147077560424805, + "learning_rate": 1.864131104546703e-05, + "loss": 0.4378, + "step": 17619 + }, + { + "epoch": 2.8763723929635523, + "grad_norm": 1.6701453924179077, + "learning_rate": 1.864115133817034e-05, + "loss": 0.6037, + "step": 17620 + }, + { + "epoch": 2.8765356516060567, + "grad_norm": 1.8124932050704956, + "learning_rate": 1.864099162217201e-05, + "loss": 0.4974, + "step": 17621 + }, + { + "epoch": 2.876698910248561, + "grad_norm": 2.1269047260284424, + "learning_rate": 1.864083189747221e-05, + "loss": 0.6311, + "step": 17622 + }, + { + "epoch": 2.8768621688910656, + "grad_norm": 1.794930100440979, + "learning_rate": 1.8640672164071088e-05, + "loss": 0.5275, + "step": 17623 + }, + { + "epoch": 2.87702542753357, + "grad_norm": 1.965524435043335, + "learning_rate": 1.8640512421968814e-05, + "loss": 0.7345, + "step": 17624 + }, + { + "epoch": 2.8771886861760745, + "grad_norm": 1.849182367324829, + "learning_rate": 1.8640352671165544e-05, + "loss": 0.6587, + "step": 17625 + }, + { + "epoch": 2.877351944818579, + "grad_norm": 1.9332880973815918, + "learning_rate": 1.8640192911661442e-05, + "loss": 0.6841, + "step": 17626 + }, + { + "epoch": 2.8775152034610834, + "grad_norm": 2.4250407218933105, + "learning_rate": 1.864003314345667e-05, + "loss": 0.5202, + "step": 17627 + }, + { + "epoch": 2.877678462103588, + "grad_norm": 1.6894888877868652, + "learning_rate": 1.8639873366551384e-05, + "loss": 0.4336, + "step": 17628 + }, + { + "epoch": 2.8778417207460922, + "grad_norm": 2.165438175201416, + "learning_rate": 1.8639713580945752e-05, + "loss": 0.679, + "step": 17629 + }, + { + "epoch": 2.8780049793885962, + "grad_norm": 1.720058560371399, + "learning_rate": 1.8639553786639927e-05, + "loss": 0.5485, + "step": 17630 + }, + { + "epoch": 2.8781682380311007, + "grad_norm": 1.6554187536239624, + "learning_rate": 1.8639393983634077e-05, + "loss": 0.5957, + "step": 17631 + }, + { + "epoch": 2.878331496673605, + "grad_norm": 1.7799054384231567, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.5486, + "step": 17632 + }, + { + "epoch": 2.8784947553161095, + "grad_norm": 1.8246937990188599, + "learning_rate": 1.8639074351522927e-05, + "loss": 0.6062, + "step": 17633 + }, + { + "epoch": 2.878658013958614, + "grad_norm": 1.7002384662628174, + "learning_rate": 1.8638914522417955e-05, + "loss": 0.5042, + "step": 17634 + }, + { + "epoch": 2.8788212726011184, + "grad_norm": 1.3789575099945068, + "learning_rate": 1.86387546846136e-05, + "loss": 0.4431, + "step": 17635 + }, + { + "epoch": 2.878984531243623, + "grad_norm": 1.8110694885253906, + "learning_rate": 1.8638594838110023e-05, + "loss": 0.5452, + "step": 17636 + }, + { + "epoch": 2.879147789886127, + "grad_norm": 2.0096914768218994, + "learning_rate": 1.863843498290738e-05, + "loss": 0.6245, + "step": 17637 + }, + { + "epoch": 2.8793110485286313, + "grad_norm": 1.7623430490493774, + "learning_rate": 1.8638275119005834e-05, + "loss": 0.5282, + "step": 17638 + }, + { + "epoch": 2.8794743071711357, + "grad_norm": 1.8232334852218628, + "learning_rate": 1.8638115246405548e-05, + "loss": 0.6149, + "step": 17639 + }, + { + "epoch": 2.87963756581364, + "grad_norm": 1.647824764251709, + "learning_rate": 1.8637955365106685e-05, + "loss": 0.5711, + "step": 17640 + }, + { + "epoch": 2.8798008244561446, + "grad_norm": 1.6714307069778442, + "learning_rate": 1.86377954751094e-05, + "loss": 0.5639, + "step": 17641 + }, + { + "epoch": 2.879964083098649, + "grad_norm": 1.7891960144042969, + "learning_rate": 1.8637635576413857e-05, + "loss": 0.5967, + "step": 17642 + }, + { + "epoch": 2.8801273417411535, + "grad_norm": 1.8879578113555908, + "learning_rate": 1.863747566902022e-05, + "loss": 0.5723, + "step": 17643 + }, + { + "epoch": 2.880290600383658, + "grad_norm": 2.963064193725586, + "learning_rate": 1.8637315752928647e-05, + "loss": 0.5239, + "step": 17644 + }, + { + "epoch": 2.8804538590261624, + "grad_norm": 2.192307233810425, + "learning_rate": 1.8637155828139297e-05, + "loss": 0.6599, + "step": 17645 + }, + { + "epoch": 2.880617117668667, + "grad_norm": 1.437577247619629, + "learning_rate": 1.8636995894652338e-05, + "loss": 0.5162, + "step": 17646 + }, + { + "epoch": 2.8807803763111712, + "grad_norm": 2.0516817569732666, + "learning_rate": 1.863683595246792e-05, + "loss": 0.5713, + "step": 17647 + }, + { + "epoch": 2.8809436349536752, + "grad_norm": 1.9263041019439697, + "learning_rate": 1.8636676001586217e-05, + "loss": 0.6355, + "step": 17648 + }, + { + "epoch": 2.8811068935961797, + "grad_norm": 1.6058074235916138, + "learning_rate": 1.863651604200738e-05, + "loss": 0.5382, + "step": 17649 + }, + { + "epoch": 2.881270152238684, + "grad_norm": 1.447536587715149, + "learning_rate": 1.863635607373157e-05, + "loss": 0.4426, + "step": 17650 + }, + { + "epoch": 2.8814334108811885, + "grad_norm": 1.8798907995224, + "learning_rate": 1.8636196096758957e-05, + "loss": 0.6299, + "step": 17651 + }, + { + "epoch": 2.881596669523693, + "grad_norm": 2.0809919834136963, + "learning_rate": 1.8636036111089697e-05, + "loss": 0.5507, + "step": 17652 + }, + { + "epoch": 2.8817599281661974, + "grad_norm": 1.619510293006897, + "learning_rate": 1.863587611672395e-05, + "loss": 0.5003, + "step": 17653 + }, + { + "epoch": 2.8819231868087014, + "grad_norm": 1.9441224336624146, + "learning_rate": 1.8635716113661876e-05, + "loss": 0.6438, + "step": 17654 + }, + { + "epoch": 2.882086445451206, + "grad_norm": 1.6985387802124023, + "learning_rate": 1.863555610190364e-05, + "loss": 0.6099, + "step": 17655 + }, + { + "epoch": 2.8822497040937103, + "grad_norm": 1.517426609992981, + "learning_rate": 1.86353960814494e-05, + "loss": 0.5653, + "step": 17656 + }, + { + "epoch": 2.8824129627362147, + "grad_norm": 1.7353768348693848, + "learning_rate": 1.863523605229932e-05, + "loss": 0.5119, + "step": 17657 + }, + { + "epoch": 2.882576221378719, + "grad_norm": 1.882068395614624, + "learning_rate": 1.863507601445356e-05, + "loss": 0.5962, + "step": 17658 + }, + { + "epoch": 2.8827394800212236, + "grad_norm": 1.4893999099731445, + "learning_rate": 1.8634915967912278e-05, + "loss": 0.509, + "step": 17659 + }, + { + "epoch": 2.882902738663728, + "grad_norm": 1.578779935836792, + "learning_rate": 1.863475591267564e-05, + "loss": 0.4792, + "step": 17660 + }, + { + "epoch": 2.8830659973062325, + "grad_norm": 1.6673457622528076, + "learning_rate": 1.8634595848743804e-05, + "loss": 0.5812, + "step": 17661 + }, + { + "epoch": 2.883229255948737, + "grad_norm": 1.8076609373092651, + "learning_rate": 1.8634435776116935e-05, + "loss": 0.5854, + "step": 17662 + }, + { + "epoch": 2.8833925145912414, + "grad_norm": 1.9446676969528198, + "learning_rate": 1.863427569479519e-05, + "loss": 0.5839, + "step": 17663 + }, + { + "epoch": 2.883555773233746, + "grad_norm": 1.836538314819336, + "learning_rate": 1.863411560477873e-05, + "loss": 0.4936, + "step": 17664 + }, + { + "epoch": 2.88371903187625, + "grad_norm": 1.7473241090774536, + "learning_rate": 1.8633955506067717e-05, + "loss": 0.5428, + "step": 17665 + }, + { + "epoch": 2.883882290518754, + "grad_norm": 1.9030282497406006, + "learning_rate": 1.8633795398662315e-05, + "loss": 0.5532, + "step": 17666 + }, + { + "epoch": 2.8840455491612587, + "grad_norm": 1.853783369064331, + "learning_rate": 1.8633635282562683e-05, + "loss": 0.5741, + "step": 17667 + }, + { + "epoch": 2.884208807803763, + "grad_norm": 1.7717732191085815, + "learning_rate": 1.8633475157768986e-05, + "loss": 0.5668, + "step": 17668 + }, + { + "epoch": 2.8843720664462675, + "grad_norm": 1.691122055053711, + "learning_rate": 1.8633315024281376e-05, + "loss": 0.5644, + "step": 17669 + }, + { + "epoch": 2.884535325088772, + "grad_norm": 1.674294352531433, + "learning_rate": 1.8633154882100024e-05, + "loss": 0.5883, + "step": 17670 + }, + { + "epoch": 2.8846985837312764, + "grad_norm": 1.6296192407608032, + "learning_rate": 1.8632994731225084e-05, + "loss": 0.5413, + "step": 17671 + }, + { + "epoch": 2.8848618423737804, + "grad_norm": 1.7110458612442017, + "learning_rate": 1.8632834571656727e-05, + "loss": 0.5481, + "step": 17672 + }, + { + "epoch": 2.885025101016285, + "grad_norm": 1.889492392539978, + "learning_rate": 1.8632674403395102e-05, + "loss": 0.5384, + "step": 17673 + }, + { + "epoch": 2.8851883596587893, + "grad_norm": 1.556234359741211, + "learning_rate": 1.8632514226440377e-05, + "loss": 0.5246, + "step": 17674 + }, + { + "epoch": 2.8853516183012937, + "grad_norm": 1.763159155845642, + "learning_rate": 1.8632354040792716e-05, + "loss": 0.5985, + "step": 17675 + }, + { + "epoch": 2.885514876943798, + "grad_norm": 1.8138935565948486, + "learning_rate": 1.863219384645227e-05, + "loss": 0.5705, + "step": 17676 + }, + { + "epoch": 2.8856781355863026, + "grad_norm": 1.7421720027923584, + "learning_rate": 1.863203364341921e-05, + "loss": 0.5802, + "step": 17677 + }, + { + "epoch": 2.885841394228807, + "grad_norm": 1.678167462348938, + "learning_rate": 1.86318734316937e-05, + "loss": 0.4673, + "step": 17678 + }, + { + "epoch": 2.8860046528713115, + "grad_norm": 1.6231141090393066, + "learning_rate": 1.863171321127589e-05, + "loss": 0.5437, + "step": 17679 + }, + { + "epoch": 2.886167911513816, + "grad_norm": 1.7522189617156982, + "learning_rate": 1.8631552982165946e-05, + "loss": 0.4901, + "step": 17680 + }, + { + "epoch": 2.8863311701563203, + "grad_norm": 1.8749809265136719, + "learning_rate": 1.8631392744364034e-05, + "loss": 0.5836, + "step": 17681 + }, + { + "epoch": 2.886494428798825, + "grad_norm": 1.7159922122955322, + "learning_rate": 1.863123249787031e-05, + "loss": 0.5992, + "step": 17682 + }, + { + "epoch": 2.886657687441329, + "grad_norm": 1.900062918663025, + "learning_rate": 1.863107224268494e-05, + "loss": 0.5491, + "step": 17683 + }, + { + "epoch": 2.886820946083833, + "grad_norm": 1.775831699371338, + "learning_rate": 1.863091197880808e-05, + "loss": 0.5851, + "step": 17684 + }, + { + "epoch": 2.8869842047263377, + "grad_norm": 1.7001153230667114, + "learning_rate": 1.8630751706239894e-05, + "loss": 0.6041, + "step": 17685 + }, + { + "epoch": 2.887147463368842, + "grad_norm": 1.7428109645843506, + "learning_rate": 1.8630591424980542e-05, + "loss": 0.5864, + "step": 17686 + }, + { + "epoch": 2.8873107220113465, + "grad_norm": 1.6521086692810059, + "learning_rate": 1.863043113503019e-05, + "loss": 0.5472, + "step": 17687 + }, + { + "epoch": 2.887473980653851, + "grad_norm": 1.3973227739334106, + "learning_rate": 1.8630270836388994e-05, + "loss": 0.505, + "step": 17688 + }, + { + "epoch": 2.8876372392963554, + "grad_norm": 1.7875254154205322, + "learning_rate": 1.8630110529057118e-05, + "loss": 0.605, + "step": 17689 + }, + { + "epoch": 2.8878004979388594, + "grad_norm": 1.7325220108032227, + "learning_rate": 1.862995021303472e-05, + "loss": 0.5034, + "step": 17690 + }, + { + "epoch": 2.887963756581364, + "grad_norm": 1.377974033355713, + "learning_rate": 1.862978988832197e-05, + "loss": 0.4334, + "step": 17691 + }, + { + "epoch": 2.8881270152238683, + "grad_norm": 1.6927976608276367, + "learning_rate": 1.8629629554919024e-05, + "loss": 0.5382, + "step": 17692 + }, + { + "epoch": 2.8882902738663727, + "grad_norm": 1.6076256036758423, + "learning_rate": 1.8629469212826037e-05, + "loss": 0.5347, + "step": 17693 + }, + { + "epoch": 2.888453532508877, + "grad_norm": 1.9172288179397583, + "learning_rate": 1.862930886204318e-05, + "loss": 0.5881, + "step": 17694 + }, + { + "epoch": 2.8886167911513816, + "grad_norm": 1.923653483390808, + "learning_rate": 1.8629148502570613e-05, + "loss": 0.4944, + "step": 17695 + }, + { + "epoch": 2.888780049793886, + "grad_norm": 1.6875557899475098, + "learning_rate": 1.8628988134408494e-05, + "loss": 0.5376, + "step": 17696 + }, + { + "epoch": 2.8889433084363905, + "grad_norm": 1.9606821537017822, + "learning_rate": 1.8628827757556985e-05, + "loss": 0.5411, + "step": 17697 + }, + { + "epoch": 2.889106567078895, + "grad_norm": 2.1482462882995605, + "learning_rate": 1.862866737201625e-05, + "loss": 0.597, + "step": 17698 + }, + { + "epoch": 2.8892698257213993, + "grad_norm": 2.1258907318115234, + "learning_rate": 1.862850697778645e-05, + "loss": 0.6809, + "step": 17699 + }, + { + "epoch": 2.889433084363904, + "grad_norm": 2.0534894466400146, + "learning_rate": 1.8628346574867748e-05, + "loss": 0.6572, + "step": 17700 + }, + { + "epoch": 2.8895963430064078, + "grad_norm": 1.606369972229004, + "learning_rate": 1.86281861632603e-05, + "loss": 0.5138, + "step": 17701 + }, + { + "epoch": 2.889759601648912, + "grad_norm": 1.9984064102172852, + "learning_rate": 1.862802574296427e-05, + "loss": 0.6654, + "step": 17702 + }, + { + "epoch": 2.8899228602914167, + "grad_norm": 2.041107416152954, + "learning_rate": 1.8627865313979822e-05, + "loss": 0.56, + "step": 17703 + }, + { + "epoch": 2.890086118933921, + "grad_norm": 1.9027751684188843, + "learning_rate": 1.8627704876307116e-05, + "loss": 0.5438, + "step": 17704 + }, + { + "epoch": 2.8902493775764255, + "grad_norm": 1.8078527450561523, + "learning_rate": 1.8627544429946312e-05, + "loss": 0.5399, + "step": 17705 + }, + { + "epoch": 2.89041263621893, + "grad_norm": 1.8444437980651855, + "learning_rate": 1.8627383974897576e-05, + "loss": 0.6477, + "step": 17706 + }, + { + "epoch": 2.890575894861434, + "grad_norm": 1.5590691566467285, + "learning_rate": 1.8627223511161063e-05, + "loss": 0.5031, + "step": 17707 + }, + { + "epoch": 2.8907391535039384, + "grad_norm": 1.8037335872650146, + "learning_rate": 1.862706303873694e-05, + "loss": 0.6616, + "step": 17708 + }, + { + "epoch": 2.890902412146443, + "grad_norm": 2.1408469676971436, + "learning_rate": 1.862690255762537e-05, + "loss": 0.6798, + "step": 17709 + }, + { + "epoch": 2.8910656707889473, + "grad_norm": 1.9356060028076172, + "learning_rate": 1.8626742067826506e-05, + "loss": 0.602, + "step": 17710 + }, + { + "epoch": 2.8912289294314517, + "grad_norm": 1.5979702472686768, + "learning_rate": 1.8626581569340517e-05, + "loss": 0.5092, + "step": 17711 + }, + { + "epoch": 2.891392188073956, + "grad_norm": 1.755308747291565, + "learning_rate": 1.8626421062167562e-05, + "loss": 0.5769, + "step": 17712 + }, + { + "epoch": 2.8915554467164606, + "grad_norm": 4.770992279052734, + "learning_rate": 1.8626260546307806e-05, + "loss": 0.569, + "step": 17713 + }, + { + "epoch": 2.891718705358965, + "grad_norm": 1.9905551671981812, + "learning_rate": 1.8626100021761406e-05, + "loss": 0.6763, + "step": 17714 + }, + { + "epoch": 2.8918819640014695, + "grad_norm": 1.5221818685531616, + "learning_rate": 1.8625939488528523e-05, + "loss": 0.6304, + "step": 17715 + }, + { + "epoch": 2.892045222643974, + "grad_norm": 1.5903034210205078, + "learning_rate": 1.8625778946609324e-05, + "loss": 0.5203, + "step": 17716 + }, + { + "epoch": 2.8922084812864783, + "grad_norm": 1.6642882823944092, + "learning_rate": 1.8625618396003972e-05, + "loss": 0.4773, + "step": 17717 + }, + { + "epoch": 2.8923717399289823, + "grad_norm": 1.7729309797286987, + "learning_rate": 1.862545783671262e-05, + "loss": 0.5629, + "step": 17718 + }, + { + "epoch": 2.8925349985714868, + "grad_norm": 2.0539376735687256, + "learning_rate": 1.8625297268735434e-05, + "loss": 0.6527, + "step": 17719 + }, + { + "epoch": 2.892698257213991, + "grad_norm": 1.753027081489563, + "learning_rate": 1.8625136692072577e-05, + "loss": 0.6177, + "step": 17720 + }, + { + "epoch": 2.8928615158564956, + "grad_norm": 2.245826244354248, + "learning_rate": 1.862497610672421e-05, + "loss": 0.6956, + "step": 17721 + }, + { + "epoch": 2.893024774499, + "grad_norm": 1.6018072366714478, + "learning_rate": 1.8624815512690492e-05, + "loss": 0.5798, + "step": 17722 + }, + { + "epoch": 2.8931880331415045, + "grad_norm": 1.6475608348846436, + "learning_rate": 1.862465490997159e-05, + "loss": 0.5919, + "step": 17723 + }, + { + "epoch": 2.893351291784009, + "grad_norm": 1.9945629835128784, + "learning_rate": 1.862449429856766e-05, + "loss": 0.6015, + "step": 17724 + }, + { + "epoch": 2.893514550426513, + "grad_norm": 1.9120267629623413, + "learning_rate": 1.8624333678478867e-05, + "loss": 0.5391, + "step": 17725 + }, + { + "epoch": 2.8936778090690174, + "grad_norm": 1.7489694356918335, + "learning_rate": 1.8624173049705374e-05, + "loss": 0.5834, + "step": 17726 + }, + { + "epoch": 2.893841067711522, + "grad_norm": 1.737308144569397, + "learning_rate": 1.862401241224734e-05, + "loss": 0.4917, + "step": 17727 + }, + { + "epoch": 2.8940043263540263, + "grad_norm": 1.523258924484253, + "learning_rate": 1.8623851766104928e-05, + "loss": 0.4855, + "step": 17728 + }, + { + "epoch": 2.8941675849965307, + "grad_norm": 1.5458741188049316, + "learning_rate": 1.86236911112783e-05, + "loss": 0.5464, + "step": 17729 + }, + { + "epoch": 2.894330843639035, + "grad_norm": 1.9457720518112183, + "learning_rate": 1.8623530447767617e-05, + "loss": 0.5481, + "step": 17730 + }, + { + "epoch": 2.8944941022815396, + "grad_norm": 1.8386073112487793, + "learning_rate": 1.862336977557304e-05, + "loss": 0.569, + "step": 17731 + }, + { + "epoch": 2.894657360924044, + "grad_norm": 1.4955552816390991, + "learning_rate": 1.8623209094694736e-05, + "loss": 0.4671, + "step": 17732 + }, + { + "epoch": 2.8948206195665485, + "grad_norm": 2.082251787185669, + "learning_rate": 1.8623048405132856e-05, + "loss": 0.5329, + "step": 17733 + }, + { + "epoch": 2.894983878209053, + "grad_norm": 1.9564498662948608, + "learning_rate": 1.8622887706887574e-05, + "loss": 0.6455, + "step": 17734 + }, + { + "epoch": 2.8951471368515573, + "grad_norm": 1.6800943613052368, + "learning_rate": 1.8622726999959045e-05, + "loss": 0.5891, + "step": 17735 + }, + { + "epoch": 2.8953103954940613, + "grad_norm": 1.9633570909500122, + "learning_rate": 1.8622566284347433e-05, + "loss": 0.5795, + "step": 17736 + }, + { + "epoch": 2.8954736541365658, + "grad_norm": 1.5250705480575562, + "learning_rate": 1.8622405560052896e-05, + "loss": 0.4851, + "step": 17737 + }, + { + "epoch": 2.89563691277907, + "grad_norm": 1.9209930896759033, + "learning_rate": 1.86222448270756e-05, + "loss": 0.6167, + "step": 17738 + }, + { + "epoch": 2.8958001714215746, + "grad_norm": 1.7274808883666992, + "learning_rate": 1.862208408541571e-05, + "loss": 0.5873, + "step": 17739 + }, + { + "epoch": 2.895963430064079, + "grad_norm": 1.9656296968460083, + "learning_rate": 1.8621923335073378e-05, + "loss": 0.5861, + "step": 17740 + }, + { + "epoch": 2.8961266887065835, + "grad_norm": 1.5797642469406128, + "learning_rate": 1.8621762576048775e-05, + "loss": 0.6379, + "step": 17741 + }, + { + "epoch": 2.896289947349088, + "grad_norm": 2.1134722232818604, + "learning_rate": 1.862160180834206e-05, + "loss": 0.7104, + "step": 17742 + }, + { + "epoch": 2.896453205991592, + "grad_norm": 1.6758716106414795, + "learning_rate": 1.862144103195339e-05, + "loss": 0.5438, + "step": 17743 + }, + { + "epoch": 2.8966164646340964, + "grad_norm": 2.0017921924591064, + "learning_rate": 1.862128024688293e-05, + "loss": 0.6798, + "step": 17744 + }, + { + "epoch": 2.896779723276601, + "grad_norm": 1.6614614725112915, + "learning_rate": 1.862111945313085e-05, + "loss": 0.5051, + "step": 17745 + }, + { + "epoch": 2.8969429819191053, + "grad_norm": 1.5223561525344849, + "learning_rate": 1.8620958650697302e-05, + "loss": 0.504, + "step": 17746 + }, + { + "epoch": 2.8971062405616097, + "grad_norm": 1.59871244430542, + "learning_rate": 1.8620797839582447e-05, + "loss": 0.5238, + "step": 17747 + }, + { + "epoch": 2.897269499204114, + "grad_norm": 1.645944356918335, + "learning_rate": 1.8620637019786456e-05, + "loss": 0.5571, + "step": 17748 + }, + { + "epoch": 2.8974327578466186, + "grad_norm": 1.8247694969177246, + "learning_rate": 1.8620476191309486e-05, + "loss": 0.6611, + "step": 17749 + }, + { + "epoch": 2.897596016489123, + "grad_norm": 1.7871205806732178, + "learning_rate": 1.8620315354151695e-05, + "loss": 0.473, + "step": 17750 + }, + { + "epoch": 2.8977592751316275, + "grad_norm": 1.4744175672531128, + "learning_rate": 1.862015450831325e-05, + "loss": 0.4579, + "step": 17751 + }, + { + "epoch": 2.897922533774132, + "grad_norm": 1.8065730333328247, + "learning_rate": 1.8619993653794312e-05, + "loss": 0.5721, + "step": 17752 + }, + { + "epoch": 2.8980857924166363, + "grad_norm": 2.013733386993408, + "learning_rate": 1.8619832790595045e-05, + "loss": 0.5852, + "step": 17753 + }, + { + "epoch": 2.8982490510591403, + "grad_norm": 1.8639310598373413, + "learning_rate": 1.8619671918715605e-05, + "loss": 0.6922, + "step": 17754 + }, + { + "epoch": 2.8984123097016448, + "grad_norm": 1.576262354850769, + "learning_rate": 1.8619511038156158e-05, + "loss": 0.5014, + "step": 17755 + }, + { + "epoch": 2.898575568344149, + "grad_norm": 1.6058295965194702, + "learning_rate": 1.861935014891687e-05, + "loss": 0.6141, + "step": 17756 + }, + { + "epoch": 2.8987388269866536, + "grad_norm": 1.9900966882705688, + "learning_rate": 1.8619189250997895e-05, + "loss": 0.6287, + "step": 17757 + }, + { + "epoch": 2.898902085629158, + "grad_norm": 1.9293241500854492, + "learning_rate": 1.8619028344399397e-05, + "loss": 0.5906, + "step": 17758 + }, + { + "epoch": 2.8990653442716625, + "grad_norm": 1.4846386909484863, + "learning_rate": 1.8618867429121543e-05, + "loss": 0.4792, + "step": 17759 + }, + { + "epoch": 2.8992286029141665, + "grad_norm": 1.2799679040908813, + "learning_rate": 1.861870650516449e-05, + "loss": 0.4613, + "step": 17760 + }, + { + "epoch": 2.899391861556671, + "grad_norm": 1.4729598760604858, + "learning_rate": 1.8618545572528405e-05, + "loss": 0.541, + "step": 17761 + }, + { + "epoch": 2.8995551201991754, + "grad_norm": 1.5994806289672852, + "learning_rate": 1.8618384631213446e-05, + "loss": 0.5127, + "step": 17762 + }, + { + "epoch": 2.89971837884168, + "grad_norm": 1.960179090499878, + "learning_rate": 1.861822368121977e-05, + "loss": 0.629, + "step": 17763 + }, + { + "epoch": 2.8998816374841843, + "grad_norm": 2.0977401733398438, + "learning_rate": 1.861806272254755e-05, + "loss": 0.5883, + "step": 17764 + }, + { + "epoch": 2.9000448961266887, + "grad_norm": 1.8194873332977295, + "learning_rate": 1.8617901755196947e-05, + "loss": 0.5983, + "step": 17765 + }, + { + "epoch": 2.900208154769193, + "grad_norm": 1.57864511013031, + "learning_rate": 1.8617740779168114e-05, + "loss": 0.5437, + "step": 17766 + }, + { + "epoch": 2.9003714134116976, + "grad_norm": 1.9332852363586426, + "learning_rate": 1.861757979446122e-05, + "loss": 0.7716, + "step": 17767 + }, + { + "epoch": 2.900534672054202, + "grad_norm": 1.50779128074646, + "learning_rate": 1.8617418801076424e-05, + "loss": 0.4807, + "step": 17768 + }, + { + "epoch": 2.9006979306967065, + "grad_norm": 1.7065582275390625, + "learning_rate": 1.861725779901389e-05, + "loss": 0.4919, + "step": 17769 + }, + { + "epoch": 2.900861189339211, + "grad_norm": 2.0082521438598633, + "learning_rate": 1.8617096788273778e-05, + "loss": 0.5676, + "step": 17770 + }, + { + "epoch": 2.901024447981715, + "grad_norm": 1.5769697427749634, + "learning_rate": 1.8616935768856255e-05, + "loss": 0.5039, + "step": 17771 + }, + { + "epoch": 2.9011877066242193, + "grad_norm": 1.821488380432129, + "learning_rate": 1.861677474076148e-05, + "loss": 0.5767, + "step": 17772 + }, + { + "epoch": 2.9013509652667238, + "grad_norm": 2.0135397911071777, + "learning_rate": 1.861661370398961e-05, + "loss": 0.713, + "step": 17773 + }, + { + "epoch": 2.901514223909228, + "grad_norm": 1.7651093006134033, + "learning_rate": 1.8616452658540817e-05, + "loss": 0.4902, + "step": 17774 + }, + { + "epoch": 2.9016774825517326, + "grad_norm": 1.6385222673416138, + "learning_rate": 1.861629160441526e-05, + "loss": 0.5747, + "step": 17775 + }, + { + "epoch": 2.901840741194237, + "grad_norm": 2.0526561737060547, + "learning_rate": 1.8616130541613095e-05, + "loss": 0.6435, + "step": 17776 + }, + { + "epoch": 2.9020039998367415, + "grad_norm": 1.7237627506256104, + "learning_rate": 1.861596947013449e-05, + "loss": 0.5383, + "step": 17777 + }, + { + "epoch": 2.9021672584792455, + "grad_norm": 2.245058059692383, + "learning_rate": 1.861580838997961e-05, + "loss": 0.7003, + "step": 17778 + }, + { + "epoch": 2.90233051712175, + "grad_norm": 1.9089449644088745, + "learning_rate": 1.861564730114861e-05, + "loss": 0.6323, + "step": 17779 + }, + { + "epoch": 2.9024937757642544, + "grad_norm": 1.781760573387146, + "learning_rate": 1.8615486203641655e-05, + "loss": 0.5843, + "step": 17780 + }, + { + "epoch": 2.902657034406759, + "grad_norm": 1.9279301166534424, + "learning_rate": 1.861532509745891e-05, + "loss": 0.5614, + "step": 17781 + }, + { + "epoch": 2.9028202930492633, + "grad_norm": 1.452147126197815, + "learning_rate": 1.8615163982600533e-05, + "loss": 0.4492, + "step": 17782 + }, + { + "epoch": 2.9029835516917677, + "grad_norm": 1.9928741455078125, + "learning_rate": 1.8615002859066686e-05, + "loss": 0.651, + "step": 17783 + }, + { + "epoch": 2.903146810334272, + "grad_norm": 1.8142743110656738, + "learning_rate": 1.8614841726857538e-05, + "loss": 0.5263, + "step": 17784 + }, + { + "epoch": 2.9033100689767766, + "grad_norm": 1.7655153274536133, + "learning_rate": 1.8614680585973244e-05, + "loss": 0.5634, + "step": 17785 + }, + { + "epoch": 2.903473327619281, + "grad_norm": 1.7965879440307617, + "learning_rate": 1.8614519436413968e-05, + "loss": 0.5544, + "step": 17786 + }, + { + "epoch": 2.9036365862617854, + "grad_norm": 1.8769704103469849, + "learning_rate": 1.8614358278179878e-05, + "loss": 0.6152, + "step": 17787 + }, + { + "epoch": 2.90379984490429, + "grad_norm": 1.6414679288864136, + "learning_rate": 1.8614197111271127e-05, + "loss": 0.54, + "step": 17788 + }, + { + "epoch": 2.903963103546794, + "grad_norm": 2.1783061027526855, + "learning_rate": 1.8614035935687882e-05, + "loss": 0.6508, + "step": 17789 + }, + { + "epoch": 2.9041263621892983, + "grad_norm": 1.7575427293777466, + "learning_rate": 1.8613874751430304e-05, + "loss": 0.5582, + "step": 17790 + }, + { + "epoch": 2.9042896208318028, + "grad_norm": 2.085942506790161, + "learning_rate": 1.861371355849856e-05, + "loss": 0.6602, + "step": 17791 + }, + { + "epoch": 2.904452879474307, + "grad_norm": 1.4218968152999878, + "learning_rate": 1.861355235689281e-05, + "loss": 0.474, + "step": 17792 + }, + { + "epoch": 2.9046161381168116, + "grad_norm": 1.9140852689743042, + "learning_rate": 1.861339114661321e-05, + "loss": 0.6796, + "step": 17793 + }, + { + "epoch": 2.904779396759316, + "grad_norm": 2.048856019973755, + "learning_rate": 1.861322992765993e-05, + "loss": 0.6875, + "step": 17794 + }, + { + "epoch": 2.90494265540182, + "grad_norm": 1.7481664419174194, + "learning_rate": 1.861306870003313e-05, + "loss": 0.5182, + "step": 17795 + }, + { + "epoch": 2.9051059140443245, + "grad_norm": 1.760698676109314, + "learning_rate": 1.8612907463732973e-05, + "loss": 0.5451, + "step": 17796 + }, + { + "epoch": 2.905269172686829, + "grad_norm": 1.713754415512085, + "learning_rate": 1.861274621875962e-05, + "loss": 0.5662, + "step": 17797 + }, + { + "epoch": 2.9054324313293334, + "grad_norm": 1.8880982398986816, + "learning_rate": 1.861258496511323e-05, + "loss": 0.4966, + "step": 17798 + }, + { + "epoch": 2.905595689971838, + "grad_norm": 1.7429906129837036, + "learning_rate": 1.8612423702793974e-05, + "loss": 0.5173, + "step": 17799 + }, + { + "epoch": 2.9057589486143423, + "grad_norm": 1.5395079851150513, + "learning_rate": 1.861226243180201e-05, + "loss": 0.5679, + "step": 17800 + }, + { + "epoch": 2.9059222072568467, + "grad_norm": 2.0469460487365723, + "learning_rate": 1.8612101152137495e-05, + "loss": 0.5671, + "step": 17801 + }, + { + "epoch": 2.906085465899351, + "grad_norm": 1.520018219947815, + "learning_rate": 1.86119398638006e-05, + "loss": 0.5258, + "step": 17802 + }, + { + "epoch": 2.9062487245418556, + "grad_norm": 1.7438335418701172, + "learning_rate": 1.8611778566791483e-05, + "loss": 0.6385, + "step": 17803 + }, + { + "epoch": 2.90641198318436, + "grad_norm": 1.69405198097229, + "learning_rate": 1.8611617261110306e-05, + "loss": 0.4847, + "step": 17804 + }, + { + "epoch": 2.9065752418268644, + "grad_norm": 2.163175106048584, + "learning_rate": 1.8611455946757236e-05, + "loss": 0.7049, + "step": 17805 + }, + { + "epoch": 2.9067385004693684, + "grad_norm": 2.064250946044922, + "learning_rate": 1.861129462373243e-05, + "loss": 0.5854, + "step": 17806 + }, + { + "epoch": 2.906901759111873, + "grad_norm": 1.7847532033920288, + "learning_rate": 1.861113329203605e-05, + "loss": 0.5038, + "step": 17807 + }, + { + "epoch": 2.9070650177543773, + "grad_norm": 1.8696763515472412, + "learning_rate": 1.8610971951668265e-05, + "loss": 0.589, + "step": 17808 + }, + { + "epoch": 2.9072282763968817, + "grad_norm": 2.001246690750122, + "learning_rate": 1.8610810602629233e-05, + "loss": 0.6272, + "step": 17809 + }, + { + "epoch": 2.907391535039386, + "grad_norm": 1.5194973945617676, + "learning_rate": 1.8610649244919114e-05, + "loss": 0.5316, + "step": 17810 + }, + { + "epoch": 2.9075547936818906, + "grad_norm": 1.7623050212860107, + "learning_rate": 1.861048787853808e-05, + "loss": 0.5045, + "step": 17811 + }, + { + "epoch": 2.907718052324395, + "grad_norm": 1.9318989515304565, + "learning_rate": 1.861032650348628e-05, + "loss": 0.6392, + "step": 17812 + }, + { + "epoch": 2.907881310966899, + "grad_norm": 1.8209224939346313, + "learning_rate": 1.8610165119763887e-05, + "loss": 0.5798, + "step": 17813 + }, + { + "epoch": 2.9080445696094035, + "grad_norm": 1.6228828430175781, + "learning_rate": 1.861000372737106e-05, + "loss": 0.5053, + "step": 17814 + }, + { + "epoch": 2.908207828251908, + "grad_norm": 1.9301124811172485, + "learning_rate": 1.8609842326307958e-05, + "loss": 0.5389, + "step": 17815 + }, + { + "epoch": 2.9083710868944124, + "grad_norm": 1.9047948122024536, + "learning_rate": 1.8609680916574753e-05, + "loss": 0.6502, + "step": 17816 + }, + { + "epoch": 2.908534345536917, + "grad_norm": 1.8809958696365356, + "learning_rate": 1.8609519498171597e-05, + "loss": 0.5529, + "step": 17817 + }, + { + "epoch": 2.9086976041794212, + "grad_norm": 2.090946674346924, + "learning_rate": 1.8609358071098658e-05, + "loss": 0.6526, + "step": 17818 + }, + { + "epoch": 2.9088608628219257, + "grad_norm": 1.687947154045105, + "learning_rate": 1.86091966353561e-05, + "loss": 0.5428, + "step": 17819 + }, + { + "epoch": 2.90902412146443, + "grad_norm": 1.7916088104248047, + "learning_rate": 1.860903519094408e-05, + "loss": 0.5875, + "step": 17820 + }, + { + "epoch": 2.9091873801069346, + "grad_norm": 1.8782737255096436, + "learning_rate": 1.8608873737862767e-05, + "loss": 0.5784, + "step": 17821 + }, + { + "epoch": 2.909350638749439, + "grad_norm": 2.060441255569458, + "learning_rate": 1.8608712276112317e-05, + "loss": 0.6317, + "step": 17822 + }, + { + "epoch": 2.9095138973919434, + "grad_norm": 1.7932711839675903, + "learning_rate": 1.86085508056929e-05, + "loss": 0.6074, + "step": 17823 + }, + { + "epoch": 2.9096771560344474, + "grad_norm": 1.8903188705444336, + "learning_rate": 1.860838932660467e-05, + "loss": 0.5922, + "step": 17824 + }, + { + "epoch": 2.909840414676952, + "grad_norm": 2.0575807094573975, + "learning_rate": 1.86082278388478e-05, + "loss": 0.6445, + "step": 17825 + }, + { + "epoch": 2.9100036733194563, + "grad_norm": 1.8959766626358032, + "learning_rate": 1.8608066342422443e-05, + "loss": 0.6189, + "step": 17826 + }, + { + "epoch": 2.9101669319619607, + "grad_norm": 1.5961552858352661, + "learning_rate": 1.8607904837328765e-05, + "loss": 0.4831, + "step": 17827 + }, + { + "epoch": 2.910330190604465, + "grad_norm": 1.5772615671157837, + "learning_rate": 1.860774332356693e-05, + "loss": 0.4707, + "step": 17828 + }, + { + "epoch": 2.9104934492469696, + "grad_norm": 2.126033306121826, + "learning_rate": 1.86075818011371e-05, + "loss": 0.6519, + "step": 17829 + }, + { + "epoch": 2.910656707889474, + "grad_norm": 2.1368894577026367, + "learning_rate": 1.860742027003944e-05, + "loss": 0.8552, + "step": 17830 + }, + { + "epoch": 2.910819966531978, + "grad_norm": 1.7530128955841064, + "learning_rate": 1.8607258730274106e-05, + "loss": 0.5154, + "step": 17831 + }, + { + "epoch": 2.9109832251744825, + "grad_norm": 1.7587404251098633, + "learning_rate": 1.8607097181841265e-05, + "loss": 0.5631, + "step": 17832 + }, + { + "epoch": 2.911146483816987, + "grad_norm": 1.6685981750488281, + "learning_rate": 1.8606935624741082e-05, + "loss": 0.5161, + "step": 17833 + }, + { + "epoch": 2.9113097424594914, + "grad_norm": 1.8164883852005005, + "learning_rate": 1.8606774058973715e-05, + "loss": 0.6677, + "step": 17834 + }, + { + "epoch": 2.911473001101996, + "grad_norm": 1.7784061431884766, + "learning_rate": 1.860661248453933e-05, + "loss": 0.5501, + "step": 17835 + }, + { + "epoch": 2.9116362597445002, + "grad_norm": 1.8102604150772095, + "learning_rate": 1.860645090143809e-05, + "loss": 0.6518, + "step": 17836 + }, + { + "epoch": 2.9117995183870047, + "grad_norm": 1.4981985092163086, + "learning_rate": 1.8606289309670155e-05, + "loss": 0.4396, + "step": 17837 + }, + { + "epoch": 2.911962777029509, + "grad_norm": 1.5951236486434937, + "learning_rate": 1.8606127709235684e-05, + "loss": 0.4779, + "step": 17838 + }, + { + "epoch": 2.9121260356720136, + "grad_norm": 1.7296514511108398, + "learning_rate": 1.860596610013485e-05, + "loss": 0.6179, + "step": 17839 + }, + { + "epoch": 2.912289294314518, + "grad_norm": 1.887043833732605, + "learning_rate": 1.860580448236781e-05, + "loss": 0.5534, + "step": 17840 + }, + { + "epoch": 2.9124525529570224, + "grad_norm": 1.630205750465393, + "learning_rate": 1.8605642855934727e-05, + "loss": 0.5271, + "step": 17841 + }, + { + "epoch": 2.9126158115995264, + "grad_norm": 2.0895328521728516, + "learning_rate": 1.8605481220835765e-05, + "loss": 0.6507, + "step": 17842 + }, + { + "epoch": 2.912779070242031, + "grad_norm": 1.5169239044189453, + "learning_rate": 1.860531957707108e-05, + "loss": 0.499, + "step": 17843 + }, + { + "epoch": 2.9129423288845353, + "grad_norm": 1.606407880783081, + "learning_rate": 1.8605157924640845e-05, + "loss": 0.496, + "step": 17844 + }, + { + "epoch": 2.9131055875270397, + "grad_norm": 1.734812617301941, + "learning_rate": 1.860499626354522e-05, + "loss": 0.5629, + "step": 17845 + }, + { + "epoch": 2.913268846169544, + "grad_norm": 1.682987093925476, + "learning_rate": 1.860483459378436e-05, + "loss": 0.5435, + "step": 17846 + }, + { + "epoch": 2.9134321048120486, + "grad_norm": 1.9356775283813477, + "learning_rate": 1.8604672915358438e-05, + "loss": 0.5537, + "step": 17847 + }, + { + "epoch": 2.9135953634545526, + "grad_norm": 1.9208205938339233, + "learning_rate": 1.8604511228267615e-05, + "loss": 0.5946, + "step": 17848 + }, + { + "epoch": 2.913758622097057, + "grad_norm": 1.682381272315979, + "learning_rate": 1.8604349532512048e-05, + "loss": 0.4915, + "step": 17849 + }, + { + "epoch": 2.9139218807395615, + "grad_norm": 1.750856637954712, + "learning_rate": 1.8604187828091906e-05, + "loss": 0.515, + "step": 17850 + }, + { + "epoch": 2.914085139382066, + "grad_norm": 1.7758454084396362, + "learning_rate": 1.8604026115007346e-05, + "loss": 0.6066, + "step": 17851 + }, + { + "epoch": 2.9142483980245704, + "grad_norm": 2.3196723461151123, + "learning_rate": 1.8603864393258534e-05, + "loss": 0.6044, + "step": 17852 + }, + { + "epoch": 2.914411656667075, + "grad_norm": 1.8440107107162476, + "learning_rate": 1.8603702662845634e-05, + "loss": 0.6408, + "step": 17853 + }, + { + "epoch": 2.9145749153095792, + "grad_norm": 1.9932106733322144, + "learning_rate": 1.860354092376881e-05, + "loss": 0.5884, + "step": 17854 + }, + { + "epoch": 2.9147381739520837, + "grad_norm": 1.6793954372406006, + "learning_rate": 1.860337917602822e-05, + "loss": 0.485, + "step": 17855 + }, + { + "epoch": 2.914901432594588, + "grad_norm": 1.557876706123352, + "learning_rate": 1.860321741962403e-05, + "loss": 0.4831, + "step": 17856 + }, + { + "epoch": 2.9150646912370926, + "grad_norm": 2.3762307167053223, + "learning_rate": 1.86030556545564e-05, + "loss": 0.6682, + "step": 17857 + }, + { + "epoch": 2.915227949879597, + "grad_norm": 1.608048439025879, + "learning_rate": 1.8602893880825498e-05, + "loss": 0.5423, + "step": 17858 + }, + { + "epoch": 2.915391208522101, + "grad_norm": 1.9455102682113647, + "learning_rate": 1.8602732098431488e-05, + "loss": 0.5327, + "step": 17859 + }, + { + "epoch": 2.9155544671646054, + "grad_norm": 1.6257545948028564, + "learning_rate": 1.8602570307374523e-05, + "loss": 0.534, + "step": 17860 + }, + { + "epoch": 2.91571772580711, + "grad_norm": 1.6405078172683716, + "learning_rate": 1.8602408507654772e-05, + "loss": 0.6237, + "step": 17861 + }, + { + "epoch": 2.9158809844496143, + "grad_norm": 1.5228326320648193, + "learning_rate": 1.86022466992724e-05, + "loss": 0.4651, + "step": 17862 + }, + { + "epoch": 2.9160442430921187, + "grad_norm": 1.7370457649230957, + "learning_rate": 1.8602084882227568e-05, + "loss": 0.5598, + "step": 17863 + }, + { + "epoch": 2.916207501734623, + "grad_norm": 1.5862135887145996, + "learning_rate": 1.8601923056520437e-05, + "loss": 0.5375, + "step": 17864 + }, + { + "epoch": 2.9163707603771276, + "grad_norm": 1.6465702056884766, + "learning_rate": 1.8601761222151174e-05, + "loss": 0.5697, + "step": 17865 + }, + { + "epoch": 2.9165340190196316, + "grad_norm": 1.7884713411331177, + "learning_rate": 1.860159937911994e-05, + "loss": 0.5668, + "step": 17866 + }, + { + "epoch": 2.916697277662136, + "grad_norm": 1.7640323638916016, + "learning_rate": 1.8601437527426897e-05, + "loss": 0.5043, + "step": 17867 + }, + { + "epoch": 2.9168605363046405, + "grad_norm": 1.7849290370941162, + "learning_rate": 1.8601275667072205e-05, + "loss": 0.5543, + "step": 17868 + }, + { + "epoch": 2.917023794947145, + "grad_norm": 1.8355060815811157, + "learning_rate": 1.8601113798056035e-05, + "loss": 0.6097, + "step": 17869 + }, + { + "epoch": 2.9171870535896494, + "grad_norm": 1.8711811304092407, + "learning_rate": 1.8600951920378546e-05, + "loss": 0.7097, + "step": 17870 + }, + { + "epoch": 2.917350312232154, + "grad_norm": 1.8175523281097412, + "learning_rate": 1.8600790034039895e-05, + "loss": 0.6085, + "step": 17871 + }, + { + "epoch": 2.9175135708746582, + "grad_norm": 1.5256733894348145, + "learning_rate": 1.8600628139040258e-05, + "loss": 0.5092, + "step": 17872 + }, + { + "epoch": 2.9176768295171627, + "grad_norm": 1.6591397523880005, + "learning_rate": 1.8600466235379786e-05, + "loss": 0.4625, + "step": 17873 + }, + { + "epoch": 2.917840088159667, + "grad_norm": 1.8574957847595215, + "learning_rate": 1.860030432305865e-05, + "loss": 0.6631, + "step": 17874 + }, + { + "epoch": 2.9180033468021715, + "grad_norm": 1.4704838991165161, + "learning_rate": 1.8600142402077006e-05, + "loss": 0.5011, + "step": 17875 + }, + { + "epoch": 2.918166605444676, + "grad_norm": 1.302383542060852, + "learning_rate": 1.8599980472435025e-05, + "loss": 0.464, + "step": 17876 + }, + { + "epoch": 2.91832986408718, + "grad_norm": 1.7129923105239868, + "learning_rate": 1.8599818534132865e-05, + "loss": 0.5799, + "step": 17877 + }, + { + "epoch": 2.9184931227296844, + "grad_norm": 1.7768864631652832, + "learning_rate": 1.8599656587170687e-05, + "loss": 0.6283, + "step": 17878 + }, + { + "epoch": 2.918656381372189, + "grad_norm": 1.7070026397705078, + "learning_rate": 1.859949463154866e-05, + "loss": 0.5783, + "step": 17879 + }, + { + "epoch": 2.9188196400146933, + "grad_norm": 1.6146975755691528, + "learning_rate": 1.8599332667266942e-05, + "loss": 0.586, + "step": 17880 + }, + { + "epoch": 2.9189828986571977, + "grad_norm": 1.8681390285491943, + "learning_rate": 1.8599170694325698e-05, + "loss": 0.6175, + "step": 17881 + }, + { + "epoch": 2.919146157299702, + "grad_norm": 1.8025680780410767, + "learning_rate": 1.8599008712725094e-05, + "loss": 0.6021, + "step": 17882 + }, + { + "epoch": 2.919309415942206, + "grad_norm": 1.4294878244400024, + "learning_rate": 1.859884672246529e-05, + "loss": 0.4391, + "step": 17883 + }, + { + "epoch": 2.9194726745847106, + "grad_norm": 2.021430730819702, + "learning_rate": 1.8598684723546448e-05, + "loss": 0.5947, + "step": 17884 + }, + { + "epoch": 2.919635933227215, + "grad_norm": 1.6895673274993896, + "learning_rate": 1.8598522715968736e-05, + "loss": 0.5434, + "step": 17885 + }, + { + "epoch": 2.9197991918697195, + "grad_norm": 1.4866735935211182, + "learning_rate": 1.859836069973231e-05, + "loss": 0.4737, + "step": 17886 + }, + { + "epoch": 2.919962450512224, + "grad_norm": 1.8909248113632202, + "learning_rate": 1.859819867483734e-05, + "loss": 0.5587, + "step": 17887 + }, + { + "epoch": 2.9201257091547284, + "grad_norm": 1.6870230436325073, + "learning_rate": 1.8598036641283985e-05, + "loss": 0.542, + "step": 17888 + }, + { + "epoch": 2.920288967797233, + "grad_norm": 1.384941577911377, + "learning_rate": 1.8597874599072413e-05, + "loss": 0.5075, + "step": 17889 + }, + { + "epoch": 2.9204522264397372, + "grad_norm": 1.696475863456726, + "learning_rate": 1.859771254820278e-05, + "loss": 0.4768, + "step": 17890 + }, + { + "epoch": 2.9206154850822417, + "grad_norm": 1.8641002178192139, + "learning_rate": 1.8597550488675252e-05, + "loss": 0.6883, + "step": 17891 + }, + { + "epoch": 2.920778743724746, + "grad_norm": 1.954168438911438, + "learning_rate": 1.859738842048999e-05, + "loss": 0.5905, + "step": 17892 + }, + { + "epoch": 2.9209420023672505, + "grad_norm": 1.6769558191299438, + "learning_rate": 1.8597226343647165e-05, + "loss": 0.5562, + "step": 17893 + }, + { + "epoch": 2.9211052610097545, + "grad_norm": 1.664737582206726, + "learning_rate": 1.8597064258146935e-05, + "loss": 0.4768, + "step": 17894 + }, + { + "epoch": 2.921268519652259, + "grad_norm": 2.2146146297454834, + "learning_rate": 1.859690216398946e-05, + "loss": 0.6291, + "step": 17895 + }, + { + "epoch": 2.9214317782947634, + "grad_norm": 1.862608551979065, + "learning_rate": 1.8596740061174912e-05, + "loss": 0.5417, + "step": 17896 + }, + { + "epoch": 2.921595036937268, + "grad_norm": 1.954742193222046, + "learning_rate": 1.859657794970345e-05, + "loss": 0.7383, + "step": 17897 + }, + { + "epoch": 2.9217582955797723, + "grad_norm": 2.046867609024048, + "learning_rate": 1.859641582957523e-05, + "loss": 0.6492, + "step": 17898 + }, + { + "epoch": 2.9219215542222767, + "grad_norm": 1.6088579893112183, + "learning_rate": 1.8596253700790427e-05, + "loss": 0.4839, + "step": 17899 + }, + { + "epoch": 2.922084812864781, + "grad_norm": 1.9849224090576172, + "learning_rate": 1.859609156334919e-05, + "loss": 0.609, + "step": 17900 + }, + { + "epoch": 2.922248071507285, + "grad_norm": 1.8670803308486938, + "learning_rate": 1.85959294172517e-05, + "loss": 0.6352, + "step": 17901 + }, + { + "epoch": 2.9224113301497896, + "grad_norm": 1.8991568088531494, + "learning_rate": 1.859576726249811e-05, + "loss": 0.5374, + "step": 17902 + }, + { + "epoch": 2.922574588792294, + "grad_norm": 2.0115537643432617, + "learning_rate": 1.8595605099088583e-05, + "loss": 0.5519, + "step": 17903 + }, + { + "epoch": 2.9227378474347985, + "grad_norm": 1.7305188179016113, + "learning_rate": 1.8595442927023283e-05, + "loss": 0.6648, + "step": 17904 + }, + { + "epoch": 2.922901106077303, + "grad_norm": 1.62247633934021, + "learning_rate": 1.8595280746302376e-05, + "loss": 0.4908, + "step": 17905 + }, + { + "epoch": 2.9230643647198074, + "grad_norm": 1.596406102180481, + "learning_rate": 1.8595118556926023e-05, + "loss": 0.5113, + "step": 17906 + }, + { + "epoch": 2.923227623362312, + "grad_norm": 1.992498517036438, + "learning_rate": 1.859495635889439e-05, + "loss": 0.5376, + "step": 17907 + }, + { + "epoch": 2.9233908820048162, + "grad_norm": 1.7793515920639038, + "learning_rate": 1.8594794152207637e-05, + "loss": 0.5525, + "step": 17908 + }, + { + "epoch": 2.9235541406473207, + "grad_norm": 1.8079369068145752, + "learning_rate": 1.8594631936865926e-05, + "loss": 0.611, + "step": 17909 + }, + { + "epoch": 2.923717399289825, + "grad_norm": 1.638043761253357, + "learning_rate": 1.8594469712869425e-05, + "loss": 0.5575, + "step": 17910 + }, + { + "epoch": 2.9238806579323295, + "grad_norm": 1.5246232748031616, + "learning_rate": 1.8594307480218297e-05, + "loss": 0.4679, + "step": 17911 + }, + { + "epoch": 2.9240439165748335, + "grad_norm": 1.7906066179275513, + "learning_rate": 1.85941452389127e-05, + "loss": 0.5215, + "step": 17912 + }, + { + "epoch": 2.924207175217338, + "grad_norm": 2.0487053394317627, + "learning_rate": 1.8593982988952802e-05, + "loss": 0.6439, + "step": 17913 + }, + { + "epoch": 2.9243704338598424, + "grad_norm": 1.7352861166000366, + "learning_rate": 1.8593820730338766e-05, + "loss": 0.5309, + "step": 17914 + }, + { + "epoch": 2.924533692502347, + "grad_norm": 1.704026699066162, + "learning_rate": 1.8593658463070757e-05, + "loss": 0.5457, + "step": 17915 + }, + { + "epoch": 2.9246969511448513, + "grad_norm": 1.8454428911209106, + "learning_rate": 1.859349618714893e-05, + "loss": 0.6157, + "step": 17916 + }, + { + "epoch": 2.9248602097873557, + "grad_norm": 1.523018717765808, + "learning_rate": 1.859333390257346e-05, + "loss": 0.5112, + "step": 17917 + }, + { + "epoch": 2.92502346842986, + "grad_norm": 1.625593900680542, + "learning_rate": 1.8593171609344505e-05, + "loss": 0.4952, + "step": 17918 + }, + { + "epoch": 2.925186727072364, + "grad_norm": 1.7568827867507935, + "learning_rate": 1.8593009307462227e-05, + "loss": 0.5347, + "step": 17919 + }, + { + "epoch": 2.9253499857148686, + "grad_norm": 1.7919505834579468, + "learning_rate": 1.8592846996926793e-05, + "loss": 0.6035, + "step": 17920 + }, + { + "epoch": 2.925513244357373, + "grad_norm": 2.013686418533325, + "learning_rate": 1.859268467773836e-05, + "loss": 0.5749, + "step": 17921 + }, + { + "epoch": 2.9256765029998775, + "grad_norm": 1.468985676765442, + "learning_rate": 1.85925223498971e-05, + "loss": 0.4519, + "step": 17922 + }, + { + "epoch": 2.925839761642382, + "grad_norm": 1.7490332126617432, + "learning_rate": 1.859236001340317e-05, + "loss": 0.6045, + "step": 17923 + }, + { + "epoch": 2.9260030202848863, + "grad_norm": 1.7930822372436523, + "learning_rate": 1.8592197668256737e-05, + "loss": 0.5351, + "step": 17924 + }, + { + "epoch": 2.926166278927391, + "grad_norm": 2.010746479034424, + "learning_rate": 1.8592035314457963e-05, + "loss": 0.6848, + "step": 17925 + }, + { + "epoch": 2.9263295375698952, + "grad_norm": 1.7743477821350098, + "learning_rate": 1.859187295200701e-05, + "loss": 0.5766, + "step": 17926 + }, + { + "epoch": 2.9264927962123997, + "grad_norm": 1.7078264951705933, + "learning_rate": 1.8591710580904043e-05, + "loss": 0.4881, + "step": 17927 + }, + { + "epoch": 2.926656054854904, + "grad_norm": 1.7915047407150269, + "learning_rate": 1.8591548201149228e-05, + "loss": 0.6081, + "step": 17928 + }, + { + "epoch": 2.9268193134974085, + "grad_norm": 1.8123133182525635, + "learning_rate": 1.8591385812742724e-05, + "loss": 0.5815, + "step": 17929 + }, + { + "epoch": 2.9269825721399125, + "grad_norm": 1.4957998991012573, + "learning_rate": 1.85912234156847e-05, + "loss": 0.4932, + "step": 17930 + }, + { + "epoch": 2.927145830782417, + "grad_norm": 1.822986364364624, + "learning_rate": 1.8591061009975316e-05, + "loss": 0.5924, + "step": 17931 + }, + { + "epoch": 2.9273090894249214, + "grad_norm": 1.765818476676941, + "learning_rate": 1.8590898595614734e-05, + "loss": 0.5113, + "step": 17932 + }, + { + "epoch": 2.927472348067426, + "grad_norm": 1.9998183250427246, + "learning_rate": 1.859073617260312e-05, + "loss": 0.6751, + "step": 17933 + }, + { + "epoch": 2.9276356067099303, + "grad_norm": 2.416313886642456, + "learning_rate": 1.8590573740940635e-05, + "loss": 0.6086, + "step": 17934 + }, + { + "epoch": 2.9277988653524347, + "grad_norm": 1.8034532070159912, + "learning_rate": 1.8590411300627447e-05, + "loss": 0.5486, + "step": 17935 + }, + { + "epoch": 2.9279621239949387, + "grad_norm": 1.6868922710418701, + "learning_rate": 1.8590248851663715e-05, + "loss": 0.515, + "step": 17936 + }, + { + "epoch": 2.928125382637443, + "grad_norm": 1.6406034231185913, + "learning_rate": 1.8590086394049605e-05, + "loss": 0.5817, + "step": 17937 + }, + { + "epoch": 2.9282886412799476, + "grad_norm": 1.8560587167739868, + "learning_rate": 1.8589923927785283e-05, + "loss": 0.6808, + "step": 17938 + }, + { + "epoch": 2.928451899922452, + "grad_norm": 1.895390272140503, + "learning_rate": 1.8589761452870908e-05, + "loss": 0.6702, + "step": 17939 + }, + { + "epoch": 2.9286151585649565, + "grad_norm": 1.541446328163147, + "learning_rate": 1.8589598969306646e-05, + "loss": 0.5023, + "step": 17940 + }, + { + "epoch": 2.928778417207461, + "grad_norm": 1.8904463052749634, + "learning_rate": 1.858943647709266e-05, + "loss": 0.723, + "step": 17941 + }, + { + "epoch": 2.9289416758499653, + "grad_norm": 1.7231184244155884, + "learning_rate": 1.8589273976229112e-05, + "loss": 0.6177, + "step": 17942 + }, + { + "epoch": 2.92910493449247, + "grad_norm": 1.6920517683029175, + "learning_rate": 1.858911146671617e-05, + "loss": 0.5386, + "step": 17943 + }, + { + "epoch": 2.929268193134974, + "grad_norm": 1.6130996942520142, + "learning_rate": 1.8588948948553995e-05, + "loss": 0.589, + "step": 17944 + }, + { + "epoch": 2.9294314517774787, + "grad_norm": 1.5637621879577637, + "learning_rate": 1.858878642174275e-05, + "loss": 0.5132, + "step": 17945 + }, + { + "epoch": 2.929594710419983, + "grad_norm": 1.3991285562515259, + "learning_rate": 1.8588623886282597e-05, + "loss": 0.4328, + "step": 17946 + }, + { + "epoch": 2.929757969062487, + "grad_norm": 2.408189296722412, + "learning_rate": 1.8588461342173704e-05, + "loss": 0.6257, + "step": 17947 + }, + { + "epoch": 2.9299212277049915, + "grad_norm": 1.7660748958587646, + "learning_rate": 1.8588298789416232e-05, + "loss": 0.6096, + "step": 17948 + }, + { + "epoch": 2.930084486347496, + "grad_norm": 2.064765691757202, + "learning_rate": 1.8588136228010347e-05, + "loss": 0.6139, + "step": 17949 + }, + { + "epoch": 2.9302477449900004, + "grad_norm": 1.5302923917770386, + "learning_rate": 1.858797365795621e-05, + "loss": 0.5366, + "step": 17950 + }, + { + "epoch": 2.930411003632505, + "grad_norm": 1.763407826423645, + "learning_rate": 1.8587811079253985e-05, + "loss": 0.447, + "step": 17951 + }, + { + "epoch": 2.9305742622750093, + "grad_norm": 1.7470853328704834, + "learning_rate": 1.858764849190384e-05, + "loss": 0.6152, + "step": 17952 + }, + { + "epoch": 2.9307375209175137, + "grad_norm": 1.69144868850708, + "learning_rate": 1.8587485895905932e-05, + "loss": 0.5614, + "step": 17953 + }, + { + "epoch": 2.9309007795600177, + "grad_norm": 1.6453455686569214, + "learning_rate": 1.8587323291260428e-05, + "loss": 0.4477, + "step": 17954 + }, + { + "epoch": 2.931064038202522, + "grad_norm": 1.5945335626602173, + "learning_rate": 1.8587160677967493e-05, + "loss": 0.5035, + "step": 17955 + }, + { + "epoch": 2.9312272968450266, + "grad_norm": 2.073347806930542, + "learning_rate": 1.8586998056027287e-05, + "loss": 0.625, + "step": 17956 + }, + { + "epoch": 2.931390555487531, + "grad_norm": 2.1483993530273438, + "learning_rate": 1.858683542543998e-05, + "loss": 0.5606, + "step": 17957 + }, + { + "epoch": 2.9315538141300355, + "grad_norm": 1.8882253170013428, + "learning_rate": 1.8586672786205732e-05, + "loss": 0.5927, + "step": 17958 + }, + { + "epoch": 2.93171707277254, + "grad_norm": 1.782860279083252, + "learning_rate": 1.8586510138324705e-05, + "loss": 0.5629, + "step": 17959 + }, + { + "epoch": 2.9318803314150443, + "grad_norm": 1.6959872245788574, + "learning_rate": 1.8586347481797064e-05, + "loss": 0.4937, + "step": 17960 + }, + { + "epoch": 2.9320435900575488, + "grad_norm": 1.6044689416885376, + "learning_rate": 1.858618481662297e-05, + "loss": 0.4609, + "step": 17961 + }, + { + "epoch": 2.932206848700053, + "grad_norm": 1.8220643997192383, + "learning_rate": 1.8586022142802597e-05, + "loss": 0.5837, + "step": 17962 + }, + { + "epoch": 2.9323701073425577, + "grad_norm": 1.689004898071289, + "learning_rate": 1.85858594603361e-05, + "loss": 0.497, + "step": 17963 + }, + { + "epoch": 2.932533365985062, + "grad_norm": 1.7420686483383179, + "learning_rate": 1.8585696769223643e-05, + "loss": 0.4904, + "step": 17964 + }, + { + "epoch": 2.932696624627566, + "grad_norm": 1.5595111846923828, + "learning_rate": 1.8585534069465392e-05, + "loss": 0.5031, + "step": 17965 + }, + { + "epoch": 2.9328598832700705, + "grad_norm": 1.7041480541229248, + "learning_rate": 1.858537136106151e-05, + "loss": 0.4777, + "step": 17966 + }, + { + "epoch": 2.933023141912575, + "grad_norm": 1.7258635759353638, + "learning_rate": 1.8585208644012165e-05, + "loss": 0.5676, + "step": 17967 + }, + { + "epoch": 2.9331864005550794, + "grad_norm": 1.5887751579284668, + "learning_rate": 1.858504591831751e-05, + "loss": 0.5002, + "step": 17968 + }, + { + "epoch": 2.933349659197584, + "grad_norm": 1.6990437507629395, + "learning_rate": 1.8584883183977724e-05, + "loss": 0.5318, + "step": 17969 + }, + { + "epoch": 2.9335129178400883, + "grad_norm": 1.8764703273773193, + "learning_rate": 1.858472044099296e-05, + "loss": 0.5213, + "step": 17970 + }, + { + "epoch": 2.9336761764825927, + "grad_norm": 1.4518674612045288, + "learning_rate": 1.8584557689363382e-05, + "loss": 0.4361, + "step": 17971 + }, + { + "epoch": 2.9338394351250967, + "grad_norm": 1.9052942991256714, + "learning_rate": 1.8584394929089157e-05, + "loss": 0.4964, + "step": 17972 + }, + { + "epoch": 2.934002693767601, + "grad_norm": 1.9771413803100586, + "learning_rate": 1.8584232160170452e-05, + "loss": 0.6091, + "step": 17973 + }, + { + "epoch": 2.9341659524101056, + "grad_norm": 1.7133971452713013, + "learning_rate": 1.8584069382607427e-05, + "loss": 0.5366, + "step": 17974 + }, + { + "epoch": 2.93432921105261, + "grad_norm": 1.5606110095977783, + "learning_rate": 1.8583906596400245e-05, + "loss": 0.471, + "step": 17975 + }, + { + "epoch": 2.9344924696951145, + "grad_norm": 2.0329689979553223, + "learning_rate": 1.858374380154907e-05, + "loss": 0.5409, + "step": 17976 + }, + { + "epoch": 2.934655728337619, + "grad_norm": 1.800714373588562, + "learning_rate": 1.858358099805407e-05, + "loss": 0.5801, + "step": 17977 + }, + { + "epoch": 2.9348189869801233, + "grad_norm": 2.2615914344787598, + "learning_rate": 1.858341818591541e-05, + "loss": 0.8652, + "step": 17978 + }, + { + "epoch": 2.9349822456226278, + "grad_norm": 1.9496499300003052, + "learning_rate": 1.8583255365133243e-05, + "loss": 0.5797, + "step": 17979 + }, + { + "epoch": 2.935145504265132, + "grad_norm": 1.879807949066162, + "learning_rate": 1.8583092535707742e-05, + "loss": 0.5565, + "step": 17980 + }, + { + "epoch": 2.9353087629076366, + "grad_norm": 1.8824173212051392, + "learning_rate": 1.8582929697639067e-05, + "loss": 0.5611, + "step": 17981 + }, + { + "epoch": 2.935472021550141, + "grad_norm": 2.0085232257843018, + "learning_rate": 1.8582766850927386e-05, + "loss": 0.5498, + "step": 17982 + }, + { + "epoch": 2.935635280192645, + "grad_norm": 1.787595510482788, + "learning_rate": 1.8582603995572862e-05, + "loss": 0.5682, + "step": 17983 + }, + { + "epoch": 2.9357985388351495, + "grad_norm": 1.5872801542282104, + "learning_rate": 1.8582441131575658e-05, + "loss": 0.5608, + "step": 17984 + }, + { + "epoch": 2.935961797477654, + "grad_norm": 1.4629100561141968, + "learning_rate": 1.858227825893594e-05, + "loss": 0.4931, + "step": 17985 + }, + { + "epoch": 2.9361250561201584, + "grad_norm": 1.7257094383239746, + "learning_rate": 1.858211537765387e-05, + "loss": 0.5257, + "step": 17986 + }, + { + "epoch": 2.936288314762663, + "grad_norm": 1.7356125116348267, + "learning_rate": 1.8581952487729606e-05, + "loss": 0.5594, + "step": 17987 + }, + { + "epoch": 2.9364515734051673, + "grad_norm": 1.5784704685211182, + "learning_rate": 1.8581789589163323e-05, + "loss": 0.6005, + "step": 17988 + }, + { + "epoch": 2.9366148320476713, + "grad_norm": 1.7964249849319458, + "learning_rate": 1.8581626681955177e-05, + "loss": 0.599, + "step": 17989 + }, + { + "epoch": 2.9367780906901757, + "grad_norm": 1.3930518627166748, + "learning_rate": 1.858146376610534e-05, + "loss": 0.4995, + "step": 17990 + }, + { + "epoch": 2.93694134933268, + "grad_norm": 2.039621591567993, + "learning_rate": 1.8581300841613967e-05, + "loss": 0.664, + "step": 17991 + }, + { + "epoch": 2.9371046079751846, + "grad_norm": 1.4649262428283691, + "learning_rate": 1.858113790848123e-05, + "loss": 0.4687, + "step": 17992 + }, + { + "epoch": 2.937267866617689, + "grad_norm": 1.7644776105880737, + "learning_rate": 1.8580974966707287e-05, + "loss": 0.55, + "step": 17993 + }, + { + "epoch": 2.9374311252601935, + "grad_norm": 1.818028211593628, + "learning_rate": 1.8580812016292306e-05, + "loss": 0.5402, + "step": 17994 + }, + { + "epoch": 2.937594383902698, + "grad_norm": 1.9628305435180664, + "learning_rate": 1.858064905723645e-05, + "loss": 0.6092, + "step": 17995 + }, + { + "epoch": 2.9377576425452023, + "grad_norm": 1.9067360162734985, + "learning_rate": 1.858048608953988e-05, + "loss": 0.6544, + "step": 17996 + }, + { + "epoch": 2.9379209011877068, + "grad_norm": 1.7021703720092773, + "learning_rate": 1.858032311320276e-05, + "loss": 0.5404, + "step": 17997 + }, + { + "epoch": 2.938084159830211, + "grad_norm": 2.026393413543701, + "learning_rate": 1.8580160128225263e-05, + "loss": 0.5885, + "step": 17998 + }, + { + "epoch": 2.9382474184727156, + "grad_norm": 1.8413658142089844, + "learning_rate": 1.8579997134607545e-05, + "loss": 0.6775, + "step": 17999 + }, + { + "epoch": 2.9384106771152196, + "grad_norm": 1.7751041650772095, + "learning_rate": 1.8579834132349773e-05, + "loss": 0.5849, + "step": 18000 + }, + { + "epoch": 2.938573935757724, + "grad_norm": 1.3367198705673218, + "learning_rate": 1.857967112145211e-05, + "loss": 0.4288, + "step": 18001 + }, + { + "epoch": 2.9387371944002285, + "grad_norm": 1.8593498468399048, + "learning_rate": 1.8579508101914715e-05, + "loss": 0.5744, + "step": 18002 + }, + { + "epoch": 2.938900453042733, + "grad_norm": 1.7493009567260742, + "learning_rate": 1.8579345073737765e-05, + "loss": 0.5311, + "step": 18003 + }, + { + "epoch": 2.9390637116852374, + "grad_norm": 1.6209200620651245, + "learning_rate": 1.8579182036921415e-05, + "loss": 0.5637, + "step": 18004 + }, + { + "epoch": 2.939226970327742, + "grad_norm": 1.7260329723358154, + "learning_rate": 1.8579018991465833e-05, + "loss": 0.553, + "step": 18005 + }, + { + "epoch": 2.9393902289702463, + "grad_norm": 2.1213278770446777, + "learning_rate": 1.8578855937371176e-05, + "loss": 0.5917, + "step": 18006 + }, + { + "epoch": 2.9395534876127503, + "grad_norm": 1.9250500202178955, + "learning_rate": 1.8578692874637612e-05, + "loss": 0.5612, + "step": 18007 + }, + { + "epoch": 2.9397167462552547, + "grad_norm": 1.6840766668319702, + "learning_rate": 1.8578529803265313e-05, + "loss": 0.608, + "step": 18008 + }, + { + "epoch": 2.939880004897759, + "grad_norm": 1.6709774732589722, + "learning_rate": 1.8578366723254432e-05, + "loss": 0.5747, + "step": 18009 + }, + { + "epoch": 2.9400432635402636, + "grad_norm": 1.8643375635147095, + "learning_rate": 1.857820363460514e-05, + "loss": 0.6407, + "step": 18010 + }, + { + "epoch": 2.940206522182768, + "grad_norm": 1.6746883392333984, + "learning_rate": 1.85780405373176e-05, + "loss": 0.5849, + "step": 18011 + }, + { + "epoch": 2.9403697808252724, + "grad_norm": 2.149601697921753, + "learning_rate": 1.8577877431391977e-05, + "loss": 0.6617, + "step": 18012 + }, + { + "epoch": 2.940533039467777, + "grad_norm": 1.569210410118103, + "learning_rate": 1.857771431682843e-05, + "loss": 0.5498, + "step": 18013 + }, + { + "epoch": 2.9406962981102813, + "grad_norm": 1.8874183893203735, + "learning_rate": 1.8577551193627126e-05, + "loss": 0.6005, + "step": 18014 + }, + { + "epoch": 2.9408595567527858, + "grad_norm": 1.7618495225906372, + "learning_rate": 1.8577388061788234e-05, + "loss": 0.5565, + "step": 18015 + }, + { + "epoch": 2.94102281539529, + "grad_norm": 2.0395314693450928, + "learning_rate": 1.8577224921311914e-05, + "loss": 0.5768, + "step": 18016 + }, + { + "epoch": 2.9411860740377946, + "grad_norm": 1.2721446752548218, + "learning_rate": 1.857706177219833e-05, + "loss": 0.4147, + "step": 18017 + }, + { + "epoch": 2.9413493326802986, + "grad_norm": 1.7796175479888916, + "learning_rate": 1.8576898614447647e-05, + "loss": 0.6386, + "step": 18018 + }, + { + "epoch": 2.941512591322803, + "grad_norm": 1.6364092826843262, + "learning_rate": 1.857673544806003e-05, + "loss": 0.5065, + "step": 18019 + }, + { + "epoch": 2.9416758499653075, + "grad_norm": 1.7773348093032837, + "learning_rate": 1.857657227303564e-05, + "loss": 0.539, + "step": 18020 + }, + { + "epoch": 2.941839108607812, + "grad_norm": 1.2756208181381226, + "learning_rate": 1.8576409089374646e-05, + "loss": 0.4374, + "step": 18021 + }, + { + "epoch": 2.9420023672503164, + "grad_norm": 1.7695531845092773, + "learning_rate": 1.857624589707721e-05, + "loss": 0.5744, + "step": 18022 + }, + { + "epoch": 2.942165625892821, + "grad_norm": 1.6499700546264648, + "learning_rate": 1.8576082696143498e-05, + "loss": 0.5485, + "step": 18023 + }, + { + "epoch": 2.942328884535325, + "grad_norm": 1.8487637042999268, + "learning_rate": 1.8575919486573673e-05, + "loss": 0.6077, + "step": 18024 + }, + { + "epoch": 2.9424921431778293, + "grad_norm": 1.4366466999053955, + "learning_rate": 1.8575756268367897e-05, + "loss": 0.4643, + "step": 18025 + }, + { + "epoch": 2.9426554018203337, + "grad_norm": 1.793678879737854, + "learning_rate": 1.8575593041526335e-05, + "loss": 0.5875, + "step": 18026 + }, + { + "epoch": 2.942818660462838, + "grad_norm": 1.9411144256591797, + "learning_rate": 1.8575429806049158e-05, + "loss": 0.7382, + "step": 18027 + }, + { + "epoch": 2.9429819191053426, + "grad_norm": 1.7894147634506226, + "learning_rate": 1.8575266561936526e-05, + "loss": 0.5042, + "step": 18028 + }, + { + "epoch": 2.943145177747847, + "grad_norm": 2.075589418411255, + "learning_rate": 1.85751033091886e-05, + "loss": 0.553, + "step": 18029 + }, + { + "epoch": 2.9433084363903514, + "grad_norm": 1.6673574447631836, + "learning_rate": 1.8574940047805547e-05, + "loss": 0.5705, + "step": 18030 + }, + { + "epoch": 2.943471695032856, + "grad_norm": 1.6359843015670776, + "learning_rate": 1.857477677778753e-05, + "loss": 0.5245, + "step": 18031 + }, + { + "epoch": 2.9436349536753603, + "grad_norm": 1.4172242879867554, + "learning_rate": 1.857461349913472e-05, + "loss": 0.4026, + "step": 18032 + }, + { + "epoch": 2.9437982123178648, + "grad_norm": 1.8109841346740723, + "learning_rate": 1.8574450211847273e-05, + "loss": 0.5421, + "step": 18033 + }, + { + "epoch": 2.943961470960369, + "grad_norm": 1.6301275491714478, + "learning_rate": 1.8574286915925357e-05, + "loss": 0.536, + "step": 18034 + }, + { + "epoch": 2.944124729602873, + "grad_norm": 1.4539228677749634, + "learning_rate": 1.857412361136914e-05, + "loss": 0.5119, + "step": 18035 + }, + { + "epoch": 2.9442879882453776, + "grad_norm": 1.9372824430465698, + "learning_rate": 1.857396029817878e-05, + "loss": 0.66, + "step": 18036 + }, + { + "epoch": 2.944451246887882, + "grad_norm": 1.9071247577667236, + "learning_rate": 1.8573796976354442e-05, + "loss": 0.6924, + "step": 18037 + }, + { + "epoch": 2.9446145055303865, + "grad_norm": 1.7879050970077515, + "learning_rate": 1.8573633645896295e-05, + "loss": 0.5753, + "step": 18038 + }, + { + "epoch": 2.944777764172891, + "grad_norm": 1.6888853311538696, + "learning_rate": 1.85734703068045e-05, + "loss": 0.5408, + "step": 18039 + }, + { + "epoch": 2.9449410228153954, + "grad_norm": 1.857982873916626, + "learning_rate": 1.857330695907922e-05, + "loss": 0.646, + "step": 18040 + }, + { + "epoch": 2.9451042814579, + "grad_norm": 1.526651382446289, + "learning_rate": 1.857314360272063e-05, + "loss": 0.462, + "step": 18041 + }, + { + "epoch": 2.945267540100404, + "grad_norm": 1.4464304447174072, + "learning_rate": 1.8572980237728882e-05, + "loss": 0.517, + "step": 18042 + }, + { + "epoch": 2.9454307987429083, + "grad_norm": 1.7867621183395386, + "learning_rate": 1.8572816864104144e-05, + "loss": 0.5866, + "step": 18043 + }, + { + "epoch": 2.9455940573854127, + "grad_norm": 1.8136091232299805, + "learning_rate": 1.8572653481846585e-05, + "loss": 0.5311, + "step": 18044 + }, + { + "epoch": 2.945757316027917, + "grad_norm": 1.7979810237884521, + "learning_rate": 1.8572490090956364e-05, + "loss": 0.6313, + "step": 18045 + }, + { + "epoch": 2.9459205746704216, + "grad_norm": 1.9422125816345215, + "learning_rate": 1.8572326691433648e-05, + "loss": 0.6601, + "step": 18046 + }, + { + "epoch": 2.946083833312926, + "grad_norm": 1.77805495262146, + "learning_rate": 1.8572163283278602e-05, + "loss": 0.5668, + "step": 18047 + }, + { + "epoch": 2.9462470919554304, + "grad_norm": 1.6690195798873901, + "learning_rate": 1.857199986649139e-05, + "loss": 0.6081, + "step": 18048 + }, + { + "epoch": 2.946410350597935, + "grad_norm": 1.8916101455688477, + "learning_rate": 1.8571836441072176e-05, + "loss": 0.6188, + "step": 18049 + }, + { + "epoch": 2.9465736092404393, + "grad_norm": 1.9806627035140991, + "learning_rate": 1.8571673007021124e-05, + "loss": 0.5789, + "step": 18050 + }, + { + "epoch": 2.9467368678829438, + "grad_norm": 2.029191017150879, + "learning_rate": 1.8571509564338403e-05, + "loss": 0.7106, + "step": 18051 + }, + { + "epoch": 2.946900126525448, + "grad_norm": 1.5958247184753418, + "learning_rate": 1.857134611302417e-05, + "loss": 0.6575, + "step": 18052 + }, + { + "epoch": 2.947063385167952, + "grad_norm": 1.5890089273452759, + "learning_rate": 1.8571182653078598e-05, + "loss": 0.5529, + "step": 18053 + }, + { + "epoch": 2.9472266438104566, + "grad_norm": 1.7500520944595337, + "learning_rate": 1.8571019184501842e-05, + "loss": 0.566, + "step": 18054 + }, + { + "epoch": 2.947389902452961, + "grad_norm": 1.9082276821136475, + "learning_rate": 1.8570855707294075e-05, + "loss": 0.629, + "step": 18055 + }, + { + "epoch": 2.9475531610954655, + "grad_norm": 1.6358975172042847, + "learning_rate": 1.8570692221455456e-05, + "loss": 0.5543, + "step": 18056 + }, + { + "epoch": 2.94771641973797, + "grad_norm": 1.7338250875473022, + "learning_rate": 1.8570528726986157e-05, + "loss": 0.5883, + "step": 18057 + }, + { + "epoch": 2.9478796783804744, + "grad_norm": 1.7178035974502563, + "learning_rate": 1.8570365223886334e-05, + "loss": 0.6212, + "step": 18058 + }, + { + "epoch": 2.948042937022979, + "grad_norm": 1.795100212097168, + "learning_rate": 1.8570201712156154e-05, + "loss": 0.6119, + "step": 18059 + }, + { + "epoch": 2.948206195665483, + "grad_norm": 2.043907642364502, + "learning_rate": 1.857003819179579e-05, + "loss": 0.6823, + "step": 18060 + }, + { + "epoch": 2.9483694543079872, + "grad_norm": 1.9183822870254517, + "learning_rate": 1.8569874662805394e-05, + "loss": 0.5741, + "step": 18061 + }, + { + "epoch": 2.9485327129504917, + "grad_norm": 2.0951156616210938, + "learning_rate": 1.856971112518514e-05, + "loss": 0.7513, + "step": 18062 + }, + { + "epoch": 2.948695971592996, + "grad_norm": 1.830708384513855, + "learning_rate": 1.856954757893519e-05, + "loss": 0.6137, + "step": 18063 + }, + { + "epoch": 2.9488592302355006, + "grad_norm": 1.9934183359146118, + "learning_rate": 1.85693840240557e-05, + "loss": 0.5954, + "step": 18064 + }, + { + "epoch": 2.949022488878005, + "grad_norm": 1.6199222803115845, + "learning_rate": 1.856922046054685e-05, + "loss": 0.5232, + "step": 18065 + }, + { + "epoch": 2.9491857475205094, + "grad_norm": 1.6457316875457764, + "learning_rate": 1.8569056888408793e-05, + "loss": 0.5333, + "step": 18066 + }, + { + "epoch": 2.949349006163014, + "grad_norm": 1.4063727855682373, + "learning_rate": 1.85688933076417e-05, + "loss": 0.455, + "step": 18067 + }, + { + "epoch": 2.9495122648055183, + "grad_norm": 1.476234793663025, + "learning_rate": 1.8568729718245735e-05, + "loss": 0.5153, + "step": 18068 + }, + { + "epoch": 2.9496755234480228, + "grad_norm": 1.5507657527923584, + "learning_rate": 1.856856612022106e-05, + "loss": 0.5222, + "step": 18069 + }, + { + "epoch": 2.949838782090527, + "grad_norm": 1.4827975034713745, + "learning_rate": 1.856840251356784e-05, + "loss": 0.5185, + "step": 18070 + }, + { + "epoch": 2.950002040733031, + "grad_norm": 1.5673984289169312, + "learning_rate": 1.856823889828624e-05, + "loss": 0.5566, + "step": 18071 + }, + { + "epoch": 2.9501652993755356, + "grad_norm": 1.610834002494812, + "learning_rate": 1.856807527437643e-05, + "loss": 0.5722, + "step": 18072 + }, + { + "epoch": 2.95032855801804, + "grad_norm": 1.9638224840164185, + "learning_rate": 1.856791164183857e-05, + "loss": 0.6481, + "step": 18073 + }, + { + "epoch": 2.9504918166605445, + "grad_norm": 1.5823862552642822, + "learning_rate": 1.8567748000672822e-05, + "loss": 0.5291, + "step": 18074 + }, + { + "epoch": 2.950655075303049, + "grad_norm": 1.647438883781433, + "learning_rate": 1.8567584350879358e-05, + "loss": 0.5731, + "step": 18075 + }, + { + "epoch": 2.9508183339455534, + "grad_norm": 1.8748046159744263, + "learning_rate": 1.8567420692458334e-05, + "loss": 0.5919, + "step": 18076 + }, + { + "epoch": 2.9509815925880574, + "grad_norm": 1.7580033540725708, + "learning_rate": 1.8567257025409925e-05, + "loss": 0.5168, + "step": 18077 + }, + { + "epoch": 2.951144851230562, + "grad_norm": 1.644645094871521, + "learning_rate": 1.8567093349734288e-05, + "loss": 0.5712, + "step": 18078 + }, + { + "epoch": 2.9513081098730662, + "grad_norm": 1.9562546014785767, + "learning_rate": 1.8566929665431587e-05, + "loss": 0.6328, + "step": 18079 + }, + { + "epoch": 2.9514713685155707, + "grad_norm": 1.8055895566940308, + "learning_rate": 1.8566765972501994e-05, + "loss": 0.5823, + "step": 18080 + }, + { + "epoch": 2.951634627158075, + "grad_norm": 1.870413899421692, + "learning_rate": 1.8566602270945672e-05, + "loss": 0.4999, + "step": 18081 + }, + { + "epoch": 2.9517978858005796, + "grad_norm": 1.6711071729660034, + "learning_rate": 1.8566438560762777e-05, + "loss": 0.5351, + "step": 18082 + }, + { + "epoch": 2.951961144443084, + "grad_norm": 1.6578034162521362, + "learning_rate": 1.8566274841953485e-05, + "loss": 0.5221, + "step": 18083 + }, + { + "epoch": 2.9521244030855884, + "grad_norm": 1.6413918733596802, + "learning_rate": 1.856611111451796e-05, + "loss": 0.595, + "step": 18084 + }, + { + "epoch": 2.952287661728093, + "grad_norm": 1.5837206840515137, + "learning_rate": 1.8565947378456357e-05, + "loss": 0.4413, + "step": 18085 + }, + { + "epoch": 2.9524509203705973, + "grad_norm": 1.4865412712097168, + "learning_rate": 1.8565783633768846e-05, + "loss": 0.4726, + "step": 18086 + }, + { + "epoch": 2.9526141790131017, + "grad_norm": 1.542541742324829, + "learning_rate": 1.85656198804556e-05, + "loss": 0.4721, + "step": 18087 + }, + { + "epoch": 2.9527774376556057, + "grad_norm": 2.103179931640625, + "learning_rate": 1.8565456118516772e-05, + "loss": 0.6867, + "step": 18088 + }, + { + "epoch": 2.95294069629811, + "grad_norm": 2.0652265548706055, + "learning_rate": 1.8565292347952534e-05, + "loss": 0.7377, + "step": 18089 + }, + { + "epoch": 2.9531039549406146, + "grad_norm": 1.577345371246338, + "learning_rate": 1.856512856876305e-05, + "loss": 0.5152, + "step": 18090 + }, + { + "epoch": 2.953267213583119, + "grad_norm": 1.952079176902771, + "learning_rate": 1.856496478094848e-05, + "loss": 0.6658, + "step": 18091 + }, + { + "epoch": 2.9534304722256235, + "grad_norm": 1.9994937181472778, + "learning_rate": 1.8564800984508996e-05, + "loss": 0.6605, + "step": 18092 + }, + { + "epoch": 2.953593730868128, + "grad_norm": 1.8440040349960327, + "learning_rate": 1.856463717944476e-05, + "loss": 0.7243, + "step": 18093 + }, + { + "epoch": 2.9537569895106324, + "grad_norm": 1.5432770252227783, + "learning_rate": 1.8564473365755936e-05, + "loss": 0.4776, + "step": 18094 + }, + { + "epoch": 2.9539202481531364, + "grad_norm": 1.98477303981781, + "learning_rate": 1.8564309543442687e-05, + "loss": 0.6147, + "step": 18095 + }, + { + "epoch": 2.954083506795641, + "grad_norm": 1.8681087493896484, + "learning_rate": 1.8564145712505183e-05, + "loss": 0.5399, + "step": 18096 + }, + { + "epoch": 2.9542467654381452, + "grad_norm": 2.1236751079559326, + "learning_rate": 1.856398187294359e-05, + "loss": 0.5822, + "step": 18097 + }, + { + "epoch": 2.9544100240806497, + "grad_norm": 1.9147045612335205, + "learning_rate": 1.856381802475806e-05, + "loss": 0.6008, + "step": 18098 + }, + { + "epoch": 2.954573282723154, + "grad_norm": 1.976349949836731, + "learning_rate": 1.8563654167948775e-05, + "loss": 0.629, + "step": 18099 + }, + { + "epoch": 2.9547365413656586, + "grad_norm": 1.6939783096313477, + "learning_rate": 1.856349030251589e-05, + "loss": 0.5764, + "step": 18100 + }, + { + "epoch": 2.954899800008163, + "grad_norm": 1.8261750936508179, + "learning_rate": 1.8563326428459575e-05, + "loss": 0.6102, + "step": 18101 + }, + { + "epoch": 2.9550630586506674, + "grad_norm": 1.7089571952819824, + "learning_rate": 1.856316254577999e-05, + "loss": 0.4901, + "step": 18102 + }, + { + "epoch": 2.955226317293172, + "grad_norm": 2.123260974884033, + "learning_rate": 1.8562998654477306e-05, + "loss": 0.6927, + "step": 18103 + }, + { + "epoch": 2.9553895759356763, + "grad_norm": 2.152907609939575, + "learning_rate": 1.856283475455168e-05, + "loss": 0.5964, + "step": 18104 + }, + { + "epoch": 2.9555528345781807, + "grad_norm": 1.6917438507080078, + "learning_rate": 1.8562670846003283e-05, + "loss": 0.5632, + "step": 18105 + }, + { + "epoch": 2.9557160932206847, + "grad_norm": 1.9249720573425293, + "learning_rate": 1.856250692883228e-05, + "loss": 0.5946, + "step": 18106 + }, + { + "epoch": 2.955879351863189, + "grad_norm": 1.9186310768127441, + "learning_rate": 1.8562343003038833e-05, + "loss": 0.7153, + "step": 18107 + }, + { + "epoch": 2.9560426105056936, + "grad_norm": 1.4700809717178345, + "learning_rate": 1.8562179068623112e-05, + "loss": 0.5064, + "step": 18108 + }, + { + "epoch": 2.956205869148198, + "grad_norm": 2.276103973388672, + "learning_rate": 1.8562015125585276e-05, + "loss": 0.7563, + "step": 18109 + }, + { + "epoch": 2.9563691277907025, + "grad_norm": 1.390039324760437, + "learning_rate": 1.8561851173925495e-05, + "loss": 0.5032, + "step": 18110 + }, + { + "epoch": 2.956532386433207, + "grad_norm": 1.741662859916687, + "learning_rate": 1.8561687213643932e-05, + "loss": 0.5679, + "step": 18111 + }, + { + "epoch": 2.956695645075711, + "grad_norm": 1.7234525680541992, + "learning_rate": 1.8561523244740752e-05, + "loss": 0.6022, + "step": 18112 + }, + { + "epoch": 2.9568589037182154, + "grad_norm": 1.9708195924758911, + "learning_rate": 1.8561359267216116e-05, + "loss": 0.5327, + "step": 18113 + }, + { + "epoch": 2.95702216236072, + "grad_norm": 1.6738073825836182, + "learning_rate": 1.8561195281070198e-05, + "loss": 0.5085, + "step": 18114 + }, + { + "epoch": 2.9571854210032242, + "grad_norm": 1.5160729885101318, + "learning_rate": 1.856103128630316e-05, + "loss": 0.6, + "step": 18115 + }, + { + "epoch": 2.9573486796457287, + "grad_norm": 1.6154049634933472, + "learning_rate": 1.8560867282915164e-05, + "loss": 0.5292, + "step": 18116 + }, + { + "epoch": 2.957511938288233, + "grad_norm": 1.945219874382019, + "learning_rate": 1.8560703270906376e-05, + "loss": 0.6413, + "step": 18117 + }, + { + "epoch": 2.9576751969307375, + "grad_norm": 1.925022006034851, + "learning_rate": 1.856053925027696e-05, + "loss": 0.6491, + "step": 18118 + }, + { + "epoch": 2.957838455573242, + "grad_norm": 1.9037195444107056, + "learning_rate": 1.856037522102709e-05, + "loss": 0.5963, + "step": 18119 + }, + { + "epoch": 2.9580017142157464, + "grad_norm": 1.7455673217773438, + "learning_rate": 1.8560211183156918e-05, + "loss": 0.5511, + "step": 18120 + }, + { + "epoch": 2.958164972858251, + "grad_norm": 1.7736386060714722, + "learning_rate": 1.856004713666662e-05, + "loss": 0.5153, + "step": 18121 + }, + { + "epoch": 2.9583282315007553, + "grad_norm": 1.6166919469833374, + "learning_rate": 1.8559883081556352e-05, + "loss": 0.4859, + "step": 18122 + }, + { + "epoch": 2.9584914901432593, + "grad_norm": 1.6300199031829834, + "learning_rate": 1.855971901782629e-05, + "loss": 0.5079, + "step": 18123 + }, + { + "epoch": 2.9586547487857637, + "grad_norm": 2.0903406143188477, + "learning_rate": 1.855955494547659e-05, + "loss": 0.6673, + "step": 18124 + }, + { + "epoch": 2.958818007428268, + "grad_norm": 1.5611151456832886, + "learning_rate": 1.855939086450742e-05, + "loss": 0.6084, + "step": 18125 + }, + { + "epoch": 2.9589812660707726, + "grad_norm": 1.7498500347137451, + "learning_rate": 1.8559226774918945e-05, + "loss": 0.6196, + "step": 18126 + }, + { + "epoch": 2.959144524713277, + "grad_norm": 1.8391656875610352, + "learning_rate": 1.855906267671133e-05, + "loss": 0.6688, + "step": 18127 + }, + { + "epoch": 2.9593077833557815, + "grad_norm": 2.016286611557007, + "learning_rate": 1.8558898569884743e-05, + "loss": 0.6459, + "step": 18128 + }, + { + "epoch": 2.959471041998286, + "grad_norm": 1.7072981595993042, + "learning_rate": 1.8558734454439348e-05, + "loss": 0.5873, + "step": 18129 + }, + { + "epoch": 2.95963430064079, + "grad_norm": 1.8694050312042236, + "learning_rate": 1.855857033037531e-05, + "loss": 0.6056, + "step": 18130 + }, + { + "epoch": 2.9597975592832944, + "grad_norm": 1.919055461883545, + "learning_rate": 1.8558406197692792e-05, + "loss": 0.539, + "step": 18131 + }, + { + "epoch": 2.959960817925799, + "grad_norm": 1.7384508848190308, + "learning_rate": 1.8558242056391963e-05, + "loss": 0.5278, + "step": 18132 + }, + { + "epoch": 2.9601240765683032, + "grad_norm": 2.3119008541107178, + "learning_rate": 1.8558077906472988e-05, + "loss": 0.629, + "step": 18133 + }, + { + "epoch": 2.9602873352108077, + "grad_norm": 1.6334726810455322, + "learning_rate": 1.8557913747936028e-05, + "loss": 0.518, + "step": 18134 + }, + { + "epoch": 2.960450593853312, + "grad_norm": 1.7381631135940552, + "learning_rate": 1.8557749580781253e-05, + "loss": 0.4551, + "step": 18135 + }, + { + "epoch": 2.9606138524958165, + "grad_norm": 1.728139042854309, + "learning_rate": 1.8557585405008823e-05, + "loss": 0.55, + "step": 18136 + }, + { + "epoch": 2.960777111138321, + "grad_norm": 1.6108583211898804, + "learning_rate": 1.8557421220618913e-05, + "loss": 0.5034, + "step": 18137 + }, + { + "epoch": 2.9609403697808254, + "grad_norm": 1.554089903831482, + "learning_rate": 1.8557257027611677e-05, + "loss": 0.5475, + "step": 18138 + }, + { + "epoch": 2.96110362842333, + "grad_norm": 1.6109753847122192, + "learning_rate": 1.8557092825987286e-05, + "loss": 0.5055, + "step": 18139 + }, + { + "epoch": 2.9612668870658343, + "grad_norm": 1.6414663791656494, + "learning_rate": 1.8556928615745903e-05, + "loss": 0.5128, + "step": 18140 + }, + { + "epoch": 2.9614301457083383, + "grad_norm": 1.6871025562286377, + "learning_rate": 1.85567643968877e-05, + "loss": 0.5843, + "step": 18141 + }, + { + "epoch": 2.9615934043508427, + "grad_norm": 1.7328838109970093, + "learning_rate": 1.8556600169412835e-05, + "loss": 0.5508, + "step": 18142 + }, + { + "epoch": 2.961756662993347, + "grad_norm": 1.868011236190796, + "learning_rate": 1.8556435933321478e-05, + "loss": 0.6581, + "step": 18143 + }, + { + "epoch": 2.9619199216358516, + "grad_norm": 1.5218510627746582, + "learning_rate": 1.855627168861379e-05, + "loss": 0.4955, + "step": 18144 + }, + { + "epoch": 2.962083180278356, + "grad_norm": 1.6428921222686768, + "learning_rate": 1.8556107435289936e-05, + "loss": 0.5685, + "step": 18145 + }, + { + "epoch": 2.9622464389208605, + "grad_norm": 1.5416829586029053, + "learning_rate": 1.8555943173350087e-05, + "loss": 0.4708, + "step": 18146 + }, + { + "epoch": 2.962409697563365, + "grad_norm": 1.6917961835861206, + "learning_rate": 1.855577890279441e-05, + "loss": 0.5702, + "step": 18147 + }, + { + "epoch": 2.962572956205869, + "grad_norm": 1.7757213115692139, + "learning_rate": 1.8555614623623058e-05, + "loss": 0.5688, + "step": 18148 + }, + { + "epoch": 2.9627362148483734, + "grad_norm": 1.778321385383606, + "learning_rate": 1.8555450335836206e-05, + "loss": 0.5371, + "step": 18149 + }, + { + "epoch": 2.962899473490878, + "grad_norm": 1.8449740409851074, + "learning_rate": 1.8555286039434022e-05, + "loss": 0.5893, + "step": 18150 + }, + { + "epoch": 2.9630627321333822, + "grad_norm": 1.6250321865081787, + "learning_rate": 1.8555121734416663e-05, + "loss": 0.5416, + "step": 18151 + }, + { + "epoch": 2.9632259907758867, + "grad_norm": 1.7698135375976562, + "learning_rate": 1.8554957420784305e-05, + "loss": 0.6383, + "step": 18152 + }, + { + "epoch": 2.963389249418391, + "grad_norm": 1.7123297452926636, + "learning_rate": 1.85547930985371e-05, + "loss": 0.6039, + "step": 18153 + }, + { + "epoch": 2.9635525080608955, + "grad_norm": 1.734176754951477, + "learning_rate": 1.8554628767675223e-05, + "loss": 0.5882, + "step": 18154 + }, + { + "epoch": 2.9637157667034, + "grad_norm": 1.7789782285690308, + "learning_rate": 1.8554464428198836e-05, + "loss": 0.5864, + "step": 18155 + }, + { + "epoch": 2.9638790253459044, + "grad_norm": 1.7084827423095703, + "learning_rate": 1.855430008010811e-05, + "loss": 0.5902, + "step": 18156 + }, + { + "epoch": 2.964042283988409, + "grad_norm": 1.6308003664016724, + "learning_rate": 1.85541357234032e-05, + "loss": 0.4939, + "step": 18157 + }, + { + "epoch": 2.9642055426309133, + "grad_norm": 1.908738136291504, + "learning_rate": 1.8553971358084283e-05, + "loss": 0.6074, + "step": 18158 + }, + { + "epoch": 2.9643688012734173, + "grad_norm": 1.7047337293624878, + "learning_rate": 1.8553806984151513e-05, + "loss": 0.5408, + "step": 18159 + }, + { + "epoch": 2.9645320599159217, + "grad_norm": 1.6893696784973145, + "learning_rate": 1.855364260160507e-05, + "loss": 0.5852, + "step": 18160 + }, + { + "epoch": 2.964695318558426, + "grad_norm": 1.947135329246521, + "learning_rate": 1.8553478210445103e-05, + "loss": 0.6195, + "step": 18161 + }, + { + "epoch": 2.9648585772009306, + "grad_norm": 1.7147351503372192, + "learning_rate": 1.855331381067179e-05, + "loss": 0.5232, + "step": 18162 + }, + { + "epoch": 2.965021835843435, + "grad_norm": 1.4189355373382568, + "learning_rate": 1.8553149402285292e-05, + "loss": 0.409, + "step": 18163 + }, + { + "epoch": 2.9651850944859395, + "grad_norm": 1.6084070205688477, + "learning_rate": 1.8552984985285776e-05, + "loss": 0.5148, + "step": 18164 + }, + { + "epoch": 2.9653483531284435, + "grad_norm": 1.6608827114105225, + "learning_rate": 1.8552820559673402e-05, + "loss": 0.5415, + "step": 18165 + }, + { + "epoch": 2.965511611770948, + "grad_norm": 1.468374490737915, + "learning_rate": 1.8552656125448343e-05, + "loss": 0.45, + "step": 18166 + }, + { + "epoch": 2.9656748704134523, + "grad_norm": 1.904512882232666, + "learning_rate": 1.855249168261076e-05, + "loss": 0.5902, + "step": 18167 + }, + { + "epoch": 2.965838129055957, + "grad_norm": 1.8768353462219238, + "learning_rate": 1.8552327231160823e-05, + "loss": 0.5993, + "step": 18168 + }, + { + "epoch": 2.9660013876984612, + "grad_norm": 2.0201563835144043, + "learning_rate": 1.8552162771098694e-05, + "loss": 0.6215, + "step": 18169 + }, + { + "epoch": 2.9661646463409657, + "grad_norm": 1.4923202991485596, + "learning_rate": 1.8551998302424538e-05, + "loss": 0.475, + "step": 18170 + }, + { + "epoch": 2.96632790498347, + "grad_norm": 1.660489559173584, + "learning_rate": 1.8551833825138522e-05, + "loss": 0.5733, + "step": 18171 + }, + { + "epoch": 2.9664911636259745, + "grad_norm": 1.5600953102111816, + "learning_rate": 1.8551669339240814e-05, + "loss": 0.5897, + "step": 18172 + }, + { + "epoch": 2.966654422268479, + "grad_norm": 2.0842835903167725, + "learning_rate": 1.8551504844731573e-05, + "loss": 0.5818, + "step": 18173 + }, + { + "epoch": 2.9668176809109834, + "grad_norm": 1.7132611274719238, + "learning_rate": 1.8551340341610972e-05, + "loss": 0.5943, + "step": 18174 + }, + { + "epoch": 2.966980939553488, + "grad_norm": 1.6732656955718994, + "learning_rate": 1.8551175829879173e-05, + "loss": 0.5182, + "step": 18175 + }, + { + "epoch": 2.967144198195992, + "grad_norm": 2.1856913566589355, + "learning_rate": 1.855101130953634e-05, + "loss": 0.6668, + "step": 18176 + }, + { + "epoch": 2.9673074568384963, + "grad_norm": 1.6018717288970947, + "learning_rate": 1.8550846780582645e-05, + "loss": 0.4983, + "step": 18177 + }, + { + "epoch": 2.9674707154810007, + "grad_norm": 1.5558254718780518, + "learning_rate": 1.8550682243018248e-05, + "loss": 0.4592, + "step": 18178 + }, + { + "epoch": 2.967633974123505, + "grad_norm": 1.8557801246643066, + "learning_rate": 1.8550517696843314e-05, + "loss": 0.509, + "step": 18179 + }, + { + "epoch": 2.9677972327660096, + "grad_norm": 1.6679770946502686, + "learning_rate": 1.8550353142058016e-05, + "loss": 0.4933, + "step": 18180 + }, + { + "epoch": 2.967960491408514, + "grad_norm": 1.8816874027252197, + "learning_rate": 1.855018857866251e-05, + "loss": 0.6487, + "step": 18181 + }, + { + "epoch": 2.9681237500510185, + "grad_norm": 1.55352783203125, + "learning_rate": 1.8550024006656967e-05, + "loss": 0.4996, + "step": 18182 + }, + { + "epoch": 2.9682870086935225, + "grad_norm": 1.5650925636291504, + "learning_rate": 1.8549859426041555e-05, + "loss": 0.4752, + "step": 18183 + }, + { + "epoch": 2.968450267336027, + "grad_norm": 1.7042255401611328, + "learning_rate": 1.8549694836816432e-05, + "loss": 0.5518, + "step": 18184 + }, + { + "epoch": 2.9686135259785313, + "grad_norm": 1.9354888200759888, + "learning_rate": 1.854953023898177e-05, + "loss": 0.5262, + "step": 18185 + }, + { + "epoch": 2.968776784621036, + "grad_norm": 1.850725531578064, + "learning_rate": 1.8549365632537733e-05, + "loss": 0.5856, + "step": 18186 + }, + { + "epoch": 2.96894004326354, + "grad_norm": 1.823691725730896, + "learning_rate": 1.8549201017484493e-05, + "loss": 0.5268, + "step": 18187 + }, + { + "epoch": 2.9691033019060447, + "grad_norm": 2.0509369373321533, + "learning_rate": 1.8549036393822206e-05, + "loss": 0.6028, + "step": 18188 + }, + { + "epoch": 2.969266560548549, + "grad_norm": 1.6641638278961182, + "learning_rate": 1.8548871761551038e-05, + "loss": 0.5109, + "step": 18189 + }, + { + "epoch": 2.9694298191910535, + "grad_norm": 1.8646317720413208, + "learning_rate": 1.854870712067116e-05, + "loss": 0.632, + "step": 18190 + }, + { + "epoch": 2.969593077833558, + "grad_norm": 1.6161279678344727, + "learning_rate": 1.854854247118274e-05, + "loss": 0.5242, + "step": 18191 + }, + { + "epoch": 2.9697563364760624, + "grad_norm": 1.4998263120651245, + "learning_rate": 1.8548377813085937e-05, + "loss": 0.5047, + "step": 18192 + }, + { + "epoch": 2.969919595118567, + "grad_norm": 1.7434195280075073, + "learning_rate": 1.854821314638092e-05, + "loss": 0.5342, + "step": 18193 + }, + { + "epoch": 2.970082853761071, + "grad_norm": 1.4435157775878906, + "learning_rate": 1.8548048471067854e-05, + "loss": 0.5177, + "step": 18194 + }, + { + "epoch": 2.9702461124035753, + "grad_norm": 1.939494252204895, + "learning_rate": 1.854788378714691e-05, + "loss": 0.5957, + "step": 18195 + }, + { + "epoch": 2.9704093710460797, + "grad_norm": 1.326919674873352, + "learning_rate": 1.8547719094618243e-05, + "loss": 0.4524, + "step": 18196 + }, + { + "epoch": 2.970572629688584, + "grad_norm": 1.5881034135818481, + "learning_rate": 1.8547554393482026e-05, + "loss": 0.5211, + "step": 18197 + }, + { + "epoch": 2.9707358883310886, + "grad_norm": 2.0996592044830322, + "learning_rate": 1.8547389683738427e-05, + "loss": 0.6088, + "step": 18198 + }, + { + "epoch": 2.970899146973593, + "grad_norm": 1.5659735202789307, + "learning_rate": 1.854722496538761e-05, + "loss": 0.5665, + "step": 18199 + }, + { + "epoch": 2.9710624056160975, + "grad_norm": 1.637933611869812, + "learning_rate": 1.8547060238429737e-05, + "loss": 0.5386, + "step": 18200 + }, + { + "epoch": 2.9712256642586015, + "grad_norm": 1.8051273822784424, + "learning_rate": 1.854689550286498e-05, + "loss": 0.5541, + "step": 18201 + }, + { + "epoch": 2.971388922901106, + "grad_norm": 1.7232459783554077, + "learning_rate": 1.8546730758693498e-05, + "loss": 0.5401, + "step": 18202 + }, + { + "epoch": 2.9715521815436103, + "grad_norm": 1.9662402868270874, + "learning_rate": 1.8546566005915458e-05, + "loss": 0.6396, + "step": 18203 + }, + { + "epoch": 2.9717154401861148, + "grad_norm": 1.3359075784683228, + "learning_rate": 1.854640124453103e-05, + "loss": 0.502, + "step": 18204 + }, + { + "epoch": 2.971878698828619, + "grad_norm": 1.657192587852478, + "learning_rate": 1.8546236474540384e-05, + "loss": 0.4249, + "step": 18205 + }, + { + "epoch": 2.9720419574711237, + "grad_norm": 1.7939566373825073, + "learning_rate": 1.854607169594367e-05, + "loss": 0.5316, + "step": 18206 + }, + { + "epoch": 2.972205216113628, + "grad_norm": 1.7591578960418701, + "learning_rate": 1.8545906908741074e-05, + "loss": 0.4955, + "step": 18207 + }, + { + "epoch": 2.9723684747561325, + "grad_norm": 1.8536492586135864, + "learning_rate": 1.8545742112932744e-05, + "loss": 0.7046, + "step": 18208 + }, + { + "epoch": 2.972531733398637, + "grad_norm": 1.5745627880096436, + "learning_rate": 1.854557730851886e-05, + "loss": 0.4885, + "step": 18209 + }, + { + "epoch": 2.9726949920411414, + "grad_norm": 1.4578603506088257, + "learning_rate": 1.854541249549958e-05, + "loss": 0.5735, + "step": 18210 + }, + { + "epoch": 2.972858250683646, + "grad_norm": 1.9467918872833252, + "learning_rate": 1.854524767387507e-05, + "loss": 0.6246, + "step": 18211 + }, + { + "epoch": 2.97302150932615, + "grad_norm": 1.8222229480743408, + "learning_rate": 1.85450828436455e-05, + "loss": 0.6184, + "step": 18212 + }, + { + "epoch": 2.9731847679686543, + "grad_norm": 1.979440450668335, + "learning_rate": 1.8544918004811034e-05, + "loss": 0.6552, + "step": 18213 + }, + { + "epoch": 2.9733480266111587, + "grad_norm": 2.1005289554595947, + "learning_rate": 1.8544753157371837e-05, + "loss": 0.675, + "step": 18214 + }, + { + "epoch": 2.973511285253663, + "grad_norm": 1.5870260000228882, + "learning_rate": 1.8544588301328077e-05, + "loss": 0.5165, + "step": 18215 + }, + { + "epoch": 2.9736745438961676, + "grad_norm": 1.5060704946517944, + "learning_rate": 1.8544423436679916e-05, + "loss": 0.5701, + "step": 18216 + }, + { + "epoch": 2.973837802538672, + "grad_norm": 2.0405776500701904, + "learning_rate": 1.8544258563427526e-05, + "loss": 0.643, + "step": 18217 + }, + { + "epoch": 2.974001061181176, + "grad_norm": 2.1862711906433105, + "learning_rate": 1.8544093681571067e-05, + "loss": 0.7036, + "step": 18218 + }, + { + "epoch": 2.9741643198236805, + "grad_norm": 2.1261558532714844, + "learning_rate": 1.854392879111071e-05, + "loss": 0.741, + "step": 18219 + }, + { + "epoch": 2.974327578466185, + "grad_norm": 2.092999219894409, + "learning_rate": 1.8543763892046618e-05, + "loss": 0.7189, + "step": 18220 + }, + { + "epoch": 2.9744908371086893, + "grad_norm": 1.74619722366333, + "learning_rate": 1.8543598984378958e-05, + "loss": 0.5812, + "step": 18221 + }, + { + "epoch": 2.9746540957511938, + "grad_norm": 1.9988503456115723, + "learning_rate": 1.8543434068107896e-05, + "loss": 0.6584, + "step": 18222 + }, + { + "epoch": 2.974817354393698, + "grad_norm": 1.783095121383667, + "learning_rate": 1.85432691432336e-05, + "loss": 0.5017, + "step": 18223 + }, + { + "epoch": 2.9749806130362026, + "grad_norm": 1.7868540287017822, + "learning_rate": 1.8543104209756233e-05, + "loss": 0.5913, + "step": 18224 + }, + { + "epoch": 2.975143871678707, + "grad_norm": 1.7318741083145142, + "learning_rate": 1.8542939267675962e-05, + "loss": 0.5376, + "step": 18225 + }, + { + "epoch": 2.9753071303212115, + "grad_norm": 1.6010483503341675, + "learning_rate": 1.8542774316992953e-05, + "loss": 0.5271, + "step": 18226 + }, + { + "epoch": 2.975470388963716, + "grad_norm": 1.786377191543579, + "learning_rate": 1.8542609357707376e-05, + "loss": 0.667, + "step": 18227 + }, + { + "epoch": 2.9756336476062204, + "grad_norm": 1.8095815181732178, + "learning_rate": 1.854244438981939e-05, + "loss": 0.5791, + "step": 18228 + }, + { + "epoch": 2.9757969062487244, + "grad_norm": 2.135197162628174, + "learning_rate": 1.8542279413329164e-05, + "loss": 0.5476, + "step": 18229 + }, + { + "epoch": 2.975960164891229, + "grad_norm": 1.7838348150253296, + "learning_rate": 1.8542114428236864e-05, + "loss": 0.5907, + "step": 18230 + }, + { + "epoch": 2.9761234235337333, + "grad_norm": 1.727491855621338, + "learning_rate": 1.854194943454266e-05, + "loss": 0.5426, + "step": 18231 + }, + { + "epoch": 2.9762866821762377, + "grad_norm": 1.7341772317886353, + "learning_rate": 1.8541784432246716e-05, + "loss": 0.5738, + "step": 18232 + }, + { + "epoch": 2.976449940818742, + "grad_norm": 1.7566782236099243, + "learning_rate": 1.8541619421349195e-05, + "loss": 0.4872, + "step": 18233 + }, + { + "epoch": 2.9766131994612466, + "grad_norm": 1.8296979665756226, + "learning_rate": 1.8541454401850268e-05, + "loss": 0.5192, + "step": 18234 + }, + { + "epoch": 2.976776458103751, + "grad_norm": 1.8785728216171265, + "learning_rate": 1.8541289373750098e-05, + "loss": 0.552, + "step": 18235 + }, + { + "epoch": 2.976939716746255, + "grad_norm": 1.5980488061904907, + "learning_rate": 1.854112433704885e-05, + "loss": 0.5409, + "step": 18236 + }, + { + "epoch": 2.9771029753887595, + "grad_norm": 1.9695004224777222, + "learning_rate": 1.8540959291746694e-05, + "loss": 0.5728, + "step": 18237 + }, + { + "epoch": 2.977266234031264, + "grad_norm": 2.1932804584503174, + "learning_rate": 1.8540794237843793e-05, + "loss": 0.5602, + "step": 18238 + }, + { + "epoch": 2.9774294926737683, + "grad_norm": 1.9010404348373413, + "learning_rate": 1.8540629175340315e-05, + "loss": 0.6171, + "step": 18239 + }, + { + "epoch": 2.9775927513162728, + "grad_norm": 1.7625526189804077, + "learning_rate": 1.8540464104236428e-05, + "loss": 0.5091, + "step": 18240 + }, + { + "epoch": 2.977756009958777, + "grad_norm": 1.6501644849777222, + "learning_rate": 1.854029902453229e-05, + "loss": 0.5233, + "step": 18241 + }, + { + "epoch": 2.9779192686012816, + "grad_norm": 2.053384780883789, + "learning_rate": 1.8540133936228077e-05, + "loss": 0.7264, + "step": 18242 + }, + { + "epoch": 2.978082527243786, + "grad_norm": 2.166025400161743, + "learning_rate": 1.853996883932395e-05, + "loss": 0.7876, + "step": 18243 + }, + { + "epoch": 2.9782457858862905, + "grad_norm": 2.0199954509735107, + "learning_rate": 1.853980373382008e-05, + "loss": 0.6919, + "step": 18244 + }, + { + "epoch": 2.978409044528795, + "grad_norm": 1.955330729484558, + "learning_rate": 1.8539638619716628e-05, + "loss": 0.5512, + "step": 18245 + }, + { + "epoch": 2.9785723031712994, + "grad_norm": 1.5919924974441528, + "learning_rate": 1.853947349701376e-05, + "loss": 0.5603, + "step": 18246 + }, + { + "epoch": 2.9787355618138034, + "grad_norm": 1.9034037590026855, + "learning_rate": 1.8539308365711644e-05, + "loss": 0.55, + "step": 18247 + }, + { + "epoch": 2.978898820456308, + "grad_norm": 1.5993704795837402, + "learning_rate": 1.8539143225810453e-05, + "loss": 0.5454, + "step": 18248 + }, + { + "epoch": 2.9790620790988123, + "grad_norm": 1.7215934991836548, + "learning_rate": 1.853897807731034e-05, + "loss": 0.6026, + "step": 18249 + }, + { + "epoch": 2.9792253377413167, + "grad_norm": 1.543052077293396, + "learning_rate": 1.8538812920211484e-05, + "loss": 0.5468, + "step": 18250 + }, + { + "epoch": 2.979388596383821, + "grad_norm": 1.9889198541641235, + "learning_rate": 1.8538647754514043e-05, + "loss": 0.6638, + "step": 18251 + }, + { + "epoch": 2.9795518550263256, + "grad_norm": 1.6613562107086182, + "learning_rate": 1.8538482580218185e-05, + "loss": 0.6302, + "step": 18252 + }, + { + "epoch": 2.9797151136688296, + "grad_norm": 1.639471411705017, + "learning_rate": 1.853831739732408e-05, + "loss": 0.5198, + "step": 18253 + }, + { + "epoch": 2.979878372311334, + "grad_norm": 1.5015804767608643, + "learning_rate": 1.8538152205831886e-05, + "loss": 0.4911, + "step": 18254 + }, + { + "epoch": 2.9800416309538384, + "grad_norm": 1.7349450588226318, + "learning_rate": 1.853798700574178e-05, + "loss": 0.5028, + "step": 18255 + }, + { + "epoch": 2.980204889596343, + "grad_norm": 1.6073940992355347, + "learning_rate": 1.8537821797053922e-05, + "loss": 0.5306, + "step": 18256 + }, + { + "epoch": 2.9803681482388473, + "grad_norm": 1.8022133111953735, + "learning_rate": 1.853765657976848e-05, + "loss": 0.5918, + "step": 18257 + }, + { + "epoch": 2.9805314068813518, + "grad_norm": 2.0392026901245117, + "learning_rate": 1.853749135388562e-05, + "loss": 0.6616, + "step": 18258 + }, + { + "epoch": 2.980694665523856, + "grad_norm": 1.5212280750274658, + "learning_rate": 1.8537326119405507e-05, + "loss": 0.5349, + "step": 18259 + }, + { + "epoch": 2.9808579241663606, + "grad_norm": 2.0180530548095703, + "learning_rate": 1.8537160876328313e-05, + "loss": 0.5742, + "step": 18260 + }, + { + "epoch": 2.981021182808865, + "grad_norm": 1.9912445545196533, + "learning_rate": 1.8536995624654197e-05, + "loss": 0.5866, + "step": 18261 + }, + { + "epoch": 2.9811844414513695, + "grad_norm": 1.7667882442474365, + "learning_rate": 1.853683036438333e-05, + "loss": 0.4847, + "step": 18262 + }, + { + "epoch": 2.981347700093874, + "grad_norm": 1.3009551763534546, + "learning_rate": 1.8536665095515876e-05, + "loss": 0.4656, + "step": 18263 + }, + { + "epoch": 2.981510958736378, + "grad_norm": 1.502537727355957, + "learning_rate": 1.8536499818052e-05, + "loss": 0.5923, + "step": 18264 + }, + { + "epoch": 2.9816742173788824, + "grad_norm": 1.6453442573547363, + "learning_rate": 1.8536334531991874e-05, + "loss": 0.5022, + "step": 18265 + }, + { + "epoch": 2.981837476021387, + "grad_norm": 1.9266594648361206, + "learning_rate": 1.8536169237335663e-05, + "loss": 0.6209, + "step": 18266 + }, + { + "epoch": 2.9820007346638913, + "grad_norm": 4.259009838104248, + "learning_rate": 1.853600393408353e-05, + "loss": 1.13, + "step": 18267 + }, + { + "epoch": 2.9821639933063957, + "grad_norm": 1.8285460472106934, + "learning_rate": 1.853583862223564e-05, + "loss": 0.5543, + "step": 18268 + }, + { + "epoch": 2.9823272519489, + "grad_norm": 1.7889249324798584, + "learning_rate": 1.853567330179217e-05, + "loss": 0.6034, + "step": 18269 + }, + { + "epoch": 2.9824905105914046, + "grad_norm": 1.7460594177246094, + "learning_rate": 1.8535507972753275e-05, + "loss": 0.5562, + "step": 18270 + }, + { + "epoch": 2.9826537692339086, + "grad_norm": 1.6565920114517212, + "learning_rate": 1.8535342635119128e-05, + "loss": 0.5559, + "step": 18271 + }, + { + "epoch": 2.982817027876413, + "grad_norm": 1.907871127128601, + "learning_rate": 1.853517728888989e-05, + "loss": 0.6358, + "step": 18272 + }, + { + "epoch": 2.9829802865189174, + "grad_norm": 1.2140332460403442, + "learning_rate": 1.8535011934065733e-05, + "loss": 0.3917, + "step": 18273 + }, + { + "epoch": 2.983143545161422, + "grad_norm": 1.6336971521377563, + "learning_rate": 1.8534846570646818e-05, + "loss": 0.5465, + "step": 18274 + }, + { + "epoch": 2.9833068038039263, + "grad_norm": 1.8147526979446411, + "learning_rate": 1.8534681198633318e-05, + "loss": 0.576, + "step": 18275 + }, + { + "epoch": 2.9834700624464308, + "grad_norm": 1.6397618055343628, + "learning_rate": 1.8534515818025397e-05, + "loss": 0.5882, + "step": 18276 + }, + { + "epoch": 2.983633321088935, + "grad_norm": 1.8914878368377686, + "learning_rate": 1.853435042882322e-05, + "loss": 0.664, + "step": 18277 + }, + { + "epoch": 2.9837965797314396, + "grad_norm": 1.6835737228393555, + "learning_rate": 1.853418503102695e-05, + "loss": 0.5484, + "step": 18278 + }, + { + "epoch": 2.983959838373944, + "grad_norm": 1.7317651510238647, + "learning_rate": 1.8534019624636764e-05, + "loss": 0.6047, + "step": 18279 + }, + { + "epoch": 2.9841230970164485, + "grad_norm": 1.4397761821746826, + "learning_rate": 1.853385420965282e-05, + "loss": 0.4041, + "step": 18280 + }, + { + "epoch": 2.984286355658953, + "grad_norm": 2.173340320587158, + "learning_rate": 1.853368878607529e-05, + "loss": 0.612, + "step": 18281 + }, + { + "epoch": 2.984449614301457, + "grad_norm": 2.129903793334961, + "learning_rate": 1.853352335390433e-05, + "loss": 0.7016, + "step": 18282 + }, + { + "epoch": 2.9846128729439614, + "grad_norm": 1.980839729309082, + "learning_rate": 1.853335791314012e-05, + "loss": 0.6208, + "step": 18283 + }, + { + "epoch": 2.984776131586466, + "grad_norm": 1.7694593667984009, + "learning_rate": 1.853319246378282e-05, + "loss": 0.5816, + "step": 18284 + }, + { + "epoch": 2.9849393902289703, + "grad_norm": 1.8405898809432983, + "learning_rate": 1.85330270058326e-05, + "loss": 0.5882, + "step": 18285 + }, + { + "epoch": 2.9851026488714747, + "grad_norm": 2.228308916091919, + "learning_rate": 1.8532861539289618e-05, + "loss": 0.6802, + "step": 18286 + }, + { + "epoch": 2.985265907513979, + "grad_norm": 1.6935869455337524, + "learning_rate": 1.853269606415405e-05, + "loss": 0.4668, + "step": 18287 + }, + { + "epoch": 2.9854291661564836, + "grad_norm": 1.7907236814498901, + "learning_rate": 1.853253058042606e-05, + "loss": 0.5881, + "step": 18288 + }, + { + "epoch": 2.9855924247989876, + "grad_norm": 1.7836415767669678, + "learning_rate": 1.8532365088105816e-05, + "loss": 0.6486, + "step": 18289 + }, + { + "epoch": 2.985755683441492, + "grad_norm": 1.7960295677185059, + "learning_rate": 1.8532199587193477e-05, + "loss": 0.5853, + "step": 18290 + }, + { + "epoch": 2.9859189420839964, + "grad_norm": 1.8897862434387207, + "learning_rate": 1.853203407768922e-05, + "loss": 0.5507, + "step": 18291 + }, + { + "epoch": 2.986082200726501, + "grad_norm": 1.6077934503555298, + "learning_rate": 1.8531868559593205e-05, + "loss": 0.4891, + "step": 18292 + }, + { + "epoch": 2.9862454593690053, + "grad_norm": 1.5636487007141113, + "learning_rate": 1.8531703032905603e-05, + "loss": 0.5114, + "step": 18293 + }, + { + "epoch": 2.9864087180115098, + "grad_norm": 2.3060624599456787, + "learning_rate": 1.8531537497626573e-05, + "loss": 0.6119, + "step": 18294 + }, + { + "epoch": 2.986571976654014, + "grad_norm": 1.7930686473846436, + "learning_rate": 1.853137195375629e-05, + "loss": 0.5559, + "step": 18295 + }, + { + "epoch": 2.9867352352965186, + "grad_norm": 1.9402812719345093, + "learning_rate": 1.853120640129492e-05, + "loss": 0.6691, + "step": 18296 + }, + { + "epoch": 2.986898493939023, + "grad_norm": 2.1886978149414062, + "learning_rate": 1.8531040840242625e-05, + "loss": 0.5938, + "step": 18297 + }, + { + "epoch": 2.9870617525815275, + "grad_norm": 2.363053560256958, + "learning_rate": 1.8530875270599573e-05, + "loss": 0.7055, + "step": 18298 + }, + { + "epoch": 2.987225011224032, + "grad_norm": 1.7611863613128662, + "learning_rate": 1.8530709692365934e-05, + "loss": 0.5745, + "step": 18299 + }, + { + "epoch": 2.987388269866536, + "grad_norm": 1.7467700242996216, + "learning_rate": 1.8530544105541872e-05, + "loss": 0.576, + "step": 18300 + }, + { + "epoch": 2.9875515285090404, + "grad_norm": 1.7809275388717651, + "learning_rate": 1.8530378510127555e-05, + "loss": 0.5546, + "step": 18301 + }, + { + "epoch": 2.987714787151545, + "grad_norm": 1.7219352722167969, + "learning_rate": 1.853021290612315e-05, + "loss": 0.5489, + "step": 18302 + }, + { + "epoch": 2.9878780457940493, + "grad_norm": 1.3850469589233398, + "learning_rate": 1.8530047293528818e-05, + "loss": 0.4865, + "step": 18303 + }, + { + "epoch": 2.9880413044365537, + "grad_norm": 1.8991518020629883, + "learning_rate": 1.8529881672344733e-05, + "loss": 0.5345, + "step": 18304 + }, + { + "epoch": 2.988204563079058, + "grad_norm": 1.81593656539917, + "learning_rate": 1.8529716042571063e-05, + "loss": 0.4816, + "step": 18305 + }, + { + "epoch": 2.988367821721562, + "grad_norm": 1.994473934173584, + "learning_rate": 1.8529550404207967e-05, + "loss": 0.5592, + "step": 18306 + }, + { + "epoch": 2.9885310803640666, + "grad_norm": 1.86656653881073, + "learning_rate": 1.852938475725562e-05, + "loss": 0.5518, + "step": 18307 + }, + { + "epoch": 2.988694339006571, + "grad_norm": 1.5667191743850708, + "learning_rate": 1.852921910171418e-05, + "loss": 0.4968, + "step": 18308 + }, + { + "epoch": 2.9888575976490754, + "grad_norm": 1.873581886291504, + "learning_rate": 1.852905343758382e-05, + "loss": 0.5235, + "step": 18309 + }, + { + "epoch": 2.98902085629158, + "grad_norm": 1.5499571561813354, + "learning_rate": 1.852888776486471e-05, + "loss": 0.5077, + "step": 18310 + }, + { + "epoch": 2.9891841149340843, + "grad_norm": 1.6724574565887451, + "learning_rate": 1.8528722083557006e-05, + "loss": 0.5847, + "step": 18311 + }, + { + "epoch": 2.9893473735765888, + "grad_norm": 1.9302486181259155, + "learning_rate": 1.8528556393660884e-05, + "loss": 0.635, + "step": 18312 + }, + { + "epoch": 2.989510632219093, + "grad_norm": 1.619215726852417, + "learning_rate": 1.8528390695176507e-05, + "loss": 0.4803, + "step": 18313 + }, + { + "epoch": 2.9896738908615976, + "grad_norm": 2.012899398803711, + "learning_rate": 1.8528224988104044e-05, + "loss": 0.6497, + "step": 18314 + }, + { + "epoch": 2.989837149504102, + "grad_norm": 1.9265031814575195, + "learning_rate": 1.852805927244366e-05, + "loss": 0.5907, + "step": 18315 + }, + { + "epoch": 2.9900004081466065, + "grad_norm": 1.7280688285827637, + "learning_rate": 1.8527893548195522e-05, + "loss": 0.5175, + "step": 18316 + }, + { + "epoch": 2.9901636667891105, + "grad_norm": 1.9736720323562622, + "learning_rate": 1.85277278153598e-05, + "loss": 0.5953, + "step": 18317 + }, + { + "epoch": 2.990326925431615, + "grad_norm": 2.152324914932251, + "learning_rate": 1.8527562073936657e-05, + "loss": 0.7341, + "step": 18318 + }, + { + "epoch": 2.9904901840741194, + "grad_norm": 1.9676761627197266, + "learning_rate": 1.852739632392626e-05, + "loss": 0.6987, + "step": 18319 + }, + { + "epoch": 2.990653442716624, + "grad_norm": 2.0830650329589844, + "learning_rate": 1.852723056532878e-05, + "loss": 0.654, + "step": 18320 + }, + { + "epoch": 2.9908167013591282, + "grad_norm": 1.6638784408569336, + "learning_rate": 1.8527064798144376e-05, + "loss": 0.5867, + "step": 18321 + }, + { + "epoch": 2.9909799600016327, + "grad_norm": 1.7794435024261475, + "learning_rate": 1.8526899022373224e-05, + "loss": 0.5915, + "step": 18322 + }, + { + "epoch": 2.991143218644137, + "grad_norm": 1.8954675197601318, + "learning_rate": 1.8526733238015486e-05, + "loss": 0.646, + "step": 18323 + }, + { + "epoch": 2.991306477286641, + "grad_norm": 1.756690263748169, + "learning_rate": 1.852656744507133e-05, + "loss": 0.564, + "step": 18324 + }, + { + "epoch": 2.9914697359291456, + "grad_norm": 1.5294424295425415, + "learning_rate": 1.8526401643540924e-05, + "loss": 0.5258, + "step": 18325 + }, + { + "epoch": 2.99163299457165, + "grad_norm": 1.7507840394973755, + "learning_rate": 1.8526235833424433e-05, + "loss": 0.5683, + "step": 18326 + }, + { + "epoch": 2.9917962532141544, + "grad_norm": 1.7037434577941895, + "learning_rate": 1.8526070014722026e-05, + "loss": 0.5249, + "step": 18327 + }, + { + "epoch": 2.991959511856659, + "grad_norm": 1.7802932262420654, + "learning_rate": 1.8525904187433866e-05, + "loss": 0.5241, + "step": 18328 + }, + { + "epoch": 2.9921227704991633, + "grad_norm": 1.6140450239181519, + "learning_rate": 1.8525738351560122e-05, + "loss": 0.432, + "step": 18329 + }, + { + "epoch": 2.9922860291416677, + "grad_norm": 1.5623780488967896, + "learning_rate": 1.8525572507100964e-05, + "loss": 0.5267, + "step": 18330 + }, + { + "epoch": 2.992449287784172, + "grad_norm": 1.6189546585083008, + "learning_rate": 1.852540665405656e-05, + "loss": 0.4609, + "step": 18331 + }, + { + "epoch": 2.9926125464266766, + "grad_norm": 1.6717780828475952, + "learning_rate": 1.8525240792427067e-05, + "loss": 0.6022, + "step": 18332 + }, + { + "epoch": 2.992775805069181, + "grad_norm": 2.0648698806762695, + "learning_rate": 1.8525074922212663e-05, + "loss": 0.6479, + "step": 18333 + }, + { + "epoch": 2.9929390637116855, + "grad_norm": 1.910882830619812, + "learning_rate": 1.852490904341351e-05, + "loss": 0.5057, + "step": 18334 + }, + { + "epoch": 2.9931023223541895, + "grad_norm": 1.2483888864517212, + "learning_rate": 1.8524743156029778e-05, + "loss": 0.4227, + "step": 18335 + }, + { + "epoch": 2.993265580996694, + "grad_norm": 1.6565933227539062, + "learning_rate": 1.8524577260061628e-05, + "loss": 0.545, + "step": 18336 + }, + { + "epoch": 2.9934288396391984, + "grad_norm": 1.4916671514511108, + "learning_rate": 1.852441135550923e-05, + "loss": 0.4996, + "step": 18337 + }, + { + "epoch": 2.993592098281703, + "grad_norm": 1.7841724157333374, + "learning_rate": 1.8524245442372756e-05, + "loss": 0.5064, + "step": 18338 + }, + { + "epoch": 2.9937553569242072, + "grad_norm": 1.7434172630310059, + "learning_rate": 1.8524079520652366e-05, + "loss": 0.513, + "step": 18339 + }, + { + "epoch": 2.9939186155667117, + "grad_norm": 1.7766677141189575, + "learning_rate": 1.852391359034823e-05, + "loss": 0.5494, + "step": 18340 + }, + { + "epoch": 2.9940818742092157, + "grad_norm": 1.8250246047973633, + "learning_rate": 1.852374765146052e-05, + "loss": 0.6103, + "step": 18341 + }, + { + "epoch": 2.99424513285172, + "grad_norm": 2.1334476470947266, + "learning_rate": 1.8523581703989396e-05, + "loss": 0.5845, + "step": 18342 + }, + { + "epoch": 2.9944083914942246, + "grad_norm": 1.7793142795562744, + "learning_rate": 1.8523415747935026e-05, + "loss": 0.5538, + "step": 18343 + }, + { + "epoch": 2.994571650136729, + "grad_norm": 1.6337394714355469, + "learning_rate": 1.852324978329758e-05, + "loss": 0.5648, + "step": 18344 + }, + { + "epoch": 2.9947349087792334, + "grad_norm": 1.9101333618164062, + "learning_rate": 1.8523083810077224e-05, + "loss": 0.6265, + "step": 18345 + }, + { + "epoch": 2.994898167421738, + "grad_norm": 1.8142592906951904, + "learning_rate": 1.8522917828274124e-05, + "loss": 0.6193, + "step": 18346 + }, + { + "epoch": 2.9950614260642423, + "grad_norm": 1.7255738973617554, + "learning_rate": 1.8522751837888448e-05, + "loss": 0.5681, + "step": 18347 + }, + { + "epoch": 2.9952246847067467, + "grad_norm": 1.7363373041152954, + "learning_rate": 1.852258583892036e-05, + "loss": 0.5223, + "step": 18348 + }, + { + "epoch": 2.995387943349251, + "grad_norm": 1.9215550422668457, + "learning_rate": 1.8522419831370037e-05, + "loss": 0.5246, + "step": 18349 + }, + { + "epoch": 2.9955512019917556, + "grad_norm": 1.6817104816436768, + "learning_rate": 1.8522253815237636e-05, + "loss": 0.5568, + "step": 18350 + }, + { + "epoch": 2.99571446063426, + "grad_norm": 1.4248195886611938, + "learning_rate": 1.8522087790523325e-05, + "loss": 0.5433, + "step": 18351 + }, + { + "epoch": 2.995877719276764, + "grad_norm": 2.117276430130005, + "learning_rate": 1.852192175722728e-05, + "loss": 0.6512, + "step": 18352 + }, + { + "epoch": 2.9960409779192685, + "grad_norm": 1.483764410018921, + "learning_rate": 1.8521755715349658e-05, + "loss": 0.5178, + "step": 18353 + }, + { + "epoch": 2.996204236561773, + "grad_norm": 1.4881583452224731, + "learning_rate": 1.852158966489063e-05, + "loss": 0.4313, + "step": 18354 + }, + { + "epoch": 2.9963674952042774, + "grad_norm": 1.8912324905395508, + "learning_rate": 1.8521423605850366e-05, + "loss": 0.6533, + "step": 18355 + }, + { + "epoch": 2.996530753846782, + "grad_norm": 1.5512250661849976, + "learning_rate": 1.852125753822903e-05, + "loss": 0.5125, + "step": 18356 + }, + { + "epoch": 2.9966940124892862, + "grad_norm": 1.6535760164260864, + "learning_rate": 1.852109146202679e-05, + "loss": 0.513, + "step": 18357 + }, + { + "epoch": 2.9968572711317907, + "grad_norm": 1.5639076232910156, + "learning_rate": 1.8520925377243812e-05, + "loss": 0.512, + "step": 18358 + }, + { + "epoch": 2.9970205297742947, + "grad_norm": 1.808722734451294, + "learning_rate": 1.852075928388026e-05, + "loss": 0.5169, + "step": 18359 + }, + { + "epoch": 2.997183788416799, + "grad_norm": 1.6385999917984009, + "learning_rate": 1.8520593181936312e-05, + "loss": 0.5465, + "step": 18360 + }, + { + "epoch": 2.9973470470593035, + "grad_norm": 1.947251319885254, + "learning_rate": 1.852042707141213e-05, + "loss": 0.5627, + "step": 18361 + }, + { + "epoch": 2.997510305701808, + "grad_norm": 1.8388338088989258, + "learning_rate": 1.8520260952307874e-05, + "loss": 0.6269, + "step": 18362 + }, + { + "epoch": 2.9976735643443124, + "grad_norm": 1.9596970081329346, + "learning_rate": 1.852009482462372e-05, + "loss": 0.5418, + "step": 18363 + }, + { + "epoch": 2.997836822986817, + "grad_norm": 1.8457374572753906, + "learning_rate": 1.8519928688359836e-05, + "loss": 0.6199, + "step": 18364 + }, + { + "epoch": 2.9980000816293213, + "grad_norm": 1.4222497940063477, + "learning_rate": 1.851976254351638e-05, + "loss": 0.4654, + "step": 18365 + }, + { + "epoch": 2.9981633402718257, + "grad_norm": 1.7840983867645264, + "learning_rate": 1.8519596390093533e-05, + "loss": 0.6026, + "step": 18366 + }, + { + "epoch": 2.99832659891433, + "grad_norm": 1.6988296508789062, + "learning_rate": 1.851943022809145e-05, + "loss": 0.5198, + "step": 18367 + }, + { + "epoch": 2.9984898575568346, + "grad_norm": 1.8054527044296265, + "learning_rate": 1.8519264057510304e-05, + "loss": 0.6254, + "step": 18368 + }, + { + "epoch": 2.998653116199339, + "grad_norm": 1.8095499277114868, + "learning_rate": 1.851909787835026e-05, + "loss": 0.5138, + "step": 18369 + }, + { + "epoch": 2.998816374841843, + "grad_norm": 1.9422591924667358, + "learning_rate": 1.851893169061149e-05, + "loss": 0.5, + "step": 18370 + }, + { + "epoch": 2.9989796334843475, + "grad_norm": 1.724324107170105, + "learning_rate": 1.8518765494294154e-05, + "loss": 0.4788, + "step": 18371 + }, + { + "epoch": 2.999142892126852, + "grad_norm": 1.7294195890426636, + "learning_rate": 1.8518599289398425e-05, + "loss": 0.5146, + "step": 18372 + }, + { + "epoch": 2.9993061507693564, + "grad_norm": 1.775810718536377, + "learning_rate": 1.8518433075924468e-05, + "loss": 0.524, + "step": 18373 + }, + { + "epoch": 2.999469409411861, + "grad_norm": 2.0746102333068848, + "learning_rate": 1.8518266853872456e-05, + "loss": 0.6908, + "step": 18374 + }, + { + "epoch": 2.9996326680543652, + "grad_norm": 1.8121826648712158, + "learning_rate": 1.8518100623242548e-05, + "loss": 0.5378, + "step": 18375 + }, + { + "epoch": 2.9997959266968697, + "grad_norm": 1.7077902555465698, + "learning_rate": 1.8517934384034913e-05, + "loss": 0.5262, + "step": 18376 + }, + { + "epoch": 2.9999591853393737, + "grad_norm": 1.9575108289718628, + "learning_rate": 1.8517768136249722e-05, + "loss": 0.5616, + "step": 18377 + }, + { + "epoch": 3.0, + "grad_norm": 4.377547740936279, + "learning_rate": 1.8517601879887143e-05, + "loss": 0.9378, + "step": 18378 + }, + { + "epoch": 3.0001632586425044, + "grad_norm": 1.7345848083496094, + "learning_rate": 1.851743561494734e-05, + "loss": 0.7063, + "step": 18379 + }, + { + "epoch": 3.000326517285009, + "grad_norm": 1.059463381767273, + "learning_rate": 1.851726934143048e-05, + "loss": 0.3863, + "step": 18380 + }, + { + "epoch": 3.0004897759275133, + "grad_norm": 1.49441397190094, + "learning_rate": 1.851710305933673e-05, + "loss": 0.4362, + "step": 18381 + }, + { + "epoch": 3.0006530345700178, + "grad_norm": 1.6815259456634521, + "learning_rate": 1.8516936768666263e-05, + "loss": 0.5695, + "step": 18382 + }, + { + "epoch": 3.0008162932125217, + "grad_norm": 1.5109926462173462, + "learning_rate": 1.8516770469419242e-05, + "loss": 0.4678, + "step": 18383 + }, + { + "epoch": 3.000979551855026, + "grad_norm": 1.3794513940811157, + "learning_rate": 1.8516604161595834e-05, + "loss": 0.4301, + "step": 18384 + }, + { + "epoch": 3.0011428104975306, + "grad_norm": 1.2333649396896362, + "learning_rate": 1.8516437845196213e-05, + "loss": 0.377, + "step": 18385 + }, + { + "epoch": 3.001306069140035, + "grad_norm": 1.6946130990982056, + "learning_rate": 1.8516271520220536e-05, + "loss": 0.4909, + "step": 18386 + }, + { + "epoch": 3.0014693277825395, + "grad_norm": 1.5168777704238892, + "learning_rate": 1.8516105186668976e-05, + "loss": 0.5049, + "step": 18387 + }, + { + "epoch": 3.001632586425044, + "grad_norm": 1.5291268825531006, + "learning_rate": 1.8515938844541702e-05, + "loss": 0.5318, + "step": 18388 + }, + { + "epoch": 3.0017958450675484, + "grad_norm": 1.6677768230438232, + "learning_rate": 1.851577249383888e-05, + "loss": 0.4646, + "step": 18389 + }, + { + "epoch": 3.001959103710053, + "grad_norm": 1.6690900325775146, + "learning_rate": 1.8515606134560676e-05, + "loss": 0.4736, + "step": 18390 + }, + { + "epoch": 3.0021223623525572, + "grad_norm": 1.8276921510696411, + "learning_rate": 1.851543976670726e-05, + "loss": 0.5106, + "step": 18391 + }, + { + "epoch": 3.0022856209950612, + "grad_norm": 1.695304036140442, + "learning_rate": 1.8515273390278797e-05, + "loss": 0.4622, + "step": 18392 + }, + { + "epoch": 3.0024488796375657, + "grad_norm": 1.9219801425933838, + "learning_rate": 1.8515107005275458e-05, + "loss": 0.4804, + "step": 18393 + }, + { + "epoch": 3.00261213828007, + "grad_norm": 1.7109897136688232, + "learning_rate": 1.851494061169741e-05, + "loss": 0.453, + "step": 18394 + }, + { + "epoch": 3.0027753969225746, + "grad_norm": 1.718621850013733, + "learning_rate": 1.8514774209544818e-05, + "loss": 0.4625, + "step": 18395 + }, + { + "epoch": 3.002938655565079, + "grad_norm": 2.235752582550049, + "learning_rate": 1.8514607798817846e-05, + "loss": 0.7186, + "step": 18396 + }, + { + "epoch": 3.0031019142075834, + "grad_norm": 1.5759915113449097, + "learning_rate": 1.851444137951667e-05, + "loss": 0.4628, + "step": 18397 + }, + { + "epoch": 3.003265172850088, + "grad_norm": 1.9532945156097412, + "learning_rate": 1.8514274951641454e-05, + "loss": 0.4882, + "step": 18398 + }, + { + "epoch": 3.0034284314925923, + "grad_norm": 1.4866477251052856, + "learning_rate": 1.8514108515192365e-05, + "loss": 0.4009, + "step": 18399 + }, + { + "epoch": 3.0035916901350967, + "grad_norm": 1.5457483530044556, + "learning_rate": 1.8513942070169572e-05, + "loss": 0.4827, + "step": 18400 + }, + { + "epoch": 3.0037549487776007, + "grad_norm": 2.1415607929229736, + "learning_rate": 1.8513775616573236e-05, + "loss": 0.6014, + "step": 18401 + }, + { + "epoch": 3.003918207420105, + "grad_norm": 1.884354829788208, + "learning_rate": 1.8513609154403535e-05, + "loss": 0.4486, + "step": 18402 + }, + { + "epoch": 3.0040814660626096, + "grad_norm": 1.61143958568573, + "learning_rate": 1.8513442683660634e-05, + "loss": 0.4172, + "step": 18403 + }, + { + "epoch": 3.004244724705114, + "grad_norm": 1.571800947189331, + "learning_rate": 1.8513276204344698e-05, + "loss": 0.4168, + "step": 18404 + }, + { + "epoch": 3.0044079833476185, + "grad_norm": 1.936752438545227, + "learning_rate": 1.851310971645589e-05, + "loss": 0.5429, + "step": 18405 + }, + { + "epoch": 3.004571241990123, + "grad_norm": 1.502915859222412, + "learning_rate": 1.8512943219994387e-05, + "loss": 0.3992, + "step": 18406 + }, + { + "epoch": 3.0047345006326274, + "grad_norm": 1.626674771308899, + "learning_rate": 1.851277671496035e-05, + "loss": 0.3661, + "step": 18407 + }, + { + "epoch": 3.004897759275132, + "grad_norm": 2.057309150695801, + "learning_rate": 1.8512610201353952e-05, + "loss": 0.4154, + "step": 18408 + }, + { + "epoch": 3.005061017917636, + "grad_norm": 2.2861721515655518, + "learning_rate": 1.8512443679175358e-05, + "loss": 0.5508, + "step": 18409 + }, + { + "epoch": 3.0052242765601402, + "grad_norm": 1.8140822649002075, + "learning_rate": 1.8512277148424732e-05, + "loss": 0.4949, + "step": 18410 + }, + { + "epoch": 3.0053875352026447, + "grad_norm": 2.1315994262695312, + "learning_rate": 1.851211060910225e-05, + "loss": 0.4707, + "step": 18411 + }, + { + "epoch": 3.005550793845149, + "grad_norm": 1.9330172538757324, + "learning_rate": 1.851194406120807e-05, + "loss": 0.4731, + "step": 18412 + }, + { + "epoch": 3.0057140524876536, + "grad_norm": 1.7516013383865356, + "learning_rate": 1.8511777504742364e-05, + "loss": 0.445, + "step": 18413 + }, + { + "epoch": 3.005877311130158, + "grad_norm": 2.1389477252960205, + "learning_rate": 1.8511610939705302e-05, + "loss": 0.5006, + "step": 18414 + }, + { + "epoch": 3.0060405697726624, + "grad_norm": 1.7275699377059937, + "learning_rate": 1.8511444366097053e-05, + "loss": 0.435, + "step": 18415 + }, + { + "epoch": 3.006203828415167, + "grad_norm": 1.4510436058044434, + "learning_rate": 1.851127778391778e-05, + "loss": 0.3573, + "step": 18416 + }, + { + "epoch": 3.0063670870576713, + "grad_norm": 1.6458261013031006, + "learning_rate": 1.851111119316765e-05, + "loss": 0.436, + "step": 18417 + }, + { + "epoch": 3.0065303457001753, + "grad_norm": 1.960505485534668, + "learning_rate": 1.8510944593846837e-05, + "loss": 0.4778, + "step": 18418 + }, + { + "epoch": 3.0066936043426797, + "grad_norm": 1.7683360576629639, + "learning_rate": 1.8510777985955498e-05, + "loss": 0.367, + "step": 18419 + }, + { + "epoch": 3.006856862985184, + "grad_norm": 1.92803156375885, + "learning_rate": 1.8510611369493815e-05, + "loss": 0.4133, + "step": 18420 + }, + { + "epoch": 3.0070201216276886, + "grad_norm": 2.101102828979492, + "learning_rate": 1.851044474446195e-05, + "loss": 0.6209, + "step": 18421 + }, + { + "epoch": 3.007183380270193, + "grad_norm": 2.19883131980896, + "learning_rate": 1.8510278110860063e-05, + "loss": 0.5062, + "step": 18422 + }, + { + "epoch": 3.0073466389126975, + "grad_norm": 1.9199559688568115, + "learning_rate": 1.8510111468688332e-05, + "loss": 0.4686, + "step": 18423 + }, + { + "epoch": 3.007509897555202, + "grad_norm": 1.6093086004257202, + "learning_rate": 1.850994481794692e-05, + "loss": 0.4466, + "step": 18424 + }, + { + "epoch": 3.0076731561977064, + "grad_norm": 1.9249941110610962, + "learning_rate": 1.8509778158636e-05, + "loss": 0.4237, + "step": 18425 + }, + { + "epoch": 3.007836414840211, + "grad_norm": 1.8909653425216675, + "learning_rate": 1.8509611490755728e-05, + "loss": 0.5167, + "step": 18426 + }, + { + "epoch": 3.007999673482715, + "grad_norm": 1.720744013786316, + "learning_rate": 1.8509444814306284e-05, + "loss": 0.4099, + "step": 18427 + }, + { + "epoch": 3.0081629321252192, + "grad_norm": 1.8505769968032837, + "learning_rate": 1.850927812928783e-05, + "loss": 0.4641, + "step": 18428 + }, + { + "epoch": 3.0083261907677237, + "grad_norm": 2.2107269763946533, + "learning_rate": 1.8509111435700537e-05, + "loss": 0.6222, + "step": 18429 + }, + { + "epoch": 3.008489449410228, + "grad_norm": 2.2616117000579834, + "learning_rate": 1.8508944733544568e-05, + "loss": 0.4774, + "step": 18430 + }, + { + "epoch": 3.0086527080527325, + "grad_norm": 2.1400210857391357, + "learning_rate": 1.8508778022820095e-05, + "loss": 0.4967, + "step": 18431 + }, + { + "epoch": 3.008815966695237, + "grad_norm": 1.626513123512268, + "learning_rate": 1.8508611303527285e-05, + "loss": 0.4132, + "step": 18432 + }, + { + "epoch": 3.0089792253377414, + "grad_norm": 1.9320924282073975, + "learning_rate": 1.850844457566631e-05, + "loss": 0.4379, + "step": 18433 + }, + { + "epoch": 3.009142483980246, + "grad_norm": 1.9688199758529663, + "learning_rate": 1.8508277839237328e-05, + "loss": 0.4371, + "step": 18434 + }, + { + "epoch": 3.0093057426227503, + "grad_norm": 1.6453510522842407, + "learning_rate": 1.8508111094240516e-05, + "loss": 0.3973, + "step": 18435 + }, + { + "epoch": 3.0094690012652543, + "grad_norm": 1.8470560312271118, + "learning_rate": 1.8507944340676035e-05, + "loss": 0.4427, + "step": 18436 + }, + { + "epoch": 3.0096322599077587, + "grad_norm": 1.723944067955017, + "learning_rate": 1.850777757854406e-05, + "loss": 0.4095, + "step": 18437 + }, + { + "epoch": 3.009795518550263, + "grad_norm": 1.8001320362091064, + "learning_rate": 1.850761080784475e-05, + "loss": 0.4608, + "step": 18438 + }, + { + "epoch": 3.0099587771927676, + "grad_norm": 1.8901313543319702, + "learning_rate": 1.8507444028578284e-05, + "loss": 0.4632, + "step": 18439 + }, + { + "epoch": 3.010122035835272, + "grad_norm": 2.261765718460083, + "learning_rate": 1.8507277240744818e-05, + "loss": 0.5294, + "step": 18440 + }, + { + "epoch": 3.0102852944777765, + "grad_norm": 2.6511919498443604, + "learning_rate": 1.850711044434453e-05, + "loss": 0.514, + "step": 18441 + }, + { + "epoch": 3.010448553120281, + "grad_norm": 1.9696508646011353, + "learning_rate": 1.8506943639377587e-05, + "loss": 0.5448, + "step": 18442 + }, + { + "epoch": 3.0106118117627854, + "grad_norm": 1.9744733572006226, + "learning_rate": 1.850677682584415e-05, + "loss": 0.4818, + "step": 18443 + }, + { + "epoch": 3.01077507040529, + "grad_norm": 1.9812250137329102, + "learning_rate": 1.850661000374439e-05, + "loss": 0.4568, + "step": 18444 + }, + { + "epoch": 3.010938329047794, + "grad_norm": 1.9795281887054443, + "learning_rate": 1.8506443173078478e-05, + "loss": 0.4701, + "step": 18445 + }, + { + "epoch": 3.0111015876902982, + "grad_norm": 1.8404620885849, + "learning_rate": 1.850627633384658e-05, + "loss": 0.4487, + "step": 18446 + }, + { + "epoch": 3.0112648463328027, + "grad_norm": 1.8910188674926758, + "learning_rate": 1.850610948604886e-05, + "loss": 0.5046, + "step": 18447 + }, + { + "epoch": 3.011428104975307, + "grad_norm": 1.4994096755981445, + "learning_rate": 1.8505942629685492e-05, + "loss": 0.4319, + "step": 18448 + }, + { + "epoch": 3.0115913636178115, + "grad_norm": 1.7680944204330444, + "learning_rate": 1.8505775764756646e-05, + "loss": 0.4616, + "step": 18449 + }, + { + "epoch": 3.011754622260316, + "grad_norm": 2.065762996673584, + "learning_rate": 1.8505608891262487e-05, + "loss": 0.4625, + "step": 18450 + }, + { + "epoch": 3.0119178809028204, + "grad_norm": 1.9081531763076782, + "learning_rate": 1.8505442009203175e-05, + "loss": 0.4498, + "step": 18451 + }, + { + "epoch": 3.012081139545325, + "grad_norm": 1.8591570854187012, + "learning_rate": 1.8505275118578892e-05, + "loss": 0.4903, + "step": 18452 + }, + { + "epoch": 3.012244398187829, + "grad_norm": 1.6592947244644165, + "learning_rate": 1.8505108219389792e-05, + "loss": 0.3962, + "step": 18453 + }, + { + "epoch": 3.0124076568303333, + "grad_norm": 1.7315024137496948, + "learning_rate": 1.8504941311636057e-05, + "loss": 0.4425, + "step": 18454 + }, + { + "epoch": 3.0125709154728377, + "grad_norm": 1.9520494937896729, + "learning_rate": 1.8504774395317847e-05, + "loss": 0.5555, + "step": 18455 + }, + { + "epoch": 3.012734174115342, + "grad_norm": 2.23970890045166, + "learning_rate": 1.850460747043533e-05, + "loss": 0.5153, + "step": 18456 + }, + { + "epoch": 3.0128974327578466, + "grad_norm": 1.631711483001709, + "learning_rate": 1.850444053698867e-05, + "loss": 0.4295, + "step": 18457 + }, + { + "epoch": 3.013060691400351, + "grad_norm": 1.5388363599777222, + "learning_rate": 1.850427359497805e-05, + "loss": 0.4681, + "step": 18458 + }, + { + "epoch": 3.0132239500428555, + "grad_norm": 2.2095422744750977, + "learning_rate": 1.8504106644403626e-05, + "loss": 0.5605, + "step": 18459 + }, + { + "epoch": 3.01338720868536, + "grad_norm": 2.034498929977417, + "learning_rate": 1.850393968526557e-05, + "loss": 0.5697, + "step": 18460 + }, + { + "epoch": 3.0135504673278644, + "grad_norm": 1.707322120666504, + "learning_rate": 1.8503772717564047e-05, + "loss": 0.4521, + "step": 18461 + }, + { + "epoch": 3.0137137259703684, + "grad_norm": 1.648890733718872, + "learning_rate": 1.8503605741299224e-05, + "loss": 0.4749, + "step": 18462 + }, + { + "epoch": 3.013876984612873, + "grad_norm": 2.1041650772094727, + "learning_rate": 1.8503438756471275e-05, + "loss": 0.5311, + "step": 18463 + }, + { + "epoch": 3.0140402432553772, + "grad_norm": 1.8426687717437744, + "learning_rate": 1.8503271763080368e-05, + "loss": 0.4347, + "step": 18464 + }, + { + "epoch": 3.0142035018978817, + "grad_norm": 2.1467485427856445, + "learning_rate": 1.8503104761126666e-05, + "loss": 0.5149, + "step": 18465 + }, + { + "epoch": 3.014366760540386, + "grad_norm": 1.979142665863037, + "learning_rate": 1.850293775061034e-05, + "loss": 0.5072, + "step": 18466 + }, + { + "epoch": 3.0145300191828905, + "grad_norm": 2.097163438796997, + "learning_rate": 1.8502770731531558e-05, + "loss": 0.4964, + "step": 18467 + }, + { + "epoch": 3.014693277825395, + "grad_norm": 1.7263426780700684, + "learning_rate": 1.8502603703890488e-05, + "loss": 0.4558, + "step": 18468 + }, + { + "epoch": 3.0148565364678994, + "grad_norm": 1.9380106925964355, + "learning_rate": 1.8502436667687296e-05, + "loss": 0.4793, + "step": 18469 + }, + { + "epoch": 3.015019795110404, + "grad_norm": 1.991590142250061, + "learning_rate": 1.8502269622922157e-05, + "loss": 0.4445, + "step": 18470 + }, + { + "epoch": 3.015183053752908, + "grad_norm": 1.8317480087280273, + "learning_rate": 1.8502102569595233e-05, + "loss": 0.4735, + "step": 18471 + }, + { + "epoch": 3.0153463123954123, + "grad_norm": 1.4751231670379639, + "learning_rate": 1.8501935507706692e-05, + "loss": 0.4014, + "step": 18472 + }, + { + "epoch": 3.0155095710379167, + "grad_norm": 2.1528706550598145, + "learning_rate": 1.8501768437256705e-05, + "loss": 0.5282, + "step": 18473 + }, + { + "epoch": 3.015672829680421, + "grad_norm": 1.870710849761963, + "learning_rate": 1.850160135824544e-05, + "loss": 0.4667, + "step": 18474 + }, + { + "epoch": 3.0158360883229256, + "grad_norm": 1.8722426891326904, + "learning_rate": 1.8501434270673066e-05, + "loss": 0.4529, + "step": 18475 + }, + { + "epoch": 3.01599934696543, + "grad_norm": 2.156005620956421, + "learning_rate": 1.850126717453975e-05, + "loss": 0.4573, + "step": 18476 + }, + { + "epoch": 3.0161626056079345, + "grad_norm": 1.709607720375061, + "learning_rate": 1.8501100069845657e-05, + "loss": 0.4451, + "step": 18477 + }, + { + "epoch": 3.016325864250439, + "grad_norm": 1.8761217594146729, + "learning_rate": 1.8500932956590962e-05, + "loss": 0.4409, + "step": 18478 + }, + { + "epoch": 3.0164891228929434, + "grad_norm": 1.5948123931884766, + "learning_rate": 1.850076583477583e-05, + "loss": 0.46, + "step": 18479 + }, + { + "epoch": 3.0166523815354473, + "grad_norm": 2.171351671218872, + "learning_rate": 1.8500598704400427e-05, + "loss": 0.4992, + "step": 18480 + }, + { + "epoch": 3.016815640177952, + "grad_norm": 2.2236905097961426, + "learning_rate": 1.8500431565464923e-05, + "loss": 0.5097, + "step": 18481 + }, + { + "epoch": 3.0169788988204562, + "grad_norm": 1.736372709274292, + "learning_rate": 1.850026441796949e-05, + "loss": 0.4631, + "step": 18482 + }, + { + "epoch": 3.0171421574629607, + "grad_norm": 1.8046756982803345, + "learning_rate": 1.850009726191429e-05, + "loss": 0.4052, + "step": 18483 + }, + { + "epoch": 3.017305416105465, + "grad_norm": 1.6675002574920654, + "learning_rate": 1.8499930097299496e-05, + "loss": 0.4909, + "step": 18484 + }, + { + "epoch": 3.0174686747479695, + "grad_norm": 2.023834228515625, + "learning_rate": 1.849976292412527e-05, + "loss": 0.5026, + "step": 18485 + }, + { + "epoch": 3.017631933390474, + "grad_norm": 1.8564742803573608, + "learning_rate": 1.849959574239179e-05, + "loss": 0.4473, + "step": 18486 + }, + { + "epoch": 3.0177951920329784, + "grad_norm": 1.754656195640564, + "learning_rate": 1.8499428552099217e-05, + "loss": 0.4267, + "step": 18487 + }, + { + "epoch": 3.017958450675483, + "grad_norm": 1.8235825300216675, + "learning_rate": 1.8499261353247722e-05, + "loss": 0.4374, + "step": 18488 + }, + { + "epoch": 3.018121709317987, + "grad_norm": 2.7384119033813477, + "learning_rate": 1.8499094145837475e-05, + "loss": 0.5287, + "step": 18489 + }, + { + "epoch": 3.0182849679604913, + "grad_norm": 1.638433575630188, + "learning_rate": 1.849892692986864e-05, + "loss": 0.409, + "step": 18490 + }, + { + "epoch": 3.0184482266029957, + "grad_norm": 1.8186067342758179, + "learning_rate": 1.849875970534139e-05, + "loss": 0.4641, + "step": 18491 + }, + { + "epoch": 3.0186114852455, + "grad_norm": 1.594215750694275, + "learning_rate": 1.849859247225589e-05, + "loss": 0.422, + "step": 18492 + }, + { + "epoch": 3.0187747438880046, + "grad_norm": 2.2628183364868164, + "learning_rate": 1.849842523061231e-05, + "loss": 0.5316, + "step": 18493 + }, + { + "epoch": 3.018938002530509, + "grad_norm": 1.6670162677764893, + "learning_rate": 1.849825798041082e-05, + "loss": 0.4217, + "step": 18494 + }, + { + "epoch": 3.0191012611730135, + "grad_norm": 1.8213368654251099, + "learning_rate": 1.849809072165158e-05, + "loss": 0.5048, + "step": 18495 + }, + { + "epoch": 3.019264519815518, + "grad_norm": 2.0318217277526855, + "learning_rate": 1.849792345433477e-05, + "loss": 0.5022, + "step": 18496 + }, + { + "epoch": 3.019427778458022, + "grad_norm": 2.1603782176971436, + "learning_rate": 1.8497756178460552e-05, + "loss": 0.4841, + "step": 18497 + }, + { + "epoch": 3.0195910371005263, + "grad_norm": 1.3117237091064453, + "learning_rate": 1.8497588894029098e-05, + "loss": 0.3807, + "step": 18498 + }, + { + "epoch": 3.019754295743031, + "grad_norm": 2.1416077613830566, + "learning_rate": 1.8497421601040574e-05, + "loss": 0.456, + "step": 18499 + }, + { + "epoch": 3.019917554385535, + "grad_norm": 2.1815788745880127, + "learning_rate": 1.8497254299495147e-05, + "loss": 0.5268, + "step": 18500 + }, + { + "epoch": 3.0200808130280397, + "grad_norm": 2.2280588150024414, + "learning_rate": 1.8497086989392986e-05, + "loss": 0.4558, + "step": 18501 + }, + { + "epoch": 3.020244071670544, + "grad_norm": 2.328702211380005, + "learning_rate": 1.8496919670734262e-05, + "loss": 0.5123, + "step": 18502 + }, + { + "epoch": 3.0204073303130485, + "grad_norm": 1.6061946153640747, + "learning_rate": 1.8496752343519142e-05, + "loss": 0.4386, + "step": 18503 + }, + { + "epoch": 3.020570588955553, + "grad_norm": 2.2084386348724365, + "learning_rate": 1.8496585007747794e-05, + "loss": 0.5412, + "step": 18504 + }, + { + "epoch": 3.0207338475980574, + "grad_norm": 1.7044726610183716, + "learning_rate": 1.849641766342039e-05, + "loss": 0.4343, + "step": 18505 + }, + { + "epoch": 3.0208971062405614, + "grad_norm": 1.9971150159835815, + "learning_rate": 1.8496250310537092e-05, + "loss": 0.4921, + "step": 18506 + }, + { + "epoch": 3.021060364883066, + "grad_norm": 1.8884111642837524, + "learning_rate": 1.8496082949098074e-05, + "loss": 0.4961, + "step": 18507 + }, + { + "epoch": 3.0212236235255703, + "grad_norm": 1.8118231296539307, + "learning_rate": 1.84959155791035e-05, + "loss": 0.3893, + "step": 18508 + }, + { + "epoch": 3.0213868821680747, + "grad_norm": 1.7713574171066284, + "learning_rate": 1.8495748200553547e-05, + "loss": 0.4341, + "step": 18509 + }, + { + "epoch": 3.021550140810579, + "grad_norm": 1.8058075904846191, + "learning_rate": 1.8495580813448374e-05, + "loss": 0.4222, + "step": 18510 + }, + { + "epoch": 3.0217133994530836, + "grad_norm": 2.315885066986084, + "learning_rate": 1.8495413417788154e-05, + "loss": 0.6678, + "step": 18511 + }, + { + "epoch": 3.021876658095588, + "grad_norm": 1.6711541414260864, + "learning_rate": 1.8495246013573057e-05, + "loss": 0.4729, + "step": 18512 + }, + { + "epoch": 3.0220399167380925, + "grad_norm": 2.0752992630004883, + "learning_rate": 1.8495078600803246e-05, + "loss": 0.5668, + "step": 18513 + }, + { + "epoch": 3.022203175380597, + "grad_norm": 1.814719796180725, + "learning_rate": 1.8494911179478894e-05, + "loss": 0.561, + "step": 18514 + }, + { + "epoch": 3.022366434023101, + "grad_norm": 1.6292648315429688, + "learning_rate": 1.849474374960017e-05, + "loss": 0.4156, + "step": 18515 + }, + { + "epoch": 3.0225296926656053, + "grad_norm": 1.9059581756591797, + "learning_rate": 1.849457631116724e-05, + "loss": 0.5285, + "step": 18516 + }, + { + "epoch": 3.0226929513081098, + "grad_norm": 1.9030908346176147, + "learning_rate": 1.8494408864180274e-05, + "loss": 0.4194, + "step": 18517 + }, + { + "epoch": 3.022856209950614, + "grad_norm": 1.7944772243499756, + "learning_rate": 1.8494241408639443e-05, + "loss": 0.4485, + "step": 18518 + }, + { + "epoch": 3.0230194685931187, + "grad_norm": 1.6627840995788574, + "learning_rate": 1.849407394454491e-05, + "loss": 0.3895, + "step": 18519 + }, + { + "epoch": 3.023182727235623, + "grad_norm": 1.8362826108932495, + "learning_rate": 1.8493906471896846e-05, + "loss": 0.4182, + "step": 18520 + }, + { + "epoch": 3.0233459858781275, + "grad_norm": 1.477038860321045, + "learning_rate": 1.8493738990695423e-05, + "loss": 0.3698, + "step": 18521 + }, + { + "epoch": 3.023509244520632, + "grad_norm": 1.9110393524169922, + "learning_rate": 1.8493571500940807e-05, + "loss": 0.4457, + "step": 18522 + }, + { + "epoch": 3.0236725031631364, + "grad_norm": 1.6459077596664429, + "learning_rate": 1.8493404002633167e-05, + "loss": 0.4214, + "step": 18523 + }, + { + "epoch": 3.0238357618056404, + "grad_norm": 1.8744080066680908, + "learning_rate": 1.849323649577267e-05, + "loss": 0.437, + "step": 18524 + }, + { + "epoch": 3.023999020448145, + "grad_norm": 1.6930999755859375, + "learning_rate": 1.8493068980359487e-05, + "loss": 0.4452, + "step": 18525 + }, + { + "epoch": 3.0241622790906493, + "grad_norm": 1.89438796043396, + "learning_rate": 1.8492901456393786e-05, + "loss": 0.4909, + "step": 18526 + }, + { + "epoch": 3.0243255377331537, + "grad_norm": 1.8247227668762207, + "learning_rate": 1.8492733923875736e-05, + "loss": 0.4437, + "step": 18527 + }, + { + "epoch": 3.024488796375658, + "grad_norm": 1.966711401939392, + "learning_rate": 1.8492566382805502e-05, + "loss": 0.4763, + "step": 18528 + }, + { + "epoch": 3.0246520550181626, + "grad_norm": 2.03691029548645, + "learning_rate": 1.849239883318326e-05, + "loss": 0.4717, + "step": 18529 + }, + { + "epoch": 3.024815313660667, + "grad_norm": 2.0042803287506104, + "learning_rate": 1.8492231275009172e-05, + "loss": 0.4968, + "step": 18530 + }, + { + "epoch": 3.0249785723031715, + "grad_norm": 1.8122791051864624, + "learning_rate": 1.849206370828341e-05, + "loss": 0.4401, + "step": 18531 + }, + { + "epoch": 3.025141830945676, + "grad_norm": 1.7246408462524414, + "learning_rate": 1.8491896133006142e-05, + "loss": 0.4282, + "step": 18532 + }, + { + "epoch": 3.02530508958818, + "grad_norm": 2.2058522701263428, + "learning_rate": 1.849172854917754e-05, + "loss": 0.4704, + "step": 18533 + }, + { + "epoch": 3.0254683482306843, + "grad_norm": 1.9551091194152832, + "learning_rate": 1.8491560956797766e-05, + "loss": 0.5531, + "step": 18534 + }, + { + "epoch": 3.0256316068731888, + "grad_norm": 1.973941445350647, + "learning_rate": 1.8491393355866993e-05, + "loss": 0.5182, + "step": 18535 + }, + { + "epoch": 3.025794865515693, + "grad_norm": 1.7628107070922852, + "learning_rate": 1.8491225746385387e-05, + "loss": 0.5576, + "step": 18536 + }, + { + "epoch": 3.0259581241581976, + "grad_norm": 1.9157634973526, + "learning_rate": 1.8491058128353123e-05, + "loss": 0.481, + "step": 18537 + }, + { + "epoch": 3.026121382800702, + "grad_norm": 1.829927682876587, + "learning_rate": 1.8490890501770363e-05, + "loss": 0.4123, + "step": 18538 + }, + { + "epoch": 3.0262846414432065, + "grad_norm": 2.167896270751953, + "learning_rate": 1.849072286663728e-05, + "loss": 0.5436, + "step": 18539 + }, + { + "epoch": 3.026447900085711, + "grad_norm": 2.085259437561035, + "learning_rate": 1.849055522295404e-05, + "loss": 0.4671, + "step": 18540 + }, + { + "epoch": 3.026611158728215, + "grad_norm": 1.7202872037887573, + "learning_rate": 1.8490387570720814e-05, + "loss": 0.4366, + "step": 18541 + }, + { + "epoch": 3.0267744173707194, + "grad_norm": 1.7242106199264526, + "learning_rate": 1.849021990993777e-05, + "loss": 0.3921, + "step": 18542 + }, + { + "epoch": 3.026937676013224, + "grad_norm": 2.1002416610717773, + "learning_rate": 1.8490052240605075e-05, + "loss": 0.53, + "step": 18543 + }, + { + "epoch": 3.0271009346557283, + "grad_norm": 1.8681710958480835, + "learning_rate": 1.8489884562722903e-05, + "loss": 0.4748, + "step": 18544 + }, + { + "epoch": 3.0272641932982327, + "grad_norm": 1.7063722610473633, + "learning_rate": 1.8489716876291417e-05, + "loss": 0.4673, + "step": 18545 + }, + { + "epoch": 3.027427451940737, + "grad_norm": 1.9917008876800537, + "learning_rate": 1.8489549181310788e-05, + "loss": 0.4549, + "step": 18546 + }, + { + "epoch": 3.0275907105832416, + "grad_norm": 2.0555872917175293, + "learning_rate": 1.8489381477781186e-05, + "loss": 0.5769, + "step": 18547 + }, + { + "epoch": 3.027753969225746, + "grad_norm": 1.47267746925354, + "learning_rate": 1.848921376570278e-05, + "loss": 0.4048, + "step": 18548 + }, + { + "epoch": 3.0279172278682505, + "grad_norm": 1.8522158861160278, + "learning_rate": 1.8489046045075737e-05, + "loss": 0.4771, + "step": 18549 + }, + { + "epoch": 3.0280804865107545, + "grad_norm": 1.7550816535949707, + "learning_rate": 1.8488878315900228e-05, + "loss": 0.4646, + "step": 18550 + }, + { + "epoch": 3.028243745153259, + "grad_norm": 1.5968536138534546, + "learning_rate": 1.8488710578176418e-05, + "loss": 0.397, + "step": 18551 + }, + { + "epoch": 3.0284070037957633, + "grad_norm": 1.7823832035064697, + "learning_rate": 1.8488542831904484e-05, + "loss": 0.4974, + "step": 18552 + }, + { + "epoch": 3.0285702624382678, + "grad_norm": 1.803792119026184, + "learning_rate": 1.8488375077084585e-05, + "loss": 0.4489, + "step": 18553 + }, + { + "epoch": 3.028733521080772, + "grad_norm": 1.6587532758712769, + "learning_rate": 1.8488207313716897e-05, + "loss": 0.4614, + "step": 18554 + }, + { + "epoch": 3.0288967797232766, + "grad_norm": 2.115346670150757, + "learning_rate": 1.8488039541801582e-05, + "loss": 0.499, + "step": 18555 + }, + { + "epoch": 3.029060038365781, + "grad_norm": 2.0273098945617676, + "learning_rate": 1.848787176133882e-05, + "loss": 0.4592, + "step": 18556 + }, + { + "epoch": 3.0292232970082855, + "grad_norm": 1.8349318504333496, + "learning_rate": 1.848770397232877e-05, + "loss": 0.3986, + "step": 18557 + }, + { + "epoch": 3.02938655565079, + "grad_norm": 1.8474103212356567, + "learning_rate": 1.8487536174771605e-05, + "loss": 0.4635, + "step": 18558 + }, + { + "epoch": 3.029549814293294, + "grad_norm": 1.6848357915878296, + "learning_rate": 1.848736836866749e-05, + "loss": 0.4248, + "step": 18559 + }, + { + "epoch": 3.0297130729357984, + "grad_norm": 1.7958800792694092, + "learning_rate": 1.8487200554016602e-05, + "loss": 0.4214, + "step": 18560 + }, + { + "epoch": 3.029876331578303, + "grad_norm": 1.713108777999878, + "learning_rate": 1.8487032730819104e-05, + "loss": 0.4445, + "step": 18561 + }, + { + "epoch": 3.0300395902208073, + "grad_norm": 1.851967692375183, + "learning_rate": 1.848686489907517e-05, + "loss": 0.4409, + "step": 18562 + }, + { + "epoch": 3.0302028488633117, + "grad_norm": 1.8192331790924072, + "learning_rate": 1.8486697058784956e-05, + "loss": 0.4356, + "step": 18563 + }, + { + "epoch": 3.030366107505816, + "grad_norm": 1.8840584754943848, + "learning_rate": 1.8486529209948645e-05, + "loss": 0.4836, + "step": 18564 + }, + { + "epoch": 3.0305293661483206, + "grad_norm": 2.20465087890625, + "learning_rate": 1.8486361352566402e-05, + "loss": 0.6, + "step": 18565 + }, + { + "epoch": 3.030692624790825, + "grad_norm": 1.774479866027832, + "learning_rate": 1.8486193486638396e-05, + "loss": 0.4839, + "step": 18566 + }, + { + "epoch": 3.0308558834333295, + "grad_norm": 2.039196729660034, + "learning_rate": 1.8486025612164796e-05, + "loss": 0.4939, + "step": 18567 + }, + { + "epoch": 3.0310191420758335, + "grad_norm": 2.077199697494507, + "learning_rate": 1.848585772914577e-05, + "loss": 0.4285, + "step": 18568 + }, + { + "epoch": 3.031182400718338, + "grad_norm": 1.970241665840149, + "learning_rate": 1.8485689837581484e-05, + "loss": 0.5272, + "step": 18569 + }, + { + "epoch": 3.0313456593608423, + "grad_norm": 2.029618263244629, + "learning_rate": 1.8485521937472114e-05, + "loss": 0.4935, + "step": 18570 + }, + { + "epoch": 3.0315089180033468, + "grad_norm": 2.002202272415161, + "learning_rate": 1.8485354028817824e-05, + "loss": 0.4968, + "step": 18571 + }, + { + "epoch": 3.031672176645851, + "grad_norm": 2.049346923828125, + "learning_rate": 1.8485186111618785e-05, + "loss": 0.4867, + "step": 18572 + }, + { + "epoch": 3.0318354352883556, + "grad_norm": 2.336742639541626, + "learning_rate": 1.848501818587517e-05, + "loss": 0.5273, + "step": 18573 + }, + { + "epoch": 3.03199869393086, + "grad_norm": 2.1030962467193604, + "learning_rate": 1.8484850251587135e-05, + "loss": 0.5398, + "step": 18574 + }, + { + "epoch": 3.0321619525733645, + "grad_norm": 1.8739272356033325, + "learning_rate": 1.8484682308754863e-05, + "loss": 0.4358, + "step": 18575 + }, + { + "epoch": 3.032325211215869, + "grad_norm": 1.888171672821045, + "learning_rate": 1.848451435737852e-05, + "loss": 0.4631, + "step": 18576 + }, + { + "epoch": 3.032488469858373, + "grad_norm": 1.7720956802368164, + "learning_rate": 1.8484346397458272e-05, + "loss": 0.4313, + "step": 18577 + }, + { + "epoch": 3.0326517285008774, + "grad_norm": 1.9825297594070435, + "learning_rate": 1.848417842899429e-05, + "loss": 0.4135, + "step": 18578 + }, + { + "epoch": 3.032814987143382, + "grad_norm": 1.896290898323059, + "learning_rate": 1.8484010451986744e-05, + "loss": 0.3977, + "step": 18579 + }, + { + "epoch": 3.0329782457858863, + "grad_norm": 2.1111438274383545, + "learning_rate": 1.8483842466435798e-05, + "loss": 0.5671, + "step": 18580 + }, + { + "epoch": 3.0331415044283907, + "grad_norm": 1.9359071254730225, + "learning_rate": 1.8483674472341627e-05, + "loss": 0.4097, + "step": 18581 + }, + { + "epoch": 3.033304763070895, + "grad_norm": 1.9141757488250732, + "learning_rate": 1.8483506469704394e-05, + "loss": 0.3859, + "step": 18582 + }, + { + "epoch": 3.0334680217133996, + "grad_norm": 2.3721466064453125, + "learning_rate": 1.8483338458524278e-05, + "loss": 0.549, + "step": 18583 + }, + { + "epoch": 3.033631280355904, + "grad_norm": 1.817600131034851, + "learning_rate": 1.8483170438801438e-05, + "loss": 0.4819, + "step": 18584 + }, + { + "epoch": 3.033794538998408, + "grad_norm": 1.8866328001022339, + "learning_rate": 1.8483002410536054e-05, + "loss": 0.4885, + "step": 18585 + }, + { + "epoch": 3.0339577976409124, + "grad_norm": 1.5779774188995361, + "learning_rate": 1.8482834373728282e-05, + "loss": 0.3586, + "step": 18586 + }, + { + "epoch": 3.034121056283417, + "grad_norm": 1.6901535987854004, + "learning_rate": 1.84826663283783e-05, + "loss": 0.4271, + "step": 18587 + }, + { + "epoch": 3.0342843149259213, + "grad_norm": 2.003873825073242, + "learning_rate": 1.8482498274486277e-05, + "loss": 0.5517, + "step": 18588 + }, + { + "epoch": 3.0344475735684258, + "grad_norm": 1.8855799436569214, + "learning_rate": 1.8482330212052377e-05, + "loss": 0.451, + "step": 18589 + }, + { + "epoch": 3.03461083221093, + "grad_norm": 1.867447853088379, + "learning_rate": 1.848216214107678e-05, + "loss": 0.4744, + "step": 18590 + }, + { + "epoch": 3.0347740908534346, + "grad_norm": 2.0480082035064697, + "learning_rate": 1.8481994061559638e-05, + "loss": 0.4901, + "step": 18591 + }, + { + "epoch": 3.034937349495939, + "grad_norm": 1.8364367485046387, + "learning_rate": 1.8481825973501138e-05, + "loss": 0.4188, + "step": 18592 + }, + { + "epoch": 3.0351006081384435, + "grad_norm": 1.8925799131393433, + "learning_rate": 1.848165787690144e-05, + "loss": 0.4801, + "step": 18593 + }, + { + "epoch": 3.0352638667809475, + "grad_norm": 2.1837453842163086, + "learning_rate": 1.8481489771760713e-05, + "loss": 0.5154, + "step": 18594 + }, + { + "epoch": 3.035427125423452, + "grad_norm": 1.9367014169692993, + "learning_rate": 1.8481321658079127e-05, + "loss": 0.4743, + "step": 18595 + }, + { + "epoch": 3.0355903840659564, + "grad_norm": 1.9907019138336182, + "learning_rate": 1.8481153535856854e-05, + "loss": 0.4742, + "step": 18596 + }, + { + "epoch": 3.035753642708461, + "grad_norm": 1.9176371097564697, + "learning_rate": 1.848098540509406e-05, + "loss": 0.4582, + "step": 18597 + }, + { + "epoch": 3.0359169013509653, + "grad_norm": 1.6209602355957031, + "learning_rate": 1.8480817265790917e-05, + "loss": 0.4286, + "step": 18598 + }, + { + "epoch": 3.0360801599934697, + "grad_norm": 2.0434186458587646, + "learning_rate": 1.8480649117947594e-05, + "loss": 0.514, + "step": 18599 + }, + { + "epoch": 3.036243418635974, + "grad_norm": 2.1719791889190674, + "learning_rate": 1.848048096156426e-05, + "loss": 0.518, + "step": 18600 + }, + { + "epoch": 3.0364066772784786, + "grad_norm": 1.784977674484253, + "learning_rate": 1.8480312796641083e-05, + "loss": 0.4151, + "step": 18601 + }, + { + "epoch": 3.036569935920983, + "grad_norm": 1.579433798789978, + "learning_rate": 1.8480144623178236e-05, + "loss": 0.4783, + "step": 18602 + }, + { + "epoch": 3.036733194563487, + "grad_norm": 1.697459101676941, + "learning_rate": 1.847997644117588e-05, + "loss": 0.4369, + "step": 18603 + }, + { + "epoch": 3.0368964532059914, + "grad_norm": 1.9512717723846436, + "learning_rate": 1.8479808250634197e-05, + "loss": 0.496, + "step": 18604 + }, + { + "epoch": 3.037059711848496, + "grad_norm": 2.0308525562286377, + "learning_rate": 1.8479640051553344e-05, + "loss": 0.6, + "step": 18605 + }, + { + "epoch": 3.0372229704910003, + "grad_norm": 1.902160406112671, + "learning_rate": 1.8479471843933497e-05, + "loss": 0.4531, + "step": 18606 + }, + { + "epoch": 3.0373862291335048, + "grad_norm": 1.8032641410827637, + "learning_rate": 1.8479303627774823e-05, + "loss": 0.4474, + "step": 18607 + }, + { + "epoch": 3.037549487776009, + "grad_norm": 1.7131620645523071, + "learning_rate": 1.8479135403077494e-05, + "loss": 0.4841, + "step": 18608 + }, + { + "epoch": 3.0377127464185136, + "grad_norm": 1.6816978454589844, + "learning_rate": 1.8478967169841677e-05, + "loss": 0.4389, + "step": 18609 + }, + { + "epoch": 3.037876005061018, + "grad_norm": 1.8240941762924194, + "learning_rate": 1.8478798928067544e-05, + "loss": 0.4891, + "step": 18610 + }, + { + "epoch": 3.0380392637035225, + "grad_norm": 2.1447415351867676, + "learning_rate": 1.8478630677755264e-05, + "loss": 0.551, + "step": 18611 + }, + { + "epoch": 3.0382025223460265, + "grad_norm": 1.8508821725845337, + "learning_rate": 1.8478462418905e-05, + "loss": 0.4671, + "step": 18612 + }, + { + "epoch": 3.038365780988531, + "grad_norm": 1.8245741128921509, + "learning_rate": 1.847829415151693e-05, + "loss": 0.4491, + "step": 18613 + }, + { + "epoch": 3.0385290396310354, + "grad_norm": 1.7404614686965942, + "learning_rate": 1.8478125875591222e-05, + "loss": 0.4607, + "step": 18614 + }, + { + "epoch": 3.03869229827354, + "grad_norm": 1.8120719194412231, + "learning_rate": 1.8477957591128038e-05, + "loss": 0.4724, + "step": 18615 + }, + { + "epoch": 3.0388555569160443, + "grad_norm": 2.084932804107666, + "learning_rate": 1.847778929812756e-05, + "loss": 0.569, + "step": 18616 + }, + { + "epoch": 3.0390188155585487, + "grad_norm": 2.260119915008545, + "learning_rate": 1.8477620996589945e-05, + "loss": 0.499, + "step": 18617 + }, + { + "epoch": 3.039182074201053, + "grad_norm": 2.0077059268951416, + "learning_rate": 1.8477452686515368e-05, + "loss": 0.4535, + "step": 18618 + }, + { + "epoch": 3.0393453328435576, + "grad_norm": 1.46359384059906, + "learning_rate": 1.8477284367904e-05, + "loss": 0.3882, + "step": 18619 + }, + { + "epoch": 3.039508591486062, + "grad_norm": 2.3076517581939697, + "learning_rate": 1.8477116040756007e-05, + "loss": 0.5746, + "step": 18620 + }, + { + "epoch": 3.039671850128566, + "grad_norm": 3.8177757263183594, + "learning_rate": 1.847694770507156e-05, + "loss": 0.5317, + "step": 18621 + }, + { + "epoch": 3.0398351087710704, + "grad_norm": 1.7906885147094727, + "learning_rate": 1.8476779360850833e-05, + "loss": 0.425, + "step": 18622 + }, + { + "epoch": 3.039998367413575, + "grad_norm": 1.717004418373108, + "learning_rate": 1.847661100809399e-05, + "loss": 0.4638, + "step": 18623 + }, + { + "epoch": 3.0401616260560793, + "grad_norm": 1.9060102701187134, + "learning_rate": 1.84764426468012e-05, + "loss": 0.4744, + "step": 18624 + }, + { + "epoch": 3.0403248846985838, + "grad_norm": 1.95914626121521, + "learning_rate": 1.8476274276972635e-05, + "loss": 0.4997, + "step": 18625 + }, + { + "epoch": 3.040488143341088, + "grad_norm": 1.9381455183029175, + "learning_rate": 1.8476105898608466e-05, + "loss": 0.4508, + "step": 18626 + }, + { + "epoch": 3.0406514019835926, + "grad_norm": 2.1354362964630127, + "learning_rate": 1.8475937511708858e-05, + "loss": 0.4924, + "step": 18627 + }, + { + "epoch": 3.040814660626097, + "grad_norm": 1.9223411083221436, + "learning_rate": 1.8475769116273987e-05, + "loss": 0.5058, + "step": 18628 + }, + { + "epoch": 3.0409779192686015, + "grad_norm": 1.8241260051727295, + "learning_rate": 1.8475600712304015e-05, + "loss": 0.443, + "step": 18629 + }, + { + "epoch": 3.0411411779111055, + "grad_norm": 2.0476441383361816, + "learning_rate": 1.847543229979912e-05, + "loss": 0.4688, + "step": 18630 + }, + { + "epoch": 3.04130443655361, + "grad_norm": 1.948378562927246, + "learning_rate": 1.8475263878759462e-05, + "loss": 0.4971, + "step": 18631 + }, + { + "epoch": 3.0414676951961144, + "grad_norm": 1.622545599937439, + "learning_rate": 1.8475095449185216e-05, + "loss": 0.4637, + "step": 18632 + }, + { + "epoch": 3.041630953838619, + "grad_norm": 2.034364938735962, + "learning_rate": 1.8474927011076554e-05, + "loss": 0.4819, + "step": 18633 + }, + { + "epoch": 3.0417942124811232, + "grad_norm": 1.6279072761535645, + "learning_rate": 1.847475856443364e-05, + "loss": 0.4325, + "step": 18634 + }, + { + "epoch": 3.0419574711236277, + "grad_norm": 1.8424049615859985, + "learning_rate": 1.8474590109256646e-05, + "loss": 0.4753, + "step": 18635 + }, + { + "epoch": 3.042120729766132, + "grad_norm": 1.6093389987945557, + "learning_rate": 1.8474421645545745e-05, + "loss": 0.4152, + "step": 18636 + }, + { + "epoch": 3.0422839884086366, + "grad_norm": 1.8137383460998535, + "learning_rate": 1.8474253173301103e-05, + "loss": 0.5798, + "step": 18637 + }, + { + "epoch": 3.0424472470511406, + "grad_norm": 1.683749794960022, + "learning_rate": 1.847408469252289e-05, + "loss": 0.4872, + "step": 18638 + }, + { + "epoch": 3.042610505693645, + "grad_norm": 1.756422758102417, + "learning_rate": 1.8473916203211274e-05, + "loss": 0.4472, + "step": 18639 + }, + { + "epoch": 3.0427737643361494, + "grad_norm": 2.014017343521118, + "learning_rate": 1.8473747705366427e-05, + "loss": 0.4421, + "step": 18640 + }, + { + "epoch": 3.042937022978654, + "grad_norm": 1.701934814453125, + "learning_rate": 1.8473579198988522e-05, + "loss": 0.4311, + "step": 18641 + }, + { + "epoch": 3.0431002816211583, + "grad_norm": 1.6692113876342773, + "learning_rate": 1.8473410684077722e-05, + "loss": 0.4299, + "step": 18642 + }, + { + "epoch": 3.0432635402636627, + "grad_norm": 2.0820791721343994, + "learning_rate": 1.8473242160634197e-05, + "loss": 0.5056, + "step": 18643 + }, + { + "epoch": 3.043426798906167, + "grad_norm": 1.9600498676300049, + "learning_rate": 1.8473073628658123e-05, + "loss": 0.5167, + "step": 18644 + }, + { + "epoch": 3.0435900575486716, + "grad_norm": 1.6431571245193481, + "learning_rate": 1.8472905088149663e-05, + "loss": 0.3961, + "step": 18645 + }, + { + "epoch": 3.043753316191176, + "grad_norm": 2.023959159851074, + "learning_rate": 1.8472736539108995e-05, + "loss": 0.4518, + "step": 18646 + }, + { + "epoch": 3.04391657483368, + "grad_norm": 1.7972930669784546, + "learning_rate": 1.847256798153628e-05, + "loss": 0.4175, + "step": 18647 + }, + { + "epoch": 3.0440798334761845, + "grad_norm": 2.0838162899017334, + "learning_rate": 1.8472399415431693e-05, + "loss": 0.469, + "step": 18648 + }, + { + "epoch": 3.044243092118689, + "grad_norm": 2.2357752323150635, + "learning_rate": 1.84722308407954e-05, + "loss": 0.4646, + "step": 18649 + }, + { + "epoch": 3.0444063507611934, + "grad_norm": 2.1866726875305176, + "learning_rate": 1.8472062257627573e-05, + "loss": 0.5837, + "step": 18650 + }, + { + "epoch": 3.044569609403698, + "grad_norm": 2.0656659603118896, + "learning_rate": 1.8471893665928385e-05, + "loss": 0.5301, + "step": 18651 + }, + { + "epoch": 3.0447328680462022, + "grad_norm": 1.5548664331436157, + "learning_rate": 1.8471725065698e-05, + "loss": 0.3986, + "step": 18652 + }, + { + "epoch": 3.0448961266887067, + "grad_norm": 1.7966485023498535, + "learning_rate": 1.847155645693659e-05, + "loss": 0.44, + "step": 18653 + }, + { + "epoch": 3.045059385331211, + "grad_norm": 2.2940239906311035, + "learning_rate": 1.8471387839644325e-05, + "loss": 0.5528, + "step": 18654 + }, + { + "epoch": 3.0452226439737156, + "grad_norm": 1.7163159847259521, + "learning_rate": 1.8471219213821374e-05, + "loss": 0.4789, + "step": 18655 + }, + { + "epoch": 3.0453859026162196, + "grad_norm": 1.8231674432754517, + "learning_rate": 1.8471050579467907e-05, + "loss": 0.4723, + "step": 18656 + }, + { + "epoch": 3.045549161258724, + "grad_norm": 2.1097187995910645, + "learning_rate": 1.8470881936584094e-05, + "loss": 0.5135, + "step": 18657 + }, + { + "epoch": 3.0457124199012284, + "grad_norm": 2.0402026176452637, + "learning_rate": 1.8470713285170106e-05, + "loss": 0.4732, + "step": 18658 + }, + { + "epoch": 3.045875678543733, + "grad_norm": 1.8236067295074463, + "learning_rate": 1.8470544625226114e-05, + "loss": 0.4354, + "step": 18659 + }, + { + "epoch": 3.0460389371862373, + "grad_norm": 2.174427032470703, + "learning_rate": 1.8470375956752283e-05, + "loss": 0.522, + "step": 18660 + }, + { + "epoch": 3.0462021958287417, + "grad_norm": 2.083010196685791, + "learning_rate": 1.847020727974879e-05, + "loss": 0.4755, + "step": 18661 + }, + { + "epoch": 3.046365454471246, + "grad_norm": 2.041193723678589, + "learning_rate": 1.8470038594215796e-05, + "loss": 0.6333, + "step": 18662 + }, + { + "epoch": 3.0465287131137506, + "grad_norm": 1.8220324516296387, + "learning_rate": 1.8469869900153476e-05, + "loss": 0.4567, + "step": 18663 + }, + { + "epoch": 3.046691971756255, + "grad_norm": 1.9396158456802368, + "learning_rate": 1.8469701197562e-05, + "loss": 0.4908, + "step": 18664 + }, + { + "epoch": 3.046855230398759, + "grad_norm": 2.0589895248413086, + "learning_rate": 1.8469532486441536e-05, + "loss": 0.5035, + "step": 18665 + }, + { + "epoch": 3.0470184890412635, + "grad_norm": 1.9262452125549316, + "learning_rate": 1.8469363766792258e-05, + "loss": 0.46, + "step": 18666 + }, + { + "epoch": 3.047181747683768, + "grad_norm": 1.7004395723342896, + "learning_rate": 1.846919503861433e-05, + "loss": 0.4467, + "step": 18667 + }, + { + "epoch": 3.0473450063262724, + "grad_norm": 1.8657355308532715, + "learning_rate": 1.8469026301907926e-05, + "loss": 0.4671, + "step": 18668 + }, + { + "epoch": 3.047508264968777, + "grad_norm": 1.83957839012146, + "learning_rate": 1.8468857556673215e-05, + "loss": 0.458, + "step": 18669 + }, + { + "epoch": 3.0476715236112812, + "grad_norm": 1.8485075235366821, + "learning_rate": 1.846868880291037e-05, + "loss": 0.5067, + "step": 18670 + }, + { + "epoch": 3.0478347822537857, + "grad_norm": 1.469110131263733, + "learning_rate": 1.8468520040619552e-05, + "loss": 0.3568, + "step": 18671 + }, + { + "epoch": 3.04799804089629, + "grad_norm": 1.8875027894973755, + "learning_rate": 1.8468351269800936e-05, + "loss": 0.4432, + "step": 18672 + }, + { + "epoch": 3.048161299538794, + "grad_norm": 1.9053432941436768, + "learning_rate": 1.8468182490454697e-05, + "loss": 0.4762, + "step": 18673 + }, + { + "epoch": 3.0483245581812985, + "grad_norm": 1.8398176431655884, + "learning_rate": 1.8468013702580998e-05, + "loss": 0.4967, + "step": 18674 + }, + { + "epoch": 3.048487816823803, + "grad_norm": 1.545680046081543, + "learning_rate": 1.846784490618001e-05, + "loss": 0.3862, + "step": 18675 + }, + { + "epoch": 3.0486510754663074, + "grad_norm": 1.8510953187942505, + "learning_rate": 1.8467676101251907e-05, + "loss": 0.4183, + "step": 18676 + }, + { + "epoch": 3.048814334108812, + "grad_norm": 1.663995623588562, + "learning_rate": 1.8467507287796857e-05, + "loss": 0.4587, + "step": 18677 + }, + { + "epoch": 3.0489775927513163, + "grad_norm": 1.6798077821731567, + "learning_rate": 1.8467338465815028e-05, + "loss": 0.37, + "step": 18678 + }, + { + "epoch": 3.0491408513938207, + "grad_norm": 1.8757520914077759, + "learning_rate": 1.8467169635306593e-05, + "loss": 0.4035, + "step": 18679 + }, + { + "epoch": 3.049304110036325, + "grad_norm": 2.2842392921447754, + "learning_rate": 1.8467000796271717e-05, + "loss": 0.5358, + "step": 18680 + }, + { + "epoch": 3.0494673686788296, + "grad_norm": 2.1493191719055176, + "learning_rate": 1.8466831948710578e-05, + "loss": 0.5548, + "step": 18681 + }, + { + "epoch": 3.0496306273213336, + "grad_norm": 1.4486808776855469, + "learning_rate": 1.8466663092623337e-05, + "loss": 0.3575, + "step": 18682 + }, + { + "epoch": 3.049793885963838, + "grad_norm": 1.7720052003860474, + "learning_rate": 1.8466494228010174e-05, + "loss": 0.4742, + "step": 18683 + }, + { + "epoch": 3.0499571446063425, + "grad_norm": 2.126699209213257, + "learning_rate": 1.8466325354871248e-05, + "loss": 0.4918, + "step": 18684 + }, + { + "epoch": 3.050120403248847, + "grad_norm": 2.2300221920013428, + "learning_rate": 1.8466156473206736e-05, + "loss": 0.5048, + "step": 18685 + }, + { + "epoch": 3.0502836618913514, + "grad_norm": 1.7653998136520386, + "learning_rate": 1.846598758301681e-05, + "loss": 0.4734, + "step": 18686 + }, + { + "epoch": 3.050446920533856, + "grad_norm": 1.882630705833435, + "learning_rate": 1.846581868430163e-05, + "loss": 0.4629, + "step": 18687 + }, + { + "epoch": 3.0506101791763602, + "grad_norm": 1.6190106868743896, + "learning_rate": 1.8465649777061377e-05, + "loss": 0.444, + "step": 18688 + }, + { + "epoch": 3.0507734378188647, + "grad_norm": 1.8425428867340088, + "learning_rate": 1.846548086129622e-05, + "loss": 0.4717, + "step": 18689 + }, + { + "epoch": 3.050936696461369, + "grad_norm": 1.9034438133239746, + "learning_rate": 1.8465311937006323e-05, + "loss": 0.4592, + "step": 18690 + }, + { + "epoch": 3.051099955103873, + "grad_norm": 1.9791196584701538, + "learning_rate": 1.846514300419186e-05, + "loss": 0.4961, + "step": 18691 + }, + { + "epoch": 3.0512632137463775, + "grad_norm": 2.077639579772949, + "learning_rate": 1.8464974062852998e-05, + "loss": 0.4694, + "step": 18692 + }, + { + "epoch": 3.051426472388882, + "grad_norm": 2.307971239089966, + "learning_rate": 1.846480511298991e-05, + "loss": 0.4715, + "step": 18693 + }, + { + "epoch": 3.0515897310313864, + "grad_norm": 1.7811779975891113, + "learning_rate": 1.8464636154602765e-05, + "loss": 0.4356, + "step": 18694 + }, + { + "epoch": 3.051752989673891, + "grad_norm": 1.6607669591903687, + "learning_rate": 1.8464467187691736e-05, + "loss": 0.43, + "step": 18695 + }, + { + "epoch": 3.0519162483163953, + "grad_norm": 1.7749162912368774, + "learning_rate": 1.846429821225699e-05, + "loss": 0.4621, + "step": 18696 + }, + { + "epoch": 3.0520795069588997, + "grad_norm": 1.5229171514511108, + "learning_rate": 1.8464129228298697e-05, + "loss": 0.3972, + "step": 18697 + }, + { + "epoch": 3.052242765601404, + "grad_norm": 1.8957631587982178, + "learning_rate": 1.8463960235817027e-05, + "loss": 0.5287, + "step": 18698 + }, + { + "epoch": 3.0524060242439086, + "grad_norm": 1.6193853616714478, + "learning_rate": 1.8463791234812152e-05, + "loss": 0.4142, + "step": 18699 + }, + { + "epoch": 3.0525692828864126, + "grad_norm": 1.9981498718261719, + "learning_rate": 1.8463622225284242e-05, + "loss": 0.5327, + "step": 18700 + }, + { + "epoch": 3.052732541528917, + "grad_norm": 2.1244075298309326, + "learning_rate": 1.846345320723347e-05, + "loss": 0.5118, + "step": 18701 + }, + { + "epoch": 3.0528958001714215, + "grad_norm": 2.209906578063965, + "learning_rate": 1.846328418066e-05, + "loss": 0.5901, + "step": 18702 + }, + { + "epoch": 3.053059058813926, + "grad_norm": 1.7870675325393677, + "learning_rate": 1.8463115145564005e-05, + "loss": 0.4418, + "step": 18703 + }, + { + "epoch": 3.0532223174564304, + "grad_norm": 1.9929282665252686, + "learning_rate": 1.8462946101945655e-05, + "loss": 0.5034, + "step": 18704 + }, + { + "epoch": 3.053385576098935, + "grad_norm": 1.7951833009719849, + "learning_rate": 1.846277704980512e-05, + "loss": 0.4761, + "step": 18705 + }, + { + "epoch": 3.0535488347414392, + "grad_norm": 2.0282347202301025, + "learning_rate": 1.8462607989142573e-05, + "loss": 0.5012, + "step": 18706 + }, + { + "epoch": 3.0537120933839437, + "grad_norm": 2.2750163078308105, + "learning_rate": 1.846243891995818e-05, + "loss": 0.5127, + "step": 18707 + }, + { + "epoch": 3.053875352026448, + "grad_norm": 1.7806729078292847, + "learning_rate": 1.8462269842252113e-05, + "loss": 0.4279, + "step": 18708 + }, + { + "epoch": 3.054038610668952, + "grad_norm": 1.699812412261963, + "learning_rate": 1.8462100756024543e-05, + "loss": 0.4584, + "step": 18709 + }, + { + "epoch": 3.0542018693114565, + "grad_norm": 2.0590999126434326, + "learning_rate": 1.8461931661275642e-05, + "loss": 0.5519, + "step": 18710 + }, + { + "epoch": 3.054365127953961, + "grad_norm": 1.780012845993042, + "learning_rate": 1.8461762558005576e-05, + "loss": 0.3916, + "step": 18711 + }, + { + "epoch": 3.0545283865964654, + "grad_norm": 1.5121303796768188, + "learning_rate": 1.8461593446214518e-05, + "loss": 0.3685, + "step": 18712 + }, + { + "epoch": 3.05469164523897, + "grad_norm": 1.917776107788086, + "learning_rate": 1.8461424325902636e-05, + "loss": 0.4162, + "step": 18713 + }, + { + "epoch": 3.0548549038814743, + "grad_norm": 1.8498414754867554, + "learning_rate": 1.8461255197070102e-05, + "loss": 0.4019, + "step": 18714 + }, + { + "epoch": 3.0550181625239787, + "grad_norm": 2.094975471496582, + "learning_rate": 1.846108605971709e-05, + "loss": 0.4913, + "step": 18715 + }, + { + "epoch": 3.055181421166483, + "grad_norm": 2.1996169090270996, + "learning_rate": 1.8460916913843764e-05, + "loss": 0.4946, + "step": 18716 + }, + { + "epoch": 3.0553446798089876, + "grad_norm": 2.022977113723755, + "learning_rate": 1.8460747759450296e-05, + "loss": 0.479, + "step": 18717 + }, + { + "epoch": 3.0555079384514916, + "grad_norm": 1.4077081680297852, + "learning_rate": 1.846057859653686e-05, + "loss": 0.3484, + "step": 18718 + }, + { + "epoch": 3.055671197093996, + "grad_norm": 1.941217064857483, + "learning_rate": 1.846040942510362e-05, + "loss": 0.414, + "step": 18719 + }, + { + "epoch": 3.0558344557365005, + "grad_norm": 1.6521804332733154, + "learning_rate": 1.846024024515075e-05, + "loss": 0.3995, + "step": 18720 + }, + { + "epoch": 3.055997714379005, + "grad_norm": 2.332625389099121, + "learning_rate": 1.8460071056678424e-05, + "loss": 0.5009, + "step": 18721 + }, + { + "epoch": 3.0561609730215094, + "grad_norm": 2.1248717308044434, + "learning_rate": 1.8459901859686805e-05, + "loss": 0.522, + "step": 18722 + }, + { + "epoch": 3.056324231664014, + "grad_norm": 1.986194372177124, + "learning_rate": 1.845973265417607e-05, + "loss": 0.3952, + "step": 18723 + }, + { + "epoch": 3.0564874903065182, + "grad_norm": 1.995153784751892, + "learning_rate": 1.8459563440146384e-05, + "loss": 0.4307, + "step": 18724 + }, + { + "epoch": 3.0566507489490227, + "grad_norm": 1.8827095031738281, + "learning_rate": 1.845939421759792e-05, + "loss": 0.4608, + "step": 18725 + }, + { + "epoch": 3.0568140075915267, + "grad_norm": 1.8095571994781494, + "learning_rate": 1.845922498653085e-05, + "loss": 0.4532, + "step": 18726 + }, + { + "epoch": 3.056977266234031, + "grad_norm": 1.9022883176803589, + "learning_rate": 1.8459055746945343e-05, + "loss": 0.4622, + "step": 18727 + }, + { + "epoch": 3.0571405248765355, + "grad_norm": 2.2726333141326904, + "learning_rate": 1.8458886498841567e-05, + "loss": 0.5058, + "step": 18728 + }, + { + "epoch": 3.05730378351904, + "grad_norm": 2.5704638957977295, + "learning_rate": 1.8458717242219696e-05, + "loss": 0.5837, + "step": 18729 + }, + { + "epoch": 3.0574670421615444, + "grad_norm": 2.027662754058838, + "learning_rate": 1.8458547977079903e-05, + "loss": 0.4227, + "step": 18730 + }, + { + "epoch": 3.057630300804049, + "grad_norm": 1.784295916557312, + "learning_rate": 1.845837870342235e-05, + "loss": 0.3965, + "step": 18731 + }, + { + "epoch": 3.0577935594465533, + "grad_norm": 2.308485269546509, + "learning_rate": 1.8458209421247208e-05, + "loss": 0.4805, + "step": 18732 + }, + { + "epoch": 3.0579568180890577, + "grad_norm": 1.6927231550216675, + "learning_rate": 1.8458040130554656e-05, + "loss": 0.4047, + "step": 18733 + }, + { + "epoch": 3.058120076731562, + "grad_norm": 1.718102216720581, + "learning_rate": 1.845787083134486e-05, + "loss": 0.428, + "step": 18734 + }, + { + "epoch": 3.058283335374066, + "grad_norm": 1.7090480327606201, + "learning_rate": 1.8457701523617988e-05, + "loss": 0.4096, + "step": 18735 + }, + { + "epoch": 3.0584465940165706, + "grad_norm": 1.8369098901748657, + "learning_rate": 1.8457532207374216e-05, + "loss": 0.4485, + "step": 18736 + }, + { + "epoch": 3.058609852659075, + "grad_norm": 1.6586722135543823, + "learning_rate": 1.845736288261371e-05, + "loss": 0.4041, + "step": 18737 + }, + { + "epoch": 3.0587731113015795, + "grad_norm": 2.2092535495758057, + "learning_rate": 1.845719354933664e-05, + "loss": 0.5125, + "step": 18738 + }, + { + "epoch": 3.058936369944084, + "grad_norm": 1.8877769708633423, + "learning_rate": 1.845702420754318e-05, + "loss": 0.401, + "step": 18739 + }, + { + "epoch": 3.0590996285865883, + "grad_norm": 2.047327756881714, + "learning_rate": 1.8456854857233498e-05, + "loss": 0.5154, + "step": 18740 + }, + { + "epoch": 3.059262887229093, + "grad_norm": 1.985632061958313, + "learning_rate": 1.8456685498407767e-05, + "loss": 0.4412, + "step": 18741 + }, + { + "epoch": 3.0594261458715972, + "grad_norm": 2.4681646823883057, + "learning_rate": 1.8456516131066157e-05, + "loss": 0.4895, + "step": 18742 + }, + { + "epoch": 3.0595894045141017, + "grad_norm": 2.090775728225708, + "learning_rate": 1.8456346755208834e-05, + "loss": 0.4538, + "step": 18743 + }, + { + "epoch": 3.0597526631566057, + "grad_norm": 2.1487197875976562, + "learning_rate": 1.8456177370835973e-05, + "loss": 0.5023, + "step": 18744 + }, + { + "epoch": 3.05991592179911, + "grad_norm": 1.869023084640503, + "learning_rate": 1.8456007977947744e-05, + "loss": 0.448, + "step": 18745 + }, + { + "epoch": 3.0600791804416145, + "grad_norm": 1.8030370473861694, + "learning_rate": 1.8455838576544317e-05, + "loss": 0.4313, + "step": 18746 + }, + { + "epoch": 3.060242439084119, + "grad_norm": 2.025587320327759, + "learning_rate": 1.8455669166625864e-05, + "loss": 0.4673, + "step": 18747 + }, + { + "epoch": 3.0604056977266234, + "grad_norm": 2.341188669204712, + "learning_rate": 1.8455499748192554e-05, + "loss": 0.5292, + "step": 18748 + }, + { + "epoch": 3.060568956369128, + "grad_norm": 1.8538835048675537, + "learning_rate": 1.8455330321244558e-05, + "loss": 0.4928, + "step": 18749 + }, + { + "epoch": 3.0607322150116323, + "grad_norm": 1.915796160697937, + "learning_rate": 1.8455160885782045e-05, + "loss": 0.4313, + "step": 18750 + }, + { + "epoch": 3.0608954736541367, + "grad_norm": 2.1587038040161133, + "learning_rate": 1.8454991441805186e-05, + "loss": 0.4954, + "step": 18751 + }, + { + "epoch": 3.061058732296641, + "grad_norm": 1.7202714681625366, + "learning_rate": 1.845482198931416e-05, + "loss": 0.448, + "step": 18752 + }, + { + "epoch": 3.061221990939145, + "grad_norm": 2.257978916168213, + "learning_rate": 1.8454652528309123e-05, + "loss": 0.5252, + "step": 18753 + }, + { + "epoch": 3.0613852495816496, + "grad_norm": 1.6192007064819336, + "learning_rate": 1.8454483058790254e-05, + "loss": 0.433, + "step": 18754 + }, + { + "epoch": 3.061548508224154, + "grad_norm": 2.0182294845581055, + "learning_rate": 1.8454313580757728e-05, + "loss": 0.4545, + "step": 18755 + }, + { + "epoch": 3.0617117668666585, + "grad_norm": 2.7299411296844482, + "learning_rate": 1.8454144094211704e-05, + "loss": 0.51, + "step": 18756 + }, + { + "epoch": 3.061875025509163, + "grad_norm": 2.188082218170166, + "learning_rate": 1.8453974599152366e-05, + "loss": 0.5215, + "step": 18757 + }, + { + "epoch": 3.0620382841516673, + "grad_norm": 2.0099289417266846, + "learning_rate": 1.8453805095579872e-05, + "loss": 0.5183, + "step": 18758 + }, + { + "epoch": 3.062201542794172, + "grad_norm": 1.8648685216903687, + "learning_rate": 1.8453635583494402e-05, + "loss": 0.4668, + "step": 18759 + }, + { + "epoch": 3.062364801436676, + "grad_norm": 2.0064539909362793, + "learning_rate": 1.8453466062896122e-05, + "loss": 0.4871, + "step": 18760 + }, + { + "epoch": 3.06252806007918, + "grad_norm": 1.9047431945800781, + "learning_rate": 1.8453296533785202e-05, + "loss": 0.4997, + "step": 18761 + }, + { + "epoch": 3.0626913187216847, + "grad_norm": 1.980421543121338, + "learning_rate": 1.8453126996161818e-05, + "loss": 0.5062, + "step": 18762 + }, + { + "epoch": 3.062854577364189, + "grad_norm": 1.6593960523605347, + "learning_rate": 1.8452957450026135e-05, + "loss": 0.3719, + "step": 18763 + }, + { + "epoch": 3.0630178360066935, + "grad_norm": 1.9482228755950928, + "learning_rate": 1.8452787895378327e-05, + "loss": 0.4308, + "step": 18764 + }, + { + "epoch": 3.063181094649198, + "grad_norm": 2.419581890106201, + "learning_rate": 1.8452618332218563e-05, + "loss": 0.5411, + "step": 18765 + }, + { + "epoch": 3.0633443532917024, + "grad_norm": 1.9124846458435059, + "learning_rate": 1.8452448760547015e-05, + "loss": 0.5385, + "step": 18766 + }, + { + "epoch": 3.063507611934207, + "grad_norm": 1.8167780637741089, + "learning_rate": 1.8452279180363854e-05, + "loss": 0.4426, + "step": 18767 + }, + { + "epoch": 3.0636708705767113, + "grad_norm": 1.699260950088501, + "learning_rate": 1.8452109591669248e-05, + "loss": 0.4387, + "step": 18768 + }, + { + "epoch": 3.0638341292192157, + "grad_norm": 2.080015182495117, + "learning_rate": 1.8451939994463374e-05, + "loss": 0.481, + "step": 18769 + }, + { + "epoch": 3.06399738786172, + "grad_norm": 1.9230217933654785, + "learning_rate": 1.8451770388746398e-05, + "loss": 0.4287, + "step": 18770 + }, + { + "epoch": 3.064160646504224, + "grad_norm": 1.827426791191101, + "learning_rate": 1.845160077451849e-05, + "loss": 0.465, + "step": 18771 + }, + { + "epoch": 3.0643239051467286, + "grad_norm": 1.5017127990722656, + "learning_rate": 1.845143115177982e-05, + "loss": 0.3663, + "step": 18772 + }, + { + "epoch": 3.064487163789233, + "grad_norm": 1.8771162033081055, + "learning_rate": 1.8451261520530563e-05, + "loss": 0.5101, + "step": 18773 + }, + { + "epoch": 3.0646504224317375, + "grad_norm": 1.3327879905700684, + "learning_rate": 1.8451091880770885e-05, + "loss": 0.3607, + "step": 18774 + }, + { + "epoch": 3.064813681074242, + "grad_norm": 1.596580982208252, + "learning_rate": 1.8450922232500966e-05, + "loss": 0.393, + "step": 18775 + }, + { + "epoch": 3.0649769397167463, + "grad_norm": 1.7358887195587158, + "learning_rate": 1.8450752575720967e-05, + "loss": 0.396, + "step": 18776 + }, + { + "epoch": 3.065140198359251, + "grad_norm": 2.254580020904541, + "learning_rate": 1.845058291043106e-05, + "loss": 0.4688, + "step": 18777 + }, + { + "epoch": 3.065303457001755, + "grad_norm": 1.796830654144287, + "learning_rate": 1.8450413236631425e-05, + "loss": 0.3779, + "step": 18778 + }, + { + "epoch": 3.065466715644259, + "grad_norm": 1.7433054447174072, + "learning_rate": 1.845024355432222e-05, + "loss": 0.4443, + "step": 18779 + }, + { + "epoch": 3.0656299742867636, + "grad_norm": 2.4573659896850586, + "learning_rate": 1.8450073863503622e-05, + "loss": 0.4966, + "step": 18780 + }, + { + "epoch": 3.065793232929268, + "grad_norm": 2.0889763832092285, + "learning_rate": 1.8449904164175804e-05, + "loss": 0.4811, + "step": 18781 + }, + { + "epoch": 3.0659564915717725, + "grad_norm": 2.18731951713562, + "learning_rate": 1.8449734456338936e-05, + "loss": 0.4349, + "step": 18782 + }, + { + "epoch": 3.066119750214277, + "grad_norm": 1.7849833965301514, + "learning_rate": 1.8449564739993186e-05, + "loss": 0.4856, + "step": 18783 + }, + { + "epoch": 3.0662830088567814, + "grad_norm": 2.3309128284454346, + "learning_rate": 1.8449395015138728e-05, + "loss": 0.5798, + "step": 18784 + }, + { + "epoch": 3.066446267499286, + "grad_norm": 1.9184921979904175, + "learning_rate": 1.844922528177573e-05, + "loss": 0.4527, + "step": 18785 + }, + { + "epoch": 3.0666095261417903, + "grad_norm": 1.634769082069397, + "learning_rate": 1.8449055539904363e-05, + "loss": 0.4139, + "step": 18786 + }, + { + "epoch": 3.0667727847842947, + "grad_norm": 1.9795173406600952, + "learning_rate": 1.8448885789524802e-05, + "loss": 0.4913, + "step": 18787 + }, + { + "epoch": 3.0669360434267987, + "grad_norm": 2.085861921310425, + "learning_rate": 1.844871603063721e-05, + "loss": 0.5435, + "step": 18788 + }, + { + "epoch": 3.067099302069303, + "grad_norm": 1.867795467376709, + "learning_rate": 1.844854626324177e-05, + "loss": 0.4135, + "step": 18789 + }, + { + "epoch": 3.0672625607118076, + "grad_norm": 1.920078158378601, + "learning_rate": 1.8448376487338647e-05, + "loss": 0.4524, + "step": 18790 + }, + { + "epoch": 3.067425819354312, + "grad_norm": 1.8178565502166748, + "learning_rate": 1.8448206702928005e-05, + "loss": 0.4533, + "step": 18791 + }, + { + "epoch": 3.0675890779968165, + "grad_norm": 1.8438527584075928, + "learning_rate": 1.8448036910010025e-05, + "loss": 0.4871, + "step": 18792 + }, + { + "epoch": 3.067752336639321, + "grad_norm": 1.9826704263687134, + "learning_rate": 1.844786710858487e-05, + "loss": 0.4577, + "step": 18793 + }, + { + "epoch": 3.0679155952818253, + "grad_norm": 1.754239559173584, + "learning_rate": 1.844769729865272e-05, + "loss": 0.4249, + "step": 18794 + }, + { + "epoch": 3.0680788539243298, + "grad_norm": 2.4487569332122803, + "learning_rate": 1.844752748021374e-05, + "loss": 0.6038, + "step": 18795 + }, + { + "epoch": 3.068242112566834, + "grad_norm": 1.706122875213623, + "learning_rate": 1.84473576532681e-05, + "loss": 0.4649, + "step": 18796 + }, + { + "epoch": 3.068405371209338, + "grad_norm": 2.041039228439331, + "learning_rate": 1.8447187817815972e-05, + "loss": 0.4782, + "step": 18797 + }, + { + "epoch": 3.0685686298518426, + "grad_norm": 1.55514395236969, + "learning_rate": 1.844701797385753e-05, + "loss": 0.3788, + "step": 18798 + }, + { + "epoch": 3.068731888494347, + "grad_norm": 1.7854281663894653, + "learning_rate": 1.8446848121392946e-05, + "loss": 0.4606, + "step": 18799 + }, + { + "epoch": 3.0688951471368515, + "grad_norm": 2.037917137145996, + "learning_rate": 1.8446678260422388e-05, + "loss": 0.53, + "step": 18800 + }, + { + "epoch": 3.069058405779356, + "grad_norm": 1.9133150577545166, + "learning_rate": 1.844650839094602e-05, + "loss": 0.4452, + "step": 18801 + }, + { + "epoch": 3.0692216644218604, + "grad_norm": 1.9216768741607666, + "learning_rate": 1.8446338512964028e-05, + "loss": 0.4917, + "step": 18802 + }, + { + "epoch": 3.069384923064365, + "grad_norm": 1.7884793281555176, + "learning_rate": 1.844616862647657e-05, + "loss": 0.4299, + "step": 18803 + }, + { + "epoch": 3.0695481817068693, + "grad_norm": 1.9165092706680298, + "learning_rate": 1.8445998731483827e-05, + "loss": 0.4008, + "step": 18804 + }, + { + "epoch": 3.0697114403493737, + "grad_norm": 1.9314677715301514, + "learning_rate": 1.844582882798596e-05, + "loss": 0.3942, + "step": 18805 + }, + { + "epoch": 3.0698746989918777, + "grad_norm": 2.2654433250427246, + "learning_rate": 1.844565891598315e-05, + "loss": 0.5436, + "step": 18806 + }, + { + "epoch": 3.070037957634382, + "grad_norm": 1.7380315065383911, + "learning_rate": 1.844548899547556e-05, + "loss": 0.4668, + "step": 18807 + }, + { + "epoch": 3.0702012162768866, + "grad_norm": 2.2576708793640137, + "learning_rate": 1.844531906646337e-05, + "loss": 0.5726, + "step": 18808 + }, + { + "epoch": 3.070364474919391, + "grad_norm": 1.8316447734832764, + "learning_rate": 1.8445149128946744e-05, + "loss": 0.4965, + "step": 18809 + }, + { + "epoch": 3.0705277335618955, + "grad_norm": 1.7351958751678467, + "learning_rate": 1.8444979182925855e-05, + "loss": 0.4402, + "step": 18810 + }, + { + "epoch": 3.0706909922044, + "grad_norm": 1.982672095298767, + "learning_rate": 1.8444809228400874e-05, + "loss": 0.4829, + "step": 18811 + }, + { + "epoch": 3.0708542508469043, + "grad_norm": 1.7118393182754517, + "learning_rate": 1.844463926537197e-05, + "loss": 0.445, + "step": 18812 + }, + { + "epoch": 3.0710175094894088, + "grad_norm": 1.8214925527572632, + "learning_rate": 1.844446929383932e-05, + "loss": 0.4403, + "step": 18813 + }, + { + "epoch": 3.0711807681319128, + "grad_norm": 2.1792144775390625, + "learning_rate": 1.844429931380309e-05, + "loss": 0.5553, + "step": 18814 + }, + { + "epoch": 3.071344026774417, + "grad_norm": 2.407731771469116, + "learning_rate": 1.8444129325263455e-05, + "loss": 0.51, + "step": 18815 + }, + { + "epoch": 3.0715072854169216, + "grad_norm": 2.4391098022460938, + "learning_rate": 1.844395932822058e-05, + "loss": 0.4763, + "step": 18816 + }, + { + "epoch": 3.071670544059426, + "grad_norm": 1.8661123514175415, + "learning_rate": 1.844378932267464e-05, + "loss": 0.3985, + "step": 18817 + }, + { + "epoch": 3.0718338027019305, + "grad_norm": 2.0415642261505127, + "learning_rate": 1.8443619308625812e-05, + "loss": 0.4733, + "step": 18818 + }, + { + "epoch": 3.071997061344435, + "grad_norm": 1.7580021619796753, + "learning_rate": 1.844344928607426e-05, + "loss": 0.4228, + "step": 18819 + }, + { + "epoch": 3.0721603199869394, + "grad_norm": 2.0802364349365234, + "learning_rate": 1.8443279255020153e-05, + "loss": 0.5292, + "step": 18820 + }, + { + "epoch": 3.072323578629444, + "grad_norm": 2.2764739990234375, + "learning_rate": 1.8443109215463665e-05, + "loss": 0.5953, + "step": 18821 + }, + { + "epoch": 3.0724868372719483, + "grad_norm": 1.983020305633545, + "learning_rate": 1.8442939167404975e-05, + "loss": 0.4931, + "step": 18822 + }, + { + "epoch": 3.0726500959144523, + "grad_norm": 1.8773808479309082, + "learning_rate": 1.8442769110844243e-05, + "loss": 0.4431, + "step": 18823 + }, + { + "epoch": 3.0728133545569567, + "grad_norm": 1.8706331253051758, + "learning_rate": 1.8442599045781645e-05, + "loss": 0.4966, + "step": 18824 + }, + { + "epoch": 3.072976613199461, + "grad_norm": 1.8343526124954224, + "learning_rate": 1.844242897221735e-05, + "loss": 0.4187, + "step": 18825 + }, + { + "epoch": 3.0731398718419656, + "grad_norm": 1.703696846961975, + "learning_rate": 1.8442258890151535e-05, + "loss": 0.4495, + "step": 18826 + }, + { + "epoch": 3.07330313048447, + "grad_norm": 1.814351201057434, + "learning_rate": 1.844208879958437e-05, + "loss": 0.4412, + "step": 18827 + }, + { + "epoch": 3.0734663891269745, + "grad_norm": 1.8826587200164795, + "learning_rate": 1.844191870051602e-05, + "loss": 0.5261, + "step": 18828 + }, + { + "epoch": 3.073629647769479, + "grad_norm": 2.12418794631958, + "learning_rate": 1.844174859294666e-05, + "loss": 0.5303, + "step": 18829 + }, + { + "epoch": 3.0737929064119833, + "grad_norm": 1.4922559261322021, + "learning_rate": 1.844157847687646e-05, + "loss": 0.4084, + "step": 18830 + }, + { + "epoch": 3.0739561650544878, + "grad_norm": 1.7425146102905273, + "learning_rate": 1.8441408352305595e-05, + "loss": 0.4129, + "step": 18831 + }, + { + "epoch": 3.0741194236969918, + "grad_norm": 1.975193977355957, + "learning_rate": 1.8441238219234234e-05, + "loss": 0.5039, + "step": 18832 + }, + { + "epoch": 3.074282682339496, + "grad_norm": 2.271416425704956, + "learning_rate": 1.8441068077662545e-05, + "loss": 0.7564, + "step": 18833 + }, + { + "epoch": 3.0744459409820006, + "grad_norm": 2.3555190563201904, + "learning_rate": 1.8440897927590707e-05, + "loss": 0.599, + "step": 18834 + }, + { + "epoch": 3.074609199624505, + "grad_norm": 1.8905137777328491, + "learning_rate": 1.8440727769018883e-05, + "loss": 0.5006, + "step": 18835 + }, + { + "epoch": 3.0747724582670095, + "grad_norm": 2.0193545818328857, + "learning_rate": 1.8440557601947254e-05, + "loss": 0.5174, + "step": 18836 + }, + { + "epoch": 3.074935716909514, + "grad_norm": 1.962745189666748, + "learning_rate": 1.844038742637598e-05, + "loss": 0.4974, + "step": 18837 + }, + { + "epoch": 3.0750989755520184, + "grad_norm": 2.2239410877227783, + "learning_rate": 1.8440217242305243e-05, + "loss": 0.511, + "step": 18838 + }, + { + "epoch": 3.075262234194523, + "grad_norm": 2.051652431488037, + "learning_rate": 1.8440047049735205e-05, + "loss": 0.4376, + "step": 18839 + }, + { + "epoch": 3.0754254928370273, + "grad_norm": 1.8901581764221191, + "learning_rate": 1.8439876848666048e-05, + "loss": 0.476, + "step": 18840 + }, + { + "epoch": 3.0755887514795313, + "grad_norm": 1.6658117771148682, + "learning_rate": 1.843970663909793e-05, + "loss": 0.4183, + "step": 18841 + }, + { + "epoch": 3.0757520101220357, + "grad_norm": 2.0383148193359375, + "learning_rate": 1.8439536421031035e-05, + "loss": 0.5322, + "step": 18842 + }, + { + "epoch": 3.07591526876454, + "grad_norm": 1.9539990425109863, + "learning_rate": 1.8439366194465526e-05, + "loss": 0.4955, + "step": 18843 + }, + { + "epoch": 3.0760785274070446, + "grad_norm": 2.1011717319488525, + "learning_rate": 1.843919595940158e-05, + "loss": 0.5348, + "step": 18844 + }, + { + "epoch": 3.076241786049549, + "grad_norm": 2.296721935272217, + "learning_rate": 1.8439025715839364e-05, + "loss": 0.4709, + "step": 18845 + }, + { + "epoch": 3.0764050446920534, + "grad_norm": 2.160215139389038, + "learning_rate": 1.843885546377905e-05, + "loss": 0.4602, + "step": 18846 + }, + { + "epoch": 3.076568303334558, + "grad_norm": 1.9022434949874878, + "learning_rate": 1.843868520322081e-05, + "loss": 0.4855, + "step": 18847 + }, + { + "epoch": 3.0767315619770623, + "grad_norm": 1.6373975276947021, + "learning_rate": 1.8438514934164822e-05, + "loss": 0.4375, + "step": 18848 + }, + { + "epoch": 3.0768948206195668, + "grad_norm": 1.8800544738769531, + "learning_rate": 1.843834465661125e-05, + "loss": 0.4667, + "step": 18849 + }, + { + "epoch": 3.0770580792620708, + "grad_norm": 2.222196102142334, + "learning_rate": 1.8438174370560263e-05, + "loss": 0.5028, + "step": 18850 + }, + { + "epoch": 3.077221337904575, + "grad_norm": 1.7658442258834839, + "learning_rate": 1.843800407601204e-05, + "loss": 0.4475, + "step": 18851 + }, + { + "epoch": 3.0773845965470796, + "grad_norm": 1.8040783405303955, + "learning_rate": 1.843783377296675e-05, + "loss": 0.4788, + "step": 18852 + }, + { + "epoch": 3.077547855189584, + "grad_norm": 2.1723926067352295, + "learning_rate": 1.8437663461424563e-05, + "loss": 0.5326, + "step": 18853 + }, + { + "epoch": 3.0777111138320885, + "grad_norm": 2.088674783706665, + "learning_rate": 1.843749314138565e-05, + "loss": 0.4943, + "step": 18854 + }, + { + "epoch": 3.077874372474593, + "grad_norm": 1.7965309619903564, + "learning_rate": 1.843732281285018e-05, + "loss": 0.4555, + "step": 18855 + }, + { + "epoch": 3.0780376311170974, + "grad_norm": 2.2205846309661865, + "learning_rate": 1.8437152475818335e-05, + "loss": 0.5216, + "step": 18856 + }, + { + "epoch": 3.078200889759602, + "grad_norm": 1.6337807178497314, + "learning_rate": 1.8436982130290277e-05, + "loss": 0.4414, + "step": 18857 + }, + { + "epoch": 3.0783641484021063, + "grad_norm": 1.7559576034545898, + "learning_rate": 1.843681177626618e-05, + "loss": 0.4488, + "step": 18858 + }, + { + "epoch": 3.0785274070446103, + "grad_norm": 1.9173935651779175, + "learning_rate": 1.8436641413746214e-05, + "loss": 0.5182, + "step": 18859 + }, + { + "epoch": 3.0786906656871147, + "grad_norm": 1.780761480331421, + "learning_rate": 1.8436471042730555e-05, + "loss": 0.4129, + "step": 18860 + }, + { + "epoch": 3.078853924329619, + "grad_norm": 2.1013360023498535, + "learning_rate": 1.843630066321937e-05, + "loss": 0.4438, + "step": 18861 + }, + { + "epoch": 3.0790171829721236, + "grad_norm": 1.6075325012207031, + "learning_rate": 1.8436130275212832e-05, + "loss": 0.3591, + "step": 18862 + }, + { + "epoch": 3.079180441614628, + "grad_norm": 2.1101760864257812, + "learning_rate": 1.8435959878711114e-05, + "loss": 0.5978, + "step": 18863 + }, + { + "epoch": 3.0793437002571324, + "grad_norm": 1.859292984008789, + "learning_rate": 1.843578947371439e-05, + "loss": 0.48, + "step": 18864 + }, + { + "epoch": 3.079506958899637, + "grad_norm": 1.755092978477478, + "learning_rate": 1.8435619060222825e-05, + "loss": 0.4096, + "step": 18865 + }, + { + "epoch": 3.0796702175421413, + "grad_norm": 2.13969087600708, + "learning_rate": 1.8435448638236596e-05, + "loss": 0.5565, + "step": 18866 + }, + { + "epoch": 3.0798334761846453, + "grad_norm": 1.8613191843032837, + "learning_rate": 1.8435278207755867e-05, + "loss": 0.4395, + "step": 18867 + }, + { + "epoch": 3.0799967348271498, + "grad_norm": 1.708535075187683, + "learning_rate": 1.843510776878082e-05, + "loss": 0.4353, + "step": 18868 + }, + { + "epoch": 3.080159993469654, + "grad_norm": 2.192122220993042, + "learning_rate": 1.843493732131162e-05, + "loss": 0.5571, + "step": 18869 + }, + { + "epoch": 3.0803232521121586, + "grad_norm": 1.6817936897277832, + "learning_rate": 1.843476686534844e-05, + "loss": 0.4646, + "step": 18870 + }, + { + "epoch": 3.080486510754663, + "grad_norm": 1.857354998588562, + "learning_rate": 1.8434596400891455e-05, + "loss": 0.509, + "step": 18871 + }, + { + "epoch": 3.0806497693971675, + "grad_norm": 2.0006778240203857, + "learning_rate": 1.843442592794083e-05, + "loss": 0.4927, + "step": 18872 + }, + { + "epoch": 3.080813028039672, + "grad_norm": 2.2836649417877197, + "learning_rate": 1.8434255446496743e-05, + "loss": 0.4629, + "step": 18873 + }, + { + "epoch": 3.0809762866821764, + "grad_norm": 2.1072330474853516, + "learning_rate": 1.8434084956559362e-05, + "loss": 0.4744, + "step": 18874 + }, + { + "epoch": 3.081139545324681, + "grad_norm": 1.6394875049591064, + "learning_rate": 1.843391445812886e-05, + "loss": 0.4124, + "step": 18875 + }, + { + "epoch": 3.081302803967185, + "grad_norm": 1.7107243537902832, + "learning_rate": 1.8433743951205406e-05, + "loss": 0.4205, + "step": 18876 + }, + { + "epoch": 3.0814660626096892, + "grad_norm": 1.699691891670227, + "learning_rate": 1.8433573435789177e-05, + "loss": 0.3941, + "step": 18877 + }, + { + "epoch": 3.0816293212521937, + "grad_norm": 1.9793766736984253, + "learning_rate": 1.843340291188034e-05, + "loss": 0.4651, + "step": 18878 + }, + { + "epoch": 3.081792579894698, + "grad_norm": 2.0843029022216797, + "learning_rate": 1.843323237947907e-05, + "loss": 0.4999, + "step": 18879 + }, + { + "epoch": 3.0819558385372026, + "grad_norm": 1.8294036388397217, + "learning_rate": 1.8433061838585537e-05, + "loss": 0.5112, + "step": 18880 + }, + { + "epoch": 3.082119097179707, + "grad_norm": 1.8631725311279297, + "learning_rate": 1.843289128919991e-05, + "loss": 0.5116, + "step": 18881 + }, + { + "epoch": 3.0822823558222114, + "grad_norm": 2.056917667388916, + "learning_rate": 1.8432720731322367e-05, + "loss": 0.5024, + "step": 18882 + }, + { + "epoch": 3.082445614464716, + "grad_norm": 1.6460543870925903, + "learning_rate": 1.8432550164953077e-05, + "loss": 0.487, + "step": 18883 + }, + { + "epoch": 3.0826088731072203, + "grad_norm": 2.10288667678833, + "learning_rate": 1.843237959009221e-05, + "loss": 0.4726, + "step": 18884 + }, + { + "epoch": 3.0827721317497243, + "grad_norm": 2.0627365112304688, + "learning_rate": 1.8432209006739937e-05, + "loss": 0.5017, + "step": 18885 + }, + { + "epoch": 3.0829353903922287, + "grad_norm": 1.8946086168289185, + "learning_rate": 1.8432038414896432e-05, + "loss": 0.4988, + "step": 18886 + }, + { + "epoch": 3.083098649034733, + "grad_norm": 1.8317276239395142, + "learning_rate": 1.843186781456187e-05, + "loss": 0.4563, + "step": 18887 + }, + { + "epoch": 3.0832619076772376, + "grad_norm": 1.7498021125793457, + "learning_rate": 1.843169720573642e-05, + "loss": 0.4339, + "step": 18888 + }, + { + "epoch": 3.083425166319742, + "grad_norm": 1.6694129705429077, + "learning_rate": 1.843152658842025e-05, + "loss": 0.4455, + "step": 18889 + }, + { + "epoch": 3.0835884249622465, + "grad_norm": 1.9756728410720825, + "learning_rate": 1.8431355962613535e-05, + "loss": 0.4538, + "step": 18890 + }, + { + "epoch": 3.083751683604751, + "grad_norm": 1.7390265464782715, + "learning_rate": 1.8431185328316445e-05, + "loss": 0.4494, + "step": 18891 + }, + { + "epoch": 3.0839149422472554, + "grad_norm": 2.0222105979919434, + "learning_rate": 1.8431014685529157e-05, + "loss": 0.4632, + "step": 18892 + }, + { + "epoch": 3.08407820088976, + "grad_norm": 1.6990940570831299, + "learning_rate": 1.8430844034251837e-05, + "loss": 0.3818, + "step": 18893 + }, + { + "epoch": 3.084241459532264, + "grad_norm": 1.9534467458724976, + "learning_rate": 1.8430673374484663e-05, + "loss": 0.5114, + "step": 18894 + }, + { + "epoch": 3.0844047181747682, + "grad_norm": 1.7974034547805786, + "learning_rate": 1.84305027062278e-05, + "loss": 0.4201, + "step": 18895 + }, + { + "epoch": 3.0845679768172727, + "grad_norm": 2.0211541652679443, + "learning_rate": 1.8430332029481425e-05, + "loss": 0.4238, + "step": 18896 + }, + { + "epoch": 3.084731235459777, + "grad_norm": 2.3846843242645264, + "learning_rate": 1.8430161344245708e-05, + "loss": 0.5701, + "step": 18897 + }, + { + "epoch": 3.0848944941022816, + "grad_norm": 1.9929403066635132, + "learning_rate": 1.8429990650520816e-05, + "loss": 0.5305, + "step": 18898 + }, + { + "epoch": 3.085057752744786, + "grad_norm": 1.5572116374969482, + "learning_rate": 1.842981994830693e-05, + "loss": 0.3789, + "step": 18899 + }, + { + "epoch": 3.0852210113872904, + "grad_norm": 2.0137388706207275, + "learning_rate": 1.8429649237604215e-05, + "loss": 0.4663, + "step": 18900 + }, + { + "epoch": 3.085384270029795, + "grad_norm": 1.5921205282211304, + "learning_rate": 1.8429478518412848e-05, + "loss": 0.3886, + "step": 18901 + }, + { + "epoch": 3.085547528672299, + "grad_norm": 1.6845531463623047, + "learning_rate": 1.8429307790732997e-05, + "loss": 0.4146, + "step": 18902 + }, + { + "epoch": 3.0857107873148033, + "grad_norm": 1.9013365507125854, + "learning_rate": 1.842913705456484e-05, + "loss": 0.5406, + "step": 18903 + }, + { + "epoch": 3.0858740459573077, + "grad_norm": 1.9745876789093018, + "learning_rate": 1.8428966309908538e-05, + "loss": 0.4581, + "step": 18904 + }, + { + "epoch": 3.086037304599812, + "grad_norm": 1.698372721672058, + "learning_rate": 1.8428795556764272e-05, + "loss": 0.4509, + "step": 18905 + }, + { + "epoch": 3.0862005632423166, + "grad_norm": 1.7978824377059937, + "learning_rate": 1.8428624795132207e-05, + "loss": 0.3892, + "step": 18906 + }, + { + "epoch": 3.086363821884821, + "grad_norm": 2.1463847160339355, + "learning_rate": 1.8428454025012526e-05, + "loss": 0.5573, + "step": 18907 + }, + { + "epoch": 3.0865270805273255, + "grad_norm": 1.645027995109558, + "learning_rate": 1.842828324640539e-05, + "loss": 0.4348, + "step": 18908 + }, + { + "epoch": 3.08669033916983, + "grad_norm": 1.8199950456619263, + "learning_rate": 1.8428112459310975e-05, + "loss": 0.433, + "step": 18909 + }, + { + "epoch": 3.0868535978123344, + "grad_norm": 2.1963396072387695, + "learning_rate": 1.8427941663729453e-05, + "loss": 0.5172, + "step": 18910 + }, + { + "epoch": 3.0870168564548384, + "grad_norm": 2.0730042457580566, + "learning_rate": 1.8427770859660997e-05, + "loss": 0.4921, + "step": 18911 + }, + { + "epoch": 3.087180115097343, + "grad_norm": 1.5925207138061523, + "learning_rate": 1.8427600047105775e-05, + "loss": 0.3947, + "step": 18912 + }, + { + "epoch": 3.0873433737398472, + "grad_norm": 1.7242997884750366, + "learning_rate": 1.8427429226063968e-05, + "loss": 0.3305, + "step": 18913 + }, + { + "epoch": 3.0875066323823517, + "grad_norm": 1.914718508720398, + "learning_rate": 1.8427258396535737e-05, + "loss": 0.4455, + "step": 18914 + }, + { + "epoch": 3.087669891024856, + "grad_norm": 1.590900182723999, + "learning_rate": 1.842708755852126e-05, + "loss": 0.4086, + "step": 18915 + }, + { + "epoch": 3.0878331496673606, + "grad_norm": 1.9944499731063843, + "learning_rate": 1.842691671202071e-05, + "loss": 0.4396, + "step": 18916 + }, + { + "epoch": 3.087996408309865, + "grad_norm": 1.4331024885177612, + "learning_rate": 1.8426745857034252e-05, + "loss": 0.4256, + "step": 18917 + }, + { + "epoch": 3.0881596669523694, + "grad_norm": 1.8899720907211304, + "learning_rate": 1.8426574993562067e-05, + "loss": 0.4176, + "step": 18918 + }, + { + "epoch": 3.088322925594874, + "grad_norm": 1.6871520280838013, + "learning_rate": 1.8426404121604324e-05, + "loss": 0.3848, + "step": 18919 + }, + { + "epoch": 3.088486184237378, + "grad_norm": 2.4072718620300293, + "learning_rate": 1.8426233241161193e-05, + "loss": 0.4916, + "step": 18920 + }, + { + "epoch": 3.0886494428798823, + "grad_norm": 1.9007920026779175, + "learning_rate": 1.8426062352232846e-05, + "loss": 0.4391, + "step": 18921 + }, + { + "epoch": 3.0888127015223867, + "grad_norm": 2.083683729171753, + "learning_rate": 1.842589145481946e-05, + "loss": 0.5468, + "step": 18922 + }, + { + "epoch": 3.088975960164891, + "grad_norm": 1.8783966302871704, + "learning_rate": 1.8425720548921203e-05, + "loss": 0.4083, + "step": 18923 + }, + { + "epoch": 3.0891392188073956, + "grad_norm": 2.108386278152466, + "learning_rate": 1.8425549634538245e-05, + "loss": 0.5154, + "step": 18924 + }, + { + "epoch": 3.0893024774499, + "grad_norm": 1.8346065282821655, + "learning_rate": 1.842537871167076e-05, + "loss": 0.462, + "step": 18925 + }, + { + "epoch": 3.0894657360924045, + "grad_norm": 2.031768560409546, + "learning_rate": 1.8425207780318925e-05, + "loss": 0.4067, + "step": 18926 + }, + { + "epoch": 3.089628994734909, + "grad_norm": 1.9320361614227295, + "learning_rate": 1.8425036840482905e-05, + "loss": 0.4561, + "step": 18927 + }, + { + "epoch": 3.0897922533774134, + "grad_norm": 2.243340492248535, + "learning_rate": 1.8424865892162874e-05, + "loss": 0.5066, + "step": 18928 + }, + { + "epoch": 3.0899555120199174, + "grad_norm": 2.1496119499206543, + "learning_rate": 1.8424694935359012e-05, + "loss": 0.5382, + "step": 18929 + }, + { + "epoch": 3.090118770662422, + "grad_norm": 1.6834529638290405, + "learning_rate": 1.842452397007148e-05, + "loss": 0.4126, + "step": 18930 + }, + { + "epoch": 3.0902820293049262, + "grad_norm": 2.4878175258636475, + "learning_rate": 1.842435299630045e-05, + "loss": 0.4812, + "step": 18931 + }, + { + "epoch": 3.0904452879474307, + "grad_norm": 1.7239290475845337, + "learning_rate": 1.8424182014046103e-05, + "loss": 0.4579, + "step": 18932 + }, + { + "epoch": 3.090608546589935, + "grad_norm": 2.2292487621307373, + "learning_rate": 1.842401102330861e-05, + "loss": 0.5019, + "step": 18933 + }, + { + "epoch": 3.0907718052324396, + "grad_norm": 2.063720464706421, + "learning_rate": 1.8423840024088134e-05, + "loss": 0.5479, + "step": 18934 + }, + { + "epoch": 3.090935063874944, + "grad_norm": 1.7588616609573364, + "learning_rate": 1.8423669016384856e-05, + "loss": 0.406, + "step": 18935 + }, + { + "epoch": 3.0910983225174484, + "grad_norm": 1.8015446662902832, + "learning_rate": 1.842349800019895e-05, + "loss": 0.5022, + "step": 18936 + }, + { + "epoch": 3.091261581159953, + "grad_norm": 1.7532639503479004, + "learning_rate": 1.8423326975530578e-05, + "loss": 0.4642, + "step": 18937 + }, + { + "epoch": 3.091424839802457, + "grad_norm": 1.9718114137649536, + "learning_rate": 1.842315594237992e-05, + "loss": 0.4559, + "step": 18938 + }, + { + "epoch": 3.0915880984449613, + "grad_norm": 1.6580588817596436, + "learning_rate": 1.8422984900747148e-05, + "loss": 0.4424, + "step": 18939 + }, + { + "epoch": 3.0917513570874657, + "grad_norm": 1.4869608879089355, + "learning_rate": 1.842281385063243e-05, + "loss": 0.437, + "step": 18940 + }, + { + "epoch": 3.09191461572997, + "grad_norm": 1.817999005317688, + "learning_rate": 1.842264279203594e-05, + "loss": 0.4334, + "step": 18941 + }, + { + "epoch": 3.0920778743724746, + "grad_norm": 1.7412457466125488, + "learning_rate": 1.842247172495785e-05, + "loss": 0.4836, + "step": 18942 + }, + { + "epoch": 3.092241133014979, + "grad_norm": 2.382089138031006, + "learning_rate": 1.8422300649398336e-05, + "loss": 0.5351, + "step": 18943 + }, + { + "epoch": 3.0924043916574835, + "grad_norm": 1.657732605934143, + "learning_rate": 1.842212956535757e-05, + "loss": 0.4324, + "step": 18944 + }, + { + "epoch": 3.092567650299988, + "grad_norm": 1.6913809776306152, + "learning_rate": 1.8421958472835715e-05, + "loss": 0.4465, + "step": 18945 + }, + { + "epoch": 3.0927309089424924, + "grad_norm": 1.8423367738723755, + "learning_rate": 1.8421787371832954e-05, + "loss": 0.5009, + "step": 18946 + }, + { + "epoch": 3.0928941675849964, + "grad_norm": 2.097724676132202, + "learning_rate": 1.8421616262349452e-05, + "loss": 0.5387, + "step": 18947 + }, + { + "epoch": 3.093057426227501, + "grad_norm": 1.6578075885772705, + "learning_rate": 1.842144514438539e-05, + "loss": 0.3923, + "step": 18948 + }, + { + "epoch": 3.0932206848700052, + "grad_norm": 1.5458980798721313, + "learning_rate": 1.8421274017940932e-05, + "loss": 0.4114, + "step": 18949 + }, + { + "epoch": 3.0933839435125097, + "grad_norm": 2.228379011154175, + "learning_rate": 1.8421102883016253e-05, + "loss": 0.6016, + "step": 18950 + }, + { + "epoch": 3.093547202155014, + "grad_norm": 2.0353455543518066, + "learning_rate": 1.8420931739611527e-05, + "loss": 0.56, + "step": 18951 + }, + { + "epoch": 3.0937104607975185, + "grad_norm": 1.7086330652236938, + "learning_rate": 1.8420760587726925e-05, + "loss": 0.409, + "step": 18952 + }, + { + "epoch": 3.093873719440023, + "grad_norm": 1.891031265258789, + "learning_rate": 1.8420589427362618e-05, + "loss": 0.483, + "step": 18953 + }, + { + "epoch": 3.0940369780825274, + "grad_norm": 2.34728741645813, + "learning_rate": 1.8420418258518782e-05, + "loss": 0.5169, + "step": 18954 + }, + { + "epoch": 3.0942002367250314, + "grad_norm": 1.700730800628662, + "learning_rate": 1.8420247081195584e-05, + "loss": 0.4401, + "step": 18955 + }, + { + "epoch": 3.094363495367536, + "grad_norm": 1.9691925048828125, + "learning_rate": 1.84200758953932e-05, + "loss": 0.5734, + "step": 18956 + }, + { + "epoch": 3.0945267540100403, + "grad_norm": 1.9014405012130737, + "learning_rate": 1.8419904701111804e-05, + "loss": 0.4983, + "step": 18957 + }, + { + "epoch": 3.0946900126525447, + "grad_norm": 1.88710618019104, + "learning_rate": 1.8419733498351563e-05, + "loss": 0.4026, + "step": 18958 + }, + { + "epoch": 3.094853271295049, + "grad_norm": 2.0525238513946533, + "learning_rate": 1.8419562287112658e-05, + "loss": 0.4753, + "step": 18959 + }, + { + "epoch": 3.0950165299375536, + "grad_norm": 1.7597275972366333, + "learning_rate": 1.8419391067395248e-05, + "loss": 0.4625, + "step": 18960 + }, + { + "epoch": 3.095179788580058, + "grad_norm": 2.319255828857422, + "learning_rate": 1.841921983919952e-05, + "loss": 0.8395, + "step": 18961 + }, + { + "epoch": 3.0953430472225625, + "grad_norm": 1.6670342683792114, + "learning_rate": 1.8419048602525637e-05, + "loss": 0.3964, + "step": 18962 + }, + { + "epoch": 3.095506305865067, + "grad_norm": 1.7694436311721802, + "learning_rate": 1.8418877357373776e-05, + "loss": 0.4231, + "step": 18963 + }, + { + "epoch": 3.095669564507571, + "grad_norm": 2.0551061630249023, + "learning_rate": 1.8418706103744108e-05, + "loss": 0.5283, + "step": 18964 + }, + { + "epoch": 3.0958328231500754, + "grad_norm": 1.931770920753479, + "learning_rate": 1.8418534841636805e-05, + "loss": 0.4505, + "step": 18965 + }, + { + "epoch": 3.09599608179258, + "grad_norm": 2.1917319297790527, + "learning_rate": 1.8418363571052037e-05, + "loss": 0.5141, + "step": 18966 + }, + { + "epoch": 3.0961593404350842, + "grad_norm": 1.8727467060089111, + "learning_rate": 1.841819229198998e-05, + "loss": 0.4571, + "step": 18967 + }, + { + "epoch": 3.0963225990775887, + "grad_norm": 1.9298038482666016, + "learning_rate": 1.841802100445081e-05, + "loss": 0.4772, + "step": 18968 + }, + { + "epoch": 3.096485857720093, + "grad_norm": 1.8089710474014282, + "learning_rate": 1.8417849708434687e-05, + "loss": 0.473, + "step": 18969 + }, + { + "epoch": 3.0966491163625975, + "grad_norm": 2.198641777038574, + "learning_rate": 1.8417678403941798e-05, + "loss": 0.5159, + "step": 18970 + }, + { + "epoch": 3.096812375005102, + "grad_norm": 2.035943031311035, + "learning_rate": 1.8417507090972308e-05, + "loss": 0.5024, + "step": 18971 + }, + { + "epoch": 3.0969756336476064, + "grad_norm": 1.6451998949050903, + "learning_rate": 1.8417335769526386e-05, + "loss": 0.4529, + "step": 18972 + }, + { + "epoch": 3.0971388922901104, + "grad_norm": 1.8999505043029785, + "learning_rate": 1.8417164439604213e-05, + "loss": 0.4186, + "step": 18973 + }, + { + "epoch": 3.097302150932615, + "grad_norm": 1.8864094018936157, + "learning_rate": 1.8416993101205957e-05, + "loss": 0.4595, + "step": 18974 + }, + { + "epoch": 3.0974654095751193, + "grad_norm": 1.9778538942337036, + "learning_rate": 1.841682175433179e-05, + "loss": 0.5471, + "step": 18975 + }, + { + "epoch": 3.0976286682176237, + "grad_norm": 1.9574711322784424, + "learning_rate": 1.841665039898189e-05, + "loss": 0.5299, + "step": 18976 + }, + { + "epoch": 3.097791926860128, + "grad_norm": 1.8119025230407715, + "learning_rate": 1.8416479035156426e-05, + "loss": 0.4166, + "step": 18977 + }, + { + "epoch": 3.0979551855026326, + "grad_norm": 2.2681591510772705, + "learning_rate": 1.8416307662855564e-05, + "loss": 0.5185, + "step": 18978 + }, + { + "epoch": 3.098118444145137, + "grad_norm": 1.8263970613479614, + "learning_rate": 1.8416136282079485e-05, + "loss": 0.4327, + "step": 18979 + }, + { + "epoch": 3.0982817027876415, + "grad_norm": 1.79865300655365, + "learning_rate": 1.841596489282836e-05, + "loss": 0.4482, + "step": 18980 + }, + { + "epoch": 3.098444961430146, + "grad_norm": 1.669936180114746, + "learning_rate": 1.841579349510236e-05, + "loss": 0.4588, + "step": 18981 + }, + { + "epoch": 3.09860822007265, + "grad_norm": 1.8828682899475098, + "learning_rate": 1.8415622088901655e-05, + "loss": 0.5138, + "step": 18982 + }, + { + "epoch": 3.0987714787151543, + "grad_norm": 2.2013888359069824, + "learning_rate": 1.8415450674226422e-05, + "loss": 0.5297, + "step": 18983 + }, + { + "epoch": 3.098934737357659, + "grad_norm": 1.884238600730896, + "learning_rate": 1.8415279251076838e-05, + "loss": 0.4101, + "step": 18984 + }, + { + "epoch": 3.0990979960001632, + "grad_norm": 1.9794212579727173, + "learning_rate": 1.8415107819453065e-05, + "loss": 0.5112, + "step": 18985 + }, + { + "epoch": 3.0992612546426677, + "grad_norm": 1.663489818572998, + "learning_rate": 1.841493637935528e-05, + "loss": 0.4343, + "step": 18986 + }, + { + "epoch": 3.099424513285172, + "grad_norm": 1.7106904983520508, + "learning_rate": 1.841476493078366e-05, + "loss": 0.4139, + "step": 18987 + }, + { + "epoch": 3.0995877719276765, + "grad_norm": 1.7274110317230225, + "learning_rate": 1.841459347373837e-05, + "loss": 0.4746, + "step": 18988 + }, + { + "epoch": 3.099751030570181, + "grad_norm": 1.8329302072525024, + "learning_rate": 1.8414422008219585e-05, + "loss": 0.521, + "step": 18989 + }, + { + "epoch": 3.099914289212685, + "grad_norm": 2.0986664295196533, + "learning_rate": 1.8414250534227485e-05, + "loss": 0.5132, + "step": 18990 + }, + { + "epoch": 3.1000775478551894, + "grad_norm": 1.7751376628875732, + "learning_rate": 1.8414079051762234e-05, + "loss": 0.4676, + "step": 18991 + }, + { + "epoch": 3.100240806497694, + "grad_norm": 2.085333824157715, + "learning_rate": 1.841390756082401e-05, + "loss": 0.5709, + "step": 18992 + }, + { + "epoch": 3.1004040651401983, + "grad_norm": 1.7551745176315308, + "learning_rate": 1.841373606141298e-05, + "loss": 0.4877, + "step": 18993 + }, + { + "epoch": 3.1005673237827027, + "grad_norm": 1.6370567083358765, + "learning_rate": 1.841356455352932e-05, + "loss": 0.425, + "step": 18994 + }, + { + "epoch": 3.100730582425207, + "grad_norm": 1.8510997295379639, + "learning_rate": 1.8413393037173206e-05, + "loss": 0.4814, + "step": 18995 + }, + { + "epoch": 3.1008938410677116, + "grad_norm": 2.227151393890381, + "learning_rate": 1.8413221512344805e-05, + "loss": 0.5151, + "step": 18996 + }, + { + "epoch": 3.101057099710216, + "grad_norm": 2.0914809703826904, + "learning_rate": 1.8413049979044295e-05, + "loss": 0.6898, + "step": 18997 + }, + { + "epoch": 3.1012203583527205, + "grad_norm": 1.8980011940002441, + "learning_rate": 1.8412878437271842e-05, + "loss": 0.4302, + "step": 18998 + }, + { + "epoch": 3.101383616995225, + "grad_norm": 1.8548920154571533, + "learning_rate": 1.8412706887027624e-05, + "loss": 0.483, + "step": 18999 + }, + { + "epoch": 3.101546875637729, + "grad_norm": 1.592512607574463, + "learning_rate": 1.8412535328311813e-05, + "loss": 0.4541, + "step": 19000 + }, + { + "epoch": 3.1017101342802333, + "grad_norm": 1.9632327556610107, + "learning_rate": 1.841236376112458e-05, + "loss": 0.4297, + "step": 19001 + }, + { + "epoch": 3.101873392922738, + "grad_norm": 1.8129053115844727, + "learning_rate": 1.84121921854661e-05, + "loss": 0.469, + "step": 19002 + }, + { + "epoch": 3.102036651565242, + "grad_norm": 1.8883506059646606, + "learning_rate": 1.8412020601336547e-05, + "loss": 0.4867, + "step": 19003 + }, + { + "epoch": 3.1021999102077467, + "grad_norm": 1.8723527193069458, + "learning_rate": 1.841184900873609e-05, + "loss": 0.4596, + "step": 19004 + }, + { + "epoch": 3.102363168850251, + "grad_norm": 1.977531909942627, + "learning_rate": 1.84116774076649e-05, + "loss": 0.4748, + "step": 19005 + }, + { + "epoch": 3.1025264274927555, + "grad_norm": 1.9879380464553833, + "learning_rate": 1.8411505798123156e-05, + "loss": 0.5143, + "step": 19006 + }, + { + "epoch": 3.10268968613526, + "grad_norm": 1.9427287578582764, + "learning_rate": 1.8411334180111027e-05, + "loss": 0.4879, + "step": 19007 + }, + { + "epoch": 3.102852944777764, + "grad_norm": 2.1207242012023926, + "learning_rate": 1.8411162553628687e-05, + "loss": 0.5459, + "step": 19008 + }, + { + "epoch": 3.1030162034202684, + "grad_norm": 1.6136043071746826, + "learning_rate": 1.8410990918676308e-05, + "loss": 0.4646, + "step": 19009 + }, + { + "epoch": 3.103179462062773, + "grad_norm": 1.7465434074401855, + "learning_rate": 1.8410819275254065e-05, + "loss": 0.4656, + "step": 19010 + }, + { + "epoch": 3.1033427207052773, + "grad_norm": 1.9319732189178467, + "learning_rate": 1.841064762336213e-05, + "loss": 0.4857, + "step": 19011 + }, + { + "epoch": 3.1035059793477817, + "grad_norm": 1.8380181789398193, + "learning_rate": 1.841047596300067e-05, + "loss": 0.4727, + "step": 19012 + }, + { + "epoch": 3.103669237990286, + "grad_norm": 2.288666248321533, + "learning_rate": 1.8410304294169867e-05, + "loss": 0.6244, + "step": 19013 + }, + { + "epoch": 3.1038324966327906, + "grad_norm": 1.9889397621154785, + "learning_rate": 1.841013261686989e-05, + "loss": 0.4609, + "step": 19014 + }, + { + "epoch": 3.103995755275295, + "grad_norm": 1.59856379032135, + "learning_rate": 1.840996093110091e-05, + "loss": 0.3995, + "step": 19015 + }, + { + "epoch": 3.1041590139177995, + "grad_norm": 2.0326247215270996, + "learning_rate": 1.8409789236863102e-05, + "loss": 0.4761, + "step": 19016 + }, + { + "epoch": 3.1043222725603035, + "grad_norm": 1.7710314989089966, + "learning_rate": 1.840961753415664e-05, + "loss": 0.4102, + "step": 19017 + }, + { + "epoch": 3.104485531202808, + "grad_norm": 1.7599267959594727, + "learning_rate": 1.8409445822981694e-05, + "loss": 0.397, + "step": 19018 + }, + { + "epoch": 3.1046487898453123, + "grad_norm": 2.1912729740142822, + "learning_rate": 1.840927410333844e-05, + "loss": 0.5312, + "step": 19019 + }, + { + "epoch": 3.104812048487817, + "grad_norm": 2.0700430870056152, + "learning_rate": 1.8409102375227042e-05, + "loss": 0.4968, + "step": 19020 + }, + { + "epoch": 3.104975307130321, + "grad_norm": 1.51426100730896, + "learning_rate": 1.8408930638647685e-05, + "loss": 0.3942, + "step": 19021 + }, + { + "epoch": 3.1051385657728257, + "grad_norm": 2.265420436859131, + "learning_rate": 1.8408758893600543e-05, + "loss": 0.9754, + "step": 19022 + }, + { + "epoch": 3.10530182441533, + "grad_norm": 2.2239606380462646, + "learning_rate": 1.8408587140085778e-05, + "loss": 0.5372, + "step": 19023 + }, + { + "epoch": 3.1054650830578345, + "grad_norm": 1.9637805223464966, + "learning_rate": 1.8408415378103567e-05, + "loss": 0.4614, + "step": 19024 + }, + { + "epoch": 3.105628341700339, + "grad_norm": 2.1804585456848145, + "learning_rate": 1.8408243607654083e-05, + "loss": 0.5155, + "step": 19025 + }, + { + "epoch": 3.105791600342843, + "grad_norm": 1.5970202684402466, + "learning_rate": 1.84080718287375e-05, + "loss": 0.3552, + "step": 19026 + }, + { + "epoch": 3.1059548589853474, + "grad_norm": 1.8876723051071167, + "learning_rate": 1.8407900041353995e-05, + "loss": 0.4799, + "step": 19027 + }, + { + "epoch": 3.106118117627852, + "grad_norm": 2.122821807861328, + "learning_rate": 1.8407728245503735e-05, + "loss": 0.4868, + "step": 19028 + }, + { + "epoch": 3.1062813762703563, + "grad_norm": 1.557146668434143, + "learning_rate": 1.8407556441186895e-05, + "loss": 0.4189, + "step": 19029 + }, + { + "epoch": 3.1064446349128607, + "grad_norm": 1.7716423273086548, + "learning_rate": 1.8407384628403642e-05, + "loss": 0.4312, + "step": 19030 + }, + { + "epoch": 3.106607893555365, + "grad_norm": 2.0076303482055664, + "learning_rate": 1.8407212807154163e-05, + "loss": 0.4587, + "step": 19031 + }, + { + "epoch": 3.1067711521978696, + "grad_norm": 1.3556954860687256, + "learning_rate": 1.840704097743862e-05, + "loss": 0.3861, + "step": 19032 + }, + { + "epoch": 3.106934410840374, + "grad_norm": 1.8387558460235596, + "learning_rate": 1.840686913925719e-05, + "loss": 0.4134, + "step": 19033 + }, + { + "epoch": 3.1070976694828785, + "grad_norm": 2.5243215560913086, + "learning_rate": 1.8406697292610042e-05, + "loss": 0.5212, + "step": 19034 + }, + { + "epoch": 3.1072609281253825, + "grad_norm": 1.6554654836654663, + "learning_rate": 1.8406525437497355e-05, + "loss": 0.408, + "step": 19035 + }, + { + "epoch": 3.107424186767887, + "grad_norm": 2.1327080726623535, + "learning_rate": 1.8406353573919298e-05, + "loss": 0.5581, + "step": 19036 + }, + { + "epoch": 3.1075874454103913, + "grad_norm": 2.24916672706604, + "learning_rate": 1.840618170187605e-05, + "loss": 0.5185, + "step": 19037 + }, + { + "epoch": 3.1077507040528958, + "grad_norm": 1.8622065782546997, + "learning_rate": 1.8406009821367772e-05, + "loss": 0.4455, + "step": 19038 + }, + { + "epoch": 3.1079139626954, + "grad_norm": 1.6989214420318604, + "learning_rate": 1.8405837932394644e-05, + "loss": 0.4197, + "step": 19039 + }, + { + "epoch": 3.1080772213379046, + "grad_norm": 1.9272774457931519, + "learning_rate": 1.8405666034956842e-05, + "loss": 0.5568, + "step": 19040 + }, + { + "epoch": 3.108240479980409, + "grad_norm": 2.157724380493164, + "learning_rate": 1.840549412905454e-05, + "loss": 0.5357, + "step": 19041 + }, + { + "epoch": 3.1084037386229135, + "grad_norm": 1.891399621963501, + "learning_rate": 1.8405322214687905e-05, + "loss": 0.4769, + "step": 19042 + }, + { + "epoch": 3.1085669972654175, + "grad_norm": 2.1864571571350098, + "learning_rate": 1.8405150291857112e-05, + "loss": 0.5439, + "step": 19043 + }, + { + "epoch": 3.108730255907922, + "grad_norm": 2.113689661026001, + "learning_rate": 1.8404978360562333e-05, + "loss": 0.5032, + "step": 19044 + }, + { + "epoch": 3.1088935145504264, + "grad_norm": 1.9628092050552368, + "learning_rate": 1.8404806420803746e-05, + "loss": 0.4954, + "step": 19045 + }, + { + "epoch": 3.109056773192931, + "grad_norm": 2.2464334964752197, + "learning_rate": 1.840463447258152e-05, + "loss": 0.4893, + "step": 19046 + }, + { + "epoch": 3.1092200318354353, + "grad_norm": 2.155205726623535, + "learning_rate": 1.840446251589583e-05, + "loss": 0.4975, + "step": 19047 + }, + { + "epoch": 3.1093832904779397, + "grad_norm": 1.393471598625183, + "learning_rate": 1.840429055074685e-05, + "loss": 0.3534, + "step": 19048 + }, + { + "epoch": 3.109546549120444, + "grad_norm": 1.7957748174667358, + "learning_rate": 1.840411857713475e-05, + "loss": 0.384, + "step": 19049 + }, + { + "epoch": 3.1097098077629486, + "grad_norm": 1.7572323083877563, + "learning_rate": 1.8403946595059705e-05, + "loss": 0.4328, + "step": 19050 + }, + { + "epoch": 3.109873066405453, + "grad_norm": 1.9426058530807495, + "learning_rate": 1.8403774604521885e-05, + "loss": 0.4489, + "step": 19051 + }, + { + "epoch": 3.110036325047957, + "grad_norm": 2.6337037086486816, + "learning_rate": 1.8403602605521472e-05, + "loss": 0.5485, + "step": 19052 + }, + { + "epoch": 3.1101995836904615, + "grad_norm": 1.7630445957183838, + "learning_rate": 1.840343059805863e-05, + "loss": 0.3778, + "step": 19053 + }, + { + "epoch": 3.110362842332966, + "grad_norm": 2.7102644443511963, + "learning_rate": 1.8403258582133533e-05, + "loss": 0.5022, + "step": 19054 + }, + { + "epoch": 3.1105261009754703, + "grad_norm": 1.8413305282592773, + "learning_rate": 1.8403086557746363e-05, + "loss": 0.4705, + "step": 19055 + }, + { + "epoch": 3.1106893596179748, + "grad_norm": 1.684760570526123, + "learning_rate": 1.8402914524897283e-05, + "loss": 0.4031, + "step": 19056 + }, + { + "epoch": 3.110852618260479, + "grad_norm": 2.183561325073242, + "learning_rate": 1.8402742483586472e-05, + "loss": 0.5003, + "step": 19057 + }, + { + "epoch": 3.1110158769029836, + "grad_norm": 2.0258543491363525, + "learning_rate": 1.84025704338141e-05, + "loss": 0.4796, + "step": 19058 + }, + { + "epoch": 3.111179135545488, + "grad_norm": 2.1878154277801514, + "learning_rate": 1.840239837558034e-05, + "loss": 0.6106, + "step": 19059 + }, + { + "epoch": 3.1113423941879925, + "grad_norm": 2.485060691833496, + "learning_rate": 1.840222630888537e-05, + "loss": 0.5558, + "step": 19060 + }, + { + "epoch": 3.1115056528304965, + "grad_norm": 1.5685791969299316, + "learning_rate": 1.8402054233729362e-05, + "loss": 0.4098, + "step": 19061 + }, + { + "epoch": 3.111668911473001, + "grad_norm": 1.485779881477356, + "learning_rate": 1.8401882150112485e-05, + "loss": 0.4012, + "step": 19062 + }, + { + "epoch": 3.1118321701155054, + "grad_norm": 1.9207626581192017, + "learning_rate": 1.8401710058034914e-05, + "loss": 0.4681, + "step": 19063 + }, + { + "epoch": 3.11199542875801, + "grad_norm": 1.657449722290039, + "learning_rate": 1.8401537957496826e-05, + "loss": 0.4137, + "step": 19064 + }, + { + "epoch": 3.1121586874005143, + "grad_norm": 2.054091215133667, + "learning_rate": 1.8401365848498386e-05, + "loss": 0.5298, + "step": 19065 + }, + { + "epoch": 3.1123219460430187, + "grad_norm": 1.6379716396331787, + "learning_rate": 1.840119373103978e-05, + "loss": 0.3693, + "step": 19066 + }, + { + "epoch": 3.112485204685523, + "grad_norm": 1.9512654542922974, + "learning_rate": 1.840102160512117e-05, + "loss": 0.4904, + "step": 19067 + }, + { + "epoch": 3.1126484633280276, + "grad_norm": 1.6732302904129028, + "learning_rate": 1.8400849470742734e-05, + "loss": 0.4212, + "step": 19068 + }, + { + "epoch": 3.112811721970532, + "grad_norm": 1.7100154161453247, + "learning_rate": 1.8400677327904647e-05, + "loss": 0.4752, + "step": 19069 + }, + { + "epoch": 3.112974980613036, + "grad_norm": 1.778276801109314, + "learning_rate": 1.840050517660708e-05, + "loss": 0.4966, + "step": 19070 + }, + { + "epoch": 3.1131382392555405, + "grad_norm": 1.8129254579544067, + "learning_rate": 1.8400333016850204e-05, + "loss": 0.4414, + "step": 19071 + }, + { + "epoch": 3.113301497898045, + "grad_norm": 2.234964370727539, + "learning_rate": 1.8400160848634193e-05, + "loss": 0.5385, + "step": 19072 + }, + { + "epoch": 3.1134647565405493, + "grad_norm": 2.105086326599121, + "learning_rate": 1.8399988671959227e-05, + "loss": 0.5722, + "step": 19073 + }, + { + "epoch": 3.1136280151830538, + "grad_norm": 2.034006118774414, + "learning_rate": 1.839981648682547e-05, + "loss": 0.5017, + "step": 19074 + }, + { + "epoch": 3.113791273825558, + "grad_norm": 1.848780632019043, + "learning_rate": 1.8399644293233106e-05, + "loss": 0.4776, + "step": 19075 + }, + { + "epoch": 3.1139545324680626, + "grad_norm": 1.8085854053497314, + "learning_rate": 1.83994720911823e-05, + "loss": 0.6134, + "step": 19076 + }, + { + "epoch": 3.114117791110567, + "grad_norm": 2.073728084564209, + "learning_rate": 1.8399299880673226e-05, + "loss": 0.5313, + "step": 19077 + }, + { + "epoch": 3.1142810497530715, + "grad_norm": 1.965909481048584, + "learning_rate": 1.839912766170606e-05, + "loss": 0.4327, + "step": 19078 + }, + { + "epoch": 3.1144443083955755, + "grad_norm": 2.425568103790283, + "learning_rate": 1.8398955434280976e-05, + "loss": 0.4779, + "step": 19079 + }, + { + "epoch": 3.11460756703808, + "grad_norm": 2.033043622970581, + "learning_rate": 1.8398783198398146e-05, + "loss": 0.5236, + "step": 19080 + }, + { + "epoch": 3.1147708256805844, + "grad_norm": 1.7101037502288818, + "learning_rate": 1.839861095405774e-05, + "loss": 0.437, + "step": 19081 + }, + { + "epoch": 3.114934084323089, + "grad_norm": 2.190295696258545, + "learning_rate": 1.8398438701259936e-05, + "loss": 0.4815, + "step": 19082 + }, + { + "epoch": 3.1150973429655933, + "grad_norm": 1.9844053983688354, + "learning_rate": 1.8398266440004912e-05, + "loss": 0.475, + "step": 19083 + }, + { + "epoch": 3.1152606016080977, + "grad_norm": 1.772014856338501, + "learning_rate": 1.839809417029283e-05, + "loss": 0.3723, + "step": 19084 + }, + { + "epoch": 3.115423860250602, + "grad_norm": 2.0712497234344482, + "learning_rate": 1.839792189212387e-05, + "loss": 0.4782, + "step": 19085 + }, + { + "epoch": 3.1155871188931066, + "grad_norm": 1.3864095211029053, + "learning_rate": 1.8397749605498208e-05, + "loss": 0.341, + "step": 19086 + }, + { + "epoch": 3.115750377535611, + "grad_norm": 1.8471800088882446, + "learning_rate": 1.839757731041601e-05, + "loss": 0.4745, + "step": 19087 + }, + { + "epoch": 3.115913636178115, + "grad_norm": 2.1476757526397705, + "learning_rate": 1.8397405006877462e-05, + "loss": 0.5007, + "step": 19088 + }, + { + "epoch": 3.1160768948206194, + "grad_norm": 2.1842470169067383, + "learning_rate": 1.839723269488272e-05, + "loss": 0.4499, + "step": 19089 + }, + { + "epoch": 3.116240153463124, + "grad_norm": 1.9584110975265503, + "learning_rate": 1.8397060374431972e-05, + "loss": 0.5429, + "step": 19090 + }, + { + "epoch": 3.1164034121056283, + "grad_norm": 2.1511290073394775, + "learning_rate": 1.839688804552539e-05, + "loss": 0.5695, + "step": 19091 + }, + { + "epoch": 3.1165666707481328, + "grad_norm": 1.796772837638855, + "learning_rate": 1.8396715708163136e-05, + "loss": 0.457, + "step": 19092 + }, + { + "epoch": 3.116729929390637, + "grad_norm": 1.816171646118164, + "learning_rate": 1.8396543362345394e-05, + "loss": 0.463, + "step": 19093 + }, + { + "epoch": 3.1168931880331416, + "grad_norm": 1.6841988563537598, + "learning_rate": 1.8396371008072335e-05, + "loss": 0.4251, + "step": 19094 + }, + { + "epoch": 3.117056446675646, + "grad_norm": 1.9399113655090332, + "learning_rate": 1.8396198645344133e-05, + "loss": 0.4456, + "step": 19095 + }, + { + "epoch": 3.11721970531815, + "grad_norm": 1.592839002609253, + "learning_rate": 1.8396026274160962e-05, + "loss": 0.3955, + "step": 19096 + }, + { + "epoch": 3.1173829639606545, + "grad_norm": 1.8083409070968628, + "learning_rate": 1.8395853894522994e-05, + "loss": 0.4352, + "step": 19097 + }, + { + "epoch": 3.117546222603159, + "grad_norm": 1.6731728315353394, + "learning_rate": 1.8395681506430403e-05, + "loss": 0.3894, + "step": 19098 + }, + { + "epoch": 3.1177094812456634, + "grad_norm": 1.9719464778900146, + "learning_rate": 1.8395509109883363e-05, + "loss": 0.5043, + "step": 19099 + }, + { + "epoch": 3.117872739888168, + "grad_norm": 2.319307804107666, + "learning_rate": 1.839533670488205e-05, + "loss": 0.6036, + "step": 19100 + }, + { + "epoch": 3.1180359985306723, + "grad_norm": 1.9424128532409668, + "learning_rate": 1.8395164291426633e-05, + "loss": 0.4514, + "step": 19101 + }, + { + "epoch": 3.1181992571731767, + "grad_norm": 2.043370246887207, + "learning_rate": 1.839499186951729e-05, + "loss": 0.4579, + "step": 19102 + }, + { + "epoch": 3.118362515815681, + "grad_norm": 2.0569565296173096, + "learning_rate": 1.839481943915419e-05, + "loss": 0.4847, + "step": 19103 + }, + { + "epoch": 3.1185257744581856, + "grad_norm": 2.307811737060547, + "learning_rate": 1.839464700033751e-05, + "loss": 0.4882, + "step": 19104 + }, + { + "epoch": 3.1186890331006896, + "grad_norm": 2.0470194816589355, + "learning_rate": 1.8394474553067422e-05, + "loss": 0.5043, + "step": 19105 + }, + { + "epoch": 3.118852291743194, + "grad_norm": 1.8945519924163818, + "learning_rate": 1.8394302097344103e-05, + "loss": 0.3927, + "step": 19106 + }, + { + "epoch": 3.1190155503856984, + "grad_norm": 2.1111748218536377, + "learning_rate": 1.839412963316772e-05, + "loss": 0.5081, + "step": 19107 + }, + { + "epoch": 3.119178809028203, + "grad_norm": 1.8925790786743164, + "learning_rate": 1.8393957160538452e-05, + "loss": 0.4859, + "step": 19108 + }, + { + "epoch": 3.1193420676707073, + "grad_norm": 2.5407028198242188, + "learning_rate": 1.8393784679456474e-05, + "loss": 0.6213, + "step": 19109 + }, + { + "epoch": 3.1195053263132118, + "grad_norm": 2.533562421798706, + "learning_rate": 1.8393612189921953e-05, + "loss": 0.5074, + "step": 19110 + }, + { + "epoch": 3.119668584955716, + "grad_norm": 1.7451250553131104, + "learning_rate": 1.8393439691935068e-05, + "loss": 0.4117, + "step": 19111 + }, + { + "epoch": 3.1198318435982206, + "grad_norm": 1.9081515073776245, + "learning_rate": 1.8393267185495992e-05, + "loss": 0.5058, + "step": 19112 + }, + { + "epoch": 3.119995102240725, + "grad_norm": 2.1967012882232666, + "learning_rate": 1.8393094670604897e-05, + "loss": 0.4578, + "step": 19113 + }, + { + "epoch": 3.120158360883229, + "grad_norm": 2.27057147026062, + "learning_rate": 1.839292214726196e-05, + "loss": 0.4944, + "step": 19114 + }, + { + "epoch": 3.1203216195257335, + "grad_norm": 1.8932514190673828, + "learning_rate": 1.839274961546735e-05, + "loss": 0.4902, + "step": 19115 + }, + { + "epoch": 3.120484878168238, + "grad_norm": 1.9823964834213257, + "learning_rate": 1.8392577075221247e-05, + "loss": 0.458, + "step": 19116 + }, + { + "epoch": 3.1206481368107424, + "grad_norm": 2.023143768310547, + "learning_rate": 1.8392404526523816e-05, + "loss": 0.5258, + "step": 19117 + }, + { + "epoch": 3.120811395453247, + "grad_norm": 2.0249555110931396, + "learning_rate": 1.8392231969375243e-05, + "loss": 0.4742, + "step": 19118 + }, + { + "epoch": 3.1209746540957513, + "grad_norm": 1.9541535377502441, + "learning_rate": 1.839205940377569e-05, + "loss": 0.48, + "step": 19119 + }, + { + "epoch": 3.1211379127382557, + "grad_norm": 2.3015940189361572, + "learning_rate": 1.8391886829725334e-05, + "loss": 0.4685, + "step": 19120 + }, + { + "epoch": 3.12130117138076, + "grad_norm": 2.199803113937378, + "learning_rate": 1.8391714247224354e-05, + "loss": 0.5353, + "step": 19121 + }, + { + "epoch": 3.1214644300232646, + "grad_norm": 1.776964545249939, + "learning_rate": 1.839154165627292e-05, + "loss": 0.4345, + "step": 19122 + }, + { + "epoch": 3.1216276886657686, + "grad_norm": 1.9244810342788696, + "learning_rate": 1.83913690568712e-05, + "loss": 0.493, + "step": 19123 + }, + { + "epoch": 3.121790947308273, + "grad_norm": 1.6264513731002808, + "learning_rate": 1.839119644901938e-05, + "loss": 0.4377, + "step": 19124 + }, + { + "epoch": 3.1219542059507774, + "grad_norm": 1.7649201154708862, + "learning_rate": 1.8391023832717626e-05, + "loss": 0.4717, + "step": 19125 + }, + { + "epoch": 3.122117464593282, + "grad_norm": 2.316427230834961, + "learning_rate": 1.839085120796611e-05, + "loss": 0.432, + "step": 19126 + }, + { + "epoch": 3.1222807232357863, + "grad_norm": 1.7968899011611938, + "learning_rate": 1.8390678574765014e-05, + "loss": 0.5347, + "step": 19127 + }, + { + "epoch": 3.1224439818782908, + "grad_norm": 1.9190679788589478, + "learning_rate": 1.8390505933114503e-05, + "loss": 0.4928, + "step": 19128 + }, + { + "epoch": 3.122607240520795, + "grad_norm": 2.048699378967285, + "learning_rate": 1.8390333283014757e-05, + "loss": 0.4656, + "step": 19129 + }, + { + "epoch": 3.1227704991632996, + "grad_norm": 2.496537923812866, + "learning_rate": 1.8390160624465946e-05, + "loss": 0.4842, + "step": 19130 + }, + { + "epoch": 3.1229337578058036, + "grad_norm": 2.0629682540893555, + "learning_rate": 1.8389987957468245e-05, + "loss": 0.4627, + "step": 19131 + }, + { + "epoch": 3.123097016448308, + "grad_norm": 1.4336191415786743, + "learning_rate": 1.838981528202183e-05, + "loss": 0.3719, + "step": 19132 + }, + { + "epoch": 3.1232602750908125, + "grad_norm": 2.2204713821411133, + "learning_rate": 1.8389642598126873e-05, + "loss": 0.5625, + "step": 19133 + }, + { + "epoch": 3.123423533733317, + "grad_norm": 1.7413872480392456, + "learning_rate": 1.8389469905783545e-05, + "loss": 0.3909, + "step": 19134 + }, + { + "epoch": 3.1235867923758214, + "grad_norm": 2.054579496383667, + "learning_rate": 1.8389297204992028e-05, + "loss": 0.4642, + "step": 19135 + }, + { + "epoch": 3.123750051018326, + "grad_norm": 1.9909058809280396, + "learning_rate": 1.838912449575249e-05, + "loss": 0.4943, + "step": 19136 + }, + { + "epoch": 3.1239133096608303, + "grad_norm": 1.9790704250335693, + "learning_rate": 1.8388951778065104e-05, + "loss": 0.4935, + "step": 19137 + }, + { + "epoch": 3.1240765683033347, + "grad_norm": 2.2108731269836426, + "learning_rate": 1.8388779051930043e-05, + "loss": 0.4816, + "step": 19138 + }, + { + "epoch": 3.124239826945839, + "grad_norm": 2.0340092182159424, + "learning_rate": 1.838860631734749e-05, + "loss": 0.4371, + "step": 19139 + }, + { + "epoch": 3.124403085588343, + "grad_norm": 2.063110828399658, + "learning_rate": 1.838843357431761e-05, + "loss": 0.4498, + "step": 19140 + }, + { + "epoch": 3.1245663442308476, + "grad_norm": 2.3248705863952637, + "learning_rate": 1.838826082284058e-05, + "loss": 0.5777, + "step": 19141 + }, + { + "epoch": 3.124729602873352, + "grad_norm": 2.1884920597076416, + "learning_rate": 1.8388088062916572e-05, + "loss": 0.5249, + "step": 19142 + }, + { + "epoch": 3.1248928615158564, + "grad_norm": 2.0289318561553955, + "learning_rate": 1.838791529454576e-05, + "loss": 0.461, + "step": 19143 + }, + { + "epoch": 3.125056120158361, + "grad_norm": 2.0098142623901367, + "learning_rate": 1.8387742517728325e-05, + "loss": 0.4633, + "step": 19144 + }, + { + "epoch": 3.1252193788008653, + "grad_norm": 2.4782795906066895, + "learning_rate": 1.838756973246443e-05, + "loss": 0.4679, + "step": 19145 + }, + { + "epoch": 3.1253826374433697, + "grad_norm": 1.8772648572921753, + "learning_rate": 1.838739693875426e-05, + "loss": 0.4129, + "step": 19146 + }, + { + "epoch": 3.125545896085874, + "grad_norm": 2.33862566947937, + "learning_rate": 1.838722413659798e-05, + "loss": 0.5364, + "step": 19147 + }, + { + "epoch": 3.1257091547283786, + "grad_norm": 1.920239806175232, + "learning_rate": 1.838705132599577e-05, + "loss": 0.5043, + "step": 19148 + }, + { + "epoch": 3.1258724133708826, + "grad_norm": 2.282862901687622, + "learning_rate": 1.8386878506947798e-05, + "loss": 0.5933, + "step": 19149 + }, + { + "epoch": 3.126035672013387, + "grad_norm": 1.5184367895126343, + "learning_rate": 1.8386705679454243e-05, + "loss": 0.3733, + "step": 19150 + }, + { + "epoch": 3.1261989306558915, + "grad_norm": 1.9671231508255005, + "learning_rate": 1.8386532843515275e-05, + "loss": 0.491, + "step": 19151 + }, + { + "epoch": 3.126362189298396, + "grad_norm": 1.79429292678833, + "learning_rate": 1.8386359999131078e-05, + "loss": 0.5035, + "step": 19152 + }, + { + "epoch": 3.1265254479409004, + "grad_norm": 2.1592330932617188, + "learning_rate": 1.8386187146301812e-05, + "loss": 0.5183, + "step": 19153 + }, + { + "epoch": 3.126688706583405, + "grad_norm": 1.8004266023635864, + "learning_rate": 1.838601428502766e-05, + "loss": 0.41, + "step": 19154 + }, + { + "epoch": 3.1268519652259092, + "grad_norm": 1.7408418655395508, + "learning_rate": 1.838584141530879e-05, + "loss": 0.5027, + "step": 19155 + }, + { + "epoch": 3.1270152238684137, + "grad_norm": 1.8845396041870117, + "learning_rate": 1.8385668537145386e-05, + "loss": 0.4756, + "step": 19156 + }, + { + "epoch": 3.127178482510918, + "grad_norm": 1.8021447658538818, + "learning_rate": 1.8385495650537612e-05, + "loss": 0.3872, + "step": 19157 + }, + { + "epoch": 3.127341741153422, + "grad_norm": 1.7435846328735352, + "learning_rate": 1.838532275548565e-05, + "loss": 0.4397, + "step": 19158 + }, + { + "epoch": 3.1275049997959266, + "grad_norm": 2.212862730026245, + "learning_rate": 1.8385149851989667e-05, + "loss": 0.5581, + "step": 19159 + }, + { + "epoch": 3.127668258438431, + "grad_norm": 1.983757495880127, + "learning_rate": 1.8384976940049842e-05, + "loss": 0.4955, + "step": 19160 + }, + { + "epoch": 3.1278315170809354, + "grad_norm": 2.0575027465820312, + "learning_rate": 1.8384804019666348e-05, + "loss": 0.4849, + "step": 19161 + }, + { + "epoch": 3.12799477572344, + "grad_norm": 1.9399220943450928, + "learning_rate": 1.838463109083936e-05, + "loss": 0.4896, + "step": 19162 + }, + { + "epoch": 3.1281580343659443, + "grad_norm": 2.2822883129119873, + "learning_rate": 1.8384458153569044e-05, + "loss": 0.4826, + "step": 19163 + }, + { + "epoch": 3.1283212930084487, + "grad_norm": 2.037858247756958, + "learning_rate": 1.8384285207855585e-05, + "loss": 0.4705, + "step": 19164 + }, + { + "epoch": 3.128484551650953, + "grad_norm": 1.8360326290130615, + "learning_rate": 1.8384112253699153e-05, + "loss": 0.4354, + "step": 19165 + }, + { + "epoch": 3.128647810293457, + "grad_norm": 1.881895899772644, + "learning_rate": 1.838393929109992e-05, + "loss": 0.4822, + "step": 19166 + }, + { + "epoch": 3.1288110689359616, + "grad_norm": 1.5092086791992188, + "learning_rate": 1.838376632005807e-05, + "loss": 0.3553, + "step": 19167 + }, + { + "epoch": 3.128974327578466, + "grad_norm": 2.107337713241577, + "learning_rate": 1.8383593340573763e-05, + "loss": 0.5285, + "step": 19168 + }, + { + "epoch": 3.1291375862209705, + "grad_norm": 2.0311777591705322, + "learning_rate": 1.8383420352647177e-05, + "loss": 0.5102, + "step": 19169 + }, + { + "epoch": 3.129300844863475, + "grad_norm": 1.9234910011291504, + "learning_rate": 1.8383247356278496e-05, + "loss": 0.473, + "step": 19170 + }, + { + "epoch": 3.1294641035059794, + "grad_norm": 2.5392792224884033, + "learning_rate": 1.838307435146788e-05, + "loss": 0.955, + "step": 19171 + }, + { + "epoch": 3.129627362148484, + "grad_norm": 1.9994525909423828, + "learning_rate": 1.8382901338215515e-05, + "loss": 0.4304, + "step": 19172 + }, + { + "epoch": 3.1297906207909882, + "grad_norm": 1.7872364521026611, + "learning_rate": 1.8382728316521568e-05, + "loss": 0.4255, + "step": 19173 + }, + { + "epoch": 3.1299538794334927, + "grad_norm": 1.704208254814148, + "learning_rate": 1.838255528638622e-05, + "loss": 0.45, + "step": 19174 + }, + { + "epoch": 3.130117138075997, + "grad_norm": 1.8586641550064087, + "learning_rate": 1.838238224780964e-05, + "loss": 0.4388, + "step": 19175 + }, + { + "epoch": 3.130280396718501, + "grad_norm": 2.0445377826690674, + "learning_rate": 1.8382209200792e-05, + "loss": 0.5095, + "step": 19176 + }, + { + "epoch": 3.1304436553610056, + "grad_norm": 2.1643171310424805, + "learning_rate": 1.838203614533348e-05, + "loss": 0.545, + "step": 19177 + }, + { + "epoch": 3.13060691400351, + "grad_norm": 2.1047680377960205, + "learning_rate": 1.838186308143425e-05, + "loss": 0.4825, + "step": 19178 + }, + { + "epoch": 3.1307701726460144, + "grad_norm": 2.1559841632843018, + "learning_rate": 1.8381690009094488e-05, + "loss": 0.6017, + "step": 19179 + }, + { + "epoch": 3.130933431288519, + "grad_norm": 1.629186987876892, + "learning_rate": 1.838151692831437e-05, + "loss": 0.4135, + "step": 19180 + }, + { + "epoch": 3.1310966899310233, + "grad_norm": 2.229228973388672, + "learning_rate": 1.838134383909406e-05, + "loss": 0.5778, + "step": 19181 + }, + { + "epoch": 3.1312599485735277, + "grad_norm": 1.8668835163116455, + "learning_rate": 1.838117074143374e-05, + "loss": 0.471, + "step": 19182 + }, + { + "epoch": 3.131423207216032, + "grad_norm": 1.9742660522460938, + "learning_rate": 1.8380997635333587e-05, + "loss": 0.4884, + "step": 19183 + }, + { + "epoch": 3.131586465858536, + "grad_norm": 1.5638824701309204, + "learning_rate": 1.8380824520793767e-05, + "loss": 0.3597, + "step": 19184 + }, + { + "epoch": 3.1317497245010406, + "grad_norm": 1.7046419382095337, + "learning_rate": 1.8380651397814463e-05, + "loss": 0.4369, + "step": 19185 + }, + { + "epoch": 3.131912983143545, + "grad_norm": 2.371206760406494, + "learning_rate": 1.8380478266395844e-05, + "loss": 0.5842, + "step": 19186 + }, + { + "epoch": 3.1320762417860495, + "grad_norm": 2.091435432434082, + "learning_rate": 1.8380305126538084e-05, + "loss": 0.4862, + "step": 19187 + }, + { + "epoch": 3.132239500428554, + "grad_norm": 2.00459885597229, + "learning_rate": 1.838013197824136e-05, + "loss": 0.5227, + "step": 19188 + }, + { + "epoch": 3.1324027590710584, + "grad_norm": 2.192333936691284, + "learning_rate": 1.8379958821505846e-05, + "loss": 0.4968, + "step": 19189 + }, + { + "epoch": 3.132566017713563, + "grad_norm": 2.337123155593872, + "learning_rate": 1.8379785656331714e-05, + "loss": 0.5236, + "step": 19190 + }, + { + "epoch": 3.1327292763560672, + "grad_norm": 1.9252872467041016, + "learning_rate": 1.8379612482719142e-05, + "loss": 0.4618, + "step": 19191 + }, + { + "epoch": 3.1328925349985717, + "grad_norm": 1.7911752462387085, + "learning_rate": 1.83794393006683e-05, + "loss": 0.4206, + "step": 19192 + }, + { + "epoch": 3.1330557936410757, + "grad_norm": 2.023324966430664, + "learning_rate": 1.8379266110179368e-05, + "loss": 0.4994, + "step": 19193 + }, + { + "epoch": 3.13321905228358, + "grad_norm": 1.7995480298995972, + "learning_rate": 1.8379092911252515e-05, + "loss": 0.4535, + "step": 19194 + }, + { + "epoch": 3.1333823109260845, + "grad_norm": 1.7781600952148438, + "learning_rate": 1.837891970388792e-05, + "loss": 0.412, + "step": 19195 + }, + { + "epoch": 3.133545569568589, + "grad_norm": 2.1254894733428955, + "learning_rate": 1.837874648808575e-05, + "loss": 0.5509, + "step": 19196 + }, + { + "epoch": 3.1337088282110934, + "grad_norm": 1.8739579916000366, + "learning_rate": 1.837857326384619e-05, + "loss": 0.4552, + "step": 19197 + }, + { + "epoch": 3.133872086853598, + "grad_norm": 2.3681814670562744, + "learning_rate": 1.8378400031169406e-05, + "loss": 0.4976, + "step": 19198 + }, + { + "epoch": 3.1340353454961023, + "grad_norm": 1.6057387590408325, + "learning_rate": 1.837822679005558e-05, + "loss": 0.4698, + "step": 19199 + }, + { + "epoch": 3.1341986041386067, + "grad_norm": 1.864689826965332, + "learning_rate": 1.8378053540504874e-05, + "loss": 0.4284, + "step": 19200 + }, + { + "epoch": 3.134361862781111, + "grad_norm": 1.9578101634979248, + "learning_rate": 1.8377880282517476e-05, + "loss": 0.4088, + "step": 19201 + }, + { + "epoch": 3.134525121423615, + "grad_norm": 2.199793577194214, + "learning_rate": 1.8377707016093553e-05, + "loss": 0.4343, + "step": 19202 + }, + { + "epoch": 3.1346883800661196, + "grad_norm": 1.900309681892395, + "learning_rate": 1.8377533741233282e-05, + "loss": 0.4381, + "step": 19203 + }, + { + "epoch": 3.134851638708624, + "grad_norm": 1.8463164567947388, + "learning_rate": 1.8377360457936835e-05, + "loss": 0.4556, + "step": 19204 + }, + { + "epoch": 3.1350148973511285, + "grad_norm": 1.971508264541626, + "learning_rate": 1.837718716620439e-05, + "loss": 0.4681, + "step": 19205 + }, + { + "epoch": 3.135178155993633, + "grad_norm": 2.0972816944122314, + "learning_rate": 1.837701386603612e-05, + "loss": 0.4622, + "step": 19206 + }, + { + "epoch": 3.1353414146361374, + "grad_norm": 1.7982628345489502, + "learning_rate": 1.83768405574322e-05, + "loss": 0.4539, + "step": 19207 + }, + { + "epoch": 3.135504673278642, + "grad_norm": 1.6723878383636475, + "learning_rate": 1.83766672403928e-05, + "loss": 0.4075, + "step": 19208 + }, + { + "epoch": 3.1356679319211462, + "grad_norm": 1.6772204637527466, + "learning_rate": 1.83764939149181e-05, + "loss": 0.4266, + "step": 19209 + }, + { + "epoch": 3.1358311905636507, + "grad_norm": 1.879230260848999, + "learning_rate": 1.8376320581008274e-05, + "loss": 0.5095, + "step": 19210 + }, + { + "epoch": 3.1359944492061547, + "grad_norm": 1.8299741744995117, + "learning_rate": 1.8376147238663497e-05, + "loss": 0.4843, + "step": 19211 + }, + { + "epoch": 3.136157707848659, + "grad_norm": 1.9370840787887573, + "learning_rate": 1.8375973887883938e-05, + "loss": 0.481, + "step": 19212 + }, + { + "epoch": 3.1363209664911635, + "grad_norm": 2.246957540512085, + "learning_rate": 1.8375800528669777e-05, + "loss": 0.5173, + "step": 19213 + }, + { + "epoch": 3.136484225133668, + "grad_norm": 2.25228214263916, + "learning_rate": 1.8375627161021185e-05, + "loss": 0.6105, + "step": 19214 + }, + { + "epoch": 3.1366474837761724, + "grad_norm": 1.6443427801132202, + "learning_rate": 1.8375453784938343e-05, + "loss": 0.4404, + "step": 19215 + }, + { + "epoch": 3.136810742418677, + "grad_norm": 2.1025502681732178, + "learning_rate": 1.837528040042142e-05, + "loss": 0.4891, + "step": 19216 + }, + { + "epoch": 3.1369740010611813, + "grad_norm": 1.6649075746536255, + "learning_rate": 1.837510700747059e-05, + "loss": 0.3995, + "step": 19217 + }, + { + "epoch": 3.1371372597036857, + "grad_norm": 1.9918556213378906, + "learning_rate": 1.837493360608603e-05, + "loss": 0.4375, + "step": 19218 + }, + { + "epoch": 3.1373005183461897, + "grad_norm": 2.0413553714752197, + "learning_rate": 1.8374760196267915e-05, + "loss": 0.49, + "step": 19219 + }, + { + "epoch": 3.137463776988694, + "grad_norm": 1.8786976337432861, + "learning_rate": 1.837458677801642e-05, + "loss": 0.4489, + "step": 19220 + }, + { + "epoch": 3.1376270356311986, + "grad_norm": 3.1672182083129883, + "learning_rate": 1.8374413351331716e-05, + "loss": 0.531, + "step": 19221 + }, + { + "epoch": 3.137790294273703, + "grad_norm": 2.4466147422790527, + "learning_rate": 1.837423991621398e-05, + "loss": 0.5085, + "step": 19222 + }, + { + "epoch": 3.1379535529162075, + "grad_norm": 1.9197770357131958, + "learning_rate": 1.8374066472663386e-05, + "loss": 0.4531, + "step": 19223 + }, + { + "epoch": 3.138116811558712, + "grad_norm": 1.8191171884536743, + "learning_rate": 1.837389302068011e-05, + "loss": 0.4815, + "step": 19224 + }, + { + "epoch": 3.1382800702012164, + "grad_norm": 2.0078275203704834, + "learning_rate": 1.837371956026433e-05, + "loss": 0.5365, + "step": 19225 + }, + { + "epoch": 3.138443328843721, + "grad_norm": 1.8167836666107178, + "learning_rate": 1.8373546091416212e-05, + "loss": 0.4047, + "step": 19226 + }, + { + "epoch": 3.1386065874862252, + "grad_norm": 2.078007698059082, + "learning_rate": 1.8373372614135935e-05, + "loss": 0.605, + "step": 19227 + }, + { + "epoch": 3.1387698461287297, + "grad_norm": 2.2191758155822754, + "learning_rate": 1.8373199128423676e-05, + "loss": 0.5396, + "step": 19228 + }, + { + "epoch": 3.1389331047712337, + "grad_norm": 1.4233413934707642, + "learning_rate": 1.8373025634279606e-05, + "loss": 0.3712, + "step": 19229 + }, + { + "epoch": 3.139096363413738, + "grad_norm": 1.903640627861023, + "learning_rate": 1.8372852131703903e-05, + "loss": 0.4744, + "step": 19230 + }, + { + "epoch": 3.1392596220562425, + "grad_norm": 2.3639211654663086, + "learning_rate": 1.8372678620696742e-05, + "loss": 0.4988, + "step": 19231 + }, + { + "epoch": 3.139422880698747, + "grad_norm": 1.9745310544967651, + "learning_rate": 1.8372505101258294e-05, + "loss": 0.5383, + "step": 19232 + }, + { + "epoch": 3.1395861393412514, + "grad_norm": 1.5489318370819092, + "learning_rate": 1.8372331573388733e-05, + "loss": 0.3644, + "step": 19233 + }, + { + "epoch": 3.139749397983756, + "grad_norm": 1.902844786643982, + "learning_rate": 1.837215803708824e-05, + "loss": 0.4831, + "step": 19234 + }, + { + "epoch": 3.1399126566262603, + "grad_norm": 1.9865195751190186, + "learning_rate": 1.8371984492356985e-05, + "loss": 0.4361, + "step": 19235 + }, + { + "epoch": 3.1400759152687647, + "grad_norm": 1.9308693408966064, + "learning_rate": 1.8371810939195143e-05, + "loss": 0.4323, + "step": 19236 + }, + { + "epoch": 3.1402391739112687, + "grad_norm": 2.0678958892822266, + "learning_rate": 1.837163737760289e-05, + "loss": 0.5162, + "step": 19237 + }, + { + "epoch": 3.140402432553773, + "grad_norm": 1.6506986618041992, + "learning_rate": 1.83714638075804e-05, + "loss": 0.3708, + "step": 19238 + }, + { + "epoch": 3.1405656911962776, + "grad_norm": 2.0123302936553955, + "learning_rate": 1.8371290229127848e-05, + "loss": 0.4802, + "step": 19239 + }, + { + "epoch": 3.140728949838782, + "grad_norm": 2.1602470874786377, + "learning_rate": 1.837111664224541e-05, + "loss": 0.5507, + "step": 19240 + }, + { + "epoch": 3.1408922084812865, + "grad_norm": 1.6565812826156616, + "learning_rate": 1.8370943046933257e-05, + "loss": 0.4923, + "step": 19241 + }, + { + "epoch": 3.141055467123791, + "grad_norm": 2.244677782058716, + "learning_rate": 1.837076944319157e-05, + "loss": 0.507, + "step": 19242 + }, + { + "epoch": 3.1412187257662953, + "grad_norm": 2.213373899459839, + "learning_rate": 1.837059583102052e-05, + "loss": 0.5763, + "step": 19243 + }, + { + "epoch": 3.1413819844088, + "grad_norm": 2.346959114074707, + "learning_rate": 1.8370422210420284e-05, + "loss": 0.5708, + "step": 19244 + }, + { + "epoch": 3.1415452430513042, + "grad_norm": 2.1421985626220703, + "learning_rate": 1.8370248581391032e-05, + "loss": 0.4957, + "step": 19245 + }, + { + "epoch": 3.141708501693808, + "grad_norm": 1.9464410543441772, + "learning_rate": 1.837007494393294e-05, + "loss": 0.4066, + "step": 19246 + }, + { + "epoch": 3.1418717603363127, + "grad_norm": 1.6493085622787476, + "learning_rate": 1.836990129804619e-05, + "loss": 0.3991, + "step": 19247 + }, + { + "epoch": 3.142035018978817, + "grad_norm": 2.5067317485809326, + "learning_rate": 1.8369727643730947e-05, + "loss": 0.5849, + "step": 19248 + }, + { + "epoch": 3.1421982776213215, + "grad_norm": 1.7200936079025269, + "learning_rate": 1.8369553980987392e-05, + "loss": 0.3872, + "step": 19249 + }, + { + "epoch": 3.142361536263826, + "grad_norm": 1.9086614847183228, + "learning_rate": 1.83693803098157e-05, + "loss": 0.4374, + "step": 19250 + }, + { + "epoch": 3.1425247949063304, + "grad_norm": 2.303955078125, + "learning_rate": 1.836920663021604e-05, + "loss": 0.5056, + "step": 19251 + }, + { + "epoch": 3.142688053548835, + "grad_norm": 2.0120105743408203, + "learning_rate": 1.8369032942188595e-05, + "loss": 0.4451, + "step": 19252 + }, + { + "epoch": 3.1428513121913393, + "grad_norm": 2.1218080520629883, + "learning_rate": 1.8368859245733535e-05, + "loss": 0.573, + "step": 19253 + }, + { + "epoch": 3.1430145708338437, + "grad_norm": 2.056225299835205, + "learning_rate": 1.8368685540851037e-05, + "loss": 0.4802, + "step": 19254 + }, + { + "epoch": 3.1431778294763477, + "grad_norm": 1.8847757577896118, + "learning_rate": 1.8368511827541275e-05, + "loss": 0.5328, + "step": 19255 + }, + { + "epoch": 3.143341088118852, + "grad_norm": 1.8967527151107788, + "learning_rate": 1.836833810580442e-05, + "loss": 0.4313, + "step": 19256 + }, + { + "epoch": 3.1435043467613566, + "grad_norm": 2.2437615394592285, + "learning_rate": 1.8368164375640653e-05, + "loss": 0.5625, + "step": 19257 + }, + { + "epoch": 3.143667605403861, + "grad_norm": 2.049694776535034, + "learning_rate": 1.836799063705015e-05, + "loss": 0.4975, + "step": 19258 + }, + { + "epoch": 3.1438308640463655, + "grad_norm": 1.9041457176208496, + "learning_rate": 1.8367816890033077e-05, + "loss": 0.5247, + "step": 19259 + }, + { + "epoch": 3.14399412268887, + "grad_norm": 2.6352407932281494, + "learning_rate": 1.836764313458962e-05, + "loss": 0.5815, + "step": 19260 + }, + { + "epoch": 3.1441573813313743, + "grad_norm": 1.9593379497528076, + "learning_rate": 1.8367469370719946e-05, + "loss": 0.4785, + "step": 19261 + }, + { + "epoch": 3.144320639973879, + "grad_norm": 1.6995769739151, + "learning_rate": 1.836729559842423e-05, + "loss": 0.3965, + "step": 19262 + }, + { + "epoch": 3.144483898616383, + "grad_norm": 1.720718264579773, + "learning_rate": 1.8367121817702653e-05, + "loss": 0.4318, + "step": 19263 + }, + { + "epoch": 3.144647157258887, + "grad_norm": 2.056804656982422, + "learning_rate": 1.8366948028555388e-05, + "loss": 0.4654, + "step": 19264 + }, + { + "epoch": 3.1448104159013917, + "grad_norm": 2.1128287315368652, + "learning_rate": 1.8366774230982606e-05, + "loss": 0.5836, + "step": 19265 + }, + { + "epoch": 3.144973674543896, + "grad_norm": 2.1343350410461426, + "learning_rate": 1.836660042498448e-05, + "loss": 0.5675, + "step": 19266 + }, + { + "epoch": 3.1451369331864005, + "grad_norm": 1.5325431823730469, + "learning_rate": 1.8366426610561198e-05, + "loss": 0.3773, + "step": 19267 + }, + { + "epoch": 3.145300191828905, + "grad_norm": 1.6370446681976318, + "learning_rate": 1.8366252787712922e-05, + "loss": 0.395, + "step": 19268 + }, + { + "epoch": 3.1454634504714094, + "grad_norm": 1.7818723917007446, + "learning_rate": 1.8366078956439833e-05, + "loss": 0.4305, + "step": 19269 + }, + { + "epoch": 3.145626709113914, + "grad_norm": 1.6227312088012695, + "learning_rate": 1.8365905116742103e-05, + "loss": 0.3878, + "step": 19270 + }, + { + "epoch": 3.1457899677564183, + "grad_norm": 1.6777211427688599, + "learning_rate": 1.8365731268619912e-05, + "loss": 0.3905, + "step": 19271 + }, + { + "epoch": 3.1459532263989223, + "grad_norm": 2.104206085205078, + "learning_rate": 1.836555741207343e-05, + "loss": 0.4908, + "step": 19272 + }, + { + "epoch": 3.1461164850414267, + "grad_norm": 1.3536583185195923, + "learning_rate": 1.836538354710283e-05, + "loss": 0.3294, + "step": 19273 + }, + { + "epoch": 3.146279743683931, + "grad_norm": 1.7271136045455933, + "learning_rate": 1.8365209673708298e-05, + "loss": 0.4074, + "step": 19274 + }, + { + "epoch": 3.1464430023264356, + "grad_norm": 1.8771289587020874, + "learning_rate": 1.836503579189e-05, + "loss": 0.3902, + "step": 19275 + }, + { + "epoch": 3.14660626096894, + "grad_norm": 1.784264326095581, + "learning_rate": 1.836486190164811e-05, + "loss": 0.4054, + "step": 19276 + }, + { + "epoch": 3.1467695196114445, + "grad_norm": 2.005175828933716, + "learning_rate": 1.836468800298281e-05, + "loss": 0.448, + "step": 19277 + }, + { + "epoch": 3.146932778253949, + "grad_norm": 2.6808018684387207, + "learning_rate": 1.836451409589427e-05, + "loss": 0.443, + "step": 19278 + }, + { + "epoch": 3.1470960368964533, + "grad_norm": 2.147965431213379, + "learning_rate": 1.836434018038267e-05, + "loss": 0.5516, + "step": 19279 + }, + { + "epoch": 3.147259295538958, + "grad_norm": 2.337078094482422, + "learning_rate": 1.8364166256448173e-05, + "loss": 0.4661, + "step": 19280 + }, + { + "epoch": 3.1474225541814618, + "grad_norm": 1.8231076002120972, + "learning_rate": 1.8363992324090967e-05, + "loss": 0.427, + "step": 19281 + }, + { + "epoch": 3.147585812823966, + "grad_norm": 1.8085187673568726, + "learning_rate": 1.8363818383311226e-05, + "loss": 0.4434, + "step": 19282 + }, + { + "epoch": 3.1477490714664706, + "grad_norm": 2.2313284873962402, + "learning_rate": 1.8363644434109117e-05, + "loss": 0.489, + "step": 19283 + }, + { + "epoch": 3.147912330108975, + "grad_norm": 2.2925872802734375, + "learning_rate": 1.8363470476484824e-05, + "loss": 0.505, + "step": 19284 + }, + { + "epoch": 3.1480755887514795, + "grad_norm": 2.1349267959594727, + "learning_rate": 1.836329651043852e-05, + "loss": 0.4629, + "step": 19285 + }, + { + "epoch": 3.148238847393984, + "grad_norm": 1.7754992246627808, + "learning_rate": 1.8363122535970374e-05, + "loss": 0.4495, + "step": 19286 + }, + { + "epoch": 3.1484021060364884, + "grad_norm": 1.8219929933547974, + "learning_rate": 1.836294855308057e-05, + "loss": 0.4787, + "step": 19287 + }, + { + "epoch": 3.148565364678993, + "grad_norm": 2.0113871097564697, + "learning_rate": 1.8362774561769275e-05, + "loss": 0.4827, + "step": 19288 + }, + { + "epoch": 3.1487286233214973, + "grad_norm": 1.9047794342041016, + "learning_rate": 1.836260056203667e-05, + "loss": 0.4786, + "step": 19289 + }, + { + "epoch": 3.1488918819640013, + "grad_norm": 1.744431734085083, + "learning_rate": 1.8362426553882932e-05, + "loss": 0.3713, + "step": 19290 + }, + { + "epoch": 3.1490551406065057, + "grad_norm": 2.1946518421173096, + "learning_rate": 1.8362252537308226e-05, + "loss": 0.4497, + "step": 19291 + }, + { + "epoch": 3.14921839924901, + "grad_norm": 1.848078727722168, + "learning_rate": 1.8362078512312738e-05, + "loss": 0.387, + "step": 19292 + }, + { + "epoch": 3.1493816578915146, + "grad_norm": 1.9390836954116821, + "learning_rate": 1.836190447889664e-05, + "loss": 0.4612, + "step": 19293 + }, + { + "epoch": 3.149544916534019, + "grad_norm": 1.9046239852905273, + "learning_rate": 1.8361730437060103e-05, + "loss": 0.4893, + "step": 19294 + }, + { + "epoch": 3.1497081751765235, + "grad_norm": 2.034959316253662, + "learning_rate": 1.8361556386803308e-05, + "loss": 0.5809, + "step": 19295 + }, + { + "epoch": 3.149871433819028, + "grad_norm": 1.7888551950454712, + "learning_rate": 1.8361382328126426e-05, + "loss": 0.3904, + "step": 19296 + }, + { + "epoch": 3.1500346924615323, + "grad_norm": 2.4024617671966553, + "learning_rate": 1.8361208261029637e-05, + "loss": 0.5734, + "step": 19297 + }, + { + "epoch": 3.1501979511040368, + "grad_norm": 1.9023979902267456, + "learning_rate": 1.8361034185513113e-05, + "loss": 0.4837, + "step": 19298 + }, + { + "epoch": 3.1503612097465408, + "grad_norm": 1.9971468448638916, + "learning_rate": 1.8360860101577027e-05, + "loss": 0.4859, + "step": 19299 + }, + { + "epoch": 3.150524468389045, + "grad_norm": 2.05781626701355, + "learning_rate": 1.836068600922156e-05, + "loss": 0.4001, + "step": 19300 + }, + { + "epoch": 3.1506877270315496, + "grad_norm": 1.6749922037124634, + "learning_rate": 1.8360511908446886e-05, + "loss": 0.4357, + "step": 19301 + }, + { + "epoch": 3.150850985674054, + "grad_norm": 1.9641056060791016, + "learning_rate": 1.8360337799253173e-05, + "loss": 0.4974, + "step": 19302 + }, + { + "epoch": 3.1510142443165585, + "grad_norm": 2.427427053451538, + "learning_rate": 1.8360163681640606e-05, + "loss": 0.6142, + "step": 19303 + }, + { + "epoch": 3.151177502959063, + "grad_norm": 2.235340118408203, + "learning_rate": 1.8359989555609355e-05, + "loss": 0.5166, + "step": 19304 + }, + { + "epoch": 3.1513407616015674, + "grad_norm": 1.9851443767547607, + "learning_rate": 1.8359815421159595e-05, + "loss": 0.5307, + "step": 19305 + }, + { + "epoch": 3.151504020244072, + "grad_norm": 1.9493770599365234, + "learning_rate": 1.8359641278291508e-05, + "loss": 0.4374, + "step": 19306 + }, + { + "epoch": 3.151667278886576, + "grad_norm": 1.9398936033248901, + "learning_rate": 1.835946712700526e-05, + "loss": 0.5207, + "step": 19307 + }, + { + "epoch": 3.1518305375290803, + "grad_norm": 2.019547700881958, + "learning_rate": 1.8359292967301035e-05, + "loss": 0.4508, + "step": 19308 + }, + { + "epoch": 3.1519937961715847, + "grad_norm": 1.7939473390579224, + "learning_rate": 1.8359118799179002e-05, + "loss": 0.4152, + "step": 19309 + }, + { + "epoch": 3.152157054814089, + "grad_norm": 2.4552643299102783, + "learning_rate": 1.835894462263934e-05, + "loss": 0.5484, + "step": 19310 + }, + { + "epoch": 3.1523203134565936, + "grad_norm": 1.7289093732833862, + "learning_rate": 1.835877043768222e-05, + "loss": 0.457, + "step": 19311 + }, + { + "epoch": 3.152483572099098, + "grad_norm": 2.103909969329834, + "learning_rate": 1.835859624430782e-05, + "loss": 0.4587, + "step": 19312 + }, + { + "epoch": 3.1526468307416025, + "grad_norm": 1.7311168909072876, + "learning_rate": 1.835842204251632e-05, + "loss": 0.483, + "step": 19313 + }, + { + "epoch": 3.152810089384107, + "grad_norm": 1.9446189403533936, + "learning_rate": 1.835824783230789e-05, + "loss": 0.4557, + "step": 19314 + }, + { + "epoch": 3.1529733480266113, + "grad_norm": 2.013918161392212, + "learning_rate": 1.8358073613682705e-05, + "loss": 0.5215, + "step": 19315 + }, + { + "epoch": 3.1531366066691158, + "grad_norm": 1.787811279296875, + "learning_rate": 1.835789938664094e-05, + "loss": 0.4518, + "step": 19316 + }, + { + "epoch": 3.1532998653116198, + "grad_norm": 1.4579166173934937, + "learning_rate": 1.8357725151182778e-05, + "loss": 0.345, + "step": 19317 + }, + { + "epoch": 3.153463123954124, + "grad_norm": 2.0798680782318115, + "learning_rate": 1.8357550907308386e-05, + "loss": 0.4737, + "step": 19318 + }, + { + "epoch": 3.1536263825966286, + "grad_norm": 2.091296911239624, + "learning_rate": 1.8357376655017943e-05, + "loss": 0.5089, + "step": 19319 + }, + { + "epoch": 3.153789641239133, + "grad_norm": 1.633711814880371, + "learning_rate": 1.8357202394311624e-05, + "loss": 0.4324, + "step": 19320 + }, + { + "epoch": 3.1539528998816375, + "grad_norm": 2.1252927780151367, + "learning_rate": 1.8357028125189603e-05, + "loss": 0.5399, + "step": 19321 + }, + { + "epoch": 3.154116158524142, + "grad_norm": 1.931122064590454, + "learning_rate": 1.835685384765206e-05, + "loss": 0.4268, + "step": 19322 + }, + { + "epoch": 3.1542794171666464, + "grad_norm": 1.7546980381011963, + "learning_rate": 1.835667956169916e-05, + "loss": 0.405, + "step": 19323 + }, + { + "epoch": 3.154442675809151, + "grad_norm": 1.7538822889328003, + "learning_rate": 1.8356505267331096e-05, + "loss": 0.4342, + "step": 19324 + }, + { + "epoch": 3.154605934451655, + "grad_norm": 1.7514036893844604, + "learning_rate": 1.8356330964548026e-05, + "loss": 0.4226, + "step": 19325 + }, + { + "epoch": 3.1547691930941593, + "grad_norm": 1.9059715270996094, + "learning_rate": 1.8356156653350138e-05, + "loss": 0.4446, + "step": 19326 + }, + { + "epoch": 3.1549324517366637, + "grad_norm": 1.9629360437393188, + "learning_rate": 1.8355982333737597e-05, + "loss": 0.4917, + "step": 19327 + }, + { + "epoch": 3.155095710379168, + "grad_norm": 1.8719786405563354, + "learning_rate": 1.8355808005710588e-05, + "loss": 0.4581, + "step": 19328 + }, + { + "epoch": 3.1552589690216726, + "grad_norm": 1.8576785326004028, + "learning_rate": 1.835563366926928e-05, + "loss": 0.4979, + "step": 19329 + }, + { + "epoch": 3.155422227664177, + "grad_norm": 2.124069929122925, + "learning_rate": 1.8355459324413854e-05, + "loss": 0.536, + "step": 19330 + }, + { + "epoch": 3.1555854863066815, + "grad_norm": 1.781045913696289, + "learning_rate": 1.835528497114448e-05, + "loss": 0.4719, + "step": 19331 + }, + { + "epoch": 3.155748744949186, + "grad_norm": 1.9589205980300903, + "learning_rate": 1.835511060946134e-05, + "loss": 0.4906, + "step": 19332 + }, + { + "epoch": 3.1559120035916903, + "grad_norm": 1.7335076332092285, + "learning_rate": 1.8354936239364604e-05, + "loss": 0.454, + "step": 19333 + }, + { + "epoch": 3.1560752622341943, + "grad_norm": 2.192348003387451, + "learning_rate": 1.8354761860854447e-05, + "loss": 0.4786, + "step": 19334 + }, + { + "epoch": 3.1562385208766988, + "grad_norm": 2.196835517883301, + "learning_rate": 1.8354587473931048e-05, + "loss": 0.5243, + "step": 19335 + }, + { + "epoch": 3.156401779519203, + "grad_norm": 2.360830307006836, + "learning_rate": 1.8354413078594585e-05, + "loss": 0.5309, + "step": 19336 + }, + { + "epoch": 3.1565650381617076, + "grad_norm": 1.7894150018692017, + "learning_rate": 1.8354238674845225e-05, + "loss": 0.3726, + "step": 19337 + }, + { + "epoch": 3.156728296804212, + "grad_norm": 1.797112226486206, + "learning_rate": 1.8354064262683154e-05, + "loss": 0.4906, + "step": 19338 + }, + { + "epoch": 3.1568915554467165, + "grad_norm": 1.765925645828247, + "learning_rate": 1.835388984210854e-05, + "loss": 0.5123, + "step": 19339 + }, + { + "epoch": 3.157054814089221, + "grad_norm": 1.8997020721435547, + "learning_rate": 1.835371541312156e-05, + "loss": 0.4119, + "step": 19340 + }, + { + "epoch": 3.1572180727317254, + "grad_norm": 1.7575836181640625, + "learning_rate": 1.835354097572239e-05, + "loss": 0.4649, + "step": 19341 + }, + { + "epoch": 3.15738133137423, + "grad_norm": 2.382176160812378, + "learning_rate": 1.8353366529911208e-05, + "loss": 0.4932, + "step": 19342 + }, + { + "epoch": 3.157544590016734, + "grad_norm": 1.9101983308792114, + "learning_rate": 1.835319207568819e-05, + "loss": 0.438, + "step": 19343 + }, + { + "epoch": 3.1577078486592383, + "grad_norm": 1.8974034786224365, + "learning_rate": 1.8353017613053508e-05, + "loss": 0.4579, + "step": 19344 + }, + { + "epoch": 3.1578711073017427, + "grad_norm": 1.7961353063583374, + "learning_rate": 1.8352843142007343e-05, + "loss": 0.4421, + "step": 19345 + }, + { + "epoch": 3.158034365944247, + "grad_norm": 1.9813495874404907, + "learning_rate": 1.8352668662549863e-05, + "loss": 0.3998, + "step": 19346 + }, + { + "epoch": 3.1581976245867516, + "grad_norm": 2.050842523574829, + "learning_rate": 1.8352494174681248e-05, + "loss": 0.426, + "step": 19347 + }, + { + "epoch": 3.158360883229256, + "grad_norm": 2.035122871398926, + "learning_rate": 1.8352319678401677e-05, + "loss": 0.5783, + "step": 19348 + }, + { + "epoch": 3.1585241418717604, + "grad_norm": 2.0063464641571045, + "learning_rate": 1.835214517371132e-05, + "loss": 0.4805, + "step": 19349 + }, + { + "epoch": 3.158687400514265, + "grad_norm": 2.2939789295196533, + "learning_rate": 1.835197066061035e-05, + "loss": 0.4675, + "step": 19350 + }, + { + "epoch": 3.1588506591567693, + "grad_norm": 2.0906171798706055, + "learning_rate": 1.8351796139098953e-05, + "loss": 0.5614, + "step": 19351 + }, + { + "epoch": 3.1590139177992733, + "grad_norm": 2.0543031692504883, + "learning_rate": 1.83516216091773e-05, + "loss": 0.5093, + "step": 19352 + }, + { + "epoch": 3.1591771764417778, + "grad_norm": 1.7534745931625366, + "learning_rate": 1.8351447070845565e-05, + "loss": 0.3829, + "step": 19353 + }, + { + "epoch": 3.159340435084282, + "grad_norm": 2.091531991958618, + "learning_rate": 1.8351272524103928e-05, + "loss": 0.5027, + "step": 19354 + }, + { + "epoch": 3.1595036937267866, + "grad_norm": 2.3798582553863525, + "learning_rate": 1.835109796895256e-05, + "loss": 0.4476, + "step": 19355 + }, + { + "epoch": 3.159666952369291, + "grad_norm": 1.9583569765090942, + "learning_rate": 1.8350923405391636e-05, + "loss": 0.4805, + "step": 19356 + }, + { + "epoch": 3.1598302110117955, + "grad_norm": 1.758935570716858, + "learning_rate": 1.8350748833421336e-05, + "loss": 0.429, + "step": 19357 + }, + { + "epoch": 3.1599934696543, + "grad_norm": 1.7307875156402588, + "learning_rate": 1.8350574253041836e-05, + "loss": 0.4061, + "step": 19358 + }, + { + "epoch": 3.1601567282968044, + "grad_norm": 1.9494054317474365, + "learning_rate": 1.8350399664253307e-05, + "loss": 0.4377, + "step": 19359 + }, + { + "epoch": 3.1603199869393084, + "grad_norm": 1.8879218101501465, + "learning_rate": 1.8350225067055927e-05, + "loss": 0.4229, + "step": 19360 + }, + { + "epoch": 3.160483245581813, + "grad_norm": 1.867623209953308, + "learning_rate": 1.8350050461449874e-05, + "loss": 0.4678, + "step": 19361 + }, + { + "epoch": 3.1606465042243173, + "grad_norm": 1.6050113439559937, + "learning_rate": 1.8349875847435322e-05, + "loss": 0.3863, + "step": 19362 + }, + { + "epoch": 3.1608097628668217, + "grad_norm": 1.8245062828063965, + "learning_rate": 1.834970122501245e-05, + "loss": 0.4506, + "step": 19363 + }, + { + "epoch": 3.160973021509326, + "grad_norm": 1.7713088989257812, + "learning_rate": 1.8349526594181428e-05, + "loss": 0.4477, + "step": 19364 + }, + { + "epoch": 3.1611362801518306, + "grad_norm": 2.07456636428833, + "learning_rate": 1.8349351954942438e-05, + "loss": 0.5156, + "step": 19365 + }, + { + "epoch": 3.161299538794335, + "grad_norm": 2.008727550506592, + "learning_rate": 1.834917730729565e-05, + "loss": 0.492, + "step": 19366 + }, + { + "epoch": 3.1614627974368394, + "grad_norm": 1.812483310699463, + "learning_rate": 1.8349002651241243e-05, + "loss": 0.4671, + "step": 19367 + }, + { + "epoch": 3.161626056079344, + "grad_norm": 1.8250105381011963, + "learning_rate": 1.8348827986779394e-05, + "loss": 0.4703, + "step": 19368 + }, + { + "epoch": 3.1617893147218483, + "grad_norm": 1.9629286527633667, + "learning_rate": 1.8348653313910276e-05, + "loss": 0.4598, + "step": 19369 + }, + { + "epoch": 3.1619525733643523, + "grad_norm": 1.9102827310562134, + "learning_rate": 1.8348478632634067e-05, + "loss": 0.487, + "step": 19370 + }, + { + "epoch": 3.1621158320068568, + "grad_norm": 2.093076467514038, + "learning_rate": 1.834830394295094e-05, + "loss": 0.4824, + "step": 19371 + }, + { + "epoch": 3.162279090649361, + "grad_norm": 1.7708357572555542, + "learning_rate": 1.8348129244861078e-05, + "loss": 0.4209, + "step": 19372 + }, + { + "epoch": 3.1624423492918656, + "grad_norm": 2.405855655670166, + "learning_rate": 1.8347954538364646e-05, + "loss": 0.4938, + "step": 19373 + }, + { + "epoch": 3.16260560793437, + "grad_norm": 2.4030673503875732, + "learning_rate": 1.834777982346183e-05, + "loss": 0.4744, + "step": 19374 + }, + { + "epoch": 3.1627688665768745, + "grad_norm": 1.9872937202453613, + "learning_rate": 1.8347605100152804e-05, + "loss": 0.5006, + "step": 19375 + }, + { + "epoch": 3.162932125219379, + "grad_norm": 2.140369176864624, + "learning_rate": 1.8347430368437738e-05, + "loss": 0.5117, + "step": 19376 + }, + { + "epoch": 3.1630953838618834, + "grad_norm": 2.0262787342071533, + "learning_rate": 1.8347255628316812e-05, + "loss": 0.4884, + "step": 19377 + }, + { + "epoch": 3.1632586425043874, + "grad_norm": 1.948663353919983, + "learning_rate": 1.8347080879790203e-05, + "loss": 0.4746, + "step": 19378 + }, + { + "epoch": 3.163421901146892, + "grad_norm": 1.9948992729187012, + "learning_rate": 1.8346906122858085e-05, + "loss": 0.4613, + "step": 19379 + }, + { + "epoch": 3.1635851597893963, + "grad_norm": 2.0289862155914307, + "learning_rate": 1.8346731357520637e-05, + "loss": 0.4789, + "step": 19380 + }, + { + "epoch": 3.1637484184319007, + "grad_norm": 1.9380401372909546, + "learning_rate": 1.8346556583778032e-05, + "loss": 0.4383, + "step": 19381 + }, + { + "epoch": 3.163911677074405, + "grad_norm": 2.375171422958374, + "learning_rate": 1.8346381801630446e-05, + "loss": 0.5323, + "step": 19382 + }, + { + "epoch": 3.1640749357169096, + "grad_norm": 2.0791704654693604, + "learning_rate": 1.8346207011078056e-05, + "loss": 0.5354, + "step": 19383 + }, + { + "epoch": 3.164238194359414, + "grad_norm": 1.989286184310913, + "learning_rate": 1.8346032212121036e-05, + "loss": 0.4513, + "step": 19384 + }, + { + "epoch": 3.1644014530019184, + "grad_norm": 2.0222158432006836, + "learning_rate": 1.8345857404759565e-05, + "loss": 0.4518, + "step": 19385 + }, + { + "epoch": 3.164564711644423, + "grad_norm": 1.9334638118743896, + "learning_rate": 1.8345682588993816e-05, + "loss": 0.427, + "step": 19386 + }, + { + "epoch": 3.164727970286927, + "grad_norm": 2.006775379180908, + "learning_rate": 1.834550776482397e-05, + "loss": 0.5372, + "step": 19387 + }, + { + "epoch": 3.1648912289294313, + "grad_norm": 1.694175124168396, + "learning_rate": 1.83453329322502e-05, + "loss": 0.4481, + "step": 19388 + }, + { + "epoch": 3.1650544875719357, + "grad_norm": 1.7279059886932373, + "learning_rate": 1.834515809127268e-05, + "loss": 0.4219, + "step": 19389 + }, + { + "epoch": 3.16521774621444, + "grad_norm": 1.769660234451294, + "learning_rate": 1.8344983241891588e-05, + "loss": 0.4353, + "step": 19390 + }, + { + "epoch": 3.1653810048569446, + "grad_norm": 2.053053140640259, + "learning_rate": 1.83448083841071e-05, + "loss": 0.527, + "step": 19391 + }, + { + "epoch": 3.165544263499449, + "grad_norm": 1.9787068367004395, + "learning_rate": 1.834463351791939e-05, + "loss": 0.4029, + "step": 19392 + }, + { + "epoch": 3.1657075221419535, + "grad_norm": 2.4926156997680664, + "learning_rate": 1.834445864332864e-05, + "loss": 0.5314, + "step": 19393 + }, + { + "epoch": 3.165870780784458, + "grad_norm": 2.2229695320129395, + "learning_rate": 1.8344283760335022e-05, + "loss": 0.5468, + "step": 19394 + }, + { + "epoch": 3.166034039426962, + "grad_norm": 1.8993566036224365, + "learning_rate": 1.8344108868938715e-05, + "loss": 0.546, + "step": 19395 + }, + { + "epoch": 3.1661972980694664, + "grad_norm": 1.824479341506958, + "learning_rate": 1.8343933969139888e-05, + "loss": 0.4781, + "step": 19396 + }, + { + "epoch": 3.166360556711971, + "grad_norm": 2.098909854888916, + "learning_rate": 1.834375906093872e-05, + "loss": 0.4905, + "step": 19397 + }, + { + "epoch": 3.1665238153544752, + "grad_norm": 2.050670623779297, + "learning_rate": 1.8343584144335395e-05, + "loss": 0.4797, + "step": 19398 + }, + { + "epoch": 3.1666870739969797, + "grad_norm": 1.9465349912643433, + "learning_rate": 1.834340921933008e-05, + "loss": 0.5158, + "step": 19399 + }, + { + "epoch": 3.166850332639484, + "grad_norm": 1.7363545894622803, + "learning_rate": 1.8343234285922955e-05, + "loss": 0.4277, + "step": 19400 + }, + { + "epoch": 3.1670135912819886, + "grad_norm": 1.7218409776687622, + "learning_rate": 1.834305934411419e-05, + "loss": 0.4401, + "step": 19401 + }, + { + "epoch": 3.167176849924493, + "grad_norm": 1.4308303594589233, + "learning_rate": 1.8342884393903975e-05, + "loss": 0.3443, + "step": 19402 + }, + { + "epoch": 3.1673401085669974, + "grad_norm": 1.7513666152954102, + "learning_rate": 1.8342709435292476e-05, + "loss": 0.4643, + "step": 19403 + }, + { + "epoch": 3.167503367209502, + "grad_norm": 2.188016176223755, + "learning_rate": 1.8342534468279863e-05, + "loss": 0.5133, + "step": 19404 + }, + { + "epoch": 3.167666625852006, + "grad_norm": 1.957112431526184, + "learning_rate": 1.8342359492866327e-05, + "loss": 0.4853, + "step": 19405 + }, + { + "epoch": 3.1678298844945103, + "grad_norm": 1.801254153251648, + "learning_rate": 1.8342184509052034e-05, + "loss": 0.5029, + "step": 19406 + }, + { + "epoch": 3.1679931431370147, + "grad_norm": 1.875631332397461, + "learning_rate": 1.8342009516837166e-05, + "loss": 0.5092, + "step": 19407 + }, + { + "epoch": 3.168156401779519, + "grad_norm": 1.6301218271255493, + "learning_rate": 1.8341834516221896e-05, + "loss": 0.3862, + "step": 19408 + }, + { + "epoch": 3.1683196604220236, + "grad_norm": 2.2237656116485596, + "learning_rate": 1.83416595072064e-05, + "loss": 0.5631, + "step": 19409 + }, + { + "epoch": 3.168482919064528, + "grad_norm": 1.911882996559143, + "learning_rate": 1.8341484489790856e-05, + "loss": 0.4739, + "step": 19410 + }, + { + "epoch": 3.1686461777070325, + "grad_norm": 2.035968065261841, + "learning_rate": 1.8341309463975437e-05, + "loss": 0.479, + "step": 19411 + }, + { + "epoch": 3.168809436349537, + "grad_norm": 2.013012409210205, + "learning_rate": 1.8341134429760322e-05, + "loss": 0.4654, + "step": 19412 + }, + { + "epoch": 3.168972694992041, + "grad_norm": 2.054798126220703, + "learning_rate": 1.8340959387145693e-05, + "loss": 0.4669, + "step": 19413 + }, + { + "epoch": 3.1691359536345454, + "grad_norm": 1.959506630897522, + "learning_rate": 1.8340784336131715e-05, + "loss": 0.4978, + "step": 19414 + }, + { + "epoch": 3.16929921227705, + "grad_norm": 2.094292163848877, + "learning_rate": 1.834060927671857e-05, + "loss": 0.5704, + "step": 19415 + }, + { + "epoch": 3.1694624709195542, + "grad_norm": 1.8116670846939087, + "learning_rate": 1.8340434208906434e-05, + "loss": 0.4186, + "step": 19416 + }, + { + "epoch": 3.1696257295620587, + "grad_norm": 1.7341760396957397, + "learning_rate": 1.8340259132695483e-05, + "loss": 0.4461, + "step": 19417 + }, + { + "epoch": 3.169788988204563, + "grad_norm": 1.9142746925354004, + "learning_rate": 1.8340084048085888e-05, + "loss": 0.587, + "step": 19418 + }, + { + "epoch": 3.1699522468470676, + "grad_norm": 2.0725996494293213, + "learning_rate": 1.8339908955077836e-05, + "loss": 0.4087, + "step": 19419 + }, + { + "epoch": 3.170115505489572, + "grad_norm": 1.709054946899414, + "learning_rate": 1.8339733853671497e-05, + "loss": 0.4193, + "step": 19420 + }, + { + "epoch": 3.1702787641320764, + "grad_norm": 1.9195125102996826, + "learning_rate": 1.833955874386705e-05, + "loss": 0.462, + "step": 19421 + }, + { + "epoch": 3.1704420227745804, + "grad_norm": 1.7478309869766235, + "learning_rate": 1.8339383625664667e-05, + "loss": 0.4512, + "step": 19422 + }, + { + "epoch": 3.170605281417085, + "grad_norm": 2.0806820392608643, + "learning_rate": 1.833920849906453e-05, + "loss": 0.4162, + "step": 19423 + }, + { + "epoch": 3.1707685400595893, + "grad_norm": 1.8339508771896362, + "learning_rate": 1.8339033364066808e-05, + "loss": 0.4163, + "step": 19424 + }, + { + "epoch": 3.1709317987020937, + "grad_norm": 2.22298526763916, + "learning_rate": 1.8338858220671683e-05, + "loss": 0.5977, + "step": 19425 + }, + { + "epoch": 3.171095057344598, + "grad_norm": 1.776863932609558, + "learning_rate": 1.833868306887933e-05, + "loss": 0.4193, + "step": 19426 + }, + { + "epoch": 3.1712583159871026, + "grad_norm": 1.916347861289978, + "learning_rate": 1.8338507908689925e-05, + "loss": 0.4338, + "step": 19427 + }, + { + "epoch": 3.171421574629607, + "grad_norm": 2.3009960651397705, + "learning_rate": 1.8338332740103648e-05, + "loss": 0.5209, + "step": 19428 + }, + { + "epoch": 3.1715848332721115, + "grad_norm": 1.7458263635635376, + "learning_rate": 1.833815756312067e-05, + "loss": 0.4449, + "step": 19429 + }, + { + "epoch": 3.171748091914616, + "grad_norm": 1.730692744255066, + "learning_rate": 1.8337982377741167e-05, + "loss": 0.4525, + "step": 19430 + }, + { + "epoch": 3.17191135055712, + "grad_norm": 2.0963997840881348, + "learning_rate": 1.8337807183965322e-05, + "loss": 0.5663, + "step": 19431 + }, + { + "epoch": 3.1720746091996244, + "grad_norm": 1.4782755374908447, + "learning_rate": 1.8337631981793308e-05, + "loss": 0.3942, + "step": 19432 + }, + { + "epoch": 3.172237867842129, + "grad_norm": 1.969273567199707, + "learning_rate": 1.8337456771225296e-05, + "loss": 0.4845, + "step": 19433 + }, + { + "epoch": 3.1724011264846332, + "grad_norm": 1.8532772064208984, + "learning_rate": 1.833728155226147e-05, + "loss": 0.4689, + "step": 19434 + }, + { + "epoch": 3.1725643851271377, + "grad_norm": 2.19911527633667, + "learning_rate": 1.8337106324902002e-05, + "loss": 0.5301, + "step": 19435 + }, + { + "epoch": 3.172727643769642, + "grad_norm": 2.0296473503112793, + "learning_rate": 1.8336931089147076e-05, + "loss": 0.4283, + "step": 19436 + }, + { + "epoch": 3.1728909024121466, + "grad_norm": 2.0314407348632812, + "learning_rate": 1.8336755844996857e-05, + "loss": 0.4577, + "step": 19437 + }, + { + "epoch": 3.173054161054651, + "grad_norm": 1.9533050060272217, + "learning_rate": 1.833658059245153e-05, + "loss": 0.4761, + "step": 19438 + }, + { + "epoch": 3.1732174196971554, + "grad_norm": 2.100487232208252, + "learning_rate": 1.8336405331511263e-05, + "loss": 0.4898, + "step": 19439 + }, + { + "epoch": 3.1733806783396594, + "grad_norm": 1.3662142753601074, + "learning_rate": 1.8336230062176245e-05, + "loss": 0.3774, + "step": 19440 + }, + { + "epoch": 3.173543936982164, + "grad_norm": 1.8979792594909668, + "learning_rate": 1.833605478444664e-05, + "loss": 0.4527, + "step": 19441 + }, + { + "epoch": 3.1737071956246683, + "grad_norm": 1.8443915843963623, + "learning_rate": 1.8335879498322633e-05, + "loss": 0.3716, + "step": 19442 + }, + { + "epoch": 3.1738704542671727, + "grad_norm": 2.0308098793029785, + "learning_rate": 1.83357042038044e-05, + "loss": 0.4476, + "step": 19443 + }, + { + "epoch": 3.174033712909677, + "grad_norm": 2.054004192352295, + "learning_rate": 1.833552890089211e-05, + "loss": 0.4554, + "step": 19444 + }, + { + "epoch": 3.1741969715521816, + "grad_norm": 2.174063205718994, + "learning_rate": 1.8335353589585945e-05, + "loss": 0.4951, + "step": 19445 + }, + { + "epoch": 3.174360230194686, + "grad_norm": 2.3536453247070312, + "learning_rate": 1.8335178269886086e-05, + "loss": 0.5772, + "step": 19446 + }, + { + "epoch": 3.1745234888371905, + "grad_norm": 1.9505541324615479, + "learning_rate": 1.83350029417927e-05, + "loss": 0.5203, + "step": 19447 + }, + { + "epoch": 3.1746867474796945, + "grad_norm": 2.2560369968414307, + "learning_rate": 1.833482760530597e-05, + "loss": 0.6076, + "step": 19448 + }, + { + "epoch": 3.174850006122199, + "grad_norm": 1.7788325548171997, + "learning_rate": 1.8334652260426068e-05, + "loss": 0.4625, + "step": 19449 + }, + { + "epoch": 3.1750132647647034, + "grad_norm": 1.842991590499878, + "learning_rate": 1.8334476907153177e-05, + "loss": 0.3963, + "step": 19450 + }, + { + "epoch": 3.175176523407208, + "grad_norm": 2.0132415294647217, + "learning_rate": 1.833430154548747e-05, + "loss": 0.4696, + "step": 19451 + }, + { + "epoch": 3.1753397820497122, + "grad_norm": 1.7360085248947144, + "learning_rate": 1.8334126175429125e-05, + "loss": 0.39, + "step": 19452 + }, + { + "epoch": 3.1755030406922167, + "grad_norm": 1.7899138927459717, + "learning_rate": 1.8333950796978312e-05, + "loss": 0.4272, + "step": 19453 + }, + { + "epoch": 3.175666299334721, + "grad_norm": 2.2138829231262207, + "learning_rate": 1.833377541013522e-05, + "loss": 0.4903, + "step": 19454 + }, + { + "epoch": 3.1758295579772255, + "grad_norm": 2.0732901096343994, + "learning_rate": 1.833360001490001e-05, + "loss": 0.4976, + "step": 19455 + }, + { + "epoch": 3.17599281661973, + "grad_norm": 2.004549264907837, + "learning_rate": 1.833342461127287e-05, + "loss": 0.4824, + "step": 19456 + }, + { + "epoch": 3.1761560752622344, + "grad_norm": 1.7096878290176392, + "learning_rate": 1.8333249199253974e-05, + "loss": 0.4265, + "step": 19457 + }, + { + "epoch": 3.1763193339047384, + "grad_norm": 1.686898112297058, + "learning_rate": 1.83330737788435e-05, + "loss": 0.394, + "step": 19458 + }, + { + "epoch": 3.176482592547243, + "grad_norm": 1.7556111812591553, + "learning_rate": 1.833289835004162e-05, + "loss": 0.4616, + "step": 19459 + }, + { + "epoch": 3.1766458511897473, + "grad_norm": 1.6305060386657715, + "learning_rate": 1.8332722912848513e-05, + "loss": 0.3793, + "step": 19460 + }, + { + "epoch": 3.1768091098322517, + "grad_norm": 1.7666574716567993, + "learning_rate": 1.833254746726436e-05, + "loss": 0.4028, + "step": 19461 + }, + { + "epoch": 3.176972368474756, + "grad_norm": 2.3226239681243896, + "learning_rate": 1.8332372013289335e-05, + "loss": 0.542, + "step": 19462 + }, + { + "epoch": 3.1771356271172606, + "grad_norm": 2.1562864780426025, + "learning_rate": 1.833219655092361e-05, + "loss": 0.503, + "step": 19463 + }, + { + "epoch": 3.177298885759765, + "grad_norm": 1.6368584632873535, + "learning_rate": 1.8332021080167366e-05, + "loss": 0.4203, + "step": 19464 + }, + { + "epoch": 3.1774621444022695, + "grad_norm": 1.726203203201294, + "learning_rate": 1.833184560102078e-05, + "loss": 0.4656, + "step": 19465 + }, + { + "epoch": 3.1776254030447735, + "grad_norm": 1.5525686740875244, + "learning_rate": 1.8331670113484025e-05, + "loss": 0.3821, + "step": 19466 + }, + { + "epoch": 3.177788661687278, + "grad_norm": 2.148404836654663, + "learning_rate": 1.8331494617557284e-05, + "loss": 0.5092, + "step": 19467 + }, + { + "epoch": 3.1779519203297824, + "grad_norm": 1.8498390913009644, + "learning_rate": 1.833131911324073e-05, + "loss": 0.4472, + "step": 19468 + }, + { + "epoch": 3.178115178972287, + "grad_norm": 1.7343631982803345, + "learning_rate": 1.8331143600534534e-05, + "loss": 0.4446, + "step": 19469 + }, + { + "epoch": 3.1782784376147912, + "grad_norm": 1.6653110980987549, + "learning_rate": 1.8330968079438887e-05, + "loss": 0.4357, + "step": 19470 + }, + { + "epoch": 3.1784416962572957, + "grad_norm": 2.0536370277404785, + "learning_rate": 1.833079254995395e-05, + "loss": 0.447, + "step": 19471 + }, + { + "epoch": 3.1786049548998, + "grad_norm": 1.8495733737945557, + "learning_rate": 1.8330617012079913e-05, + "loss": 0.5094, + "step": 19472 + }, + { + "epoch": 3.1787682135423045, + "grad_norm": 1.853529930114746, + "learning_rate": 1.8330441465816942e-05, + "loss": 0.4171, + "step": 19473 + }, + { + "epoch": 3.178931472184809, + "grad_norm": 2.032954454421997, + "learning_rate": 1.833026591116522e-05, + "loss": 0.536, + "step": 19474 + }, + { + "epoch": 3.179094730827313, + "grad_norm": 1.8326293230056763, + "learning_rate": 1.8330090348124926e-05, + "loss": 0.4567, + "step": 19475 + }, + { + "epoch": 3.1792579894698174, + "grad_norm": 1.8864691257476807, + "learning_rate": 1.832991477669623e-05, + "loss": 0.5495, + "step": 19476 + }, + { + "epoch": 3.179421248112322, + "grad_norm": 1.8669357299804688, + "learning_rate": 1.8329739196879312e-05, + "loss": 0.4421, + "step": 19477 + }, + { + "epoch": 3.1795845067548263, + "grad_norm": 2.1007907390594482, + "learning_rate": 1.832956360867435e-05, + "loss": 0.4499, + "step": 19478 + }, + { + "epoch": 3.1797477653973307, + "grad_norm": 2.15738844871521, + "learning_rate": 1.832938801208152e-05, + "loss": 0.5399, + "step": 19479 + }, + { + "epoch": 3.179911024039835, + "grad_norm": 1.8570142984390259, + "learning_rate": 1.8329212407100996e-05, + "loss": 0.4516, + "step": 19480 + }, + { + "epoch": 3.1800742826823396, + "grad_norm": 1.9413913488388062, + "learning_rate": 1.832903679373296e-05, + "loss": 0.4317, + "step": 19481 + }, + { + "epoch": 3.180237541324844, + "grad_norm": 1.5999428033828735, + "learning_rate": 1.8328861171977585e-05, + "loss": 0.4162, + "step": 19482 + }, + { + "epoch": 3.180400799967348, + "grad_norm": 2.2204248905181885, + "learning_rate": 1.832868554183505e-05, + "loss": 0.5326, + "step": 19483 + }, + { + "epoch": 3.1805640586098525, + "grad_norm": 2.2970845699310303, + "learning_rate": 1.832850990330553e-05, + "loss": 0.5384, + "step": 19484 + }, + { + "epoch": 3.180727317252357, + "grad_norm": 1.6347428560256958, + "learning_rate": 1.83283342563892e-05, + "loss": 0.4168, + "step": 19485 + }, + { + "epoch": 3.1808905758948613, + "grad_norm": 1.9419910907745361, + "learning_rate": 1.8328158601086242e-05, + "loss": 0.5134, + "step": 19486 + }, + { + "epoch": 3.181053834537366, + "grad_norm": 1.5779368877410889, + "learning_rate": 1.8327982937396833e-05, + "loss": 0.3957, + "step": 19487 + }, + { + "epoch": 3.1812170931798702, + "grad_norm": 1.8582366704940796, + "learning_rate": 1.8327807265321145e-05, + "loss": 0.4905, + "step": 19488 + }, + { + "epoch": 3.1813803518223747, + "grad_norm": 1.5388875007629395, + "learning_rate": 1.8327631584859355e-05, + "loss": 0.4332, + "step": 19489 + }, + { + "epoch": 3.181543610464879, + "grad_norm": 1.710969090461731, + "learning_rate": 1.8327455896011645e-05, + "loss": 0.4068, + "step": 19490 + }, + { + "epoch": 3.1817068691073835, + "grad_norm": 2.0014355182647705, + "learning_rate": 1.832728019877819e-05, + "loss": 0.5252, + "step": 19491 + }, + { + "epoch": 3.181870127749888, + "grad_norm": 1.685555100440979, + "learning_rate": 1.8327104493159167e-05, + "loss": 0.38, + "step": 19492 + }, + { + "epoch": 3.182033386392392, + "grad_norm": 1.9415581226348877, + "learning_rate": 1.8326928779154748e-05, + "loss": 0.4434, + "step": 19493 + }, + { + "epoch": 3.1821966450348964, + "grad_norm": 1.9431551694869995, + "learning_rate": 1.8326753056765113e-05, + "loss": 0.4571, + "step": 19494 + }, + { + "epoch": 3.182359903677401, + "grad_norm": 2.0218188762664795, + "learning_rate": 1.8326577325990444e-05, + "loss": 0.4441, + "step": 19495 + }, + { + "epoch": 3.1825231623199053, + "grad_norm": 1.966469645500183, + "learning_rate": 1.832640158683091e-05, + "loss": 0.4783, + "step": 19496 + }, + { + "epoch": 3.1826864209624097, + "grad_norm": 1.6297578811645508, + "learning_rate": 1.8326225839286698e-05, + "loss": 0.3948, + "step": 19497 + }, + { + "epoch": 3.182849679604914, + "grad_norm": 1.9749455451965332, + "learning_rate": 1.8326050083357977e-05, + "loss": 0.4967, + "step": 19498 + }, + { + "epoch": 3.1830129382474186, + "grad_norm": 1.7173680067062378, + "learning_rate": 1.832587431904492e-05, + "loss": 0.4651, + "step": 19499 + }, + { + "epoch": 3.183176196889923, + "grad_norm": 1.996874451637268, + "learning_rate": 1.8325698546347714e-05, + "loss": 0.477, + "step": 19500 + }, + { + "epoch": 3.183339455532427, + "grad_norm": 2.2433865070343018, + "learning_rate": 1.8325522765266532e-05, + "loss": 0.5478, + "step": 19501 + }, + { + "epoch": 3.1835027141749315, + "grad_norm": 1.8912630081176758, + "learning_rate": 1.832534697580155e-05, + "loss": 0.4732, + "step": 19502 + }, + { + "epoch": 3.183665972817436, + "grad_norm": 2.1384878158569336, + "learning_rate": 1.8325171177952948e-05, + "loss": 0.5055, + "step": 19503 + }, + { + "epoch": 3.1838292314599403, + "grad_norm": 1.8180205821990967, + "learning_rate": 1.8324995371720898e-05, + "loss": 0.4894, + "step": 19504 + }, + { + "epoch": 3.183992490102445, + "grad_norm": 1.6100879907608032, + "learning_rate": 1.832481955710558e-05, + "loss": 0.4242, + "step": 19505 + }, + { + "epoch": 3.184155748744949, + "grad_norm": 2.200056791305542, + "learning_rate": 1.832464373410717e-05, + "loss": 0.4414, + "step": 19506 + }, + { + "epoch": 3.1843190073874537, + "grad_norm": 2.087191581726074, + "learning_rate": 1.8324467902725848e-05, + "loss": 0.4987, + "step": 19507 + }, + { + "epoch": 3.184482266029958, + "grad_norm": 1.9070347547531128, + "learning_rate": 1.832429206296179e-05, + "loss": 0.49, + "step": 19508 + }, + { + "epoch": 3.1846455246724625, + "grad_norm": 1.9199659824371338, + "learning_rate": 1.8324116214815172e-05, + "loss": 0.475, + "step": 19509 + }, + { + "epoch": 3.1848087833149665, + "grad_norm": 2.3945484161376953, + "learning_rate": 1.832394035828617e-05, + "loss": 0.4879, + "step": 19510 + }, + { + "epoch": 3.184972041957471, + "grad_norm": 1.7942672967910767, + "learning_rate": 1.8323764493374964e-05, + "loss": 0.443, + "step": 19511 + }, + { + "epoch": 3.1851353005999754, + "grad_norm": 1.758232593536377, + "learning_rate": 1.8323588620081723e-05, + "loss": 0.4461, + "step": 19512 + }, + { + "epoch": 3.18529855924248, + "grad_norm": 2.085678815841675, + "learning_rate": 1.8323412738406638e-05, + "loss": 0.5214, + "step": 19513 + }, + { + "epoch": 3.1854618178849843, + "grad_norm": 1.8132766485214233, + "learning_rate": 1.8323236848349873e-05, + "loss": 0.4397, + "step": 19514 + }, + { + "epoch": 3.1856250765274887, + "grad_norm": 1.7460663318634033, + "learning_rate": 1.8323060949911612e-05, + "loss": 0.4849, + "step": 19515 + }, + { + "epoch": 3.185788335169993, + "grad_norm": 2.100801944732666, + "learning_rate": 1.8322885043092035e-05, + "loss": 0.5102, + "step": 19516 + }, + { + "epoch": 3.1859515938124976, + "grad_norm": 1.6405383348464966, + "learning_rate": 1.832270912789131e-05, + "loss": 0.4022, + "step": 19517 + }, + { + "epoch": 3.186114852455002, + "grad_norm": 2.1602718830108643, + "learning_rate": 1.8322533204309622e-05, + "loss": 0.6193, + "step": 19518 + }, + { + "epoch": 3.186278111097506, + "grad_norm": 1.7573230266571045, + "learning_rate": 1.8322357272347146e-05, + "loss": 0.426, + "step": 19519 + }, + { + "epoch": 3.1864413697400105, + "grad_norm": 1.9749364852905273, + "learning_rate": 1.8322181332004057e-05, + "loss": 0.493, + "step": 19520 + }, + { + "epoch": 3.186604628382515, + "grad_norm": 1.8659029006958008, + "learning_rate": 1.8322005383280534e-05, + "loss": 0.4563, + "step": 19521 + }, + { + "epoch": 3.1867678870250193, + "grad_norm": 2.1972219944000244, + "learning_rate": 1.832182942617675e-05, + "loss": 0.5257, + "step": 19522 + }, + { + "epoch": 3.186931145667524, + "grad_norm": 1.6085740327835083, + "learning_rate": 1.832165346069289e-05, + "loss": 0.4155, + "step": 19523 + }, + { + "epoch": 3.187094404310028, + "grad_norm": 2.3197073936462402, + "learning_rate": 1.8321477486829128e-05, + "loss": 0.5119, + "step": 19524 + }, + { + "epoch": 3.1872576629525327, + "grad_norm": 1.645952820777893, + "learning_rate": 1.8321301504585638e-05, + "loss": 0.3581, + "step": 19525 + }, + { + "epoch": 3.187420921595037, + "grad_norm": 1.7364463806152344, + "learning_rate": 1.83211255139626e-05, + "loss": 0.4272, + "step": 19526 + }, + { + "epoch": 3.1875841802375415, + "grad_norm": 1.6503351926803589, + "learning_rate": 1.8320949514960192e-05, + "loss": 0.4427, + "step": 19527 + }, + { + "epoch": 3.1877474388800455, + "grad_norm": 1.811378836631775, + "learning_rate": 1.832077350757859e-05, + "loss": 0.4378, + "step": 19528 + }, + { + "epoch": 3.18791069752255, + "grad_norm": 1.887601613998413, + "learning_rate": 1.832059749181797e-05, + "loss": 0.4467, + "step": 19529 + }, + { + "epoch": 3.1880739561650544, + "grad_norm": 1.6549791097640991, + "learning_rate": 1.8320421467678507e-05, + "loss": 0.3898, + "step": 19530 + }, + { + "epoch": 3.188237214807559, + "grad_norm": 1.8880434036254883, + "learning_rate": 1.832024543516039e-05, + "loss": 0.5054, + "step": 19531 + }, + { + "epoch": 3.1884004734500633, + "grad_norm": 1.8308242559432983, + "learning_rate": 1.8320069394263784e-05, + "loss": 0.4075, + "step": 19532 + }, + { + "epoch": 3.1885637320925677, + "grad_norm": 2.060730457305908, + "learning_rate": 1.8319893344988867e-05, + "loss": 0.5441, + "step": 19533 + }, + { + "epoch": 3.188726990735072, + "grad_norm": 2.0667738914489746, + "learning_rate": 1.8319717287335822e-05, + "loss": 0.4611, + "step": 19534 + }, + { + "epoch": 3.1888902493775766, + "grad_norm": 2.1282100677490234, + "learning_rate": 1.8319541221304825e-05, + "loss": 0.5761, + "step": 19535 + }, + { + "epoch": 3.1890535080200806, + "grad_norm": 2.2473654747009277, + "learning_rate": 1.8319365146896056e-05, + "loss": 0.5521, + "step": 19536 + }, + { + "epoch": 3.189216766662585, + "grad_norm": 1.591321587562561, + "learning_rate": 1.831918906410968e-05, + "loss": 0.381, + "step": 19537 + }, + { + "epoch": 3.1893800253050895, + "grad_norm": 2.2890560626983643, + "learning_rate": 1.8319012972945887e-05, + "loss": 0.4908, + "step": 19538 + }, + { + "epoch": 3.189543283947594, + "grad_norm": 1.6805204153060913, + "learning_rate": 1.8318836873404854e-05, + "loss": 0.4955, + "step": 19539 + }, + { + "epoch": 3.1897065425900983, + "grad_norm": 1.6428909301757812, + "learning_rate": 1.831866076548675e-05, + "loss": 0.3978, + "step": 19540 + }, + { + "epoch": 3.1898698012326028, + "grad_norm": 1.9693000316619873, + "learning_rate": 1.8318484649191757e-05, + "loss": 0.3583, + "step": 19541 + }, + { + "epoch": 3.190033059875107, + "grad_norm": 2.095877170562744, + "learning_rate": 1.8318308524520052e-05, + "loss": 0.5064, + "step": 19542 + }, + { + "epoch": 3.1901963185176117, + "grad_norm": 1.9269578456878662, + "learning_rate": 1.8318132391471815e-05, + "loss": 0.4648, + "step": 19543 + }, + { + "epoch": 3.190359577160116, + "grad_norm": 1.7647509574890137, + "learning_rate": 1.831795625004722e-05, + "loss": 0.4636, + "step": 19544 + }, + { + "epoch": 3.1905228358026205, + "grad_norm": 1.8884388208389282, + "learning_rate": 1.8317780100246442e-05, + "loss": 0.4453, + "step": 19545 + }, + { + "epoch": 3.1906860944451245, + "grad_norm": 1.9400248527526855, + "learning_rate": 1.8317603942069665e-05, + "loss": 0.406, + "step": 19546 + }, + { + "epoch": 3.190849353087629, + "grad_norm": 2.1974709033966064, + "learning_rate": 1.8317427775517063e-05, + "loss": 0.583, + "step": 19547 + }, + { + "epoch": 3.1910126117301334, + "grad_norm": 1.8757987022399902, + "learning_rate": 1.8317251600588814e-05, + "loss": 0.4075, + "step": 19548 + }, + { + "epoch": 3.191175870372638, + "grad_norm": 2.389094352722168, + "learning_rate": 1.8317075417285092e-05, + "loss": 0.529, + "step": 19549 + }, + { + "epoch": 3.1913391290151423, + "grad_norm": 2.011375665664673, + "learning_rate": 1.8316899225606078e-05, + "loss": 0.4638, + "step": 19550 + }, + { + "epoch": 3.1915023876576467, + "grad_norm": 2.1239383220672607, + "learning_rate": 1.831672302555195e-05, + "loss": 0.5059, + "step": 19551 + }, + { + "epoch": 3.191665646300151, + "grad_norm": 2.140509843826294, + "learning_rate": 1.8316546817122885e-05, + "loss": 0.5226, + "step": 19552 + }, + { + "epoch": 3.1918289049426556, + "grad_norm": 2.5327160358428955, + "learning_rate": 1.8316370600319054e-05, + "loss": 0.4842, + "step": 19553 + }, + { + "epoch": 3.1919921635851596, + "grad_norm": 2.1440703868865967, + "learning_rate": 1.8316194375140646e-05, + "loss": 0.4715, + "step": 19554 + }, + { + "epoch": 3.192155422227664, + "grad_norm": 1.9341566562652588, + "learning_rate": 1.8316018141587833e-05, + "loss": 0.4911, + "step": 19555 + }, + { + "epoch": 3.1923186808701685, + "grad_norm": 1.7734322547912598, + "learning_rate": 1.831584189966079e-05, + "loss": 0.46, + "step": 19556 + }, + { + "epoch": 3.192481939512673, + "grad_norm": 2.2548458576202393, + "learning_rate": 1.8315665649359692e-05, + "loss": 0.4941, + "step": 19557 + }, + { + "epoch": 3.1926451981551773, + "grad_norm": 2.13787841796875, + "learning_rate": 1.8315489390684725e-05, + "loss": 0.4697, + "step": 19558 + }, + { + "epoch": 3.1928084567976818, + "grad_norm": 1.954977035522461, + "learning_rate": 1.8315313123636063e-05, + "loss": 0.4706, + "step": 19559 + }, + { + "epoch": 3.192971715440186, + "grad_norm": 2.226811647415161, + "learning_rate": 1.8315136848213883e-05, + "loss": 0.4919, + "step": 19560 + }, + { + "epoch": 3.1931349740826906, + "grad_norm": 1.9745374917984009, + "learning_rate": 1.8314960564418362e-05, + "loss": 0.4819, + "step": 19561 + }, + { + "epoch": 3.193298232725195, + "grad_norm": 1.7666473388671875, + "learning_rate": 1.8314784272249677e-05, + "loss": 0.432, + "step": 19562 + }, + { + "epoch": 3.193461491367699, + "grad_norm": 1.625012755393982, + "learning_rate": 1.8314607971708006e-05, + "loss": 0.4079, + "step": 19563 + }, + { + "epoch": 3.1936247500102035, + "grad_norm": 1.8283253908157349, + "learning_rate": 1.8314431662793527e-05, + "loss": 0.4816, + "step": 19564 + }, + { + "epoch": 3.193788008652708, + "grad_norm": 1.7323793172836304, + "learning_rate": 1.831425534550642e-05, + "loss": 0.434, + "step": 19565 + }, + { + "epoch": 3.1939512672952124, + "grad_norm": 1.746498942375183, + "learning_rate": 1.8314079019846857e-05, + "loss": 0.4281, + "step": 19566 + }, + { + "epoch": 3.194114525937717, + "grad_norm": 2.121636152267456, + "learning_rate": 1.831390268581502e-05, + "loss": 0.4876, + "step": 19567 + }, + { + "epoch": 3.1942777845802213, + "grad_norm": 1.8976359367370605, + "learning_rate": 1.8313726343411085e-05, + "loss": 0.4613, + "step": 19568 + }, + { + "epoch": 3.1944410432227257, + "grad_norm": 2.340796947479248, + "learning_rate": 1.831354999263523e-05, + "loss": 0.6362, + "step": 19569 + }, + { + "epoch": 3.19460430186523, + "grad_norm": 2.103839874267578, + "learning_rate": 1.8313373633487633e-05, + "loss": 0.4583, + "step": 19570 + }, + { + "epoch": 3.1947675605077346, + "grad_norm": 1.7732237577438354, + "learning_rate": 1.8313197265968472e-05, + "loss": 0.4216, + "step": 19571 + }, + { + "epoch": 3.1949308191502386, + "grad_norm": 2.10184907913208, + "learning_rate": 1.8313020890077922e-05, + "loss": 0.5817, + "step": 19572 + }, + { + "epoch": 3.195094077792743, + "grad_norm": 1.9710017442703247, + "learning_rate": 1.8312844505816162e-05, + "loss": 0.4811, + "step": 19573 + }, + { + "epoch": 3.1952573364352475, + "grad_norm": 2.1800854206085205, + "learning_rate": 1.831266811318337e-05, + "loss": 0.4888, + "step": 19574 + }, + { + "epoch": 3.195420595077752, + "grad_norm": 2.368826389312744, + "learning_rate": 1.8312491712179722e-05, + "loss": 0.5054, + "step": 19575 + }, + { + "epoch": 3.1955838537202563, + "grad_norm": 1.6319855451583862, + "learning_rate": 1.83123153028054e-05, + "loss": 0.3737, + "step": 19576 + }, + { + "epoch": 3.1957471123627608, + "grad_norm": 1.7007611989974976, + "learning_rate": 1.8312138885060577e-05, + "loss": 0.3902, + "step": 19577 + }, + { + "epoch": 3.195910371005265, + "grad_norm": 1.7965928316116333, + "learning_rate": 1.8311962458945432e-05, + "loss": 0.4303, + "step": 19578 + }, + { + "epoch": 3.1960736296477696, + "grad_norm": 1.6361415386199951, + "learning_rate": 1.8311786024460145e-05, + "loss": 0.4016, + "step": 19579 + }, + { + "epoch": 3.196236888290274, + "grad_norm": 1.7300794124603271, + "learning_rate": 1.8311609581604887e-05, + "loss": 0.4563, + "step": 19580 + }, + { + "epoch": 3.196400146932778, + "grad_norm": 1.7035677433013916, + "learning_rate": 1.8311433130379844e-05, + "loss": 0.3803, + "step": 19581 + }, + { + "epoch": 3.1965634055752825, + "grad_norm": 1.5410540103912354, + "learning_rate": 1.831125667078519e-05, + "loss": 0.3723, + "step": 19582 + }, + { + "epoch": 3.196726664217787, + "grad_norm": 2.016362190246582, + "learning_rate": 1.8311080202821104e-05, + "loss": 0.4857, + "step": 19583 + }, + { + "epoch": 3.1968899228602914, + "grad_norm": 2.1865170001983643, + "learning_rate": 1.8310903726487758e-05, + "loss": 0.5285, + "step": 19584 + }, + { + "epoch": 3.197053181502796, + "grad_norm": 2.0166268348693848, + "learning_rate": 1.831072724178534e-05, + "loss": 0.4948, + "step": 19585 + }, + { + "epoch": 3.1972164401453003, + "grad_norm": 2.032808303833008, + "learning_rate": 1.831055074871402e-05, + "loss": 0.5913, + "step": 19586 + }, + { + "epoch": 3.1973796987878047, + "grad_norm": 2.3696446418762207, + "learning_rate": 1.831037424727397e-05, + "loss": 0.5123, + "step": 19587 + }, + { + "epoch": 3.197542957430309, + "grad_norm": 1.4204304218292236, + "learning_rate": 1.8310197737465385e-05, + "loss": 0.3676, + "step": 19588 + }, + { + "epoch": 3.197706216072813, + "grad_norm": 1.952985405921936, + "learning_rate": 1.831002121928843e-05, + "loss": 0.4398, + "step": 19589 + }, + { + "epoch": 3.1978694747153176, + "grad_norm": 1.892081618309021, + "learning_rate": 1.8309844692743283e-05, + "loss": 0.5001, + "step": 19590 + }, + { + "epoch": 3.198032733357822, + "grad_norm": 2.40335750579834, + "learning_rate": 1.830966815783013e-05, + "loss": 0.4961, + "step": 19591 + }, + { + "epoch": 3.1981959920003264, + "grad_norm": 1.836540699005127, + "learning_rate": 1.830949161454914e-05, + "loss": 0.3972, + "step": 19592 + }, + { + "epoch": 3.198359250642831, + "grad_norm": 2.152526617050171, + "learning_rate": 1.8309315062900493e-05, + "loss": 0.5108, + "step": 19593 + }, + { + "epoch": 3.1985225092853353, + "grad_norm": 2.115694046020508, + "learning_rate": 1.830913850288437e-05, + "loss": 0.4778, + "step": 19594 + }, + { + "epoch": 3.1986857679278398, + "grad_norm": 2.135572910308838, + "learning_rate": 1.8308961934500948e-05, + "loss": 0.5462, + "step": 19595 + }, + { + "epoch": 3.198849026570344, + "grad_norm": 2.3206892013549805, + "learning_rate": 1.8308785357750402e-05, + "loss": 0.5042, + "step": 19596 + }, + { + "epoch": 3.1990122852128486, + "grad_norm": 2.015599012374878, + "learning_rate": 1.830860877263291e-05, + "loss": 0.4511, + "step": 19597 + }, + { + "epoch": 3.199175543855353, + "grad_norm": 2.0190742015838623, + "learning_rate": 1.830843217914865e-05, + "loss": 0.4611, + "step": 19598 + }, + { + "epoch": 3.199338802497857, + "grad_norm": 2.409735918045044, + "learning_rate": 1.8308255577297808e-05, + "loss": 0.5156, + "step": 19599 + }, + { + "epoch": 3.1995020611403615, + "grad_norm": 1.8327524662017822, + "learning_rate": 1.8308078967080547e-05, + "loss": 0.4334, + "step": 19600 + }, + { + "epoch": 3.199665319782866, + "grad_norm": 2.0062079429626465, + "learning_rate": 1.8307902348497056e-05, + "loss": 0.4888, + "step": 19601 + }, + { + "epoch": 3.1998285784253704, + "grad_norm": 1.6429908275604248, + "learning_rate": 1.830772572154751e-05, + "loss": 0.4028, + "step": 19602 + }, + { + "epoch": 3.199991837067875, + "grad_norm": 2.0927200317382812, + "learning_rate": 1.8307549086232085e-05, + "loss": 0.4997, + "step": 19603 + }, + { + "epoch": 3.2001550957103793, + "grad_norm": 1.7307814359664917, + "learning_rate": 1.830737244255096e-05, + "loss": 0.3514, + "step": 19604 + }, + { + "epoch": 3.2003183543528837, + "grad_norm": 1.7446619272232056, + "learning_rate": 1.8307195790504316e-05, + "loss": 0.4583, + "step": 19605 + }, + { + "epoch": 3.200481612995388, + "grad_norm": 1.620684027671814, + "learning_rate": 1.8307019130092326e-05, + "loss": 0.4172, + "step": 19606 + }, + { + "epoch": 3.200644871637892, + "grad_norm": 2.267345428466797, + "learning_rate": 1.830684246131517e-05, + "loss": 0.6104, + "step": 19607 + }, + { + "epoch": 3.2008081302803966, + "grad_norm": 1.794154405593872, + "learning_rate": 1.8306665784173026e-05, + "loss": 0.4777, + "step": 19608 + }, + { + "epoch": 3.200971388922901, + "grad_norm": 1.871884822845459, + "learning_rate": 1.8306489098666072e-05, + "loss": 0.4945, + "step": 19609 + }, + { + "epoch": 3.2011346475654054, + "grad_norm": 1.8022282123565674, + "learning_rate": 1.8306312404794485e-05, + "loss": 0.4907, + "step": 19610 + }, + { + "epoch": 3.20129790620791, + "grad_norm": 2.0368547439575195, + "learning_rate": 1.8306135702558444e-05, + "loss": 0.4317, + "step": 19611 + }, + { + "epoch": 3.2014611648504143, + "grad_norm": 2.0825605392456055, + "learning_rate": 1.830595899195813e-05, + "loss": 0.5336, + "step": 19612 + }, + { + "epoch": 3.2016244234929188, + "grad_norm": 1.2495461702346802, + "learning_rate": 1.8305782272993712e-05, + "loss": 0.3391, + "step": 19613 + }, + { + "epoch": 3.201787682135423, + "grad_norm": 1.90328848361969, + "learning_rate": 1.8305605545665374e-05, + "loss": 0.5042, + "step": 19614 + }, + { + "epoch": 3.2019509407779276, + "grad_norm": 1.9784331321716309, + "learning_rate": 1.8305428809973297e-05, + "loss": 0.5026, + "step": 19615 + }, + { + "epoch": 3.2021141994204316, + "grad_norm": 2.024881601333618, + "learning_rate": 1.8305252065917653e-05, + "loss": 0.4896, + "step": 19616 + }, + { + "epoch": 3.202277458062936, + "grad_norm": 2.0466530323028564, + "learning_rate": 1.8305075313498624e-05, + "loss": 0.5007, + "step": 19617 + }, + { + "epoch": 3.2024407167054405, + "grad_norm": 1.919000267982483, + "learning_rate": 1.8304898552716384e-05, + "loss": 0.5346, + "step": 19618 + }, + { + "epoch": 3.202603975347945, + "grad_norm": 1.8420442342758179, + "learning_rate": 1.8304721783571116e-05, + "loss": 0.4766, + "step": 19619 + }, + { + "epoch": 3.2027672339904494, + "grad_norm": 1.9834940433502197, + "learning_rate": 1.830454500606299e-05, + "loss": 0.4646, + "step": 19620 + }, + { + "epoch": 3.202930492632954, + "grad_norm": 1.5404096841812134, + "learning_rate": 1.8304368220192198e-05, + "loss": 0.3539, + "step": 19621 + }, + { + "epoch": 3.2030937512754583, + "grad_norm": 1.607448935508728, + "learning_rate": 1.8304191425958905e-05, + "loss": 0.3864, + "step": 19622 + }, + { + "epoch": 3.2032570099179627, + "grad_norm": 1.7967172861099243, + "learning_rate": 1.830401462336329e-05, + "loss": 0.5477, + "step": 19623 + }, + { + "epoch": 3.2034202685604667, + "grad_norm": 2.363156318664551, + "learning_rate": 1.830383781240554e-05, + "loss": 0.5192, + "step": 19624 + }, + { + "epoch": 3.203583527202971, + "grad_norm": 1.5355092287063599, + "learning_rate": 1.8303660993085825e-05, + "loss": 0.4007, + "step": 19625 + }, + { + "epoch": 3.2037467858454756, + "grad_norm": 2.018120765686035, + "learning_rate": 1.8303484165404323e-05, + "loss": 0.5341, + "step": 19626 + }, + { + "epoch": 3.20391004448798, + "grad_norm": 1.8691537380218506, + "learning_rate": 1.8303307329361217e-05, + "loss": 0.4226, + "step": 19627 + }, + { + "epoch": 3.2040733031304844, + "grad_norm": 1.774322271347046, + "learning_rate": 1.8303130484956682e-05, + "loss": 0.496, + "step": 19628 + }, + { + "epoch": 3.204236561772989, + "grad_norm": 1.967867374420166, + "learning_rate": 1.83029536321909e-05, + "loss": 0.4552, + "step": 19629 + }, + { + "epoch": 3.2043998204154933, + "grad_norm": 1.6074422597885132, + "learning_rate": 1.8302776771064044e-05, + "loss": 0.3821, + "step": 19630 + }, + { + "epoch": 3.2045630790579978, + "grad_norm": 1.8645256757736206, + "learning_rate": 1.8302599901576296e-05, + "loss": 0.4787, + "step": 19631 + }, + { + "epoch": 3.204726337700502, + "grad_norm": 1.9750136137008667, + "learning_rate": 1.8302423023727828e-05, + "loss": 0.5011, + "step": 19632 + }, + { + "epoch": 3.2048895963430066, + "grad_norm": 1.8905576467514038, + "learning_rate": 1.8302246137518823e-05, + "loss": 0.5192, + "step": 19633 + }, + { + "epoch": 3.2050528549855106, + "grad_norm": 1.7879860401153564, + "learning_rate": 1.830206924294946e-05, + "loss": 0.3874, + "step": 19634 + }, + { + "epoch": 3.205216113628015, + "grad_norm": 1.8421549797058105, + "learning_rate": 1.8301892340019916e-05, + "loss": 0.5504, + "step": 19635 + }, + { + "epoch": 3.2053793722705195, + "grad_norm": 1.7435662746429443, + "learning_rate": 1.8301715428730367e-05, + "loss": 0.4015, + "step": 19636 + }, + { + "epoch": 3.205542630913024, + "grad_norm": 2.0909814834594727, + "learning_rate": 1.8301538509080992e-05, + "loss": 0.4606, + "step": 19637 + }, + { + "epoch": 3.2057058895555284, + "grad_norm": 2.08389949798584, + "learning_rate": 1.830136158107197e-05, + "loss": 0.4943, + "step": 19638 + }, + { + "epoch": 3.205869148198033, + "grad_norm": 1.5654551982879639, + "learning_rate": 1.830118464470348e-05, + "loss": 0.3988, + "step": 19639 + }, + { + "epoch": 3.2060324068405373, + "grad_norm": 2.013561964035034, + "learning_rate": 1.8301007699975704e-05, + "loss": 0.5066, + "step": 19640 + }, + { + "epoch": 3.2061956654830417, + "grad_norm": 1.8626009225845337, + "learning_rate": 1.8300830746888813e-05, + "loss": 0.4459, + "step": 19641 + }, + { + "epoch": 3.2063589241255457, + "grad_norm": 1.7621209621429443, + "learning_rate": 1.830065378544298e-05, + "loss": 0.4215, + "step": 19642 + }, + { + "epoch": 3.20652218276805, + "grad_norm": 2.053036689758301, + "learning_rate": 1.83004768156384e-05, + "loss": 0.434, + "step": 19643 + }, + { + "epoch": 3.2066854414105546, + "grad_norm": 2.0912325382232666, + "learning_rate": 1.8300299837475236e-05, + "loss": 0.5095, + "step": 19644 + }, + { + "epoch": 3.206848700053059, + "grad_norm": 2.413562059402466, + "learning_rate": 1.8300122850953678e-05, + "loss": 0.5048, + "step": 19645 + }, + { + "epoch": 3.2070119586955634, + "grad_norm": 1.9266762733459473, + "learning_rate": 1.8299945856073896e-05, + "loss": 0.488, + "step": 19646 + }, + { + "epoch": 3.207175217338068, + "grad_norm": 1.9684224128723145, + "learning_rate": 1.8299768852836068e-05, + "loss": 0.4615, + "step": 19647 + }, + { + "epoch": 3.2073384759805723, + "grad_norm": 1.912516713142395, + "learning_rate": 1.8299591841240376e-05, + "loss": 0.4414, + "step": 19648 + }, + { + "epoch": 3.2075017346230768, + "grad_norm": 2.0965850353240967, + "learning_rate": 1.8299414821287e-05, + "loss": 0.4823, + "step": 19649 + }, + { + "epoch": 3.207664993265581, + "grad_norm": 1.710727572441101, + "learning_rate": 1.829923779297611e-05, + "loss": 0.4052, + "step": 19650 + }, + { + "epoch": 3.207828251908085, + "grad_norm": 1.8523346185684204, + "learning_rate": 1.8299060756307895e-05, + "loss": 0.4253, + "step": 19651 + }, + { + "epoch": 3.2079915105505896, + "grad_norm": 2.2285959720611572, + "learning_rate": 1.8298883711282526e-05, + "loss": 0.4865, + "step": 19652 + }, + { + "epoch": 3.208154769193094, + "grad_norm": 1.8871684074401855, + "learning_rate": 1.8298706657900185e-05, + "loss": 0.4901, + "step": 19653 + }, + { + "epoch": 3.2083180278355985, + "grad_norm": 2.412090539932251, + "learning_rate": 1.8298529596161047e-05, + "loss": 0.5558, + "step": 19654 + }, + { + "epoch": 3.208481286478103, + "grad_norm": 2.218461036682129, + "learning_rate": 1.8298352526065292e-05, + "loss": 0.5515, + "step": 19655 + }, + { + "epoch": 3.2086445451206074, + "grad_norm": 1.9370707273483276, + "learning_rate": 1.82981754476131e-05, + "loss": 0.5068, + "step": 19656 + }, + { + "epoch": 3.208807803763112, + "grad_norm": 1.9357099533081055, + "learning_rate": 1.8297998360804646e-05, + "loss": 0.5067, + "step": 19657 + }, + { + "epoch": 3.2089710624056162, + "grad_norm": 2.1195530891418457, + "learning_rate": 1.8297821265640107e-05, + "loss": 0.5169, + "step": 19658 + }, + { + "epoch": 3.2091343210481207, + "grad_norm": 1.760433554649353, + "learning_rate": 1.8297644162119666e-05, + "loss": 0.427, + "step": 19659 + }, + { + "epoch": 3.2092975796906247, + "grad_norm": 1.749983787536621, + "learning_rate": 1.8297467050243503e-05, + "loss": 0.4074, + "step": 19660 + }, + { + "epoch": 3.209460838333129, + "grad_norm": 1.8577622175216675, + "learning_rate": 1.829728993001179e-05, + "loss": 0.486, + "step": 19661 + }, + { + "epoch": 3.2096240969756336, + "grad_norm": 1.8855870962142944, + "learning_rate": 1.829711280142471e-05, + "loss": 0.4913, + "step": 19662 + }, + { + "epoch": 3.209787355618138, + "grad_norm": 2.039752960205078, + "learning_rate": 1.829693566448244e-05, + "loss": 0.4396, + "step": 19663 + }, + { + "epoch": 3.2099506142606424, + "grad_norm": 2.2638161182403564, + "learning_rate": 1.8296758519185154e-05, + "loss": 0.5574, + "step": 19664 + }, + { + "epoch": 3.210113872903147, + "grad_norm": 1.536924123764038, + "learning_rate": 1.8296581365533038e-05, + "loss": 0.358, + "step": 19665 + }, + { + "epoch": 3.2102771315456513, + "grad_norm": 2.4349405765533447, + "learning_rate": 1.8296404203526267e-05, + "loss": 0.6145, + "step": 19666 + }, + { + "epoch": 3.2104403901881557, + "grad_norm": 2.2324492931365967, + "learning_rate": 1.8296227033165016e-05, + "loss": 0.5554, + "step": 19667 + }, + { + "epoch": 3.21060364883066, + "grad_norm": 1.7308926582336426, + "learning_rate": 1.8296049854449466e-05, + "loss": 0.3396, + "step": 19668 + }, + { + "epoch": 3.210766907473164, + "grad_norm": 1.900835394859314, + "learning_rate": 1.82958726673798e-05, + "loss": 0.5244, + "step": 19669 + }, + { + "epoch": 3.2109301661156686, + "grad_norm": 2.0649027824401855, + "learning_rate": 1.829569547195619e-05, + "loss": 0.4192, + "step": 19670 + }, + { + "epoch": 3.211093424758173, + "grad_norm": 2.043198585510254, + "learning_rate": 1.8295518268178817e-05, + "loss": 0.5146, + "step": 19671 + }, + { + "epoch": 3.2112566834006775, + "grad_norm": 1.8824375867843628, + "learning_rate": 1.8295341056047858e-05, + "loss": 0.4837, + "step": 19672 + }, + { + "epoch": 3.211419942043182, + "grad_norm": 1.9548472166061401, + "learning_rate": 1.8295163835563497e-05, + "loss": 0.5435, + "step": 19673 + }, + { + "epoch": 3.2115832006856864, + "grad_norm": 1.9567254781723022, + "learning_rate": 1.8294986606725907e-05, + "loss": 0.5031, + "step": 19674 + }, + { + "epoch": 3.211746459328191, + "grad_norm": 1.617077350616455, + "learning_rate": 1.8294809369535265e-05, + "loss": 0.4347, + "step": 19675 + }, + { + "epoch": 3.2119097179706952, + "grad_norm": 1.6488335132598877, + "learning_rate": 1.8294632123991753e-05, + "loss": 0.4427, + "step": 19676 + }, + { + "epoch": 3.2120729766131992, + "grad_norm": 2.05963397026062, + "learning_rate": 1.8294454870095547e-05, + "loss": 0.489, + "step": 19677 + }, + { + "epoch": 3.2122362352557037, + "grad_norm": 2.299913167953491, + "learning_rate": 1.8294277607846834e-05, + "loss": 0.4816, + "step": 19678 + }, + { + "epoch": 3.212399493898208, + "grad_norm": 1.6785780191421509, + "learning_rate": 1.829410033724578e-05, + "loss": 0.4503, + "step": 19679 + }, + { + "epoch": 3.2125627525407126, + "grad_norm": 2.0155081748962402, + "learning_rate": 1.829392305829257e-05, + "loss": 0.4553, + "step": 19680 + }, + { + "epoch": 3.212726011183217, + "grad_norm": 2.4312777519226074, + "learning_rate": 1.829374577098738e-05, + "loss": 0.5325, + "step": 19681 + }, + { + "epoch": 3.2128892698257214, + "grad_norm": 2.511763095855713, + "learning_rate": 1.8293568475330393e-05, + "loss": 0.6153, + "step": 19682 + }, + { + "epoch": 3.213052528468226, + "grad_norm": 1.8054568767547607, + "learning_rate": 1.8293391171321784e-05, + "loss": 0.4848, + "step": 19683 + }, + { + "epoch": 3.2132157871107303, + "grad_norm": 2.1676671504974365, + "learning_rate": 1.829321385896173e-05, + "loss": 0.5622, + "step": 19684 + }, + { + "epoch": 3.2133790457532347, + "grad_norm": 2.304023265838623, + "learning_rate": 1.8293036538250418e-05, + "loss": 0.4822, + "step": 19685 + }, + { + "epoch": 3.213542304395739, + "grad_norm": 2.3942205905914307, + "learning_rate": 1.8292859209188014e-05, + "loss": 0.4138, + "step": 19686 + }, + { + "epoch": 3.213705563038243, + "grad_norm": 1.6554298400878906, + "learning_rate": 1.8292681871774705e-05, + "loss": 0.4151, + "step": 19687 + }, + { + "epoch": 3.2138688216807476, + "grad_norm": 2.010852336883545, + "learning_rate": 1.8292504526010668e-05, + "loss": 0.4804, + "step": 19688 + }, + { + "epoch": 3.214032080323252, + "grad_norm": 1.8285444974899292, + "learning_rate": 1.8292327171896082e-05, + "loss": 0.4671, + "step": 19689 + }, + { + "epoch": 3.2141953389657565, + "grad_norm": 2.0833446979522705, + "learning_rate": 1.8292149809431123e-05, + "loss": 0.5755, + "step": 19690 + }, + { + "epoch": 3.214358597608261, + "grad_norm": 1.8746492862701416, + "learning_rate": 1.829197243861597e-05, + "loss": 0.4718, + "step": 19691 + }, + { + "epoch": 3.2145218562507654, + "grad_norm": 2.0355634689331055, + "learning_rate": 1.8291795059450806e-05, + "loss": 0.4219, + "step": 19692 + }, + { + "epoch": 3.21468511489327, + "grad_norm": 1.8298709392547607, + "learning_rate": 1.8291617671935807e-05, + "loss": 0.4551, + "step": 19693 + }, + { + "epoch": 3.2148483735357742, + "grad_norm": 2.0532383918762207, + "learning_rate": 1.829144027607115e-05, + "loss": 0.4926, + "step": 19694 + }, + { + "epoch": 3.2150116321782782, + "grad_norm": 2.0638351440429688, + "learning_rate": 1.8291262871857015e-05, + "loss": 0.5067, + "step": 19695 + }, + { + "epoch": 3.2151748908207827, + "grad_norm": 1.7651591300964355, + "learning_rate": 1.829108545929358e-05, + "loss": 0.4423, + "step": 19696 + }, + { + "epoch": 3.215338149463287, + "grad_norm": 1.7539269924163818, + "learning_rate": 1.8290908038381024e-05, + "loss": 0.4264, + "step": 19697 + }, + { + "epoch": 3.2155014081057915, + "grad_norm": 2.208038806915283, + "learning_rate": 1.8290730609119525e-05, + "loss": 0.4573, + "step": 19698 + }, + { + "epoch": 3.215664666748296, + "grad_norm": 2.3940048217773438, + "learning_rate": 1.8290553171509263e-05, + "loss": 0.5779, + "step": 19699 + }, + { + "epoch": 3.2158279253908004, + "grad_norm": 1.7746416330337524, + "learning_rate": 1.8290375725550417e-05, + "loss": 0.4586, + "step": 19700 + }, + { + "epoch": 3.215991184033305, + "grad_norm": 2.1617279052734375, + "learning_rate": 1.8290198271243166e-05, + "loss": 0.5727, + "step": 19701 + }, + { + "epoch": 3.2161544426758093, + "grad_norm": 1.7708792686462402, + "learning_rate": 1.8290020808587688e-05, + "loss": 0.4228, + "step": 19702 + }, + { + "epoch": 3.2163177013183137, + "grad_norm": 2.3087081909179688, + "learning_rate": 1.8289843337584158e-05, + "loss": 0.4836, + "step": 19703 + }, + { + "epoch": 3.2164809599608177, + "grad_norm": 1.9485132694244385, + "learning_rate": 1.828966585823276e-05, + "loss": 0.469, + "step": 19704 + }, + { + "epoch": 3.216644218603322, + "grad_norm": 1.8208705186843872, + "learning_rate": 1.8289488370533667e-05, + "loss": 0.4662, + "step": 19705 + }, + { + "epoch": 3.2168074772458266, + "grad_norm": 1.7626985311508179, + "learning_rate": 1.8289310874487066e-05, + "loss": 0.4755, + "step": 19706 + }, + { + "epoch": 3.216970735888331, + "grad_norm": 1.6838027238845825, + "learning_rate": 1.828913337009313e-05, + "loss": 0.4455, + "step": 19707 + }, + { + "epoch": 3.2171339945308355, + "grad_norm": 1.997738003730774, + "learning_rate": 1.828895585735204e-05, + "loss": 0.5284, + "step": 19708 + }, + { + "epoch": 3.21729725317334, + "grad_norm": 2.015070676803589, + "learning_rate": 1.828877833626397e-05, + "loss": 0.4735, + "step": 19709 + }, + { + "epoch": 3.2174605118158444, + "grad_norm": 2.0451624393463135, + "learning_rate": 1.8288600806829104e-05, + "loss": 0.485, + "step": 19710 + }, + { + "epoch": 3.217623770458349, + "grad_norm": 1.8634099960327148, + "learning_rate": 1.828842326904762e-05, + "loss": 0.397, + "step": 19711 + }, + { + "epoch": 3.217787029100853, + "grad_norm": 1.762561559677124, + "learning_rate": 1.8288245722919695e-05, + "loss": 0.421, + "step": 19712 + }, + { + "epoch": 3.2179502877433572, + "grad_norm": 2.1464874744415283, + "learning_rate": 1.828806816844551e-05, + "loss": 0.4366, + "step": 19713 + }, + { + "epoch": 3.2181135463858617, + "grad_norm": 1.9309561252593994, + "learning_rate": 1.828789060562524e-05, + "loss": 0.4444, + "step": 19714 + }, + { + "epoch": 3.218276805028366, + "grad_norm": 1.9617294073104858, + "learning_rate": 1.8287713034459072e-05, + "loss": 0.44, + "step": 19715 + }, + { + "epoch": 3.2184400636708705, + "grad_norm": 1.8072595596313477, + "learning_rate": 1.8287535454947172e-05, + "loss": 0.4604, + "step": 19716 + }, + { + "epoch": 3.218603322313375, + "grad_norm": 2.129019021987915, + "learning_rate": 1.828735786708973e-05, + "loss": 0.485, + "step": 19717 + }, + { + "epoch": 3.2187665809558794, + "grad_norm": 1.891739010810852, + "learning_rate": 1.8287180270886922e-05, + "loss": 0.4389, + "step": 19718 + }, + { + "epoch": 3.218929839598384, + "grad_norm": 1.8310402631759644, + "learning_rate": 1.8287002666338924e-05, + "loss": 0.4676, + "step": 19719 + }, + { + "epoch": 3.2190930982408883, + "grad_norm": 1.7038880586624146, + "learning_rate": 1.8286825053445916e-05, + "loss": 0.3976, + "step": 19720 + }, + { + "epoch": 3.2192563568833927, + "grad_norm": 1.896985650062561, + "learning_rate": 1.828664743220808e-05, + "loss": 0.435, + "step": 19721 + }, + { + "epoch": 3.2194196155258967, + "grad_norm": 1.618194580078125, + "learning_rate": 1.828646980262559e-05, + "loss": 0.4321, + "step": 19722 + }, + { + "epoch": 3.219582874168401, + "grad_norm": 1.6995024681091309, + "learning_rate": 1.8286292164698624e-05, + "loss": 0.4352, + "step": 19723 + }, + { + "epoch": 3.2197461328109056, + "grad_norm": 1.9158746004104614, + "learning_rate": 1.8286114518427372e-05, + "loss": 0.5333, + "step": 19724 + }, + { + "epoch": 3.21990939145341, + "grad_norm": 1.955507755279541, + "learning_rate": 1.8285936863811998e-05, + "loss": 0.45, + "step": 19725 + }, + { + "epoch": 3.2200726500959145, + "grad_norm": 1.9535053968429565, + "learning_rate": 1.828575920085269e-05, + "loss": 0.4296, + "step": 19726 + }, + { + "epoch": 3.220235908738419, + "grad_norm": 1.7457698583602905, + "learning_rate": 1.8285581529549625e-05, + "loss": 0.4471, + "step": 19727 + }, + { + "epoch": 3.2203991673809234, + "grad_norm": 2.1091747283935547, + "learning_rate": 1.8285403849902977e-05, + "loss": 0.5092, + "step": 19728 + }, + { + "epoch": 3.220562426023428, + "grad_norm": 2.0415871143341064, + "learning_rate": 1.8285226161912937e-05, + "loss": 0.4395, + "step": 19729 + }, + { + "epoch": 3.220725684665932, + "grad_norm": 1.8372162580490112, + "learning_rate": 1.8285048465579672e-05, + "loss": 0.4393, + "step": 19730 + }, + { + "epoch": 3.2208889433084362, + "grad_norm": 1.7880946397781372, + "learning_rate": 1.8284870760903367e-05, + "loss": 0.4314, + "step": 19731 + }, + { + "epoch": 3.2210522019509407, + "grad_norm": 2.4104645252227783, + "learning_rate": 1.8284693047884198e-05, + "loss": 0.5813, + "step": 19732 + }, + { + "epoch": 3.221215460593445, + "grad_norm": 1.838129997253418, + "learning_rate": 1.8284515326522347e-05, + "loss": 0.464, + "step": 19733 + }, + { + "epoch": 3.2213787192359495, + "grad_norm": 1.9246076345443726, + "learning_rate": 1.828433759681799e-05, + "loss": 0.4236, + "step": 19734 + }, + { + "epoch": 3.221541977878454, + "grad_norm": 1.9546054601669312, + "learning_rate": 1.8284159858771307e-05, + "loss": 0.4643, + "step": 19735 + }, + { + "epoch": 3.2217052365209584, + "grad_norm": 1.8680888414382935, + "learning_rate": 1.828398211238248e-05, + "loss": 0.4354, + "step": 19736 + }, + { + "epoch": 3.221868495163463, + "grad_norm": 2.026540994644165, + "learning_rate": 1.8283804357651683e-05, + "loss": 0.4387, + "step": 19737 + }, + { + "epoch": 3.2220317538059673, + "grad_norm": 1.88138747215271, + "learning_rate": 1.8283626594579097e-05, + "loss": 0.5368, + "step": 19738 + }, + { + "epoch": 3.2221950124484713, + "grad_norm": 1.9321210384368896, + "learning_rate": 1.8283448823164904e-05, + "loss": 0.408, + "step": 19739 + }, + { + "epoch": 3.2223582710909757, + "grad_norm": 1.795000672340393, + "learning_rate": 1.8283271043409275e-05, + "loss": 0.4934, + "step": 19740 + }, + { + "epoch": 3.22252152973348, + "grad_norm": 1.9393657445907593, + "learning_rate": 1.82830932553124e-05, + "loss": 0.4698, + "step": 19741 + }, + { + "epoch": 3.2226847883759846, + "grad_norm": 1.8735644817352295, + "learning_rate": 1.8282915458874446e-05, + "loss": 0.5073, + "step": 19742 + }, + { + "epoch": 3.222848047018489, + "grad_norm": 2.1061511039733887, + "learning_rate": 1.82827376540956e-05, + "loss": 0.5575, + "step": 19743 + }, + { + "epoch": 3.2230113056609935, + "grad_norm": 1.9558589458465576, + "learning_rate": 1.8282559840976043e-05, + "loss": 0.4743, + "step": 19744 + }, + { + "epoch": 3.223174564303498, + "grad_norm": 2.253047466278076, + "learning_rate": 1.828238201951595e-05, + "loss": 0.5224, + "step": 19745 + }, + { + "epoch": 3.2233378229460024, + "grad_norm": 2.344491481781006, + "learning_rate": 1.82822041897155e-05, + "loss": 0.5029, + "step": 19746 + }, + { + "epoch": 3.223501081588507, + "grad_norm": 1.7785671949386597, + "learning_rate": 1.828202635157487e-05, + "loss": 0.4723, + "step": 19747 + }, + { + "epoch": 3.223664340231011, + "grad_norm": 1.803494930267334, + "learning_rate": 1.828184850509424e-05, + "loss": 0.4646, + "step": 19748 + }, + { + "epoch": 3.223827598873515, + "grad_norm": 1.9012837409973145, + "learning_rate": 1.8281670650273796e-05, + "loss": 0.4534, + "step": 19749 + }, + { + "epoch": 3.2239908575160197, + "grad_norm": 1.8963944911956787, + "learning_rate": 1.8281492787113707e-05, + "loss": 0.4754, + "step": 19750 + }, + { + "epoch": 3.224154116158524, + "grad_norm": 1.6369264125823975, + "learning_rate": 1.828131491561416e-05, + "loss": 0.3985, + "step": 19751 + }, + { + "epoch": 3.2243173748010285, + "grad_norm": 1.6247692108154297, + "learning_rate": 1.8281137035775332e-05, + "loss": 0.4061, + "step": 19752 + }, + { + "epoch": 3.224480633443533, + "grad_norm": 2.275331735610962, + "learning_rate": 1.82809591475974e-05, + "loss": 0.512, + "step": 19753 + }, + { + "epoch": 3.2246438920860374, + "grad_norm": 1.4983733892440796, + "learning_rate": 1.8280781251080546e-05, + "loss": 0.4203, + "step": 19754 + }, + { + "epoch": 3.224807150728542, + "grad_norm": 1.7650138139724731, + "learning_rate": 1.8280603346224945e-05, + "loss": 0.4613, + "step": 19755 + }, + { + "epoch": 3.2249704093710463, + "grad_norm": 1.9371355772018433, + "learning_rate": 1.8280425433030782e-05, + "loss": 0.4919, + "step": 19756 + }, + { + "epoch": 3.2251336680135503, + "grad_norm": 1.801053762435913, + "learning_rate": 1.8280247511498226e-05, + "loss": 0.4849, + "step": 19757 + }, + { + "epoch": 3.2252969266560547, + "grad_norm": 1.5591658353805542, + "learning_rate": 1.828006958162747e-05, + "loss": 0.3612, + "step": 19758 + }, + { + "epoch": 3.225460185298559, + "grad_norm": 1.814083218574524, + "learning_rate": 1.8279891643418685e-05, + "loss": 0.4261, + "step": 19759 + }, + { + "epoch": 3.2256234439410636, + "grad_norm": 1.860736608505249, + "learning_rate": 1.8279713696872047e-05, + "loss": 0.5, + "step": 19760 + }, + { + "epoch": 3.225786702583568, + "grad_norm": 2.0602238178253174, + "learning_rate": 1.8279535741987745e-05, + "loss": 0.5007, + "step": 19761 + }, + { + "epoch": 3.2259499612260725, + "grad_norm": 2.1616873741149902, + "learning_rate": 1.8279357778765946e-05, + "loss": 0.5197, + "step": 19762 + }, + { + "epoch": 3.226113219868577, + "grad_norm": 1.8931833505630493, + "learning_rate": 1.8279179807206842e-05, + "loss": 0.4347, + "step": 19763 + }, + { + "epoch": 3.2262764785110813, + "grad_norm": 2.472564697265625, + "learning_rate": 1.8279001827310603e-05, + "loss": 0.522, + "step": 19764 + }, + { + "epoch": 3.2264397371535853, + "grad_norm": 2.543645143508911, + "learning_rate": 1.8278823839077412e-05, + "loss": 0.5419, + "step": 19765 + }, + { + "epoch": 3.22660299579609, + "grad_norm": 2.10745906829834, + "learning_rate": 1.8278645842507448e-05, + "loss": 0.5593, + "step": 19766 + }, + { + "epoch": 3.226766254438594, + "grad_norm": 1.9902522563934326, + "learning_rate": 1.827846783760089e-05, + "loss": 0.4386, + "step": 19767 + }, + { + "epoch": 3.2269295130810987, + "grad_norm": 1.6609046459197998, + "learning_rate": 1.8278289824357917e-05, + "loss": 0.4359, + "step": 19768 + }, + { + "epoch": 3.227092771723603, + "grad_norm": 1.940859317779541, + "learning_rate": 1.8278111802778705e-05, + "loss": 0.4726, + "step": 19769 + }, + { + "epoch": 3.2272560303661075, + "grad_norm": 1.933000087738037, + "learning_rate": 1.8277933772863443e-05, + "loss": 0.4862, + "step": 19770 + }, + { + "epoch": 3.227419289008612, + "grad_norm": 1.6849687099456787, + "learning_rate": 1.8277755734612302e-05, + "loss": 0.4133, + "step": 19771 + }, + { + "epoch": 3.2275825476511164, + "grad_norm": 1.6419737339019775, + "learning_rate": 1.827757768802546e-05, + "loss": 0.4522, + "step": 19772 + }, + { + "epoch": 3.227745806293621, + "grad_norm": 1.9152719974517822, + "learning_rate": 1.82773996331031e-05, + "loss": 0.5032, + "step": 19773 + }, + { + "epoch": 3.2279090649361253, + "grad_norm": 2.004554033279419, + "learning_rate": 1.8277221569845402e-05, + "loss": 0.4549, + "step": 19774 + }, + { + "epoch": 3.2280723235786293, + "grad_norm": 2.447281837463379, + "learning_rate": 1.8277043498252544e-05, + "loss": 0.5247, + "step": 19775 + }, + { + "epoch": 3.2282355822211337, + "grad_norm": 2.0307157039642334, + "learning_rate": 1.8276865418324706e-05, + "loss": 0.4292, + "step": 19776 + }, + { + "epoch": 3.228398840863638, + "grad_norm": 2.053356647491455, + "learning_rate": 1.8276687330062067e-05, + "loss": 0.4738, + "step": 19777 + }, + { + "epoch": 3.2285620995061426, + "grad_norm": 2.159471035003662, + "learning_rate": 1.8276509233464806e-05, + "loss": 0.5348, + "step": 19778 + }, + { + "epoch": 3.228725358148647, + "grad_norm": 2.178358793258667, + "learning_rate": 1.82763311285331e-05, + "loss": 0.5651, + "step": 19779 + }, + { + "epoch": 3.2288886167911515, + "grad_norm": 1.943186640739441, + "learning_rate": 1.8276153015267133e-05, + "loss": 0.4359, + "step": 19780 + }, + { + "epoch": 3.229051875433656, + "grad_norm": 2.030911445617676, + "learning_rate": 1.827597489366708e-05, + "loss": 0.5649, + "step": 19781 + }, + { + "epoch": 3.2292151340761603, + "grad_norm": 1.8532209396362305, + "learning_rate": 1.8275796763733126e-05, + "loss": 0.4661, + "step": 19782 + }, + { + "epoch": 3.2293783927186643, + "grad_norm": 1.8343489170074463, + "learning_rate": 1.8275618625465443e-05, + "loss": 0.3931, + "step": 19783 + }, + { + "epoch": 3.2295416513611688, + "grad_norm": 1.9170185327529907, + "learning_rate": 1.8275440478864216e-05, + "loss": 0.4613, + "step": 19784 + }, + { + "epoch": 3.229704910003673, + "grad_norm": 1.723833680152893, + "learning_rate": 1.8275262323929625e-05, + "loss": 0.4254, + "step": 19785 + }, + { + "epoch": 3.2298681686461777, + "grad_norm": 2.1768293380737305, + "learning_rate": 1.8275084160661842e-05, + "loss": 0.487, + "step": 19786 + }, + { + "epoch": 3.230031427288682, + "grad_norm": 2.0834195613861084, + "learning_rate": 1.8274905989061057e-05, + "loss": 0.4766, + "step": 19787 + }, + { + "epoch": 3.2301946859311865, + "grad_norm": 2.173133373260498, + "learning_rate": 1.827472780912744e-05, + "loss": 0.5044, + "step": 19788 + }, + { + "epoch": 3.230357944573691, + "grad_norm": 1.9893742799758911, + "learning_rate": 1.8274549620861174e-05, + "loss": 0.4798, + "step": 19789 + }, + { + "epoch": 3.2305212032161954, + "grad_norm": 1.9256478548049927, + "learning_rate": 1.8274371424262442e-05, + "loss": 0.4324, + "step": 19790 + }, + { + "epoch": 3.2306844618587, + "grad_norm": 1.7570847272872925, + "learning_rate": 1.827419321933142e-05, + "loss": 0.4447, + "step": 19791 + }, + { + "epoch": 3.230847720501204, + "grad_norm": 1.7684173583984375, + "learning_rate": 1.8274015006068282e-05, + "loss": 0.4614, + "step": 19792 + }, + { + "epoch": 3.2310109791437083, + "grad_norm": 2.1403696537017822, + "learning_rate": 1.8273836784473218e-05, + "loss": 0.545, + "step": 19793 + }, + { + "epoch": 3.2311742377862127, + "grad_norm": 2.174215316772461, + "learning_rate": 1.8273658554546402e-05, + "loss": 0.4667, + "step": 19794 + }, + { + "epoch": 3.231337496428717, + "grad_norm": 2.199486494064331, + "learning_rate": 1.8273480316288014e-05, + "loss": 0.5154, + "step": 19795 + }, + { + "epoch": 3.2315007550712216, + "grad_norm": 1.962664008140564, + "learning_rate": 1.827330206969823e-05, + "loss": 0.4646, + "step": 19796 + }, + { + "epoch": 3.231664013713726, + "grad_norm": 2.048784017562866, + "learning_rate": 1.8273123814777237e-05, + "loss": 0.4839, + "step": 19797 + }, + { + "epoch": 3.2318272723562305, + "grad_norm": 1.8029698133468628, + "learning_rate": 1.827294555152521e-05, + "loss": 0.4318, + "step": 19798 + }, + { + "epoch": 3.231990530998735, + "grad_norm": 2.4245545864105225, + "learning_rate": 1.827276727994233e-05, + "loss": 0.4895, + "step": 19799 + }, + { + "epoch": 3.2321537896412393, + "grad_norm": 1.9175947904586792, + "learning_rate": 1.8272589000028774e-05, + "loss": 0.5155, + "step": 19800 + }, + { + "epoch": 3.2323170482837433, + "grad_norm": 1.9413102865219116, + "learning_rate": 1.8272410711784722e-05, + "loss": 0.4837, + "step": 19801 + }, + { + "epoch": 3.2324803069262478, + "grad_norm": 1.9432035684585571, + "learning_rate": 1.8272232415210355e-05, + "loss": 0.4564, + "step": 19802 + }, + { + "epoch": 3.232643565568752, + "grad_norm": 1.8126466274261475, + "learning_rate": 1.8272054110305853e-05, + "loss": 0.4578, + "step": 19803 + }, + { + "epoch": 3.2328068242112566, + "grad_norm": 2.241772174835205, + "learning_rate": 1.8271875797071395e-05, + "loss": 0.5211, + "step": 19804 + }, + { + "epoch": 3.232970082853761, + "grad_norm": 1.578998327255249, + "learning_rate": 1.827169747550716e-05, + "loss": 0.4433, + "step": 19805 + }, + { + "epoch": 3.2331333414962655, + "grad_norm": 1.9582678079605103, + "learning_rate": 1.8271519145613327e-05, + "loss": 0.4584, + "step": 19806 + }, + { + "epoch": 3.23329660013877, + "grad_norm": 1.9424922466278076, + "learning_rate": 1.827134080739008e-05, + "loss": 0.4347, + "step": 19807 + }, + { + "epoch": 3.2334598587812744, + "grad_norm": 1.647964596748352, + "learning_rate": 1.827116246083759e-05, + "loss": 0.4712, + "step": 19808 + }, + { + "epoch": 3.233623117423779, + "grad_norm": 2.2228574752807617, + "learning_rate": 1.8270984105956044e-05, + "loss": 0.4632, + "step": 19809 + }, + { + "epoch": 3.233786376066283, + "grad_norm": 1.8270292282104492, + "learning_rate": 1.827080574274562e-05, + "loss": 0.4273, + "step": 19810 + }, + { + "epoch": 3.2339496347087873, + "grad_norm": 1.8364341259002686, + "learning_rate": 1.8270627371206495e-05, + "loss": 0.4569, + "step": 19811 + }, + { + "epoch": 3.2341128933512917, + "grad_norm": 2.4708588123321533, + "learning_rate": 1.8270448991338852e-05, + "loss": 0.5157, + "step": 19812 + }, + { + "epoch": 3.234276151993796, + "grad_norm": 1.5743780136108398, + "learning_rate": 1.827027060314287e-05, + "loss": 0.4087, + "step": 19813 + }, + { + "epoch": 3.2344394106363006, + "grad_norm": 1.8878320455551147, + "learning_rate": 1.8270092206618723e-05, + "loss": 0.3923, + "step": 19814 + }, + { + "epoch": 3.234602669278805, + "grad_norm": 1.8849319219589233, + "learning_rate": 1.82699138017666e-05, + "loss": 0.4907, + "step": 19815 + }, + { + "epoch": 3.2347659279213095, + "grad_norm": 2.24770188331604, + "learning_rate": 1.8269735388586673e-05, + "loss": 0.49, + "step": 19816 + }, + { + "epoch": 3.234929186563814, + "grad_norm": 2.090771198272705, + "learning_rate": 1.8269556967079127e-05, + "loss": 0.5029, + "step": 19817 + }, + { + "epoch": 3.235092445206318, + "grad_norm": 1.8641576766967773, + "learning_rate": 1.8269378537244136e-05, + "loss": 0.478, + "step": 19818 + }, + { + "epoch": 3.2352557038488223, + "grad_norm": 1.945031762123108, + "learning_rate": 1.8269200099081887e-05, + "loss": 0.4407, + "step": 19819 + }, + { + "epoch": 3.2354189624913268, + "grad_norm": 2.044161081314087, + "learning_rate": 1.8269021652592555e-05, + "loss": 0.6832, + "step": 19820 + }, + { + "epoch": 3.235582221133831, + "grad_norm": 1.9448083639144897, + "learning_rate": 1.826884319777632e-05, + "loss": 0.4996, + "step": 19821 + }, + { + "epoch": 3.2357454797763356, + "grad_norm": 2.087594509124756, + "learning_rate": 1.826866473463336e-05, + "loss": 0.535, + "step": 19822 + }, + { + "epoch": 3.23590873841884, + "grad_norm": 2.220273494720459, + "learning_rate": 1.8268486263163858e-05, + "loss": 0.542, + "step": 19823 + }, + { + "epoch": 3.2360719970613445, + "grad_norm": 1.8438373804092407, + "learning_rate": 1.8268307783367994e-05, + "loss": 0.3982, + "step": 19824 + }, + { + "epoch": 3.236235255703849, + "grad_norm": 2.036590337753296, + "learning_rate": 1.8268129295245946e-05, + "loss": 0.4104, + "step": 19825 + }, + { + "epoch": 3.2363985143463534, + "grad_norm": 1.8373087644577026, + "learning_rate": 1.8267950798797892e-05, + "loss": 0.4012, + "step": 19826 + }, + { + "epoch": 3.236561772988858, + "grad_norm": 1.9465183019638062, + "learning_rate": 1.8267772294024016e-05, + "loss": 0.4923, + "step": 19827 + }, + { + "epoch": 3.236725031631362, + "grad_norm": 1.9365758895874023, + "learning_rate": 1.826759378092449e-05, + "loss": 0.4182, + "step": 19828 + }, + { + "epoch": 3.2368882902738663, + "grad_norm": 2.1688482761383057, + "learning_rate": 1.8267415259499506e-05, + "loss": 0.5098, + "step": 19829 + }, + { + "epoch": 3.2370515489163707, + "grad_norm": 2.095107078552246, + "learning_rate": 1.8267236729749234e-05, + "loss": 0.4722, + "step": 19830 + }, + { + "epoch": 3.237214807558875, + "grad_norm": 2.0042600631713867, + "learning_rate": 1.8267058191673858e-05, + "loss": 0.4456, + "step": 19831 + }, + { + "epoch": 3.2373780662013796, + "grad_norm": 2.2536723613739014, + "learning_rate": 1.8266879645273557e-05, + "loss": 0.5944, + "step": 19832 + }, + { + "epoch": 3.237541324843884, + "grad_norm": 1.790701985359192, + "learning_rate": 1.826670109054851e-05, + "loss": 0.4556, + "step": 19833 + }, + { + "epoch": 3.2377045834863885, + "grad_norm": 2.3091604709625244, + "learning_rate": 1.8266522527498894e-05, + "loss": 0.5134, + "step": 19834 + }, + { + "epoch": 3.237867842128893, + "grad_norm": 1.577185034751892, + "learning_rate": 1.8266343956124895e-05, + "loss": 0.4239, + "step": 19835 + }, + { + "epoch": 3.238031100771397, + "grad_norm": 2.1533334255218506, + "learning_rate": 1.826616537642669e-05, + "loss": 0.513, + "step": 19836 + }, + { + "epoch": 3.2381943594139013, + "grad_norm": 1.6994380950927734, + "learning_rate": 1.826598678840446e-05, + "loss": 0.4088, + "step": 19837 + }, + { + "epoch": 3.2383576180564058, + "grad_norm": 2.283864974975586, + "learning_rate": 1.8265808192058377e-05, + "loss": 0.5423, + "step": 19838 + }, + { + "epoch": 3.23852087669891, + "grad_norm": 2.1102755069732666, + "learning_rate": 1.8265629587388634e-05, + "loss": 0.4807, + "step": 19839 + }, + { + "epoch": 3.2386841353414146, + "grad_norm": 2.660179615020752, + "learning_rate": 1.8265450974395403e-05, + "loss": 1.0319, + "step": 19840 + }, + { + "epoch": 3.238847393983919, + "grad_norm": 1.9428311586380005, + "learning_rate": 1.8265272353078863e-05, + "loss": 0.4534, + "step": 19841 + }, + { + "epoch": 3.2390106526264235, + "grad_norm": 2.2522826194763184, + "learning_rate": 1.82650937234392e-05, + "loss": 0.4638, + "step": 19842 + }, + { + "epoch": 3.239173911268928, + "grad_norm": 2.07511305809021, + "learning_rate": 1.8264915085476585e-05, + "loss": 0.5067, + "step": 19843 + }, + { + "epoch": 3.2393371699114324, + "grad_norm": 2.4052040576934814, + "learning_rate": 1.8264736439191205e-05, + "loss": 0.5538, + "step": 19844 + }, + { + "epoch": 3.2395004285539364, + "grad_norm": 1.618126630783081, + "learning_rate": 1.8264557784583234e-05, + "loss": 0.4054, + "step": 19845 + }, + { + "epoch": 3.239663687196441, + "grad_norm": 2.291670799255371, + "learning_rate": 1.826437912165286e-05, + "loss": 0.5994, + "step": 19846 + }, + { + "epoch": 3.2398269458389453, + "grad_norm": 1.9798251390457153, + "learning_rate": 1.8264200450400256e-05, + "loss": 0.4906, + "step": 19847 + }, + { + "epoch": 3.2399902044814497, + "grad_norm": 2.067878007888794, + "learning_rate": 1.8264021770825607e-05, + "loss": 0.4825, + "step": 19848 + }, + { + "epoch": 3.240153463123954, + "grad_norm": 1.6447994709014893, + "learning_rate": 1.826384308292909e-05, + "loss": 0.4228, + "step": 19849 + }, + { + "epoch": 3.2403167217664586, + "grad_norm": 2.0298912525177, + "learning_rate": 1.826366438671088e-05, + "loss": 0.4957, + "step": 19850 + }, + { + "epoch": 3.240479980408963, + "grad_norm": 1.8682385683059692, + "learning_rate": 1.826348568217117e-05, + "loss": 0.4159, + "step": 19851 + }, + { + "epoch": 3.2406432390514675, + "grad_norm": 1.8607889413833618, + "learning_rate": 1.8263306969310127e-05, + "loss": 0.4373, + "step": 19852 + }, + { + "epoch": 3.2408064976939714, + "grad_norm": 2.428715944290161, + "learning_rate": 1.826312824812794e-05, + "loss": 0.5251, + "step": 19853 + }, + { + "epoch": 3.240969756336476, + "grad_norm": 1.9911134243011475, + "learning_rate": 1.826294951862478e-05, + "loss": 0.4945, + "step": 19854 + }, + { + "epoch": 3.2411330149789803, + "grad_norm": 1.93474280834198, + "learning_rate": 1.8262770780800834e-05, + "loss": 0.5251, + "step": 19855 + }, + { + "epoch": 3.2412962736214848, + "grad_norm": 1.7007322311401367, + "learning_rate": 1.826259203465628e-05, + "loss": 0.4015, + "step": 19856 + }, + { + "epoch": 3.241459532263989, + "grad_norm": 2.049635648727417, + "learning_rate": 1.8262413280191298e-05, + "loss": 0.4607, + "step": 19857 + }, + { + "epoch": 3.2416227909064936, + "grad_norm": 1.842278003692627, + "learning_rate": 1.826223451740607e-05, + "loss": 0.4339, + "step": 19858 + }, + { + "epoch": 3.241786049548998, + "grad_norm": 1.9717143774032593, + "learning_rate": 1.8262055746300773e-05, + "loss": 0.4726, + "step": 19859 + }, + { + "epoch": 3.2419493081915025, + "grad_norm": 2.410515785217285, + "learning_rate": 1.8261876966875583e-05, + "loss": 0.4734, + "step": 19860 + }, + { + "epoch": 3.242112566834007, + "grad_norm": 1.966588020324707, + "learning_rate": 1.826169817913069e-05, + "loss": 0.489, + "step": 19861 + }, + { + "epoch": 3.2422758254765114, + "grad_norm": 2.257474660873413, + "learning_rate": 1.826151938306627e-05, + "loss": 0.5982, + "step": 19862 + }, + { + "epoch": 3.2424390841190154, + "grad_norm": 1.7792490720748901, + "learning_rate": 1.82613405786825e-05, + "loss": 0.4417, + "step": 19863 + }, + { + "epoch": 3.24260234276152, + "grad_norm": 1.6467094421386719, + "learning_rate": 1.8261161765979566e-05, + "loss": 0.4375, + "step": 19864 + }, + { + "epoch": 3.2427656014040243, + "grad_norm": 1.9282317161560059, + "learning_rate": 1.8260982944957638e-05, + "loss": 0.4755, + "step": 19865 + }, + { + "epoch": 3.2429288600465287, + "grad_norm": 1.580509901046753, + "learning_rate": 1.8260804115616908e-05, + "loss": 0.3793, + "step": 19866 + }, + { + "epoch": 3.243092118689033, + "grad_norm": 1.7956122159957886, + "learning_rate": 1.826062527795755e-05, + "loss": 0.4862, + "step": 19867 + }, + { + "epoch": 3.2432553773315376, + "grad_norm": 2.032193183898926, + "learning_rate": 1.826044643197974e-05, + "loss": 0.5103, + "step": 19868 + }, + { + "epoch": 3.243418635974042, + "grad_norm": 2.2485523223876953, + "learning_rate": 1.8260267577683665e-05, + "loss": 0.5315, + "step": 19869 + }, + { + "epoch": 3.2435818946165464, + "grad_norm": 2.20963454246521, + "learning_rate": 1.8260088715069506e-05, + "loss": 0.5091, + "step": 19870 + }, + { + "epoch": 3.2437451532590504, + "grad_norm": 1.8808228969573975, + "learning_rate": 1.8259909844137435e-05, + "loss": 0.4095, + "step": 19871 + }, + { + "epoch": 3.243908411901555, + "grad_norm": 2.3236608505249023, + "learning_rate": 1.825973096488764e-05, + "loss": 0.5279, + "step": 19872 + }, + { + "epoch": 3.2440716705440593, + "grad_norm": 1.8761402368545532, + "learning_rate": 1.8259552077320297e-05, + "loss": 0.4186, + "step": 19873 + }, + { + "epoch": 3.2442349291865638, + "grad_norm": 1.853437066078186, + "learning_rate": 1.825937318143559e-05, + "loss": 0.395, + "step": 19874 + }, + { + "epoch": 3.244398187829068, + "grad_norm": 1.710277795791626, + "learning_rate": 1.825919427723369e-05, + "loss": 0.4367, + "step": 19875 + }, + { + "epoch": 3.2445614464715726, + "grad_norm": 2.3811535835266113, + "learning_rate": 1.8259015364714786e-05, + "loss": 0.456, + "step": 19876 + }, + { + "epoch": 3.244724705114077, + "grad_norm": 2.2033474445343018, + "learning_rate": 1.825883644387906e-05, + "loss": 0.468, + "step": 19877 + }, + { + "epoch": 3.2448879637565815, + "grad_norm": 1.9465454816818237, + "learning_rate": 1.8258657514726683e-05, + "loss": 0.4034, + "step": 19878 + }, + { + "epoch": 3.245051222399086, + "grad_norm": 2.0051772594451904, + "learning_rate": 1.8258478577257844e-05, + "loss": 0.4571, + "step": 19879 + }, + { + "epoch": 3.24521448104159, + "grad_norm": 2.2070631980895996, + "learning_rate": 1.8258299631472717e-05, + "loss": 0.5373, + "step": 19880 + }, + { + "epoch": 3.2453777396840944, + "grad_norm": 2.254781484603882, + "learning_rate": 1.8258120677371484e-05, + "loss": 0.5365, + "step": 19881 + }, + { + "epoch": 3.245540998326599, + "grad_norm": 2.1196467876434326, + "learning_rate": 1.8257941714954326e-05, + "loss": 0.4777, + "step": 19882 + }, + { + "epoch": 3.2457042569691033, + "grad_norm": 1.788446307182312, + "learning_rate": 1.8257762744221422e-05, + "loss": 0.5052, + "step": 19883 + }, + { + "epoch": 3.2458675156116077, + "grad_norm": 2.2347207069396973, + "learning_rate": 1.8257583765172955e-05, + "loss": 0.5096, + "step": 19884 + }, + { + "epoch": 3.246030774254112, + "grad_norm": 2.4230611324310303, + "learning_rate": 1.8257404777809102e-05, + "loss": 0.5986, + "step": 19885 + }, + { + "epoch": 3.2461940328966166, + "grad_norm": 2.1515071392059326, + "learning_rate": 1.8257225782130044e-05, + "loss": 0.595, + "step": 19886 + }, + { + "epoch": 3.246357291539121, + "grad_norm": 1.7397183179855347, + "learning_rate": 1.8257046778135966e-05, + "loss": 0.4102, + "step": 19887 + }, + { + "epoch": 3.2465205501816254, + "grad_norm": 1.9876456260681152, + "learning_rate": 1.825686776582704e-05, + "loss": 0.4887, + "step": 19888 + }, + { + "epoch": 3.2466838088241294, + "grad_norm": 1.9642671346664429, + "learning_rate": 1.825668874520345e-05, + "loss": 0.4882, + "step": 19889 + }, + { + "epoch": 3.246847067466634, + "grad_norm": 1.9460172653198242, + "learning_rate": 1.825650971626538e-05, + "loss": 0.5189, + "step": 19890 + }, + { + "epoch": 3.2470103261091383, + "grad_norm": 2.143934726715088, + "learning_rate": 1.8256330679013007e-05, + "loss": 0.5193, + "step": 19891 + }, + { + "epoch": 3.2471735847516427, + "grad_norm": 1.954857587814331, + "learning_rate": 1.8256151633446507e-05, + "loss": 0.5027, + "step": 19892 + }, + { + "epoch": 3.247336843394147, + "grad_norm": 2.215578317642212, + "learning_rate": 1.8255972579566064e-05, + "loss": 0.6088, + "step": 19893 + }, + { + "epoch": 3.2475001020366516, + "grad_norm": 1.8269784450531006, + "learning_rate": 1.8255793517371864e-05, + "loss": 0.4656, + "step": 19894 + }, + { + "epoch": 3.247663360679156, + "grad_norm": 1.7076570987701416, + "learning_rate": 1.825561444686408e-05, + "loss": 0.3969, + "step": 19895 + }, + { + "epoch": 3.2478266193216605, + "grad_norm": 1.8521252870559692, + "learning_rate": 1.8255435368042894e-05, + "loss": 0.4231, + "step": 19896 + }, + { + "epoch": 3.247989877964165, + "grad_norm": 2.2111520767211914, + "learning_rate": 1.8255256280908486e-05, + "loss": 0.5532, + "step": 19897 + }, + { + "epoch": 3.248153136606669, + "grad_norm": 2.2424046993255615, + "learning_rate": 1.825507718546104e-05, + "loss": 0.5281, + "step": 19898 + }, + { + "epoch": 3.2483163952491734, + "grad_norm": 2.001190662384033, + "learning_rate": 1.825489808170073e-05, + "loss": 0.5122, + "step": 19899 + }, + { + "epoch": 3.248479653891678, + "grad_norm": 2.4249374866485596, + "learning_rate": 1.825471896962774e-05, + "loss": 0.5946, + "step": 19900 + }, + { + "epoch": 3.2486429125341822, + "grad_norm": 2.4750750064849854, + "learning_rate": 1.8254539849242253e-05, + "loss": 0.4907, + "step": 19901 + }, + { + "epoch": 3.2488061711766867, + "grad_norm": 2.0469672679901123, + "learning_rate": 1.8254360720544446e-05, + "loss": 0.4478, + "step": 19902 + }, + { + "epoch": 3.248969429819191, + "grad_norm": 2.4646084308624268, + "learning_rate": 1.8254181583534496e-05, + "loss": 0.582, + "step": 19903 + }, + { + "epoch": 3.2491326884616956, + "grad_norm": 1.9825248718261719, + "learning_rate": 1.825400243821259e-05, + "loss": 0.4667, + "step": 19904 + }, + { + "epoch": 3.2492959471042, + "grad_norm": 1.8104220628738403, + "learning_rate": 1.8253823284578907e-05, + "loss": 0.4988, + "step": 19905 + }, + { + "epoch": 3.249459205746704, + "grad_norm": 1.9910534620285034, + "learning_rate": 1.8253644122633628e-05, + "loss": 0.5572, + "step": 19906 + }, + { + "epoch": 3.2496224643892084, + "grad_norm": 2.154738187789917, + "learning_rate": 1.8253464952376926e-05, + "loss": 0.4838, + "step": 19907 + }, + { + "epoch": 3.249785723031713, + "grad_norm": 1.7532366514205933, + "learning_rate": 1.825328577380899e-05, + "loss": 0.4567, + "step": 19908 + }, + { + "epoch": 3.2499489816742173, + "grad_norm": 1.7065171003341675, + "learning_rate": 1.825310658693e-05, + "loss": 0.4682, + "step": 19909 + }, + { + "epoch": 3.2501122403167217, + "grad_norm": 1.6093268394470215, + "learning_rate": 1.825292739174013e-05, + "loss": 0.3838, + "step": 19910 + }, + { + "epoch": 3.250275498959226, + "grad_norm": 1.8842140436172485, + "learning_rate": 1.8252748188239564e-05, + "loss": 0.4494, + "step": 19911 + }, + { + "epoch": 3.2504387576017306, + "grad_norm": 1.9161442518234253, + "learning_rate": 1.8252568976428483e-05, + "loss": 0.4628, + "step": 19912 + }, + { + "epoch": 3.250602016244235, + "grad_norm": 1.6061969995498657, + "learning_rate": 1.825238975630707e-05, + "loss": 0.4305, + "step": 19913 + }, + { + "epoch": 3.2507652748867395, + "grad_norm": 2.123081922531128, + "learning_rate": 1.8252210527875502e-05, + "loss": 0.425, + "step": 19914 + }, + { + "epoch": 3.250928533529244, + "grad_norm": 1.9789525270462036, + "learning_rate": 1.8252031291133957e-05, + "loss": 0.4288, + "step": 19915 + }, + { + "epoch": 3.251091792171748, + "grad_norm": 2.2017998695373535, + "learning_rate": 1.825185204608262e-05, + "loss": 0.5126, + "step": 19916 + }, + { + "epoch": 3.2512550508142524, + "grad_norm": 2.177875280380249, + "learning_rate": 1.8251672792721673e-05, + "loss": 0.5366, + "step": 19917 + }, + { + "epoch": 3.251418309456757, + "grad_norm": 1.6123872995376587, + "learning_rate": 1.825149353105129e-05, + "loss": 0.4745, + "step": 19918 + }, + { + "epoch": 3.2515815680992612, + "grad_norm": 1.906653881072998, + "learning_rate": 1.825131426107166e-05, + "loss": 0.4893, + "step": 19919 + }, + { + "epoch": 3.2517448267417657, + "grad_norm": 1.8109337091445923, + "learning_rate": 1.8251134982782952e-05, + "loss": 0.4292, + "step": 19920 + }, + { + "epoch": 3.25190808538427, + "grad_norm": 1.6222740411758423, + "learning_rate": 1.8250955696185357e-05, + "loss": 0.3997, + "step": 19921 + }, + { + "epoch": 3.2520713440267746, + "grad_norm": 1.8411736488342285, + "learning_rate": 1.8250776401279053e-05, + "loss": 0.464, + "step": 19922 + }, + { + "epoch": 3.252234602669279, + "grad_norm": 1.9030483961105347, + "learning_rate": 1.825059709806422e-05, + "loss": 0.4374, + "step": 19923 + }, + { + "epoch": 3.252397861311783, + "grad_norm": 1.9095194339752197, + "learning_rate": 1.8250417786541033e-05, + "loss": 0.4325, + "step": 19924 + }, + { + "epoch": 3.2525611199542874, + "grad_norm": 2.2829782962799072, + "learning_rate": 1.825023846670968e-05, + "loss": 0.5155, + "step": 19925 + }, + { + "epoch": 3.252724378596792, + "grad_norm": 1.6985522508621216, + "learning_rate": 1.8250059138570343e-05, + "loss": 0.4037, + "step": 19926 + }, + { + "epoch": 3.2528876372392963, + "grad_norm": 2.064514398574829, + "learning_rate": 1.824987980212319e-05, + "loss": 0.4679, + "step": 19927 + }, + { + "epoch": 3.2530508958818007, + "grad_norm": 2.045318126678467, + "learning_rate": 1.824970045736842e-05, + "loss": 0.4896, + "step": 19928 + }, + { + "epoch": 3.253214154524305, + "grad_norm": 1.8595269918441772, + "learning_rate": 1.8249521104306195e-05, + "loss": 0.4482, + "step": 19929 + }, + { + "epoch": 3.2533774131668096, + "grad_norm": 1.607696533203125, + "learning_rate": 1.824934174293671e-05, + "loss": 0.3971, + "step": 19930 + }, + { + "epoch": 3.253540671809314, + "grad_norm": 2.0045278072357178, + "learning_rate": 1.824916237326014e-05, + "loss": 0.5443, + "step": 19931 + }, + { + "epoch": 3.2537039304518185, + "grad_norm": 2.2996957302093506, + "learning_rate": 1.8248982995276664e-05, + "loss": 0.5439, + "step": 19932 + }, + { + "epoch": 3.2538671890943225, + "grad_norm": 2.0599286556243896, + "learning_rate": 1.8248803608986466e-05, + "loss": 0.4869, + "step": 19933 + }, + { + "epoch": 3.254030447736827, + "grad_norm": 2.1422319412231445, + "learning_rate": 1.8248624214389723e-05, + "loss": 0.5945, + "step": 19934 + }, + { + "epoch": 3.2541937063793314, + "grad_norm": 2.062629461288452, + "learning_rate": 1.8248444811486617e-05, + "loss": 0.4615, + "step": 19935 + }, + { + "epoch": 3.254356965021836, + "grad_norm": 2.198063611984253, + "learning_rate": 1.8248265400277332e-05, + "loss": 0.4936, + "step": 19936 + }, + { + "epoch": 3.2545202236643402, + "grad_norm": 1.900908350944519, + "learning_rate": 1.8248085980762043e-05, + "loss": 0.441, + "step": 19937 + }, + { + "epoch": 3.2546834823068447, + "grad_norm": 1.4588347673416138, + "learning_rate": 1.8247906552940934e-05, + "loss": 0.3876, + "step": 19938 + }, + { + "epoch": 3.254846740949349, + "grad_norm": 1.8382700681686401, + "learning_rate": 1.8247727116814187e-05, + "loss": 0.3977, + "step": 19939 + }, + { + "epoch": 3.2550099995918536, + "grad_norm": 1.8644049167633057, + "learning_rate": 1.824754767238198e-05, + "loss": 0.469, + "step": 19940 + }, + { + "epoch": 3.2551732582343575, + "grad_norm": 2.10211443901062, + "learning_rate": 1.8247368219644496e-05, + "loss": 0.5081, + "step": 19941 + }, + { + "epoch": 3.255336516876862, + "grad_norm": 1.7339688539505005, + "learning_rate": 1.8247188758601912e-05, + "loss": 0.4078, + "step": 19942 + }, + { + "epoch": 3.2554997755193664, + "grad_norm": 2.2604892253875732, + "learning_rate": 1.8247009289254414e-05, + "loss": 0.5566, + "step": 19943 + }, + { + "epoch": 3.255663034161871, + "grad_norm": 2.323786497116089, + "learning_rate": 1.8246829811602178e-05, + "loss": 0.4902, + "step": 19944 + }, + { + "epoch": 3.2558262928043753, + "grad_norm": 1.7504377365112305, + "learning_rate": 1.8246650325645387e-05, + "loss": 0.3724, + "step": 19945 + }, + { + "epoch": 3.2559895514468797, + "grad_norm": 1.8784488439559937, + "learning_rate": 1.824647083138422e-05, + "loss": 0.4923, + "step": 19946 + }, + { + "epoch": 3.256152810089384, + "grad_norm": 1.6818276643753052, + "learning_rate": 1.824629132881886e-05, + "loss": 0.3771, + "step": 19947 + }, + { + "epoch": 3.2563160687318886, + "grad_norm": 1.8028771877288818, + "learning_rate": 1.8246111817949486e-05, + "loss": 0.5109, + "step": 19948 + }, + { + "epoch": 3.256479327374393, + "grad_norm": 1.9828357696533203, + "learning_rate": 1.8245932298776285e-05, + "loss": 0.4523, + "step": 19949 + }, + { + "epoch": 3.2566425860168975, + "grad_norm": 1.7806031703948975, + "learning_rate": 1.8245752771299426e-05, + "loss": 0.4605, + "step": 19950 + }, + { + "epoch": 3.2568058446594015, + "grad_norm": 1.8306434154510498, + "learning_rate": 1.82455732355191e-05, + "loss": 0.485, + "step": 19951 + }, + { + "epoch": 3.256969103301906, + "grad_norm": 1.9412750005722046, + "learning_rate": 1.824539369143548e-05, + "loss": 0.5575, + "step": 19952 + }, + { + "epoch": 3.2571323619444104, + "grad_norm": 1.983553409576416, + "learning_rate": 1.8245214139048753e-05, + "loss": 0.4847, + "step": 19953 + }, + { + "epoch": 3.257295620586915, + "grad_norm": 1.787693738937378, + "learning_rate": 1.82450345783591e-05, + "loss": 0.4329, + "step": 19954 + }, + { + "epoch": 3.2574588792294192, + "grad_norm": 1.972496747970581, + "learning_rate": 1.8244855009366693e-05, + "loss": 0.4478, + "step": 19955 + }, + { + "epoch": 3.2576221378719237, + "grad_norm": 1.941447138786316, + "learning_rate": 1.8244675432071723e-05, + "loss": 0.4764, + "step": 19956 + }, + { + "epoch": 3.257785396514428, + "grad_norm": 1.8226481676101685, + "learning_rate": 1.8244495846474367e-05, + "loss": 0.4241, + "step": 19957 + }, + { + "epoch": 3.2579486551569325, + "grad_norm": 2.2057716846466064, + "learning_rate": 1.8244316252574808e-05, + "loss": 0.4621, + "step": 19958 + }, + { + "epoch": 3.2581119137994365, + "grad_norm": 1.572947382926941, + "learning_rate": 1.824413665037322e-05, + "loss": 0.4317, + "step": 19959 + }, + { + "epoch": 3.258275172441941, + "grad_norm": 1.9553881883621216, + "learning_rate": 1.824395703986979e-05, + "loss": 0.5114, + "step": 19960 + }, + { + "epoch": 3.2584384310844454, + "grad_norm": 1.7482995986938477, + "learning_rate": 1.82437774210647e-05, + "loss": 0.4041, + "step": 19961 + }, + { + "epoch": 3.25860168972695, + "grad_norm": 2.3349599838256836, + "learning_rate": 1.8243597793958128e-05, + "loss": 0.4869, + "step": 19962 + }, + { + "epoch": 3.2587649483694543, + "grad_norm": 2.0042033195495605, + "learning_rate": 1.8243418158550254e-05, + "loss": 0.4511, + "step": 19963 + }, + { + "epoch": 3.2589282070119587, + "grad_norm": 1.8524048328399658, + "learning_rate": 1.824323851484126e-05, + "loss": 0.3794, + "step": 19964 + }, + { + "epoch": 3.259091465654463, + "grad_norm": 2.007139205932617, + "learning_rate": 1.8243058862831328e-05, + "loss": 0.4401, + "step": 19965 + }, + { + "epoch": 3.2592547242969676, + "grad_norm": 1.6633027791976929, + "learning_rate": 1.8242879202520635e-05, + "loss": 0.3755, + "step": 19966 + }, + { + "epoch": 3.259417982939472, + "grad_norm": 1.71791672706604, + "learning_rate": 1.8242699533909368e-05, + "loss": 0.4594, + "step": 19967 + }, + { + "epoch": 3.2595812415819765, + "grad_norm": 2.16852068901062, + "learning_rate": 1.8242519856997703e-05, + "loss": 0.6853, + "step": 19968 + }, + { + "epoch": 3.2597445002244805, + "grad_norm": 2.2362418174743652, + "learning_rate": 1.8242340171785823e-05, + "loss": 0.5464, + "step": 19969 + }, + { + "epoch": 3.259907758866985, + "grad_norm": 2.077819585800171, + "learning_rate": 1.8242160478273908e-05, + "loss": 0.4807, + "step": 19970 + }, + { + "epoch": 3.2600710175094894, + "grad_norm": 2.061586856842041, + "learning_rate": 1.8241980776462144e-05, + "loss": 0.5037, + "step": 19971 + }, + { + "epoch": 3.260234276151994, + "grad_norm": 1.9462618827819824, + "learning_rate": 1.8241801066350705e-05, + "loss": 0.5238, + "step": 19972 + }, + { + "epoch": 3.2603975347944982, + "grad_norm": 2.240145444869995, + "learning_rate": 1.824162134793977e-05, + "loss": 0.5313, + "step": 19973 + }, + { + "epoch": 3.2605607934370027, + "grad_norm": 1.8971116542816162, + "learning_rate": 1.824144162122953e-05, + "loss": 0.4889, + "step": 19974 + }, + { + "epoch": 3.260724052079507, + "grad_norm": 3.248300790786743, + "learning_rate": 1.8241261886220155e-05, + "loss": 0.599, + "step": 19975 + }, + { + "epoch": 3.260887310722011, + "grad_norm": 2.0228404998779297, + "learning_rate": 1.824108214291184e-05, + "loss": 0.494, + "step": 19976 + }, + { + "epoch": 3.2610505693645155, + "grad_norm": 1.7644407749176025, + "learning_rate": 1.824090239130475e-05, + "loss": 0.4238, + "step": 19977 + }, + { + "epoch": 3.26121382800702, + "grad_norm": 2.031167984008789, + "learning_rate": 1.8240722631399077e-05, + "loss": 0.5367, + "step": 19978 + }, + { + "epoch": 3.2613770866495244, + "grad_norm": 1.9449899196624756, + "learning_rate": 1.8240542863194997e-05, + "loss": 0.4612, + "step": 19979 + }, + { + "epoch": 3.261540345292029, + "grad_norm": 1.9337657690048218, + "learning_rate": 1.8240363086692695e-05, + "loss": 0.4064, + "step": 19980 + }, + { + "epoch": 3.2617036039345333, + "grad_norm": 1.741765022277832, + "learning_rate": 1.8240183301892343e-05, + "loss": 0.4342, + "step": 19981 + }, + { + "epoch": 3.2618668625770377, + "grad_norm": 2.1080784797668457, + "learning_rate": 1.8240003508794134e-05, + "loss": 0.4796, + "step": 19982 + }, + { + "epoch": 3.262030121219542, + "grad_norm": 1.937576413154602, + "learning_rate": 1.8239823707398245e-05, + "loss": 0.5393, + "step": 19983 + }, + { + "epoch": 3.2621933798620466, + "grad_norm": 1.850681185722351, + "learning_rate": 1.8239643897704855e-05, + "loss": 0.5067, + "step": 19984 + }, + { + "epoch": 3.262356638504551, + "grad_norm": 1.7104310989379883, + "learning_rate": 1.8239464079714144e-05, + "loss": 0.4686, + "step": 19985 + }, + { + "epoch": 3.262519897147055, + "grad_norm": 1.8275175094604492, + "learning_rate": 1.8239284253426294e-05, + "loss": 0.4189, + "step": 19986 + }, + { + "epoch": 3.2626831557895595, + "grad_norm": 1.9923691749572754, + "learning_rate": 1.823910441884149e-05, + "loss": 0.5328, + "step": 19987 + }, + { + "epoch": 3.262846414432064, + "grad_norm": 1.9968422651290894, + "learning_rate": 1.8238924575959905e-05, + "loss": 0.4898, + "step": 19988 + }, + { + "epoch": 3.2630096730745684, + "grad_norm": 1.6941279172897339, + "learning_rate": 1.8238744724781728e-05, + "loss": 0.4519, + "step": 19989 + }, + { + "epoch": 3.263172931717073, + "grad_norm": 2.0652999877929688, + "learning_rate": 1.823856486530714e-05, + "loss": 0.5112, + "step": 19990 + }, + { + "epoch": 3.2633361903595772, + "grad_norm": 1.9452491998672485, + "learning_rate": 1.8238384997536317e-05, + "loss": 0.459, + "step": 19991 + }, + { + "epoch": 3.2634994490020817, + "grad_norm": 1.8669904470443726, + "learning_rate": 1.8238205121469442e-05, + "loss": 0.4068, + "step": 19992 + }, + { + "epoch": 3.263662707644586, + "grad_norm": 1.6877986192703247, + "learning_rate": 1.82380252371067e-05, + "loss": 0.4376, + "step": 19993 + }, + { + "epoch": 3.26382596628709, + "grad_norm": 1.9831780195236206, + "learning_rate": 1.8237845344448262e-05, + "loss": 0.53, + "step": 19994 + }, + { + "epoch": 3.2639892249295945, + "grad_norm": 1.8338723182678223, + "learning_rate": 1.8237665443494323e-05, + "loss": 0.4312, + "step": 19995 + }, + { + "epoch": 3.264152483572099, + "grad_norm": 1.9956586360931396, + "learning_rate": 1.823748553424505e-05, + "loss": 0.5169, + "step": 19996 + }, + { + "epoch": 3.2643157422146034, + "grad_norm": 2.5667524337768555, + "learning_rate": 1.823730561670064e-05, + "loss": 0.5545, + "step": 19997 + }, + { + "epoch": 3.264479000857108, + "grad_norm": 1.84737229347229, + "learning_rate": 1.8237125690861258e-05, + "loss": 0.5085, + "step": 19998 + }, + { + "epoch": 3.2646422594996123, + "grad_norm": 2.0889086723327637, + "learning_rate": 1.8236945756727093e-05, + "loss": 0.4705, + "step": 19999 + }, + { + "epoch": 3.2648055181421167, + "grad_norm": 2.074276924133301, + "learning_rate": 1.8236765814298328e-05, + "loss": 0.4871, + "step": 20000 + } + ], + "logging_steps": 1, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 17, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.317349924124754e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}