diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,84021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5287597936174278, + "eval_steps": 500, + "global_step": 12000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.736969857569419, + "learning_rate": 2.1231422505307857e-08, + "loss": 0.6756, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 2.037470655132517, + "learning_rate": 4.246284501061571e-08, + "loss": 0.8562, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 1.9218059224199386, + "learning_rate": 6.369426751592358e-08, + "loss": 0.8068, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 1.619998245287453, + "learning_rate": 8.492569002123143e-08, + "loss": 0.6641, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 1.7552286831643211, + "learning_rate": 1.0615711252653928e-07, + "loss": 0.6532, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 1.725938522012392, + "learning_rate": 1.2738853503184715e-07, + "loss": 0.6431, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 1.8840277525393838, + "learning_rate": 1.4861995753715502e-07, + "loss": 0.7829, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 1.787761961910525, + "learning_rate": 1.6985138004246285e-07, + "loss": 0.6583, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 1.856382702441212, + "learning_rate": 1.9108280254777072e-07, + "loss": 0.8299, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 1.9440185036397162, + "learning_rate": 2.1231422505307855e-07, + "loss": 0.7814, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 2.042243550306094, + "learning_rate": 2.3354564755838642e-07, + "loss": 0.8368, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 1.7996395106043221, + "learning_rate": 2.547770700636943e-07, + "loss": 0.638, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 1.869579841578725, + "learning_rate": 2.7600849256900214e-07, + "loss": 0.8517, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 1.9095070756184216, + "learning_rate": 2.9723991507431003e-07, + "loss": 0.8669, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 1.7011520507056614, + "learning_rate": 3.1847133757961787e-07, + "loss": 0.6481, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 1.72656352612305, + "learning_rate": 3.397027600849257e-07, + "loss": 0.6676, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 1.9850011594794108, + "learning_rate": 3.6093418259023354e-07, + "loss": 0.8064, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 1.9961052081277577, + "learning_rate": 3.8216560509554143e-07, + "loss": 0.8623, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 1.5419972208334418, + "learning_rate": 4.0339702760084927e-07, + "loss": 0.6767, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 2.1838794830736905, + "learning_rate": 4.246284501061571e-07, + "loss": 0.8244, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 1.9317232935018747, + "learning_rate": 4.45859872611465e-07, + "loss": 0.852, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 1.7085361334482871, + "learning_rate": 4.6709129511677283e-07, + "loss": 0.657, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 1.8870619473509618, + "learning_rate": 4.883227176220808e-07, + "loss": 0.8576, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 1.5299893741179431, + "learning_rate": 5.095541401273886e-07, + "loss": 0.6873, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 1.4250924984224744, + "learning_rate": 5.307855626326964e-07, + "loss": 0.7369, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 1.7077717936168544, + "learning_rate": 5.520169851380043e-07, + "loss": 0.815, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 1.5997246770566833, + "learning_rate": 5.732484076433121e-07, + "loss": 0.7909, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 1.4952396677004043, + "learning_rate": 5.944798301486201e-07, + "loss": 0.6704, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 1.3899327550048044, + "learning_rate": 6.157112526539279e-07, + "loss": 0.6509, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 2.061712463091723, + "learning_rate": 6.369426751592357e-07, + "loss": 0.85, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 1.2744370728062344, + "learning_rate": 6.581740976645436e-07, + "loss": 0.6423, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 1.3923514258885368, + "learning_rate": 6.794055201698514e-07, + "loss": 0.6969, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 1.3692117861966746, + "learning_rate": 7.006369426751592e-07, + "loss": 0.6553, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 1.096917613406094, + "learning_rate": 7.218683651804671e-07, + "loss": 0.5882, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 1.23589373051139, + "learning_rate": 7.43099787685775e-07, + "loss": 0.6099, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 1.457426821981949, + "learning_rate": 7.643312101910829e-07, + "loss": 0.8656, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 1.3463087038344559, + "learning_rate": 7.855626326963907e-07, + "loss": 0.7443, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 1.2354215190204334, + "learning_rate": 8.067940552016985e-07, + "loss": 0.6589, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 1.3543046797702938, + "learning_rate": 8.280254777070064e-07, + "loss": 0.7884, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 0.9978970211744808, + "learning_rate": 8.492569002123142e-07, + "loss": 0.646, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 1.198186746771855, + "learning_rate": 8.70488322717622e-07, + "loss": 0.8023, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 0.9222935238354153, + "learning_rate": 8.9171974522293e-07, + "loss": 0.7114, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 0.7987781325353095, + "learning_rate": 9.129511677282378e-07, + "loss": 0.5467, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 0.9526224105233752, + "learning_rate": 9.341825902335457e-07, + "loss": 0.5956, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 1.4331820306922685, + "learning_rate": 9.554140127388537e-07, + "loss": 0.8467, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 0.8270249691131161, + "learning_rate": 9.766454352441615e-07, + "loss": 0.6266, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 0.8966224676323107, + "learning_rate": 9.978768577494694e-07, + "loss": 0.6487, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 1.1974556124489195, + "learning_rate": 1.0191082802547772e-06, + "loss": 0.8419, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 1.2300556285046356, + "learning_rate": 1.040339702760085e-06, + "loss": 0.7656, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 1.088997811624343, + "learning_rate": 1.0615711252653929e-06, + "loss": 0.6085, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 1.1661197607504696, + "learning_rate": 1.0828025477707007e-06, + "loss": 0.7957, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 0.8565785576485809, + "learning_rate": 1.1040339702760086e-06, + "loss": 0.5701, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 0.8073850980562628, + "learning_rate": 1.1252653927813164e-06, + "loss": 0.6135, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 1.1524349803707088, + "learning_rate": 1.1464968152866242e-06, + "loss": 0.7108, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 0.8949503229969369, + "learning_rate": 1.167728237791932e-06, + "loss": 0.644, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 1.1290110901645551, + "learning_rate": 1.1889596602972401e-06, + "loss": 0.7598, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 1.2260917734247248, + "learning_rate": 1.210191082802548e-06, + "loss": 0.7976, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 0.7484536915279122, + "learning_rate": 1.2314225053078558e-06, + "loss": 0.6533, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 0.7018104782389608, + "learning_rate": 1.2526539278131636e-06, + "loss": 0.6257, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 0.7496189310231463, + "learning_rate": 1.2738853503184715e-06, + "loss": 0.6163, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 1.0275015023681622, + "learning_rate": 1.2951167728237793e-06, + "loss": 0.7835, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 0.8049740996421182, + "learning_rate": 1.3163481953290871e-06, + "loss": 0.5951, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 0.769250999670833, + "learning_rate": 1.337579617834395e-06, + "loss": 0.6993, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 0.7367855450371175, + "learning_rate": 1.3588110403397028e-06, + "loss": 0.5676, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 1.0538913639692464, + "learning_rate": 1.3800424628450107e-06, + "loss": 0.7137, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 0.7836353836094159, + "learning_rate": 1.4012738853503185e-06, + "loss": 0.6458, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 1.2681341915414426, + "learning_rate": 1.4225053078556263e-06, + "loss": 0.7745, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 0.7464717867766247, + "learning_rate": 1.4437367303609342e-06, + "loss": 0.6341, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 0.7034007234369021, + "learning_rate": 1.4649681528662422e-06, + "loss": 0.5941, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 0.9151609867840141, + "learning_rate": 1.48619957537155e-06, + "loss": 0.7036, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 0.8111596902625513, + "learning_rate": 1.5074309978768579e-06, + "loss": 0.6062, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 0.7583995233620089, + "learning_rate": 1.5286624203821657e-06, + "loss": 0.5653, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 0.9962891903702115, + "learning_rate": 1.5498938428874736e-06, + "loss": 0.7762, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 1.1313304547472938, + "learning_rate": 1.5711252653927814e-06, + "loss": 0.7421, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 1.008144720140337, + "learning_rate": 1.5923566878980892e-06, + "loss": 0.7189, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 0.7780427717780393, + "learning_rate": 1.613588110403397e-06, + "loss": 0.5921, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 0.8399311727829962, + "learning_rate": 1.634819532908705e-06, + "loss": 0.5997, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 0.6927282906786248, + "learning_rate": 1.6560509554140127e-06, + "loss": 0.5671, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 1.2300098877330876, + "learning_rate": 1.6772823779193206e-06, + "loss": 0.7011, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 0.9185988906241528, + "learning_rate": 1.6985138004246284e-06, + "loss": 0.6905, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 0.9206959465561452, + "learning_rate": 1.7197452229299363e-06, + "loss": 0.7107, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 0.6818779957755264, + "learning_rate": 1.740976645435244e-06, + "loss": 0.5602, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 0.6498799279522858, + "learning_rate": 1.7622080679405521e-06, + "loss": 0.601, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 0.888398773343403, + "learning_rate": 1.78343949044586e-06, + "loss": 0.6146, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 0.962512792773225, + "learning_rate": 1.8046709129511678e-06, + "loss": 0.726, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 1.14206816976848, + "learning_rate": 1.8259023354564756e-06, + "loss": 0.7984, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 0.8072834925084155, + "learning_rate": 1.8471337579617835e-06, + "loss": 0.7032, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 0.947602536205962, + "learning_rate": 1.8683651804670913e-06, + "loss": 0.6548, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 0.9143970421375164, + "learning_rate": 1.8895966029723994e-06, + "loss": 0.7302, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 0.674175642032997, + "learning_rate": 1.9108280254777074e-06, + "loss": 0.5131, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.7886837529805113, + "learning_rate": 1.9320594479830153e-06, + "loss": 0.7306, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 0.8998842154200234, + "learning_rate": 1.953290870488323e-06, + "loss": 0.7619, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 0.8939754429002351, + "learning_rate": 1.974522292993631e-06, + "loss": 0.6977, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 0.7267793453096345, + "learning_rate": 1.9957537154989388e-06, + "loss": 0.6719, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 0.9423699329896398, + "learning_rate": 2.0169851380042466e-06, + "loss": 0.6696, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.9127968272713965, + "learning_rate": 2.0382165605095544e-06, + "loss": 0.6863, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 0.6453158741901237, + "learning_rate": 2.0594479830148623e-06, + "loss": 0.5726, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 0.8004608999355025, + "learning_rate": 2.08067940552017e-06, + "loss": 0.6646, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 0.741422175795783, + "learning_rate": 2.101910828025478e-06, + "loss": 0.6141, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 0.6185567152561202, + "learning_rate": 2.1231422505307858e-06, + "loss": 0.6171, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.912129457483921, + "learning_rate": 2.1443736730360936e-06, + "loss": 0.7509, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 0.8247609672782598, + "learning_rate": 2.1656050955414015e-06, + "loss": 0.6468, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 0.8029055653595574, + "learning_rate": 2.1868365180467093e-06, + "loss": 0.7296, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 0.9851633577727037, + "learning_rate": 2.208067940552017e-06, + "loss": 0.744, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 0.6679860577610721, + "learning_rate": 2.229299363057325e-06, + "loss": 0.5683, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.819612351171737, + "learning_rate": 2.250530785562633e-06, + "loss": 0.6859, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 0.8845853691655635, + "learning_rate": 2.2717622080679406e-06, + "loss": 0.7128, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 0.9896458089509722, + "learning_rate": 2.2929936305732485e-06, + "loss": 0.6823, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 0.8401401133737159, + "learning_rate": 2.3142250530785563e-06, + "loss": 0.6848, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 0.7044845791337637, + "learning_rate": 2.335456475583864e-06, + "loss": 0.6287, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.8936501983713924, + "learning_rate": 2.356687898089172e-06, + "loss": 0.6301, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 0.7977858105718107, + "learning_rate": 2.3779193205944802e-06, + "loss": 0.6501, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 0.9625024469697634, + "learning_rate": 2.399150743099788e-06, + "loss": 0.7136, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 0.8202496777053818, + "learning_rate": 2.420382165605096e-06, + "loss": 0.6466, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 0.8083756378016473, + "learning_rate": 2.4416135881104038e-06, + "loss": 0.5758, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.6794575896854108, + "learning_rate": 2.4628450106157116e-06, + "loss": 0.6271, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 0.920475184785038, + "learning_rate": 2.4840764331210194e-06, + "loss": 0.6896, + "step": 117 + }, + { + "epoch": 0.02, + "grad_norm": 1.0624376066131578, + "learning_rate": 2.5053078556263273e-06, + "loss": 0.6885, + "step": 118 + }, + { + "epoch": 0.02, + "grad_norm": 1.3494780155295418, + "learning_rate": 2.526539278131635e-06, + "loss": 0.6805, + "step": 119 + }, + { + "epoch": 0.02, + "grad_norm": 0.6680598812484428, + "learning_rate": 2.547770700636943e-06, + "loss": 0.6003, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 0.7145951603724503, + "learning_rate": 2.5690021231422508e-06, + "loss": 0.5845, + "step": 121 + }, + { + "epoch": 0.02, + "grad_norm": 0.932029401239929, + "learning_rate": 2.5902335456475586e-06, + "loss": 0.7106, + "step": 122 + }, + { + "epoch": 0.02, + "grad_norm": 0.6711551268440451, + "learning_rate": 2.6114649681528665e-06, + "loss": 0.5839, + "step": 123 + }, + { + "epoch": 0.02, + "grad_norm": 0.8123840964518879, + "learning_rate": 2.6326963906581743e-06, + "loss": 0.6535, + "step": 124 + }, + { + "epoch": 0.02, + "grad_norm": 0.9502589598494059, + "learning_rate": 2.653927813163482e-06, + "loss": 0.6723, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 0.686260030484047, + "learning_rate": 2.67515923566879e-06, + "loss": 0.5849, + "step": 126 + }, + { + "epoch": 0.02, + "grad_norm": 0.671346829133467, + "learning_rate": 2.696390658174098e-06, + "loss": 0.5539, + "step": 127 + }, + { + "epoch": 0.02, + "grad_norm": 0.7348439508382469, + "learning_rate": 2.7176220806794056e-06, + "loss": 0.6279, + "step": 128 + }, + { + "epoch": 0.02, + "grad_norm": 0.6657544130751492, + "learning_rate": 2.7388535031847135e-06, + "loss": 0.5721, + "step": 129 + }, + { + "epoch": 0.02, + "grad_norm": 0.7505039321902811, + "learning_rate": 2.7600849256900213e-06, + "loss": 0.6523, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 0.9081812514026412, + "learning_rate": 2.781316348195329e-06, + "loss": 0.7182, + "step": 131 + }, + { + "epoch": 0.02, + "grad_norm": 1.1754570547070735, + "learning_rate": 2.802547770700637e-06, + "loss": 0.7845, + "step": 132 + }, + { + "epoch": 0.02, + "grad_norm": 0.759204923968475, + "learning_rate": 2.823779193205945e-06, + "loss": 0.5666, + "step": 133 + }, + { + "epoch": 0.02, + "grad_norm": 0.9134266282272141, + "learning_rate": 2.8450106157112527e-06, + "loss": 0.617, + "step": 134 + }, + { + "epoch": 0.02, + "grad_norm": 0.6743132459299028, + "learning_rate": 2.8662420382165605e-06, + "loss": 0.5926, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 1.2852422502660135, + "learning_rate": 2.8874734607218683e-06, + "loss": 0.7055, + "step": 136 + }, + { + "epoch": 0.02, + "grad_norm": 1.0032638917986518, + "learning_rate": 2.908704883227176e-06, + "loss": 0.6845, + "step": 137 + }, + { + "epoch": 0.02, + "grad_norm": 0.6545452001482567, + "learning_rate": 2.9299363057324844e-06, + "loss": 0.5724, + "step": 138 + }, + { + "epoch": 0.02, + "grad_norm": 1.0033165293069863, + "learning_rate": 2.9511677282377923e-06, + "loss": 0.708, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 0.6603942673614058, + "learning_rate": 2.9723991507431e-06, + "loss": 0.5816, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 0.9929599681631902, + "learning_rate": 2.993630573248408e-06, + "loss": 0.652, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 0.9195381630729605, + "learning_rate": 3.0148619957537158e-06, + "loss": 0.6224, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 0.96507859825706, + "learning_rate": 3.0360934182590236e-06, + "loss": 0.7299, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 0.6331253476001903, + "learning_rate": 3.0573248407643314e-06, + "loss": 0.5364, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 0.7321637764611932, + "learning_rate": 3.0785562632696393e-06, + "loss": 0.6539, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 0.8029848201541017, + "learning_rate": 3.099787685774947e-06, + "loss": 0.6919, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 0.8112265854608945, + "learning_rate": 3.121019108280255e-06, + "loss": 0.5885, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 0.945927793705502, + "learning_rate": 3.142250530785563e-06, + "loss": 0.6859, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 0.6415884691227447, + "learning_rate": 3.1634819532908706e-06, + "loss": 0.5566, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 0.7085088857450645, + "learning_rate": 3.1847133757961785e-06, + "loss": 0.6582, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 0.7086688356979257, + "learning_rate": 3.2059447983014863e-06, + "loss": 0.5684, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 0.89752749128589, + "learning_rate": 3.227176220806794e-06, + "loss": 0.6836, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 0.7609285062708646, + "learning_rate": 3.248407643312102e-06, + "loss": 0.6002, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 0.8250009585344907, + "learning_rate": 3.26963906581741e-06, + "loss": 0.6999, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 0.6295758904840901, + "learning_rate": 3.2908704883227177e-06, + "loss": 0.6006, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 0.657018296450642, + "learning_rate": 3.3121019108280255e-06, + "loss": 0.5709, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 0.7453449119436517, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.6195, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 0.6623503898673194, + "learning_rate": 3.354564755838641e-06, + "loss": 0.618, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 0.8975443110926178, + "learning_rate": 3.375796178343949e-06, + "loss": 0.7095, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 0.7196920125346429, + "learning_rate": 3.397027600849257e-06, + "loss": 0.5944, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 0.7274047246720723, + "learning_rate": 3.4182590233545647e-06, + "loss": 0.6097, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 0.8416353775608544, + "learning_rate": 3.4394904458598725e-06, + "loss": 0.6731, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 0.7565544021842142, + "learning_rate": 3.4607218683651803e-06, + "loss": 0.5338, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 0.8198347197807316, + "learning_rate": 3.481953290870488e-06, + "loss": 0.5975, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 0.5958765236284512, + "learning_rate": 3.5031847133757964e-06, + "loss": 0.5593, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 0.8596154311044288, + "learning_rate": 3.5244161358811043e-06, + "loss": 0.7188, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 1.2250817147186526, + "learning_rate": 3.545647558386412e-06, + "loss": 0.6885, + "step": 167 + }, + { + "epoch": 0.02, + "grad_norm": 0.6555194561718294, + "learning_rate": 3.56687898089172e-06, + "loss": 0.584, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 0.620333165580415, + "learning_rate": 3.5881104033970278e-06, + "loss": 0.5509, + "step": 169 + }, + { + "epoch": 0.02, + "grad_norm": 0.8073633638541113, + "learning_rate": 3.6093418259023356e-06, + "loss": 0.5459, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 0.7528890153499365, + "learning_rate": 3.6305732484076435e-06, + "loss": 0.6095, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 0.9357230716689366, + "learning_rate": 3.6518046709129513e-06, + "loss": 0.6951, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 0.8441946942232076, + "learning_rate": 3.673036093418259e-06, + "loss": 0.686, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 0.6953738393442778, + "learning_rate": 3.694267515923567e-06, + "loss": 0.5587, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 1.0614940309536194, + "learning_rate": 3.715498938428875e-06, + "loss": 0.6437, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 0.8910635949918921, + "learning_rate": 3.7367303609341826e-06, + "loss": 0.7033, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 0.8732162877196056, + "learning_rate": 3.757961783439491e-06, + "loss": 0.6226, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 0.6165029790713727, + "learning_rate": 3.7791932059447987e-06, + "loss": 0.5568, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 0.7087055049069805, + "learning_rate": 3.8004246284501066e-06, + "loss": 0.5936, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 0.6614937596481818, + "learning_rate": 3.821656050955415e-06, + "loss": 0.6021, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 0.7181004064853254, + "learning_rate": 3.842887473460722e-06, + "loss": 0.5976, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 0.8746688456208674, + "learning_rate": 3.8641188959660305e-06, + "loss": 0.7057, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 0.8168122830164994, + "learning_rate": 3.885350318471338e-06, + "loss": 0.6907, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 0.7355109794661671, + "learning_rate": 3.906581740976646e-06, + "loss": 0.5938, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 0.9557411818018728, + "learning_rate": 3.927813163481954e-06, + "loss": 0.7442, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 0.8272193627767886, + "learning_rate": 3.949044585987262e-06, + "loss": 0.7387, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 0.7601867318329757, + "learning_rate": 3.970276008492569e-06, + "loss": 0.5666, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 0.7848861700574545, + "learning_rate": 3.9915074309978775e-06, + "loss": 0.6065, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 0.6641571738403048, + "learning_rate": 4.012738853503185e-06, + "loss": 0.5761, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 0.6167229488978831, + "learning_rate": 4.033970276008493e-06, + "loss": 0.6061, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 0.9845644834250507, + "learning_rate": 4.055201698513801e-06, + "loss": 0.6665, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 1.4225166467098442, + "learning_rate": 4.076433121019109e-06, + "loss": 0.7222, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 0.825520396357522, + "learning_rate": 4.097664543524416e-06, + "loss": 0.6237, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 0.9855794534684909, + "learning_rate": 4.1188959660297246e-06, + "loss": 0.7422, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 0.8740524318615315, + "learning_rate": 4.140127388535032e-06, + "loss": 0.6686, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 0.7916317776172551, + "learning_rate": 4.16135881104034e-06, + "loss": 0.6258, + "step": 196 + }, + { + "epoch": 0.03, + "grad_norm": 0.7074471847536283, + "learning_rate": 4.1825902335456485e-06, + "loss": 0.6024, + "step": 197 + }, + { + "epoch": 0.03, + "grad_norm": 0.7027202324237053, + "learning_rate": 4.203821656050956e-06, + "loss": 0.5537, + "step": 198 + }, + { + "epoch": 0.03, + "grad_norm": 1.1408714976242633, + "learning_rate": 4.225053078556264e-06, + "loss": 0.634, + "step": 199 + }, + { + "epoch": 0.03, + "grad_norm": 0.8283598588701209, + "learning_rate": 4.2462845010615716e-06, + "loss": 0.6226, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 0.9313008468844602, + "learning_rate": 4.26751592356688e-06, + "loss": 0.7185, + "step": 201 + }, + { + "epoch": 0.03, + "grad_norm": 0.8810462807052284, + "learning_rate": 4.288747346072187e-06, + "loss": 0.6209, + "step": 202 + }, + { + "epoch": 0.03, + "grad_norm": 0.9015348702685758, + "learning_rate": 4.3099787685774955e-06, + "loss": 0.6665, + "step": 203 + }, + { + "epoch": 0.03, + "grad_norm": 0.5941279491487401, + "learning_rate": 4.331210191082803e-06, + "loss": 0.5455, + "step": 204 + }, + { + "epoch": 0.03, + "grad_norm": 0.6242764583731539, + "learning_rate": 4.352441613588111e-06, + "loss": 0.5779, + "step": 205 + }, + { + "epoch": 0.03, + "grad_norm": 0.7002150286010799, + "learning_rate": 4.373673036093419e-06, + "loss": 0.5447, + "step": 206 + }, + { + "epoch": 0.03, + "grad_norm": 0.6912798497967817, + "learning_rate": 4.394904458598727e-06, + "loss": 0.5988, + "step": 207 + }, + { + "epoch": 0.03, + "grad_norm": 0.8690559647733447, + "learning_rate": 4.416135881104034e-06, + "loss": 0.6324, + "step": 208 + }, + { + "epoch": 0.03, + "grad_norm": 1.0281067404352298, + "learning_rate": 4.4373673036093425e-06, + "loss": 0.6801, + "step": 209 + }, + { + "epoch": 0.03, + "grad_norm": 0.6895297615817462, + "learning_rate": 4.45859872611465e-06, + "loss": 0.5542, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 0.6504077645341889, + "learning_rate": 4.479830148619958e-06, + "loss": 0.5956, + "step": 211 + }, + { + "epoch": 0.03, + "grad_norm": 0.8892541805800651, + "learning_rate": 4.501061571125266e-06, + "loss": 0.651, + "step": 212 + }, + { + "epoch": 0.03, + "grad_norm": 0.7710720325367937, + "learning_rate": 4.522292993630574e-06, + "loss": 0.713, + "step": 213 + }, + { + "epoch": 0.03, + "grad_norm": 0.675110704188669, + "learning_rate": 4.543524416135881e-06, + "loss": 0.5562, + "step": 214 + }, + { + "epoch": 0.03, + "grad_norm": 0.7072695261962337, + "learning_rate": 4.5647558386411895e-06, + "loss": 0.6159, + "step": 215 + }, + { + "epoch": 0.03, + "grad_norm": 0.7444640008129021, + "learning_rate": 4.585987261146497e-06, + "loss": 0.6267, + "step": 216 + }, + { + "epoch": 0.03, + "grad_norm": 0.8671132992439867, + "learning_rate": 4.607218683651805e-06, + "loss": 0.6657, + "step": 217 + }, + { + "epoch": 0.03, + "grad_norm": 0.7074397549705441, + "learning_rate": 4.628450106157113e-06, + "loss": 0.558, + "step": 218 + }, + { + "epoch": 0.03, + "grad_norm": 0.7800357800835367, + "learning_rate": 4.649681528662421e-06, + "loss": 0.5888, + "step": 219 + }, + { + "epoch": 0.03, + "grad_norm": 0.7353721538536665, + "learning_rate": 4.670912951167728e-06, + "loss": 0.6019, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 0.9013761422335163, + "learning_rate": 4.6921443736730366e-06, + "loss": 0.636, + "step": 221 + }, + { + "epoch": 0.03, + "grad_norm": 0.7469190290966377, + "learning_rate": 4.713375796178344e-06, + "loss": 0.5989, + "step": 222 + }, + { + "epoch": 0.03, + "grad_norm": 0.7270094404967548, + "learning_rate": 4.734607218683652e-06, + "loss": 0.6204, + "step": 223 + }, + { + "epoch": 0.03, + "grad_norm": 0.6216399989244757, + "learning_rate": 4.7558386411889605e-06, + "loss": 0.5971, + "step": 224 + }, + { + "epoch": 0.03, + "grad_norm": 0.6066311313104265, + "learning_rate": 4.777070063694268e-06, + "loss": 0.6148, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 0.5599471218785727, + "learning_rate": 4.798301486199576e-06, + "loss": 0.4796, + "step": 226 + }, + { + "epoch": 0.03, + "grad_norm": 0.6157753345646437, + "learning_rate": 4.819532908704884e-06, + "loss": 0.5216, + "step": 227 + }, + { + "epoch": 0.03, + "grad_norm": 0.8228195083528653, + "learning_rate": 4.840764331210192e-06, + "loss": 0.6043, + "step": 228 + }, + { + "epoch": 0.03, + "grad_norm": 0.686772813135059, + "learning_rate": 4.861995753715499e-06, + "loss": 0.6483, + "step": 229 + }, + { + "epoch": 0.03, + "grad_norm": 0.9872220934878595, + "learning_rate": 4.8832271762208075e-06, + "loss": 0.7057, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 0.9814221220293042, + "learning_rate": 4.904458598726115e-06, + "loss": 0.6954, + "step": 231 + }, + { + "epoch": 0.03, + "grad_norm": 0.89164011450147, + "learning_rate": 4.925690021231423e-06, + "loss": 0.5815, + "step": 232 + }, + { + "epoch": 0.03, + "grad_norm": 0.7016484179686268, + "learning_rate": 4.946921443736731e-06, + "loss": 0.6257, + "step": 233 + }, + { + "epoch": 0.03, + "grad_norm": 0.774515551923184, + "learning_rate": 4.968152866242039e-06, + "loss": 0.6715, + "step": 234 + }, + { + "epoch": 0.03, + "grad_norm": 0.8137485818123509, + "learning_rate": 4.989384288747346e-06, + "loss": 0.6076, + "step": 235 + }, + { + "epoch": 0.03, + "grad_norm": 0.7561940959299209, + "learning_rate": 5.0106157112526545e-06, + "loss": 0.5821, + "step": 236 + }, + { + "epoch": 0.03, + "grad_norm": 0.898623103820379, + "learning_rate": 5.031847133757962e-06, + "loss": 0.706, + "step": 237 + }, + { + "epoch": 0.03, + "grad_norm": 0.7995953466662316, + "learning_rate": 5.05307855626327e-06, + "loss": 0.6196, + "step": 238 + }, + { + "epoch": 0.03, + "grad_norm": 0.6536283203986453, + "learning_rate": 5.074309978768578e-06, + "loss": 0.5753, + "step": 239 + }, + { + "epoch": 0.03, + "grad_norm": 1.051355284149499, + "learning_rate": 5.095541401273886e-06, + "loss": 0.6036, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 0.8991691183020971, + "learning_rate": 5.116772823779193e-06, + "loss": 0.6782, + "step": 241 + }, + { + "epoch": 0.03, + "grad_norm": 0.6256156762749676, + "learning_rate": 5.1380042462845016e-06, + "loss": 0.5823, + "step": 242 + }, + { + "epoch": 0.03, + "grad_norm": 0.7177094517309004, + "learning_rate": 5.159235668789809e-06, + "loss": 0.6223, + "step": 243 + }, + { + "epoch": 0.03, + "grad_norm": 0.8711944375615034, + "learning_rate": 5.180467091295117e-06, + "loss": 0.6944, + "step": 244 + }, + { + "epoch": 0.03, + "grad_norm": 0.8803187138200502, + "learning_rate": 5.201698513800425e-06, + "loss": 0.6699, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 0.7775556410870242, + "learning_rate": 5.222929936305733e-06, + "loss": 0.5807, + "step": 246 + }, + { + "epoch": 0.03, + "grad_norm": 0.8186463739188571, + "learning_rate": 5.24416135881104e-06, + "loss": 0.6677, + "step": 247 + }, + { + "epoch": 0.03, + "grad_norm": 0.8494691832014075, + "learning_rate": 5.265392781316349e-06, + "loss": 0.7448, + "step": 248 + }, + { + "epoch": 0.03, + "grad_norm": 0.7690654223032175, + "learning_rate": 5.286624203821657e-06, + "loss": 0.5999, + "step": 249 + }, + { + "epoch": 0.03, + "grad_norm": 0.6041357286634986, + "learning_rate": 5.307855626326964e-06, + "loss": 0.55, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 0.6227306599717993, + "learning_rate": 5.3290870488322725e-06, + "loss": 0.5158, + "step": 251 + }, + { + "epoch": 0.03, + "grad_norm": 0.7241136824678885, + "learning_rate": 5.35031847133758e-06, + "loss": 0.6279, + "step": 252 + }, + { + "epoch": 0.03, + "grad_norm": 0.6810033718421417, + "learning_rate": 5.371549893842888e-06, + "loss": 0.5917, + "step": 253 + }, + { + "epoch": 0.03, + "grad_norm": 0.8444944767654462, + "learning_rate": 5.392781316348196e-06, + "loss": 0.6579, + "step": 254 + }, + { + "epoch": 0.03, + "grad_norm": 0.7122852417532933, + "learning_rate": 5.414012738853504e-06, + "loss": 0.5785, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 0.6718345646223689, + "learning_rate": 5.435244161358811e-06, + "loss": 0.5623, + "step": 256 + }, + { + "epoch": 0.03, + "grad_norm": 0.8559619562534222, + "learning_rate": 5.4564755838641195e-06, + "loss": 0.6424, + "step": 257 + }, + { + "epoch": 0.03, + "grad_norm": 1.9897427075245315, + "learning_rate": 5.477707006369427e-06, + "loss": 0.6599, + "step": 258 + }, + { + "epoch": 0.03, + "grad_norm": 0.7495990121048346, + "learning_rate": 5.498938428874735e-06, + "loss": 0.7048, + "step": 259 + }, + { + "epoch": 0.03, + "grad_norm": 0.7617635842160821, + "learning_rate": 5.520169851380043e-06, + "loss": 0.6453, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 0.9923681855393078, + "learning_rate": 5.541401273885351e-06, + "loss": 0.7383, + "step": 261 + }, + { + "epoch": 0.03, + "grad_norm": 0.8222305195153744, + "learning_rate": 5.562632696390658e-06, + "loss": 0.6413, + "step": 262 + }, + { + "epoch": 0.03, + "grad_norm": 0.6626699551761589, + "learning_rate": 5.5838641188959666e-06, + "loss": 0.5702, + "step": 263 + }, + { + "epoch": 0.03, + "grad_norm": 0.8119419623711125, + "learning_rate": 5.605095541401274e-06, + "loss": 0.6073, + "step": 264 + }, + { + "epoch": 0.03, + "grad_norm": 0.7647961887972264, + "learning_rate": 5.626326963906582e-06, + "loss": 0.5806, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 0.6166367579382988, + "learning_rate": 5.64755838641189e-06, + "loss": 0.5656, + "step": 266 + }, + { + "epoch": 0.03, + "grad_norm": 0.799875375532442, + "learning_rate": 5.668789808917198e-06, + "loss": 0.649, + "step": 267 + }, + { + "epoch": 0.03, + "grad_norm": 0.7275526056631297, + "learning_rate": 5.690021231422505e-06, + "loss": 0.6317, + "step": 268 + }, + { + "epoch": 0.03, + "grad_norm": 0.6472681387536379, + "learning_rate": 5.7112526539278136e-06, + "loss": 0.5783, + "step": 269 + }, + { + "epoch": 0.03, + "grad_norm": 0.637531562482795, + "learning_rate": 5.732484076433121e-06, + "loss": 0.5846, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 0.7397922736008121, + "learning_rate": 5.753715498938429e-06, + "loss": 0.6684, + "step": 271 + }, + { + "epoch": 0.03, + "grad_norm": 1.0205806869608305, + "learning_rate": 5.774946921443737e-06, + "loss": 0.6707, + "step": 272 + }, + { + "epoch": 0.03, + "grad_norm": 0.8699718878359272, + "learning_rate": 5.796178343949045e-06, + "loss": 0.6475, + "step": 273 + }, + { + "epoch": 0.03, + "grad_norm": 0.6690727819694017, + "learning_rate": 5.817409766454352e-06, + "loss": 0.6253, + "step": 274 + }, + { + "epoch": 0.04, + "grad_norm": 0.8550115033966339, + "learning_rate": 5.838641188959661e-06, + "loss": 0.6021, + "step": 275 + }, + { + "epoch": 0.04, + "grad_norm": 0.8652339155907169, + "learning_rate": 5.859872611464969e-06, + "loss": 0.6385, + "step": 276 + }, + { + "epoch": 0.04, + "grad_norm": 0.8573757882631059, + "learning_rate": 5.881104033970276e-06, + "loss": 0.5985, + "step": 277 + }, + { + "epoch": 0.04, + "grad_norm": 0.7481145556491249, + "learning_rate": 5.9023354564755845e-06, + "loss": 0.5614, + "step": 278 + }, + { + "epoch": 0.04, + "grad_norm": 0.7603850815803599, + "learning_rate": 5.923566878980892e-06, + "loss": 0.5895, + "step": 279 + }, + { + "epoch": 0.04, + "grad_norm": 0.7106234283638302, + "learning_rate": 5.9447983014862e-06, + "loss": 0.5841, + "step": 280 + }, + { + "epoch": 0.04, + "grad_norm": 0.7571796686983795, + "learning_rate": 5.966029723991508e-06, + "loss": 0.5725, + "step": 281 + }, + { + "epoch": 0.04, + "grad_norm": 0.6563732440289131, + "learning_rate": 5.987261146496816e-06, + "loss": 0.6088, + "step": 282 + }, + { + "epoch": 0.04, + "grad_norm": 0.6977442128338657, + "learning_rate": 6.008492569002123e-06, + "loss": 0.5594, + "step": 283 + }, + { + "epoch": 0.04, + "grad_norm": 0.706010792363827, + "learning_rate": 6.0297239915074315e-06, + "loss": 0.5695, + "step": 284 + }, + { + "epoch": 0.04, + "grad_norm": 0.859176506136312, + "learning_rate": 6.050955414012739e-06, + "loss": 0.667, + "step": 285 + }, + { + "epoch": 0.04, + "grad_norm": 0.8105147566645884, + "learning_rate": 6.072186836518047e-06, + "loss": 0.632, + "step": 286 + }, + { + "epoch": 0.04, + "grad_norm": 0.8209217482786072, + "learning_rate": 6.093418259023355e-06, + "loss": 0.6366, + "step": 287 + }, + { + "epoch": 0.04, + "grad_norm": 0.7608301811315319, + "learning_rate": 6.114649681528663e-06, + "loss": 0.5933, + "step": 288 + }, + { + "epoch": 0.04, + "grad_norm": 0.7654007541474549, + "learning_rate": 6.13588110403397e-06, + "loss": 0.6515, + "step": 289 + }, + { + "epoch": 0.04, + "grad_norm": 0.629612742923365, + "learning_rate": 6.1571125265392786e-06, + "loss": 0.571, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 0.7279126481255758, + "learning_rate": 6.178343949044586e-06, + "loss": 0.6631, + "step": 291 + }, + { + "epoch": 0.04, + "grad_norm": 0.6004298020074539, + "learning_rate": 6.199575371549894e-06, + "loss": 0.5499, + "step": 292 + }, + { + "epoch": 0.04, + "grad_norm": 0.6887789599745987, + "learning_rate": 6.220806794055202e-06, + "loss": 0.6049, + "step": 293 + }, + { + "epoch": 0.04, + "grad_norm": 0.7972200861030105, + "learning_rate": 6.24203821656051e-06, + "loss": 0.6428, + "step": 294 + }, + { + "epoch": 0.04, + "grad_norm": 0.7465288393939438, + "learning_rate": 6.263269639065817e-06, + "loss": 0.587, + "step": 295 + }, + { + "epoch": 0.04, + "grad_norm": 1.0574613089275766, + "learning_rate": 6.284501061571126e-06, + "loss": 0.6344, + "step": 296 + }, + { + "epoch": 0.04, + "grad_norm": 0.9460877485374782, + "learning_rate": 6.305732484076433e-06, + "loss": 0.6436, + "step": 297 + }, + { + "epoch": 0.04, + "grad_norm": 0.8184763277187621, + "learning_rate": 6.326963906581741e-06, + "loss": 0.7271, + "step": 298 + }, + { + "epoch": 0.04, + "grad_norm": 0.6930575103378493, + "learning_rate": 6.348195329087049e-06, + "loss": 0.6041, + "step": 299 + }, + { + "epoch": 0.04, + "grad_norm": 0.7531353169689462, + "learning_rate": 6.369426751592357e-06, + "loss": 0.5961, + "step": 300 + }, + { + "epoch": 0.04, + "grad_norm": 0.7100085841084717, + "learning_rate": 6.390658174097664e-06, + "loss": 0.5595, + "step": 301 + }, + { + "epoch": 0.04, + "grad_norm": 0.9048480632880608, + "learning_rate": 6.411889596602973e-06, + "loss": 0.6511, + "step": 302 + }, + { + "epoch": 0.04, + "grad_norm": 0.6880597100842921, + "learning_rate": 6.433121019108281e-06, + "loss": 0.5716, + "step": 303 + }, + { + "epoch": 0.04, + "grad_norm": 0.8482267906196166, + "learning_rate": 6.454352441613588e-06, + "loss": 0.6337, + "step": 304 + }, + { + "epoch": 0.04, + "grad_norm": 0.7599383356399694, + "learning_rate": 6.4755838641188965e-06, + "loss": 0.5896, + "step": 305 + }, + { + "epoch": 0.04, + "grad_norm": 0.6976717143745537, + "learning_rate": 6.496815286624204e-06, + "loss": 0.5877, + "step": 306 + }, + { + "epoch": 0.04, + "grad_norm": 0.9891232937813345, + "learning_rate": 6.518046709129512e-06, + "loss": 0.741, + "step": 307 + }, + { + "epoch": 0.04, + "grad_norm": 0.7442534546756299, + "learning_rate": 6.53927813163482e-06, + "loss": 0.6178, + "step": 308 + }, + { + "epoch": 0.04, + "grad_norm": 0.7970249741724695, + "learning_rate": 6.560509554140128e-06, + "loss": 0.5939, + "step": 309 + }, + { + "epoch": 0.04, + "grad_norm": 0.8354095101493896, + "learning_rate": 6.581740976645435e-06, + "loss": 0.7177, + "step": 310 + }, + { + "epoch": 0.04, + "grad_norm": 0.6420658221124799, + "learning_rate": 6.6029723991507436e-06, + "loss": 0.5191, + "step": 311 + }, + { + "epoch": 0.04, + "grad_norm": 0.7228803925846446, + "learning_rate": 6.624203821656051e-06, + "loss": 0.5701, + "step": 312 + }, + { + "epoch": 0.04, + "grad_norm": 0.7886061379417076, + "learning_rate": 6.645435244161359e-06, + "loss": 0.6825, + "step": 313 + }, + { + "epoch": 0.04, + "grad_norm": 0.6494021019881504, + "learning_rate": 6.666666666666667e-06, + "loss": 0.6025, + "step": 314 + }, + { + "epoch": 0.04, + "grad_norm": 1.064902467884702, + "learning_rate": 6.687898089171975e-06, + "loss": 0.7276, + "step": 315 + }, + { + "epoch": 0.04, + "grad_norm": 0.7076045071300145, + "learning_rate": 6.709129511677282e-06, + "loss": 0.5927, + "step": 316 + }, + { + "epoch": 0.04, + "grad_norm": 0.6516034356887127, + "learning_rate": 6.730360934182591e-06, + "loss": 0.5837, + "step": 317 + }, + { + "epoch": 0.04, + "grad_norm": 0.8532912970246789, + "learning_rate": 6.751592356687898e-06, + "loss": 0.6974, + "step": 318 + }, + { + "epoch": 0.04, + "grad_norm": 0.8136814955252396, + "learning_rate": 6.772823779193206e-06, + "loss": 0.6814, + "step": 319 + }, + { + "epoch": 0.04, + "grad_norm": 0.7694784076036806, + "learning_rate": 6.794055201698514e-06, + "loss": 0.6535, + "step": 320 + }, + { + "epoch": 0.04, + "grad_norm": 0.6080238914599104, + "learning_rate": 6.815286624203822e-06, + "loss": 0.57, + "step": 321 + }, + { + "epoch": 0.04, + "grad_norm": 0.8143841412568615, + "learning_rate": 6.836518046709129e-06, + "loss": 0.5867, + "step": 322 + }, + { + "epoch": 0.04, + "grad_norm": 0.9020756728287854, + "learning_rate": 6.857749469214438e-06, + "loss": 0.6342, + "step": 323 + }, + { + "epoch": 0.04, + "grad_norm": 0.9170186790597973, + "learning_rate": 6.878980891719745e-06, + "loss": 0.701, + "step": 324 + }, + { + "epoch": 0.04, + "grad_norm": 0.6878753443589771, + "learning_rate": 6.900212314225053e-06, + "loss": 0.5742, + "step": 325 + }, + { + "epoch": 0.04, + "grad_norm": 0.6721737113608273, + "learning_rate": 6.921443736730361e-06, + "loss": 0.5557, + "step": 326 + }, + { + "epoch": 0.04, + "grad_norm": 0.6550388119556643, + "learning_rate": 6.942675159235669e-06, + "loss": 0.6052, + "step": 327 + }, + { + "epoch": 0.04, + "grad_norm": 0.7399479184989896, + "learning_rate": 6.963906581740976e-06, + "loss": 0.6463, + "step": 328 + }, + { + "epoch": 0.04, + "grad_norm": 0.7156549908824404, + "learning_rate": 6.985138004246285e-06, + "loss": 0.5728, + "step": 329 + }, + { + "epoch": 0.04, + "grad_norm": 0.6718801324294158, + "learning_rate": 7.006369426751593e-06, + "loss": 0.5629, + "step": 330 + }, + { + "epoch": 0.04, + "grad_norm": 0.7009074851724983, + "learning_rate": 7.0276008492569e-06, + "loss": 0.5532, + "step": 331 + }, + { + "epoch": 0.04, + "grad_norm": 0.8029408411971348, + "learning_rate": 7.0488322717622086e-06, + "loss": 0.5271, + "step": 332 + }, + { + "epoch": 0.04, + "grad_norm": 1.0431012036211642, + "learning_rate": 7.070063694267516e-06, + "loss": 0.6367, + "step": 333 + }, + { + "epoch": 0.04, + "grad_norm": 0.8093878262347405, + "learning_rate": 7.091295116772824e-06, + "loss": 0.6403, + "step": 334 + }, + { + "epoch": 0.04, + "grad_norm": 1.2908023804812134, + "learning_rate": 7.112526539278132e-06, + "loss": 0.7034, + "step": 335 + }, + { + "epoch": 0.04, + "grad_norm": 1.0398537291927146, + "learning_rate": 7.13375796178344e-06, + "loss": 0.6809, + "step": 336 + }, + { + "epoch": 0.04, + "grad_norm": 1.0350876345922804, + "learning_rate": 7.154989384288747e-06, + "loss": 0.6931, + "step": 337 + }, + { + "epoch": 0.04, + "grad_norm": 0.7886373173499229, + "learning_rate": 7.1762208067940556e-06, + "loss": 0.6589, + "step": 338 + }, + { + "epoch": 0.04, + "grad_norm": 0.8927862733820032, + "learning_rate": 7.197452229299363e-06, + "loss": 0.6935, + "step": 339 + }, + { + "epoch": 0.04, + "grad_norm": 0.8259408573357077, + "learning_rate": 7.218683651804671e-06, + "loss": 0.6376, + "step": 340 + }, + { + "epoch": 0.04, + "grad_norm": 0.7824414331887, + "learning_rate": 7.239915074309979e-06, + "loss": 0.5965, + "step": 341 + }, + { + "epoch": 0.04, + "grad_norm": 0.6088715758221711, + "learning_rate": 7.261146496815287e-06, + "loss": 0.5885, + "step": 342 + }, + { + "epoch": 0.04, + "grad_norm": 1.0709181495605602, + "learning_rate": 7.282377919320594e-06, + "loss": 0.6809, + "step": 343 + }, + { + "epoch": 0.04, + "grad_norm": 0.8043326912255777, + "learning_rate": 7.303609341825903e-06, + "loss": 0.6297, + "step": 344 + }, + { + "epoch": 0.04, + "grad_norm": 0.8565291208373391, + "learning_rate": 7.32484076433121e-06, + "loss": 0.6546, + "step": 345 + }, + { + "epoch": 0.04, + "grad_norm": 0.624084247670802, + "learning_rate": 7.346072186836518e-06, + "loss": 0.5563, + "step": 346 + }, + { + "epoch": 0.04, + "grad_norm": 0.7824732109326937, + "learning_rate": 7.367303609341826e-06, + "loss": 0.6785, + "step": 347 + }, + { + "epoch": 0.04, + "grad_norm": 0.8399482169207614, + "learning_rate": 7.388535031847134e-06, + "loss": 0.5974, + "step": 348 + }, + { + "epoch": 0.04, + "grad_norm": 0.785521414538481, + "learning_rate": 7.409766454352441e-06, + "loss": 0.6213, + "step": 349 + }, + { + "epoch": 0.04, + "grad_norm": 0.8548704922581076, + "learning_rate": 7.43099787685775e-06, + "loss": 0.6807, + "step": 350 + }, + { + "epoch": 0.04, + "grad_norm": 0.7805367105203607, + "learning_rate": 7.452229299363057e-06, + "loss": 0.6876, + "step": 351 + }, + { + "epoch": 0.04, + "grad_norm": 0.7911626509097884, + "learning_rate": 7.473460721868365e-06, + "loss": 0.6201, + "step": 352 + }, + { + "epoch": 0.04, + "grad_norm": 1.9263847215601537, + "learning_rate": 7.494692144373673e-06, + "loss": 0.6726, + "step": 353 + }, + { + "epoch": 0.05, + "grad_norm": 0.773292037382418, + "learning_rate": 7.515923566878982e-06, + "loss": 0.6163, + "step": 354 + }, + { + "epoch": 0.05, + "grad_norm": 0.8488440940786954, + "learning_rate": 7.53715498938429e-06, + "loss": 0.5657, + "step": 355 + }, + { + "epoch": 0.05, + "grad_norm": 0.8985238040159557, + "learning_rate": 7.5583864118895975e-06, + "loss": 0.6968, + "step": 356 + }, + { + "epoch": 0.05, + "grad_norm": 0.7376906256571085, + "learning_rate": 7.579617834394906e-06, + "loss": 0.5766, + "step": 357 + }, + { + "epoch": 0.05, + "grad_norm": 0.6623660432305776, + "learning_rate": 7.600849256900213e-06, + "loss": 0.5424, + "step": 358 + }, + { + "epoch": 0.05, + "grad_norm": 0.7753384538437651, + "learning_rate": 7.622080679405521e-06, + "loss": 0.5261, + "step": 359 + }, + { + "epoch": 0.05, + "grad_norm": 0.7636721584433728, + "learning_rate": 7.64331210191083e-06, + "loss": 0.5604, + "step": 360 + }, + { + "epoch": 0.05, + "grad_norm": 0.807227120074653, + "learning_rate": 7.664543524416136e-06, + "loss": 0.6237, + "step": 361 + }, + { + "epoch": 0.05, + "grad_norm": 0.7202077397488372, + "learning_rate": 7.685774946921445e-06, + "loss": 0.5967, + "step": 362 + }, + { + "epoch": 0.05, + "grad_norm": 0.9238434706684684, + "learning_rate": 7.707006369426753e-06, + "loss": 0.6284, + "step": 363 + }, + { + "epoch": 0.05, + "grad_norm": 0.6904290880218553, + "learning_rate": 7.728237791932061e-06, + "loss": 0.5772, + "step": 364 + }, + { + "epoch": 0.05, + "grad_norm": 0.6759959514969673, + "learning_rate": 7.74946921443737e-06, + "loss": 0.5675, + "step": 365 + }, + { + "epoch": 0.05, + "grad_norm": 1.575640820985936, + "learning_rate": 7.770700636942676e-06, + "loss": 0.6411, + "step": 366 + }, + { + "epoch": 0.05, + "grad_norm": 0.976655319737452, + "learning_rate": 7.791932059447984e-06, + "loss": 0.7161, + "step": 367 + }, + { + "epoch": 0.05, + "grad_norm": 0.6794250834654887, + "learning_rate": 7.813163481953292e-06, + "loss": 0.5621, + "step": 368 + }, + { + "epoch": 0.05, + "grad_norm": 0.8958077513340039, + "learning_rate": 7.8343949044586e-06, + "loss": 0.6572, + "step": 369 + }, + { + "epoch": 0.05, + "grad_norm": 0.7730165050550669, + "learning_rate": 7.855626326963907e-06, + "loss": 0.6202, + "step": 370 + }, + { + "epoch": 0.05, + "grad_norm": 0.744079962426877, + "learning_rate": 7.876857749469215e-06, + "loss": 0.6058, + "step": 371 + }, + { + "epoch": 0.05, + "grad_norm": 0.8540704691717359, + "learning_rate": 7.898089171974524e-06, + "loss": 0.5719, + "step": 372 + }, + { + "epoch": 0.05, + "grad_norm": 0.8631284956302128, + "learning_rate": 7.919320594479832e-06, + "loss": 0.6008, + "step": 373 + }, + { + "epoch": 0.05, + "grad_norm": 0.8642598382843337, + "learning_rate": 7.940552016985139e-06, + "loss": 0.614, + "step": 374 + }, + { + "epoch": 0.05, + "grad_norm": 0.7439967365258471, + "learning_rate": 7.961783439490447e-06, + "loss": 0.5856, + "step": 375 + }, + { + "epoch": 0.05, + "grad_norm": 0.6795607401111311, + "learning_rate": 7.983014861995755e-06, + "loss": 0.5841, + "step": 376 + }, + { + "epoch": 0.05, + "grad_norm": 0.7969899454925713, + "learning_rate": 8.004246284501063e-06, + "loss": 0.6461, + "step": 377 + }, + { + "epoch": 0.05, + "grad_norm": 0.975299506666238, + "learning_rate": 8.02547770700637e-06, + "loss": 0.6425, + "step": 378 + }, + { + "epoch": 0.05, + "grad_norm": 0.8174766590854395, + "learning_rate": 8.046709129511678e-06, + "loss": 0.6229, + "step": 379 + }, + { + "epoch": 0.05, + "grad_norm": 0.6061374519640897, + "learning_rate": 8.067940552016986e-06, + "loss": 0.5521, + "step": 380 + }, + { + "epoch": 0.05, + "grad_norm": 1.2087893068940179, + "learning_rate": 8.089171974522295e-06, + "loss": 0.7737, + "step": 381 + }, + { + "epoch": 0.05, + "grad_norm": 0.8486748058122714, + "learning_rate": 8.110403397027601e-06, + "loss": 0.5821, + "step": 382 + }, + { + "epoch": 0.05, + "grad_norm": 0.6690153487406375, + "learning_rate": 8.13163481953291e-06, + "loss": 0.5573, + "step": 383 + }, + { + "epoch": 0.05, + "grad_norm": 0.8056415402086693, + "learning_rate": 8.152866242038218e-06, + "loss": 0.695, + "step": 384 + }, + { + "epoch": 0.05, + "grad_norm": 0.7857803059268238, + "learning_rate": 8.174097664543526e-06, + "loss": 0.6159, + "step": 385 + }, + { + "epoch": 0.05, + "grad_norm": 5.022417337666467, + "learning_rate": 8.195329087048833e-06, + "loss": 0.6367, + "step": 386 + }, + { + "epoch": 0.05, + "grad_norm": 0.886779803282784, + "learning_rate": 8.21656050955414e-06, + "loss": 0.7315, + "step": 387 + }, + { + "epoch": 0.05, + "grad_norm": 0.6185410245742909, + "learning_rate": 8.237791932059449e-06, + "loss": 0.5285, + "step": 388 + }, + { + "epoch": 0.05, + "grad_norm": 0.6555034141382942, + "learning_rate": 8.259023354564757e-06, + "loss": 0.5407, + "step": 389 + }, + { + "epoch": 0.05, + "grad_norm": 0.927509309026582, + "learning_rate": 8.280254777070064e-06, + "loss": 0.6634, + "step": 390 + }, + { + "epoch": 0.05, + "grad_norm": 0.8900249257788111, + "learning_rate": 8.301486199575372e-06, + "loss": 0.6216, + "step": 391 + }, + { + "epoch": 0.05, + "grad_norm": 0.8231308283868264, + "learning_rate": 8.32271762208068e-06, + "loss": 0.6526, + "step": 392 + }, + { + "epoch": 0.05, + "grad_norm": 0.7226958174541385, + "learning_rate": 8.343949044585989e-06, + "loss": 0.6514, + "step": 393 + }, + { + "epoch": 0.05, + "grad_norm": 0.6441010227146757, + "learning_rate": 8.365180467091297e-06, + "loss": 0.6123, + "step": 394 + }, + { + "epoch": 0.05, + "grad_norm": 0.7184500228931993, + "learning_rate": 8.386411889596604e-06, + "loss": 0.6132, + "step": 395 + }, + { + "epoch": 0.05, + "grad_norm": 0.7611063791793172, + "learning_rate": 8.407643312101912e-06, + "loss": 0.6107, + "step": 396 + }, + { + "epoch": 0.05, + "grad_norm": 0.8854254361174873, + "learning_rate": 8.42887473460722e-06, + "loss": 0.6482, + "step": 397 + }, + { + "epoch": 0.05, + "grad_norm": 0.7142906085084705, + "learning_rate": 8.450106157112528e-06, + "loss": 0.5991, + "step": 398 + }, + { + "epoch": 0.05, + "grad_norm": 0.7452467444618182, + "learning_rate": 8.471337579617835e-06, + "loss": 0.5906, + "step": 399 + }, + { + "epoch": 0.05, + "grad_norm": 0.88893485363304, + "learning_rate": 8.492569002123143e-06, + "loss": 0.6391, + "step": 400 + }, + { + "epoch": 0.05, + "grad_norm": 0.7797099055102723, + "learning_rate": 8.513800424628451e-06, + "loss": 0.6288, + "step": 401 + }, + { + "epoch": 0.05, + "grad_norm": 1.1476881331334823, + "learning_rate": 8.53503184713376e-06, + "loss": 0.6295, + "step": 402 + }, + { + "epoch": 0.05, + "grad_norm": 0.6454433387905704, + "learning_rate": 8.556263269639066e-06, + "loss": 0.5851, + "step": 403 + }, + { + "epoch": 0.05, + "grad_norm": 0.9202915443187007, + "learning_rate": 8.577494692144374e-06, + "loss": 0.6669, + "step": 404 + }, + { + "epoch": 0.05, + "grad_norm": 0.7896740571157151, + "learning_rate": 8.598726114649683e-06, + "loss": 0.6247, + "step": 405 + }, + { + "epoch": 0.05, + "grad_norm": 0.650371840959321, + "learning_rate": 8.619957537154991e-06, + "loss": 0.5711, + "step": 406 + }, + { + "epoch": 0.05, + "grad_norm": 1.2264582688372754, + "learning_rate": 8.641188959660298e-06, + "loss": 0.6571, + "step": 407 + }, + { + "epoch": 0.05, + "grad_norm": 0.7511960497743809, + "learning_rate": 8.662420382165606e-06, + "loss": 0.5488, + "step": 408 + }, + { + "epoch": 0.05, + "grad_norm": 0.8883132225129745, + "learning_rate": 8.683651804670914e-06, + "loss": 0.6498, + "step": 409 + }, + { + "epoch": 0.05, + "grad_norm": 0.8711475329433065, + "learning_rate": 8.704883227176222e-06, + "loss": 0.633, + "step": 410 + }, + { + "epoch": 0.05, + "grad_norm": 0.6482577969800928, + "learning_rate": 8.726114649681529e-06, + "loss": 0.5435, + "step": 411 + }, + { + "epoch": 0.05, + "grad_norm": 0.833716792141639, + "learning_rate": 8.747346072186837e-06, + "loss": 0.5639, + "step": 412 + }, + { + "epoch": 0.05, + "grad_norm": 0.8404773082943031, + "learning_rate": 8.768577494692145e-06, + "loss": 0.6407, + "step": 413 + }, + { + "epoch": 0.05, + "grad_norm": 1.003279144187182, + "learning_rate": 8.789808917197454e-06, + "loss": 0.6892, + "step": 414 + }, + { + "epoch": 0.05, + "grad_norm": 0.702753050046326, + "learning_rate": 8.81104033970276e-06, + "loss": 0.5622, + "step": 415 + }, + { + "epoch": 0.05, + "grad_norm": 0.7055678953855339, + "learning_rate": 8.832271762208069e-06, + "loss": 0.573, + "step": 416 + }, + { + "epoch": 0.05, + "grad_norm": 0.7195947029496707, + "learning_rate": 8.853503184713377e-06, + "loss": 0.61, + "step": 417 + }, + { + "epoch": 0.05, + "grad_norm": 0.7367726612748673, + "learning_rate": 8.874734607218685e-06, + "loss": 0.5678, + "step": 418 + }, + { + "epoch": 0.05, + "grad_norm": 0.7274607829878597, + "learning_rate": 8.895966029723993e-06, + "loss": 0.529, + "step": 419 + }, + { + "epoch": 0.05, + "grad_norm": 0.7830343262063172, + "learning_rate": 8.9171974522293e-06, + "loss": 0.6487, + "step": 420 + }, + { + "epoch": 0.05, + "grad_norm": 0.8795400014954516, + "learning_rate": 8.938428874734608e-06, + "loss": 0.6744, + "step": 421 + }, + { + "epoch": 0.05, + "grad_norm": 0.8163787645669263, + "learning_rate": 8.959660297239916e-06, + "loss": 0.6919, + "step": 422 + }, + { + "epoch": 0.05, + "grad_norm": 0.6131252077612136, + "learning_rate": 8.980891719745225e-06, + "loss": 0.539, + "step": 423 + }, + { + "epoch": 0.05, + "grad_norm": 0.6577080763662211, + "learning_rate": 9.002123142250531e-06, + "loss": 0.5362, + "step": 424 + }, + { + "epoch": 0.05, + "grad_norm": 0.6754781830302256, + "learning_rate": 9.02335456475584e-06, + "loss": 0.618, + "step": 425 + }, + { + "epoch": 0.05, + "grad_norm": 0.6180744659367413, + "learning_rate": 9.044585987261148e-06, + "loss": 0.5789, + "step": 426 + }, + { + "epoch": 0.05, + "grad_norm": 0.7656745343043879, + "learning_rate": 9.065817409766456e-06, + "loss": 0.58, + "step": 427 + }, + { + "epoch": 0.05, + "grad_norm": 0.8047908307660389, + "learning_rate": 9.087048832271763e-06, + "loss": 0.6189, + "step": 428 + }, + { + "epoch": 0.05, + "grad_norm": 1.1378676066470146, + "learning_rate": 9.10828025477707e-06, + "loss": 0.6134, + "step": 429 + }, + { + "epoch": 0.05, + "grad_norm": 0.7408459076539501, + "learning_rate": 9.129511677282379e-06, + "loss": 0.6018, + "step": 430 + }, + { + "epoch": 0.05, + "grad_norm": 0.8835243577619359, + "learning_rate": 9.150743099787687e-06, + "loss": 0.6269, + "step": 431 + }, + { + "epoch": 0.06, + "grad_norm": 0.6478236226856178, + "learning_rate": 9.171974522292994e-06, + "loss": 0.5543, + "step": 432 + }, + { + "epoch": 0.06, + "grad_norm": 0.8829938350860344, + "learning_rate": 9.193205944798302e-06, + "loss": 0.6712, + "step": 433 + }, + { + "epoch": 0.06, + "grad_norm": 0.8020701009095713, + "learning_rate": 9.21443736730361e-06, + "loss": 0.5648, + "step": 434 + }, + { + "epoch": 0.06, + "grad_norm": 0.8097751212747626, + "learning_rate": 9.235668789808919e-06, + "loss": 0.6388, + "step": 435 + }, + { + "epoch": 0.06, + "grad_norm": 0.7264245763854907, + "learning_rate": 9.256900212314225e-06, + "loss": 0.5783, + "step": 436 + }, + { + "epoch": 0.06, + "grad_norm": 0.6658669676284141, + "learning_rate": 9.278131634819534e-06, + "loss": 0.5424, + "step": 437 + }, + { + "epoch": 0.06, + "grad_norm": 0.7293560413896242, + "learning_rate": 9.299363057324842e-06, + "loss": 0.5849, + "step": 438 + }, + { + "epoch": 0.06, + "grad_norm": 0.9165722911894904, + "learning_rate": 9.32059447983015e-06, + "loss": 0.6842, + "step": 439 + }, + { + "epoch": 0.06, + "grad_norm": 0.6672157004325562, + "learning_rate": 9.341825902335457e-06, + "loss": 0.612, + "step": 440 + }, + { + "epoch": 0.06, + "grad_norm": 1.515072527151968, + "learning_rate": 9.363057324840765e-06, + "loss": 0.5953, + "step": 441 + }, + { + "epoch": 0.06, + "grad_norm": 0.8466680398556095, + "learning_rate": 9.384288747346073e-06, + "loss": 0.6494, + "step": 442 + }, + { + "epoch": 0.06, + "grad_norm": 0.76149805712936, + "learning_rate": 9.405520169851381e-06, + "loss": 0.6328, + "step": 443 + }, + { + "epoch": 0.06, + "grad_norm": 0.667792369022887, + "learning_rate": 9.426751592356688e-06, + "loss": 0.5881, + "step": 444 + }, + { + "epoch": 0.06, + "grad_norm": 1.1000601752879977, + "learning_rate": 9.447983014861996e-06, + "loss": 0.5828, + "step": 445 + }, + { + "epoch": 0.06, + "grad_norm": 0.7977541069806383, + "learning_rate": 9.469214437367304e-06, + "loss": 0.6257, + "step": 446 + }, + { + "epoch": 0.06, + "grad_norm": 0.7554239986005157, + "learning_rate": 9.490445859872613e-06, + "loss": 0.5868, + "step": 447 + }, + { + "epoch": 0.06, + "grad_norm": 0.6902154989570397, + "learning_rate": 9.511677282377921e-06, + "loss": 0.5741, + "step": 448 + }, + { + "epoch": 0.06, + "grad_norm": 0.831785734821144, + "learning_rate": 9.532908704883228e-06, + "loss": 0.6632, + "step": 449 + }, + { + "epoch": 0.06, + "grad_norm": 0.6589235504467976, + "learning_rate": 9.554140127388536e-06, + "loss": 0.5661, + "step": 450 + }, + { + "epoch": 0.06, + "grad_norm": 0.6117670949273154, + "learning_rate": 9.575371549893844e-06, + "loss": 0.5249, + "step": 451 + }, + { + "epoch": 0.06, + "grad_norm": 0.704911300521356, + "learning_rate": 9.596602972399152e-06, + "loss": 0.5271, + "step": 452 + }, + { + "epoch": 0.06, + "grad_norm": 0.8596611607511858, + "learning_rate": 9.617834394904459e-06, + "loss": 0.6802, + "step": 453 + }, + { + "epoch": 0.06, + "grad_norm": 0.8965616025174299, + "learning_rate": 9.639065817409767e-06, + "loss": 0.6273, + "step": 454 + }, + { + "epoch": 0.06, + "grad_norm": 0.6109748675543724, + "learning_rate": 9.660297239915075e-06, + "loss": 0.6023, + "step": 455 + }, + { + "epoch": 0.06, + "grad_norm": 0.7991002535309961, + "learning_rate": 9.681528662420384e-06, + "loss": 0.6369, + "step": 456 + }, + { + "epoch": 0.06, + "grad_norm": 0.7452014876614284, + "learning_rate": 9.70276008492569e-06, + "loss": 0.5368, + "step": 457 + }, + { + "epoch": 0.06, + "grad_norm": 0.7116054497263848, + "learning_rate": 9.723991507430999e-06, + "loss": 0.647, + "step": 458 + }, + { + "epoch": 0.06, + "grad_norm": 0.7733574624574389, + "learning_rate": 9.745222929936307e-06, + "loss": 0.5834, + "step": 459 + }, + { + "epoch": 0.06, + "grad_norm": 0.8673038237405521, + "learning_rate": 9.766454352441615e-06, + "loss": 0.6557, + "step": 460 + }, + { + "epoch": 0.06, + "grad_norm": 0.9087978224410032, + "learning_rate": 9.787685774946922e-06, + "loss": 0.6369, + "step": 461 + }, + { + "epoch": 0.06, + "grad_norm": 0.7248864196673108, + "learning_rate": 9.80891719745223e-06, + "loss": 0.5998, + "step": 462 + }, + { + "epoch": 0.06, + "grad_norm": 0.762858181858747, + "learning_rate": 9.830148619957538e-06, + "loss": 0.6096, + "step": 463 + }, + { + "epoch": 0.06, + "grad_norm": 0.8189284828454165, + "learning_rate": 9.851380042462846e-06, + "loss": 0.6268, + "step": 464 + }, + { + "epoch": 0.06, + "grad_norm": 1.2234999520903838, + "learning_rate": 9.872611464968153e-06, + "loss": 0.6039, + "step": 465 + }, + { + "epoch": 0.06, + "grad_norm": 0.6611390379618824, + "learning_rate": 9.893842887473461e-06, + "loss": 0.5561, + "step": 466 + }, + { + "epoch": 0.06, + "grad_norm": 0.7861456650459644, + "learning_rate": 9.91507430997877e-06, + "loss": 0.6825, + "step": 467 + }, + { + "epoch": 0.06, + "grad_norm": 0.7232391983502742, + "learning_rate": 9.936305732484078e-06, + "loss": 0.5668, + "step": 468 + }, + { + "epoch": 0.06, + "grad_norm": 0.8245054175555814, + "learning_rate": 9.957537154989384e-06, + "loss": 0.6465, + "step": 469 + }, + { + "epoch": 0.06, + "grad_norm": 0.8159840462089913, + "learning_rate": 9.978768577494693e-06, + "loss": 0.623, + "step": 470 + }, + { + "epoch": 0.06, + "grad_norm": 0.6739818639719175, + "learning_rate": 1e-05, + "loss": 0.5829, + "step": 471 + }, + { + "epoch": 0.06, + "grad_norm": 0.9169394853492169, + "learning_rate": 9.99999989358299e-06, + "loss": 0.6543, + "step": 472 + }, + { + "epoch": 0.06, + "grad_norm": 0.732398277025336, + "learning_rate": 9.99999957433196e-06, + "loss": 0.604, + "step": 473 + }, + { + "epoch": 0.06, + "grad_norm": 0.7935352329835511, + "learning_rate": 9.999999042246928e-06, + "loss": 0.6036, + "step": 474 + }, + { + "epoch": 0.06, + "grad_norm": 0.7204735579547549, + "learning_rate": 9.999998297327913e-06, + "loss": 0.567, + "step": 475 + }, + { + "epoch": 0.06, + "grad_norm": 0.9466278469784851, + "learning_rate": 9.999997339574949e-06, + "loss": 0.6903, + "step": 476 + }, + { + "epoch": 0.06, + "grad_norm": 0.7510980657849371, + "learning_rate": 9.999996168988075e-06, + "loss": 0.5798, + "step": 477 + }, + { + "epoch": 0.06, + "grad_norm": 0.9251013947903287, + "learning_rate": 9.999994785567344e-06, + "loss": 0.6208, + "step": 478 + }, + { + "epoch": 0.06, + "grad_norm": 0.8823297094976791, + "learning_rate": 9.99999318931281e-06, + "loss": 0.6067, + "step": 479 + }, + { + "epoch": 0.06, + "grad_norm": 0.8889253143015425, + "learning_rate": 9.999991380224545e-06, + "loss": 0.6672, + "step": 480 + }, + { + "epoch": 0.06, + "grad_norm": 0.8224626528968296, + "learning_rate": 9.999989358302623e-06, + "loss": 0.6605, + "step": 481 + }, + { + "epoch": 0.06, + "grad_norm": 0.5962750987251436, + "learning_rate": 9.999987123547133e-06, + "loss": 0.5454, + "step": 482 + }, + { + "epoch": 0.06, + "grad_norm": 0.8329766576540251, + "learning_rate": 9.999984675958169e-06, + "loss": 0.6366, + "step": 483 + }, + { + "epoch": 0.06, + "grad_norm": 0.8074101110714306, + "learning_rate": 9.999982015535834e-06, + "loss": 0.6583, + "step": 484 + }, + { + "epoch": 0.06, + "grad_norm": 0.6613443421470934, + "learning_rate": 9.999979142280246e-06, + "loss": 0.6069, + "step": 485 + }, + { + "epoch": 0.06, + "grad_norm": 0.7004874091363438, + "learning_rate": 9.999976056191519e-06, + "loss": 0.5727, + "step": 486 + }, + { + "epoch": 0.06, + "grad_norm": 0.8654496022656083, + "learning_rate": 9.999972757269792e-06, + "loss": 0.6838, + "step": 487 + }, + { + "epoch": 0.06, + "grad_norm": 0.723092832411239, + "learning_rate": 9.9999692455152e-06, + "loss": 0.5972, + "step": 488 + }, + { + "epoch": 0.06, + "grad_norm": 0.718562574605943, + "learning_rate": 9.999965520927895e-06, + "loss": 0.5875, + "step": 489 + }, + { + "epoch": 0.06, + "grad_norm": 0.6778690028989336, + "learning_rate": 9.999961583508035e-06, + "loss": 0.5754, + "step": 490 + }, + { + "epoch": 0.06, + "grad_norm": 0.8718546884186662, + "learning_rate": 9.99995743325579e-06, + "loss": 0.6625, + "step": 491 + }, + { + "epoch": 0.06, + "grad_norm": 0.7923702690792669, + "learning_rate": 9.999953070171334e-06, + "loss": 0.6471, + "step": 492 + }, + { + "epoch": 0.06, + "grad_norm": 0.7507109521830716, + "learning_rate": 9.999948494254853e-06, + "loss": 0.63, + "step": 493 + }, + { + "epoch": 0.06, + "grad_norm": 0.7205764419308681, + "learning_rate": 9.999943705506544e-06, + "loss": 0.5992, + "step": 494 + }, + { + "epoch": 0.06, + "grad_norm": 0.6601109934385242, + "learning_rate": 9.999938703926607e-06, + "loss": 0.528, + "step": 495 + }, + { + "epoch": 0.06, + "grad_norm": 0.6786762625720932, + "learning_rate": 9.999933489515257e-06, + "loss": 0.5651, + "step": 496 + }, + { + "epoch": 0.06, + "grad_norm": 0.7622149354723696, + "learning_rate": 9.999928062272714e-06, + "loss": 0.6337, + "step": 497 + }, + { + "epoch": 0.06, + "grad_norm": 0.9885600548836091, + "learning_rate": 9.999922422199213e-06, + "loss": 0.6804, + "step": 498 + }, + { + "epoch": 0.06, + "grad_norm": 0.6422403188821002, + "learning_rate": 9.999916569294994e-06, + "loss": 0.5432, + "step": 499 + }, + { + "epoch": 0.06, + "grad_norm": 0.7370948573188223, + "learning_rate": 9.9999105035603e-06, + "loss": 0.5625, + "step": 500 + }, + { + "epoch": 0.06, + "grad_norm": 0.7935001229962054, + "learning_rate": 9.999904224995397e-06, + "loss": 0.594, + "step": 501 + }, + { + "epoch": 0.06, + "grad_norm": 0.8847636308919531, + "learning_rate": 9.999897733600545e-06, + "loss": 0.6236, + "step": 502 + }, + { + "epoch": 0.06, + "grad_norm": 0.6425009679367006, + "learning_rate": 9.999891029376025e-06, + "loss": 0.6326, + "step": 503 + }, + { + "epoch": 0.06, + "grad_norm": 0.8989107169408984, + "learning_rate": 9.999884112322122e-06, + "loss": 0.6011, + "step": 504 + }, + { + "epoch": 0.06, + "grad_norm": 0.8916440421695172, + "learning_rate": 9.999876982439131e-06, + "loss": 0.698, + "step": 505 + }, + { + "epoch": 0.06, + "grad_norm": 0.8724879286301799, + "learning_rate": 9.999869639727353e-06, + "loss": 0.6846, + "step": 506 + }, + { + "epoch": 0.06, + "grad_norm": 0.7649817263081151, + "learning_rate": 9.999862084187101e-06, + "loss": 0.5864, + "step": 507 + }, + { + "epoch": 0.06, + "grad_norm": 0.718616029914458, + "learning_rate": 9.999854315818697e-06, + "loss": 0.5718, + "step": 508 + }, + { + "epoch": 0.06, + "grad_norm": 0.8222526275223059, + "learning_rate": 9.999846334622474e-06, + "loss": 0.5788, + "step": 509 + }, + { + "epoch": 0.06, + "grad_norm": 2.130838876693734, + "learning_rate": 9.999838140598768e-06, + "loss": 0.6831, + "step": 510 + }, + { + "epoch": 0.07, + "grad_norm": 0.6932187212337148, + "learning_rate": 9.99982973374793e-06, + "loss": 0.5836, + "step": 511 + }, + { + "epoch": 0.07, + "grad_norm": 0.8214432283624149, + "learning_rate": 9.999821114070318e-06, + "loss": 0.6369, + "step": 512 + }, + { + "epoch": 0.07, + "grad_norm": 0.6525866559128067, + "learning_rate": 9.999812281566298e-06, + "loss": 0.5711, + "step": 513 + }, + { + "epoch": 0.07, + "grad_norm": 0.7239474453362629, + "learning_rate": 9.999803236236246e-06, + "loss": 0.5785, + "step": 514 + }, + { + "epoch": 0.07, + "grad_norm": 0.8555078103180713, + "learning_rate": 9.999793978080548e-06, + "loss": 0.6511, + "step": 515 + }, + { + "epoch": 0.07, + "grad_norm": 0.732197985942794, + "learning_rate": 9.999784507099598e-06, + "loss": 0.5622, + "step": 516 + }, + { + "epoch": 0.07, + "grad_norm": 0.8622453933353151, + "learning_rate": 9.999774823293794e-06, + "loss": 0.6547, + "step": 517 + }, + { + "epoch": 0.07, + "grad_norm": 0.9183637108699996, + "learning_rate": 9.999764926663558e-06, + "loss": 0.6939, + "step": 518 + }, + { + "epoch": 0.07, + "grad_norm": 0.8389181768232583, + "learning_rate": 9.999754817209305e-06, + "loss": 0.5699, + "step": 519 + }, + { + "epoch": 0.07, + "grad_norm": 0.6502745124123062, + "learning_rate": 9.999744494931465e-06, + "loss": 0.6191, + "step": 520 + }, + { + "epoch": 0.07, + "grad_norm": 0.6886355269499652, + "learning_rate": 9.99973395983048e-06, + "loss": 0.5393, + "step": 521 + }, + { + "epoch": 0.07, + "grad_norm": 0.6612716942626704, + "learning_rate": 9.999723211906796e-06, + "loss": 0.6018, + "step": 522 + }, + { + "epoch": 0.07, + "grad_norm": 0.6501186337772761, + "learning_rate": 9.999712251160871e-06, + "loss": 0.5573, + "step": 523 + }, + { + "epoch": 0.07, + "grad_norm": 0.7706352731103632, + "learning_rate": 9.999701077593174e-06, + "loss": 0.6581, + "step": 524 + }, + { + "epoch": 0.07, + "grad_norm": 0.7730012720034379, + "learning_rate": 9.999689691204179e-06, + "loss": 0.5962, + "step": 525 + }, + { + "epoch": 0.07, + "grad_norm": 0.9508155364434, + "learning_rate": 9.99967809199437e-06, + "loss": 0.6623, + "step": 526 + }, + { + "epoch": 0.07, + "grad_norm": 0.8060102874185446, + "learning_rate": 9.999666279964242e-06, + "loss": 0.6761, + "step": 527 + }, + { + "epoch": 0.07, + "grad_norm": 0.7066786391666761, + "learning_rate": 9.999654255114295e-06, + "loss": 0.5771, + "step": 528 + }, + { + "epoch": 0.07, + "grad_norm": 0.6828924290413217, + "learning_rate": 9.999642017445045e-06, + "loss": 0.62, + "step": 529 + }, + { + "epoch": 0.07, + "grad_norm": 0.7222614445163584, + "learning_rate": 9.99962956695701e-06, + "loss": 0.6126, + "step": 530 + }, + { + "epoch": 0.07, + "grad_norm": 0.9046138767923666, + "learning_rate": 9.999616903650722e-06, + "loss": 0.5865, + "step": 531 + }, + { + "epoch": 0.07, + "grad_norm": 0.7886204704955198, + "learning_rate": 9.999604027526717e-06, + "loss": 0.65, + "step": 532 + }, + { + "epoch": 0.07, + "grad_norm": 1.0075595298045992, + "learning_rate": 9.999590938585546e-06, + "loss": 0.69, + "step": 533 + }, + { + "epoch": 0.07, + "grad_norm": 0.8124456029009315, + "learning_rate": 9.999577636827766e-06, + "loss": 0.6025, + "step": 534 + }, + { + "epoch": 0.07, + "grad_norm": 0.6329892453569808, + "learning_rate": 9.99956412225394e-06, + "loss": 0.5329, + "step": 535 + }, + { + "epoch": 0.07, + "grad_norm": 0.7952247674357384, + "learning_rate": 9.999550394864647e-06, + "loss": 0.5825, + "step": 536 + }, + { + "epoch": 0.07, + "grad_norm": 0.8465928631531884, + "learning_rate": 9.99953645466047e-06, + "loss": 0.6563, + "step": 537 + }, + { + "epoch": 0.07, + "grad_norm": 0.8713340741755371, + "learning_rate": 9.999522301642004e-06, + "loss": 0.6791, + "step": 538 + }, + { + "epoch": 0.07, + "grad_norm": 0.9350628744751012, + "learning_rate": 9.999507935809848e-06, + "loss": 0.5694, + "step": 539 + }, + { + "epoch": 0.07, + "grad_norm": 0.7993387054659695, + "learning_rate": 9.999493357164616e-06, + "loss": 0.5733, + "step": 540 + }, + { + "epoch": 0.07, + "grad_norm": 0.6539378896746085, + "learning_rate": 9.999478565706927e-06, + "loss": 0.5551, + "step": 541 + }, + { + "epoch": 0.07, + "grad_norm": 0.8243934943005577, + "learning_rate": 9.999463561437412e-06, + "loss": 0.624, + "step": 542 + }, + { + "epoch": 0.07, + "grad_norm": 0.7133367340956287, + "learning_rate": 9.999448344356709e-06, + "loss": 0.5759, + "step": 543 + }, + { + "epoch": 0.07, + "grad_norm": 0.6729087774343463, + "learning_rate": 9.999432914465466e-06, + "loss": 0.5409, + "step": 544 + }, + { + "epoch": 0.07, + "grad_norm": 1.008624253443484, + "learning_rate": 9.99941727176434e-06, + "loss": 0.6462, + "step": 545 + }, + { + "epoch": 0.07, + "grad_norm": 0.7191947866708757, + "learning_rate": 9.999401416253997e-06, + "loss": 0.5832, + "step": 546 + }, + { + "epoch": 0.07, + "grad_norm": 0.6854202092498776, + "learning_rate": 9.99938534793511e-06, + "loss": 0.5299, + "step": 547 + }, + { + "epoch": 0.07, + "grad_norm": 0.6434247509405175, + "learning_rate": 9.999369066808366e-06, + "loss": 0.5476, + "step": 548 + }, + { + "epoch": 0.07, + "grad_norm": 0.6140147352500417, + "learning_rate": 9.999352572874457e-06, + "loss": 0.5594, + "step": 549 + }, + { + "epoch": 0.07, + "grad_norm": 0.7705661429097387, + "learning_rate": 9.999335866134084e-06, + "loss": 0.6459, + "step": 550 + }, + { + "epoch": 0.07, + "grad_norm": 0.8157044716675759, + "learning_rate": 9.999318946587957e-06, + "loss": 0.6638, + "step": 551 + }, + { + "epoch": 0.07, + "grad_norm": 1.5025190176934549, + "learning_rate": 9.9993018142368e-06, + "loss": 0.6143, + "step": 552 + }, + { + "epoch": 0.07, + "grad_norm": 0.6499419604536284, + "learning_rate": 9.999284469081338e-06, + "loss": 0.5732, + "step": 553 + }, + { + "epoch": 0.07, + "grad_norm": 0.7374540089225985, + "learning_rate": 9.999266911122314e-06, + "loss": 0.5951, + "step": 554 + }, + { + "epoch": 0.07, + "grad_norm": 0.6667930700380424, + "learning_rate": 9.999249140360473e-06, + "loss": 0.576, + "step": 555 + }, + { + "epoch": 0.07, + "grad_norm": 1.0162824245358222, + "learning_rate": 9.99923115679657e-06, + "loss": 0.7426, + "step": 556 + }, + { + "epoch": 0.07, + "grad_norm": 0.6305009829157838, + "learning_rate": 9.999212960431372e-06, + "loss": 0.5965, + "step": 557 + }, + { + "epoch": 0.07, + "grad_norm": 0.823049052290893, + "learning_rate": 9.999194551265653e-06, + "loss": 0.6681, + "step": 558 + }, + { + "epoch": 0.07, + "grad_norm": 0.7925088954710804, + "learning_rate": 9.9991759293002e-06, + "loss": 0.6394, + "step": 559 + }, + { + "epoch": 0.07, + "grad_norm": 0.6887087221153546, + "learning_rate": 9.9991570945358e-06, + "loss": 0.5601, + "step": 560 + }, + { + "epoch": 0.07, + "grad_norm": 0.631151150092096, + "learning_rate": 9.999138046973258e-06, + "loss": 0.5477, + "step": 561 + }, + { + "epoch": 0.07, + "grad_norm": 0.8605832852629615, + "learning_rate": 9.999118786613384e-06, + "loss": 0.6283, + "step": 562 + }, + { + "epoch": 0.07, + "grad_norm": 0.739203631837093, + "learning_rate": 9.999099313457e-06, + "loss": 0.6127, + "step": 563 + }, + { + "epoch": 0.07, + "grad_norm": 0.6596350627733387, + "learning_rate": 9.999079627504931e-06, + "loss": 0.5736, + "step": 564 + }, + { + "epoch": 0.07, + "grad_norm": 1.4592063027815079, + "learning_rate": 9.999059728758018e-06, + "loss": 0.6749, + "step": 565 + }, + { + "epoch": 0.07, + "grad_norm": 1.1222640617266986, + "learning_rate": 9.999039617217108e-06, + "loss": 0.7198, + "step": 566 + }, + { + "epoch": 0.07, + "grad_norm": 0.6920355737275525, + "learning_rate": 9.999019292883053e-06, + "loss": 0.612, + "step": 567 + }, + { + "epoch": 0.07, + "grad_norm": 0.76675542132742, + "learning_rate": 9.998998755756724e-06, + "loss": 0.6004, + "step": 568 + }, + { + "epoch": 0.07, + "grad_norm": 0.8098835588375225, + "learning_rate": 9.998978005838993e-06, + "loss": 0.5792, + "step": 569 + }, + { + "epoch": 0.07, + "grad_norm": 0.9330560815304642, + "learning_rate": 9.99895704313074e-06, + "loss": 0.6745, + "step": 570 + }, + { + "epoch": 0.07, + "grad_norm": 0.7107217808643186, + "learning_rate": 9.998935867632862e-06, + "loss": 0.5645, + "step": 571 + }, + { + "epoch": 0.07, + "grad_norm": 0.8628341720586082, + "learning_rate": 9.998914479346258e-06, + "loss": 0.5851, + "step": 572 + }, + { + "epoch": 0.07, + "grad_norm": 1.812296064370831, + "learning_rate": 9.998892878271839e-06, + "loss": 0.6905, + "step": 573 + }, + { + "epoch": 0.07, + "grad_norm": 0.8344977204345659, + "learning_rate": 9.998871064410524e-06, + "loss": 0.636, + "step": 574 + }, + { + "epoch": 0.07, + "grad_norm": 0.6655544154860575, + "learning_rate": 9.998849037763243e-06, + "loss": 0.6042, + "step": 575 + }, + { + "epoch": 0.07, + "grad_norm": 0.9039870258725992, + "learning_rate": 9.998826798330932e-06, + "loss": 0.7086, + "step": 576 + }, + { + "epoch": 0.07, + "grad_norm": 0.6918627084041002, + "learning_rate": 9.998804346114536e-06, + "loss": 0.532, + "step": 577 + }, + { + "epoch": 0.07, + "grad_norm": 0.6263729431215533, + "learning_rate": 9.998781681115017e-06, + "loss": 0.5717, + "step": 578 + }, + { + "epoch": 0.07, + "grad_norm": 0.9054569474491742, + "learning_rate": 9.998758803333333e-06, + "loss": 0.6493, + "step": 579 + }, + { + "epoch": 0.07, + "grad_norm": 0.7819365195426208, + "learning_rate": 9.99873571277046e-06, + "loss": 0.6549, + "step": 580 + }, + { + "epoch": 0.07, + "grad_norm": 0.7845315517848321, + "learning_rate": 9.998712409427382e-06, + "loss": 0.6991, + "step": 581 + }, + { + "epoch": 0.07, + "grad_norm": 1.0283213258141806, + "learning_rate": 9.99868889330509e-06, + "loss": 0.669, + "step": 582 + }, + { + "epoch": 0.07, + "grad_norm": 0.9632780379974247, + "learning_rate": 9.998665164404586e-06, + "loss": 0.7075, + "step": 583 + }, + { + "epoch": 0.07, + "grad_norm": 0.7756967072338767, + "learning_rate": 9.99864122272688e-06, + "loss": 0.5049, + "step": 584 + }, + { + "epoch": 0.07, + "grad_norm": 0.9607889685736162, + "learning_rate": 9.99861706827299e-06, + "loss": 0.6408, + "step": 585 + }, + { + "epoch": 0.07, + "grad_norm": 0.6789333840195599, + "learning_rate": 9.998592701043945e-06, + "loss": 0.5208, + "step": 586 + }, + { + "epoch": 0.07, + "grad_norm": 0.7073847955888349, + "learning_rate": 9.998568121040781e-06, + "loss": 0.6186, + "step": 587 + }, + { + "epoch": 0.07, + "grad_norm": 0.8511169500473535, + "learning_rate": 9.998543328264544e-06, + "loss": 0.6493, + "step": 588 + }, + { + "epoch": 0.08, + "grad_norm": 0.6728585712796856, + "learning_rate": 9.998518322716294e-06, + "loss": 0.5862, + "step": 589 + }, + { + "epoch": 0.08, + "grad_norm": 0.8205208526183553, + "learning_rate": 9.99849310439709e-06, + "loss": 0.6667, + "step": 590 + }, + { + "epoch": 0.08, + "grad_norm": 0.7534012734900305, + "learning_rate": 9.998467673308008e-06, + "loss": 0.6106, + "step": 591 + }, + { + "epoch": 0.08, + "grad_norm": 0.7119869181595472, + "learning_rate": 9.998442029450128e-06, + "loss": 0.5755, + "step": 592 + }, + { + "epoch": 0.08, + "grad_norm": 0.8701971631151063, + "learning_rate": 9.998416172824545e-06, + "loss": 0.7085, + "step": 593 + }, + { + "epoch": 0.08, + "grad_norm": 0.9227174594042213, + "learning_rate": 9.998390103432359e-06, + "loss": 0.6441, + "step": 594 + }, + { + "epoch": 0.08, + "grad_norm": 0.7216308708654693, + "learning_rate": 9.99836382127468e-06, + "loss": 0.5922, + "step": 595 + }, + { + "epoch": 0.08, + "grad_norm": 0.5848478019701097, + "learning_rate": 9.998337326352623e-06, + "loss": 0.5544, + "step": 596 + }, + { + "epoch": 0.08, + "grad_norm": 0.951394179802008, + "learning_rate": 9.99831061866732e-06, + "loss": 0.6307, + "step": 597 + }, + { + "epoch": 0.08, + "grad_norm": 0.6242384053692704, + "learning_rate": 9.998283698219905e-06, + "loss": 0.543, + "step": 598 + }, + { + "epoch": 0.08, + "grad_norm": 0.7794169851853867, + "learning_rate": 9.998256565011525e-06, + "loss": 0.6081, + "step": 599 + }, + { + "epoch": 0.08, + "grad_norm": 0.8420300848884748, + "learning_rate": 9.998229219043336e-06, + "loss": 0.5916, + "step": 600 + }, + { + "epoch": 0.08, + "grad_norm": 0.704643197328675, + "learning_rate": 9.9982016603165e-06, + "loss": 0.5532, + "step": 601 + }, + { + "epoch": 0.08, + "grad_norm": 0.6434413239654009, + "learning_rate": 9.998173888832193e-06, + "loss": 0.5382, + "step": 602 + }, + { + "epoch": 0.08, + "grad_norm": 1.028914317228709, + "learning_rate": 9.998145904591595e-06, + "loss": 0.6082, + "step": 603 + }, + { + "epoch": 0.08, + "grad_norm": 0.6454334681598273, + "learning_rate": 9.998117707595898e-06, + "loss": 0.5557, + "step": 604 + }, + { + "epoch": 0.08, + "grad_norm": 0.6891323611593384, + "learning_rate": 9.9980892978463e-06, + "loss": 0.5964, + "step": 605 + }, + { + "epoch": 0.08, + "grad_norm": 0.6483259854376953, + "learning_rate": 9.998060675344015e-06, + "loss": 0.5604, + "step": 606 + }, + { + "epoch": 0.08, + "grad_norm": 0.640850410937195, + "learning_rate": 9.998031840090257e-06, + "loss": 0.5605, + "step": 607 + }, + { + "epoch": 0.08, + "grad_norm": 0.7237150931450929, + "learning_rate": 9.998002792086254e-06, + "loss": 0.5944, + "step": 608 + }, + { + "epoch": 0.08, + "grad_norm": 0.6996628810617805, + "learning_rate": 9.997973531333245e-06, + "loss": 0.5957, + "step": 609 + }, + { + "epoch": 0.08, + "grad_norm": 0.6173241916508667, + "learning_rate": 9.997944057832474e-06, + "loss": 0.5626, + "step": 610 + }, + { + "epoch": 0.08, + "grad_norm": 0.9164611669637491, + "learning_rate": 9.997914371585195e-06, + "loss": 0.677, + "step": 611 + }, + { + "epoch": 0.08, + "grad_norm": 0.7285589497635658, + "learning_rate": 9.997884472592672e-06, + "loss": 0.5631, + "step": 612 + }, + { + "epoch": 0.08, + "grad_norm": 0.9485477673636976, + "learning_rate": 9.997854360856178e-06, + "loss": 0.669, + "step": 613 + }, + { + "epoch": 0.08, + "grad_norm": 0.7781838708302896, + "learning_rate": 9.997824036376995e-06, + "loss": 0.6623, + "step": 614 + }, + { + "epoch": 0.08, + "grad_norm": 0.7898792801970184, + "learning_rate": 9.997793499156414e-06, + "loss": 0.6471, + "step": 615 + }, + { + "epoch": 0.08, + "grad_norm": 0.8308443987749227, + "learning_rate": 9.997762749195735e-06, + "loss": 0.6877, + "step": 616 + }, + { + "epoch": 0.08, + "grad_norm": 0.6310836629954972, + "learning_rate": 9.997731786496265e-06, + "loss": 0.5485, + "step": 617 + }, + { + "epoch": 0.08, + "grad_norm": 1.2053576611133063, + "learning_rate": 9.997700611059323e-06, + "loss": 0.6664, + "step": 618 + }, + { + "epoch": 0.08, + "grad_norm": 0.613546971387751, + "learning_rate": 9.997669222886237e-06, + "loss": 0.5233, + "step": 619 + }, + { + "epoch": 0.08, + "grad_norm": 0.6745170096468901, + "learning_rate": 9.997637621978341e-06, + "loss": 0.5831, + "step": 620 + }, + { + "epoch": 0.08, + "grad_norm": 0.8062537763927432, + "learning_rate": 9.997605808336985e-06, + "loss": 0.6216, + "step": 621 + }, + { + "epoch": 0.08, + "grad_norm": 0.6924571937105224, + "learning_rate": 9.997573781963517e-06, + "loss": 0.5789, + "step": 622 + }, + { + "epoch": 0.08, + "grad_norm": 1.181097370897103, + "learning_rate": 9.997541542859305e-06, + "loss": 0.6104, + "step": 623 + }, + { + "epoch": 0.08, + "grad_norm": 0.5527977444336124, + "learning_rate": 9.997509091025718e-06, + "loss": 0.5088, + "step": 624 + }, + { + "epoch": 0.08, + "grad_norm": 0.8535412593908939, + "learning_rate": 9.99747642646414e-06, + "loss": 0.6658, + "step": 625 + }, + { + "epoch": 0.08, + "grad_norm": 0.8290599255212616, + "learning_rate": 9.997443549175957e-06, + "loss": 0.642, + "step": 626 + }, + { + "epoch": 0.08, + "grad_norm": 0.6585395907225443, + "learning_rate": 9.997410459162575e-06, + "loss": 0.5996, + "step": 627 + }, + { + "epoch": 0.08, + "grad_norm": 0.7643793451239477, + "learning_rate": 9.997377156425398e-06, + "loss": 0.6094, + "step": 628 + }, + { + "epoch": 0.08, + "grad_norm": 0.5850937661332478, + "learning_rate": 9.997343640965846e-06, + "loss": 0.5058, + "step": 629 + }, + { + "epoch": 0.08, + "grad_norm": 0.8798977741642525, + "learning_rate": 9.997309912785343e-06, + "loss": 0.6818, + "step": 630 + }, + { + "epoch": 0.08, + "grad_norm": 0.7845680405955845, + "learning_rate": 9.997275971885326e-06, + "loss": 0.6399, + "step": 631 + }, + { + "epoch": 0.08, + "grad_norm": 0.787892789994041, + "learning_rate": 9.997241818267241e-06, + "loss": 0.5836, + "step": 632 + }, + { + "epoch": 0.08, + "grad_norm": 0.871851233272999, + "learning_rate": 9.997207451932539e-06, + "loss": 0.7112, + "step": 633 + }, + { + "epoch": 0.08, + "grad_norm": 0.8369811460124821, + "learning_rate": 9.997172872882685e-06, + "loss": 0.6088, + "step": 634 + }, + { + "epoch": 0.08, + "grad_norm": 0.7052094714802036, + "learning_rate": 9.997138081119153e-06, + "loss": 0.5751, + "step": 635 + }, + { + "epoch": 0.08, + "grad_norm": 0.7405549824133622, + "learning_rate": 9.997103076643418e-06, + "loss": 0.5486, + "step": 636 + }, + { + "epoch": 0.08, + "grad_norm": 0.7189672087528193, + "learning_rate": 9.997067859456975e-06, + "loss": 0.5817, + "step": 637 + }, + { + "epoch": 0.08, + "grad_norm": 0.7107487540224503, + "learning_rate": 9.997032429561321e-06, + "loss": 0.5652, + "step": 638 + }, + { + "epoch": 0.08, + "grad_norm": 0.8769897123204996, + "learning_rate": 9.996996786957967e-06, + "loss": 0.6672, + "step": 639 + }, + { + "epoch": 0.08, + "grad_norm": 0.6401910642444841, + "learning_rate": 9.996960931648425e-06, + "loss": 0.5351, + "step": 640 + }, + { + "epoch": 0.08, + "grad_norm": 0.6778235077811764, + "learning_rate": 9.996924863634225e-06, + "loss": 0.5921, + "step": 641 + }, + { + "epoch": 0.08, + "grad_norm": 0.7011876136636904, + "learning_rate": 9.996888582916902e-06, + "loss": 0.5723, + "step": 642 + }, + { + "epoch": 0.08, + "grad_norm": 1.0460066107180521, + "learning_rate": 9.996852089497999e-06, + "loss": 0.6511, + "step": 643 + }, + { + "epoch": 0.08, + "grad_norm": 0.6826216750177364, + "learning_rate": 9.996815383379072e-06, + "loss": 0.5215, + "step": 644 + }, + { + "epoch": 0.08, + "grad_norm": 0.6729038352173468, + "learning_rate": 9.99677846456168e-06, + "loss": 0.5201, + "step": 645 + }, + { + "epoch": 0.08, + "grad_norm": 0.6388857321520266, + "learning_rate": 9.996741333047398e-06, + "loss": 0.5607, + "step": 646 + }, + { + "epoch": 0.08, + "grad_norm": 0.7085204374891683, + "learning_rate": 9.996703988837801e-06, + "loss": 0.5963, + "step": 647 + }, + { + "epoch": 0.08, + "grad_norm": 0.6648291904521154, + "learning_rate": 9.996666431934486e-06, + "loss": 0.5623, + "step": 648 + }, + { + "epoch": 0.08, + "grad_norm": 0.791919546274351, + "learning_rate": 9.996628662339046e-06, + "loss": 0.6429, + "step": 649 + }, + { + "epoch": 0.08, + "grad_norm": 0.8183198332608097, + "learning_rate": 9.996590680053091e-06, + "loss": 0.632, + "step": 650 + }, + { + "epoch": 0.08, + "grad_norm": 0.806752433284419, + "learning_rate": 9.99655248507824e-06, + "loss": 0.6869, + "step": 651 + }, + { + "epoch": 0.08, + "grad_norm": 0.6296693161494359, + "learning_rate": 9.996514077416114e-06, + "loss": 0.5906, + "step": 652 + }, + { + "epoch": 0.08, + "grad_norm": 0.7090714175483955, + "learning_rate": 9.996475457068351e-06, + "loss": 0.5875, + "step": 653 + }, + { + "epoch": 0.08, + "grad_norm": 1.0149882615272856, + "learning_rate": 9.996436624036594e-06, + "loss": 0.6216, + "step": 654 + }, + { + "epoch": 0.08, + "grad_norm": 0.6379949441574454, + "learning_rate": 9.996397578322497e-06, + "loss": 0.5716, + "step": 655 + }, + { + "epoch": 0.08, + "grad_norm": 0.8184009434954894, + "learning_rate": 9.996358319927719e-06, + "loss": 0.6285, + "step": 656 + }, + { + "epoch": 0.08, + "grad_norm": 0.7292977782606443, + "learning_rate": 9.996318848853936e-06, + "loss": 0.5799, + "step": 657 + }, + { + "epoch": 0.08, + "grad_norm": 0.6748165000254052, + "learning_rate": 9.996279165102824e-06, + "loss": 0.5708, + "step": 658 + }, + { + "epoch": 0.08, + "grad_norm": 0.8797755071234167, + "learning_rate": 9.996239268676075e-06, + "loss": 0.6819, + "step": 659 + }, + { + "epoch": 0.08, + "grad_norm": 0.7414200697991348, + "learning_rate": 9.996199159575385e-06, + "loss": 0.6368, + "step": 660 + }, + { + "epoch": 0.08, + "grad_norm": 0.6793534011032392, + "learning_rate": 9.996158837802463e-06, + "loss": 0.6238, + "step": 661 + }, + { + "epoch": 0.08, + "grad_norm": 0.6365931996356293, + "learning_rate": 9.996118303359024e-06, + "loss": 0.6239, + "step": 662 + }, + { + "epoch": 0.08, + "grad_norm": 1.7063336894857222, + "learning_rate": 9.996077556246795e-06, + "loss": 0.7095, + "step": 663 + }, + { + "epoch": 0.08, + "grad_norm": 0.8983126405033272, + "learning_rate": 9.996036596467509e-06, + "loss": 0.6375, + "step": 664 + }, + { + "epoch": 0.08, + "grad_norm": 0.6875782930744684, + "learning_rate": 9.995995424022911e-06, + "loss": 0.5261, + "step": 665 + }, + { + "epoch": 0.08, + "grad_norm": 0.6755263244361941, + "learning_rate": 9.995954038914752e-06, + "loss": 0.5926, + "step": 666 + }, + { + "epoch": 0.08, + "grad_norm": 1.1520147850642788, + "learning_rate": 9.995912441144794e-06, + "loss": 0.7104, + "step": 667 + }, + { + "epoch": 0.09, + "grad_norm": 0.8267163628665787, + "learning_rate": 9.995870630714808e-06, + "loss": 0.6427, + "step": 668 + }, + { + "epoch": 0.09, + "grad_norm": 0.839777535046645, + "learning_rate": 9.995828607626574e-06, + "loss": 0.6504, + "step": 669 + }, + { + "epoch": 0.09, + "grad_norm": 0.6164221903425307, + "learning_rate": 9.995786371881882e-06, + "loss": 0.5905, + "step": 670 + }, + { + "epoch": 0.09, + "grad_norm": 0.9366572634947168, + "learning_rate": 9.995743923482527e-06, + "loss": 0.6999, + "step": 671 + }, + { + "epoch": 0.09, + "grad_norm": 0.7084248721753169, + "learning_rate": 9.995701262430317e-06, + "loss": 0.5855, + "step": 672 + }, + { + "epoch": 0.09, + "grad_norm": 0.8142888157647624, + "learning_rate": 9.995658388727067e-06, + "loss": 0.6274, + "step": 673 + }, + { + "epoch": 0.09, + "grad_norm": 0.60095279503971, + "learning_rate": 9.995615302374607e-06, + "loss": 0.5436, + "step": 674 + }, + { + "epoch": 0.09, + "grad_norm": 0.6355827884716699, + "learning_rate": 9.995572003374765e-06, + "loss": 0.565, + "step": 675 + }, + { + "epoch": 0.09, + "grad_norm": 0.6002682697245667, + "learning_rate": 9.995528491729386e-06, + "loss": 0.5332, + "step": 676 + }, + { + "epoch": 0.09, + "grad_norm": 0.7921462760949219, + "learning_rate": 9.995484767440321e-06, + "loss": 0.5832, + "step": 677 + }, + { + "epoch": 0.09, + "grad_norm": 0.8446136275255393, + "learning_rate": 9.995440830509437e-06, + "loss": 0.6251, + "step": 678 + }, + { + "epoch": 0.09, + "grad_norm": 0.578197473122897, + "learning_rate": 9.995396680938599e-06, + "loss": 0.5717, + "step": 679 + }, + { + "epoch": 0.09, + "grad_norm": 0.5975808365786478, + "learning_rate": 9.995352318729685e-06, + "loss": 0.5526, + "step": 680 + }, + { + "epoch": 0.09, + "grad_norm": 0.8402832168714275, + "learning_rate": 9.995307743884587e-06, + "loss": 0.6598, + "step": 681 + }, + { + "epoch": 0.09, + "grad_norm": 0.5955051072932334, + "learning_rate": 9.9952629564052e-06, + "loss": 0.5324, + "step": 682 + }, + { + "epoch": 0.09, + "grad_norm": 0.7020067178988065, + "learning_rate": 9.995217956293433e-06, + "loss": 0.5926, + "step": 683 + }, + { + "epoch": 0.09, + "grad_norm": 0.6494717514572398, + "learning_rate": 9.995172743551199e-06, + "loss": 0.5969, + "step": 684 + }, + { + "epoch": 0.09, + "grad_norm": 0.9143489878257325, + "learning_rate": 9.995127318180424e-06, + "loss": 0.6555, + "step": 685 + }, + { + "epoch": 0.09, + "grad_norm": 0.721704853088111, + "learning_rate": 9.99508168018304e-06, + "loss": 0.5903, + "step": 686 + }, + { + "epoch": 0.09, + "grad_norm": 0.6613346090827121, + "learning_rate": 9.995035829560993e-06, + "loss": 0.5517, + "step": 687 + }, + { + "epoch": 0.09, + "grad_norm": 0.5711868712156565, + "learning_rate": 9.994989766316232e-06, + "loss": 0.5386, + "step": 688 + }, + { + "epoch": 0.09, + "grad_norm": 0.6276912272892899, + "learning_rate": 9.994943490450719e-06, + "loss": 0.5614, + "step": 689 + }, + { + "epoch": 0.09, + "grad_norm": 0.6781272192810728, + "learning_rate": 9.99489700196642e-06, + "loss": 0.6198, + "step": 690 + }, + { + "epoch": 0.09, + "grad_norm": 1.6205918684127034, + "learning_rate": 9.99485030086532e-06, + "loss": 0.621, + "step": 691 + }, + { + "epoch": 0.09, + "grad_norm": 1.0623697303149042, + "learning_rate": 9.994803387149403e-06, + "loss": 0.6651, + "step": 692 + }, + { + "epoch": 0.09, + "grad_norm": 0.6625208597496124, + "learning_rate": 9.994756260820668e-06, + "loss": 0.5595, + "step": 693 + }, + { + "epoch": 0.09, + "grad_norm": 0.908463009621757, + "learning_rate": 9.994708921881118e-06, + "loss": 0.6057, + "step": 694 + }, + { + "epoch": 0.09, + "grad_norm": 0.7078131488575676, + "learning_rate": 9.994661370332772e-06, + "loss": 0.5556, + "step": 695 + }, + { + "epoch": 0.09, + "grad_norm": 0.6595231667492926, + "learning_rate": 9.99461360617765e-06, + "loss": 0.538, + "step": 696 + }, + { + "epoch": 0.09, + "grad_norm": 0.6282873366296898, + "learning_rate": 9.99456562941779e-06, + "loss": 0.5978, + "step": 697 + }, + { + "epoch": 0.09, + "grad_norm": 0.8378046457361583, + "learning_rate": 9.99451744005523e-06, + "loss": 0.6169, + "step": 698 + }, + { + "epoch": 0.09, + "grad_norm": 0.7321909573353417, + "learning_rate": 9.994469038092021e-06, + "loss": 0.633, + "step": 699 + }, + { + "epoch": 0.09, + "grad_norm": 0.7138296810024128, + "learning_rate": 9.994420423530226e-06, + "loss": 0.5621, + "step": 700 + }, + { + "epoch": 0.09, + "grad_norm": 0.8574973273647994, + "learning_rate": 9.994371596371916e-06, + "loss": 0.6913, + "step": 701 + }, + { + "epoch": 0.09, + "grad_norm": 0.7042105564808486, + "learning_rate": 9.994322556619165e-06, + "loss": 0.5727, + "step": 702 + }, + { + "epoch": 0.09, + "grad_norm": 0.8617735874889249, + "learning_rate": 9.994273304274063e-06, + "loss": 0.7115, + "step": 703 + }, + { + "epoch": 0.09, + "grad_norm": 1.0008493088743105, + "learning_rate": 9.994223839338704e-06, + "loss": 0.6561, + "step": 704 + }, + { + "epoch": 0.09, + "grad_norm": 0.7520301080440177, + "learning_rate": 9.994174161815198e-06, + "loss": 0.6137, + "step": 705 + }, + { + "epoch": 0.09, + "grad_norm": 0.6516214975658886, + "learning_rate": 9.994124271705654e-06, + "loss": 0.5581, + "step": 706 + }, + { + "epoch": 0.09, + "grad_norm": 1.9097192762638664, + "learning_rate": 9.994074169012201e-06, + "loss": 0.5854, + "step": 707 + }, + { + "epoch": 0.09, + "grad_norm": 0.8444910501822214, + "learning_rate": 9.99402385373697e-06, + "loss": 0.5931, + "step": 708 + }, + { + "epoch": 0.09, + "grad_norm": 0.6834082983943086, + "learning_rate": 9.9939733258821e-06, + "loss": 0.5772, + "step": 709 + }, + { + "epoch": 0.09, + "grad_norm": 0.9681715674561516, + "learning_rate": 9.993922585449745e-06, + "loss": 0.6371, + "step": 710 + }, + { + "epoch": 0.09, + "grad_norm": 0.9051820415048046, + "learning_rate": 9.993871632442065e-06, + "loss": 0.6557, + "step": 711 + }, + { + "epoch": 0.09, + "grad_norm": 0.7707641503024311, + "learning_rate": 9.993820466861225e-06, + "loss": 0.6534, + "step": 712 + }, + { + "epoch": 0.09, + "grad_norm": 1.2328567189492285, + "learning_rate": 9.99376908870941e-06, + "loss": 0.6417, + "step": 713 + }, + { + "epoch": 0.09, + "grad_norm": 0.7516038633024393, + "learning_rate": 9.993717497988797e-06, + "loss": 0.6337, + "step": 714 + }, + { + "epoch": 0.09, + "grad_norm": 0.7045844606273769, + "learning_rate": 9.993665694701591e-06, + "loss": 0.5938, + "step": 715 + }, + { + "epoch": 0.09, + "grad_norm": 0.6691251105720674, + "learning_rate": 9.993613678849994e-06, + "loss": 0.5311, + "step": 716 + }, + { + "epoch": 0.09, + "grad_norm": 0.8419025097296773, + "learning_rate": 9.99356145043622e-06, + "loss": 0.6553, + "step": 717 + }, + { + "epoch": 0.09, + "grad_norm": 0.6814007274604756, + "learning_rate": 9.993509009462491e-06, + "loss": 0.5697, + "step": 718 + }, + { + "epoch": 0.09, + "grad_norm": 0.5878337898826469, + "learning_rate": 9.993456355931042e-06, + "loss": 0.5071, + "step": 719 + }, + { + "epoch": 0.09, + "grad_norm": 0.7779635187798183, + "learning_rate": 9.993403489844112e-06, + "loss": 0.6374, + "step": 720 + }, + { + "epoch": 0.09, + "grad_norm": 0.8100625867571185, + "learning_rate": 9.99335041120395e-06, + "loss": 0.5503, + "step": 721 + }, + { + "epoch": 0.09, + "grad_norm": 0.6768655524400484, + "learning_rate": 9.99329712001282e-06, + "loss": 0.5852, + "step": 722 + }, + { + "epoch": 0.09, + "grad_norm": 0.7387183957894116, + "learning_rate": 9.993243616272987e-06, + "loss": 0.6057, + "step": 723 + }, + { + "epoch": 0.09, + "grad_norm": 0.6497037461439694, + "learning_rate": 9.993189899986731e-06, + "loss": 0.5253, + "step": 724 + }, + { + "epoch": 0.09, + "grad_norm": 0.8513723100526019, + "learning_rate": 9.993135971156335e-06, + "loss": 0.6671, + "step": 725 + }, + { + "epoch": 0.09, + "grad_norm": 0.8503166815417971, + "learning_rate": 9.993081829784098e-06, + "loss": 0.7132, + "step": 726 + }, + { + "epoch": 0.09, + "grad_norm": 0.7190913432055303, + "learning_rate": 9.993027475872322e-06, + "loss": 0.5251, + "step": 727 + }, + { + "epoch": 0.09, + "grad_norm": 0.6644874297155562, + "learning_rate": 9.992972909423321e-06, + "loss": 0.5253, + "step": 728 + }, + { + "epoch": 0.09, + "grad_norm": 0.8234238920892054, + "learning_rate": 9.99291813043942e-06, + "loss": 0.5972, + "step": 729 + }, + { + "epoch": 0.09, + "grad_norm": 0.7352248725035845, + "learning_rate": 9.992863138922949e-06, + "loss": 0.6274, + "step": 730 + }, + { + "epoch": 0.09, + "grad_norm": 0.7644661183425183, + "learning_rate": 9.992807934876248e-06, + "loss": 0.685, + "step": 731 + }, + { + "epoch": 0.09, + "grad_norm": 0.7556809583325526, + "learning_rate": 9.99275251830167e-06, + "loss": 0.5298, + "step": 732 + }, + { + "epoch": 0.09, + "grad_norm": 0.824620178065385, + "learning_rate": 9.99269688920157e-06, + "loss": 0.6469, + "step": 733 + }, + { + "epoch": 0.09, + "grad_norm": 0.7232993765683309, + "learning_rate": 9.992641047578319e-06, + "loss": 0.5896, + "step": 734 + }, + { + "epoch": 0.09, + "grad_norm": 0.6835725439560649, + "learning_rate": 9.99258499343429e-06, + "loss": 0.5439, + "step": 735 + }, + { + "epoch": 0.09, + "grad_norm": 0.6363589148560473, + "learning_rate": 9.992528726771875e-06, + "loss": 0.5456, + "step": 736 + }, + { + "epoch": 0.09, + "grad_norm": 0.656622508428008, + "learning_rate": 9.992472247593466e-06, + "loss": 0.5427, + "step": 737 + }, + { + "epoch": 0.09, + "grad_norm": 1.3032360495897406, + "learning_rate": 9.992415555901466e-06, + "loss": 0.648, + "step": 738 + }, + { + "epoch": 0.09, + "grad_norm": 0.7557967211581249, + "learning_rate": 9.99235865169829e-06, + "loss": 0.627, + "step": 739 + }, + { + "epoch": 0.09, + "grad_norm": 0.6555105421279371, + "learning_rate": 9.992301534986359e-06, + "loss": 0.5434, + "step": 740 + }, + { + "epoch": 0.09, + "grad_norm": 0.7438722139028712, + "learning_rate": 9.992244205768104e-06, + "loss": 0.5501, + "step": 741 + }, + { + "epoch": 0.09, + "grad_norm": 0.7776058377148094, + "learning_rate": 9.992186664045966e-06, + "loss": 0.5536, + "step": 742 + }, + { + "epoch": 0.09, + "grad_norm": 0.6041178167844797, + "learning_rate": 9.992128909822395e-06, + "loss": 0.5535, + "step": 743 + }, + { + "epoch": 0.09, + "grad_norm": 0.6041064112548524, + "learning_rate": 9.99207094309985e-06, + "loss": 0.4954, + "step": 744 + }, + { + "epoch": 0.09, + "grad_norm": 0.6782353751153032, + "learning_rate": 9.992012763880797e-06, + "loss": 0.6132, + "step": 745 + }, + { + "epoch": 0.1, + "grad_norm": 0.5807807179807045, + "learning_rate": 9.991954372167711e-06, + "loss": 0.4822, + "step": 746 + }, + { + "epoch": 0.1, + "grad_norm": 0.8909082671096811, + "learning_rate": 9.991895767963082e-06, + "loss": 0.6357, + "step": 747 + }, + { + "epoch": 0.1, + "grad_norm": 0.6784308680180855, + "learning_rate": 9.991836951269401e-06, + "loss": 0.5967, + "step": 748 + }, + { + "epoch": 0.1, + "grad_norm": 0.8851869588465735, + "learning_rate": 9.99177792208917e-06, + "loss": 0.6157, + "step": 749 + }, + { + "epoch": 0.1, + "grad_norm": 0.80918196157227, + "learning_rate": 9.991718680424906e-06, + "loss": 0.6044, + "step": 750 + }, + { + "epoch": 0.1, + "grad_norm": 0.7570481980607737, + "learning_rate": 9.991659226279132e-06, + "loss": 0.6415, + "step": 751 + }, + { + "epoch": 0.1, + "grad_norm": 0.6209879202808876, + "learning_rate": 9.991599559654372e-06, + "loss": 0.5686, + "step": 752 + }, + { + "epoch": 0.1, + "grad_norm": 0.5636213452795127, + "learning_rate": 9.991539680553171e-06, + "loss": 0.5461, + "step": 753 + }, + { + "epoch": 0.1, + "grad_norm": 0.7768031019190919, + "learning_rate": 9.991479588978077e-06, + "loss": 0.6046, + "step": 754 + }, + { + "epoch": 0.1, + "grad_norm": 0.8738921674332195, + "learning_rate": 9.991419284931645e-06, + "loss": 0.6363, + "step": 755 + }, + { + "epoch": 0.1, + "grad_norm": 0.9867078491125648, + "learning_rate": 9.991358768416449e-06, + "loss": 0.5987, + "step": 756 + }, + { + "epoch": 0.1, + "grad_norm": 0.6157519853697115, + "learning_rate": 9.991298039435055e-06, + "loss": 0.5974, + "step": 757 + }, + { + "epoch": 0.1, + "grad_norm": 0.8817679833279981, + "learning_rate": 9.991237097990057e-06, + "loss": 0.5939, + "step": 758 + }, + { + "epoch": 0.1, + "grad_norm": 0.6540140299071304, + "learning_rate": 9.991175944084044e-06, + "loss": 0.54, + "step": 759 + }, + { + "epoch": 0.1, + "grad_norm": 0.7985386733836422, + "learning_rate": 9.991114577719621e-06, + "loss": 0.6122, + "step": 760 + }, + { + "epoch": 0.1, + "grad_norm": 0.664628522098663, + "learning_rate": 9.9910529988994e-06, + "loss": 0.5446, + "step": 761 + }, + { + "epoch": 0.1, + "grad_norm": 0.7417870718986781, + "learning_rate": 9.990991207626e-06, + "loss": 0.6229, + "step": 762 + }, + { + "epoch": 0.1, + "grad_norm": 0.6035253140147998, + "learning_rate": 9.990929203902056e-06, + "loss": 0.5411, + "step": 763 + }, + { + "epoch": 0.1, + "grad_norm": 0.7360947084341074, + "learning_rate": 9.990866987730204e-06, + "loss": 0.5483, + "step": 764 + }, + { + "epoch": 0.1, + "grad_norm": 0.6659011890744978, + "learning_rate": 9.990804559113093e-06, + "loss": 0.509, + "step": 765 + }, + { + "epoch": 0.1, + "grad_norm": 0.7637223777381329, + "learning_rate": 9.990741918053376e-06, + "loss": 0.601, + "step": 766 + }, + { + "epoch": 0.1, + "grad_norm": 0.817278269228528, + "learning_rate": 9.99067906455373e-06, + "loss": 0.6557, + "step": 767 + }, + { + "epoch": 0.1, + "grad_norm": 0.6655728021288501, + "learning_rate": 9.99061599861682e-06, + "loss": 0.5226, + "step": 768 + }, + { + "epoch": 0.1, + "grad_norm": 0.9914790343979668, + "learning_rate": 9.990552720245336e-06, + "loss": 0.6255, + "step": 769 + }, + { + "epoch": 0.1, + "grad_norm": 0.7603987378373237, + "learning_rate": 9.99048922944197e-06, + "loss": 0.5869, + "step": 770 + }, + { + "epoch": 0.1, + "grad_norm": 0.8781397184181137, + "learning_rate": 9.990425526209424e-06, + "loss": 0.6306, + "step": 771 + }, + { + "epoch": 0.1, + "grad_norm": 0.7860454394941983, + "learning_rate": 9.990361610550412e-06, + "loss": 0.6214, + "step": 772 + }, + { + "epoch": 0.1, + "grad_norm": 0.6146630025732014, + "learning_rate": 9.990297482467653e-06, + "loss": 0.582, + "step": 773 + }, + { + "epoch": 0.1, + "grad_norm": 0.6136480741260867, + "learning_rate": 9.990233141963877e-06, + "loss": 0.5521, + "step": 774 + }, + { + "epoch": 0.1, + "grad_norm": 0.8202346777365337, + "learning_rate": 9.990168589041821e-06, + "loss": 0.6426, + "step": 775 + }, + { + "epoch": 0.1, + "grad_norm": 0.8162614768870611, + "learning_rate": 9.990103823704236e-06, + "loss": 0.6168, + "step": 776 + }, + { + "epoch": 0.1, + "grad_norm": 0.822384912582235, + "learning_rate": 9.990038845953876e-06, + "loss": 0.6485, + "step": 777 + }, + { + "epoch": 0.1, + "grad_norm": 0.6787019985342974, + "learning_rate": 9.98997365579351e-06, + "loss": 0.5608, + "step": 778 + }, + { + "epoch": 0.1, + "grad_norm": 0.6934141057209189, + "learning_rate": 9.989908253225911e-06, + "loss": 0.5967, + "step": 779 + }, + { + "epoch": 0.1, + "grad_norm": 0.860039086305572, + "learning_rate": 9.989842638253861e-06, + "loss": 0.7043, + "step": 780 + }, + { + "epoch": 0.1, + "grad_norm": 0.6735360534206339, + "learning_rate": 9.989776810880156e-06, + "loss": 0.5952, + "step": 781 + }, + { + "epoch": 0.1, + "grad_norm": 0.6717695884994594, + "learning_rate": 9.9897107711076e-06, + "loss": 0.5797, + "step": 782 + }, + { + "epoch": 0.1, + "grad_norm": 0.6516625962471542, + "learning_rate": 9.989644518938998e-06, + "loss": 0.5605, + "step": 783 + }, + { + "epoch": 0.1, + "grad_norm": 0.9424134113324836, + "learning_rate": 9.989578054377174e-06, + "loss": 0.6219, + "step": 784 + }, + { + "epoch": 0.1, + "grad_norm": 0.7819554334442894, + "learning_rate": 9.989511377424957e-06, + "loss": 0.5281, + "step": 785 + }, + { + "epoch": 0.1, + "grad_norm": 0.8878934838991167, + "learning_rate": 9.989444488085185e-06, + "loss": 0.6686, + "step": 786 + }, + { + "epoch": 0.1, + "grad_norm": 0.5945890322248437, + "learning_rate": 9.989377386360706e-06, + "loss": 0.5234, + "step": 787 + }, + { + "epoch": 0.1, + "grad_norm": 0.7394100402484586, + "learning_rate": 9.989310072254375e-06, + "loss": 0.5295, + "step": 788 + }, + { + "epoch": 0.1, + "grad_norm": 0.6550729403258995, + "learning_rate": 9.989242545769056e-06, + "loss": 0.5895, + "step": 789 + }, + { + "epoch": 0.1, + "grad_norm": 0.8327940750285137, + "learning_rate": 9.989174806907627e-06, + "loss": 0.672, + "step": 790 + }, + { + "epoch": 0.1, + "grad_norm": 0.9044076708722738, + "learning_rate": 9.989106855672968e-06, + "loss": 0.6254, + "step": 791 + }, + { + "epoch": 0.1, + "grad_norm": 0.840277815762319, + "learning_rate": 9.989038692067974e-06, + "loss": 0.6586, + "step": 792 + }, + { + "epoch": 0.1, + "grad_norm": 0.6758925195276294, + "learning_rate": 9.988970316095547e-06, + "loss": 0.5875, + "step": 793 + }, + { + "epoch": 0.1, + "grad_norm": 0.8431847504791554, + "learning_rate": 9.988901727758594e-06, + "loss": 0.6252, + "step": 794 + }, + { + "epoch": 0.1, + "grad_norm": 3.227123737024226, + "learning_rate": 9.988832927060038e-06, + "loss": 0.741, + "step": 795 + }, + { + "epoch": 0.1, + "grad_norm": 0.8854218475472951, + "learning_rate": 9.988763914002806e-06, + "loss": 0.6091, + "step": 796 + }, + { + "epoch": 0.1, + "grad_norm": 1.5903794308945736, + "learning_rate": 9.988694688589836e-06, + "loss": 0.6588, + "step": 797 + }, + { + "epoch": 0.1, + "grad_norm": 0.7046035355578506, + "learning_rate": 9.988625250824074e-06, + "loss": 0.5571, + "step": 798 + }, + { + "epoch": 0.1, + "grad_norm": 0.639894445052264, + "learning_rate": 9.988555600708476e-06, + "loss": 0.5296, + "step": 799 + }, + { + "epoch": 0.1, + "grad_norm": 0.6306065126784005, + "learning_rate": 9.98848573824601e-06, + "loss": 0.5398, + "step": 800 + }, + { + "epoch": 0.1, + "grad_norm": 0.8263720474269912, + "learning_rate": 9.988415663439645e-06, + "loss": 0.6316, + "step": 801 + }, + { + "epoch": 0.1, + "grad_norm": 0.6595772593698904, + "learning_rate": 9.988345376292366e-06, + "loss": 0.4882, + "step": 802 + }, + { + "epoch": 0.1, + "grad_norm": 0.9817781299679428, + "learning_rate": 9.988274876807164e-06, + "loss": 0.6646, + "step": 803 + }, + { + "epoch": 0.1, + "grad_norm": 0.8816016131007491, + "learning_rate": 9.988204164987042e-06, + "loss": 0.6662, + "step": 804 + }, + { + "epoch": 0.1, + "grad_norm": 0.6877624983632619, + "learning_rate": 9.988133240835008e-06, + "loss": 0.5817, + "step": 805 + }, + { + "epoch": 0.1, + "grad_norm": 1.1979913318985735, + "learning_rate": 9.988062104354083e-06, + "loss": 0.6077, + "step": 806 + }, + { + "epoch": 0.1, + "grad_norm": 0.818338647993114, + "learning_rate": 9.987990755547293e-06, + "loss": 0.662, + "step": 807 + }, + { + "epoch": 0.1, + "grad_norm": 0.6286575467844161, + "learning_rate": 9.987919194417675e-06, + "loss": 0.5648, + "step": 808 + }, + { + "epoch": 0.1, + "grad_norm": 0.668826912313625, + "learning_rate": 9.987847420968278e-06, + "loss": 0.5823, + "step": 809 + }, + { + "epoch": 0.1, + "grad_norm": 0.8283074214210586, + "learning_rate": 9.987775435202153e-06, + "loss": 0.6711, + "step": 810 + }, + { + "epoch": 0.1, + "grad_norm": 0.5467020703048837, + "learning_rate": 9.987703237122366e-06, + "loss": 0.515, + "step": 811 + }, + { + "epoch": 0.1, + "grad_norm": 0.991593659635225, + "learning_rate": 9.987630826731993e-06, + "loss": 0.6609, + "step": 812 + }, + { + "epoch": 0.1, + "grad_norm": 0.7969645835899805, + "learning_rate": 9.987558204034114e-06, + "loss": 0.6772, + "step": 813 + }, + { + "epoch": 0.1, + "grad_norm": 0.8937788622731796, + "learning_rate": 9.987485369031817e-06, + "loss": 0.7098, + "step": 814 + }, + { + "epoch": 0.1, + "grad_norm": 0.608424689955248, + "learning_rate": 9.987412321728209e-06, + "loss": 0.5166, + "step": 815 + }, + { + "epoch": 0.1, + "grad_norm": 0.8267505483392998, + "learning_rate": 9.987339062126394e-06, + "loss": 0.6073, + "step": 816 + }, + { + "epoch": 0.1, + "grad_norm": 0.6303518988797593, + "learning_rate": 9.987265590229494e-06, + "loss": 0.5204, + "step": 817 + }, + { + "epoch": 0.1, + "grad_norm": 0.6519974039844972, + "learning_rate": 9.987191906040634e-06, + "loss": 0.5612, + "step": 818 + }, + { + "epoch": 0.1, + "grad_norm": 0.6690501483398921, + "learning_rate": 9.98711800956295e-06, + "loss": 0.544, + "step": 819 + }, + { + "epoch": 0.1, + "grad_norm": 0.6613313944575195, + "learning_rate": 9.98704390079959e-06, + "loss": 0.5547, + "step": 820 + }, + { + "epoch": 0.1, + "grad_norm": 1.2262546884070182, + "learning_rate": 9.986969579753706e-06, + "loss": 0.6491, + "step": 821 + }, + { + "epoch": 0.1, + "grad_norm": 0.6410866164962201, + "learning_rate": 9.986895046428467e-06, + "loss": 0.5322, + "step": 822 + }, + { + "epoch": 0.1, + "grad_norm": 0.7479005045701804, + "learning_rate": 9.986820300827038e-06, + "loss": 0.5812, + "step": 823 + }, + { + "epoch": 0.1, + "grad_norm": 0.8574836652317681, + "learning_rate": 9.986745342952605e-06, + "loss": 0.6477, + "step": 824 + }, + { + "epoch": 0.11, + "grad_norm": 0.808972861030263, + "learning_rate": 9.986670172808359e-06, + "loss": 0.6282, + "step": 825 + }, + { + "epoch": 0.11, + "grad_norm": 0.7115535529703887, + "learning_rate": 9.9865947903975e-06, + "loss": 0.499, + "step": 826 + }, + { + "epoch": 0.11, + "grad_norm": 0.7978214995594192, + "learning_rate": 9.986519195723233e-06, + "loss": 0.649, + "step": 827 + }, + { + "epoch": 0.11, + "grad_norm": 0.6958941154244207, + "learning_rate": 9.986443388788781e-06, + "loss": 0.5564, + "step": 828 + }, + { + "epoch": 0.11, + "grad_norm": 0.8236487676418658, + "learning_rate": 9.986367369597366e-06, + "loss": 0.5772, + "step": 829 + }, + { + "epoch": 0.11, + "grad_norm": 0.7713431504854724, + "learning_rate": 9.986291138152227e-06, + "loss": 0.572, + "step": 830 + }, + { + "epoch": 0.11, + "grad_norm": 0.6744967713967663, + "learning_rate": 9.986214694456609e-06, + "loss": 0.5558, + "step": 831 + }, + { + "epoch": 0.11, + "grad_norm": 0.7719304482937637, + "learning_rate": 9.986138038513765e-06, + "loss": 0.596, + "step": 832 + }, + { + "epoch": 0.11, + "grad_norm": 0.6605250408593111, + "learning_rate": 9.986061170326958e-06, + "loss": 0.6089, + "step": 833 + }, + { + "epoch": 0.11, + "grad_norm": 0.7431533406246252, + "learning_rate": 9.98598408989946e-06, + "loss": 0.544, + "step": 834 + }, + { + "epoch": 0.11, + "grad_norm": 0.6748294392612384, + "learning_rate": 9.98590679723455e-06, + "loss": 0.5397, + "step": 835 + }, + { + "epoch": 0.11, + "grad_norm": 0.7118939115746294, + "learning_rate": 9.985829292335523e-06, + "loss": 0.6153, + "step": 836 + }, + { + "epoch": 0.11, + "grad_norm": 0.6076472785108435, + "learning_rate": 9.985751575205676e-06, + "loss": 0.5244, + "step": 837 + }, + { + "epoch": 0.11, + "grad_norm": 0.8751842414285672, + "learning_rate": 9.985673645848315e-06, + "loss": 0.6215, + "step": 838 + }, + { + "epoch": 0.11, + "grad_norm": 0.8569561121943189, + "learning_rate": 9.985595504266758e-06, + "loss": 0.609, + "step": 839 + }, + { + "epoch": 0.11, + "grad_norm": 0.8646887324930379, + "learning_rate": 9.985517150464335e-06, + "loss": 0.7117, + "step": 840 + }, + { + "epoch": 0.11, + "grad_norm": 0.7605808467254423, + "learning_rate": 9.985438584444375e-06, + "loss": 0.5617, + "step": 841 + }, + { + "epoch": 0.11, + "grad_norm": 0.7730492847886752, + "learning_rate": 9.985359806210229e-06, + "loss": 0.6153, + "step": 842 + }, + { + "epoch": 0.11, + "grad_norm": 0.6678351603223434, + "learning_rate": 9.985280815765244e-06, + "loss": 0.5506, + "step": 843 + }, + { + "epoch": 0.11, + "grad_norm": 0.6927775493943853, + "learning_rate": 9.985201613112788e-06, + "loss": 0.5791, + "step": 844 + }, + { + "epoch": 0.11, + "grad_norm": 0.8481325219175609, + "learning_rate": 9.985122198256227e-06, + "loss": 0.5568, + "step": 845 + }, + { + "epoch": 0.11, + "grad_norm": 0.5960932300090971, + "learning_rate": 9.985042571198947e-06, + "loss": 0.5035, + "step": 846 + }, + { + "epoch": 0.11, + "grad_norm": 0.875815844031176, + "learning_rate": 9.984962731944332e-06, + "loss": 0.659, + "step": 847 + }, + { + "epoch": 0.11, + "grad_norm": 0.7295163582351202, + "learning_rate": 9.984882680495784e-06, + "loss": 0.6118, + "step": 848 + }, + { + "epoch": 0.11, + "grad_norm": 0.7520869448489738, + "learning_rate": 9.984802416856711e-06, + "loss": 0.5664, + "step": 849 + }, + { + "epoch": 0.11, + "grad_norm": 0.8502409940694009, + "learning_rate": 9.984721941030528e-06, + "loss": 0.6992, + "step": 850 + }, + { + "epoch": 0.11, + "grad_norm": 0.7819413543841294, + "learning_rate": 9.984641253020659e-06, + "loss": 0.5361, + "step": 851 + }, + { + "epoch": 0.11, + "grad_norm": 0.871924910992041, + "learning_rate": 9.984560352830542e-06, + "loss": 0.6563, + "step": 852 + }, + { + "epoch": 0.11, + "grad_norm": 0.7098584975906701, + "learning_rate": 9.98447924046362e-06, + "loss": 0.5937, + "step": 853 + }, + { + "epoch": 0.11, + "grad_norm": 0.6019504772808552, + "learning_rate": 9.984397915923344e-06, + "loss": 0.5419, + "step": 854 + }, + { + "epoch": 0.11, + "grad_norm": 0.8800667340065077, + "learning_rate": 9.984316379213175e-06, + "loss": 0.6453, + "step": 855 + }, + { + "epoch": 0.11, + "grad_norm": 0.9498703164426573, + "learning_rate": 9.984234630336586e-06, + "loss": 0.6814, + "step": 856 + }, + { + "epoch": 0.11, + "grad_norm": 0.6556240441790118, + "learning_rate": 9.984152669297058e-06, + "loss": 0.5306, + "step": 857 + }, + { + "epoch": 0.11, + "grad_norm": 0.9087981299754682, + "learning_rate": 9.984070496098076e-06, + "loss": 0.6071, + "step": 858 + }, + { + "epoch": 0.11, + "grad_norm": 0.5963703935302598, + "learning_rate": 9.983988110743141e-06, + "loss": 0.5068, + "step": 859 + }, + { + "epoch": 0.11, + "grad_norm": 0.7657075869969863, + "learning_rate": 9.983905513235758e-06, + "loss": 0.5896, + "step": 860 + }, + { + "epoch": 0.11, + "grad_norm": 0.7436250195216065, + "learning_rate": 9.983822703579445e-06, + "loss": 0.5575, + "step": 861 + }, + { + "epoch": 0.11, + "grad_norm": 0.6023354584527244, + "learning_rate": 9.983739681777723e-06, + "loss": 0.554, + "step": 862 + }, + { + "epoch": 0.11, + "grad_norm": 1.1627523945571974, + "learning_rate": 9.983656447834129e-06, + "loss": 0.6125, + "step": 863 + }, + { + "epoch": 0.11, + "grad_norm": 0.8267175821570756, + "learning_rate": 9.983573001752208e-06, + "loss": 0.6356, + "step": 864 + }, + { + "epoch": 0.11, + "grad_norm": 0.6092360522288158, + "learning_rate": 9.983489343535506e-06, + "loss": 0.5403, + "step": 865 + }, + { + "epoch": 0.11, + "grad_norm": 0.908618839583729, + "learning_rate": 9.98340547318759e-06, + "loss": 0.6543, + "step": 866 + }, + { + "epoch": 0.11, + "grad_norm": 1.125831052765039, + "learning_rate": 9.983321390712028e-06, + "loss": 0.6917, + "step": 867 + }, + { + "epoch": 0.11, + "grad_norm": 0.7682568756384482, + "learning_rate": 9.983237096112397e-06, + "loss": 0.6885, + "step": 868 + }, + { + "epoch": 0.11, + "grad_norm": 0.6470432155642117, + "learning_rate": 9.98315258939229e-06, + "loss": 0.5309, + "step": 869 + }, + { + "epoch": 0.11, + "grad_norm": 1.084456252229607, + "learning_rate": 9.983067870555297e-06, + "loss": 0.6374, + "step": 870 + }, + { + "epoch": 0.11, + "grad_norm": 0.8018245820192976, + "learning_rate": 9.98298293960503e-06, + "loss": 0.5757, + "step": 871 + }, + { + "epoch": 0.11, + "grad_norm": 0.9643134256057387, + "learning_rate": 9.982897796545104e-06, + "loss": 0.6722, + "step": 872 + }, + { + "epoch": 0.11, + "grad_norm": 0.8580838189426214, + "learning_rate": 9.982812441379141e-06, + "loss": 0.6549, + "step": 873 + }, + { + "epoch": 0.11, + "grad_norm": 0.7800072711229105, + "learning_rate": 9.982726874110776e-06, + "loss": 0.687, + "step": 874 + }, + { + "epoch": 0.11, + "grad_norm": 1.0363934860983977, + "learning_rate": 9.982641094743648e-06, + "loss": 0.6925, + "step": 875 + }, + { + "epoch": 0.11, + "grad_norm": 0.7987480249623445, + "learning_rate": 9.982555103281413e-06, + "loss": 0.6675, + "step": 876 + }, + { + "epoch": 0.11, + "grad_norm": 0.6689629806115877, + "learning_rate": 9.982468899727728e-06, + "loss": 0.5386, + "step": 877 + }, + { + "epoch": 0.11, + "grad_norm": 0.9533975614522542, + "learning_rate": 9.982382484086263e-06, + "loss": 0.6647, + "step": 878 + }, + { + "epoch": 0.11, + "grad_norm": 0.7022065148065463, + "learning_rate": 9.9822958563607e-06, + "loss": 0.5639, + "step": 879 + }, + { + "epoch": 0.11, + "grad_norm": 0.8822886302244307, + "learning_rate": 9.98220901655472e-06, + "loss": 0.6937, + "step": 880 + }, + { + "epoch": 0.11, + "grad_norm": 0.9593099526128606, + "learning_rate": 9.982121964672027e-06, + "loss": 0.6368, + "step": 881 + }, + { + "epoch": 0.11, + "grad_norm": 0.8376183826694517, + "learning_rate": 9.98203470071632e-06, + "loss": 0.5754, + "step": 882 + }, + { + "epoch": 0.11, + "grad_norm": 0.8580188257672448, + "learning_rate": 9.981947224691316e-06, + "loss": 0.6551, + "step": 883 + }, + { + "epoch": 0.11, + "grad_norm": 0.6865148549538115, + "learning_rate": 9.98185953660074e-06, + "loss": 0.6089, + "step": 884 + }, + { + "epoch": 0.11, + "grad_norm": 0.749497520981886, + "learning_rate": 9.981771636448323e-06, + "loss": 0.5604, + "step": 885 + }, + { + "epoch": 0.11, + "grad_norm": 1.0383945444894498, + "learning_rate": 9.981683524237805e-06, + "loss": 0.7119, + "step": 886 + }, + { + "epoch": 0.11, + "grad_norm": 0.736463033196381, + "learning_rate": 9.98159519997294e-06, + "loss": 0.5953, + "step": 887 + }, + { + "epoch": 0.11, + "grad_norm": 0.618337131288357, + "learning_rate": 9.981506663657486e-06, + "loss": 0.5805, + "step": 888 + }, + { + "epoch": 0.11, + "grad_norm": 0.6831445190302305, + "learning_rate": 9.981417915295213e-06, + "loss": 0.5492, + "step": 889 + }, + { + "epoch": 0.11, + "grad_norm": 0.8456877619742489, + "learning_rate": 9.981328954889896e-06, + "loss": 0.6944, + "step": 890 + }, + { + "epoch": 0.11, + "grad_norm": 0.8339034668691195, + "learning_rate": 9.981239782445325e-06, + "loss": 0.6603, + "step": 891 + }, + { + "epoch": 0.11, + "grad_norm": 0.8107358052589377, + "learning_rate": 9.981150397965293e-06, + "loss": 0.6619, + "step": 892 + }, + { + "epoch": 0.11, + "grad_norm": 0.5933002724612979, + "learning_rate": 9.981060801453605e-06, + "loss": 0.5694, + "step": 893 + }, + { + "epoch": 0.11, + "grad_norm": 0.703868613588881, + "learning_rate": 9.980970992914079e-06, + "loss": 0.5536, + "step": 894 + }, + { + "epoch": 0.11, + "grad_norm": 1.0813069844773486, + "learning_rate": 9.980880972350533e-06, + "loss": 0.6424, + "step": 895 + }, + { + "epoch": 0.11, + "grad_norm": 0.8435970727418484, + "learning_rate": 9.980790739766801e-06, + "loss": 0.6975, + "step": 896 + }, + { + "epoch": 0.11, + "grad_norm": 0.6400389588171608, + "learning_rate": 9.980700295166724e-06, + "loss": 0.5398, + "step": 897 + }, + { + "epoch": 0.11, + "grad_norm": 1.0833546007836785, + "learning_rate": 9.98060963855415e-06, + "loss": 0.7218, + "step": 898 + }, + { + "epoch": 0.11, + "grad_norm": 0.6181242714505477, + "learning_rate": 9.980518769932938e-06, + "loss": 0.5475, + "step": 899 + }, + { + "epoch": 0.11, + "grad_norm": 0.8244564941672826, + "learning_rate": 9.980427689306962e-06, + "loss": 0.6312, + "step": 900 + }, + { + "epoch": 0.11, + "grad_norm": 0.9318019554131991, + "learning_rate": 9.98033639668009e-06, + "loss": 0.6611, + "step": 901 + }, + { + "epoch": 0.11, + "grad_norm": 1.4396113769073684, + "learning_rate": 9.980244892056216e-06, + "loss": 0.711, + "step": 902 + }, + { + "epoch": 0.12, + "grad_norm": 0.9624421894783178, + "learning_rate": 9.980153175439229e-06, + "loss": 0.6278, + "step": 903 + }, + { + "epoch": 0.12, + "grad_norm": 0.6600662675780681, + "learning_rate": 9.980061246833037e-06, + "loss": 0.5735, + "step": 904 + }, + { + "epoch": 0.12, + "grad_norm": 0.8117856760783336, + "learning_rate": 9.979969106241551e-06, + "loss": 0.6374, + "step": 905 + }, + { + "epoch": 0.12, + "grad_norm": 0.6889655407137212, + "learning_rate": 9.979876753668695e-06, + "loss": 0.6147, + "step": 906 + }, + { + "epoch": 0.12, + "grad_norm": 0.7440401528217445, + "learning_rate": 9.979784189118398e-06, + "loss": 0.5489, + "step": 907 + }, + { + "epoch": 0.12, + "grad_norm": 0.7987482256491769, + "learning_rate": 9.979691412594601e-06, + "loss": 0.6265, + "step": 908 + }, + { + "epoch": 0.12, + "grad_norm": 0.8769697592520569, + "learning_rate": 9.979598424101253e-06, + "loss": 0.6605, + "step": 909 + }, + { + "epoch": 0.12, + "grad_norm": 0.5906352109625501, + "learning_rate": 9.979505223642314e-06, + "loss": 0.5676, + "step": 910 + }, + { + "epoch": 0.12, + "grad_norm": 0.811207726950151, + "learning_rate": 9.97941181122175e-06, + "loss": 0.6785, + "step": 911 + }, + { + "epoch": 0.12, + "grad_norm": 0.8088554766434953, + "learning_rate": 9.979318186843536e-06, + "loss": 0.6877, + "step": 912 + }, + { + "epoch": 0.12, + "grad_norm": 0.8193905623642039, + "learning_rate": 9.979224350511658e-06, + "loss": 0.6505, + "step": 913 + }, + { + "epoch": 0.12, + "grad_norm": 0.7438195458920674, + "learning_rate": 9.979130302230112e-06, + "loss": 0.5947, + "step": 914 + }, + { + "epoch": 0.12, + "grad_norm": 0.8952939406382615, + "learning_rate": 9.979036042002899e-06, + "loss": 0.7302, + "step": 915 + }, + { + "epoch": 0.12, + "grad_norm": 0.6503815007052984, + "learning_rate": 9.978941569834033e-06, + "loss": 0.574, + "step": 916 + }, + { + "epoch": 0.12, + "grad_norm": 0.6709939905326485, + "learning_rate": 9.978846885727536e-06, + "loss": 0.5437, + "step": 917 + }, + { + "epoch": 0.12, + "grad_norm": 0.8905563115820866, + "learning_rate": 9.978751989687437e-06, + "loss": 0.6812, + "step": 918 + }, + { + "epoch": 0.12, + "grad_norm": 0.844480162747943, + "learning_rate": 9.978656881717774e-06, + "loss": 0.6143, + "step": 919 + }, + { + "epoch": 0.12, + "grad_norm": 0.6506977904792782, + "learning_rate": 9.978561561822598e-06, + "loss": 0.5209, + "step": 920 + }, + { + "epoch": 0.12, + "grad_norm": 0.7591810938046205, + "learning_rate": 9.978466030005964e-06, + "loss": 0.5882, + "step": 921 + }, + { + "epoch": 0.12, + "grad_norm": 0.789580194552697, + "learning_rate": 9.97837028627194e-06, + "loss": 0.6269, + "step": 922 + }, + { + "epoch": 0.12, + "grad_norm": 0.61162725932201, + "learning_rate": 9.978274330624604e-06, + "loss": 0.5384, + "step": 923 + }, + { + "epoch": 0.12, + "grad_norm": 0.7958299395610733, + "learning_rate": 9.978178163068035e-06, + "loss": 0.6472, + "step": 924 + }, + { + "epoch": 0.12, + "grad_norm": 0.718883200546543, + "learning_rate": 9.978081783606332e-06, + "loss": 0.5932, + "step": 925 + }, + { + "epoch": 0.12, + "grad_norm": 1.0171229312281451, + "learning_rate": 9.977985192243596e-06, + "loss": 0.6725, + "step": 926 + }, + { + "epoch": 0.12, + "grad_norm": 0.9366746409763742, + "learning_rate": 9.977888388983935e-06, + "loss": 0.7197, + "step": 927 + }, + { + "epoch": 0.12, + "grad_norm": 0.6951243322287972, + "learning_rate": 9.977791373831474e-06, + "loss": 0.5666, + "step": 928 + }, + { + "epoch": 0.12, + "grad_norm": 0.9492044887336627, + "learning_rate": 9.97769414679034e-06, + "loss": 0.6385, + "step": 929 + }, + { + "epoch": 0.12, + "grad_norm": 0.6958637272327954, + "learning_rate": 9.977596707864673e-06, + "loss": 0.569, + "step": 930 + }, + { + "epoch": 0.12, + "grad_norm": 0.6150529362764223, + "learning_rate": 9.97749905705862e-06, + "loss": 0.5312, + "step": 931 + }, + { + "epoch": 0.12, + "grad_norm": 0.8477634045500256, + "learning_rate": 9.97740119437634e-06, + "loss": 0.6062, + "step": 932 + }, + { + "epoch": 0.12, + "grad_norm": 1.8303457600093067, + "learning_rate": 9.977303119821994e-06, + "loss": 0.6604, + "step": 933 + }, + { + "epoch": 0.12, + "grad_norm": 0.6807067924550453, + "learning_rate": 9.977204833399761e-06, + "loss": 0.5853, + "step": 934 + }, + { + "epoch": 0.12, + "grad_norm": 0.7044752600345968, + "learning_rate": 9.977106335113821e-06, + "loss": 0.5931, + "step": 935 + }, + { + "epoch": 0.12, + "grad_norm": 0.6435129007633353, + "learning_rate": 9.977007624968371e-06, + "loss": 0.5699, + "step": 936 + }, + { + "epoch": 0.12, + "grad_norm": 0.6334797029424789, + "learning_rate": 9.97690870296761e-06, + "loss": 0.5531, + "step": 937 + }, + { + "epoch": 0.12, + "grad_norm": 0.6622043096372564, + "learning_rate": 9.976809569115749e-06, + "loss": 0.6299, + "step": 938 + }, + { + "epoch": 0.12, + "grad_norm": 0.6378979654875966, + "learning_rate": 9.97671022341701e-06, + "loss": 0.5171, + "step": 939 + }, + { + "epoch": 0.12, + "grad_norm": 0.5915013537029304, + "learning_rate": 9.976610665875616e-06, + "loss": 0.5631, + "step": 940 + }, + { + "epoch": 0.12, + "grad_norm": 0.6343711762016183, + "learning_rate": 9.976510896495813e-06, + "loss": 0.5886, + "step": 941 + }, + { + "epoch": 0.12, + "grad_norm": 0.7789674759810145, + "learning_rate": 9.976410915281842e-06, + "loss": 0.648, + "step": 942 + }, + { + "epoch": 0.12, + "grad_norm": 0.6231699029833191, + "learning_rate": 9.97631072223796e-06, + "loss": 0.5565, + "step": 943 + }, + { + "epoch": 0.12, + "grad_norm": 0.8113065507507176, + "learning_rate": 9.976210317368436e-06, + "loss": 0.6295, + "step": 944 + }, + { + "epoch": 0.12, + "grad_norm": 0.6421119369093602, + "learning_rate": 9.976109700677537e-06, + "loss": 0.5357, + "step": 945 + }, + { + "epoch": 0.12, + "grad_norm": 0.6972200604773494, + "learning_rate": 9.976008872169552e-06, + "loss": 0.6487, + "step": 946 + }, + { + "epoch": 0.12, + "grad_norm": 0.6898354852896688, + "learning_rate": 9.975907831848768e-06, + "loss": 0.5815, + "step": 947 + }, + { + "epoch": 0.12, + "grad_norm": 1.375720273664557, + "learning_rate": 9.97580657971949e-06, + "loss": 0.6092, + "step": 948 + }, + { + "epoch": 0.12, + "grad_norm": 0.8500198409185694, + "learning_rate": 9.975705115786025e-06, + "loss": 0.6139, + "step": 949 + }, + { + "epoch": 0.12, + "grad_norm": 0.8683120723328192, + "learning_rate": 9.975603440052694e-06, + "loss": 0.6425, + "step": 950 + }, + { + "epoch": 0.12, + "grad_norm": 0.593346455614026, + "learning_rate": 9.975501552523822e-06, + "loss": 0.5493, + "step": 951 + }, + { + "epoch": 0.12, + "grad_norm": 0.8936501859503566, + "learning_rate": 9.975399453203752e-06, + "loss": 0.6537, + "step": 952 + }, + { + "epoch": 0.12, + "grad_norm": 0.9860019505445827, + "learning_rate": 9.975297142096825e-06, + "loss": 0.7264, + "step": 953 + }, + { + "epoch": 0.12, + "grad_norm": 0.8669851463587487, + "learning_rate": 9.975194619207398e-06, + "loss": 0.5254, + "step": 954 + }, + { + "epoch": 0.12, + "grad_norm": 0.6487483507891224, + "learning_rate": 9.975091884539833e-06, + "loss": 0.5469, + "step": 955 + }, + { + "epoch": 0.12, + "grad_norm": 0.7343751264180686, + "learning_rate": 9.974988938098505e-06, + "loss": 0.6526, + "step": 956 + }, + { + "epoch": 0.12, + "grad_norm": 0.798451745274211, + "learning_rate": 9.974885779887796e-06, + "loss": 0.6238, + "step": 957 + }, + { + "epoch": 0.12, + "grad_norm": 0.7944966330683679, + "learning_rate": 9.974782409912098e-06, + "loss": 0.6414, + "step": 958 + }, + { + "epoch": 0.12, + "grad_norm": 0.7326192905022801, + "learning_rate": 9.974678828175808e-06, + "loss": 0.5782, + "step": 959 + }, + { + "epoch": 0.12, + "grad_norm": 0.851409031464957, + "learning_rate": 9.974575034683339e-06, + "loss": 0.6633, + "step": 960 + }, + { + "epoch": 0.12, + "grad_norm": 0.8329275260160559, + "learning_rate": 9.974471029439106e-06, + "loss": 0.6178, + "step": 961 + }, + { + "epoch": 0.12, + "grad_norm": 0.8480645643861054, + "learning_rate": 9.974366812447538e-06, + "loss": 0.6868, + "step": 962 + }, + { + "epoch": 0.12, + "grad_norm": 0.7403624291621149, + "learning_rate": 9.974262383713069e-06, + "loss": 0.585, + "step": 963 + }, + { + "epoch": 0.12, + "grad_norm": 0.6057429652873727, + "learning_rate": 9.974157743240146e-06, + "loss": 0.5241, + "step": 964 + }, + { + "epoch": 0.12, + "grad_norm": 0.723803797580167, + "learning_rate": 9.974052891033226e-06, + "loss": 0.5509, + "step": 965 + }, + { + "epoch": 0.12, + "grad_norm": 0.7605102234841643, + "learning_rate": 9.973947827096766e-06, + "loss": 0.6096, + "step": 966 + }, + { + "epoch": 0.12, + "grad_norm": 0.9964860694641393, + "learning_rate": 9.973842551435243e-06, + "loss": 0.6447, + "step": 967 + }, + { + "epoch": 0.12, + "grad_norm": 0.7125726674264059, + "learning_rate": 9.973737064053137e-06, + "loss": 0.5714, + "step": 968 + }, + { + "epoch": 0.12, + "grad_norm": 0.6447963992240072, + "learning_rate": 9.973631364954937e-06, + "loss": 0.5731, + "step": 969 + }, + { + "epoch": 0.12, + "grad_norm": 0.8861228431851509, + "learning_rate": 9.973525454145143e-06, + "loss": 0.6725, + "step": 970 + }, + { + "epoch": 0.12, + "grad_norm": 0.8599942129085489, + "learning_rate": 9.973419331628265e-06, + "loss": 0.619, + "step": 971 + }, + { + "epoch": 0.12, + "grad_norm": 0.7870859596350437, + "learning_rate": 9.973312997408817e-06, + "loss": 0.6642, + "step": 972 + }, + { + "epoch": 0.12, + "grad_norm": 0.8758457278522148, + "learning_rate": 9.973206451491329e-06, + "loss": 0.6678, + "step": 973 + }, + { + "epoch": 0.12, + "grad_norm": 0.6407520295940252, + "learning_rate": 9.973099693880332e-06, + "loss": 0.5044, + "step": 974 + }, + { + "epoch": 0.12, + "grad_norm": 0.6075628641057531, + "learning_rate": 9.972992724580375e-06, + "loss": 0.5654, + "step": 975 + }, + { + "epoch": 0.12, + "grad_norm": 0.843137238932385, + "learning_rate": 9.97288554359601e-06, + "loss": 0.692, + "step": 976 + }, + { + "epoch": 0.12, + "grad_norm": 0.8644489293636047, + "learning_rate": 9.972778150931797e-06, + "loss": 0.6415, + "step": 977 + }, + { + "epoch": 0.12, + "grad_norm": 0.9426617416035463, + "learning_rate": 9.972670546592307e-06, + "loss": 0.6469, + "step": 978 + }, + { + "epoch": 0.12, + "grad_norm": 0.64565198646652, + "learning_rate": 9.972562730582125e-06, + "loss": 0.5428, + "step": 979 + }, + { + "epoch": 0.12, + "grad_norm": 0.6857282054523964, + "learning_rate": 9.972454702905837e-06, + "loss": 0.5881, + "step": 980 + }, + { + "epoch": 0.12, + "grad_norm": 0.8631417246527289, + "learning_rate": 9.97234646356804e-06, + "loss": 0.6209, + "step": 981 + }, + { + "epoch": 0.13, + "grad_norm": 0.8870029643336272, + "learning_rate": 9.972238012573345e-06, + "loss": 0.6494, + "step": 982 + }, + { + "epoch": 0.13, + "grad_norm": 0.6632097527897141, + "learning_rate": 9.972129349926368e-06, + "loss": 0.5686, + "step": 983 + }, + { + "epoch": 0.13, + "grad_norm": 0.6194036534760328, + "learning_rate": 9.972020475631731e-06, + "loss": 0.5366, + "step": 984 + }, + { + "epoch": 0.13, + "grad_norm": 0.6992208227502703, + "learning_rate": 9.971911389694072e-06, + "loss": 0.5421, + "step": 985 + }, + { + "epoch": 0.13, + "grad_norm": 0.820385052058671, + "learning_rate": 9.971802092118032e-06, + "loss": 0.6599, + "step": 986 + }, + { + "epoch": 0.13, + "grad_norm": 0.6824016838728466, + "learning_rate": 9.971692582908267e-06, + "loss": 0.5739, + "step": 987 + }, + { + "epoch": 0.13, + "grad_norm": 0.898698060195507, + "learning_rate": 9.971582862069434e-06, + "loss": 0.6914, + "step": 988 + }, + { + "epoch": 0.13, + "grad_norm": 0.7831417966919729, + "learning_rate": 9.971472929606206e-06, + "loss": 0.637, + "step": 989 + }, + { + "epoch": 0.13, + "grad_norm": 0.8710437767208958, + "learning_rate": 9.971362785523261e-06, + "loss": 0.6501, + "step": 990 + }, + { + "epoch": 0.13, + "grad_norm": 0.9016651239427222, + "learning_rate": 9.97125242982529e-06, + "loss": 0.6262, + "step": 991 + }, + { + "epoch": 0.13, + "grad_norm": 0.7706646260634279, + "learning_rate": 9.971141862516988e-06, + "loss": 0.641, + "step": 992 + }, + { + "epoch": 0.13, + "grad_norm": 0.9426023139160854, + "learning_rate": 9.971031083603061e-06, + "loss": 0.6943, + "step": 993 + }, + { + "epoch": 0.13, + "grad_norm": 0.6896168476557996, + "learning_rate": 9.970920093088227e-06, + "loss": 0.5623, + "step": 994 + }, + { + "epoch": 0.13, + "grad_norm": 0.9728087762100552, + "learning_rate": 9.970808890977211e-06, + "loss": 0.7213, + "step": 995 + }, + { + "epoch": 0.13, + "grad_norm": 0.9548926293067601, + "learning_rate": 9.970697477274744e-06, + "loss": 0.6837, + "step": 996 + }, + { + "epoch": 0.13, + "grad_norm": 0.8069373684565228, + "learning_rate": 9.970585851985569e-06, + "loss": 0.616, + "step": 997 + }, + { + "epoch": 0.13, + "grad_norm": 0.7443443251819689, + "learning_rate": 9.970474015114437e-06, + "loss": 0.5486, + "step": 998 + }, + { + "epoch": 0.13, + "grad_norm": 0.8556421758965715, + "learning_rate": 9.970361966666112e-06, + "loss": 0.6649, + "step": 999 + }, + { + "epoch": 0.13, + "grad_norm": 0.6362859396689061, + "learning_rate": 9.97024970664536e-06, + "loss": 0.5386, + "step": 1000 + }, + { + "epoch": 0.13, + "grad_norm": 0.698702022519884, + "learning_rate": 9.97013723505696e-06, + "loss": 0.5146, + "step": 1001 + }, + { + "epoch": 0.13, + "grad_norm": 0.646339997881583, + "learning_rate": 9.970024551905701e-06, + "loss": 0.5592, + "step": 1002 + }, + { + "epoch": 0.13, + "grad_norm": 0.8423911491600581, + "learning_rate": 9.969911657196378e-06, + "loss": 0.6253, + "step": 1003 + }, + { + "epoch": 0.13, + "grad_norm": 1.0160592301030402, + "learning_rate": 9.969798550933799e-06, + "loss": 0.6283, + "step": 1004 + }, + { + "epoch": 0.13, + "grad_norm": 0.690352142959983, + "learning_rate": 9.969685233122774e-06, + "loss": 0.5628, + "step": 1005 + }, + { + "epoch": 0.13, + "grad_norm": 1.0311787740806562, + "learning_rate": 9.969571703768132e-06, + "loss": 0.6159, + "step": 1006 + }, + { + "epoch": 0.13, + "grad_norm": 0.7557814251185381, + "learning_rate": 9.969457962874702e-06, + "loss": 0.5594, + "step": 1007 + }, + { + "epoch": 0.13, + "grad_norm": 0.785242310763266, + "learning_rate": 9.969344010447326e-06, + "loss": 0.6654, + "step": 1008 + }, + { + "epoch": 0.13, + "grad_norm": 0.9067553933913962, + "learning_rate": 9.969229846490857e-06, + "loss": 0.6485, + "step": 1009 + }, + { + "epoch": 0.13, + "grad_norm": 0.6216889007415037, + "learning_rate": 9.969115471010152e-06, + "loss": 0.5283, + "step": 1010 + }, + { + "epoch": 0.13, + "grad_norm": 0.691634208557814, + "learning_rate": 9.96900088401008e-06, + "loss": 0.5518, + "step": 1011 + }, + { + "epoch": 0.13, + "grad_norm": 0.7841277537304792, + "learning_rate": 9.96888608549552e-06, + "loss": 0.6263, + "step": 1012 + }, + { + "epoch": 0.13, + "grad_norm": 0.8179078057475995, + "learning_rate": 9.968771075471356e-06, + "loss": 0.6032, + "step": 1013 + }, + { + "epoch": 0.13, + "grad_norm": 0.9425520151640354, + "learning_rate": 9.968655853942487e-06, + "loss": 0.6576, + "step": 1014 + }, + { + "epoch": 0.13, + "grad_norm": 0.8844954600716567, + "learning_rate": 9.968540420913815e-06, + "loss": 0.628, + "step": 1015 + }, + { + "epoch": 0.13, + "grad_norm": 0.8832647227415215, + "learning_rate": 9.968424776390254e-06, + "loss": 0.6427, + "step": 1016 + }, + { + "epoch": 0.13, + "grad_norm": 0.7438323830426561, + "learning_rate": 9.968308920376726e-06, + "loss": 0.6119, + "step": 1017 + }, + { + "epoch": 0.13, + "grad_norm": 0.5971458444281293, + "learning_rate": 9.968192852878166e-06, + "loss": 0.5301, + "step": 1018 + }, + { + "epoch": 0.13, + "grad_norm": 0.8894538345661622, + "learning_rate": 9.968076573899513e-06, + "loss": 0.6625, + "step": 1019 + }, + { + "epoch": 0.13, + "grad_norm": 1.0120386073613519, + "learning_rate": 9.967960083445714e-06, + "loss": 0.6678, + "step": 1020 + }, + { + "epoch": 0.13, + "grad_norm": 0.9919547610476247, + "learning_rate": 9.96784338152173e-06, + "loss": 0.7238, + "step": 1021 + }, + { + "epoch": 0.13, + "grad_norm": 0.5978918140517522, + "learning_rate": 9.967726468132529e-06, + "loss": 0.5507, + "step": 1022 + }, + { + "epoch": 0.13, + "grad_norm": 0.8751885204164667, + "learning_rate": 9.967609343283084e-06, + "loss": 0.6528, + "step": 1023 + }, + { + "epoch": 0.13, + "grad_norm": 0.6618436429645297, + "learning_rate": 9.967492006978386e-06, + "loss": 0.5955, + "step": 1024 + }, + { + "epoch": 0.13, + "grad_norm": 0.6316969474791747, + "learning_rate": 9.967374459223426e-06, + "loss": 0.5325, + "step": 1025 + }, + { + "epoch": 0.13, + "grad_norm": 0.79104689602268, + "learning_rate": 9.967256700023212e-06, + "loss": 0.5868, + "step": 1026 + }, + { + "epoch": 0.13, + "grad_norm": 1.011617657288393, + "learning_rate": 9.96713872938275e-06, + "loss": 0.6631, + "step": 1027 + }, + { + "epoch": 0.13, + "grad_norm": 0.8170905485014265, + "learning_rate": 9.967020547307065e-06, + "loss": 0.6326, + "step": 1028 + }, + { + "epoch": 0.13, + "grad_norm": 0.76207282808973, + "learning_rate": 9.96690215380119e-06, + "loss": 0.6174, + "step": 1029 + }, + { + "epoch": 0.13, + "grad_norm": 0.72412968908377, + "learning_rate": 9.96678354887016e-06, + "loss": 0.5365, + "step": 1030 + }, + { + "epoch": 0.13, + "grad_norm": 0.7450700003219325, + "learning_rate": 9.966664732519026e-06, + "loss": 0.6086, + "step": 1031 + }, + { + "epoch": 0.13, + "grad_norm": 0.9396237903527469, + "learning_rate": 9.966545704752845e-06, + "loss": 0.6627, + "step": 1032 + }, + { + "epoch": 0.13, + "grad_norm": 0.6340270017350794, + "learning_rate": 9.966426465576687e-06, + "loss": 0.5418, + "step": 1033 + }, + { + "epoch": 0.13, + "grad_norm": 0.7629589665546378, + "learning_rate": 9.966307014995623e-06, + "loss": 0.6605, + "step": 1034 + }, + { + "epoch": 0.13, + "grad_norm": 0.6688292179498095, + "learning_rate": 9.966187353014739e-06, + "loss": 0.5774, + "step": 1035 + }, + { + "epoch": 0.13, + "grad_norm": 0.790214161776146, + "learning_rate": 9.96606747963913e-06, + "loss": 0.619, + "step": 1036 + }, + { + "epoch": 0.13, + "grad_norm": 0.6345663154255697, + "learning_rate": 9.965947394873896e-06, + "loss": 0.596, + "step": 1037 + }, + { + "epoch": 0.13, + "grad_norm": 0.7544768452896589, + "learning_rate": 9.965827098724152e-06, + "loss": 0.6468, + "step": 1038 + }, + { + "epoch": 0.13, + "grad_norm": 0.6907460514008421, + "learning_rate": 9.965706591195017e-06, + "loss": 0.5965, + "step": 1039 + }, + { + "epoch": 0.13, + "grad_norm": 0.6590134071145285, + "learning_rate": 9.96558587229162e-06, + "loss": 0.5376, + "step": 1040 + }, + { + "epoch": 0.13, + "grad_norm": 0.6689845023180553, + "learning_rate": 9.9654649420191e-06, + "loss": 0.5715, + "step": 1041 + }, + { + "epoch": 0.13, + "grad_norm": 0.7262232483291121, + "learning_rate": 9.965343800382605e-06, + "loss": 0.6246, + "step": 1042 + }, + { + "epoch": 0.13, + "grad_norm": 0.772432713462555, + "learning_rate": 9.965222447387291e-06, + "loss": 0.5931, + "step": 1043 + }, + { + "epoch": 0.13, + "grad_norm": 0.9080945357905905, + "learning_rate": 9.965100883038323e-06, + "loss": 0.6755, + "step": 1044 + }, + { + "epoch": 0.13, + "grad_norm": 0.6537616023445731, + "learning_rate": 9.964979107340878e-06, + "loss": 0.5833, + "step": 1045 + }, + { + "epoch": 0.13, + "grad_norm": 0.8274528250485428, + "learning_rate": 9.964857120300138e-06, + "loss": 0.6862, + "step": 1046 + }, + { + "epoch": 0.13, + "grad_norm": 0.9268645173768608, + "learning_rate": 9.964734921921296e-06, + "loss": 0.6117, + "step": 1047 + }, + { + "epoch": 0.13, + "grad_norm": 0.637947616361313, + "learning_rate": 9.964612512209553e-06, + "loss": 0.56, + "step": 1048 + }, + { + "epoch": 0.13, + "grad_norm": 0.8345885037272234, + "learning_rate": 9.964489891170122e-06, + "loss": 0.6915, + "step": 1049 + }, + { + "epoch": 0.13, + "grad_norm": 0.947682923968399, + "learning_rate": 9.964367058808217e-06, + "loss": 0.557, + "step": 1050 + }, + { + "epoch": 0.13, + "grad_norm": 0.6394885257013123, + "learning_rate": 9.964244015129071e-06, + "loss": 0.5454, + "step": 1051 + }, + { + "epoch": 0.13, + "grad_norm": 0.6039722828318739, + "learning_rate": 9.964120760137922e-06, + "loss": 0.5202, + "step": 1052 + }, + { + "epoch": 0.13, + "grad_norm": 0.6552510421484344, + "learning_rate": 9.963997293840017e-06, + "loss": 0.5242, + "step": 1053 + }, + { + "epoch": 0.13, + "grad_norm": 0.9393663003689945, + "learning_rate": 9.963873616240607e-06, + "loss": 0.6933, + "step": 1054 + }, + { + "epoch": 0.13, + "grad_norm": 0.6995096683237148, + "learning_rate": 9.96374972734496e-06, + "loss": 0.5895, + "step": 1055 + }, + { + "epoch": 0.13, + "grad_norm": 0.5909357598843684, + "learning_rate": 9.96362562715835e-06, + "loss": 0.5418, + "step": 1056 + }, + { + "epoch": 0.13, + "grad_norm": 0.8842765522046706, + "learning_rate": 9.963501315686057e-06, + "loss": 0.6927, + "step": 1057 + }, + { + "epoch": 0.13, + "grad_norm": 0.769690140762895, + "learning_rate": 9.963376792933376e-06, + "loss": 0.6504, + "step": 1058 + }, + { + "epoch": 0.13, + "grad_norm": 0.8712214969714248, + "learning_rate": 9.963252058905604e-06, + "loss": 0.6754, + "step": 1059 + }, + { + "epoch": 0.14, + "grad_norm": 0.7626277381140851, + "learning_rate": 9.963127113608054e-06, + "loss": 0.58, + "step": 1060 + }, + { + "epoch": 0.14, + "grad_norm": 0.7415564123378365, + "learning_rate": 9.963001957046041e-06, + "loss": 0.5493, + "step": 1061 + }, + { + "epoch": 0.14, + "grad_norm": 0.8252324047814299, + "learning_rate": 9.962876589224894e-06, + "loss": 0.6158, + "step": 1062 + }, + { + "epoch": 0.14, + "grad_norm": 0.7740686906153754, + "learning_rate": 9.96275101014995e-06, + "loss": 0.647, + "step": 1063 + }, + { + "epoch": 0.14, + "grad_norm": 0.6434890865213917, + "learning_rate": 9.962625219826554e-06, + "loss": 0.5146, + "step": 1064 + }, + { + "epoch": 0.14, + "grad_norm": 0.6356266208073319, + "learning_rate": 9.96249921826006e-06, + "loss": 0.5051, + "step": 1065 + }, + { + "epoch": 0.14, + "grad_norm": 0.6917109800463024, + "learning_rate": 9.962373005455835e-06, + "loss": 0.5306, + "step": 1066 + }, + { + "epoch": 0.14, + "grad_norm": 0.6839785692218564, + "learning_rate": 9.962246581419246e-06, + "loss": 0.599, + "step": 1067 + }, + { + "epoch": 0.14, + "grad_norm": 0.8413408456625441, + "learning_rate": 9.962119946155678e-06, + "loss": 0.6054, + "step": 1068 + }, + { + "epoch": 0.14, + "grad_norm": 0.8825072950719561, + "learning_rate": 9.96199309967052e-06, + "loss": 0.6556, + "step": 1069 + }, + { + "epoch": 0.14, + "grad_norm": 0.6568332605989884, + "learning_rate": 9.961866041969172e-06, + "loss": 0.5804, + "step": 1070 + }, + { + "epoch": 0.14, + "grad_norm": 0.6701904019859587, + "learning_rate": 9.961738773057044e-06, + "loss": 0.5573, + "step": 1071 + }, + { + "epoch": 0.14, + "grad_norm": 0.8403411171391452, + "learning_rate": 9.96161129293955e-06, + "loss": 0.5507, + "step": 1072 + }, + { + "epoch": 0.14, + "grad_norm": 0.6320571860344364, + "learning_rate": 9.96148360162212e-06, + "loss": 0.4739, + "step": 1073 + }, + { + "epoch": 0.14, + "grad_norm": 0.8241160117697117, + "learning_rate": 9.961355699110188e-06, + "loss": 0.6369, + "step": 1074 + }, + { + "epoch": 0.14, + "grad_norm": 0.6775006836052057, + "learning_rate": 9.961227585409194e-06, + "loss": 0.5706, + "step": 1075 + }, + { + "epoch": 0.14, + "grad_norm": 0.782967568295276, + "learning_rate": 9.961099260524601e-06, + "loss": 0.6293, + "step": 1076 + }, + { + "epoch": 0.14, + "grad_norm": 0.9922788598511965, + "learning_rate": 9.960970724461862e-06, + "loss": 0.6214, + "step": 1077 + }, + { + "epoch": 0.14, + "grad_norm": 0.6835477547128451, + "learning_rate": 9.960841977226455e-06, + "loss": 0.5708, + "step": 1078 + }, + { + "epoch": 0.14, + "grad_norm": 0.7085800777580535, + "learning_rate": 9.960713018823855e-06, + "loss": 0.5775, + "step": 1079 + }, + { + "epoch": 0.14, + "grad_norm": 0.8834974525895641, + "learning_rate": 9.960583849259556e-06, + "loss": 0.6351, + "step": 1080 + }, + { + "epoch": 0.14, + "grad_norm": 0.7213832710568305, + "learning_rate": 9.960454468539053e-06, + "loss": 0.5675, + "step": 1081 + }, + { + "epoch": 0.14, + "grad_norm": 0.8879443734615274, + "learning_rate": 9.960324876667854e-06, + "loss": 0.6596, + "step": 1082 + }, + { + "epoch": 0.14, + "grad_norm": 0.7344950907712255, + "learning_rate": 9.960195073651478e-06, + "loss": 0.565, + "step": 1083 + }, + { + "epoch": 0.14, + "grad_norm": 0.7007327711779928, + "learning_rate": 9.960065059495446e-06, + "loss": 0.5203, + "step": 1084 + }, + { + "epoch": 0.14, + "grad_norm": 0.9642709628559569, + "learning_rate": 9.959934834205296e-06, + "loss": 0.5881, + "step": 1085 + }, + { + "epoch": 0.14, + "grad_norm": 0.7909316044844384, + "learning_rate": 9.95980439778657e-06, + "loss": 0.5805, + "step": 1086 + }, + { + "epoch": 0.14, + "grad_norm": 0.6601031474911905, + "learning_rate": 9.95967375024482e-06, + "loss": 0.5908, + "step": 1087 + }, + { + "epoch": 0.14, + "grad_norm": 0.7576039583517902, + "learning_rate": 9.959542891585606e-06, + "loss": 0.5639, + "step": 1088 + }, + { + "epoch": 0.14, + "grad_norm": 0.9212523958223972, + "learning_rate": 9.9594118218145e-06, + "loss": 0.6982, + "step": 1089 + }, + { + "epoch": 0.14, + "grad_norm": 0.5924096392485172, + "learning_rate": 9.959280540937082e-06, + "loss": 0.5012, + "step": 1090 + }, + { + "epoch": 0.14, + "grad_norm": 0.6641663266900463, + "learning_rate": 9.959149048958938e-06, + "loss": 0.5456, + "step": 1091 + }, + { + "epoch": 0.14, + "grad_norm": 0.9091526004034339, + "learning_rate": 9.959017345885666e-06, + "loss": 0.7084, + "step": 1092 + }, + { + "epoch": 0.14, + "grad_norm": 1.1011575016214283, + "learning_rate": 9.958885431722874e-06, + "loss": 0.6814, + "step": 1093 + }, + { + "epoch": 0.14, + "grad_norm": 0.9750371123627987, + "learning_rate": 9.958753306476172e-06, + "loss": 0.6081, + "step": 1094 + }, + { + "epoch": 0.14, + "grad_norm": 0.6333414917853868, + "learning_rate": 9.95862097015119e-06, + "loss": 0.5111, + "step": 1095 + }, + { + "epoch": 0.14, + "grad_norm": 0.6731356071365201, + "learning_rate": 9.95848842275356e-06, + "loss": 0.5338, + "step": 1096 + }, + { + "epoch": 0.14, + "grad_norm": 0.8756458617676428, + "learning_rate": 9.95835566428892e-06, + "loss": 0.5627, + "step": 1097 + }, + { + "epoch": 0.14, + "grad_norm": 0.7144933715655397, + "learning_rate": 9.958222694762926e-06, + "loss": 0.6085, + "step": 1098 + }, + { + "epoch": 0.14, + "grad_norm": 0.8363796515389963, + "learning_rate": 9.958089514181236e-06, + "loss": 0.586, + "step": 1099 + }, + { + "epoch": 0.14, + "grad_norm": 1.027379405617287, + "learning_rate": 9.95795612254952e-06, + "loss": 0.6114, + "step": 1100 + }, + { + "epoch": 0.14, + "grad_norm": 1.0292639883298103, + "learning_rate": 9.957822519873453e-06, + "loss": 0.6284, + "step": 1101 + }, + { + "epoch": 0.14, + "grad_norm": 1.0574842183994442, + "learning_rate": 9.957688706158725e-06, + "loss": 0.5952, + "step": 1102 + }, + { + "epoch": 0.14, + "grad_norm": 0.8483335753387509, + "learning_rate": 9.957554681411032e-06, + "loss": 0.6091, + "step": 1103 + }, + { + "epoch": 0.14, + "grad_norm": 0.7428902607788493, + "learning_rate": 9.957420445636077e-06, + "loss": 0.6634, + "step": 1104 + }, + { + "epoch": 0.14, + "grad_norm": 0.7026104738413125, + "learning_rate": 9.957285998839577e-06, + "loss": 0.5774, + "step": 1105 + }, + { + "epoch": 0.14, + "grad_norm": 0.7245714299947138, + "learning_rate": 9.957151341027251e-06, + "loss": 0.6126, + "step": 1106 + }, + { + "epoch": 0.14, + "grad_norm": 0.5621554383607955, + "learning_rate": 9.957016472204834e-06, + "loss": 0.5149, + "step": 1107 + }, + { + "epoch": 0.14, + "grad_norm": 0.848785091007283, + "learning_rate": 9.956881392378068e-06, + "loss": 0.6135, + "step": 1108 + }, + { + "epoch": 0.14, + "grad_norm": 0.6024732942844954, + "learning_rate": 9.9567461015527e-06, + "loss": 0.5038, + "step": 1109 + }, + { + "epoch": 0.14, + "grad_norm": 0.7577805467470974, + "learning_rate": 9.956610599734488e-06, + "loss": 0.5623, + "step": 1110 + }, + { + "epoch": 0.14, + "grad_norm": 0.6529702214325929, + "learning_rate": 9.956474886929205e-06, + "loss": 0.5759, + "step": 1111 + }, + { + "epoch": 0.14, + "grad_norm": 0.8385138547411434, + "learning_rate": 9.956338963142622e-06, + "loss": 0.6259, + "step": 1112 + }, + { + "epoch": 0.14, + "grad_norm": 0.6348340774981751, + "learning_rate": 9.95620282838053e-06, + "loss": 0.5491, + "step": 1113 + }, + { + "epoch": 0.14, + "grad_norm": 0.8432930975231842, + "learning_rate": 9.95606648264872e-06, + "loss": 0.6528, + "step": 1114 + }, + { + "epoch": 0.14, + "grad_norm": 0.646499926993693, + "learning_rate": 9.955929925952996e-06, + "loss": 0.5908, + "step": 1115 + }, + { + "epoch": 0.14, + "grad_norm": 0.829058950554376, + "learning_rate": 9.955793158299173e-06, + "loss": 0.6599, + "step": 1116 + }, + { + "epoch": 0.14, + "grad_norm": 0.613515823068835, + "learning_rate": 9.955656179693073e-06, + "loss": 0.4962, + "step": 1117 + }, + { + "epoch": 0.14, + "grad_norm": 1.090777028471484, + "learning_rate": 9.955518990140525e-06, + "loss": 0.6454, + "step": 1118 + }, + { + "epoch": 0.14, + "grad_norm": 0.5946182803339479, + "learning_rate": 9.955381589647367e-06, + "loss": 0.5602, + "step": 1119 + }, + { + "epoch": 0.14, + "grad_norm": 0.8286584023219346, + "learning_rate": 9.955243978219452e-06, + "loss": 0.6051, + "step": 1120 + }, + { + "epoch": 0.14, + "grad_norm": 0.6467996168545798, + "learning_rate": 9.955106155862635e-06, + "loss": 0.5473, + "step": 1121 + }, + { + "epoch": 0.14, + "grad_norm": 0.5940894978116646, + "learning_rate": 9.954968122582784e-06, + "loss": 0.5338, + "step": 1122 + }, + { + "epoch": 0.14, + "grad_norm": 0.8590824308861795, + "learning_rate": 9.954829878385773e-06, + "loss": 0.6326, + "step": 1123 + }, + { + "epoch": 0.14, + "grad_norm": 0.6244978334185487, + "learning_rate": 9.954691423277487e-06, + "loss": 0.5246, + "step": 1124 + }, + { + "epoch": 0.14, + "grad_norm": 0.5909229083578984, + "learning_rate": 9.95455275726382e-06, + "loss": 0.5258, + "step": 1125 + }, + { + "epoch": 0.14, + "grad_norm": 0.7924079847665105, + "learning_rate": 9.954413880350674e-06, + "loss": 0.6057, + "step": 1126 + }, + { + "epoch": 0.14, + "grad_norm": 1.008750003158618, + "learning_rate": 9.954274792543963e-06, + "loss": 0.6279, + "step": 1127 + }, + { + "epoch": 0.14, + "grad_norm": 0.9275739014823893, + "learning_rate": 9.954135493849605e-06, + "loss": 0.6504, + "step": 1128 + }, + { + "epoch": 0.14, + "grad_norm": 0.6588408665126066, + "learning_rate": 9.95399598427353e-06, + "loss": 0.5735, + "step": 1129 + }, + { + "epoch": 0.14, + "grad_norm": 0.7338741263032064, + "learning_rate": 9.953856263821677e-06, + "loss": 0.536, + "step": 1130 + }, + { + "epoch": 0.14, + "grad_norm": 0.6913121183957724, + "learning_rate": 9.953716332499991e-06, + "loss": 0.6244, + "step": 1131 + }, + { + "epoch": 0.14, + "grad_norm": 0.9019267806465207, + "learning_rate": 9.953576190314434e-06, + "loss": 0.6787, + "step": 1132 + }, + { + "epoch": 0.14, + "grad_norm": 0.6933534279255132, + "learning_rate": 9.953435837270966e-06, + "loss": 0.5362, + "step": 1133 + }, + { + "epoch": 0.14, + "grad_norm": 0.8972807751072782, + "learning_rate": 9.953295273375564e-06, + "loss": 0.6749, + "step": 1134 + }, + { + "epoch": 0.14, + "grad_norm": 0.6176012154418374, + "learning_rate": 9.95315449863421e-06, + "loss": 0.5268, + "step": 1135 + }, + { + "epoch": 0.14, + "grad_norm": 0.9166086004029897, + "learning_rate": 9.953013513052898e-06, + "loss": 0.5797, + "step": 1136 + }, + { + "epoch": 0.14, + "grad_norm": 0.7726218336683778, + "learning_rate": 9.952872316637627e-06, + "loss": 0.593, + "step": 1137 + }, + { + "epoch": 0.14, + "grad_norm": 0.6059201439858781, + "learning_rate": 9.952730909394409e-06, + "loss": 0.5262, + "step": 1138 + }, + { + "epoch": 0.15, + "grad_norm": 0.8709987021880209, + "learning_rate": 9.952589291329262e-06, + "loss": 0.6675, + "step": 1139 + }, + { + "epoch": 0.15, + "grad_norm": 0.9166989103261511, + "learning_rate": 9.952447462448219e-06, + "loss": 0.6499, + "step": 1140 + }, + { + "epoch": 0.15, + "grad_norm": 0.7545998650211926, + "learning_rate": 9.952305422757308e-06, + "loss": 0.6138, + "step": 1141 + }, + { + "epoch": 0.15, + "grad_norm": 1.2410058081739495, + "learning_rate": 9.952163172262583e-06, + "loss": 0.6333, + "step": 1142 + }, + { + "epoch": 0.15, + "grad_norm": 0.7992091222627921, + "learning_rate": 9.952020710970098e-06, + "loss": 0.5316, + "step": 1143 + }, + { + "epoch": 0.15, + "grad_norm": 0.7289102533460955, + "learning_rate": 9.951878038885914e-06, + "loss": 0.537, + "step": 1144 + }, + { + "epoch": 0.15, + "grad_norm": 0.6614206102741648, + "learning_rate": 9.951735156016105e-06, + "loss": 0.5623, + "step": 1145 + }, + { + "epoch": 0.15, + "grad_norm": 0.8177142563654926, + "learning_rate": 9.951592062366754e-06, + "loss": 0.6382, + "step": 1146 + }, + { + "epoch": 0.15, + "grad_norm": 0.7097636836053219, + "learning_rate": 9.951448757943954e-06, + "loss": 0.5972, + "step": 1147 + }, + { + "epoch": 0.15, + "grad_norm": 0.9254677182285955, + "learning_rate": 9.951305242753801e-06, + "loss": 0.571, + "step": 1148 + }, + { + "epoch": 0.15, + "grad_norm": 0.6988956392409752, + "learning_rate": 9.951161516802408e-06, + "loss": 0.5549, + "step": 1149 + }, + { + "epoch": 0.15, + "grad_norm": 0.8134103090839735, + "learning_rate": 9.951017580095889e-06, + "loss": 0.5931, + "step": 1150 + }, + { + "epoch": 0.15, + "grad_norm": 0.6059200480854988, + "learning_rate": 9.950873432640373e-06, + "loss": 0.5429, + "step": 1151 + }, + { + "epoch": 0.15, + "grad_norm": 0.9848688757575335, + "learning_rate": 9.950729074441998e-06, + "loss": 0.6748, + "step": 1152 + }, + { + "epoch": 0.15, + "grad_norm": 0.5763030200172503, + "learning_rate": 9.950584505506904e-06, + "loss": 0.495, + "step": 1153 + }, + { + "epoch": 0.15, + "grad_norm": 0.7517038459961862, + "learning_rate": 9.950439725841247e-06, + "loss": 0.5802, + "step": 1154 + }, + { + "epoch": 0.15, + "grad_norm": 1.0629583474210524, + "learning_rate": 9.950294735451192e-06, + "loss": 0.6841, + "step": 1155 + }, + { + "epoch": 0.15, + "grad_norm": 0.7663206359698448, + "learning_rate": 9.950149534342907e-06, + "loss": 0.5319, + "step": 1156 + }, + { + "epoch": 0.15, + "grad_norm": 0.7795779642312519, + "learning_rate": 9.950004122522578e-06, + "loss": 0.6035, + "step": 1157 + }, + { + "epoch": 0.15, + "grad_norm": 1.178206918510157, + "learning_rate": 9.949858499996389e-06, + "loss": 0.6628, + "step": 1158 + }, + { + "epoch": 0.15, + "grad_norm": 0.7391505252721015, + "learning_rate": 9.949712666770541e-06, + "loss": 0.5569, + "step": 1159 + }, + { + "epoch": 0.15, + "grad_norm": 0.7583907860065018, + "learning_rate": 9.949566622851243e-06, + "loss": 0.5678, + "step": 1160 + }, + { + "epoch": 0.15, + "grad_norm": 0.646869541150209, + "learning_rate": 9.94942036824471e-06, + "loss": 0.5737, + "step": 1161 + }, + { + "epoch": 0.15, + "grad_norm": 0.794371583641433, + "learning_rate": 9.949273902957169e-06, + "loss": 0.6565, + "step": 1162 + }, + { + "epoch": 0.15, + "grad_norm": 0.9050968642999875, + "learning_rate": 9.94912722699485e-06, + "loss": 0.5778, + "step": 1163 + }, + { + "epoch": 0.15, + "grad_norm": 0.6189268621262112, + "learning_rate": 9.948980340364002e-06, + "loss": 0.5335, + "step": 1164 + }, + { + "epoch": 0.15, + "grad_norm": 0.6192781388637013, + "learning_rate": 9.948833243070877e-06, + "loss": 0.5795, + "step": 1165 + }, + { + "epoch": 0.15, + "grad_norm": 0.6997940420230017, + "learning_rate": 9.948685935121735e-06, + "loss": 0.5817, + "step": 1166 + }, + { + "epoch": 0.15, + "grad_norm": 0.666587408485315, + "learning_rate": 9.948538416522846e-06, + "loss": 0.5358, + "step": 1167 + }, + { + "epoch": 0.15, + "grad_norm": 0.922220574123095, + "learning_rate": 9.948390687280489e-06, + "loss": 0.6504, + "step": 1168 + }, + { + "epoch": 0.15, + "grad_norm": 0.9291835324778818, + "learning_rate": 9.948242747400953e-06, + "loss": 0.6552, + "step": 1169 + }, + { + "epoch": 0.15, + "grad_norm": 0.6609464921861429, + "learning_rate": 9.948094596890536e-06, + "loss": 0.5258, + "step": 1170 + }, + { + "epoch": 0.15, + "grad_norm": 0.7432722982339321, + "learning_rate": 9.947946235755545e-06, + "loss": 0.518, + "step": 1171 + }, + { + "epoch": 0.15, + "grad_norm": 0.9232759463515889, + "learning_rate": 9.947797664002294e-06, + "loss": 0.6911, + "step": 1172 + }, + { + "epoch": 0.15, + "grad_norm": 0.9677536032809348, + "learning_rate": 9.947648881637107e-06, + "loss": 0.6149, + "step": 1173 + }, + { + "epoch": 0.15, + "grad_norm": 1.0779556679380788, + "learning_rate": 9.947499888666317e-06, + "loss": 0.6383, + "step": 1174 + }, + { + "epoch": 0.15, + "grad_norm": 0.6818092950991045, + "learning_rate": 9.947350685096266e-06, + "loss": 0.5611, + "step": 1175 + }, + { + "epoch": 0.15, + "grad_norm": 0.6562011194061566, + "learning_rate": 9.947201270933307e-06, + "loss": 0.558, + "step": 1176 + }, + { + "epoch": 0.15, + "grad_norm": 0.7506221775353435, + "learning_rate": 9.947051646183798e-06, + "loss": 0.6525, + "step": 1177 + }, + { + "epoch": 0.15, + "grad_norm": 0.9732030857141276, + "learning_rate": 9.946901810854109e-06, + "loss": 0.6358, + "step": 1178 + }, + { + "epoch": 0.15, + "grad_norm": 0.6719722587021625, + "learning_rate": 9.94675176495062e-06, + "loss": 0.5724, + "step": 1179 + }, + { + "epoch": 0.15, + "grad_norm": 0.7277819505454043, + "learning_rate": 9.946601508479714e-06, + "loss": 0.5659, + "step": 1180 + }, + { + "epoch": 0.15, + "grad_norm": 0.6024131898213151, + "learning_rate": 9.946451041447788e-06, + "loss": 0.5407, + "step": 1181 + }, + { + "epoch": 0.15, + "grad_norm": 0.6194888085588076, + "learning_rate": 9.94630036386125e-06, + "loss": 0.531, + "step": 1182 + }, + { + "epoch": 0.15, + "grad_norm": 0.819612771311822, + "learning_rate": 9.946149475726509e-06, + "loss": 0.6299, + "step": 1183 + }, + { + "epoch": 0.15, + "grad_norm": 0.5988162409384846, + "learning_rate": 9.945998377049992e-06, + "loss": 0.5163, + "step": 1184 + }, + { + "epoch": 0.15, + "grad_norm": 0.621047731739097, + "learning_rate": 9.945847067838131e-06, + "loss": 0.6432, + "step": 1185 + }, + { + "epoch": 0.15, + "grad_norm": 0.6933969601897686, + "learning_rate": 9.945695548097363e-06, + "loss": 0.55, + "step": 1186 + }, + { + "epoch": 0.15, + "grad_norm": 0.6215097677045021, + "learning_rate": 9.94554381783414e-06, + "loss": 0.627, + "step": 1187 + }, + { + "epoch": 0.15, + "grad_norm": 0.9398154868265022, + "learning_rate": 9.945391877054919e-06, + "loss": 0.7062, + "step": 1188 + }, + { + "epoch": 0.15, + "grad_norm": 0.9323473338926082, + "learning_rate": 9.945239725766172e-06, + "loss": 0.6333, + "step": 1189 + }, + { + "epoch": 0.15, + "grad_norm": 0.9423907671305878, + "learning_rate": 9.94508736397437e-06, + "loss": 0.6752, + "step": 1190 + }, + { + "epoch": 0.15, + "grad_norm": 0.9615586928806058, + "learning_rate": 9.944934791686003e-06, + "loss": 0.6944, + "step": 1191 + }, + { + "epoch": 0.15, + "grad_norm": 0.6415203280411638, + "learning_rate": 9.944782008907564e-06, + "loss": 0.5337, + "step": 1192 + }, + { + "epoch": 0.15, + "grad_norm": 0.6503610631443516, + "learning_rate": 9.944629015645553e-06, + "loss": 0.517, + "step": 1193 + }, + { + "epoch": 0.15, + "grad_norm": 1.189574393483398, + "learning_rate": 9.944475811906487e-06, + "loss": 0.6663, + "step": 1194 + }, + { + "epoch": 0.15, + "grad_norm": 0.8127292741305696, + "learning_rate": 9.944322397696888e-06, + "loss": 0.6214, + "step": 1195 + }, + { + "epoch": 0.15, + "grad_norm": 0.9352628663178603, + "learning_rate": 9.944168773023282e-06, + "loss": 0.6445, + "step": 1196 + }, + { + "epoch": 0.15, + "grad_norm": 0.8159532544926157, + "learning_rate": 9.944014937892211e-06, + "loss": 0.6347, + "step": 1197 + }, + { + "epoch": 0.15, + "grad_norm": 0.7529396903292472, + "learning_rate": 9.943860892310225e-06, + "loss": 0.5708, + "step": 1198 + }, + { + "epoch": 0.15, + "grad_norm": 0.7332699396889881, + "learning_rate": 9.943706636283876e-06, + "loss": 0.617, + "step": 1199 + }, + { + "epoch": 0.15, + "grad_norm": 0.7065292946089605, + "learning_rate": 9.943552169819734e-06, + "loss": 0.5599, + "step": 1200 + }, + { + "epoch": 0.15, + "grad_norm": 0.9523616116392661, + "learning_rate": 9.943397492924377e-06, + "loss": 0.6991, + "step": 1201 + }, + { + "epoch": 0.15, + "grad_norm": 0.6718897182036292, + "learning_rate": 9.943242605604381e-06, + "loss": 0.5334, + "step": 1202 + }, + { + "epoch": 0.15, + "grad_norm": 0.6255929956273176, + "learning_rate": 9.943087507866345e-06, + "loss": 0.5123, + "step": 1203 + }, + { + "epoch": 0.15, + "grad_norm": 0.5863924057030013, + "learning_rate": 9.94293219971687e-06, + "loss": 0.5259, + "step": 1204 + }, + { + "epoch": 0.15, + "grad_norm": 0.6129535666687689, + "learning_rate": 9.942776681162566e-06, + "loss": 0.5341, + "step": 1205 + }, + { + "epoch": 0.15, + "grad_norm": 0.6738433228662217, + "learning_rate": 9.942620952210057e-06, + "loss": 0.5877, + "step": 1206 + }, + { + "epoch": 0.15, + "grad_norm": 0.6691527608264151, + "learning_rate": 9.942465012865964e-06, + "loss": 0.6106, + "step": 1207 + }, + { + "epoch": 0.15, + "grad_norm": 0.8947966968433815, + "learning_rate": 9.94230886313693e-06, + "loss": 0.6766, + "step": 1208 + }, + { + "epoch": 0.15, + "grad_norm": 0.5937059305694171, + "learning_rate": 9.942152503029603e-06, + "loss": 0.5424, + "step": 1209 + }, + { + "epoch": 0.15, + "grad_norm": 0.6666550628105659, + "learning_rate": 9.941995932550636e-06, + "loss": 0.6206, + "step": 1210 + }, + { + "epoch": 0.15, + "grad_norm": 0.6855403367079185, + "learning_rate": 9.941839151706694e-06, + "loss": 0.5557, + "step": 1211 + }, + { + "epoch": 0.15, + "grad_norm": 0.6996973493623373, + "learning_rate": 9.941682160504452e-06, + "loss": 0.638, + "step": 1212 + }, + { + "epoch": 0.15, + "grad_norm": 0.8022251873817732, + "learning_rate": 9.941524958950591e-06, + "loss": 0.6535, + "step": 1213 + }, + { + "epoch": 0.15, + "grad_norm": 0.8575600904950452, + "learning_rate": 9.941367547051803e-06, + "loss": 0.5889, + "step": 1214 + }, + { + "epoch": 0.15, + "grad_norm": 0.7458489001018135, + "learning_rate": 9.94120992481479e-06, + "loss": 0.6282, + "step": 1215 + }, + { + "epoch": 0.15, + "grad_norm": 0.7011274758384984, + "learning_rate": 9.94105209224626e-06, + "loss": 0.6035, + "step": 1216 + }, + { + "epoch": 0.16, + "grad_norm": 0.778022051851357, + "learning_rate": 9.940894049352932e-06, + "loss": 0.6024, + "step": 1217 + }, + { + "epoch": 0.16, + "grad_norm": 0.8229848811812075, + "learning_rate": 9.940735796141533e-06, + "loss": 0.7138, + "step": 1218 + }, + { + "epoch": 0.16, + "grad_norm": 0.6693289356258347, + "learning_rate": 9.940577332618798e-06, + "loss": 0.6005, + "step": 1219 + }, + { + "epoch": 0.16, + "grad_norm": 0.8768519648063721, + "learning_rate": 9.940418658791475e-06, + "loss": 0.6364, + "step": 1220 + }, + { + "epoch": 0.16, + "grad_norm": 0.6210843299179503, + "learning_rate": 9.940259774666316e-06, + "loss": 0.5151, + "step": 1221 + }, + { + "epoch": 0.16, + "grad_norm": 0.5928265232338573, + "learning_rate": 9.940100680250086e-06, + "loss": 0.5706, + "step": 1222 + }, + { + "epoch": 0.16, + "grad_norm": 0.7666455691407895, + "learning_rate": 9.939941375549559e-06, + "loss": 0.611, + "step": 1223 + }, + { + "epoch": 0.16, + "grad_norm": 0.8285991798299784, + "learning_rate": 9.93978186057151e-06, + "loss": 0.657, + "step": 1224 + }, + { + "epoch": 0.16, + "grad_norm": 0.5513770793701552, + "learning_rate": 9.939622135322733e-06, + "loss": 0.5328, + "step": 1225 + }, + { + "epoch": 0.16, + "grad_norm": 0.6724169105488185, + "learning_rate": 9.939462199810027e-06, + "loss": 0.5896, + "step": 1226 + }, + { + "epoch": 0.16, + "grad_norm": 0.6217065032154994, + "learning_rate": 9.9393020540402e-06, + "loss": 0.535, + "step": 1227 + }, + { + "epoch": 0.16, + "grad_norm": 0.714592607405014, + "learning_rate": 9.939141698020067e-06, + "loss": 0.5438, + "step": 1228 + }, + { + "epoch": 0.16, + "grad_norm": 0.6270973608896668, + "learning_rate": 9.938981131756455e-06, + "loss": 0.5692, + "step": 1229 + }, + { + "epoch": 0.16, + "grad_norm": 0.7516987047795822, + "learning_rate": 9.938820355256201e-06, + "loss": 0.6568, + "step": 1230 + }, + { + "epoch": 0.16, + "grad_norm": 0.6853617324725388, + "learning_rate": 9.938659368526146e-06, + "loss": 0.5389, + "step": 1231 + }, + { + "epoch": 0.16, + "grad_norm": 0.739056326079907, + "learning_rate": 9.938498171573142e-06, + "loss": 0.5849, + "step": 1232 + }, + { + "epoch": 0.16, + "grad_norm": 0.8286906199401044, + "learning_rate": 9.938336764404053e-06, + "loss": 0.6549, + "step": 1233 + }, + { + "epoch": 0.16, + "grad_norm": 0.6871217548760605, + "learning_rate": 9.938175147025749e-06, + "loss": 0.5441, + "step": 1234 + }, + { + "epoch": 0.16, + "grad_norm": 0.635761385045518, + "learning_rate": 9.938013319445107e-06, + "loss": 0.5623, + "step": 1235 + }, + { + "epoch": 0.16, + "grad_norm": 0.7732509643778649, + "learning_rate": 9.93785128166902e-06, + "loss": 0.6476, + "step": 1236 + }, + { + "epoch": 0.16, + "grad_norm": 0.9388942019799706, + "learning_rate": 9.937689033704383e-06, + "loss": 0.6313, + "step": 1237 + }, + { + "epoch": 0.16, + "grad_norm": 0.6506073910645148, + "learning_rate": 9.937526575558102e-06, + "loss": 0.5507, + "step": 1238 + }, + { + "epoch": 0.16, + "grad_norm": 0.7109459417537408, + "learning_rate": 9.937363907237093e-06, + "loss": 0.5455, + "step": 1239 + }, + { + "epoch": 0.16, + "grad_norm": 0.7131945189513951, + "learning_rate": 9.937201028748278e-06, + "loss": 0.5972, + "step": 1240 + }, + { + "epoch": 0.16, + "grad_norm": 0.7933921050202154, + "learning_rate": 9.937037940098595e-06, + "loss": 0.6384, + "step": 1241 + }, + { + "epoch": 0.16, + "grad_norm": 0.7117828298931219, + "learning_rate": 9.936874641294982e-06, + "loss": 0.5741, + "step": 1242 + }, + { + "epoch": 0.16, + "grad_norm": 0.6858607363442373, + "learning_rate": 9.936711132344393e-06, + "loss": 0.5684, + "step": 1243 + }, + { + "epoch": 0.16, + "grad_norm": 1.079827994215085, + "learning_rate": 9.936547413253783e-06, + "loss": 0.6917, + "step": 1244 + }, + { + "epoch": 0.16, + "grad_norm": 0.8693329698611052, + "learning_rate": 9.936383484030127e-06, + "loss": 0.6281, + "step": 1245 + }, + { + "epoch": 0.16, + "grad_norm": 0.9237381724365175, + "learning_rate": 9.9362193446804e-06, + "loss": 0.6816, + "step": 1246 + }, + { + "epoch": 0.16, + "grad_norm": 0.5947942369877502, + "learning_rate": 9.93605499521159e-06, + "loss": 0.5281, + "step": 1247 + }, + { + "epoch": 0.16, + "grad_norm": 0.6876185476635489, + "learning_rate": 9.935890435630693e-06, + "loss": 0.5819, + "step": 1248 + }, + { + "epoch": 0.16, + "grad_norm": 0.7852566555702422, + "learning_rate": 9.935725665944712e-06, + "loss": 0.5835, + "step": 1249 + }, + { + "epoch": 0.16, + "grad_norm": 0.8860897748537813, + "learning_rate": 9.935560686160661e-06, + "loss": 0.6229, + "step": 1250 + }, + { + "epoch": 0.16, + "grad_norm": 0.8380467170046101, + "learning_rate": 9.935395496285565e-06, + "loss": 0.5754, + "step": 1251 + }, + { + "epoch": 0.16, + "grad_norm": 0.8174419125773005, + "learning_rate": 9.935230096326452e-06, + "loss": 0.5831, + "step": 1252 + }, + { + "epoch": 0.16, + "grad_norm": 0.6096272258583538, + "learning_rate": 9.935064486290366e-06, + "loss": 0.5627, + "step": 1253 + }, + { + "epoch": 0.16, + "grad_norm": 0.8144193664929303, + "learning_rate": 9.934898666184354e-06, + "loss": 0.6342, + "step": 1254 + }, + { + "epoch": 0.16, + "grad_norm": 0.6618055773716802, + "learning_rate": 9.934732636015475e-06, + "loss": 0.558, + "step": 1255 + }, + { + "epoch": 0.16, + "grad_norm": 0.5670649211847698, + "learning_rate": 9.934566395790798e-06, + "loss": 0.5249, + "step": 1256 + }, + { + "epoch": 0.16, + "grad_norm": 1.0116605711057045, + "learning_rate": 9.934399945517398e-06, + "loss": 0.7115, + "step": 1257 + }, + { + "epoch": 0.16, + "grad_norm": 0.9606958544754229, + "learning_rate": 9.934233285202362e-06, + "loss": 0.6245, + "step": 1258 + }, + { + "epoch": 0.16, + "grad_norm": 0.6005659704302857, + "learning_rate": 9.93406641485278e-06, + "loss": 0.5508, + "step": 1259 + }, + { + "epoch": 0.16, + "grad_norm": 0.603908805834337, + "learning_rate": 9.93389933447576e-06, + "loss": 0.5268, + "step": 1260 + }, + { + "epoch": 0.16, + "grad_norm": 0.6912683190406798, + "learning_rate": 9.93373204407841e-06, + "loss": 0.5603, + "step": 1261 + }, + { + "epoch": 0.16, + "grad_norm": 1.5015099086663268, + "learning_rate": 9.933564543667854e-06, + "loss": 0.6758, + "step": 1262 + }, + { + "epoch": 0.16, + "grad_norm": 0.61498877277781, + "learning_rate": 9.933396833251221e-06, + "loss": 0.5979, + "step": 1263 + }, + { + "epoch": 0.16, + "grad_norm": 0.7587062308638808, + "learning_rate": 9.933228912835649e-06, + "loss": 0.535, + "step": 1264 + }, + { + "epoch": 0.16, + "grad_norm": 0.6372135461611754, + "learning_rate": 9.933060782428286e-06, + "loss": 0.5916, + "step": 1265 + }, + { + "epoch": 0.16, + "grad_norm": 0.8828201367862585, + "learning_rate": 9.932892442036289e-06, + "loss": 0.5955, + "step": 1266 + }, + { + "epoch": 0.16, + "grad_norm": 0.8675700039740134, + "learning_rate": 9.932723891666825e-06, + "loss": 0.6343, + "step": 1267 + }, + { + "epoch": 0.16, + "grad_norm": 1.000392034730579, + "learning_rate": 9.932555131327069e-06, + "loss": 0.5913, + "step": 1268 + }, + { + "epoch": 0.16, + "grad_norm": 0.6441274395889746, + "learning_rate": 9.9323861610242e-06, + "loss": 0.5294, + "step": 1269 + }, + { + "epoch": 0.16, + "grad_norm": 0.5634681746022322, + "learning_rate": 9.932216980765416e-06, + "loss": 0.5094, + "step": 1270 + }, + { + "epoch": 0.16, + "grad_norm": 0.8087186209107541, + "learning_rate": 9.932047590557916e-06, + "loss": 0.6285, + "step": 1271 + }, + { + "epoch": 0.16, + "grad_norm": 0.5830632100716865, + "learning_rate": 9.93187799040891e-06, + "loss": 0.5117, + "step": 1272 + }, + { + "epoch": 0.16, + "grad_norm": 0.5993417802328452, + "learning_rate": 9.931708180325619e-06, + "loss": 0.5062, + "step": 1273 + }, + { + "epoch": 0.16, + "grad_norm": 0.669294113898828, + "learning_rate": 9.931538160315268e-06, + "loss": 0.5355, + "step": 1274 + }, + { + "epoch": 0.16, + "grad_norm": 0.60490174747613, + "learning_rate": 9.931367930385098e-06, + "loss": 0.5085, + "step": 1275 + }, + { + "epoch": 0.16, + "grad_norm": 0.6262615291290174, + "learning_rate": 9.931197490542354e-06, + "loss": 0.5536, + "step": 1276 + }, + { + "epoch": 0.16, + "grad_norm": 0.91396197589831, + "learning_rate": 9.931026840794292e-06, + "loss": 0.6301, + "step": 1277 + }, + { + "epoch": 0.16, + "grad_norm": 1.0022672535619206, + "learning_rate": 9.930855981148172e-06, + "loss": 0.6549, + "step": 1278 + }, + { + "epoch": 0.16, + "grad_norm": 0.9263003202148294, + "learning_rate": 9.93068491161127e-06, + "loss": 0.6036, + "step": 1279 + }, + { + "epoch": 0.16, + "grad_norm": 0.7871313319089243, + "learning_rate": 9.930513632190868e-06, + "loss": 0.643, + "step": 1280 + }, + { + "epoch": 0.16, + "grad_norm": 0.7385298884516197, + "learning_rate": 9.930342142894259e-06, + "loss": 0.544, + "step": 1281 + }, + { + "epoch": 0.16, + "grad_norm": 0.9343110234348226, + "learning_rate": 9.930170443728736e-06, + "loss": 0.6865, + "step": 1282 + }, + { + "epoch": 0.16, + "grad_norm": 1.778588128833451, + "learning_rate": 9.929998534701612e-06, + "loss": 0.6069, + "step": 1283 + }, + { + "epoch": 0.16, + "grad_norm": 1.0524259183173499, + "learning_rate": 9.929826415820207e-06, + "loss": 0.6236, + "step": 1284 + }, + { + "epoch": 0.16, + "grad_norm": 0.8147387322477797, + "learning_rate": 9.929654087091845e-06, + "loss": 0.5876, + "step": 1285 + }, + { + "epoch": 0.16, + "grad_norm": 0.8626286053173774, + "learning_rate": 9.92948154852386e-06, + "loss": 0.6699, + "step": 1286 + }, + { + "epoch": 0.16, + "grad_norm": 0.6848567231390712, + "learning_rate": 9.929308800123597e-06, + "loss": 0.5203, + "step": 1287 + }, + { + "epoch": 0.16, + "grad_norm": 0.8106992091885179, + "learning_rate": 9.929135841898412e-06, + "loss": 0.6333, + "step": 1288 + }, + { + "epoch": 0.16, + "grad_norm": 0.6355507244372836, + "learning_rate": 9.928962673855664e-06, + "loss": 0.5674, + "step": 1289 + }, + { + "epoch": 0.16, + "grad_norm": 0.9575810776483066, + "learning_rate": 9.928789296002726e-06, + "loss": 0.5601, + "step": 1290 + }, + { + "epoch": 0.16, + "grad_norm": 0.6614320788002571, + "learning_rate": 9.928615708346978e-06, + "loss": 0.5323, + "step": 1291 + }, + { + "epoch": 0.16, + "grad_norm": 0.78603144821625, + "learning_rate": 9.92844191089581e-06, + "loss": 0.63, + "step": 1292 + }, + { + "epoch": 0.16, + "grad_norm": 0.854145474325908, + "learning_rate": 9.92826790365662e-06, + "loss": 0.5931, + "step": 1293 + }, + { + "epoch": 0.16, + "grad_norm": 0.9479017928633185, + "learning_rate": 9.928093686636811e-06, + "loss": 0.6366, + "step": 1294 + }, + { + "epoch": 0.16, + "grad_norm": 0.6556789536313667, + "learning_rate": 9.927919259843801e-06, + "loss": 0.6144, + "step": 1295 + }, + { + "epoch": 0.17, + "grad_norm": 0.7178000902835753, + "learning_rate": 9.927744623285017e-06, + "loss": 0.6428, + "step": 1296 + }, + { + "epoch": 0.17, + "grad_norm": 0.9778528421679374, + "learning_rate": 9.927569776967891e-06, + "loss": 0.6789, + "step": 1297 + }, + { + "epoch": 0.17, + "grad_norm": 0.699409897187686, + "learning_rate": 9.927394720899866e-06, + "loss": 0.5989, + "step": 1298 + }, + { + "epoch": 0.17, + "grad_norm": 0.7475897085800394, + "learning_rate": 9.927219455088394e-06, + "loss": 0.5351, + "step": 1299 + }, + { + "epoch": 0.17, + "grad_norm": 0.9380606806438925, + "learning_rate": 9.927043979540934e-06, + "loss": 0.6808, + "step": 1300 + }, + { + "epoch": 0.17, + "grad_norm": 0.7623954190929098, + "learning_rate": 9.926868294264957e-06, + "loss": 0.6033, + "step": 1301 + }, + { + "epoch": 0.17, + "grad_norm": 0.8451633671559466, + "learning_rate": 9.92669239926794e-06, + "loss": 0.6047, + "step": 1302 + }, + { + "epoch": 0.17, + "grad_norm": 0.6344557359348485, + "learning_rate": 9.926516294557374e-06, + "loss": 0.4629, + "step": 1303 + }, + { + "epoch": 0.17, + "grad_norm": 0.983505389962346, + "learning_rate": 9.92633998014075e-06, + "loss": 0.6401, + "step": 1304 + }, + { + "epoch": 0.17, + "grad_norm": 0.8396107779558755, + "learning_rate": 9.926163456025573e-06, + "loss": 0.5932, + "step": 1305 + }, + { + "epoch": 0.17, + "grad_norm": 0.6464848349421851, + "learning_rate": 9.925986722219362e-06, + "loss": 0.5482, + "step": 1306 + }, + { + "epoch": 0.17, + "grad_norm": 0.6152424291581389, + "learning_rate": 9.925809778729639e-06, + "loss": 0.5177, + "step": 1307 + }, + { + "epoch": 0.17, + "grad_norm": 0.8963702088469717, + "learning_rate": 9.925632625563931e-06, + "loss": 0.6215, + "step": 1308 + }, + { + "epoch": 0.17, + "grad_norm": 0.8236919014857446, + "learning_rate": 9.925455262729785e-06, + "loss": 0.6344, + "step": 1309 + }, + { + "epoch": 0.17, + "grad_norm": 0.8067235544500863, + "learning_rate": 9.925277690234745e-06, + "loss": 0.6568, + "step": 1310 + }, + { + "epoch": 0.17, + "grad_norm": 0.8053307649416414, + "learning_rate": 9.925099908086375e-06, + "loss": 0.6914, + "step": 1311 + }, + { + "epoch": 0.17, + "grad_norm": 0.8185646572828913, + "learning_rate": 9.924921916292239e-06, + "loss": 0.6134, + "step": 1312 + }, + { + "epoch": 0.17, + "grad_norm": 0.8488052935040817, + "learning_rate": 9.924743714859915e-06, + "loss": 0.6114, + "step": 1313 + }, + { + "epoch": 0.17, + "grad_norm": 0.786538209385159, + "learning_rate": 9.924565303796988e-06, + "loss": 0.6325, + "step": 1314 + }, + { + "epoch": 0.17, + "grad_norm": 0.8421790302765904, + "learning_rate": 9.924386683111055e-06, + "loss": 0.6299, + "step": 1315 + }, + { + "epoch": 0.17, + "grad_norm": 0.8593619423668151, + "learning_rate": 9.924207852809715e-06, + "loss": 0.5854, + "step": 1316 + }, + { + "epoch": 0.17, + "grad_norm": 0.7794165596221101, + "learning_rate": 9.924028812900582e-06, + "loss": 0.6472, + "step": 1317 + }, + { + "epoch": 0.17, + "grad_norm": 0.6200289391608015, + "learning_rate": 9.923849563391278e-06, + "loss": 0.5082, + "step": 1318 + }, + { + "epoch": 0.17, + "grad_norm": 0.6503282906773031, + "learning_rate": 9.923670104289433e-06, + "loss": 0.5554, + "step": 1319 + }, + { + "epoch": 0.17, + "grad_norm": 0.9334848680283766, + "learning_rate": 9.923490435602685e-06, + "loss": 0.6438, + "step": 1320 + }, + { + "epoch": 0.17, + "grad_norm": 0.853719193099155, + "learning_rate": 9.923310557338681e-06, + "loss": 0.5958, + "step": 1321 + }, + { + "epoch": 0.17, + "grad_norm": 0.6323711898369337, + "learning_rate": 9.923130469505081e-06, + "loss": 0.524, + "step": 1322 + }, + { + "epoch": 0.17, + "grad_norm": 0.6266910281118175, + "learning_rate": 9.922950172109549e-06, + "loss": 0.5212, + "step": 1323 + }, + { + "epoch": 0.17, + "grad_norm": 0.6949655797370283, + "learning_rate": 9.922769665159759e-06, + "loss": 0.5901, + "step": 1324 + }, + { + "epoch": 0.17, + "grad_norm": 0.8504757543505016, + "learning_rate": 9.922588948663395e-06, + "loss": 0.6324, + "step": 1325 + }, + { + "epoch": 0.17, + "grad_norm": 1.198837872795737, + "learning_rate": 9.92240802262815e-06, + "loss": 0.5773, + "step": 1326 + }, + { + "epoch": 0.17, + "grad_norm": 0.7389168502790365, + "learning_rate": 9.922226887061726e-06, + "loss": 0.6019, + "step": 1327 + }, + { + "epoch": 0.17, + "grad_norm": 0.9543392867703335, + "learning_rate": 9.922045541971831e-06, + "loss": 0.6718, + "step": 1328 + }, + { + "epoch": 0.17, + "grad_norm": 0.6769447419107208, + "learning_rate": 9.921863987366187e-06, + "loss": 0.5451, + "step": 1329 + }, + { + "epoch": 0.17, + "grad_norm": 0.7995115583222592, + "learning_rate": 9.921682223252522e-06, + "loss": 0.6461, + "step": 1330 + }, + { + "epoch": 0.17, + "grad_norm": 0.5807904876349749, + "learning_rate": 9.921500249638572e-06, + "loss": 0.5647, + "step": 1331 + }, + { + "epoch": 0.17, + "grad_norm": 0.7328444051173592, + "learning_rate": 9.921318066532082e-06, + "loss": 0.5567, + "step": 1332 + }, + { + "epoch": 0.17, + "grad_norm": 0.6423699956731261, + "learning_rate": 9.921135673940809e-06, + "loss": 0.532, + "step": 1333 + }, + { + "epoch": 0.17, + "grad_norm": 1.0732783771120218, + "learning_rate": 9.920953071872514e-06, + "loss": 0.6393, + "step": 1334 + }, + { + "epoch": 0.17, + "grad_norm": 0.6072194907461469, + "learning_rate": 9.920770260334974e-06, + "loss": 0.5242, + "step": 1335 + }, + { + "epoch": 0.17, + "grad_norm": 0.8678831232928724, + "learning_rate": 9.920587239335968e-06, + "loss": 0.6428, + "step": 1336 + }, + { + "epoch": 0.17, + "grad_norm": 0.7322646257079967, + "learning_rate": 9.920404008883287e-06, + "loss": 0.582, + "step": 1337 + }, + { + "epoch": 0.17, + "grad_norm": 0.6987065357589037, + "learning_rate": 9.920220568984731e-06, + "loss": 0.5263, + "step": 1338 + }, + { + "epoch": 0.17, + "grad_norm": 0.894246119987906, + "learning_rate": 9.920036919648108e-06, + "loss": 0.6545, + "step": 1339 + }, + { + "epoch": 0.17, + "grad_norm": 0.7271150938938604, + "learning_rate": 9.919853060881238e-06, + "loss": 0.6291, + "step": 1340 + }, + { + "epoch": 0.17, + "grad_norm": 1.0716868696461448, + "learning_rate": 9.919668992691941e-06, + "loss": 0.6191, + "step": 1341 + }, + { + "epoch": 0.17, + "grad_norm": 1.07794875912514, + "learning_rate": 9.919484715088057e-06, + "loss": 0.6533, + "step": 1342 + }, + { + "epoch": 0.17, + "grad_norm": 0.7029692021347415, + "learning_rate": 9.91930022807743e-06, + "loss": 0.5706, + "step": 1343 + }, + { + "epoch": 0.17, + "grad_norm": 0.5813230784025685, + "learning_rate": 9.919115531667911e-06, + "loss": 0.5593, + "step": 1344 + }, + { + "epoch": 0.17, + "grad_norm": 0.605113227667247, + "learning_rate": 9.918930625867364e-06, + "loss": 0.5373, + "step": 1345 + }, + { + "epoch": 0.17, + "grad_norm": 0.7225811940174828, + "learning_rate": 9.918745510683659e-06, + "loss": 0.6309, + "step": 1346 + }, + { + "epoch": 0.17, + "grad_norm": 0.6053881124764692, + "learning_rate": 9.918560186124674e-06, + "loss": 0.5351, + "step": 1347 + }, + { + "epoch": 0.17, + "grad_norm": 0.6818319116848462, + "learning_rate": 9.918374652198302e-06, + "loss": 0.5734, + "step": 1348 + }, + { + "epoch": 0.17, + "grad_norm": 0.8958174067084802, + "learning_rate": 9.918188908912436e-06, + "loss": 0.6048, + "step": 1349 + }, + { + "epoch": 0.17, + "grad_norm": 0.8187811101690751, + "learning_rate": 9.918002956274986e-06, + "loss": 0.6245, + "step": 1350 + }, + { + "epoch": 0.17, + "grad_norm": 0.7363704720909162, + "learning_rate": 9.917816794293864e-06, + "loss": 0.5962, + "step": 1351 + }, + { + "epoch": 0.17, + "grad_norm": 0.7539830444599103, + "learning_rate": 9.917630422976997e-06, + "loss": 0.632, + "step": 1352 + }, + { + "epoch": 0.17, + "grad_norm": 0.800461503275176, + "learning_rate": 9.917443842332318e-06, + "loss": 0.6383, + "step": 1353 + }, + { + "epoch": 0.17, + "grad_norm": 1.5859864768068026, + "learning_rate": 9.917257052367768e-06, + "loss": 0.5852, + "step": 1354 + }, + { + "epoch": 0.17, + "grad_norm": 0.7748718361879976, + "learning_rate": 9.917070053091298e-06, + "loss": 0.5468, + "step": 1355 + }, + { + "epoch": 0.17, + "grad_norm": 0.7480038777585786, + "learning_rate": 9.916882844510868e-06, + "loss": 0.6377, + "step": 1356 + }, + { + "epoch": 0.17, + "grad_norm": 0.5950072074474939, + "learning_rate": 9.91669542663445e-06, + "loss": 0.5946, + "step": 1357 + }, + { + "epoch": 0.17, + "grad_norm": 0.7547818256118963, + "learning_rate": 9.916507799470016e-06, + "loss": 0.5945, + "step": 1358 + }, + { + "epoch": 0.17, + "grad_norm": 0.6190436223612008, + "learning_rate": 9.916319963025558e-06, + "loss": 0.588, + "step": 1359 + }, + { + "epoch": 0.17, + "grad_norm": 0.6289210240530831, + "learning_rate": 9.91613191730907e-06, + "loss": 0.5692, + "step": 1360 + }, + { + "epoch": 0.17, + "grad_norm": 0.6210779960478716, + "learning_rate": 9.915943662328555e-06, + "loss": 0.5621, + "step": 1361 + }, + { + "epoch": 0.17, + "grad_norm": 1.0351385027493207, + "learning_rate": 9.915755198092027e-06, + "loss": 0.6741, + "step": 1362 + }, + { + "epoch": 0.17, + "grad_norm": 1.0988988261684944, + "learning_rate": 9.915566524607509e-06, + "loss": 0.5733, + "step": 1363 + }, + { + "epoch": 0.17, + "grad_norm": 0.8624232098103166, + "learning_rate": 9.915377641883033e-06, + "loss": 0.6935, + "step": 1364 + }, + { + "epoch": 0.17, + "grad_norm": 0.9210306619681377, + "learning_rate": 9.915188549926637e-06, + "loss": 0.6424, + "step": 1365 + }, + { + "epoch": 0.17, + "grad_norm": 0.9948571038404521, + "learning_rate": 9.914999248746371e-06, + "loss": 0.6696, + "step": 1366 + }, + { + "epoch": 0.17, + "grad_norm": 3.2880528637544018, + "learning_rate": 9.914809738350294e-06, + "loss": 0.6204, + "step": 1367 + }, + { + "epoch": 0.17, + "grad_norm": 0.6282329190313225, + "learning_rate": 9.914620018746472e-06, + "loss": 0.5488, + "step": 1368 + }, + { + "epoch": 0.17, + "grad_norm": 0.8313993414409726, + "learning_rate": 9.91443008994298e-06, + "loss": 0.6667, + "step": 1369 + }, + { + "epoch": 0.17, + "grad_norm": 1.1268742468420139, + "learning_rate": 9.914239951947902e-06, + "loss": 0.7016, + "step": 1370 + }, + { + "epoch": 0.17, + "grad_norm": 0.7616004118200012, + "learning_rate": 9.914049604769335e-06, + "loss": 0.6171, + "step": 1371 + }, + { + "epoch": 0.17, + "grad_norm": 0.6443959873384107, + "learning_rate": 9.913859048415378e-06, + "loss": 0.5593, + "step": 1372 + }, + { + "epoch": 0.17, + "grad_norm": 0.8507864958821436, + "learning_rate": 9.913668282894144e-06, + "loss": 0.6136, + "step": 1373 + }, + { + "epoch": 0.18, + "grad_norm": 0.7798333503909397, + "learning_rate": 9.913477308213754e-06, + "loss": 0.6367, + "step": 1374 + }, + { + "epoch": 0.18, + "grad_norm": 0.8440171935341105, + "learning_rate": 9.913286124382335e-06, + "loss": 0.6558, + "step": 1375 + }, + { + "epoch": 0.18, + "grad_norm": 0.7562020603164891, + "learning_rate": 9.913094731408029e-06, + "loss": 0.6334, + "step": 1376 + }, + { + "epoch": 0.18, + "grad_norm": 0.7084724801967266, + "learning_rate": 9.912903129298977e-06, + "loss": 0.5323, + "step": 1377 + }, + { + "epoch": 0.18, + "grad_norm": 0.6243371740891822, + "learning_rate": 9.91271131806334e-06, + "loss": 0.5569, + "step": 1378 + }, + { + "epoch": 0.18, + "grad_norm": 0.5874709344697451, + "learning_rate": 9.91251929770928e-06, + "loss": 0.539, + "step": 1379 + }, + { + "epoch": 0.18, + "grad_norm": 1.221005406389, + "learning_rate": 9.912327068244972e-06, + "loss": 0.6443, + "step": 1380 + }, + { + "epoch": 0.18, + "grad_norm": 0.6991482269482668, + "learning_rate": 9.912134629678598e-06, + "loss": 0.5285, + "step": 1381 + }, + { + "epoch": 0.18, + "grad_norm": 0.7167757674295981, + "learning_rate": 9.91194198201835e-06, + "loss": 0.5978, + "step": 1382 + }, + { + "epoch": 0.18, + "grad_norm": 0.5761426658547066, + "learning_rate": 9.911749125272428e-06, + "loss": 0.5181, + "step": 1383 + }, + { + "epoch": 0.18, + "grad_norm": 0.8663088232605922, + "learning_rate": 9.911556059449043e-06, + "loss": 0.6388, + "step": 1384 + }, + { + "epoch": 0.18, + "grad_norm": 1.49120814305152, + "learning_rate": 9.91136278455641e-06, + "loss": 0.6325, + "step": 1385 + }, + { + "epoch": 0.18, + "grad_norm": 0.623522604963163, + "learning_rate": 9.911169300602758e-06, + "loss": 0.5337, + "step": 1386 + }, + { + "epoch": 0.18, + "grad_norm": 0.7734456232414336, + "learning_rate": 9.910975607596322e-06, + "loss": 0.5485, + "step": 1387 + }, + { + "epoch": 0.18, + "grad_norm": 0.5857910899105334, + "learning_rate": 9.91078170554535e-06, + "loss": 0.4878, + "step": 1388 + }, + { + "epoch": 0.18, + "grad_norm": 1.089985469595313, + "learning_rate": 9.910587594458089e-06, + "loss": 0.6836, + "step": 1389 + }, + { + "epoch": 0.18, + "grad_norm": 0.8096160681242981, + "learning_rate": 9.910393274342811e-06, + "loss": 0.613, + "step": 1390 + }, + { + "epoch": 0.18, + "grad_norm": 0.9494275055958991, + "learning_rate": 9.91019874520778e-06, + "loss": 0.6892, + "step": 1391 + }, + { + "epoch": 0.18, + "grad_norm": 0.8545086755605236, + "learning_rate": 9.910004007061281e-06, + "loss": 0.5985, + "step": 1392 + }, + { + "epoch": 0.18, + "grad_norm": 0.8613985857742912, + "learning_rate": 9.9098090599116e-06, + "loss": 0.6026, + "step": 1393 + }, + { + "epoch": 0.18, + "grad_norm": 0.6690755088707274, + "learning_rate": 9.909613903767038e-06, + "loss": 0.5481, + "step": 1394 + }, + { + "epoch": 0.18, + "grad_norm": 0.649320401916616, + "learning_rate": 9.909418538635898e-06, + "loss": 0.5486, + "step": 1395 + }, + { + "epoch": 0.18, + "grad_norm": 0.8956892661343855, + "learning_rate": 9.909222964526502e-06, + "loss": 0.6372, + "step": 1396 + }, + { + "epoch": 0.18, + "grad_norm": 0.6206099989106391, + "learning_rate": 9.909027181447171e-06, + "loss": 0.5523, + "step": 1397 + }, + { + "epoch": 0.18, + "grad_norm": 0.6666549049604876, + "learning_rate": 9.90883118940624e-06, + "loss": 0.4955, + "step": 1398 + }, + { + "epoch": 0.18, + "grad_norm": 0.9338525361206736, + "learning_rate": 9.908634988412052e-06, + "loss": 0.7067, + "step": 1399 + }, + { + "epoch": 0.18, + "grad_norm": 0.5636396475962866, + "learning_rate": 9.908438578472958e-06, + "loss": 0.502, + "step": 1400 + }, + { + "epoch": 0.18, + "grad_norm": 0.9191880855606592, + "learning_rate": 9.908241959597317e-06, + "loss": 0.6488, + "step": 1401 + }, + { + "epoch": 0.18, + "grad_norm": 0.6686319641226389, + "learning_rate": 9.908045131793504e-06, + "loss": 0.533, + "step": 1402 + }, + { + "epoch": 0.18, + "grad_norm": 0.7011337591826701, + "learning_rate": 9.907848095069892e-06, + "loss": 0.5539, + "step": 1403 + }, + { + "epoch": 0.18, + "grad_norm": 0.6767613065949596, + "learning_rate": 9.907650849434868e-06, + "loss": 0.6041, + "step": 1404 + }, + { + "epoch": 0.18, + "grad_norm": 0.832178458992157, + "learning_rate": 9.907453394896834e-06, + "loss": 0.6848, + "step": 1405 + }, + { + "epoch": 0.18, + "grad_norm": 0.7569351652390657, + "learning_rate": 9.907255731464186e-06, + "loss": 0.5541, + "step": 1406 + }, + { + "epoch": 0.18, + "grad_norm": 0.6697095708357677, + "learning_rate": 9.907057859145347e-06, + "loss": 0.4926, + "step": 1407 + }, + { + "epoch": 0.18, + "grad_norm": 0.6832580071791883, + "learning_rate": 9.906859777948735e-06, + "loss": 0.5754, + "step": 1408 + }, + { + "epoch": 0.18, + "grad_norm": 0.8892841362498346, + "learning_rate": 9.906661487882781e-06, + "loss": 0.6543, + "step": 1409 + }, + { + "epoch": 0.18, + "grad_norm": 0.6172957653929051, + "learning_rate": 9.906462988955927e-06, + "loss": 0.5229, + "step": 1410 + }, + { + "epoch": 0.18, + "grad_norm": 0.6165829961599874, + "learning_rate": 9.90626428117662e-06, + "loss": 0.5249, + "step": 1411 + }, + { + "epoch": 0.18, + "grad_norm": 0.9504490083975338, + "learning_rate": 9.906065364553325e-06, + "loss": 0.6192, + "step": 1412 + }, + { + "epoch": 0.18, + "grad_norm": 1.1873244845545143, + "learning_rate": 9.905866239094504e-06, + "loss": 0.6932, + "step": 1413 + }, + { + "epoch": 0.18, + "grad_norm": 8.06923183666248, + "learning_rate": 9.905666904808634e-06, + "loss": 0.6468, + "step": 1414 + }, + { + "epoch": 0.18, + "grad_norm": 0.6611026336952993, + "learning_rate": 9.905467361704197e-06, + "loss": 0.5087, + "step": 1415 + }, + { + "epoch": 0.18, + "grad_norm": 0.8563940941522039, + "learning_rate": 9.905267609789694e-06, + "loss": 0.6809, + "step": 1416 + }, + { + "epoch": 0.18, + "grad_norm": 0.7925201345931134, + "learning_rate": 9.905067649073623e-06, + "loss": 0.577, + "step": 1417 + }, + { + "epoch": 0.18, + "grad_norm": 0.7296902809381323, + "learning_rate": 9.904867479564495e-06, + "loss": 0.5895, + "step": 1418 + }, + { + "epoch": 0.18, + "grad_norm": 0.7197783778450046, + "learning_rate": 9.904667101270832e-06, + "loss": 0.526, + "step": 1419 + }, + { + "epoch": 0.18, + "grad_norm": 0.9967702771570577, + "learning_rate": 9.904466514201166e-06, + "loss": 0.6622, + "step": 1420 + }, + { + "epoch": 0.18, + "grad_norm": 0.8584232773378019, + "learning_rate": 9.904265718364032e-06, + "loss": 0.5902, + "step": 1421 + }, + { + "epoch": 0.18, + "grad_norm": 1.0000577762329252, + "learning_rate": 9.904064713767978e-06, + "loss": 0.6025, + "step": 1422 + }, + { + "epoch": 0.18, + "grad_norm": 0.6847807358568969, + "learning_rate": 9.90386350042156e-06, + "loss": 0.5539, + "step": 1423 + }, + { + "epoch": 0.18, + "grad_norm": 1.0644584919556448, + "learning_rate": 9.903662078333342e-06, + "loss": 0.6682, + "step": 1424 + }, + { + "epoch": 0.18, + "grad_norm": 0.5823183939248158, + "learning_rate": 9.9034604475119e-06, + "loss": 0.5162, + "step": 1425 + }, + { + "epoch": 0.18, + "grad_norm": 0.9679891295378018, + "learning_rate": 9.903258607965818e-06, + "loss": 0.6741, + "step": 1426 + }, + { + "epoch": 0.18, + "grad_norm": 0.6281644095149793, + "learning_rate": 9.903056559703683e-06, + "loss": 0.5355, + "step": 1427 + }, + { + "epoch": 0.18, + "grad_norm": 0.9660733377233732, + "learning_rate": 9.902854302734099e-06, + "loss": 0.6062, + "step": 1428 + }, + { + "epoch": 0.18, + "grad_norm": 1.5278258123370565, + "learning_rate": 9.902651837065675e-06, + "loss": 0.6827, + "step": 1429 + }, + { + "epoch": 0.18, + "grad_norm": 0.8966393076686645, + "learning_rate": 9.902449162707029e-06, + "loss": 0.6557, + "step": 1430 + }, + { + "epoch": 0.18, + "grad_norm": 0.8878906994413838, + "learning_rate": 9.90224627966679e-06, + "loss": 0.6704, + "step": 1431 + }, + { + "epoch": 0.18, + "grad_norm": 1.1623132432739118, + "learning_rate": 9.902043187953589e-06, + "loss": 0.6952, + "step": 1432 + }, + { + "epoch": 0.18, + "grad_norm": 0.9283657657374793, + "learning_rate": 9.901839887576075e-06, + "loss": 0.6659, + "step": 1433 + }, + { + "epoch": 0.18, + "grad_norm": 0.9289844103113588, + "learning_rate": 9.901636378542902e-06, + "loss": 0.6226, + "step": 1434 + }, + { + "epoch": 0.18, + "grad_norm": 0.7697613872955419, + "learning_rate": 9.901432660862731e-06, + "loss": 0.5677, + "step": 1435 + }, + { + "epoch": 0.18, + "grad_norm": 0.8329828227111222, + "learning_rate": 9.901228734544235e-06, + "loss": 0.586, + "step": 1436 + }, + { + "epoch": 0.18, + "grad_norm": 0.6647831985745791, + "learning_rate": 9.901024599596092e-06, + "loss": 0.5618, + "step": 1437 + }, + { + "epoch": 0.18, + "grad_norm": 0.9703069379973013, + "learning_rate": 9.900820256026996e-06, + "loss": 0.6259, + "step": 1438 + }, + { + "epoch": 0.18, + "grad_norm": 0.7257311929494387, + "learning_rate": 9.90061570384564e-06, + "loss": 0.5675, + "step": 1439 + }, + { + "epoch": 0.18, + "grad_norm": 0.6434160744348719, + "learning_rate": 9.900410943060734e-06, + "loss": 0.5133, + "step": 1440 + }, + { + "epoch": 0.18, + "grad_norm": 0.567375826023123, + "learning_rate": 9.900205973680996e-06, + "loss": 0.5638, + "step": 1441 + }, + { + "epoch": 0.18, + "grad_norm": 1.1463821681631006, + "learning_rate": 9.900000795715146e-06, + "loss": 0.6737, + "step": 1442 + }, + { + "epoch": 0.18, + "grad_norm": 0.7329192627325487, + "learning_rate": 9.899795409171923e-06, + "loss": 0.5904, + "step": 1443 + }, + { + "epoch": 0.18, + "grad_norm": 0.8684228605169513, + "learning_rate": 9.899589814060063e-06, + "loss": 0.627, + "step": 1444 + }, + { + "epoch": 0.18, + "grad_norm": 0.5880745445069139, + "learning_rate": 9.899384010388324e-06, + "loss": 0.5317, + "step": 1445 + }, + { + "epoch": 0.18, + "grad_norm": 0.6981588853474568, + "learning_rate": 9.899177998165464e-06, + "loss": 0.5199, + "step": 1446 + }, + { + "epoch": 0.18, + "grad_norm": 0.9437908832128978, + "learning_rate": 9.898971777400251e-06, + "loss": 0.6431, + "step": 1447 + }, + { + "epoch": 0.18, + "grad_norm": 1.0875796335565775, + "learning_rate": 9.898765348101464e-06, + "loss": 0.6638, + "step": 1448 + }, + { + "epoch": 0.18, + "grad_norm": 0.8013969374803733, + "learning_rate": 9.898558710277893e-06, + "loss": 0.6208, + "step": 1449 + }, + { + "epoch": 0.18, + "grad_norm": 0.827640750552822, + "learning_rate": 9.898351863938328e-06, + "loss": 0.6842, + "step": 1450 + }, + { + "epoch": 0.18, + "grad_norm": 0.8845593547754813, + "learning_rate": 9.898144809091578e-06, + "loss": 0.6881, + "step": 1451 + }, + { + "epoch": 0.18, + "grad_norm": 0.6339290565768775, + "learning_rate": 9.897937545746457e-06, + "loss": 0.5589, + "step": 1452 + }, + { + "epoch": 0.19, + "grad_norm": 0.63683296212066, + "learning_rate": 9.897730073911785e-06, + "loss": 0.5677, + "step": 1453 + }, + { + "epoch": 0.19, + "grad_norm": 0.8037689871613029, + "learning_rate": 9.897522393596395e-06, + "loss": 0.6354, + "step": 1454 + }, + { + "epoch": 0.19, + "grad_norm": 0.7750823129055913, + "learning_rate": 9.897314504809128e-06, + "loss": 0.5568, + "step": 1455 + }, + { + "epoch": 0.19, + "grad_norm": 0.6269252556078115, + "learning_rate": 9.89710640755883e-06, + "loss": 0.5016, + "step": 1456 + }, + { + "epoch": 0.19, + "grad_norm": 0.5762192182401308, + "learning_rate": 9.896898101854363e-06, + "loss": 0.5174, + "step": 1457 + }, + { + "epoch": 0.19, + "grad_norm": 1.5017342628673056, + "learning_rate": 9.896689587704591e-06, + "loss": 0.6123, + "step": 1458 + }, + { + "epoch": 0.19, + "grad_norm": 0.7616051248698723, + "learning_rate": 9.896480865118393e-06, + "loss": 0.5661, + "step": 1459 + }, + { + "epoch": 0.19, + "grad_norm": 0.6299426594954421, + "learning_rate": 9.896271934104649e-06, + "loss": 0.537, + "step": 1460 + }, + { + "epoch": 0.19, + "grad_norm": 0.6590310849058559, + "learning_rate": 9.896062794672255e-06, + "loss": 0.5836, + "step": 1461 + }, + { + "epoch": 0.19, + "grad_norm": 0.6677798165719644, + "learning_rate": 9.895853446830115e-06, + "loss": 0.6128, + "step": 1462 + }, + { + "epoch": 0.19, + "grad_norm": 1.7311823577825591, + "learning_rate": 9.895643890587137e-06, + "loss": 0.6706, + "step": 1463 + }, + { + "epoch": 0.19, + "grad_norm": 0.5735339436796842, + "learning_rate": 9.895434125952244e-06, + "loss": 0.552, + "step": 1464 + }, + { + "epoch": 0.19, + "grad_norm": 0.6007134379056589, + "learning_rate": 9.895224152934362e-06, + "loss": 0.5615, + "step": 1465 + }, + { + "epoch": 0.19, + "grad_norm": 0.792092039855217, + "learning_rate": 9.895013971542433e-06, + "loss": 0.592, + "step": 1466 + }, + { + "epoch": 0.19, + "grad_norm": 0.9023231896386574, + "learning_rate": 9.8948035817854e-06, + "loss": 0.6443, + "step": 1467 + }, + { + "epoch": 0.19, + "grad_norm": 0.8971879997141177, + "learning_rate": 9.894592983672223e-06, + "loss": 0.6252, + "step": 1468 + }, + { + "epoch": 0.19, + "grad_norm": 0.6786774174016315, + "learning_rate": 9.89438217721186e-06, + "loss": 0.5836, + "step": 1469 + }, + { + "epoch": 0.19, + "grad_norm": 0.7149454501134153, + "learning_rate": 9.894171162413289e-06, + "loss": 0.5909, + "step": 1470 + }, + { + "epoch": 0.19, + "grad_norm": 0.6392940942523266, + "learning_rate": 9.893959939285491e-06, + "loss": 0.5693, + "step": 1471 + }, + { + "epoch": 0.19, + "grad_norm": 0.8879975013720354, + "learning_rate": 9.893748507837458e-06, + "loss": 0.6327, + "step": 1472 + }, + { + "epoch": 0.19, + "grad_norm": 0.8588077828201054, + "learning_rate": 9.893536868078188e-06, + "loss": 0.6957, + "step": 1473 + }, + { + "epoch": 0.19, + "grad_norm": 0.9244754258118437, + "learning_rate": 9.893325020016692e-06, + "loss": 0.6857, + "step": 1474 + }, + { + "epoch": 0.19, + "grad_norm": 1.0370381276392335, + "learning_rate": 9.893112963661986e-06, + "loss": 0.5826, + "step": 1475 + }, + { + "epoch": 0.19, + "grad_norm": 0.7727905979684659, + "learning_rate": 9.892900699023098e-06, + "loss": 0.6026, + "step": 1476 + }, + { + "epoch": 0.19, + "grad_norm": 0.6070696744331471, + "learning_rate": 9.892688226109064e-06, + "loss": 0.4797, + "step": 1477 + }, + { + "epoch": 0.19, + "grad_norm": 0.7962443280674872, + "learning_rate": 9.892475544928925e-06, + "loss": 0.5596, + "step": 1478 + }, + { + "epoch": 0.19, + "grad_norm": 0.8647045792132472, + "learning_rate": 9.892262655491736e-06, + "loss": 0.6613, + "step": 1479 + }, + { + "epoch": 0.19, + "grad_norm": 0.7776722259973866, + "learning_rate": 9.89204955780656e-06, + "loss": 0.6021, + "step": 1480 + }, + { + "epoch": 0.19, + "grad_norm": 0.6041916236982071, + "learning_rate": 9.891836251882468e-06, + "loss": 0.4868, + "step": 1481 + }, + { + "epoch": 0.19, + "grad_norm": 0.6806411366767511, + "learning_rate": 9.891622737728537e-06, + "loss": 0.5788, + "step": 1482 + }, + { + "epoch": 0.19, + "grad_norm": 0.7959127973674263, + "learning_rate": 9.891409015353859e-06, + "loss": 0.6046, + "step": 1483 + }, + { + "epoch": 0.19, + "grad_norm": 0.8228697951855963, + "learning_rate": 9.89119508476753e-06, + "loss": 0.5781, + "step": 1484 + }, + { + "epoch": 0.19, + "grad_norm": 0.7064391469084955, + "learning_rate": 9.890980945978655e-06, + "loss": 0.5517, + "step": 1485 + }, + { + "epoch": 0.19, + "grad_norm": 0.9005920425585042, + "learning_rate": 9.89076659899635e-06, + "loss": 0.6435, + "step": 1486 + }, + { + "epoch": 0.19, + "grad_norm": 0.7604358015849504, + "learning_rate": 9.89055204382974e-06, + "loss": 0.591, + "step": 1487 + }, + { + "epoch": 0.19, + "grad_norm": 0.684928912636846, + "learning_rate": 9.890337280487958e-06, + "loss": 0.5108, + "step": 1488 + }, + { + "epoch": 0.19, + "grad_norm": 0.8129477359373968, + "learning_rate": 9.890122308980145e-06, + "loss": 0.6252, + "step": 1489 + }, + { + "epoch": 0.19, + "grad_norm": 0.8150066125639978, + "learning_rate": 9.889907129315452e-06, + "loss": 0.6235, + "step": 1490 + }, + { + "epoch": 0.19, + "grad_norm": 0.902468709050217, + "learning_rate": 9.889691741503038e-06, + "loss": 0.6093, + "step": 1491 + }, + { + "epoch": 0.19, + "grad_norm": 0.922199679349637, + "learning_rate": 9.889476145552073e-06, + "loss": 0.687, + "step": 1492 + }, + { + "epoch": 0.19, + "grad_norm": 0.7101187977843562, + "learning_rate": 9.889260341471732e-06, + "loss": 0.5222, + "step": 1493 + }, + { + "epoch": 0.19, + "grad_norm": 0.6516247339669807, + "learning_rate": 9.889044329271202e-06, + "loss": 0.6044, + "step": 1494 + }, + { + "epoch": 0.19, + "grad_norm": 0.6617508541540921, + "learning_rate": 9.888828108959678e-06, + "loss": 0.5943, + "step": 1495 + }, + { + "epoch": 0.19, + "grad_norm": 0.6547760173649779, + "learning_rate": 9.888611680546366e-06, + "loss": 0.5926, + "step": 1496 + }, + { + "epoch": 0.19, + "grad_norm": 0.8146903408494094, + "learning_rate": 9.888395044040475e-06, + "loss": 0.6898, + "step": 1497 + }, + { + "epoch": 0.19, + "grad_norm": 0.9730046667110773, + "learning_rate": 9.888178199451227e-06, + "loss": 0.6485, + "step": 1498 + }, + { + "epoch": 0.19, + "grad_norm": 0.5776389634461832, + "learning_rate": 9.887961146787854e-06, + "loss": 0.5667, + "step": 1499 + }, + { + "epoch": 0.19, + "grad_norm": 0.9170097770006553, + "learning_rate": 9.887743886059595e-06, + "loss": 0.6274, + "step": 1500 + }, + { + "epoch": 0.19, + "grad_norm": 0.675811892876071, + "learning_rate": 9.887526417275698e-06, + "loss": 0.5421, + "step": 1501 + }, + { + "epoch": 0.19, + "grad_norm": 0.8182205446623694, + "learning_rate": 9.88730874044542e-06, + "loss": 0.6506, + "step": 1502 + }, + { + "epoch": 0.19, + "grad_norm": 0.688221644284625, + "learning_rate": 9.887090855578026e-06, + "loss": 0.5645, + "step": 1503 + }, + { + "epoch": 0.19, + "grad_norm": 0.7839834339247176, + "learning_rate": 9.886872762682791e-06, + "loss": 0.6135, + "step": 1504 + }, + { + "epoch": 0.19, + "grad_norm": 0.7414209322009344, + "learning_rate": 9.886654461768999e-06, + "loss": 0.5907, + "step": 1505 + }, + { + "epoch": 0.19, + "grad_norm": 0.7005784343852325, + "learning_rate": 9.886435952845941e-06, + "loss": 0.5969, + "step": 1506 + }, + { + "epoch": 0.19, + "grad_norm": 0.9034003306411155, + "learning_rate": 9.886217235922921e-06, + "loss": 0.6325, + "step": 1507 + }, + { + "epoch": 0.19, + "grad_norm": 0.6488541912557233, + "learning_rate": 9.885998311009246e-06, + "loss": 0.5933, + "step": 1508 + }, + { + "epoch": 0.19, + "grad_norm": 0.5939210241457038, + "learning_rate": 9.885779178114236e-06, + "loss": 0.5055, + "step": 1509 + }, + { + "epoch": 0.19, + "grad_norm": 0.7107377608775156, + "learning_rate": 9.88555983724722e-06, + "loss": 0.5928, + "step": 1510 + }, + { + "epoch": 0.19, + "grad_norm": 1.1112842519856718, + "learning_rate": 9.885340288417535e-06, + "loss": 0.7156, + "step": 1511 + }, + { + "epoch": 0.19, + "grad_norm": 0.6305165668223054, + "learning_rate": 9.885120531634525e-06, + "loss": 0.614, + "step": 1512 + }, + { + "epoch": 0.19, + "grad_norm": 0.7158541089272958, + "learning_rate": 9.884900566907543e-06, + "loss": 0.5829, + "step": 1513 + }, + { + "epoch": 0.19, + "grad_norm": 0.9801514047422134, + "learning_rate": 9.884680394245953e-06, + "loss": 0.6375, + "step": 1514 + }, + { + "epoch": 0.19, + "grad_norm": 0.8273363649517759, + "learning_rate": 9.88446001365913e-06, + "loss": 0.6249, + "step": 1515 + }, + { + "epoch": 0.19, + "grad_norm": 0.7454375934919218, + "learning_rate": 9.884239425156452e-06, + "loss": 0.6193, + "step": 1516 + }, + { + "epoch": 0.19, + "grad_norm": 0.7092783240013331, + "learning_rate": 9.88401862874731e-06, + "loss": 0.5809, + "step": 1517 + }, + { + "epoch": 0.19, + "grad_norm": 0.6810210577340193, + "learning_rate": 9.8837976244411e-06, + "loss": 0.5678, + "step": 1518 + }, + { + "epoch": 0.19, + "grad_norm": 0.6809892291975016, + "learning_rate": 9.883576412247233e-06, + "loss": 0.5781, + "step": 1519 + }, + { + "epoch": 0.19, + "grad_norm": 0.8503934579940726, + "learning_rate": 9.883354992175124e-06, + "loss": 0.6795, + "step": 1520 + }, + { + "epoch": 0.19, + "grad_norm": 0.9274211197639796, + "learning_rate": 9.883133364234198e-06, + "loss": 0.7125, + "step": 1521 + }, + { + "epoch": 0.19, + "grad_norm": 0.9042072813027953, + "learning_rate": 9.88291152843389e-06, + "loss": 0.7166, + "step": 1522 + }, + { + "epoch": 0.19, + "grad_norm": 0.6546267571881859, + "learning_rate": 9.88268948478364e-06, + "loss": 0.5698, + "step": 1523 + }, + { + "epoch": 0.19, + "grad_norm": 0.6648143273555109, + "learning_rate": 9.882467233292901e-06, + "loss": 0.6095, + "step": 1524 + }, + { + "epoch": 0.19, + "grad_norm": 0.669896902749318, + "learning_rate": 9.882244773971137e-06, + "loss": 0.6117, + "step": 1525 + }, + { + "epoch": 0.19, + "grad_norm": 0.669379190520074, + "learning_rate": 9.88202210682781e-06, + "loss": 0.5603, + "step": 1526 + }, + { + "epoch": 0.19, + "grad_norm": 0.6705263480036763, + "learning_rate": 9.881799231872406e-06, + "loss": 0.5099, + "step": 1527 + }, + { + "epoch": 0.19, + "grad_norm": 0.5993458923502961, + "learning_rate": 9.881576149114407e-06, + "loss": 0.509, + "step": 1528 + }, + { + "epoch": 0.19, + "grad_norm": 2.0238276485462094, + "learning_rate": 9.88135285856331e-06, + "loss": 0.5775, + "step": 1529 + }, + { + "epoch": 0.19, + "grad_norm": 0.8936331822670118, + "learning_rate": 9.881129360228623e-06, + "loss": 0.6531, + "step": 1530 + }, + { + "epoch": 0.2, + "grad_norm": 0.9032830182022972, + "learning_rate": 9.880905654119856e-06, + "loss": 0.5964, + "step": 1531 + }, + { + "epoch": 0.2, + "grad_norm": 0.8753903201065488, + "learning_rate": 9.880681740246531e-06, + "loss": 0.6343, + "step": 1532 + }, + { + "epoch": 0.2, + "grad_norm": 0.7976664177553304, + "learning_rate": 9.880457618618184e-06, + "loss": 0.558, + "step": 1533 + }, + { + "epoch": 0.2, + "grad_norm": 0.769384373440501, + "learning_rate": 9.88023328924435e-06, + "loss": 0.5724, + "step": 1534 + }, + { + "epoch": 0.2, + "grad_norm": 0.708971831517012, + "learning_rate": 9.880008752134578e-06, + "loss": 0.5605, + "step": 1535 + }, + { + "epoch": 0.2, + "grad_norm": 0.6838771857352796, + "learning_rate": 9.87978400729843e-06, + "loss": 0.5889, + "step": 1536 + }, + { + "epoch": 0.2, + "grad_norm": 0.7179981698375474, + "learning_rate": 9.87955905474547e-06, + "loss": 0.6386, + "step": 1537 + }, + { + "epoch": 0.2, + "grad_norm": 0.7286557985737631, + "learning_rate": 9.879333894485272e-06, + "loss": 0.5959, + "step": 1538 + }, + { + "epoch": 0.2, + "grad_norm": 0.6417993377828476, + "learning_rate": 9.879108526527425e-06, + "loss": 0.5761, + "step": 1539 + }, + { + "epoch": 0.2, + "grad_norm": 0.6455887264965151, + "learning_rate": 9.878882950881518e-06, + "loss": 0.5701, + "step": 1540 + }, + { + "epoch": 0.2, + "grad_norm": 0.9392531020535543, + "learning_rate": 9.878657167557156e-06, + "loss": 0.6737, + "step": 1541 + }, + { + "epoch": 0.2, + "grad_norm": 0.8052631006205957, + "learning_rate": 9.878431176563945e-06, + "loss": 0.6034, + "step": 1542 + }, + { + "epoch": 0.2, + "grad_norm": 0.6452744052517272, + "learning_rate": 9.87820497791151e-06, + "loss": 0.5617, + "step": 1543 + }, + { + "epoch": 0.2, + "grad_norm": 0.8888982388016089, + "learning_rate": 9.877978571609479e-06, + "loss": 0.6541, + "step": 1544 + }, + { + "epoch": 0.2, + "grad_norm": 0.8622303599846088, + "learning_rate": 9.877751957667486e-06, + "loss": 0.6467, + "step": 1545 + }, + { + "epoch": 0.2, + "grad_norm": 0.760571615055131, + "learning_rate": 9.87752513609518e-06, + "loss": 0.615, + "step": 1546 + }, + { + "epoch": 0.2, + "grad_norm": 0.6745613351888998, + "learning_rate": 9.877298106902216e-06, + "loss": 0.5902, + "step": 1547 + }, + { + "epoch": 0.2, + "grad_norm": 0.6051554949691383, + "learning_rate": 9.877070870098256e-06, + "loss": 0.5162, + "step": 1548 + }, + { + "epoch": 0.2, + "grad_norm": 0.6335179298694252, + "learning_rate": 9.876843425692975e-06, + "loss": 0.5526, + "step": 1549 + }, + { + "epoch": 0.2, + "grad_norm": 0.8044564719001647, + "learning_rate": 9.876615773696053e-06, + "loss": 0.6415, + "step": 1550 + }, + { + "epoch": 0.2, + "grad_norm": 1.2969363633187778, + "learning_rate": 9.876387914117181e-06, + "loss": 0.6653, + "step": 1551 + }, + { + "epoch": 0.2, + "grad_norm": 0.8054796427768358, + "learning_rate": 9.876159846966057e-06, + "loss": 0.5273, + "step": 1552 + }, + { + "epoch": 0.2, + "grad_norm": 0.9842086078093535, + "learning_rate": 9.87593157225239e-06, + "loss": 0.5805, + "step": 1553 + }, + { + "epoch": 0.2, + "grad_norm": 0.6891899317152046, + "learning_rate": 9.8757030899859e-06, + "loss": 0.5981, + "step": 1554 + }, + { + "epoch": 0.2, + "grad_norm": 1.3612018114252962, + "learning_rate": 9.875474400176307e-06, + "loss": 0.6092, + "step": 1555 + }, + { + "epoch": 0.2, + "grad_norm": 0.6543286888948006, + "learning_rate": 9.87524550283335e-06, + "loss": 0.5868, + "step": 1556 + }, + { + "epoch": 0.2, + "grad_norm": 0.769976874425928, + "learning_rate": 9.875016397966772e-06, + "loss": 0.6744, + "step": 1557 + }, + { + "epoch": 0.2, + "grad_norm": 0.585257672773087, + "learning_rate": 9.874787085586323e-06, + "loss": 0.6127, + "step": 1558 + }, + { + "epoch": 0.2, + "grad_norm": 0.5717036765553271, + "learning_rate": 9.874557565701766e-06, + "loss": 0.5382, + "step": 1559 + }, + { + "epoch": 0.2, + "grad_norm": 0.6541689594121229, + "learning_rate": 9.87432783832287e-06, + "loss": 0.5523, + "step": 1560 + }, + { + "epoch": 0.2, + "grad_norm": 0.8055566439494736, + "learning_rate": 9.874097903459413e-06, + "loss": 0.6251, + "step": 1561 + }, + { + "epoch": 0.2, + "grad_norm": 0.8346090143319688, + "learning_rate": 9.873867761121186e-06, + "loss": 0.6496, + "step": 1562 + }, + { + "epoch": 0.2, + "grad_norm": 0.670308442840768, + "learning_rate": 9.873637411317982e-06, + "loss": 0.5613, + "step": 1563 + }, + { + "epoch": 0.2, + "grad_norm": 0.6667886214579567, + "learning_rate": 9.873406854059606e-06, + "loss": 0.5542, + "step": 1564 + }, + { + "epoch": 0.2, + "grad_norm": 0.6787516379371499, + "learning_rate": 9.873176089355876e-06, + "loss": 0.5624, + "step": 1565 + }, + { + "epoch": 0.2, + "grad_norm": 0.7509925927710046, + "learning_rate": 9.87294511721661e-06, + "loss": 0.5894, + "step": 1566 + }, + { + "epoch": 0.2, + "grad_norm": 0.9022552804982648, + "learning_rate": 9.872713937651644e-06, + "loss": 0.6462, + "step": 1567 + }, + { + "epoch": 0.2, + "grad_norm": 0.6648974660353939, + "learning_rate": 9.872482550670815e-06, + "loss": 0.6236, + "step": 1568 + }, + { + "epoch": 0.2, + "grad_norm": 0.9422739387844301, + "learning_rate": 9.872250956283974e-06, + "loss": 0.595, + "step": 1569 + }, + { + "epoch": 0.2, + "grad_norm": 0.7314958690611608, + "learning_rate": 9.87201915450098e-06, + "loss": 0.6098, + "step": 1570 + }, + { + "epoch": 0.2, + "grad_norm": 0.7636374468141408, + "learning_rate": 9.8717871453317e-06, + "loss": 0.6245, + "step": 1571 + }, + { + "epoch": 0.2, + "grad_norm": 0.8121591259993941, + "learning_rate": 9.871554928786007e-06, + "loss": 0.5783, + "step": 1572 + }, + { + "epoch": 0.2, + "grad_norm": 0.9954410395816852, + "learning_rate": 9.87132250487379e-06, + "loss": 0.6484, + "step": 1573 + }, + { + "epoch": 0.2, + "grad_norm": 0.7954625985334292, + "learning_rate": 9.87108987360494e-06, + "loss": 0.5603, + "step": 1574 + }, + { + "epoch": 0.2, + "grad_norm": 0.9622292414595693, + "learning_rate": 9.870857034989359e-06, + "loss": 0.6343, + "step": 1575 + }, + { + "epoch": 0.2, + "grad_norm": 0.6513245173343255, + "learning_rate": 9.87062398903696e-06, + "loss": 0.554, + "step": 1576 + }, + { + "epoch": 0.2, + "grad_norm": 0.7017719471811728, + "learning_rate": 9.87039073575766e-06, + "loss": 0.5777, + "step": 1577 + }, + { + "epoch": 0.2, + "grad_norm": 0.8207537650088288, + "learning_rate": 9.870157275161394e-06, + "loss": 0.5658, + "step": 1578 + }, + { + "epoch": 0.2, + "grad_norm": 0.9306911535789684, + "learning_rate": 9.869923607258093e-06, + "loss": 0.5868, + "step": 1579 + }, + { + "epoch": 0.2, + "grad_norm": 0.8082491365487466, + "learning_rate": 9.869689732057706e-06, + "loss": 0.7051, + "step": 1580 + }, + { + "epoch": 0.2, + "grad_norm": 0.7968390618902453, + "learning_rate": 9.869455649570188e-06, + "loss": 0.6241, + "step": 1581 + }, + { + "epoch": 0.2, + "grad_norm": 0.7852533651109809, + "learning_rate": 9.869221359805506e-06, + "loss": 0.6005, + "step": 1582 + }, + { + "epoch": 0.2, + "grad_norm": 0.844980110728931, + "learning_rate": 9.868986862773628e-06, + "loss": 0.6541, + "step": 1583 + }, + { + "epoch": 0.2, + "grad_norm": 0.890366904680627, + "learning_rate": 9.86875215848454e-06, + "loss": 0.6543, + "step": 1584 + }, + { + "epoch": 0.2, + "grad_norm": 0.6280073463676458, + "learning_rate": 9.868517246948228e-06, + "loss": 0.5546, + "step": 1585 + }, + { + "epoch": 0.2, + "grad_norm": 0.682946858818819, + "learning_rate": 9.868282128174698e-06, + "loss": 0.5619, + "step": 1586 + }, + { + "epoch": 0.2, + "grad_norm": 0.6369580171058496, + "learning_rate": 9.868046802173952e-06, + "loss": 0.5563, + "step": 1587 + }, + { + "epoch": 0.2, + "grad_norm": 0.6459659497595663, + "learning_rate": 9.867811268956011e-06, + "loss": 0.5679, + "step": 1588 + }, + { + "epoch": 0.2, + "grad_norm": 0.9076663467022951, + "learning_rate": 9.867575528530899e-06, + "loss": 0.6413, + "step": 1589 + }, + { + "epoch": 0.2, + "grad_norm": 0.7644414635666698, + "learning_rate": 9.867339580908652e-06, + "loss": 0.5695, + "step": 1590 + }, + { + "epoch": 0.2, + "grad_norm": 0.9802070894770096, + "learning_rate": 9.867103426099313e-06, + "loss": 0.6452, + "step": 1591 + }, + { + "epoch": 0.2, + "grad_norm": 0.9292906996900488, + "learning_rate": 9.866867064112934e-06, + "loss": 0.6407, + "step": 1592 + }, + { + "epoch": 0.2, + "grad_norm": 0.6791333305841595, + "learning_rate": 9.866630494959574e-06, + "loss": 0.5767, + "step": 1593 + }, + { + "epoch": 0.2, + "grad_norm": 0.6453810795437466, + "learning_rate": 9.866393718649309e-06, + "loss": 0.5665, + "step": 1594 + }, + { + "epoch": 0.2, + "grad_norm": 0.8320577331876663, + "learning_rate": 9.866156735192211e-06, + "loss": 0.6371, + "step": 1595 + }, + { + "epoch": 0.2, + "grad_norm": 0.7034813881171303, + "learning_rate": 9.865919544598373e-06, + "loss": 0.5716, + "step": 1596 + }, + { + "epoch": 0.2, + "grad_norm": 0.7509585379589557, + "learning_rate": 9.865682146877888e-06, + "loss": 0.5962, + "step": 1597 + }, + { + "epoch": 0.2, + "grad_norm": 0.9686834691796042, + "learning_rate": 9.865444542040865e-06, + "loss": 0.6133, + "step": 1598 + }, + { + "epoch": 0.2, + "grad_norm": 0.8268502522136598, + "learning_rate": 9.865206730097413e-06, + "loss": 0.6333, + "step": 1599 + }, + { + "epoch": 0.2, + "grad_norm": 1.1796734025912674, + "learning_rate": 9.86496871105766e-06, + "loss": 0.6021, + "step": 1600 + }, + { + "epoch": 0.2, + "grad_norm": 0.5929343695552657, + "learning_rate": 9.86473048493173e-06, + "loss": 0.4875, + "step": 1601 + }, + { + "epoch": 0.2, + "grad_norm": 0.8711210540344185, + "learning_rate": 9.864492051729772e-06, + "loss": 0.6851, + "step": 1602 + }, + { + "epoch": 0.2, + "grad_norm": 0.9756325623444182, + "learning_rate": 9.864253411461932e-06, + "loss": 0.6592, + "step": 1603 + }, + { + "epoch": 0.2, + "grad_norm": 0.8278114941762432, + "learning_rate": 9.864014564138369e-06, + "loss": 0.6452, + "step": 1604 + }, + { + "epoch": 0.2, + "grad_norm": 0.8427554962800573, + "learning_rate": 9.86377550976925e-06, + "loss": 0.6274, + "step": 1605 + }, + { + "epoch": 0.2, + "grad_norm": 0.780829481157279, + "learning_rate": 9.863536248364748e-06, + "loss": 0.6168, + "step": 1606 + }, + { + "epoch": 0.2, + "grad_norm": 0.8066292955098436, + "learning_rate": 9.86329677993505e-06, + "loss": 0.5908, + "step": 1607 + }, + { + "epoch": 0.2, + "grad_norm": 0.5680189296272247, + "learning_rate": 9.863057104490348e-06, + "loss": 0.5663, + "step": 1608 + }, + { + "epoch": 0.2, + "grad_norm": 0.8443458599638527, + "learning_rate": 9.862817222040845e-06, + "loss": 0.6007, + "step": 1609 + }, + { + "epoch": 0.21, + "grad_norm": 0.9003249014982616, + "learning_rate": 9.862577132596755e-06, + "loss": 0.6193, + "step": 1610 + }, + { + "epoch": 0.21, + "grad_norm": 0.852654164152259, + "learning_rate": 9.862336836168293e-06, + "loss": 0.6172, + "step": 1611 + }, + { + "epoch": 0.21, + "grad_norm": 0.5795560644283292, + "learning_rate": 9.86209633276569e-06, + "loss": 0.547, + "step": 1612 + }, + { + "epoch": 0.21, + "grad_norm": 0.6376658582526519, + "learning_rate": 9.861855622399184e-06, + "loss": 0.5538, + "step": 1613 + }, + { + "epoch": 0.21, + "grad_norm": 0.6823015338001743, + "learning_rate": 9.86161470507902e-06, + "loss": 0.5244, + "step": 1614 + }, + { + "epoch": 0.21, + "grad_norm": 0.7020896797015818, + "learning_rate": 9.861373580815452e-06, + "loss": 0.6004, + "step": 1615 + }, + { + "epoch": 0.21, + "grad_norm": 0.7067135748192178, + "learning_rate": 9.861132249618745e-06, + "loss": 0.5495, + "step": 1616 + }, + { + "epoch": 0.21, + "grad_norm": 0.7840787137883893, + "learning_rate": 9.860890711499175e-06, + "loss": 0.6569, + "step": 1617 + }, + { + "epoch": 0.21, + "grad_norm": 0.6508566226300785, + "learning_rate": 9.860648966467018e-06, + "loss": 0.5679, + "step": 1618 + }, + { + "epoch": 0.21, + "grad_norm": 0.7997398853337984, + "learning_rate": 9.860407014532568e-06, + "loss": 0.63, + "step": 1619 + }, + { + "epoch": 0.21, + "grad_norm": 0.6305162767609661, + "learning_rate": 9.860164855706123e-06, + "loss": 0.5793, + "step": 1620 + }, + { + "epoch": 0.21, + "grad_norm": 0.6898611702327054, + "learning_rate": 9.859922489997991e-06, + "loss": 0.5842, + "step": 1621 + }, + { + "epoch": 0.21, + "grad_norm": 0.6204437929878805, + "learning_rate": 9.859679917418489e-06, + "loss": 0.5621, + "step": 1622 + }, + { + "epoch": 0.21, + "grad_norm": 0.8567548920555998, + "learning_rate": 9.859437137977942e-06, + "loss": 0.6206, + "step": 1623 + }, + { + "epoch": 0.21, + "grad_norm": 0.6573511487962571, + "learning_rate": 9.859194151686685e-06, + "loss": 0.5791, + "step": 1624 + }, + { + "epoch": 0.21, + "grad_norm": 0.7905473570590201, + "learning_rate": 9.85895095855506e-06, + "loss": 0.5781, + "step": 1625 + }, + { + "epoch": 0.21, + "grad_norm": 0.9481362480472231, + "learning_rate": 9.858707558593421e-06, + "loss": 0.6792, + "step": 1626 + }, + { + "epoch": 0.21, + "grad_norm": 0.838869534675686, + "learning_rate": 9.858463951812125e-06, + "loss": 0.6789, + "step": 1627 + }, + { + "epoch": 0.21, + "grad_norm": 0.6271821937311737, + "learning_rate": 9.858220138221546e-06, + "loss": 0.531, + "step": 1628 + }, + { + "epoch": 0.21, + "grad_norm": 0.6543623055698378, + "learning_rate": 9.85797611783206e-06, + "loss": 0.5008, + "step": 1629 + }, + { + "epoch": 0.21, + "grad_norm": 0.7101443776889375, + "learning_rate": 9.857731890654055e-06, + "loss": 0.5595, + "step": 1630 + }, + { + "epoch": 0.21, + "grad_norm": 0.933466387973217, + "learning_rate": 9.857487456697925e-06, + "loss": 0.576, + "step": 1631 + }, + { + "epoch": 0.21, + "grad_norm": 1.408488406231972, + "learning_rate": 9.857242815974078e-06, + "loss": 0.6954, + "step": 1632 + }, + { + "epoch": 0.21, + "grad_norm": 0.9322296346854334, + "learning_rate": 9.856997968492924e-06, + "loss": 0.6195, + "step": 1633 + }, + { + "epoch": 0.21, + "grad_norm": 0.8527005341614082, + "learning_rate": 9.856752914264889e-06, + "loss": 0.6158, + "step": 1634 + }, + { + "epoch": 0.21, + "grad_norm": 0.6029817931915498, + "learning_rate": 9.8565076533004e-06, + "loss": 0.5773, + "step": 1635 + }, + { + "epoch": 0.21, + "grad_norm": 0.6922544203961906, + "learning_rate": 9.856262185609901e-06, + "loss": 0.5389, + "step": 1636 + }, + { + "epoch": 0.21, + "grad_norm": 0.6657983220041698, + "learning_rate": 9.856016511203839e-06, + "loss": 0.5861, + "step": 1637 + }, + { + "epoch": 0.21, + "grad_norm": 1.0928942883190693, + "learning_rate": 9.855770630092672e-06, + "loss": 0.6335, + "step": 1638 + }, + { + "epoch": 0.21, + "grad_norm": 1.1189330925779744, + "learning_rate": 9.855524542286866e-06, + "loss": 0.606, + "step": 1639 + }, + { + "epoch": 0.21, + "grad_norm": 0.8953629009058321, + "learning_rate": 9.855278247796897e-06, + "loss": 0.5854, + "step": 1640 + }, + { + "epoch": 0.21, + "grad_norm": 0.6875087356327355, + "learning_rate": 9.855031746633247e-06, + "loss": 0.5434, + "step": 1641 + }, + { + "epoch": 0.21, + "grad_norm": 1.1680271686982153, + "learning_rate": 9.854785038806411e-06, + "loss": 0.65, + "step": 1642 + }, + { + "epoch": 0.21, + "grad_norm": 0.6489497901061975, + "learning_rate": 9.854538124326889e-06, + "loss": 0.6051, + "step": 1643 + }, + { + "epoch": 0.21, + "grad_norm": 0.6195205120566304, + "learning_rate": 9.854291003205192e-06, + "loss": 0.5157, + "step": 1644 + }, + { + "epoch": 0.21, + "grad_norm": 0.8719980440711714, + "learning_rate": 9.854043675451838e-06, + "loss": 0.6223, + "step": 1645 + }, + { + "epoch": 0.21, + "grad_norm": 0.8184096554586712, + "learning_rate": 9.853796141077357e-06, + "loss": 0.6516, + "step": 1646 + }, + { + "epoch": 0.21, + "grad_norm": 0.6511274776408341, + "learning_rate": 9.853548400092285e-06, + "loss": 0.5093, + "step": 1647 + }, + { + "epoch": 0.21, + "grad_norm": 0.9905158951835199, + "learning_rate": 9.853300452507167e-06, + "loss": 0.6101, + "step": 1648 + }, + { + "epoch": 0.21, + "grad_norm": 0.6572766198066207, + "learning_rate": 9.853052298332559e-06, + "loss": 0.5691, + "step": 1649 + }, + { + "epoch": 0.21, + "grad_norm": 0.6855701865032515, + "learning_rate": 9.85280393757902e-06, + "loss": 0.541, + "step": 1650 + }, + { + "epoch": 0.21, + "grad_norm": 0.7668807561188197, + "learning_rate": 9.852555370257127e-06, + "loss": 0.6382, + "step": 1651 + }, + { + "epoch": 0.21, + "grad_norm": 0.6325259906279111, + "learning_rate": 9.852306596377459e-06, + "loss": 0.5566, + "step": 1652 + }, + { + "epoch": 0.21, + "grad_norm": 0.9250753695471651, + "learning_rate": 9.852057615950603e-06, + "loss": 0.6393, + "step": 1653 + }, + { + "epoch": 0.21, + "grad_norm": 0.9279146497246463, + "learning_rate": 9.85180842898716e-06, + "loss": 0.6717, + "step": 1654 + }, + { + "epoch": 0.21, + "grad_norm": 0.7987768441302029, + "learning_rate": 9.851559035497735e-06, + "loss": 0.6624, + "step": 1655 + }, + { + "epoch": 0.21, + "grad_norm": 0.5686685449129294, + "learning_rate": 9.851309435492948e-06, + "loss": 0.5228, + "step": 1656 + }, + { + "epoch": 0.21, + "grad_norm": 0.7910938429868714, + "learning_rate": 9.85105962898342e-06, + "loss": 0.6313, + "step": 1657 + }, + { + "epoch": 0.21, + "grad_norm": 0.7554652092727698, + "learning_rate": 9.850809615979785e-06, + "loss": 0.6802, + "step": 1658 + }, + { + "epoch": 0.21, + "grad_norm": 0.9099323233798392, + "learning_rate": 9.850559396492685e-06, + "loss": 0.655, + "step": 1659 + }, + { + "epoch": 0.21, + "grad_norm": 0.8393170889094593, + "learning_rate": 9.850308970532772e-06, + "loss": 0.6576, + "step": 1660 + }, + { + "epoch": 0.21, + "grad_norm": 0.7841118211705639, + "learning_rate": 9.850058338110705e-06, + "loss": 0.5871, + "step": 1661 + }, + { + "epoch": 0.21, + "grad_norm": 0.5549050394204721, + "learning_rate": 9.849807499237154e-06, + "loss": 0.5044, + "step": 1662 + }, + { + "epoch": 0.21, + "grad_norm": 0.686561215272153, + "learning_rate": 9.849556453922796e-06, + "loss": 0.5421, + "step": 1663 + }, + { + "epoch": 0.21, + "grad_norm": 0.5762718732077279, + "learning_rate": 9.849305202178314e-06, + "loss": 0.5529, + "step": 1664 + }, + { + "epoch": 0.21, + "grad_norm": 0.758757336304438, + "learning_rate": 9.849053744014408e-06, + "loss": 0.6206, + "step": 1665 + }, + { + "epoch": 0.21, + "grad_norm": 0.55247077233078, + "learning_rate": 9.848802079441779e-06, + "loss": 0.5167, + "step": 1666 + }, + { + "epoch": 0.21, + "grad_norm": 0.8895213823591999, + "learning_rate": 9.848550208471142e-06, + "loss": 0.6165, + "step": 1667 + }, + { + "epoch": 0.21, + "grad_norm": 0.855374858594082, + "learning_rate": 9.848298131113214e-06, + "loss": 0.6212, + "step": 1668 + }, + { + "epoch": 0.21, + "grad_norm": 0.6102036648610574, + "learning_rate": 9.848045847378726e-06, + "loss": 0.5034, + "step": 1669 + }, + { + "epoch": 0.21, + "grad_norm": 0.825259433810369, + "learning_rate": 9.847793357278421e-06, + "loss": 0.6835, + "step": 1670 + }, + { + "epoch": 0.21, + "grad_norm": 0.6130078610734073, + "learning_rate": 9.847540660823045e-06, + "loss": 0.5656, + "step": 1671 + }, + { + "epoch": 0.21, + "grad_norm": 0.7704459089579551, + "learning_rate": 9.847287758023351e-06, + "loss": 0.6029, + "step": 1672 + }, + { + "epoch": 0.21, + "grad_norm": 0.6260214451166097, + "learning_rate": 9.847034648890108e-06, + "loss": 0.5277, + "step": 1673 + }, + { + "epoch": 0.21, + "grad_norm": 0.8269750454306841, + "learning_rate": 9.84678133343409e-06, + "loss": 0.5882, + "step": 1674 + }, + { + "epoch": 0.21, + "grad_norm": 0.7715515416702583, + "learning_rate": 9.846527811666078e-06, + "loss": 0.5867, + "step": 1675 + }, + { + "epoch": 0.21, + "grad_norm": 0.6165145422690684, + "learning_rate": 9.846274083596863e-06, + "loss": 0.5475, + "step": 1676 + }, + { + "epoch": 0.21, + "grad_norm": 0.6233397616203508, + "learning_rate": 9.846020149237248e-06, + "loss": 0.545, + "step": 1677 + }, + { + "epoch": 0.21, + "grad_norm": 0.6156061510869181, + "learning_rate": 9.845766008598042e-06, + "loss": 0.5755, + "step": 1678 + }, + { + "epoch": 0.21, + "grad_norm": 0.9484201167773875, + "learning_rate": 9.845511661690059e-06, + "loss": 0.6152, + "step": 1679 + }, + { + "epoch": 0.21, + "grad_norm": 0.6501776492838833, + "learning_rate": 9.84525710852413e-06, + "loss": 0.5681, + "step": 1680 + }, + { + "epoch": 0.21, + "grad_norm": 0.6109494161031894, + "learning_rate": 9.84500234911109e-06, + "loss": 0.5359, + "step": 1681 + }, + { + "epoch": 0.21, + "grad_norm": 0.6539827487484311, + "learning_rate": 9.84474738346178e-06, + "loss": 0.5798, + "step": 1682 + }, + { + "epoch": 0.21, + "grad_norm": 0.634957277775411, + "learning_rate": 9.844492211587058e-06, + "loss": 0.5734, + "step": 1683 + }, + { + "epoch": 0.21, + "grad_norm": 0.6636185620154464, + "learning_rate": 9.844236833497782e-06, + "loss": 0.5195, + "step": 1684 + }, + { + "epoch": 0.21, + "grad_norm": 1.2841736516006863, + "learning_rate": 9.843981249204825e-06, + "loss": 0.6146, + "step": 1685 + }, + { + "epoch": 0.21, + "grad_norm": 0.7380799101894738, + "learning_rate": 9.843725458719065e-06, + "loss": 0.5537, + "step": 1686 + }, + { + "epoch": 0.21, + "grad_norm": 0.6474222869333314, + "learning_rate": 9.84346946205139e-06, + "loss": 0.5762, + "step": 1687 + }, + { + "epoch": 0.22, + "grad_norm": 0.6328728625853145, + "learning_rate": 9.843213259212698e-06, + "loss": 0.5034, + "step": 1688 + }, + { + "epoch": 0.22, + "grad_norm": 0.9351332256119143, + "learning_rate": 9.842956850213893e-06, + "loss": 0.6256, + "step": 1689 + }, + { + "epoch": 0.22, + "grad_norm": 0.6733164129517442, + "learning_rate": 9.842700235065893e-06, + "loss": 0.6195, + "step": 1690 + }, + { + "epoch": 0.22, + "grad_norm": 0.5682114661543541, + "learning_rate": 9.842443413779618e-06, + "loss": 0.5294, + "step": 1691 + }, + { + "epoch": 0.22, + "grad_norm": 0.6413366655094044, + "learning_rate": 9.842186386366002e-06, + "loss": 0.5897, + "step": 1692 + }, + { + "epoch": 0.22, + "grad_norm": 0.63052655305629, + "learning_rate": 9.841929152835983e-06, + "loss": 0.5714, + "step": 1693 + }, + { + "epoch": 0.22, + "grad_norm": 0.67460003471768, + "learning_rate": 9.841671713200513e-06, + "loss": 0.548, + "step": 1694 + }, + { + "epoch": 0.22, + "grad_norm": 0.8176681406211385, + "learning_rate": 9.841414067470552e-06, + "loss": 0.6159, + "step": 1695 + }, + { + "epoch": 0.22, + "grad_norm": 0.9175096301150686, + "learning_rate": 9.841156215657063e-06, + "loss": 0.6853, + "step": 1696 + }, + { + "epoch": 0.22, + "grad_norm": 0.7801575625897028, + "learning_rate": 9.840898157771024e-06, + "loss": 0.5602, + "step": 1697 + }, + { + "epoch": 0.22, + "grad_norm": 0.6269357761719352, + "learning_rate": 9.84063989382342e-06, + "loss": 0.5295, + "step": 1698 + }, + { + "epoch": 0.22, + "grad_norm": 0.7248816894100609, + "learning_rate": 9.840381423825245e-06, + "loss": 0.6128, + "step": 1699 + }, + { + "epoch": 0.22, + "grad_norm": 0.8856359618494074, + "learning_rate": 9.840122747787499e-06, + "loss": 0.6154, + "step": 1700 + }, + { + "epoch": 0.22, + "grad_norm": 0.8144981201588504, + "learning_rate": 9.839863865721197e-06, + "loss": 0.6517, + "step": 1701 + }, + { + "epoch": 0.22, + "grad_norm": 0.8074339626714965, + "learning_rate": 9.839604777637355e-06, + "loss": 0.6454, + "step": 1702 + }, + { + "epoch": 0.22, + "grad_norm": 0.755912866391722, + "learning_rate": 9.839345483547002e-06, + "loss": 0.5942, + "step": 1703 + }, + { + "epoch": 0.22, + "grad_norm": 0.5453412409044164, + "learning_rate": 9.839085983461176e-06, + "loss": 0.4867, + "step": 1704 + }, + { + "epoch": 0.22, + "grad_norm": 1.1312225476673403, + "learning_rate": 9.838826277390924e-06, + "loss": 0.623, + "step": 1705 + }, + { + "epoch": 0.22, + "grad_norm": 0.5580077144699175, + "learning_rate": 9.8385663653473e-06, + "loss": 0.5262, + "step": 1706 + }, + { + "epoch": 0.22, + "grad_norm": 1.0276120472248955, + "learning_rate": 9.838306247341368e-06, + "loss": 0.6897, + "step": 1707 + }, + { + "epoch": 0.22, + "grad_norm": 0.6345644554063219, + "learning_rate": 9.838045923384198e-06, + "loss": 0.5501, + "step": 1708 + }, + { + "epoch": 0.22, + "grad_norm": 0.6950163591199844, + "learning_rate": 9.837785393486875e-06, + "loss": 0.5713, + "step": 1709 + }, + { + "epoch": 0.22, + "grad_norm": 0.7247758833337077, + "learning_rate": 9.837524657660486e-06, + "loss": 0.6012, + "step": 1710 + }, + { + "epoch": 0.22, + "grad_norm": 0.8438527231007332, + "learning_rate": 9.837263715916132e-06, + "loss": 0.6684, + "step": 1711 + }, + { + "epoch": 0.22, + "grad_norm": 1.003086199282932, + "learning_rate": 9.837002568264919e-06, + "loss": 0.5959, + "step": 1712 + }, + { + "epoch": 0.22, + "grad_norm": 0.9463278570698767, + "learning_rate": 9.836741214717964e-06, + "loss": 0.6049, + "step": 1713 + }, + { + "epoch": 0.22, + "grad_norm": 0.6365549845329462, + "learning_rate": 9.836479655286391e-06, + "loss": 0.5541, + "step": 1714 + }, + { + "epoch": 0.22, + "grad_norm": 0.8799062000288134, + "learning_rate": 9.836217889981335e-06, + "loss": 0.6678, + "step": 1715 + }, + { + "epoch": 0.22, + "grad_norm": 0.6781532713920536, + "learning_rate": 9.835955918813937e-06, + "loss": 0.554, + "step": 1716 + }, + { + "epoch": 0.22, + "grad_norm": 0.6090068756136409, + "learning_rate": 9.835693741795348e-06, + "loss": 0.5387, + "step": 1717 + }, + { + "epoch": 0.22, + "grad_norm": 0.6727395605649175, + "learning_rate": 9.83543135893673e-06, + "loss": 0.5264, + "step": 1718 + }, + { + "epoch": 0.22, + "grad_norm": 0.7648070527881605, + "learning_rate": 9.83516877024925e-06, + "loss": 0.6062, + "step": 1719 + }, + { + "epoch": 0.22, + "grad_norm": 0.8421626479733515, + "learning_rate": 9.83490597574409e-06, + "loss": 0.5994, + "step": 1720 + }, + { + "epoch": 0.22, + "grad_norm": 0.742540896777215, + "learning_rate": 9.83464297543243e-06, + "loss": 0.5347, + "step": 1721 + }, + { + "epoch": 0.22, + "grad_norm": 0.8461047171388711, + "learning_rate": 9.834379769325466e-06, + "loss": 0.6343, + "step": 1722 + }, + { + "epoch": 0.22, + "grad_norm": 0.6598525582260091, + "learning_rate": 9.834116357434407e-06, + "loss": 0.59, + "step": 1723 + }, + { + "epoch": 0.22, + "grad_norm": 0.6924863493852103, + "learning_rate": 9.833852739770463e-06, + "loss": 0.5899, + "step": 1724 + }, + { + "epoch": 0.22, + "grad_norm": 0.6556261440230574, + "learning_rate": 9.833588916344851e-06, + "loss": 0.5352, + "step": 1725 + }, + { + "epoch": 0.22, + "grad_norm": 0.6164402957827337, + "learning_rate": 9.833324887168807e-06, + "loss": 0.5855, + "step": 1726 + }, + { + "epoch": 0.22, + "grad_norm": 0.83313658454367, + "learning_rate": 9.833060652253567e-06, + "loss": 0.6342, + "step": 1727 + }, + { + "epoch": 0.22, + "grad_norm": 0.7971140584665952, + "learning_rate": 9.83279621161038e-06, + "loss": 0.6277, + "step": 1728 + }, + { + "epoch": 0.22, + "grad_norm": 0.7533113921604073, + "learning_rate": 9.832531565250501e-06, + "loss": 0.5854, + "step": 1729 + }, + { + "epoch": 0.22, + "grad_norm": 0.5973559185082606, + "learning_rate": 9.832266713185195e-06, + "loss": 0.5474, + "step": 1730 + }, + { + "epoch": 0.22, + "grad_norm": 0.6941911837247505, + "learning_rate": 9.832001655425737e-06, + "loss": 0.5791, + "step": 1731 + }, + { + "epoch": 0.22, + "grad_norm": 0.7588200922371693, + "learning_rate": 9.83173639198341e-06, + "loss": 0.5659, + "step": 1732 + }, + { + "epoch": 0.22, + "grad_norm": 0.659717998884332, + "learning_rate": 9.831470922869506e-06, + "loss": 0.5656, + "step": 1733 + }, + { + "epoch": 0.22, + "grad_norm": 0.792657017702605, + "learning_rate": 9.831205248095322e-06, + "loss": 0.5704, + "step": 1734 + }, + { + "epoch": 0.22, + "grad_norm": 0.8794596488407359, + "learning_rate": 9.83093936767217e-06, + "loss": 0.5796, + "step": 1735 + }, + { + "epoch": 0.22, + "grad_norm": 0.6368550476640701, + "learning_rate": 9.830673281611367e-06, + "loss": 0.5795, + "step": 1736 + }, + { + "epoch": 0.22, + "grad_norm": 0.8113624566405523, + "learning_rate": 9.830406989924239e-06, + "loss": 0.664, + "step": 1737 + }, + { + "epoch": 0.22, + "grad_norm": 0.7148748320239768, + "learning_rate": 9.83014049262212e-06, + "loss": 0.5796, + "step": 1738 + }, + { + "epoch": 0.22, + "grad_norm": 0.7045937711178463, + "learning_rate": 9.829873789716355e-06, + "loss": 0.5847, + "step": 1739 + }, + { + "epoch": 0.22, + "grad_norm": 0.5964086549917872, + "learning_rate": 9.829606881218297e-06, + "loss": 0.5836, + "step": 1740 + }, + { + "epoch": 0.22, + "grad_norm": 0.8052988769986834, + "learning_rate": 9.829339767139308e-06, + "loss": 0.5969, + "step": 1741 + }, + { + "epoch": 0.22, + "grad_norm": 0.9221403141530807, + "learning_rate": 9.82907244749076e-06, + "loss": 0.7186, + "step": 1742 + }, + { + "epoch": 0.22, + "grad_norm": 0.6203895913139273, + "learning_rate": 9.828804922284026e-06, + "loss": 0.5362, + "step": 1743 + }, + { + "epoch": 0.22, + "grad_norm": 0.8738324041496631, + "learning_rate": 9.828537191530496e-06, + "loss": 0.614, + "step": 1744 + }, + { + "epoch": 0.22, + "grad_norm": 0.7098802141428318, + "learning_rate": 9.82826925524157e-06, + "loss": 0.6196, + "step": 1745 + }, + { + "epoch": 0.22, + "grad_norm": 0.6802253944524707, + "learning_rate": 9.82800111342865e-06, + "loss": 0.5573, + "step": 1746 + }, + { + "epoch": 0.22, + "grad_norm": 0.8081654337036757, + "learning_rate": 9.82773276610315e-06, + "loss": 0.6616, + "step": 1747 + }, + { + "epoch": 0.22, + "grad_norm": 0.7162665211569121, + "learning_rate": 9.827464213276494e-06, + "loss": 0.5565, + "step": 1748 + }, + { + "epoch": 0.22, + "grad_norm": 1.2342725550537075, + "learning_rate": 9.827195454960115e-06, + "loss": 0.6428, + "step": 1749 + }, + { + "epoch": 0.22, + "grad_norm": 0.8406360594694418, + "learning_rate": 9.826926491165448e-06, + "loss": 0.6361, + "step": 1750 + }, + { + "epoch": 0.22, + "grad_norm": 0.7423596412830483, + "learning_rate": 9.826657321903946e-06, + "loss": 0.6112, + "step": 1751 + }, + { + "epoch": 0.22, + "grad_norm": 0.6104102388515092, + "learning_rate": 9.826387947187066e-06, + "loss": 0.516, + "step": 1752 + }, + { + "epoch": 0.22, + "grad_norm": 0.6846252071575041, + "learning_rate": 9.826118367026274e-06, + "loss": 0.5526, + "step": 1753 + }, + { + "epoch": 0.22, + "grad_norm": 0.8553823260980193, + "learning_rate": 9.825848581433044e-06, + "loss": 0.704, + "step": 1754 + }, + { + "epoch": 0.22, + "grad_norm": 0.6719827243923429, + "learning_rate": 9.825578590418862e-06, + "loss": 0.5003, + "step": 1755 + }, + { + "epoch": 0.22, + "grad_norm": 0.7850911675406774, + "learning_rate": 9.82530839399522e-06, + "loss": 0.5521, + "step": 1756 + }, + { + "epoch": 0.22, + "grad_norm": 0.7978646385458676, + "learning_rate": 9.825037992173618e-06, + "loss": 0.6446, + "step": 1757 + }, + { + "epoch": 0.22, + "grad_norm": 0.8090192887152099, + "learning_rate": 9.824767384965567e-06, + "loss": 0.6247, + "step": 1758 + }, + { + "epoch": 0.22, + "grad_norm": 0.8957108224116083, + "learning_rate": 9.82449657238259e-06, + "loss": 0.6446, + "step": 1759 + }, + { + "epoch": 0.22, + "grad_norm": 0.8677758678006399, + "learning_rate": 9.824225554436207e-06, + "loss": 0.6161, + "step": 1760 + }, + { + "epoch": 0.22, + "grad_norm": 0.8454027020819754, + "learning_rate": 9.82395433113796e-06, + "loss": 0.6067, + "step": 1761 + }, + { + "epoch": 0.22, + "grad_norm": 0.7106692070370019, + "learning_rate": 9.82368290249939e-06, + "loss": 0.6342, + "step": 1762 + }, + { + "epoch": 0.22, + "grad_norm": 0.6807978308051146, + "learning_rate": 9.823411268532055e-06, + "loss": 0.5495, + "step": 1763 + }, + { + "epoch": 0.22, + "grad_norm": 0.6558366754497376, + "learning_rate": 9.823139429247515e-06, + "loss": 0.5589, + "step": 1764 + }, + { + "epoch": 0.22, + "grad_norm": 0.7010054377950301, + "learning_rate": 9.822867384657341e-06, + "loss": 0.5249, + "step": 1765 + }, + { + "epoch": 0.22, + "grad_norm": 0.6448064571642647, + "learning_rate": 9.822595134773116e-06, + "loss": 0.5513, + "step": 1766 + }, + { + "epoch": 0.23, + "grad_norm": 1.0398190136124312, + "learning_rate": 9.822322679606427e-06, + "loss": 0.6578, + "step": 1767 + }, + { + "epoch": 0.23, + "grad_norm": 0.6064319244857759, + "learning_rate": 9.822050019168871e-06, + "loss": 0.5589, + "step": 1768 + }, + { + "epoch": 0.23, + "grad_norm": 0.7163008061551831, + "learning_rate": 9.821777153472053e-06, + "loss": 0.5687, + "step": 1769 + }, + { + "epoch": 0.23, + "grad_norm": 0.6828781593595651, + "learning_rate": 9.821504082527592e-06, + "loss": 0.5684, + "step": 1770 + }, + { + "epoch": 0.23, + "grad_norm": 0.7278664732198938, + "learning_rate": 9.821230806347111e-06, + "loss": 0.5327, + "step": 1771 + }, + { + "epoch": 0.23, + "grad_norm": 0.7243893679333716, + "learning_rate": 9.820957324942237e-06, + "loss": 0.5791, + "step": 1772 + }, + { + "epoch": 0.23, + "grad_norm": 0.764962784739967, + "learning_rate": 9.820683638324618e-06, + "loss": 0.5995, + "step": 1773 + }, + { + "epoch": 0.23, + "grad_norm": 1.1084149493792808, + "learning_rate": 9.820409746505901e-06, + "loss": 0.6086, + "step": 1774 + }, + { + "epoch": 0.23, + "grad_norm": 0.6267456203921593, + "learning_rate": 9.820135649497744e-06, + "loss": 0.5813, + "step": 1775 + }, + { + "epoch": 0.23, + "grad_norm": 0.9700337915284244, + "learning_rate": 9.819861347311815e-06, + "loss": 0.6304, + "step": 1776 + }, + { + "epoch": 0.23, + "grad_norm": 0.9323486483003622, + "learning_rate": 9.819586839959792e-06, + "loss": 0.6092, + "step": 1777 + }, + { + "epoch": 0.23, + "grad_norm": 0.8293205415777264, + "learning_rate": 9.819312127453358e-06, + "loss": 0.6256, + "step": 1778 + }, + { + "epoch": 0.23, + "grad_norm": 0.7672079298631141, + "learning_rate": 9.819037209804205e-06, + "loss": 0.6065, + "step": 1779 + }, + { + "epoch": 0.23, + "grad_norm": 0.6515826109273735, + "learning_rate": 9.818762087024039e-06, + "loss": 0.5435, + "step": 1780 + }, + { + "epoch": 0.23, + "grad_norm": 0.6496504007458078, + "learning_rate": 9.81848675912457e-06, + "loss": 0.5473, + "step": 1781 + }, + { + "epoch": 0.23, + "grad_norm": 0.5649728998215756, + "learning_rate": 9.818211226117517e-06, + "loss": 0.5303, + "step": 1782 + }, + { + "epoch": 0.23, + "grad_norm": 0.7576583466894525, + "learning_rate": 9.81793548801461e-06, + "loss": 0.6118, + "step": 1783 + }, + { + "epoch": 0.23, + "grad_norm": 0.6930723487360514, + "learning_rate": 9.817659544827582e-06, + "loss": 0.547, + "step": 1784 + }, + { + "epoch": 0.23, + "grad_norm": 0.8219685663116344, + "learning_rate": 9.817383396568185e-06, + "loss": 0.6618, + "step": 1785 + }, + { + "epoch": 0.23, + "grad_norm": 0.7419186308064855, + "learning_rate": 9.817107043248171e-06, + "loss": 0.5586, + "step": 1786 + }, + { + "epoch": 0.23, + "grad_norm": 0.6580774179682489, + "learning_rate": 9.816830484879302e-06, + "loss": 0.604, + "step": 1787 + }, + { + "epoch": 0.23, + "grad_norm": 0.6058216236679842, + "learning_rate": 9.816553721473352e-06, + "loss": 0.566, + "step": 1788 + }, + { + "epoch": 0.23, + "grad_norm": 0.7888254498849562, + "learning_rate": 9.816276753042103e-06, + "loss": 0.645, + "step": 1789 + }, + { + "epoch": 0.23, + "grad_norm": 0.6141278230424722, + "learning_rate": 9.815999579597342e-06, + "loss": 0.5402, + "step": 1790 + }, + { + "epoch": 0.23, + "grad_norm": 0.6808504418226013, + "learning_rate": 9.815722201150869e-06, + "loss": 0.5652, + "step": 1791 + }, + { + "epoch": 0.23, + "grad_norm": 0.8140413787945604, + "learning_rate": 9.815444617714489e-06, + "loss": 0.6463, + "step": 1792 + }, + { + "epoch": 0.23, + "grad_norm": 0.5419997332132857, + "learning_rate": 9.815166829300022e-06, + "loss": 0.4861, + "step": 1793 + }, + { + "epoch": 0.23, + "grad_norm": 0.6864311376367265, + "learning_rate": 9.81488883591929e-06, + "loss": 0.5347, + "step": 1794 + }, + { + "epoch": 0.23, + "grad_norm": 0.7049230119572016, + "learning_rate": 9.814610637584125e-06, + "loss": 0.5836, + "step": 1795 + }, + { + "epoch": 0.23, + "grad_norm": 0.6704913119275552, + "learning_rate": 9.81433223430637e-06, + "loss": 0.5475, + "step": 1796 + }, + { + "epoch": 0.23, + "grad_norm": 0.6143483032874669, + "learning_rate": 9.814053626097879e-06, + "loss": 0.5557, + "step": 1797 + }, + { + "epoch": 0.23, + "grad_norm": 0.6345160268573378, + "learning_rate": 9.813774812970506e-06, + "loss": 0.5294, + "step": 1798 + }, + { + "epoch": 0.23, + "grad_norm": 0.8201777018395332, + "learning_rate": 9.813495794936123e-06, + "loss": 0.6206, + "step": 1799 + }, + { + "epoch": 0.23, + "grad_norm": 1.0710772418338836, + "learning_rate": 9.813216572006606e-06, + "loss": 0.664, + "step": 1800 + }, + { + "epoch": 0.23, + "grad_norm": 0.7240729327012577, + "learning_rate": 9.812937144193839e-06, + "loss": 0.6213, + "step": 1801 + }, + { + "epoch": 0.23, + "grad_norm": 0.5999673694543911, + "learning_rate": 9.81265751150972e-06, + "loss": 0.5311, + "step": 1802 + }, + { + "epoch": 0.23, + "grad_norm": 0.9403646063554219, + "learning_rate": 9.812377673966147e-06, + "loss": 0.666, + "step": 1803 + }, + { + "epoch": 0.23, + "grad_norm": 0.6040105186180103, + "learning_rate": 9.812097631575037e-06, + "loss": 0.5258, + "step": 1804 + }, + { + "epoch": 0.23, + "grad_norm": 1.0974913112250428, + "learning_rate": 9.811817384348306e-06, + "loss": 0.6727, + "step": 1805 + }, + { + "epoch": 0.23, + "grad_norm": 0.8672915396636229, + "learning_rate": 9.811536932297887e-06, + "loss": 0.5533, + "step": 1806 + }, + { + "epoch": 0.23, + "grad_norm": 0.6340230627965114, + "learning_rate": 9.811256275435716e-06, + "loss": 0.5834, + "step": 1807 + }, + { + "epoch": 0.23, + "grad_norm": 0.8718616880804044, + "learning_rate": 9.81097541377374e-06, + "loss": 0.6046, + "step": 1808 + }, + { + "epoch": 0.23, + "grad_norm": 0.7195864741878601, + "learning_rate": 9.810694347323913e-06, + "loss": 0.5896, + "step": 1809 + }, + { + "epoch": 0.23, + "grad_norm": 0.834375755837981, + "learning_rate": 9.810413076098201e-06, + "loss": 0.6209, + "step": 1810 + }, + { + "epoch": 0.23, + "grad_norm": 1.0459165288147858, + "learning_rate": 9.810131600108577e-06, + "loss": 0.7057, + "step": 1811 + }, + { + "epoch": 0.23, + "grad_norm": 0.9085264408735688, + "learning_rate": 9.809849919367021e-06, + "loss": 0.6524, + "step": 1812 + }, + { + "epoch": 0.23, + "grad_norm": 0.6245803370399223, + "learning_rate": 9.809568033885523e-06, + "loss": 0.5362, + "step": 1813 + }, + { + "epoch": 0.23, + "grad_norm": 0.62992976780188, + "learning_rate": 9.809285943676086e-06, + "loss": 0.5811, + "step": 1814 + }, + { + "epoch": 0.23, + "grad_norm": 0.7588014992492863, + "learning_rate": 9.809003648750712e-06, + "loss": 0.6019, + "step": 1815 + }, + { + "epoch": 0.23, + "grad_norm": 0.8454433772131614, + "learning_rate": 9.808721149121421e-06, + "loss": 0.6199, + "step": 1816 + }, + { + "epoch": 0.23, + "grad_norm": 0.6311787764848752, + "learning_rate": 9.808438444800238e-06, + "loss": 0.5422, + "step": 1817 + }, + { + "epoch": 0.23, + "grad_norm": 0.7647311379504511, + "learning_rate": 9.808155535799196e-06, + "loss": 0.6039, + "step": 1818 + }, + { + "epoch": 0.23, + "grad_norm": 0.6859108418964911, + "learning_rate": 9.807872422130336e-06, + "loss": 0.5781, + "step": 1819 + }, + { + "epoch": 0.23, + "grad_norm": 0.7272285110153806, + "learning_rate": 9.80758910380571e-06, + "loss": 0.5255, + "step": 1820 + }, + { + "epoch": 0.23, + "grad_norm": 0.9282585204580677, + "learning_rate": 9.80730558083738e-06, + "loss": 0.6676, + "step": 1821 + }, + { + "epoch": 0.23, + "grad_norm": 0.7771576518488775, + "learning_rate": 9.807021853237415e-06, + "loss": 0.6327, + "step": 1822 + }, + { + "epoch": 0.23, + "grad_norm": 0.8991058364926697, + "learning_rate": 9.806737921017889e-06, + "loss": 0.5972, + "step": 1823 + }, + { + "epoch": 0.23, + "grad_norm": 0.6311085357228988, + "learning_rate": 9.80645378419089e-06, + "loss": 0.5644, + "step": 1824 + }, + { + "epoch": 0.23, + "grad_norm": 0.7044051154406225, + "learning_rate": 9.806169442768512e-06, + "loss": 0.6508, + "step": 1825 + }, + { + "epoch": 0.23, + "grad_norm": 4.377727237075721, + "learning_rate": 9.805884896762861e-06, + "loss": 0.6146, + "step": 1826 + }, + { + "epoch": 0.23, + "grad_norm": 0.6541557095893447, + "learning_rate": 9.805600146186045e-06, + "loss": 0.535, + "step": 1827 + }, + { + "epoch": 0.23, + "grad_norm": 0.7937694622750345, + "learning_rate": 9.805315191050189e-06, + "loss": 0.6149, + "step": 1828 + }, + { + "epoch": 0.23, + "grad_norm": 0.6882330971401761, + "learning_rate": 9.805030031367421e-06, + "loss": 0.5604, + "step": 1829 + }, + { + "epoch": 0.23, + "grad_norm": 0.5556525189442758, + "learning_rate": 9.804744667149878e-06, + "loss": 0.4953, + "step": 1830 + }, + { + "epoch": 0.23, + "grad_norm": 0.6249957602193592, + "learning_rate": 9.80445909840971e-06, + "loss": 0.5598, + "step": 1831 + }, + { + "epoch": 0.23, + "grad_norm": 0.6452773557447997, + "learning_rate": 9.80417332515907e-06, + "loss": 0.5606, + "step": 1832 + }, + { + "epoch": 0.23, + "grad_norm": 0.881344799149902, + "learning_rate": 9.803887347410123e-06, + "loss": 0.617, + "step": 1833 + }, + { + "epoch": 0.23, + "grad_norm": 0.7228633029360589, + "learning_rate": 9.803601165175043e-06, + "loss": 0.5779, + "step": 1834 + }, + { + "epoch": 0.23, + "grad_norm": 0.6328450543531903, + "learning_rate": 9.803314778466011e-06, + "loss": 0.5072, + "step": 1835 + }, + { + "epoch": 0.23, + "grad_norm": 0.6878082965265847, + "learning_rate": 9.803028187295218e-06, + "loss": 0.5446, + "step": 1836 + }, + { + "epoch": 0.23, + "grad_norm": 0.7561265841790251, + "learning_rate": 9.802741391674864e-06, + "loss": 0.6078, + "step": 1837 + }, + { + "epoch": 0.23, + "grad_norm": 1.5269171983317034, + "learning_rate": 9.802454391617158e-06, + "loss": 0.6168, + "step": 1838 + }, + { + "epoch": 0.23, + "grad_norm": 0.662341341988264, + "learning_rate": 9.802167187134312e-06, + "loss": 0.5304, + "step": 1839 + }, + { + "epoch": 0.23, + "grad_norm": 0.6361927091599409, + "learning_rate": 9.801879778238557e-06, + "loss": 0.5468, + "step": 1840 + }, + { + "epoch": 0.23, + "grad_norm": 0.8665692683559979, + "learning_rate": 9.801592164942122e-06, + "loss": 0.654, + "step": 1841 + }, + { + "epoch": 0.23, + "grad_norm": 0.5794446386427597, + "learning_rate": 9.801304347257255e-06, + "loss": 0.514, + "step": 1842 + }, + { + "epoch": 0.23, + "grad_norm": 0.565083604405036, + "learning_rate": 9.801016325196201e-06, + "loss": 0.4536, + "step": 1843 + }, + { + "epoch": 0.23, + "grad_norm": 0.7119210720305069, + "learning_rate": 9.800728098771227e-06, + "loss": 0.5877, + "step": 1844 + }, + { + "epoch": 0.24, + "grad_norm": 0.6911282038193561, + "learning_rate": 9.800439667994597e-06, + "loss": 0.5466, + "step": 1845 + }, + { + "epoch": 0.24, + "grad_norm": 1.1006291115988809, + "learning_rate": 9.800151032878593e-06, + "loss": 0.6649, + "step": 1846 + }, + { + "epoch": 0.24, + "grad_norm": 0.7941378228483236, + "learning_rate": 9.799862193435496e-06, + "loss": 0.6036, + "step": 1847 + }, + { + "epoch": 0.24, + "grad_norm": 0.7007455534210305, + "learning_rate": 9.799573149677604e-06, + "loss": 0.5798, + "step": 1848 + }, + { + "epoch": 0.24, + "grad_norm": 0.7955570218586099, + "learning_rate": 9.79928390161722e-06, + "loss": 0.6842, + "step": 1849 + }, + { + "epoch": 0.24, + "grad_norm": 0.6485587476088552, + "learning_rate": 9.798994449266657e-06, + "loss": 0.5086, + "step": 1850 + }, + { + "epoch": 0.24, + "grad_norm": 0.7990949334892158, + "learning_rate": 9.798704792638235e-06, + "loss": 0.6534, + "step": 1851 + }, + { + "epoch": 0.24, + "grad_norm": 0.6409614614806528, + "learning_rate": 9.798414931744286e-06, + "loss": 0.5419, + "step": 1852 + }, + { + "epoch": 0.24, + "grad_norm": 0.8167959177032953, + "learning_rate": 9.798124866597148e-06, + "loss": 0.6005, + "step": 1853 + }, + { + "epoch": 0.24, + "grad_norm": 0.653313874850728, + "learning_rate": 9.797834597209164e-06, + "loss": 0.5686, + "step": 1854 + }, + { + "epoch": 0.24, + "grad_norm": 0.7840320310347608, + "learning_rate": 9.797544123592694e-06, + "loss": 0.6048, + "step": 1855 + }, + { + "epoch": 0.24, + "grad_norm": 0.6548722236822639, + "learning_rate": 9.797253445760103e-06, + "loss": 0.471, + "step": 1856 + }, + { + "epoch": 0.24, + "grad_norm": 0.8512904379792536, + "learning_rate": 9.796962563723761e-06, + "loss": 0.6613, + "step": 1857 + }, + { + "epoch": 0.24, + "grad_norm": 0.9699865123154551, + "learning_rate": 9.796671477496052e-06, + "loss": 0.7084, + "step": 1858 + }, + { + "epoch": 0.24, + "grad_norm": 1.2359218981242521, + "learning_rate": 9.796380187089365e-06, + "loss": 0.7023, + "step": 1859 + }, + { + "epoch": 0.24, + "grad_norm": 0.6176424523894019, + "learning_rate": 9.7960886925161e-06, + "loss": 0.5307, + "step": 1860 + }, + { + "epoch": 0.24, + "grad_norm": 0.6457671000241276, + "learning_rate": 9.795796993788669e-06, + "loss": 0.5582, + "step": 1861 + }, + { + "epoch": 0.24, + "grad_norm": 0.8170632193767384, + "learning_rate": 9.795505090919483e-06, + "loss": 0.6949, + "step": 1862 + }, + { + "epoch": 0.24, + "grad_norm": 0.6247440359342894, + "learning_rate": 9.79521298392097e-06, + "loss": 0.5519, + "step": 1863 + }, + { + "epoch": 0.24, + "grad_norm": 0.8991623154905343, + "learning_rate": 9.794920672805563e-06, + "loss": 0.6027, + "step": 1864 + }, + { + "epoch": 0.24, + "grad_norm": 0.6371668104566895, + "learning_rate": 9.794628157585705e-06, + "loss": 0.5482, + "step": 1865 + }, + { + "epoch": 0.24, + "grad_norm": 0.7367620987083294, + "learning_rate": 9.794335438273847e-06, + "loss": 0.6282, + "step": 1866 + }, + { + "epoch": 0.24, + "grad_norm": 0.8788005777652736, + "learning_rate": 9.794042514882453e-06, + "loss": 0.6181, + "step": 1867 + }, + { + "epoch": 0.24, + "grad_norm": 2.2160155982655554, + "learning_rate": 9.793749387423986e-06, + "loss": 0.6313, + "step": 1868 + }, + { + "epoch": 0.24, + "grad_norm": 0.6875203175738123, + "learning_rate": 9.793456055910929e-06, + "loss": 0.5657, + "step": 1869 + }, + { + "epoch": 0.24, + "grad_norm": 0.620390393773217, + "learning_rate": 9.79316252035576e-06, + "loss": 0.5636, + "step": 1870 + }, + { + "epoch": 0.24, + "grad_norm": 0.8060927637625144, + "learning_rate": 9.792868780770984e-06, + "loss": 0.6076, + "step": 1871 + }, + { + "epoch": 0.24, + "grad_norm": 0.7892093311624908, + "learning_rate": 9.7925748371691e-06, + "loss": 0.6325, + "step": 1872 + }, + { + "epoch": 0.24, + "grad_norm": 0.6340907113812765, + "learning_rate": 9.792280689562617e-06, + "loss": 0.5467, + "step": 1873 + }, + { + "epoch": 0.24, + "grad_norm": 0.7812304518114844, + "learning_rate": 9.791986337964061e-06, + "loss": 0.5487, + "step": 1874 + }, + { + "epoch": 0.24, + "grad_norm": 0.6868687819751733, + "learning_rate": 9.79169178238596e-06, + "loss": 0.5666, + "step": 1875 + }, + { + "epoch": 0.24, + "grad_norm": 0.7470990444460175, + "learning_rate": 9.79139702284085e-06, + "loss": 0.5775, + "step": 1876 + }, + { + "epoch": 0.24, + "grad_norm": 0.669118934532083, + "learning_rate": 9.791102059341282e-06, + "loss": 0.5375, + "step": 1877 + }, + { + "epoch": 0.24, + "grad_norm": 0.5789565813247094, + "learning_rate": 9.790806891899807e-06, + "loss": 0.5067, + "step": 1878 + }, + { + "epoch": 0.24, + "grad_norm": 0.8735880122952258, + "learning_rate": 9.790511520528994e-06, + "loss": 0.655, + "step": 1879 + }, + { + "epoch": 0.24, + "grad_norm": 0.7434939390969325, + "learning_rate": 9.790215945241414e-06, + "loss": 0.6341, + "step": 1880 + }, + { + "epoch": 0.24, + "grad_norm": 0.6660027918812905, + "learning_rate": 9.789920166049646e-06, + "loss": 0.5673, + "step": 1881 + }, + { + "epoch": 0.24, + "grad_norm": 0.8022913933066801, + "learning_rate": 9.789624182966287e-06, + "loss": 0.6094, + "step": 1882 + }, + { + "epoch": 0.24, + "grad_norm": 0.6770294527721105, + "learning_rate": 9.789327996003928e-06, + "loss": 0.5774, + "step": 1883 + }, + { + "epoch": 0.24, + "grad_norm": 0.94355827439604, + "learning_rate": 9.789031605175183e-06, + "loss": 0.6976, + "step": 1884 + }, + { + "epoch": 0.24, + "grad_norm": 0.7256597176856311, + "learning_rate": 9.788735010492665e-06, + "loss": 0.5596, + "step": 1885 + }, + { + "epoch": 0.24, + "grad_norm": 0.8615465433468593, + "learning_rate": 9.788438211969e-06, + "loss": 0.6355, + "step": 1886 + }, + { + "epoch": 0.24, + "grad_norm": 0.8250046633773639, + "learning_rate": 9.788141209616823e-06, + "loss": 0.6403, + "step": 1887 + }, + { + "epoch": 0.24, + "grad_norm": 0.7262155352396545, + "learning_rate": 9.787844003448776e-06, + "loss": 0.6173, + "step": 1888 + }, + { + "epoch": 0.24, + "grad_norm": 0.6120776219419543, + "learning_rate": 9.787546593477509e-06, + "loss": 0.555, + "step": 1889 + }, + { + "epoch": 0.24, + "grad_norm": 0.5843370001787598, + "learning_rate": 9.78724897971568e-06, + "loss": 0.5397, + "step": 1890 + }, + { + "epoch": 0.24, + "grad_norm": 0.7438143537722247, + "learning_rate": 9.786951162175961e-06, + "loss": 0.6256, + "step": 1891 + }, + { + "epoch": 0.24, + "grad_norm": 0.6842285821982648, + "learning_rate": 9.786653140871029e-06, + "loss": 0.5712, + "step": 1892 + }, + { + "epoch": 0.24, + "grad_norm": 1.080874953127134, + "learning_rate": 9.78635491581357e-06, + "loss": 0.6835, + "step": 1893 + }, + { + "epoch": 0.24, + "grad_norm": 0.7851095478476103, + "learning_rate": 9.786056487016276e-06, + "loss": 0.5984, + "step": 1894 + }, + { + "epoch": 0.24, + "grad_norm": 0.9732872864863693, + "learning_rate": 9.785757854491851e-06, + "loss": 0.6524, + "step": 1895 + }, + { + "epoch": 0.24, + "grad_norm": 0.6817606329799621, + "learning_rate": 9.785459018253007e-06, + "loss": 0.5323, + "step": 1896 + }, + { + "epoch": 0.24, + "grad_norm": 0.5777681519472451, + "learning_rate": 9.785159978312465e-06, + "loss": 0.5827, + "step": 1897 + }, + { + "epoch": 0.24, + "grad_norm": 0.9055026562937675, + "learning_rate": 9.784860734682954e-06, + "loss": 0.7013, + "step": 1898 + }, + { + "epoch": 0.24, + "grad_norm": 0.6536385790471594, + "learning_rate": 9.78456128737721e-06, + "loss": 0.5104, + "step": 1899 + }, + { + "epoch": 0.24, + "grad_norm": 0.6870498844950168, + "learning_rate": 9.784261636407983e-06, + "loss": 0.5769, + "step": 1900 + }, + { + "epoch": 0.24, + "grad_norm": 0.564708476266616, + "learning_rate": 9.783961781788027e-06, + "loss": 0.4899, + "step": 1901 + }, + { + "epoch": 0.24, + "grad_norm": 0.6478259646680682, + "learning_rate": 9.783661723530105e-06, + "loss": 0.5143, + "step": 1902 + }, + { + "epoch": 0.24, + "grad_norm": 0.5663585499776949, + "learning_rate": 9.78336146164699e-06, + "loss": 0.5137, + "step": 1903 + }, + { + "epoch": 0.24, + "grad_norm": 0.5878041448139183, + "learning_rate": 9.783060996151461e-06, + "loss": 0.5116, + "step": 1904 + }, + { + "epoch": 0.24, + "grad_norm": 0.7619117929481182, + "learning_rate": 9.78276032705631e-06, + "loss": 0.6297, + "step": 1905 + }, + { + "epoch": 0.24, + "grad_norm": 0.8115502761321896, + "learning_rate": 9.782459454374337e-06, + "loss": 0.5855, + "step": 1906 + }, + { + "epoch": 0.24, + "grad_norm": 0.7862715666596223, + "learning_rate": 9.78215837811835e-06, + "loss": 0.6286, + "step": 1907 + }, + { + "epoch": 0.24, + "grad_norm": 0.681053413459562, + "learning_rate": 9.781857098301158e-06, + "loss": 0.5786, + "step": 1908 + }, + { + "epoch": 0.24, + "grad_norm": 0.5921885004324525, + "learning_rate": 9.781555614935591e-06, + "loss": 0.5684, + "step": 1909 + }, + { + "epoch": 0.24, + "grad_norm": 1.001601564863681, + "learning_rate": 9.781253928034483e-06, + "loss": 0.6666, + "step": 1910 + }, + { + "epoch": 0.24, + "grad_norm": 0.5639342154583687, + "learning_rate": 9.780952037610674e-06, + "loss": 0.4926, + "step": 1911 + }, + { + "epoch": 0.24, + "grad_norm": 0.948207088170478, + "learning_rate": 9.780649943677013e-06, + "loss": 0.6486, + "step": 1912 + }, + { + "epoch": 0.24, + "grad_norm": 0.731045565932882, + "learning_rate": 9.780347646246359e-06, + "loss": 0.6269, + "step": 1913 + }, + { + "epoch": 0.24, + "grad_norm": 0.7373371843437875, + "learning_rate": 9.780045145331585e-06, + "loss": 0.5573, + "step": 1914 + }, + { + "epoch": 0.24, + "grad_norm": 0.8560864430134707, + "learning_rate": 9.779742440945563e-06, + "loss": 0.64, + "step": 1915 + }, + { + "epoch": 0.24, + "grad_norm": 1.3035260561724835, + "learning_rate": 9.779439533101178e-06, + "loss": 0.6405, + "step": 1916 + }, + { + "epoch": 0.24, + "grad_norm": 0.8829823219148513, + "learning_rate": 9.779136421811326e-06, + "loss": 0.6577, + "step": 1917 + }, + { + "epoch": 0.24, + "grad_norm": 0.9189240436124142, + "learning_rate": 9.778833107088907e-06, + "loss": 0.6538, + "step": 1918 + }, + { + "epoch": 0.24, + "grad_norm": 0.8070312100696989, + "learning_rate": 9.778529588946835e-06, + "loss": 0.6585, + "step": 1919 + }, + { + "epoch": 0.24, + "grad_norm": 0.6162431523006736, + "learning_rate": 9.778225867398027e-06, + "loss": 0.5574, + "step": 1920 + }, + { + "epoch": 0.24, + "grad_norm": 0.7038298018322586, + "learning_rate": 9.777921942455414e-06, + "loss": 0.5793, + "step": 1921 + }, + { + "epoch": 0.24, + "grad_norm": 0.8130947236153864, + "learning_rate": 9.777617814131934e-06, + "loss": 0.6469, + "step": 1922 + }, + { + "epoch": 0.24, + "grad_norm": 0.6830042517603784, + "learning_rate": 9.777313482440528e-06, + "loss": 0.5905, + "step": 1923 + }, + { + "epoch": 0.25, + "grad_norm": 0.7720600364389372, + "learning_rate": 9.777008947394152e-06, + "loss": 0.625, + "step": 1924 + }, + { + "epoch": 0.25, + "grad_norm": 0.7757763888398167, + "learning_rate": 9.776704209005774e-06, + "loss": 0.6185, + "step": 1925 + }, + { + "epoch": 0.25, + "grad_norm": 0.7150199268120511, + "learning_rate": 9.77639926728836e-06, + "loss": 0.5528, + "step": 1926 + }, + { + "epoch": 0.25, + "grad_norm": 0.6747235250631582, + "learning_rate": 9.776094122254892e-06, + "loss": 0.6118, + "step": 1927 + }, + { + "epoch": 0.25, + "grad_norm": 0.5501730011096506, + "learning_rate": 9.77578877391836e-06, + "loss": 0.5672, + "step": 1928 + }, + { + "epoch": 0.25, + "grad_norm": 0.6747575054042125, + "learning_rate": 9.775483222291762e-06, + "loss": 0.579, + "step": 1929 + }, + { + "epoch": 0.25, + "grad_norm": 0.6437417447268526, + "learning_rate": 9.775177467388101e-06, + "loss": 0.5662, + "step": 1930 + }, + { + "epoch": 0.25, + "grad_norm": 0.5818262316647661, + "learning_rate": 9.774871509220396e-06, + "loss": 0.5539, + "step": 1931 + }, + { + "epoch": 0.25, + "grad_norm": 0.6384674367901276, + "learning_rate": 9.77456534780167e-06, + "loss": 0.4779, + "step": 1932 + }, + { + "epoch": 0.25, + "grad_norm": 0.6626283438878735, + "learning_rate": 9.774258983144952e-06, + "loss": 0.5772, + "step": 1933 + }, + { + "epoch": 0.25, + "grad_norm": 1.0488726515835534, + "learning_rate": 9.773952415263288e-06, + "loss": 0.6297, + "step": 1934 + }, + { + "epoch": 0.25, + "grad_norm": 0.6240739938305208, + "learning_rate": 9.773645644169724e-06, + "loss": 0.5175, + "step": 1935 + }, + { + "epoch": 0.25, + "grad_norm": 0.8390389841799185, + "learning_rate": 9.77333866987732e-06, + "loss": 0.6576, + "step": 1936 + }, + { + "epoch": 0.25, + "grad_norm": 0.8315159132638323, + "learning_rate": 9.77303149239914e-06, + "loss": 0.5954, + "step": 1937 + }, + { + "epoch": 0.25, + "grad_norm": 0.6388266061400573, + "learning_rate": 9.772724111748265e-06, + "loss": 0.5862, + "step": 1938 + }, + { + "epoch": 0.25, + "grad_norm": 1.1806161302822178, + "learning_rate": 9.772416527937774e-06, + "loss": 0.6431, + "step": 1939 + }, + { + "epoch": 0.25, + "grad_norm": 0.7005717575610654, + "learning_rate": 9.772108740980764e-06, + "loss": 0.5804, + "step": 1940 + }, + { + "epoch": 0.25, + "grad_norm": 0.5913186514354882, + "learning_rate": 9.77180075089033e-06, + "loss": 0.5499, + "step": 1941 + }, + { + "epoch": 0.25, + "grad_norm": 0.6922462934160454, + "learning_rate": 9.77149255767959e-06, + "loss": 0.5241, + "step": 1942 + }, + { + "epoch": 0.25, + "grad_norm": 0.77347767174598, + "learning_rate": 9.77118416136166e-06, + "loss": 0.6407, + "step": 1943 + }, + { + "epoch": 0.25, + "grad_norm": 0.8038559729479153, + "learning_rate": 9.770875561949665e-06, + "loss": 0.6179, + "step": 1944 + }, + { + "epoch": 0.25, + "grad_norm": 0.6279371308770109, + "learning_rate": 9.770566759456743e-06, + "loss": 0.5585, + "step": 1945 + }, + { + "epoch": 0.25, + "grad_norm": 0.7289729212128583, + "learning_rate": 9.770257753896038e-06, + "loss": 0.5612, + "step": 1946 + }, + { + "epoch": 0.25, + "grad_norm": 0.615149968987624, + "learning_rate": 9.769948545280705e-06, + "loss": 0.5887, + "step": 1947 + }, + { + "epoch": 0.25, + "grad_norm": 0.6851751966271976, + "learning_rate": 9.769639133623905e-06, + "loss": 0.5275, + "step": 1948 + }, + { + "epoch": 0.25, + "grad_norm": 0.5536769005121086, + "learning_rate": 9.769329518938808e-06, + "loss": 0.5175, + "step": 1949 + }, + { + "epoch": 0.25, + "grad_norm": 0.6095844296252171, + "learning_rate": 9.769019701238595e-06, + "loss": 0.508, + "step": 1950 + }, + { + "epoch": 0.25, + "grad_norm": 0.6736914616040667, + "learning_rate": 9.768709680536453e-06, + "loss": 0.5366, + "step": 1951 + }, + { + "epoch": 0.25, + "grad_norm": 0.637199326869723, + "learning_rate": 9.768399456845577e-06, + "loss": 0.5953, + "step": 1952 + }, + { + "epoch": 0.25, + "grad_norm": 0.8049306836349139, + "learning_rate": 9.768089030179175e-06, + "loss": 0.583, + "step": 1953 + }, + { + "epoch": 0.25, + "grad_norm": 0.6303732958697623, + "learning_rate": 9.767778400550459e-06, + "loss": 0.5499, + "step": 1954 + }, + { + "epoch": 0.25, + "grad_norm": 0.6099271068664792, + "learning_rate": 9.767467567972653e-06, + "loss": 0.59, + "step": 1955 + }, + { + "epoch": 0.25, + "grad_norm": 0.7435821689088031, + "learning_rate": 9.767156532458985e-06, + "loss": 0.6661, + "step": 1956 + }, + { + "epoch": 0.25, + "grad_norm": 0.7438129889723072, + "learning_rate": 9.7668452940227e-06, + "loss": 0.6034, + "step": 1957 + }, + { + "epoch": 0.25, + "grad_norm": 1.112723556132667, + "learning_rate": 9.766533852677042e-06, + "loss": 0.5991, + "step": 1958 + }, + { + "epoch": 0.25, + "grad_norm": 0.6311477087148216, + "learning_rate": 9.76622220843527e-06, + "loss": 0.5811, + "step": 1959 + }, + { + "epoch": 0.25, + "grad_norm": 0.6599377713956334, + "learning_rate": 9.765910361310648e-06, + "loss": 0.5685, + "step": 1960 + }, + { + "epoch": 0.25, + "grad_norm": 0.6476688505922261, + "learning_rate": 9.765598311316453e-06, + "loss": 0.5605, + "step": 1961 + }, + { + "epoch": 0.25, + "grad_norm": 1.0305340753836008, + "learning_rate": 9.765286058465966e-06, + "loss": 0.6607, + "step": 1962 + }, + { + "epoch": 0.25, + "grad_norm": 1.044994744196121, + "learning_rate": 9.764973602772479e-06, + "loss": 0.6493, + "step": 1963 + }, + { + "epoch": 0.25, + "grad_norm": 0.6759009554840261, + "learning_rate": 9.764660944249294e-06, + "loss": 0.5395, + "step": 1964 + }, + { + "epoch": 0.25, + "grad_norm": 0.6588819199833671, + "learning_rate": 9.764348082909716e-06, + "loss": 0.5729, + "step": 1965 + }, + { + "epoch": 0.25, + "grad_norm": 1.5676185798033166, + "learning_rate": 9.764035018767066e-06, + "loss": 0.6366, + "step": 1966 + }, + { + "epoch": 0.25, + "grad_norm": 0.7488411094021024, + "learning_rate": 9.763721751834669e-06, + "loss": 0.6195, + "step": 1967 + }, + { + "epoch": 0.25, + "grad_norm": 0.7798042275961441, + "learning_rate": 9.763408282125859e-06, + "loss": 0.6319, + "step": 1968 + }, + { + "epoch": 0.25, + "grad_norm": 0.6113906584451468, + "learning_rate": 9.76309460965398e-06, + "loss": 0.5173, + "step": 1969 + }, + { + "epoch": 0.25, + "grad_norm": 0.9364621296730903, + "learning_rate": 9.762780734432385e-06, + "loss": 0.6356, + "step": 1970 + }, + { + "epoch": 0.25, + "grad_norm": 0.5893345078880406, + "learning_rate": 9.762466656474434e-06, + "loss": 0.5172, + "step": 1971 + }, + { + "epoch": 0.25, + "grad_norm": 0.9155751022027765, + "learning_rate": 9.762152375793494e-06, + "loss": 0.5782, + "step": 1972 + }, + { + "epoch": 0.25, + "grad_norm": 0.5915185565742986, + "learning_rate": 9.761837892402947e-06, + "loss": 0.4789, + "step": 1973 + }, + { + "epoch": 0.25, + "grad_norm": 0.7598180720449295, + "learning_rate": 9.761523206316178e-06, + "loss": 0.6276, + "step": 1974 + }, + { + "epoch": 0.25, + "grad_norm": 0.6946721467354218, + "learning_rate": 9.76120831754658e-06, + "loss": 0.5663, + "step": 1975 + }, + { + "epoch": 0.25, + "grad_norm": 0.7861411569330273, + "learning_rate": 9.76089322610756e-06, + "loss": 0.6307, + "step": 1976 + }, + { + "epoch": 0.25, + "grad_norm": 0.7665265029916419, + "learning_rate": 9.760577932012529e-06, + "loss": 0.6186, + "step": 1977 + }, + { + "epoch": 0.25, + "grad_norm": 0.6485763026996052, + "learning_rate": 9.760262435274907e-06, + "loss": 0.5554, + "step": 1978 + }, + { + "epoch": 0.25, + "grad_norm": 1.0154239397820972, + "learning_rate": 9.759946735908125e-06, + "loss": 0.6091, + "step": 1979 + }, + { + "epoch": 0.25, + "grad_norm": 0.6333491185463955, + "learning_rate": 9.759630833925622e-06, + "loss": 0.5214, + "step": 1980 + }, + { + "epoch": 0.25, + "grad_norm": 0.8024634694331682, + "learning_rate": 9.759314729340843e-06, + "loss": 0.5984, + "step": 1981 + }, + { + "epoch": 0.25, + "grad_norm": 0.5994701254065763, + "learning_rate": 9.758998422167245e-06, + "loss": 0.5424, + "step": 1982 + }, + { + "epoch": 0.25, + "grad_norm": 0.8043182819282072, + "learning_rate": 9.758681912418292e-06, + "loss": 0.6087, + "step": 1983 + }, + { + "epoch": 0.25, + "grad_norm": 0.6638797433462842, + "learning_rate": 9.758365200107455e-06, + "loss": 0.5653, + "step": 1984 + }, + { + "epoch": 0.25, + "grad_norm": 0.8512709591686907, + "learning_rate": 9.75804828524822e-06, + "loss": 0.6407, + "step": 1985 + }, + { + "epoch": 0.25, + "grad_norm": 0.7922522876482098, + "learning_rate": 9.757731167854072e-06, + "loss": 0.5331, + "step": 1986 + }, + { + "epoch": 0.25, + "grad_norm": 0.6068064293105282, + "learning_rate": 9.757413847938512e-06, + "loss": 0.5123, + "step": 1987 + }, + { + "epoch": 0.25, + "grad_norm": 0.8246795405287417, + "learning_rate": 9.757096325515047e-06, + "loss": 0.6201, + "step": 1988 + }, + { + "epoch": 0.25, + "grad_norm": 0.9882681574025354, + "learning_rate": 9.756778600597193e-06, + "loss": 0.6364, + "step": 1989 + }, + { + "epoch": 0.25, + "grad_norm": 1.0554044969170837, + "learning_rate": 9.756460673198474e-06, + "loss": 0.587, + "step": 1990 + }, + { + "epoch": 0.25, + "grad_norm": 0.8535287115138298, + "learning_rate": 9.756142543332424e-06, + "loss": 0.6933, + "step": 1991 + }, + { + "epoch": 0.25, + "grad_norm": 0.7035383973149673, + "learning_rate": 9.755824211012585e-06, + "loss": 0.5658, + "step": 1992 + }, + { + "epoch": 0.25, + "grad_norm": 0.6204193016440039, + "learning_rate": 9.755505676252506e-06, + "loss": 0.497, + "step": 1993 + }, + { + "epoch": 0.25, + "grad_norm": 0.757533191159159, + "learning_rate": 9.755186939065746e-06, + "loss": 0.6107, + "step": 1994 + }, + { + "epoch": 0.25, + "grad_norm": 0.6809709597015329, + "learning_rate": 9.754867999465876e-06, + "loss": 0.5836, + "step": 1995 + }, + { + "epoch": 0.25, + "grad_norm": 0.6514669512667115, + "learning_rate": 9.754548857466468e-06, + "loss": 0.549, + "step": 1996 + }, + { + "epoch": 0.25, + "grad_norm": 0.5753757412548579, + "learning_rate": 9.754229513081109e-06, + "loss": 0.5024, + "step": 1997 + }, + { + "epoch": 0.25, + "grad_norm": 0.898139757103797, + "learning_rate": 9.753909966323389e-06, + "loss": 0.6627, + "step": 1998 + }, + { + "epoch": 0.25, + "grad_norm": 0.7743361101681899, + "learning_rate": 9.753590217206917e-06, + "loss": 0.5513, + "step": 1999 + }, + { + "epoch": 0.25, + "grad_norm": 0.8926775684335282, + "learning_rate": 9.753270265745298e-06, + "loss": 0.6801, + "step": 2000 + }, + { + "epoch": 0.25, + "grad_norm": 0.905779177290222, + "learning_rate": 9.752950111952153e-06, + "loss": 0.6537, + "step": 2001 + }, + { + "epoch": 0.26, + "grad_norm": 0.587992147442533, + "learning_rate": 9.75262975584111e-06, + "loss": 0.4905, + "step": 2002 + }, + { + "epoch": 0.26, + "grad_norm": 0.6776931912155311, + "learning_rate": 9.752309197425807e-06, + "loss": 0.5584, + "step": 2003 + }, + { + "epoch": 0.26, + "grad_norm": 0.9758447500493901, + "learning_rate": 9.751988436719886e-06, + "loss": 0.6163, + "step": 2004 + }, + { + "epoch": 0.26, + "grad_norm": 0.786220018553936, + "learning_rate": 9.751667473737003e-06, + "loss": 0.6614, + "step": 2005 + }, + { + "epoch": 0.26, + "grad_norm": 0.8054724299259647, + "learning_rate": 9.751346308490819e-06, + "loss": 0.6208, + "step": 2006 + }, + { + "epoch": 0.26, + "grad_norm": 0.6953486716148269, + "learning_rate": 9.751024940995008e-06, + "loss": 0.5441, + "step": 2007 + }, + { + "epoch": 0.26, + "grad_norm": 0.7116303093210296, + "learning_rate": 9.750703371263246e-06, + "loss": 0.5668, + "step": 2008 + }, + { + "epoch": 0.26, + "grad_norm": 0.6128615339599108, + "learning_rate": 9.750381599309223e-06, + "loss": 0.5103, + "step": 2009 + }, + { + "epoch": 0.26, + "grad_norm": 0.5895524545957406, + "learning_rate": 9.750059625146634e-06, + "loss": 0.5451, + "step": 2010 + }, + { + "epoch": 0.26, + "grad_norm": 0.7106416463269892, + "learning_rate": 9.749737448789188e-06, + "loss": 0.5748, + "step": 2011 + }, + { + "epoch": 0.26, + "grad_norm": 0.9539169606527462, + "learning_rate": 9.749415070250595e-06, + "loss": 0.616, + "step": 2012 + }, + { + "epoch": 0.26, + "grad_norm": 0.8121783756099556, + "learning_rate": 9.749092489544578e-06, + "loss": 0.661, + "step": 2013 + }, + { + "epoch": 0.26, + "grad_norm": 0.5523981654354464, + "learning_rate": 9.748769706684872e-06, + "loss": 0.4868, + "step": 2014 + }, + { + "epoch": 0.26, + "grad_norm": 0.9279060732475999, + "learning_rate": 9.748446721685214e-06, + "loss": 0.67, + "step": 2015 + }, + { + "epoch": 0.26, + "grad_norm": 0.6502826087472836, + "learning_rate": 9.748123534559353e-06, + "loss": 0.5581, + "step": 2016 + }, + { + "epoch": 0.26, + "grad_norm": 0.9561047719262271, + "learning_rate": 9.747800145321047e-06, + "loss": 0.6016, + "step": 2017 + }, + { + "epoch": 0.26, + "grad_norm": 0.6914834687803977, + "learning_rate": 9.74747655398406e-06, + "loss": 0.5959, + "step": 2018 + }, + { + "epoch": 0.26, + "grad_norm": 0.9461309182534641, + "learning_rate": 9.747152760562165e-06, + "loss": 0.6504, + "step": 2019 + }, + { + "epoch": 0.26, + "grad_norm": 0.6677868296472617, + "learning_rate": 9.746828765069148e-06, + "loss": 0.5674, + "step": 2020 + }, + { + "epoch": 0.26, + "grad_norm": 0.7205211983597987, + "learning_rate": 9.7465045675188e-06, + "loss": 0.5707, + "step": 2021 + }, + { + "epoch": 0.26, + "grad_norm": 0.8565289265491989, + "learning_rate": 9.746180167924919e-06, + "loss": 0.5994, + "step": 2022 + }, + { + "epoch": 0.26, + "grad_norm": 0.7827491188198623, + "learning_rate": 9.745855566301315e-06, + "loss": 0.6366, + "step": 2023 + }, + { + "epoch": 0.26, + "grad_norm": 0.8513641718980255, + "learning_rate": 9.745530762661805e-06, + "loss": 0.6954, + "step": 2024 + }, + { + "epoch": 0.26, + "grad_norm": 0.9325634003810489, + "learning_rate": 9.745205757020216e-06, + "loss": 0.6333, + "step": 2025 + }, + { + "epoch": 0.26, + "grad_norm": 0.6288848026923852, + "learning_rate": 9.744880549390382e-06, + "loss": 0.5053, + "step": 2026 + }, + { + "epoch": 0.26, + "grad_norm": 0.7933685981261801, + "learning_rate": 9.744555139786142e-06, + "loss": 0.5391, + "step": 2027 + }, + { + "epoch": 0.26, + "grad_norm": 1.8354595558937188, + "learning_rate": 9.744229528221354e-06, + "loss": 0.572, + "step": 2028 + }, + { + "epoch": 0.26, + "grad_norm": 0.6365256178368215, + "learning_rate": 9.743903714709875e-06, + "loss": 0.5329, + "step": 2029 + }, + { + "epoch": 0.26, + "grad_norm": 1.0296411852335627, + "learning_rate": 9.743577699265574e-06, + "loss": 0.694, + "step": 2030 + }, + { + "epoch": 0.26, + "grad_norm": 0.5709720494840266, + "learning_rate": 9.743251481902329e-06, + "loss": 0.5204, + "step": 2031 + }, + { + "epoch": 0.26, + "grad_norm": 0.7712303990324043, + "learning_rate": 9.742925062634025e-06, + "loss": 0.5535, + "step": 2032 + }, + { + "epoch": 0.26, + "grad_norm": 1.036540970751146, + "learning_rate": 9.742598441474558e-06, + "loss": 0.6278, + "step": 2033 + }, + { + "epoch": 0.26, + "grad_norm": 0.7719481581583385, + "learning_rate": 9.74227161843783e-06, + "loss": 0.6635, + "step": 2034 + }, + { + "epoch": 0.26, + "grad_norm": 0.7193126175204697, + "learning_rate": 9.741944593537754e-06, + "loss": 0.5871, + "step": 2035 + }, + { + "epoch": 0.26, + "grad_norm": 0.5914133858909755, + "learning_rate": 9.74161736678825e-06, + "loss": 0.5482, + "step": 2036 + }, + { + "epoch": 0.26, + "grad_norm": 0.8380293249144736, + "learning_rate": 9.741289938203246e-06, + "loss": 0.5469, + "step": 2037 + }, + { + "epoch": 0.26, + "grad_norm": 0.717292077984541, + "learning_rate": 9.74096230779668e-06, + "loss": 0.5984, + "step": 2038 + }, + { + "epoch": 0.26, + "grad_norm": 0.8120503871855917, + "learning_rate": 9.740634475582499e-06, + "loss": 0.6502, + "step": 2039 + }, + { + "epoch": 0.26, + "grad_norm": 0.6498869588587249, + "learning_rate": 9.740306441574656e-06, + "loss": 0.5655, + "step": 2040 + }, + { + "epoch": 0.26, + "grad_norm": 0.650287790228828, + "learning_rate": 9.739978205787117e-06, + "loss": 0.5129, + "step": 2041 + }, + { + "epoch": 0.26, + "grad_norm": 1.4853959479523764, + "learning_rate": 9.739649768233853e-06, + "loss": 0.5819, + "step": 2042 + }, + { + "epoch": 0.26, + "grad_norm": 0.6446973900005055, + "learning_rate": 9.739321128928844e-06, + "loss": 0.5536, + "step": 2043 + }, + { + "epoch": 0.26, + "grad_norm": 0.6946298845745377, + "learning_rate": 9.738992287886078e-06, + "loss": 0.5143, + "step": 2044 + }, + { + "epoch": 0.26, + "grad_norm": 0.5753733402622996, + "learning_rate": 9.738663245119555e-06, + "loss": 0.5252, + "step": 2045 + }, + { + "epoch": 0.26, + "grad_norm": 0.7782048941045351, + "learning_rate": 9.738334000643278e-06, + "loss": 0.5927, + "step": 2046 + }, + { + "epoch": 0.26, + "grad_norm": 0.9996804655740105, + "learning_rate": 9.738004554471267e-06, + "loss": 0.6152, + "step": 2047 + }, + { + "epoch": 0.26, + "grad_norm": 0.8691303856697805, + "learning_rate": 9.737674906617543e-06, + "loss": 0.6083, + "step": 2048 + }, + { + "epoch": 0.26, + "grad_norm": 0.6258160618603322, + "learning_rate": 9.737345057096135e-06, + "loss": 0.5168, + "step": 2049 + }, + { + "epoch": 0.26, + "grad_norm": 0.775293773299195, + "learning_rate": 9.737015005921088e-06, + "loss": 0.6381, + "step": 2050 + }, + { + "epoch": 0.26, + "grad_norm": 0.6385407024821724, + "learning_rate": 9.73668475310645e-06, + "loss": 0.5543, + "step": 2051 + }, + { + "epoch": 0.26, + "grad_norm": 0.5861410800895339, + "learning_rate": 9.736354298666277e-06, + "loss": 0.4962, + "step": 2052 + }, + { + "epoch": 0.26, + "grad_norm": 0.9199703456080394, + "learning_rate": 9.736023642614638e-06, + "loss": 0.6674, + "step": 2053 + }, + { + "epoch": 0.26, + "grad_norm": 0.9154313901885083, + "learning_rate": 9.735692784965606e-06, + "loss": 0.6767, + "step": 2054 + }, + { + "epoch": 0.26, + "grad_norm": 1.1384153897500089, + "learning_rate": 9.735361725733265e-06, + "loss": 0.6438, + "step": 2055 + }, + { + "epoch": 0.26, + "grad_norm": 0.6542157276954875, + "learning_rate": 9.735030464931707e-06, + "loss": 0.5181, + "step": 2056 + }, + { + "epoch": 0.26, + "grad_norm": 0.7714882273538483, + "learning_rate": 9.734699002575035e-06, + "loss": 0.6218, + "step": 2057 + }, + { + "epoch": 0.26, + "grad_norm": 0.9906874791748633, + "learning_rate": 9.734367338677355e-06, + "loss": 0.6813, + "step": 2058 + }, + { + "epoch": 0.26, + "grad_norm": 0.6833249408232183, + "learning_rate": 9.734035473252786e-06, + "loss": 0.5471, + "step": 2059 + }, + { + "epoch": 0.26, + "grad_norm": 0.6003145554045849, + "learning_rate": 9.733703406315455e-06, + "loss": 0.5351, + "step": 2060 + }, + { + "epoch": 0.26, + "grad_norm": 0.7307122835592621, + "learning_rate": 9.733371137879498e-06, + "loss": 0.5821, + "step": 2061 + }, + { + "epoch": 0.26, + "grad_norm": 0.6831646478598287, + "learning_rate": 9.733038667959054e-06, + "loss": 0.5798, + "step": 2062 + }, + { + "epoch": 0.26, + "grad_norm": 0.5258412523513408, + "learning_rate": 9.73270599656828e-06, + "loss": 0.4822, + "step": 2063 + }, + { + "epoch": 0.26, + "grad_norm": 1.288210033872429, + "learning_rate": 9.732373123721337e-06, + "loss": 0.635, + "step": 2064 + }, + { + "epoch": 0.26, + "grad_norm": 0.6161080538188768, + "learning_rate": 9.732040049432393e-06, + "loss": 0.5423, + "step": 2065 + }, + { + "epoch": 0.26, + "grad_norm": 0.9489274096999086, + "learning_rate": 9.731706773715624e-06, + "loss": 0.6287, + "step": 2066 + }, + { + "epoch": 0.26, + "grad_norm": 0.6266522673051259, + "learning_rate": 9.731373296585218e-06, + "loss": 0.5672, + "step": 2067 + }, + { + "epoch": 0.26, + "grad_norm": 0.6733008292071735, + "learning_rate": 9.73103961805537e-06, + "loss": 0.6035, + "step": 2068 + }, + { + "epoch": 0.26, + "grad_norm": 0.722691724097056, + "learning_rate": 9.730705738140284e-06, + "loss": 0.6052, + "step": 2069 + }, + { + "epoch": 0.26, + "grad_norm": 0.5733938543934274, + "learning_rate": 9.730371656854172e-06, + "loss": 0.4776, + "step": 2070 + }, + { + "epoch": 0.26, + "grad_norm": 0.620428591961551, + "learning_rate": 9.730037374211255e-06, + "loss": 0.5432, + "step": 2071 + }, + { + "epoch": 0.26, + "grad_norm": 0.7010401930330775, + "learning_rate": 9.729702890225761e-06, + "loss": 0.5135, + "step": 2072 + }, + { + "epoch": 0.26, + "grad_norm": 0.6290161986027752, + "learning_rate": 9.729368204911928e-06, + "loss": 0.5256, + "step": 2073 + }, + { + "epoch": 0.26, + "grad_norm": 0.6798955498123993, + "learning_rate": 9.729033318284005e-06, + "loss": 0.5764, + "step": 2074 + }, + { + "epoch": 0.26, + "grad_norm": 0.7569271787854982, + "learning_rate": 9.728698230356246e-06, + "loss": 0.5505, + "step": 2075 + }, + { + "epoch": 0.26, + "grad_norm": 0.6169296838439209, + "learning_rate": 9.728362941142913e-06, + "loss": 0.5342, + "step": 2076 + }, + { + "epoch": 0.26, + "grad_norm": 0.665497242148675, + "learning_rate": 9.72802745065828e-06, + "loss": 0.559, + "step": 2077 + }, + { + "epoch": 0.26, + "grad_norm": 0.8047324862964884, + "learning_rate": 9.727691758916627e-06, + "loss": 0.6709, + "step": 2078 + }, + { + "epoch": 0.26, + "grad_norm": 0.6871136990546536, + "learning_rate": 9.727355865932242e-06, + "loss": 0.5583, + "step": 2079 + }, + { + "epoch": 0.26, + "grad_norm": 0.6454370635444046, + "learning_rate": 9.727019771719427e-06, + "loss": 0.522, + "step": 2080 + }, + { + "epoch": 0.27, + "grad_norm": 0.6001209377809871, + "learning_rate": 9.726683476292484e-06, + "loss": 0.542, + "step": 2081 + }, + { + "epoch": 0.27, + "grad_norm": 0.6933759361085206, + "learning_rate": 9.72634697966573e-06, + "loss": 0.5507, + "step": 2082 + }, + { + "epoch": 0.27, + "grad_norm": 0.9055509903938554, + "learning_rate": 9.726010281853488e-06, + "loss": 0.6022, + "step": 2083 + }, + { + "epoch": 0.27, + "grad_norm": 0.6116977883299418, + "learning_rate": 9.725673382870092e-06, + "loss": 0.5139, + "step": 2084 + }, + { + "epoch": 0.27, + "grad_norm": 0.6370131943565843, + "learning_rate": 9.725336282729877e-06, + "loss": 0.5515, + "step": 2085 + }, + { + "epoch": 0.27, + "grad_norm": 0.9004152023727671, + "learning_rate": 9.7249989814472e-06, + "loss": 0.6576, + "step": 2086 + }, + { + "epoch": 0.27, + "grad_norm": 1.0158407119320836, + "learning_rate": 9.724661479036414e-06, + "loss": 0.5774, + "step": 2087 + }, + { + "epoch": 0.27, + "grad_norm": 0.7675904043554551, + "learning_rate": 9.724323775511888e-06, + "loss": 0.6786, + "step": 2088 + }, + { + "epoch": 0.27, + "grad_norm": 0.7489623594684662, + "learning_rate": 9.723985870887995e-06, + "loss": 0.5578, + "step": 2089 + }, + { + "epoch": 0.27, + "grad_norm": 0.6084731557722258, + "learning_rate": 9.723647765179119e-06, + "loss": 0.5833, + "step": 2090 + }, + { + "epoch": 0.27, + "grad_norm": 0.8331052745998051, + "learning_rate": 9.723309458399652e-06, + "loss": 0.6462, + "step": 2091 + }, + { + "epoch": 0.27, + "grad_norm": 0.7359928436991852, + "learning_rate": 9.722970950563995e-06, + "loss": 0.5608, + "step": 2092 + }, + { + "epoch": 0.27, + "grad_norm": 0.6837342295014889, + "learning_rate": 9.722632241686559e-06, + "loss": 0.5407, + "step": 2093 + }, + { + "epoch": 0.27, + "grad_norm": 0.7729308964984413, + "learning_rate": 9.722293331781758e-06, + "loss": 0.5863, + "step": 2094 + }, + { + "epoch": 0.27, + "grad_norm": 0.8614004223480878, + "learning_rate": 9.721954220864019e-06, + "loss": 0.6489, + "step": 2095 + }, + { + "epoch": 0.27, + "grad_norm": 0.6232329294269511, + "learning_rate": 9.721614908947781e-06, + "loss": 0.5391, + "step": 2096 + }, + { + "epoch": 0.27, + "grad_norm": 0.6077719243223584, + "learning_rate": 9.721275396047483e-06, + "loss": 0.5393, + "step": 2097 + }, + { + "epoch": 0.27, + "grad_norm": 0.558351867007384, + "learning_rate": 9.720935682177577e-06, + "loss": 0.4888, + "step": 2098 + }, + { + "epoch": 0.27, + "grad_norm": 0.8062406355126962, + "learning_rate": 9.720595767352527e-06, + "loss": 0.673, + "step": 2099 + }, + { + "epoch": 0.27, + "grad_norm": 0.9037491752296137, + "learning_rate": 9.720255651586799e-06, + "loss": 0.6714, + "step": 2100 + }, + { + "epoch": 0.27, + "grad_norm": 0.7360739154254519, + "learning_rate": 9.719915334894871e-06, + "loss": 0.6018, + "step": 2101 + }, + { + "epoch": 0.27, + "grad_norm": 0.7831402654138032, + "learning_rate": 9.71957481729123e-06, + "loss": 0.5486, + "step": 2102 + }, + { + "epoch": 0.27, + "grad_norm": 1.0765839399703987, + "learning_rate": 9.719234098790374e-06, + "loss": 0.5915, + "step": 2103 + }, + { + "epoch": 0.27, + "grad_norm": 0.7966734872967898, + "learning_rate": 9.718893179406798e-06, + "loss": 0.61, + "step": 2104 + }, + { + "epoch": 0.27, + "grad_norm": 0.971240792968848, + "learning_rate": 9.718552059155022e-06, + "loss": 0.6608, + "step": 2105 + }, + { + "epoch": 0.27, + "grad_norm": 0.5970469956752246, + "learning_rate": 9.718210738049563e-06, + "loss": 0.4595, + "step": 2106 + }, + { + "epoch": 0.27, + "grad_norm": 0.5920103825747084, + "learning_rate": 9.71786921610495e-06, + "loss": 0.5117, + "step": 2107 + }, + { + "epoch": 0.27, + "grad_norm": 0.8117674785784921, + "learning_rate": 9.71752749333572e-06, + "loss": 0.6788, + "step": 2108 + }, + { + "epoch": 0.27, + "grad_norm": 0.5256254889132973, + "learning_rate": 9.717185569756419e-06, + "loss": 0.46, + "step": 2109 + }, + { + "epoch": 0.27, + "grad_norm": 0.5834368545417268, + "learning_rate": 9.716843445381603e-06, + "loss": 0.5359, + "step": 2110 + }, + { + "epoch": 0.27, + "grad_norm": 1.0017053379236673, + "learning_rate": 9.716501120225834e-06, + "loss": 0.6183, + "step": 2111 + }, + { + "epoch": 0.27, + "grad_norm": 0.6548945424440169, + "learning_rate": 9.716158594303685e-06, + "loss": 0.5459, + "step": 2112 + }, + { + "epoch": 0.27, + "grad_norm": 2.23223843284866, + "learning_rate": 9.715815867629735e-06, + "loss": 0.6646, + "step": 2113 + }, + { + "epoch": 0.27, + "grad_norm": 0.659091643792312, + "learning_rate": 9.715472940218573e-06, + "loss": 0.5551, + "step": 2114 + }, + { + "epoch": 0.27, + "grad_norm": 0.6976834670146201, + "learning_rate": 9.715129812084795e-06, + "loss": 0.5258, + "step": 2115 + }, + { + "epoch": 0.27, + "grad_norm": 0.8699304617573581, + "learning_rate": 9.71478648324301e-06, + "loss": 0.6653, + "step": 2116 + }, + { + "epoch": 0.27, + "grad_norm": 0.8140126412568707, + "learning_rate": 9.71444295370783e-06, + "loss": 0.6282, + "step": 2117 + }, + { + "epoch": 0.27, + "grad_norm": 0.5488027208872968, + "learning_rate": 9.71409922349388e-06, + "loss": 0.5297, + "step": 2118 + }, + { + "epoch": 0.27, + "grad_norm": 0.7111858818212421, + "learning_rate": 9.713755292615789e-06, + "loss": 0.5571, + "step": 2119 + }, + { + "epoch": 0.27, + "grad_norm": 0.8020520271094485, + "learning_rate": 9.713411161088198e-06, + "loss": 0.6039, + "step": 2120 + }, + { + "epoch": 0.27, + "grad_norm": 0.7866973296861831, + "learning_rate": 9.713066828925757e-06, + "loss": 0.6567, + "step": 2121 + }, + { + "epoch": 0.27, + "grad_norm": 0.642506004797298, + "learning_rate": 9.712722296143121e-06, + "loss": 0.5614, + "step": 2122 + }, + { + "epoch": 0.27, + "grad_norm": 0.7503891770795013, + "learning_rate": 9.712377562754957e-06, + "loss": 0.6174, + "step": 2123 + }, + { + "epoch": 0.27, + "grad_norm": 0.7500400246988823, + "learning_rate": 9.712032628775939e-06, + "loss": 0.5557, + "step": 2124 + }, + { + "epoch": 0.27, + "grad_norm": 0.6542104740649382, + "learning_rate": 9.711687494220748e-06, + "loss": 0.5911, + "step": 2125 + }, + { + "epoch": 0.27, + "grad_norm": 1.0160392181510352, + "learning_rate": 9.711342159104078e-06, + "loss": 0.6453, + "step": 2126 + }, + { + "epoch": 0.27, + "grad_norm": 0.7892779950190293, + "learning_rate": 9.710996623440627e-06, + "loss": 0.5836, + "step": 2127 + }, + { + "epoch": 0.27, + "grad_norm": 0.7363641448666912, + "learning_rate": 9.710650887245103e-06, + "loss": 0.5829, + "step": 2128 + }, + { + "epoch": 0.27, + "grad_norm": 0.8431743530457255, + "learning_rate": 9.710304950532225e-06, + "loss": 0.5804, + "step": 2129 + }, + { + "epoch": 0.27, + "grad_norm": 0.8824858658654475, + "learning_rate": 9.709958813316718e-06, + "loss": 0.6999, + "step": 2130 + }, + { + "epoch": 0.27, + "grad_norm": 0.8998131890206996, + "learning_rate": 9.709612475613315e-06, + "loss": 0.6894, + "step": 2131 + }, + { + "epoch": 0.27, + "grad_norm": 0.5910197690092174, + "learning_rate": 9.709265937436758e-06, + "loss": 0.5738, + "step": 2132 + }, + { + "epoch": 0.27, + "grad_norm": 1.019065391040781, + "learning_rate": 9.708919198801799e-06, + "loss": 0.6415, + "step": 2133 + }, + { + "epoch": 0.27, + "grad_norm": 0.6614502965096164, + "learning_rate": 9.708572259723198e-06, + "loss": 0.5642, + "step": 2134 + }, + { + "epoch": 0.27, + "grad_norm": 0.8262117858413601, + "learning_rate": 9.70822512021572e-06, + "loss": 0.6035, + "step": 2135 + }, + { + "epoch": 0.27, + "grad_norm": 0.9290103171899875, + "learning_rate": 9.707877780294147e-06, + "loss": 0.6671, + "step": 2136 + }, + { + "epoch": 0.27, + "grad_norm": 0.72120369339109, + "learning_rate": 9.707530239973257e-06, + "loss": 0.5314, + "step": 2137 + }, + { + "epoch": 0.27, + "grad_norm": 0.6497313724067862, + "learning_rate": 9.707182499267851e-06, + "loss": 0.5609, + "step": 2138 + }, + { + "epoch": 0.27, + "grad_norm": 0.8843671026780996, + "learning_rate": 9.706834558192728e-06, + "loss": 0.6537, + "step": 2139 + }, + { + "epoch": 0.27, + "grad_norm": 0.6927955833608418, + "learning_rate": 9.706486416762696e-06, + "loss": 0.5397, + "step": 2140 + }, + { + "epoch": 0.27, + "grad_norm": 0.7035186658685634, + "learning_rate": 9.706138074992581e-06, + "loss": 0.625, + "step": 2141 + }, + { + "epoch": 0.27, + "grad_norm": 0.6516548353517828, + "learning_rate": 9.705789532897205e-06, + "loss": 0.5345, + "step": 2142 + }, + { + "epoch": 0.27, + "grad_norm": 0.6039563025733024, + "learning_rate": 9.705440790491406e-06, + "loss": 0.5231, + "step": 2143 + }, + { + "epoch": 0.27, + "grad_norm": 0.7199901116167109, + "learning_rate": 9.705091847790029e-06, + "loss": 0.6729, + "step": 2144 + }, + { + "epoch": 0.27, + "grad_norm": 0.8138226573688557, + "learning_rate": 9.704742704807928e-06, + "loss": 0.6693, + "step": 2145 + }, + { + "epoch": 0.27, + "grad_norm": 0.8791064321004185, + "learning_rate": 9.704393361559963e-06, + "loss": 0.5696, + "step": 2146 + }, + { + "epoch": 0.27, + "grad_norm": 0.7829314545257753, + "learning_rate": 9.704043818061007e-06, + "loss": 0.5571, + "step": 2147 + }, + { + "epoch": 0.27, + "grad_norm": 0.9699664155305645, + "learning_rate": 9.703694074325935e-06, + "loss": 0.6281, + "step": 2148 + }, + { + "epoch": 0.27, + "grad_norm": 0.8313151676250713, + "learning_rate": 9.70334413036964e-06, + "loss": 0.6658, + "step": 2149 + }, + { + "epoch": 0.27, + "grad_norm": 0.6751066000361966, + "learning_rate": 9.702993986207014e-06, + "loss": 0.5604, + "step": 2150 + }, + { + "epoch": 0.27, + "grad_norm": 0.7255453828137651, + "learning_rate": 9.702643641852963e-06, + "loss": 0.5996, + "step": 2151 + }, + { + "epoch": 0.27, + "grad_norm": 0.6983987210473735, + "learning_rate": 9.7022930973224e-06, + "loss": 0.5415, + "step": 2152 + }, + { + "epoch": 0.27, + "grad_norm": 0.936193879001611, + "learning_rate": 9.701942352630246e-06, + "loss": 0.6409, + "step": 2153 + }, + { + "epoch": 0.27, + "grad_norm": 0.912956042654097, + "learning_rate": 9.701591407791431e-06, + "loss": 0.6389, + "step": 2154 + }, + { + "epoch": 0.27, + "grad_norm": 0.7230062589302516, + "learning_rate": 9.701240262820894e-06, + "loss": 0.5882, + "step": 2155 + }, + { + "epoch": 0.27, + "grad_norm": 0.6272954988725057, + "learning_rate": 9.700888917733582e-06, + "loss": 0.5692, + "step": 2156 + }, + { + "epoch": 0.27, + "grad_norm": 1.322684260098865, + "learning_rate": 9.70053737254445e-06, + "loss": 0.6119, + "step": 2157 + }, + { + "epoch": 0.27, + "grad_norm": 0.6771476018489737, + "learning_rate": 9.700185627268463e-06, + "loss": 0.5113, + "step": 2158 + }, + { + "epoch": 0.28, + "grad_norm": 0.6986586269348238, + "learning_rate": 9.699833681920595e-06, + "loss": 0.5369, + "step": 2159 + }, + { + "epoch": 0.28, + "grad_norm": 0.8707171763834329, + "learning_rate": 9.699481536515824e-06, + "loss": 0.6593, + "step": 2160 + }, + { + "epoch": 0.28, + "grad_norm": 1.0172857832492763, + "learning_rate": 9.699129191069141e-06, + "loss": 0.6357, + "step": 2161 + }, + { + "epoch": 0.28, + "grad_norm": 0.7348842636230057, + "learning_rate": 9.698776645595546e-06, + "loss": 0.6248, + "step": 2162 + }, + { + "epoch": 0.28, + "grad_norm": 0.798755867497702, + "learning_rate": 9.698423900110046e-06, + "loss": 0.6579, + "step": 2163 + }, + { + "epoch": 0.28, + "grad_norm": 1.1928872004184816, + "learning_rate": 9.698070954627652e-06, + "loss": 0.664, + "step": 2164 + }, + { + "epoch": 0.28, + "grad_norm": 0.7014593433875881, + "learning_rate": 9.69771780916339e-06, + "loss": 0.5624, + "step": 2165 + }, + { + "epoch": 0.28, + "grad_norm": 0.767277100934089, + "learning_rate": 9.697364463732293e-06, + "loss": 0.6384, + "step": 2166 + }, + { + "epoch": 0.28, + "grad_norm": 0.9248346711069855, + "learning_rate": 9.697010918349402e-06, + "loss": 0.6596, + "step": 2167 + }, + { + "epoch": 0.28, + "grad_norm": 0.6387516173536527, + "learning_rate": 9.696657173029767e-06, + "loss": 0.5607, + "step": 2168 + }, + { + "epoch": 0.28, + "grad_norm": 0.8047509435201944, + "learning_rate": 9.696303227788441e-06, + "loss": 0.6464, + "step": 2169 + }, + { + "epoch": 0.28, + "grad_norm": 0.6238519105509818, + "learning_rate": 9.695949082640497e-06, + "loss": 0.5011, + "step": 2170 + }, + { + "epoch": 0.28, + "grad_norm": 0.7983154023591998, + "learning_rate": 9.695594737601006e-06, + "loss": 0.5157, + "step": 2171 + }, + { + "epoch": 0.28, + "grad_norm": 0.948630607940082, + "learning_rate": 9.695240192685051e-06, + "loss": 0.6251, + "step": 2172 + }, + { + "epoch": 0.28, + "grad_norm": 0.5868660188042967, + "learning_rate": 9.694885447907726e-06, + "loss": 0.5212, + "step": 2173 + }, + { + "epoch": 0.28, + "grad_norm": 0.7581979463662551, + "learning_rate": 9.69453050328413e-06, + "loss": 0.6261, + "step": 2174 + }, + { + "epoch": 0.28, + "grad_norm": 0.7541088329169049, + "learning_rate": 9.694175358829372e-06, + "loss": 0.6103, + "step": 2175 + }, + { + "epoch": 0.28, + "grad_norm": 0.5998178387441944, + "learning_rate": 9.693820014558568e-06, + "loss": 0.5542, + "step": 2176 + }, + { + "epoch": 0.28, + "grad_norm": 0.7863609600049947, + "learning_rate": 9.693464470486847e-06, + "loss": 0.6182, + "step": 2177 + }, + { + "epoch": 0.28, + "grad_norm": 0.6913994240424528, + "learning_rate": 9.69310872662934e-06, + "loss": 0.6005, + "step": 2178 + }, + { + "epoch": 0.28, + "grad_norm": 0.9020903523450176, + "learning_rate": 9.692752783001194e-06, + "loss": 0.6919, + "step": 2179 + }, + { + "epoch": 0.28, + "grad_norm": 0.8009512130787827, + "learning_rate": 9.692396639617556e-06, + "loss": 0.6028, + "step": 2180 + }, + { + "epoch": 0.28, + "grad_norm": 0.5603008308784059, + "learning_rate": 9.692040296493587e-06, + "loss": 0.5243, + "step": 2181 + }, + { + "epoch": 0.28, + "grad_norm": 0.804830335848704, + "learning_rate": 9.691683753644459e-06, + "loss": 0.5833, + "step": 2182 + }, + { + "epoch": 0.28, + "grad_norm": 0.7136042371926369, + "learning_rate": 9.691327011085344e-06, + "loss": 0.5521, + "step": 2183 + }, + { + "epoch": 0.28, + "grad_norm": 0.7516852237198544, + "learning_rate": 9.690970068831431e-06, + "loss": 0.6193, + "step": 2184 + }, + { + "epoch": 0.28, + "grad_norm": 0.6088908958862602, + "learning_rate": 9.690612926897911e-06, + "loss": 0.5562, + "step": 2185 + }, + { + "epoch": 0.28, + "grad_norm": 0.7406939704811897, + "learning_rate": 9.690255585299988e-06, + "loss": 0.6627, + "step": 2186 + }, + { + "epoch": 0.28, + "grad_norm": 0.9702544059608751, + "learning_rate": 9.689898044052872e-06, + "loss": 0.6357, + "step": 2187 + }, + { + "epoch": 0.28, + "grad_norm": 0.6700948045728238, + "learning_rate": 9.689540303171785e-06, + "loss": 0.5431, + "step": 2188 + }, + { + "epoch": 0.28, + "grad_norm": 0.7436438109257585, + "learning_rate": 9.689182362671952e-06, + "loss": 0.6278, + "step": 2189 + }, + { + "epoch": 0.28, + "grad_norm": 0.7576916423630417, + "learning_rate": 9.688824222568608e-06, + "loss": 0.6341, + "step": 2190 + }, + { + "epoch": 0.28, + "grad_norm": 0.6391553504785336, + "learning_rate": 9.688465882877005e-06, + "loss": 0.4978, + "step": 2191 + }, + { + "epoch": 0.28, + "grad_norm": 1.0998597734639064, + "learning_rate": 9.688107343612387e-06, + "loss": 0.6933, + "step": 2192 + }, + { + "epoch": 0.28, + "grad_norm": 0.6994534113107235, + "learning_rate": 9.687748604790024e-06, + "loss": 0.5918, + "step": 2193 + }, + { + "epoch": 0.28, + "grad_norm": 0.7935273500053455, + "learning_rate": 9.68738966642518e-06, + "loss": 0.5797, + "step": 2194 + }, + { + "epoch": 0.28, + "grad_norm": 0.6313576338974369, + "learning_rate": 9.687030528533137e-06, + "loss": 0.5135, + "step": 2195 + }, + { + "epoch": 0.28, + "grad_norm": 0.58855144835993, + "learning_rate": 9.686671191129184e-06, + "loss": 0.564, + "step": 2196 + }, + { + "epoch": 0.28, + "grad_norm": 0.6397217492402749, + "learning_rate": 9.686311654228613e-06, + "loss": 0.5565, + "step": 2197 + }, + { + "epoch": 0.28, + "grad_norm": 0.6257870798373377, + "learning_rate": 9.68595191784673e-06, + "loss": 0.5277, + "step": 2198 + }, + { + "epoch": 0.28, + "grad_norm": 0.7337667986495767, + "learning_rate": 9.68559198199885e-06, + "loss": 0.6309, + "step": 2199 + }, + { + "epoch": 0.28, + "grad_norm": 0.5983308697317115, + "learning_rate": 9.685231846700292e-06, + "loss": 0.5012, + "step": 2200 + }, + { + "epoch": 0.28, + "grad_norm": 0.6799392679002891, + "learning_rate": 9.684871511966383e-06, + "loss": 0.5181, + "step": 2201 + }, + { + "epoch": 0.28, + "grad_norm": 0.6914395568735426, + "learning_rate": 9.684510977812467e-06, + "loss": 0.5455, + "step": 2202 + }, + { + "epoch": 0.28, + "grad_norm": 0.6238811969387991, + "learning_rate": 9.68415024425389e-06, + "loss": 0.5432, + "step": 2203 + }, + { + "epoch": 0.28, + "grad_norm": 0.6340758968912651, + "learning_rate": 9.683789311306003e-06, + "loss": 0.5409, + "step": 2204 + }, + { + "epoch": 0.28, + "grad_norm": 0.6886613532073099, + "learning_rate": 9.683428178984172e-06, + "loss": 0.5815, + "step": 2205 + }, + { + "epoch": 0.28, + "grad_norm": 0.6442929205541311, + "learning_rate": 9.68306684730377e-06, + "loss": 0.5504, + "step": 2206 + }, + { + "epoch": 0.28, + "grad_norm": 0.8588064591456216, + "learning_rate": 9.682705316280178e-06, + "loss": 0.6147, + "step": 2207 + }, + { + "epoch": 0.28, + "grad_norm": 0.8804960211736516, + "learning_rate": 9.682343585928785e-06, + "loss": 0.6474, + "step": 2208 + }, + { + "epoch": 0.28, + "grad_norm": 0.9515813480598867, + "learning_rate": 9.681981656264984e-06, + "loss": 0.6407, + "step": 2209 + }, + { + "epoch": 0.28, + "grad_norm": 0.5990587431565297, + "learning_rate": 9.68161952730419e-06, + "loss": 0.5491, + "step": 2210 + }, + { + "epoch": 0.28, + "grad_norm": 0.6515404033001146, + "learning_rate": 9.681257199061812e-06, + "loss": 0.5397, + "step": 2211 + }, + { + "epoch": 0.28, + "grad_norm": 0.5903850281832909, + "learning_rate": 9.680894671553273e-06, + "loss": 0.5919, + "step": 2212 + }, + { + "epoch": 0.28, + "grad_norm": 0.7799265947403901, + "learning_rate": 9.680531944794007e-06, + "loss": 0.6372, + "step": 2213 + }, + { + "epoch": 0.28, + "grad_norm": 0.9926507262328429, + "learning_rate": 9.680169018799452e-06, + "loss": 0.5947, + "step": 2214 + }, + { + "epoch": 0.28, + "grad_norm": 0.6339056599829482, + "learning_rate": 9.679805893585059e-06, + "loss": 0.4961, + "step": 2215 + }, + { + "epoch": 0.28, + "grad_norm": 0.6551315532687673, + "learning_rate": 9.679442569166282e-06, + "loss": 0.5736, + "step": 2216 + }, + { + "epoch": 0.28, + "grad_norm": 0.6300361657396402, + "learning_rate": 9.67907904555859e-06, + "loss": 0.5434, + "step": 2217 + }, + { + "epoch": 0.28, + "grad_norm": 0.9418706930656059, + "learning_rate": 9.678715322777453e-06, + "loss": 0.6208, + "step": 2218 + }, + { + "epoch": 0.28, + "grad_norm": 0.6543659916555796, + "learning_rate": 9.678351400838357e-06, + "loss": 0.5593, + "step": 2219 + }, + { + "epoch": 0.28, + "grad_norm": 0.8837889611829904, + "learning_rate": 9.67798727975679e-06, + "loss": 0.7165, + "step": 2220 + }, + { + "epoch": 0.28, + "grad_norm": 0.8815426881808548, + "learning_rate": 9.677622959548256e-06, + "loss": 0.6614, + "step": 2221 + }, + { + "epoch": 0.28, + "grad_norm": 0.6174900120372893, + "learning_rate": 9.677258440228259e-06, + "loss": 0.5333, + "step": 2222 + }, + { + "epoch": 0.28, + "grad_norm": 0.8184534973285272, + "learning_rate": 9.676893721812317e-06, + "loss": 0.6351, + "step": 2223 + }, + { + "epoch": 0.28, + "grad_norm": 0.6330019368009169, + "learning_rate": 9.676528804315954e-06, + "loss": 0.5142, + "step": 2224 + }, + { + "epoch": 0.28, + "grad_norm": 0.8743920706670454, + "learning_rate": 9.676163687754705e-06, + "loss": 0.6303, + "step": 2225 + }, + { + "epoch": 0.28, + "grad_norm": 0.6326861851605023, + "learning_rate": 9.675798372144108e-06, + "loss": 0.5446, + "step": 2226 + }, + { + "epoch": 0.28, + "grad_norm": 1.0371082516296481, + "learning_rate": 9.675432857499718e-06, + "loss": 0.5686, + "step": 2227 + }, + { + "epoch": 0.28, + "grad_norm": 0.836242150354895, + "learning_rate": 9.675067143837092e-06, + "loss": 0.642, + "step": 2228 + }, + { + "epoch": 0.28, + "grad_norm": 0.8301236096375209, + "learning_rate": 9.674701231171795e-06, + "loss": 0.5979, + "step": 2229 + }, + { + "epoch": 0.28, + "grad_norm": 2.1578010235586267, + "learning_rate": 9.674335119519407e-06, + "loss": 0.6818, + "step": 2230 + }, + { + "epoch": 0.28, + "grad_norm": 0.9290913461612924, + "learning_rate": 9.673968808895509e-06, + "loss": 0.6093, + "step": 2231 + }, + { + "epoch": 0.28, + "grad_norm": 0.7342519929147039, + "learning_rate": 9.673602299315694e-06, + "loss": 0.596, + "step": 2232 + }, + { + "epoch": 0.28, + "grad_norm": 0.6525783827942293, + "learning_rate": 9.673235590795565e-06, + "loss": 0.5375, + "step": 2233 + }, + { + "epoch": 0.28, + "grad_norm": 0.5989832941113682, + "learning_rate": 9.672868683350731e-06, + "loss": 0.5109, + "step": 2234 + }, + { + "epoch": 0.28, + "grad_norm": 0.6569948231952824, + "learning_rate": 9.67250157699681e-06, + "loss": 0.5465, + "step": 2235 + }, + { + "epoch": 0.28, + "grad_norm": 0.6724503094389964, + "learning_rate": 9.672134271749425e-06, + "loss": 0.5342, + "step": 2236 + }, + { + "epoch": 0.28, + "grad_norm": 1.0433338392795097, + "learning_rate": 9.671766767624215e-06, + "loss": 0.659, + "step": 2237 + }, + { + "epoch": 0.29, + "grad_norm": 0.6852954531542389, + "learning_rate": 9.671399064636824e-06, + "loss": 0.5177, + "step": 2238 + }, + { + "epoch": 0.29, + "grad_norm": 0.7620657504708801, + "learning_rate": 9.671031162802901e-06, + "loss": 0.6211, + "step": 2239 + }, + { + "epoch": 0.29, + "grad_norm": 0.7714690157358612, + "learning_rate": 9.670663062138111e-06, + "loss": 0.5459, + "step": 2240 + }, + { + "epoch": 0.29, + "grad_norm": 0.6553942130009848, + "learning_rate": 9.670294762658116e-06, + "loss": 0.546, + "step": 2241 + }, + { + "epoch": 0.29, + "grad_norm": 0.62564941691109, + "learning_rate": 9.669926264378598e-06, + "loss": 0.556, + "step": 2242 + }, + { + "epoch": 0.29, + "grad_norm": 0.5762097403614382, + "learning_rate": 9.669557567315242e-06, + "loss": 0.5195, + "step": 2243 + }, + { + "epoch": 0.29, + "grad_norm": 0.8532764287259377, + "learning_rate": 9.669188671483742e-06, + "loss": 0.6213, + "step": 2244 + }, + { + "epoch": 0.29, + "grad_norm": 0.7489377850365356, + "learning_rate": 9.668819576899802e-06, + "loss": 0.5236, + "step": 2245 + }, + { + "epoch": 0.29, + "grad_norm": 0.7617188167186214, + "learning_rate": 9.668450283579132e-06, + "loss": 0.6345, + "step": 2246 + }, + { + "epoch": 0.29, + "grad_norm": 0.851474713301415, + "learning_rate": 9.668080791537451e-06, + "loss": 0.6059, + "step": 2247 + }, + { + "epoch": 0.29, + "grad_norm": 0.7362806713736375, + "learning_rate": 9.667711100790487e-06, + "loss": 0.5474, + "step": 2248 + }, + { + "epoch": 0.29, + "grad_norm": 0.6993209279475131, + "learning_rate": 9.667341211353979e-06, + "loss": 0.5467, + "step": 2249 + }, + { + "epoch": 0.29, + "grad_norm": 0.6047618263812696, + "learning_rate": 9.66697112324367e-06, + "loss": 0.5799, + "step": 2250 + }, + { + "epoch": 0.29, + "grad_norm": 0.6760460559516863, + "learning_rate": 9.666600836475313e-06, + "loss": 0.5667, + "step": 2251 + }, + { + "epoch": 0.29, + "grad_norm": 0.8904894784357311, + "learning_rate": 9.66623035106467e-06, + "loss": 0.6271, + "step": 2252 + }, + { + "epoch": 0.29, + "grad_norm": 0.5959552961352129, + "learning_rate": 9.665859667027514e-06, + "loss": 0.5595, + "step": 2253 + }, + { + "epoch": 0.29, + "grad_norm": 0.5915119580046773, + "learning_rate": 9.665488784379619e-06, + "loss": 0.5449, + "step": 2254 + }, + { + "epoch": 0.29, + "grad_norm": 0.7050973527303205, + "learning_rate": 9.665117703136778e-06, + "loss": 0.5475, + "step": 2255 + }, + { + "epoch": 0.29, + "grad_norm": 0.5741843191594764, + "learning_rate": 9.664746423314783e-06, + "loss": 0.4807, + "step": 2256 + }, + { + "epoch": 0.29, + "grad_norm": 0.605516106318389, + "learning_rate": 9.66437494492944e-06, + "loss": 0.4949, + "step": 2257 + }, + { + "epoch": 0.29, + "grad_norm": 0.7298421166590036, + "learning_rate": 9.66400326799656e-06, + "loss": 0.6303, + "step": 2258 + }, + { + "epoch": 0.29, + "grad_norm": 0.8276973612240265, + "learning_rate": 9.663631392531964e-06, + "loss": 0.5599, + "step": 2259 + }, + { + "epoch": 0.29, + "grad_norm": 0.7930071135800971, + "learning_rate": 9.663259318551484e-06, + "loss": 0.5948, + "step": 2260 + }, + { + "epoch": 0.29, + "grad_norm": 0.8995766276937424, + "learning_rate": 9.662887046070955e-06, + "loss": 0.5952, + "step": 2261 + }, + { + "epoch": 0.29, + "grad_norm": 0.6427601142691367, + "learning_rate": 9.662514575106226e-06, + "loss": 0.5744, + "step": 2262 + }, + { + "epoch": 0.29, + "grad_norm": 0.828253738518374, + "learning_rate": 9.66214190567315e-06, + "loss": 0.6222, + "step": 2263 + }, + { + "epoch": 0.29, + "grad_norm": 0.7720476783409141, + "learning_rate": 9.661769037787593e-06, + "loss": 0.5892, + "step": 2264 + }, + { + "epoch": 0.29, + "grad_norm": 0.5936003716856899, + "learning_rate": 9.661395971465425e-06, + "loss": 0.4931, + "step": 2265 + }, + { + "epoch": 0.29, + "grad_norm": 0.6928100126960826, + "learning_rate": 9.661022706722522e-06, + "loss": 0.5143, + "step": 2266 + }, + { + "epoch": 0.29, + "grad_norm": 0.6655182767575827, + "learning_rate": 9.66064924357478e-06, + "loss": 0.5259, + "step": 2267 + }, + { + "epoch": 0.29, + "grad_norm": 0.7407844825951722, + "learning_rate": 9.660275582038095e-06, + "loss": 0.5317, + "step": 2268 + }, + { + "epoch": 0.29, + "grad_norm": 0.7158721229727145, + "learning_rate": 9.659901722128366e-06, + "loss": 0.5526, + "step": 2269 + }, + { + "epoch": 0.29, + "grad_norm": 0.7424287306296785, + "learning_rate": 9.659527663861513e-06, + "loss": 0.6317, + "step": 2270 + }, + { + "epoch": 0.29, + "grad_norm": 1.107374494985659, + "learning_rate": 9.65915340725346e-06, + "loss": 0.6258, + "step": 2271 + }, + { + "epoch": 0.29, + "grad_norm": 0.6830529175884509, + "learning_rate": 9.658778952320133e-06, + "loss": 0.6194, + "step": 2272 + }, + { + "epoch": 0.29, + "grad_norm": 0.6713118487544459, + "learning_rate": 9.658404299077472e-06, + "loss": 0.5113, + "step": 2273 + }, + { + "epoch": 0.29, + "grad_norm": 0.8508721732925933, + "learning_rate": 9.658029447541429e-06, + "loss": 0.5819, + "step": 2274 + }, + { + "epoch": 0.29, + "grad_norm": 0.6838730161706225, + "learning_rate": 9.657654397727956e-06, + "loss": 0.5601, + "step": 2275 + }, + { + "epoch": 0.29, + "grad_norm": 0.7911712009038191, + "learning_rate": 9.657279149653018e-06, + "loss": 0.5822, + "step": 2276 + }, + { + "epoch": 0.29, + "grad_norm": 0.776451702863992, + "learning_rate": 9.65690370333259e-06, + "loss": 0.6277, + "step": 2277 + }, + { + "epoch": 0.29, + "grad_norm": 0.8645971786151312, + "learning_rate": 9.656528058782653e-06, + "loss": 0.6251, + "step": 2278 + }, + { + "epoch": 0.29, + "grad_norm": 0.8527749737790213, + "learning_rate": 9.656152216019197e-06, + "loss": 0.6788, + "step": 2279 + }, + { + "epoch": 0.29, + "grad_norm": 0.8828743894008848, + "learning_rate": 9.655776175058218e-06, + "loss": 0.6028, + "step": 2280 + }, + { + "epoch": 0.29, + "grad_norm": 0.823956024995578, + "learning_rate": 9.655399935915728e-06, + "loss": 0.6078, + "step": 2281 + }, + { + "epoch": 0.29, + "grad_norm": 0.925549441991507, + "learning_rate": 9.655023498607736e-06, + "loss": 0.6605, + "step": 2282 + }, + { + "epoch": 0.29, + "grad_norm": 0.6801753009251831, + "learning_rate": 9.654646863150271e-06, + "loss": 0.568, + "step": 2283 + }, + { + "epoch": 0.29, + "grad_norm": 0.7717913263725459, + "learning_rate": 9.654270029559362e-06, + "loss": 0.6152, + "step": 2284 + }, + { + "epoch": 0.29, + "grad_norm": 0.8557536389595195, + "learning_rate": 9.653892997851052e-06, + "loss": 0.6703, + "step": 2285 + }, + { + "epoch": 0.29, + "grad_norm": 0.8793890657596471, + "learning_rate": 9.653515768041388e-06, + "loss": 0.6016, + "step": 2286 + }, + { + "epoch": 0.29, + "grad_norm": 0.6443234170989017, + "learning_rate": 9.653138340146429e-06, + "loss": 0.5274, + "step": 2287 + }, + { + "epoch": 0.29, + "grad_norm": 0.8172794242502196, + "learning_rate": 9.65276071418224e-06, + "loss": 0.5886, + "step": 2288 + }, + { + "epoch": 0.29, + "grad_norm": 0.686034579986197, + "learning_rate": 9.652382890164895e-06, + "loss": 0.5846, + "step": 2289 + }, + { + "epoch": 0.29, + "grad_norm": 0.8096073641913435, + "learning_rate": 9.652004868110477e-06, + "loss": 0.5808, + "step": 2290 + }, + { + "epoch": 0.29, + "grad_norm": 0.624890273928074, + "learning_rate": 9.65162664803508e-06, + "loss": 0.5363, + "step": 2291 + }, + { + "epoch": 0.29, + "grad_norm": 0.6749908950627436, + "learning_rate": 9.651248229954798e-06, + "loss": 0.6828, + "step": 2292 + }, + { + "epoch": 0.29, + "grad_norm": 0.8880591936931553, + "learning_rate": 9.650869613885742e-06, + "loss": 0.6491, + "step": 2293 + }, + { + "epoch": 0.29, + "grad_norm": 0.8159750356945049, + "learning_rate": 9.65049079984403e-06, + "loss": 0.6944, + "step": 2294 + }, + { + "epoch": 0.29, + "grad_norm": 0.7418131176842386, + "learning_rate": 9.650111787845784e-06, + "loss": 0.6, + "step": 2295 + }, + { + "epoch": 0.29, + "grad_norm": 1.0461700823998974, + "learning_rate": 9.64973257790714e-06, + "loss": 0.5587, + "step": 2296 + }, + { + "epoch": 0.29, + "grad_norm": 0.6101097244691224, + "learning_rate": 9.64935317004424e-06, + "loss": 0.5083, + "step": 2297 + }, + { + "epoch": 0.29, + "grad_norm": 1.275132176881301, + "learning_rate": 9.648973564273232e-06, + "loss": 0.5649, + "step": 2298 + }, + { + "epoch": 0.29, + "grad_norm": 0.7032765826619104, + "learning_rate": 9.648593760610274e-06, + "loss": 0.6405, + "step": 2299 + }, + { + "epoch": 0.29, + "grad_norm": 0.6759865672860612, + "learning_rate": 9.648213759071536e-06, + "loss": 0.4923, + "step": 2300 + }, + { + "epoch": 0.29, + "grad_norm": 0.8516325001087683, + "learning_rate": 9.64783355967319e-06, + "loss": 0.5733, + "step": 2301 + }, + { + "epoch": 0.29, + "grad_norm": 1.0391152789607287, + "learning_rate": 9.647453162431422e-06, + "loss": 0.6847, + "step": 2302 + }, + { + "epoch": 0.29, + "grad_norm": 0.7283393465124091, + "learning_rate": 9.647072567362426e-06, + "loss": 0.6249, + "step": 2303 + }, + { + "epoch": 0.29, + "grad_norm": 0.5795741497487102, + "learning_rate": 9.6466917744824e-06, + "loss": 0.548, + "step": 2304 + }, + { + "epoch": 0.29, + "grad_norm": 0.6297529108765231, + "learning_rate": 9.646310783807552e-06, + "loss": 0.5269, + "step": 2305 + }, + { + "epoch": 0.29, + "grad_norm": 0.7688890632750403, + "learning_rate": 9.645929595354101e-06, + "loss": 0.583, + "step": 2306 + }, + { + "epoch": 0.29, + "grad_norm": 0.5647579057091476, + "learning_rate": 9.645548209138277e-06, + "loss": 0.4726, + "step": 2307 + }, + { + "epoch": 0.29, + "grad_norm": 0.6696597283297824, + "learning_rate": 9.645166625176308e-06, + "loss": 0.5587, + "step": 2308 + }, + { + "epoch": 0.29, + "grad_norm": 0.5912881255480464, + "learning_rate": 9.64478484348444e-06, + "loss": 0.5827, + "step": 2309 + }, + { + "epoch": 0.29, + "grad_norm": 0.9013934620363223, + "learning_rate": 9.644402864078925e-06, + "loss": 0.6017, + "step": 2310 + }, + { + "epoch": 0.29, + "grad_norm": 0.8091548399589423, + "learning_rate": 9.644020686976022e-06, + "loss": 0.5655, + "step": 2311 + }, + { + "epoch": 0.29, + "grad_norm": 0.6815593571986776, + "learning_rate": 9.643638312191996e-06, + "loss": 0.5179, + "step": 2312 + }, + { + "epoch": 0.29, + "grad_norm": 0.6353061674175475, + "learning_rate": 9.643255739743128e-06, + "loss": 0.5628, + "step": 2313 + }, + { + "epoch": 0.29, + "grad_norm": 0.5980690191909007, + "learning_rate": 9.6428729696457e-06, + "loss": 0.5004, + "step": 2314 + }, + { + "epoch": 0.29, + "grad_norm": 0.5907665810638241, + "learning_rate": 9.642490001916004e-06, + "loss": 0.584, + "step": 2315 + }, + { + "epoch": 0.3, + "grad_norm": 0.6795787447301739, + "learning_rate": 9.642106836570349e-06, + "loss": 0.5571, + "step": 2316 + }, + { + "epoch": 0.3, + "grad_norm": 0.6581148947684329, + "learning_rate": 9.641723473625036e-06, + "loss": 0.5441, + "step": 2317 + }, + { + "epoch": 0.3, + "grad_norm": 0.6025978028439085, + "learning_rate": 9.64133991309639e-06, + "loss": 0.5222, + "step": 2318 + }, + { + "epoch": 0.3, + "grad_norm": 0.6102422844649658, + "learning_rate": 9.640956155000734e-06, + "loss": 0.4969, + "step": 2319 + }, + { + "epoch": 0.3, + "grad_norm": 0.5951995875521914, + "learning_rate": 9.640572199354404e-06, + "loss": 0.4884, + "step": 2320 + }, + { + "epoch": 0.3, + "grad_norm": 0.8117996206937678, + "learning_rate": 9.640188046173746e-06, + "loss": 0.6194, + "step": 2321 + }, + { + "epoch": 0.3, + "grad_norm": 0.569777508360132, + "learning_rate": 9.639803695475111e-06, + "loss": 0.501, + "step": 2322 + }, + { + "epoch": 0.3, + "grad_norm": 0.7353086269939838, + "learning_rate": 9.639419147274858e-06, + "loss": 0.5792, + "step": 2323 + }, + { + "epoch": 0.3, + "grad_norm": 0.8305030844985611, + "learning_rate": 9.639034401589359e-06, + "loss": 0.6425, + "step": 2324 + }, + { + "epoch": 0.3, + "grad_norm": 0.6875012573272089, + "learning_rate": 9.638649458434989e-06, + "loss": 0.4913, + "step": 2325 + }, + { + "epoch": 0.3, + "grad_norm": 0.6104260420512554, + "learning_rate": 9.638264317828135e-06, + "loss": 0.5186, + "step": 2326 + }, + { + "epoch": 0.3, + "grad_norm": 0.8513012735792077, + "learning_rate": 9.63787897978519e-06, + "loss": 0.6214, + "step": 2327 + }, + { + "epoch": 0.3, + "grad_norm": 0.8140359431336607, + "learning_rate": 9.637493444322557e-06, + "loss": 0.6138, + "step": 2328 + }, + { + "epoch": 0.3, + "grad_norm": 0.9359810761278444, + "learning_rate": 9.63710771145665e-06, + "loss": 0.5982, + "step": 2329 + }, + { + "epoch": 0.3, + "grad_norm": 0.7033344549949977, + "learning_rate": 9.636721781203882e-06, + "loss": 0.5665, + "step": 2330 + }, + { + "epoch": 0.3, + "grad_norm": 0.712401301102582, + "learning_rate": 9.636335653580687e-06, + "loss": 0.5778, + "step": 2331 + }, + { + "epoch": 0.3, + "grad_norm": 0.6295356997241944, + "learning_rate": 9.635949328603499e-06, + "loss": 0.5391, + "step": 2332 + }, + { + "epoch": 0.3, + "grad_norm": 0.7082610788110011, + "learning_rate": 9.635562806288763e-06, + "loss": 0.6003, + "step": 2333 + }, + { + "epoch": 0.3, + "grad_norm": 0.5721241947970244, + "learning_rate": 9.635176086652929e-06, + "loss": 0.5039, + "step": 2334 + }, + { + "epoch": 0.3, + "grad_norm": 2.3745115336996983, + "learning_rate": 9.634789169712461e-06, + "loss": 0.6207, + "step": 2335 + }, + { + "epoch": 0.3, + "grad_norm": 0.9141369306134102, + "learning_rate": 9.634402055483832e-06, + "loss": 0.608, + "step": 2336 + }, + { + "epoch": 0.3, + "grad_norm": 0.6391256201758749, + "learning_rate": 9.634014743983513e-06, + "loss": 0.548, + "step": 2337 + }, + { + "epoch": 0.3, + "grad_norm": 0.6602876220896754, + "learning_rate": 9.633627235227998e-06, + "loss": 0.5919, + "step": 2338 + }, + { + "epoch": 0.3, + "grad_norm": 0.7155167772219896, + "learning_rate": 9.633239529233776e-06, + "loss": 0.4985, + "step": 2339 + }, + { + "epoch": 0.3, + "grad_norm": 0.7546479779069812, + "learning_rate": 9.632851626017355e-06, + "loss": 0.6248, + "step": 2340 + }, + { + "epoch": 0.3, + "grad_norm": 0.9036446272415479, + "learning_rate": 9.632463525595243e-06, + "loss": 0.6682, + "step": 2341 + }, + { + "epoch": 0.3, + "grad_norm": 0.692489419858691, + "learning_rate": 9.632075227983963e-06, + "loss": 0.5552, + "step": 2342 + }, + { + "epoch": 0.3, + "grad_norm": 0.6542026424178569, + "learning_rate": 9.63168673320004e-06, + "loss": 0.5592, + "step": 2343 + }, + { + "epoch": 0.3, + "grad_norm": 0.9882606063086841, + "learning_rate": 9.631298041260018e-06, + "loss": 0.6924, + "step": 2344 + }, + { + "epoch": 0.3, + "grad_norm": 1.102230759389261, + "learning_rate": 9.630909152180434e-06, + "loss": 0.6162, + "step": 2345 + }, + { + "epoch": 0.3, + "grad_norm": 0.7644577021931153, + "learning_rate": 9.63052006597785e-06, + "loss": 0.5817, + "step": 2346 + }, + { + "epoch": 0.3, + "grad_norm": 0.716617409334602, + "learning_rate": 9.630130782668818e-06, + "loss": 0.5892, + "step": 2347 + }, + { + "epoch": 0.3, + "grad_norm": 0.9729341324652461, + "learning_rate": 9.629741302269918e-06, + "loss": 0.6403, + "step": 2348 + }, + { + "epoch": 0.3, + "grad_norm": 0.6853664457080397, + "learning_rate": 9.629351624797725e-06, + "loss": 0.6067, + "step": 2349 + }, + { + "epoch": 0.3, + "grad_norm": 0.8342759285622677, + "learning_rate": 9.628961750268825e-06, + "loss": 0.632, + "step": 2350 + }, + { + "epoch": 0.3, + "grad_norm": 0.7689119065965148, + "learning_rate": 9.628571678699818e-06, + "loss": 0.573, + "step": 2351 + }, + { + "epoch": 0.3, + "grad_norm": 0.858385414084931, + "learning_rate": 9.628181410107305e-06, + "loss": 0.7072, + "step": 2352 + }, + { + "epoch": 0.3, + "grad_norm": 0.7249149251190334, + "learning_rate": 9.627790944507898e-06, + "loss": 0.5645, + "step": 2353 + }, + { + "epoch": 0.3, + "grad_norm": 0.6217419898343416, + "learning_rate": 9.627400281918218e-06, + "loss": 0.5539, + "step": 2354 + }, + { + "epoch": 0.3, + "grad_norm": 0.7610052903814447, + "learning_rate": 9.627009422354896e-06, + "loss": 0.6443, + "step": 2355 + }, + { + "epoch": 0.3, + "grad_norm": 0.9250303565161557, + "learning_rate": 9.626618365834568e-06, + "loss": 0.5487, + "step": 2356 + }, + { + "epoch": 0.3, + "grad_norm": 0.8242894584329374, + "learning_rate": 9.62622711237388e-06, + "loss": 0.6333, + "step": 2357 + }, + { + "epoch": 0.3, + "grad_norm": 0.8640955393357904, + "learning_rate": 9.62583566198949e-06, + "loss": 0.6112, + "step": 2358 + }, + { + "epoch": 0.3, + "grad_norm": 0.5736128865275786, + "learning_rate": 9.625444014698056e-06, + "loss": 0.5134, + "step": 2359 + }, + { + "epoch": 0.3, + "grad_norm": 0.8840592830174363, + "learning_rate": 9.62505217051625e-06, + "loss": 0.6405, + "step": 2360 + }, + { + "epoch": 0.3, + "grad_norm": 0.782897049526451, + "learning_rate": 9.624660129460756e-06, + "loss": 0.6289, + "step": 2361 + }, + { + "epoch": 0.3, + "grad_norm": 0.7876266615541588, + "learning_rate": 9.624267891548257e-06, + "loss": 0.6221, + "step": 2362 + }, + { + "epoch": 0.3, + "grad_norm": 0.9185263520658862, + "learning_rate": 9.62387545679545e-06, + "loss": 0.691, + "step": 2363 + }, + { + "epoch": 0.3, + "grad_norm": 0.6542667272945265, + "learning_rate": 9.623482825219041e-06, + "loss": 0.5247, + "step": 2364 + }, + { + "epoch": 0.3, + "grad_norm": 0.6180807685369759, + "learning_rate": 9.623089996835744e-06, + "loss": 0.5468, + "step": 2365 + }, + { + "epoch": 0.3, + "grad_norm": 0.713283927268299, + "learning_rate": 9.622696971662278e-06, + "loss": 0.6307, + "step": 2366 + }, + { + "epoch": 0.3, + "grad_norm": 0.87003445307037, + "learning_rate": 9.622303749715375e-06, + "loss": 0.6911, + "step": 2367 + }, + { + "epoch": 0.3, + "grad_norm": 0.94623684327798, + "learning_rate": 9.621910331011769e-06, + "loss": 0.6401, + "step": 2368 + }, + { + "epoch": 0.3, + "grad_norm": 0.7600855773772919, + "learning_rate": 9.621516715568212e-06, + "loss": 0.5957, + "step": 2369 + }, + { + "epoch": 0.3, + "grad_norm": 0.874117530399109, + "learning_rate": 9.621122903401457e-06, + "loss": 0.6094, + "step": 2370 + }, + { + "epoch": 0.3, + "grad_norm": 0.8481734501936808, + "learning_rate": 9.620728894528266e-06, + "loss": 0.6145, + "step": 2371 + }, + { + "epoch": 0.3, + "grad_norm": 0.7617368364319198, + "learning_rate": 9.620334688965411e-06, + "loss": 0.6298, + "step": 2372 + }, + { + "epoch": 0.3, + "grad_norm": 0.7728250496801723, + "learning_rate": 9.619940286729674e-06, + "loss": 0.5981, + "step": 2373 + }, + { + "epoch": 0.3, + "grad_norm": 0.6006432571031343, + "learning_rate": 9.619545687837843e-06, + "loss": 0.5815, + "step": 2374 + }, + { + "epoch": 0.3, + "grad_norm": 0.7252849051821438, + "learning_rate": 9.619150892306713e-06, + "loss": 0.6159, + "step": 2375 + }, + { + "epoch": 0.3, + "grad_norm": 0.6607678635799454, + "learning_rate": 9.618755900153091e-06, + "loss": 0.5698, + "step": 2376 + }, + { + "epoch": 0.3, + "grad_norm": 0.594289771715513, + "learning_rate": 9.618360711393789e-06, + "loss": 0.5386, + "step": 2377 + }, + { + "epoch": 0.3, + "grad_norm": 0.6907339889550254, + "learning_rate": 9.61796532604563e-06, + "loss": 0.57, + "step": 2378 + }, + { + "epoch": 0.3, + "grad_norm": 0.6174250140212003, + "learning_rate": 9.617569744125443e-06, + "loss": 0.5633, + "step": 2379 + }, + { + "epoch": 0.3, + "grad_norm": 0.8122324273378344, + "learning_rate": 9.617173965650068e-06, + "loss": 0.6107, + "step": 2380 + }, + { + "epoch": 0.3, + "grad_norm": 1.092650696850718, + "learning_rate": 9.616777990636353e-06, + "loss": 0.6218, + "step": 2381 + }, + { + "epoch": 0.3, + "grad_norm": 0.6128014853222221, + "learning_rate": 9.616381819101151e-06, + "loss": 0.5706, + "step": 2382 + }, + { + "epoch": 0.3, + "grad_norm": 0.5657710730165172, + "learning_rate": 9.615985451061327e-06, + "loss": 0.4872, + "step": 2383 + }, + { + "epoch": 0.3, + "grad_norm": 0.9024634261132601, + "learning_rate": 9.615588886533753e-06, + "loss": 0.6439, + "step": 2384 + }, + { + "epoch": 0.3, + "grad_norm": 0.6598051106064764, + "learning_rate": 9.615192125535308e-06, + "loss": 0.5426, + "step": 2385 + }, + { + "epoch": 0.3, + "grad_norm": 0.941218753522489, + "learning_rate": 9.614795168082885e-06, + "loss": 0.6302, + "step": 2386 + }, + { + "epoch": 0.3, + "grad_norm": 0.6969140643920829, + "learning_rate": 9.614398014193376e-06, + "loss": 0.5691, + "step": 2387 + }, + { + "epoch": 0.3, + "grad_norm": 0.6307159750972525, + "learning_rate": 9.614000663883692e-06, + "loss": 0.5168, + "step": 2388 + }, + { + "epoch": 0.3, + "grad_norm": 0.7211489461599196, + "learning_rate": 9.61360311717074e-06, + "loss": 0.6418, + "step": 2389 + }, + { + "epoch": 0.3, + "grad_norm": 0.6542520301640792, + "learning_rate": 9.613205374071449e-06, + "loss": 0.5681, + "step": 2390 + }, + { + "epoch": 0.3, + "grad_norm": 0.7676464715721771, + "learning_rate": 9.612807434602747e-06, + "loss": 0.567, + "step": 2391 + }, + { + "epoch": 0.3, + "grad_norm": 0.6810222458656829, + "learning_rate": 9.612409298781571e-06, + "loss": 0.5391, + "step": 2392 + }, + { + "epoch": 0.3, + "grad_norm": 0.6421619680166654, + "learning_rate": 9.612010966624871e-06, + "loss": 0.5632, + "step": 2393 + }, + { + "epoch": 0.3, + "grad_norm": 0.6334382080486921, + "learning_rate": 9.611612438149604e-06, + "loss": 0.5109, + "step": 2394 + }, + { + "epoch": 0.31, + "grad_norm": 0.6000212034183585, + "learning_rate": 9.61121371337273e-06, + "loss": 0.5949, + "step": 2395 + }, + { + "epoch": 0.31, + "grad_norm": 0.6461431312045264, + "learning_rate": 9.610814792311223e-06, + "loss": 0.5368, + "step": 2396 + }, + { + "epoch": 0.31, + "grad_norm": 0.7603082067529316, + "learning_rate": 9.610415674982066e-06, + "loss": 0.6449, + "step": 2397 + }, + { + "epoch": 0.31, + "grad_norm": 0.8991449463517307, + "learning_rate": 9.610016361402246e-06, + "loss": 0.6748, + "step": 2398 + }, + { + "epoch": 0.31, + "grad_norm": 0.6349684387182754, + "learning_rate": 9.60961685158876e-06, + "loss": 0.5669, + "step": 2399 + }, + { + "epoch": 0.31, + "grad_norm": 0.6248834326375421, + "learning_rate": 9.609217145558617e-06, + "loss": 0.5188, + "step": 2400 + }, + { + "epoch": 0.31, + "grad_norm": 1.2104698116864234, + "learning_rate": 9.608817243328827e-06, + "loss": 0.7036, + "step": 2401 + }, + { + "epoch": 0.31, + "grad_norm": 0.8889571647654392, + "learning_rate": 9.608417144916417e-06, + "loss": 0.6893, + "step": 2402 + }, + { + "epoch": 0.31, + "grad_norm": 0.6054003487784059, + "learning_rate": 9.608016850338413e-06, + "loss": 0.5112, + "step": 2403 + }, + { + "epoch": 0.31, + "grad_norm": 0.704592851394262, + "learning_rate": 9.60761635961186e-06, + "loss": 0.5493, + "step": 2404 + }, + { + "epoch": 0.31, + "grad_norm": 0.551236892913067, + "learning_rate": 9.607215672753799e-06, + "loss": 0.4779, + "step": 2405 + }, + { + "epoch": 0.31, + "grad_norm": 0.552987497017455, + "learning_rate": 9.60681478978129e-06, + "loss": 0.5193, + "step": 2406 + }, + { + "epoch": 0.31, + "grad_norm": 0.5834679274972107, + "learning_rate": 9.606413710711398e-06, + "loss": 0.5557, + "step": 2407 + }, + { + "epoch": 0.31, + "grad_norm": 0.8429905180044114, + "learning_rate": 9.606012435561194e-06, + "loss": 0.6261, + "step": 2408 + }, + { + "epoch": 0.31, + "grad_norm": 0.5815754742957093, + "learning_rate": 9.605610964347758e-06, + "loss": 0.5468, + "step": 2409 + }, + { + "epoch": 0.31, + "grad_norm": 0.5785602730013256, + "learning_rate": 9.605209297088182e-06, + "loss": 0.546, + "step": 2410 + }, + { + "epoch": 0.31, + "grad_norm": 0.7059173564552337, + "learning_rate": 9.604807433799563e-06, + "loss": 0.583, + "step": 2411 + }, + { + "epoch": 0.31, + "grad_norm": 0.5967318636854467, + "learning_rate": 9.604405374499003e-06, + "loss": 0.552, + "step": 2412 + }, + { + "epoch": 0.31, + "grad_norm": 0.7672505433207766, + "learning_rate": 9.604003119203624e-06, + "loss": 0.5824, + "step": 2413 + }, + { + "epoch": 0.31, + "grad_norm": 0.6569396385308869, + "learning_rate": 9.603600667930542e-06, + "loss": 0.5255, + "step": 2414 + }, + { + "epoch": 0.31, + "grad_norm": 0.7784896520611584, + "learning_rate": 9.603198020696892e-06, + "loss": 0.5965, + "step": 2415 + }, + { + "epoch": 0.31, + "grad_norm": 0.5673147912890423, + "learning_rate": 9.60279517751981e-06, + "loss": 0.497, + "step": 2416 + }, + { + "epoch": 0.31, + "grad_norm": 0.6463402705649699, + "learning_rate": 9.602392138416447e-06, + "loss": 0.5272, + "step": 2417 + }, + { + "epoch": 0.31, + "grad_norm": 0.6907065927904141, + "learning_rate": 9.601988903403958e-06, + "loss": 0.5938, + "step": 2418 + }, + { + "epoch": 0.31, + "grad_norm": 0.8793334566508969, + "learning_rate": 9.601585472499508e-06, + "loss": 0.6642, + "step": 2419 + }, + { + "epoch": 0.31, + "grad_norm": 0.7844761009137756, + "learning_rate": 9.601181845720268e-06, + "loss": 0.5851, + "step": 2420 + }, + { + "epoch": 0.31, + "grad_norm": 0.67963826962421, + "learning_rate": 9.60077802308342e-06, + "loss": 0.5639, + "step": 2421 + }, + { + "epoch": 0.31, + "grad_norm": 0.8693147114331978, + "learning_rate": 9.600374004606153e-06, + "loss": 0.5973, + "step": 2422 + }, + { + "epoch": 0.31, + "grad_norm": 0.6078543400729967, + "learning_rate": 9.599969790305667e-06, + "loss": 0.5417, + "step": 2423 + }, + { + "epoch": 0.31, + "grad_norm": 0.8805026740278471, + "learning_rate": 9.599565380199164e-06, + "loss": 0.6367, + "step": 2424 + }, + { + "epoch": 0.31, + "grad_norm": 0.5795579343765204, + "learning_rate": 9.599160774303863e-06, + "loss": 0.505, + "step": 2425 + }, + { + "epoch": 0.31, + "grad_norm": 0.8687021496402021, + "learning_rate": 9.598755972636983e-06, + "loss": 0.6368, + "step": 2426 + }, + { + "epoch": 0.31, + "grad_norm": 0.7197724942923636, + "learning_rate": 9.598350975215757e-06, + "loss": 0.5701, + "step": 2427 + }, + { + "epoch": 0.31, + "grad_norm": 0.723969890496662, + "learning_rate": 9.597945782057427e-06, + "loss": 0.6485, + "step": 2428 + }, + { + "epoch": 0.31, + "grad_norm": 0.7725077900043601, + "learning_rate": 9.597540393179235e-06, + "loss": 0.6412, + "step": 2429 + }, + { + "epoch": 0.31, + "grad_norm": 0.8068132619636262, + "learning_rate": 9.59713480859844e-06, + "loss": 0.7052, + "step": 2430 + }, + { + "epoch": 0.31, + "grad_norm": 0.624133680104771, + "learning_rate": 9.596729028332309e-06, + "loss": 0.5699, + "step": 2431 + }, + { + "epoch": 0.31, + "grad_norm": 0.7688880035065165, + "learning_rate": 9.596323052398112e-06, + "loss": 0.6272, + "step": 2432 + }, + { + "epoch": 0.31, + "grad_norm": 0.8384046807259603, + "learning_rate": 9.595916880813127e-06, + "loss": 0.6034, + "step": 2433 + }, + { + "epoch": 0.31, + "grad_norm": 0.769858270127807, + "learning_rate": 9.59551051359465e-06, + "loss": 0.5621, + "step": 2434 + }, + { + "epoch": 0.31, + "grad_norm": 0.6400763534881783, + "learning_rate": 9.595103950759974e-06, + "loss": 0.5397, + "step": 2435 + }, + { + "epoch": 0.31, + "grad_norm": 0.8557673589892321, + "learning_rate": 9.594697192326408e-06, + "loss": 0.5849, + "step": 2436 + }, + { + "epoch": 0.31, + "grad_norm": 0.6461724804678096, + "learning_rate": 9.594290238311264e-06, + "loss": 0.5297, + "step": 2437 + }, + { + "epoch": 0.31, + "grad_norm": 0.9378438918064688, + "learning_rate": 9.593883088731866e-06, + "loss": 0.6056, + "step": 2438 + }, + { + "epoch": 0.31, + "grad_norm": 0.8683897724391, + "learning_rate": 9.593475743605546e-06, + "loss": 0.6301, + "step": 2439 + }, + { + "epoch": 0.31, + "grad_norm": 0.8675075549901747, + "learning_rate": 9.593068202949642e-06, + "loss": 0.6534, + "step": 2440 + }, + { + "epoch": 0.31, + "grad_norm": 0.6497677519019572, + "learning_rate": 9.5926604667815e-06, + "loss": 0.5513, + "step": 2441 + }, + { + "epoch": 0.31, + "grad_norm": 0.6224714071357573, + "learning_rate": 9.59225253511848e-06, + "loss": 0.5239, + "step": 2442 + }, + { + "epoch": 0.31, + "grad_norm": 0.8110296456715863, + "learning_rate": 9.591844407977944e-06, + "loss": 0.6376, + "step": 2443 + }, + { + "epoch": 0.31, + "grad_norm": 0.5785413804034262, + "learning_rate": 9.591436085377263e-06, + "loss": 0.5523, + "step": 2444 + }, + { + "epoch": 0.31, + "grad_norm": 0.9300910909493508, + "learning_rate": 9.591027567333822e-06, + "loss": 0.6671, + "step": 2445 + }, + { + "epoch": 0.31, + "grad_norm": 0.7967115344511013, + "learning_rate": 9.590618853865008e-06, + "loss": 0.6202, + "step": 2446 + }, + { + "epoch": 0.31, + "grad_norm": 0.7293896493512114, + "learning_rate": 9.590209944988218e-06, + "loss": 0.5719, + "step": 2447 + }, + { + "epoch": 0.31, + "grad_norm": 0.6017138154585803, + "learning_rate": 9.58980084072086e-06, + "loss": 0.5371, + "step": 2448 + }, + { + "epoch": 0.31, + "grad_norm": 0.6763495265701278, + "learning_rate": 9.589391541080346e-06, + "loss": 0.5398, + "step": 2449 + }, + { + "epoch": 0.31, + "grad_norm": 0.633634695181765, + "learning_rate": 9.588982046084101e-06, + "loss": 0.5232, + "step": 2450 + }, + { + "epoch": 0.31, + "grad_norm": 0.5540877292722046, + "learning_rate": 9.588572355749555e-06, + "loss": 0.5284, + "step": 2451 + }, + { + "epoch": 0.31, + "grad_norm": 0.9761110237084909, + "learning_rate": 9.588162470094145e-06, + "loss": 0.665, + "step": 2452 + }, + { + "epoch": 0.31, + "grad_norm": 1.223077998431946, + "learning_rate": 9.58775238913532e-06, + "loss": 0.6697, + "step": 2453 + }, + { + "epoch": 0.31, + "grad_norm": 0.5979268632542212, + "learning_rate": 9.587342112890539e-06, + "loss": 0.5319, + "step": 2454 + }, + { + "epoch": 0.31, + "grad_norm": 0.7510362212559786, + "learning_rate": 9.586931641377262e-06, + "loss": 0.6268, + "step": 2455 + }, + { + "epoch": 0.31, + "grad_norm": 0.7176335642595973, + "learning_rate": 9.58652097461296e-06, + "loss": 0.5559, + "step": 2456 + }, + { + "epoch": 0.31, + "grad_norm": 0.6989847542259999, + "learning_rate": 9.58611011261512e-06, + "loss": 0.5358, + "step": 2457 + }, + { + "epoch": 0.31, + "grad_norm": 0.8864963162315552, + "learning_rate": 9.585699055401226e-06, + "loss": 0.6491, + "step": 2458 + }, + { + "epoch": 0.31, + "grad_norm": 0.6172125182878831, + "learning_rate": 9.585287802988778e-06, + "loss": 0.5386, + "step": 2459 + }, + { + "epoch": 0.31, + "grad_norm": 0.7234782866031692, + "learning_rate": 9.58487635539528e-06, + "loss": 0.5292, + "step": 2460 + }, + { + "epoch": 0.31, + "grad_norm": 0.6549233588633315, + "learning_rate": 9.584464712638245e-06, + "loss": 0.5821, + "step": 2461 + }, + { + "epoch": 0.31, + "grad_norm": 0.7295733671228533, + "learning_rate": 9.584052874735201e-06, + "loss": 0.6253, + "step": 2462 + }, + { + "epoch": 0.31, + "grad_norm": 0.5980296759480169, + "learning_rate": 9.583640841703672e-06, + "loss": 0.4917, + "step": 2463 + }, + { + "epoch": 0.31, + "grad_norm": 0.6224250812587412, + "learning_rate": 9.5832286135612e-06, + "loss": 0.5461, + "step": 2464 + }, + { + "epoch": 0.31, + "grad_norm": 0.6684864407034281, + "learning_rate": 9.582816190325333e-06, + "loss": 0.5555, + "step": 2465 + }, + { + "epoch": 0.31, + "grad_norm": 0.6233339430704379, + "learning_rate": 9.582403572013623e-06, + "loss": 0.5446, + "step": 2466 + }, + { + "epoch": 0.31, + "grad_norm": 0.7842371818556565, + "learning_rate": 9.58199075864364e-06, + "loss": 0.6205, + "step": 2467 + }, + { + "epoch": 0.31, + "grad_norm": 0.7908366264740696, + "learning_rate": 9.581577750232948e-06, + "loss": 0.5679, + "step": 2468 + }, + { + "epoch": 0.31, + "grad_norm": 0.7666704393633019, + "learning_rate": 9.581164546799135e-06, + "loss": 0.6375, + "step": 2469 + }, + { + "epoch": 0.31, + "grad_norm": 0.5828591565212466, + "learning_rate": 9.580751148359785e-06, + "loss": 0.5457, + "step": 2470 + }, + { + "epoch": 0.31, + "grad_norm": 0.6463208179192446, + "learning_rate": 9.580337554932497e-06, + "loss": 0.6051, + "step": 2471 + }, + { + "epoch": 0.31, + "grad_norm": 0.6733956994877681, + "learning_rate": 9.579923766534875e-06, + "loss": 0.5557, + "step": 2472 + }, + { + "epoch": 0.32, + "grad_norm": 0.7015158626325191, + "learning_rate": 9.579509783184535e-06, + "loss": 0.6169, + "step": 2473 + }, + { + "epoch": 0.32, + "grad_norm": 0.5563294339181992, + "learning_rate": 9.579095604899097e-06, + "loss": 0.5075, + "step": 2474 + }, + { + "epoch": 0.32, + "grad_norm": 0.9959362943126006, + "learning_rate": 9.578681231696191e-06, + "loss": 0.6288, + "step": 2475 + }, + { + "epoch": 0.32, + "grad_norm": 0.7535498086676388, + "learning_rate": 9.578266663593458e-06, + "loss": 0.7042, + "step": 2476 + }, + { + "epoch": 0.32, + "grad_norm": 0.825513303560254, + "learning_rate": 9.577851900608541e-06, + "loss": 0.6261, + "step": 2477 + }, + { + "epoch": 0.32, + "grad_norm": 0.7635947749024156, + "learning_rate": 9.5774369427591e-06, + "loss": 0.5794, + "step": 2478 + }, + { + "epoch": 0.32, + "grad_norm": 0.5658271771333826, + "learning_rate": 9.577021790062794e-06, + "loss": 0.4903, + "step": 2479 + }, + { + "epoch": 0.32, + "grad_norm": 0.7353477955814989, + "learning_rate": 9.576606442537297e-06, + "loss": 0.5989, + "step": 2480 + }, + { + "epoch": 0.32, + "grad_norm": 0.7766002116136786, + "learning_rate": 9.576190900200288e-06, + "loss": 0.6137, + "step": 2481 + }, + { + "epoch": 0.32, + "grad_norm": 0.889908014101899, + "learning_rate": 9.575775163069456e-06, + "loss": 0.6999, + "step": 2482 + }, + { + "epoch": 0.32, + "grad_norm": 0.9757343338918586, + "learning_rate": 9.575359231162497e-06, + "loss": 0.5649, + "step": 2483 + }, + { + "epoch": 0.32, + "grad_norm": 0.7849444011005536, + "learning_rate": 9.574943104497118e-06, + "loss": 0.6195, + "step": 2484 + }, + { + "epoch": 0.32, + "grad_norm": 0.6463259656534145, + "learning_rate": 9.574526783091029e-06, + "loss": 0.5361, + "step": 2485 + }, + { + "epoch": 0.32, + "grad_norm": 0.6617501805839066, + "learning_rate": 9.574110266961953e-06, + "loss": 0.5729, + "step": 2486 + }, + { + "epoch": 0.32, + "grad_norm": 0.7279993137004347, + "learning_rate": 9.573693556127618e-06, + "loss": 0.5579, + "step": 2487 + }, + { + "epoch": 0.32, + "grad_norm": 0.7996822044827662, + "learning_rate": 9.573276650605768e-06, + "loss": 0.6384, + "step": 2488 + }, + { + "epoch": 0.32, + "grad_norm": 0.5908133831685772, + "learning_rate": 9.572859550414143e-06, + "loss": 0.5495, + "step": 2489 + }, + { + "epoch": 0.32, + "grad_norm": 0.7043086949520345, + "learning_rate": 9.572442255570498e-06, + "loss": 0.5855, + "step": 2490 + }, + { + "epoch": 0.32, + "grad_norm": 0.8993169692043346, + "learning_rate": 9.5720247660926e-06, + "loss": 0.536, + "step": 2491 + }, + { + "epoch": 0.32, + "grad_norm": 0.7928167705587933, + "learning_rate": 9.571607081998216e-06, + "loss": 0.5852, + "step": 2492 + }, + { + "epoch": 0.32, + "grad_norm": 0.7593312595814335, + "learning_rate": 9.571189203305128e-06, + "loss": 0.6027, + "step": 2493 + }, + { + "epoch": 0.32, + "grad_norm": 0.6564232624986837, + "learning_rate": 9.570771130031123e-06, + "loss": 0.6041, + "step": 2494 + }, + { + "epoch": 0.32, + "grad_norm": 0.6221530793613381, + "learning_rate": 9.570352862193998e-06, + "loss": 0.5808, + "step": 2495 + }, + { + "epoch": 0.32, + "grad_norm": 0.7136315212099565, + "learning_rate": 9.569934399811556e-06, + "loss": 0.5321, + "step": 2496 + }, + { + "epoch": 0.32, + "grad_norm": 0.7963988455937888, + "learning_rate": 9.56951574290161e-06, + "loss": 0.6246, + "step": 2497 + }, + { + "epoch": 0.32, + "grad_norm": 1.014207828651746, + "learning_rate": 9.569096891481979e-06, + "loss": 0.624, + "step": 2498 + }, + { + "epoch": 0.32, + "grad_norm": 0.5695149473103667, + "learning_rate": 9.568677845570497e-06, + "loss": 0.5503, + "step": 2499 + }, + { + "epoch": 0.32, + "grad_norm": 0.5961049589529291, + "learning_rate": 9.568258605184996e-06, + "loss": 0.4863, + "step": 2500 + }, + { + "epoch": 0.32, + "grad_norm": 0.6857720938128696, + "learning_rate": 9.567839170343327e-06, + "loss": 0.5623, + "step": 2501 + }, + { + "epoch": 0.32, + "grad_norm": 0.8282032857138397, + "learning_rate": 9.567419541063338e-06, + "loss": 0.5572, + "step": 2502 + }, + { + "epoch": 0.32, + "grad_norm": 0.6893026749200307, + "learning_rate": 9.566999717362897e-06, + "loss": 0.5443, + "step": 2503 + }, + { + "epoch": 0.32, + "grad_norm": 0.7768239746749537, + "learning_rate": 9.56657969925987e-06, + "loss": 0.5686, + "step": 2504 + }, + { + "epoch": 0.32, + "grad_norm": 0.5597594429832988, + "learning_rate": 9.56615948677214e-06, + "loss": 0.5284, + "step": 2505 + }, + { + "epoch": 0.32, + "grad_norm": 0.7564168137120164, + "learning_rate": 9.565739079917591e-06, + "loss": 0.5774, + "step": 2506 + }, + { + "epoch": 0.32, + "grad_norm": 0.7641672679370525, + "learning_rate": 9.56531847871412e-06, + "loss": 0.6491, + "step": 2507 + }, + { + "epoch": 0.32, + "grad_norm": 0.9111016128120439, + "learning_rate": 9.56489768317963e-06, + "loss": 0.6339, + "step": 2508 + }, + { + "epoch": 0.32, + "grad_norm": 0.8567261134162234, + "learning_rate": 9.564476693332032e-06, + "loss": 0.6073, + "step": 2509 + }, + { + "epoch": 0.32, + "grad_norm": 0.8562793246556688, + "learning_rate": 9.564055509189247e-06, + "loss": 0.6335, + "step": 2510 + }, + { + "epoch": 0.32, + "grad_norm": 0.6186671603918202, + "learning_rate": 9.563634130769206e-06, + "loss": 0.5171, + "step": 2511 + }, + { + "epoch": 0.32, + "grad_norm": 0.639540957253105, + "learning_rate": 9.56321255808984e-06, + "loss": 0.6051, + "step": 2512 + }, + { + "epoch": 0.32, + "grad_norm": 0.6801078881261133, + "learning_rate": 9.5627907911691e-06, + "loss": 0.5157, + "step": 2513 + }, + { + "epoch": 0.32, + "grad_norm": 0.6373131560688663, + "learning_rate": 9.562368830024935e-06, + "loss": 0.4953, + "step": 2514 + }, + { + "epoch": 0.32, + "grad_norm": 0.7733377559227137, + "learning_rate": 9.561946674675308e-06, + "loss": 0.6336, + "step": 2515 + }, + { + "epoch": 0.32, + "grad_norm": 0.8001748975946251, + "learning_rate": 9.561524325138192e-06, + "loss": 0.657, + "step": 2516 + }, + { + "epoch": 0.32, + "grad_norm": 0.8574515862304499, + "learning_rate": 9.561101781431558e-06, + "loss": 0.6496, + "step": 2517 + }, + { + "epoch": 0.32, + "grad_norm": 0.8432913593456502, + "learning_rate": 9.5606790435734e-06, + "loss": 0.6368, + "step": 2518 + }, + { + "epoch": 0.32, + "grad_norm": 0.7313829825713054, + "learning_rate": 9.560256111581705e-06, + "loss": 0.604, + "step": 2519 + }, + { + "epoch": 0.32, + "grad_norm": 0.6259317550178952, + "learning_rate": 9.559832985474482e-06, + "loss": 0.594, + "step": 2520 + }, + { + "epoch": 0.32, + "grad_norm": 0.6732396527372642, + "learning_rate": 9.559409665269741e-06, + "loss": 0.5467, + "step": 2521 + }, + { + "epoch": 0.32, + "grad_norm": 0.9823587856496049, + "learning_rate": 9.5589861509855e-06, + "loss": 0.636, + "step": 2522 + }, + { + "epoch": 0.32, + "grad_norm": 0.6743790179134761, + "learning_rate": 9.558562442639785e-06, + "loss": 0.5974, + "step": 2523 + }, + { + "epoch": 0.32, + "grad_norm": 0.8780746135749471, + "learning_rate": 9.558138540250636e-06, + "loss": 0.6705, + "step": 2524 + }, + { + "epoch": 0.32, + "grad_norm": 0.8130945511228779, + "learning_rate": 9.557714443836093e-06, + "loss": 0.6641, + "step": 2525 + }, + { + "epoch": 0.32, + "grad_norm": 0.7906361142613465, + "learning_rate": 9.557290153414211e-06, + "loss": 0.636, + "step": 2526 + }, + { + "epoch": 0.32, + "grad_norm": 0.7101826950871989, + "learning_rate": 9.556865669003053e-06, + "loss": 0.5826, + "step": 2527 + }, + { + "epoch": 0.32, + "grad_norm": 0.6076279506595039, + "learning_rate": 9.556440990620682e-06, + "loss": 0.5123, + "step": 2528 + }, + { + "epoch": 0.32, + "grad_norm": 0.6120283623210143, + "learning_rate": 9.556016118285178e-06, + "loss": 0.5538, + "step": 2529 + }, + { + "epoch": 0.32, + "grad_norm": 0.7650111755183286, + "learning_rate": 9.555591052014629e-06, + "loss": 0.5438, + "step": 2530 + }, + { + "epoch": 0.32, + "grad_norm": 0.5349207680266397, + "learning_rate": 9.555165791827125e-06, + "loss": 0.5127, + "step": 2531 + }, + { + "epoch": 0.32, + "grad_norm": 0.7083003864694378, + "learning_rate": 9.55474033774077e-06, + "loss": 0.5487, + "step": 2532 + }, + { + "epoch": 0.32, + "grad_norm": 1.079547641280141, + "learning_rate": 9.554314689773674e-06, + "loss": 0.6343, + "step": 2533 + }, + { + "epoch": 0.32, + "grad_norm": 0.6465127203315632, + "learning_rate": 9.553888847943956e-06, + "loss": 0.5409, + "step": 2534 + }, + { + "epoch": 0.32, + "grad_norm": 0.6650303012598022, + "learning_rate": 9.55346281226974e-06, + "loss": 0.5893, + "step": 2535 + }, + { + "epoch": 0.32, + "grad_norm": 0.6110300011948008, + "learning_rate": 9.553036582769164e-06, + "loss": 0.4981, + "step": 2536 + }, + { + "epoch": 0.32, + "grad_norm": 0.8100895984954982, + "learning_rate": 9.552610159460369e-06, + "loss": 0.6186, + "step": 2537 + }, + { + "epoch": 0.32, + "grad_norm": 0.8179220677587817, + "learning_rate": 9.552183542361508e-06, + "loss": 0.5529, + "step": 2538 + }, + { + "epoch": 0.32, + "grad_norm": 0.5952520994167613, + "learning_rate": 9.551756731490742e-06, + "loss": 0.522, + "step": 2539 + }, + { + "epoch": 0.32, + "grad_norm": 0.8226784584113425, + "learning_rate": 9.551329726866235e-06, + "loss": 0.5815, + "step": 2540 + }, + { + "epoch": 0.32, + "grad_norm": 0.5704498044176846, + "learning_rate": 9.550902528506168e-06, + "loss": 0.4416, + "step": 2541 + }, + { + "epoch": 0.32, + "grad_norm": 0.604239828616303, + "learning_rate": 9.550475136428721e-06, + "loss": 0.5282, + "step": 2542 + }, + { + "epoch": 0.32, + "grad_norm": 0.649238403244341, + "learning_rate": 9.550047550652089e-06, + "loss": 0.5562, + "step": 2543 + }, + { + "epoch": 0.32, + "grad_norm": 0.6570463366363107, + "learning_rate": 9.549619771194472e-06, + "loss": 0.5762, + "step": 2544 + }, + { + "epoch": 0.32, + "grad_norm": 0.6351487994969179, + "learning_rate": 9.54919179807408e-06, + "loss": 0.5349, + "step": 2545 + }, + { + "epoch": 0.32, + "grad_norm": 0.648900949394083, + "learning_rate": 9.548763631309132e-06, + "loss": 0.5566, + "step": 2546 + }, + { + "epoch": 0.32, + "grad_norm": 0.5933008133983052, + "learning_rate": 9.54833527091785e-06, + "loss": 0.5216, + "step": 2547 + }, + { + "epoch": 0.32, + "grad_norm": 0.5900970115260598, + "learning_rate": 9.547906716918472e-06, + "loss": 0.4871, + "step": 2548 + }, + { + "epoch": 0.32, + "grad_norm": 0.6477875891595827, + "learning_rate": 9.547477969329235e-06, + "loss": 0.5577, + "step": 2549 + }, + { + "epoch": 0.32, + "grad_norm": 0.5294932860860582, + "learning_rate": 9.547049028168395e-06, + "loss": 0.4855, + "step": 2550 + }, + { + "epoch": 0.32, + "grad_norm": 0.7605284892461301, + "learning_rate": 9.546619893454208e-06, + "loss": 0.5526, + "step": 2551 + }, + { + "epoch": 0.33, + "grad_norm": 0.8821681184089893, + "learning_rate": 9.546190565204941e-06, + "loss": 0.6431, + "step": 2552 + }, + { + "epoch": 0.33, + "grad_norm": 0.5972036879598572, + "learning_rate": 9.545761043438868e-06, + "loss": 0.5477, + "step": 2553 + }, + { + "epoch": 0.33, + "grad_norm": 0.7774109891516569, + "learning_rate": 9.545331328174274e-06, + "loss": 0.6452, + "step": 2554 + }, + { + "epoch": 0.33, + "grad_norm": 0.559993123168119, + "learning_rate": 9.544901419429452e-06, + "loss": 0.4755, + "step": 2555 + }, + { + "epoch": 0.33, + "grad_norm": 0.9812318418974344, + "learning_rate": 9.544471317222696e-06, + "loss": 0.631, + "step": 2556 + }, + { + "epoch": 0.33, + "grad_norm": 0.5700560989878682, + "learning_rate": 9.54404102157232e-06, + "loss": 0.5152, + "step": 2557 + }, + { + "epoch": 0.33, + "grad_norm": 0.6060204128545266, + "learning_rate": 9.54361053249664e-06, + "loss": 0.48, + "step": 2558 + }, + { + "epoch": 0.33, + "grad_norm": 0.682067997060381, + "learning_rate": 9.543179850013978e-06, + "loss": 0.5346, + "step": 2559 + }, + { + "epoch": 0.33, + "grad_norm": 1.1084238206703774, + "learning_rate": 9.542748974142668e-06, + "loss": 0.6243, + "step": 2560 + }, + { + "epoch": 0.33, + "grad_norm": 0.8378037363051217, + "learning_rate": 9.542317904901049e-06, + "loss": 0.5972, + "step": 2561 + }, + { + "epoch": 0.33, + "grad_norm": 0.7135604533169227, + "learning_rate": 9.541886642307473e-06, + "loss": 0.6139, + "step": 2562 + }, + { + "epoch": 0.33, + "grad_norm": 0.6945523477457736, + "learning_rate": 9.541455186380297e-06, + "loss": 0.556, + "step": 2563 + }, + { + "epoch": 0.33, + "grad_norm": 0.8105865099203851, + "learning_rate": 9.541023537137885e-06, + "loss": 0.662, + "step": 2564 + }, + { + "epoch": 0.33, + "grad_norm": 0.9723262911446988, + "learning_rate": 9.540591694598615e-06, + "loss": 0.6237, + "step": 2565 + }, + { + "epoch": 0.33, + "grad_norm": 0.6284942090793638, + "learning_rate": 9.540159658780862e-06, + "loss": 0.5601, + "step": 2566 + }, + { + "epoch": 0.33, + "grad_norm": 0.7943621277394831, + "learning_rate": 9.539727429703024e-06, + "loss": 0.6296, + "step": 2567 + }, + { + "epoch": 0.33, + "grad_norm": 0.646599463677551, + "learning_rate": 9.539295007383496e-06, + "loss": 0.5127, + "step": 2568 + }, + { + "epoch": 0.33, + "grad_norm": 0.714891775660591, + "learning_rate": 9.538862391840683e-06, + "loss": 0.5886, + "step": 2569 + }, + { + "epoch": 0.33, + "grad_norm": 0.7683683299777343, + "learning_rate": 9.538429583093003e-06, + "loss": 0.6505, + "step": 2570 + }, + { + "epoch": 0.33, + "grad_norm": 0.8068531708286238, + "learning_rate": 9.537996581158878e-06, + "loss": 0.5425, + "step": 2571 + }, + { + "epoch": 0.33, + "grad_norm": 0.7633809149010669, + "learning_rate": 9.53756338605674e-06, + "loss": 0.5695, + "step": 2572 + }, + { + "epoch": 0.33, + "grad_norm": 0.5927862910594637, + "learning_rate": 9.53712999780503e-06, + "loss": 0.5093, + "step": 2573 + }, + { + "epoch": 0.33, + "grad_norm": 0.6329786480367147, + "learning_rate": 9.536696416422194e-06, + "loss": 0.533, + "step": 2574 + }, + { + "epoch": 0.33, + "grad_norm": 0.7403403535131283, + "learning_rate": 9.536262641926689e-06, + "loss": 0.6452, + "step": 2575 + }, + { + "epoch": 0.33, + "grad_norm": 0.5918314936786386, + "learning_rate": 9.535828674336976e-06, + "loss": 0.4818, + "step": 2576 + }, + { + "epoch": 0.33, + "grad_norm": 0.5683143465029358, + "learning_rate": 9.535394513671534e-06, + "loss": 0.4978, + "step": 2577 + }, + { + "epoch": 0.33, + "grad_norm": 1.233205772739789, + "learning_rate": 9.534960159948841e-06, + "loss": 0.6469, + "step": 2578 + }, + { + "epoch": 0.33, + "grad_norm": 0.9695389770572457, + "learning_rate": 9.534525613187383e-06, + "loss": 0.6816, + "step": 2579 + }, + { + "epoch": 0.33, + "grad_norm": 0.5381105391785254, + "learning_rate": 9.534090873405662e-06, + "loss": 0.511, + "step": 2580 + }, + { + "epoch": 0.33, + "grad_norm": 1.190502374810711, + "learning_rate": 9.53365594062218e-06, + "loss": 0.5892, + "step": 2581 + }, + { + "epoch": 0.33, + "grad_norm": 0.8059850644306538, + "learning_rate": 9.533220814855453e-06, + "loss": 0.6301, + "step": 2582 + }, + { + "epoch": 0.33, + "grad_norm": 0.6498408537331678, + "learning_rate": 9.532785496124003e-06, + "loss": 0.5565, + "step": 2583 + }, + { + "epoch": 0.33, + "grad_norm": 0.5783800531387244, + "learning_rate": 9.532349984446358e-06, + "loss": 0.5364, + "step": 2584 + }, + { + "epoch": 0.33, + "grad_norm": 0.9546698067137003, + "learning_rate": 9.531914279841057e-06, + "loss": 0.5862, + "step": 2585 + }, + { + "epoch": 0.33, + "grad_norm": 0.5701886528114443, + "learning_rate": 9.531478382326647e-06, + "loss": 0.5115, + "step": 2586 + }, + { + "epoch": 0.33, + "grad_norm": 0.6303662784078251, + "learning_rate": 9.531042291921685e-06, + "loss": 0.5237, + "step": 2587 + }, + { + "epoch": 0.33, + "grad_norm": 0.6637468226996036, + "learning_rate": 9.53060600864473e-06, + "loss": 0.4938, + "step": 2588 + }, + { + "epoch": 0.33, + "grad_norm": 0.7976804502105883, + "learning_rate": 9.530169532514355e-06, + "loss": 0.4904, + "step": 2589 + }, + { + "epoch": 0.33, + "grad_norm": 1.8480503149393777, + "learning_rate": 9.52973286354914e-06, + "loss": 0.6147, + "step": 2590 + }, + { + "epoch": 0.33, + "grad_norm": 0.5410025719543855, + "learning_rate": 9.529296001767669e-06, + "loss": 0.4789, + "step": 2591 + }, + { + "epoch": 0.33, + "grad_norm": 0.6944545405994262, + "learning_rate": 9.528858947188543e-06, + "loss": 0.5154, + "step": 2592 + }, + { + "epoch": 0.33, + "grad_norm": 0.5943387449210243, + "learning_rate": 9.528421699830365e-06, + "loss": 0.5479, + "step": 2593 + }, + { + "epoch": 0.33, + "grad_norm": 0.7460414739115181, + "learning_rate": 9.527984259711744e-06, + "loss": 0.5879, + "step": 2594 + }, + { + "epoch": 0.33, + "grad_norm": 0.6829949307460953, + "learning_rate": 9.527546626851306e-06, + "loss": 0.612, + "step": 2595 + }, + { + "epoch": 0.33, + "grad_norm": 0.7688717836898237, + "learning_rate": 9.527108801267673e-06, + "loss": 0.5747, + "step": 2596 + }, + { + "epoch": 0.33, + "grad_norm": 0.6935414862682998, + "learning_rate": 9.526670782979483e-06, + "loss": 0.5645, + "step": 2597 + }, + { + "epoch": 0.33, + "grad_norm": 0.91662531566737, + "learning_rate": 9.526232572005387e-06, + "loss": 0.6257, + "step": 2598 + }, + { + "epoch": 0.33, + "grad_norm": 0.6404437087562576, + "learning_rate": 9.525794168364032e-06, + "loss": 0.5792, + "step": 2599 + }, + { + "epoch": 0.33, + "grad_norm": 0.9336752361852743, + "learning_rate": 9.525355572074083e-06, + "loss": 0.6525, + "step": 2600 + }, + { + "epoch": 0.33, + "grad_norm": 0.6188451823758374, + "learning_rate": 9.524916783154206e-06, + "loss": 0.5327, + "step": 2601 + }, + { + "epoch": 0.33, + "grad_norm": 0.595947713758538, + "learning_rate": 9.524477801623083e-06, + "loss": 0.5018, + "step": 2602 + }, + { + "epoch": 0.33, + "grad_norm": 0.7602569644562919, + "learning_rate": 9.524038627499396e-06, + "loss": 0.5526, + "step": 2603 + }, + { + "epoch": 0.33, + "grad_norm": 1.1390855016538206, + "learning_rate": 9.523599260801845e-06, + "loss": 0.6233, + "step": 2604 + }, + { + "epoch": 0.33, + "grad_norm": 0.5826783634067593, + "learning_rate": 9.523159701549126e-06, + "loss": 0.4934, + "step": 2605 + }, + { + "epoch": 0.33, + "grad_norm": 0.6096115493108369, + "learning_rate": 9.522719949759953e-06, + "loss": 0.5076, + "step": 2606 + }, + { + "epoch": 0.33, + "grad_norm": 0.7390090601415761, + "learning_rate": 9.522280005453045e-06, + "loss": 0.6452, + "step": 2607 + }, + { + "epoch": 0.33, + "grad_norm": 0.6902664668253552, + "learning_rate": 9.521839868647128e-06, + "loss": 0.585, + "step": 2608 + }, + { + "epoch": 0.33, + "grad_norm": 0.8791964525039503, + "learning_rate": 9.521399539360937e-06, + "loss": 0.6836, + "step": 2609 + }, + { + "epoch": 0.33, + "grad_norm": 0.6490246657183921, + "learning_rate": 9.520959017613215e-06, + "loss": 0.5153, + "step": 2610 + }, + { + "epoch": 0.33, + "grad_norm": 0.5809035468751219, + "learning_rate": 9.520518303422718e-06, + "loss": 0.5684, + "step": 2611 + }, + { + "epoch": 0.33, + "grad_norm": 0.6707458810382166, + "learning_rate": 9.5200773968082e-06, + "loss": 0.5169, + "step": 2612 + }, + { + "epoch": 0.33, + "grad_norm": 0.6345208821443088, + "learning_rate": 9.51963629778843e-06, + "loss": 0.5915, + "step": 2613 + }, + { + "epoch": 0.33, + "grad_norm": 0.530845563877577, + "learning_rate": 9.519195006382188e-06, + "loss": 0.517, + "step": 2614 + }, + { + "epoch": 0.33, + "grad_norm": 0.835407233106553, + "learning_rate": 9.518753522608255e-06, + "loss": 0.6703, + "step": 2615 + }, + { + "epoch": 0.33, + "grad_norm": 0.6793471378124661, + "learning_rate": 9.518311846485424e-06, + "loss": 0.5751, + "step": 2616 + }, + { + "epoch": 0.33, + "grad_norm": 0.7349933974032022, + "learning_rate": 9.517869978032497e-06, + "loss": 0.5582, + "step": 2617 + }, + { + "epoch": 0.33, + "grad_norm": 1.074984731694823, + "learning_rate": 9.51742791726828e-06, + "loss": 0.6236, + "step": 2618 + }, + { + "epoch": 0.33, + "grad_norm": 0.9201587831352932, + "learning_rate": 9.516985664211595e-06, + "loss": 0.5787, + "step": 2619 + }, + { + "epoch": 0.33, + "grad_norm": 0.6485537188339017, + "learning_rate": 9.516543218881264e-06, + "loss": 0.5675, + "step": 2620 + }, + { + "epoch": 0.33, + "grad_norm": 0.7191479145391688, + "learning_rate": 9.51610058129612e-06, + "loss": 0.5694, + "step": 2621 + }, + { + "epoch": 0.33, + "grad_norm": 0.8073994848174533, + "learning_rate": 9.515657751475005e-06, + "loss": 0.653, + "step": 2622 + }, + { + "epoch": 0.33, + "grad_norm": 0.620470519151046, + "learning_rate": 9.51521472943677e-06, + "loss": 0.5594, + "step": 2623 + }, + { + "epoch": 0.33, + "grad_norm": 0.8577981010508564, + "learning_rate": 9.514771515200273e-06, + "loss": 0.5185, + "step": 2624 + }, + { + "epoch": 0.33, + "grad_norm": 0.5613299806463133, + "learning_rate": 9.51432810878438e-06, + "loss": 0.5347, + "step": 2625 + }, + { + "epoch": 0.33, + "grad_norm": 0.6905584167856317, + "learning_rate": 9.513884510207965e-06, + "loss": 0.5404, + "step": 2626 + }, + { + "epoch": 0.33, + "grad_norm": 0.5787359917483325, + "learning_rate": 9.51344071948991e-06, + "loss": 0.5663, + "step": 2627 + }, + { + "epoch": 0.33, + "grad_norm": 0.6451525074311069, + "learning_rate": 9.512996736649107e-06, + "loss": 0.5533, + "step": 2628 + }, + { + "epoch": 0.33, + "grad_norm": 0.6246696688029569, + "learning_rate": 9.512552561704454e-06, + "loss": 0.5496, + "step": 2629 + }, + { + "epoch": 0.34, + "grad_norm": 0.6815441549936629, + "learning_rate": 9.51210819467486e-06, + "loss": 0.5343, + "step": 2630 + }, + { + "epoch": 0.34, + "grad_norm": 0.8360184672663747, + "learning_rate": 9.511663635579237e-06, + "loss": 0.657, + "step": 2631 + }, + { + "epoch": 0.34, + "grad_norm": 0.7001008146479152, + "learning_rate": 9.51121888443651e-06, + "loss": 0.6144, + "step": 2632 + }, + { + "epoch": 0.34, + "grad_norm": 0.832107455918356, + "learning_rate": 9.510773941265612e-06, + "loss": 0.5801, + "step": 2633 + }, + { + "epoch": 0.34, + "grad_norm": 0.6515346390702125, + "learning_rate": 9.510328806085483e-06, + "loss": 0.536, + "step": 2634 + }, + { + "epoch": 0.34, + "grad_norm": 0.5205168551152277, + "learning_rate": 9.509883478915068e-06, + "loss": 0.4978, + "step": 2635 + }, + { + "epoch": 0.34, + "grad_norm": 0.8758102289762094, + "learning_rate": 9.509437959773324e-06, + "loss": 0.6378, + "step": 2636 + }, + { + "epoch": 0.34, + "grad_norm": 0.7007231004619369, + "learning_rate": 9.508992248679217e-06, + "loss": 0.5801, + "step": 2637 + }, + { + "epoch": 0.34, + "grad_norm": 2.6204772699830503, + "learning_rate": 9.50854634565172e-06, + "loss": 0.6512, + "step": 2638 + }, + { + "epoch": 0.34, + "grad_norm": 0.969055388010146, + "learning_rate": 9.508100250709809e-06, + "loss": 0.6135, + "step": 2639 + }, + { + "epoch": 0.34, + "grad_norm": 1.1653372937097857, + "learning_rate": 9.507653963872479e-06, + "loss": 0.5752, + "step": 2640 + }, + { + "epoch": 0.34, + "grad_norm": 0.6896592730666222, + "learning_rate": 9.507207485158724e-06, + "loss": 0.5448, + "step": 2641 + }, + { + "epoch": 0.34, + "grad_norm": 1.0668974797658888, + "learning_rate": 9.506760814587547e-06, + "loss": 0.6443, + "step": 2642 + }, + { + "epoch": 0.34, + "grad_norm": 0.8925707789010904, + "learning_rate": 9.506313952177966e-06, + "loss": 0.6913, + "step": 2643 + }, + { + "epoch": 0.34, + "grad_norm": 0.7922544781589685, + "learning_rate": 9.505866897949e-06, + "loss": 0.5695, + "step": 2644 + }, + { + "epoch": 0.34, + "grad_norm": 0.5977798601691642, + "learning_rate": 9.505419651919678e-06, + "loss": 0.5472, + "step": 2645 + }, + { + "epoch": 0.34, + "grad_norm": 0.5857657542808271, + "learning_rate": 9.50497221410904e-06, + "loss": 0.5256, + "step": 2646 + }, + { + "epoch": 0.34, + "grad_norm": 0.6269816345974752, + "learning_rate": 9.50452458453613e-06, + "loss": 0.5469, + "step": 2647 + }, + { + "epoch": 0.34, + "grad_norm": 0.6637418482259861, + "learning_rate": 9.504076763220003e-06, + "loss": 0.5697, + "step": 2648 + }, + { + "epoch": 0.34, + "grad_norm": 0.7637371397015333, + "learning_rate": 9.503628750179719e-06, + "loss": 0.568, + "step": 2649 + }, + { + "epoch": 0.34, + "grad_norm": 0.939164956586822, + "learning_rate": 9.503180545434353e-06, + "loss": 0.6734, + "step": 2650 + }, + { + "epoch": 0.34, + "grad_norm": 0.8160952788765521, + "learning_rate": 9.50273214900298e-06, + "loss": 0.589, + "step": 2651 + }, + { + "epoch": 0.34, + "grad_norm": 0.664853124608123, + "learning_rate": 9.50228356090469e-06, + "loss": 0.5112, + "step": 2652 + }, + { + "epoch": 0.34, + "grad_norm": 0.6596469254493981, + "learning_rate": 9.501834781158574e-06, + "loss": 0.5901, + "step": 2653 + }, + { + "epoch": 0.34, + "grad_norm": 0.6962722835328786, + "learning_rate": 9.501385809783739e-06, + "loss": 0.5398, + "step": 2654 + }, + { + "epoch": 0.34, + "grad_norm": 0.6632304354029682, + "learning_rate": 9.500936646799293e-06, + "loss": 0.5534, + "step": 2655 + }, + { + "epoch": 0.34, + "grad_norm": 0.847071596337105, + "learning_rate": 9.500487292224357e-06, + "loss": 0.6602, + "step": 2656 + }, + { + "epoch": 0.34, + "grad_norm": 0.6594231855021783, + "learning_rate": 9.500037746078058e-06, + "loss": 0.5528, + "step": 2657 + }, + { + "epoch": 0.34, + "grad_norm": 0.6536904250392873, + "learning_rate": 9.499588008379534e-06, + "loss": 0.5198, + "step": 2658 + }, + { + "epoch": 0.34, + "grad_norm": 0.6360593846325148, + "learning_rate": 9.499138079147927e-06, + "loss": 0.592, + "step": 2659 + }, + { + "epoch": 0.34, + "grad_norm": 0.6326183225471013, + "learning_rate": 9.498687958402388e-06, + "loss": 0.5683, + "step": 2660 + }, + { + "epoch": 0.34, + "grad_norm": 0.6737489838915545, + "learning_rate": 9.498237646162078e-06, + "loss": 0.5475, + "step": 2661 + }, + { + "epoch": 0.34, + "grad_norm": 0.6694602163920368, + "learning_rate": 9.497787142446166e-06, + "loss": 0.5977, + "step": 2662 + }, + { + "epoch": 0.34, + "grad_norm": 0.6919396971952262, + "learning_rate": 9.497336447273828e-06, + "loss": 0.6119, + "step": 2663 + }, + { + "epoch": 0.34, + "grad_norm": 0.6545875727828754, + "learning_rate": 9.496885560664251e-06, + "loss": 0.5487, + "step": 2664 + }, + { + "epoch": 0.34, + "grad_norm": 0.8161631128077677, + "learning_rate": 9.496434482636623e-06, + "loss": 0.6141, + "step": 2665 + }, + { + "epoch": 0.34, + "grad_norm": 1.066318647899695, + "learning_rate": 9.49598321321015e-06, + "loss": 0.5804, + "step": 2666 + }, + { + "epoch": 0.34, + "grad_norm": 0.6145505682433737, + "learning_rate": 9.495531752404036e-06, + "loss": 0.5522, + "step": 2667 + }, + { + "epoch": 0.34, + "grad_norm": 0.7404581570988357, + "learning_rate": 9.495080100237503e-06, + "loss": 0.6437, + "step": 2668 + }, + { + "epoch": 0.34, + "grad_norm": 0.6453979666377999, + "learning_rate": 9.494628256729774e-06, + "loss": 0.5338, + "step": 2669 + }, + { + "epoch": 0.34, + "grad_norm": 0.7390857664877688, + "learning_rate": 9.494176221900081e-06, + "loss": 0.5518, + "step": 2670 + }, + { + "epoch": 0.34, + "grad_norm": 0.6313898819588964, + "learning_rate": 9.49372399576767e-06, + "loss": 0.5381, + "step": 2671 + }, + { + "epoch": 0.34, + "grad_norm": 0.7259062340874922, + "learning_rate": 9.493271578351787e-06, + "loss": 0.5903, + "step": 2672 + }, + { + "epoch": 0.34, + "grad_norm": 0.9063405020904501, + "learning_rate": 9.492818969671693e-06, + "loss": 0.5912, + "step": 2673 + }, + { + "epoch": 0.34, + "grad_norm": 0.6202276898209159, + "learning_rate": 9.492366169746652e-06, + "loss": 0.5696, + "step": 2674 + }, + { + "epoch": 0.34, + "grad_norm": 0.8134286613739936, + "learning_rate": 9.491913178595937e-06, + "loss": 0.596, + "step": 2675 + }, + { + "epoch": 0.34, + "grad_norm": 0.8993303631152776, + "learning_rate": 9.491459996238834e-06, + "loss": 0.641, + "step": 2676 + }, + { + "epoch": 0.34, + "grad_norm": 0.816233493485614, + "learning_rate": 9.49100662269463e-06, + "loss": 0.6091, + "step": 2677 + }, + { + "epoch": 0.34, + "grad_norm": 0.6127999309058696, + "learning_rate": 9.490553057982627e-06, + "loss": 0.4907, + "step": 2678 + }, + { + "epoch": 0.34, + "grad_norm": 0.6327981151154719, + "learning_rate": 9.490099302122129e-06, + "loss": 0.5736, + "step": 2679 + }, + { + "epoch": 0.34, + "grad_norm": 0.8263889646504642, + "learning_rate": 9.489645355132452e-06, + "loss": 0.6461, + "step": 2680 + }, + { + "epoch": 0.34, + "grad_norm": 0.6240105661629423, + "learning_rate": 9.489191217032919e-06, + "loss": 0.5742, + "step": 2681 + }, + { + "epoch": 0.34, + "grad_norm": 0.6009437124199746, + "learning_rate": 9.488736887842862e-06, + "loss": 0.5071, + "step": 2682 + }, + { + "epoch": 0.34, + "grad_norm": 0.8700586532006659, + "learning_rate": 9.488282367581619e-06, + "loss": 0.712, + "step": 2683 + }, + { + "epoch": 0.34, + "grad_norm": 0.7503216585904273, + "learning_rate": 9.487827656268539e-06, + "loss": 0.5549, + "step": 2684 + }, + { + "epoch": 0.34, + "grad_norm": 0.860642292830429, + "learning_rate": 9.487372753922976e-06, + "loss": 0.64, + "step": 2685 + }, + { + "epoch": 0.34, + "grad_norm": 0.5906724271653871, + "learning_rate": 9.486917660564294e-06, + "loss": 0.5373, + "step": 2686 + }, + { + "epoch": 0.34, + "grad_norm": 0.748581513417254, + "learning_rate": 9.486462376211866e-06, + "loss": 0.5543, + "step": 2687 + }, + { + "epoch": 0.34, + "grad_norm": 0.6625302637533109, + "learning_rate": 9.486006900885073e-06, + "loss": 0.5095, + "step": 2688 + }, + { + "epoch": 0.34, + "grad_norm": 0.6821843085540856, + "learning_rate": 9.485551234603299e-06, + "loss": 0.5771, + "step": 2689 + }, + { + "epoch": 0.34, + "grad_norm": 0.8304659259874845, + "learning_rate": 9.485095377385943e-06, + "loss": 0.5634, + "step": 2690 + }, + { + "epoch": 0.34, + "grad_norm": 0.7512685960757559, + "learning_rate": 9.484639329252409e-06, + "loss": 0.6491, + "step": 2691 + }, + { + "epoch": 0.34, + "grad_norm": 0.7534347072646292, + "learning_rate": 9.48418309022211e-06, + "loss": 0.6229, + "step": 2692 + }, + { + "epoch": 0.34, + "grad_norm": 0.6126461797913709, + "learning_rate": 9.483726660314464e-06, + "loss": 0.5082, + "step": 2693 + }, + { + "epoch": 0.34, + "grad_norm": 0.7279374182617983, + "learning_rate": 9.483270039548906e-06, + "loss": 0.5317, + "step": 2694 + }, + { + "epoch": 0.34, + "grad_norm": 0.9354531491803656, + "learning_rate": 9.482813227944868e-06, + "loss": 0.6301, + "step": 2695 + }, + { + "epoch": 0.34, + "grad_norm": 1.2255087844545716, + "learning_rate": 9.482356225521794e-06, + "loss": 0.6343, + "step": 2696 + }, + { + "epoch": 0.34, + "grad_norm": 0.6515432382342676, + "learning_rate": 9.48189903229914e-06, + "loss": 0.608, + "step": 2697 + }, + { + "epoch": 0.34, + "grad_norm": 0.7259746189084652, + "learning_rate": 9.481441648296367e-06, + "loss": 0.6028, + "step": 2698 + }, + { + "epoch": 0.34, + "grad_norm": 0.8509618283107478, + "learning_rate": 9.480984073532943e-06, + "loss": 0.6006, + "step": 2699 + }, + { + "epoch": 0.34, + "grad_norm": 0.661686115769553, + "learning_rate": 9.480526308028345e-06, + "loss": 0.5693, + "step": 2700 + }, + { + "epoch": 0.34, + "grad_norm": 0.8340500821731865, + "learning_rate": 9.480068351802063e-06, + "loss": 0.6269, + "step": 2701 + }, + { + "epoch": 0.34, + "grad_norm": 0.7294376986355238, + "learning_rate": 9.479610204873586e-06, + "loss": 0.5847, + "step": 2702 + }, + { + "epoch": 0.34, + "grad_norm": 0.64222982626784, + "learning_rate": 9.479151867262417e-06, + "loss": 0.6098, + "step": 2703 + }, + { + "epoch": 0.34, + "grad_norm": 0.7411094065018148, + "learning_rate": 9.478693338988065e-06, + "loss": 0.5866, + "step": 2704 + }, + { + "epoch": 0.34, + "grad_norm": 0.7757412492305917, + "learning_rate": 9.47823462007005e-06, + "loss": 0.6302, + "step": 2705 + }, + { + "epoch": 0.34, + "grad_norm": 0.7008346031950602, + "learning_rate": 9.477775710527898e-06, + "loss": 0.6065, + "step": 2706 + }, + { + "epoch": 0.34, + "grad_norm": 0.6450380535106405, + "learning_rate": 9.477316610381144e-06, + "loss": 0.5669, + "step": 2707 + }, + { + "epoch": 0.34, + "grad_norm": 0.8020350688024981, + "learning_rate": 9.47685731964933e-06, + "loss": 0.5942, + "step": 2708 + }, + { + "epoch": 0.35, + "grad_norm": 0.6515230738536907, + "learning_rate": 9.476397838352003e-06, + "loss": 0.537, + "step": 2709 + }, + { + "epoch": 0.35, + "grad_norm": 0.7106797304009778, + "learning_rate": 9.475938166508726e-06, + "loss": 0.5968, + "step": 2710 + }, + { + "epoch": 0.35, + "grad_norm": 0.6767112085036076, + "learning_rate": 9.475478304139064e-06, + "loss": 0.5551, + "step": 2711 + }, + { + "epoch": 0.35, + "grad_norm": 0.8966197085509974, + "learning_rate": 9.475018251262594e-06, + "loss": 0.6464, + "step": 2712 + }, + { + "epoch": 0.35, + "grad_norm": 0.8575931820709087, + "learning_rate": 9.474558007898895e-06, + "loss": 0.6516, + "step": 2713 + }, + { + "epoch": 0.35, + "grad_norm": 0.9080998604636245, + "learning_rate": 9.47409757406756e-06, + "loss": 0.604, + "step": 2714 + }, + { + "epoch": 0.35, + "grad_norm": 0.8304634081485033, + "learning_rate": 9.47363694978819e-06, + "loss": 0.5593, + "step": 2715 + }, + { + "epoch": 0.35, + "grad_norm": 0.8461238243280341, + "learning_rate": 9.473176135080392e-06, + "loss": 0.6296, + "step": 2716 + }, + { + "epoch": 0.35, + "grad_norm": 0.6635571220376301, + "learning_rate": 9.472715129963776e-06, + "loss": 0.5634, + "step": 2717 + }, + { + "epoch": 0.35, + "grad_norm": 0.6624474616243881, + "learning_rate": 9.472253934457973e-06, + "loss": 0.5372, + "step": 2718 + }, + { + "epoch": 0.35, + "grad_norm": 0.6674988841873706, + "learning_rate": 9.47179254858261e-06, + "loss": 0.5388, + "step": 2719 + }, + { + "epoch": 0.35, + "grad_norm": 1.19573730107884, + "learning_rate": 9.471330972357327e-06, + "loss": 0.6058, + "step": 2720 + }, + { + "epoch": 0.35, + "grad_norm": 0.5937215067510677, + "learning_rate": 9.470869205801774e-06, + "loss": 0.566, + "step": 2721 + }, + { + "epoch": 0.35, + "grad_norm": 0.5487173465229642, + "learning_rate": 9.470407248935606e-06, + "loss": 0.5182, + "step": 2722 + }, + { + "epoch": 0.35, + "grad_norm": 0.8008315113990622, + "learning_rate": 9.469945101778487e-06, + "loss": 0.5512, + "step": 2723 + }, + { + "epoch": 0.35, + "grad_norm": 0.6749228954537383, + "learning_rate": 9.46948276435009e-06, + "loss": 0.5427, + "step": 2724 + }, + { + "epoch": 0.35, + "grad_norm": 0.7251223243488515, + "learning_rate": 9.46902023667009e-06, + "loss": 0.5931, + "step": 2725 + }, + { + "epoch": 0.35, + "grad_norm": 0.5893424590220128, + "learning_rate": 9.468557518758183e-06, + "loss": 0.5271, + "step": 2726 + }, + { + "epoch": 0.35, + "grad_norm": 0.6206131778648086, + "learning_rate": 9.468094610634062e-06, + "loss": 0.5336, + "step": 2727 + }, + { + "epoch": 0.35, + "grad_norm": 0.7232629802065703, + "learning_rate": 9.46763151231743e-06, + "loss": 0.5801, + "step": 2728 + }, + { + "epoch": 0.35, + "grad_norm": 0.5559640325030679, + "learning_rate": 9.467168223828001e-06, + "loss": 0.5295, + "step": 2729 + }, + { + "epoch": 0.35, + "grad_norm": 0.6453014003908335, + "learning_rate": 9.466704745185497e-06, + "loss": 0.5208, + "step": 2730 + }, + { + "epoch": 0.35, + "grad_norm": 0.7949818119663074, + "learning_rate": 9.466241076409644e-06, + "loss": 0.6, + "step": 2731 + }, + { + "epoch": 0.35, + "grad_norm": 0.6768070524933107, + "learning_rate": 9.46577721752018e-06, + "loss": 0.5493, + "step": 2732 + }, + { + "epoch": 0.35, + "grad_norm": 0.6235733846150524, + "learning_rate": 9.465313168536852e-06, + "loss": 0.4976, + "step": 2733 + }, + { + "epoch": 0.35, + "grad_norm": 0.5526313657564148, + "learning_rate": 9.464848929479412e-06, + "loss": 0.5021, + "step": 2734 + }, + { + "epoch": 0.35, + "grad_norm": 0.7255657737830097, + "learning_rate": 9.46438450036762e-06, + "loss": 0.5794, + "step": 2735 + }, + { + "epoch": 0.35, + "grad_norm": 0.729547906615159, + "learning_rate": 9.463919881221246e-06, + "loss": 0.631, + "step": 2736 + }, + { + "epoch": 0.35, + "grad_norm": 0.7358070039119499, + "learning_rate": 9.463455072060066e-06, + "loss": 0.7151, + "step": 2737 + }, + { + "epoch": 0.35, + "grad_norm": 0.565924718889971, + "learning_rate": 9.462990072903868e-06, + "loss": 0.4992, + "step": 2738 + }, + { + "epoch": 0.35, + "grad_norm": 0.708636311457162, + "learning_rate": 9.462524883772444e-06, + "loss": 0.5494, + "step": 2739 + }, + { + "epoch": 0.35, + "grad_norm": 0.7565394219576851, + "learning_rate": 9.462059504685596e-06, + "loss": 0.6017, + "step": 2740 + }, + { + "epoch": 0.35, + "grad_norm": 0.6288142162213577, + "learning_rate": 9.461593935663134e-06, + "loss": 0.5263, + "step": 2741 + }, + { + "epoch": 0.35, + "grad_norm": 0.628364933812235, + "learning_rate": 9.461128176724876e-06, + "loss": 0.522, + "step": 2742 + }, + { + "epoch": 0.35, + "grad_norm": 0.6109788086616127, + "learning_rate": 9.460662227890645e-06, + "loss": 0.55, + "step": 2743 + }, + { + "epoch": 0.35, + "grad_norm": 0.710527864029022, + "learning_rate": 9.460196089180278e-06, + "loss": 0.6023, + "step": 2744 + }, + { + "epoch": 0.35, + "grad_norm": 0.5793125193996272, + "learning_rate": 9.459729760613618e-06, + "loss": 0.521, + "step": 2745 + }, + { + "epoch": 0.35, + "grad_norm": 0.633340780233596, + "learning_rate": 9.459263242210511e-06, + "loss": 0.5724, + "step": 2746 + }, + { + "epoch": 0.35, + "grad_norm": 0.9481665252210578, + "learning_rate": 9.45879653399082e-06, + "loss": 0.6289, + "step": 2747 + }, + { + "epoch": 0.35, + "grad_norm": 0.7203959862754759, + "learning_rate": 9.458329635974405e-06, + "loss": 0.6071, + "step": 2748 + }, + { + "epoch": 0.35, + "grad_norm": 0.7140244824243365, + "learning_rate": 9.457862548181147e-06, + "loss": 0.5492, + "step": 2749 + }, + { + "epoch": 0.35, + "grad_norm": 0.6796579025760305, + "learning_rate": 9.457395270630926e-06, + "loss": 0.5862, + "step": 2750 + }, + { + "epoch": 0.35, + "grad_norm": 0.6233384318469098, + "learning_rate": 9.456927803343631e-06, + "loss": 0.6139, + "step": 2751 + }, + { + "epoch": 0.35, + "grad_norm": 0.5590310560221577, + "learning_rate": 9.456460146339162e-06, + "loss": 0.5194, + "step": 2752 + }, + { + "epoch": 0.35, + "grad_norm": 0.6587705455020467, + "learning_rate": 9.455992299637426e-06, + "loss": 0.5528, + "step": 2753 + }, + { + "epoch": 0.35, + "grad_norm": 0.6777434539217504, + "learning_rate": 9.455524263258335e-06, + "loss": 0.5015, + "step": 2754 + }, + { + "epoch": 0.35, + "grad_norm": 1.218106107899709, + "learning_rate": 9.455056037221816e-06, + "loss": 0.5961, + "step": 2755 + }, + { + "epoch": 0.35, + "grad_norm": 0.8525734012370407, + "learning_rate": 9.454587621547796e-06, + "loss": 0.6683, + "step": 2756 + }, + { + "epoch": 0.35, + "grad_norm": 0.9978909741872638, + "learning_rate": 9.454119016256218e-06, + "loss": 0.6548, + "step": 2757 + }, + { + "epoch": 0.35, + "grad_norm": 0.6537367828334337, + "learning_rate": 9.453650221367025e-06, + "loss": 0.5757, + "step": 2758 + }, + { + "epoch": 0.35, + "grad_norm": 0.8057521803778169, + "learning_rate": 9.453181236900175e-06, + "loss": 0.5573, + "step": 2759 + }, + { + "epoch": 0.35, + "grad_norm": 0.8751902451285365, + "learning_rate": 9.452712062875631e-06, + "loss": 0.603, + "step": 2760 + }, + { + "epoch": 0.35, + "grad_norm": 0.7558656559155569, + "learning_rate": 9.452242699313361e-06, + "loss": 0.6003, + "step": 2761 + }, + { + "epoch": 0.35, + "grad_norm": 0.643190086717265, + "learning_rate": 9.451773146233348e-06, + "loss": 0.5123, + "step": 2762 + }, + { + "epoch": 0.35, + "grad_norm": 0.7753792676439469, + "learning_rate": 9.451303403655579e-06, + "loss": 0.6313, + "step": 2763 + }, + { + "epoch": 0.35, + "grad_norm": 0.7982668840577974, + "learning_rate": 9.450833471600047e-06, + "loss": 0.6137, + "step": 2764 + }, + { + "epoch": 0.35, + "grad_norm": 0.6469800456142494, + "learning_rate": 9.450363350086756e-06, + "loss": 0.5721, + "step": 2765 + }, + { + "epoch": 0.35, + "grad_norm": 0.5954827834637731, + "learning_rate": 9.44989303913572e-06, + "loss": 0.5192, + "step": 2766 + }, + { + "epoch": 0.35, + "grad_norm": 0.6267961389988276, + "learning_rate": 9.449422538766958e-06, + "loss": 0.5352, + "step": 2767 + }, + { + "epoch": 0.35, + "grad_norm": 0.6181168997578211, + "learning_rate": 9.448951849000494e-06, + "loss": 0.5943, + "step": 2768 + }, + { + "epoch": 0.35, + "grad_norm": 0.9379775552391613, + "learning_rate": 9.448480969856368e-06, + "loss": 0.6026, + "step": 2769 + }, + { + "epoch": 0.35, + "grad_norm": 0.8224891932385208, + "learning_rate": 9.448009901354623e-06, + "loss": 0.6197, + "step": 2770 + }, + { + "epoch": 0.35, + "grad_norm": 0.6413797970707639, + "learning_rate": 9.44753864351531e-06, + "loss": 0.5587, + "step": 2771 + }, + { + "epoch": 0.35, + "grad_norm": 0.9317899250101933, + "learning_rate": 9.44706719635849e-06, + "loss": 0.6364, + "step": 2772 + }, + { + "epoch": 0.35, + "grad_norm": 0.6969392482600794, + "learning_rate": 9.446595559904228e-06, + "loss": 0.5118, + "step": 2773 + }, + { + "epoch": 0.35, + "grad_norm": 0.7753036998085789, + "learning_rate": 9.446123734172606e-06, + "loss": 0.5955, + "step": 2774 + }, + { + "epoch": 0.35, + "grad_norm": 1.1130667261540452, + "learning_rate": 9.445651719183701e-06, + "loss": 0.7081, + "step": 2775 + }, + { + "epoch": 0.35, + "grad_norm": 0.7918291910362325, + "learning_rate": 9.44517951495761e-06, + "loss": 0.624, + "step": 2776 + }, + { + "epoch": 0.35, + "grad_norm": 0.6756623501720954, + "learning_rate": 9.444707121514432e-06, + "loss": 0.5837, + "step": 2777 + }, + { + "epoch": 0.35, + "grad_norm": 0.6287568219254337, + "learning_rate": 9.444234538874273e-06, + "loss": 0.5435, + "step": 2778 + }, + { + "epoch": 0.35, + "grad_norm": 0.5790455136469947, + "learning_rate": 9.443761767057253e-06, + "loss": 0.517, + "step": 2779 + }, + { + "epoch": 0.35, + "grad_norm": 0.8612660254985647, + "learning_rate": 9.443288806083496e-06, + "loss": 0.6321, + "step": 2780 + }, + { + "epoch": 0.35, + "grad_norm": 0.7449823418496511, + "learning_rate": 9.442815655973133e-06, + "loss": 0.6455, + "step": 2781 + }, + { + "epoch": 0.35, + "grad_norm": 0.5845316912298215, + "learning_rate": 9.442342316746303e-06, + "loss": 0.555, + "step": 2782 + }, + { + "epoch": 0.35, + "grad_norm": 0.9020077596190232, + "learning_rate": 9.441868788423156e-06, + "loss": 0.6682, + "step": 2783 + }, + { + "epoch": 0.35, + "grad_norm": 0.6004706591343363, + "learning_rate": 9.44139507102385e-06, + "loss": 0.5382, + "step": 2784 + }, + { + "epoch": 0.35, + "grad_norm": 0.7774042995780771, + "learning_rate": 9.440921164568548e-06, + "loss": 0.6594, + "step": 2785 + }, + { + "epoch": 0.35, + "grad_norm": 0.7090563768533579, + "learning_rate": 9.440447069077425e-06, + "loss": 0.5637, + "step": 2786 + }, + { + "epoch": 0.36, + "grad_norm": 0.6212625374063201, + "learning_rate": 9.439972784570659e-06, + "loss": 0.5373, + "step": 2787 + }, + { + "epoch": 0.36, + "grad_norm": 0.9759312113547524, + "learning_rate": 9.439498311068438e-06, + "loss": 0.6406, + "step": 2788 + }, + { + "epoch": 0.36, + "grad_norm": 0.7079321917169328, + "learning_rate": 9.439023648590961e-06, + "loss": 0.5166, + "step": 2789 + }, + { + "epoch": 0.36, + "grad_norm": 0.6498521179541045, + "learning_rate": 9.438548797158435e-06, + "loss": 0.532, + "step": 2790 + }, + { + "epoch": 0.36, + "grad_norm": 0.5737016888241293, + "learning_rate": 9.438073756791068e-06, + "loss": 0.5157, + "step": 2791 + }, + { + "epoch": 0.36, + "grad_norm": 0.6755265566833282, + "learning_rate": 9.437598527509082e-06, + "loss": 0.5655, + "step": 2792 + }, + { + "epoch": 0.36, + "grad_norm": 0.7078292393750686, + "learning_rate": 9.43712310933271e-06, + "loss": 0.6258, + "step": 2793 + }, + { + "epoch": 0.36, + "grad_norm": 0.7853443316626899, + "learning_rate": 9.436647502282185e-06, + "loss": 0.5843, + "step": 2794 + }, + { + "epoch": 0.36, + "grad_norm": 0.7130818120694572, + "learning_rate": 9.436171706377753e-06, + "loss": 0.5606, + "step": 2795 + }, + { + "epoch": 0.36, + "grad_norm": 0.77522789020357, + "learning_rate": 9.435695721639668e-06, + "loss": 0.6855, + "step": 2796 + }, + { + "epoch": 0.36, + "grad_norm": 0.6393191307337324, + "learning_rate": 9.43521954808819e-06, + "loss": 0.5047, + "step": 2797 + }, + { + "epoch": 0.36, + "grad_norm": 0.8133557704088161, + "learning_rate": 9.43474318574359e-06, + "loss": 0.6843, + "step": 2798 + }, + { + "epoch": 0.36, + "grad_norm": 0.91311500568812, + "learning_rate": 9.434266634626143e-06, + "loss": 0.6161, + "step": 2799 + }, + { + "epoch": 0.36, + "grad_norm": 0.6518119783303293, + "learning_rate": 9.433789894756136e-06, + "loss": 0.5574, + "step": 2800 + }, + { + "epoch": 0.36, + "grad_norm": 1.2686676296534765, + "learning_rate": 9.43331296615386e-06, + "loss": 0.644, + "step": 2801 + }, + { + "epoch": 0.36, + "grad_norm": 0.748624786857783, + "learning_rate": 9.432835848839619e-06, + "loss": 0.5264, + "step": 2802 + }, + { + "epoch": 0.36, + "grad_norm": 0.638531267290233, + "learning_rate": 9.432358542833722e-06, + "loss": 0.5296, + "step": 2803 + }, + { + "epoch": 0.36, + "grad_norm": 0.6667117779795751, + "learning_rate": 9.431881048156484e-06, + "loss": 0.5383, + "step": 2804 + }, + { + "epoch": 0.36, + "grad_norm": 1.1193428143495239, + "learning_rate": 9.431403364828233e-06, + "loss": 0.5939, + "step": 2805 + }, + { + "epoch": 0.36, + "grad_norm": 0.919471429477277, + "learning_rate": 9.4309254928693e-06, + "loss": 0.622, + "step": 2806 + }, + { + "epoch": 0.36, + "grad_norm": 0.817986507762727, + "learning_rate": 9.43044743230003e-06, + "loss": 0.6453, + "step": 2807 + }, + { + "epoch": 0.36, + "grad_norm": 0.7075351440730898, + "learning_rate": 9.429969183140771e-06, + "loss": 0.5307, + "step": 2808 + }, + { + "epoch": 0.36, + "grad_norm": 0.8892681978788348, + "learning_rate": 9.429490745411878e-06, + "loss": 0.5703, + "step": 2809 + }, + { + "epoch": 0.36, + "grad_norm": 0.6934844745237143, + "learning_rate": 9.42901211913372e-06, + "loss": 0.5847, + "step": 2810 + }, + { + "epoch": 0.36, + "grad_norm": 0.7907377083152881, + "learning_rate": 9.428533304326668e-06, + "loss": 0.5943, + "step": 2811 + }, + { + "epoch": 0.36, + "grad_norm": 0.7895661295094039, + "learning_rate": 9.428054301011104e-06, + "loss": 0.6229, + "step": 2812 + }, + { + "epoch": 0.36, + "grad_norm": 0.592793376722089, + "learning_rate": 9.42757510920742e-06, + "loss": 0.5125, + "step": 2813 + }, + { + "epoch": 0.36, + "grad_norm": 0.9090321305209546, + "learning_rate": 9.427095728936013e-06, + "loss": 0.6113, + "step": 2814 + }, + { + "epoch": 0.36, + "grad_norm": 0.6829019996273853, + "learning_rate": 9.426616160217288e-06, + "loss": 0.5814, + "step": 2815 + }, + { + "epoch": 0.36, + "grad_norm": 0.6471304331268625, + "learning_rate": 9.426136403071656e-06, + "loss": 0.5511, + "step": 2816 + }, + { + "epoch": 0.36, + "grad_norm": 0.6523135127801731, + "learning_rate": 9.425656457519544e-06, + "loss": 0.5825, + "step": 2817 + }, + { + "epoch": 0.36, + "grad_norm": 1.159718982308664, + "learning_rate": 9.425176323581375e-06, + "loss": 0.6203, + "step": 2818 + }, + { + "epoch": 0.36, + "grad_norm": 0.8029568291454016, + "learning_rate": 9.424696001277594e-06, + "loss": 0.5665, + "step": 2819 + }, + { + "epoch": 0.36, + "grad_norm": 1.0670005911598865, + "learning_rate": 9.424215490628644e-06, + "loss": 0.6193, + "step": 2820 + }, + { + "epoch": 0.36, + "grad_norm": 0.6666859487893871, + "learning_rate": 9.423734791654976e-06, + "loss": 0.5869, + "step": 2821 + }, + { + "epoch": 0.36, + "grad_norm": 0.7380117096550212, + "learning_rate": 9.423253904377054e-06, + "loss": 0.5548, + "step": 2822 + }, + { + "epoch": 0.36, + "grad_norm": 0.86011089010381, + "learning_rate": 9.42277282881535e-06, + "loss": 0.5972, + "step": 2823 + }, + { + "epoch": 0.36, + "grad_norm": 0.8826088904629107, + "learning_rate": 9.422291564990339e-06, + "loss": 0.6252, + "step": 2824 + }, + { + "epoch": 0.36, + "grad_norm": 0.8408935153832707, + "learning_rate": 9.421810112922507e-06, + "loss": 0.6092, + "step": 2825 + }, + { + "epoch": 0.36, + "grad_norm": 0.8537020795261052, + "learning_rate": 9.421328472632349e-06, + "loss": 0.68, + "step": 2826 + }, + { + "epoch": 0.36, + "grad_norm": 0.6949599215545131, + "learning_rate": 9.420846644140368e-06, + "loss": 0.5896, + "step": 2827 + }, + { + "epoch": 0.36, + "grad_norm": 0.7244250735563492, + "learning_rate": 9.420364627467071e-06, + "loss": 0.6225, + "step": 2828 + }, + { + "epoch": 0.36, + "grad_norm": 0.7446010533844636, + "learning_rate": 9.419882422632978e-06, + "loss": 0.588, + "step": 2829 + }, + { + "epoch": 0.36, + "grad_norm": 0.7872628454553385, + "learning_rate": 9.419400029658613e-06, + "loss": 0.5979, + "step": 2830 + }, + { + "epoch": 0.36, + "grad_norm": 0.6342756540961432, + "learning_rate": 9.418917448564512e-06, + "loss": 0.5236, + "step": 2831 + }, + { + "epoch": 0.36, + "grad_norm": 0.9368501245531178, + "learning_rate": 9.418434679371216e-06, + "loss": 0.6125, + "step": 2832 + }, + { + "epoch": 0.36, + "grad_norm": 1.033354267299191, + "learning_rate": 9.417951722099275e-06, + "loss": 0.6481, + "step": 2833 + }, + { + "epoch": 0.36, + "grad_norm": 0.630607770788086, + "learning_rate": 9.417468576769247e-06, + "loss": 0.5658, + "step": 2834 + }, + { + "epoch": 0.36, + "grad_norm": 1.4325380942899781, + "learning_rate": 9.416985243401696e-06, + "loss": 0.6096, + "step": 2835 + }, + { + "epoch": 0.36, + "grad_norm": 0.5761440335248742, + "learning_rate": 9.4165017220172e-06, + "loss": 0.5246, + "step": 2836 + }, + { + "epoch": 0.36, + "grad_norm": 0.7249543068030742, + "learning_rate": 9.41601801263634e-06, + "loss": 0.6412, + "step": 2837 + }, + { + "epoch": 0.36, + "grad_norm": 0.9412920957801415, + "learning_rate": 9.415534115279701e-06, + "loss": 0.5744, + "step": 2838 + }, + { + "epoch": 0.36, + "grad_norm": 0.7897584709082406, + "learning_rate": 9.415050029967887e-06, + "loss": 0.5695, + "step": 2839 + }, + { + "epoch": 0.36, + "grad_norm": 1.0703647721946201, + "learning_rate": 9.4145657567215e-06, + "loss": 0.595, + "step": 2840 + }, + { + "epoch": 0.36, + "grad_norm": 0.8092186795924167, + "learning_rate": 9.414081295561157e-06, + "loss": 0.6585, + "step": 2841 + }, + { + "epoch": 0.36, + "grad_norm": 0.9069107351169117, + "learning_rate": 9.41359664650748e-06, + "loss": 0.639, + "step": 2842 + }, + { + "epoch": 0.36, + "grad_norm": 0.7338329136891169, + "learning_rate": 9.413111809581097e-06, + "loss": 0.5871, + "step": 2843 + }, + { + "epoch": 0.36, + "grad_norm": 0.645351721082941, + "learning_rate": 9.412626784802646e-06, + "loss": 0.5864, + "step": 2844 + }, + { + "epoch": 0.36, + "grad_norm": 0.853078731366598, + "learning_rate": 9.412141572192772e-06, + "loss": 0.6837, + "step": 2845 + }, + { + "epoch": 0.36, + "grad_norm": 0.5737539198299346, + "learning_rate": 9.411656171772132e-06, + "loss": 0.5264, + "step": 2846 + }, + { + "epoch": 0.36, + "grad_norm": 0.8672541381913115, + "learning_rate": 9.411170583561386e-06, + "loss": 0.6583, + "step": 2847 + }, + { + "epoch": 0.36, + "grad_norm": 0.8309777279979316, + "learning_rate": 9.410684807581204e-06, + "loss": 0.6165, + "step": 2848 + }, + { + "epoch": 0.36, + "grad_norm": 0.5985005098021324, + "learning_rate": 9.410198843852267e-06, + "loss": 0.5366, + "step": 2849 + }, + { + "epoch": 0.36, + "grad_norm": 0.9759883360897562, + "learning_rate": 9.409712692395257e-06, + "loss": 0.6922, + "step": 2850 + }, + { + "epoch": 0.36, + "grad_norm": 1.0933014987638408, + "learning_rate": 9.409226353230866e-06, + "loss": 0.6349, + "step": 2851 + }, + { + "epoch": 0.36, + "grad_norm": 0.6851806530435292, + "learning_rate": 9.408739826379802e-06, + "loss": 0.5324, + "step": 2852 + }, + { + "epoch": 0.36, + "grad_norm": 0.6426987004506657, + "learning_rate": 9.40825311186277e-06, + "loss": 0.5423, + "step": 2853 + }, + { + "epoch": 0.36, + "grad_norm": 0.7543805886760679, + "learning_rate": 9.407766209700493e-06, + "loss": 0.6085, + "step": 2854 + }, + { + "epoch": 0.36, + "grad_norm": 0.6945186168163094, + "learning_rate": 9.40727911991369e-06, + "loss": 0.6244, + "step": 2855 + }, + { + "epoch": 0.36, + "grad_norm": 0.9530437827842488, + "learning_rate": 9.406791842523101e-06, + "loss": 0.6679, + "step": 2856 + }, + { + "epoch": 0.36, + "grad_norm": 0.6025847100072658, + "learning_rate": 9.406304377549464e-06, + "loss": 0.52, + "step": 2857 + }, + { + "epoch": 0.36, + "grad_norm": 0.8686059097275766, + "learning_rate": 9.40581672501353e-06, + "loss": 0.6455, + "step": 2858 + }, + { + "epoch": 0.36, + "grad_norm": 0.6122480572803359, + "learning_rate": 9.405328884936058e-06, + "loss": 0.5348, + "step": 2859 + }, + { + "epoch": 0.36, + "grad_norm": 0.585461852705897, + "learning_rate": 9.404840857337814e-06, + "loss": 0.5403, + "step": 2860 + }, + { + "epoch": 0.36, + "grad_norm": 0.5677296145455577, + "learning_rate": 9.404352642239569e-06, + "loss": 0.5262, + "step": 2861 + }, + { + "epoch": 0.36, + "grad_norm": 1.081751345125273, + "learning_rate": 9.403864239662106e-06, + "loss": 0.6406, + "step": 2862 + }, + { + "epoch": 0.36, + "grad_norm": 0.8885644008018558, + "learning_rate": 9.403375649626214e-06, + "loss": 0.6475, + "step": 2863 + }, + { + "epoch": 0.36, + "grad_norm": 0.6353144235774352, + "learning_rate": 9.402886872152695e-06, + "loss": 0.5235, + "step": 2864 + }, + { + "epoch": 0.36, + "grad_norm": 0.7409406000262078, + "learning_rate": 9.402397907262348e-06, + "loss": 0.6313, + "step": 2865 + }, + { + "epoch": 0.37, + "grad_norm": 0.5830889351646802, + "learning_rate": 9.401908754975993e-06, + "loss": 0.5451, + "step": 2866 + }, + { + "epoch": 0.37, + "grad_norm": 0.601798767248453, + "learning_rate": 9.401419415314447e-06, + "loss": 0.501, + "step": 2867 + }, + { + "epoch": 0.37, + "grad_norm": 0.6403338198548815, + "learning_rate": 9.400929888298542e-06, + "loss": 0.583, + "step": 2868 + }, + { + "epoch": 0.37, + "grad_norm": 0.6519778667822522, + "learning_rate": 9.400440173949115e-06, + "loss": 0.5455, + "step": 2869 + }, + { + "epoch": 0.37, + "grad_norm": 0.616829522867513, + "learning_rate": 9.399950272287011e-06, + "loss": 0.6113, + "step": 2870 + }, + { + "epoch": 0.37, + "grad_norm": 0.5984801817276588, + "learning_rate": 9.399460183333084e-06, + "loss": 0.4937, + "step": 2871 + }, + { + "epoch": 0.37, + "grad_norm": 0.8405439718925305, + "learning_rate": 9.398969907108198e-06, + "loss": 0.6504, + "step": 2872 + }, + { + "epoch": 0.37, + "grad_norm": 0.8285620852945064, + "learning_rate": 9.398479443633217e-06, + "loss": 0.5455, + "step": 2873 + }, + { + "epoch": 0.37, + "grad_norm": 0.658413194806729, + "learning_rate": 9.397988792929024e-06, + "loss": 0.5504, + "step": 2874 + }, + { + "epoch": 0.37, + "grad_norm": 0.779401416885058, + "learning_rate": 9.397497955016502e-06, + "loss": 0.5915, + "step": 2875 + }, + { + "epoch": 0.37, + "grad_norm": 0.5954922739090105, + "learning_rate": 9.397006929916542e-06, + "loss": 0.5742, + "step": 2876 + }, + { + "epoch": 0.37, + "grad_norm": 0.7908914462166622, + "learning_rate": 9.39651571765005e-06, + "loss": 0.6958, + "step": 2877 + }, + { + "epoch": 0.37, + "grad_norm": 0.5842953799188918, + "learning_rate": 9.396024318237932e-06, + "loss": 0.5165, + "step": 2878 + }, + { + "epoch": 0.37, + "grad_norm": 0.8778519407334405, + "learning_rate": 9.395532731701106e-06, + "loss": 0.6618, + "step": 2879 + }, + { + "epoch": 0.37, + "grad_norm": 0.732685740487691, + "learning_rate": 9.3950409580605e-06, + "loss": 0.5762, + "step": 2880 + }, + { + "epoch": 0.37, + "grad_norm": 0.6341810910406681, + "learning_rate": 9.394548997337044e-06, + "loss": 0.5098, + "step": 2881 + }, + { + "epoch": 0.37, + "grad_norm": 0.6446350693674776, + "learning_rate": 9.394056849551681e-06, + "loss": 0.5658, + "step": 2882 + }, + { + "epoch": 0.37, + "grad_norm": 0.6860208191364687, + "learning_rate": 9.393564514725357e-06, + "loss": 0.5522, + "step": 2883 + }, + { + "epoch": 0.37, + "grad_norm": 0.573984716627783, + "learning_rate": 9.393071992879033e-06, + "loss": 0.5381, + "step": 2884 + }, + { + "epoch": 0.37, + "grad_norm": 0.5983887150093357, + "learning_rate": 9.392579284033672e-06, + "loss": 0.5736, + "step": 2885 + }, + { + "epoch": 0.37, + "grad_norm": 0.7597083521338649, + "learning_rate": 9.392086388210248e-06, + "loss": 0.6242, + "step": 2886 + }, + { + "epoch": 0.37, + "grad_norm": 0.7071420000098009, + "learning_rate": 9.391593305429741e-06, + "loss": 0.5656, + "step": 2887 + }, + { + "epoch": 0.37, + "grad_norm": 0.7808994541893899, + "learning_rate": 9.39110003571314e-06, + "loss": 0.6728, + "step": 2888 + }, + { + "epoch": 0.37, + "grad_norm": 0.5811025956057759, + "learning_rate": 9.390606579081445e-06, + "loss": 0.5341, + "step": 2889 + }, + { + "epoch": 0.37, + "grad_norm": 0.6992273447442522, + "learning_rate": 9.390112935555655e-06, + "loss": 0.577, + "step": 2890 + }, + { + "epoch": 0.37, + "grad_norm": 0.7456521083506628, + "learning_rate": 9.389619105156788e-06, + "loss": 0.6125, + "step": 2891 + }, + { + "epoch": 0.37, + "grad_norm": 0.7156156740151897, + "learning_rate": 9.389125087905862e-06, + "loss": 0.6062, + "step": 2892 + }, + { + "epoch": 0.37, + "grad_norm": 0.884844055039022, + "learning_rate": 9.388630883823909e-06, + "loss": 0.5962, + "step": 2893 + }, + { + "epoch": 0.37, + "grad_norm": 1.1687731001705837, + "learning_rate": 9.38813649293196e-06, + "loss": 0.6283, + "step": 2894 + }, + { + "epoch": 0.37, + "grad_norm": 0.5924002278826536, + "learning_rate": 9.387641915251065e-06, + "loss": 0.5487, + "step": 2895 + }, + { + "epoch": 0.37, + "grad_norm": 0.5738750677465778, + "learning_rate": 9.387147150802274e-06, + "loss": 0.589, + "step": 2896 + }, + { + "epoch": 0.37, + "grad_norm": 0.7334098818311762, + "learning_rate": 9.386652199606648e-06, + "loss": 0.5495, + "step": 2897 + }, + { + "epoch": 0.37, + "grad_norm": 0.6344325183000025, + "learning_rate": 9.386157061685255e-06, + "loss": 0.5708, + "step": 2898 + }, + { + "epoch": 0.37, + "grad_norm": 0.7188129179715578, + "learning_rate": 9.385661737059172e-06, + "loss": 0.5859, + "step": 2899 + }, + { + "epoch": 0.37, + "grad_norm": 0.5763663100946099, + "learning_rate": 9.385166225749485e-06, + "loss": 0.5492, + "step": 2900 + }, + { + "epoch": 0.37, + "grad_norm": 0.7245777365379958, + "learning_rate": 9.384670527777284e-06, + "loss": 0.517, + "step": 2901 + }, + { + "epoch": 0.37, + "grad_norm": 0.8367286295807743, + "learning_rate": 9.38417464316367e-06, + "loss": 0.6508, + "step": 2902 + }, + { + "epoch": 0.37, + "grad_norm": 0.6331625799793746, + "learning_rate": 9.38367857192975e-06, + "loss": 0.5626, + "step": 2903 + }, + { + "epoch": 0.37, + "grad_norm": 0.8974882912021842, + "learning_rate": 9.383182314096643e-06, + "loss": 0.6413, + "step": 2904 + }, + { + "epoch": 0.37, + "grad_norm": 0.715344356395881, + "learning_rate": 9.382685869685473e-06, + "loss": 0.5415, + "step": 2905 + }, + { + "epoch": 0.37, + "grad_norm": 0.7184999660613931, + "learning_rate": 9.382189238717367e-06, + "loss": 0.5578, + "step": 2906 + }, + { + "epoch": 0.37, + "grad_norm": 0.6660457606478688, + "learning_rate": 9.38169242121347e-06, + "loss": 0.5613, + "step": 2907 + }, + { + "epoch": 0.37, + "grad_norm": 0.7625046137579277, + "learning_rate": 9.38119541719493e-06, + "loss": 0.6153, + "step": 2908 + }, + { + "epoch": 0.37, + "grad_norm": 0.6219015803012954, + "learning_rate": 9.3806982266829e-06, + "loss": 0.5723, + "step": 2909 + }, + { + "epoch": 0.37, + "grad_norm": 0.6358396301577278, + "learning_rate": 9.380200849698547e-06, + "loss": 0.5039, + "step": 2910 + }, + { + "epoch": 0.37, + "grad_norm": 0.8309198004924665, + "learning_rate": 9.379703286263037e-06, + "loss": 0.6091, + "step": 2911 + }, + { + "epoch": 0.37, + "grad_norm": 0.7881318920771448, + "learning_rate": 9.379205536397558e-06, + "loss": 0.6043, + "step": 2912 + }, + { + "epoch": 0.37, + "grad_norm": 0.658074606345979, + "learning_rate": 9.378707600123292e-06, + "loss": 0.5866, + "step": 2913 + }, + { + "epoch": 0.37, + "grad_norm": 0.7781760241293016, + "learning_rate": 9.378209477461435e-06, + "loss": 0.6365, + "step": 2914 + }, + { + "epoch": 0.37, + "grad_norm": 0.8019528651249476, + "learning_rate": 9.37771116843319e-06, + "loss": 0.6024, + "step": 2915 + }, + { + "epoch": 0.37, + "grad_norm": 0.6137475171688824, + "learning_rate": 9.377212673059771e-06, + "loss": 0.5373, + "step": 2916 + }, + { + "epoch": 0.37, + "grad_norm": 0.6083863516564999, + "learning_rate": 9.376713991362396e-06, + "loss": 0.5458, + "step": 2917 + }, + { + "epoch": 0.37, + "grad_norm": 0.8613242340804598, + "learning_rate": 9.376215123362293e-06, + "loss": 0.6441, + "step": 2918 + }, + { + "epoch": 0.37, + "grad_norm": 0.6604837134858038, + "learning_rate": 9.375716069080696e-06, + "loss": 0.5517, + "step": 2919 + }, + { + "epoch": 0.37, + "grad_norm": 0.7052519738588503, + "learning_rate": 9.375216828538849e-06, + "loss": 0.5298, + "step": 2920 + }, + { + "epoch": 0.37, + "grad_norm": 0.9483927267239565, + "learning_rate": 9.374717401758001e-06, + "loss": 0.6746, + "step": 2921 + }, + { + "epoch": 0.37, + "grad_norm": 0.648908303677272, + "learning_rate": 9.374217788759417e-06, + "loss": 0.5798, + "step": 2922 + }, + { + "epoch": 0.37, + "grad_norm": 0.6520297891395666, + "learning_rate": 9.373717989564357e-06, + "loss": 0.5101, + "step": 2923 + }, + { + "epoch": 0.37, + "grad_norm": 0.6209113678896905, + "learning_rate": 9.373218004194098e-06, + "loss": 0.5482, + "step": 2924 + }, + { + "epoch": 0.37, + "grad_norm": 0.6624328566805205, + "learning_rate": 9.372717832669924e-06, + "loss": 0.6084, + "step": 2925 + }, + { + "epoch": 0.37, + "grad_norm": 0.6039012435279127, + "learning_rate": 9.372217475013125e-06, + "loss": 0.5126, + "step": 2926 + }, + { + "epoch": 0.37, + "grad_norm": 0.6613123256570572, + "learning_rate": 9.371716931245001e-06, + "loss": 0.576, + "step": 2927 + }, + { + "epoch": 0.37, + "grad_norm": 0.6287042674642637, + "learning_rate": 9.371216201386855e-06, + "loss": 0.5232, + "step": 2928 + }, + { + "epoch": 0.37, + "grad_norm": 0.5753287882600551, + "learning_rate": 9.370715285460006e-06, + "loss": 0.5522, + "step": 2929 + }, + { + "epoch": 0.37, + "grad_norm": 0.846231266735744, + "learning_rate": 9.370214183485773e-06, + "loss": 0.6236, + "step": 2930 + }, + { + "epoch": 0.37, + "grad_norm": 0.7573771445645239, + "learning_rate": 9.369712895485487e-06, + "loss": 0.654, + "step": 2931 + }, + { + "epoch": 0.37, + "grad_norm": 1.170271331259036, + "learning_rate": 9.369211421480488e-06, + "loss": 0.6071, + "step": 2932 + }, + { + "epoch": 0.37, + "grad_norm": 0.6709961556183903, + "learning_rate": 9.368709761492118e-06, + "loss": 0.5738, + "step": 2933 + }, + { + "epoch": 0.37, + "grad_norm": 0.6046082867343111, + "learning_rate": 9.368207915541736e-06, + "loss": 0.5495, + "step": 2934 + }, + { + "epoch": 0.37, + "grad_norm": 1.1325040102684705, + "learning_rate": 9.367705883650702e-06, + "loss": 0.6025, + "step": 2935 + }, + { + "epoch": 0.37, + "grad_norm": 0.7292815796694089, + "learning_rate": 9.367203665840385e-06, + "loss": 0.5687, + "step": 2936 + }, + { + "epoch": 0.37, + "grad_norm": 0.9575243393849735, + "learning_rate": 9.366701262132164e-06, + "loss": 0.6827, + "step": 2937 + }, + { + "epoch": 0.37, + "grad_norm": 0.6305451456666317, + "learning_rate": 9.366198672547424e-06, + "loss": 0.5839, + "step": 2938 + }, + { + "epoch": 0.37, + "grad_norm": 0.8588962449405341, + "learning_rate": 9.36569589710756e-06, + "loss": 0.6266, + "step": 2939 + }, + { + "epoch": 0.37, + "grad_norm": 0.589765890407199, + "learning_rate": 9.365192935833972e-06, + "loss": 0.5277, + "step": 2940 + }, + { + "epoch": 0.37, + "grad_norm": 0.7664631674614872, + "learning_rate": 9.364689788748068e-06, + "loss": 0.6369, + "step": 2941 + }, + { + "epoch": 0.37, + "grad_norm": 1.1335936449339294, + "learning_rate": 9.36418645587127e-06, + "loss": 0.7047, + "step": 2942 + }, + { + "epoch": 0.37, + "grad_norm": 0.8685996226447723, + "learning_rate": 9.363682937224997e-06, + "loss": 0.6507, + "step": 2943 + }, + { + "epoch": 0.38, + "grad_norm": 0.7300136339693422, + "learning_rate": 9.363179232830688e-06, + "loss": 0.5687, + "step": 2944 + }, + { + "epoch": 0.38, + "grad_norm": 0.6508117148397068, + "learning_rate": 9.362675342709782e-06, + "loss": 0.5304, + "step": 2945 + }, + { + "epoch": 0.38, + "grad_norm": 0.5731058920579397, + "learning_rate": 9.362171266883728e-06, + "loss": 0.5281, + "step": 2946 + }, + { + "epoch": 0.38, + "grad_norm": 0.6330248233737237, + "learning_rate": 9.36166700537398e-06, + "loss": 0.6018, + "step": 2947 + }, + { + "epoch": 0.38, + "grad_norm": 0.6931518987762789, + "learning_rate": 9.361162558202009e-06, + "loss": 0.5297, + "step": 2948 + }, + { + "epoch": 0.38, + "grad_norm": 0.8090933159926346, + "learning_rate": 9.360657925389283e-06, + "loss": 0.6245, + "step": 2949 + }, + { + "epoch": 0.38, + "grad_norm": 0.9081573626819215, + "learning_rate": 9.360153106957283e-06, + "loss": 0.6553, + "step": 2950 + }, + { + "epoch": 0.38, + "grad_norm": 0.6089575980777817, + "learning_rate": 9.359648102927499e-06, + "loss": 0.5723, + "step": 2951 + }, + { + "epoch": 0.38, + "grad_norm": 0.8969338297373163, + "learning_rate": 9.359142913321427e-06, + "loss": 0.6536, + "step": 2952 + }, + { + "epoch": 0.38, + "grad_norm": 0.6081021581080863, + "learning_rate": 9.358637538160569e-06, + "loss": 0.5302, + "step": 2953 + }, + { + "epoch": 0.38, + "grad_norm": 0.5845986707871152, + "learning_rate": 9.35813197746644e-06, + "loss": 0.5377, + "step": 2954 + }, + { + "epoch": 0.38, + "grad_norm": 0.7618249368956932, + "learning_rate": 9.357626231260562e-06, + "loss": 0.5494, + "step": 2955 + }, + { + "epoch": 0.38, + "grad_norm": 0.7053833506407878, + "learning_rate": 9.357120299564457e-06, + "loss": 0.5632, + "step": 2956 + }, + { + "epoch": 0.38, + "grad_norm": 0.7454397296241266, + "learning_rate": 9.356614182399666e-06, + "loss": 0.5882, + "step": 2957 + }, + { + "epoch": 0.38, + "grad_norm": 0.6079229664082009, + "learning_rate": 9.356107879787731e-06, + "loss": 0.5482, + "step": 2958 + }, + { + "epoch": 0.38, + "grad_norm": 0.8058099202927756, + "learning_rate": 9.355601391750202e-06, + "loss": 0.5491, + "step": 2959 + }, + { + "epoch": 0.38, + "grad_norm": 0.8408124520543254, + "learning_rate": 9.355094718308642e-06, + "loss": 0.7278, + "step": 2960 + }, + { + "epoch": 0.38, + "grad_norm": 0.8638475470427798, + "learning_rate": 9.354587859484616e-06, + "loss": 0.6491, + "step": 2961 + }, + { + "epoch": 0.38, + "grad_norm": 0.9616312733333742, + "learning_rate": 9.3540808152997e-06, + "loss": 0.6151, + "step": 2962 + }, + { + "epoch": 0.38, + "grad_norm": 0.9157548569921015, + "learning_rate": 9.353573585775478e-06, + "loss": 0.6686, + "step": 2963 + }, + { + "epoch": 0.38, + "grad_norm": 0.7716073816140209, + "learning_rate": 9.35306617093354e-06, + "loss": 0.616, + "step": 2964 + }, + { + "epoch": 0.38, + "grad_norm": 0.7078847840826726, + "learning_rate": 9.352558570795485e-06, + "loss": 0.5714, + "step": 2965 + }, + { + "epoch": 0.38, + "grad_norm": 0.6285416812475433, + "learning_rate": 9.35205078538292e-06, + "loss": 0.5332, + "step": 2966 + }, + { + "epoch": 0.38, + "grad_norm": 0.738607843354742, + "learning_rate": 9.351542814717463e-06, + "loss": 0.6244, + "step": 2967 + }, + { + "epoch": 0.38, + "grad_norm": 0.7025995652133761, + "learning_rate": 9.351034658820731e-06, + "loss": 0.5678, + "step": 2968 + }, + { + "epoch": 0.38, + "grad_norm": 0.6454152210839516, + "learning_rate": 9.35052631771436e-06, + "loss": 0.5515, + "step": 2969 + }, + { + "epoch": 0.38, + "grad_norm": 2.2417818654827886, + "learning_rate": 9.350017791419983e-06, + "loss": 0.6563, + "step": 2970 + }, + { + "epoch": 0.38, + "grad_norm": 0.977732183414922, + "learning_rate": 9.349509079959252e-06, + "loss": 0.5975, + "step": 2971 + }, + { + "epoch": 0.38, + "grad_norm": 0.7946625188117596, + "learning_rate": 9.349000183353817e-06, + "loss": 0.6598, + "step": 2972 + }, + { + "epoch": 0.38, + "grad_norm": 1.3933140392024956, + "learning_rate": 9.348491101625342e-06, + "loss": 0.6087, + "step": 2973 + }, + { + "epoch": 0.38, + "grad_norm": 0.63265142918763, + "learning_rate": 9.347981834795497e-06, + "loss": 0.5641, + "step": 2974 + }, + { + "epoch": 0.38, + "grad_norm": 0.8515643184144605, + "learning_rate": 9.347472382885958e-06, + "loss": 0.5925, + "step": 2975 + }, + { + "epoch": 0.38, + "grad_norm": 0.8477336400377684, + "learning_rate": 9.346962745918413e-06, + "loss": 0.6146, + "step": 2976 + }, + { + "epoch": 0.38, + "grad_norm": 0.603407537780216, + "learning_rate": 9.346452923914555e-06, + "loss": 0.4649, + "step": 2977 + }, + { + "epoch": 0.38, + "grad_norm": 0.6604501094374994, + "learning_rate": 9.345942916896087e-06, + "loss": 0.5437, + "step": 2978 + }, + { + "epoch": 0.38, + "grad_norm": 0.694988307245293, + "learning_rate": 9.345432724884714e-06, + "loss": 0.585, + "step": 2979 + }, + { + "epoch": 0.38, + "grad_norm": 0.7034293684780129, + "learning_rate": 9.344922347902157e-06, + "loss": 0.5716, + "step": 2980 + }, + { + "epoch": 0.38, + "grad_norm": 0.7991747189698661, + "learning_rate": 9.34441178597014e-06, + "loss": 0.5782, + "step": 2981 + }, + { + "epoch": 0.38, + "grad_norm": 2.4727008222887323, + "learning_rate": 9.343901039110396e-06, + "loss": 0.6948, + "step": 2982 + }, + { + "epoch": 0.38, + "grad_norm": 0.9349853240298963, + "learning_rate": 9.343390107344665e-06, + "loss": 0.6253, + "step": 2983 + }, + { + "epoch": 0.38, + "grad_norm": 0.6167534352333365, + "learning_rate": 9.342878990694698e-06, + "loss": 0.587, + "step": 2984 + }, + { + "epoch": 0.38, + "grad_norm": 0.7958548637169885, + "learning_rate": 9.34236768918225e-06, + "loss": 0.637, + "step": 2985 + }, + { + "epoch": 0.38, + "grad_norm": 0.7553547332723676, + "learning_rate": 9.341856202829086e-06, + "loss": 0.6235, + "step": 2986 + }, + { + "epoch": 0.38, + "grad_norm": 0.6116580339787628, + "learning_rate": 9.341344531656977e-06, + "loss": 0.5701, + "step": 2987 + }, + { + "epoch": 0.38, + "grad_norm": 0.8274743915536293, + "learning_rate": 9.340832675687706e-06, + "loss": 0.6359, + "step": 2988 + }, + { + "epoch": 0.38, + "grad_norm": 0.5895476890248806, + "learning_rate": 9.340320634943057e-06, + "loss": 0.4841, + "step": 2989 + }, + { + "epoch": 0.38, + "grad_norm": 0.7026001758067324, + "learning_rate": 9.339808409444829e-06, + "loss": 0.5757, + "step": 2990 + }, + { + "epoch": 0.38, + "grad_norm": 0.6487806828137881, + "learning_rate": 9.339295999214825e-06, + "loss": 0.521, + "step": 2991 + }, + { + "epoch": 0.38, + "grad_norm": 0.5982423403557943, + "learning_rate": 9.338783404274858e-06, + "loss": 0.5509, + "step": 2992 + }, + { + "epoch": 0.38, + "grad_norm": 0.8143840407930086, + "learning_rate": 9.338270624646745e-06, + "loss": 0.6016, + "step": 2993 + }, + { + "epoch": 0.38, + "grad_norm": 0.8278309078279198, + "learning_rate": 9.337757660352315e-06, + "loss": 0.6428, + "step": 2994 + }, + { + "epoch": 0.38, + "grad_norm": 0.6076523701640529, + "learning_rate": 9.337244511413402e-06, + "loss": 0.5534, + "step": 2995 + }, + { + "epoch": 0.38, + "grad_norm": 0.9330251158033744, + "learning_rate": 9.336731177851852e-06, + "loss": 0.6081, + "step": 2996 + }, + { + "epoch": 0.38, + "grad_norm": 0.8346508394264712, + "learning_rate": 9.336217659689512e-06, + "loss": 0.6167, + "step": 2997 + }, + { + "epoch": 0.38, + "grad_norm": 0.742525535161732, + "learning_rate": 9.335703956948243e-06, + "loss": 0.5543, + "step": 2998 + }, + { + "epoch": 0.38, + "grad_norm": 0.7866807279632302, + "learning_rate": 9.335190069649913e-06, + "loss": 0.5509, + "step": 2999 + }, + { + "epoch": 0.38, + "grad_norm": 0.6825555935104467, + "learning_rate": 9.334675997816393e-06, + "loss": 0.5665, + "step": 3000 + }, + { + "epoch": 0.38, + "grad_norm": 0.7496473110772021, + "learning_rate": 9.33416174146957e-06, + "loss": 0.6646, + "step": 3001 + }, + { + "epoch": 0.38, + "grad_norm": 0.6414747458468472, + "learning_rate": 9.33364730063133e-06, + "loss": 0.5835, + "step": 3002 + }, + { + "epoch": 0.38, + "grad_norm": 0.7642314919294045, + "learning_rate": 9.333132675323573e-06, + "loss": 0.6587, + "step": 3003 + }, + { + "epoch": 0.38, + "grad_norm": 0.6151487272763504, + "learning_rate": 9.332617865568204e-06, + "loss": 0.5578, + "step": 3004 + }, + { + "epoch": 0.38, + "grad_norm": 0.8710684662555659, + "learning_rate": 9.332102871387139e-06, + "loss": 0.5876, + "step": 3005 + }, + { + "epoch": 0.38, + "grad_norm": 0.6427377041504402, + "learning_rate": 9.331587692802298e-06, + "loss": 0.5182, + "step": 3006 + }, + { + "epoch": 0.38, + "grad_norm": 0.7158472416463113, + "learning_rate": 9.331072329835608e-06, + "loss": 0.5122, + "step": 3007 + }, + { + "epoch": 0.38, + "grad_norm": 0.7418179663545695, + "learning_rate": 9.330556782509013e-06, + "loss": 0.6371, + "step": 3008 + }, + { + "epoch": 0.38, + "grad_norm": 0.5572898008872351, + "learning_rate": 9.330041050844451e-06, + "loss": 0.5122, + "step": 3009 + }, + { + "epoch": 0.38, + "grad_norm": 0.7713518982989871, + "learning_rate": 9.329525134863879e-06, + "loss": 0.6413, + "step": 3010 + }, + { + "epoch": 0.38, + "grad_norm": 0.6819405797409833, + "learning_rate": 9.329009034589259e-06, + "loss": 0.553, + "step": 3011 + }, + { + "epoch": 0.38, + "grad_norm": 0.9541471572663822, + "learning_rate": 9.328492750042557e-06, + "loss": 0.6898, + "step": 3012 + }, + { + "epoch": 0.38, + "grad_norm": 0.7001677216180435, + "learning_rate": 9.327976281245749e-06, + "loss": 0.5193, + "step": 3013 + }, + { + "epoch": 0.38, + "grad_norm": 0.696208988090597, + "learning_rate": 9.327459628220823e-06, + "loss": 0.547, + "step": 3014 + }, + { + "epoch": 0.38, + "grad_norm": 0.8183328967029198, + "learning_rate": 9.326942790989768e-06, + "loss": 0.6224, + "step": 3015 + }, + { + "epoch": 0.38, + "grad_norm": 0.8003448604103915, + "learning_rate": 9.326425769574586e-06, + "loss": 0.5968, + "step": 3016 + }, + { + "epoch": 0.38, + "grad_norm": 0.7645706263357354, + "learning_rate": 9.325908563997284e-06, + "loss": 0.5375, + "step": 3017 + }, + { + "epoch": 0.38, + "grad_norm": 0.6388165494741231, + "learning_rate": 9.32539117427988e-06, + "loss": 0.5432, + "step": 3018 + }, + { + "epoch": 0.38, + "grad_norm": 0.6579376954107429, + "learning_rate": 9.324873600444393e-06, + "loss": 0.5294, + "step": 3019 + }, + { + "epoch": 0.38, + "grad_norm": 0.5645629811200503, + "learning_rate": 9.324355842512858e-06, + "loss": 0.5076, + "step": 3020 + }, + { + "epoch": 0.38, + "grad_norm": 0.7229755582677653, + "learning_rate": 9.323837900507313e-06, + "loss": 0.5746, + "step": 3021 + }, + { + "epoch": 0.38, + "grad_norm": 0.790481285180887, + "learning_rate": 9.323319774449806e-06, + "loss": 0.6435, + "step": 3022 + }, + { + "epoch": 0.39, + "grad_norm": 0.691859607482006, + "learning_rate": 9.32280146436239e-06, + "loss": 0.6124, + "step": 3023 + }, + { + "epoch": 0.39, + "grad_norm": 0.626196159201232, + "learning_rate": 9.322282970267132e-06, + "loss": 0.5344, + "step": 3024 + }, + { + "epoch": 0.39, + "grad_norm": 0.8321946396502785, + "learning_rate": 9.321764292186098e-06, + "loss": 0.6125, + "step": 3025 + }, + { + "epoch": 0.39, + "grad_norm": 0.6642325621002598, + "learning_rate": 9.32124543014137e-06, + "loss": 0.5373, + "step": 3026 + }, + { + "epoch": 0.39, + "grad_norm": 1.2194842676981374, + "learning_rate": 9.320726384155032e-06, + "loss": 0.6215, + "step": 3027 + }, + { + "epoch": 0.39, + "grad_norm": 0.6319056284270012, + "learning_rate": 9.320207154249179e-06, + "loss": 0.5524, + "step": 3028 + }, + { + "epoch": 0.39, + "grad_norm": 0.6057130280445902, + "learning_rate": 9.319687740445914e-06, + "loss": 0.5426, + "step": 3029 + }, + { + "epoch": 0.39, + "grad_norm": 0.8053863670189406, + "learning_rate": 9.319168142767344e-06, + "loss": 0.6014, + "step": 3030 + }, + { + "epoch": 0.39, + "grad_norm": 0.8536366098854649, + "learning_rate": 9.318648361235588e-06, + "loss": 0.5412, + "step": 3031 + }, + { + "epoch": 0.39, + "grad_norm": 1.1209552390347748, + "learning_rate": 9.318128395872772e-06, + "loss": 0.6431, + "step": 3032 + }, + { + "epoch": 0.39, + "grad_norm": 0.5805310978027239, + "learning_rate": 9.31760824670103e-06, + "loss": 0.5322, + "step": 3033 + }, + { + "epoch": 0.39, + "grad_norm": 0.7922086275610772, + "learning_rate": 9.317087913742501e-06, + "loss": 0.5947, + "step": 3034 + }, + { + "epoch": 0.39, + "grad_norm": 0.7086138941111222, + "learning_rate": 9.316567397019336e-06, + "loss": 0.4977, + "step": 3035 + }, + { + "epoch": 0.39, + "grad_norm": 0.8280261185336169, + "learning_rate": 9.31604669655369e-06, + "loss": 0.639, + "step": 3036 + }, + { + "epoch": 0.39, + "grad_norm": 0.901513560973976, + "learning_rate": 9.315525812367728e-06, + "loss": 0.6625, + "step": 3037 + }, + { + "epoch": 0.39, + "grad_norm": 0.6683418740981407, + "learning_rate": 9.315004744483623e-06, + "loss": 0.5734, + "step": 3038 + }, + { + "epoch": 0.39, + "grad_norm": 0.5955479809766612, + "learning_rate": 9.314483492923555e-06, + "loss": 0.569, + "step": 3039 + }, + { + "epoch": 0.39, + "grad_norm": 0.7098366300768894, + "learning_rate": 9.313962057709712e-06, + "loss": 0.5087, + "step": 3040 + }, + { + "epoch": 0.39, + "grad_norm": 0.6644309826988876, + "learning_rate": 9.313440438864288e-06, + "loss": 0.5233, + "step": 3041 + }, + { + "epoch": 0.39, + "grad_norm": 0.9052441480315545, + "learning_rate": 9.312918636409488e-06, + "loss": 0.5835, + "step": 3042 + }, + { + "epoch": 0.39, + "grad_norm": 0.7402182307744731, + "learning_rate": 9.312396650367528e-06, + "loss": 0.6059, + "step": 3043 + }, + { + "epoch": 0.39, + "grad_norm": 0.5881936680543857, + "learning_rate": 9.311874480760619e-06, + "loss": 0.4582, + "step": 3044 + }, + { + "epoch": 0.39, + "grad_norm": 0.7831489203537119, + "learning_rate": 9.311352127610995e-06, + "loss": 0.648, + "step": 3045 + }, + { + "epoch": 0.39, + "grad_norm": 0.5943426347788903, + "learning_rate": 9.310829590940886e-06, + "loss": 0.5038, + "step": 3046 + }, + { + "epoch": 0.39, + "grad_norm": 0.6626827551503282, + "learning_rate": 9.310306870772536e-06, + "loss": 0.6143, + "step": 3047 + }, + { + "epoch": 0.39, + "grad_norm": 0.6808381714363823, + "learning_rate": 9.3097839671282e-06, + "loss": 0.5389, + "step": 3048 + }, + { + "epoch": 0.39, + "grad_norm": 0.7441243592991102, + "learning_rate": 9.309260880030128e-06, + "loss": 0.5767, + "step": 3049 + }, + { + "epoch": 0.39, + "grad_norm": 0.6852853431179967, + "learning_rate": 9.308737609500593e-06, + "loss": 0.5732, + "step": 3050 + }, + { + "epoch": 0.39, + "grad_norm": 0.8530760941007216, + "learning_rate": 9.308214155561866e-06, + "loss": 0.6467, + "step": 3051 + }, + { + "epoch": 0.39, + "grad_norm": 0.6322104196931244, + "learning_rate": 9.30769051823623e-06, + "loss": 0.4893, + "step": 3052 + }, + { + "epoch": 0.39, + "grad_norm": 1.1032182049391803, + "learning_rate": 9.307166697545976e-06, + "loss": 0.6418, + "step": 3053 + }, + { + "epoch": 0.39, + "grad_norm": 0.546218845958658, + "learning_rate": 9.306642693513397e-06, + "loss": 0.5108, + "step": 3054 + }, + { + "epoch": 0.39, + "grad_norm": 0.5662008064598786, + "learning_rate": 9.3061185061608e-06, + "loss": 0.5168, + "step": 3055 + }, + { + "epoch": 0.39, + "grad_norm": 0.7800336009647092, + "learning_rate": 9.3055941355105e-06, + "loss": 0.6175, + "step": 3056 + }, + { + "epoch": 0.39, + "grad_norm": 0.7333866231083132, + "learning_rate": 9.305069581584816e-06, + "loss": 0.5589, + "step": 3057 + }, + { + "epoch": 0.39, + "grad_norm": 0.6769398353688174, + "learning_rate": 9.304544844406077e-06, + "loss": 0.5824, + "step": 3058 + }, + { + "epoch": 0.39, + "grad_norm": 0.6581380436974328, + "learning_rate": 9.304019923996619e-06, + "loss": 0.5922, + "step": 3059 + }, + { + "epoch": 0.39, + "grad_norm": 0.6768552942287963, + "learning_rate": 9.303494820378787e-06, + "loss": 0.6227, + "step": 3060 + }, + { + "epoch": 0.39, + "grad_norm": 0.6824732779233269, + "learning_rate": 9.302969533574933e-06, + "loss": 0.5432, + "step": 3061 + }, + { + "epoch": 0.39, + "grad_norm": 0.9017951473668746, + "learning_rate": 9.302444063607415e-06, + "loss": 0.6175, + "step": 3062 + }, + { + "epoch": 0.39, + "grad_norm": 0.9157586394484665, + "learning_rate": 9.301918410498603e-06, + "loss": 0.6712, + "step": 3063 + }, + { + "epoch": 0.39, + "grad_norm": 0.7766878050601725, + "learning_rate": 9.301392574270871e-06, + "loss": 0.6945, + "step": 3064 + }, + { + "epoch": 0.39, + "grad_norm": 0.6717405829253689, + "learning_rate": 9.300866554946601e-06, + "loss": 0.5873, + "step": 3065 + }, + { + "epoch": 0.39, + "grad_norm": 0.7471310858999942, + "learning_rate": 9.300340352548187e-06, + "loss": 0.5998, + "step": 3066 + }, + { + "epoch": 0.39, + "grad_norm": 0.7138892208972915, + "learning_rate": 9.299813967098025e-06, + "loss": 0.5422, + "step": 3067 + }, + { + "epoch": 0.39, + "grad_norm": 0.7148621043066545, + "learning_rate": 9.299287398618523e-06, + "loss": 0.5343, + "step": 3068 + }, + { + "epoch": 0.39, + "grad_norm": 0.5276683100930166, + "learning_rate": 9.298760647132096e-06, + "loss": 0.4901, + "step": 3069 + }, + { + "epoch": 0.39, + "grad_norm": 0.6157281448589456, + "learning_rate": 9.298233712661166e-06, + "loss": 0.5509, + "step": 3070 + }, + { + "epoch": 0.39, + "grad_norm": 0.5890621180780182, + "learning_rate": 9.29770659522816e-06, + "loss": 0.536, + "step": 3071 + }, + { + "epoch": 0.39, + "grad_norm": 0.5949375071132643, + "learning_rate": 9.297179294855519e-06, + "loss": 0.5854, + "step": 3072 + }, + { + "epoch": 0.39, + "grad_norm": 0.7321215936111286, + "learning_rate": 9.296651811565685e-06, + "loss": 0.6554, + "step": 3073 + }, + { + "epoch": 0.39, + "grad_norm": 0.6632960728330101, + "learning_rate": 9.296124145381116e-06, + "loss": 0.5441, + "step": 3074 + }, + { + "epoch": 0.39, + "grad_norm": 0.8927654358690954, + "learning_rate": 9.295596296324268e-06, + "loss": 0.5433, + "step": 3075 + }, + { + "epoch": 0.39, + "grad_norm": 0.5917597025145906, + "learning_rate": 9.295068264417615e-06, + "loss": 0.5124, + "step": 3076 + }, + { + "epoch": 0.39, + "grad_norm": 0.8730540277709486, + "learning_rate": 9.294540049683629e-06, + "loss": 0.6292, + "step": 3077 + }, + { + "epoch": 0.39, + "grad_norm": 0.76195088902274, + "learning_rate": 9.2940116521448e-06, + "loss": 0.6291, + "step": 3078 + }, + { + "epoch": 0.39, + "grad_norm": 0.7144567401257371, + "learning_rate": 9.293483071823612e-06, + "loss": 0.5584, + "step": 3079 + }, + { + "epoch": 0.39, + "grad_norm": 0.8238836202252123, + "learning_rate": 9.292954308742572e-06, + "loss": 0.6001, + "step": 3080 + }, + { + "epoch": 0.39, + "grad_norm": 0.5557593549520737, + "learning_rate": 9.292425362924185e-06, + "loss": 0.5021, + "step": 3081 + }, + { + "epoch": 0.39, + "grad_norm": 0.6445106737516632, + "learning_rate": 9.291896234390966e-06, + "loss": 0.5657, + "step": 3082 + }, + { + "epoch": 0.39, + "grad_norm": 0.7875561067619063, + "learning_rate": 9.291366923165442e-06, + "loss": 0.562, + "step": 3083 + }, + { + "epoch": 0.39, + "grad_norm": 0.5296996203046099, + "learning_rate": 9.290837429270138e-06, + "loss": 0.4997, + "step": 3084 + }, + { + "epoch": 0.39, + "grad_norm": 0.7394728089933901, + "learning_rate": 9.290307752727598e-06, + "loss": 0.5575, + "step": 3085 + }, + { + "epoch": 0.39, + "grad_norm": 0.712067102744521, + "learning_rate": 9.289777893560368e-06, + "loss": 0.5479, + "step": 3086 + }, + { + "epoch": 0.39, + "grad_norm": 0.7463792735340825, + "learning_rate": 9.289247851791e-06, + "loss": 0.5827, + "step": 3087 + }, + { + "epoch": 0.39, + "grad_norm": 0.6464882057670299, + "learning_rate": 9.288717627442056e-06, + "loss": 0.571, + "step": 3088 + }, + { + "epoch": 0.39, + "grad_norm": 0.8399134938837616, + "learning_rate": 9.288187220536108e-06, + "loss": 0.6561, + "step": 3089 + }, + { + "epoch": 0.39, + "grad_norm": 0.6463657899447613, + "learning_rate": 9.287656631095737e-06, + "loss": 0.554, + "step": 3090 + }, + { + "epoch": 0.39, + "grad_norm": 0.6640107311547536, + "learning_rate": 9.287125859143519e-06, + "loss": 0.5583, + "step": 3091 + }, + { + "epoch": 0.39, + "grad_norm": 0.6834728724945228, + "learning_rate": 9.286594904702056e-06, + "loss": 0.5647, + "step": 3092 + }, + { + "epoch": 0.39, + "grad_norm": 0.7438160033033764, + "learning_rate": 9.286063767793946e-06, + "loss": 0.5557, + "step": 3093 + }, + { + "epoch": 0.39, + "grad_norm": 0.7879538753250512, + "learning_rate": 9.285532448441796e-06, + "loss": 0.6375, + "step": 3094 + }, + { + "epoch": 0.39, + "grad_norm": 0.5826151051188484, + "learning_rate": 9.285000946668225e-06, + "loss": 0.5366, + "step": 3095 + }, + { + "epoch": 0.39, + "grad_norm": 0.8468259040440832, + "learning_rate": 9.284469262495858e-06, + "loss": 0.6204, + "step": 3096 + }, + { + "epoch": 0.39, + "grad_norm": 0.8301424864486188, + "learning_rate": 9.283937395947324e-06, + "loss": 0.545, + "step": 3097 + }, + { + "epoch": 0.39, + "grad_norm": 0.5769045292234983, + "learning_rate": 9.283405347045265e-06, + "loss": 0.5145, + "step": 3098 + }, + { + "epoch": 0.39, + "grad_norm": 0.7910485700784748, + "learning_rate": 9.282873115812328e-06, + "loss": 0.6243, + "step": 3099 + }, + { + "epoch": 0.39, + "grad_norm": 0.7994343145544429, + "learning_rate": 9.28234070227117e-06, + "loss": 0.6042, + "step": 3100 + }, + { + "epoch": 0.4, + "grad_norm": 0.7675790930144175, + "learning_rate": 9.281808106444452e-06, + "loss": 0.5719, + "step": 3101 + }, + { + "epoch": 0.4, + "grad_norm": 0.7582156493616083, + "learning_rate": 9.281275328354845e-06, + "loss": 0.6003, + "step": 3102 + }, + { + "epoch": 0.4, + "grad_norm": 0.5871920321578749, + "learning_rate": 9.280742368025027e-06, + "loss": 0.575, + "step": 3103 + }, + { + "epoch": 0.4, + "grad_norm": 0.5610931006296737, + "learning_rate": 9.280209225477688e-06, + "loss": 0.5215, + "step": 3104 + }, + { + "epoch": 0.4, + "grad_norm": 0.7206658624226153, + "learning_rate": 9.279675900735519e-06, + "loss": 0.6207, + "step": 3105 + }, + { + "epoch": 0.4, + "grad_norm": 0.6770833910428605, + "learning_rate": 9.279142393821224e-06, + "loss": 0.5367, + "step": 3106 + }, + { + "epoch": 0.4, + "grad_norm": 0.8784298211966007, + "learning_rate": 9.278608704757509e-06, + "loss": 0.6006, + "step": 3107 + }, + { + "epoch": 0.4, + "grad_norm": 1.0511083295817143, + "learning_rate": 9.278074833567094e-06, + "loss": 0.635, + "step": 3108 + }, + { + "epoch": 0.4, + "grad_norm": 0.7725134460781607, + "learning_rate": 9.277540780272707e-06, + "loss": 0.66, + "step": 3109 + }, + { + "epoch": 0.4, + "grad_norm": 0.9074875949708152, + "learning_rate": 9.277006544897075e-06, + "loss": 0.6412, + "step": 3110 + }, + { + "epoch": 0.4, + "grad_norm": 0.5747213986561281, + "learning_rate": 9.276472127462943e-06, + "loss": 0.5407, + "step": 3111 + }, + { + "epoch": 0.4, + "grad_norm": 0.6084753742855469, + "learning_rate": 9.275937527993058e-06, + "loss": 0.5673, + "step": 3112 + }, + { + "epoch": 0.4, + "grad_norm": 0.7364418087309035, + "learning_rate": 9.275402746510179e-06, + "loss": 0.5565, + "step": 3113 + }, + { + "epoch": 0.4, + "grad_norm": 0.6074742736840726, + "learning_rate": 9.274867783037063e-06, + "loss": 0.4966, + "step": 3114 + }, + { + "epoch": 0.4, + "grad_norm": 0.7712402099091962, + "learning_rate": 9.274332637596488e-06, + "loss": 0.6442, + "step": 3115 + }, + { + "epoch": 0.4, + "grad_norm": 0.5292188506702172, + "learning_rate": 9.273797310211232e-06, + "loss": 0.4742, + "step": 3116 + }, + { + "epoch": 0.4, + "grad_norm": 0.7506883236759608, + "learning_rate": 9.273261800904082e-06, + "loss": 0.5994, + "step": 3117 + }, + { + "epoch": 0.4, + "grad_norm": 0.8367115217354989, + "learning_rate": 9.272726109697829e-06, + "loss": 0.6219, + "step": 3118 + }, + { + "epoch": 0.4, + "grad_norm": 0.6130562197922128, + "learning_rate": 9.272190236615283e-06, + "loss": 0.4886, + "step": 3119 + }, + { + "epoch": 0.4, + "grad_norm": 0.6482109192490533, + "learning_rate": 9.271654181679249e-06, + "loss": 0.5192, + "step": 3120 + }, + { + "epoch": 0.4, + "grad_norm": 0.6525488995115156, + "learning_rate": 9.271117944912546e-06, + "loss": 0.5351, + "step": 3121 + }, + { + "epoch": 0.4, + "grad_norm": 0.8013882323655882, + "learning_rate": 9.270581526338003e-06, + "loss": 0.5881, + "step": 3122 + }, + { + "epoch": 0.4, + "grad_norm": 0.8160372182766578, + "learning_rate": 9.27004492597845e-06, + "loss": 0.6463, + "step": 3123 + }, + { + "epoch": 0.4, + "grad_norm": 0.7607549688946318, + "learning_rate": 9.269508143856728e-06, + "loss": 0.6309, + "step": 3124 + }, + { + "epoch": 0.4, + "grad_norm": 0.7238568313130813, + "learning_rate": 9.26897117999569e-06, + "loss": 0.6006, + "step": 3125 + }, + { + "epoch": 0.4, + "grad_norm": 0.6115375378503345, + "learning_rate": 9.26843403441819e-06, + "loss": 0.5531, + "step": 3126 + }, + { + "epoch": 0.4, + "grad_norm": 0.9119037048165477, + "learning_rate": 9.267896707147093e-06, + "loss": 0.6225, + "step": 3127 + }, + { + "epoch": 0.4, + "grad_norm": 0.6926218128611051, + "learning_rate": 9.26735919820527e-06, + "loss": 0.5986, + "step": 3128 + }, + { + "epoch": 0.4, + "grad_norm": 0.8324995338310999, + "learning_rate": 9.266821507615605e-06, + "loss": 0.6328, + "step": 3129 + }, + { + "epoch": 0.4, + "grad_norm": 0.8445941146845337, + "learning_rate": 9.266283635400981e-06, + "loss": 0.6779, + "step": 3130 + }, + { + "epoch": 0.4, + "grad_norm": 0.6201031681120114, + "learning_rate": 9.265745581584297e-06, + "loss": 0.5428, + "step": 3131 + }, + { + "epoch": 0.4, + "grad_norm": 0.8656280762912113, + "learning_rate": 9.265207346188456e-06, + "loss": 0.6874, + "step": 3132 + }, + { + "epoch": 0.4, + "grad_norm": 1.8283757591201077, + "learning_rate": 9.264668929236365e-06, + "loss": 0.5856, + "step": 3133 + }, + { + "epoch": 0.4, + "grad_norm": 0.5957631667069805, + "learning_rate": 9.264130330750949e-06, + "loss": 0.5381, + "step": 3134 + }, + { + "epoch": 0.4, + "grad_norm": 0.7690889525596007, + "learning_rate": 9.26359155075513e-06, + "loss": 0.6318, + "step": 3135 + }, + { + "epoch": 0.4, + "grad_norm": 0.7993677063373822, + "learning_rate": 9.263052589271841e-06, + "loss": 0.6209, + "step": 3136 + }, + { + "epoch": 0.4, + "grad_norm": 0.6530563933064154, + "learning_rate": 9.262513446324027e-06, + "loss": 0.5409, + "step": 3137 + }, + { + "epoch": 0.4, + "grad_norm": 0.9396208535520592, + "learning_rate": 9.261974121934637e-06, + "loss": 0.6181, + "step": 3138 + }, + { + "epoch": 0.4, + "grad_norm": 0.5817774325248966, + "learning_rate": 9.261434616126628e-06, + "loss": 0.5174, + "step": 3139 + }, + { + "epoch": 0.4, + "grad_norm": 0.8179224407841669, + "learning_rate": 9.260894928922966e-06, + "loss": 0.5769, + "step": 3140 + }, + { + "epoch": 0.4, + "grad_norm": 0.6675953013812651, + "learning_rate": 9.260355060346622e-06, + "loss": 0.5401, + "step": 3141 + }, + { + "epoch": 0.4, + "grad_norm": 0.6047063955764763, + "learning_rate": 9.259815010420577e-06, + "loss": 0.5212, + "step": 3142 + }, + { + "epoch": 0.4, + "grad_norm": 0.8137771625454581, + "learning_rate": 9.259274779167819e-06, + "loss": 0.5973, + "step": 3143 + }, + { + "epoch": 0.4, + "grad_norm": 0.5891073724621664, + "learning_rate": 9.258734366611345e-06, + "loss": 0.4764, + "step": 3144 + }, + { + "epoch": 0.4, + "grad_norm": 0.6936274196117064, + "learning_rate": 9.258193772774158e-06, + "loss": 0.5844, + "step": 3145 + }, + { + "epoch": 0.4, + "grad_norm": 0.6005108952985638, + "learning_rate": 9.257652997679268e-06, + "loss": 0.5612, + "step": 3146 + }, + { + "epoch": 0.4, + "grad_norm": 0.677342080236454, + "learning_rate": 9.257112041349697e-06, + "loss": 0.547, + "step": 3147 + }, + { + "epoch": 0.4, + "grad_norm": 0.6082332538703317, + "learning_rate": 9.25657090380847e-06, + "loss": 0.5582, + "step": 3148 + }, + { + "epoch": 0.4, + "grad_norm": 0.8679094395986775, + "learning_rate": 9.256029585078622e-06, + "loss": 0.6566, + "step": 3149 + }, + { + "epoch": 0.4, + "grad_norm": 0.6503681281399057, + "learning_rate": 9.255488085183193e-06, + "loss": 0.5036, + "step": 3150 + }, + { + "epoch": 0.4, + "grad_norm": 0.7907090344306829, + "learning_rate": 9.254946404145236e-06, + "loss": 0.6045, + "step": 3151 + }, + { + "epoch": 0.4, + "grad_norm": 0.8024687216639622, + "learning_rate": 9.254404541987807e-06, + "loss": 0.645, + "step": 3152 + }, + { + "epoch": 0.4, + "grad_norm": 0.5642301377950192, + "learning_rate": 9.253862498733972e-06, + "loss": 0.535, + "step": 3153 + }, + { + "epoch": 0.4, + "grad_norm": 0.7766938738624718, + "learning_rate": 9.253320274406806e-06, + "loss": 0.5819, + "step": 3154 + }, + { + "epoch": 0.4, + "grad_norm": 0.6637008314505372, + "learning_rate": 9.252777869029386e-06, + "loss": 0.5625, + "step": 3155 + }, + { + "epoch": 0.4, + "grad_norm": 0.7747869499782861, + "learning_rate": 9.252235282624801e-06, + "loss": 0.594, + "step": 3156 + }, + { + "epoch": 0.4, + "grad_norm": 0.6668475851783979, + "learning_rate": 9.251692515216148e-06, + "loss": 0.5273, + "step": 3157 + }, + { + "epoch": 0.4, + "grad_norm": 0.8544566294686278, + "learning_rate": 9.251149566826533e-06, + "loss": 0.5744, + "step": 3158 + }, + { + "epoch": 0.4, + "grad_norm": 0.781423719362171, + "learning_rate": 9.250606437479064e-06, + "loss": 0.5431, + "step": 3159 + }, + { + "epoch": 0.4, + "grad_norm": 0.6466778267197053, + "learning_rate": 9.250063127196863e-06, + "loss": 0.5671, + "step": 3160 + }, + { + "epoch": 0.4, + "grad_norm": 0.7482778024640498, + "learning_rate": 9.249519636003056e-06, + "loss": 0.5943, + "step": 3161 + }, + { + "epoch": 0.4, + "grad_norm": 0.6519006468416555, + "learning_rate": 9.248975963920777e-06, + "loss": 0.549, + "step": 3162 + }, + { + "epoch": 0.4, + "grad_norm": 0.6622251030234204, + "learning_rate": 9.24843211097317e-06, + "loss": 0.5245, + "step": 3163 + }, + { + "epoch": 0.4, + "grad_norm": 0.7375639692494137, + "learning_rate": 9.247888077183382e-06, + "loss": 0.5534, + "step": 3164 + }, + { + "epoch": 0.4, + "grad_norm": 0.7985924562798349, + "learning_rate": 9.247343862574575e-06, + "loss": 0.6107, + "step": 3165 + }, + { + "epoch": 0.4, + "grad_norm": 0.5996574487719852, + "learning_rate": 9.24679946716991e-06, + "loss": 0.4911, + "step": 3166 + }, + { + "epoch": 0.4, + "grad_norm": 0.5761423746533978, + "learning_rate": 9.246254890992565e-06, + "loss": 0.5138, + "step": 3167 + }, + { + "epoch": 0.4, + "grad_norm": 0.7984236349512767, + "learning_rate": 9.245710134065718e-06, + "loss": 0.6416, + "step": 3168 + }, + { + "epoch": 0.4, + "grad_norm": 0.7003687244052177, + "learning_rate": 9.245165196412558e-06, + "loss": 0.5389, + "step": 3169 + }, + { + "epoch": 0.4, + "grad_norm": 1.158307663065942, + "learning_rate": 9.244620078056281e-06, + "loss": 0.5646, + "step": 3170 + }, + { + "epoch": 0.4, + "grad_norm": 0.7284331813544216, + "learning_rate": 9.244074779020088e-06, + "loss": 0.6357, + "step": 3171 + }, + { + "epoch": 0.4, + "grad_norm": 0.6903426199710162, + "learning_rate": 9.243529299327198e-06, + "loss": 0.5638, + "step": 3172 + }, + { + "epoch": 0.4, + "grad_norm": 0.6913996326403633, + "learning_rate": 9.242983639000825e-06, + "loss": 0.6357, + "step": 3173 + }, + { + "epoch": 0.4, + "grad_norm": 0.8558684031398051, + "learning_rate": 9.242437798064197e-06, + "loss": 0.5699, + "step": 3174 + }, + { + "epoch": 0.4, + "grad_norm": 0.6233453702160608, + "learning_rate": 9.241891776540549e-06, + "loss": 0.562, + "step": 3175 + }, + { + "epoch": 0.4, + "grad_norm": 0.7623320687446391, + "learning_rate": 9.241345574453122e-06, + "loss": 0.6822, + "step": 3176 + }, + { + "epoch": 0.4, + "grad_norm": 0.8212404104712616, + "learning_rate": 9.240799191825167e-06, + "loss": 0.6634, + "step": 3177 + }, + { + "epoch": 0.4, + "grad_norm": 0.6892695539055483, + "learning_rate": 9.240252628679945e-06, + "loss": 0.5825, + "step": 3178 + }, + { + "epoch": 0.4, + "grad_norm": 0.7474256591434516, + "learning_rate": 9.239705885040716e-06, + "loss": 0.5885, + "step": 3179 + }, + { + "epoch": 0.41, + "grad_norm": 0.701528282681911, + "learning_rate": 9.239158960930757e-06, + "loss": 0.5961, + "step": 3180 + }, + { + "epoch": 0.41, + "grad_norm": 0.614972457800695, + "learning_rate": 9.238611856373346e-06, + "loss": 0.5648, + "step": 3181 + }, + { + "epoch": 0.41, + "grad_norm": 0.7016096696878144, + "learning_rate": 9.238064571391772e-06, + "loss": 0.5871, + "step": 3182 + }, + { + "epoch": 0.41, + "grad_norm": 0.6065021393621388, + "learning_rate": 9.237517106009335e-06, + "loss": 0.5456, + "step": 3183 + }, + { + "epoch": 0.41, + "grad_norm": 0.6165816396422367, + "learning_rate": 9.236969460249335e-06, + "loss": 0.4853, + "step": 3184 + }, + { + "epoch": 0.41, + "grad_norm": 0.9406136898950629, + "learning_rate": 9.236421634135085e-06, + "loss": 0.6237, + "step": 3185 + }, + { + "epoch": 0.41, + "grad_norm": 0.7294982034530778, + "learning_rate": 9.235873627689902e-06, + "loss": 0.5752, + "step": 3186 + }, + { + "epoch": 0.41, + "grad_norm": 0.6475248770643098, + "learning_rate": 9.235325440937115e-06, + "loss": 0.5316, + "step": 3187 + }, + { + "epoch": 0.41, + "grad_norm": 0.8161288507107136, + "learning_rate": 9.234777073900059e-06, + "loss": 0.6273, + "step": 3188 + }, + { + "epoch": 0.41, + "grad_norm": 0.9524174405826741, + "learning_rate": 9.234228526602074e-06, + "loss": 0.6585, + "step": 3189 + }, + { + "epoch": 0.41, + "grad_norm": 0.7622085834034301, + "learning_rate": 9.233679799066514e-06, + "loss": 0.5887, + "step": 3190 + }, + { + "epoch": 0.41, + "grad_norm": 0.8001198885334686, + "learning_rate": 9.233130891316731e-06, + "loss": 0.6415, + "step": 3191 + }, + { + "epoch": 0.41, + "grad_norm": 0.7299578815119844, + "learning_rate": 9.232581803376095e-06, + "loss": 0.6307, + "step": 3192 + }, + { + "epoch": 0.41, + "grad_norm": 0.7906144659315226, + "learning_rate": 9.232032535267976e-06, + "loss": 0.6024, + "step": 3193 + }, + { + "epoch": 0.41, + "grad_norm": 0.7027847323877751, + "learning_rate": 9.231483087015755e-06, + "loss": 0.6417, + "step": 3194 + }, + { + "epoch": 0.41, + "grad_norm": 0.8302817529568468, + "learning_rate": 9.230933458642823e-06, + "loss": 0.562, + "step": 3195 + }, + { + "epoch": 0.41, + "grad_norm": 0.8739537638604074, + "learning_rate": 9.230383650172572e-06, + "loss": 0.6126, + "step": 3196 + }, + { + "epoch": 0.41, + "grad_norm": 0.5171971506646962, + "learning_rate": 9.229833661628409e-06, + "loss": 0.4514, + "step": 3197 + }, + { + "epoch": 0.41, + "grad_norm": 0.7834864512856085, + "learning_rate": 9.229283493033742e-06, + "loss": 0.621, + "step": 3198 + }, + { + "epoch": 0.41, + "grad_norm": 0.6275152321814564, + "learning_rate": 9.228733144411993e-06, + "loss": 0.5007, + "step": 3199 + }, + { + "epoch": 0.41, + "grad_norm": 1.1290054452156002, + "learning_rate": 9.228182615786585e-06, + "loss": 0.6702, + "step": 3200 + }, + { + "epoch": 0.41, + "grad_norm": 0.8395437716995005, + "learning_rate": 9.227631907180956e-06, + "loss": 0.5648, + "step": 3201 + }, + { + "epoch": 0.41, + "grad_norm": 0.7740440337826754, + "learning_rate": 9.227081018618546e-06, + "loss": 0.6478, + "step": 3202 + }, + { + "epoch": 0.41, + "grad_norm": 0.7305535433094422, + "learning_rate": 9.226529950122805e-06, + "loss": 0.5504, + "step": 3203 + }, + { + "epoch": 0.41, + "grad_norm": 0.5646711821983378, + "learning_rate": 9.225978701717189e-06, + "loss": 0.5423, + "step": 3204 + }, + { + "epoch": 0.41, + "grad_norm": 0.6183282577703919, + "learning_rate": 9.225427273425165e-06, + "loss": 0.5615, + "step": 3205 + }, + { + "epoch": 0.41, + "grad_norm": 0.6229665852162618, + "learning_rate": 9.224875665270206e-06, + "loss": 0.5171, + "step": 3206 + }, + { + "epoch": 0.41, + "grad_norm": 0.5431499851299624, + "learning_rate": 9.224323877275786e-06, + "loss": 0.5199, + "step": 3207 + }, + { + "epoch": 0.41, + "grad_norm": 0.6601271472318798, + "learning_rate": 9.223771909465402e-06, + "loss": 0.5443, + "step": 3208 + }, + { + "epoch": 0.41, + "grad_norm": 0.6404625223145297, + "learning_rate": 9.223219761862545e-06, + "loss": 0.5509, + "step": 3209 + }, + { + "epoch": 0.41, + "grad_norm": 1.7037978654532602, + "learning_rate": 9.222667434490716e-06, + "loss": 0.6473, + "step": 3210 + }, + { + "epoch": 0.41, + "grad_norm": 0.721589651611027, + "learning_rate": 9.222114927373429e-06, + "loss": 0.5679, + "step": 3211 + }, + { + "epoch": 0.41, + "grad_norm": 0.6568083250973575, + "learning_rate": 9.2215622405342e-06, + "loss": 0.5928, + "step": 3212 + }, + { + "epoch": 0.41, + "grad_norm": 0.7164926491186786, + "learning_rate": 9.22100937399656e-06, + "loss": 0.6043, + "step": 3213 + }, + { + "epoch": 0.41, + "grad_norm": 0.8310340783994664, + "learning_rate": 9.220456327784038e-06, + "loss": 0.6041, + "step": 3214 + }, + { + "epoch": 0.41, + "grad_norm": 0.7426404072208325, + "learning_rate": 9.219903101920176e-06, + "loss": 0.6161, + "step": 3215 + }, + { + "epoch": 0.41, + "grad_norm": 0.8143939296303716, + "learning_rate": 9.219349696428523e-06, + "loss": 0.6158, + "step": 3216 + }, + { + "epoch": 0.41, + "grad_norm": 0.7328724560042611, + "learning_rate": 9.218796111332638e-06, + "loss": 0.5665, + "step": 3217 + }, + { + "epoch": 0.41, + "grad_norm": 1.014845447845572, + "learning_rate": 9.218242346656083e-06, + "loss": 0.625, + "step": 3218 + }, + { + "epoch": 0.41, + "grad_norm": 0.9705993139030301, + "learning_rate": 9.217688402422431e-06, + "loss": 0.6891, + "step": 3219 + }, + { + "epoch": 0.41, + "grad_norm": 0.7499917739142293, + "learning_rate": 9.217134278655262e-06, + "loss": 0.5755, + "step": 3220 + }, + { + "epoch": 0.41, + "grad_norm": 0.7125004999350245, + "learning_rate": 9.216579975378163e-06, + "loss": 0.5191, + "step": 3221 + }, + { + "epoch": 0.41, + "grad_norm": 0.854810055587463, + "learning_rate": 9.21602549261473e-06, + "loss": 0.6089, + "step": 3222 + }, + { + "epoch": 0.41, + "grad_norm": 0.5691407539342304, + "learning_rate": 9.215470830388562e-06, + "loss": 0.5109, + "step": 3223 + }, + { + "epoch": 0.41, + "grad_norm": 0.6172279238982875, + "learning_rate": 9.21491598872327e-06, + "loss": 0.5294, + "step": 3224 + }, + { + "epoch": 0.41, + "grad_norm": 0.7981602876094932, + "learning_rate": 9.214360967642477e-06, + "loss": 0.6234, + "step": 3225 + }, + { + "epoch": 0.41, + "grad_norm": 0.6538744727864495, + "learning_rate": 9.213805767169802e-06, + "loss": 0.5342, + "step": 3226 + }, + { + "epoch": 0.41, + "grad_norm": 0.7244812345654913, + "learning_rate": 9.213250387328882e-06, + "loss": 0.5867, + "step": 3227 + }, + { + "epoch": 0.41, + "grad_norm": 0.7182566480602726, + "learning_rate": 9.212694828143356e-06, + "loss": 0.5795, + "step": 3228 + }, + { + "epoch": 0.41, + "grad_norm": 0.8816372560725132, + "learning_rate": 9.212139089636875e-06, + "loss": 0.627, + "step": 3229 + }, + { + "epoch": 0.41, + "grad_norm": 0.5951494522354634, + "learning_rate": 9.211583171833092e-06, + "loss": 0.5791, + "step": 3230 + }, + { + "epoch": 0.41, + "grad_norm": 0.6931626072454565, + "learning_rate": 9.211027074755671e-06, + "loss": 0.5897, + "step": 3231 + }, + { + "epoch": 0.41, + "grad_norm": 0.6833677451982374, + "learning_rate": 9.210470798428283e-06, + "loss": 0.5215, + "step": 3232 + }, + { + "epoch": 0.41, + "grad_norm": 1.282989705730721, + "learning_rate": 9.20991434287461e-06, + "loss": 0.6345, + "step": 3233 + }, + { + "epoch": 0.41, + "grad_norm": 0.5855130011417814, + "learning_rate": 9.209357708118335e-06, + "loss": 0.5138, + "step": 3234 + }, + { + "epoch": 0.41, + "grad_norm": 0.596196516944751, + "learning_rate": 9.208800894183154e-06, + "loss": 0.5724, + "step": 3235 + }, + { + "epoch": 0.41, + "grad_norm": 0.7480587414485678, + "learning_rate": 9.208243901092769e-06, + "loss": 0.6149, + "step": 3236 + }, + { + "epoch": 0.41, + "grad_norm": 0.8518895744305718, + "learning_rate": 9.207686728870889e-06, + "loss": 0.683, + "step": 3237 + }, + { + "epoch": 0.41, + "grad_norm": 0.892959194286907, + "learning_rate": 9.207129377541229e-06, + "loss": 0.6175, + "step": 3238 + }, + { + "epoch": 0.41, + "grad_norm": 0.5810982847496905, + "learning_rate": 9.206571847127517e-06, + "loss": 0.5218, + "step": 3239 + }, + { + "epoch": 0.41, + "grad_norm": 0.8535470654985331, + "learning_rate": 9.206014137653482e-06, + "loss": 0.6381, + "step": 3240 + }, + { + "epoch": 0.41, + "grad_norm": 0.633664919505574, + "learning_rate": 9.205456249142868e-06, + "loss": 0.555, + "step": 3241 + }, + { + "epoch": 0.41, + "grad_norm": 0.8160371523072273, + "learning_rate": 9.204898181619419e-06, + "loss": 0.5539, + "step": 3242 + }, + { + "epoch": 0.41, + "grad_norm": 0.7079567478379215, + "learning_rate": 9.204339935106892e-06, + "loss": 0.599, + "step": 3243 + }, + { + "epoch": 0.41, + "grad_norm": 0.7615625775965033, + "learning_rate": 9.203781509629048e-06, + "loss": 0.5952, + "step": 3244 + }, + { + "epoch": 0.41, + "grad_norm": 0.7544846974124569, + "learning_rate": 9.203222905209659e-06, + "loss": 0.6333, + "step": 3245 + }, + { + "epoch": 0.41, + "grad_norm": 0.6298341210044557, + "learning_rate": 9.202664121872502e-06, + "loss": 0.5374, + "step": 3246 + }, + { + "epoch": 0.41, + "grad_norm": 0.5971652832986465, + "learning_rate": 9.202105159641363e-06, + "loss": 0.5817, + "step": 3247 + }, + { + "epoch": 0.41, + "grad_norm": 0.7216286265188634, + "learning_rate": 9.201546018540037e-06, + "loss": 0.509, + "step": 3248 + }, + { + "epoch": 0.41, + "grad_norm": 0.8146484420358, + "learning_rate": 9.200986698592323e-06, + "loss": 0.6646, + "step": 3249 + }, + { + "epoch": 0.41, + "grad_norm": 0.6470741855321337, + "learning_rate": 9.200427199822028e-06, + "loss": 0.5547, + "step": 3250 + }, + { + "epoch": 0.41, + "grad_norm": 0.5307516565538946, + "learning_rate": 9.19986752225297e-06, + "loss": 0.4952, + "step": 3251 + }, + { + "epoch": 0.41, + "grad_norm": 0.7499313470267501, + "learning_rate": 9.199307665908974e-06, + "loss": 0.6223, + "step": 3252 + }, + { + "epoch": 0.41, + "grad_norm": 0.7873693261187309, + "learning_rate": 9.19874763081387e-06, + "loss": 0.5919, + "step": 3253 + }, + { + "epoch": 0.41, + "grad_norm": 0.8312459521266437, + "learning_rate": 9.198187416991493e-06, + "loss": 0.5581, + "step": 3254 + }, + { + "epoch": 0.41, + "grad_norm": 0.6218252502503935, + "learning_rate": 9.197627024465697e-06, + "loss": 0.5271, + "step": 3255 + }, + { + "epoch": 0.41, + "grad_norm": 0.6793659870096235, + "learning_rate": 9.197066453260331e-06, + "loss": 0.5878, + "step": 3256 + }, + { + "epoch": 0.41, + "grad_norm": 0.8097458712161544, + "learning_rate": 9.196505703399257e-06, + "loss": 0.6745, + "step": 3257 + }, + { + "epoch": 0.42, + "grad_norm": 0.8928733149820136, + "learning_rate": 9.195944774906347e-06, + "loss": 0.6213, + "step": 3258 + }, + { + "epoch": 0.42, + "grad_norm": 0.6472409547449489, + "learning_rate": 9.195383667805475e-06, + "loss": 0.5767, + "step": 3259 + }, + { + "epoch": 0.42, + "grad_norm": 0.6662930157629626, + "learning_rate": 9.194822382120527e-06, + "loss": 0.5626, + "step": 3260 + }, + { + "epoch": 0.42, + "grad_norm": 0.9905387848773961, + "learning_rate": 9.194260917875395e-06, + "loss": 0.635, + "step": 3261 + }, + { + "epoch": 0.42, + "grad_norm": 0.6018491781279309, + "learning_rate": 9.193699275093979e-06, + "loss": 0.5283, + "step": 3262 + }, + { + "epoch": 0.42, + "grad_norm": 0.7426946739996282, + "learning_rate": 9.193137453800185e-06, + "loss": 0.5722, + "step": 3263 + }, + { + "epoch": 0.42, + "grad_norm": 0.5828078313417627, + "learning_rate": 9.19257545401793e-06, + "loss": 0.5268, + "step": 3264 + }, + { + "epoch": 0.42, + "grad_norm": 0.9124170137727117, + "learning_rate": 9.192013275771136e-06, + "loss": 0.6762, + "step": 3265 + }, + { + "epoch": 0.42, + "grad_norm": 0.7039819945593421, + "learning_rate": 9.191450919083731e-06, + "loss": 0.5918, + "step": 3266 + }, + { + "epoch": 0.42, + "grad_norm": 0.7032769511281745, + "learning_rate": 9.190888383979655e-06, + "loss": 0.5774, + "step": 3267 + }, + { + "epoch": 0.42, + "grad_norm": 0.7520639233199657, + "learning_rate": 9.190325670482854e-06, + "loss": 0.6654, + "step": 3268 + }, + { + "epoch": 0.42, + "grad_norm": 0.6459007792468826, + "learning_rate": 9.189762778617277e-06, + "loss": 0.552, + "step": 3269 + }, + { + "epoch": 0.42, + "grad_norm": 0.7001160285666483, + "learning_rate": 9.189199708406889e-06, + "loss": 0.5639, + "step": 3270 + }, + { + "epoch": 0.42, + "grad_norm": 0.6140801763175019, + "learning_rate": 9.188636459875655e-06, + "loss": 0.5589, + "step": 3271 + }, + { + "epoch": 0.42, + "grad_norm": 1.153917460766353, + "learning_rate": 9.188073033047553e-06, + "loss": 0.6643, + "step": 3272 + }, + { + "epoch": 0.42, + "grad_norm": 0.7246733619528902, + "learning_rate": 9.187509427946565e-06, + "loss": 0.5491, + "step": 3273 + }, + { + "epoch": 0.42, + "grad_norm": 0.5658520188236634, + "learning_rate": 9.18694564459668e-06, + "loss": 0.5345, + "step": 3274 + }, + { + "epoch": 0.42, + "grad_norm": 0.8130835488751036, + "learning_rate": 9.1863816830219e-06, + "loss": 0.5984, + "step": 3275 + }, + { + "epoch": 0.42, + "grad_norm": 0.5737981169444385, + "learning_rate": 9.185817543246232e-06, + "loss": 0.5141, + "step": 3276 + }, + { + "epoch": 0.42, + "grad_norm": 0.6484394591916782, + "learning_rate": 9.185253225293684e-06, + "loss": 0.574, + "step": 3277 + }, + { + "epoch": 0.42, + "grad_norm": 0.8387932616442788, + "learning_rate": 9.184688729188281e-06, + "loss": 0.5799, + "step": 3278 + }, + { + "epoch": 0.42, + "grad_norm": 0.6059481311429133, + "learning_rate": 9.18412405495405e-06, + "loss": 0.4868, + "step": 3279 + }, + { + "epoch": 0.42, + "grad_norm": 0.5912994012192516, + "learning_rate": 9.18355920261503e-06, + "loss": 0.5601, + "step": 3280 + }, + { + "epoch": 0.42, + "grad_norm": 0.6053788989091289, + "learning_rate": 9.182994172195266e-06, + "loss": 0.5473, + "step": 3281 + }, + { + "epoch": 0.42, + "grad_norm": 0.6659146771005405, + "learning_rate": 9.182428963718804e-06, + "loss": 0.5679, + "step": 3282 + }, + { + "epoch": 0.42, + "grad_norm": 0.8035257479160005, + "learning_rate": 9.181863577209705e-06, + "loss": 0.6232, + "step": 3283 + }, + { + "epoch": 0.42, + "grad_norm": 0.8270559144867194, + "learning_rate": 9.181298012692039e-06, + "loss": 0.6617, + "step": 3284 + }, + { + "epoch": 0.42, + "grad_norm": 0.731995430929404, + "learning_rate": 9.180732270189877e-06, + "loss": 0.5651, + "step": 3285 + }, + { + "epoch": 0.42, + "grad_norm": 0.6317926449243199, + "learning_rate": 9.180166349727303e-06, + "loss": 0.542, + "step": 3286 + }, + { + "epoch": 0.42, + "grad_norm": 0.74161766044179, + "learning_rate": 9.179600251328405e-06, + "loss": 0.6183, + "step": 3287 + }, + { + "epoch": 0.42, + "grad_norm": 0.6565143061279405, + "learning_rate": 9.17903397501728e-06, + "loss": 0.5836, + "step": 3288 + }, + { + "epoch": 0.42, + "grad_norm": 0.9126999944088788, + "learning_rate": 9.178467520818033e-06, + "loss": 0.6003, + "step": 3289 + }, + { + "epoch": 0.42, + "grad_norm": 0.8470934289346624, + "learning_rate": 9.177900888754775e-06, + "loss": 0.6555, + "step": 3290 + }, + { + "epoch": 0.42, + "grad_norm": 0.6351474911842814, + "learning_rate": 9.17733407885163e-06, + "loss": 0.5221, + "step": 3291 + }, + { + "epoch": 0.42, + "grad_norm": 0.6176272144988341, + "learning_rate": 9.176767091132718e-06, + "loss": 0.5505, + "step": 3292 + }, + { + "epoch": 0.42, + "grad_norm": 0.5704415893873597, + "learning_rate": 9.176199925622183e-06, + "loss": 0.5121, + "step": 3293 + }, + { + "epoch": 0.42, + "grad_norm": 0.6747566530757642, + "learning_rate": 9.175632582344158e-06, + "loss": 0.4967, + "step": 3294 + }, + { + "epoch": 0.42, + "grad_norm": 0.7974450169225238, + "learning_rate": 9.175065061322799e-06, + "loss": 0.6486, + "step": 3295 + }, + { + "epoch": 0.42, + "grad_norm": 0.5971550840646895, + "learning_rate": 9.174497362582262e-06, + "loss": 0.5224, + "step": 3296 + }, + { + "epoch": 0.42, + "grad_norm": 0.8160735889062235, + "learning_rate": 9.173929486146713e-06, + "loss": 0.637, + "step": 3297 + }, + { + "epoch": 0.42, + "grad_norm": 0.7745251199837998, + "learning_rate": 9.173361432040321e-06, + "loss": 0.6084, + "step": 3298 + }, + { + "epoch": 0.42, + "grad_norm": 0.6976358887644472, + "learning_rate": 9.172793200287272e-06, + "loss": 0.5466, + "step": 3299 + }, + { + "epoch": 0.42, + "grad_norm": 0.64552829772451, + "learning_rate": 9.172224790911748e-06, + "loss": 0.5751, + "step": 3300 + }, + { + "epoch": 0.42, + "grad_norm": 0.6585268850253904, + "learning_rate": 9.17165620393795e-06, + "loss": 0.6338, + "step": 3301 + }, + { + "epoch": 0.42, + "grad_norm": 0.6721880760623039, + "learning_rate": 9.171087439390076e-06, + "loss": 0.5549, + "step": 3302 + }, + { + "epoch": 0.42, + "grad_norm": 0.5779134645775404, + "learning_rate": 9.17051849729234e-06, + "loss": 0.4685, + "step": 3303 + }, + { + "epoch": 0.42, + "grad_norm": 0.9383967938567331, + "learning_rate": 9.169949377668958e-06, + "loss": 0.7189, + "step": 3304 + }, + { + "epoch": 0.42, + "grad_norm": 0.618739152947225, + "learning_rate": 9.169380080544157e-06, + "loss": 0.5339, + "step": 3305 + }, + { + "epoch": 0.42, + "grad_norm": 0.6074685136042486, + "learning_rate": 9.168810605942168e-06, + "loss": 0.5385, + "step": 3306 + }, + { + "epoch": 0.42, + "grad_norm": 2.3964853215673774, + "learning_rate": 9.168240953887234e-06, + "loss": 0.6239, + "step": 3307 + }, + { + "epoch": 0.42, + "grad_norm": 0.8854781551920204, + "learning_rate": 9.167671124403601e-06, + "loss": 0.6395, + "step": 3308 + }, + { + "epoch": 0.42, + "grad_norm": 0.6341463647232668, + "learning_rate": 9.167101117515528e-06, + "loss": 0.5084, + "step": 3309 + }, + { + "epoch": 0.42, + "grad_norm": 0.6204243305855078, + "learning_rate": 9.166530933247276e-06, + "loss": 0.4837, + "step": 3310 + }, + { + "epoch": 0.42, + "grad_norm": 0.8782656556572855, + "learning_rate": 9.165960571623116e-06, + "loss": 0.6147, + "step": 3311 + }, + { + "epoch": 0.42, + "grad_norm": 0.5921902972365098, + "learning_rate": 9.165390032667328e-06, + "loss": 0.5314, + "step": 3312 + }, + { + "epoch": 0.42, + "grad_norm": 0.6168798103946963, + "learning_rate": 9.164819316404197e-06, + "loss": 0.549, + "step": 3313 + }, + { + "epoch": 0.42, + "grad_norm": 0.6546515366806996, + "learning_rate": 9.164248422858015e-06, + "loss": 0.5533, + "step": 3314 + }, + { + "epoch": 0.42, + "grad_norm": 0.527765102681221, + "learning_rate": 9.163677352053087e-06, + "loss": 0.4784, + "step": 3315 + }, + { + "epoch": 0.42, + "grad_norm": 0.5243501757772494, + "learning_rate": 9.163106104013717e-06, + "loss": 0.5187, + "step": 3316 + }, + { + "epoch": 0.42, + "grad_norm": 0.8512706500980287, + "learning_rate": 9.162534678764225e-06, + "loss": 0.6358, + "step": 3317 + }, + { + "epoch": 0.42, + "grad_norm": 0.801562613911495, + "learning_rate": 9.161963076328933e-06, + "loss": 0.6723, + "step": 3318 + }, + { + "epoch": 0.42, + "grad_norm": 0.9063120931367663, + "learning_rate": 9.161391296732171e-06, + "loss": 0.6395, + "step": 3319 + }, + { + "epoch": 0.42, + "grad_norm": 0.6613677803151039, + "learning_rate": 9.160819339998282e-06, + "loss": 0.5656, + "step": 3320 + }, + { + "epoch": 0.42, + "grad_norm": 0.8788521624452991, + "learning_rate": 9.160247206151608e-06, + "loss": 0.6328, + "step": 3321 + }, + { + "epoch": 0.42, + "grad_norm": 0.9102758162586393, + "learning_rate": 9.159674895216507e-06, + "loss": 0.6288, + "step": 3322 + }, + { + "epoch": 0.42, + "grad_norm": 0.6292348531350221, + "learning_rate": 9.159102407217334e-06, + "loss": 0.5138, + "step": 3323 + }, + { + "epoch": 0.42, + "grad_norm": 0.6926993439377039, + "learning_rate": 9.158529742178466e-06, + "loss": 0.5795, + "step": 3324 + }, + { + "epoch": 0.42, + "grad_norm": 0.8196574769217888, + "learning_rate": 9.157956900124273e-06, + "loss": 0.6138, + "step": 3325 + }, + { + "epoch": 0.42, + "grad_norm": 0.6584602786355435, + "learning_rate": 9.157383881079143e-06, + "loss": 0.5792, + "step": 3326 + }, + { + "epoch": 0.42, + "grad_norm": 1.0104669688959764, + "learning_rate": 9.156810685067467e-06, + "loss": 0.6747, + "step": 3327 + }, + { + "epoch": 0.42, + "grad_norm": 0.6532782793076529, + "learning_rate": 9.156237312113642e-06, + "loss": 0.5303, + "step": 3328 + }, + { + "epoch": 0.42, + "grad_norm": 0.8287066372914469, + "learning_rate": 9.155663762242076e-06, + "loss": 0.6336, + "step": 3329 + }, + { + "epoch": 0.42, + "grad_norm": 0.6742819142673648, + "learning_rate": 9.155090035477185e-06, + "loss": 0.579, + "step": 3330 + }, + { + "epoch": 0.42, + "grad_norm": 0.953562904369008, + "learning_rate": 9.154516131843388e-06, + "loss": 0.6716, + "step": 3331 + }, + { + "epoch": 0.42, + "grad_norm": 0.6873147643866758, + "learning_rate": 9.153942051365115e-06, + "loss": 0.5199, + "step": 3332 + }, + { + "epoch": 0.42, + "grad_norm": 0.9029028758271641, + "learning_rate": 9.153367794066802e-06, + "loss": 0.5479, + "step": 3333 + }, + { + "epoch": 0.42, + "grad_norm": 0.6772567790207706, + "learning_rate": 9.152793359972894e-06, + "loss": 0.5512, + "step": 3334 + }, + { + "epoch": 0.42, + "grad_norm": 0.846653051375728, + "learning_rate": 9.152218749107845e-06, + "loss": 0.6476, + "step": 3335 + }, + { + "epoch": 0.42, + "grad_norm": 0.624086643969705, + "learning_rate": 9.151643961496111e-06, + "loss": 0.575, + "step": 3336 + }, + { + "epoch": 0.43, + "grad_norm": 0.9148599476333431, + "learning_rate": 9.151068997162163e-06, + "loss": 0.6742, + "step": 3337 + }, + { + "epoch": 0.43, + "grad_norm": 0.5664197078715828, + "learning_rate": 9.15049385613047e-06, + "loss": 0.5124, + "step": 3338 + }, + { + "epoch": 0.43, + "grad_norm": 0.6044488824927788, + "learning_rate": 9.149918538425517e-06, + "loss": 0.5296, + "step": 3339 + }, + { + "epoch": 0.43, + "grad_norm": 0.5886124520905012, + "learning_rate": 9.149343044071793e-06, + "loss": 0.5609, + "step": 3340 + }, + { + "epoch": 0.43, + "grad_norm": 0.5699769180161598, + "learning_rate": 9.148767373093796e-06, + "loss": 0.5536, + "step": 3341 + }, + { + "epoch": 0.43, + "grad_norm": 0.5924989604629504, + "learning_rate": 9.148191525516027e-06, + "loss": 0.5398, + "step": 3342 + }, + { + "epoch": 0.43, + "grad_norm": 0.7687488774906683, + "learning_rate": 9.147615501363004e-06, + "loss": 0.5943, + "step": 3343 + }, + { + "epoch": 0.43, + "grad_norm": 0.5523382658679526, + "learning_rate": 9.147039300659241e-06, + "loss": 0.5572, + "step": 3344 + }, + { + "epoch": 0.43, + "grad_norm": 0.7726880357411495, + "learning_rate": 9.146462923429269e-06, + "loss": 0.6564, + "step": 3345 + }, + { + "epoch": 0.43, + "grad_norm": 0.5975929958069081, + "learning_rate": 9.145886369697618e-06, + "loss": 0.5452, + "step": 3346 + }, + { + "epoch": 0.43, + "grad_norm": 0.7360591699213976, + "learning_rate": 9.145309639488835e-06, + "loss": 0.6373, + "step": 3347 + }, + { + "epoch": 0.43, + "grad_norm": 0.5683055490433099, + "learning_rate": 9.144732732827464e-06, + "loss": 0.5132, + "step": 3348 + }, + { + "epoch": 0.43, + "grad_norm": 0.8434869696891951, + "learning_rate": 9.144155649738067e-06, + "loss": 0.6544, + "step": 3349 + }, + { + "epoch": 0.43, + "grad_norm": 0.6122219318901587, + "learning_rate": 9.143578390245207e-06, + "loss": 0.5217, + "step": 3350 + }, + { + "epoch": 0.43, + "grad_norm": 0.9442240314101615, + "learning_rate": 9.143000954373455e-06, + "loss": 0.619, + "step": 3351 + }, + { + "epoch": 0.43, + "grad_norm": 0.8609016654664363, + "learning_rate": 9.142423342147392e-06, + "loss": 0.6901, + "step": 3352 + }, + { + "epoch": 0.43, + "grad_norm": 0.8100428882519414, + "learning_rate": 9.141845553591606e-06, + "loss": 0.6494, + "step": 3353 + }, + { + "epoch": 0.43, + "grad_norm": 0.761576052846754, + "learning_rate": 9.141267588730689e-06, + "loss": 0.6327, + "step": 3354 + }, + { + "epoch": 0.43, + "grad_norm": 0.6165467173058305, + "learning_rate": 9.140689447589245e-06, + "loss": 0.542, + "step": 3355 + }, + { + "epoch": 0.43, + "grad_norm": 0.5876969642271552, + "learning_rate": 9.140111130191882e-06, + "loss": 0.5399, + "step": 3356 + }, + { + "epoch": 0.43, + "grad_norm": 0.6287577389873411, + "learning_rate": 9.139532636563219e-06, + "loss": 0.5657, + "step": 3357 + }, + { + "epoch": 0.43, + "grad_norm": 0.8379049445511668, + "learning_rate": 9.13895396672788e-06, + "loss": 0.5946, + "step": 3358 + }, + { + "epoch": 0.43, + "grad_norm": 0.5379736932317782, + "learning_rate": 9.138375120710495e-06, + "loss": 0.5038, + "step": 3359 + }, + { + "epoch": 0.43, + "grad_norm": 0.8809318084173557, + "learning_rate": 9.137796098535707e-06, + "loss": 0.6579, + "step": 3360 + }, + { + "epoch": 0.43, + "grad_norm": 0.6072689367456056, + "learning_rate": 9.137216900228161e-06, + "loss": 0.5149, + "step": 3361 + }, + { + "epoch": 0.43, + "grad_norm": 0.6422148248983705, + "learning_rate": 9.13663752581251e-06, + "loss": 0.5391, + "step": 3362 + }, + { + "epoch": 0.43, + "grad_norm": 0.6986428452209693, + "learning_rate": 9.136057975313422e-06, + "loss": 0.5689, + "step": 3363 + }, + { + "epoch": 0.43, + "grad_norm": 0.5706498515380872, + "learning_rate": 9.135478248755561e-06, + "loss": 0.5233, + "step": 3364 + }, + { + "epoch": 0.43, + "grad_norm": 0.6477492814285084, + "learning_rate": 9.134898346163608e-06, + "loss": 0.5261, + "step": 3365 + }, + { + "epoch": 0.43, + "grad_norm": 0.5451511723075446, + "learning_rate": 9.134318267562244e-06, + "loss": 0.4821, + "step": 3366 + }, + { + "epoch": 0.43, + "grad_norm": 1.0732824412215014, + "learning_rate": 9.133738012976163e-06, + "loss": 0.659, + "step": 3367 + }, + { + "epoch": 0.43, + "grad_norm": 0.6449010183592399, + "learning_rate": 9.133157582430063e-06, + "loss": 0.5152, + "step": 3368 + }, + { + "epoch": 0.43, + "grad_norm": 0.8101058622650507, + "learning_rate": 9.132576975948655e-06, + "loss": 0.5877, + "step": 3369 + }, + { + "epoch": 0.43, + "grad_norm": 0.7507506869939086, + "learning_rate": 9.13199619355665e-06, + "loss": 0.5631, + "step": 3370 + }, + { + "epoch": 0.43, + "grad_norm": 0.845321121448081, + "learning_rate": 9.13141523527877e-06, + "loss": 0.6395, + "step": 3371 + }, + { + "epoch": 0.43, + "grad_norm": 0.7082437538968397, + "learning_rate": 9.130834101139746e-06, + "loss": 0.5613, + "step": 3372 + }, + { + "epoch": 0.43, + "grad_norm": 0.9805342219127094, + "learning_rate": 9.130252791164316e-06, + "loss": 0.6344, + "step": 3373 + }, + { + "epoch": 0.43, + "grad_norm": 0.7900535866295054, + "learning_rate": 9.129671305377221e-06, + "loss": 0.6313, + "step": 3374 + }, + { + "epoch": 0.43, + "grad_norm": 0.6552233602094087, + "learning_rate": 9.129089643803214e-06, + "loss": 0.629, + "step": 3375 + }, + { + "epoch": 0.43, + "grad_norm": 0.6125297194096332, + "learning_rate": 9.12850780646706e-06, + "loss": 0.5558, + "step": 3376 + }, + { + "epoch": 0.43, + "grad_norm": 0.7039150157695548, + "learning_rate": 9.127925793393518e-06, + "loss": 0.5445, + "step": 3377 + }, + { + "epoch": 0.43, + "grad_norm": 0.5808338395472502, + "learning_rate": 9.127343604607368e-06, + "loss": 0.5222, + "step": 3378 + }, + { + "epoch": 0.43, + "grad_norm": 0.6737747923260493, + "learning_rate": 9.126761240133389e-06, + "loss": 0.5396, + "step": 3379 + }, + { + "epoch": 0.43, + "grad_norm": 0.7678252240305559, + "learning_rate": 9.126178699996372e-06, + "loss": 0.6594, + "step": 3380 + }, + { + "epoch": 0.43, + "grad_norm": 0.6479400350903531, + "learning_rate": 9.125595984221111e-06, + "loss": 0.5199, + "step": 3381 + }, + { + "epoch": 0.43, + "grad_norm": 0.9702338280115503, + "learning_rate": 9.125013092832413e-06, + "loss": 0.6508, + "step": 3382 + }, + { + "epoch": 0.43, + "grad_norm": 0.7154203402310482, + "learning_rate": 9.124430025855092e-06, + "loss": 0.6366, + "step": 3383 + }, + { + "epoch": 0.43, + "grad_norm": 1.0165729780864998, + "learning_rate": 9.123846783313962e-06, + "loss": 0.6224, + "step": 3384 + }, + { + "epoch": 0.43, + "grad_norm": 0.5766769383579708, + "learning_rate": 9.123263365233853e-06, + "loss": 0.4986, + "step": 3385 + }, + { + "epoch": 0.43, + "grad_norm": 0.6297472786360359, + "learning_rate": 9.122679771639597e-06, + "loss": 0.5538, + "step": 3386 + }, + { + "epoch": 0.43, + "grad_norm": 0.5713774395211625, + "learning_rate": 9.12209600255604e-06, + "loss": 0.4997, + "step": 3387 + }, + { + "epoch": 0.43, + "grad_norm": 0.7399484539715754, + "learning_rate": 9.121512058008027e-06, + "loss": 0.5827, + "step": 3388 + }, + { + "epoch": 0.43, + "grad_norm": 0.8690171513296493, + "learning_rate": 9.120927938020417e-06, + "loss": 0.5909, + "step": 3389 + }, + { + "epoch": 0.43, + "grad_norm": 0.6314486785053257, + "learning_rate": 9.120343642618072e-06, + "loss": 0.556, + "step": 3390 + }, + { + "epoch": 0.43, + "grad_norm": 0.8256157442624091, + "learning_rate": 9.119759171825864e-06, + "loss": 0.5406, + "step": 3391 + }, + { + "epoch": 0.43, + "grad_norm": 1.3008687447765541, + "learning_rate": 9.119174525668675e-06, + "loss": 0.6182, + "step": 3392 + }, + { + "epoch": 0.43, + "grad_norm": 0.6699956479986262, + "learning_rate": 9.11858970417139e-06, + "loss": 0.5853, + "step": 3393 + }, + { + "epoch": 0.43, + "grad_norm": 0.745843427690736, + "learning_rate": 9.1180047073589e-06, + "loss": 0.5609, + "step": 3394 + }, + { + "epoch": 0.43, + "grad_norm": 0.8526497815180284, + "learning_rate": 9.117419535256112e-06, + "loss": 0.6744, + "step": 3395 + }, + { + "epoch": 0.43, + "grad_norm": 0.8985836024702388, + "learning_rate": 9.116834187887929e-06, + "loss": 0.5944, + "step": 3396 + }, + { + "epoch": 0.43, + "grad_norm": 0.7569995747681864, + "learning_rate": 9.116248665279271e-06, + "loss": 0.5935, + "step": 3397 + }, + { + "epoch": 0.43, + "grad_norm": 0.8722201886654226, + "learning_rate": 9.115662967455062e-06, + "loss": 0.6522, + "step": 3398 + }, + { + "epoch": 0.43, + "grad_norm": 0.54464972485816, + "learning_rate": 9.11507709444023e-06, + "loss": 0.4958, + "step": 3399 + }, + { + "epoch": 0.43, + "grad_norm": 0.9578844103435927, + "learning_rate": 9.11449104625972e-06, + "loss": 0.6634, + "step": 3400 + }, + { + "epoch": 0.43, + "grad_norm": 0.6305311560784173, + "learning_rate": 9.11390482293847e-06, + "loss": 0.5234, + "step": 3401 + }, + { + "epoch": 0.43, + "grad_norm": 0.8473449763833505, + "learning_rate": 9.113318424501441e-06, + "loss": 0.6717, + "step": 3402 + }, + { + "epoch": 0.43, + "grad_norm": 0.6465775105355057, + "learning_rate": 9.112731850973588e-06, + "loss": 0.5333, + "step": 3403 + }, + { + "epoch": 0.43, + "grad_norm": 0.7232518515987104, + "learning_rate": 9.112145102379883e-06, + "loss": 0.6084, + "step": 3404 + }, + { + "epoch": 0.43, + "grad_norm": 0.859278182864312, + "learning_rate": 9.111558178745303e-06, + "loss": 0.589, + "step": 3405 + }, + { + "epoch": 0.43, + "grad_norm": 1.0453940997207696, + "learning_rate": 9.110971080094831e-06, + "loss": 0.6317, + "step": 3406 + }, + { + "epoch": 0.43, + "grad_norm": 0.6580486255596497, + "learning_rate": 9.110383806453454e-06, + "loss": 0.531, + "step": 3407 + }, + { + "epoch": 0.43, + "grad_norm": 0.7649491544022339, + "learning_rate": 9.109796357846175e-06, + "loss": 0.648, + "step": 3408 + }, + { + "epoch": 0.43, + "grad_norm": 0.759930269148066, + "learning_rate": 9.109208734297998e-06, + "loss": 0.6209, + "step": 3409 + }, + { + "epoch": 0.43, + "grad_norm": 0.7872410564615248, + "learning_rate": 9.108620935833936e-06, + "loss": 0.5486, + "step": 3410 + }, + { + "epoch": 0.43, + "grad_norm": 0.9132832582965659, + "learning_rate": 9.10803296247901e-06, + "loss": 0.6446, + "step": 3411 + }, + { + "epoch": 0.43, + "grad_norm": 0.8071141709769595, + "learning_rate": 9.107444814258248e-06, + "loss": 0.6386, + "step": 3412 + }, + { + "epoch": 0.43, + "grad_norm": 0.6190393288077055, + "learning_rate": 9.106856491196687e-06, + "loss": 0.5587, + "step": 3413 + }, + { + "epoch": 0.43, + "grad_norm": 0.687739849457782, + "learning_rate": 9.106267993319367e-06, + "loss": 0.5009, + "step": 3414 + }, + { + "epoch": 0.44, + "grad_norm": 0.7321852620574655, + "learning_rate": 9.105679320651342e-06, + "loss": 0.5614, + "step": 3415 + }, + { + "epoch": 0.44, + "grad_norm": 0.795381282193837, + "learning_rate": 9.105090473217666e-06, + "loss": 0.5831, + "step": 3416 + }, + { + "epoch": 0.44, + "grad_norm": 0.5307703594494323, + "learning_rate": 9.104501451043408e-06, + "loss": 0.5021, + "step": 3417 + }, + { + "epoch": 0.44, + "grad_norm": 0.9021048770161111, + "learning_rate": 9.103912254153639e-06, + "loss": 0.5814, + "step": 3418 + }, + { + "epoch": 0.44, + "grad_norm": 1.091798701074009, + "learning_rate": 9.10332288257344e-06, + "loss": 0.629, + "step": 3419 + }, + { + "epoch": 0.44, + "grad_norm": 0.7033812224130684, + "learning_rate": 9.102733336327898e-06, + "loss": 0.5506, + "step": 3420 + }, + { + "epoch": 0.44, + "grad_norm": 0.7485385640187735, + "learning_rate": 9.102143615442108e-06, + "loss": 0.5998, + "step": 3421 + }, + { + "epoch": 0.44, + "grad_norm": 0.8108791291501346, + "learning_rate": 9.101553719941174e-06, + "loss": 0.5812, + "step": 3422 + }, + { + "epoch": 0.44, + "grad_norm": 0.7281258194228255, + "learning_rate": 9.100963649850205e-06, + "loss": 0.6093, + "step": 3423 + }, + { + "epoch": 0.44, + "grad_norm": 0.62751963414827, + "learning_rate": 9.100373405194315e-06, + "loss": 0.529, + "step": 3424 + }, + { + "epoch": 0.44, + "grad_norm": 0.830029602085821, + "learning_rate": 9.099782985998636e-06, + "loss": 0.5853, + "step": 3425 + }, + { + "epoch": 0.44, + "grad_norm": 0.5672589507696492, + "learning_rate": 9.099192392288294e-06, + "loss": 0.5196, + "step": 3426 + }, + { + "epoch": 0.44, + "grad_norm": 0.8414099568616669, + "learning_rate": 9.098601624088431e-06, + "loss": 0.5821, + "step": 3427 + }, + { + "epoch": 0.44, + "grad_norm": 0.5549940027603152, + "learning_rate": 9.098010681424195e-06, + "loss": 0.5335, + "step": 3428 + }, + { + "epoch": 0.44, + "grad_norm": 0.5860203072757566, + "learning_rate": 9.09741956432074e-06, + "loss": 0.5245, + "step": 3429 + }, + { + "epoch": 0.44, + "grad_norm": 0.6877467215061256, + "learning_rate": 9.096828272803226e-06, + "loss": 0.5793, + "step": 3430 + }, + { + "epoch": 0.44, + "grad_norm": 0.7067325107093925, + "learning_rate": 9.096236806896826e-06, + "loss": 0.5951, + "step": 3431 + }, + { + "epoch": 0.44, + "grad_norm": 0.6761810453383481, + "learning_rate": 9.095645166626715e-06, + "loss": 0.5423, + "step": 3432 + }, + { + "epoch": 0.44, + "grad_norm": 0.5536052962497934, + "learning_rate": 9.095053352018075e-06, + "loss": 0.4714, + "step": 3433 + }, + { + "epoch": 0.44, + "grad_norm": 0.781149231770683, + "learning_rate": 9.094461363096101e-06, + "loss": 0.6311, + "step": 3434 + }, + { + "epoch": 0.44, + "grad_norm": 0.8280277777453815, + "learning_rate": 9.093869199885989e-06, + "loss": 0.6185, + "step": 3435 + }, + { + "epoch": 0.44, + "grad_norm": 0.5771242618740802, + "learning_rate": 9.09327686241295e-06, + "loss": 0.4675, + "step": 3436 + }, + { + "epoch": 0.44, + "grad_norm": 0.617704913127233, + "learning_rate": 9.092684350702194e-06, + "loss": 0.6161, + "step": 3437 + }, + { + "epoch": 0.44, + "grad_norm": 0.8166272347866761, + "learning_rate": 9.092091664778944e-06, + "loss": 0.6166, + "step": 3438 + }, + { + "epoch": 0.44, + "grad_norm": 0.6073872660581949, + "learning_rate": 9.091498804668427e-06, + "loss": 0.5854, + "step": 3439 + }, + { + "epoch": 0.44, + "grad_norm": 0.7709524855851002, + "learning_rate": 9.090905770395881e-06, + "loss": 0.6592, + "step": 3440 + }, + { + "epoch": 0.44, + "grad_norm": 0.7446287029972817, + "learning_rate": 9.09031256198655e-06, + "loss": 0.5939, + "step": 3441 + }, + { + "epoch": 0.44, + "grad_norm": 0.5762656640439561, + "learning_rate": 9.089719179465684e-06, + "loss": 0.5321, + "step": 3442 + }, + { + "epoch": 0.44, + "grad_norm": 0.7922364264667471, + "learning_rate": 9.089125622858538e-06, + "loss": 0.6616, + "step": 3443 + }, + { + "epoch": 0.44, + "grad_norm": 0.5957511470102311, + "learning_rate": 9.088531892190387e-06, + "loss": 0.5604, + "step": 3444 + }, + { + "epoch": 0.44, + "grad_norm": 0.5609628154928852, + "learning_rate": 9.087937987486495e-06, + "loss": 0.4976, + "step": 3445 + }, + { + "epoch": 0.44, + "grad_norm": 0.6642831355527429, + "learning_rate": 9.087343908772147e-06, + "loss": 0.5502, + "step": 3446 + }, + { + "epoch": 0.44, + "grad_norm": 0.7742182344477593, + "learning_rate": 9.086749656072628e-06, + "loss": 0.6316, + "step": 3447 + }, + { + "epoch": 0.44, + "grad_norm": 0.7032856889729958, + "learning_rate": 9.086155229413237e-06, + "loss": 0.5739, + "step": 3448 + }, + { + "epoch": 0.44, + "grad_norm": 0.6828160605012141, + "learning_rate": 9.085560628819275e-06, + "loss": 0.6179, + "step": 3449 + }, + { + "epoch": 0.44, + "grad_norm": 0.6665142563842354, + "learning_rate": 9.084965854316053e-06, + "loss": 0.5718, + "step": 3450 + }, + { + "epoch": 0.44, + "grad_norm": 0.6591496051120823, + "learning_rate": 9.084370905928888e-06, + "loss": 0.5358, + "step": 3451 + }, + { + "epoch": 0.44, + "grad_norm": 0.9074742145259302, + "learning_rate": 9.083775783683107e-06, + "loss": 0.6626, + "step": 3452 + }, + { + "epoch": 0.44, + "grad_norm": 0.7297895037645014, + "learning_rate": 9.083180487604037e-06, + "loss": 0.5436, + "step": 3453 + }, + { + "epoch": 0.44, + "grad_norm": 0.6099829711461648, + "learning_rate": 9.082585017717025e-06, + "loss": 0.5411, + "step": 3454 + }, + { + "epoch": 0.44, + "grad_norm": 0.596012446359992, + "learning_rate": 9.081989374047414e-06, + "loss": 0.5547, + "step": 3455 + }, + { + "epoch": 0.44, + "grad_norm": 0.6013586954300786, + "learning_rate": 9.081393556620559e-06, + "loss": 0.546, + "step": 3456 + }, + { + "epoch": 0.44, + "grad_norm": 0.6037784928475926, + "learning_rate": 9.080797565461823e-06, + "loss": 0.5251, + "step": 3457 + }, + { + "epoch": 0.44, + "grad_norm": 0.6740132365035242, + "learning_rate": 9.080201400596576e-06, + "loss": 0.5677, + "step": 3458 + }, + { + "epoch": 0.44, + "grad_norm": 0.5920250294100176, + "learning_rate": 9.079605062050192e-06, + "loss": 0.4839, + "step": 3459 + }, + { + "epoch": 0.44, + "grad_norm": 0.8994227572042438, + "learning_rate": 9.079008549848059e-06, + "loss": 0.6469, + "step": 3460 + }, + { + "epoch": 0.44, + "grad_norm": 0.7040440667533507, + "learning_rate": 9.078411864015564e-06, + "loss": 0.554, + "step": 3461 + }, + { + "epoch": 0.44, + "grad_norm": 0.7005283401069226, + "learning_rate": 9.07781500457811e-06, + "loss": 0.5902, + "step": 3462 + }, + { + "epoch": 0.44, + "grad_norm": 0.6336861352541178, + "learning_rate": 9.077217971561104e-06, + "loss": 0.5874, + "step": 3463 + }, + { + "epoch": 0.44, + "grad_norm": 0.6950969304149969, + "learning_rate": 9.076620764989954e-06, + "loss": 0.5774, + "step": 3464 + }, + { + "epoch": 0.44, + "grad_norm": 0.663118890296144, + "learning_rate": 9.076023384890088e-06, + "loss": 0.5519, + "step": 3465 + }, + { + "epoch": 0.44, + "grad_norm": 0.740456861741743, + "learning_rate": 9.07542583128693e-06, + "loss": 0.6554, + "step": 3466 + }, + { + "epoch": 0.44, + "grad_norm": 0.9448633073039158, + "learning_rate": 9.07482810420592e-06, + "loss": 0.6324, + "step": 3467 + }, + { + "epoch": 0.44, + "grad_norm": 0.6265449999592396, + "learning_rate": 9.074230203672497e-06, + "loss": 0.5207, + "step": 3468 + }, + { + "epoch": 0.44, + "grad_norm": 0.773030939432317, + "learning_rate": 9.073632129712113e-06, + "loss": 0.6169, + "step": 3469 + }, + { + "epoch": 0.44, + "grad_norm": 0.9168246701072386, + "learning_rate": 9.073033882350228e-06, + "loss": 0.6942, + "step": 3470 + }, + { + "epoch": 0.44, + "grad_norm": 1.531811412093806, + "learning_rate": 9.072435461612306e-06, + "loss": 0.633, + "step": 3471 + }, + { + "epoch": 0.44, + "grad_norm": 0.6088111774191548, + "learning_rate": 9.07183686752382e-06, + "loss": 0.5624, + "step": 3472 + }, + { + "epoch": 0.44, + "grad_norm": 0.5718113399475073, + "learning_rate": 9.07123810011025e-06, + "loss": 0.5179, + "step": 3473 + }, + { + "epoch": 0.44, + "grad_norm": 0.6586860404694903, + "learning_rate": 9.070639159397083e-06, + "loss": 0.5209, + "step": 3474 + }, + { + "epoch": 0.44, + "grad_norm": 0.695078600559415, + "learning_rate": 9.070040045409816e-06, + "loss": 0.543, + "step": 3475 + }, + { + "epoch": 0.44, + "grad_norm": 0.7903743500897177, + "learning_rate": 9.06944075817395e-06, + "loss": 0.5196, + "step": 3476 + }, + { + "epoch": 0.44, + "grad_norm": 2.3566609457891983, + "learning_rate": 9.068841297714995e-06, + "loss": 0.6252, + "step": 3477 + }, + { + "epoch": 0.44, + "grad_norm": 0.8800272974540738, + "learning_rate": 9.068241664058468e-06, + "loss": 0.6801, + "step": 3478 + }, + { + "epoch": 0.44, + "grad_norm": 0.5566431404014974, + "learning_rate": 9.067641857229894e-06, + "loss": 0.5208, + "step": 3479 + }, + { + "epoch": 0.44, + "grad_norm": 0.6205159678325204, + "learning_rate": 9.067041877254804e-06, + "loss": 0.5347, + "step": 3480 + }, + { + "epoch": 0.44, + "grad_norm": 0.7549855744068191, + "learning_rate": 9.066441724158739e-06, + "loss": 0.5272, + "step": 3481 + }, + { + "epoch": 0.44, + "grad_norm": 0.5513535047703215, + "learning_rate": 9.065841397967244e-06, + "loss": 0.5126, + "step": 3482 + }, + { + "epoch": 0.44, + "grad_norm": 0.6202979237028544, + "learning_rate": 9.065240898705872e-06, + "loss": 0.5474, + "step": 3483 + }, + { + "epoch": 0.44, + "grad_norm": 0.716092780306067, + "learning_rate": 9.064640226400185e-06, + "loss": 0.5148, + "step": 3484 + }, + { + "epoch": 0.44, + "grad_norm": 0.6400446680115223, + "learning_rate": 9.064039381075754e-06, + "loss": 0.5406, + "step": 3485 + }, + { + "epoch": 0.44, + "grad_norm": 0.622689900777675, + "learning_rate": 9.063438362758152e-06, + "loss": 0.5448, + "step": 3486 + }, + { + "epoch": 0.44, + "grad_norm": 0.7307962940199746, + "learning_rate": 9.062837171472964e-06, + "loss": 0.5628, + "step": 3487 + }, + { + "epoch": 0.44, + "grad_norm": 0.6741559708179777, + "learning_rate": 9.062235807245782e-06, + "loss": 0.6137, + "step": 3488 + }, + { + "epoch": 0.44, + "grad_norm": 0.7451203092554143, + "learning_rate": 9.061634270102203e-06, + "loss": 0.6273, + "step": 3489 + }, + { + "epoch": 0.44, + "grad_norm": 0.7588593424679776, + "learning_rate": 9.06103256006783e-06, + "loss": 0.6269, + "step": 3490 + }, + { + "epoch": 0.44, + "grad_norm": 0.5981496309045811, + "learning_rate": 9.06043067716828e-06, + "loss": 0.5627, + "step": 3491 + }, + { + "epoch": 0.44, + "grad_norm": 0.6738724834755675, + "learning_rate": 9.05982862142917e-06, + "loss": 0.6255, + "step": 3492 + }, + { + "epoch": 0.44, + "grad_norm": 0.6156238216372788, + "learning_rate": 9.05922639287613e-06, + "loss": 0.5565, + "step": 3493 + }, + { + "epoch": 0.45, + "grad_norm": 0.6418613128208774, + "learning_rate": 9.058623991534792e-06, + "loss": 0.5136, + "step": 3494 + }, + { + "epoch": 0.45, + "grad_norm": 0.9065074820157594, + "learning_rate": 9.058021417430802e-06, + "loss": 0.6651, + "step": 3495 + }, + { + "epoch": 0.45, + "grad_norm": 0.877923466513029, + "learning_rate": 9.057418670589808e-06, + "loss": 0.6275, + "step": 3496 + }, + { + "epoch": 0.45, + "grad_norm": 0.6657402420828011, + "learning_rate": 9.056815751037467e-06, + "loss": 0.5542, + "step": 3497 + }, + { + "epoch": 0.45, + "grad_norm": 0.8721493810906396, + "learning_rate": 9.056212658799442e-06, + "loss": 0.6611, + "step": 3498 + }, + { + "epoch": 0.45, + "grad_norm": 0.6708404727325938, + "learning_rate": 9.055609393901406e-06, + "loss": 0.5685, + "step": 3499 + }, + { + "epoch": 0.45, + "grad_norm": 0.8400889025363306, + "learning_rate": 9.055005956369038e-06, + "loss": 0.6319, + "step": 3500 + }, + { + "epoch": 0.45, + "grad_norm": 0.9255944785305048, + "learning_rate": 9.054402346228025e-06, + "loss": 0.6464, + "step": 3501 + }, + { + "epoch": 0.45, + "grad_norm": 0.6925266681120441, + "learning_rate": 9.05379856350406e-06, + "loss": 0.5551, + "step": 3502 + }, + { + "epoch": 0.45, + "grad_norm": 0.7483439418040335, + "learning_rate": 9.053194608222843e-06, + "loss": 0.5183, + "step": 3503 + }, + { + "epoch": 0.45, + "grad_norm": 0.6846736393015276, + "learning_rate": 9.052590480410086e-06, + "loss": 0.5299, + "step": 3504 + }, + { + "epoch": 0.45, + "grad_norm": 0.6688169423039391, + "learning_rate": 9.051986180091501e-06, + "loss": 0.6147, + "step": 3505 + }, + { + "epoch": 0.45, + "grad_norm": 0.8562968352471371, + "learning_rate": 9.051381707292813e-06, + "loss": 0.6629, + "step": 3506 + }, + { + "epoch": 0.45, + "grad_norm": 0.5459875483422001, + "learning_rate": 9.050777062039753e-06, + "loss": 0.5054, + "step": 3507 + }, + { + "epoch": 0.45, + "grad_norm": 0.7191218558112576, + "learning_rate": 9.050172244358055e-06, + "loss": 0.6282, + "step": 3508 + }, + { + "epoch": 0.45, + "grad_norm": 0.9132885327842507, + "learning_rate": 9.04956725427347e-06, + "loss": 0.6207, + "step": 3509 + }, + { + "epoch": 0.45, + "grad_norm": 0.6305026284423193, + "learning_rate": 9.048962091811746e-06, + "loss": 0.5662, + "step": 3510 + }, + { + "epoch": 0.45, + "grad_norm": 0.792983350898886, + "learning_rate": 9.048356756998645e-06, + "loss": 0.6244, + "step": 3511 + }, + { + "epoch": 0.45, + "grad_norm": 1.1881448784013917, + "learning_rate": 9.047751249859935e-06, + "loss": 0.6438, + "step": 3512 + }, + { + "epoch": 0.45, + "grad_norm": 0.7654548079755309, + "learning_rate": 9.047145570421387e-06, + "loss": 0.5731, + "step": 3513 + }, + { + "epoch": 0.45, + "grad_norm": 0.882883690809048, + "learning_rate": 9.046539718708784e-06, + "loss": 0.6784, + "step": 3514 + }, + { + "epoch": 0.45, + "grad_norm": 0.6098511240638075, + "learning_rate": 9.045933694747919e-06, + "loss": 0.496, + "step": 3515 + }, + { + "epoch": 0.45, + "grad_norm": 0.8982423895289703, + "learning_rate": 9.045327498564584e-06, + "loss": 0.6329, + "step": 3516 + }, + { + "epoch": 0.45, + "grad_norm": 0.9235146585001086, + "learning_rate": 9.044721130184584e-06, + "loss": 0.5889, + "step": 3517 + }, + { + "epoch": 0.45, + "grad_norm": 0.6330000143716848, + "learning_rate": 9.04411458963373e-06, + "loss": 0.5864, + "step": 3518 + }, + { + "epoch": 0.45, + "grad_norm": 0.764318534633108, + "learning_rate": 9.043507876937843e-06, + "loss": 0.5963, + "step": 3519 + }, + { + "epoch": 0.45, + "grad_norm": 0.8447809422758628, + "learning_rate": 9.042900992122745e-06, + "loss": 0.6525, + "step": 3520 + }, + { + "epoch": 0.45, + "grad_norm": 0.6936724517910646, + "learning_rate": 9.042293935214272e-06, + "loss": 0.4891, + "step": 3521 + }, + { + "epoch": 0.45, + "grad_norm": 0.6371401996807756, + "learning_rate": 9.041686706238264e-06, + "loss": 0.5634, + "step": 3522 + }, + { + "epoch": 0.45, + "grad_norm": 0.5988503216968053, + "learning_rate": 9.041079305220569e-06, + "loss": 0.5076, + "step": 3523 + }, + { + "epoch": 0.45, + "grad_norm": 0.9111324375489593, + "learning_rate": 9.04047173218704e-06, + "loss": 0.6961, + "step": 3524 + }, + { + "epoch": 0.45, + "grad_norm": 1.0821068528264417, + "learning_rate": 9.03986398716354e-06, + "loss": 0.6123, + "step": 3525 + }, + { + "epoch": 0.45, + "grad_norm": 0.6841654508660304, + "learning_rate": 9.03925607017594e-06, + "loss": 0.5856, + "step": 3526 + }, + { + "epoch": 0.45, + "grad_norm": 0.8591711706905032, + "learning_rate": 9.038647981250117e-06, + "loss": 0.5892, + "step": 3527 + }, + { + "epoch": 0.45, + "grad_norm": 0.8684315769543314, + "learning_rate": 9.038039720411958e-06, + "loss": 0.5494, + "step": 3528 + }, + { + "epoch": 0.45, + "grad_norm": 1.1932004626529524, + "learning_rate": 9.037431287687348e-06, + "loss": 0.6754, + "step": 3529 + }, + { + "epoch": 0.45, + "grad_norm": 0.5471167506236436, + "learning_rate": 9.036822683102192e-06, + "loss": 0.5416, + "step": 3530 + }, + { + "epoch": 0.45, + "grad_norm": 1.3445573027619735, + "learning_rate": 9.036213906682392e-06, + "loss": 0.6121, + "step": 3531 + }, + { + "epoch": 0.45, + "grad_norm": 0.8696630568214259, + "learning_rate": 9.035604958453865e-06, + "loss": 0.7065, + "step": 3532 + }, + { + "epoch": 0.45, + "grad_norm": 0.738741077272047, + "learning_rate": 9.034995838442533e-06, + "loss": 0.6173, + "step": 3533 + }, + { + "epoch": 0.45, + "grad_norm": 0.683246772680504, + "learning_rate": 9.03438654667432e-06, + "loss": 0.6086, + "step": 3534 + }, + { + "epoch": 0.45, + "grad_norm": 0.6920271954019079, + "learning_rate": 9.033777083175165e-06, + "loss": 0.6018, + "step": 3535 + }, + { + "epoch": 0.45, + "grad_norm": 0.7884672789407592, + "learning_rate": 9.033167447971008e-06, + "loss": 0.6032, + "step": 3536 + }, + { + "epoch": 0.45, + "grad_norm": 0.5911953857692304, + "learning_rate": 9.0325576410878e-06, + "loss": 0.5322, + "step": 3537 + }, + { + "epoch": 0.45, + "grad_norm": 0.6238313942336, + "learning_rate": 9.031947662551502e-06, + "loss": 0.5774, + "step": 3538 + }, + { + "epoch": 0.45, + "grad_norm": 0.8525044260773812, + "learning_rate": 9.031337512388076e-06, + "loss": 0.6106, + "step": 3539 + }, + { + "epoch": 0.45, + "grad_norm": 0.7503718152639232, + "learning_rate": 9.030727190623495e-06, + "loss": 0.5538, + "step": 3540 + }, + { + "epoch": 0.45, + "grad_norm": 0.6340760337955164, + "learning_rate": 9.030116697283735e-06, + "loss": 0.5398, + "step": 3541 + }, + { + "epoch": 0.45, + "grad_norm": 0.7525091210116114, + "learning_rate": 9.029506032394787e-06, + "loss": 0.5367, + "step": 3542 + }, + { + "epoch": 0.45, + "grad_norm": 0.6724150312502559, + "learning_rate": 9.028895195982643e-06, + "loss": 0.5417, + "step": 3543 + }, + { + "epoch": 0.45, + "grad_norm": 0.656939585729722, + "learning_rate": 9.028284188073306e-06, + "loss": 0.5082, + "step": 3544 + }, + { + "epoch": 0.45, + "grad_norm": 0.6931507465832119, + "learning_rate": 9.027673008692785e-06, + "loss": 0.605, + "step": 3545 + }, + { + "epoch": 0.45, + "grad_norm": 0.6786019314804285, + "learning_rate": 9.027061657867094e-06, + "loss": 0.5854, + "step": 3546 + }, + { + "epoch": 0.45, + "grad_norm": 0.7182869959058398, + "learning_rate": 9.026450135622254e-06, + "loss": 0.6483, + "step": 3547 + }, + { + "epoch": 0.45, + "grad_norm": 0.7704219718710181, + "learning_rate": 9.0258384419843e-06, + "loss": 0.601, + "step": 3548 + }, + { + "epoch": 0.45, + "grad_norm": 0.5832354117182584, + "learning_rate": 9.02522657697927e-06, + "loss": 0.539, + "step": 3549 + }, + { + "epoch": 0.45, + "grad_norm": 0.7130117431608659, + "learning_rate": 9.024614540633205e-06, + "loss": 0.5908, + "step": 3550 + }, + { + "epoch": 0.45, + "grad_norm": 0.6324806673749507, + "learning_rate": 9.02400233297216e-06, + "loss": 0.5251, + "step": 3551 + }, + { + "epoch": 0.45, + "grad_norm": 0.7298318214376163, + "learning_rate": 9.023389954022196e-06, + "loss": 0.6148, + "step": 3552 + }, + { + "epoch": 0.45, + "grad_norm": 0.5603837477940324, + "learning_rate": 9.02277740380938e-06, + "loss": 0.5157, + "step": 3553 + }, + { + "epoch": 0.45, + "grad_norm": 0.7181252072288951, + "learning_rate": 9.022164682359783e-06, + "loss": 0.5862, + "step": 3554 + }, + { + "epoch": 0.45, + "grad_norm": 1.2766420117783404, + "learning_rate": 9.021551789699487e-06, + "loss": 0.5757, + "step": 3555 + }, + { + "epoch": 0.45, + "grad_norm": 0.8044614780027008, + "learning_rate": 9.020938725854585e-06, + "loss": 0.6335, + "step": 3556 + }, + { + "epoch": 0.45, + "grad_norm": 0.6934871265053367, + "learning_rate": 9.020325490851169e-06, + "loss": 0.5562, + "step": 3557 + }, + { + "epoch": 0.45, + "grad_norm": 0.565211036478897, + "learning_rate": 9.019712084715343e-06, + "loss": 0.5114, + "step": 3558 + }, + { + "epoch": 0.45, + "grad_norm": 0.7343693646958838, + "learning_rate": 9.019098507473222e-06, + "loss": 0.5775, + "step": 3559 + }, + { + "epoch": 0.45, + "grad_norm": 0.8307521926904984, + "learning_rate": 9.018484759150919e-06, + "loss": 0.6105, + "step": 3560 + }, + { + "epoch": 0.45, + "grad_norm": 0.6969391684108982, + "learning_rate": 9.017870839774561e-06, + "loss": 0.5618, + "step": 3561 + }, + { + "epoch": 0.45, + "grad_norm": 0.7913594337955373, + "learning_rate": 9.01725674937028e-06, + "loss": 0.6236, + "step": 3562 + }, + { + "epoch": 0.45, + "grad_norm": 0.7831484268905573, + "learning_rate": 9.016642487964216e-06, + "loss": 0.6271, + "step": 3563 + }, + { + "epoch": 0.45, + "grad_norm": 0.7734479276434101, + "learning_rate": 9.016028055582519e-06, + "loss": 0.5629, + "step": 3564 + }, + { + "epoch": 0.45, + "grad_norm": 0.9184675808772949, + "learning_rate": 9.015413452251339e-06, + "loss": 0.5913, + "step": 3565 + }, + { + "epoch": 0.45, + "grad_norm": 0.9242821490871466, + "learning_rate": 9.014798677996842e-06, + "loss": 0.6394, + "step": 3566 + }, + { + "epoch": 0.45, + "grad_norm": 0.5374049006998934, + "learning_rate": 9.014183732845192e-06, + "loss": 0.5085, + "step": 3567 + }, + { + "epoch": 0.45, + "grad_norm": 0.5652246223121693, + "learning_rate": 9.013568616822569e-06, + "loss": 0.5485, + "step": 3568 + }, + { + "epoch": 0.45, + "grad_norm": 0.736919600344496, + "learning_rate": 9.012953329955155e-06, + "loss": 0.6425, + "step": 3569 + }, + { + "epoch": 0.45, + "grad_norm": 0.6557264578405426, + "learning_rate": 9.012337872269142e-06, + "loss": 0.4939, + "step": 3570 + }, + { + "epoch": 0.45, + "grad_norm": 0.6350277987555162, + "learning_rate": 9.011722243790728e-06, + "loss": 0.598, + "step": 3571 + }, + { + "epoch": 0.46, + "grad_norm": 0.8998325682651059, + "learning_rate": 9.011106444546115e-06, + "loss": 0.5951, + "step": 3572 + }, + { + "epoch": 0.46, + "grad_norm": 0.854526313990166, + "learning_rate": 9.010490474561522e-06, + "loss": 0.6122, + "step": 3573 + }, + { + "epoch": 0.46, + "grad_norm": 0.7132548594570698, + "learning_rate": 9.009874333863163e-06, + "loss": 0.5614, + "step": 3574 + }, + { + "epoch": 0.46, + "grad_norm": 0.6080094417516144, + "learning_rate": 9.009258022477268e-06, + "loss": 0.4917, + "step": 3575 + }, + { + "epoch": 0.46, + "grad_norm": 0.8485189554263428, + "learning_rate": 9.00864154043007e-06, + "loss": 0.6404, + "step": 3576 + }, + { + "epoch": 0.46, + "grad_norm": 0.8277463297538589, + "learning_rate": 9.008024887747813e-06, + "loss": 0.5755, + "step": 3577 + }, + { + "epoch": 0.46, + "grad_norm": 0.967668426314493, + "learning_rate": 9.007408064456744e-06, + "loss": 0.6513, + "step": 3578 + }, + { + "epoch": 0.46, + "grad_norm": 0.6532989589050314, + "learning_rate": 9.006791070583119e-06, + "loss": 0.5663, + "step": 3579 + }, + { + "epoch": 0.46, + "grad_norm": 0.6288425525090104, + "learning_rate": 9.006173906153202e-06, + "loss": 0.4971, + "step": 3580 + }, + { + "epoch": 0.46, + "grad_norm": 0.7285857204167445, + "learning_rate": 9.005556571193263e-06, + "loss": 0.5593, + "step": 3581 + }, + { + "epoch": 0.46, + "grad_norm": 0.6763956316419001, + "learning_rate": 9.004939065729581e-06, + "loss": 0.5647, + "step": 3582 + }, + { + "epoch": 0.46, + "grad_norm": 0.8032770269219566, + "learning_rate": 9.004321389788442e-06, + "loss": 0.6445, + "step": 3583 + }, + { + "epoch": 0.46, + "grad_norm": 0.969687943895717, + "learning_rate": 9.003703543396137e-06, + "loss": 0.6297, + "step": 3584 + }, + { + "epoch": 0.46, + "grad_norm": 0.5827599786857293, + "learning_rate": 9.003085526578964e-06, + "loss": 0.5076, + "step": 3585 + }, + { + "epoch": 0.46, + "grad_norm": 0.6571114061701214, + "learning_rate": 9.002467339363234e-06, + "loss": 0.5265, + "step": 3586 + }, + { + "epoch": 0.46, + "grad_norm": 0.8832216698744784, + "learning_rate": 9.001848981775261e-06, + "loss": 0.5551, + "step": 3587 + }, + { + "epoch": 0.46, + "grad_norm": 0.7237677051629329, + "learning_rate": 9.001230453841363e-06, + "loss": 0.5705, + "step": 3588 + }, + { + "epoch": 0.46, + "grad_norm": 0.8385872730792924, + "learning_rate": 9.000611755587872e-06, + "loss": 0.6383, + "step": 3589 + }, + { + "epoch": 0.46, + "grad_norm": 0.8945242828773745, + "learning_rate": 8.99999288704112e-06, + "loss": 0.6061, + "step": 3590 + }, + { + "epoch": 0.46, + "grad_norm": 0.7118769071711949, + "learning_rate": 8.999373848227455e-06, + "loss": 0.5276, + "step": 3591 + }, + { + "epoch": 0.46, + "grad_norm": 0.5566838614263709, + "learning_rate": 8.998754639173225e-06, + "loss": 0.5062, + "step": 3592 + }, + { + "epoch": 0.46, + "grad_norm": 0.6515824007018469, + "learning_rate": 8.998135259904787e-06, + "loss": 0.5533, + "step": 3593 + }, + { + "epoch": 0.46, + "grad_norm": 0.7030632564794325, + "learning_rate": 8.99751571044851e-06, + "loss": 0.5867, + "step": 3594 + }, + { + "epoch": 0.46, + "grad_norm": 0.7117488516645356, + "learning_rate": 8.996895990830763e-06, + "loss": 0.5591, + "step": 3595 + }, + { + "epoch": 0.46, + "grad_norm": 0.6918029758605921, + "learning_rate": 8.996276101077924e-06, + "loss": 0.5465, + "step": 3596 + }, + { + "epoch": 0.46, + "grad_norm": 0.6188471235386569, + "learning_rate": 8.995656041216383e-06, + "loss": 0.5254, + "step": 3597 + }, + { + "epoch": 0.46, + "grad_norm": 0.6665565780111213, + "learning_rate": 8.995035811272533e-06, + "loss": 0.5108, + "step": 3598 + }, + { + "epoch": 0.46, + "grad_norm": 0.6798739320660234, + "learning_rate": 8.994415411272772e-06, + "loss": 0.5485, + "step": 3599 + }, + { + "epoch": 0.46, + "grad_norm": 0.5883844255105386, + "learning_rate": 8.993794841243513e-06, + "loss": 0.4893, + "step": 3600 + }, + { + "epoch": 0.46, + "grad_norm": 0.8426013484882545, + "learning_rate": 8.99317410121117e-06, + "loss": 0.6526, + "step": 3601 + }, + { + "epoch": 0.46, + "grad_norm": 0.9301523420229989, + "learning_rate": 8.992553191202168e-06, + "loss": 0.6579, + "step": 3602 + }, + { + "epoch": 0.46, + "grad_norm": 0.6248214648633588, + "learning_rate": 8.991932111242933e-06, + "loss": 0.5348, + "step": 3603 + }, + { + "epoch": 0.46, + "grad_norm": 0.8076042973562217, + "learning_rate": 8.991310861359904e-06, + "loss": 0.539, + "step": 3604 + }, + { + "epoch": 0.46, + "grad_norm": 0.7605421082766961, + "learning_rate": 8.990689441579526e-06, + "loss": 0.6825, + "step": 3605 + }, + { + "epoch": 0.46, + "grad_norm": 0.7127814112166351, + "learning_rate": 8.990067851928252e-06, + "loss": 0.5872, + "step": 3606 + }, + { + "epoch": 0.46, + "grad_norm": 0.5811292756002087, + "learning_rate": 8.98944609243254e-06, + "loss": 0.5591, + "step": 3607 + }, + { + "epoch": 0.46, + "grad_norm": 0.5792800572504391, + "learning_rate": 8.988824163118854e-06, + "loss": 0.5058, + "step": 3608 + }, + { + "epoch": 0.46, + "grad_norm": 0.7089904525701487, + "learning_rate": 8.988202064013673e-06, + "loss": 0.5812, + "step": 3609 + }, + { + "epoch": 0.46, + "grad_norm": 0.7294877951057975, + "learning_rate": 8.987579795143473e-06, + "loss": 0.5835, + "step": 3610 + }, + { + "epoch": 0.46, + "grad_norm": 0.6887990728814826, + "learning_rate": 8.986957356534742e-06, + "loss": 0.5641, + "step": 3611 + }, + { + "epoch": 0.46, + "grad_norm": 0.7191118812383622, + "learning_rate": 8.98633474821398e-06, + "loss": 0.5935, + "step": 3612 + }, + { + "epoch": 0.46, + "grad_norm": 0.6515297198485607, + "learning_rate": 8.985711970207685e-06, + "loss": 0.5361, + "step": 3613 + }, + { + "epoch": 0.46, + "grad_norm": 0.9157814976397483, + "learning_rate": 8.985089022542367e-06, + "loss": 0.6844, + "step": 3614 + }, + { + "epoch": 0.46, + "grad_norm": 0.7650958676349688, + "learning_rate": 8.984465905244546e-06, + "loss": 0.6407, + "step": 3615 + }, + { + "epoch": 0.46, + "grad_norm": 2.024815730251743, + "learning_rate": 8.983842618340742e-06, + "loss": 0.6333, + "step": 3616 + }, + { + "epoch": 0.46, + "grad_norm": 0.5460686408606807, + "learning_rate": 8.983219161857489e-06, + "loss": 0.5028, + "step": 3617 + }, + { + "epoch": 0.46, + "grad_norm": 0.6485308670297574, + "learning_rate": 8.982595535821324e-06, + "loss": 0.55, + "step": 3618 + }, + { + "epoch": 0.46, + "grad_norm": 0.6363369782361266, + "learning_rate": 8.981971740258795e-06, + "loss": 0.5457, + "step": 3619 + }, + { + "epoch": 0.46, + "grad_norm": 0.6064298546329495, + "learning_rate": 8.981347775196452e-06, + "loss": 0.572, + "step": 3620 + }, + { + "epoch": 0.46, + "grad_norm": 0.750851122173302, + "learning_rate": 8.980723640660857e-06, + "loss": 0.5652, + "step": 3621 + }, + { + "epoch": 0.46, + "grad_norm": 0.6175940383720777, + "learning_rate": 8.980099336678577e-06, + "loss": 0.5109, + "step": 3622 + }, + { + "epoch": 0.46, + "grad_norm": 0.6855898550667417, + "learning_rate": 8.979474863276188e-06, + "loss": 0.5377, + "step": 3623 + }, + { + "epoch": 0.46, + "grad_norm": 0.5832630637423732, + "learning_rate": 8.97885022048027e-06, + "loss": 0.5287, + "step": 3624 + }, + { + "epoch": 0.46, + "grad_norm": 0.6370558039230712, + "learning_rate": 8.978225408317414e-06, + "loss": 0.5674, + "step": 3625 + }, + { + "epoch": 0.46, + "grad_norm": 0.7609008271334486, + "learning_rate": 8.977600426814215e-06, + "loss": 0.6438, + "step": 3626 + }, + { + "epoch": 0.46, + "grad_norm": 0.6273958182773944, + "learning_rate": 8.976975275997275e-06, + "loss": 0.5573, + "step": 3627 + }, + { + "epoch": 0.46, + "grad_norm": 0.9226487919657396, + "learning_rate": 8.976349955893206e-06, + "loss": 0.6231, + "step": 3628 + }, + { + "epoch": 0.46, + "grad_norm": 0.6128143406081149, + "learning_rate": 8.975724466528626e-06, + "loss": 0.5613, + "step": 3629 + }, + { + "epoch": 0.46, + "grad_norm": 0.6398142618401653, + "learning_rate": 8.97509880793016e-06, + "loss": 0.5611, + "step": 3630 + }, + { + "epoch": 0.46, + "grad_norm": 0.7264243852564435, + "learning_rate": 8.974472980124443e-06, + "loss": 0.5442, + "step": 3631 + }, + { + "epoch": 0.46, + "grad_norm": 0.5936566974062466, + "learning_rate": 8.973846983138107e-06, + "loss": 0.5127, + "step": 3632 + }, + { + "epoch": 0.46, + "grad_norm": 0.6381544609829135, + "learning_rate": 8.973220816997808e-06, + "loss": 0.5251, + "step": 3633 + }, + { + "epoch": 0.46, + "grad_norm": 0.6156416391308108, + "learning_rate": 8.972594481730192e-06, + "loss": 0.5123, + "step": 3634 + }, + { + "epoch": 0.46, + "grad_norm": 0.5261340639641087, + "learning_rate": 8.971967977361925e-06, + "loss": 0.5031, + "step": 3635 + }, + { + "epoch": 0.46, + "grad_norm": 0.788753557283552, + "learning_rate": 8.971341303919676e-06, + "loss": 0.6076, + "step": 3636 + }, + { + "epoch": 0.46, + "grad_norm": 0.7121461437090074, + "learning_rate": 8.970714461430116e-06, + "loss": 0.5194, + "step": 3637 + }, + { + "epoch": 0.46, + "grad_norm": 0.6242108132600825, + "learning_rate": 8.970087449919929e-06, + "loss": 0.5454, + "step": 3638 + }, + { + "epoch": 0.46, + "grad_norm": 0.8317702218257098, + "learning_rate": 8.969460269415807e-06, + "loss": 0.6484, + "step": 3639 + }, + { + "epoch": 0.46, + "grad_norm": 0.6835746484984608, + "learning_rate": 8.968832919944446e-06, + "loss": 0.5384, + "step": 3640 + }, + { + "epoch": 0.46, + "grad_norm": 0.6736306798476158, + "learning_rate": 8.96820540153255e-06, + "loss": 0.5284, + "step": 3641 + }, + { + "epoch": 0.46, + "grad_norm": 1.0457220740733495, + "learning_rate": 8.967577714206832e-06, + "loss": 0.6925, + "step": 3642 + }, + { + "epoch": 0.46, + "grad_norm": 0.7113487787701108, + "learning_rate": 8.966949857994007e-06, + "loss": 0.6277, + "step": 3643 + }, + { + "epoch": 0.46, + "grad_norm": 0.7970221435192456, + "learning_rate": 8.966321832920804e-06, + "loss": 0.6078, + "step": 3644 + }, + { + "epoch": 0.46, + "grad_norm": 0.7886578393286339, + "learning_rate": 8.965693639013955e-06, + "loss": 0.5752, + "step": 3645 + }, + { + "epoch": 0.46, + "grad_norm": 0.7332633846795106, + "learning_rate": 8.965065276300202e-06, + "loss": 0.6483, + "step": 3646 + }, + { + "epoch": 0.46, + "grad_norm": 0.7502611305161161, + "learning_rate": 8.96443674480629e-06, + "loss": 0.5996, + "step": 3647 + }, + { + "epoch": 0.46, + "grad_norm": 0.6608765133593425, + "learning_rate": 8.963808044558972e-06, + "loss": 0.5511, + "step": 3648 + }, + { + "epoch": 0.46, + "grad_norm": 0.7846595181403333, + "learning_rate": 8.963179175585012e-06, + "loss": 0.5486, + "step": 3649 + }, + { + "epoch": 0.46, + "grad_norm": 0.9753311951927206, + "learning_rate": 8.962550137911182e-06, + "loss": 0.5919, + "step": 3650 + }, + { + "epoch": 0.47, + "grad_norm": 0.6435971415344093, + "learning_rate": 8.961920931564255e-06, + "loss": 0.5739, + "step": 3651 + }, + { + "epoch": 0.47, + "grad_norm": 0.7819153363536666, + "learning_rate": 8.961291556571012e-06, + "loss": 0.644, + "step": 3652 + }, + { + "epoch": 0.47, + "grad_norm": 0.8680582180989188, + "learning_rate": 8.960662012958247e-06, + "loss": 0.6545, + "step": 3653 + }, + { + "epoch": 0.47, + "grad_norm": 0.698502228725985, + "learning_rate": 8.960032300752756e-06, + "loss": 0.5272, + "step": 3654 + }, + { + "epoch": 0.47, + "grad_norm": 0.6257151794084105, + "learning_rate": 8.959402419981346e-06, + "loss": 0.5477, + "step": 3655 + }, + { + "epoch": 0.47, + "grad_norm": 1.308083973214287, + "learning_rate": 8.958772370670826e-06, + "loss": 0.6393, + "step": 3656 + }, + { + "epoch": 0.47, + "grad_norm": 0.7685872453673079, + "learning_rate": 8.958142152848017e-06, + "loss": 0.6066, + "step": 3657 + }, + { + "epoch": 0.47, + "grad_norm": 0.7596884286710909, + "learning_rate": 8.957511766539745e-06, + "loss": 0.5981, + "step": 3658 + }, + { + "epoch": 0.47, + "grad_norm": 0.7544052857735296, + "learning_rate": 8.956881211772845e-06, + "loss": 0.6317, + "step": 3659 + }, + { + "epoch": 0.47, + "grad_norm": 0.8116571271042058, + "learning_rate": 8.956250488574155e-06, + "loss": 0.6562, + "step": 3660 + }, + { + "epoch": 0.47, + "grad_norm": 0.8595715639648566, + "learning_rate": 8.955619596970523e-06, + "loss": 0.6507, + "step": 3661 + }, + { + "epoch": 0.47, + "grad_norm": 0.6162109935608634, + "learning_rate": 8.954988536988806e-06, + "loss": 0.5304, + "step": 3662 + }, + { + "epoch": 0.47, + "grad_norm": 0.7534583776145457, + "learning_rate": 8.954357308655866e-06, + "loss": 0.6389, + "step": 3663 + }, + { + "epoch": 0.47, + "grad_norm": 0.7576891010065256, + "learning_rate": 8.95372591199857e-06, + "loss": 0.6503, + "step": 3664 + }, + { + "epoch": 0.47, + "grad_norm": 0.5698501475385936, + "learning_rate": 8.953094347043797e-06, + "loss": 0.5285, + "step": 3665 + }, + { + "epoch": 0.47, + "grad_norm": 0.695740224806512, + "learning_rate": 8.95246261381843e-06, + "loss": 0.5967, + "step": 3666 + }, + { + "epoch": 0.47, + "grad_norm": 0.6306413817937664, + "learning_rate": 8.95183071234936e-06, + "loss": 0.6064, + "step": 3667 + }, + { + "epoch": 0.47, + "grad_norm": 0.5847056946508924, + "learning_rate": 8.951198642663485e-06, + "loss": 0.5417, + "step": 3668 + }, + { + "epoch": 0.47, + "grad_norm": 0.7661917869993571, + "learning_rate": 8.95056640478771e-06, + "loss": 0.6035, + "step": 3669 + }, + { + "epoch": 0.47, + "grad_norm": 0.7672207861555538, + "learning_rate": 8.949933998748946e-06, + "loss": 0.5935, + "step": 3670 + }, + { + "epoch": 0.47, + "grad_norm": 0.7525470670952771, + "learning_rate": 8.949301424574115e-06, + "loss": 0.6315, + "step": 3671 + }, + { + "epoch": 0.47, + "grad_norm": 0.7143112124804997, + "learning_rate": 8.948668682290142e-06, + "loss": 0.5592, + "step": 3672 + }, + { + "epoch": 0.47, + "grad_norm": 0.7831403294970477, + "learning_rate": 8.948035771923961e-06, + "loss": 0.6157, + "step": 3673 + }, + { + "epoch": 0.47, + "grad_norm": 0.6399385467758807, + "learning_rate": 8.947402693502515e-06, + "loss": 0.4654, + "step": 3674 + }, + { + "epoch": 0.47, + "grad_norm": 0.8733134726545604, + "learning_rate": 8.94676944705275e-06, + "loss": 0.6258, + "step": 3675 + }, + { + "epoch": 0.47, + "grad_norm": 0.7460259667296016, + "learning_rate": 8.946136032601623e-06, + "loss": 0.5559, + "step": 3676 + }, + { + "epoch": 0.47, + "grad_norm": 0.8508096676925495, + "learning_rate": 8.945502450176092e-06, + "loss": 0.5815, + "step": 3677 + }, + { + "epoch": 0.47, + "grad_norm": 0.6754333686919946, + "learning_rate": 8.944868699803131e-06, + "loss": 0.5236, + "step": 3678 + }, + { + "epoch": 0.47, + "grad_norm": 0.5879271451746475, + "learning_rate": 8.944234781509715e-06, + "loss": 0.5497, + "step": 3679 + }, + { + "epoch": 0.47, + "grad_norm": 0.8066598560244146, + "learning_rate": 8.94360069532283e-06, + "loss": 0.5931, + "step": 3680 + }, + { + "epoch": 0.47, + "grad_norm": 0.589504278045563, + "learning_rate": 8.942966441269464e-06, + "loss": 0.491, + "step": 3681 + }, + { + "epoch": 0.47, + "grad_norm": 0.5845327895512482, + "learning_rate": 8.942332019376616e-06, + "loss": 0.5332, + "step": 3682 + }, + { + "epoch": 0.47, + "grad_norm": 0.7246144716836019, + "learning_rate": 8.941697429671292e-06, + "loss": 0.6354, + "step": 3683 + }, + { + "epoch": 0.47, + "grad_norm": 0.5774564684260933, + "learning_rate": 8.941062672180504e-06, + "loss": 0.526, + "step": 3684 + }, + { + "epoch": 0.47, + "grad_norm": 0.823466105597688, + "learning_rate": 8.940427746931273e-06, + "loss": 0.5982, + "step": 3685 + }, + { + "epoch": 0.47, + "grad_norm": 1.261700497687932, + "learning_rate": 8.939792653950625e-06, + "loss": 0.6182, + "step": 3686 + }, + { + "epoch": 0.47, + "grad_norm": 0.6459124300892567, + "learning_rate": 8.939157393265595e-06, + "loss": 0.5445, + "step": 3687 + }, + { + "epoch": 0.47, + "grad_norm": 0.6322548075569477, + "learning_rate": 8.93852196490322e-06, + "loss": 0.5482, + "step": 3688 + }, + { + "epoch": 0.47, + "grad_norm": 0.797144074997912, + "learning_rate": 8.93788636889055e-06, + "loss": 0.5824, + "step": 3689 + }, + { + "epoch": 0.47, + "grad_norm": 0.746353141671459, + "learning_rate": 8.937250605254641e-06, + "loss": 0.5989, + "step": 3690 + }, + { + "epoch": 0.47, + "grad_norm": 0.8253001390495809, + "learning_rate": 8.936614674022556e-06, + "loss": 0.6158, + "step": 3691 + }, + { + "epoch": 0.47, + "grad_norm": 0.8076365924064134, + "learning_rate": 8.935978575221366e-06, + "loss": 0.6325, + "step": 3692 + }, + { + "epoch": 0.47, + "grad_norm": 0.5585774221205613, + "learning_rate": 8.935342308878144e-06, + "loss": 0.4789, + "step": 3693 + }, + { + "epoch": 0.47, + "grad_norm": 0.6422880908803965, + "learning_rate": 8.934705875019976e-06, + "loss": 0.563, + "step": 3694 + }, + { + "epoch": 0.47, + "grad_norm": 0.9356429250556946, + "learning_rate": 8.934069273673951e-06, + "loss": 0.5969, + "step": 3695 + }, + { + "epoch": 0.47, + "grad_norm": 0.6924930173721241, + "learning_rate": 8.93343250486717e-06, + "loss": 0.5042, + "step": 3696 + }, + { + "epoch": 0.47, + "grad_norm": 0.6793333757864841, + "learning_rate": 8.932795568626736e-06, + "loss": 0.4747, + "step": 3697 + }, + { + "epoch": 0.47, + "grad_norm": 0.6395061693133614, + "learning_rate": 8.93215846497976e-06, + "loss": 0.5652, + "step": 3698 + }, + { + "epoch": 0.47, + "grad_norm": 0.789979768524907, + "learning_rate": 8.931521193953367e-06, + "loss": 0.573, + "step": 3699 + }, + { + "epoch": 0.47, + "grad_norm": 0.9383309760443929, + "learning_rate": 8.930883755574678e-06, + "loss": 0.6093, + "step": 3700 + }, + { + "epoch": 0.47, + "grad_norm": 0.7544976213925937, + "learning_rate": 8.93024614987083e-06, + "loss": 0.676, + "step": 3701 + }, + { + "epoch": 0.47, + "grad_norm": 0.7721047466971459, + "learning_rate": 8.929608376868963e-06, + "loss": 0.63, + "step": 3702 + }, + { + "epoch": 0.47, + "grad_norm": 0.5767679134176217, + "learning_rate": 8.928970436596223e-06, + "loss": 0.4625, + "step": 3703 + }, + { + "epoch": 0.47, + "grad_norm": 0.7562580186526595, + "learning_rate": 8.928332329079767e-06, + "loss": 0.6102, + "step": 3704 + }, + { + "epoch": 0.47, + "grad_norm": 0.588729134635355, + "learning_rate": 8.927694054346758e-06, + "loss": 0.5789, + "step": 3705 + }, + { + "epoch": 0.47, + "grad_norm": 0.7680495111791352, + "learning_rate": 8.927055612424365e-06, + "loss": 0.6219, + "step": 3706 + }, + { + "epoch": 0.47, + "grad_norm": 0.6400531904276301, + "learning_rate": 8.926417003339762e-06, + "loss": 0.5259, + "step": 3707 + }, + { + "epoch": 0.47, + "grad_norm": 0.6218218356598285, + "learning_rate": 8.925778227120135e-06, + "loss": 0.5129, + "step": 3708 + }, + { + "epoch": 0.47, + "grad_norm": 0.7326078904729264, + "learning_rate": 8.925139283792672e-06, + "loss": 0.6119, + "step": 3709 + }, + { + "epoch": 0.47, + "grad_norm": 0.6507012789068991, + "learning_rate": 8.924500173384575e-06, + "loss": 0.5466, + "step": 3710 + }, + { + "epoch": 0.47, + "grad_norm": 0.8024968920410219, + "learning_rate": 8.923860895923045e-06, + "loss": 0.6062, + "step": 3711 + }, + { + "epoch": 0.47, + "grad_norm": 0.6992840744481165, + "learning_rate": 8.923221451435297e-06, + "loss": 0.565, + "step": 3712 + }, + { + "epoch": 0.47, + "grad_norm": 0.6309224522411508, + "learning_rate": 8.922581839948548e-06, + "loss": 0.5373, + "step": 3713 + }, + { + "epoch": 0.47, + "grad_norm": 0.7646701275992417, + "learning_rate": 8.921942061490023e-06, + "loss": 0.6289, + "step": 3714 + }, + { + "epoch": 0.47, + "grad_norm": 0.7521383474272808, + "learning_rate": 8.92130211608696e-06, + "loss": 0.6059, + "step": 3715 + }, + { + "epoch": 0.47, + "grad_norm": 0.8684102885098306, + "learning_rate": 8.920662003766595e-06, + "loss": 0.6571, + "step": 3716 + }, + { + "epoch": 0.47, + "grad_norm": 0.7495562426728156, + "learning_rate": 8.920021724556178e-06, + "loss": 0.6, + "step": 3717 + }, + { + "epoch": 0.47, + "grad_norm": 0.6088179970275477, + "learning_rate": 8.919381278482962e-06, + "loss": 0.5447, + "step": 3718 + }, + { + "epoch": 0.47, + "grad_norm": 0.8082923425081123, + "learning_rate": 8.91874066557421e-06, + "loss": 0.6528, + "step": 3719 + }, + { + "epoch": 0.47, + "grad_norm": 0.5844725058247551, + "learning_rate": 8.91809988585719e-06, + "loss": 0.5033, + "step": 3720 + }, + { + "epoch": 0.47, + "grad_norm": 0.9171774230903313, + "learning_rate": 8.917458939359178e-06, + "loss": 0.648, + "step": 3721 + }, + { + "epoch": 0.47, + "grad_norm": 0.6941438210718455, + "learning_rate": 8.91681782610746e-06, + "loss": 0.5536, + "step": 3722 + }, + { + "epoch": 0.47, + "grad_norm": 0.6281803328648323, + "learning_rate": 8.91617654612932e-06, + "loss": 0.4978, + "step": 3723 + }, + { + "epoch": 0.47, + "grad_norm": 0.6482814929124665, + "learning_rate": 8.915535099452061e-06, + "loss": 0.538, + "step": 3724 + }, + { + "epoch": 0.47, + "grad_norm": 0.7974226398465755, + "learning_rate": 8.914893486102983e-06, + "loss": 0.5559, + "step": 3725 + }, + { + "epoch": 0.47, + "grad_norm": 0.6514447406437415, + "learning_rate": 8.914251706109402e-06, + "loss": 0.5573, + "step": 3726 + }, + { + "epoch": 0.47, + "grad_norm": 0.8363222865577461, + "learning_rate": 8.913609759498631e-06, + "loss": 0.6463, + "step": 3727 + }, + { + "epoch": 0.47, + "grad_norm": 0.9874203579039861, + "learning_rate": 8.912967646298001e-06, + "loss": 0.6479, + "step": 3728 + }, + { + "epoch": 0.48, + "grad_norm": 0.8192879669837126, + "learning_rate": 8.912325366534842e-06, + "loss": 0.6297, + "step": 3729 + }, + { + "epoch": 0.48, + "grad_norm": 0.5820980775361936, + "learning_rate": 8.911682920236495e-06, + "loss": 0.5041, + "step": 3730 + }, + { + "epoch": 0.48, + "grad_norm": 0.5925154836169583, + "learning_rate": 8.911040307430303e-06, + "loss": 0.5716, + "step": 3731 + }, + { + "epoch": 0.48, + "grad_norm": 0.9268843463803044, + "learning_rate": 8.910397528143627e-06, + "loss": 0.6361, + "step": 3732 + }, + { + "epoch": 0.48, + "grad_norm": 0.8574553628574446, + "learning_rate": 8.909754582403822e-06, + "loss": 0.6125, + "step": 3733 + }, + { + "epoch": 0.48, + "grad_norm": 1.0145060814788536, + "learning_rate": 8.909111470238256e-06, + "loss": 0.6787, + "step": 3734 + }, + { + "epoch": 0.48, + "grad_norm": 0.5704184117156543, + "learning_rate": 8.90846819167431e-06, + "loss": 0.5245, + "step": 3735 + }, + { + "epoch": 0.48, + "grad_norm": 1.3110162758325523, + "learning_rate": 8.907824746739363e-06, + "loss": 0.6501, + "step": 3736 + }, + { + "epoch": 0.48, + "grad_norm": 0.9043482548097569, + "learning_rate": 8.9071811354608e-06, + "loss": 0.6681, + "step": 3737 + }, + { + "epoch": 0.48, + "grad_norm": 0.709447598432559, + "learning_rate": 8.906537357866026e-06, + "loss": 0.5274, + "step": 3738 + }, + { + "epoch": 0.48, + "grad_norm": 1.0643688626522934, + "learning_rate": 8.905893413982438e-06, + "loss": 0.6129, + "step": 3739 + }, + { + "epoch": 0.48, + "grad_norm": 0.6010955615352654, + "learning_rate": 8.905249303837448e-06, + "loss": 0.588, + "step": 3740 + }, + { + "epoch": 0.48, + "grad_norm": 0.6190471340716149, + "learning_rate": 8.904605027458477e-06, + "loss": 0.5585, + "step": 3741 + }, + { + "epoch": 0.48, + "grad_norm": 0.8350867052162392, + "learning_rate": 8.903960584872945e-06, + "loss": 0.653, + "step": 3742 + }, + { + "epoch": 0.48, + "grad_norm": 0.6357271986644341, + "learning_rate": 8.90331597610829e-06, + "loss": 0.538, + "step": 3743 + }, + { + "epoch": 0.48, + "grad_norm": 0.9068142955416113, + "learning_rate": 8.90267120119194e-06, + "loss": 0.6197, + "step": 3744 + }, + { + "epoch": 0.48, + "grad_norm": 0.6219985719635462, + "learning_rate": 8.902026260151355e-06, + "loss": 0.5764, + "step": 3745 + }, + { + "epoch": 0.48, + "grad_norm": 0.7068264692512304, + "learning_rate": 8.90138115301398e-06, + "loss": 0.5794, + "step": 3746 + }, + { + "epoch": 0.48, + "grad_norm": 0.7212069131587737, + "learning_rate": 8.900735879807274e-06, + "loss": 0.5856, + "step": 3747 + }, + { + "epoch": 0.48, + "grad_norm": 0.8011842512755848, + "learning_rate": 8.900090440558706e-06, + "loss": 0.6483, + "step": 3748 + }, + { + "epoch": 0.48, + "grad_norm": 0.7802673096194498, + "learning_rate": 8.899444835295754e-06, + "loss": 0.6121, + "step": 3749 + }, + { + "epoch": 0.48, + "grad_norm": 0.8161764962167143, + "learning_rate": 8.898799064045895e-06, + "loss": 0.6432, + "step": 3750 + }, + { + "epoch": 0.48, + "grad_norm": 0.7955542280150687, + "learning_rate": 8.898153126836618e-06, + "loss": 0.6138, + "step": 3751 + }, + { + "epoch": 0.48, + "grad_norm": 0.6364499303774556, + "learning_rate": 8.897507023695418e-06, + "loss": 0.5281, + "step": 3752 + }, + { + "epoch": 0.48, + "grad_norm": 0.819956479796958, + "learning_rate": 8.896860754649801e-06, + "loss": 0.65, + "step": 3753 + }, + { + "epoch": 0.48, + "grad_norm": 0.6882934163630536, + "learning_rate": 8.896214319727273e-06, + "loss": 0.611, + "step": 3754 + }, + { + "epoch": 0.48, + "grad_norm": 0.71638994144973, + "learning_rate": 8.895567718955353e-06, + "loss": 0.5825, + "step": 3755 + }, + { + "epoch": 0.48, + "grad_norm": 0.722479875569419, + "learning_rate": 8.894920952361562e-06, + "loss": 0.6253, + "step": 3756 + }, + { + "epoch": 0.48, + "grad_norm": 1.1841776943865747, + "learning_rate": 8.894274019973433e-06, + "loss": 0.6373, + "step": 3757 + }, + { + "epoch": 0.48, + "grad_norm": 0.7555028605921302, + "learning_rate": 8.893626921818504e-06, + "loss": 0.5653, + "step": 3758 + }, + { + "epoch": 0.48, + "grad_norm": 0.7476453365424197, + "learning_rate": 8.89297965792432e-06, + "loss": 0.6065, + "step": 3759 + }, + { + "epoch": 0.48, + "grad_norm": 0.5876891237970107, + "learning_rate": 8.89233222831843e-06, + "loss": 0.5155, + "step": 3760 + }, + { + "epoch": 0.48, + "grad_norm": 0.5492884386908883, + "learning_rate": 8.891684633028397e-06, + "loss": 0.4743, + "step": 3761 + }, + { + "epoch": 0.48, + "grad_norm": 0.6953441699789937, + "learning_rate": 8.891036872081784e-06, + "loss": 0.5283, + "step": 3762 + }, + { + "epoch": 0.48, + "grad_norm": 0.5973827674200589, + "learning_rate": 8.890388945506166e-06, + "loss": 0.5016, + "step": 3763 + }, + { + "epoch": 0.48, + "grad_norm": 0.7751216169597506, + "learning_rate": 8.889740853329121e-06, + "loss": 0.5599, + "step": 3764 + }, + { + "epoch": 0.48, + "grad_norm": 0.8330327571955368, + "learning_rate": 8.88909259557824e-06, + "loss": 0.5825, + "step": 3765 + }, + { + "epoch": 0.48, + "grad_norm": 0.7576639230977771, + "learning_rate": 8.888444172281112e-06, + "loss": 0.6222, + "step": 3766 + }, + { + "epoch": 0.48, + "grad_norm": 0.639892378999344, + "learning_rate": 8.887795583465343e-06, + "loss": 0.5417, + "step": 3767 + }, + { + "epoch": 0.48, + "grad_norm": 0.7382001861606764, + "learning_rate": 8.88714682915854e-06, + "loss": 0.6214, + "step": 3768 + }, + { + "epoch": 0.48, + "grad_norm": 0.8397368465145428, + "learning_rate": 8.886497909388318e-06, + "loss": 0.697, + "step": 3769 + }, + { + "epoch": 0.48, + "grad_norm": 0.6448698628371455, + "learning_rate": 8.885848824182299e-06, + "loss": 0.5623, + "step": 3770 + }, + { + "epoch": 0.48, + "grad_norm": 0.5877626713984844, + "learning_rate": 8.885199573568113e-06, + "loss": 0.5625, + "step": 3771 + }, + { + "epoch": 0.48, + "grad_norm": 0.7280144982047488, + "learning_rate": 8.884550157573398e-06, + "loss": 0.5979, + "step": 3772 + }, + { + "epoch": 0.48, + "grad_norm": 4.846620867906846, + "learning_rate": 8.883900576225795e-06, + "loss": 0.6208, + "step": 3773 + }, + { + "epoch": 0.48, + "grad_norm": 0.9117724489057908, + "learning_rate": 8.883250829552954e-06, + "loss": 0.5988, + "step": 3774 + }, + { + "epoch": 0.48, + "grad_norm": 0.6069774680218666, + "learning_rate": 8.882600917582535e-06, + "loss": 0.4871, + "step": 3775 + }, + { + "epoch": 0.48, + "grad_norm": 0.6308346196193754, + "learning_rate": 8.881950840342204e-06, + "loss": 0.5341, + "step": 3776 + }, + { + "epoch": 0.48, + "grad_norm": 0.692961978011267, + "learning_rate": 8.881300597859628e-06, + "loss": 0.5472, + "step": 3777 + }, + { + "epoch": 0.48, + "grad_norm": 0.6725794310638088, + "learning_rate": 8.88065019016249e-06, + "loss": 0.5911, + "step": 3778 + }, + { + "epoch": 0.48, + "grad_norm": 0.7646460223700365, + "learning_rate": 8.879999617278474e-06, + "loss": 0.5878, + "step": 3779 + }, + { + "epoch": 0.48, + "grad_norm": 1.3992359439150255, + "learning_rate": 8.879348879235273e-06, + "loss": 0.6499, + "step": 3780 + }, + { + "epoch": 0.48, + "grad_norm": 1.5100420033806663, + "learning_rate": 8.878697976060587e-06, + "loss": 0.6436, + "step": 3781 + }, + { + "epoch": 0.48, + "grad_norm": 0.6949835926994373, + "learning_rate": 8.878046907782123e-06, + "loss": 0.5458, + "step": 3782 + }, + { + "epoch": 0.48, + "grad_norm": 0.7426340637441344, + "learning_rate": 8.877395674427594e-06, + "loss": 0.5895, + "step": 3783 + }, + { + "epoch": 0.48, + "grad_norm": 0.9767401982713976, + "learning_rate": 8.87674427602472e-06, + "loss": 0.5982, + "step": 3784 + }, + { + "epoch": 0.48, + "grad_norm": 0.739690809890258, + "learning_rate": 8.876092712601232e-06, + "loss": 0.6252, + "step": 3785 + }, + { + "epoch": 0.48, + "grad_norm": 0.535623539253013, + "learning_rate": 8.875440984184865e-06, + "loss": 0.5018, + "step": 3786 + }, + { + "epoch": 0.48, + "grad_norm": 0.6071631986355673, + "learning_rate": 8.874789090803357e-06, + "loss": 0.5932, + "step": 3787 + }, + { + "epoch": 0.48, + "grad_norm": 0.743789965590836, + "learning_rate": 8.87413703248446e-06, + "loss": 0.6023, + "step": 3788 + }, + { + "epoch": 0.48, + "grad_norm": 0.6378897141467588, + "learning_rate": 8.87348480925593e-06, + "loss": 0.5606, + "step": 3789 + }, + { + "epoch": 0.48, + "grad_norm": 0.7544153680391198, + "learning_rate": 8.87283242114553e-06, + "loss": 0.6267, + "step": 3790 + }, + { + "epoch": 0.48, + "grad_norm": 0.694397722754607, + "learning_rate": 8.872179868181028e-06, + "loss": 0.6275, + "step": 3791 + }, + { + "epoch": 0.48, + "grad_norm": 0.8056110231682484, + "learning_rate": 8.871527150390205e-06, + "loss": 0.5115, + "step": 3792 + }, + { + "epoch": 0.48, + "grad_norm": 0.7209354821466226, + "learning_rate": 8.870874267800839e-06, + "loss": 0.5651, + "step": 3793 + }, + { + "epoch": 0.48, + "grad_norm": 0.6068038272840759, + "learning_rate": 8.870221220440729e-06, + "loss": 0.5354, + "step": 3794 + }, + { + "epoch": 0.48, + "grad_norm": 0.6011028532546561, + "learning_rate": 8.869568008337669e-06, + "loss": 0.5603, + "step": 3795 + }, + { + "epoch": 0.48, + "grad_norm": 1.0632137333295957, + "learning_rate": 8.86891463151946e-06, + "loss": 0.6775, + "step": 3796 + }, + { + "epoch": 0.48, + "grad_norm": 0.6163072991381189, + "learning_rate": 8.868261090013923e-06, + "loss": 0.5296, + "step": 3797 + }, + { + "epoch": 0.48, + "grad_norm": 0.6114030165969825, + "learning_rate": 8.86760738384887e-06, + "loss": 0.5308, + "step": 3798 + }, + { + "epoch": 0.48, + "grad_norm": 0.5891162191896753, + "learning_rate": 8.86695351305213e-06, + "loss": 0.5534, + "step": 3799 + }, + { + "epoch": 0.48, + "grad_norm": 0.7547945331350864, + "learning_rate": 8.866299477651538e-06, + "loss": 0.5845, + "step": 3800 + }, + { + "epoch": 0.48, + "grad_norm": 0.5839010840818549, + "learning_rate": 8.86564527767493e-06, + "loss": 0.4936, + "step": 3801 + }, + { + "epoch": 0.48, + "grad_norm": 0.851915511901835, + "learning_rate": 8.864990913150157e-06, + "loss": 0.6214, + "step": 3802 + }, + { + "epoch": 0.48, + "grad_norm": 0.6491400211590191, + "learning_rate": 8.86433638410507e-06, + "loss": 0.5591, + "step": 3803 + }, + { + "epoch": 0.48, + "grad_norm": 0.6449044857682525, + "learning_rate": 8.863681690567533e-06, + "loss": 0.5224, + "step": 3804 + }, + { + "epoch": 0.48, + "grad_norm": 0.6280119347772116, + "learning_rate": 8.863026832565412e-06, + "loss": 0.5718, + "step": 3805 + }, + { + "epoch": 0.48, + "grad_norm": 0.594692193545, + "learning_rate": 8.862371810126584e-06, + "loss": 0.5003, + "step": 3806 + }, + { + "epoch": 0.48, + "grad_norm": 0.6855556235983449, + "learning_rate": 8.86171662327893e-06, + "loss": 0.5872, + "step": 3807 + }, + { + "epoch": 0.49, + "grad_norm": 0.7636411056548783, + "learning_rate": 8.861061272050339e-06, + "loss": 0.6026, + "step": 3808 + }, + { + "epoch": 0.49, + "grad_norm": 0.6804317876899881, + "learning_rate": 8.860405756468709e-06, + "loss": 0.5311, + "step": 3809 + }, + { + "epoch": 0.49, + "grad_norm": 0.756783970454099, + "learning_rate": 8.859750076561942e-06, + "loss": 0.6205, + "step": 3810 + }, + { + "epoch": 0.49, + "grad_norm": 0.7552420388713613, + "learning_rate": 8.859094232357948e-06, + "loss": 0.6087, + "step": 3811 + }, + { + "epoch": 0.49, + "grad_norm": 0.8209982992660904, + "learning_rate": 8.858438223884644e-06, + "loss": 0.6055, + "step": 3812 + }, + { + "epoch": 0.49, + "grad_norm": 0.6056568586874923, + "learning_rate": 8.857782051169956e-06, + "loss": 0.5688, + "step": 3813 + }, + { + "epoch": 0.49, + "grad_norm": 0.8394050062298642, + "learning_rate": 8.857125714241812e-06, + "loss": 0.6111, + "step": 3814 + }, + { + "epoch": 0.49, + "grad_norm": 0.6203192329381239, + "learning_rate": 8.856469213128153e-06, + "loss": 0.5345, + "step": 3815 + }, + { + "epoch": 0.49, + "grad_norm": 0.694377448647183, + "learning_rate": 8.855812547856923e-06, + "loss": 0.5889, + "step": 3816 + }, + { + "epoch": 0.49, + "grad_norm": 0.8583079935692607, + "learning_rate": 8.855155718456075e-06, + "loss": 0.6116, + "step": 3817 + }, + { + "epoch": 0.49, + "grad_norm": 0.7022409878542383, + "learning_rate": 8.854498724953564e-06, + "loss": 0.6024, + "step": 3818 + }, + { + "epoch": 0.49, + "grad_norm": 0.7305438770594196, + "learning_rate": 8.853841567377363e-06, + "loss": 0.5816, + "step": 3819 + }, + { + "epoch": 0.49, + "grad_norm": 0.6651848706841397, + "learning_rate": 8.853184245755439e-06, + "loss": 0.5932, + "step": 3820 + }, + { + "epoch": 0.49, + "grad_norm": 0.8874623586818131, + "learning_rate": 8.852526760115778e-06, + "loss": 0.6175, + "step": 3821 + }, + { + "epoch": 0.49, + "grad_norm": 0.6640870272319327, + "learning_rate": 8.851869110486362e-06, + "loss": 0.534, + "step": 3822 + }, + { + "epoch": 0.49, + "grad_norm": 0.5720754296337881, + "learning_rate": 8.851211296895186e-06, + "loss": 0.4845, + "step": 3823 + }, + { + "epoch": 0.49, + "grad_norm": 0.8111147186094624, + "learning_rate": 8.850553319370252e-06, + "loss": 0.6294, + "step": 3824 + }, + { + "epoch": 0.49, + "grad_norm": 0.7863860785012166, + "learning_rate": 8.84989517793957e-06, + "loss": 0.6354, + "step": 3825 + }, + { + "epoch": 0.49, + "grad_norm": 0.8385381607676841, + "learning_rate": 8.84923687263115e-06, + "loss": 0.6259, + "step": 3826 + }, + { + "epoch": 0.49, + "grad_norm": 0.7224318901767393, + "learning_rate": 8.848578403473018e-06, + "loss": 0.5923, + "step": 3827 + }, + { + "epoch": 0.49, + "grad_norm": 0.8064197154174032, + "learning_rate": 8.847919770493201e-06, + "loss": 0.6581, + "step": 3828 + }, + { + "epoch": 0.49, + "grad_norm": 0.570652831295744, + "learning_rate": 8.847260973719736e-06, + "loss": 0.562, + "step": 3829 + }, + { + "epoch": 0.49, + "grad_norm": 0.7335012936634536, + "learning_rate": 8.846602013180666e-06, + "loss": 0.5797, + "step": 3830 + }, + { + "epoch": 0.49, + "grad_norm": 0.6228570584062498, + "learning_rate": 8.845942888904038e-06, + "loss": 0.5717, + "step": 3831 + }, + { + "epoch": 0.49, + "grad_norm": 0.7002958417312181, + "learning_rate": 8.845283600917914e-06, + "loss": 0.5549, + "step": 3832 + }, + { + "epoch": 0.49, + "grad_norm": 0.636369277306478, + "learning_rate": 8.844624149250354e-06, + "loss": 0.6098, + "step": 3833 + }, + { + "epoch": 0.49, + "grad_norm": 0.7456172052339775, + "learning_rate": 8.84396453392943e-06, + "loss": 0.5924, + "step": 3834 + }, + { + "epoch": 0.49, + "grad_norm": 0.9141506628012072, + "learning_rate": 8.843304754983217e-06, + "loss": 0.6215, + "step": 3835 + }, + { + "epoch": 0.49, + "grad_norm": 0.9676529684471039, + "learning_rate": 8.842644812439804e-06, + "loss": 0.6414, + "step": 3836 + }, + { + "epoch": 0.49, + "grad_norm": 0.7605335745927064, + "learning_rate": 8.84198470632728e-06, + "loss": 0.6494, + "step": 3837 + }, + { + "epoch": 0.49, + "grad_norm": 0.7200340824015131, + "learning_rate": 8.841324436673745e-06, + "loss": 0.6169, + "step": 3838 + }, + { + "epoch": 0.49, + "grad_norm": 0.6287154856209565, + "learning_rate": 8.840664003507304e-06, + "loss": 0.5316, + "step": 3839 + }, + { + "epoch": 0.49, + "grad_norm": 0.8146697148983737, + "learning_rate": 8.840003406856067e-06, + "loss": 0.5974, + "step": 3840 + }, + { + "epoch": 0.49, + "grad_norm": 0.7181890454293451, + "learning_rate": 8.83934264674816e-06, + "loss": 0.5717, + "step": 3841 + }, + { + "epoch": 0.49, + "grad_norm": 0.6613350860317138, + "learning_rate": 8.838681723211701e-06, + "loss": 0.5262, + "step": 3842 + }, + { + "epoch": 0.49, + "grad_norm": 0.791346358642499, + "learning_rate": 8.838020636274832e-06, + "loss": 0.6149, + "step": 3843 + }, + { + "epoch": 0.49, + "grad_norm": 0.7061119137167481, + "learning_rate": 8.837359385965688e-06, + "loss": 0.5473, + "step": 3844 + }, + { + "epoch": 0.49, + "grad_norm": 0.8103306329770703, + "learning_rate": 8.836697972312413e-06, + "loss": 0.6177, + "step": 3845 + }, + { + "epoch": 0.49, + "grad_norm": 0.765865468795884, + "learning_rate": 8.83603639534317e-06, + "loss": 0.6611, + "step": 3846 + }, + { + "epoch": 0.49, + "grad_norm": 1.1176111088317007, + "learning_rate": 8.835374655086116e-06, + "loss": 0.5709, + "step": 3847 + }, + { + "epoch": 0.49, + "grad_norm": 0.5733228264454907, + "learning_rate": 8.834712751569417e-06, + "loss": 0.4513, + "step": 3848 + }, + { + "epoch": 0.49, + "grad_norm": 0.8420986077281761, + "learning_rate": 8.834050684821251e-06, + "loss": 0.5911, + "step": 3849 + }, + { + "epoch": 0.49, + "grad_norm": 0.615357020898463, + "learning_rate": 8.8333884548698e-06, + "loss": 0.5186, + "step": 3850 + }, + { + "epoch": 0.49, + "grad_norm": 0.6185145452469081, + "learning_rate": 8.83272606174325e-06, + "loss": 0.5413, + "step": 3851 + }, + { + "epoch": 0.49, + "grad_norm": 0.7200094600005802, + "learning_rate": 8.832063505469801e-06, + "loss": 0.5959, + "step": 3852 + }, + { + "epoch": 0.49, + "grad_norm": 0.5578276948046608, + "learning_rate": 8.831400786077653e-06, + "loss": 0.5372, + "step": 3853 + }, + { + "epoch": 0.49, + "grad_norm": 0.5840033030101617, + "learning_rate": 8.830737903595019e-06, + "loss": 0.5608, + "step": 3854 + }, + { + "epoch": 0.49, + "grad_norm": 0.8529316521621446, + "learning_rate": 8.830074858050111e-06, + "loss": 0.6426, + "step": 3855 + }, + { + "epoch": 0.49, + "grad_norm": 0.5460277659603923, + "learning_rate": 8.829411649471158e-06, + "loss": 0.5459, + "step": 3856 + }, + { + "epoch": 0.49, + "grad_norm": 0.6497799968015209, + "learning_rate": 8.828748277886386e-06, + "loss": 0.5793, + "step": 3857 + }, + { + "epoch": 0.49, + "grad_norm": 0.6160836682642195, + "learning_rate": 8.828084743324036e-06, + "loss": 0.5312, + "step": 3858 + }, + { + "epoch": 0.49, + "grad_norm": 0.5674052953999604, + "learning_rate": 8.82742104581235e-06, + "loss": 0.506, + "step": 3859 + }, + { + "epoch": 0.49, + "grad_norm": 0.6026918906289411, + "learning_rate": 8.826757185379582e-06, + "loss": 0.5994, + "step": 3860 + }, + { + "epoch": 0.49, + "grad_norm": 0.6268201867888585, + "learning_rate": 8.826093162053989e-06, + "loss": 0.4832, + "step": 3861 + }, + { + "epoch": 0.49, + "grad_norm": 0.7433241407554696, + "learning_rate": 8.825428975863836e-06, + "loss": 0.5958, + "step": 3862 + }, + { + "epoch": 0.49, + "grad_norm": 0.78294957505671, + "learning_rate": 8.824764626837397e-06, + "loss": 0.6129, + "step": 3863 + }, + { + "epoch": 0.49, + "grad_norm": 0.7157255879477162, + "learning_rate": 8.82410011500295e-06, + "loss": 0.5746, + "step": 3864 + }, + { + "epoch": 0.49, + "grad_norm": 0.6111326643348116, + "learning_rate": 8.823435440388779e-06, + "loss": 0.475, + "step": 3865 + }, + { + "epoch": 0.49, + "grad_norm": 0.6872514168090972, + "learning_rate": 8.822770603023181e-06, + "loss": 0.5695, + "step": 3866 + }, + { + "epoch": 0.49, + "grad_norm": 0.8509162722683828, + "learning_rate": 8.822105602934454e-06, + "loss": 0.6404, + "step": 3867 + }, + { + "epoch": 0.49, + "grad_norm": 0.6578410806098519, + "learning_rate": 8.821440440150906e-06, + "loss": 0.5242, + "step": 3868 + }, + { + "epoch": 0.49, + "grad_norm": 0.6987518175746289, + "learning_rate": 8.82077511470085e-06, + "loss": 0.5508, + "step": 3869 + }, + { + "epoch": 0.49, + "grad_norm": 1.0894242529125457, + "learning_rate": 8.820109626612604e-06, + "loss": 0.6254, + "step": 3870 + }, + { + "epoch": 0.49, + "grad_norm": 0.853879069248099, + "learning_rate": 8.8194439759145e-06, + "loss": 0.6686, + "step": 3871 + }, + { + "epoch": 0.49, + "grad_norm": 0.8424280450962335, + "learning_rate": 8.818778162634874e-06, + "loss": 0.6449, + "step": 3872 + }, + { + "epoch": 0.49, + "grad_norm": 0.8413090703550339, + "learning_rate": 8.818112186802062e-06, + "loss": 0.6777, + "step": 3873 + }, + { + "epoch": 0.49, + "grad_norm": 0.6179861901530256, + "learning_rate": 8.817446048444415e-06, + "loss": 0.5722, + "step": 3874 + }, + { + "epoch": 0.49, + "grad_norm": 0.6634789003231849, + "learning_rate": 8.816779747590289e-06, + "loss": 0.5636, + "step": 3875 + }, + { + "epoch": 0.49, + "grad_norm": 0.7289076672701938, + "learning_rate": 8.816113284268046e-06, + "loss": 0.5762, + "step": 3876 + }, + { + "epoch": 0.49, + "grad_norm": 0.6968440077589, + "learning_rate": 8.815446658506056e-06, + "loss": 0.5641, + "step": 3877 + }, + { + "epoch": 0.49, + "grad_norm": 0.5641354164248852, + "learning_rate": 8.814779870332692e-06, + "loss": 0.5449, + "step": 3878 + }, + { + "epoch": 0.49, + "grad_norm": 0.6193433571350114, + "learning_rate": 8.81411291977634e-06, + "loss": 0.5039, + "step": 3879 + }, + { + "epoch": 0.49, + "grad_norm": 0.8229145202184409, + "learning_rate": 8.81344580686539e-06, + "loss": 0.637, + "step": 3880 + }, + { + "epoch": 0.49, + "grad_norm": 0.8346751370246499, + "learning_rate": 8.812778531628238e-06, + "loss": 0.6622, + "step": 3881 + }, + { + "epoch": 0.49, + "grad_norm": 1.1992556023933005, + "learning_rate": 8.812111094093287e-06, + "loss": 0.6383, + "step": 3882 + }, + { + "epoch": 0.49, + "grad_norm": 0.5229181416011706, + "learning_rate": 8.811443494288949e-06, + "loss": 0.5322, + "step": 3883 + }, + { + "epoch": 0.49, + "grad_norm": 0.5361558155327227, + "learning_rate": 8.81077573224364e-06, + "loss": 0.5464, + "step": 3884 + }, + { + "epoch": 0.49, + "grad_norm": 0.680848544029737, + "learning_rate": 8.810107807985787e-06, + "loss": 0.6104, + "step": 3885 + }, + { + "epoch": 0.5, + "grad_norm": 0.7053434025936305, + "learning_rate": 8.80943972154382e-06, + "loss": 0.5591, + "step": 3886 + }, + { + "epoch": 0.5, + "grad_norm": 0.763889628092451, + "learning_rate": 8.808771472946176e-06, + "loss": 0.5878, + "step": 3887 + }, + { + "epoch": 0.5, + "grad_norm": 0.7282180768583473, + "learning_rate": 8.808103062221303e-06, + "loss": 0.5763, + "step": 3888 + }, + { + "epoch": 0.5, + "grad_norm": 0.8156709587893172, + "learning_rate": 8.80743448939765e-06, + "loss": 0.6678, + "step": 3889 + }, + { + "epoch": 0.5, + "grad_norm": 0.6750448945648101, + "learning_rate": 8.806765754503679e-06, + "loss": 0.568, + "step": 3890 + }, + { + "epoch": 0.5, + "grad_norm": 0.6642955876761903, + "learning_rate": 8.806096857567854e-06, + "loss": 0.5051, + "step": 3891 + }, + { + "epoch": 0.5, + "grad_norm": 0.7798986842305553, + "learning_rate": 8.805427798618647e-06, + "loss": 0.6624, + "step": 3892 + }, + { + "epoch": 0.5, + "grad_norm": 0.5693023559753447, + "learning_rate": 8.804758577684541e-06, + "loss": 0.5417, + "step": 3893 + }, + { + "epoch": 0.5, + "grad_norm": 0.7554695739250997, + "learning_rate": 8.80408919479402e-06, + "loss": 0.6065, + "step": 3894 + }, + { + "epoch": 0.5, + "grad_norm": 0.7597579230078431, + "learning_rate": 8.803419649975577e-06, + "loss": 0.5305, + "step": 3895 + }, + { + "epoch": 0.5, + "grad_norm": 0.6083369015618603, + "learning_rate": 8.802749943257715e-06, + "loss": 0.5535, + "step": 3896 + }, + { + "epoch": 0.5, + "grad_norm": 0.6535379816174423, + "learning_rate": 8.80208007466894e-06, + "loss": 0.6076, + "step": 3897 + }, + { + "epoch": 0.5, + "grad_norm": 0.6099357229501594, + "learning_rate": 8.801410044237765e-06, + "loss": 0.5075, + "step": 3898 + }, + { + "epoch": 0.5, + "grad_norm": 0.8653682857511228, + "learning_rate": 8.800739851992712e-06, + "loss": 0.6629, + "step": 3899 + }, + { + "epoch": 0.5, + "grad_norm": 0.7785614415931186, + "learning_rate": 8.800069497962309e-06, + "loss": 0.5511, + "step": 3900 + }, + { + "epoch": 0.5, + "grad_norm": 0.8210474196867106, + "learning_rate": 8.799398982175091e-06, + "loss": 0.6493, + "step": 3901 + }, + { + "epoch": 0.5, + "grad_norm": 0.8364813898871233, + "learning_rate": 8.7987283046596e-06, + "loss": 0.6044, + "step": 3902 + }, + { + "epoch": 0.5, + "grad_norm": 0.6456465765628633, + "learning_rate": 8.798057465444383e-06, + "loss": 0.5518, + "step": 3903 + }, + { + "epoch": 0.5, + "grad_norm": 0.6409540453409056, + "learning_rate": 8.797386464557998e-06, + "loss": 0.5093, + "step": 3904 + }, + { + "epoch": 0.5, + "grad_norm": 0.7971425482188506, + "learning_rate": 8.796715302029004e-06, + "loss": 0.6072, + "step": 3905 + }, + { + "epoch": 0.5, + "grad_norm": 0.8016859291483994, + "learning_rate": 8.796043977885973e-06, + "loss": 0.5869, + "step": 3906 + }, + { + "epoch": 0.5, + "grad_norm": 0.8663166529864562, + "learning_rate": 8.79537249215748e-06, + "loss": 0.6501, + "step": 3907 + }, + { + "epoch": 0.5, + "grad_norm": 0.6453729917214116, + "learning_rate": 8.794700844872108e-06, + "loss": 0.5609, + "step": 3908 + }, + { + "epoch": 0.5, + "grad_norm": 0.6642357020406892, + "learning_rate": 8.794029036058449e-06, + "loss": 0.5729, + "step": 3909 + }, + { + "epoch": 0.5, + "grad_norm": 0.6688130520578079, + "learning_rate": 8.793357065745097e-06, + "loss": 0.6118, + "step": 3910 + }, + { + "epoch": 0.5, + "grad_norm": 0.8056793290954218, + "learning_rate": 8.792684933960656e-06, + "loss": 0.6105, + "step": 3911 + }, + { + "epoch": 0.5, + "grad_norm": 0.793202100360878, + "learning_rate": 8.792012640733735e-06, + "loss": 0.6141, + "step": 3912 + }, + { + "epoch": 0.5, + "grad_norm": 0.6599718294437324, + "learning_rate": 8.791340186092958e-06, + "loss": 0.5557, + "step": 3913 + }, + { + "epoch": 0.5, + "grad_norm": 0.841302491480118, + "learning_rate": 8.790667570066941e-06, + "loss": 0.6534, + "step": 3914 + }, + { + "epoch": 0.5, + "grad_norm": 0.6025201812345489, + "learning_rate": 8.78999479268432e-06, + "loss": 0.4833, + "step": 3915 + }, + { + "epoch": 0.5, + "grad_norm": 0.814955347674252, + "learning_rate": 8.789321853973733e-06, + "loss": 0.677, + "step": 3916 + }, + { + "epoch": 0.5, + "grad_norm": 0.7262445888644592, + "learning_rate": 8.788648753963822e-06, + "loss": 0.6205, + "step": 3917 + }, + { + "epoch": 0.5, + "grad_norm": 0.7006669001515851, + "learning_rate": 8.78797549268324e-06, + "loss": 0.5315, + "step": 3918 + }, + { + "epoch": 0.5, + "grad_norm": 0.8567688768015841, + "learning_rate": 8.787302070160649e-06, + "loss": 0.6288, + "step": 3919 + }, + { + "epoch": 0.5, + "grad_norm": 0.5519427008134669, + "learning_rate": 8.786628486424707e-06, + "loss": 0.5202, + "step": 3920 + }, + { + "epoch": 0.5, + "grad_norm": 0.7393250649816075, + "learning_rate": 8.785954741504093e-06, + "loss": 0.6294, + "step": 3921 + }, + { + "epoch": 0.5, + "grad_norm": 0.6297069959304996, + "learning_rate": 8.785280835427484e-06, + "loss": 0.4981, + "step": 3922 + }, + { + "epoch": 0.5, + "grad_norm": 0.6528378035469024, + "learning_rate": 8.784606768223566e-06, + "loss": 0.5373, + "step": 3923 + }, + { + "epoch": 0.5, + "grad_norm": 0.6227212886003048, + "learning_rate": 8.783932539921032e-06, + "loss": 0.5402, + "step": 3924 + }, + { + "epoch": 0.5, + "grad_norm": 0.9937598494539905, + "learning_rate": 8.78325815054858e-06, + "loss": 0.599, + "step": 3925 + }, + { + "epoch": 0.5, + "grad_norm": 0.6215995681617628, + "learning_rate": 8.782583600134918e-06, + "loss": 0.5335, + "step": 3926 + }, + { + "epoch": 0.5, + "grad_norm": 0.5380554764527031, + "learning_rate": 8.78190888870876e-06, + "loss": 0.4775, + "step": 3927 + }, + { + "epoch": 0.5, + "grad_norm": 0.7026124895736224, + "learning_rate": 8.781234016298828e-06, + "loss": 0.5746, + "step": 3928 + }, + { + "epoch": 0.5, + "grad_norm": 0.8598823617613709, + "learning_rate": 8.780558982933844e-06, + "loss": 0.6042, + "step": 3929 + }, + { + "epoch": 0.5, + "grad_norm": 2.8975364148059026, + "learning_rate": 8.779883788642547e-06, + "loss": 0.6304, + "step": 3930 + }, + { + "epoch": 0.5, + "grad_norm": 0.541520899074446, + "learning_rate": 8.779208433453674e-06, + "loss": 0.5239, + "step": 3931 + }, + { + "epoch": 0.5, + "grad_norm": 0.710982995919678, + "learning_rate": 8.778532917395976e-06, + "loss": 0.6096, + "step": 3932 + }, + { + "epoch": 0.5, + "grad_norm": 0.5236664120263389, + "learning_rate": 8.777857240498207e-06, + "loss": 0.4722, + "step": 3933 + }, + { + "epoch": 0.5, + "grad_norm": 0.7267115813270354, + "learning_rate": 8.777181402789126e-06, + "loss": 0.6277, + "step": 3934 + }, + { + "epoch": 0.5, + "grad_norm": 0.7040182474563481, + "learning_rate": 8.776505404297505e-06, + "loss": 0.6227, + "step": 3935 + }, + { + "epoch": 0.5, + "grad_norm": 0.5660328735296014, + "learning_rate": 8.775829245052115e-06, + "loss": 0.516, + "step": 3936 + }, + { + "epoch": 0.5, + "grad_norm": 0.9056767207848451, + "learning_rate": 8.775152925081738e-06, + "loss": 0.6336, + "step": 3937 + }, + { + "epoch": 0.5, + "grad_norm": 0.565275260062294, + "learning_rate": 8.774476444415168e-06, + "loss": 0.4978, + "step": 3938 + }, + { + "epoch": 0.5, + "grad_norm": 0.8544456932806831, + "learning_rate": 8.773799803081195e-06, + "loss": 0.6114, + "step": 3939 + }, + { + "epoch": 0.5, + "grad_norm": 0.7471981205121471, + "learning_rate": 8.773123001108626e-06, + "loss": 0.5687, + "step": 3940 + }, + { + "epoch": 0.5, + "grad_norm": 0.7788288350058924, + "learning_rate": 8.772446038526267e-06, + "loss": 0.6382, + "step": 3941 + }, + { + "epoch": 0.5, + "grad_norm": 0.7295741329091895, + "learning_rate": 8.771768915362935e-06, + "loss": 0.5073, + "step": 3942 + }, + { + "epoch": 0.5, + "grad_norm": 0.7935413544226886, + "learning_rate": 8.771091631647453e-06, + "loss": 0.5818, + "step": 3943 + }, + { + "epoch": 0.5, + "grad_norm": 0.9755369019121798, + "learning_rate": 8.770414187408652e-06, + "loss": 0.6647, + "step": 3944 + }, + { + "epoch": 0.5, + "grad_norm": 0.8530256658483412, + "learning_rate": 8.769736582675366e-06, + "loss": 0.6504, + "step": 3945 + }, + { + "epoch": 0.5, + "grad_norm": 0.7088817180572102, + "learning_rate": 8.76905881747644e-06, + "loss": 0.6092, + "step": 3946 + }, + { + "epoch": 0.5, + "grad_norm": 0.6121614392287799, + "learning_rate": 8.768380891840725e-06, + "loss": 0.5421, + "step": 3947 + }, + { + "epoch": 0.5, + "grad_norm": 0.8196487573531597, + "learning_rate": 8.767702805797077e-06, + "loss": 0.6215, + "step": 3948 + }, + { + "epoch": 0.5, + "grad_norm": 0.6000699850241337, + "learning_rate": 8.767024559374361e-06, + "loss": 0.5263, + "step": 3949 + }, + { + "epoch": 0.5, + "grad_norm": 0.6605492076345316, + "learning_rate": 8.766346152601448e-06, + "loss": 0.53, + "step": 3950 + }, + { + "epoch": 0.5, + "grad_norm": 0.6352723081872792, + "learning_rate": 8.765667585507213e-06, + "loss": 0.5504, + "step": 3951 + }, + { + "epoch": 0.5, + "grad_norm": 0.7852571521634043, + "learning_rate": 8.764988858120543e-06, + "loss": 0.641, + "step": 3952 + }, + { + "epoch": 0.5, + "grad_norm": 0.7361326200022199, + "learning_rate": 8.764309970470328e-06, + "loss": 0.6594, + "step": 3953 + }, + { + "epoch": 0.5, + "grad_norm": 0.6980304144144914, + "learning_rate": 8.763630922585466e-06, + "loss": 0.6162, + "step": 3954 + }, + { + "epoch": 0.5, + "grad_norm": 0.6626658153353427, + "learning_rate": 8.762951714494864e-06, + "loss": 0.5584, + "step": 3955 + }, + { + "epoch": 0.5, + "grad_norm": 0.6939803321819855, + "learning_rate": 8.762272346227434e-06, + "loss": 0.5594, + "step": 3956 + }, + { + "epoch": 0.5, + "grad_norm": 0.7735382705157501, + "learning_rate": 8.761592817812088e-06, + "loss": 0.6308, + "step": 3957 + }, + { + "epoch": 0.5, + "grad_norm": 0.8090253150963491, + "learning_rate": 8.76091312927776e-06, + "loss": 0.6188, + "step": 3958 + }, + { + "epoch": 0.5, + "grad_norm": 0.6500013847522571, + "learning_rate": 8.760233280653376e-06, + "loss": 0.5905, + "step": 3959 + }, + { + "epoch": 0.5, + "grad_norm": 0.6895783995054142, + "learning_rate": 8.75955327196788e-06, + "loss": 0.5169, + "step": 3960 + }, + { + "epoch": 0.5, + "grad_norm": 0.5468811719940511, + "learning_rate": 8.758873103250212e-06, + "loss": 0.4966, + "step": 3961 + }, + { + "epoch": 0.5, + "grad_norm": 0.7887595464839658, + "learning_rate": 8.758192774529328e-06, + "loss": 0.6637, + "step": 3962 + }, + { + "epoch": 0.5, + "grad_norm": 1.7138163027372428, + "learning_rate": 8.757512285834189e-06, + "loss": 0.6375, + "step": 3963 + }, + { + "epoch": 0.51, + "grad_norm": 0.5599752231453183, + "learning_rate": 8.756831637193757e-06, + "loss": 0.5507, + "step": 3964 + }, + { + "epoch": 0.51, + "grad_norm": 0.6959909932970919, + "learning_rate": 8.756150828637009e-06, + "loss": 0.5288, + "step": 3965 + }, + { + "epoch": 0.51, + "grad_norm": 0.7716403889418847, + "learning_rate": 8.755469860192922e-06, + "loss": 0.5618, + "step": 3966 + }, + { + "epoch": 0.51, + "grad_norm": 0.7329369573913461, + "learning_rate": 8.754788731890485e-06, + "loss": 0.5584, + "step": 3967 + }, + { + "epoch": 0.51, + "grad_norm": 0.7316888058876729, + "learning_rate": 8.75410744375869e-06, + "loss": 0.6067, + "step": 3968 + }, + { + "epoch": 0.51, + "grad_norm": 0.5722472749697322, + "learning_rate": 8.753425995826536e-06, + "loss": 0.5167, + "step": 3969 + }, + { + "epoch": 0.51, + "grad_norm": 0.5915106949795744, + "learning_rate": 8.752744388123033e-06, + "loss": 0.5552, + "step": 3970 + }, + { + "epoch": 0.51, + "grad_norm": 0.8081715364208715, + "learning_rate": 8.752062620677193e-06, + "loss": 0.6373, + "step": 3971 + }, + { + "epoch": 0.51, + "grad_norm": 0.5530192994616905, + "learning_rate": 8.751380693518038e-06, + "loss": 0.4732, + "step": 3972 + }, + { + "epoch": 0.51, + "grad_norm": 0.5950488609759187, + "learning_rate": 8.750698606674594e-06, + "loss": 0.5268, + "step": 3973 + }, + { + "epoch": 0.51, + "grad_norm": 0.5749555327536146, + "learning_rate": 8.750016360175895e-06, + "loss": 0.5284, + "step": 3974 + }, + { + "epoch": 0.51, + "grad_norm": 0.6571820300636001, + "learning_rate": 8.749333954050984e-06, + "loss": 0.6087, + "step": 3975 + }, + { + "epoch": 0.51, + "grad_norm": 0.6477038814961695, + "learning_rate": 8.748651388328906e-06, + "loss": 0.535, + "step": 3976 + }, + { + "epoch": 0.51, + "grad_norm": 0.6708654495194948, + "learning_rate": 8.74796866303872e-06, + "loss": 0.568, + "step": 3977 + }, + { + "epoch": 0.51, + "grad_norm": 0.7169526277350821, + "learning_rate": 8.747285778209483e-06, + "loss": 0.5198, + "step": 3978 + }, + { + "epoch": 0.51, + "grad_norm": 0.7714452218504663, + "learning_rate": 8.746602733870266e-06, + "loss": 0.6246, + "step": 3979 + }, + { + "epoch": 0.51, + "grad_norm": 0.7184550479829447, + "learning_rate": 8.745919530050143e-06, + "loss": 0.5991, + "step": 3980 + }, + { + "epoch": 0.51, + "grad_norm": 0.8511114247254772, + "learning_rate": 8.745236166778194e-06, + "loss": 0.6768, + "step": 3981 + }, + { + "epoch": 0.51, + "grad_norm": 0.5887325762540285, + "learning_rate": 8.744552644083512e-06, + "loss": 0.512, + "step": 3982 + }, + { + "epoch": 0.51, + "grad_norm": 0.7314092283086151, + "learning_rate": 8.743868961995188e-06, + "loss": 0.4886, + "step": 3983 + }, + { + "epoch": 0.51, + "grad_norm": 0.6158300606820595, + "learning_rate": 8.743185120542326e-06, + "loss": 0.5424, + "step": 3984 + }, + { + "epoch": 0.51, + "grad_norm": 0.6551168389002009, + "learning_rate": 8.742501119754037e-06, + "loss": 0.5373, + "step": 3985 + }, + { + "epoch": 0.51, + "grad_norm": 0.573471529393463, + "learning_rate": 8.741816959659432e-06, + "loss": 0.5122, + "step": 3986 + }, + { + "epoch": 0.51, + "grad_norm": 0.6910089267339331, + "learning_rate": 8.741132640287639e-06, + "loss": 0.5785, + "step": 3987 + }, + { + "epoch": 0.51, + "grad_norm": 0.6899036001471719, + "learning_rate": 8.740448161667783e-06, + "loss": 0.5408, + "step": 3988 + }, + { + "epoch": 0.51, + "grad_norm": 0.7321408321882751, + "learning_rate": 8.739763523829001e-06, + "loss": 0.5501, + "step": 3989 + }, + { + "epoch": 0.51, + "grad_norm": 0.7112818807731353, + "learning_rate": 8.739078726800436e-06, + "loss": 0.5512, + "step": 3990 + }, + { + "epoch": 0.51, + "grad_norm": 1.1155330792726104, + "learning_rate": 8.73839377061124e-06, + "loss": 0.6421, + "step": 3991 + }, + { + "epoch": 0.51, + "grad_norm": 0.5606726973898964, + "learning_rate": 8.737708655290564e-06, + "loss": 0.5187, + "step": 3992 + }, + { + "epoch": 0.51, + "grad_norm": 0.7522452464776879, + "learning_rate": 8.737023380867578e-06, + "loss": 0.5994, + "step": 3993 + }, + { + "epoch": 0.51, + "grad_norm": 0.9150160363986606, + "learning_rate": 8.736337947371448e-06, + "loss": 0.6838, + "step": 3994 + }, + { + "epoch": 0.51, + "grad_norm": 0.7507945652532474, + "learning_rate": 8.73565235483135e-06, + "loss": 0.5765, + "step": 3995 + }, + { + "epoch": 0.51, + "grad_norm": 0.5785208228204933, + "learning_rate": 8.73496660327647e-06, + "loss": 0.5373, + "step": 3996 + }, + { + "epoch": 0.51, + "grad_norm": 0.5357985068475164, + "learning_rate": 8.734280692735995e-06, + "loss": 0.5086, + "step": 3997 + }, + { + "epoch": 0.51, + "grad_norm": 0.572289222696746, + "learning_rate": 8.733594623239125e-06, + "loss": 0.5474, + "step": 3998 + }, + { + "epoch": 0.51, + "grad_norm": 0.7119114566450648, + "learning_rate": 8.732908394815063e-06, + "loss": 0.6178, + "step": 3999 + }, + { + "epoch": 0.51, + "grad_norm": 0.7543084118528486, + "learning_rate": 8.732222007493019e-06, + "loss": 0.5496, + "step": 4000 + }, + { + "epoch": 0.51, + "grad_norm": 0.6542244018315772, + "learning_rate": 8.73153546130221e-06, + "loss": 0.5878, + "step": 4001 + }, + { + "epoch": 0.51, + "grad_norm": 0.7133576120049314, + "learning_rate": 8.730848756271862e-06, + "loss": 0.639, + "step": 4002 + }, + { + "epoch": 0.51, + "grad_norm": 0.7365670830170392, + "learning_rate": 8.730161892431204e-06, + "loss": 0.5871, + "step": 4003 + }, + { + "epoch": 0.51, + "grad_norm": 0.8981157433936465, + "learning_rate": 8.729474869809474e-06, + "loss": 0.6407, + "step": 4004 + }, + { + "epoch": 0.51, + "grad_norm": 0.5689923401973566, + "learning_rate": 8.728787688435916e-06, + "loss": 0.5288, + "step": 4005 + }, + { + "epoch": 0.51, + "grad_norm": 0.6219527165380645, + "learning_rate": 8.728100348339783e-06, + "loss": 0.5202, + "step": 4006 + }, + { + "epoch": 0.51, + "grad_norm": 0.630844260409958, + "learning_rate": 8.72741284955033e-06, + "loss": 0.5255, + "step": 4007 + }, + { + "epoch": 0.51, + "grad_norm": 0.7227527979511357, + "learning_rate": 8.726725192096824e-06, + "loss": 0.5761, + "step": 4008 + }, + { + "epoch": 0.51, + "grad_norm": 0.6202240367694997, + "learning_rate": 8.726037376008536e-06, + "loss": 0.4882, + "step": 4009 + }, + { + "epoch": 0.51, + "grad_norm": 0.6559508357445297, + "learning_rate": 8.72534940131474e-06, + "loss": 0.5337, + "step": 4010 + }, + { + "epoch": 0.51, + "grad_norm": 0.6405642393842376, + "learning_rate": 8.724661268044728e-06, + "loss": 0.48, + "step": 4011 + }, + { + "epoch": 0.51, + "grad_norm": 0.6600537617560803, + "learning_rate": 8.723972976227788e-06, + "loss": 0.5125, + "step": 4012 + }, + { + "epoch": 0.51, + "grad_norm": 0.6226580315348801, + "learning_rate": 8.723284525893219e-06, + "loss": 0.5397, + "step": 4013 + }, + { + "epoch": 0.51, + "grad_norm": 0.6788184469513424, + "learning_rate": 8.722595917070324e-06, + "loss": 0.5355, + "step": 4014 + }, + { + "epoch": 0.51, + "grad_norm": 0.5946836383408967, + "learning_rate": 8.721907149788416e-06, + "loss": 0.5817, + "step": 4015 + }, + { + "epoch": 0.51, + "grad_norm": 0.6621622427559274, + "learning_rate": 8.721218224076815e-06, + "loss": 0.5309, + "step": 4016 + }, + { + "epoch": 0.51, + "grad_norm": 0.6623768717483649, + "learning_rate": 8.720529139964846e-06, + "loss": 0.5924, + "step": 4017 + }, + { + "epoch": 0.51, + "grad_norm": 0.8514805777793002, + "learning_rate": 8.71983989748184e-06, + "loss": 0.62, + "step": 4018 + }, + { + "epoch": 0.51, + "grad_norm": 0.744568234209917, + "learning_rate": 8.719150496657138e-06, + "loss": 0.6015, + "step": 4019 + }, + { + "epoch": 0.51, + "grad_norm": 0.8172516781128346, + "learning_rate": 8.718460937520083e-06, + "loss": 0.6092, + "step": 4020 + }, + { + "epoch": 0.51, + "grad_norm": 0.7186431297226278, + "learning_rate": 8.717771220100028e-06, + "loss": 0.6158, + "step": 4021 + }, + { + "epoch": 0.51, + "grad_norm": 0.5596547073665259, + "learning_rate": 8.717081344426332e-06, + "loss": 0.5079, + "step": 4022 + }, + { + "epoch": 0.51, + "grad_norm": 0.7277117309897767, + "learning_rate": 8.716391310528362e-06, + "loss": 0.6234, + "step": 4023 + }, + { + "epoch": 0.51, + "grad_norm": 0.5581829383311955, + "learning_rate": 8.71570111843549e-06, + "loss": 0.532, + "step": 4024 + }, + { + "epoch": 0.51, + "grad_norm": 0.8546352303153767, + "learning_rate": 8.715010768177096e-06, + "loss": 0.6008, + "step": 4025 + }, + { + "epoch": 0.51, + "grad_norm": 0.6431617435741185, + "learning_rate": 8.714320259782564e-06, + "loss": 0.5458, + "step": 4026 + }, + { + "epoch": 0.51, + "grad_norm": 0.871148495129938, + "learning_rate": 8.713629593281287e-06, + "loss": 0.6525, + "step": 4027 + }, + { + "epoch": 0.51, + "grad_norm": 0.5651827233493597, + "learning_rate": 8.712938768702667e-06, + "loss": 0.5311, + "step": 4028 + }, + { + "epoch": 0.51, + "grad_norm": 0.6910219990548543, + "learning_rate": 8.712247786076107e-06, + "loss": 0.5667, + "step": 4029 + }, + { + "epoch": 0.51, + "grad_norm": 0.6234576595087924, + "learning_rate": 8.711556645431023e-06, + "loss": 0.5451, + "step": 4030 + }, + { + "epoch": 0.51, + "grad_norm": 0.7584569646657977, + "learning_rate": 8.710865346796831e-06, + "loss": 0.6466, + "step": 4031 + }, + { + "epoch": 0.51, + "grad_norm": 0.7033104790397724, + "learning_rate": 8.71017389020296e-06, + "loss": 0.6485, + "step": 4032 + }, + { + "epoch": 0.51, + "grad_norm": 0.6413070074062487, + "learning_rate": 8.709482275678844e-06, + "loss": 0.5302, + "step": 4033 + }, + { + "epoch": 0.51, + "grad_norm": 0.9181804849062695, + "learning_rate": 8.70879050325392e-06, + "loss": 0.6107, + "step": 4034 + }, + { + "epoch": 0.51, + "grad_norm": 0.614806654088529, + "learning_rate": 8.708098572957634e-06, + "loss": 0.5341, + "step": 4035 + }, + { + "epoch": 0.51, + "grad_norm": 0.6211892196371582, + "learning_rate": 8.707406484819445e-06, + "loss": 0.5617, + "step": 4036 + }, + { + "epoch": 0.51, + "grad_norm": 0.7178186116677533, + "learning_rate": 8.706714238868808e-06, + "loss": 0.5269, + "step": 4037 + }, + { + "epoch": 0.51, + "grad_norm": 0.667514713153214, + "learning_rate": 8.70602183513519e-06, + "loss": 0.54, + "step": 4038 + }, + { + "epoch": 0.51, + "grad_norm": 0.7723771910555213, + "learning_rate": 8.705329273648065e-06, + "loss": 0.6125, + "step": 4039 + }, + { + "epoch": 0.51, + "grad_norm": 0.7126742556076741, + "learning_rate": 8.704636554436913e-06, + "loss": 0.5893, + "step": 4040 + }, + { + "epoch": 0.51, + "grad_norm": 0.8399445894924042, + "learning_rate": 8.703943677531223e-06, + "loss": 0.5503, + "step": 4041 + }, + { + "epoch": 0.51, + "grad_norm": 0.8695449332268784, + "learning_rate": 8.703250642960486e-06, + "loss": 0.6168, + "step": 4042 + }, + { + "epoch": 0.52, + "grad_norm": 0.7359725643449906, + "learning_rate": 8.702557450754201e-06, + "loss": 0.607, + "step": 4043 + }, + { + "epoch": 0.52, + "grad_norm": 0.7472907414402942, + "learning_rate": 8.701864100941879e-06, + "loss": 0.5726, + "step": 4044 + }, + { + "epoch": 0.52, + "grad_norm": 0.6833697075096975, + "learning_rate": 8.701170593553032e-06, + "loss": 0.5109, + "step": 4045 + }, + { + "epoch": 0.52, + "grad_norm": 0.8930552361383712, + "learning_rate": 8.70047692861718e-06, + "loss": 0.6676, + "step": 4046 + }, + { + "epoch": 0.52, + "grad_norm": 0.6422085922646437, + "learning_rate": 8.69978310616385e-06, + "loss": 0.5863, + "step": 4047 + }, + { + "epoch": 0.52, + "grad_norm": 0.656960316599447, + "learning_rate": 8.699089126222576e-06, + "loss": 0.5375, + "step": 4048 + }, + { + "epoch": 0.52, + "grad_norm": 0.676279498152905, + "learning_rate": 8.698394988822898e-06, + "loss": 0.5397, + "step": 4049 + }, + { + "epoch": 0.52, + "grad_norm": 0.6276087711034353, + "learning_rate": 8.697700693994363e-06, + "loss": 0.518, + "step": 4050 + }, + { + "epoch": 0.52, + "grad_norm": 0.6371970251125455, + "learning_rate": 8.697006241766527e-06, + "loss": 0.5037, + "step": 4051 + }, + { + "epoch": 0.52, + "grad_norm": 0.6893087387319538, + "learning_rate": 8.696311632168949e-06, + "loss": 0.5757, + "step": 4052 + }, + { + "epoch": 0.52, + "grad_norm": 0.6162191817085644, + "learning_rate": 8.695616865231195e-06, + "loss": 0.5383, + "step": 4053 + }, + { + "epoch": 0.52, + "grad_norm": 0.7829475087524488, + "learning_rate": 8.694921940982843e-06, + "loss": 0.5308, + "step": 4054 + }, + { + "epoch": 0.52, + "grad_norm": 0.8307525867404142, + "learning_rate": 8.694226859453469e-06, + "loss": 0.616, + "step": 4055 + }, + { + "epoch": 0.52, + "grad_norm": 0.9620411113643442, + "learning_rate": 8.693531620672664e-06, + "loss": 0.6843, + "step": 4056 + }, + { + "epoch": 0.52, + "grad_norm": 0.6578430396849646, + "learning_rate": 8.69283622467002e-06, + "loss": 0.5298, + "step": 4057 + }, + { + "epoch": 0.52, + "grad_norm": 0.5854872539494609, + "learning_rate": 8.69214067147514e-06, + "loss": 0.5428, + "step": 4058 + }, + { + "epoch": 0.52, + "grad_norm": 0.6192794312437374, + "learning_rate": 8.69144496111763e-06, + "loss": 0.5471, + "step": 4059 + }, + { + "epoch": 0.52, + "grad_norm": 0.61017652142063, + "learning_rate": 8.690749093627102e-06, + "loss": 0.5551, + "step": 4060 + }, + { + "epoch": 0.52, + "grad_norm": 0.7824772911012285, + "learning_rate": 8.690053069033182e-06, + "loss": 0.5582, + "step": 4061 + }, + { + "epoch": 0.52, + "grad_norm": 0.6472949686667087, + "learning_rate": 8.689356887365492e-06, + "loss": 0.5841, + "step": 4062 + }, + { + "epoch": 0.52, + "grad_norm": 0.7541421204474587, + "learning_rate": 8.68866054865367e-06, + "loss": 0.6478, + "step": 4063 + }, + { + "epoch": 0.52, + "grad_norm": 0.5682369069903911, + "learning_rate": 8.687964052927354e-06, + "loss": 0.5286, + "step": 4064 + }, + { + "epoch": 0.52, + "grad_norm": 0.843779305332011, + "learning_rate": 8.687267400216195e-06, + "loss": 0.6123, + "step": 4065 + }, + { + "epoch": 0.52, + "grad_norm": 0.8020967312698006, + "learning_rate": 8.686570590549845e-06, + "loss": 0.6425, + "step": 4066 + }, + { + "epoch": 0.52, + "grad_norm": 0.7308260065605556, + "learning_rate": 8.685873623957966e-06, + "loss": 0.6648, + "step": 4067 + }, + { + "epoch": 0.52, + "grad_norm": 0.5821694481684088, + "learning_rate": 8.685176500470225e-06, + "loss": 0.5071, + "step": 4068 + }, + { + "epoch": 0.52, + "grad_norm": 0.6087344874233439, + "learning_rate": 8.684479220116296e-06, + "loss": 0.5287, + "step": 4069 + }, + { + "epoch": 0.52, + "grad_norm": 0.6595120773884432, + "learning_rate": 8.683781782925861e-06, + "loss": 0.5019, + "step": 4070 + }, + { + "epoch": 0.52, + "grad_norm": 1.3103340262646546, + "learning_rate": 8.683084188928608e-06, + "loss": 0.6074, + "step": 4071 + }, + { + "epoch": 0.52, + "grad_norm": 0.7099659840423111, + "learning_rate": 8.68238643815423e-06, + "loss": 0.6247, + "step": 4072 + }, + { + "epoch": 0.52, + "grad_norm": 0.7628189058571455, + "learning_rate": 8.681688530632429e-06, + "loss": 0.6058, + "step": 4073 + }, + { + "epoch": 0.52, + "grad_norm": 0.6555232246497514, + "learning_rate": 8.680990466392912e-06, + "loss": 0.5221, + "step": 4074 + }, + { + "epoch": 0.52, + "grad_norm": 1.3898887696892253, + "learning_rate": 8.680292245465392e-06, + "loss": 0.5638, + "step": 4075 + }, + { + "epoch": 0.52, + "grad_norm": 0.6140742381851138, + "learning_rate": 8.679593867879592e-06, + "loss": 0.5619, + "step": 4076 + }, + { + "epoch": 0.52, + "grad_norm": 0.7637791651443466, + "learning_rate": 8.67889533366524e-06, + "loss": 0.6555, + "step": 4077 + }, + { + "epoch": 0.52, + "grad_norm": 0.7524925106588084, + "learning_rate": 8.67819664285207e-06, + "loss": 0.6099, + "step": 4078 + }, + { + "epoch": 0.52, + "grad_norm": 0.6666990954598578, + "learning_rate": 8.677497795469823e-06, + "loss": 0.603, + "step": 4079 + }, + { + "epoch": 0.52, + "grad_norm": 0.7646750122609234, + "learning_rate": 8.676798791548246e-06, + "loss": 0.612, + "step": 4080 + }, + { + "epoch": 0.52, + "grad_norm": 0.5656202450360042, + "learning_rate": 8.676099631117094e-06, + "loss": 0.5268, + "step": 4081 + }, + { + "epoch": 0.52, + "grad_norm": 0.6034868943379306, + "learning_rate": 8.675400314206126e-06, + "loss": 0.5224, + "step": 4082 + }, + { + "epoch": 0.52, + "grad_norm": 0.8118847928983691, + "learning_rate": 8.674700840845114e-06, + "loss": 0.6165, + "step": 4083 + }, + { + "epoch": 0.52, + "grad_norm": 0.641151788610495, + "learning_rate": 8.674001211063829e-06, + "loss": 0.5163, + "step": 4084 + }, + { + "epoch": 0.52, + "grad_norm": 0.6908073121201854, + "learning_rate": 8.673301424892053e-06, + "loss": 0.5475, + "step": 4085 + }, + { + "epoch": 0.52, + "grad_norm": 0.6066042040860486, + "learning_rate": 8.672601482359575e-06, + "loss": 0.5278, + "step": 4086 + }, + { + "epoch": 0.52, + "grad_norm": 0.6191794521350743, + "learning_rate": 8.671901383496186e-06, + "loss": 0.5292, + "step": 4087 + }, + { + "epoch": 0.52, + "grad_norm": 0.7959269387054396, + "learning_rate": 8.671201128331687e-06, + "loss": 0.6259, + "step": 4088 + }, + { + "epoch": 0.52, + "grad_norm": 0.8350898812561149, + "learning_rate": 8.67050071689589e-06, + "loss": 0.6722, + "step": 4089 + }, + { + "epoch": 0.52, + "grad_norm": 0.7523124281825234, + "learning_rate": 8.669800149218607e-06, + "loss": 0.584, + "step": 4090 + }, + { + "epoch": 0.52, + "grad_norm": 0.6330707856985351, + "learning_rate": 8.66909942532966e-06, + "loss": 0.5714, + "step": 4091 + }, + { + "epoch": 0.52, + "grad_norm": 0.5500080185137677, + "learning_rate": 8.668398545258872e-06, + "loss": 0.4706, + "step": 4092 + }, + { + "epoch": 0.52, + "grad_norm": 0.7440400781920494, + "learning_rate": 8.667697509036081e-06, + "loss": 0.5917, + "step": 4093 + }, + { + "epoch": 0.52, + "grad_norm": 0.7571174341942223, + "learning_rate": 8.666996316691129e-06, + "loss": 0.594, + "step": 4094 + }, + { + "epoch": 0.52, + "grad_norm": 0.5886212622941261, + "learning_rate": 8.66629496825386e-06, + "loss": 0.5436, + "step": 4095 + }, + { + "epoch": 0.52, + "grad_norm": 0.8535177447626139, + "learning_rate": 8.665593463754131e-06, + "loss": 0.6544, + "step": 4096 + }, + { + "epoch": 0.52, + "grad_norm": 0.6216790588125976, + "learning_rate": 8.664891803221802e-06, + "loss": 0.5401, + "step": 4097 + }, + { + "epoch": 0.52, + "grad_norm": 0.6166756233687279, + "learning_rate": 8.66418998668674e-06, + "loss": 0.5657, + "step": 4098 + }, + { + "epoch": 0.52, + "grad_norm": 0.6877739594834584, + "learning_rate": 8.66348801417882e-06, + "loss": 0.5408, + "step": 4099 + }, + { + "epoch": 0.52, + "grad_norm": 0.6640935474929383, + "learning_rate": 8.66278588572792e-06, + "loss": 0.5717, + "step": 4100 + }, + { + "epoch": 0.52, + "grad_norm": 0.7394274919281051, + "learning_rate": 8.66208360136393e-06, + "loss": 0.5924, + "step": 4101 + }, + { + "epoch": 0.52, + "grad_norm": 0.7655150255633112, + "learning_rate": 8.661381161116745e-06, + "loss": 0.5503, + "step": 4102 + }, + { + "epoch": 0.52, + "grad_norm": 0.6972852218878891, + "learning_rate": 8.660678565016263e-06, + "loss": 0.5819, + "step": 4103 + }, + { + "epoch": 0.52, + "grad_norm": 0.7388092332104087, + "learning_rate": 8.659975813092393e-06, + "loss": 0.5306, + "step": 4104 + }, + { + "epoch": 0.52, + "grad_norm": 0.7989478190168652, + "learning_rate": 8.659272905375047e-06, + "loss": 0.588, + "step": 4105 + }, + { + "epoch": 0.52, + "grad_norm": 0.6038444889572419, + "learning_rate": 8.658569841894148e-06, + "loss": 0.5637, + "step": 4106 + }, + { + "epoch": 0.52, + "grad_norm": 0.7578801116926419, + "learning_rate": 8.657866622679623e-06, + "loss": 0.5777, + "step": 4107 + }, + { + "epoch": 0.52, + "grad_norm": 0.6616704042341236, + "learning_rate": 8.657163247761402e-06, + "loss": 0.5615, + "step": 4108 + }, + { + "epoch": 0.52, + "grad_norm": 0.5650059545608774, + "learning_rate": 8.656459717169429e-06, + "loss": 0.4692, + "step": 4109 + }, + { + "epoch": 0.52, + "grad_norm": 0.6599071762573367, + "learning_rate": 8.65575603093365e-06, + "loss": 0.4731, + "step": 4110 + }, + { + "epoch": 0.52, + "grad_norm": 0.9157734941245574, + "learning_rate": 8.655052189084022e-06, + "loss": 0.6301, + "step": 4111 + }, + { + "epoch": 0.52, + "grad_norm": 0.806251991510044, + "learning_rate": 8.654348191650499e-06, + "loss": 0.5543, + "step": 4112 + }, + { + "epoch": 0.52, + "grad_norm": 1.4774434268320176, + "learning_rate": 8.653644038663051e-06, + "loss": 0.6115, + "step": 4113 + }, + { + "epoch": 0.52, + "grad_norm": 0.5390721471166144, + "learning_rate": 8.652939730151653e-06, + "loss": 0.5035, + "step": 4114 + }, + { + "epoch": 0.52, + "grad_norm": 0.8317143946613599, + "learning_rate": 8.652235266146284e-06, + "loss": 0.6595, + "step": 4115 + }, + { + "epoch": 0.52, + "grad_norm": 0.7913826290662731, + "learning_rate": 8.651530646676928e-06, + "loss": 0.6217, + "step": 4116 + }, + { + "epoch": 0.52, + "grad_norm": 0.654443711056819, + "learning_rate": 8.650825871773586e-06, + "loss": 0.5208, + "step": 4117 + }, + { + "epoch": 0.52, + "grad_norm": 0.7653087047593622, + "learning_rate": 8.650120941466248e-06, + "loss": 0.6432, + "step": 4118 + }, + { + "epoch": 0.52, + "grad_norm": 0.57040990562097, + "learning_rate": 8.649415855784929e-06, + "loss": 0.5459, + "step": 4119 + }, + { + "epoch": 0.52, + "grad_norm": 0.6527354173215035, + "learning_rate": 8.648710614759637e-06, + "loss": 0.5758, + "step": 4120 + }, + { + "epoch": 0.53, + "grad_norm": 0.8138528256104013, + "learning_rate": 8.648005218420393e-06, + "loss": 0.588, + "step": 4121 + }, + { + "epoch": 0.53, + "grad_norm": 0.613926470552269, + "learning_rate": 8.647299666797226e-06, + "loss": 0.5711, + "step": 4122 + }, + { + "epoch": 0.53, + "grad_norm": 0.7165810152569417, + "learning_rate": 8.646593959920166e-06, + "loss": 0.6382, + "step": 4123 + }, + { + "epoch": 0.53, + "grad_norm": 0.8305220534314462, + "learning_rate": 8.645888097819255e-06, + "loss": 0.6302, + "step": 4124 + }, + { + "epoch": 0.53, + "grad_norm": 0.610496215479003, + "learning_rate": 8.64518208052454e-06, + "loss": 0.5477, + "step": 4125 + }, + { + "epoch": 0.53, + "grad_norm": 0.5918479460318838, + "learning_rate": 8.644475908066067e-06, + "loss": 0.5531, + "step": 4126 + }, + { + "epoch": 0.53, + "grad_norm": 0.8880709961866834, + "learning_rate": 8.643769580473905e-06, + "loss": 0.6025, + "step": 4127 + }, + { + "epoch": 0.53, + "grad_norm": 0.7413602925646209, + "learning_rate": 8.643063097778115e-06, + "loss": 0.6323, + "step": 4128 + }, + { + "epoch": 0.53, + "grad_norm": 0.9621443936566458, + "learning_rate": 8.642356460008769e-06, + "loss": 0.6034, + "step": 4129 + }, + { + "epoch": 0.53, + "grad_norm": 0.5911236851883683, + "learning_rate": 8.641649667195951e-06, + "loss": 0.5578, + "step": 4130 + }, + { + "epoch": 0.53, + "grad_norm": 0.5623687100068212, + "learning_rate": 8.640942719369741e-06, + "loss": 0.5149, + "step": 4131 + }, + { + "epoch": 0.53, + "grad_norm": 0.695412717064727, + "learning_rate": 8.640235616560234e-06, + "loss": 0.6133, + "step": 4132 + }, + { + "epoch": 0.53, + "grad_norm": 0.9097797461029631, + "learning_rate": 8.639528358797531e-06, + "loss": 0.5955, + "step": 4133 + }, + { + "epoch": 0.53, + "grad_norm": 0.7032956044712427, + "learning_rate": 8.638820946111735e-06, + "loss": 0.5135, + "step": 4134 + }, + { + "epoch": 0.53, + "grad_norm": 0.6102219813605976, + "learning_rate": 8.638113378532958e-06, + "loss": 0.5143, + "step": 4135 + }, + { + "epoch": 0.53, + "grad_norm": 0.8384911835924354, + "learning_rate": 8.637405656091323e-06, + "loss": 0.5976, + "step": 4136 + }, + { + "epoch": 0.53, + "grad_norm": 0.9153487422463845, + "learning_rate": 8.63669777881695e-06, + "loss": 0.6568, + "step": 4137 + }, + { + "epoch": 0.53, + "grad_norm": 0.5797435423856463, + "learning_rate": 8.635989746739977e-06, + "loss": 0.509, + "step": 4138 + }, + { + "epoch": 0.53, + "grad_norm": 0.6914392041462173, + "learning_rate": 8.635281559890536e-06, + "loss": 0.5441, + "step": 4139 + }, + { + "epoch": 0.53, + "grad_norm": 0.7410179945760179, + "learning_rate": 8.634573218298778e-06, + "loss": 0.5199, + "step": 4140 + }, + { + "epoch": 0.53, + "grad_norm": 0.6525900131655286, + "learning_rate": 8.633864721994852e-06, + "loss": 0.5494, + "step": 4141 + }, + { + "epoch": 0.53, + "grad_norm": 0.5900897350075014, + "learning_rate": 8.633156071008917e-06, + "loss": 0.5399, + "step": 4142 + }, + { + "epoch": 0.53, + "grad_norm": 0.8346246161729853, + "learning_rate": 8.632447265371137e-06, + "loss": 0.595, + "step": 4143 + }, + { + "epoch": 0.53, + "grad_norm": 0.9441855549004021, + "learning_rate": 8.631738305111686e-06, + "loss": 0.6699, + "step": 4144 + }, + { + "epoch": 0.53, + "grad_norm": 0.6496769937815052, + "learning_rate": 8.63102919026074e-06, + "loss": 0.5157, + "step": 4145 + }, + { + "epoch": 0.53, + "grad_norm": 0.6931401396567558, + "learning_rate": 8.630319920848485e-06, + "loss": 0.5379, + "step": 4146 + }, + { + "epoch": 0.53, + "grad_norm": 0.650106143481835, + "learning_rate": 8.62961049690511e-06, + "loss": 0.587, + "step": 4147 + }, + { + "epoch": 0.53, + "grad_norm": 0.5441884620634613, + "learning_rate": 8.628900918460816e-06, + "loss": 0.5076, + "step": 4148 + }, + { + "epoch": 0.53, + "grad_norm": 0.791754450685671, + "learning_rate": 8.628191185545806e-06, + "loss": 0.5994, + "step": 4149 + }, + { + "epoch": 0.53, + "grad_norm": 0.7597955361054618, + "learning_rate": 8.627481298190292e-06, + "loss": 0.6826, + "step": 4150 + }, + { + "epoch": 0.53, + "grad_norm": 0.8251425342384988, + "learning_rate": 8.62677125642449e-06, + "loss": 0.6157, + "step": 4151 + }, + { + "epoch": 0.53, + "grad_norm": 0.650528724218808, + "learning_rate": 8.626061060278627e-06, + "loss": 0.555, + "step": 4152 + }, + { + "epoch": 0.53, + "grad_norm": 0.7189530729765402, + "learning_rate": 8.62535070978293e-06, + "loss": 0.5655, + "step": 4153 + }, + { + "epoch": 0.53, + "grad_norm": 0.7606499760756236, + "learning_rate": 8.62464020496764e-06, + "loss": 0.6268, + "step": 4154 + }, + { + "epoch": 0.53, + "grad_norm": 0.5732538416130583, + "learning_rate": 8.623929545862997e-06, + "loss": 0.4855, + "step": 4155 + }, + { + "epoch": 0.53, + "grad_norm": 0.7301025580970828, + "learning_rate": 8.623218732499254e-06, + "loss": 0.6045, + "step": 4156 + }, + { + "epoch": 0.53, + "grad_norm": 0.7152390100076712, + "learning_rate": 8.622507764906669e-06, + "loss": 0.5478, + "step": 4157 + }, + { + "epoch": 0.53, + "grad_norm": 0.7711394971385065, + "learning_rate": 8.621796643115503e-06, + "loss": 0.6485, + "step": 4158 + }, + { + "epoch": 0.53, + "grad_norm": 0.693863683938519, + "learning_rate": 8.621085367156028e-06, + "loss": 0.4978, + "step": 4159 + }, + { + "epoch": 0.53, + "grad_norm": 0.7262533239298832, + "learning_rate": 8.62037393705852e-06, + "loss": 0.6006, + "step": 4160 + }, + { + "epoch": 0.53, + "grad_norm": 0.7916697470361381, + "learning_rate": 8.619662352853264e-06, + "loss": 0.6171, + "step": 4161 + }, + { + "epoch": 0.53, + "grad_norm": 0.6121833954315268, + "learning_rate": 8.618950614570547e-06, + "loss": 0.5233, + "step": 4162 + }, + { + "epoch": 0.53, + "grad_norm": 0.5597290368600105, + "learning_rate": 8.618238722240668e-06, + "loss": 0.4886, + "step": 4163 + }, + { + "epoch": 0.53, + "grad_norm": 0.7417499369103104, + "learning_rate": 8.617526675893928e-06, + "loss": 0.6362, + "step": 4164 + }, + { + "epoch": 0.53, + "grad_norm": 0.9988250437935274, + "learning_rate": 8.616814475560636e-06, + "loss": 0.6653, + "step": 4165 + }, + { + "epoch": 0.53, + "grad_norm": 0.5158037139850548, + "learning_rate": 8.61610212127111e-06, + "loss": 0.4958, + "step": 4166 + }, + { + "epoch": 0.53, + "grad_norm": 0.7505200182807563, + "learning_rate": 8.615389613055673e-06, + "loss": 0.6192, + "step": 4167 + }, + { + "epoch": 0.53, + "grad_norm": 0.6833847468301292, + "learning_rate": 8.614676950944654e-06, + "loss": 0.6084, + "step": 4168 + }, + { + "epoch": 0.53, + "grad_norm": 0.7950884612137535, + "learning_rate": 8.613964134968387e-06, + "loss": 0.6446, + "step": 4169 + }, + { + "epoch": 0.53, + "grad_norm": 0.760316459249358, + "learning_rate": 8.613251165157217e-06, + "loss": 0.6126, + "step": 4170 + }, + { + "epoch": 0.53, + "grad_norm": 0.7438221263805592, + "learning_rate": 8.612538041541489e-06, + "loss": 0.6419, + "step": 4171 + }, + { + "epoch": 0.53, + "grad_norm": 0.6463356886247231, + "learning_rate": 8.611824764151561e-06, + "loss": 0.5119, + "step": 4172 + }, + { + "epoch": 0.53, + "grad_norm": 0.6826826851340493, + "learning_rate": 8.611111333017796e-06, + "loss": 0.6654, + "step": 4173 + }, + { + "epoch": 0.53, + "grad_norm": 0.7182546328168065, + "learning_rate": 8.61039774817056e-06, + "loss": 0.5875, + "step": 4174 + }, + { + "epoch": 0.53, + "grad_norm": 0.6464219189713521, + "learning_rate": 8.609684009640229e-06, + "loss": 0.5131, + "step": 4175 + }, + { + "epoch": 0.53, + "grad_norm": 0.8153351876063932, + "learning_rate": 8.608970117457184e-06, + "loss": 0.6086, + "step": 4176 + }, + { + "epoch": 0.53, + "grad_norm": 0.7910516754668436, + "learning_rate": 8.608256071651815e-06, + "loss": 0.6165, + "step": 4177 + }, + { + "epoch": 0.53, + "grad_norm": 0.656802740069538, + "learning_rate": 8.607541872254514e-06, + "loss": 0.5108, + "step": 4178 + }, + { + "epoch": 0.53, + "grad_norm": 0.6371077279264837, + "learning_rate": 8.606827519295684e-06, + "loss": 0.5285, + "step": 4179 + }, + { + "epoch": 0.53, + "grad_norm": 0.5539188929778869, + "learning_rate": 8.606113012805733e-06, + "loss": 0.5101, + "step": 4180 + }, + { + "epoch": 0.53, + "grad_norm": 0.6357198748650659, + "learning_rate": 8.605398352815074e-06, + "loss": 0.5168, + "step": 4181 + }, + { + "epoch": 0.53, + "grad_norm": 0.7508241451132187, + "learning_rate": 8.604683539354129e-06, + "loss": 0.5726, + "step": 4182 + }, + { + "epoch": 0.53, + "grad_norm": 0.6215320432201875, + "learning_rate": 8.603968572453324e-06, + "loss": 0.5758, + "step": 4183 + }, + { + "epoch": 0.53, + "grad_norm": 0.8068590590914627, + "learning_rate": 8.603253452143093e-06, + "loss": 0.6111, + "step": 4184 + }, + { + "epoch": 0.53, + "grad_norm": 0.7141856617069584, + "learning_rate": 8.602538178453877e-06, + "loss": 0.5656, + "step": 4185 + }, + { + "epoch": 0.53, + "grad_norm": 0.5908150107095695, + "learning_rate": 8.601822751416124e-06, + "loss": 0.5656, + "step": 4186 + }, + { + "epoch": 0.53, + "grad_norm": 0.7204489416727751, + "learning_rate": 8.601107171060286e-06, + "loss": 0.5973, + "step": 4187 + }, + { + "epoch": 0.53, + "grad_norm": 0.5675320397907174, + "learning_rate": 8.600391437416822e-06, + "loss": 0.5187, + "step": 4188 + }, + { + "epoch": 0.53, + "grad_norm": 0.6409800876569068, + "learning_rate": 8.599675550516201e-06, + "loss": 0.5155, + "step": 4189 + }, + { + "epoch": 0.53, + "grad_norm": 0.5860737309056203, + "learning_rate": 8.598959510388894e-06, + "loss": 0.5359, + "step": 4190 + }, + { + "epoch": 0.53, + "grad_norm": 0.9123702174969488, + "learning_rate": 8.598243317065382e-06, + "loss": 0.6837, + "step": 4191 + }, + { + "epoch": 0.53, + "grad_norm": 0.8262264583018809, + "learning_rate": 8.597526970576148e-06, + "loss": 0.6649, + "step": 4192 + }, + { + "epoch": 0.53, + "grad_norm": 0.6748319481331477, + "learning_rate": 8.59681047095169e-06, + "loss": 0.595, + "step": 4193 + }, + { + "epoch": 0.53, + "grad_norm": 0.676612574805558, + "learning_rate": 8.596093818222504e-06, + "loss": 0.5552, + "step": 4194 + }, + { + "epoch": 0.53, + "grad_norm": 0.823835763345746, + "learning_rate": 8.595377012419093e-06, + "loss": 0.6093, + "step": 4195 + }, + { + "epoch": 0.53, + "grad_norm": 0.8484589908110719, + "learning_rate": 8.594660053571972e-06, + "loss": 0.6066, + "step": 4196 + }, + { + "epoch": 0.53, + "grad_norm": 0.7217782534743693, + "learning_rate": 8.593942941711662e-06, + "loss": 0.6151, + "step": 4197 + }, + { + "epoch": 0.53, + "grad_norm": 0.5632304908788441, + "learning_rate": 8.593225676868685e-06, + "loss": 0.5254, + "step": 4198 + }, + { + "epoch": 0.53, + "grad_norm": 0.6662700969602309, + "learning_rate": 8.592508259073571e-06, + "loss": 0.5316, + "step": 4199 + }, + { + "epoch": 0.54, + "grad_norm": 0.737610740156497, + "learning_rate": 8.591790688356863e-06, + "loss": 0.5141, + "step": 4200 + }, + { + "epoch": 0.54, + "grad_norm": 0.7547099648272309, + "learning_rate": 8.5910729647491e-06, + "loss": 0.5791, + "step": 4201 + }, + { + "epoch": 0.54, + "grad_norm": 0.6084062881886371, + "learning_rate": 8.59035508828084e-06, + "loss": 0.5719, + "step": 4202 + }, + { + "epoch": 0.54, + "grad_norm": 0.5751780275211879, + "learning_rate": 8.589637058982634e-06, + "loss": 0.5317, + "step": 4203 + }, + { + "epoch": 0.54, + "grad_norm": 0.5713846862244606, + "learning_rate": 8.58891887688505e-06, + "loss": 0.5295, + "step": 4204 + }, + { + "epoch": 0.54, + "grad_norm": 0.8001175862986285, + "learning_rate": 8.588200542018658e-06, + "loss": 0.5774, + "step": 4205 + }, + { + "epoch": 0.54, + "grad_norm": 0.6770304915731522, + "learning_rate": 8.587482054414036e-06, + "loss": 0.5484, + "step": 4206 + }, + { + "epoch": 0.54, + "grad_norm": 0.7087752401044223, + "learning_rate": 8.586763414101766e-06, + "loss": 0.5347, + "step": 4207 + }, + { + "epoch": 0.54, + "grad_norm": 0.7071601500404803, + "learning_rate": 8.586044621112442e-06, + "loss": 0.5589, + "step": 4208 + }, + { + "epoch": 0.54, + "grad_norm": 0.8252986939014532, + "learning_rate": 8.585325675476655e-06, + "loss": 0.5873, + "step": 4209 + }, + { + "epoch": 0.54, + "grad_norm": 0.6291140667432583, + "learning_rate": 8.58460657722501e-06, + "loss": 0.5605, + "step": 4210 + }, + { + "epoch": 0.54, + "grad_norm": 0.5163097025465593, + "learning_rate": 8.58388732638812e-06, + "loss": 0.5118, + "step": 4211 + }, + { + "epoch": 0.54, + "grad_norm": 0.8195100101489301, + "learning_rate": 8.583167922996598e-06, + "loss": 0.5969, + "step": 4212 + }, + { + "epoch": 0.54, + "grad_norm": 0.6322556113774183, + "learning_rate": 8.58244836708107e-06, + "loss": 0.617, + "step": 4213 + }, + { + "epoch": 0.54, + "grad_norm": 0.587528861532704, + "learning_rate": 8.581728658672159e-06, + "loss": 0.4921, + "step": 4214 + }, + { + "epoch": 0.54, + "grad_norm": 0.5784900038882096, + "learning_rate": 8.581008797800506e-06, + "loss": 0.524, + "step": 4215 + }, + { + "epoch": 0.54, + "grad_norm": 0.6700150476268066, + "learning_rate": 8.580288784496752e-06, + "loss": 0.5758, + "step": 4216 + }, + { + "epoch": 0.54, + "grad_norm": 0.5539846050468967, + "learning_rate": 8.579568618791547e-06, + "loss": 0.4944, + "step": 4217 + }, + { + "epoch": 0.54, + "grad_norm": 0.7533418118582415, + "learning_rate": 8.578848300715542e-06, + "loss": 0.5857, + "step": 4218 + }, + { + "epoch": 0.54, + "grad_norm": 0.5890395111804739, + "learning_rate": 8.578127830299404e-06, + "loss": 0.5226, + "step": 4219 + }, + { + "epoch": 0.54, + "grad_norm": 0.6189759451509745, + "learning_rate": 8.577407207573795e-06, + "loss": 0.5422, + "step": 4220 + }, + { + "epoch": 0.54, + "grad_norm": 0.7313971748589089, + "learning_rate": 8.576686432569394e-06, + "loss": 0.5291, + "step": 4221 + }, + { + "epoch": 0.54, + "grad_norm": 0.6457195678984613, + "learning_rate": 8.575965505316883e-06, + "loss": 0.6376, + "step": 4222 + }, + { + "epoch": 0.54, + "grad_norm": 0.6017810013002097, + "learning_rate": 8.575244425846945e-06, + "loss": 0.522, + "step": 4223 + }, + { + "epoch": 0.54, + "grad_norm": 0.665085071496339, + "learning_rate": 8.574523194190279e-06, + "loss": 0.5689, + "step": 4224 + }, + { + "epoch": 0.54, + "grad_norm": 0.5874504662682642, + "learning_rate": 8.573801810377581e-06, + "loss": 0.5589, + "step": 4225 + }, + { + "epoch": 0.54, + "grad_norm": 0.7687996514174105, + "learning_rate": 8.573080274439562e-06, + "loss": 0.5539, + "step": 4226 + }, + { + "epoch": 0.54, + "grad_norm": 1.11924188433432, + "learning_rate": 8.572358586406933e-06, + "loss": 0.6066, + "step": 4227 + }, + { + "epoch": 0.54, + "grad_norm": 0.6729692267052756, + "learning_rate": 8.571636746310414e-06, + "loss": 0.4876, + "step": 4228 + }, + { + "epoch": 0.54, + "grad_norm": 0.666474001354608, + "learning_rate": 8.570914754180733e-06, + "loss": 0.5208, + "step": 4229 + }, + { + "epoch": 0.54, + "grad_norm": 0.7473142683705336, + "learning_rate": 8.57019261004862e-06, + "loss": 0.5898, + "step": 4230 + }, + { + "epoch": 0.54, + "grad_norm": 0.6328917397470593, + "learning_rate": 8.56947031394482e-06, + "loss": 0.5285, + "step": 4231 + }, + { + "epoch": 0.54, + "grad_norm": 0.7986635809428788, + "learning_rate": 8.568747865900071e-06, + "loss": 0.5585, + "step": 4232 + }, + { + "epoch": 0.54, + "grad_norm": 0.7868504692724377, + "learning_rate": 8.56802526594513e-06, + "loss": 0.5947, + "step": 4233 + }, + { + "epoch": 0.54, + "grad_norm": 0.6468800138600855, + "learning_rate": 8.567302514110758e-06, + "loss": 0.5143, + "step": 4234 + }, + { + "epoch": 0.54, + "grad_norm": 0.7471634788203108, + "learning_rate": 8.566579610427715e-06, + "loss": 0.5681, + "step": 4235 + }, + { + "epoch": 0.54, + "grad_norm": 0.6310562666237889, + "learning_rate": 8.565856554926776e-06, + "loss": 0.5338, + "step": 4236 + }, + { + "epoch": 0.54, + "grad_norm": 0.6684379571259103, + "learning_rate": 8.565133347638719e-06, + "loss": 0.5026, + "step": 4237 + }, + { + "epoch": 0.54, + "grad_norm": 0.7049264857095631, + "learning_rate": 8.564409988594327e-06, + "loss": 0.5413, + "step": 4238 + }, + { + "epoch": 0.54, + "grad_norm": 0.6182126875094807, + "learning_rate": 8.563686477824392e-06, + "loss": 0.5407, + "step": 4239 + }, + { + "epoch": 0.54, + "grad_norm": 0.7842127618823762, + "learning_rate": 8.562962815359712e-06, + "loss": 0.6675, + "step": 4240 + }, + { + "epoch": 0.54, + "grad_norm": 0.7562142014068194, + "learning_rate": 8.56223900123109e-06, + "loss": 0.6016, + "step": 4241 + }, + { + "epoch": 0.54, + "grad_norm": 0.880586079244348, + "learning_rate": 8.561515035469337e-06, + "loss": 0.6347, + "step": 4242 + }, + { + "epoch": 0.54, + "grad_norm": 0.7206524348861861, + "learning_rate": 8.56079091810527e-06, + "loss": 0.5485, + "step": 4243 + }, + { + "epoch": 0.54, + "grad_norm": 0.6043269227553943, + "learning_rate": 8.560066649169713e-06, + "loss": 0.546, + "step": 4244 + }, + { + "epoch": 0.54, + "grad_norm": 0.7613203813613829, + "learning_rate": 8.559342228693494e-06, + "loss": 0.6181, + "step": 4245 + }, + { + "epoch": 0.54, + "grad_norm": 0.7318250387236054, + "learning_rate": 8.558617656707452e-06, + "loss": 0.5911, + "step": 4246 + }, + { + "epoch": 0.54, + "grad_norm": 0.7224439786834228, + "learning_rate": 8.557892933242427e-06, + "loss": 0.6229, + "step": 4247 + }, + { + "epoch": 0.54, + "grad_norm": 0.7243203917415874, + "learning_rate": 8.55716805832927e-06, + "loss": 0.5974, + "step": 4248 + }, + { + "epoch": 0.54, + "grad_norm": 0.626188156046898, + "learning_rate": 8.556443031998834e-06, + "loss": 0.528, + "step": 4249 + }, + { + "epoch": 0.54, + "grad_norm": 0.7931637331244187, + "learning_rate": 8.555717854281984e-06, + "loss": 0.6025, + "step": 4250 + }, + { + "epoch": 0.54, + "grad_norm": 0.6933561182954645, + "learning_rate": 8.554992525209589e-06, + "loss": 0.5412, + "step": 4251 + }, + { + "epoch": 0.54, + "grad_norm": 0.7755683091433357, + "learning_rate": 8.55426704481252e-06, + "loss": 0.6288, + "step": 4252 + }, + { + "epoch": 0.54, + "grad_norm": 0.585608131688245, + "learning_rate": 8.55354141312166e-06, + "loss": 0.5274, + "step": 4253 + }, + { + "epoch": 0.54, + "grad_norm": 0.5471904085494139, + "learning_rate": 8.552815630167901e-06, + "loss": 0.5133, + "step": 4254 + }, + { + "epoch": 0.54, + "grad_norm": 0.6381377826857912, + "learning_rate": 8.552089695982132e-06, + "loss": 0.5194, + "step": 4255 + }, + { + "epoch": 0.54, + "grad_norm": 0.5628675477002059, + "learning_rate": 8.551363610595253e-06, + "loss": 0.4989, + "step": 4256 + }, + { + "epoch": 0.54, + "grad_norm": 0.6824626597554021, + "learning_rate": 8.550637374038178e-06, + "loss": 0.572, + "step": 4257 + }, + { + "epoch": 0.54, + "grad_norm": 0.757329896163903, + "learning_rate": 8.549910986341813e-06, + "loss": 0.5717, + "step": 4258 + }, + { + "epoch": 0.54, + "grad_norm": 0.6578028395022709, + "learning_rate": 8.549184447537082e-06, + "loss": 0.576, + "step": 4259 + }, + { + "epoch": 0.54, + "grad_norm": 0.5724289711452756, + "learning_rate": 8.548457757654912e-06, + "loss": 0.515, + "step": 4260 + }, + { + "epoch": 0.54, + "grad_norm": 0.5910746454046204, + "learning_rate": 8.547730916726234e-06, + "loss": 0.5374, + "step": 4261 + }, + { + "epoch": 0.54, + "grad_norm": 0.944792849147322, + "learning_rate": 8.547003924781988e-06, + "loss": 0.6537, + "step": 4262 + }, + { + "epoch": 0.54, + "grad_norm": 0.5978913596841029, + "learning_rate": 8.546276781853117e-06, + "loss": 0.5498, + "step": 4263 + }, + { + "epoch": 0.54, + "grad_norm": 0.5862549759040205, + "learning_rate": 8.545549487970577e-06, + "loss": 0.5268, + "step": 4264 + }, + { + "epoch": 0.54, + "grad_norm": 0.7393124458187841, + "learning_rate": 8.544822043165326e-06, + "loss": 0.5998, + "step": 4265 + }, + { + "epoch": 0.54, + "grad_norm": 0.660501968995592, + "learning_rate": 8.544094447468328e-06, + "loss": 0.594, + "step": 4266 + }, + { + "epoch": 0.54, + "grad_norm": 0.694251272036348, + "learning_rate": 8.543366700910555e-06, + "loss": 0.5976, + "step": 4267 + }, + { + "epoch": 0.54, + "grad_norm": 0.7277816593245259, + "learning_rate": 8.542638803522985e-06, + "loss": 0.5914, + "step": 4268 + }, + { + "epoch": 0.54, + "grad_norm": 0.7565444795806341, + "learning_rate": 8.5419107553366e-06, + "loss": 0.6361, + "step": 4269 + }, + { + "epoch": 0.54, + "grad_norm": 0.8124572372090746, + "learning_rate": 8.541182556382393e-06, + "loss": 0.6094, + "step": 4270 + }, + { + "epoch": 0.54, + "grad_norm": 0.5984985772708591, + "learning_rate": 8.540454206691362e-06, + "loss": 0.5356, + "step": 4271 + }, + { + "epoch": 0.54, + "grad_norm": 0.6298035543755568, + "learning_rate": 8.539725706294509e-06, + "loss": 0.5158, + "step": 4272 + }, + { + "epoch": 0.54, + "grad_norm": 0.8228448548212153, + "learning_rate": 8.538997055222842e-06, + "loss": 0.5704, + "step": 4273 + }, + { + "epoch": 0.54, + "grad_norm": 0.8257302554424572, + "learning_rate": 8.53826825350738e-06, + "loss": 0.6344, + "step": 4274 + }, + { + "epoch": 0.54, + "grad_norm": 0.7226723426272715, + "learning_rate": 8.537539301179145e-06, + "loss": 0.6658, + "step": 4275 + }, + { + "epoch": 0.54, + "grad_norm": 0.8743980343808205, + "learning_rate": 8.536810198269168e-06, + "loss": 0.5995, + "step": 4276 + }, + { + "epoch": 0.54, + "grad_norm": 0.7517533146836193, + "learning_rate": 8.536080944808483e-06, + "loss": 0.6374, + "step": 4277 + }, + { + "epoch": 0.55, + "grad_norm": 0.6061560152620385, + "learning_rate": 8.53535154082813e-06, + "loss": 0.5455, + "step": 4278 + }, + { + "epoch": 0.55, + "grad_norm": 0.6457503016339637, + "learning_rate": 8.53462198635916e-06, + "loss": 0.5165, + "step": 4279 + }, + { + "epoch": 0.55, + "grad_norm": 0.6406034037273896, + "learning_rate": 8.533892281432627e-06, + "loss": 0.5416, + "step": 4280 + }, + { + "epoch": 0.55, + "grad_norm": 0.6012821386061982, + "learning_rate": 8.533162426079591e-06, + "loss": 0.5397, + "step": 4281 + }, + { + "epoch": 0.55, + "grad_norm": 0.6000210809009845, + "learning_rate": 8.532432420331122e-06, + "loss": 0.5231, + "step": 4282 + }, + { + "epoch": 0.55, + "grad_norm": 0.8292004198863734, + "learning_rate": 8.531702264218295e-06, + "loss": 0.6071, + "step": 4283 + }, + { + "epoch": 0.55, + "grad_norm": 0.8269181989750112, + "learning_rate": 8.530971957772186e-06, + "loss": 0.6241, + "step": 4284 + }, + { + "epoch": 0.55, + "grad_norm": 0.7960326046520969, + "learning_rate": 8.530241501023885e-06, + "loss": 0.673, + "step": 4285 + }, + { + "epoch": 0.55, + "grad_norm": 0.6209979527108801, + "learning_rate": 8.529510894004484e-06, + "loss": 0.5193, + "step": 4286 + }, + { + "epoch": 0.55, + "grad_norm": 0.8199632399794136, + "learning_rate": 8.528780136745083e-06, + "loss": 0.5951, + "step": 4287 + }, + { + "epoch": 0.55, + "grad_norm": 0.7524330032267278, + "learning_rate": 8.528049229276787e-06, + "loss": 0.6162, + "step": 4288 + }, + { + "epoch": 0.55, + "grad_norm": 0.7915587553085761, + "learning_rate": 8.52731817163071e-06, + "loss": 0.6037, + "step": 4289 + }, + { + "epoch": 0.55, + "grad_norm": 0.8825922227382288, + "learning_rate": 8.52658696383797e-06, + "loss": 0.6703, + "step": 4290 + }, + { + "epoch": 0.55, + "grad_norm": 0.8712034370811429, + "learning_rate": 8.525855605929694e-06, + "loss": 0.5956, + "step": 4291 + }, + { + "epoch": 0.55, + "grad_norm": 0.7057093369300327, + "learning_rate": 8.52512409793701e-06, + "loss": 0.4895, + "step": 4292 + }, + { + "epoch": 0.55, + "grad_norm": 0.7541501445141401, + "learning_rate": 8.524392439891058e-06, + "loss": 0.6444, + "step": 4293 + }, + { + "epoch": 0.55, + "grad_norm": 0.7671029700881294, + "learning_rate": 8.523660631822983e-06, + "loss": 0.5962, + "step": 4294 + }, + { + "epoch": 0.55, + "grad_norm": 1.0784969699630877, + "learning_rate": 8.522928673763934e-06, + "loss": 0.6503, + "step": 4295 + }, + { + "epoch": 0.55, + "grad_norm": 0.5629563927965483, + "learning_rate": 8.52219656574507e-06, + "loss": 0.4987, + "step": 4296 + }, + { + "epoch": 0.55, + "grad_norm": 0.9040638419303705, + "learning_rate": 8.521464307797553e-06, + "loss": 0.6195, + "step": 4297 + }, + { + "epoch": 0.55, + "grad_norm": 0.5859049259778425, + "learning_rate": 8.520731899952553e-06, + "loss": 0.4963, + "step": 4298 + }, + { + "epoch": 0.55, + "grad_norm": 0.8656947498642173, + "learning_rate": 8.519999342241247e-06, + "loss": 0.5696, + "step": 4299 + }, + { + "epoch": 0.55, + "grad_norm": 0.7548629521197194, + "learning_rate": 8.519266634694817e-06, + "loss": 0.6493, + "step": 4300 + }, + { + "epoch": 0.55, + "grad_norm": 0.5617877931928973, + "learning_rate": 8.518533777344453e-06, + "loss": 0.4903, + "step": 4301 + }, + { + "epoch": 0.55, + "grad_norm": 0.8194925530236241, + "learning_rate": 8.51780077022135e-06, + "loss": 0.6523, + "step": 4302 + }, + { + "epoch": 0.55, + "grad_norm": 0.7949077097631095, + "learning_rate": 8.517067613356708e-06, + "loss": 0.6174, + "step": 4303 + }, + { + "epoch": 0.55, + "grad_norm": 0.6836000824920909, + "learning_rate": 8.516334306781737e-06, + "loss": 0.4949, + "step": 4304 + }, + { + "epoch": 0.55, + "grad_norm": 0.6349075682784242, + "learning_rate": 8.515600850527653e-06, + "loss": 0.4851, + "step": 4305 + }, + { + "epoch": 0.55, + "grad_norm": 0.6277272316069593, + "learning_rate": 8.514867244625673e-06, + "loss": 0.5361, + "step": 4306 + }, + { + "epoch": 0.55, + "grad_norm": 0.7849166812396969, + "learning_rate": 8.514133489107028e-06, + "loss": 0.6509, + "step": 4307 + }, + { + "epoch": 0.55, + "grad_norm": 0.7560839783702589, + "learning_rate": 8.513399584002948e-06, + "loss": 0.6344, + "step": 4308 + }, + { + "epoch": 0.55, + "grad_norm": 0.5989992762340549, + "learning_rate": 8.512665529344675e-06, + "loss": 0.5297, + "step": 4309 + }, + { + "epoch": 0.55, + "grad_norm": 0.7960445312315848, + "learning_rate": 8.511931325163457e-06, + "loss": 0.6894, + "step": 4310 + }, + { + "epoch": 0.55, + "grad_norm": 0.6863474292387314, + "learning_rate": 8.511196971490546e-06, + "loss": 0.5417, + "step": 4311 + }, + { + "epoch": 0.55, + "grad_norm": 0.7034927102973783, + "learning_rate": 8.510462468357197e-06, + "loss": 0.5478, + "step": 4312 + }, + { + "epoch": 0.55, + "grad_norm": 0.7326193985658955, + "learning_rate": 8.50972781579468e-06, + "loss": 0.643, + "step": 4313 + }, + { + "epoch": 0.55, + "grad_norm": 0.630221023403941, + "learning_rate": 8.508993013834266e-06, + "loss": 0.5863, + "step": 4314 + }, + { + "epoch": 0.55, + "grad_norm": 0.854629603371839, + "learning_rate": 8.508258062507231e-06, + "loss": 0.7465, + "step": 4315 + }, + { + "epoch": 0.55, + "grad_norm": 0.744227787448643, + "learning_rate": 8.507522961844861e-06, + "loss": 0.6361, + "step": 4316 + }, + { + "epoch": 0.55, + "grad_norm": 0.8040693037841271, + "learning_rate": 8.50678771187845e-06, + "loss": 0.6694, + "step": 4317 + }, + { + "epoch": 0.55, + "grad_norm": 0.8317219420549856, + "learning_rate": 8.506052312639288e-06, + "loss": 0.6361, + "step": 4318 + }, + { + "epoch": 0.55, + "grad_norm": 0.6828791795082905, + "learning_rate": 8.505316764158685e-06, + "loss": 0.5857, + "step": 4319 + }, + { + "epoch": 0.55, + "grad_norm": 0.6833364312519926, + "learning_rate": 8.504581066467949e-06, + "loss": 0.6378, + "step": 4320 + }, + { + "epoch": 0.55, + "grad_norm": 0.604475259089991, + "learning_rate": 8.503845219598395e-06, + "loss": 0.5891, + "step": 4321 + }, + { + "epoch": 0.55, + "grad_norm": 0.8001010618198438, + "learning_rate": 8.503109223581348e-06, + "loss": 0.6729, + "step": 4322 + }, + { + "epoch": 0.55, + "grad_norm": 0.7029522662113565, + "learning_rate": 8.502373078448136e-06, + "loss": 0.5837, + "step": 4323 + }, + { + "epoch": 0.55, + "grad_norm": 0.6558529097763018, + "learning_rate": 8.501636784230091e-06, + "loss": 0.6184, + "step": 4324 + }, + { + "epoch": 0.55, + "grad_norm": 0.7121771072838233, + "learning_rate": 8.50090034095856e-06, + "loss": 0.5648, + "step": 4325 + }, + { + "epoch": 0.55, + "grad_norm": 0.7918856170257168, + "learning_rate": 8.50016374866489e-06, + "loss": 0.6181, + "step": 4326 + }, + { + "epoch": 0.55, + "grad_norm": 0.744418829081397, + "learning_rate": 8.499427007380433e-06, + "loss": 0.6777, + "step": 4327 + }, + { + "epoch": 0.55, + "grad_norm": 0.8242404096462485, + "learning_rate": 8.49869011713655e-06, + "loss": 0.5594, + "step": 4328 + }, + { + "epoch": 0.55, + "grad_norm": 0.8580536800368124, + "learning_rate": 8.49795307796461e-06, + "loss": 0.6931, + "step": 4329 + }, + { + "epoch": 0.55, + "grad_norm": 0.757476645322607, + "learning_rate": 8.497215889895984e-06, + "loss": 0.6423, + "step": 4330 + }, + { + "epoch": 0.55, + "grad_norm": 0.5462465331305486, + "learning_rate": 8.496478552962053e-06, + "loss": 0.557, + "step": 4331 + }, + { + "epoch": 0.55, + "grad_norm": 0.7154005967583038, + "learning_rate": 8.495741067194203e-06, + "loss": 0.6269, + "step": 4332 + }, + { + "epoch": 0.55, + "grad_norm": 0.7064131698022073, + "learning_rate": 8.495003432623828e-06, + "loss": 0.6145, + "step": 4333 + }, + { + "epoch": 0.55, + "grad_norm": 0.578074893201352, + "learning_rate": 8.494265649282325e-06, + "loss": 0.5458, + "step": 4334 + }, + { + "epoch": 0.55, + "grad_norm": 0.5167112198028618, + "learning_rate": 8.4935277172011e-06, + "loss": 0.5034, + "step": 4335 + }, + { + "epoch": 0.55, + "grad_norm": 0.7197536131515607, + "learning_rate": 8.492789636411562e-06, + "loss": 0.6008, + "step": 4336 + }, + { + "epoch": 0.55, + "grad_norm": 0.8414455711739971, + "learning_rate": 8.492051406945132e-06, + "loss": 0.6627, + "step": 4337 + }, + { + "epoch": 0.55, + "grad_norm": 0.8546785748275249, + "learning_rate": 8.491313028833232e-06, + "loss": 0.692, + "step": 4338 + }, + { + "epoch": 0.55, + "grad_norm": 0.5943690480985239, + "learning_rate": 8.490574502107294e-06, + "loss": 0.5173, + "step": 4339 + }, + { + "epoch": 0.55, + "grad_norm": 0.7680057826779969, + "learning_rate": 8.489835826798752e-06, + "loss": 0.6163, + "step": 4340 + }, + { + "epoch": 0.55, + "grad_norm": 0.5888222986238567, + "learning_rate": 8.489097002939052e-06, + "loss": 0.504, + "step": 4341 + }, + { + "epoch": 0.55, + "grad_norm": 0.6894175108984276, + "learning_rate": 8.488358030559643e-06, + "loss": 0.6367, + "step": 4342 + }, + { + "epoch": 0.55, + "grad_norm": 0.740876167443951, + "learning_rate": 8.487618909691977e-06, + "loss": 0.6399, + "step": 4343 + }, + { + "epoch": 0.55, + "grad_norm": 0.7171090480709094, + "learning_rate": 8.48687964036752e-06, + "loss": 0.5975, + "step": 4344 + }, + { + "epoch": 0.55, + "grad_norm": 0.7724993531563258, + "learning_rate": 8.48614022261774e-06, + "loss": 0.6287, + "step": 4345 + }, + { + "epoch": 0.55, + "grad_norm": 0.7967773280677185, + "learning_rate": 8.48540065647411e-06, + "loss": 0.5835, + "step": 4346 + }, + { + "epoch": 0.55, + "grad_norm": 0.6120062632400328, + "learning_rate": 8.484660941968113e-06, + "loss": 0.5514, + "step": 4347 + }, + { + "epoch": 0.55, + "grad_norm": 0.6610184084121873, + "learning_rate": 8.483921079131233e-06, + "loss": 0.5538, + "step": 4348 + }, + { + "epoch": 0.55, + "grad_norm": 0.6147261331619809, + "learning_rate": 8.48318106799497e-06, + "loss": 0.4788, + "step": 4349 + }, + { + "epoch": 0.55, + "grad_norm": 0.7151182122048538, + "learning_rate": 8.482440908590816e-06, + "loss": 0.571, + "step": 4350 + }, + { + "epoch": 0.55, + "grad_norm": 0.7994778922305898, + "learning_rate": 8.48170060095028e-06, + "loss": 0.5999, + "step": 4351 + }, + { + "epoch": 0.55, + "grad_norm": 0.9544782281327197, + "learning_rate": 8.480960145104877e-06, + "loss": 0.6318, + "step": 4352 + }, + { + "epoch": 0.55, + "grad_norm": 0.7274607856506495, + "learning_rate": 8.480219541086125e-06, + "loss": 0.619, + "step": 4353 + }, + { + "epoch": 0.55, + "grad_norm": 0.8166137352707963, + "learning_rate": 8.479478788925547e-06, + "loss": 0.6274, + "step": 4354 + }, + { + "epoch": 0.55, + "grad_norm": 0.7180740681173754, + "learning_rate": 8.478737888654676e-06, + "loss": 0.5898, + "step": 4355 + }, + { + "epoch": 0.55, + "grad_norm": 0.5609736243588401, + "learning_rate": 8.477996840305049e-06, + "loss": 0.4804, + "step": 4356 + }, + { + "epoch": 0.56, + "grad_norm": 0.5991913989693161, + "learning_rate": 8.47725564390821e-06, + "loss": 0.5141, + "step": 4357 + }, + { + "epoch": 0.56, + "grad_norm": 0.894912278276884, + "learning_rate": 8.476514299495713e-06, + "loss": 0.6199, + "step": 4358 + }, + { + "epoch": 0.56, + "grad_norm": 0.647204864992118, + "learning_rate": 8.47577280709911e-06, + "loss": 0.5633, + "step": 4359 + }, + { + "epoch": 0.56, + "grad_norm": 0.7095036023821971, + "learning_rate": 8.475031166749965e-06, + "loss": 0.5271, + "step": 4360 + }, + { + "epoch": 0.56, + "grad_norm": 0.6218753907478503, + "learning_rate": 8.47428937847985e-06, + "loss": 0.5431, + "step": 4361 + }, + { + "epoch": 0.56, + "grad_norm": 0.6700308475198772, + "learning_rate": 8.473547442320335e-06, + "loss": 0.5286, + "step": 4362 + }, + { + "epoch": 0.56, + "grad_norm": 0.7376301008276802, + "learning_rate": 8.472805358303007e-06, + "loss": 0.6417, + "step": 4363 + }, + { + "epoch": 0.56, + "grad_norm": 0.7932546849779789, + "learning_rate": 8.472063126459453e-06, + "loss": 0.5774, + "step": 4364 + }, + { + "epoch": 0.56, + "grad_norm": 0.7437735077644912, + "learning_rate": 8.471320746821265e-06, + "loss": 0.5766, + "step": 4365 + }, + { + "epoch": 0.56, + "grad_norm": 0.8088037529803853, + "learning_rate": 8.470578219420048e-06, + "loss": 0.6105, + "step": 4366 + }, + { + "epoch": 0.56, + "grad_norm": 1.0850284063236828, + "learning_rate": 8.469835544287404e-06, + "loss": 0.6306, + "step": 4367 + }, + { + "epoch": 0.56, + "grad_norm": 0.6899848915368455, + "learning_rate": 8.46909272145495e-06, + "loss": 0.6038, + "step": 4368 + }, + { + "epoch": 0.56, + "grad_norm": 0.6172932585039929, + "learning_rate": 8.468349750954306e-06, + "loss": 0.5382, + "step": 4369 + }, + { + "epoch": 0.56, + "grad_norm": 0.8093365987408733, + "learning_rate": 8.467606632817093e-06, + "loss": 0.661, + "step": 4370 + }, + { + "epoch": 0.56, + "grad_norm": 0.8171873486748183, + "learning_rate": 8.466863367074949e-06, + "loss": 0.6363, + "step": 4371 + }, + { + "epoch": 0.56, + "grad_norm": 0.7271344873956909, + "learning_rate": 8.46611995375951e-06, + "loss": 0.4952, + "step": 4372 + }, + { + "epoch": 0.56, + "grad_norm": 0.615570178408858, + "learning_rate": 8.46537639290242e-06, + "loss": 0.5651, + "step": 4373 + }, + { + "epoch": 0.56, + "grad_norm": 0.6726183851577787, + "learning_rate": 8.46463268453533e-06, + "loss": 0.5349, + "step": 4374 + }, + { + "epoch": 0.56, + "grad_norm": 0.7369120502687199, + "learning_rate": 8.4638888286899e-06, + "loss": 0.6024, + "step": 4375 + }, + { + "epoch": 0.56, + "grad_norm": 1.0593756232299247, + "learning_rate": 8.463144825397792e-06, + "loss": 0.611, + "step": 4376 + }, + { + "epoch": 0.56, + "grad_norm": 0.6251734736496313, + "learning_rate": 8.462400674690673e-06, + "loss": 0.5461, + "step": 4377 + }, + { + "epoch": 0.56, + "grad_norm": 0.7709423700590702, + "learning_rate": 8.461656376600222e-06, + "loss": 0.665, + "step": 4378 + }, + { + "epoch": 0.56, + "grad_norm": 0.7453921369022694, + "learning_rate": 8.460911931158122e-06, + "loss": 0.5916, + "step": 4379 + }, + { + "epoch": 0.56, + "grad_norm": 0.785242195667744, + "learning_rate": 8.46016733839606e-06, + "loss": 0.5633, + "step": 4380 + }, + { + "epoch": 0.56, + "grad_norm": 0.8122768105789598, + "learning_rate": 8.45942259834573e-06, + "loss": 0.6906, + "step": 4381 + }, + { + "epoch": 0.56, + "grad_norm": 0.9553098162617438, + "learning_rate": 8.458677711038836e-06, + "loss": 0.6909, + "step": 4382 + }, + { + "epoch": 0.56, + "grad_norm": 0.7384939551402149, + "learning_rate": 8.457932676507085e-06, + "loss": 0.5796, + "step": 4383 + }, + { + "epoch": 0.56, + "grad_norm": 0.8338925407618774, + "learning_rate": 8.457187494782189e-06, + "loss": 0.6332, + "step": 4384 + }, + { + "epoch": 0.56, + "grad_norm": 0.6910187485770999, + "learning_rate": 8.456442165895869e-06, + "loss": 0.6281, + "step": 4385 + }, + { + "epoch": 0.56, + "grad_norm": 0.5985086222563284, + "learning_rate": 8.455696689879852e-06, + "loss": 0.5164, + "step": 4386 + }, + { + "epoch": 0.56, + "grad_norm": 0.7031678249461256, + "learning_rate": 8.454951066765868e-06, + "loss": 0.5555, + "step": 4387 + }, + { + "epoch": 0.56, + "grad_norm": 0.8621667979416765, + "learning_rate": 8.45420529658566e-06, + "loss": 0.6076, + "step": 4388 + }, + { + "epoch": 0.56, + "grad_norm": 0.9141387606178659, + "learning_rate": 8.453459379370969e-06, + "loss": 0.6112, + "step": 4389 + }, + { + "epoch": 0.56, + "grad_norm": 0.6266554619922422, + "learning_rate": 8.45271331515355e-06, + "loss": 0.527, + "step": 4390 + }, + { + "epoch": 0.56, + "grad_norm": 0.8093966545388899, + "learning_rate": 8.451967103965155e-06, + "loss": 0.65, + "step": 4391 + }, + { + "epoch": 0.56, + "grad_norm": 0.9465313148657902, + "learning_rate": 8.451220745837553e-06, + "loss": 0.6203, + "step": 4392 + }, + { + "epoch": 0.56, + "grad_norm": 0.8207052142712354, + "learning_rate": 8.45047424080251e-06, + "loss": 0.6107, + "step": 4393 + }, + { + "epoch": 0.56, + "grad_norm": 0.6106219759665608, + "learning_rate": 8.44972758889181e-06, + "loss": 0.5803, + "step": 4394 + }, + { + "epoch": 0.56, + "grad_norm": 0.7100279341225781, + "learning_rate": 8.448980790137226e-06, + "loss": 0.5977, + "step": 4395 + }, + { + "epoch": 0.56, + "grad_norm": 0.625937153542458, + "learning_rate": 8.448233844570553e-06, + "loss": 0.5184, + "step": 4396 + }, + { + "epoch": 0.56, + "grad_norm": 0.7864800476436626, + "learning_rate": 8.447486752223584e-06, + "loss": 0.5955, + "step": 4397 + }, + { + "epoch": 0.56, + "grad_norm": 0.7887189384852409, + "learning_rate": 8.44673951312812e-06, + "loss": 0.6166, + "step": 4398 + }, + { + "epoch": 0.56, + "grad_norm": 0.7566261208448904, + "learning_rate": 8.445992127315969e-06, + "loss": 0.5728, + "step": 4399 + }, + { + "epoch": 0.56, + "grad_norm": 0.5423403336054106, + "learning_rate": 8.445244594818945e-06, + "loss": 0.4905, + "step": 4400 + }, + { + "epoch": 0.56, + "grad_norm": 0.621849865909122, + "learning_rate": 8.44449691566887e-06, + "loss": 0.5985, + "step": 4401 + }, + { + "epoch": 0.56, + "grad_norm": 0.7771931513536706, + "learning_rate": 8.443749089897564e-06, + "loss": 0.598, + "step": 4402 + }, + { + "epoch": 0.56, + "grad_norm": 0.8557942555061883, + "learning_rate": 8.443001117536868e-06, + "loss": 0.6443, + "step": 4403 + }, + { + "epoch": 0.56, + "grad_norm": 0.6672955356737872, + "learning_rate": 8.442252998618615e-06, + "loss": 0.5755, + "step": 4404 + }, + { + "epoch": 0.56, + "grad_norm": 0.580889998246896, + "learning_rate": 8.441504733174653e-06, + "loss": 0.5515, + "step": 4405 + }, + { + "epoch": 0.56, + "grad_norm": 0.7432901191602979, + "learning_rate": 8.440756321236829e-06, + "loss": 0.591, + "step": 4406 + }, + { + "epoch": 0.56, + "grad_norm": 0.9234935615780759, + "learning_rate": 8.440007762837006e-06, + "loss": 0.6407, + "step": 4407 + }, + { + "epoch": 0.56, + "grad_norm": 0.649441462722885, + "learning_rate": 8.439259058007046e-06, + "loss": 0.5522, + "step": 4408 + }, + { + "epoch": 0.56, + "grad_norm": 0.5782242670876485, + "learning_rate": 8.438510206778816e-06, + "loss": 0.5429, + "step": 4409 + }, + { + "epoch": 0.56, + "grad_norm": 0.623231150960479, + "learning_rate": 8.437761209184196e-06, + "loss": 0.5516, + "step": 4410 + }, + { + "epoch": 0.56, + "grad_norm": 0.8951910480939944, + "learning_rate": 8.437012065255066e-06, + "loss": 0.6411, + "step": 4411 + }, + { + "epoch": 0.56, + "grad_norm": 0.5606392635611586, + "learning_rate": 8.436262775023314e-06, + "loss": 0.513, + "step": 4412 + }, + { + "epoch": 0.56, + "grad_norm": 0.533246558256732, + "learning_rate": 8.435513338520839e-06, + "loss": 0.54, + "step": 4413 + }, + { + "epoch": 0.56, + "grad_norm": 0.6118557654265043, + "learning_rate": 8.434763755779538e-06, + "loss": 0.5256, + "step": 4414 + }, + { + "epoch": 0.56, + "grad_norm": 1.0038463817566687, + "learning_rate": 8.434014026831321e-06, + "loss": 0.5858, + "step": 4415 + }, + { + "epoch": 0.56, + "grad_norm": 0.5914753578370091, + "learning_rate": 8.4332641517081e-06, + "loss": 0.4763, + "step": 4416 + }, + { + "epoch": 0.56, + "grad_norm": 0.7647039799522053, + "learning_rate": 8.432514130441794e-06, + "loss": 0.6039, + "step": 4417 + }, + { + "epoch": 0.56, + "grad_norm": 0.7876606790890524, + "learning_rate": 8.431763963064331e-06, + "loss": 0.667, + "step": 4418 + }, + { + "epoch": 0.56, + "grad_norm": 0.6997141913370188, + "learning_rate": 8.431013649607643e-06, + "loss": 0.6125, + "step": 4419 + }, + { + "epoch": 0.56, + "grad_norm": 0.7125075326371043, + "learning_rate": 8.430263190103667e-06, + "loss": 0.5906, + "step": 4420 + }, + { + "epoch": 0.56, + "grad_norm": 0.9033565393958323, + "learning_rate": 8.42951258458435e-06, + "loss": 0.5742, + "step": 4421 + }, + { + "epoch": 0.56, + "grad_norm": 0.618547274382211, + "learning_rate": 8.42876183308164e-06, + "loss": 0.5284, + "step": 4422 + }, + { + "epoch": 0.56, + "grad_norm": 0.641059564173985, + "learning_rate": 8.428010935627496e-06, + "loss": 0.551, + "step": 4423 + }, + { + "epoch": 0.56, + "grad_norm": 0.6386043167072742, + "learning_rate": 8.42725989225388e-06, + "loss": 0.5118, + "step": 4424 + }, + { + "epoch": 0.56, + "grad_norm": 0.6912236302258694, + "learning_rate": 8.426508702992763e-06, + "loss": 0.5888, + "step": 4425 + }, + { + "epoch": 0.56, + "grad_norm": 0.8647033230811324, + "learning_rate": 8.425757367876122e-06, + "loss": 0.6452, + "step": 4426 + }, + { + "epoch": 0.56, + "grad_norm": 0.608122830630269, + "learning_rate": 8.425005886935936e-06, + "loss": 0.5214, + "step": 4427 + }, + { + "epoch": 0.56, + "grad_norm": 0.5668157024818089, + "learning_rate": 8.424254260204192e-06, + "loss": 0.5299, + "step": 4428 + }, + { + "epoch": 0.56, + "grad_norm": 0.7158474583360988, + "learning_rate": 8.423502487712888e-06, + "loss": 0.5725, + "step": 4429 + }, + { + "epoch": 0.56, + "grad_norm": 0.7657430339608319, + "learning_rate": 8.422750569494023e-06, + "loss": 0.5384, + "step": 4430 + }, + { + "epoch": 0.56, + "grad_norm": 0.7803553538455631, + "learning_rate": 8.421998505579603e-06, + "loss": 0.6445, + "step": 4431 + }, + { + "epoch": 0.56, + "grad_norm": 0.6182084913311109, + "learning_rate": 8.421246296001644e-06, + "loss": 0.5148, + "step": 4432 + }, + { + "epoch": 0.56, + "grad_norm": 0.8661938537133296, + "learning_rate": 8.42049394079216e-06, + "loss": 0.5838, + "step": 4433 + }, + { + "epoch": 0.56, + "grad_norm": 0.5759663788211165, + "learning_rate": 8.419741439983181e-06, + "loss": 0.47, + "step": 4434 + }, + { + "epoch": 0.57, + "grad_norm": 0.7058287456561901, + "learning_rate": 8.418988793606738e-06, + "loss": 0.6008, + "step": 4435 + }, + { + "epoch": 0.57, + "grad_norm": 0.7666808515140944, + "learning_rate": 8.418236001694864e-06, + "loss": 0.61, + "step": 4436 + }, + { + "epoch": 0.57, + "grad_norm": 0.5853176142649072, + "learning_rate": 8.41748306427961e-06, + "loss": 0.5359, + "step": 4437 + }, + { + "epoch": 0.57, + "grad_norm": 0.5862092657965132, + "learning_rate": 8.416729981393021e-06, + "loss": 0.456, + "step": 4438 + }, + { + "epoch": 0.57, + "grad_norm": 0.5724659081903708, + "learning_rate": 8.415976753067154e-06, + "loss": 0.5349, + "step": 4439 + }, + { + "epoch": 0.57, + "grad_norm": 0.6833715600826622, + "learning_rate": 8.415223379334075e-06, + "loss": 0.5165, + "step": 4440 + }, + { + "epoch": 0.57, + "grad_norm": 0.7931207098380295, + "learning_rate": 8.414469860225849e-06, + "loss": 0.6231, + "step": 4441 + }, + { + "epoch": 0.57, + "grad_norm": 0.6813979269344471, + "learning_rate": 8.413716195774553e-06, + "loss": 0.533, + "step": 4442 + }, + { + "epoch": 0.57, + "grad_norm": 0.8431388638660972, + "learning_rate": 8.412962386012264e-06, + "loss": 0.652, + "step": 4443 + }, + { + "epoch": 0.57, + "grad_norm": 0.7565321624882378, + "learning_rate": 8.412208430971076e-06, + "loss": 0.5863, + "step": 4444 + }, + { + "epoch": 0.57, + "grad_norm": 0.7096067574235724, + "learning_rate": 8.411454330683077e-06, + "loss": 0.5656, + "step": 4445 + }, + { + "epoch": 0.57, + "grad_norm": 0.6383764971101797, + "learning_rate": 8.41070008518037e-06, + "loss": 0.5137, + "step": 4446 + }, + { + "epoch": 0.57, + "grad_norm": 0.7497706062021322, + "learning_rate": 8.40994569449506e-06, + "loss": 0.6202, + "step": 4447 + }, + { + "epoch": 0.57, + "grad_norm": 0.6312542897477779, + "learning_rate": 8.409191158659259e-06, + "loss": 0.5386, + "step": 4448 + }, + { + "epoch": 0.57, + "grad_norm": 0.653863516156257, + "learning_rate": 8.40843647770508e-06, + "loss": 0.5385, + "step": 4449 + }, + { + "epoch": 0.57, + "grad_norm": 0.6156252252004295, + "learning_rate": 8.407681651664654e-06, + "loss": 0.5179, + "step": 4450 + }, + { + "epoch": 0.57, + "grad_norm": 0.7142752209597169, + "learning_rate": 8.406926680570111e-06, + "loss": 0.5873, + "step": 4451 + }, + { + "epoch": 0.57, + "grad_norm": 1.0733664514333419, + "learning_rate": 8.406171564453588e-06, + "loss": 0.6702, + "step": 4452 + }, + { + "epoch": 0.57, + "grad_norm": 0.5710862247044152, + "learning_rate": 8.405416303347223e-06, + "loss": 0.5509, + "step": 4453 + }, + { + "epoch": 0.57, + "grad_norm": 0.7607990110698681, + "learning_rate": 8.404660897283168e-06, + "loss": 0.5989, + "step": 4454 + }, + { + "epoch": 0.57, + "grad_norm": 0.8445745815776041, + "learning_rate": 8.403905346293579e-06, + "loss": 0.6713, + "step": 4455 + }, + { + "epoch": 0.57, + "grad_norm": 0.7937089993604419, + "learning_rate": 8.403149650410617e-06, + "loss": 0.6041, + "step": 4456 + }, + { + "epoch": 0.57, + "grad_norm": 0.7770958593130254, + "learning_rate": 8.402393809666448e-06, + "loss": 0.6032, + "step": 4457 + }, + { + "epoch": 0.57, + "grad_norm": 0.7400065671466064, + "learning_rate": 8.40163782409325e-06, + "loss": 0.6354, + "step": 4458 + }, + { + "epoch": 0.57, + "grad_norm": 0.7432366006126222, + "learning_rate": 8.400881693723197e-06, + "loss": 0.6646, + "step": 4459 + }, + { + "epoch": 0.57, + "grad_norm": 0.7735783971083421, + "learning_rate": 8.400125418588478e-06, + "loss": 0.5998, + "step": 4460 + }, + { + "epoch": 0.57, + "grad_norm": 0.7964233894410391, + "learning_rate": 8.399368998721287e-06, + "loss": 0.6373, + "step": 4461 + }, + { + "epoch": 0.57, + "grad_norm": 0.6999861157095514, + "learning_rate": 8.398612434153819e-06, + "loss": 0.6027, + "step": 4462 + }, + { + "epoch": 0.57, + "grad_norm": 0.6412778795231933, + "learning_rate": 8.39785572491828e-06, + "loss": 0.5091, + "step": 4463 + }, + { + "epoch": 0.57, + "grad_norm": 0.7456919161309303, + "learning_rate": 8.397098871046883e-06, + "loss": 0.6147, + "step": 4464 + }, + { + "epoch": 0.57, + "grad_norm": 0.5393626049617862, + "learning_rate": 8.396341872571841e-06, + "loss": 0.5005, + "step": 4465 + }, + { + "epoch": 0.57, + "grad_norm": 0.8533845519661352, + "learning_rate": 8.395584729525379e-06, + "loss": 0.5932, + "step": 4466 + }, + { + "epoch": 0.57, + "grad_norm": 0.6321572046611634, + "learning_rate": 8.394827441939725e-06, + "loss": 0.5685, + "step": 4467 + }, + { + "epoch": 0.57, + "grad_norm": 0.6953148304892535, + "learning_rate": 8.394070009847115e-06, + "loss": 0.6013, + "step": 4468 + }, + { + "epoch": 0.57, + "grad_norm": 0.9975105974700247, + "learning_rate": 8.393312433279791e-06, + "loss": 0.6189, + "step": 4469 + }, + { + "epoch": 0.57, + "grad_norm": 0.6276522101524512, + "learning_rate": 8.392554712270001e-06, + "loss": 0.5636, + "step": 4470 + }, + { + "epoch": 0.57, + "grad_norm": 0.7611873920784217, + "learning_rate": 8.391796846849997e-06, + "loss": 0.6058, + "step": 4471 + }, + { + "epoch": 0.57, + "grad_norm": 0.6066135351030368, + "learning_rate": 8.391038837052042e-06, + "loss": 0.5753, + "step": 4472 + }, + { + "epoch": 0.57, + "grad_norm": 0.5548566589434175, + "learning_rate": 8.390280682908398e-06, + "loss": 0.5037, + "step": 4473 + }, + { + "epoch": 0.57, + "grad_norm": 0.6370250625053674, + "learning_rate": 8.389522384451338e-06, + "loss": 0.4581, + "step": 4474 + }, + { + "epoch": 0.57, + "grad_norm": 1.0508030236374957, + "learning_rate": 8.388763941713142e-06, + "loss": 0.6039, + "step": 4475 + }, + { + "epoch": 0.57, + "grad_norm": 0.5595806131880281, + "learning_rate": 8.388005354726096e-06, + "loss": 0.4803, + "step": 4476 + }, + { + "epoch": 0.57, + "grad_norm": 0.833762981885313, + "learning_rate": 8.387246623522487e-06, + "loss": 0.6109, + "step": 4477 + }, + { + "epoch": 0.57, + "grad_norm": 0.7650249292107758, + "learning_rate": 8.386487748134615e-06, + "loss": 0.5198, + "step": 4478 + }, + { + "epoch": 0.57, + "grad_norm": 1.2155262611753492, + "learning_rate": 8.38572872859478e-06, + "loss": 0.6279, + "step": 4479 + }, + { + "epoch": 0.57, + "grad_norm": 0.6689505057411278, + "learning_rate": 8.38496956493529e-06, + "loss": 0.5565, + "step": 4480 + }, + { + "epoch": 0.57, + "grad_norm": 0.8908804902154505, + "learning_rate": 8.384210257188465e-06, + "loss": 0.6179, + "step": 4481 + }, + { + "epoch": 0.57, + "grad_norm": 0.8230118279260698, + "learning_rate": 8.383450805386624e-06, + "loss": 0.6429, + "step": 4482 + }, + { + "epoch": 0.57, + "grad_norm": 0.576327363748139, + "learning_rate": 8.382691209562093e-06, + "loss": 0.4824, + "step": 4483 + }, + { + "epoch": 0.57, + "grad_norm": 0.608708951314828, + "learning_rate": 8.381931469747208e-06, + "loss": 0.539, + "step": 4484 + }, + { + "epoch": 0.57, + "grad_norm": 0.8149030142574907, + "learning_rate": 8.381171585974306e-06, + "loss": 0.569, + "step": 4485 + }, + { + "epoch": 0.57, + "grad_norm": 0.8041259526101094, + "learning_rate": 8.380411558275735e-06, + "loss": 0.6296, + "step": 4486 + }, + { + "epoch": 0.57, + "grad_norm": 0.7740474608758181, + "learning_rate": 8.379651386683845e-06, + "loss": 0.5857, + "step": 4487 + }, + { + "epoch": 0.57, + "grad_norm": 0.6021501709260586, + "learning_rate": 8.378891071230998e-06, + "loss": 0.4878, + "step": 4488 + }, + { + "epoch": 0.57, + "grad_norm": 0.5518869268865115, + "learning_rate": 8.378130611949553e-06, + "loss": 0.5544, + "step": 4489 + }, + { + "epoch": 0.57, + "grad_norm": 0.8623381337388066, + "learning_rate": 8.377370008871884e-06, + "loss": 0.6404, + "step": 4490 + }, + { + "epoch": 0.57, + "grad_norm": 0.6187203560978535, + "learning_rate": 8.376609262030366e-06, + "loss": 0.5227, + "step": 4491 + }, + { + "epoch": 0.57, + "grad_norm": 0.6501240335984548, + "learning_rate": 8.375848371457383e-06, + "loss": 0.564, + "step": 4492 + }, + { + "epoch": 0.57, + "grad_norm": 0.8229738642659958, + "learning_rate": 8.37508733718532e-06, + "loss": 0.6042, + "step": 4493 + }, + { + "epoch": 0.57, + "grad_norm": 0.7727942980664513, + "learning_rate": 8.374326159246575e-06, + "loss": 0.6231, + "step": 4494 + }, + { + "epoch": 0.57, + "grad_norm": 0.7508246557879681, + "learning_rate": 8.373564837673549e-06, + "loss": 0.5802, + "step": 4495 + }, + { + "epoch": 0.57, + "grad_norm": 0.5813973478855676, + "learning_rate": 8.372803372498648e-06, + "loss": 0.4964, + "step": 4496 + }, + { + "epoch": 0.57, + "grad_norm": 0.7093755340221902, + "learning_rate": 8.372041763754287e-06, + "loss": 0.5232, + "step": 4497 + }, + { + "epoch": 0.57, + "grad_norm": 0.7251925775924358, + "learning_rate": 8.371280011472881e-06, + "loss": 0.5971, + "step": 4498 + }, + { + "epoch": 0.57, + "grad_norm": 0.5501750394360069, + "learning_rate": 8.370518115686858e-06, + "loss": 0.5746, + "step": 4499 + }, + { + "epoch": 0.57, + "grad_norm": 0.6571418849244468, + "learning_rate": 8.369756076428652e-06, + "loss": 0.5331, + "step": 4500 + }, + { + "epoch": 0.57, + "grad_norm": 0.5686787720364862, + "learning_rate": 8.368993893730697e-06, + "loss": 0.5394, + "step": 4501 + }, + { + "epoch": 0.57, + "grad_norm": 0.5638748476950008, + "learning_rate": 8.368231567625437e-06, + "loss": 0.5315, + "step": 4502 + }, + { + "epoch": 0.57, + "grad_norm": 0.7717308841066829, + "learning_rate": 8.367469098145322e-06, + "loss": 0.6847, + "step": 4503 + }, + { + "epoch": 0.57, + "grad_norm": 0.8644609082076836, + "learning_rate": 8.366706485322808e-06, + "loss": 0.5843, + "step": 4504 + }, + { + "epoch": 0.57, + "grad_norm": 0.6412821334544353, + "learning_rate": 8.36594372919036e-06, + "loss": 0.5142, + "step": 4505 + }, + { + "epoch": 0.57, + "grad_norm": 0.5799824683265785, + "learning_rate": 8.365180829780442e-06, + "loss": 0.5174, + "step": 4506 + }, + { + "epoch": 0.57, + "grad_norm": 0.568866374908323, + "learning_rate": 8.36441778712553e-06, + "loss": 0.5253, + "step": 4507 + }, + { + "epoch": 0.57, + "grad_norm": 0.7030245093748975, + "learning_rate": 8.363654601258103e-06, + "loss": 0.5826, + "step": 4508 + }, + { + "epoch": 0.57, + "grad_norm": 0.854340025941681, + "learning_rate": 8.362891272210647e-06, + "loss": 0.6624, + "step": 4509 + }, + { + "epoch": 0.57, + "grad_norm": 0.7255927073232108, + "learning_rate": 8.362127800015659e-06, + "loss": 0.5874, + "step": 4510 + }, + { + "epoch": 0.57, + "grad_norm": 0.5250171534714101, + "learning_rate": 8.361364184705633e-06, + "loss": 0.483, + "step": 4511 + }, + { + "epoch": 0.57, + "grad_norm": 0.7513306481367137, + "learning_rate": 8.360600426313074e-06, + "loss": 0.5445, + "step": 4512 + }, + { + "epoch": 0.57, + "grad_norm": 0.6615244418505919, + "learning_rate": 8.359836524870496e-06, + "loss": 0.5322, + "step": 4513 + }, + { + "epoch": 0.58, + "grad_norm": 0.7702991030071065, + "learning_rate": 8.359072480410412e-06, + "loss": 0.6721, + "step": 4514 + }, + { + "epoch": 0.58, + "grad_norm": 0.5494868261672948, + "learning_rate": 8.358308292965347e-06, + "loss": 0.5264, + "step": 4515 + }, + { + "epoch": 0.58, + "grad_norm": 0.6066363350537363, + "learning_rate": 8.35754396256783e-06, + "loss": 0.5767, + "step": 4516 + }, + { + "epoch": 0.58, + "grad_norm": 0.7460154252299692, + "learning_rate": 8.356779489250395e-06, + "loss": 0.566, + "step": 4517 + }, + { + "epoch": 0.58, + "grad_norm": 0.647224113721489, + "learning_rate": 8.356014873045584e-06, + "loss": 0.5135, + "step": 4518 + }, + { + "epoch": 0.58, + "grad_norm": 0.7487485443601356, + "learning_rate": 8.355250113985945e-06, + "loss": 0.6398, + "step": 4519 + }, + { + "epoch": 0.58, + "grad_norm": 0.7762880385238963, + "learning_rate": 8.35448521210403e-06, + "loss": 0.6106, + "step": 4520 + }, + { + "epoch": 0.58, + "grad_norm": 0.7100629932358956, + "learning_rate": 8.353720167432397e-06, + "loss": 0.6517, + "step": 4521 + }, + { + "epoch": 0.58, + "grad_norm": 0.6947478186112446, + "learning_rate": 8.352954980003615e-06, + "loss": 0.528, + "step": 4522 + }, + { + "epoch": 0.58, + "grad_norm": 0.7519450452872166, + "learning_rate": 8.352189649850255e-06, + "loss": 0.5414, + "step": 4523 + }, + { + "epoch": 0.58, + "grad_norm": 0.7199672055423483, + "learning_rate": 8.351424177004893e-06, + "loss": 0.5983, + "step": 4524 + }, + { + "epoch": 0.58, + "grad_norm": 0.6369900256999003, + "learning_rate": 8.350658561500114e-06, + "loss": 0.4919, + "step": 4525 + }, + { + "epoch": 0.58, + "grad_norm": 0.6968041918701267, + "learning_rate": 8.349892803368506e-06, + "loss": 0.6362, + "step": 4526 + }, + { + "epoch": 0.58, + "grad_norm": 0.6900397909829166, + "learning_rate": 8.349126902642667e-06, + "loss": 0.5738, + "step": 4527 + }, + { + "epoch": 0.58, + "grad_norm": 0.7267647190814117, + "learning_rate": 8.348360859355198e-06, + "loss": 0.6043, + "step": 4528 + }, + { + "epoch": 0.58, + "grad_norm": 0.779604519713317, + "learning_rate": 8.347594673538705e-06, + "loss": 0.6071, + "step": 4529 + }, + { + "epoch": 0.58, + "grad_norm": 0.743336975063652, + "learning_rate": 8.346828345225806e-06, + "loss": 0.6383, + "step": 4530 + }, + { + "epoch": 0.58, + "grad_norm": 0.544228793325572, + "learning_rate": 8.34606187444912e-06, + "loss": 0.4821, + "step": 4531 + }, + { + "epoch": 0.58, + "grad_norm": 0.7831383892292381, + "learning_rate": 8.34529526124127e-06, + "loss": 0.6049, + "step": 4532 + }, + { + "epoch": 0.58, + "grad_norm": 0.7421778872396227, + "learning_rate": 8.344528505634894e-06, + "loss": 0.6024, + "step": 4533 + }, + { + "epoch": 0.58, + "grad_norm": 0.8358407921835102, + "learning_rate": 8.343761607662625e-06, + "loss": 0.6336, + "step": 4534 + }, + { + "epoch": 0.58, + "grad_norm": 0.7708064486421488, + "learning_rate": 8.34299456735711e-06, + "loss": 0.622, + "step": 4535 + }, + { + "epoch": 0.58, + "grad_norm": 0.6063444648625697, + "learning_rate": 8.342227384750998e-06, + "loss": 0.5267, + "step": 4536 + }, + { + "epoch": 0.58, + "grad_norm": 0.5994602788328899, + "learning_rate": 8.341460059876947e-06, + "loss": 0.5056, + "step": 4537 + }, + { + "epoch": 0.58, + "grad_norm": 0.563371386041275, + "learning_rate": 8.34069259276762e-06, + "loss": 0.4971, + "step": 4538 + }, + { + "epoch": 0.58, + "grad_norm": 0.6961521146768033, + "learning_rate": 8.339924983455684e-06, + "loss": 0.5955, + "step": 4539 + }, + { + "epoch": 0.58, + "grad_norm": 0.7742789902056458, + "learning_rate": 8.339157231973815e-06, + "loss": 0.5895, + "step": 4540 + }, + { + "epoch": 0.58, + "grad_norm": 0.6445128213410857, + "learning_rate": 8.338389338354693e-06, + "loss": 0.5341, + "step": 4541 + }, + { + "epoch": 0.58, + "grad_norm": 0.6121826947430126, + "learning_rate": 8.337621302631003e-06, + "loss": 0.5428, + "step": 4542 + }, + { + "epoch": 0.58, + "grad_norm": 0.7345697368035174, + "learning_rate": 8.33685312483544e-06, + "loss": 0.5354, + "step": 4543 + }, + { + "epoch": 0.58, + "grad_norm": 0.8811001564261831, + "learning_rate": 8.336084805000703e-06, + "loss": 0.6578, + "step": 4544 + }, + { + "epoch": 0.58, + "grad_norm": 0.7278108537264646, + "learning_rate": 8.335316343159498e-06, + "loss": 0.57, + "step": 4545 + }, + { + "epoch": 0.58, + "grad_norm": 0.6591807933465704, + "learning_rate": 8.334547739344534e-06, + "loss": 0.5237, + "step": 4546 + }, + { + "epoch": 0.58, + "grad_norm": 0.6241909819167553, + "learning_rate": 8.333778993588529e-06, + "loss": 0.5383, + "step": 4547 + }, + { + "epoch": 0.58, + "grad_norm": 0.5406844452349486, + "learning_rate": 8.333010105924201e-06, + "loss": 0.4765, + "step": 4548 + }, + { + "epoch": 0.58, + "grad_norm": 0.5974736064406457, + "learning_rate": 8.33224107638429e-06, + "loss": 0.528, + "step": 4549 + }, + { + "epoch": 0.58, + "grad_norm": 0.6932640265580899, + "learning_rate": 8.331471905001521e-06, + "loss": 0.5598, + "step": 4550 + }, + { + "epoch": 0.58, + "grad_norm": 0.6459708961663617, + "learning_rate": 8.330702591808639e-06, + "loss": 0.4623, + "step": 4551 + }, + { + "epoch": 0.58, + "grad_norm": 0.6357239836619667, + "learning_rate": 8.329933136838391e-06, + "loss": 0.5339, + "step": 4552 + }, + { + "epoch": 0.58, + "grad_norm": 0.8551503830518864, + "learning_rate": 8.32916354012353e-06, + "loss": 0.6803, + "step": 4553 + }, + { + "epoch": 0.58, + "grad_norm": 0.5869725208040968, + "learning_rate": 8.328393801696817e-06, + "loss": 0.4797, + "step": 4554 + }, + { + "epoch": 0.58, + "grad_norm": 0.6419183139505903, + "learning_rate": 8.327623921591014e-06, + "loss": 0.5271, + "step": 4555 + }, + { + "epoch": 0.58, + "grad_norm": 0.8884931704165502, + "learning_rate": 8.326853899838895e-06, + "loss": 0.6423, + "step": 4556 + }, + { + "epoch": 0.58, + "grad_norm": 0.7618765452436912, + "learning_rate": 8.326083736473238e-06, + "loss": 0.6033, + "step": 4557 + }, + { + "epoch": 0.58, + "grad_norm": 0.821669465018436, + "learning_rate": 8.325313431526824e-06, + "loss": 0.6335, + "step": 4558 + }, + { + "epoch": 0.58, + "grad_norm": 0.5591507822069133, + "learning_rate": 8.324542985032444e-06, + "loss": 0.5383, + "step": 4559 + }, + { + "epoch": 0.58, + "grad_norm": 0.5881586167867418, + "learning_rate": 8.32377239702289e-06, + "loss": 0.544, + "step": 4560 + }, + { + "epoch": 0.58, + "grad_norm": 0.9188463807647256, + "learning_rate": 8.32300166753097e-06, + "loss": 0.5166, + "step": 4561 + }, + { + "epoch": 0.58, + "grad_norm": 0.8610061131662988, + "learning_rate": 8.322230796589486e-06, + "loss": 0.595, + "step": 4562 + }, + { + "epoch": 0.58, + "grad_norm": 0.9617603272577288, + "learning_rate": 8.321459784231254e-06, + "loss": 0.6348, + "step": 4563 + }, + { + "epoch": 0.58, + "grad_norm": 0.7006695571472142, + "learning_rate": 8.320688630489093e-06, + "loss": 0.5687, + "step": 4564 + }, + { + "epoch": 0.58, + "grad_norm": 0.7845886099944425, + "learning_rate": 8.319917335395827e-06, + "loss": 0.5882, + "step": 4565 + }, + { + "epoch": 0.58, + "grad_norm": 0.7221957013093857, + "learning_rate": 8.319145898984291e-06, + "loss": 0.5141, + "step": 4566 + }, + { + "epoch": 0.58, + "grad_norm": 1.001814019951614, + "learning_rate": 8.318374321287319e-06, + "loss": 0.6619, + "step": 4567 + }, + { + "epoch": 0.58, + "grad_norm": 0.5920898880186863, + "learning_rate": 8.317602602337755e-06, + "loss": 0.5261, + "step": 4568 + }, + { + "epoch": 0.58, + "grad_norm": 0.8901935217083875, + "learning_rate": 8.316830742168452e-06, + "loss": 0.69, + "step": 4569 + }, + { + "epoch": 0.58, + "grad_norm": 0.737862041467974, + "learning_rate": 8.316058740812263e-06, + "loss": 0.5711, + "step": 4570 + }, + { + "epoch": 0.58, + "grad_norm": 0.6741381417129442, + "learning_rate": 8.31528659830205e-06, + "loss": 0.5838, + "step": 4571 + }, + { + "epoch": 0.58, + "grad_norm": 0.6252054561230648, + "learning_rate": 8.314514314670681e-06, + "loss": 0.5187, + "step": 4572 + }, + { + "epoch": 0.58, + "grad_norm": 0.6362798972986528, + "learning_rate": 8.313741889951028e-06, + "loss": 0.5418, + "step": 4573 + }, + { + "epoch": 0.58, + "grad_norm": 0.7232528490922306, + "learning_rate": 8.312969324175972e-06, + "loss": 0.5919, + "step": 4574 + }, + { + "epoch": 0.58, + "grad_norm": 0.8233850690240231, + "learning_rate": 8.312196617378399e-06, + "loss": 0.6409, + "step": 4575 + }, + { + "epoch": 0.58, + "grad_norm": 0.6412296015838452, + "learning_rate": 8.311423769591201e-06, + "loss": 0.5617, + "step": 4576 + }, + { + "epoch": 0.58, + "grad_norm": 0.8807926764936764, + "learning_rate": 8.310650780847275e-06, + "loss": 0.6779, + "step": 4577 + }, + { + "epoch": 0.58, + "grad_norm": 0.6088078919307018, + "learning_rate": 8.309877651179523e-06, + "loss": 0.5354, + "step": 4578 + }, + { + "epoch": 0.58, + "grad_norm": 0.7457311656456699, + "learning_rate": 8.309104380620857e-06, + "loss": 0.5341, + "step": 4579 + }, + { + "epoch": 0.58, + "grad_norm": 1.5331643572616342, + "learning_rate": 8.308330969204192e-06, + "loss": 0.6517, + "step": 4580 + }, + { + "epoch": 0.58, + "grad_norm": 0.6157849670100451, + "learning_rate": 8.30755741696245e-06, + "loss": 0.5307, + "step": 4581 + }, + { + "epoch": 0.58, + "grad_norm": 0.8538888377847706, + "learning_rate": 8.306783723928558e-06, + "loss": 0.6179, + "step": 4582 + }, + { + "epoch": 0.58, + "grad_norm": 0.5696347142685619, + "learning_rate": 8.30600989013545e-06, + "loss": 0.5387, + "step": 4583 + }, + { + "epoch": 0.58, + "grad_norm": 0.7679757721676566, + "learning_rate": 8.305235915616065e-06, + "loss": 0.6794, + "step": 4584 + }, + { + "epoch": 0.58, + "grad_norm": 0.6822017456418559, + "learning_rate": 8.30446180040335e-06, + "loss": 0.5187, + "step": 4585 + }, + { + "epoch": 0.58, + "grad_norm": 0.6171342767274044, + "learning_rate": 8.303687544530254e-06, + "loss": 0.5268, + "step": 4586 + }, + { + "epoch": 0.58, + "grad_norm": 0.5780266415667538, + "learning_rate": 8.302913148029738e-06, + "loss": 0.528, + "step": 4587 + }, + { + "epoch": 0.58, + "grad_norm": 0.6674256120796203, + "learning_rate": 8.302138610934762e-06, + "loss": 0.5698, + "step": 4588 + }, + { + "epoch": 0.58, + "grad_norm": 0.8206296189598848, + "learning_rate": 8.3013639332783e-06, + "loss": 0.6536, + "step": 4589 + }, + { + "epoch": 0.58, + "grad_norm": 0.7877189165111409, + "learning_rate": 8.300589115093324e-06, + "loss": 0.635, + "step": 4590 + }, + { + "epoch": 0.58, + "grad_norm": 0.5616090193575192, + "learning_rate": 8.299814156412816e-06, + "loss": 0.5161, + "step": 4591 + }, + { + "epoch": 0.59, + "grad_norm": 0.5988902365227176, + "learning_rate": 8.299039057269764e-06, + "loss": 0.5687, + "step": 4592 + }, + { + "epoch": 0.59, + "grad_norm": 0.6960419334930371, + "learning_rate": 8.298263817697162e-06, + "loss": 0.5795, + "step": 4593 + }, + { + "epoch": 0.59, + "grad_norm": 0.5707883144842965, + "learning_rate": 8.297488437728008e-06, + "loss": 0.5011, + "step": 4594 + }, + { + "epoch": 0.59, + "grad_norm": 0.6575111612549288, + "learning_rate": 8.296712917395311e-06, + "loss": 0.5649, + "step": 4595 + }, + { + "epoch": 0.59, + "grad_norm": 0.745300592831123, + "learning_rate": 8.295937256732077e-06, + "loss": 0.6651, + "step": 4596 + }, + { + "epoch": 0.59, + "grad_norm": 0.6442942869675553, + "learning_rate": 8.295161455771327e-06, + "loss": 0.5579, + "step": 4597 + }, + { + "epoch": 0.59, + "grad_norm": 0.6301425841386397, + "learning_rate": 8.294385514546083e-06, + "loss": 0.5346, + "step": 4598 + }, + { + "epoch": 0.59, + "grad_norm": 0.5985101956372935, + "learning_rate": 8.293609433089378e-06, + "loss": 0.5695, + "step": 4599 + }, + { + "epoch": 0.59, + "grad_norm": 0.5838525089912457, + "learning_rate": 8.292833211434243e-06, + "loss": 0.5491, + "step": 4600 + }, + { + "epoch": 0.59, + "grad_norm": 0.608297559285062, + "learning_rate": 8.29205684961372e-06, + "loss": 0.5415, + "step": 4601 + }, + { + "epoch": 0.59, + "grad_norm": 0.5850325101804145, + "learning_rate": 8.291280347660856e-06, + "loss": 0.53, + "step": 4602 + }, + { + "epoch": 0.59, + "grad_norm": 0.6541675043155815, + "learning_rate": 8.290503705608707e-06, + "loss": 0.5075, + "step": 4603 + }, + { + "epoch": 0.59, + "grad_norm": 0.732558482290716, + "learning_rate": 8.28972692349033e-06, + "loss": 0.6011, + "step": 4604 + }, + { + "epoch": 0.59, + "grad_norm": 0.6349321427691408, + "learning_rate": 8.288950001338788e-06, + "loss": 0.5052, + "step": 4605 + }, + { + "epoch": 0.59, + "grad_norm": 0.7066040299986205, + "learning_rate": 8.288172939187155e-06, + "loss": 0.5979, + "step": 4606 + }, + { + "epoch": 0.59, + "grad_norm": 0.6263607674648934, + "learning_rate": 8.287395737068509e-06, + "loss": 0.5699, + "step": 4607 + }, + { + "epoch": 0.59, + "grad_norm": 0.7576194191057484, + "learning_rate": 8.286618395015931e-06, + "loss": 0.5935, + "step": 4608 + }, + { + "epoch": 0.59, + "grad_norm": 0.6930537273041669, + "learning_rate": 8.28584091306251e-06, + "loss": 0.5415, + "step": 4609 + }, + { + "epoch": 0.59, + "grad_norm": 0.8121738648037318, + "learning_rate": 8.285063291241342e-06, + "loss": 0.6349, + "step": 4610 + }, + { + "epoch": 0.59, + "grad_norm": 0.8005553728820987, + "learning_rate": 8.284285529585526e-06, + "loss": 0.6458, + "step": 4611 + }, + { + "epoch": 0.59, + "grad_norm": 0.6713183728884552, + "learning_rate": 8.283507628128172e-06, + "loss": 0.5637, + "step": 4612 + }, + { + "epoch": 0.59, + "grad_norm": 0.5326842064130279, + "learning_rate": 8.282729586902389e-06, + "loss": 0.5141, + "step": 4613 + }, + { + "epoch": 0.59, + "grad_norm": 0.7089873735358927, + "learning_rate": 8.281951405941298e-06, + "loss": 0.5704, + "step": 4614 + }, + { + "epoch": 0.59, + "grad_norm": 0.748539003985585, + "learning_rate": 8.281173085278024e-06, + "loss": 0.5747, + "step": 4615 + }, + { + "epoch": 0.59, + "grad_norm": 0.768531314502041, + "learning_rate": 8.280394624945697e-06, + "loss": 0.5113, + "step": 4616 + }, + { + "epoch": 0.59, + "grad_norm": 0.596118209725609, + "learning_rate": 8.279616024977453e-06, + "loss": 0.5245, + "step": 4617 + }, + { + "epoch": 0.59, + "grad_norm": 0.8051177654560966, + "learning_rate": 8.278837285406436e-06, + "loss": 0.5446, + "step": 4618 + }, + { + "epoch": 0.59, + "grad_norm": 0.5759237485926437, + "learning_rate": 8.278058406265792e-06, + "loss": 0.51, + "step": 4619 + }, + { + "epoch": 0.59, + "grad_norm": 0.6625567295722475, + "learning_rate": 8.277279387588677e-06, + "loss": 0.6042, + "step": 4620 + }, + { + "epoch": 0.59, + "grad_norm": 0.6024807647703426, + "learning_rate": 8.276500229408251e-06, + "loss": 0.5292, + "step": 4621 + }, + { + "epoch": 0.59, + "grad_norm": 0.7874690897198057, + "learning_rate": 8.27572093175768e-06, + "loss": 0.6525, + "step": 4622 + }, + { + "epoch": 0.59, + "grad_norm": 0.6261995981712538, + "learning_rate": 8.27494149467014e-06, + "loss": 0.5347, + "step": 4623 + }, + { + "epoch": 0.59, + "grad_norm": 0.7515888419419297, + "learning_rate": 8.274161918178803e-06, + "loss": 0.6693, + "step": 4624 + }, + { + "epoch": 0.59, + "grad_norm": 0.6524185402631865, + "learning_rate": 8.273382202316855e-06, + "loss": 0.5265, + "step": 4625 + }, + { + "epoch": 0.59, + "grad_norm": 0.6747795621916263, + "learning_rate": 8.272602347117488e-06, + "loss": 0.5475, + "step": 4626 + }, + { + "epoch": 0.59, + "grad_norm": 0.7020670020957112, + "learning_rate": 8.271822352613896e-06, + "loss": 0.6218, + "step": 4627 + }, + { + "epoch": 0.59, + "grad_norm": 0.5411099911561422, + "learning_rate": 8.271042218839284e-06, + "loss": 0.5111, + "step": 4628 + }, + { + "epoch": 0.59, + "grad_norm": 0.5493993465983869, + "learning_rate": 8.270261945826855e-06, + "loss": 0.5359, + "step": 4629 + }, + { + "epoch": 0.59, + "grad_norm": 0.792438775915959, + "learning_rate": 8.269481533609827e-06, + "loss": 0.6317, + "step": 4630 + }, + { + "epoch": 0.59, + "grad_norm": 0.8024391520749959, + "learning_rate": 8.268700982221416e-06, + "loss": 0.6572, + "step": 4631 + }, + { + "epoch": 0.59, + "grad_norm": 0.9578000117038373, + "learning_rate": 8.267920291694851e-06, + "loss": 0.5122, + "step": 4632 + }, + { + "epoch": 0.59, + "grad_norm": 0.7981641600501669, + "learning_rate": 8.267139462063363e-06, + "loss": 0.6319, + "step": 4633 + }, + { + "epoch": 0.59, + "grad_norm": 0.8111421228066134, + "learning_rate": 8.266358493360186e-06, + "loss": 0.6145, + "step": 4634 + }, + { + "epoch": 0.59, + "grad_norm": 0.5781219843472877, + "learning_rate": 8.265577385618566e-06, + "loss": 0.5159, + "step": 4635 + }, + { + "epoch": 0.59, + "grad_norm": 0.6945359278874309, + "learning_rate": 8.264796138871753e-06, + "loss": 0.5012, + "step": 4636 + }, + { + "epoch": 0.59, + "grad_norm": 0.6453162545353781, + "learning_rate": 8.264014753153e-06, + "loss": 0.5841, + "step": 4637 + }, + { + "epoch": 0.59, + "grad_norm": 0.666539692458954, + "learning_rate": 8.26323322849557e-06, + "loss": 0.5674, + "step": 4638 + }, + { + "epoch": 0.59, + "grad_norm": 0.5924871182551501, + "learning_rate": 8.262451564932729e-06, + "loss": 0.5025, + "step": 4639 + }, + { + "epoch": 0.59, + "grad_norm": 0.6339368540742805, + "learning_rate": 8.261669762497752e-06, + "loss": 0.5289, + "step": 4640 + }, + { + "epoch": 0.59, + "grad_norm": 0.7127930839254408, + "learning_rate": 8.260887821223915e-06, + "loss": 0.5792, + "step": 4641 + }, + { + "epoch": 0.59, + "grad_norm": 0.7262930078915326, + "learning_rate": 8.260105741144502e-06, + "loss": 0.5962, + "step": 4642 + }, + { + "epoch": 0.59, + "grad_norm": 0.5935238702267821, + "learning_rate": 8.259323522292808e-06, + "loss": 0.4399, + "step": 4643 + }, + { + "epoch": 0.59, + "grad_norm": 0.6121339348837413, + "learning_rate": 8.258541164702126e-06, + "loss": 0.5426, + "step": 4644 + }, + { + "epoch": 0.59, + "grad_norm": 0.824277049920599, + "learning_rate": 8.257758668405758e-06, + "loss": 0.6448, + "step": 4645 + }, + { + "epoch": 0.59, + "grad_norm": 0.5560645771345775, + "learning_rate": 8.256976033437015e-06, + "loss": 0.5562, + "step": 4646 + }, + { + "epoch": 0.59, + "grad_norm": 0.6901256974527458, + "learning_rate": 8.25619325982921e-06, + "loss": 0.5405, + "step": 4647 + }, + { + "epoch": 0.59, + "grad_norm": 0.779128310073951, + "learning_rate": 8.255410347615663e-06, + "loss": 0.6679, + "step": 4648 + }, + { + "epoch": 0.59, + "grad_norm": 0.6752717566763254, + "learning_rate": 8.2546272968297e-06, + "loss": 0.5654, + "step": 4649 + }, + { + "epoch": 0.59, + "grad_norm": 0.8439658033365739, + "learning_rate": 8.253844107504653e-06, + "loss": 0.6299, + "step": 4650 + }, + { + "epoch": 0.59, + "grad_norm": 0.8188284309297219, + "learning_rate": 8.253060779673863e-06, + "loss": 0.6453, + "step": 4651 + }, + { + "epoch": 0.59, + "grad_norm": 0.7889139822860217, + "learning_rate": 8.252277313370668e-06, + "loss": 0.6143, + "step": 4652 + }, + { + "epoch": 0.59, + "grad_norm": 0.6781967242411078, + "learning_rate": 8.251493708628422e-06, + "loss": 0.5723, + "step": 4653 + }, + { + "epoch": 0.59, + "grad_norm": 1.1367495885264813, + "learning_rate": 8.250709965480478e-06, + "loss": 0.6713, + "step": 4654 + }, + { + "epoch": 0.59, + "grad_norm": 0.5367243516615912, + "learning_rate": 8.2499260839602e-06, + "loss": 0.537, + "step": 4655 + }, + { + "epoch": 0.59, + "grad_norm": 0.7737990284411801, + "learning_rate": 8.249142064100952e-06, + "loss": 0.5988, + "step": 4656 + }, + { + "epoch": 0.59, + "grad_norm": 0.6965216708231662, + "learning_rate": 8.248357905936111e-06, + "loss": 0.566, + "step": 4657 + }, + { + "epoch": 0.59, + "grad_norm": 0.7603000484519947, + "learning_rate": 8.247573609499054e-06, + "loss": 0.5566, + "step": 4658 + }, + { + "epoch": 0.59, + "grad_norm": 0.8556547495436599, + "learning_rate": 8.246789174823164e-06, + "loss": 0.6139, + "step": 4659 + }, + { + "epoch": 0.59, + "grad_norm": 0.8039671031347132, + "learning_rate": 8.246004601941834e-06, + "loss": 0.6099, + "step": 4660 + }, + { + "epoch": 0.59, + "grad_norm": 0.7857399076760495, + "learning_rate": 8.245219890888463e-06, + "loss": 0.6252, + "step": 4661 + }, + { + "epoch": 0.59, + "grad_norm": 0.6986999667506865, + "learning_rate": 8.244435041696453e-06, + "loss": 0.5901, + "step": 4662 + }, + { + "epoch": 0.59, + "grad_norm": 0.6663399209358067, + "learning_rate": 8.243650054399209e-06, + "loss": 0.5309, + "step": 4663 + }, + { + "epoch": 0.59, + "grad_norm": 0.6288798647661501, + "learning_rate": 8.242864929030145e-06, + "loss": 0.5573, + "step": 4664 + }, + { + "epoch": 0.59, + "grad_norm": 0.7463076296848864, + "learning_rate": 8.242079665622686e-06, + "loss": 0.5377, + "step": 4665 + }, + { + "epoch": 0.59, + "grad_norm": 0.8739340742472499, + "learning_rate": 8.241294264210256e-06, + "loss": 0.6134, + "step": 4666 + }, + { + "epoch": 0.59, + "grad_norm": 0.777090426151362, + "learning_rate": 8.240508724826287e-06, + "loss": 0.5906, + "step": 4667 + }, + { + "epoch": 0.59, + "grad_norm": 0.699591831918812, + "learning_rate": 8.239723047504216e-06, + "loss": 0.5446, + "step": 4668 + }, + { + "epoch": 0.59, + "grad_norm": 0.5941513675925244, + "learning_rate": 8.23893723227749e-06, + "loss": 0.5582, + "step": 4669 + }, + { + "epoch": 0.59, + "grad_norm": 0.8144842680043904, + "learning_rate": 8.238151279179553e-06, + "loss": 0.6324, + "step": 4670 + }, + { + "epoch": 0.6, + "grad_norm": 0.785511863474483, + "learning_rate": 8.237365188243865e-06, + "loss": 0.6472, + "step": 4671 + }, + { + "epoch": 0.6, + "grad_norm": 0.7463472417078749, + "learning_rate": 8.236578959503884e-06, + "loss": 0.6375, + "step": 4672 + }, + { + "epoch": 0.6, + "grad_norm": 0.6446009794678811, + "learning_rate": 8.235792592993082e-06, + "loss": 0.5261, + "step": 4673 + }, + { + "epoch": 0.6, + "grad_norm": 0.6573585926396419, + "learning_rate": 8.235006088744929e-06, + "loss": 0.5246, + "step": 4674 + }, + { + "epoch": 0.6, + "grad_norm": 0.6582561884637631, + "learning_rate": 8.234219446792904e-06, + "loss": 0.5402, + "step": 4675 + }, + { + "epoch": 0.6, + "grad_norm": 0.7633191541450485, + "learning_rate": 8.233432667170494e-06, + "loss": 0.5676, + "step": 4676 + }, + { + "epoch": 0.6, + "grad_norm": 0.7939556371781813, + "learning_rate": 8.232645749911185e-06, + "loss": 0.6211, + "step": 4677 + }, + { + "epoch": 0.6, + "grad_norm": 1.008517296327773, + "learning_rate": 8.231858695048479e-06, + "loss": 0.6066, + "step": 4678 + }, + { + "epoch": 0.6, + "grad_norm": 0.5938877523332594, + "learning_rate": 8.231071502615873e-06, + "loss": 0.5422, + "step": 4679 + }, + { + "epoch": 0.6, + "grad_norm": 0.5268837938765629, + "learning_rate": 8.230284172646881e-06, + "loss": 0.5053, + "step": 4680 + }, + { + "epoch": 0.6, + "grad_norm": 0.5833814210144035, + "learning_rate": 8.229496705175012e-06, + "loss": 0.5064, + "step": 4681 + }, + { + "epoch": 0.6, + "grad_norm": 0.5674897173502002, + "learning_rate": 8.228709100233789e-06, + "loss": 0.5552, + "step": 4682 + }, + { + "epoch": 0.6, + "grad_norm": 0.5026327757855346, + "learning_rate": 8.227921357856738e-06, + "loss": 0.463, + "step": 4683 + }, + { + "epoch": 0.6, + "grad_norm": 0.6045577676720874, + "learning_rate": 8.22713347807739e-06, + "loss": 0.4916, + "step": 4684 + }, + { + "epoch": 0.6, + "grad_norm": 0.6782690097154433, + "learning_rate": 8.22634546092928e-06, + "loss": 0.5303, + "step": 4685 + }, + { + "epoch": 0.6, + "grad_norm": 0.6143614157222634, + "learning_rate": 8.225557306445956e-06, + "loss": 0.5555, + "step": 4686 + }, + { + "epoch": 0.6, + "grad_norm": 0.768168823360345, + "learning_rate": 8.224769014660964e-06, + "loss": 0.5589, + "step": 4687 + }, + { + "epoch": 0.6, + "grad_norm": 0.6340062847522439, + "learning_rate": 8.223980585607861e-06, + "loss": 0.5835, + "step": 4688 + }, + { + "epoch": 0.6, + "grad_norm": 0.6927697622048208, + "learning_rate": 8.223192019320206e-06, + "loss": 0.5845, + "step": 4689 + }, + { + "epoch": 0.6, + "grad_norm": 0.5803929956917045, + "learning_rate": 8.222403315831565e-06, + "loss": 0.5398, + "step": 4690 + }, + { + "epoch": 0.6, + "grad_norm": 0.6620576144270319, + "learning_rate": 8.221614475175511e-06, + "loss": 0.5646, + "step": 4691 + }, + { + "epoch": 0.6, + "grad_norm": 0.6485821328922843, + "learning_rate": 8.220825497385628e-06, + "loss": 0.5433, + "step": 4692 + }, + { + "epoch": 0.6, + "grad_norm": 0.758682620589759, + "learning_rate": 8.220036382495494e-06, + "loss": 0.5635, + "step": 4693 + }, + { + "epoch": 0.6, + "grad_norm": 0.5472629883164765, + "learning_rate": 8.219247130538702e-06, + "loss": 0.51, + "step": 4694 + }, + { + "epoch": 0.6, + "grad_norm": 0.7145393761055885, + "learning_rate": 8.218457741548846e-06, + "loss": 0.536, + "step": 4695 + }, + { + "epoch": 0.6, + "grad_norm": 0.5911447726164596, + "learning_rate": 8.217668215559528e-06, + "loss": 0.5062, + "step": 4696 + }, + { + "epoch": 0.6, + "grad_norm": 0.8202246234094444, + "learning_rate": 8.216878552604356e-06, + "loss": 0.6482, + "step": 4697 + }, + { + "epoch": 0.6, + "grad_norm": 0.7048611594369557, + "learning_rate": 8.216088752716945e-06, + "loss": 0.5861, + "step": 4698 + }, + { + "epoch": 0.6, + "grad_norm": 0.5855694900257165, + "learning_rate": 8.215298815930912e-06, + "loss": 0.5338, + "step": 4699 + }, + { + "epoch": 0.6, + "grad_norm": 0.7717944753042869, + "learning_rate": 8.214508742279884e-06, + "loss": 0.661, + "step": 4700 + }, + { + "epoch": 0.6, + "grad_norm": 0.5407674945319926, + "learning_rate": 8.21371853179749e-06, + "loss": 0.5083, + "step": 4701 + }, + { + "epoch": 0.6, + "grad_norm": 0.7767660691205729, + "learning_rate": 8.212928184517368e-06, + "loss": 0.5481, + "step": 4702 + }, + { + "epoch": 0.6, + "grad_norm": 0.5305472202996602, + "learning_rate": 8.212137700473159e-06, + "loss": 0.4873, + "step": 4703 + }, + { + "epoch": 0.6, + "grad_norm": 0.8007867522899832, + "learning_rate": 8.211347079698515e-06, + "loss": 0.6342, + "step": 4704 + }, + { + "epoch": 0.6, + "grad_norm": 0.5979449171900671, + "learning_rate": 8.210556322227087e-06, + "loss": 0.5908, + "step": 4705 + }, + { + "epoch": 0.6, + "grad_norm": 0.7231031042642851, + "learning_rate": 8.209765428092535e-06, + "loss": 0.5683, + "step": 4706 + }, + { + "epoch": 0.6, + "grad_norm": 0.7289049404047805, + "learning_rate": 8.208974397328527e-06, + "loss": 0.6381, + "step": 4707 + }, + { + "epoch": 0.6, + "grad_norm": 0.7236172218250517, + "learning_rate": 8.208183229968731e-06, + "loss": 0.5567, + "step": 4708 + }, + { + "epoch": 0.6, + "grad_norm": 0.7738967297665412, + "learning_rate": 8.20739192604683e-06, + "loss": 0.5994, + "step": 4709 + }, + { + "epoch": 0.6, + "grad_norm": 0.7728467453810064, + "learning_rate": 8.206600485596501e-06, + "loss": 0.6049, + "step": 4710 + }, + { + "epoch": 0.6, + "grad_norm": 0.8767782367508747, + "learning_rate": 8.205808908651439e-06, + "loss": 0.6213, + "step": 4711 + }, + { + "epoch": 0.6, + "grad_norm": 0.7429710637834168, + "learning_rate": 8.205017195245333e-06, + "loss": 0.6152, + "step": 4712 + }, + { + "epoch": 0.6, + "grad_norm": 0.5897784338241513, + "learning_rate": 8.20422534541189e-06, + "loss": 0.5291, + "step": 4713 + }, + { + "epoch": 0.6, + "grad_norm": 0.5771387514393338, + "learning_rate": 8.203433359184811e-06, + "loss": 0.5108, + "step": 4714 + }, + { + "epoch": 0.6, + "grad_norm": 0.8162661306598429, + "learning_rate": 8.20264123659781e-06, + "loss": 0.5798, + "step": 4715 + }, + { + "epoch": 0.6, + "grad_norm": 0.6520075500920509, + "learning_rate": 8.201848977684608e-06, + "loss": 0.5716, + "step": 4716 + }, + { + "epoch": 0.6, + "grad_norm": 0.9540425134830905, + "learning_rate": 8.201056582478926e-06, + "loss": 0.6701, + "step": 4717 + }, + { + "epoch": 0.6, + "grad_norm": 0.5588207886438106, + "learning_rate": 8.200264051014494e-06, + "loss": 0.5205, + "step": 4718 + }, + { + "epoch": 0.6, + "grad_norm": 1.0424848989956124, + "learning_rate": 8.199471383325049e-06, + "loss": 0.6383, + "step": 4719 + }, + { + "epoch": 0.6, + "grad_norm": 0.6467697823264894, + "learning_rate": 8.19867857944433e-06, + "loss": 0.5593, + "step": 4720 + }, + { + "epoch": 0.6, + "grad_norm": 0.8636816125209708, + "learning_rate": 8.197885639406085e-06, + "loss": 0.664, + "step": 4721 + }, + { + "epoch": 0.6, + "grad_norm": 0.7121787750414755, + "learning_rate": 8.197092563244067e-06, + "loss": 0.6414, + "step": 4722 + }, + { + "epoch": 0.6, + "grad_norm": 0.5867722333401326, + "learning_rate": 8.196299350992036e-06, + "loss": 0.5231, + "step": 4723 + }, + { + "epoch": 0.6, + "grad_norm": 0.8165973481902713, + "learning_rate": 8.195506002683756e-06, + "loss": 0.6644, + "step": 4724 + }, + { + "epoch": 0.6, + "grad_norm": 0.9932537819337868, + "learning_rate": 8.194712518352997e-06, + "loss": 0.5714, + "step": 4725 + }, + { + "epoch": 0.6, + "grad_norm": 0.7202564353050699, + "learning_rate": 8.193918898033533e-06, + "loss": 0.6067, + "step": 4726 + }, + { + "epoch": 0.6, + "grad_norm": 0.6208630082852726, + "learning_rate": 8.19312514175915e-06, + "loss": 0.5114, + "step": 4727 + }, + { + "epoch": 0.6, + "grad_norm": 0.7230895260903912, + "learning_rate": 8.192331249563632e-06, + "loss": 0.6077, + "step": 4728 + }, + { + "epoch": 0.6, + "grad_norm": 0.5531853516276057, + "learning_rate": 8.191537221480775e-06, + "loss": 0.4787, + "step": 4729 + }, + { + "epoch": 0.6, + "grad_norm": 0.5858968568748244, + "learning_rate": 8.190743057544376e-06, + "loss": 0.5306, + "step": 4730 + }, + { + "epoch": 0.6, + "grad_norm": 0.76533443112363, + "learning_rate": 8.189948757788242e-06, + "loss": 0.6236, + "step": 4731 + }, + { + "epoch": 0.6, + "grad_norm": 0.6381047834579116, + "learning_rate": 8.189154322246184e-06, + "loss": 0.521, + "step": 4732 + }, + { + "epoch": 0.6, + "grad_norm": 0.8094782169375087, + "learning_rate": 8.188359750952015e-06, + "loss": 0.622, + "step": 4733 + }, + { + "epoch": 0.6, + "grad_norm": 0.6167793767933879, + "learning_rate": 8.187565043939562e-06, + "loss": 0.5403, + "step": 4734 + }, + { + "epoch": 0.6, + "grad_norm": 0.8147174576845679, + "learning_rate": 8.18677020124265e-06, + "loss": 0.6379, + "step": 4735 + }, + { + "epoch": 0.6, + "grad_norm": 1.14250842422233, + "learning_rate": 8.185975222895115e-06, + "loss": 0.6127, + "step": 4736 + }, + { + "epoch": 0.6, + "grad_norm": 0.6121115758075073, + "learning_rate": 8.185180108930795e-06, + "loss": 0.5385, + "step": 4737 + }, + { + "epoch": 0.6, + "grad_norm": 0.7378590011021329, + "learning_rate": 8.184384859383537e-06, + "loss": 0.5876, + "step": 4738 + }, + { + "epoch": 0.6, + "grad_norm": 0.8116117048304031, + "learning_rate": 8.18358947428719e-06, + "loss": 0.6028, + "step": 4739 + }, + { + "epoch": 0.6, + "grad_norm": 0.7827381595688337, + "learning_rate": 8.182793953675613e-06, + "loss": 0.5824, + "step": 4740 + }, + { + "epoch": 0.6, + "grad_norm": 0.7529424867561774, + "learning_rate": 8.181998297582668e-06, + "loss": 0.6171, + "step": 4741 + }, + { + "epoch": 0.6, + "grad_norm": 0.8637249480518574, + "learning_rate": 8.181202506042224e-06, + "loss": 0.6406, + "step": 4742 + }, + { + "epoch": 0.6, + "grad_norm": 0.84913266113476, + "learning_rate": 8.180406579088154e-06, + "loss": 0.5968, + "step": 4743 + }, + { + "epoch": 0.6, + "grad_norm": 0.8594606251697315, + "learning_rate": 8.179610516754341e-06, + "loss": 0.6496, + "step": 4744 + }, + { + "epoch": 0.6, + "grad_norm": 0.5428580741176288, + "learning_rate": 8.178814319074669e-06, + "loss": 0.5436, + "step": 4745 + }, + { + "epoch": 0.6, + "grad_norm": 0.6550157569932077, + "learning_rate": 8.178017986083026e-06, + "loss": 0.5443, + "step": 4746 + }, + { + "epoch": 0.6, + "grad_norm": 0.7766491928055453, + "learning_rate": 8.177221517813317e-06, + "loss": 0.5871, + "step": 4747 + }, + { + "epoch": 0.6, + "grad_norm": 0.7229487511566859, + "learning_rate": 8.17642491429944e-06, + "loss": 0.6646, + "step": 4748 + }, + { + "epoch": 0.61, + "grad_norm": 0.8465769643127085, + "learning_rate": 8.175628175575303e-06, + "loss": 0.6456, + "step": 4749 + }, + { + "epoch": 0.61, + "grad_norm": 0.9591417564551952, + "learning_rate": 8.174831301674823e-06, + "loss": 0.6841, + "step": 4750 + }, + { + "epoch": 0.61, + "grad_norm": 0.6584652317916735, + "learning_rate": 8.17403429263192e-06, + "loss": 0.5447, + "step": 4751 + }, + { + "epoch": 0.61, + "grad_norm": 0.6351246382884953, + "learning_rate": 8.17323714848052e-06, + "loss": 0.4827, + "step": 4752 + }, + { + "epoch": 0.61, + "grad_norm": 0.6794344882886584, + "learning_rate": 8.172439869254553e-06, + "loss": 0.5947, + "step": 4753 + }, + { + "epoch": 0.61, + "grad_norm": 0.6075077998455788, + "learning_rate": 8.171642454987962e-06, + "loss": 0.59, + "step": 4754 + }, + { + "epoch": 0.61, + "grad_norm": 0.6062427020311038, + "learning_rate": 8.170844905714684e-06, + "loss": 0.5528, + "step": 4755 + }, + { + "epoch": 0.61, + "grad_norm": 0.7944220427103551, + "learning_rate": 8.170047221468673e-06, + "loss": 0.6177, + "step": 4756 + }, + { + "epoch": 0.61, + "grad_norm": 0.5500138044856991, + "learning_rate": 8.16924940228388e-06, + "loss": 0.4603, + "step": 4757 + }, + { + "epoch": 0.61, + "grad_norm": 0.7967601294356719, + "learning_rate": 8.16845144819427e-06, + "loss": 0.6302, + "step": 4758 + }, + { + "epoch": 0.61, + "grad_norm": 0.5609469815530869, + "learning_rate": 8.167653359233803e-06, + "loss": 0.5194, + "step": 4759 + }, + { + "epoch": 0.61, + "grad_norm": 0.6899560275405495, + "learning_rate": 8.166855135436458e-06, + "loss": 0.5395, + "step": 4760 + }, + { + "epoch": 0.61, + "grad_norm": 0.640058892806277, + "learning_rate": 8.166056776836207e-06, + "loss": 0.5185, + "step": 4761 + }, + { + "epoch": 0.61, + "grad_norm": 0.7415652042202203, + "learning_rate": 8.16525828346704e-06, + "loss": 0.6, + "step": 4762 + }, + { + "epoch": 0.61, + "grad_norm": 0.7140960492609035, + "learning_rate": 8.16445965536294e-06, + "loss": 0.5209, + "step": 4763 + }, + { + "epoch": 0.61, + "grad_norm": 0.6429796093746752, + "learning_rate": 8.163660892557904e-06, + "loss": 0.5481, + "step": 4764 + }, + { + "epoch": 0.61, + "grad_norm": 0.6797984855087086, + "learning_rate": 8.162861995085934e-06, + "loss": 0.571, + "step": 4765 + }, + { + "epoch": 0.61, + "grad_norm": 0.6390299248708973, + "learning_rate": 8.162062962981036e-06, + "loss": 0.5633, + "step": 4766 + }, + { + "epoch": 0.61, + "grad_norm": 0.5368528206457558, + "learning_rate": 8.161263796277223e-06, + "loss": 0.4686, + "step": 4767 + }, + { + "epoch": 0.61, + "grad_norm": 0.9471618086438857, + "learning_rate": 8.160464495008512e-06, + "loss": 0.7009, + "step": 4768 + }, + { + "epoch": 0.61, + "grad_norm": 0.6900754661852971, + "learning_rate": 8.159665059208926e-06, + "loss": 0.563, + "step": 4769 + }, + { + "epoch": 0.61, + "grad_norm": 0.6665293492580482, + "learning_rate": 8.158865488912498e-06, + "loss": 0.5591, + "step": 4770 + }, + { + "epoch": 0.61, + "grad_norm": 0.6719251737210051, + "learning_rate": 8.158065784153258e-06, + "loss": 0.5602, + "step": 4771 + }, + { + "epoch": 0.61, + "grad_norm": 0.6652603466476381, + "learning_rate": 8.15726594496525e-06, + "loss": 0.5329, + "step": 4772 + }, + { + "epoch": 0.61, + "grad_norm": 0.6329341996750119, + "learning_rate": 8.156465971382518e-06, + "loss": 0.5747, + "step": 4773 + }, + { + "epoch": 0.61, + "grad_norm": 0.7178080928686216, + "learning_rate": 8.155665863439118e-06, + "loss": 0.5932, + "step": 4774 + }, + { + "epoch": 0.61, + "grad_norm": 0.5875864628155599, + "learning_rate": 8.154865621169106e-06, + "loss": 0.5702, + "step": 4775 + }, + { + "epoch": 0.61, + "grad_norm": 0.703005532628864, + "learning_rate": 8.154065244606547e-06, + "loss": 0.5411, + "step": 4776 + }, + { + "epoch": 0.61, + "grad_norm": 0.7231004309489556, + "learning_rate": 8.153264733785506e-06, + "loss": 0.6325, + "step": 4777 + }, + { + "epoch": 0.61, + "grad_norm": 0.5795419795889204, + "learning_rate": 8.152464088740066e-06, + "loss": 0.5075, + "step": 4778 + }, + { + "epoch": 0.61, + "grad_norm": 0.975411624673703, + "learning_rate": 8.151663309504301e-06, + "loss": 0.6291, + "step": 4779 + }, + { + "epoch": 0.61, + "grad_norm": 0.6978743100613516, + "learning_rate": 8.1508623961123e-06, + "loss": 0.5392, + "step": 4780 + }, + { + "epoch": 0.61, + "grad_norm": 0.7707502929035336, + "learning_rate": 8.150061348598158e-06, + "loss": 0.6428, + "step": 4781 + }, + { + "epoch": 0.61, + "grad_norm": 0.7465758153972254, + "learning_rate": 8.14926016699597e-06, + "loss": 0.55, + "step": 4782 + }, + { + "epoch": 0.61, + "grad_norm": 0.6830603107167149, + "learning_rate": 8.148458851339837e-06, + "loss": 0.5929, + "step": 4783 + }, + { + "epoch": 0.61, + "grad_norm": 0.6912109698268275, + "learning_rate": 8.147657401663875e-06, + "loss": 0.5323, + "step": 4784 + }, + { + "epoch": 0.61, + "grad_norm": 0.6295671870593303, + "learning_rate": 8.146855818002194e-06, + "loss": 0.547, + "step": 4785 + }, + { + "epoch": 0.61, + "grad_norm": 0.5388794576131567, + "learning_rate": 8.146054100388917e-06, + "loss": 0.497, + "step": 4786 + }, + { + "epoch": 0.61, + "grad_norm": 0.561849712112106, + "learning_rate": 8.145252248858172e-06, + "loss": 0.4933, + "step": 4787 + }, + { + "epoch": 0.61, + "grad_norm": 0.6238080545111447, + "learning_rate": 8.144450263444089e-06, + "loss": 0.4867, + "step": 4788 + }, + { + "epoch": 0.61, + "grad_norm": 0.6100209454034814, + "learning_rate": 8.143648144180804e-06, + "loss": 0.5575, + "step": 4789 + }, + { + "epoch": 0.61, + "grad_norm": 0.6869051626211036, + "learning_rate": 8.142845891102466e-06, + "loss": 0.5246, + "step": 4790 + }, + { + "epoch": 0.61, + "grad_norm": 0.7185336470393379, + "learning_rate": 8.14204350424322e-06, + "loss": 0.5567, + "step": 4791 + }, + { + "epoch": 0.61, + "grad_norm": 0.7722515859289103, + "learning_rate": 8.141240983637224e-06, + "loss": 0.5818, + "step": 4792 + }, + { + "epoch": 0.61, + "grad_norm": 0.7579152851004997, + "learning_rate": 8.140438329318636e-06, + "loss": 0.6249, + "step": 4793 + }, + { + "epoch": 0.61, + "grad_norm": 0.718007431908234, + "learning_rate": 8.139635541321624e-06, + "loss": 0.5953, + "step": 4794 + }, + { + "epoch": 0.61, + "grad_norm": 0.6915713114399602, + "learning_rate": 8.138832619680361e-06, + "loss": 0.5483, + "step": 4795 + }, + { + "epoch": 0.61, + "grad_norm": 0.826928163300088, + "learning_rate": 8.138029564429022e-06, + "loss": 0.6276, + "step": 4796 + }, + { + "epoch": 0.61, + "grad_norm": 0.6821590812335693, + "learning_rate": 8.137226375601793e-06, + "loss": 0.5521, + "step": 4797 + }, + { + "epoch": 0.61, + "grad_norm": 0.7539433385643475, + "learning_rate": 8.136423053232863e-06, + "loss": 0.6502, + "step": 4798 + }, + { + "epoch": 0.61, + "grad_norm": 0.8280041133982782, + "learning_rate": 8.135619597356426e-06, + "loss": 0.5772, + "step": 4799 + }, + { + "epoch": 0.61, + "grad_norm": 0.7759999067614988, + "learning_rate": 8.134816008006681e-06, + "loss": 0.6089, + "step": 4800 + }, + { + "epoch": 0.61, + "grad_norm": 0.5967147981107758, + "learning_rate": 8.134012285217838e-06, + "loss": 0.5352, + "step": 4801 + }, + { + "epoch": 0.61, + "grad_norm": 0.605268819827656, + "learning_rate": 8.133208429024106e-06, + "loss": 0.5731, + "step": 4802 + }, + { + "epoch": 0.61, + "grad_norm": 0.7784713763142074, + "learning_rate": 8.132404439459702e-06, + "loss": 0.5946, + "step": 4803 + }, + { + "epoch": 0.61, + "grad_norm": 0.6422953028824958, + "learning_rate": 8.131600316558852e-06, + "loss": 0.5425, + "step": 4804 + }, + { + "epoch": 0.61, + "grad_norm": 0.9488262430014444, + "learning_rate": 8.130796060355784e-06, + "loss": 0.5849, + "step": 4805 + }, + { + "epoch": 0.61, + "grad_norm": 0.760895490255571, + "learning_rate": 8.129991670884733e-06, + "loss": 0.6297, + "step": 4806 + }, + { + "epoch": 0.61, + "grad_norm": 0.797257959647494, + "learning_rate": 8.129187148179938e-06, + "loss": 0.5763, + "step": 4807 + }, + { + "epoch": 0.61, + "grad_norm": 0.6082905580279685, + "learning_rate": 8.128382492275644e-06, + "loss": 0.5502, + "step": 4808 + }, + { + "epoch": 0.61, + "grad_norm": 0.6848614964915353, + "learning_rate": 8.127577703206106e-06, + "loss": 0.6208, + "step": 4809 + }, + { + "epoch": 0.61, + "grad_norm": 0.9584029166809982, + "learning_rate": 8.12677278100558e-06, + "loss": 0.6336, + "step": 4810 + }, + { + "epoch": 0.61, + "grad_norm": 0.7183629341738105, + "learning_rate": 8.125967725708328e-06, + "loss": 0.5912, + "step": 4811 + }, + { + "epoch": 0.61, + "grad_norm": 0.6785159993765555, + "learning_rate": 8.125162537348619e-06, + "loss": 0.5499, + "step": 4812 + }, + { + "epoch": 0.61, + "grad_norm": 0.8716339175559804, + "learning_rate": 8.124357215960727e-06, + "loss": 0.6399, + "step": 4813 + }, + { + "epoch": 0.61, + "grad_norm": 0.6433837587231448, + "learning_rate": 8.123551761578931e-06, + "loss": 0.5176, + "step": 4814 + }, + { + "epoch": 0.61, + "grad_norm": 0.6443099290245619, + "learning_rate": 8.122746174237518e-06, + "loss": 0.5517, + "step": 4815 + }, + { + "epoch": 0.61, + "grad_norm": 0.8781899269543718, + "learning_rate": 8.121940453970783e-06, + "loss": 0.6455, + "step": 4816 + }, + { + "epoch": 0.61, + "grad_norm": 0.6489200588777149, + "learning_rate": 8.121134600813015e-06, + "loss": 0.5344, + "step": 4817 + }, + { + "epoch": 0.61, + "grad_norm": 0.6396351274396583, + "learning_rate": 8.120328614798523e-06, + "loss": 0.5311, + "step": 4818 + }, + { + "epoch": 0.61, + "grad_norm": 0.7540203510791869, + "learning_rate": 8.119522495961612e-06, + "loss": 0.6351, + "step": 4819 + }, + { + "epoch": 0.61, + "grad_norm": 0.6818248485252133, + "learning_rate": 8.118716244336597e-06, + "loss": 0.5111, + "step": 4820 + }, + { + "epoch": 0.61, + "grad_norm": 0.6778605624073916, + "learning_rate": 8.117909859957798e-06, + "loss": 0.5092, + "step": 4821 + }, + { + "epoch": 0.61, + "grad_norm": 0.6924205642203811, + "learning_rate": 8.11710334285954e-06, + "loss": 0.5422, + "step": 4822 + }, + { + "epoch": 0.61, + "grad_norm": 0.5901394765861796, + "learning_rate": 8.116296693076154e-06, + "loss": 0.5939, + "step": 4823 + }, + { + "epoch": 0.61, + "grad_norm": 0.8388903654904268, + "learning_rate": 8.115489910641974e-06, + "loss": 0.6289, + "step": 4824 + }, + { + "epoch": 0.61, + "grad_norm": 0.6627781865118006, + "learning_rate": 8.114682995591345e-06, + "loss": 0.5172, + "step": 4825 + }, + { + "epoch": 0.61, + "grad_norm": 0.5900976791764801, + "learning_rate": 8.113875947958616e-06, + "loss": 0.5203, + "step": 4826 + }, + { + "epoch": 0.61, + "grad_norm": 0.6738291442510475, + "learning_rate": 8.113068767778134e-06, + "loss": 0.5559, + "step": 4827 + }, + { + "epoch": 0.62, + "grad_norm": 0.566831010030039, + "learning_rate": 8.112261455084265e-06, + "loss": 0.4868, + "step": 4828 + }, + { + "epoch": 0.62, + "grad_norm": 0.6321945141159364, + "learning_rate": 8.111454009911372e-06, + "loss": 0.563, + "step": 4829 + }, + { + "epoch": 0.62, + "grad_norm": 0.635862688232433, + "learning_rate": 8.110646432293824e-06, + "loss": 0.5428, + "step": 4830 + }, + { + "epoch": 0.62, + "grad_norm": 0.7577421420038001, + "learning_rate": 8.109838722265997e-06, + "loss": 0.6351, + "step": 4831 + }, + { + "epoch": 0.62, + "grad_norm": 0.7286074056901038, + "learning_rate": 8.109030879862274e-06, + "loss": 0.5856, + "step": 4832 + }, + { + "epoch": 0.62, + "grad_norm": 0.633054677055702, + "learning_rate": 8.108222905117042e-06, + "loss": 0.5816, + "step": 4833 + }, + { + "epoch": 0.62, + "grad_norm": 0.6770640871206239, + "learning_rate": 8.107414798064691e-06, + "loss": 0.5155, + "step": 4834 + }, + { + "epoch": 0.62, + "grad_norm": 0.7535503897137129, + "learning_rate": 8.106606558739625e-06, + "loss": 0.6265, + "step": 4835 + }, + { + "epoch": 0.62, + "grad_norm": 0.7024832634606328, + "learning_rate": 8.105798187176244e-06, + "loss": 0.553, + "step": 4836 + }, + { + "epoch": 0.62, + "grad_norm": 0.8754086666788395, + "learning_rate": 8.104989683408958e-06, + "loss": 0.633, + "step": 4837 + }, + { + "epoch": 0.62, + "grad_norm": 0.7669117862269665, + "learning_rate": 8.104181047472184e-06, + "loss": 0.6301, + "step": 4838 + }, + { + "epoch": 0.62, + "grad_norm": 0.9213269387667131, + "learning_rate": 8.103372279400341e-06, + "loss": 0.6266, + "step": 4839 + }, + { + "epoch": 0.62, + "grad_norm": 0.6657568430405255, + "learning_rate": 8.10256337922786e-06, + "loss": 0.5311, + "step": 4840 + }, + { + "epoch": 0.62, + "grad_norm": 0.7967500853464504, + "learning_rate": 8.101754346989168e-06, + "loss": 0.6059, + "step": 4841 + }, + { + "epoch": 0.62, + "grad_norm": 0.597754659123826, + "learning_rate": 8.100945182718705e-06, + "loss": 0.5488, + "step": 4842 + }, + { + "epoch": 0.62, + "grad_norm": 1.6263617389601803, + "learning_rate": 8.100135886450916e-06, + "loss": 0.651, + "step": 4843 + }, + { + "epoch": 0.62, + "grad_norm": 0.7260252159977802, + "learning_rate": 8.099326458220249e-06, + "loss": 0.5937, + "step": 4844 + }, + { + "epoch": 0.62, + "grad_norm": 1.2379358999520154, + "learning_rate": 8.098516898061159e-06, + "loss": 0.6083, + "step": 4845 + }, + { + "epoch": 0.62, + "grad_norm": 0.5925252654182341, + "learning_rate": 8.097707206008102e-06, + "loss": 0.5397, + "step": 4846 + }, + { + "epoch": 0.62, + "grad_norm": 0.6085251850615595, + "learning_rate": 8.096897382095553e-06, + "loss": 0.5117, + "step": 4847 + }, + { + "epoch": 0.62, + "grad_norm": 0.7328260826979824, + "learning_rate": 8.096087426357978e-06, + "loss": 0.6174, + "step": 4848 + }, + { + "epoch": 0.62, + "grad_norm": 0.6541158758973585, + "learning_rate": 8.095277338829853e-06, + "loss": 0.5069, + "step": 4849 + }, + { + "epoch": 0.62, + "grad_norm": 0.5777393348145821, + "learning_rate": 8.094467119545663e-06, + "loss": 0.5154, + "step": 4850 + }, + { + "epoch": 0.62, + "grad_norm": 0.7090133563575965, + "learning_rate": 8.0936567685399e-06, + "loss": 0.5341, + "step": 4851 + }, + { + "epoch": 0.62, + "grad_norm": 0.7579490527579456, + "learning_rate": 8.092846285847049e-06, + "loss": 0.6279, + "step": 4852 + }, + { + "epoch": 0.62, + "grad_norm": 0.6256002701802358, + "learning_rate": 8.092035671501617e-06, + "loss": 0.5118, + "step": 4853 + }, + { + "epoch": 0.62, + "grad_norm": 0.6212915649896393, + "learning_rate": 8.091224925538108e-06, + "loss": 0.5432, + "step": 4854 + }, + { + "epoch": 0.62, + "grad_norm": 0.5667830327736616, + "learning_rate": 8.090414047991031e-06, + "loss": 0.4996, + "step": 4855 + }, + { + "epoch": 0.62, + "grad_norm": 0.7373447686309222, + "learning_rate": 8.089603038894904e-06, + "loss": 0.5686, + "step": 4856 + }, + { + "epoch": 0.62, + "grad_norm": 0.9525439248404798, + "learning_rate": 8.08879189828425e-06, + "loss": 0.5795, + "step": 4857 + }, + { + "epoch": 0.62, + "grad_norm": 0.7785048846145941, + "learning_rate": 8.087980626193592e-06, + "loss": 0.6069, + "step": 4858 + }, + { + "epoch": 0.62, + "grad_norm": 0.6276731033295745, + "learning_rate": 8.08716922265747e-06, + "loss": 0.6157, + "step": 4859 + }, + { + "epoch": 0.62, + "grad_norm": 0.6201341741204739, + "learning_rate": 8.086357687710417e-06, + "loss": 0.5355, + "step": 4860 + }, + { + "epoch": 0.62, + "grad_norm": 0.8459649364744836, + "learning_rate": 8.08554602138698e-06, + "loss": 0.6384, + "step": 4861 + }, + { + "epoch": 0.62, + "grad_norm": 0.7000963566777608, + "learning_rate": 8.08473422372171e-06, + "loss": 0.6873, + "step": 4862 + }, + { + "epoch": 0.62, + "grad_norm": 0.6653633755268963, + "learning_rate": 8.08392229474916e-06, + "loss": 0.5585, + "step": 4863 + }, + { + "epoch": 0.62, + "grad_norm": 0.6239248567406012, + "learning_rate": 8.083110234503895e-06, + "loss": 0.5455, + "step": 4864 + }, + { + "epoch": 0.62, + "grad_norm": 0.7981524243795168, + "learning_rate": 8.082298043020478e-06, + "loss": 0.6161, + "step": 4865 + }, + { + "epoch": 0.62, + "grad_norm": 0.7162597540883014, + "learning_rate": 8.081485720333482e-06, + "loss": 0.5786, + "step": 4866 + }, + { + "epoch": 0.62, + "grad_norm": 0.6799110198446136, + "learning_rate": 8.080673266477488e-06, + "loss": 0.5397, + "step": 4867 + }, + { + "epoch": 0.62, + "grad_norm": 0.7345853282228795, + "learning_rate": 8.079860681487076e-06, + "loss": 0.59, + "step": 4868 + }, + { + "epoch": 0.62, + "grad_norm": 0.6141263424194727, + "learning_rate": 8.079047965396839e-06, + "loss": 0.5624, + "step": 4869 + }, + { + "epoch": 0.62, + "grad_norm": 0.5948313931876724, + "learning_rate": 8.078235118241365e-06, + "loss": 0.5591, + "step": 4870 + }, + { + "epoch": 0.62, + "grad_norm": 0.8005217658260888, + "learning_rate": 8.077422140055261e-06, + "loss": 0.6141, + "step": 4871 + }, + { + "epoch": 0.62, + "grad_norm": 0.5711097481074564, + "learning_rate": 8.07660903087313e-06, + "loss": 0.5394, + "step": 4872 + }, + { + "epoch": 0.62, + "grad_norm": 0.7340286153341028, + "learning_rate": 8.075795790729586e-06, + "loss": 0.6156, + "step": 4873 + }, + { + "epoch": 0.62, + "grad_norm": 0.7820995091205901, + "learning_rate": 8.074982419659244e-06, + "loss": 0.5836, + "step": 4874 + }, + { + "epoch": 0.62, + "grad_norm": 0.6910957990008223, + "learning_rate": 8.074168917696724e-06, + "loss": 0.5851, + "step": 4875 + }, + { + "epoch": 0.62, + "grad_norm": 0.8480003491441266, + "learning_rate": 8.073355284876658e-06, + "loss": 0.5989, + "step": 4876 + }, + { + "epoch": 0.62, + "grad_norm": 0.8737900651445702, + "learning_rate": 8.072541521233679e-06, + "loss": 0.6439, + "step": 4877 + }, + { + "epoch": 0.62, + "grad_norm": 0.8007813808033702, + "learning_rate": 8.071727626802424e-06, + "loss": 0.6284, + "step": 4878 + }, + { + "epoch": 0.62, + "grad_norm": 0.6936560572133352, + "learning_rate": 8.070913601617543e-06, + "loss": 0.5069, + "step": 4879 + }, + { + "epoch": 0.62, + "grad_norm": 0.7981425101843691, + "learning_rate": 8.07009944571368e-06, + "loss": 0.6347, + "step": 4880 + }, + { + "epoch": 0.62, + "grad_norm": 0.5779411834405241, + "learning_rate": 8.069285159125496e-06, + "loss": 0.5422, + "step": 4881 + }, + { + "epoch": 0.62, + "grad_norm": 0.8190092659255641, + "learning_rate": 8.068470741887651e-06, + "loss": 0.6454, + "step": 4882 + }, + { + "epoch": 0.62, + "grad_norm": 0.804109598707946, + "learning_rate": 8.067656194034811e-06, + "loss": 0.5844, + "step": 4883 + }, + { + "epoch": 0.62, + "grad_norm": 0.5820104262480262, + "learning_rate": 8.06684151560165e-06, + "loss": 0.5529, + "step": 4884 + }, + { + "epoch": 0.62, + "grad_norm": 0.6473168890837421, + "learning_rate": 8.066026706622847e-06, + "loss": 0.5917, + "step": 4885 + }, + { + "epoch": 0.62, + "grad_norm": 0.7896002476119041, + "learning_rate": 8.065211767133084e-06, + "loss": 0.6267, + "step": 4886 + }, + { + "epoch": 0.62, + "grad_norm": 0.8729663423341352, + "learning_rate": 8.06439669716705e-06, + "loss": 0.683, + "step": 4887 + }, + { + "epoch": 0.62, + "grad_norm": 0.8224710341287688, + "learning_rate": 8.063581496759443e-06, + "loss": 0.6236, + "step": 4888 + }, + { + "epoch": 0.62, + "grad_norm": 0.6662888533544024, + "learning_rate": 8.06276616594496e-06, + "loss": 0.5263, + "step": 4889 + }, + { + "epoch": 0.62, + "grad_norm": 0.9789375430210324, + "learning_rate": 8.061950704758307e-06, + "loss": 0.6493, + "step": 4890 + }, + { + "epoch": 0.62, + "grad_norm": 0.8049248495167562, + "learning_rate": 8.061135113234199e-06, + "loss": 0.5939, + "step": 4891 + }, + { + "epoch": 0.62, + "grad_norm": 0.683245514292312, + "learning_rate": 8.060319391407352e-06, + "loss": 0.5611, + "step": 4892 + }, + { + "epoch": 0.62, + "grad_norm": 0.5641700951459367, + "learning_rate": 8.059503539312487e-06, + "loss": 0.5655, + "step": 4893 + }, + { + "epoch": 0.62, + "grad_norm": 0.5811884318442994, + "learning_rate": 8.058687556984333e-06, + "loss": 0.5113, + "step": 4894 + }, + { + "epoch": 0.62, + "grad_norm": 0.5629376793973534, + "learning_rate": 8.057871444457624e-06, + "loss": 0.4969, + "step": 4895 + }, + { + "epoch": 0.62, + "grad_norm": 0.6015200960107091, + "learning_rate": 8.057055201767098e-06, + "loss": 0.5512, + "step": 4896 + }, + { + "epoch": 0.62, + "grad_norm": 0.6332036574827241, + "learning_rate": 8.056238828947503e-06, + "loss": 0.5584, + "step": 4897 + }, + { + "epoch": 0.62, + "grad_norm": 0.5763057480957622, + "learning_rate": 8.055422326033586e-06, + "loss": 0.5248, + "step": 4898 + }, + { + "epoch": 0.62, + "grad_norm": 0.8874959396134705, + "learning_rate": 8.054605693060106e-06, + "loss": 0.6002, + "step": 4899 + }, + { + "epoch": 0.62, + "grad_norm": 0.6794838650061711, + "learning_rate": 8.053788930061822e-06, + "loss": 0.5925, + "step": 4900 + }, + { + "epoch": 0.62, + "grad_norm": 0.8484945740126925, + "learning_rate": 8.052972037073501e-06, + "loss": 0.6414, + "step": 4901 + }, + { + "epoch": 0.62, + "grad_norm": 0.612939219459935, + "learning_rate": 8.052155014129916e-06, + "loss": 0.5461, + "step": 4902 + }, + { + "epoch": 0.62, + "grad_norm": 0.7431409948859721, + "learning_rate": 8.051337861265847e-06, + "loss": 0.5727, + "step": 4903 + }, + { + "epoch": 0.62, + "grad_norm": 0.5868836082094497, + "learning_rate": 8.050520578516074e-06, + "loss": 0.5204, + "step": 4904 + }, + { + "epoch": 0.62, + "grad_norm": 0.6702798902963087, + "learning_rate": 8.04970316591539e-06, + "loss": 0.5689, + "step": 4905 + }, + { + "epoch": 0.63, + "grad_norm": 0.7830835168618177, + "learning_rate": 8.048885623498587e-06, + "loss": 0.6054, + "step": 4906 + }, + { + "epoch": 0.63, + "grad_norm": 0.7448351170870566, + "learning_rate": 8.048067951300465e-06, + "loss": 0.6233, + "step": 4907 + }, + { + "epoch": 0.63, + "grad_norm": 0.6486517744835938, + "learning_rate": 8.04725014935583e-06, + "loss": 0.554, + "step": 4908 + }, + { + "epoch": 0.63, + "grad_norm": 0.6722504772304511, + "learning_rate": 8.046432217699496e-06, + "loss": 0.5193, + "step": 4909 + }, + { + "epoch": 0.63, + "grad_norm": 0.8182603359914264, + "learning_rate": 8.045614156366276e-06, + "loss": 0.6271, + "step": 4910 + }, + { + "epoch": 0.63, + "grad_norm": 0.6446605809609107, + "learning_rate": 8.044795965390995e-06, + "loss": 0.5534, + "step": 4911 + }, + { + "epoch": 0.63, + "grad_norm": 0.7035520761978654, + "learning_rate": 8.043977644808478e-06, + "loss": 0.5389, + "step": 4912 + }, + { + "epoch": 0.63, + "grad_norm": 0.827594038147539, + "learning_rate": 8.04315919465356e-06, + "loss": 0.6375, + "step": 4913 + }, + { + "epoch": 0.63, + "grad_norm": 0.7374695926988003, + "learning_rate": 8.04234061496108e-06, + "loss": 0.6404, + "step": 4914 + }, + { + "epoch": 0.63, + "grad_norm": 0.5693042303910528, + "learning_rate": 8.041521905765883e-06, + "loss": 0.5619, + "step": 4915 + }, + { + "epoch": 0.63, + "grad_norm": 0.5875885745632737, + "learning_rate": 8.040703067102816e-06, + "loss": 0.5437, + "step": 4916 + }, + { + "epoch": 0.63, + "grad_norm": 0.7185821643081933, + "learning_rate": 8.039884099006739e-06, + "loss": 0.5417, + "step": 4917 + }, + { + "epoch": 0.63, + "grad_norm": 0.6892261854689584, + "learning_rate": 8.039065001512508e-06, + "loss": 0.5838, + "step": 4918 + }, + { + "epoch": 0.63, + "grad_norm": 0.6964076157822582, + "learning_rate": 8.03824577465499e-06, + "loss": 0.5413, + "step": 4919 + }, + { + "epoch": 0.63, + "grad_norm": 0.7527568758841825, + "learning_rate": 8.037426418469058e-06, + "loss": 0.6089, + "step": 4920 + }, + { + "epoch": 0.63, + "grad_norm": 0.7509769115590305, + "learning_rate": 8.036606932989592e-06, + "loss": 0.5368, + "step": 4921 + }, + { + "epoch": 0.63, + "grad_norm": 0.5815321045061785, + "learning_rate": 8.03578731825147e-06, + "loss": 0.4913, + "step": 4922 + }, + { + "epoch": 0.63, + "grad_norm": 0.7236216992126074, + "learning_rate": 8.034967574289587e-06, + "loss": 0.6146, + "step": 4923 + }, + { + "epoch": 0.63, + "grad_norm": 0.7644428638996943, + "learning_rate": 8.03414770113883e-06, + "loss": 0.528, + "step": 4924 + }, + { + "epoch": 0.63, + "grad_norm": 0.6993079172656466, + "learning_rate": 8.033327698834101e-06, + "loss": 0.5754, + "step": 4925 + }, + { + "epoch": 0.63, + "grad_norm": 0.7313450764515096, + "learning_rate": 8.032507567410304e-06, + "loss": 0.6255, + "step": 4926 + }, + { + "epoch": 0.63, + "grad_norm": 0.761825300077517, + "learning_rate": 8.031687306902352e-06, + "loss": 0.6288, + "step": 4927 + }, + { + "epoch": 0.63, + "grad_norm": 0.6141230843182297, + "learning_rate": 8.030866917345159e-06, + "loss": 0.5166, + "step": 4928 + }, + { + "epoch": 0.63, + "grad_norm": 0.6644433915350286, + "learning_rate": 8.030046398773647e-06, + "loss": 0.5611, + "step": 4929 + }, + { + "epoch": 0.63, + "grad_norm": 0.573637497473543, + "learning_rate": 8.029225751222743e-06, + "loss": 0.5396, + "step": 4930 + }, + { + "epoch": 0.63, + "grad_norm": 0.6878188589679671, + "learning_rate": 8.028404974727377e-06, + "loss": 0.5248, + "step": 4931 + }, + { + "epoch": 0.63, + "grad_norm": 0.8069764538185773, + "learning_rate": 8.02758406932249e-06, + "loss": 0.6407, + "step": 4932 + }, + { + "epoch": 0.63, + "grad_norm": 0.6955927656844051, + "learning_rate": 8.026763035043021e-06, + "loss": 0.5512, + "step": 4933 + }, + { + "epoch": 0.63, + "grad_norm": 0.7575647114305473, + "learning_rate": 8.025941871923926e-06, + "loss": 0.5891, + "step": 4934 + }, + { + "epoch": 0.63, + "grad_norm": 0.6995656652211012, + "learning_rate": 8.025120580000153e-06, + "loss": 0.5557, + "step": 4935 + }, + { + "epoch": 0.63, + "grad_norm": 0.7235148605292684, + "learning_rate": 8.024299159306663e-06, + "loss": 0.6974, + "step": 4936 + }, + { + "epoch": 0.63, + "grad_norm": 0.6035609518052024, + "learning_rate": 8.023477609878422e-06, + "loss": 0.5044, + "step": 4937 + }, + { + "epoch": 0.63, + "grad_norm": 0.5910437522974566, + "learning_rate": 8.022655931750403e-06, + "loss": 0.5683, + "step": 4938 + }, + { + "epoch": 0.63, + "grad_norm": 0.6474405146828806, + "learning_rate": 8.021834124957578e-06, + "loss": 0.5213, + "step": 4939 + }, + { + "epoch": 0.63, + "grad_norm": 0.8086742935312414, + "learning_rate": 8.021012189534931e-06, + "loss": 0.6436, + "step": 4940 + }, + { + "epoch": 0.63, + "grad_norm": 0.616790651415157, + "learning_rate": 8.020190125517451e-06, + "loss": 0.5632, + "step": 4941 + }, + { + "epoch": 0.63, + "grad_norm": 0.9826153405074254, + "learning_rate": 8.019367932940126e-06, + "loss": 0.6423, + "step": 4942 + }, + { + "epoch": 0.63, + "grad_norm": 0.7436441861685466, + "learning_rate": 8.018545611837959e-06, + "loss": 0.6507, + "step": 4943 + }, + { + "epoch": 0.63, + "grad_norm": 1.4778030251635295, + "learning_rate": 8.017723162245948e-06, + "loss": 0.6036, + "step": 4944 + }, + { + "epoch": 0.63, + "grad_norm": 0.6638589947665885, + "learning_rate": 8.016900584199106e-06, + "loss": 0.5245, + "step": 4945 + }, + { + "epoch": 0.63, + "grad_norm": 0.580197713064733, + "learning_rate": 8.016077877732448e-06, + "loss": 0.5446, + "step": 4946 + }, + { + "epoch": 0.63, + "grad_norm": 0.6682971371288308, + "learning_rate": 8.015255042880993e-06, + "loss": 0.5816, + "step": 4947 + }, + { + "epoch": 0.63, + "grad_norm": 0.7183818744936854, + "learning_rate": 8.014432079679766e-06, + "loss": 0.5814, + "step": 4948 + }, + { + "epoch": 0.63, + "grad_norm": 0.685527682626321, + "learning_rate": 8.013608988163797e-06, + "loss": 0.6078, + "step": 4949 + }, + { + "epoch": 0.63, + "grad_norm": 0.59734172242765, + "learning_rate": 8.012785768368125e-06, + "loss": 0.5283, + "step": 4950 + }, + { + "epoch": 0.63, + "grad_norm": 0.6447979255046242, + "learning_rate": 8.011962420327788e-06, + "loss": 0.604, + "step": 4951 + }, + { + "epoch": 0.63, + "grad_norm": 0.809380976290288, + "learning_rate": 8.011138944077838e-06, + "loss": 0.6925, + "step": 4952 + }, + { + "epoch": 0.63, + "grad_norm": 0.7869086565648559, + "learning_rate": 8.010315339653325e-06, + "loss": 0.6037, + "step": 4953 + }, + { + "epoch": 0.63, + "grad_norm": 0.7987426487144512, + "learning_rate": 8.009491607089305e-06, + "loss": 0.6005, + "step": 4954 + }, + { + "epoch": 0.63, + "grad_norm": 0.7516274719269177, + "learning_rate": 8.008667746420849e-06, + "loss": 0.6318, + "step": 4955 + }, + { + "epoch": 0.63, + "grad_norm": 0.7740498530156996, + "learning_rate": 8.007843757683019e-06, + "loss": 0.598, + "step": 4956 + }, + { + "epoch": 0.63, + "grad_norm": 0.6904989824350313, + "learning_rate": 8.00701964091089e-06, + "loss": 0.5592, + "step": 4957 + }, + { + "epoch": 0.63, + "grad_norm": 0.6199203109786972, + "learning_rate": 8.006195396139545e-06, + "loss": 0.5515, + "step": 4958 + }, + { + "epoch": 0.63, + "grad_norm": 0.8311154963488638, + "learning_rate": 8.00537102340407e-06, + "loss": 0.6265, + "step": 4959 + }, + { + "epoch": 0.63, + "grad_norm": 0.673649226032902, + "learning_rate": 8.004546522739553e-06, + "loss": 0.5239, + "step": 4960 + }, + { + "epoch": 0.63, + "grad_norm": 0.9741430450093904, + "learning_rate": 8.003721894181092e-06, + "loss": 0.596, + "step": 4961 + }, + { + "epoch": 0.63, + "grad_norm": 0.7731955285755833, + "learning_rate": 8.00289713776379e-06, + "loss": 0.6207, + "step": 4962 + }, + { + "epoch": 0.63, + "grad_norm": 0.8706968197408063, + "learning_rate": 8.00207225352275e-06, + "loss": 0.6353, + "step": 4963 + }, + { + "epoch": 0.63, + "grad_norm": 0.8210021030467289, + "learning_rate": 8.001247241493089e-06, + "loss": 0.6399, + "step": 4964 + }, + { + "epoch": 0.63, + "grad_norm": 0.6537251604318643, + "learning_rate": 8.000422101709923e-06, + "loss": 0.5753, + "step": 4965 + }, + { + "epoch": 0.63, + "grad_norm": 0.9390043673291344, + "learning_rate": 7.999596834208377e-06, + "loss": 0.6659, + "step": 4966 + }, + { + "epoch": 0.63, + "grad_norm": 0.963241817320959, + "learning_rate": 7.998771439023578e-06, + "loss": 0.6748, + "step": 4967 + }, + { + "epoch": 0.63, + "grad_norm": 0.789801755202053, + "learning_rate": 7.997945916190661e-06, + "loss": 0.6539, + "step": 4968 + }, + { + "epoch": 0.63, + "grad_norm": 0.7281406478570932, + "learning_rate": 7.99712026574477e-06, + "loss": 0.5985, + "step": 4969 + }, + { + "epoch": 0.63, + "grad_norm": 0.7085923490566987, + "learning_rate": 7.996294487721041e-06, + "loss": 0.5913, + "step": 4970 + }, + { + "epoch": 0.63, + "grad_norm": 0.6460091542718994, + "learning_rate": 7.995468582154634e-06, + "loss": 0.4897, + "step": 4971 + }, + { + "epoch": 0.63, + "grad_norm": 0.624189856220877, + "learning_rate": 7.994642549080702e-06, + "loss": 0.5663, + "step": 4972 + }, + { + "epoch": 0.63, + "grad_norm": 0.7484498850374535, + "learning_rate": 7.993816388534404e-06, + "loss": 0.5894, + "step": 4973 + }, + { + "epoch": 0.63, + "grad_norm": 0.81555407216843, + "learning_rate": 7.99299010055091e-06, + "loss": 0.5471, + "step": 4974 + }, + { + "epoch": 0.63, + "grad_norm": 0.574974637473329, + "learning_rate": 7.992163685165393e-06, + "loss": 0.5283, + "step": 4975 + }, + { + "epoch": 0.63, + "grad_norm": 0.6329118697095062, + "learning_rate": 7.991337142413029e-06, + "loss": 0.5339, + "step": 4976 + }, + { + "epoch": 0.63, + "grad_norm": 0.6164356332347005, + "learning_rate": 7.990510472329e-06, + "loss": 0.5469, + "step": 4977 + }, + { + "epoch": 0.63, + "grad_norm": 0.6144438291787048, + "learning_rate": 7.989683674948498e-06, + "loss": 0.4932, + "step": 4978 + }, + { + "epoch": 0.63, + "grad_norm": 0.8879003383011629, + "learning_rate": 7.988856750306716e-06, + "loss": 0.6502, + "step": 4979 + }, + { + "epoch": 0.63, + "grad_norm": 0.7075341439946503, + "learning_rate": 7.988029698438853e-06, + "loss": 0.5467, + "step": 4980 + }, + { + "epoch": 0.63, + "grad_norm": 0.651597186699634, + "learning_rate": 7.987202519380114e-06, + "loss": 0.5461, + "step": 4981 + }, + { + "epoch": 0.63, + "grad_norm": 0.8258889716800095, + "learning_rate": 7.98637521316571e-06, + "loss": 0.6369, + "step": 4982 + }, + { + "epoch": 0.63, + "grad_norm": 0.6822670076700751, + "learning_rate": 7.985547779830856e-06, + "loss": 0.5853, + "step": 4983 + }, + { + "epoch": 0.63, + "grad_norm": 0.6908581928240837, + "learning_rate": 7.984720219410773e-06, + "loss": 0.5748, + "step": 4984 + }, + { + "epoch": 0.64, + "grad_norm": 0.9160495837039937, + "learning_rate": 7.983892531940687e-06, + "loss": 0.6137, + "step": 4985 + }, + { + "epoch": 0.64, + "grad_norm": 0.5555883207325232, + "learning_rate": 7.983064717455832e-06, + "loss": 0.4661, + "step": 4986 + }, + { + "epoch": 0.64, + "grad_norm": 0.6066013306324208, + "learning_rate": 7.982236775991445e-06, + "loss": 0.5452, + "step": 4987 + }, + { + "epoch": 0.64, + "grad_norm": 0.6284802021881586, + "learning_rate": 7.981408707582769e-06, + "loss": 0.5563, + "step": 4988 + }, + { + "epoch": 0.64, + "grad_norm": 0.6886870793431915, + "learning_rate": 7.98058051226505e-06, + "loss": 0.6052, + "step": 4989 + }, + { + "epoch": 0.64, + "grad_norm": 0.6990423918403783, + "learning_rate": 7.979752190073543e-06, + "loss": 0.551, + "step": 4990 + }, + { + "epoch": 0.64, + "grad_norm": 0.7674093938950507, + "learning_rate": 7.978923741043508e-06, + "loss": 0.5451, + "step": 4991 + }, + { + "epoch": 0.64, + "grad_norm": 2.199156064081174, + "learning_rate": 7.978095165210209e-06, + "loss": 0.6322, + "step": 4992 + }, + { + "epoch": 0.64, + "grad_norm": 0.6908099319990278, + "learning_rate": 7.977266462608915e-06, + "loss": 0.5999, + "step": 4993 + }, + { + "epoch": 0.64, + "grad_norm": 0.6431466981335197, + "learning_rate": 7.976437633274901e-06, + "loss": 0.5218, + "step": 4994 + }, + { + "epoch": 0.64, + "grad_norm": 0.6817665191447893, + "learning_rate": 7.975608677243449e-06, + "loss": 0.5663, + "step": 4995 + }, + { + "epoch": 0.64, + "grad_norm": 0.6648658984445288, + "learning_rate": 7.974779594549844e-06, + "loss": 0.5393, + "step": 4996 + }, + { + "epoch": 0.64, + "grad_norm": 0.6637238420430308, + "learning_rate": 7.973950385229378e-06, + "loss": 0.5556, + "step": 4997 + }, + { + "epoch": 0.64, + "grad_norm": 0.8176644553156135, + "learning_rate": 7.973121049317349e-06, + "loss": 0.6779, + "step": 4998 + }, + { + "epoch": 0.64, + "grad_norm": 0.6876679393660092, + "learning_rate": 7.972291586849054e-06, + "loss": 0.5757, + "step": 4999 + }, + { + "epoch": 0.64, + "grad_norm": 0.6717052628132845, + "learning_rate": 7.971461997859808e-06, + "loss": 0.5094, + "step": 5000 + }, + { + "epoch": 0.64, + "grad_norm": 0.7194117786502368, + "learning_rate": 7.970632282384918e-06, + "loss": 0.5393, + "step": 5001 + }, + { + "epoch": 0.64, + "grad_norm": 1.0149557103812121, + "learning_rate": 7.969802440459704e-06, + "loss": 0.599, + "step": 5002 + }, + { + "epoch": 0.64, + "grad_norm": 0.6942619217568177, + "learning_rate": 7.968972472119491e-06, + "loss": 0.5802, + "step": 5003 + }, + { + "epoch": 0.64, + "grad_norm": 0.5765078417775431, + "learning_rate": 7.968142377399608e-06, + "loss": 0.4968, + "step": 5004 + }, + { + "epoch": 0.64, + "grad_norm": 0.811633019565186, + "learning_rate": 7.967312156335389e-06, + "loss": 0.6265, + "step": 5005 + }, + { + "epoch": 0.64, + "grad_norm": 0.8410833284701286, + "learning_rate": 7.966481808962174e-06, + "loss": 0.6364, + "step": 5006 + }, + { + "epoch": 0.64, + "grad_norm": 0.7550709211650153, + "learning_rate": 7.965651335315305e-06, + "loss": 0.5816, + "step": 5007 + }, + { + "epoch": 0.64, + "grad_norm": 0.6682562657334422, + "learning_rate": 7.96482073543014e-06, + "loss": 0.5109, + "step": 5008 + }, + { + "epoch": 0.64, + "grad_norm": 0.7032653731154704, + "learning_rate": 7.963990009342026e-06, + "loss": 0.5968, + "step": 5009 + }, + { + "epoch": 0.64, + "grad_norm": 0.6804902791575567, + "learning_rate": 7.96315915708633e-06, + "loss": 0.5686, + "step": 5010 + }, + { + "epoch": 0.64, + "grad_norm": 0.7233130333012316, + "learning_rate": 7.96232817869842e-06, + "loss": 0.6483, + "step": 5011 + }, + { + "epoch": 0.64, + "grad_norm": 0.5943661914938109, + "learning_rate": 7.961497074213664e-06, + "loss": 0.5234, + "step": 5012 + }, + { + "epoch": 0.64, + "grad_norm": 0.7687568441710754, + "learning_rate": 7.96066584366744e-06, + "loss": 0.6158, + "step": 5013 + }, + { + "epoch": 0.64, + "grad_norm": 0.7907268912226469, + "learning_rate": 7.959834487095135e-06, + "loss": 0.5825, + "step": 5014 + }, + { + "epoch": 0.64, + "grad_norm": 0.8253108962829002, + "learning_rate": 7.959003004532132e-06, + "loss": 0.6264, + "step": 5015 + }, + { + "epoch": 0.64, + "grad_norm": 0.8314557847227163, + "learning_rate": 7.958171396013827e-06, + "loss": 0.6159, + "step": 5016 + }, + { + "epoch": 0.64, + "grad_norm": 0.6070493284253823, + "learning_rate": 7.957339661575618e-06, + "loss": 0.5816, + "step": 5017 + }, + { + "epoch": 0.64, + "grad_norm": 0.8312694095835157, + "learning_rate": 7.956507801252912e-06, + "loss": 0.6823, + "step": 5018 + }, + { + "epoch": 0.64, + "grad_norm": 0.7910304487107016, + "learning_rate": 7.955675815081114e-06, + "loss": 0.5786, + "step": 5019 + }, + { + "epoch": 0.64, + "grad_norm": 0.6418403797533321, + "learning_rate": 7.954843703095644e-06, + "loss": 0.571, + "step": 5020 + }, + { + "epoch": 0.64, + "grad_norm": 0.9557562828315895, + "learning_rate": 7.954011465331917e-06, + "loss": 0.5964, + "step": 5021 + }, + { + "epoch": 0.64, + "grad_norm": 0.7629141335309464, + "learning_rate": 7.953179101825364e-06, + "loss": 0.5923, + "step": 5022 + }, + { + "epoch": 0.64, + "grad_norm": 0.5695493872500682, + "learning_rate": 7.952346612611413e-06, + "loss": 0.5009, + "step": 5023 + }, + { + "epoch": 0.64, + "grad_norm": 0.5967780145596528, + "learning_rate": 7.951513997725501e-06, + "loss": 0.5137, + "step": 5024 + }, + { + "epoch": 0.64, + "grad_norm": 0.6046722760236224, + "learning_rate": 7.95068125720307e-06, + "loss": 0.5109, + "step": 5025 + }, + { + "epoch": 0.64, + "grad_norm": 0.9257658633257344, + "learning_rate": 7.949848391079566e-06, + "loss": 0.6057, + "step": 5026 + }, + { + "epoch": 0.64, + "grad_norm": 0.7817539946449922, + "learning_rate": 7.949015399390443e-06, + "loss": 0.637, + "step": 5027 + }, + { + "epoch": 0.64, + "grad_norm": 0.5977205782331978, + "learning_rate": 7.94818228217116e-06, + "loss": 0.511, + "step": 5028 + }, + { + "epoch": 0.64, + "grad_norm": 0.845289093357311, + "learning_rate": 7.947349039457175e-06, + "loss": 0.6272, + "step": 5029 + }, + { + "epoch": 0.64, + "grad_norm": 0.6242821350284501, + "learning_rate": 7.946515671283962e-06, + "loss": 0.5621, + "step": 5030 + }, + { + "epoch": 0.64, + "grad_norm": 0.740643890292027, + "learning_rate": 7.945682177686992e-06, + "loss": 0.5391, + "step": 5031 + }, + { + "epoch": 0.64, + "grad_norm": 0.6881059538004348, + "learning_rate": 7.944848558701743e-06, + "loss": 0.6137, + "step": 5032 + }, + { + "epoch": 0.64, + "grad_norm": 0.5642095014106421, + "learning_rate": 7.944014814363703e-06, + "loss": 0.514, + "step": 5033 + }, + { + "epoch": 0.64, + "grad_norm": 0.6670245561237913, + "learning_rate": 7.943180944708361e-06, + "loss": 0.6139, + "step": 5034 + }, + { + "epoch": 0.64, + "grad_norm": 0.7789270633283152, + "learning_rate": 7.942346949771211e-06, + "loss": 0.5987, + "step": 5035 + }, + { + "epoch": 0.64, + "grad_norm": 0.572422754598143, + "learning_rate": 7.941512829587753e-06, + "loss": 0.5582, + "step": 5036 + }, + { + "epoch": 0.64, + "grad_norm": 0.5562236654922782, + "learning_rate": 7.940678584193492e-06, + "loss": 0.5599, + "step": 5037 + }, + { + "epoch": 0.64, + "grad_norm": 0.6948678762479501, + "learning_rate": 7.939844213623942e-06, + "loss": 0.5732, + "step": 5038 + }, + { + "epoch": 0.64, + "grad_norm": 0.7105738402823709, + "learning_rate": 7.939009717914619e-06, + "loss": 0.6254, + "step": 5039 + }, + { + "epoch": 0.64, + "grad_norm": 0.6884141398825876, + "learning_rate": 7.938175097101043e-06, + "loss": 0.5349, + "step": 5040 + }, + { + "epoch": 0.64, + "grad_norm": 0.7075112009203091, + "learning_rate": 7.937340351218743e-06, + "loss": 0.5585, + "step": 5041 + }, + { + "epoch": 0.64, + "grad_norm": 0.5841872756180618, + "learning_rate": 7.936505480303251e-06, + "loss": 0.5211, + "step": 5042 + }, + { + "epoch": 0.64, + "grad_norm": 0.7582660928114365, + "learning_rate": 7.935670484390103e-06, + "loss": 0.5876, + "step": 5043 + }, + { + "epoch": 0.64, + "grad_norm": 0.6394882090662029, + "learning_rate": 7.934835363514843e-06, + "loss": 0.564, + "step": 5044 + }, + { + "epoch": 0.64, + "grad_norm": 0.5738471337815415, + "learning_rate": 7.934000117713021e-06, + "loss": 0.5109, + "step": 5045 + }, + { + "epoch": 0.64, + "grad_norm": 0.6522987781055405, + "learning_rate": 7.933164747020191e-06, + "loss": 0.5848, + "step": 5046 + }, + { + "epoch": 0.64, + "grad_norm": 0.7265418727142777, + "learning_rate": 7.932329251471911e-06, + "loss": 0.509, + "step": 5047 + }, + { + "epoch": 0.64, + "grad_norm": 0.7495131456050885, + "learning_rate": 7.931493631103743e-06, + "loss": 0.6102, + "step": 5048 + }, + { + "epoch": 0.64, + "grad_norm": 0.7328420990163743, + "learning_rate": 7.930657885951261e-06, + "loss": 0.5479, + "step": 5049 + }, + { + "epoch": 0.64, + "grad_norm": 0.620422845891505, + "learning_rate": 7.929822016050034e-06, + "loss": 0.5732, + "step": 5050 + }, + { + "epoch": 0.64, + "grad_norm": 0.7642114420941445, + "learning_rate": 7.92898602143565e-06, + "loss": 0.6453, + "step": 5051 + }, + { + "epoch": 0.64, + "grad_norm": 0.7367152211775877, + "learning_rate": 7.92814990214369e-06, + "loss": 0.5673, + "step": 5052 + }, + { + "epoch": 0.64, + "grad_norm": 0.53367041818719, + "learning_rate": 7.927313658209744e-06, + "loss": 0.5007, + "step": 5053 + }, + { + "epoch": 0.64, + "grad_norm": 0.7977123909340416, + "learning_rate": 7.926477289669411e-06, + "loss": 0.6518, + "step": 5054 + }, + { + "epoch": 0.64, + "grad_norm": 0.8386176978568222, + "learning_rate": 7.925640796558291e-06, + "loss": 0.6049, + "step": 5055 + }, + { + "epoch": 0.64, + "grad_norm": 1.7815541597991627, + "learning_rate": 7.924804178911993e-06, + "loss": 0.579, + "step": 5056 + }, + { + "epoch": 0.64, + "grad_norm": 0.7325344772733176, + "learning_rate": 7.923967436766126e-06, + "loss": 0.55, + "step": 5057 + }, + { + "epoch": 0.64, + "grad_norm": 0.7146358864359678, + "learning_rate": 7.92313057015631e-06, + "loss": 0.6017, + "step": 5058 + }, + { + "epoch": 0.64, + "grad_norm": 0.6840612327982274, + "learning_rate": 7.922293579118165e-06, + "loss": 0.5505, + "step": 5059 + }, + { + "epoch": 0.64, + "grad_norm": 0.5578037891999774, + "learning_rate": 7.921456463687323e-06, + "loss": 0.5551, + "step": 5060 + }, + { + "epoch": 0.64, + "grad_norm": 0.7035750119166656, + "learning_rate": 7.920619223899413e-06, + "loss": 0.5665, + "step": 5061 + }, + { + "epoch": 0.64, + "grad_norm": 0.7074648346030253, + "learning_rate": 7.919781859790076e-06, + "loss": 0.5221, + "step": 5062 + }, + { + "epoch": 0.65, + "grad_norm": 0.7985287647041702, + "learning_rate": 7.918944371394953e-06, + "loss": 0.5882, + "step": 5063 + }, + { + "epoch": 0.65, + "grad_norm": 0.6836268680802838, + "learning_rate": 7.9181067587497e-06, + "loss": 0.5696, + "step": 5064 + }, + { + "epoch": 0.65, + "grad_norm": 0.7989420764743823, + "learning_rate": 7.917269021889965e-06, + "loss": 0.5797, + "step": 5065 + }, + { + "epoch": 0.65, + "grad_norm": 0.7147370856602557, + "learning_rate": 7.91643116085141e-06, + "loss": 0.565, + "step": 5066 + }, + { + "epoch": 0.65, + "grad_norm": 0.7860893362529873, + "learning_rate": 7.915593175669702e-06, + "loss": 0.601, + "step": 5067 + }, + { + "epoch": 0.65, + "grad_norm": 0.8811939473588392, + "learning_rate": 7.914755066380508e-06, + "loss": 0.6297, + "step": 5068 + }, + { + "epoch": 0.65, + "grad_norm": 0.6325432752600508, + "learning_rate": 7.913916833019503e-06, + "loss": 0.5523, + "step": 5069 + }, + { + "epoch": 0.65, + "grad_norm": 0.5665737980940468, + "learning_rate": 7.913078475622373e-06, + "loss": 0.4338, + "step": 5070 + }, + { + "epoch": 0.65, + "grad_norm": 0.7085118896327416, + "learning_rate": 7.9122399942248e-06, + "loss": 0.608, + "step": 5071 + }, + { + "epoch": 0.65, + "grad_norm": 0.6413836376839173, + "learning_rate": 7.911401388862477e-06, + "loss": 0.5476, + "step": 5072 + }, + { + "epoch": 0.65, + "grad_norm": 0.7335134286632833, + "learning_rate": 7.9105626595711e-06, + "loss": 0.5112, + "step": 5073 + }, + { + "epoch": 0.65, + "grad_norm": 0.6778552221130795, + "learning_rate": 7.909723806386372e-06, + "loss": 0.5642, + "step": 5074 + }, + { + "epoch": 0.65, + "grad_norm": 0.6949316492523618, + "learning_rate": 7.908884829343998e-06, + "loss": 0.6151, + "step": 5075 + }, + { + "epoch": 0.65, + "grad_norm": 0.6620510132905717, + "learning_rate": 7.908045728479694e-06, + "loss": 0.5053, + "step": 5076 + }, + { + "epoch": 0.65, + "grad_norm": 0.6588846465849989, + "learning_rate": 7.907206503829176e-06, + "loss": 0.5192, + "step": 5077 + }, + { + "epoch": 0.65, + "grad_norm": 0.8842365911384155, + "learning_rate": 7.906367155428168e-06, + "loss": 0.6151, + "step": 5078 + }, + { + "epoch": 0.65, + "grad_norm": 0.628025438165786, + "learning_rate": 7.905527683312395e-06, + "loss": 0.524, + "step": 5079 + }, + { + "epoch": 0.65, + "grad_norm": 0.6083744230761006, + "learning_rate": 7.904688087517595e-06, + "loss": 0.5431, + "step": 5080 + }, + { + "epoch": 0.65, + "grad_norm": 0.5776827949960666, + "learning_rate": 7.903848368079506e-06, + "loss": 0.505, + "step": 5081 + }, + { + "epoch": 0.65, + "grad_norm": 0.5946324858724029, + "learning_rate": 7.903008525033868e-06, + "loss": 0.4874, + "step": 5082 + }, + { + "epoch": 0.65, + "grad_norm": 0.7214488254666845, + "learning_rate": 7.902168558416436e-06, + "loss": 0.5664, + "step": 5083 + }, + { + "epoch": 0.65, + "grad_norm": 0.7315511424239522, + "learning_rate": 7.901328468262963e-06, + "loss": 0.5936, + "step": 5084 + }, + { + "epoch": 0.65, + "grad_norm": 0.6636904846832666, + "learning_rate": 7.900488254609209e-06, + "loss": 0.5117, + "step": 5085 + }, + { + "epoch": 0.65, + "grad_norm": 0.6531652372518333, + "learning_rate": 7.899647917490938e-06, + "loss": 0.5689, + "step": 5086 + }, + { + "epoch": 0.65, + "grad_norm": 0.9240937186441299, + "learning_rate": 7.89880745694392e-06, + "loss": 0.65, + "step": 5087 + }, + { + "epoch": 0.65, + "grad_norm": 0.7957616863220984, + "learning_rate": 7.897966873003933e-06, + "loss": 0.6273, + "step": 5088 + }, + { + "epoch": 0.65, + "grad_norm": 0.7467367534511393, + "learning_rate": 7.897126165706757e-06, + "loss": 0.5711, + "step": 5089 + }, + { + "epoch": 0.65, + "grad_norm": 1.3916064669992472, + "learning_rate": 7.896285335088177e-06, + "loss": 0.6094, + "step": 5090 + }, + { + "epoch": 0.65, + "grad_norm": 0.6232743246329217, + "learning_rate": 7.895444381183985e-06, + "loss": 0.5025, + "step": 5091 + }, + { + "epoch": 0.65, + "grad_norm": 0.7756969175785325, + "learning_rate": 7.89460330402998e-06, + "loss": 0.6654, + "step": 5092 + }, + { + "epoch": 0.65, + "grad_norm": 0.7707337467045592, + "learning_rate": 7.89376210366196e-06, + "loss": 0.6015, + "step": 5093 + }, + { + "epoch": 0.65, + "grad_norm": 0.5844229986305767, + "learning_rate": 7.892920780115737e-06, + "loss": 0.5971, + "step": 5094 + }, + { + "epoch": 0.65, + "grad_norm": 0.7163947909411027, + "learning_rate": 7.892079333427118e-06, + "loss": 0.6132, + "step": 5095 + }, + { + "epoch": 0.65, + "grad_norm": 0.5902806092067072, + "learning_rate": 7.891237763631925e-06, + "loss": 0.4839, + "step": 5096 + }, + { + "epoch": 0.65, + "grad_norm": 0.68979041909195, + "learning_rate": 7.89039607076598e-06, + "loss": 0.6174, + "step": 5097 + }, + { + "epoch": 0.65, + "grad_norm": 0.8002207584875587, + "learning_rate": 7.889554254865111e-06, + "loss": 0.5945, + "step": 5098 + }, + { + "epoch": 0.65, + "grad_norm": 0.7854459286449994, + "learning_rate": 7.888712315965149e-06, + "loss": 0.6174, + "step": 5099 + }, + { + "epoch": 0.65, + "grad_norm": 0.7525764975305921, + "learning_rate": 7.887870254101935e-06, + "loss": 0.6494, + "step": 5100 + }, + { + "epoch": 0.65, + "grad_norm": 0.6643851045720129, + "learning_rate": 7.887028069311313e-06, + "loss": 0.5931, + "step": 5101 + }, + { + "epoch": 0.65, + "grad_norm": 0.8465028869449369, + "learning_rate": 7.886185761629132e-06, + "loss": 0.6563, + "step": 5102 + }, + { + "epoch": 0.65, + "grad_norm": 0.8076228025908405, + "learning_rate": 7.885343331091247e-06, + "loss": 0.6046, + "step": 5103 + }, + { + "epoch": 0.65, + "grad_norm": 0.5874870140906882, + "learning_rate": 7.884500777733516e-06, + "loss": 0.5657, + "step": 5104 + }, + { + "epoch": 0.65, + "grad_norm": 2.226635961562645, + "learning_rate": 7.883658101591804e-06, + "loss": 0.558, + "step": 5105 + }, + { + "epoch": 0.65, + "grad_norm": 0.7058727457265803, + "learning_rate": 7.882815302701982e-06, + "loss": 0.5941, + "step": 5106 + }, + { + "epoch": 0.65, + "grad_norm": 0.5803580049785869, + "learning_rate": 7.881972381099925e-06, + "loss": 0.5172, + "step": 5107 + }, + { + "epoch": 0.65, + "grad_norm": 0.6040349374902021, + "learning_rate": 7.881129336821512e-06, + "loss": 0.5331, + "step": 5108 + }, + { + "epoch": 0.65, + "grad_norm": 0.6026742740070428, + "learning_rate": 7.88028616990263e-06, + "loss": 0.5429, + "step": 5109 + }, + { + "epoch": 0.65, + "grad_norm": 0.6927095988027014, + "learning_rate": 7.879442880379172e-06, + "loss": 0.6484, + "step": 5110 + }, + { + "epoch": 0.65, + "grad_norm": 0.5754632919120132, + "learning_rate": 7.87859946828703e-06, + "loss": 0.5088, + "step": 5111 + }, + { + "epoch": 0.65, + "grad_norm": 0.5707306306612392, + "learning_rate": 7.877755933662108e-06, + "loss": 0.4958, + "step": 5112 + }, + { + "epoch": 0.65, + "grad_norm": 0.7392420938090287, + "learning_rate": 7.876912276540311e-06, + "loss": 0.6147, + "step": 5113 + }, + { + "epoch": 0.65, + "grad_norm": 0.5479266888669612, + "learning_rate": 7.876068496957552e-06, + "loss": 0.516, + "step": 5114 + }, + { + "epoch": 0.65, + "grad_norm": 0.758498682855062, + "learning_rate": 7.875224594949748e-06, + "loss": 0.664, + "step": 5115 + }, + { + "epoch": 0.65, + "grad_norm": 0.6984565023936016, + "learning_rate": 7.874380570552822e-06, + "loss": 0.5542, + "step": 5116 + }, + { + "epoch": 0.65, + "grad_norm": 0.5626514143291393, + "learning_rate": 7.8735364238027e-06, + "loss": 0.4988, + "step": 5117 + }, + { + "epoch": 0.65, + "grad_norm": 0.8381434701455139, + "learning_rate": 7.872692154735313e-06, + "loss": 0.6667, + "step": 5118 + }, + { + "epoch": 0.65, + "grad_norm": 0.8450080472542723, + "learning_rate": 7.871847763386602e-06, + "loss": 0.6469, + "step": 5119 + }, + { + "epoch": 0.65, + "grad_norm": 0.6069234026750302, + "learning_rate": 7.871003249792508e-06, + "loss": 0.5464, + "step": 5120 + }, + { + "epoch": 0.65, + "grad_norm": 0.7521649567792072, + "learning_rate": 7.87015861398898e-06, + "loss": 0.5728, + "step": 5121 + }, + { + "epoch": 0.65, + "grad_norm": 0.8635092037279701, + "learning_rate": 7.869313856011974e-06, + "loss": 0.6384, + "step": 5122 + }, + { + "epoch": 0.65, + "grad_norm": 0.5564774881543907, + "learning_rate": 7.868468975897445e-06, + "loss": 0.5411, + "step": 5123 + }, + { + "epoch": 0.65, + "grad_norm": 0.5449304868682998, + "learning_rate": 7.867623973681358e-06, + "loss": 0.4851, + "step": 5124 + }, + { + "epoch": 0.65, + "grad_norm": 0.6184281164995215, + "learning_rate": 7.86677884939968e-06, + "loss": 0.531, + "step": 5125 + }, + { + "epoch": 0.65, + "grad_norm": 0.8489258598200148, + "learning_rate": 7.865933603088389e-06, + "loss": 0.6572, + "step": 5126 + }, + { + "epoch": 0.65, + "grad_norm": 0.638792438542576, + "learning_rate": 7.865088234783463e-06, + "loss": 0.5859, + "step": 5127 + }, + { + "epoch": 0.65, + "grad_norm": 0.7543736467193048, + "learning_rate": 7.864242744520886e-06, + "loss": 0.5805, + "step": 5128 + }, + { + "epoch": 0.65, + "grad_norm": 0.7625111433611972, + "learning_rate": 7.863397132336648e-06, + "loss": 0.6078, + "step": 5129 + }, + { + "epoch": 0.65, + "grad_norm": 0.7196372673758752, + "learning_rate": 7.862551398266744e-06, + "loss": 0.5841, + "step": 5130 + }, + { + "epoch": 0.65, + "grad_norm": 0.8401073786095281, + "learning_rate": 7.861705542347175e-06, + "loss": 0.5828, + "step": 5131 + }, + { + "epoch": 0.65, + "grad_norm": 0.6065509265600705, + "learning_rate": 7.860859564613945e-06, + "loss": 0.5115, + "step": 5132 + }, + { + "epoch": 0.65, + "grad_norm": 1.0771813157619665, + "learning_rate": 7.860013465103065e-06, + "loss": 0.6201, + "step": 5133 + }, + { + "epoch": 0.65, + "grad_norm": 0.5939155001641759, + "learning_rate": 7.859167243850552e-06, + "loss": 0.4979, + "step": 5134 + }, + { + "epoch": 0.65, + "grad_norm": 0.7375112666833743, + "learning_rate": 7.858320900892427e-06, + "loss": 0.6139, + "step": 5135 + }, + { + "epoch": 0.65, + "grad_norm": 0.6441646194030141, + "learning_rate": 7.857474436264714e-06, + "loss": 0.5202, + "step": 5136 + }, + { + "epoch": 0.65, + "grad_norm": 0.6054882863749236, + "learning_rate": 7.856627850003444e-06, + "loss": 0.5316, + "step": 5137 + }, + { + "epoch": 0.65, + "grad_norm": 0.8045266530316749, + "learning_rate": 7.855781142144658e-06, + "loss": 0.6269, + "step": 5138 + }, + { + "epoch": 0.65, + "grad_norm": 0.9468585915100367, + "learning_rate": 7.854934312724391e-06, + "loss": 0.6268, + "step": 5139 + }, + { + "epoch": 0.65, + "grad_norm": 0.6460564456698489, + "learning_rate": 7.854087361778698e-06, + "loss": 0.5551, + "step": 5140 + }, + { + "epoch": 0.65, + "grad_norm": 0.7607520314008942, + "learning_rate": 7.853240289343621e-06, + "loss": 0.6268, + "step": 5141 + }, + { + "epoch": 0.66, + "grad_norm": 0.6359834973801624, + "learning_rate": 7.852393095455226e-06, + "loss": 0.5635, + "step": 5142 + }, + { + "epoch": 0.66, + "grad_norm": 0.9100654112663452, + "learning_rate": 7.851545780149571e-06, + "loss": 0.6246, + "step": 5143 + }, + { + "epoch": 0.66, + "grad_norm": 0.6760599372913728, + "learning_rate": 7.850698343462724e-06, + "loss": 0.5719, + "step": 5144 + }, + { + "epoch": 0.66, + "grad_norm": 0.7202926353213159, + "learning_rate": 7.849850785430759e-06, + "loss": 0.5632, + "step": 5145 + }, + { + "epoch": 0.66, + "grad_norm": 0.7183561679175698, + "learning_rate": 7.849003106089754e-06, + "loss": 0.5408, + "step": 5146 + }, + { + "epoch": 0.66, + "grad_norm": 0.6055642801620112, + "learning_rate": 7.848155305475789e-06, + "loss": 0.5052, + "step": 5147 + }, + { + "epoch": 0.66, + "grad_norm": 0.7046992852981054, + "learning_rate": 7.847307383624953e-06, + "loss": 0.5302, + "step": 5148 + }, + { + "epoch": 0.66, + "grad_norm": 0.7820383122734619, + "learning_rate": 7.846459340573342e-06, + "loss": 0.6188, + "step": 5149 + }, + { + "epoch": 0.66, + "grad_norm": 0.7600596662095117, + "learning_rate": 7.845611176357054e-06, + "loss": 0.6185, + "step": 5150 + }, + { + "epoch": 0.66, + "grad_norm": 0.6274596287492995, + "learning_rate": 7.844762891012188e-06, + "loss": 0.5299, + "step": 5151 + }, + { + "epoch": 0.66, + "grad_norm": 0.7672853534932366, + "learning_rate": 7.84391448457486e-06, + "loss": 0.629, + "step": 5152 + }, + { + "epoch": 0.66, + "grad_norm": 0.5741163666937225, + "learning_rate": 7.843065957081178e-06, + "loss": 0.5292, + "step": 5153 + }, + { + "epoch": 0.66, + "grad_norm": 0.8314494622019364, + "learning_rate": 7.842217308567264e-06, + "loss": 0.6659, + "step": 5154 + }, + { + "epoch": 0.66, + "grad_norm": 0.6011434329075702, + "learning_rate": 7.841368539069242e-06, + "loss": 0.5511, + "step": 5155 + }, + { + "epoch": 0.66, + "grad_norm": 0.8361578191754074, + "learning_rate": 7.840519648623239e-06, + "loss": 0.5895, + "step": 5156 + }, + { + "epoch": 0.66, + "grad_norm": 0.5632250279890848, + "learning_rate": 7.839670637265394e-06, + "loss": 0.4852, + "step": 5157 + }, + { + "epoch": 0.66, + "grad_norm": 0.7246292297571776, + "learning_rate": 7.838821505031844e-06, + "loss": 0.6514, + "step": 5158 + }, + { + "epoch": 0.66, + "grad_norm": 0.7391546158511483, + "learning_rate": 7.837972251958732e-06, + "loss": 0.6161, + "step": 5159 + }, + { + "epoch": 0.66, + "grad_norm": 0.8163291036002391, + "learning_rate": 7.837122878082212e-06, + "loss": 0.6397, + "step": 5160 + }, + { + "epoch": 0.66, + "grad_norm": 0.8075185919964452, + "learning_rate": 7.836273383438438e-06, + "loss": 0.6156, + "step": 5161 + }, + { + "epoch": 0.66, + "grad_norm": 0.7559379472919219, + "learning_rate": 7.835423768063567e-06, + "loss": 0.6047, + "step": 5162 + }, + { + "epoch": 0.66, + "grad_norm": 0.8136407199916558, + "learning_rate": 7.834574031993768e-06, + "loss": 0.6001, + "step": 5163 + }, + { + "epoch": 0.66, + "grad_norm": 0.7023863094800844, + "learning_rate": 7.833724175265211e-06, + "loss": 0.578, + "step": 5164 + }, + { + "epoch": 0.66, + "grad_norm": 0.6659585123110534, + "learning_rate": 7.83287419791407e-06, + "loss": 0.5229, + "step": 5165 + }, + { + "epoch": 0.66, + "grad_norm": 0.6094398168476903, + "learning_rate": 7.832024099976528e-06, + "loss": 0.5229, + "step": 5166 + }, + { + "epoch": 0.66, + "grad_norm": 0.6019145873911041, + "learning_rate": 7.83117388148877e-06, + "loss": 0.4999, + "step": 5167 + }, + { + "epoch": 0.66, + "grad_norm": 0.8972452164358072, + "learning_rate": 7.830323542486986e-06, + "loss": 0.6029, + "step": 5168 + }, + { + "epoch": 0.66, + "grad_norm": 0.626960742856091, + "learning_rate": 7.829473083007375e-06, + "loss": 0.5823, + "step": 5169 + }, + { + "epoch": 0.66, + "grad_norm": 0.7269685635796725, + "learning_rate": 7.828622503086136e-06, + "loss": 0.6062, + "step": 5170 + }, + { + "epoch": 0.66, + "grad_norm": 0.7469406272220259, + "learning_rate": 7.827771802759474e-06, + "loss": 0.608, + "step": 5171 + }, + { + "epoch": 0.66, + "grad_norm": 0.7433024029517992, + "learning_rate": 7.826920982063605e-06, + "loss": 0.5773, + "step": 5172 + }, + { + "epoch": 0.66, + "grad_norm": 0.7168109881004537, + "learning_rate": 7.826070041034742e-06, + "loss": 0.5221, + "step": 5173 + }, + { + "epoch": 0.66, + "grad_norm": 0.7240518256261765, + "learning_rate": 7.825218979709108e-06, + "loss": 0.5505, + "step": 5174 + }, + { + "epoch": 0.66, + "grad_norm": 0.5734171472591354, + "learning_rate": 7.824367798122931e-06, + "loss": 0.5272, + "step": 5175 + }, + { + "epoch": 0.66, + "grad_norm": 0.6215298058927083, + "learning_rate": 7.823516496312443e-06, + "loss": 0.5087, + "step": 5176 + }, + { + "epoch": 0.66, + "grad_norm": 0.7002335222577787, + "learning_rate": 7.822665074313878e-06, + "loss": 0.5872, + "step": 5177 + }, + { + "epoch": 0.66, + "grad_norm": 0.8213398502255651, + "learning_rate": 7.821813532163483e-06, + "loss": 0.6456, + "step": 5178 + }, + { + "epoch": 0.66, + "grad_norm": 0.8025881365393628, + "learning_rate": 7.820961869897502e-06, + "loss": 0.6218, + "step": 5179 + }, + { + "epoch": 0.66, + "grad_norm": 0.6121564975239244, + "learning_rate": 7.820110087552189e-06, + "loss": 0.5248, + "step": 5180 + }, + { + "epoch": 0.66, + "grad_norm": 0.7328072190869244, + "learning_rate": 7.819258185163801e-06, + "loss": 0.5726, + "step": 5181 + }, + { + "epoch": 0.66, + "grad_norm": 0.585275707297132, + "learning_rate": 7.818406162768602e-06, + "loss": 0.5403, + "step": 5182 + }, + { + "epoch": 0.66, + "grad_norm": 0.6375131315404875, + "learning_rate": 7.817554020402859e-06, + "loss": 0.5232, + "step": 5183 + }, + { + "epoch": 0.66, + "grad_norm": 0.6944233175821881, + "learning_rate": 7.816701758102845e-06, + "loss": 0.5848, + "step": 5184 + }, + { + "epoch": 0.66, + "grad_norm": 0.8608587649765883, + "learning_rate": 7.815849375904838e-06, + "loss": 0.6514, + "step": 5185 + }, + { + "epoch": 0.66, + "grad_norm": 0.6201659343923066, + "learning_rate": 7.81499687384512e-06, + "loss": 0.5047, + "step": 5186 + }, + { + "epoch": 0.66, + "grad_norm": 0.7537271775227715, + "learning_rate": 7.814144251959981e-06, + "loss": 0.5925, + "step": 5187 + }, + { + "epoch": 0.66, + "grad_norm": 0.7287519655576999, + "learning_rate": 7.813291510285713e-06, + "loss": 0.5735, + "step": 5188 + }, + { + "epoch": 0.66, + "grad_norm": 0.6470465468464696, + "learning_rate": 7.812438648858617e-06, + "loss": 0.4743, + "step": 5189 + }, + { + "epoch": 0.66, + "grad_norm": 0.6886438333736695, + "learning_rate": 7.811585667714995e-06, + "loss": 0.5626, + "step": 5190 + }, + { + "epoch": 0.66, + "grad_norm": 0.6946642431259499, + "learning_rate": 7.810732566891154e-06, + "loss": 0.6052, + "step": 5191 + }, + { + "epoch": 0.66, + "grad_norm": 0.6308731610498515, + "learning_rate": 7.809879346423413e-06, + "loss": 0.5125, + "step": 5192 + }, + { + "epoch": 0.66, + "grad_norm": 0.6106891296862297, + "learning_rate": 7.809026006348084e-06, + "loss": 0.5201, + "step": 5193 + }, + { + "epoch": 0.66, + "grad_norm": 0.5615808931381902, + "learning_rate": 7.808172546701494e-06, + "loss": 0.5032, + "step": 5194 + }, + { + "epoch": 0.66, + "grad_norm": 0.7863016380805363, + "learning_rate": 7.807318967519974e-06, + "loss": 0.6359, + "step": 5195 + }, + { + "epoch": 0.66, + "grad_norm": 0.745459832326678, + "learning_rate": 7.806465268839856e-06, + "loss": 0.5823, + "step": 5196 + }, + { + "epoch": 0.66, + "grad_norm": 0.8146838361068188, + "learning_rate": 7.80561145069748e-06, + "loss": 0.6344, + "step": 5197 + }, + { + "epoch": 0.66, + "grad_norm": 0.727117847301932, + "learning_rate": 7.80475751312919e-06, + "loss": 0.5705, + "step": 5198 + }, + { + "epoch": 0.66, + "grad_norm": 0.5648858992023471, + "learning_rate": 7.803903456171335e-06, + "loss": 0.5066, + "step": 5199 + }, + { + "epoch": 0.66, + "grad_norm": 0.6611353478365266, + "learning_rate": 7.803049279860269e-06, + "loss": 0.5384, + "step": 5200 + }, + { + "epoch": 0.66, + "grad_norm": 0.8299260810403722, + "learning_rate": 7.802194984232354e-06, + "loss": 0.6377, + "step": 5201 + }, + { + "epoch": 0.66, + "grad_norm": 0.5871918397856358, + "learning_rate": 7.801340569323951e-06, + "loss": 0.4882, + "step": 5202 + }, + { + "epoch": 0.66, + "grad_norm": 0.6599277193781682, + "learning_rate": 7.800486035171435e-06, + "loss": 0.5231, + "step": 5203 + }, + { + "epoch": 0.66, + "grad_norm": 0.8046535975450259, + "learning_rate": 7.799631381811176e-06, + "loss": 0.5779, + "step": 5204 + }, + { + "epoch": 0.66, + "grad_norm": 0.701747700038936, + "learning_rate": 7.798776609279556e-06, + "loss": 0.5708, + "step": 5205 + }, + { + "epoch": 0.66, + "grad_norm": 0.6359479503432651, + "learning_rate": 7.797921717612958e-06, + "loss": 0.5053, + "step": 5206 + }, + { + "epoch": 0.66, + "grad_norm": 0.6905023218957824, + "learning_rate": 7.797066706847774e-06, + "loss": 0.6112, + "step": 5207 + }, + { + "epoch": 0.66, + "grad_norm": 0.6848635570336745, + "learning_rate": 7.796211577020397e-06, + "loss": 0.5701, + "step": 5208 + }, + { + "epoch": 0.66, + "grad_norm": 0.7129518356470682, + "learning_rate": 7.79535632816723e-06, + "loss": 0.6381, + "step": 5209 + }, + { + "epoch": 0.66, + "grad_norm": 0.6438700220277365, + "learning_rate": 7.794500960324678e-06, + "loss": 0.5188, + "step": 5210 + }, + { + "epoch": 0.66, + "grad_norm": 0.7216950014145777, + "learning_rate": 7.793645473529149e-06, + "loss": 0.5319, + "step": 5211 + }, + { + "epoch": 0.66, + "grad_norm": 0.8354405557326996, + "learning_rate": 7.792789867817058e-06, + "loss": 0.624, + "step": 5212 + }, + { + "epoch": 0.66, + "grad_norm": 0.7324552468722789, + "learning_rate": 7.791934143224829e-06, + "loss": 0.6057, + "step": 5213 + }, + { + "epoch": 0.66, + "grad_norm": 0.7308574461977455, + "learning_rate": 7.791078299788884e-06, + "loss": 0.5678, + "step": 5214 + }, + { + "epoch": 0.66, + "grad_norm": 0.5744071979513701, + "learning_rate": 7.790222337545654e-06, + "loss": 0.5048, + "step": 5215 + }, + { + "epoch": 0.66, + "grad_norm": 0.6416332515529836, + "learning_rate": 7.789366256531576e-06, + "loss": 0.5444, + "step": 5216 + }, + { + "epoch": 0.66, + "grad_norm": 1.6933249241088075, + "learning_rate": 7.788510056783092e-06, + "loss": 0.6455, + "step": 5217 + }, + { + "epoch": 0.66, + "grad_norm": 0.8108136984915921, + "learning_rate": 7.787653738336643e-06, + "loss": 0.6444, + "step": 5218 + }, + { + "epoch": 0.66, + "grad_norm": 0.6048875555064881, + "learning_rate": 7.786797301228684e-06, + "loss": 0.5507, + "step": 5219 + }, + { + "epoch": 0.67, + "grad_norm": 0.930254418294972, + "learning_rate": 7.785940745495668e-06, + "loss": 0.6532, + "step": 5220 + }, + { + "epoch": 0.67, + "grad_norm": 0.7822429191860759, + "learning_rate": 7.785084071174058e-06, + "loss": 0.5916, + "step": 5221 + }, + { + "epoch": 0.67, + "grad_norm": 0.9170950010327478, + "learning_rate": 7.784227278300318e-06, + "loss": 0.6079, + "step": 5222 + }, + { + "epoch": 0.67, + "grad_norm": 0.673604555362709, + "learning_rate": 7.783370366910922e-06, + "loss": 0.5987, + "step": 5223 + }, + { + "epoch": 0.67, + "grad_norm": 0.6986338181314327, + "learning_rate": 7.78251333704234e-06, + "loss": 0.6355, + "step": 5224 + }, + { + "epoch": 0.67, + "grad_norm": 0.6307382224520708, + "learning_rate": 7.781656188731062e-06, + "loss": 0.5691, + "step": 5225 + }, + { + "epoch": 0.67, + "grad_norm": 0.6829051843925056, + "learning_rate": 7.780798922013565e-06, + "loss": 0.5189, + "step": 5226 + }, + { + "epoch": 0.67, + "grad_norm": 0.6749744785196069, + "learning_rate": 7.779941536926345e-06, + "loss": 0.5651, + "step": 5227 + }, + { + "epoch": 0.67, + "grad_norm": 0.6058614720841491, + "learning_rate": 7.779084033505899e-06, + "loss": 0.522, + "step": 5228 + }, + { + "epoch": 0.67, + "grad_norm": 0.8298931683925259, + "learning_rate": 7.778226411788725e-06, + "loss": 0.62, + "step": 5229 + }, + { + "epoch": 0.67, + "grad_norm": 0.7912802282173766, + "learning_rate": 7.77736867181133e-06, + "loss": 0.6445, + "step": 5230 + }, + { + "epoch": 0.67, + "grad_norm": 0.8074130351241093, + "learning_rate": 7.776510813610225e-06, + "loss": 0.6185, + "step": 5231 + }, + { + "epoch": 0.67, + "grad_norm": 0.7578963500252686, + "learning_rate": 7.77565283722193e-06, + "loss": 0.5821, + "step": 5232 + }, + { + "epoch": 0.67, + "grad_norm": 0.6218647626778033, + "learning_rate": 7.774794742682961e-06, + "loss": 0.5173, + "step": 5233 + }, + { + "epoch": 0.67, + "grad_norm": 0.6471351305066092, + "learning_rate": 7.773936530029849e-06, + "loss": 0.5417, + "step": 5234 + }, + { + "epoch": 0.67, + "grad_norm": 0.7390504882646253, + "learning_rate": 7.77307819929912e-06, + "loss": 0.6398, + "step": 5235 + }, + { + "epoch": 0.67, + "grad_norm": 0.7753625720932061, + "learning_rate": 7.772219750527316e-06, + "loss": 0.652, + "step": 5236 + }, + { + "epoch": 0.67, + "grad_norm": 0.6519807511949876, + "learning_rate": 7.771361183750975e-06, + "loss": 0.5764, + "step": 5237 + }, + { + "epoch": 0.67, + "grad_norm": 0.7810016907333998, + "learning_rate": 7.770502499006645e-06, + "loss": 0.6333, + "step": 5238 + }, + { + "epoch": 0.67, + "grad_norm": 0.5838930966048808, + "learning_rate": 7.769643696330877e-06, + "loss": 0.5447, + "step": 5239 + }, + { + "epoch": 0.67, + "grad_norm": 1.0001303086954352, + "learning_rate": 7.768784775760228e-06, + "loss": 0.6272, + "step": 5240 + }, + { + "epoch": 0.67, + "grad_norm": 0.6931588109953511, + "learning_rate": 7.767925737331256e-06, + "loss": 0.5593, + "step": 5241 + }, + { + "epoch": 0.67, + "grad_norm": 0.5662650751063849, + "learning_rate": 7.767066581080534e-06, + "loss": 0.4843, + "step": 5242 + }, + { + "epoch": 0.67, + "grad_norm": 0.6550126948099457, + "learning_rate": 7.766207307044627e-06, + "loss": 0.5182, + "step": 5243 + }, + { + "epoch": 0.67, + "grad_norm": 0.750923788877791, + "learning_rate": 7.765347915260115e-06, + "loss": 0.6606, + "step": 5244 + }, + { + "epoch": 0.67, + "grad_norm": 0.7441033530674466, + "learning_rate": 7.76448840576358e-06, + "loss": 0.635, + "step": 5245 + }, + { + "epoch": 0.67, + "grad_norm": 0.7096215012666898, + "learning_rate": 7.763628778591607e-06, + "loss": 0.7049, + "step": 5246 + }, + { + "epoch": 0.67, + "grad_norm": 0.7676337863445605, + "learning_rate": 7.76276903378079e-06, + "loss": 0.6122, + "step": 5247 + }, + { + "epoch": 0.67, + "grad_norm": 0.8137413588366649, + "learning_rate": 7.761909171367722e-06, + "loss": 0.6652, + "step": 5248 + }, + { + "epoch": 0.67, + "grad_norm": 0.6437698150916662, + "learning_rate": 7.761049191389007e-06, + "loss": 0.5222, + "step": 5249 + }, + { + "epoch": 0.67, + "grad_norm": 0.5676579201559623, + "learning_rate": 7.76018909388125e-06, + "loss": 0.5295, + "step": 5250 + }, + { + "epoch": 0.67, + "grad_norm": 0.5614919750815595, + "learning_rate": 7.759328878881065e-06, + "loss": 0.5247, + "step": 5251 + }, + { + "epoch": 0.67, + "grad_norm": 0.5382690537083782, + "learning_rate": 7.758468546425065e-06, + "loss": 0.5207, + "step": 5252 + }, + { + "epoch": 0.67, + "grad_norm": 0.6729442376850997, + "learning_rate": 7.757608096549876e-06, + "loss": 0.595, + "step": 5253 + }, + { + "epoch": 0.67, + "grad_norm": 0.6365130074032272, + "learning_rate": 7.756747529292122e-06, + "loss": 0.5417, + "step": 5254 + }, + { + "epoch": 0.67, + "grad_norm": 0.8748126834650067, + "learning_rate": 7.755886844688434e-06, + "loss": 0.6467, + "step": 5255 + }, + { + "epoch": 0.67, + "grad_norm": 0.6791106755834856, + "learning_rate": 7.755026042775449e-06, + "loss": 0.4985, + "step": 5256 + }, + { + "epoch": 0.67, + "grad_norm": 1.1160212712389737, + "learning_rate": 7.75416512358981e-06, + "loss": 0.6542, + "step": 5257 + }, + { + "epoch": 0.67, + "grad_norm": 0.5903330924371895, + "learning_rate": 7.753304087168164e-06, + "loss": 0.5641, + "step": 5258 + }, + { + "epoch": 0.67, + "grad_norm": 0.7014534647262111, + "learning_rate": 7.75244293354716e-06, + "loss": 0.5763, + "step": 5259 + }, + { + "epoch": 0.67, + "grad_norm": 0.578510819540876, + "learning_rate": 7.751581662763457e-06, + "loss": 0.5263, + "step": 5260 + }, + { + "epoch": 0.67, + "grad_norm": 0.7657974706803282, + "learning_rate": 7.750720274853714e-06, + "loss": 0.5801, + "step": 5261 + }, + { + "epoch": 0.67, + "grad_norm": 0.705884157972561, + "learning_rate": 7.749858769854599e-06, + "loss": 0.5401, + "step": 5262 + }, + { + "epoch": 0.67, + "grad_norm": 0.6155866238805996, + "learning_rate": 7.748997147802784e-06, + "loss": 0.5116, + "step": 5263 + }, + { + "epoch": 0.67, + "grad_norm": 0.7409621896719885, + "learning_rate": 7.748135408734944e-06, + "loss": 0.5894, + "step": 5264 + }, + { + "epoch": 0.67, + "grad_norm": 0.6367364129316485, + "learning_rate": 7.747273552687764e-06, + "loss": 0.5494, + "step": 5265 + }, + { + "epoch": 0.67, + "grad_norm": 0.846871606813435, + "learning_rate": 7.746411579697925e-06, + "loss": 0.6348, + "step": 5266 + }, + { + "epoch": 0.67, + "grad_norm": 0.788538756400709, + "learning_rate": 7.745549489802122e-06, + "loss": 0.5791, + "step": 5267 + }, + { + "epoch": 0.67, + "grad_norm": 0.6446296980648142, + "learning_rate": 7.744687283037052e-06, + "loss": 0.5506, + "step": 5268 + }, + { + "epoch": 0.67, + "grad_norm": 0.79133393596402, + "learning_rate": 7.743824959439414e-06, + "loss": 0.6569, + "step": 5269 + }, + { + "epoch": 0.67, + "grad_norm": 0.9501967627153795, + "learning_rate": 7.742962519045918e-06, + "loss": 0.6764, + "step": 5270 + }, + { + "epoch": 0.67, + "grad_norm": 0.7381205333942157, + "learning_rate": 7.74209996189327e-06, + "loss": 0.5695, + "step": 5271 + }, + { + "epoch": 0.67, + "grad_norm": 0.6540025867844753, + "learning_rate": 7.741237288018191e-06, + "loss": 0.5615, + "step": 5272 + }, + { + "epoch": 0.67, + "grad_norm": 0.7181243194896826, + "learning_rate": 7.740374497457398e-06, + "loss": 0.6131, + "step": 5273 + }, + { + "epoch": 0.67, + "grad_norm": 0.6566134632869463, + "learning_rate": 7.739511590247621e-06, + "loss": 0.6021, + "step": 5274 + }, + { + "epoch": 0.67, + "grad_norm": 0.8969440704119592, + "learning_rate": 7.738648566425591e-06, + "loss": 0.5757, + "step": 5275 + }, + { + "epoch": 0.67, + "grad_norm": 0.7802441121795158, + "learning_rate": 7.737785426028041e-06, + "loss": 0.6476, + "step": 5276 + }, + { + "epoch": 0.67, + "grad_norm": 0.91875618050464, + "learning_rate": 7.736922169091716e-06, + "loss": 0.5961, + "step": 5277 + }, + { + "epoch": 0.67, + "grad_norm": 0.8098334712758746, + "learning_rate": 7.736058795653359e-06, + "loss": 0.6284, + "step": 5278 + }, + { + "epoch": 0.67, + "grad_norm": 0.7593980031617001, + "learning_rate": 7.73519530574972e-06, + "loss": 0.6243, + "step": 5279 + }, + { + "epoch": 0.67, + "grad_norm": 0.7599905609209325, + "learning_rate": 7.73433169941756e-06, + "loss": 0.5997, + "step": 5280 + }, + { + "epoch": 0.67, + "grad_norm": 0.6565273382115473, + "learning_rate": 7.733467976693637e-06, + "loss": 0.5329, + "step": 5281 + }, + { + "epoch": 0.67, + "grad_norm": 0.5459231059576544, + "learning_rate": 7.732604137614718e-06, + "loss": 0.5042, + "step": 5282 + }, + { + "epoch": 0.67, + "grad_norm": 0.731271369521869, + "learning_rate": 7.73174018221757e-06, + "loss": 0.6515, + "step": 5283 + }, + { + "epoch": 0.67, + "grad_norm": 0.6690327574379991, + "learning_rate": 7.730876110538975e-06, + "loss": 0.584, + "step": 5284 + }, + { + "epoch": 0.67, + "grad_norm": 1.1085067758957585, + "learning_rate": 7.73001192261571e-06, + "loss": 0.7003, + "step": 5285 + }, + { + "epoch": 0.67, + "grad_norm": 0.5691674781479082, + "learning_rate": 7.72914761848456e-06, + "loss": 0.5211, + "step": 5286 + }, + { + "epoch": 0.67, + "grad_norm": 0.7834924281762433, + "learning_rate": 7.728283198182318e-06, + "loss": 0.6152, + "step": 5287 + }, + { + "epoch": 0.67, + "grad_norm": 0.8045769745747544, + "learning_rate": 7.727418661745778e-06, + "loss": 0.6364, + "step": 5288 + }, + { + "epoch": 0.67, + "grad_norm": 0.6718009128217338, + "learning_rate": 7.726554009211741e-06, + "loss": 0.591, + "step": 5289 + }, + { + "epoch": 0.67, + "grad_norm": 0.7679383615974319, + "learning_rate": 7.725689240617013e-06, + "loss": 0.6062, + "step": 5290 + }, + { + "epoch": 0.67, + "grad_norm": 0.6527731247062422, + "learning_rate": 7.724824355998407e-06, + "loss": 0.5649, + "step": 5291 + }, + { + "epoch": 0.67, + "grad_norm": 0.6553288435708892, + "learning_rate": 7.723959355392731e-06, + "loss": 0.6194, + "step": 5292 + }, + { + "epoch": 0.67, + "grad_norm": 0.6506231333113452, + "learning_rate": 7.723094238836812e-06, + "loss": 0.5562, + "step": 5293 + }, + { + "epoch": 0.67, + "grad_norm": 1.151922057213763, + "learning_rate": 7.722229006367472e-06, + "loss": 0.6436, + "step": 5294 + }, + { + "epoch": 0.67, + "grad_norm": 0.628292377199441, + "learning_rate": 7.721363658021544e-06, + "loss": 0.5476, + "step": 5295 + }, + { + "epoch": 0.67, + "grad_norm": 0.6171340045203727, + "learning_rate": 7.72049819383586e-06, + "loss": 0.5805, + "step": 5296 + }, + { + "epoch": 0.67, + "grad_norm": 0.7147657361764136, + "learning_rate": 7.719632613847261e-06, + "loss": 0.5796, + "step": 5297 + }, + { + "epoch": 0.67, + "grad_norm": 0.8015877735837139, + "learning_rate": 7.718766918092591e-06, + "loss": 0.5992, + "step": 5298 + }, + { + "epoch": 0.68, + "grad_norm": 0.7026833989226147, + "learning_rate": 7.717901106608702e-06, + "loss": 0.6074, + "step": 5299 + }, + { + "epoch": 0.68, + "grad_norm": 0.8114330970205671, + "learning_rate": 7.71703517943245e-06, + "loss": 0.4924, + "step": 5300 + }, + { + "epoch": 0.68, + "grad_norm": 0.7083457757063296, + "learning_rate": 7.716169136600693e-06, + "loss": 0.6061, + "step": 5301 + }, + { + "epoch": 0.68, + "grad_norm": 0.5583808615003, + "learning_rate": 7.715302978150294e-06, + "loss": 0.5014, + "step": 5302 + }, + { + "epoch": 0.68, + "grad_norm": 0.5871350625935311, + "learning_rate": 7.714436704118125e-06, + "loss": 0.5239, + "step": 5303 + }, + { + "epoch": 0.68, + "grad_norm": 0.6299127194613465, + "learning_rate": 7.713570314541058e-06, + "loss": 0.553, + "step": 5304 + }, + { + "epoch": 0.68, + "grad_norm": 0.8586505503492021, + "learning_rate": 7.712703809455975e-06, + "loss": 0.5921, + "step": 5305 + }, + { + "epoch": 0.68, + "grad_norm": 0.6091045261816805, + "learning_rate": 7.71183718889976e-06, + "loss": 0.5248, + "step": 5306 + }, + { + "epoch": 0.68, + "grad_norm": 0.6034704887543436, + "learning_rate": 7.7109704529093e-06, + "loss": 0.5276, + "step": 5307 + }, + { + "epoch": 0.68, + "grad_norm": 0.7722605894684905, + "learning_rate": 7.710103601521492e-06, + "loss": 0.6225, + "step": 5308 + }, + { + "epoch": 0.68, + "grad_norm": 0.6710929798497455, + "learning_rate": 7.709236634773236e-06, + "loss": 0.4731, + "step": 5309 + }, + { + "epoch": 0.68, + "grad_norm": 0.7655579462306464, + "learning_rate": 7.708369552701432e-06, + "loss": 0.5772, + "step": 5310 + }, + { + "epoch": 0.68, + "grad_norm": 0.7879824673690861, + "learning_rate": 7.707502355342989e-06, + "loss": 0.627, + "step": 5311 + }, + { + "epoch": 0.68, + "grad_norm": 0.6507972696036635, + "learning_rate": 7.706635042734824e-06, + "loss": 0.5595, + "step": 5312 + }, + { + "epoch": 0.68, + "grad_norm": 0.6743730344430495, + "learning_rate": 7.705767614913854e-06, + "loss": 0.534, + "step": 5313 + }, + { + "epoch": 0.68, + "grad_norm": 0.6790101412040879, + "learning_rate": 7.704900071917005e-06, + "loss": 0.5905, + "step": 5314 + }, + { + "epoch": 0.68, + "grad_norm": 0.8545274723722545, + "learning_rate": 7.704032413781202e-06, + "loss": 0.6125, + "step": 5315 + }, + { + "epoch": 0.68, + "grad_norm": 0.6313573737284246, + "learning_rate": 7.703164640543382e-06, + "loss": 0.5129, + "step": 5316 + }, + { + "epoch": 0.68, + "grad_norm": 0.6075125460045357, + "learning_rate": 7.702296752240477e-06, + "loss": 0.5553, + "step": 5317 + }, + { + "epoch": 0.68, + "grad_norm": 0.6638290762596978, + "learning_rate": 7.701428748909437e-06, + "loss": 0.5404, + "step": 5318 + }, + { + "epoch": 0.68, + "grad_norm": 0.6688444155868246, + "learning_rate": 7.700560630587207e-06, + "loss": 0.5494, + "step": 5319 + }, + { + "epoch": 0.68, + "grad_norm": 0.6266998309402686, + "learning_rate": 7.699692397310743e-06, + "loss": 0.5443, + "step": 5320 + }, + { + "epoch": 0.68, + "grad_norm": 0.6244766913497802, + "learning_rate": 7.698824049117e-06, + "loss": 0.5316, + "step": 5321 + }, + { + "epoch": 0.68, + "grad_norm": 0.8014393174243583, + "learning_rate": 7.69795558604294e-06, + "loss": 0.6258, + "step": 5322 + }, + { + "epoch": 0.68, + "grad_norm": 0.7211660414928572, + "learning_rate": 7.697087008125532e-06, + "loss": 0.6665, + "step": 5323 + }, + { + "epoch": 0.68, + "grad_norm": 0.7058654858828981, + "learning_rate": 7.696218315401751e-06, + "loss": 0.5718, + "step": 5324 + }, + { + "epoch": 0.68, + "grad_norm": 0.6178785197188702, + "learning_rate": 7.69534950790857e-06, + "loss": 0.5029, + "step": 5325 + }, + { + "epoch": 0.68, + "grad_norm": 0.6951047346040349, + "learning_rate": 7.694480585682975e-06, + "loss": 0.5487, + "step": 5326 + }, + { + "epoch": 0.68, + "grad_norm": 0.8982887178203129, + "learning_rate": 7.69361154876195e-06, + "loss": 0.6249, + "step": 5327 + }, + { + "epoch": 0.68, + "grad_norm": 0.6800590187037101, + "learning_rate": 7.69274239718249e-06, + "loss": 0.5264, + "step": 5328 + }, + { + "epoch": 0.68, + "grad_norm": 0.6003970713816201, + "learning_rate": 7.691873130981591e-06, + "loss": 0.5264, + "step": 5329 + }, + { + "epoch": 0.68, + "grad_norm": 0.6240141298785197, + "learning_rate": 7.691003750196256e-06, + "loss": 0.5584, + "step": 5330 + }, + { + "epoch": 0.68, + "grad_norm": 0.9179250672137081, + "learning_rate": 7.690134254863488e-06, + "loss": 0.6387, + "step": 5331 + }, + { + "epoch": 0.68, + "grad_norm": 0.58800512162918, + "learning_rate": 7.689264645020303e-06, + "loss": 0.5629, + "step": 5332 + }, + { + "epoch": 0.68, + "grad_norm": 0.5639766958796449, + "learning_rate": 7.688394920703715e-06, + "loss": 0.5339, + "step": 5333 + }, + { + "epoch": 0.68, + "grad_norm": 0.8565263335822307, + "learning_rate": 7.687525081950747e-06, + "loss": 0.599, + "step": 5334 + }, + { + "epoch": 0.68, + "grad_norm": 0.8002941713789701, + "learning_rate": 7.686655128798423e-06, + "loss": 0.5762, + "step": 5335 + }, + { + "epoch": 0.68, + "grad_norm": 0.7013084372962467, + "learning_rate": 7.685785061283776e-06, + "loss": 0.5826, + "step": 5336 + }, + { + "epoch": 0.68, + "grad_norm": 0.71366480717337, + "learning_rate": 7.684914879443841e-06, + "loss": 0.5865, + "step": 5337 + }, + { + "epoch": 0.68, + "grad_norm": 0.5942434556786707, + "learning_rate": 7.684044583315661e-06, + "loss": 0.5539, + "step": 5338 + }, + { + "epoch": 0.68, + "grad_norm": 0.8754726154923553, + "learning_rate": 7.68317417293628e-06, + "loss": 0.6342, + "step": 5339 + }, + { + "epoch": 0.68, + "grad_norm": 0.7556984430068773, + "learning_rate": 7.682303648342746e-06, + "loss": 0.5984, + "step": 5340 + }, + { + "epoch": 0.68, + "grad_norm": 0.6625352947072592, + "learning_rate": 7.681433009572119e-06, + "loss": 0.5482, + "step": 5341 + }, + { + "epoch": 0.68, + "grad_norm": 0.66197751487079, + "learning_rate": 7.680562256661457e-06, + "loss": 0.5302, + "step": 5342 + }, + { + "epoch": 0.68, + "grad_norm": 0.7109019654491981, + "learning_rate": 7.679691389647825e-06, + "loss": 0.5847, + "step": 5343 + }, + { + "epoch": 0.68, + "grad_norm": 0.7787371126541046, + "learning_rate": 7.678820408568295e-06, + "loss": 0.6185, + "step": 5344 + }, + { + "epoch": 0.68, + "grad_norm": 0.613323840463627, + "learning_rate": 7.67794931345994e-06, + "loss": 0.5122, + "step": 5345 + }, + { + "epoch": 0.68, + "grad_norm": 0.6187148388381738, + "learning_rate": 7.67707810435984e-06, + "loss": 0.5817, + "step": 5346 + }, + { + "epoch": 0.68, + "grad_norm": 0.6664703757482348, + "learning_rate": 7.67620678130508e-06, + "loss": 0.5617, + "step": 5347 + }, + { + "epoch": 0.68, + "grad_norm": 0.8225262527102246, + "learning_rate": 7.67533534433275e-06, + "loss": 0.6193, + "step": 5348 + }, + { + "epoch": 0.68, + "grad_norm": 0.6386208623915298, + "learning_rate": 7.674463793479943e-06, + "loss": 0.555, + "step": 5349 + }, + { + "epoch": 0.68, + "grad_norm": 0.5982119011369568, + "learning_rate": 7.673592128783759e-06, + "loss": 0.5243, + "step": 5350 + }, + { + "epoch": 0.68, + "grad_norm": 0.6490819455678268, + "learning_rate": 7.6727203502813e-06, + "loss": 0.4916, + "step": 5351 + }, + { + "epoch": 0.68, + "grad_norm": 0.6128393412017639, + "learning_rate": 7.67184845800968e-06, + "loss": 0.5057, + "step": 5352 + }, + { + "epoch": 0.68, + "grad_norm": 0.6780731281205047, + "learning_rate": 7.670976452006004e-06, + "loss": 0.5384, + "step": 5353 + }, + { + "epoch": 0.68, + "grad_norm": 0.5690235257815663, + "learning_rate": 7.6701043323074e-06, + "loss": 0.4932, + "step": 5354 + }, + { + "epoch": 0.68, + "grad_norm": 0.7420962431359314, + "learning_rate": 7.669232098950985e-06, + "loss": 0.6022, + "step": 5355 + }, + { + "epoch": 0.68, + "grad_norm": 0.635443809239543, + "learning_rate": 7.66835975197389e-06, + "loss": 0.5794, + "step": 5356 + }, + { + "epoch": 0.68, + "grad_norm": 0.7707091924932612, + "learning_rate": 7.667487291413247e-06, + "loss": 0.616, + "step": 5357 + }, + { + "epoch": 0.68, + "grad_norm": 0.5705184601449362, + "learning_rate": 7.666614717306193e-06, + "loss": 0.5002, + "step": 5358 + }, + { + "epoch": 0.68, + "grad_norm": 1.1716942186392239, + "learning_rate": 7.665742029689874e-06, + "loss": 0.6027, + "step": 5359 + }, + { + "epoch": 0.68, + "grad_norm": 0.7184567838817191, + "learning_rate": 7.664869228601433e-06, + "loss": 0.5835, + "step": 5360 + }, + { + "epoch": 0.68, + "grad_norm": 0.7024049582923059, + "learning_rate": 7.663996314078025e-06, + "loss": 0.5724, + "step": 5361 + }, + { + "epoch": 0.68, + "grad_norm": 0.539286470647893, + "learning_rate": 7.663123286156809e-06, + "loss": 0.4751, + "step": 5362 + }, + { + "epoch": 0.68, + "grad_norm": 0.6091919738277533, + "learning_rate": 7.662250144874943e-06, + "loss": 0.5182, + "step": 5363 + }, + { + "epoch": 0.68, + "grad_norm": 0.582008114127871, + "learning_rate": 7.661376890269595e-06, + "loss": 0.5199, + "step": 5364 + }, + { + "epoch": 0.68, + "grad_norm": 0.5617807601456263, + "learning_rate": 7.660503522377938e-06, + "loss": 0.5081, + "step": 5365 + }, + { + "epoch": 0.68, + "grad_norm": 0.6430367415995056, + "learning_rate": 7.659630041237148e-06, + "loss": 0.5466, + "step": 5366 + }, + { + "epoch": 0.68, + "grad_norm": 0.6385091164378073, + "learning_rate": 7.658756446884405e-06, + "loss": 0.5704, + "step": 5367 + }, + { + "epoch": 0.68, + "grad_norm": 0.682410731978004, + "learning_rate": 7.657882739356898e-06, + "loss": 0.5996, + "step": 5368 + }, + { + "epoch": 0.68, + "grad_norm": 0.7599238778128445, + "learning_rate": 7.657008918691816e-06, + "loss": 0.6487, + "step": 5369 + }, + { + "epoch": 0.68, + "grad_norm": 0.8529496451834723, + "learning_rate": 7.656134984926354e-06, + "loss": 0.6324, + "step": 5370 + }, + { + "epoch": 0.68, + "grad_norm": 0.5668754506049037, + "learning_rate": 7.655260938097715e-06, + "loss": 0.5344, + "step": 5371 + }, + { + "epoch": 0.68, + "grad_norm": 0.6120986039999534, + "learning_rate": 7.654386778243102e-06, + "loss": 0.5122, + "step": 5372 + }, + { + "epoch": 0.68, + "grad_norm": 0.5789339225907278, + "learning_rate": 7.653512505399725e-06, + "loss": 0.5343, + "step": 5373 + }, + { + "epoch": 0.68, + "grad_norm": 0.6454722610257745, + "learning_rate": 7.6526381196048e-06, + "loss": 0.5546, + "step": 5374 + }, + { + "epoch": 0.68, + "grad_norm": 0.7044088838158508, + "learning_rate": 7.65176362089555e-06, + "loss": 0.6006, + "step": 5375 + }, + { + "epoch": 0.68, + "grad_norm": 0.581249987029883, + "learning_rate": 7.650889009309192e-06, + "loss": 0.512, + "step": 5376 + }, + { + "epoch": 0.69, + "grad_norm": 0.8576964304073412, + "learning_rate": 7.650014284882963e-06, + "loss": 0.6489, + "step": 5377 + }, + { + "epoch": 0.69, + "grad_norm": 0.545865383601175, + "learning_rate": 7.649139447654093e-06, + "loss": 0.4896, + "step": 5378 + }, + { + "epoch": 0.69, + "grad_norm": 0.6026253881184362, + "learning_rate": 7.648264497659823e-06, + "loss": 0.5201, + "step": 5379 + }, + { + "epoch": 0.69, + "grad_norm": 0.810654222136975, + "learning_rate": 7.647389434937396e-06, + "loss": 0.5675, + "step": 5380 + }, + { + "epoch": 0.69, + "grad_norm": 0.6133869022233376, + "learning_rate": 7.64651425952406e-06, + "loss": 0.5084, + "step": 5381 + }, + { + "epoch": 0.69, + "grad_norm": 0.8385442460914531, + "learning_rate": 7.645638971457068e-06, + "loss": 0.5815, + "step": 5382 + }, + { + "epoch": 0.69, + "grad_norm": 0.5621602526736209, + "learning_rate": 7.64476357077368e-06, + "loss": 0.4776, + "step": 5383 + }, + { + "epoch": 0.69, + "grad_norm": 0.6911636496925291, + "learning_rate": 7.643888057511159e-06, + "loss": 0.5934, + "step": 5384 + }, + { + "epoch": 0.69, + "grad_norm": 0.6843306488745862, + "learning_rate": 7.643012431706771e-06, + "loss": 0.5266, + "step": 5385 + }, + { + "epoch": 0.69, + "grad_norm": 0.7198955846421942, + "learning_rate": 7.64213669339779e-06, + "loss": 0.5934, + "step": 5386 + }, + { + "epoch": 0.69, + "grad_norm": 0.6491076186574289, + "learning_rate": 7.641260842621494e-06, + "loss": 0.5063, + "step": 5387 + }, + { + "epoch": 0.69, + "grad_norm": 0.6641918581037313, + "learning_rate": 7.640384879415164e-06, + "loss": 0.5459, + "step": 5388 + }, + { + "epoch": 0.69, + "grad_norm": 0.7564332359919856, + "learning_rate": 7.639508803816087e-06, + "loss": 0.6161, + "step": 5389 + }, + { + "epoch": 0.69, + "grad_norm": 0.5403020534820326, + "learning_rate": 7.638632615861555e-06, + "loss": 0.4545, + "step": 5390 + }, + { + "epoch": 0.69, + "grad_norm": 0.7666645707024677, + "learning_rate": 7.637756315588862e-06, + "loss": 0.6201, + "step": 5391 + }, + { + "epoch": 0.69, + "grad_norm": 0.835280670452697, + "learning_rate": 7.636879903035314e-06, + "loss": 0.617, + "step": 5392 + }, + { + "epoch": 0.69, + "grad_norm": 0.6937853179061539, + "learning_rate": 7.636003378238215e-06, + "loss": 0.5016, + "step": 5393 + }, + { + "epoch": 0.69, + "grad_norm": 0.6569690233668273, + "learning_rate": 7.635126741234873e-06, + "loss": 0.5547, + "step": 5394 + }, + { + "epoch": 0.69, + "grad_norm": 0.7645363855007361, + "learning_rate": 7.63424999206261e-06, + "loss": 0.6731, + "step": 5395 + }, + { + "epoch": 0.69, + "grad_norm": 0.5415710308418598, + "learning_rate": 7.633373130758741e-06, + "loss": 0.5246, + "step": 5396 + }, + { + "epoch": 0.69, + "grad_norm": 0.7086320400014732, + "learning_rate": 7.632496157360591e-06, + "loss": 0.5988, + "step": 5397 + }, + { + "epoch": 0.69, + "grad_norm": 0.5935657497173328, + "learning_rate": 7.631619071905494e-06, + "loss": 0.5018, + "step": 5398 + }, + { + "epoch": 0.69, + "grad_norm": 0.5834521141203658, + "learning_rate": 7.630741874430782e-06, + "loss": 0.5276, + "step": 5399 + }, + { + "epoch": 0.69, + "grad_norm": 0.5481030974166777, + "learning_rate": 7.629864564973798e-06, + "loss": 0.4916, + "step": 5400 + }, + { + "epoch": 0.69, + "grad_norm": 0.8143837782426001, + "learning_rate": 7.62898714357188e-06, + "loss": 0.6332, + "step": 5401 + }, + { + "epoch": 0.69, + "grad_norm": 1.2293795422243798, + "learning_rate": 7.62810961026238e-06, + "loss": 0.6667, + "step": 5402 + }, + { + "epoch": 0.69, + "grad_norm": 0.7048513179537687, + "learning_rate": 7.627231965082653e-06, + "loss": 0.535, + "step": 5403 + }, + { + "epoch": 0.69, + "grad_norm": 0.7083127999938686, + "learning_rate": 7.626354208070057e-06, + "loss": 0.5071, + "step": 5404 + }, + { + "epoch": 0.69, + "grad_norm": 0.5967341527148564, + "learning_rate": 7.625476339261955e-06, + "loss": 0.5027, + "step": 5405 + }, + { + "epoch": 0.69, + "grad_norm": 0.6622242437396073, + "learning_rate": 7.624598358695714e-06, + "loss": 0.5336, + "step": 5406 + }, + { + "epoch": 0.69, + "grad_norm": 0.6955140117451355, + "learning_rate": 7.6237202664087095e-06, + "loss": 0.5751, + "step": 5407 + }, + { + "epoch": 0.69, + "grad_norm": 0.7103838535428126, + "learning_rate": 7.622842062438317e-06, + "loss": 0.6078, + "step": 5408 + }, + { + "epoch": 0.69, + "grad_norm": 0.7475216235717348, + "learning_rate": 7.621963746821919e-06, + "loss": 0.6063, + "step": 5409 + }, + { + "epoch": 0.69, + "grad_norm": 0.6925293170683312, + "learning_rate": 7.621085319596904e-06, + "loss": 0.6021, + "step": 5410 + }, + { + "epoch": 0.69, + "grad_norm": 0.607102851080319, + "learning_rate": 7.62020678080066e-06, + "loss": 0.5005, + "step": 5411 + }, + { + "epoch": 0.69, + "grad_norm": 0.6573127097147509, + "learning_rate": 7.619328130470589e-06, + "loss": 0.5143, + "step": 5412 + }, + { + "epoch": 0.69, + "grad_norm": 0.724357688680548, + "learning_rate": 7.618449368644087e-06, + "loss": 0.6107, + "step": 5413 + }, + { + "epoch": 0.69, + "grad_norm": 0.6472703477079979, + "learning_rate": 7.617570495358565e-06, + "loss": 0.5169, + "step": 5414 + }, + { + "epoch": 0.69, + "grad_norm": 0.7683889088151806, + "learning_rate": 7.61669151065143e-06, + "loss": 0.5863, + "step": 5415 + }, + { + "epoch": 0.69, + "grad_norm": 0.5961739433201265, + "learning_rate": 7.615812414560102e-06, + "loss": 0.5223, + "step": 5416 + }, + { + "epoch": 0.69, + "grad_norm": 0.7755467138983485, + "learning_rate": 7.614933207121996e-06, + "loss": 0.609, + "step": 5417 + }, + { + "epoch": 0.69, + "grad_norm": 0.6213787758286397, + "learning_rate": 7.6140538883745394e-06, + "loss": 0.5373, + "step": 5418 + }, + { + "epoch": 0.69, + "grad_norm": 0.7239157114677028, + "learning_rate": 7.613174458355164e-06, + "loss": 0.6376, + "step": 5419 + }, + { + "epoch": 0.69, + "grad_norm": 0.8068751355867426, + "learning_rate": 7.612294917101302e-06, + "loss": 0.6776, + "step": 5420 + }, + { + "epoch": 0.69, + "grad_norm": 0.7230239352917054, + "learning_rate": 7.611415264650392e-06, + "loss": 0.643, + "step": 5421 + }, + { + "epoch": 0.69, + "grad_norm": 0.6644945599757978, + "learning_rate": 7.610535501039878e-06, + "loss": 0.6106, + "step": 5422 + }, + { + "epoch": 0.69, + "grad_norm": 0.5683173879743686, + "learning_rate": 7.609655626307211e-06, + "loss": 0.5046, + "step": 5423 + }, + { + "epoch": 0.69, + "grad_norm": 0.7109541936676621, + "learning_rate": 7.608775640489844e-06, + "loss": 0.6107, + "step": 5424 + }, + { + "epoch": 0.69, + "grad_norm": 0.6107410779022118, + "learning_rate": 7.607895543625233e-06, + "loss": 0.5724, + "step": 5425 + }, + { + "epoch": 0.69, + "grad_norm": 0.7945096443773711, + "learning_rate": 7.6070153357508435e-06, + "loss": 0.6273, + "step": 5426 + }, + { + "epoch": 0.69, + "grad_norm": 0.7070430298179371, + "learning_rate": 7.6061350169041415e-06, + "loss": 0.5953, + "step": 5427 + }, + { + "epoch": 0.69, + "grad_norm": 0.7733949426303575, + "learning_rate": 7.605254587122599e-06, + "loss": 0.5823, + "step": 5428 + }, + { + "epoch": 0.69, + "grad_norm": 0.7809194339482484, + "learning_rate": 7.604374046443697e-06, + "loss": 0.5006, + "step": 5429 + }, + { + "epoch": 0.69, + "grad_norm": 0.6251040077474513, + "learning_rate": 7.60349339490491e-06, + "loss": 0.5755, + "step": 5430 + }, + { + "epoch": 0.69, + "grad_norm": 0.795837385220522, + "learning_rate": 7.602612632543732e-06, + "loss": 0.5978, + "step": 5431 + }, + { + "epoch": 0.69, + "grad_norm": 0.576667226610342, + "learning_rate": 7.60173175939765e-06, + "loss": 0.515, + "step": 5432 + }, + { + "epoch": 0.69, + "grad_norm": 0.6468163008548756, + "learning_rate": 7.60085077550416e-06, + "loss": 0.5676, + "step": 5433 + }, + { + "epoch": 0.69, + "grad_norm": 0.6780546892541153, + "learning_rate": 7.599969680900765e-06, + "loss": 0.5994, + "step": 5434 + }, + { + "epoch": 0.69, + "grad_norm": 0.7657552987026378, + "learning_rate": 7.5990884756249685e-06, + "loss": 0.613, + "step": 5435 + }, + { + "epoch": 0.69, + "grad_norm": 1.0754882894038966, + "learning_rate": 7.5982071597142815e-06, + "loss": 0.6183, + "step": 5436 + }, + { + "epoch": 0.69, + "grad_norm": 0.5796398281401157, + "learning_rate": 7.597325733206218e-06, + "loss": 0.543, + "step": 5437 + }, + { + "epoch": 0.69, + "grad_norm": 0.7832954875863424, + "learning_rate": 7.596444196138297e-06, + "loss": 0.6057, + "step": 5438 + }, + { + "epoch": 0.69, + "grad_norm": 0.7701490375794444, + "learning_rate": 7.595562548548045e-06, + "loss": 0.6196, + "step": 5439 + }, + { + "epoch": 0.69, + "grad_norm": 0.7453484478446744, + "learning_rate": 7.5946807904729894e-06, + "loss": 0.5259, + "step": 5440 + }, + { + "epoch": 0.69, + "grad_norm": 0.5822669148490245, + "learning_rate": 7.593798921950662e-06, + "loss": 0.5312, + "step": 5441 + }, + { + "epoch": 0.69, + "grad_norm": 0.7201675708314541, + "learning_rate": 7.592916943018604e-06, + "loss": 0.6383, + "step": 5442 + }, + { + "epoch": 0.69, + "grad_norm": 0.7129321961162887, + "learning_rate": 7.592034853714359e-06, + "loss": 0.6, + "step": 5443 + }, + { + "epoch": 0.69, + "grad_norm": 0.6956984087821456, + "learning_rate": 7.591152654075473e-06, + "loss": 0.6116, + "step": 5444 + }, + { + "epoch": 0.69, + "grad_norm": 0.6734346629490349, + "learning_rate": 7.590270344139499e-06, + "loss": 0.5618, + "step": 5445 + }, + { + "epoch": 0.69, + "grad_norm": 0.6328741817031723, + "learning_rate": 7.589387923943992e-06, + "loss": 0.5009, + "step": 5446 + }, + { + "epoch": 0.69, + "grad_norm": 1.301681239723456, + "learning_rate": 7.5885053935265155e-06, + "loss": 0.6442, + "step": 5447 + }, + { + "epoch": 0.69, + "grad_norm": 0.8176790790323062, + "learning_rate": 7.587622752924637e-06, + "loss": 0.6091, + "step": 5448 + }, + { + "epoch": 0.69, + "grad_norm": 0.7297976101130947, + "learning_rate": 7.586740002175928e-06, + "loss": 0.5673, + "step": 5449 + }, + { + "epoch": 0.69, + "grad_norm": 0.6506549382261865, + "learning_rate": 7.585857141317962e-06, + "loss": 0.5933, + "step": 5450 + }, + { + "epoch": 0.69, + "grad_norm": 0.6608629796794794, + "learning_rate": 7.584974170388319e-06, + "loss": 0.5372, + "step": 5451 + }, + { + "epoch": 0.69, + "grad_norm": 0.6446043855865727, + "learning_rate": 7.584091089424589e-06, + "loss": 0.5698, + "step": 5452 + }, + { + "epoch": 0.69, + "grad_norm": 0.6616059324009207, + "learning_rate": 7.583207898464356e-06, + "loss": 0.5118, + "step": 5453 + }, + { + "epoch": 0.69, + "grad_norm": 0.6768717887359736, + "learning_rate": 7.582324597545219e-06, + "loss": 0.5281, + "step": 5454 + }, + { + "epoch": 0.69, + "grad_norm": 0.7068338364357585, + "learning_rate": 7.581441186704776e-06, + "loss": 0.5516, + "step": 5455 + }, + { + "epoch": 0.7, + "grad_norm": 0.5690980398220856, + "learning_rate": 7.580557665980631e-06, + "loss": 0.5194, + "step": 5456 + }, + { + "epoch": 0.7, + "grad_norm": 0.625019399391991, + "learning_rate": 7.579674035410391e-06, + "loss": 0.4861, + "step": 5457 + }, + { + "epoch": 0.7, + "grad_norm": 0.6799176783698669, + "learning_rate": 7.5787902950316725e-06, + "loss": 0.5666, + "step": 5458 + }, + { + "epoch": 0.7, + "grad_norm": 0.708875205693146, + "learning_rate": 7.577906444882091e-06, + "loss": 0.6049, + "step": 5459 + }, + { + "epoch": 0.7, + "grad_norm": 0.5581560808860451, + "learning_rate": 7.5770224849992705e-06, + "loss": 0.5692, + "step": 5460 + }, + { + "epoch": 0.7, + "grad_norm": 1.349474955178804, + "learning_rate": 7.576138415420838e-06, + "loss": 0.6313, + "step": 5461 + }, + { + "epoch": 0.7, + "grad_norm": 0.6496242085299923, + "learning_rate": 7.575254236184424e-06, + "loss": 0.5417, + "step": 5462 + }, + { + "epoch": 0.7, + "grad_norm": 0.7364391519847628, + "learning_rate": 7.5743699473276664e-06, + "loss": 0.6295, + "step": 5463 + }, + { + "epoch": 0.7, + "grad_norm": 0.9249468560003917, + "learning_rate": 7.573485548888209e-06, + "loss": 0.6309, + "step": 5464 + }, + { + "epoch": 0.7, + "grad_norm": 0.7308284489970758, + "learning_rate": 7.572601040903693e-06, + "loss": 0.5864, + "step": 5465 + }, + { + "epoch": 0.7, + "grad_norm": 0.6646316896812953, + "learning_rate": 7.571716423411772e-06, + "loss": 0.5318, + "step": 5466 + }, + { + "epoch": 0.7, + "grad_norm": 0.7871039854942252, + "learning_rate": 7.570831696450101e-06, + "loss": 0.6606, + "step": 5467 + }, + { + "epoch": 0.7, + "grad_norm": 0.5777473358378997, + "learning_rate": 7.569946860056341e-06, + "loss": 0.5046, + "step": 5468 + }, + { + "epoch": 0.7, + "grad_norm": 0.6865903037439381, + "learning_rate": 7.569061914268154e-06, + "loss": 0.5421, + "step": 5469 + }, + { + "epoch": 0.7, + "grad_norm": 0.6264948735094058, + "learning_rate": 7.568176859123212e-06, + "loss": 0.5668, + "step": 5470 + }, + { + "epoch": 0.7, + "grad_norm": 0.8757852540791926, + "learning_rate": 7.5672916946591866e-06, + "loss": 0.6485, + "step": 5471 + }, + { + "epoch": 0.7, + "grad_norm": 0.7127381619652646, + "learning_rate": 7.566406420913759e-06, + "loss": 0.6035, + "step": 5472 + }, + { + "epoch": 0.7, + "grad_norm": 0.7086887400407788, + "learning_rate": 7.565521037924612e-06, + "loss": 0.5203, + "step": 5473 + }, + { + "epoch": 0.7, + "grad_norm": 0.7432649571650681, + "learning_rate": 7.564635545729431e-06, + "loss": 0.59, + "step": 5474 + }, + { + "epoch": 0.7, + "grad_norm": 0.632263530343266, + "learning_rate": 7.563749944365909e-06, + "loss": 0.5628, + "step": 5475 + }, + { + "epoch": 0.7, + "grad_norm": 0.7524947352704826, + "learning_rate": 7.562864233871746e-06, + "loss": 0.534, + "step": 5476 + }, + { + "epoch": 0.7, + "grad_norm": 0.5442806979374378, + "learning_rate": 7.561978414284643e-06, + "loss": 0.5033, + "step": 5477 + }, + { + "epoch": 0.7, + "grad_norm": 0.731510330305729, + "learning_rate": 7.561092485642305e-06, + "loss": 0.6024, + "step": 5478 + }, + { + "epoch": 0.7, + "grad_norm": 0.6656624046368786, + "learning_rate": 7.560206447982443e-06, + "loss": 0.5929, + "step": 5479 + }, + { + "epoch": 0.7, + "grad_norm": 0.7745102302614559, + "learning_rate": 7.559320301342775e-06, + "loss": 0.5565, + "step": 5480 + }, + { + "epoch": 0.7, + "grad_norm": 0.6797113051754411, + "learning_rate": 7.55843404576102e-06, + "loss": 0.5854, + "step": 5481 + }, + { + "epoch": 0.7, + "grad_norm": 0.7466460173279136, + "learning_rate": 7.557547681274903e-06, + "loss": 0.6156, + "step": 5482 + }, + { + "epoch": 0.7, + "grad_norm": 0.7288466337400398, + "learning_rate": 7.556661207922156e-06, + "loss": 0.5199, + "step": 5483 + }, + { + "epoch": 0.7, + "grad_norm": 1.0119325422624517, + "learning_rate": 7.555774625740509e-06, + "loss": 0.649, + "step": 5484 + }, + { + "epoch": 0.7, + "grad_norm": 0.7638263102320347, + "learning_rate": 7.554887934767703e-06, + "loss": 0.6565, + "step": 5485 + }, + { + "epoch": 0.7, + "grad_norm": 0.7065218600244592, + "learning_rate": 7.554001135041482e-06, + "loss": 0.5312, + "step": 5486 + }, + { + "epoch": 0.7, + "grad_norm": 0.6434874856143814, + "learning_rate": 7.553114226599595e-06, + "loss": 0.5571, + "step": 5487 + }, + { + "epoch": 0.7, + "grad_norm": 0.6323723664848917, + "learning_rate": 7.552227209479794e-06, + "loss": 0.5305, + "step": 5488 + }, + { + "epoch": 0.7, + "grad_norm": 0.7267461648954927, + "learning_rate": 7.5513400837198355e-06, + "loss": 0.5722, + "step": 5489 + }, + { + "epoch": 0.7, + "grad_norm": 0.6671882548083844, + "learning_rate": 7.550452849357484e-06, + "loss": 0.6023, + "step": 5490 + }, + { + "epoch": 0.7, + "grad_norm": 0.6674776173780151, + "learning_rate": 7.5495655064305025e-06, + "loss": 0.5253, + "step": 5491 + }, + { + "epoch": 0.7, + "grad_norm": 0.746430621867441, + "learning_rate": 7.548678054976666e-06, + "loss": 0.6356, + "step": 5492 + }, + { + "epoch": 0.7, + "grad_norm": 0.6746456754429112, + "learning_rate": 7.54779049503375e-06, + "loss": 0.5319, + "step": 5493 + }, + { + "epoch": 0.7, + "grad_norm": 0.5909693788870325, + "learning_rate": 7.546902826639533e-06, + "loss": 0.5263, + "step": 5494 + }, + { + "epoch": 0.7, + "grad_norm": 0.6822441732105623, + "learning_rate": 7.546015049831802e-06, + "loss": 0.5531, + "step": 5495 + }, + { + "epoch": 0.7, + "grad_norm": 0.8611769642948816, + "learning_rate": 7.545127164648347e-06, + "loss": 0.6259, + "step": 5496 + }, + { + "epoch": 0.7, + "grad_norm": 0.6240254873315566, + "learning_rate": 7.544239171126959e-06, + "loss": 0.5498, + "step": 5497 + }, + { + "epoch": 0.7, + "grad_norm": 0.6247896324463584, + "learning_rate": 7.5433510693054425e-06, + "loss": 0.56, + "step": 5498 + }, + { + "epoch": 0.7, + "grad_norm": 0.8005416297048783, + "learning_rate": 7.542462859221597e-06, + "loss": 0.6389, + "step": 5499 + }, + { + "epoch": 0.7, + "grad_norm": 0.5355148967485481, + "learning_rate": 7.541574540913232e-06, + "loss": 0.52, + "step": 5500 + }, + { + "epoch": 0.7, + "grad_norm": 0.7439451661989224, + "learning_rate": 7.5406861144181606e-06, + "loss": 0.5985, + "step": 5501 + }, + { + "epoch": 0.7, + "grad_norm": 0.9114322341961698, + "learning_rate": 7.539797579774201e-06, + "loss": 0.6614, + "step": 5502 + }, + { + "epoch": 0.7, + "grad_norm": 0.7011849852364572, + "learning_rate": 7.538908937019174e-06, + "loss": 0.6, + "step": 5503 + }, + { + "epoch": 0.7, + "grad_norm": 0.5877618936349194, + "learning_rate": 7.538020186190907e-06, + "loss": 0.5305, + "step": 5504 + }, + { + "epoch": 0.7, + "grad_norm": 0.7128670619590266, + "learning_rate": 7.537131327327229e-06, + "loss": 0.5942, + "step": 5505 + }, + { + "epoch": 0.7, + "grad_norm": 0.6583241096129865, + "learning_rate": 7.536242360465979e-06, + "loss": 0.5493, + "step": 5506 + }, + { + "epoch": 0.7, + "grad_norm": 0.7975469748019888, + "learning_rate": 7.535353285644998e-06, + "loss": 0.627, + "step": 5507 + }, + { + "epoch": 0.7, + "grad_norm": 0.7146920127920903, + "learning_rate": 7.534464102902129e-06, + "loss": 0.6266, + "step": 5508 + }, + { + "epoch": 0.7, + "grad_norm": 0.7334534430237059, + "learning_rate": 7.53357481227522e-06, + "loss": 0.5772, + "step": 5509 + }, + { + "epoch": 0.7, + "grad_norm": 0.9073657062311751, + "learning_rate": 7.53268541380213e-06, + "loss": 0.6741, + "step": 5510 + }, + { + "epoch": 0.7, + "grad_norm": 0.6777630299392265, + "learning_rate": 7.531795907520714e-06, + "loss": 0.5558, + "step": 5511 + }, + { + "epoch": 0.7, + "grad_norm": 0.6457095570061134, + "learning_rate": 7.530906293468837e-06, + "loss": 0.5385, + "step": 5512 + }, + { + "epoch": 0.7, + "grad_norm": 0.6152226963370656, + "learning_rate": 7.530016571684366e-06, + "loss": 0.5043, + "step": 5513 + }, + { + "epoch": 0.7, + "grad_norm": 0.642558496011311, + "learning_rate": 7.529126742205176e-06, + "loss": 0.5459, + "step": 5514 + }, + { + "epoch": 0.7, + "grad_norm": 0.5746935789627408, + "learning_rate": 7.5282368050691405e-06, + "loss": 0.555, + "step": 5515 + }, + { + "epoch": 0.7, + "grad_norm": 0.6245333402941172, + "learning_rate": 7.527346760314145e-06, + "loss": 0.5692, + "step": 5516 + }, + { + "epoch": 0.7, + "grad_norm": 0.6533041992053598, + "learning_rate": 7.526456607978072e-06, + "loss": 0.5658, + "step": 5517 + }, + { + "epoch": 0.7, + "grad_norm": 0.7423793547101148, + "learning_rate": 7.5255663480988165e-06, + "loss": 0.5703, + "step": 5518 + }, + { + "epoch": 0.7, + "grad_norm": 0.5864841659833234, + "learning_rate": 7.524675980714272e-06, + "loss": 0.5449, + "step": 5519 + }, + { + "epoch": 0.7, + "grad_norm": 0.6568467936774076, + "learning_rate": 7.523785505862339e-06, + "loss": 0.4968, + "step": 5520 + }, + { + "epoch": 0.7, + "grad_norm": 0.5663009235343691, + "learning_rate": 7.5228949235809205e-06, + "loss": 0.4981, + "step": 5521 + }, + { + "epoch": 0.7, + "grad_norm": 0.5477631262774487, + "learning_rate": 7.522004233907927e-06, + "loss": 0.5194, + "step": 5522 + }, + { + "epoch": 0.7, + "grad_norm": 0.767181909781016, + "learning_rate": 7.521113436881273e-06, + "loss": 0.5831, + "step": 5523 + }, + { + "epoch": 0.7, + "grad_norm": 0.93607736525853, + "learning_rate": 7.520222532538877e-06, + "loss": 0.5758, + "step": 5524 + }, + { + "epoch": 0.7, + "grad_norm": 0.7008607764510454, + "learning_rate": 7.519331520918662e-06, + "loss": 0.5337, + "step": 5525 + }, + { + "epoch": 0.7, + "grad_norm": 0.7891733069469153, + "learning_rate": 7.518440402058553e-06, + "loss": 0.641, + "step": 5526 + }, + { + "epoch": 0.7, + "grad_norm": 0.8685535532634578, + "learning_rate": 7.517549175996485e-06, + "loss": 0.6258, + "step": 5527 + }, + { + "epoch": 0.7, + "grad_norm": 0.755134567623352, + "learning_rate": 7.5166578427703914e-06, + "loss": 0.637, + "step": 5528 + }, + { + "epoch": 0.7, + "grad_norm": 0.7420557935642936, + "learning_rate": 7.5157664024182174e-06, + "loss": 0.5485, + "step": 5529 + }, + { + "epoch": 0.7, + "grad_norm": 0.59662029397398, + "learning_rate": 7.5148748549779075e-06, + "loss": 0.5874, + "step": 5530 + }, + { + "epoch": 0.7, + "grad_norm": 0.693248989669816, + "learning_rate": 7.51398320048741e-06, + "loss": 0.556, + "step": 5531 + }, + { + "epoch": 0.7, + "grad_norm": 0.6317669816606988, + "learning_rate": 7.513091438984682e-06, + "loss": 0.5348, + "step": 5532 + }, + { + "epoch": 0.7, + "grad_norm": 0.7504080633376384, + "learning_rate": 7.5121995705076825e-06, + "loss": 0.5211, + "step": 5533 + }, + { + "epoch": 0.71, + "grad_norm": 0.5733965546187132, + "learning_rate": 7.511307595094375e-06, + "loss": 0.4929, + "step": 5534 + }, + { + "epoch": 0.71, + "grad_norm": 0.7869138745795831, + "learning_rate": 7.5104155127827275e-06, + "loss": 0.5317, + "step": 5535 + }, + { + "epoch": 0.71, + "grad_norm": 0.7438322500729675, + "learning_rate": 7.509523323610713e-06, + "loss": 0.6496, + "step": 5536 + }, + { + "epoch": 0.71, + "grad_norm": 0.577448630036473, + "learning_rate": 7.508631027616311e-06, + "loss": 0.5282, + "step": 5537 + }, + { + "epoch": 0.71, + "grad_norm": 0.6927512837277563, + "learning_rate": 7.507738624837502e-06, + "loss": 0.6542, + "step": 5538 + }, + { + "epoch": 0.71, + "grad_norm": 0.7953838617113616, + "learning_rate": 7.506846115312274e-06, + "loss": 0.634, + "step": 5539 + }, + { + "epoch": 0.71, + "grad_norm": 0.6284276738644262, + "learning_rate": 7.505953499078618e-06, + "loss": 0.5649, + "step": 5540 + }, + { + "epoch": 0.71, + "grad_norm": 0.7195176283967624, + "learning_rate": 7.505060776174529e-06, + "loss": 0.6298, + "step": 5541 + }, + { + "epoch": 0.71, + "grad_norm": 0.7861746368351229, + "learning_rate": 7.504167946638007e-06, + "loss": 0.5821, + "step": 5542 + }, + { + "epoch": 0.71, + "grad_norm": 0.7098625392169347, + "learning_rate": 7.503275010507058e-06, + "loss": 0.5195, + "step": 5543 + }, + { + "epoch": 0.71, + "grad_norm": 0.7370578608999878, + "learning_rate": 7.502381967819692e-06, + "loss": 0.6558, + "step": 5544 + }, + { + "epoch": 0.71, + "grad_norm": 0.6854933639347381, + "learning_rate": 7.501488818613921e-06, + "loss": 0.5144, + "step": 5545 + }, + { + "epoch": 0.71, + "grad_norm": 0.6374023047684221, + "learning_rate": 7.500595562927764e-06, + "loss": 0.5638, + "step": 5546 + }, + { + "epoch": 0.71, + "grad_norm": 0.7846165899240831, + "learning_rate": 7.499702200799246e-06, + "loss": 0.602, + "step": 5547 + }, + { + "epoch": 0.71, + "grad_norm": 0.7775624447052985, + "learning_rate": 7.4988087322663936e-06, + "loss": 0.5535, + "step": 5548 + }, + { + "epoch": 0.71, + "grad_norm": 0.86649198116926, + "learning_rate": 7.497915157367237e-06, + "loss": 0.6449, + "step": 5549 + }, + { + "epoch": 0.71, + "grad_norm": 0.6199681158813892, + "learning_rate": 7.497021476139816e-06, + "loss": 0.5017, + "step": 5550 + }, + { + "epoch": 0.71, + "grad_norm": 0.5509487381920122, + "learning_rate": 7.496127688622169e-06, + "loss": 0.5233, + "step": 5551 + }, + { + "epoch": 0.71, + "grad_norm": 0.5544781377612417, + "learning_rate": 7.495233794852342e-06, + "loss": 0.5086, + "step": 5552 + }, + { + "epoch": 0.71, + "grad_norm": 0.5923297570546956, + "learning_rate": 7.494339794868388e-06, + "loss": 0.5406, + "step": 5553 + }, + { + "epoch": 0.71, + "grad_norm": 0.7484659934821983, + "learning_rate": 7.493445688708358e-06, + "loss": 0.5456, + "step": 5554 + }, + { + "epoch": 0.71, + "grad_norm": 0.6575814254828445, + "learning_rate": 7.492551476410314e-06, + "loss": 0.5076, + "step": 5555 + }, + { + "epoch": 0.71, + "grad_norm": 0.6520039793260324, + "learning_rate": 7.491657158012319e-06, + "loss": 0.4847, + "step": 5556 + }, + { + "epoch": 0.71, + "grad_norm": 0.8301138512520254, + "learning_rate": 7.4907627335524405e-06, + "loss": 0.6799, + "step": 5557 + }, + { + "epoch": 0.71, + "grad_norm": 0.7741470180064739, + "learning_rate": 7.489868203068752e-06, + "loss": 0.5918, + "step": 5558 + }, + { + "epoch": 0.71, + "grad_norm": 0.7109115392808868, + "learning_rate": 7.488973566599329e-06, + "loss": 0.6069, + "step": 5559 + }, + { + "epoch": 0.71, + "grad_norm": 0.767178787813936, + "learning_rate": 7.488078824182257e-06, + "loss": 0.576, + "step": 5560 + }, + { + "epoch": 0.71, + "grad_norm": 0.581246501876493, + "learning_rate": 7.4871839758556185e-06, + "loss": 0.5452, + "step": 5561 + }, + { + "epoch": 0.71, + "grad_norm": 0.8460346470153032, + "learning_rate": 7.4862890216575065e-06, + "loss": 0.6142, + "step": 5562 + }, + { + "epoch": 0.71, + "grad_norm": 0.8801084456986683, + "learning_rate": 7.4853939616260174e-06, + "loss": 0.5992, + "step": 5563 + }, + { + "epoch": 0.71, + "grad_norm": 0.7579932606957567, + "learning_rate": 7.4844987957992485e-06, + "loss": 0.6464, + "step": 5564 + }, + { + "epoch": 0.71, + "grad_norm": 0.6864144690084117, + "learning_rate": 7.483603524215303e-06, + "loss": 0.5825, + "step": 5565 + }, + { + "epoch": 0.71, + "grad_norm": 0.8509339686622452, + "learning_rate": 7.482708146912295e-06, + "loss": 0.6108, + "step": 5566 + }, + { + "epoch": 0.71, + "grad_norm": 0.555737422673903, + "learning_rate": 7.481812663928334e-06, + "loss": 0.5675, + "step": 5567 + }, + { + "epoch": 0.71, + "grad_norm": 0.7807117691246142, + "learning_rate": 7.4809170753015395e-06, + "loss": 0.6354, + "step": 5568 + }, + { + "epoch": 0.71, + "grad_norm": 0.6876108956454414, + "learning_rate": 7.480021381070032e-06, + "loss": 0.5911, + "step": 5569 + }, + { + "epoch": 0.71, + "grad_norm": 0.5522113252313763, + "learning_rate": 7.479125581271939e-06, + "loss": 0.4529, + "step": 5570 + }, + { + "epoch": 0.71, + "grad_norm": 0.5929854641676827, + "learning_rate": 7.478229675945392e-06, + "loss": 0.4925, + "step": 5571 + }, + { + "epoch": 0.71, + "grad_norm": 0.7239031681381652, + "learning_rate": 7.477333665128526e-06, + "loss": 0.6061, + "step": 5572 + }, + { + "epoch": 0.71, + "grad_norm": 0.6463245893762213, + "learning_rate": 7.4764375488594855e-06, + "loss": 0.5464, + "step": 5573 + }, + { + "epoch": 0.71, + "grad_norm": 0.7083116479145974, + "learning_rate": 7.47554132717641e-06, + "loss": 0.5667, + "step": 5574 + }, + { + "epoch": 0.71, + "grad_norm": 0.8719949481149875, + "learning_rate": 7.474645000117451e-06, + "loss": 0.6517, + "step": 5575 + }, + { + "epoch": 0.71, + "grad_norm": 0.5770683532298139, + "learning_rate": 7.473748567720762e-06, + "loss": 0.5132, + "step": 5576 + }, + { + "epoch": 0.71, + "grad_norm": 0.7199319889513051, + "learning_rate": 7.472852030024501e-06, + "loss": 0.5643, + "step": 5577 + }, + { + "epoch": 0.71, + "grad_norm": 0.6908856397956292, + "learning_rate": 7.4719553870668324e-06, + "loss": 0.5836, + "step": 5578 + }, + { + "epoch": 0.71, + "grad_norm": 0.6841647098460729, + "learning_rate": 7.4710586388859215e-06, + "loss": 0.6108, + "step": 5579 + }, + { + "epoch": 0.71, + "grad_norm": 0.5849911087359764, + "learning_rate": 7.470161785519942e-06, + "loss": 0.5901, + "step": 5580 + }, + { + "epoch": 0.71, + "grad_norm": 0.6602073844857029, + "learning_rate": 7.469264827007068e-06, + "loss": 0.5685, + "step": 5581 + }, + { + "epoch": 0.71, + "grad_norm": 0.6523554592045353, + "learning_rate": 7.46836776338548e-06, + "loss": 0.5861, + "step": 5582 + }, + { + "epoch": 0.71, + "grad_norm": 0.6890021902397042, + "learning_rate": 7.467470594693364e-06, + "loss": 0.5016, + "step": 5583 + }, + { + "epoch": 0.71, + "grad_norm": 0.6401967613016705, + "learning_rate": 7.466573320968912e-06, + "loss": 0.508, + "step": 5584 + }, + { + "epoch": 0.71, + "grad_norm": 0.6570991772661504, + "learning_rate": 7.465675942250314e-06, + "loss": 0.5759, + "step": 5585 + }, + { + "epoch": 0.71, + "grad_norm": 1.0630567084386817, + "learning_rate": 7.464778458575771e-06, + "loss": 0.6719, + "step": 5586 + }, + { + "epoch": 0.71, + "grad_norm": 0.6374844016034613, + "learning_rate": 7.4638808699834855e-06, + "loss": 0.5183, + "step": 5587 + }, + { + "epoch": 0.71, + "grad_norm": 0.5731688138145928, + "learning_rate": 7.462983176511663e-06, + "loss": 0.4934, + "step": 5588 + }, + { + "epoch": 0.71, + "grad_norm": 0.9017871195801972, + "learning_rate": 7.462085378198519e-06, + "loss": 0.5813, + "step": 5589 + }, + { + "epoch": 0.71, + "grad_norm": 0.6838869058084027, + "learning_rate": 7.4611874750822675e-06, + "loss": 0.5739, + "step": 5590 + }, + { + "epoch": 0.71, + "grad_norm": 0.7481473094848149, + "learning_rate": 7.460289467201129e-06, + "loss": 0.6227, + "step": 5591 + }, + { + "epoch": 0.71, + "grad_norm": 0.7552485024500268, + "learning_rate": 7.459391354593333e-06, + "loss": 0.5599, + "step": 5592 + }, + { + "epoch": 0.71, + "grad_norm": 0.6282533925896012, + "learning_rate": 7.458493137297103e-06, + "loss": 0.5235, + "step": 5593 + }, + { + "epoch": 0.71, + "grad_norm": 0.6166800626199566, + "learning_rate": 7.457594815350678e-06, + "loss": 0.5384, + "step": 5594 + }, + { + "epoch": 0.71, + "grad_norm": 0.7920820713629735, + "learning_rate": 7.456696388792295e-06, + "loss": 0.5468, + "step": 5595 + }, + { + "epoch": 0.71, + "grad_norm": 0.7346765132397483, + "learning_rate": 7.455797857660196e-06, + "loss": 0.536, + "step": 5596 + }, + { + "epoch": 0.71, + "grad_norm": 0.6430437289013298, + "learning_rate": 7.454899221992632e-06, + "loss": 0.4796, + "step": 5597 + }, + { + "epoch": 0.71, + "grad_norm": 0.6721675578508051, + "learning_rate": 7.454000481827851e-06, + "loss": 0.5873, + "step": 5598 + }, + { + "epoch": 0.71, + "grad_norm": 0.6096008981029728, + "learning_rate": 7.453101637204111e-06, + "loss": 0.5323, + "step": 5599 + }, + { + "epoch": 0.71, + "grad_norm": 0.5925943527999598, + "learning_rate": 7.452202688159674e-06, + "loss": 0.5041, + "step": 5600 + }, + { + "epoch": 0.71, + "grad_norm": 0.5728460009147941, + "learning_rate": 7.451303634732805e-06, + "loss": 0.5218, + "step": 5601 + }, + { + "epoch": 0.71, + "grad_norm": 0.6973336160605523, + "learning_rate": 7.450404476961773e-06, + "loss": 0.6362, + "step": 5602 + }, + { + "epoch": 0.71, + "grad_norm": 0.7958766218236931, + "learning_rate": 7.449505214884853e-06, + "loss": 0.6399, + "step": 5603 + }, + { + "epoch": 0.71, + "grad_norm": 0.6441371673883424, + "learning_rate": 7.448605848540324e-06, + "loss": 0.5445, + "step": 5604 + }, + { + "epoch": 0.71, + "grad_norm": 0.5541922008080312, + "learning_rate": 7.447706377966469e-06, + "loss": 0.4974, + "step": 5605 + }, + { + "epoch": 0.71, + "grad_norm": 0.7344791006261536, + "learning_rate": 7.446806803201574e-06, + "loss": 0.6002, + "step": 5606 + }, + { + "epoch": 0.71, + "grad_norm": 0.7050838857584212, + "learning_rate": 7.445907124283933e-06, + "loss": 0.5986, + "step": 5607 + }, + { + "epoch": 0.71, + "grad_norm": 0.7662848114017572, + "learning_rate": 7.445007341251841e-06, + "loss": 0.6198, + "step": 5608 + }, + { + "epoch": 0.71, + "grad_norm": 0.9430902788951556, + "learning_rate": 7.444107454143601e-06, + "loss": 0.6372, + "step": 5609 + }, + { + "epoch": 0.71, + "grad_norm": 0.7552672058105092, + "learning_rate": 7.443207462997515e-06, + "loss": 0.6099, + "step": 5610 + }, + { + "epoch": 0.71, + "grad_norm": 0.6819261497740676, + "learning_rate": 7.442307367851897e-06, + "loss": 0.55, + "step": 5611 + }, + { + "epoch": 0.71, + "grad_norm": 0.7249374279700982, + "learning_rate": 7.441407168745056e-06, + "loss": 0.5955, + "step": 5612 + }, + { + "epoch": 0.72, + "grad_norm": 0.6992416001775748, + "learning_rate": 7.440506865715316e-06, + "loss": 0.5949, + "step": 5613 + }, + { + "epoch": 0.72, + "grad_norm": 0.6546698374596573, + "learning_rate": 7.439606458800995e-06, + "loss": 0.4977, + "step": 5614 + }, + { + "epoch": 0.72, + "grad_norm": 0.7400243103854021, + "learning_rate": 7.438705948040426e-06, + "loss": 0.641, + "step": 5615 + }, + { + "epoch": 0.72, + "grad_norm": 0.5775259899719023, + "learning_rate": 7.4378053334719345e-06, + "loss": 0.5262, + "step": 5616 + }, + { + "epoch": 0.72, + "grad_norm": 0.5750151254313061, + "learning_rate": 7.436904615133862e-06, + "loss": 0.4882, + "step": 5617 + }, + { + "epoch": 0.72, + "grad_norm": 0.7503319692774794, + "learning_rate": 7.436003793064548e-06, + "loss": 0.5577, + "step": 5618 + }, + { + "epoch": 0.72, + "grad_norm": 0.577851017085559, + "learning_rate": 7.435102867302335e-06, + "loss": 0.5084, + "step": 5619 + }, + { + "epoch": 0.72, + "grad_norm": 0.7988919347178581, + "learning_rate": 7.434201837885576e-06, + "loss": 0.6207, + "step": 5620 + }, + { + "epoch": 0.72, + "grad_norm": 0.6414598371822323, + "learning_rate": 7.433300704852622e-06, + "loss": 0.5151, + "step": 5621 + }, + { + "epoch": 0.72, + "grad_norm": 0.6155064305103313, + "learning_rate": 7.432399468241833e-06, + "loss": 0.5176, + "step": 5622 + }, + { + "epoch": 0.72, + "grad_norm": 0.6295500286375426, + "learning_rate": 7.431498128091572e-06, + "loss": 0.5537, + "step": 5623 + }, + { + "epoch": 0.72, + "grad_norm": 0.8794832337445059, + "learning_rate": 7.4305966844402055e-06, + "loss": 0.6861, + "step": 5624 + }, + { + "epoch": 0.72, + "grad_norm": 0.7617276757271866, + "learning_rate": 7.429695137326105e-06, + "loss": 0.642, + "step": 5625 + }, + { + "epoch": 0.72, + "grad_norm": 0.597706565973292, + "learning_rate": 7.428793486787648e-06, + "loss": 0.5264, + "step": 5626 + }, + { + "epoch": 0.72, + "grad_norm": 0.6784040452151213, + "learning_rate": 7.427891732863212e-06, + "loss": 0.5605, + "step": 5627 + }, + { + "epoch": 0.72, + "grad_norm": 0.6174673738110383, + "learning_rate": 7.426989875591183e-06, + "loss": 0.5353, + "step": 5628 + }, + { + "epoch": 0.72, + "grad_norm": 0.6309831888077089, + "learning_rate": 7.426087915009952e-06, + "loss": 0.5295, + "step": 5629 + }, + { + "epoch": 0.72, + "grad_norm": 0.5773689344234675, + "learning_rate": 7.42518585115791e-06, + "loss": 0.5117, + "step": 5630 + }, + { + "epoch": 0.72, + "grad_norm": 0.6580223975721441, + "learning_rate": 7.424283684073456e-06, + "loss": 0.5813, + "step": 5631 + }, + { + "epoch": 0.72, + "grad_norm": 0.7694103118508562, + "learning_rate": 7.423381413794995e-06, + "loss": 0.6048, + "step": 5632 + }, + { + "epoch": 0.72, + "grad_norm": 0.7798427890138921, + "learning_rate": 7.422479040360928e-06, + "loss": 0.593, + "step": 5633 + }, + { + "epoch": 0.72, + "grad_norm": 0.5912828207632826, + "learning_rate": 7.42157656380967e-06, + "loss": 0.476, + "step": 5634 + }, + { + "epoch": 0.72, + "grad_norm": 0.6336847705950144, + "learning_rate": 7.4206739841796375e-06, + "loss": 0.4985, + "step": 5635 + }, + { + "epoch": 0.72, + "grad_norm": 0.8028719077312637, + "learning_rate": 7.419771301509248e-06, + "loss": 0.632, + "step": 5636 + }, + { + "epoch": 0.72, + "grad_norm": 0.667322966161425, + "learning_rate": 7.418868515836927e-06, + "loss": 0.5008, + "step": 5637 + }, + { + "epoch": 0.72, + "grad_norm": 0.5994255309463825, + "learning_rate": 7.4179656272011025e-06, + "loss": 0.5302, + "step": 5638 + }, + { + "epoch": 0.72, + "grad_norm": 0.7060310675464113, + "learning_rate": 7.417062635640207e-06, + "loss": 0.5395, + "step": 5639 + }, + { + "epoch": 0.72, + "grad_norm": 0.6671501879542686, + "learning_rate": 7.4161595411926804e-06, + "loss": 0.5724, + "step": 5640 + }, + { + "epoch": 0.72, + "grad_norm": 0.6086150095679586, + "learning_rate": 7.415256343896963e-06, + "loss": 0.5083, + "step": 5641 + }, + { + "epoch": 0.72, + "grad_norm": 0.67659508707042, + "learning_rate": 7.4143530437915e-06, + "loss": 0.53, + "step": 5642 + }, + { + "epoch": 0.72, + "grad_norm": 0.5930453102751836, + "learning_rate": 7.413449640914744e-06, + "loss": 0.5051, + "step": 5643 + }, + { + "epoch": 0.72, + "grad_norm": 0.6773802712835152, + "learning_rate": 7.4125461353051495e-06, + "loss": 0.6126, + "step": 5644 + }, + { + "epoch": 0.72, + "grad_norm": 0.638229018001181, + "learning_rate": 7.411642527001174e-06, + "loss": 0.53, + "step": 5645 + }, + { + "epoch": 0.72, + "grad_norm": 0.5765617116730168, + "learning_rate": 7.410738816041283e-06, + "loss": 0.5203, + "step": 5646 + }, + { + "epoch": 0.72, + "grad_norm": 0.7021036191079978, + "learning_rate": 7.409835002463946e-06, + "loss": 0.56, + "step": 5647 + }, + { + "epoch": 0.72, + "grad_norm": 0.6326871301716849, + "learning_rate": 7.408931086307631e-06, + "loss": 0.5378, + "step": 5648 + }, + { + "epoch": 0.72, + "grad_norm": 0.6308750172678016, + "learning_rate": 7.408027067610819e-06, + "loss": 0.5477, + "step": 5649 + }, + { + "epoch": 0.72, + "grad_norm": 0.6086451695692009, + "learning_rate": 7.40712294641199e-06, + "loss": 0.5381, + "step": 5650 + }, + { + "epoch": 0.72, + "grad_norm": 0.6221308692310638, + "learning_rate": 7.406218722749629e-06, + "loss": 0.548, + "step": 5651 + }, + { + "epoch": 0.72, + "grad_norm": 0.7823893248720007, + "learning_rate": 7.405314396662224e-06, + "loss": 0.5919, + "step": 5652 + }, + { + "epoch": 0.72, + "grad_norm": 0.5910429812763102, + "learning_rate": 7.404409968188274e-06, + "loss": 0.4926, + "step": 5653 + }, + { + "epoch": 0.72, + "grad_norm": 0.845404297963058, + "learning_rate": 7.403505437366274e-06, + "loss": 0.6041, + "step": 5654 + }, + { + "epoch": 0.72, + "grad_norm": 0.6082515530765161, + "learning_rate": 7.40260080423473e-06, + "loss": 0.568, + "step": 5655 + }, + { + "epoch": 0.72, + "grad_norm": 0.5486652756234054, + "learning_rate": 7.401696068832145e-06, + "loss": 0.4863, + "step": 5656 + }, + { + "epoch": 0.72, + "grad_norm": 0.6023368850152669, + "learning_rate": 7.400791231197034e-06, + "loss": 0.5219, + "step": 5657 + }, + { + "epoch": 0.72, + "grad_norm": 0.6377601372870552, + "learning_rate": 7.399886291367913e-06, + "loss": 0.5574, + "step": 5658 + }, + { + "epoch": 0.72, + "grad_norm": 0.7918680740193286, + "learning_rate": 7.398981249383299e-06, + "loss": 0.5839, + "step": 5659 + }, + { + "epoch": 0.72, + "grad_norm": 0.5790747823396108, + "learning_rate": 7.398076105281722e-06, + "loss": 0.5479, + "step": 5660 + }, + { + "epoch": 0.72, + "grad_norm": 0.7643797573132838, + "learning_rate": 7.397170859101705e-06, + "loss": 0.6538, + "step": 5661 + }, + { + "epoch": 0.72, + "grad_norm": 0.6298632808110909, + "learning_rate": 7.396265510881788e-06, + "loss": 0.5053, + "step": 5662 + }, + { + "epoch": 0.72, + "grad_norm": 0.6966163507054176, + "learning_rate": 7.3953600606605035e-06, + "loss": 0.5453, + "step": 5663 + }, + { + "epoch": 0.72, + "grad_norm": 0.6567215107829254, + "learning_rate": 7.394454508476397e-06, + "loss": 0.5479, + "step": 5664 + }, + { + "epoch": 0.72, + "grad_norm": 0.5775030708547746, + "learning_rate": 7.393548854368014e-06, + "loss": 0.4355, + "step": 5665 + }, + { + "epoch": 0.72, + "grad_norm": 0.7233305176706283, + "learning_rate": 7.392643098373904e-06, + "loss": 0.5721, + "step": 5666 + }, + { + "epoch": 0.72, + "grad_norm": 1.1049064980849475, + "learning_rate": 7.391737240532624e-06, + "loss": 0.6325, + "step": 5667 + }, + { + "epoch": 0.72, + "grad_norm": 1.2527277130532108, + "learning_rate": 7.390831280882732e-06, + "loss": 0.6771, + "step": 5668 + }, + { + "epoch": 0.72, + "grad_norm": 0.8047751973395623, + "learning_rate": 7.389925219462792e-06, + "loss": 0.5585, + "step": 5669 + }, + { + "epoch": 0.72, + "grad_norm": 0.8064024223119045, + "learning_rate": 7.389019056311371e-06, + "loss": 0.6413, + "step": 5670 + }, + { + "epoch": 0.72, + "grad_norm": 0.589140462294574, + "learning_rate": 7.3881127914670455e-06, + "loss": 0.5318, + "step": 5671 + }, + { + "epoch": 0.72, + "grad_norm": 0.6241907795546442, + "learning_rate": 7.38720642496839e-06, + "loss": 0.5404, + "step": 5672 + }, + { + "epoch": 0.72, + "grad_norm": 0.6073188246300422, + "learning_rate": 7.386299956853984e-06, + "loss": 0.5196, + "step": 5673 + }, + { + "epoch": 0.72, + "grad_norm": 0.5545529651076304, + "learning_rate": 7.385393387162415e-06, + "loss": 0.5349, + "step": 5674 + }, + { + "epoch": 0.72, + "grad_norm": 0.7124743450162648, + "learning_rate": 7.384486715932272e-06, + "loss": 0.5394, + "step": 5675 + }, + { + "epoch": 0.72, + "grad_norm": 0.5928389882833025, + "learning_rate": 7.383579943202148e-06, + "loss": 0.5678, + "step": 5676 + }, + { + "epoch": 0.72, + "grad_norm": 0.7692055633192265, + "learning_rate": 7.382673069010644e-06, + "loss": 0.5883, + "step": 5677 + }, + { + "epoch": 0.72, + "grad_norm": 0.7938091410107768, + "learning_rate": 7.381766093396362e-06, + "loss": 0.6544, + "step": 5678 + }, + { + "epoch": 0.72, + "grad_norm": 0.6892771854821916, + "learning_rate": 7.380859016397908e-06, + "loss": 0.521, + "step": 5679 + }, + { + "epoch": 0.72, + "grad_norm": 0.5692625926593775, + "learning_rate": 7.379951838053894e-06, + "loss": 0.5834, + "step": 5680 + }, + { + "epoch": 0.72, + "grad_norm": 0.6058655119416744, + "learning_rate": 7.3790445584029345e-06, + "loss": 0.5688, + "step": 5681 + }, + { + "epoch": 0.72, + "grad_norm": 0.5780249326111432, + "learning_rate": 7.378137177483651e-06, + "loss": 0.5268, + "step": 5682 + }, + { + "epoch": 0.72, + "grad_norm": 0.6595996301171776, + "learning_rate": 7.377229695334667e-06, + "loss": 0.5472, + "step": 5683 + }, + { + "epoch": 0.72, + "grad_norm": 0.8328354287000123, + "learning_rate": 7.376322111994612e-06, + "loss": 0.6481, + "step": 5684 + }, + { + "epoch": 0.72, + "grad_norm": 0.606581976116039, + "learning_rate": 7.375414427502117e-06, + "loss": 0.5033, + "step": 5685 + }, + { + "epoch": 0.72, + "grad_norm": 1.12974718942241, + "learning_rate": 7.374506641895822e-06, + "loss": 0.5692, + "step": 5686 + }, + { + "epoch": 0.72, + "grad_norm": 0.8017090667085868, + "learning_rate": 7.373598755214367e-06, + "loss": 0.5877, + "step": 5687 + }, + { + "epoch": 0.72, + "grad_norm": 0.8565991649038176, + "learning_rate": 7.3726907674963975e-06, + "loss": 0.6385, + "step": 5688 + }, + { + "epoch": 0.72, + "grad_norm": 0.9003083774224998, + "learning_rate": 7.371782678780563e-06, + "loss": 0.6021, + "step": 5689 + }, + { + "epoch": 0.72, + "grad_norm": 0.8611439299483281, + "learning_rate": 7.370874489105521e-06, + "loss": 0.6344, + "step": 5690 + }, + { + "epoch": 0.73, + "grad_norm": 0.7747739841680015, + "learning_rate": 7.369966198509927e-06, + "loss": 0.6773, + "step": 5691 + }, + { + "epoch": 0.73, + "grad_norm": 0.5755643231676336, + "learning_rate": 7.369057807032446e-06, + "loss": 0.5079, + "step": 5692 + }, + { + "epoch": 0.73, + "grad_norm": 0.5969373227799505, + "learning_rate": 7.368149314711745e-06, + "loss": 0.5407, + "step": 5693 + }, + { + "epoch": 0.73, + "grad_norm": 0.6443221645439203, + "learning_rate": 7.367240721586493e-06, + "loss": 0.609, + "step": 5694 + }, + { + "epoch": 0.73, + "grad_norm": 0.8416427703897001, + "learning_rate": 7.3663320276953695e-06, + "loss": 0.6508, + "step": 5695 + }, + { + "epoch": 0.73, + "grad_norm": 1.641847549599186, + "learning_rate": 7.3654232330770535e-06, + "loss": 0.6177, + "step": 5696 + }, + { + "epoch": 0.73, + "grad_norm": 0.582604822902364, + "learning_rate": 7.3645143377702284e-06, + "loss": 0.5219, + "step": 5697 + }, + { + "epoch": 0.73, + "grad_norm": 0.8213505261178368, + "learning_rate": 7.363605341813585e-06, + "loss": 0.5864, + "step": 5698 + }, + { + "epoch": 0.73, + "grad_norm": 0.6305030534520216, + "learning_rate": 7.362696245245815e-06, + "loss": 0.5217, + "step": 5699 + }, + { + "epoch": 0.73, + "grad_norm": 0.6773320674301949, + "learning_rate": 7.3617870481056165e-06, + "loss": 0.5949, + "step": 5700 + }, + { + "epoch": 0.73, + "grad_norm": 0.6919005551557165, + "learning_rate": 7.360877750431689e-06, + "loss": 0.4858, + "step": 5701 + }, + { + "epoch": 0.73, + "grad_norm": 0.5804239232357055, + "learning_rate": 7.3599683522627405e-06, + "loss": 0.5496, + "step": 5702 + }, + { + "epoch": 0.73, + "grad_norm": 0.6610032405258903, + "learning_rate": 7.3590588536374805e-06, + "loss": 0.4988, + "step": 5703 + }, + { + "epoch": 0.73, + "grad_norm": 0.8855582237760503, + "learning_rate": 7.358149254594624e-06, + "loss": 0.596, + "step": 5704 + }, + { + "epoch": 0.73, + "grad_norm": 0.5636212708018711, + "learning_rate": 7.35723955517289e-06, + "loss": 0.4766, + "step": 5705 + }, + { + "epoch": 0.73, + "grad_norm": 0.6337044899930441, + "learning_rate": 7.3563297554110005e-06, + "loss": 0.5057, + "step": 5706 + }, + { + "epoch": 0.73, + "grad_norm": 0.6345555559804692, + "learning_rate": 7.35541985534768e-06, + "loss": 0.5733, + "step": 5707 + }, + { + "epoch": 0.73, + "grad_norm": 0.7950576407901063, + "learning_rate": 7.354509855021667e-06, + "loss": 0.555, + "step": 5708 + }, + { + "epoch": 0.73, + "grad_norm": 0.6605766481544945, + "learning_rate": 7.353599754471692e-06, + "loss": 0.5253, + "step": 5709 + }, + { + "epoch": 0.73, + "grad_norm": 0.7430219107069737, + "learning_rate": 7.3526895537364965e-06, + "loss": 0.6322, + "step": 5710 + }, + { + "epoch": 0.73, + "grad_norm": 0.8479109464219405, + "learning_rate": 7.351779252854825e-06, + "loss": 0.6321, + "step": 5711 + }, + { + "epoch": 0.73, + "grad_norm": 0.677630186932484, + "learning_rate": 7.350868851865426e-06, + "loss": 0.5457, + "step": 5712 + }, + { + "epoch": 0.73, + "grad_norm": 0.9251062126657649, + "learning_rate": 7.349958350807052e-06, + "loss": 0.5952, + "step": 5713 + }, + { + "epoch": 0.73, + "grad_norm": 0.5326632761752162, + "learning_rate": 7.34904774971846e-06, + "loss": 0.5065, + "step": 5714 + }, + { + "epoch": 0.73, + "grad_norm": 0.6292827983477869, + "learning_rate": 7.348137048638412e-06, + "loss": 0.492, + "step": 5715 + }, + { + "epoch": 0.73, + "grad_norm": 0.5743666733056835, + "learning_rate": 7.347226247605673e-06, + "loss": 0.5412, + "step": 5716 + }, + { + "epoch": 0.73, + "grad_norm": 0.6139657094352567, + "learning_rate": 7.346315346659014e-06, + "loss": 0.514, + "step": 5717 + }, + { + "epoch": 0.73, + "grad_norm": 0.9149782753433823, + "learning_rate": 7.345404345837209e-06, + "loss": 0.5899, + "step": 5718 + }, + { + "epoch": 0.73, + "grad_norm": 0.7233615211723191, + "learning_rate": 7.344493245179035e-06, + "loss": 0.6705, + "step": 5719 + }, + { + "epoch": 0.73, + "grad_norm": 0.9639616574622281, + "learning_rate": 7.343582044723276e-06, + "loss": 0.5781, + "step": 5720 + }, + { + "epoch": 0.73, + "grad_norm": 0.5957331894293145, + "learning_rate": 7.3426707445087184e-06, + "loss": 0.4817, + "step": 5721 + }, + { + "epoch": 0.73, + "grad_norm": 0.7178498297909918, + "learning_rate": 7.341759344574152e-06, + "loss": 0.5127, + "step": 5722 + }, + { + "epoch": 0.73, + "grad_norm": 0.8664327293580654, + "learning_rate": 7.340847844958374e-06, + "loss": 0.6246, + "step": 5723 + }, + { + "epoch": 0.73, + "grad_norm": 0.8089286751783417, + "learning_rate": 7.339936245700185e-06, + "loss": 0.6295, + "step": 5724 + }, + { + "epoch": 0.73, + "grad_norm": 0.7828796500291207, + "learning_rate": 7.339024546838387e-06, + "loss": 0.6233, + "step": 5725 + }, + { + "epoch": 0.73, + "grad_norm": 0.7008235148702397, + "learning_rate": 7.338112748411788e-06, + "loss": 0.5881, + "step": 5726 + }, + { + "epoch": 0.73, + "grad_norm": 0.8967780490066428, + "learning_rate": 7.3372008504592004e-06, + "loss": 0.6183, + "step": 5727 + }, + { + "epoch": 0.73, + "grad_norm": 0.8810109259499682, + "learning_rate": 7.3362888530194424e-06, + "loss": 0.6314, + "step": 5728 + }, + { + "epoch": 0.73, + "grad_norm": 0.6175716689203922, + "learning_rate": 7.335376756131332e-06, + "loss": 0.4422, + "step": 5729 + }, + { + "epoch": 0.73, + "grad_norm": 0.7423435893389143, + "learning_rate": 7.334464559833696e-06, + "loss": 0.6334, + "step": 5730 + }, + { + "epoch": 0.73, + "grad_norm": 0.6636192297928335, + "learning_rate": 7.3335522641653646e-06, + "loss": 0.5319, + "step": 5731 + }, + { + "epoch": 0.73, + "grad_norm": 0.66616175987784, + "learning_rate": 7.33263986916517e-06, + "loss": 0.5524, + "step": 5732 + }, + { + "epoch": 0.73, + "grad_norm": 0.6475477908293895, + "learning_rate": 7.33172737487195e-06, + "loss": 0.5317, + "step": 5733 + }, + { + "epoch": 0.73, + "grad_norm": 0.7146391529957522, + "learning_rate": 7.330814781324547e-06, + "loss": 0.5766, + "step": 5734 + }, + { + "epoch": 0.73, + "grad_norm": 0.7788520304420267, + "learning_rate": 7.329902088561806e-06, + "loss": 0.6296, + "step": 5735 + }, + { + "epoch": 0.73, + "grad_norm": 0.6937233753639961, + "learning_rate": 7.328989296622581e-06, + "loss": 0.6616, + "step": 5736 + }, + { + "epoch": 0.73, + "grad_norm": 0.8307538059154279, + "learning_rate": 7.328076405545722e-06, + "loss": 0.5748, + "step": 5737 + }, + { + "epoch": 0.73, + "grad_norm": 0.9242323931259457, + "learning_rate": 7.327163415370089e-06, + "loss": 0.6153, + "step": 5738 + }, + { + "epoch": 0.73, + "grad_norm": 0.9070685116030577, + "learning_rate": 7.326250326134548e-06, + "loss": 0.6259, + "step": 5739 + }, + { + "epoch": 0.73, + "grad_norm": 0.7604768170118185, + "learning_rate": 7.325337137877963e-06, + "loss": 0.6053, + "step": 5740 + }, + { + "epoch": 0.73, + "grad_norm": 0.7182055685307104, + "learning_rate": 7.324423850639207e-06, + "loss": 0.536, + "step": 5741 + }, + { + "epoch": 0.73, + "grad_norm": 0.6482156691808076, + "learning_rate": 7.323510464457156e-06, + "loss": 0.4842, + "step": 5742 + }, + { + "epoch": 0.73, + "grad_norm": 0.7466382047438345, + "learning_rate": 7.322596979370689e-06, + "loss": 0.628, + "step": 5743 + }, + { + "epoch": 0.73, + "grad_norm": 0.6334318570349889, + "learning_rate": 7.321683395418691e-06, + "loss": 0.541, + "step": 5744 + }, + { + "epoch": 0.73, + "grad_norm": 0.5639089463078808, + "learning_rate": 7.320769712640048e-06, + "loss": 0.5475, + "step": 5745 + }, + { + "epoch": 0.73, + "grad_norm": 1.0410641630002657, + "learning_rate": 7.319855931073656e-06, + "loss": 0.6268, + "step": 5746 + }, + { + "epoch": 0.73, + "grad_norm": 0.9442298973443018, + "learning_rate": 7.318942050758411e-06, + "loss": 0.6891, + "step": 5747 + }, + { + "epoch": 0.73, + "grad_norm": 0.6236351149290799, + "learning_rate": 7.318028071733212e-06, + "loss": 0.5682, + "step": 5748 + }, + { + "epoch": 0.73, + "grad_norm": 0.6115715915318616, + "learning_rate": 7.317113994036967e-06, + "loss": 0.5258, + "step": 5749 + }, + { + "epoch": 0.73, + "grad_norm": 0.5883876225640409, + "learning_rate": 7.316199817708581e-06, + "loss": 0.5212, + "step": 5750 + }, + { + "epoch": 0.73, + "grad_norm": 0.5565977068422248, + "learning_rate": 7.315285542786972e-06, + "loss": 0.4633, + "step": 5751 + }, + { + "epoch": 0.73, + "grad_norm": 0.5742294527078042, + "learning_rate": 7.314371169311056e-06, + "loss": 0.4993, + "step": 5752 + }, + { + "epoch": 0.73, + "grad_norm": 0.7491745457961114, + "learning_rate": 7.313456697319754e-06, + "loss": 0.6878, + "step": 5753 + }, + { + "epoch": 0.73, + "grad_norm": 0.6489124135714357, + "learning_rate": 7.312542126851994e-06, + "loss": 0.5235, + "step": 5754 + }, + { + "epoch": 0.73, + "grad_norm": 0.662751538576484, + "learning_rate": 7.311627457946705e-06, + "loss": 0.5554, + "step": 5755 + }, + { + "epoch": 0.73, + "grad_norm": 0.7297963370112601, + "learning_rate": 7.310712690642823e-06, + "loss": 0.5713, + "step": 5756 + }, + { + "epoch": 0.73, + "grad_norm": 0.6276117480430634, + "learning_rate": 7.309797824979283e-06, + "loss": 0.6388, + "step": 5757 + }, + { + "epoch": 0.73, + "grad_norm": 0.8929000216587889, + "learning_rate": 7.308882860995034e-06, + "loss": 0.6366, + "step": 5758 + }, + { + "epoch": 0.73, + "grad_norm": 0.5414804000899441, + "learning_rate": 7.307967798729019e-06, + "loss": 0.4763, + "step": 5759 + }, + { + "epoch": 0.73, + "grad_norm": 0.6689040892470147, + "learning_rate": 7.307052638220189e-06, + "loss": 0.5, + "step": 5760 + }, + { + "epoch": 0.73, + "grad_norm": 0.6150035201513734, + "learning_rate": 7.306137379507501e-06, + "loss": 0.5282, + "step": 5761 + }, + { + "epoch": 0.73, + "grad_norm": 0.6229377456891717, + "learning_rate": 7.305222022629914e-06, + "loss": 0.6035, + "step": 5762 + }, + { + "epoch": 0.73, + "grad_norm": 0.5776227052583409, + "learning_rate": 7.304306567626391e-06, + "loss": 0.5294, + "step": 5763 + }, + { + "epoch": 0.73, + "grad_norm": 0.58151431386689, + "learning_rate": 7.3033910145359025e-06, + "loss": 0.5283, + "step": 5764 + }, + { + "epoch": 0.73, + "grad_norm": 0.6479035031175571, + "learning_rate": 7.302475363397419e-06, + "loss": 0.518, + "step": 5765 + }, + { + "epoch": 0.73, + "grad_norm": 0.7282825080334686, + "learning_rate": 7.301559614249915e-06, + "loss": 0.5657, + "step": 5766 + }, + { + "epoch": 0.73, + "grad_norm": 0.574330821080527, + "learning_rate": 7.300643767132376e-06, + "loss": 0.5208, + "step": 5767 + }, + { + "epoch": 0.73, + "grad_norm": 0.736684147857944, + "learning_rate": 7.299727822083782e-06, + "loss": 0.6046, + "step": 5768 + }, + { + "epoch": 0.73, + "grad_norm": 0.891767415071248, + "learning_rate": 7.298811779143122e-06, + "loss": 0.618, + "step": 5769 + }, + { + "epoch": 0.74, + "grad_norm": 0.6490241774211478, + "learning_rate": 7.297895638349392e-06, + "loss": 0.5211, + "step": 5770 + }, + { + "epoch": 0.74, + "grad_norm": 0.6065242278724827, + "learning_rate": 7.2969793997415885e-06, + "loss": 0.5445, + "step": 5771 + }, + { + "epoch": 0.74, + "grad_norm": 0.5399168870417422, + "learning_rate": 7.2960630633587115e-06, + "loss": 0.503, + "step": 5772 + }, + { + "epoch": 0.74, + "grad_norm": 0.7694682650140962, + "learning_rate": 7.295146629239767e-06, + "loss": 0.5799, + "step": 5773 + }, + { + "epoch": 0.74, + "grad_norm": 0.748506408540628, + "learning_rate": 7.2942300974237644e-06, + "loss": 0.609, + "step": 5774 + }, + { + "epoch": 0.74, + "grad_norm": 0.7446645910047759, + "learning_rate": 7.293313467949719e-06, + "loss": 0.5374, + "step": 5775 + }, + { + "epoch": 0.74, + "grad_norm": 0.707044546472477, + "learning_rate": 7.292396740856645e-06, + "loss": 0.5104, + "step": 5776 + }, + { + "epoch": 0.74, + "grad_norm": 0.5935709965582329, + "learning_rate": 7.291479916183571e-06, + "loss": 0.5092, + "step": 5777 + }, + { + "epoch": 0.74, + "grad_norm": 0.6715587002930182, + "learning_rate": 7.2905629939695165e-06, + "loss": 0.5302, + "step": 5778 + }, + { + "epoch": 0.74, + "grad_norm": 0.7167607621852933, + "learning_rate": 7.289645974253517e-06, + "loss": 0.5911, + "step": 5779 + }, + { + "epoch": 0.74, + "grad_norm": 0.5748019431123031, + "learning_rate": 7.288728857074603e-06, + "loss": 0.552, + "step": 5780 + }, + { + "epoch": 0.74, + "grad_norm": 0.5809459574804884, + "learning_rate": 7.2878116424718156e-06, + "loss": 0.5267, + "step": 5781 + }, + { + "epoch": 0.74, + "grad_norm": 0.7847818709982552, + "learning_rate": 7.286894330484199e-06, + "loss": 0.6018, + "step": 5782 + }, + { + "epoch": 0.74, + "grad_norm": 0.7157408343451143, + "learning_rate": 7.285976921150797e-06, + "loss": 0.5727, + "step": 5783 + }, + { + "epoch": 0.74, + "grad_norm": 0.597567504021634, + "learning_rate": 7.285059414510662e-06, + "loss": 0.5119, + "step": 5784 + }, + { + "epoch": 0.74, + "grad_norm": 0.5660599563764835, + "learning_rate": 7.284141810602851e-06, + "loss": 0.4974, + "step": 5785 + }, + { + "epoch": 0.74, + "grad_norm": 0.7149585941121873, + "learning_rate": 7.283224109466422e-06, + "loss": 0.5979, + "step": 5786 + }, + { + "epoch": 0.74, + "grad_norm": 0.7330929300573952, + "learning_rate": 7.282306311140439e-06, + "loss": 0.6008, + "step": 5787 + }, + { + "epoch": 0.74, + "grad_norm": 0.5428974476513099, + "learning_rate": 7.281388415663969e-06, + "loss": 0.5337, + "step": 5788 + }, + { + "epoch": 0.74, + "grad_norm": 0.5691331466082877, + "learning_rate": 7.280470423076085e-06, + "loss": 0.5148, + "step": 5789 + }, + { + "epoch": 0.74, + "grad_norm": 0.6695316733450618, + "learning_rate": 7.279552333415862e-06, + "loss": 0.5428, + "step": 5790 + }, + { + "epoch": 0.74, + "grad_norm": 0.7811793793546558, + "learning_rate": 7.278634146722381e-06, + "loss": 0.5439, + "step": 5791 + }, + { + "epoch": 0.74, + "grad_norm": 0.7569573819473664, + "learning_rate": 7.277715863034725e-06, + "loss": 0.6132, + "step": 5792 + }, + { + "epoch": 0.74, + "grad_norm": 0.6489098647452222, + "learning_rate": 7.276797482391985e-06, + "loss": 0.5182, + "step": 5793 + }, + { + "epoch": 0.74, + "grad_norm": 0.8661039160399061, + "learning_rate": 7.27587900483325e-06, + "loss": 0.6641, + "step": 5794 + }, + { + "epoch": 0.74, + "grad_norm": 0.5820186757358307, + "learning_rate": 7.274960430397618e-06, + "loss": 0.5221, + "step": 5795 + }, + { + "epoch": 0.74, + "grad_norm": 0.6325369896018755, + "learning_rate": 7.274041759124192e-06, + "loss": 0.5318, + "step": 5796 + }, + { + "epoch": 0.74, + "grad_norm": 0.7142674492910979, + "learning_rate": 7.273122991052074e-06, + "loss": 0.6028, + "step": 5797 + }, + { + "epoch": 0.74, + "grad_norm": 0.5693571249743329, + "learning_rate": 7.272204126220375e-06, + "loss": 0.5323, + "step": 5798 + }, + { + "epoch": 0.74, + "grad_norm": 0.5726341623341924, + "learning_rate": 7.271285164668207e-06, + "loss": 0.4993, + "step": 5799 + }, + { + "epoch": 0.74, + "grad_norm": 0.5720652830387938, + "learning_rate": 7.270366106434687e-06, + "loss": 0.5106, + "step": 5800 + }, + { + "epoch": 0.74, + "grad_norm": 0.5975392123756584, + "learning_rate": 7.269446951558936e-06, + "loss": 0.5266, + "step": 5801 + }, + { + "epoch": 0.74, + "grad_norm": 0.6988535752173115, + "learning_rate": 7.268527700080081e-06, + "loss": 0.5369, + "step": 5802 + }, + { + "epoch": 0.74, + "grad_norm": 0.6046355646140684, + "learning_rate": 7.267608352037252e-06, + "loss": 0.4992, + "step": 5803 + }, + { + "epoch": 0.74, + "grad_norm": 0.6186768633041764, + "learning_rate": 7.266688907469582e-06, + "loss": 0.5125, + "step": 5804 + }, + { + "epoch": 0.74, + "grad_norm": 0.7500248355786981, + "learning_rate": 7.2657693664162074e-06, + "loss": 0.5671, + "step": 5805 + }, + { + "epoch": 0.74, + "grad_norm": 0.7928151669816452, + "learning_rate": 7.264849728916272e-06, + "loss": 0.6154, + "step": 5806 + }, + { + "epoch": 0.74, + "grad_norm": 1.0398998040134704, + "learning_rate": 7.263929995008921e-06, + "loss": 0.6265, + "step": 5807 + }, + { + "epoch": 0.74, + "grad_norm": 0.6269689146676133, + "learning_rate": 7.263010164733303e-06, + "loss": 0.4923, + "step": 5808 + }, + { + "epoch": 0.74, + "grad_norm": 0.7283140190877309, + "learning_rate": 7.262090238128575e-06, + "loss": 0.5979, + "step": 5809 + }, + { + "epoch": 0.74, + "grad_norm": 0.836802862033315, + "learning_rate": 7.261170215233895e-06, + "loss": 0.6063, + "step": 5810 + }, + { + "epoch": 0.74, + "grad_norm": 0.8295010847206123, + "learning_rate": 7.2602500960884235e-06, + "loss": 0.6066, + "step": 5811 + }, + { + "epoch": 0.74, + "grad_norm": 0.7460122452114994, + "learning_rate": 7.259329880731328e-06, + "loss": 0.6002, + "step": 5812 + }, + { + "epoch": 0.74, + "grad_norm": 0.5560955965994837, + "learning_rate": 7.2584095692017795e-06, + "loss": 0.5122, + "step": 5813 + }, + { + "epoch": 0.74, + "grad_norm": 0.7079701401655037, + "learning_rate": 7.257489161538953e-06, + "loss": 0.5133, + "step": 5814 + }, + { + "epoch": 0.74, + "grad_norm": 1.142037142805699, + "learning_rate": 7.256568657782026e-06, + "loss": 0.5646, + "step": 5815 + }, + { + "epoch": 0.74, + "grad_norm": 0.6196514172605062, + "learning_rate": 7.255648057970184e-06, + "loss": 0.5278, + "step": 5816 + }, + { + "epoch": 0.74, + "grad_norm": 0.660998859609431, + "learning_rate": 7.254727362142611e-06, + "loss": 0.5407, + "step": 5817 + }, + { + "epoch": 0.74, + "grad_norm": 0.7939894477975382, + "learning_rate": 7.253806570338499e-06, + "loss": 0.6261, + "step": 5818 + }, + { + "epoch": 0.74, + "grad_norm": 0.7881642132198244, + "learning_rate": 7.252885682597041e-06, + "loss": 0.5966, + "step": 5819 + }, + { + "epoch": 0.74, + "grad_norm": 0.5466048211434927, + "learning_rate": 7.2519646989574435e-06, + "loss": 0.5176, + "step": 5820 + }, + { + "epoch": 0.74, + "grad_norm": 0.8007518272564078, + "learning_rate": 7.251043619458902e-06, + "loss": 0.5782, + "step": 5821 + }, + { + "epoch": 0.74, + "grad_norm": 0.7113264111127049, + "learning_rate": 7.250122444140628e-06, + "loss": 0.6143, + "step": 5822 + }, + { + "epoch": 0.74, + "grad_norm": 0.5343792154273836, + "learning_rate": 7.249201173041832e-06, + "loss": 0.4949, + "step": 5823 + }, + { + "epoch": 0.74, + "grad_norm": 0.6462591485954078, + "learning_rate": 7.248279806201729e-06, + "loss": 0.5439, + "step": 5824 + }, + { + "epoch": 0.74, + "grad_norm": 0.70629652786406, + "learning_rate": 7.2473583436595384e-06, + "loss": 0.5427, + "step": 5825 + }, + { + "epoch": 0.74, + "grad_norm": 0.633347607682042, + "learning_rate": 7.246436785454486e-06, + "loss": 0.5787, + "step": 5826 + }, + { + "epoch": 0.74, + "grad_norm": 0.7389954359806247, + "learning_rate": 7.2455151316257975e-06, + "loss": 0.5875, + "step": 5827 + }, + { + "epoch": 0.74, + "grad_norm": 0.8314776720107688, + "learning_rate": 7.244593382212706e-06, + "loss": 0.5708, + "step": 5828 + }, + { + "epoch": 0.74, + "grad_norm": 0.5866210514358545, + "learning_rate": 7.243671537254446e-06, + "loss": 0.5067, + "step": 5829 + }, + { + "epoch": 0.74, + "grad_norm": 0.6535199784271271, + "learning_rate": 7.24274959679026e-06, + "loss": 0.5729, + "step": 5830 + }, + { + "epoch": 0.74, + "grad_norm": 0.7643511709475582, + "learning_rate": 7.241827560859388e-06, + "loss": 0.5796, + "step": 5831 + }, + { + "epoch": 0.74, + "grad_norm": 0.8033218630139487, + "learning_rate": 7.240905429501083e-06, + "loss": 0.6167, + "step": 5832 + }, + { + "epoch": 0.74, + "grad_norm": 2.264851543870776, + "learning_rate": 7.239983202754594e-06, + "loss": 0.5799, + "step": 5833 + }, + { + "epoch": 0.74, + "grad_norm": 0.6654171955223167, + "learning_rate": 7.239060880659177e-06, + "loss": 0.5984, + "step": 5834 + }, + { + "epoch": 0.74, + "grad_norm": 0.635724976641325, + "learning_rate": 7.238138463254095e-06, + "loss": 0.4765, + "step": 5835 + }, + { + "epoch": 0.74, + "grad_norm": 0.6263943895827097, + "learning_rate": 7.23721595057861e-06, + "loss": 0.6167, + "step": 5836 + }, + { + "epoch": 0.74, + "grad_norm": 0.7289171218932294, + "learning_rate": 7.2362933426719905e-06, + "loss": 0.5762, + "step": 5837 + }, + { + "epoch": 0.74, + "grad_norm": 0.6279899523271233, + "learning_rate": 7.235370639573509e-06, + "loss": 0.5283, + "step": 5838 + }, + { + "epoch": 0.74, + "grad_norm": 0.710489049279531, + "learning_rate": 7.234447841322445e-06, + "loss": 0.517, + "step": 5839 + }, + { + "epoch": 0.74, + "grad_norm": 0.6623958418177638, + "learning_rate": 7.233524947958075e-06, + "loss": 0.5515, + "step": 5840 + }, + { + "epoch": 0.74, + "grad_norm": 0.6971840183623577, + "learning_rate": 7.232601959519685e-06, + "loss": 0.5163, + "step": 5841 + }, + { + "epoch": 0.74, + "grad_norm": 0.631580799135298, + "learning_rate": 7.231678876046565e-06, + "loss": 0.5337, + "step": 5842 + }, + { + "epoch": 0.74, + "grad_norm": 0.8002069483257911, + "learning_rate": 7.230755697578007e-06, + "loss": 0.6165, + "step": 5843 + }, + { + "epoch": 0.74, + "grad_norm": 0.8090719609189326, + "learning_rate": 7.229832424153307e-06, + "loss": 0.6482, + "step": 5844 + }, + { + "epoch": 0.74, + "grad_norm": 0.6438418091126863, + "learning_rate": 7.228909055811766e-06, + "loss": 0.547, + "step": 5845 + }, + { + "epoch": 0.74, + "grad_norm": 0.6686358657763787, + "learning_rate": 7.227985592592688e-06, + "loss": 0.606, + "step": 5846 + }, + { + "epoch": 0.74, + "grad_norm": 0.5883747889183144, + "learning_rate": 7.227062034535384e-06, + "loss": 0.5372, + "step": 5847 + }, + { + "epoch": 0.75, + "grad_norm": 0.7887012703417479, + "learning_rate": 7.226138381679165e-06, + "loss": 0.6289, + "step": 5848 + }, + { + "epoch": 0.75, + "grad_norm": 0.8359052276538436, + "learning_rate": 7.22521463406335e-06, + "loss": 0.6622, + "step": 5849 + }, + { + "epoch": 0.75, + "grad_norm": 0.8624524633674899, + "learning_rate": 7.224290791727259e-06, + "loss": 0.6222, + "step": 5850 + }, + { + "epoch": 0.75, + "grad_norm": 0.6247450915222917, + "learning_rate": 7.223366854710216e-06, + "loss": 0.5271, + "step": 5851 + }, + { + "epoch": 0.75, + "grad_norm": 0.7475055977408548, + "learning_rate": 7.222442823051552e-06, + "loss": 0.6031, + "step": 5852 + }, + { + "epoch": 0.75, + "grad_norm": 0.7956688269029737, + "learning_rate": 7.221518696790597e-06, + "loss": 0.621, + "step": 5853 + }, + { + "epoch": 0.75, + "grad_norm": 0.5937287743492685, + "learning_rate": 7.220594475966691e-06, + "loss": 0.5155, + "step": 5854 + }, + { + "epoch": 0.75, + "grad_norm": 0.5664963442322977, + "learning_rate": 7.219670160619174e-06, + "loss": 0.5153, + "step": 5855 + }, + { + "epoch": 0.75, + "grad_norm": 0.9856919355818669, + "learning_rate": 7.218745750787392e-06, + "loss": 0.6404, + "step": 5856 + }, + { + "epoch": 0.75, + "grad_norm": 0.6901444752687154, + "learning_rate": 7.217821246510692e-06, + "loss": 0.565, + "step": 5857 + }, + { + "epoch": 0.75, + "grad_norm": 0.7678632427900924, + "learning_rate": 7.216896647828431e-06, + "loss": 0.6458, + "step": 5858 + }, + { + "epoch": 0.75, + "grad_norm": 0.6206510986093188, + "learning_rate": 7.215971954779962e-06, + "loss": 0.5602, + "step": 5859 + }, + { + "epoch": 0.75, + "grad_norm": 0.7804973788476395, + "learning_rate": 7.21504716740465e-06, + "loss": 0.6281, + "step": 5860 + }, + { + "epoch": 0.75, + "grad_norm": 0.7043479877195835, + "learning_rate": 7.2141222857418555e-06, + "loss": 0.5742, + "step": 5861 + }, + { + "epoch": 0.75, + "grad_norm": 0.7256562511537287, + "learning_rate": 7.213197309830953e-06, + "loss": 0.5702, + "step": 5862 + }, + { + "epoch": 0.75, + "grad_norm": 0.6173465471508519, + "learning_rate": 7.212272239711312e-06, + "loss": 0.5248, + "step": 5863 + }, + { + "epoch": 0.75, + "grad_norm": 0.6334666783535389, + "learning_rate": 7.211347075422312e-06, + "loss": 0.5136, + "step": 5864 + }, + { + "epoch": 0.75, + "grad_norm": 0.7329613183016386, + "learning_rate": 7.2104218170033325e-06, + "loss": 0.5893, + "step": 5865 + }, + { + "epoch": 0.75, + "grad_norm": 0.8142819818199812, + "learning_rate": 7.209496464493762e-06, + "loss": 0.6028, + "step": 5866 + }, + { + "epoch": 0.75, + "grad_norm": 0.6844891015001855, + "learning_rate": 7.208571017932985e-06, + "loss": 0.5128, + "step": 5867 + }, + { + "epoch": 0.75, + "grad_norm": 0.8260304063851073, + "learning_rate": 7.207645477360399e-06, + "loss": 0.5968, + "step": 5868 + }, + { + "epoch": 0.75, + "grad_norm": 0.6660889071416392, + "learning_rate": 7.206719842815398e-06, + "loss": 0.5666, + "step": 5869 + }, + { + "epoch": 0.75, + "grad_norm": 0.9208399051921774, + "learning_rate": 7.205794114337385e-06, + "loss": 0.6336, + "step": 5870 + }, + { + "epoch": 0.75, + "grad_norm": 0.6119512037323821, + "learning_rate": 7.204868291965767e-06, + "loss": 0.544, + "step": 5871 + }, + { + "epoch": 0.75, + "grad_norm": 0.6525157575923063, + "learning_rate": 7.203942375739951e-06, + "loss": 0.575, + "step": 5872 + }, + { + "epoch": 0.75, + "grad_norm": 0.6777254365784581, + "learning_rate": 7.2030163656993505e-06, + "loss": 0.5898, + "step": 5873 + }, + { + "epoch": 0.75, + "grad_norm": 0.5781114453839473, + "learning_rate": 7.2020902618833836e-06, + "loss": 0.535, + "step": 5874 + }, + { + "epoch": 0.75, + "grad_norm": 0.7283837867809089, + "learning_rate": 7.201164064331469e-06, + "loss": 0.6501, + "step": 5875 + }, + { + "epoch": 0.75, + "grad_norm": 0.6009244489423754, + "learning_rate": 7.200237773083036e-06, + "loss": 0.5303, + "step": 5876 + }, + { + "epoch": 0.75, + "grad_norm": 0.7275894205842025, + "learning_rate": 7.199311388177512e-06, + "loss": 0.4724, + "step": 5877 + }, + { + "epoch": 0.75, + "grad_norm": 0.643228658523126, + "learning_rate": 7.198384909654331e-06, + "loss": 0.5359, + "step": 5878 + }, + { + "epoch": 0.75, + "grad_norm": 0.7513516441067521, + "learning_rate": 7.197458337552928e-06, + "loss": 0.605, + "step": 5879 + }, + { + "epoch": 0.75, + "grad_norm": 0.6258683148799139, + "learning_rate": 7.1965316719127465e-06, + "loss": 0.5499, + "step": 5880 + }, + { + "epoch": 0.75, + "grad_norm": 0.7772835189884585, + "learning_rate": 7.1956049127732295e-06, + "loss": 0.5744, + "step": 5881 + }, + { + "epoch": 0.75, + "grad_norm": 0.6216236494210087, + "learning_rate": 7.1946780601738285e-06, + "loss": 0.5491, + "step": 5882 + }, + { + "epoch": 0.75, + "grad_norm": 0.5663351592564475, + "learning_rate": 7.1937511141539965e-06, + "loss": 0.5244, + "step": 5883 + }, + { + "epoch": 0.75, + "grad_norm": 0.7554566424136938, + "learning_rate": 7.192824074753188e-06, + "loss": 0.5798, + "step": 5884 + }, + { + "epoch": 0.75, + "grad_norm": 0.7613029883601454, + "learning_rate": 7.191896942010867e-06, + "loss": 0.599, + "step": 5885 + }, + { + "epoch": 0.75, + "grad_norm": 0.8065786710297403, + "learning_rate": 7.190969715966498e-06, + "loss": 0.6365, + "step": 5886 + }, + { + "epoch": 0.75, + "grad_norm": 0.5736921731008394, + "learning_rate": 7.190042396659548e-06, + "loss": 0.5074, + "step": 5887 + }, + { + "epoch": 0.75, + "grad_norm": 0.6962951030751786, + "learning_rate": 7.189114984129492e-06, + "loss": 0.6247, + "step": 5888 + }, + { + "epoch": 0.75, + "grad_norm": 0.6855375517979899, + "learning_rate": 7.1881874784158065e-06, + "loss": 0.5474, + "step": 5889 + }, + { + "epoch": 0.75, + "grad_norm": 0.7174166678113488, + "learning_rate": 7.187259879557974e-06, + "loss": 0.5967, + "step": 5890 + }, + { + "epoch": 0.75, + "grad_norm": 0.746697031944723, + "learning_rate": 7.186332187595477e-06, + "loss": 0.5463, + "step": 5891 + }, + { + "epoch": 0.75, + "grad_norm": 0.6994200247041723, + "learning_rate": 7.185404402567805e-06, + "loss": 0.6516, + "step": 5892 + }, + { + "epoch": 0.75, + "grad_norm": 0.8037768005668379, + "learning_rate": 7.18447652451445e-06, + "loss": 0.5931, + "step": 5893 + }, + { + "epoch": 0.75, + "grad_norm": 0.8136246375220463, + "learning_rate": 7.183548553474912e-06, + "loss": 0.5129, + "step": 5894 + }, + { + "epoch": 0.75, + "grad_norm": 0.5920517296472912, + "learning_rate": 7.182620489488689e-06, + "loss": 0.5559, + "step": 5895 + }, + { + "epoch": 0.75, + "grad_norm": 0.7856583055030771, + "learning_rate": 7.181692332595286e-06, + "loss": 0.6187, + "step": 5896 + }, + { + "epoch": 0.75, + "grad_norm": 0.8117548383096944, + "learning_rate": 7.180764082834213e-06, + "loss": 0.6325, + "step": 5897 + }, + { + "epoch": 0.75, + "grad_norm": 0.8346757934061227, + "learning_rate": 7.1798357402449805e-06, + "loss": 0.6566, + "step": 5898 + }, + { + "epoch": 0.75, + "grad_norm": 0.7666718394886585, + "learning_rate": 7.178907304867108e-06, + "loss": 0.6011, + "step": 5899 + }, + { + "epoch": 0.75, + "grad_norm": 0.6797956495444377, + "learning_rate": 7.177978776740112e-06, + "loss": 0.5266, + "step": 5900 + }, + { + "epoch": 0.75, + "grad_norm": 0.5985809901293472, + "learning_rate": 7.17705015590352e-06, + "loss": 0.5502, + "step": 5901 + }, + { + "epoch": 0.75, + "grad_norm": 0.6179930592407553, + "learning_rate": 7.176121442396861e-06, + "loss": 0.5399, + "step": 5902 + }, + { + "epoch": 0.75, + "grad_norm": 0.887166665193454, + "learning_rate": 7.175192636259666e-06, + "loss": 0.6518, + "step": 5903 + }, + { + "epoch": 0.75, + "grad_norm": 0.6767972616304102, + "learning_rate": 7.174263737531471e-06, + "loss": 0.5321, + "step": 5904 + }, + { + "epoch": 0.75, + "grad_norm": 0.7545027952937109, + "learning_rate": 7.173334746251815e-06, + "loss": 0.5955, + "step": 5905 + }, + { + "epoch": 0.75, + "grad_norm": 0.694087376964566, + "learning_rate": 7.172405662460247e-06, + "loss": 0.6055, + "step": 5906 + }, + { + "epoch": 0.75, + "grad_norm": 0.5405844381160699, + "learning_rate": 7.1714764861963095e-06, + "loss": 0.5076, + "step": 5907 + }, + { + "epoch": 0.75, + "grad_norm": 0.6662841454229068, + "learning_rate": 7.170547217499557e-06, + "loss": 0.5525, + "step": 5908 + }, + { + "epoch": 0.75, + "grad_norm": 0.6366622953428074, + "learning_rate": 7.169617856409547e-06, + "loss": 0.5104, + "step": 5909 + }, + { + "epoch": 0.75, + "grad_norm": 0.7557653178685522, + "learning_rate": 7.168688402965837e-06, + "loss": 0.6106, + "step": 5910 + }, + { + "epoch": 0.75, + "grad_norm": 0.5698851246736489, + "learning_rate": 7.167758857207992e-06, + "loss": 0.5251, + "step": 5911 + }, + { + "epoch": 0.75, + "grad_norm": 0.6579949979231071, + "learning_rate": 7.16682921917558e-06, + "loss": 0.4939, + "step": 5912 + }, + { + "epoch": 0.75, + "grad_norm": 1.9267038564495211, + "learning_rate": 7.165899488908171e-06, + "loss": 0.6138, + "step": 5913 + }, + { + "epoch": 0.75, + "grad_norm": 0.6137144751518273, + "learning_rate": 7.1649696664453435e-06, + "loss": 0.4945, + "step": 5914 + }, + { + "epoch": 0.75, + "grad_norm": 0.6336469828178837, + "learning_rate": 7.164039751826675e-06, + "loss": 0.5368, + "step": 5915 + }, + { + "epoch": 0.75, + "grad_norm": 0.5332662473473622, + "learning_rate": 7.16310974509175e-06, + "loss": 0.5095, + "step": 5916 + }, + { + "epoch": 0.75, + "grad_norm": 0.6380657288145953, + "learning_rate": 7.162179646280155e-06, + "loss": 0.5039, + "step": 5917 + }, + { + "epoch": 0.75, + "grad_norm": 0.5217335558926929, + "learning_rate": 7.161249455431481e-06, + "loss": 0.5023, + "step": 5918 + }, + { + "epoch": 0.75, + "grad_norm": 0.7476408907405009, + "learning_rate": 7.160319172585325e-06, + "loss": 0.6326, + "step": 5919 + }, + { + "epoch": 0.75, + "grad_norm": 0.8369853122374212, + "learning_rate": 7.159388797781285e-06, + "loss": 0.6343, + "step": 5920 + }, + { + "epoch": 0.75, + "grad_norm": 0.8397283935017545, + "learning_rate": 7.1584583310589635e-06, + "loss": 0.6006, + "step": 5921 + }, + { + "epoch": 0.75, + "grad_norm": 0.686576869167361, + "learning_rate": 7.15752777245797e-06, + "loss": 0.5358, + "step": 5922 + }, + { + "epoch": 0.75, + "grad_norm": 0.8897349901289278, + "learning_rate": 7.156597122017913e-06, + "loss": 0.6119, + "step": 5923 + }, + { + "epoch": 0.75, + "grad_norm": 0.6827827637178683, + "learning_rate": 7.155666379778407e-06, + "loss": 0.5499, + "step": 5924 + }, + { + "epoch": 0.75, + "grad_norm": 0.6486559097349742, + "learning_rate": 7.154735545779072e-06, + "loss": 0.4898, + "step": 5925 + }, + { + "epoch": 0.75, + "grad_norm": 1.4057449600176155, + "learning_rate": 7.153804620059532e-06, + "loss": 0.6217, + "step": 5926 + }, + { + "epoch": 0.76, + "grad_norm": 0.5506670451254673, + "learning_rate": 7.152873602659411e-06, + "loss": 0.5245, + "step": 5927 + }, + { + "epoch": 0.76, + "grad_norm": 0.6546870274474008, + "learning_rate": 7.15194249361834e-06, + "loss": 0.5798, + "step": 5928 + }, + { + "epoch": 0.76, + "grad_norm": 0.7520327982372209, + "learning_rate": 7.1510112929759544e-06, + "loss": 0.6366, + "step": 5929 + }, + { + "epoch": 0.76, + "grad_norm": 0.6011870153450867, + "learning_rate": 7.150080000771892e-06, + "loss": 0.5259, + "step": 5930 + }, + { + "epoch": 0.76, + "grad_norm": 0.5643861021086324, + "learning_rate": 7.149148617045793e-06, + "loss": 0.5191, + "step": 5931 + }, + { + "epoch": 0.76, + "grad_norm": 0.6655891581971597, + "learning_rate": 7.1482171418373055e-06, + "loss": 0.5382, + "step": 5932 + }, + { + "epoch": 0.76, + "grad_norm": 0.7097024475756677, + "learning_rate": 7.14728557518608e-06, + "loss": 0.5988, + "step": 5933 + }, + { + "epoch": 0.76, + "grad_norm": 0.5814307284642501, + "learning_rate": 7.146353917131767e-06, + "loss": 0.5127, + "step": 5934 + }, + { + "epoch": 0.76, + "grad_norm": 0.8005391443407877, + "learning_rate": 7.145422167714029e-06, + "loss": 0.6022, + "step": 5935 + }, + { + "epoch": 0.76, + "grad_norm": 0.7009976228365044, + "learning_rate": 7.144490326972525e-06, + "loss": 0.55, + "step": 5936 + }, + { + "epoch": 0.76, + "grad_norm": 0.6720608201194977, + "learning_rate": 7.143558394946919e-06, + "loss": 0.601, + "step": 5937 + }, + { + "epoch": 0.76, + "grad_norm": 0.9116088358992631, + "learning_rate": 7.142626371676883e-06, + "loss": 0.6073, + "step": 5938 + }, + { + "epoch": 0.76, + "grad_norm": 0.6275254875012132, + "learning_rate": 7.14169425720209e-06, + "loss": 0.527, + "step": 5939 + }, + { + "epoch": 0.76, + "grad_norm": 3.5094376765807014, + "learning_rate": 7.140762051562215e-06, + "loss": 0.6293, + "step": 5940 + }, + { + "epoch": 0.76, + "grad_norm": 0.6549302507765816, + "learning_rate": 7.139829754796941e-06, + "loss": 0.5013, + "step": 5941 + }, + { + "epoch": 0.76, + "grad_norm": 0.612729566792426, + "learning_rate": 7.1388973669459515e-06, + "loss": 0.5466, + "step": 5942 + }, + { + "epoch": 0.76, + "grad_norm": 0.7536579417526534, + "learning_rate": 7.137964888048936e-06, + "loss": 0.6376, + "step": 5943 + }, + { + "epoch": 0.76, + "grad_norm": 0.9117217692848721, + "learning_rate": 7.137032318145588e-06, + "loss": 0.6506, + "step": 5944 + }, + { + "epoch": 0.76, + "grad_norm": 0.7630075700909473, + "learning_rate": 7.1360996572756034e-06, + "loss": 0.6294, + "step": 5945 + }, + { + "epoch": 0.76, + "grad_norm": 0.6414206866660678, + "learning_rate": 7.1351669054786825e-06, + "loss": 0.552, + "step": 5946 + }, + { + "epoch": 0.76, + "grad_norm": 0.7169280605097303, + "learning_rate": 7.134234062794528e-06, + "loss": 0.6154, + "step": 5947 + }, + { + "epoch": 0.76, + "grad_norm": 0.5957026845046108, + "learning_rate": 7.1333011292628506e-06, + "loss": 0.5875, + "step": 5948 + }, + { + "epoch": 0.76, + "grad_norm": 0.6799298689048163, + "learning_rate": 7.132368104923359e-06, + "loss": 0.5323, + "step": 5949 + }, + { + "epoch": 0.76, + "grad_norm": 0.6339070219489517, + "learning_rate": 7.131434989815774e-06, + "loss": 0.5409, + "step": 5950 + }, + { + "epoch": 0.76, + "grad_norm": 0.8221592820088667, + "learning_rate": 7.130501783979813e-06, + "loss": 0.6135, + "step": 5951 + }, + { + "epoch": 0.76, + "grad_norm": 0.7197916486344914, + "learning_rate": 7.129568487455197e-06, + "loss": 0.5424, + "step": 5952 + }, + { + "epoch": 0.76, + "grad_norm": 0.7347070262150646, + "learning_rate": 7.128635100281657e-06, + "loss": 0.5833, + "step": 5953 + }, + { + "epoch": 0.76, + "grad_norm": 0.7357879577306148, + "learning_rate": 7.1277016224989224e-06, + "loss": 0.5877, + "step": 5954 + }, + { + "epoch": 0.76, + "grad_norm": 0.8899153229202661, + "learning_rate": 7.126768054146729e-06, + "loss": 0.621, + "step": 5955 + }, + { + "epoch": 0.76, + "grad_norm": 0.8080942458176608, + "learning_rate": 7.125834395264815e-06, + "loss": 0.6003, + "step": 5956 + }, + { + "epoch": 0.76, + "grad_norm": 0.7409659378027762, + "learning_rate": 7.124900645892925e-06, + "loss": 0.5787, + "step": 5957 + }, + { + "epoch": 0.76, + "grad_norm": 0.5636619965707601, + "learning_rate": 7.123966806070804e-06, + "loss": 0.5148, + "step": 5958 + }, + { + "epoch": 0.76, + "grad_norm": 0.6939137855049864, + "learning_rate": 7.123032875838204e-06, + "loss": 0.5317, + "step": 5959 + }, + { + "epoch": 0.76, + "grad_norm": 0.6169178512654427, + "learning_rate": 7.122098855234879e-06, + "loss": 0.5068, + "step": 5960 + }, + { + "epoch": 0.76, + "grad_norm": 0.7670689037036561, + "learning_rate": 7.121164744300586e-06, + "loss": 0.6026, + "step": 5961 + }, + { + "epoch": 0.76, + "grad_norm": 0.5868224830446753, + "learning_rate": 7.120230543075088e-06, + "loss": 0.5358, + "step": 5962 + }, + { + "epoch": 0.76, + "grad_norm": 0.5930265037171247, + "learning_rate": 7.119296251598152e-06, + "loss": 0.566, + "step": 5963 + }, + { + "epoch": 0.76, + "grad_norm": 0.5908456018357738, + "learning_rate": 7.118361869909547e-06, + "loss": 0.5208, + "step": 5964 + }, + { + "epoch": 0.76, + "grad_norm": 0.5593592665307044, + "learning_rate": 7.117427398049045e-06, + "loss": 0.5258, + "step": 5965 + }, + { + "epoch": 0.76, + "grad_norm": 0.65059181906227, + "learning_rate": 7.116492836056427e-06, + "loss": 0.5709, + "step": 5966 + }, + { + "epoch": 0.76, + "grad_norm": 0.7629845456020955, + "learning_rate": 7.11555818397147e-06, + "loss": 0.5841, + "step": 5967 + }, + { + "epoch": 0.76, + "grad_norm": 0.7274199972082073, + "learning_rate": 7.1146234418339635e-06, + "loss": 0.6031, + "step": 5968 + }, + { + "epoch": 0.76, + "grad_norm": 0.5852919273140857, + "learning_rate": 7.1136886096836935e-06, + "loss": 0.5653, + "step": 5969 + }, + { + "epoch": 0.76, + "grad_norm": 0.7087866747568418, + "learning_rate": 7.112753687560454e-06, + "loss": 0.5917, + "step": 5970 + }, + { + "epoch": 0.76, + "grad_norm": 0.56202775250031, + "learning_rate": 7.111818675504041e-06, + "loss": 0.5322, + "step": 5971 + }, + { + "epoch": 0.76, + "grad_norm": 0.5920179015436181, + "learning_rate": 7.1108835735542556e-06, + "loss": 0.5292, + "step": 5972 + }, + { + "epoch": 0.76, + "grad_norm": 0.7633463885951367, + "learning_rate": 7.109948381750902e-06, + "loss": 0.5593, + "step": 5973 + }, + { + "epoch": 0.76, + "grad_norm": 0.6123231443472408, + "learning_rate": 7.109013100133789e-06, + "loss": 0.5001, + "step": 5974 + }, + { + "epoch": 0.76, + "grad_norm": 0.8101548896370763, + "learning_rate": 7.108077728742727e-06, + "loss": 0.6194, + "step": 5975 + }, + { + "epoch": 0.76, + "grad_norm": 0.6193684625645861, + "learning_rate": 7.107142267617533e-06, + "loss": 0.5115, + "step": 5976 + }, + { + "epoch": 0.76, + "grad_norm": 0.5573063001403132, + "learning_rate": 7.106206716798025e-06, + "loss": 0.5082, + "step": 5977 + }, + { + "epoch": 0.76, + "grad_norm": 0.7036027776981749, + "learning_rate": 7.105271076324028e-06, + "loss": 0.5874, + "step": 5978 + }, + { + "epoch": 0.76, + "grad_norm": 0.652049101670856, + "learning_rate": 7.10433534623537e-06, + "loss": 0.5015, + "step": 5979 + }, + { + "epoch": 0.76, + "grad_norm": 0.5763090882467994, + "learning_rate": 7.103399526571879e-06, + "loss": 0.5307, + "step": 5980 + }, + { + "epoch": 0.76, + "grad_norm": 0.674680021102911, + "learning_rate": 7.102463617373392e-06, + "loss": 0.5083, + "step": 5981 + }, + { + "epoch": 0.76, + "grad_norm": 0.6330173496203505, + "learning_rate": 7.101527618679749e-06, + "loss": 0.5371, + "step": 5982 + }, + { + "epoch": 0.76, + "grad_norm": 0.7500296472281367, + "learning_rate": 7.10059153053079e-06, + "loss": 0.4961, + "step": 5983 + }, + { + "epoch": 0.76, + "grad_norm": 0.7230364666967767, + "learning_rate": 7.099655352966361e-06, + "loss": 0.565, + "step": 5984 + }, + { + "epoch": 0.76, + "grad_norm": 0.7281334536321968, + "learning_rate": 7.098719086026313e-06, + "loss": 0.5901, + "step": 5985 + }, + { + "epoch": 0.76, + "grad_norm": 0.5669714110538367, + "learning_rate": 7.097782729750501e-06, + "loss": 0.552, + "step": 5986 + }, + { + "epoch": 0.76, + "grad_norm": 0.5776175088594866, + "learning_rate": 7.0968462841787825e-06, + "loss": 0.5161, + "step": 5987 + }, + { + "epoch": 0.76, + "grad_norm": 0.8035701307743665, + "learning_rate": 7.095909749351016e-06, + "loss": 0.5945, + "step": 5988 + }, + { + "epoch": 0.76, + "grad_norm": 0.5749391162769852, + "learning_rate": 7.094973125307071e-06, + "loss": 0.5199, + "step": 5989 + }, + { + "epoch": 0.76, + "grad_norm": 0.6034501105002962, + "learning_rate": 7.094036412086815e-06, + "loss": 0.5526, + "step": 5990 + }, + { + "epoch": 0.76, + "grad_norm": 0.6351080516770963, + "learning_rate": 7.093099609730118e-06, + "loss": 0.584, + "step": 5991 + }, + { + "epoch": 0.76, + "grad_norm": 0.616218268716067, + "learning_rate": 7.092162718276861e-06, + "loss": 0.5596, + "step": 5992 + }, + { + "epoch": 0.76, + "grad_norm": 0.6133377103476508, + "learning_rate": 7.091225737766923e-06, + "loss": 0.549, + "step": 5993 + }, + { + "epoch": 0.76, + "grad_norm": 0.6974070658555137, + "learning_rate": 7.090288668240187e-06, + "loss": 0.5781, + "step": 5994 + }, + { + "epoch": 0.76, + "grad_norm": 0.6675311153928515, + "learning_rate": 7.089351509736543e-06, + "loss": 0.5763, + "step": 5995 + }, + { + "epoch": 0.76, + "grad_norm": 0.756118850204332, + "learning_rate": 7.0884142622958805e-06, + "loss": 0.6154, + "step": 5996 + }, + { + "epoch": 0.76, + "grad_norm": 0.6698436678644841, + "learning_rate": 7.087476925958098e-06, + "loss": 0.5872, + "step": 5997 + }, + { + "epoch": 0.76, + "grad_norm": 0.5878670451744555, + "learning_rate": 7.086539500763092e-06, + "loss": 0.5009, + "step": 5998 + }, + { + "epoch": 0.76, + "grad_norm": 0.6414485430906762, + "learning_rate": 7.085601986750767e-06, + "loss": 0.5466, + "step": 5999 + }, + { + "epoch": 0.76, + "grad_norm": 0.7238108126282299, + "learning_rate": 7.084664383961031e-06, + "loss": 0.5699, + "step": 6000 + }, + { + "epoch": 0.76, + "grad_norm": 0.6678033522689986, + "learning_rate": 7.083726692433793e-06, + "loss": 0.6002, + "step": 6001 + }, + { + "epoch": 0.76, + "grad_norm": 0.7837549105544315, + "learning_rate": 7.08278891220897e-06, + "loss": 0.6126, + "step": 6002 + }, + { + "epoch": 0.76, + "grad_norm": 0.6074108868955742, + "learning_rate": 7.081851043326477e-06, + "loss": 0.5207, + "step": 6003 + }, + { + "epoch": 0.76, + "grad_norm": 0.7707852242079446, + "learning_rate": 7.0809130858262385e-06, + "loss": 0.5875, + "step": 6004 + }, + { + "epoch": 0.77, + "grad_norm": 0.8314941488122427, + "learning_rate": 7.079975039748179e-06, + "loss": 0.681, + "step": 6005 + }, + { + "epoch": 0.77, + "grad_norm": 0.7864963641232625, + "learning_rate": 7.0790369051322285e-06, + "loss": 0.6382, + "step": 6006 + }, + { + "epoch": 0.77, + "grad_norm": 0.545907604352695, + "learning_rate": 7.0780986820183215e-06, + "loss": 0.5221, + "step": 6007 + }, + { + "epoch": 0.77, + "grad_norm": 0.8486654749552658, + "learning_rate": 7.077160370446395e-06, + "loss": 0.6261, + "step": 6008 + }, + { + "epoch": 0.77, + "grad_norm": 0.8072855106678415, + "learning_rate": 7.076221970456387e-06, + "loss": 0.6484, + "step": 6009 + }, + { + "epoch": 0.77, + "grad_norm": 0.5287191367170836, + "learning_rate": 7.075283482088246e-06, + "loss": 0.4511, + "step": 6010 + }, + { + "epoch": 0.77, + "grad_norm": 0.5741171173985696, + "learning_rate": 7.074344905381917e-06, + "loss": 0.5015, + "step": 6011 + }, + { + "epoch": 0.77, + "grad_norm": 1.1406092082545656, + "learning_rate": 7.073406240377356e-06, + "loss": 0.5978, + "step": 6012 + }, + { + "epoch": 0.77, + "grad_norm": 0.7210962449068865, + "learning_rate": 7.072467487114516e-06, + "loss": 0.5889, + "step": 6013 + }, + { + "epoch": 0.77, + "grad_norm": 0.9730920088064027, + "learning_rate": 7.071528645633358e-06, + "loss": 0.6462, + "step": 6014 + }, + { + "epoch": 0.77, + "grad_norm": 0.5686619356341395, + "learning_rate": 7.070589715973845e-06, + "loss": 0.5222, + "step": 6015 + }, + { + "epoch": 0.77, + "grad_norm": 0.582273852585019, + "learning_rate": 7.069650698175945e-06, + "loss": 0.5512, + "step": 6016 + }, + { + "epoch": 0.77, + "grad_norm": 0.5794857780835623, + "learning_rate": 7.068711592279628e-06, + "loss": 0.5046, + "step": 6017 + }, + { + "epoch": 0.77, + "grad_norm": 0.8308853893111957, + "learning_rate": 7.067772398324869e-06, + "loss": 0.6769, + "step": 6018 + }, + { + "epoch": 0.77, + "grad_norm": 0.8871208267240814, + "learning_rate": 7.0668331163516475e-06, + "loss": 0.6243, + "step": 6019 + }, + { + "epoch": 0.77, + "grad_norm": 0.7353649761204798, + "learning_rate": 7.065893746399945e-06, + "loss": 0.6082, + "step": 6020 + }, + { + "epoch": 0.77, + "grad_norm": 0.8206904928576645, + "learning_rate": 7.0649542885097464e-06, + "loss": 0.6601, + "step": 6021 + }, + { + "epoch": 0.77, + "grad_norm": 0.7994782475497233, + "learning_rate": 7.064014742721043e-06, + "loss": 0.5921, + "step": 6022 + }, + { + "epoch": 0.77, + "grad_norm": 0.6300920174866359, + "learning_rate": 7.063075109073827e-06, + "loss": 0.5324, + "step": 6023 + }, + { + "epoch": 0.77, + "grad_norm": 0.7172389415559045, + "learning_rate": 7.062135387608095e-06, + "loss": 0.5691, + "step": 6024 + }, + { + "epoch": 0.77, + "grad_norm": 0.6011265374447058, + "learning_rate": 7.0611955783638516e-06, + "loss": 0.5494, + "step": 6025 + }, + { + "epoch": 0.77, + "grad_norm": 0.7913267524061012, + "learning_rate": 7.060255681381098e-06, + "loss": 0.5861, + "step": 6026 + }, + { + "epoch": 0.77, + "grad_norm": 0.5408541454873202, + "learning_rate": 7.0593156966998445e-06, + "loss": 0.4974, + "step": 6027 + }, + { + "epoch": 0.77, + "grad_norm": 0.7635874309264382, + "learning_rate": 7.058375624360102e-06, + "loss": 0.6004, + "step": 6028 + }, + { + "epoch": 0.77, + "grad_norm": 0.5356125016899917, + "learning_rate": 7.057435464401887e-06, + "loss": 0.5235, + "step": 6029 + }, + { + "epoch": 0.77, + "grad_norm": 0.626775555298914, + "learning_rate": 7.056495216865219e-06, + "loss": 0.5085, + "step": 6030 + }, + { + "epoch": 0.77, + "grad_norm": 0.8547834497440239, + "learning_rate": 7.055554881790121e-06, + "loss": 0.6407, + "step": 6031 + }, + { + "epoch": 0.77, + "grad_norm": 0.7310096864764011, + "learning_rate": 7.054614459216619e-06, + "loss": 0.6107, + "step": 6032 + }, + { + "epoch": 0.77, + "grad_norm": 0.5757760455675806, + "learning_rate": 7.0536739491847465e-06, + "loss": 0.5305, + "step": 6033 + }, + { + "epoch": 0.77, + "grad_norm": 0.7645665550905649, + "learning_rate": 7.052733351734537e-06, + "loss": 0.6176, + "step": 6034 + }, + { + "epoch": 0.77, + "grad_norm": 0.8265033598092296, + "learning_rate": 7.051792666906027e-06, + "loss": 0.6061, + "step": 6035 + }, + { + "epoch": 0.77, + "grad_norm": 0.6837039457234325, + "learning_rate": 7.0508518947392614e-06, + "loss": 0.5634, + "step": 6036 + }, + { + "epoch": 0.77, + "grad_norm": 0.586998764721055, + "learning_rate": 7.049911035274283e-06, + "loss": 0.5124, + "step": 6037 + }, + { + "epoch": 0.77, + "grad_norm": 0.5892951271709775, + "learning_rate": 7.048970088551143e-06, + "loss": 0.517, + "step": 6038 + }, + { + "epoch": 0.77, + "grad_norm": 0.6512704912070886, + "learning_rate": 7.048029054609893e-06, + "loss": 0.5707, + "step": 6039 + }, + { + "epoch": 0.77, + "grad_norm": 0.7442628176487647, + "learning_rate": 7.047087933490591e-06, + "loss": 0.597, + "step": 6040 + }, + { + "epoch": 0.77, + "grad_norm": 0.6634565714644403, + "learning_rate": 7.0461467252332984e-06, + "loss": 0.4795, + "step": 6041 + }, + { + "epoch": 0.77, + "grad_norm": 0.7873848421662484, + "learning_rate": 7.045205429878076e-06, + "loss": 0.6211, + "step": 6042 + }, + { + "epoch": 0.77, + "grad_norm": 0.6345169378203459, + "learning_rate": 7.044264047464995e-06, + "loss": 0.4902, + "step": 6043 + }, + { + "epoch": 0.77, + "grad_norm": 0.6048493037612452, + "learning_rate": 7.0433225780341265e-06, + "loss": 0.5271, + "step": 6044 + }, + { + "epoch": 0.77, + "grad_norm": 0.5987685909027206, + "learning_rate": 7.042381021625546e-06, + "loss": 0.5846, + "step": 6045 + }, + { + "epoch": 0.77, + "grad_norm": 0.761014568982348, + "learning_rate": 7.0414393782793315e-06, + "loss": 0.5534, + "step": 6046 + }, + { + "epoch": 0.77, + "grad_norm": 0.5899132072894913, + "learning_rate": 7.040497648035565e-06, + "loss": 0.5201, + "step": 6047 + }, + { + "epoch": 0.77, + "grad_norm": 0.7087852389345419, + "learning_rate": 7.0395558309343345e-06, + "loss": 0.6047, + "step": 6048 + }, + { + "epoch": 0.77, + "grad_norm": 0.877431540022819, + "learning_rate": 7.03861392701573e-06, + "loss": 0.6155, + "step": 6049 + }, + { + "epoch": 0.77, + "grad_norm": 0.5516692086699265, + "learning_rate": 7.037671936319845e-06, + "loss": 0.4954, + "step": 6050 + }, + { + "epoch": 0.77, + "grad_norm": 0.5984557294666936, + "learning_rate": 7.036729858886779e-06, + "loss": 0.5467, + "step": 6051 + }, + { + "epoch": 0.77, + "grad_norm": 0.5747723406091592, + "learning_rate": 7.035787694756629e-06, + "loss": 0.547, + "step": 6052 + }, + { + "epoch": 0.77, + "grad_norm": 0.6071889743209812, + "learning_rate": 7.0348454439695035e-06, + "loss": 0.5746, + "step": 6053 + }, + { + "epoch": 0.77, + "grad_norm": 0.7304023891510193, + "learning_rate": 7.033903106565509e-06, + "loss": 0.6284, + "step": 6054 + }, + { + "epoch": 0.77, + "grad_norm": 1.6570541196986908, + "learning_rate": 7.032960682584758e-06, + "loss": 0.6438, + "step": 6055 + }, + { + "epoch": 0.77, + "grad_norm": 0.5953741344451474, + "learning_rate": 7.03201817206737e-06, + "loss": 0.5526, + "step": 6056 + }, + { + "epoch": 0.77, + "grad_norm": 0.7390360241547438, + "learning_rate": 7.031075575053459e-06, + "loss": 0.6218, + "step": 6057 + }, + { + "epoch": 0.77, + "grad_norm": 0.6182872596546606, + "learning_rate": 7.030132891583152e-06, + "loss": 0.55, + "step": 6058 + }, + { + "epoch": 0.77, + "grad_norm": 0.5474958207745392, + "learning_rate": 7.029190121696577e-06, + "loss": 0.496, + "step": 6059 + }, + { + "epoch": 0.77, + "grad_norm": 0.8319296740253522, + "learning_rate": 7.028247265433861e-06, + "loss": 0.6289, + "step": 6060 + }, + { + "epoch": 0.77, + "grad_norm": 0.6632324180366329, + "learning_rate": 7.0273043228351404e-06, + "loss": 0.6, + "step": 6061 + }, + { + "epoch": 0.77, + "grad_norm": 0.7093942999794963, + "learning_rate": 7.0263612939405524e-06, + "loss": 0.568, + "step": 6062 + }, + { + "epoch": 0.77, + "grad_norm": 0.672908521751254, + "learning_rate": 7.025418178790241e-06, + "loss": 0.5213, + "step": 6063 + }, + { + "epoch": 0.77, + "grad_norm": 0.880776464295765, + "learning_rate": 7.0244749774243495e-06, + "loss": 0.5648, + "step": 6064 + }, + { + "epoch": 0.77, + "grad_norm": 0.6353285734379588, + "learning_rate": 7.0235316898830274e-06, + "loss": 0.5592, + "step": 6065 + }, + { + "epoch": 0.77, + "grad_norm": 0.80600462935185, + "learning_rate": 7.022588316206427e-06, + "loss": 0.664, + "step": 6066 + }, + { + "epoch": 0.77, + "grad_norm": 0.6758317174509741, + "learning_rate": 7.021644856434705e-06, + "loss": 0.5806, + "step": 6067 + }, + { + "epoch": 0.77, + "grad_norm": 0.6523254610023184, + "learning_rate": 7.0207013106080225e-06, + "loss": 0.5572, + "step": 6068 + }, + { + "epoch": 0.77, + "grad_norm": 0.5382601567959718, + "learning_rate": 7.019757678766544e-06, + "loss": 0.541, + "step": 6069 + }, + { + "epoch": 0.77, + "grad_norm": 0.6813693905307426, + "learning_rate": 7.018813960950432e-06, + "loss": 0.5508, + "step": 6070 + }, + { + "epoch": 0.77, + "grad_norm": 0.745718707449539, + "learning_rate": 7.0178701571998644e-06, + "loss": 0.5573, + "step": 6071 + }, + { + "epoch": 0.77, + "grad_norm": 0.6610284907289825, + "learning_rate": 7.01692626755501e-06, + "loss": 0.5677, + "step": 6072 + }, + { + "epoch": 0.77, + "grad_norm": 0.7525283827944745, + "learning_rate": 7.01598229205605e-06, + "loss": 0.5415, + "step": 6073 + }, + { + "epoch": 0.77, + "grad_norm": 0.6812398425879042, + "learning_rate": 7.015038230743165e-06, + "loss": 0.5766, + "step": 6074 + }, + { + "epoch": 0.77, + "grad_norm": 0.6376153279099177, + "learning_rate": 7.014094083656544e-06, + "loss": 0.5339, + "step": 6075 + }, + { + "epoch": 0.77, + "grad_norm": 0.6617176435761051, + "learning_rate": 7.013149850836373e-06, + "loss": 0.5468, + "step": 6076 + }, + { + "epoch": 0.77, + "grad_norm": 0.612352892101539, + "learning_rate": 7.0122055323228455e-06, + "loss": 0.5113, + "step": 6077 + }, + { + "epoch": 0.77, + "grad_norm": 0.5283091130467976, + "learning_rate": 7.0112611281561595e-06, + "loss": 0.494, + "step": 6078 + }, + { + "epoch": 0.77, + "grad_norm": 0.6262043520584162, + "learning_rate": 7.010316638376513e-06, + "loss": 0.5996, + "step": 6079 + }, + { + "epoch": 0.77, + "grad_norm": 0.6629502911278257, + "learning_rate": 7.009372063024111e-06, + "loss": 0.5422, + "step": 6080 + }, + { + "epoch": 0.77, + "grad_norm": 0.7604354893945472, + "learning_rate": 7.008427402139163e-06, + "loss": 0.6251, + "step": 6081 + }, + { + "epoch": 0.77, + "grad_norm": 0.7047594686504335, + "learning_rate": 7.007482655761878e-06, + "loss": 0.5861, + "step": 6082 + }, + { + "epoch": 0.77, + "grad_norm": 0.8143829922209819, + "learning_rate": 7.006537823932471e-06, + "loss": 0.601, + "step": 6083 + }, + { + "epoch": 0.78, + "grad_norm": 0.5984028371930139, + "learning_rate": 7.005592906691161e-06, + "loss": 0.5225, + "step": 6084 + }, + { + "epoch": 0.78, + "grad_norm": 0.8038181984577378, + "learning_rate": 7.00464790407817e-06, + "loss": 0.612, + "step": 6085 + }, + { + "epoch": 0.78, + "grad_norm": 0.7853545575807502, + "learning_rate": 7.003702816133724e-06, + "loss": 0.5401, + "step": 6086 + }, + { + "epoch": 0.78, + "grad_norm": 0.5447574378687547, + "learning_rate": 7.002757642898052e-06, + "loss": 0.5152, + "step": 6087 + }, + { + "epoch": 0.78, + "grad_norm": 0.7508248008587622, + "learning_rate": 7.001812384411386e-06, + "loss": 0.5991, + "step": 6088 + }, + { + "epoch": 0.78, + "grad_norm": 0.6071495688858949, + "learning_rate": 7.000867040713964e-06, + "loss": 0.5257, + "step": 6089 + }, + { + "epoch": 0.78, + "grad_norm": 0.6580350397288754, + "learning_rate": 6.999921611846027e-06, + "loss": 0.5114, + "step": 6090 + }, + { + "epoch": 0.78, + "grad_norm": 0.8296385344887232, + "learning_rate": 6.9989760978478176e-06, + "loss": 0.6806, + "step": 6091 + }, + { + "epoch": 0.78, + "grad_norm": 0.7362209605189728, + "learning_rate": 6.998030498759583e-06, + "loss": 0.5777, + "step": 6092 + }, + { + "epoch": 0.78, + "grad_norm": 0.6859307273497655, + "learning_rate": 6.9970848146215744e-06, + "loss": 0.5421, + "step": 6093 + }, + { + "epoch": 0.78, + "grad_norm": 0.5844085917491771, + "learning_rate": 6.996139045474046e-06, + "loss": 0.532, + "step": 6094 + }, + { + "epoch": 0.78, + "grad_norm": 0.6592499049348985, + "learning_rate": 6.995193191357259e-06, + "loss": 0.563, + "step": 6095 + }, + { + "epoch": 0.78, + "grad_norm": 0.5713688234567691, + "learning_rate": 6.994247252311473e-06, + "loss": 0.5138, + "step": 6096 + }, + { + "epoch": 0.78, + "grad_norm": 0.6933565529333269, + "learning_rate": 6.993301228376955e-06, + "loss": 0.5335, + "step": 6097 + }, + { + "epoch": 0.78, + "grad_norm": 0.9250161177146866, + "learning_rate": 6.992355119593973e-06, + "loss": 0.622, + "step": 6098 + }, + { + "epoch": 0.78, + "grad_norm": 0.7224784839760994, + "learning_rate": 6.9914089260028e-06, + "loss": 0.5998, + "step": 6099 + }, + { + "epoch": 0.78, + "grad_norm": 0.774091694077413, + "learning_rate": 6.990462647643713e-06, + "loss": 0.6345, + "step": 6100 + }, + { + "epoch": 0.78, + "grad_norm": 0.5607801855546023, + "learning_rate": 6.989516284556992e-06, + "loss": 0.4911, + "step": 6101 + }, + { + "epoch": 0.78, + "grad_norm": 0.7074015476510126, + "learning_rate": 6.988569836782919e-06, + "loss": 0.4826, + "step": 6102 + }, + { + "epoch": 0.78, + "grad_norm": 0.8538333881835464, + "learning_rate": 6.9876233043617835e-06, + "loss": 0.6135, + "step": 6103 + }, + { + "epoch": 0.78, + "grad_norm": 0.6198661983849384, + "learning_rate": 6.986676687333874e-06, + "loss": 0.5423, + "step": 6104 + }, + { + "epoch": 0.78, + "grad_norm": 0.6908445499513478, + "learning_rate": 6.985729985739489e-06, + "loss": 0.526, + "step": 6105 + }, + { + "epoch": 0.78, + "grad_norm": 0.6352547873271959, + "learning_rate": 6.984783199618922e-06, + "loss": 0.6285, + "step": 6106 + }, + { + "epoch": 0.78, + "grad_norm": 0.5914740537912223, + "learning_rate": 6.9838363290124776e-06, + "loss": 0.5254, + "step": 6107 + }, + { + "epoch": 0.78, + "grad_norm": 1.2801019088613952, + "learning_rate": 6.98288937396046e-06, + "loss": 0.6288, + "step": 6108 + }, + { + "epoch": 0.78, + "grad_norm": 0.9335250808678409, + "learning_rate": 6.981942334503178e-06, + "loss": 0.6318, + "step": 6109 + }, + { + "epoch": 0.78, + "grad_norm": 0.5849128006547969, + "learning_rate": 6.980995210680943e-06, + "loss": 0.5099, + "step": 6110 + }, + { + "epoch": 0.78, + "grad_norm": 0.5681560596857154, + "learning_rate": 6.980048002534074e-06, + "loss": 0.5909, + "step": 6111 + }, + { + "epoch": 0.78, + "grad_norm": 0.5871495885697342, + "learning_rate": 6.979100710102888e-06, + "loss": 0.485, + "step": 6112 + }, + { + "epoch": 0.78, + "grad_norm": 0.8050561993364773, + "learning_rate": 6.97815333342771e-06, + "loss": 0.6033, + "step": 6113 + }, + { + "epoch": 0.78, + "grad_norm": 0.7343342902711865, + "learning_rate": 6.9772058725488655e-06, + "loss": 0.6465, + "step": 6114 + }, + { + "epoch": 0.78, + "grad_norm": 0.6408911840395797, + "learning_rate": 6.9762583275066855e-06, + "loss": 0.5082, + "step": 6115 + }, + { + "epoch": 0.78, + "grad_norm": 0.7044054058239293, + "learning_rate": 6.975310698341503e-06, + "loss": 0.5339, + "step": 6116 + }, + { + "epoch": 0.78, + "grad_norm": 0.5868739593970147, + "learning_rate": 6.974362985093656e-06, + "loss": 0.5566, + "step": 6117 + }, + { + "epoch": 0.78, + "grad_norm": 0.6440622083548919, + "learning_rate": 6.973415187803486e-06, + "loss": 0.5513, + "step": 6118 + }, + { + "epoch": 0.78, + "grad_norm": 0.7066928954633134, + "learning_rate": 6.972467306511338e-06, + "loss": 0.5257, + "step": 6119 + }, + { + "epoch": 0.78, + "grad_norm": 0.7700288467201174, + "learning_rate": 6.97151934125756e-06, + "loss": 0.5991, + "step": 6120 + }, + { + "epoch": 0.78, + "grad_norm": 0.5283508544117611, + "learning_rate": 6.970571292082504e-06, + "loss": 0.5175, + "step": 6121 + }, + { + "epoch": 0.78, + "grad_norm": 0.8470524703983034, + "learning_rate": 6.969623159026524e-06, + "loss": 0.6532, + "step": 6122 + }, + { + "epoch": 0.78, + "grad_norm": 0.6218391939761694, + "learning_rate": 6.96867494212998e-06, + "loss": 0.5331, + "step": 6123 + }, + { + "epoch": 0.78, + "grad_norm": 0.7473198316412153, + "learning_rate": 6.967726641433236e-06, + "loss": 0.6415, + "step": 6124 + }, + { + "epoch": 0.78, + "grad_norm": 0.5365926497601979, + "learning_rate": 6.966778256976656e-06, + "loss": 0.4869, + "step": 6125 + }, + { + "epoch": 0.78, + "grad_norm": 0.914592815355564, + "learning_rate": 6.965829788800611e-06, + "loss": 0.6547, + "step": 6126 + }, + { + "epoch": 0.78, + "grad_norm": 0.6601173258279925, + "learning_rate": 6.964881236945474e-06, + "loss": 0.5607, + "step": 6127 + }, + { + "epoch": 0.78, + "grad_norm": 0.6843616194760515, + "learning_rate": 6.963932601451621e-06, + "loss": 0.5407, + "step": 6128 + }, + { + "epoch": 0.78, + "grad_norm": 0.7183738774638622, + "learning_rate": 6.962983882359433e-06, + "loss": 0.6158, + "step": 6129 + }, + { + "epoch": 0.78, + "grad_norm": 0.7680164649880659, + "learning_rate": 6.962035079709294e-06, + "loss": 0.6409, + "step": 6130 + }, + { + "epoch": 0.78, + "grad_norm": 0.6684581297969855, + "learning_rate": 6.9610861935415915e-06, + "loss": 0.5631, + "step": 6131 + }, + { + "epoch": 0.78, + "grad_norm": 0.8363427677623987, + "learning_rate": 6.960137223896717e-06, + "loss": 0.6002, + "step": 6132 + }, + { + "epoch": 0.78, + "grad_norm": 0.7994319823281306, + "learning_rate": 6.9591881708150645e-06, + "loss": 0.5981, + "step": 6133 + }, + { + "epoch": 0.78, + "grad_norm": 0.6396602571762071, + "learning_rate": 6.958239034337032e-06, + "loss": 0.48, + "step": 6134 + }, + { + "epoch": 0.78, + "grad_norm": 0.9312708990308008, + "learning_rate": 6.95728981450302e-06, + "loss": 0.5926, + "step": 6135 + }, + { + "epoch": 0.78, + "grad_norm": 0.7540675079988468, + "learning_rate": 6.956340511353436e-06, + "loss": 0.6073, + "step": 6136 + }, + { + "epoch": 0.78, + "grad_norm": 0.7620861994525625, + "learning_rate": 6.95539112492869e-06, + "loss": 0.5923, + "step": 6137 + }, + { + "epoch": 0.78, + "grad_norm": 0.7773320261409922, + "learning_rate": 6.95444165526919e-06, + "loss": 0.6306, + "step": 6138 + }, + { + "epoch": 0.78, + "grad_norm": 0.6345931819255022, + "learning_rate": 6.953492102415355e-06, + "loss": 0.497, + "step": 6139 + }, + { + "epoch": 0.78, + "grad_norm": 0.7509745325030854, + "learning_rate": 6.9525424664076046e-06, + "loss": 0.6594, + "step": 6140 + }, + { + "epoch": 0.78, + "grad_norm": 0.8573200746023567, + "learning_rate": 6.9515927472863595e-06, + "loss": 0.6061, + "step": 6141 + }, + { + "epoch": 0.78, + "grad_norm": 0.6054241820101158, + "learning_rate": 6.950642945092047e-06, + "loss": 0.5233, + "step": 6142 + }, + { + "epoch": 0.78, + "grad_norm": 0.8119505648001418, + "learning_rate": 6.9496930598651e-06, + "loss": 0.609, + "step": 6143 + }, + { + "epoch": 0.78, + "grad_norm": 0.6494219303020818, + "learning_rate": 6.948743091645949e-06, + "loss": 0.5748, + "step": 6144 + }, + { + "epoch": 0.78, + "grad_norm": 0.8789723812372376, + "learning_rate": 6.9477930404750304e-06, + "loss": 0.6145, + "step": 6145 + }, + { + "epoch": 0.78, + "grad_norm": 0.6700853983739093, + "learning_rate": 6.9468429063927875e-06, + "loss": 0.5913, + "step": 6146 + }, + { + "epoch": 0.78, + "grad_norm": 0.7035478550174823, + "learning_rate": 6.945892689439664e-06, + "loss": 0.5895, + "step": 6147 + }, + { + "epoch": 0.78, + "grad_norm": 0.6381283890988907, + "learning_rate": 6.9449423896561055e-06, + "loss": 0.5454, + "step": 6148 + }, + { + "epoch": 0.78, + "grad_norm": 0.6237204562565154, + "learning_rate": 6.943992007082565e-06, + "loss": 0.5893, + "step": 6149 + }, + { + "epoch": 0.78, + "grad_norm": 0.8342605489853235, + "learning_rate": 6.943041541759496e-06, + "loss": 0.6199, + "step": 6150 + }, + { + "epoch": 0.78, + "grad_norm": 0.6102616469181731, + "learning_rate": 6.94209099372736e-06, + "loss": 0.5517, + "step": 6151 + }, + { + "epoch": 0.78, + "grad_norm": 0.9253336997123479, + "learning_rate": 6.941140363026615e-06, + "loss": 0.5974, + "step": 6152 + }, + { + "epoch": 0.78, + "grad_norm": 0.525117978333921, + "learning_rate": 6.940189649697728e-06, + "loss": 0.5245, + "step": 6153 + }, + { + "epoch": 0.78, + "grad_norm": 0.6832617034871739, + "learning_rate": 6.9392388537811675e-06, + "loss": 0.5946, + "step": 6154 + }, + { + "epoch": 0.78, + "grad_norm": 0.5714910527073446, + "learning_rate": 6.938287975317406e-06, + "loss": 0.5195, + "step": 6155 + }, + { + "epoch": 0.78, + "grad_norm": 0.6361896105627376, + "learning_rate": 6.937337014346918e-06, + "loss": 0.6188, + "step": 6156 + }, + { + "epoch": 0.78, + "grad_norm": 0.7734582689465479, + "learning_rate": 6.936385970910185e-06, + "loss": 0.5521, + "step": 6157 + }, + { + "epoch": 0.78, + "grad_norm": 0.7891876975678117, + "learning_rate": 6.9354348450476894e-06, + "loss": 0.6444, + "step": 6158 + }, + { + "epoch": 0.78, + "grad_norm": 0.7203664824765719, + "learning_rate": 6.934483636799918e-06, + "loss": 0.6059, + "step": 6159 + }, + { + "epoch": 0.78, + "grad_norm": 0.7088666589050271, + "learning_rate": 6.933532346207359e-06, + "loss": 0.565, + "step": 6160 + }, + { + "epoch": 0.78, + "grad_norm": 1.0739967395205492, + "learning_rate": 6.932580973310507e-06, + "loss": 0.6433, + "step": 6161 + }, + { + "epoch": 0.79, + "grad_norm": 0.6721416735590974, + "learning_rate": 6.931629518149858e-06, + "loss": 0.5417, + "step": 6162 + }, + { + "epoch": 0.79, + "grad_norm": 0.8208870301459986, + "learning_rate": 6.930677980765913e-06, + "loss": 0.6081, + "step": 6163 + }, + { + "epoch": 0.79, + "grad_norm": 0.7918140216290176, + "learning_rate": 6.929726361199176e-06, + "loss": 0.6054, + "step": 6164 + }, + { + "epoch": 0.79, + "grad_norm": 0.6279839957932126, + "learning_rate": 6.928774659490155e-06, + "loss": 0.4815, + "step": 6165 + }, + { + "epoch": 0.79, + "grad_norm": 0.5384804074980266, + "learning_rate": 6.9278228756793594e-06, + "loss": 0.4604, + "step": 6166 + }, + { + "epoch": 0.79, + "grad_norm": 0.5964097962248348, + "learning_rate": 6.9268710098073044e-06, + "loss": 0.5459, + "step": 6167 + }, + { + "epoch": 0.79, + "grad_norm": 0.5860460691839549, + "learning_rate": 6.925919061914509e-06, + "loss": 0.4896, + "step": 6168 + }, + { + "epoch": 0.79, + "grad_norm": 0.6369287628579393, + "learning_rate": 6.924967032041493e-06, + "loss": 0.5444, + "step": 6169 + }, + { + "epoch": 0.79, + "grad_norm": 0.5809937042979003, + "learning_rate": 6.924014920228781e-06, + "loss": 0.5216, + "step": 6170 + }, + { + "epoch": 0.79, + "grad_norm": 0.799396344707945, + "learning_rate": 6.923062726516902e-06, + "loss": 0.6009, + "step": 6171 + }, + { + "epoch": 0.79, + "grad_norm": 0.6579431336556247, + "learning_rate": 6.922110450946389e-06, + "loss": 0.5732, + "step": 6172 + }, + { + "epoch": 0.79, + "grad_norm": 0.9682610077255045, + "learning_rate": 6.921158093557776e-06, + "loss": 0.6369, + "step": 6173 + }, + { + "epoch": 0.79, + "grad_norm": 0.7393093470537417, + "learning_rate": 6.920205654391602e-06, + "loss": 0.6148, + "step": 6174 + }, + { + "epoch": 0.79, + "grad_norm": 0.7832586780668236, + "learning_rate": 6.9192531334884095e-06, + "loss": 0.6525, + "step": 6175 + }, + { + "epoch": 0.79, + "grad_norm": 0.7039747573022053, + "learning_rate": 6.918300530888744e-06, + "loss": 0.5907, + "step": 6176 + }, + { + "epoch": 0.79, + "grad_norm": 0.5574547025823923, + "learning_rate": 6.917347846633155e-06, + "loss": 0.5281, + "step": 6177 + }, + { + "epoch": 0.79, + "grad_norm": 0.684420410474668, + "learning_rate": 6.916395080762196e-06, + "loss": 0.6124, + "step": 6178 + }, + { + "epoch": 0.79, + "grad_norm": 0.6630924917107729, + "learning_rate": 6.91544223331642e-06, + "loss": 0.5838, + "step": 6179 + }, + { + "epoch": 0.79, + "grad_norm": 1.0094688674188406, + "learning_rate": 6.914489304336391e-06, + "loss": 0.661, + "step": 6180 + }, + { + "epoch": 0.79, + "grad_norm": 0.693860965646948, + "learning_rate": 6.91353629386267e-06, + "loss": 0.5885, + "step": 6181 + }, + { + "epoch": 0.79, + "grad_norm": 0.6045233868854597, + "learning_rate": 6.912583201935823e-06, + "loss": 0.4921, + "step": 6182 + }, + { + "epoch": 0.79, + "grad_norm": 0.5976208342295841, + "learning_rate": 6.911630028596421e-06, + "loss": 0.5161, + "step": 6183 + }, + { + "epoch": 0.79, + "grad_norm": 0.6133173503245882, + "learning_rate": 6.9106767738850386e-06, + "loss": 0.4978, + "step": 6184 + }, + { + "epoch": 0.79, + "grad_norm": 0.710504803794944, + "learning_rate": 6.909723437842249e-06, + "loss": 0.628, + "step": 6185 + }, + { + "epoch": 0.79, + "grad_norm": 0.7743011024412121, + "learning_rate": 6.908770020508637e-06, + "loss": 0.6303, + "step": 6186 + }, + { + "epoch": 0.79, + "grad_norm": 0.6856054433042441, + "learning_rate": 6.907816521924785e-06, + "loss": 0.557, + "step": 6187 + }, + { + "epoch": 0.79, + "grad_norm": 0.781354247419235, + "learning_rate": 6.90686294213128e-06, + "loss": 0.5982, + "step": 6188 + }, + { + "epoch": 0.79, + "grad_norm": 0.754877037475734, + "learning_rate": 6.905909281168713e-06, + "loss": 0.6367, + "step": 6189 + }, + { + "epoch": 0.79, + "grad_norm": 0.7514741435184724, + "learning_rate": 6.904955539077678e-06, + "loss": 0.5725, + "step": 6190 + }, + { + "epoch": 0.79, + "grad_norm": 0.8216087262393915, + "learning_rate": 6.904001715898773e-06, + "loss": 0.6266, + "step": 6191 + }, + { + "epoch": 0.79, + "grad_norm": 0.7563601759227156, + "learning_rate": 6.903047811672599e-06, + "loss": 0.5646, + "step": 6192 + }, + { + "epoch": 0.79, + "grad_norm": 0.6395881979251259, + "learning_rate": 6.9020938264397615e-06, + "loss": 0.4992, + "step": 6193 + }, + { + "epoch": 0.79, + "grad_norm": 0.5733684592744743, + "learning_rate": 6.901139760240868e-06, + "loss": 0.4656, + "step": 6194 + }, + { + "epoch": 0.79, + "grad_norm": 0.5579104745176718, + "learning_rate": 6.90018561311653e-06, + "loss": 0.529, + "step": 6195 + }, + { + "epoch": 0.79, + "grad_norm": 0.8436171976687403, + "learning_rate": 6.899231385107364e-06, + "loss": 0.619, + "step": 6196 + }, + { + "epoch": 0.79, + "grad_norm": 0.5820596508606891, + "learning_rate": 6.8982770762539845e-06, + "loss": 0.5333, + "step": 6197 + }, + { + "epoch": 0.79, + "grad_norm": 0.8198069024579641, + "learning_rate": 6.897322686597016e-06, + "loss": 0.5821, + "step": 6198 + }, + { + "epoch": 0.79, + "grad_norm": 0.5074964758574475, + "learning_rate": 6.8963682161770854e-06, + "loss": 0.4855, + "step": 6199 + }, + { + "epoch": 0.79, + "grad_norm": 0.5053268102071425, + "learning_rate": 6.895413665034819e-06, + "loss": 0.461, + "step": 6200 + }, + { + "epoch": 0.79, + "grad_norm": 0.7541032712826765, + "learning_rate": 6.89445903321085e-06, + "loss": 0.577, + "step": 6201 + }, + { + "epoch": 0.79, + "grad_norm": 0.7463920325018282, + "learning_rate": 6.893504320745814e-06, + "loss": 0.6272, + "step": 6202 + }, + { + "epoch": 0.79, + "grad_norm": 0.6227189089476501, + "learning_rate": 6.892549527680348e-06, + "loss": 0.5534, + "step": 6203 + }, + { + "epoch": 0.79, + "grad_norm": 0.6424807333780551, + "learning_rate": 6.891594654055098e-06, + "loss": 0.5329, + "step": 6204 + }, + { + "epoch": 0.79, + "grad_norm": 0.7734804845782416, + "learning_rate": 6.8906396999107085e-06, + "loss": 0.5427, + "step": 6205 + }, + { + "epoch": 0.79, + "grad_norm": 0.742196668913134, + "learning_rate": 6.889684665287828e-06, + "loss": 0.6229, + "step": 6206 + }, + { + "epoch": 0.79, + "grad_norm": 0.8613033288837254, + "learning_rate": 6.88872955022711e-06, + "loss": 0.679, + "step": 6207 + }, + { + "epoch": 0.79, + "grad_norm": 0.6666564855842763, + "learning_rate": 6.88777435476921e-06, + "loss": 0.5233, + "step": 6208 + }, + { + "epoch": 0.79, + "grad_norm": 0.8033239955118487, + "learning_rate": 6.88681907895479e-06, + "loss": 0.6039, + "step": 6209 + }, + { + "epoch": 0.79, + "grad_norm": 0.7021445627143932, + "learning_rate": 6.8858637228245086e-06, + "loss": 0.6238, + "step": 6210 + }, + { + "epoch": 0.79, + "grad_norm": 0.6665255342789965, + "learning_rate": 6.884908286419036e-06, + "loss": 0.5371, + "step": 6211 + }, + { + "epoch": 0.79, + "grad_norm": 0.5943111470724661, + "learning_rate": 6.8839527697790405e-06, + "loss": 0.5069, + "step": 6212 + }, + { + "epoch": 0.79, + "grad_norm": 0.5626700224449825, + "learning_rate": 6.882997172945198e-06, + "loss": 0.509, + "step": 6213 + }, + { + "epoch": 0.79, + "grad_norm": 0.7003198306754069, + "learning_rate": 6.882041495958181e-06, + "loss": 0.4998, + "step": 6214 + }, + { + "epoch": 0.79, + "grad_norm": 0.5864565904943859, + "learning_rate": 6.881085738858673e-06, + "loss": 0.513, + "step": 6215 + }, + { + "epoch": 0.79, + "grad_norm": 0.5634487255844925, + "learning_rate": 6.880129901687356e-06, + "loss": 0.4916, + "step": 6216 + }, + { + "epoch": 0.79, + "grad_norm": 0.5806470534231932, + "learning_rate": 6.879173984484915e-06, + "loss": 0.5266, + "step": 6217 + }, + { + "epoch": 0.79, + "grad_norm": 0.7388452903608196, + "learning_rate": 6.878217987292044e-06, + "loss": 0.626, + "step": 6218 + }, + { + "epoch": 0.79, + "grad_norm": 0.7120962507420958, + "learning_rate": 6.8772619101494356e-06, + "loss": 0.545, + "step": 6219 + }, + { + "epoch": 0.79, + "grad_norm": 0.5667017214524706, + "learning_rate": 6.876305753097786e-06, + "loss": 0.4413, + "step": 6220 + }, + { + "epoch": 0.79, + "grad_norm": 0.9126339519742783, + "learning_rate": 6.875349516177796e-06, + "loss": 0.607, + "step": 6221 + }, + { + "epoch": 0.79, + "grad_norm": 0.6508330591251228, + "learning_rate": 6.87439319943017e-06, + "loss": 0.5155, + "step": 6222 + }, + { + "epoch": 0.79, + "grad_norm": 0.559887847798657, + "learning_rate": 6.873436802895617e-06, + "loss": 0.5099, + "step": 6223 + }, + { + "epoch": 0.79, + "grad_norm": 0.7936099331457016, + "learning_rate": 6.872480326614844e-06, + "loss": 0.5903, + "step": 6224 + }, + { + "epoch": 0.79, + "grad_norm": 0.610167109177135, + "learning_rate": 6.871523770628568e-06, + "loss": 0.5426, + "step": 6225 + }, + { + "epoch": 0.79, + "grad_norm": 0.8166665705034846, + "learning_rate": 6.870567134977505e-06, + "loss": 0.6252, + "step": 6226 + }, + { + "epoch": 0.79, + "grad_norm": 0.7287601280290925, + "learning_rate": 6.8696104197023776e-06, + "loss": 0.6008, + "step": 6227 + }, + { + "epoch": 0.79, + "grad_norm": 0.601670641793849, + "learning_rate": 6.868653624843906e-06, + "loss": 0.5068, + "step": 6228 + }, + { + "epoch": 0.79, + "grad_norm": 0.7617357785708877, + "learning_rate": 6.8676967504428235e-06, + "loss": 0.61, + "step": 6229 + }, + { + "epoch": 0.79, + "grad_norm": 0.7307420799092397, + "learning_rate": 6.866739796539859e-06, + "loss": 0.6041, + "step": 6230 + }, + { + "epoch": 0.79, + "grad_norm": 0.5876454093073531, + "learning_rate": 6.865782763175746e-06, + "loss": 0.4629, + "step": 6231 + }, + { + "epoch": 0.79, + "grad_norm": 0.7987521085472857, + "learning_rate": 6.864825650391223e-06, + "loss": 0.6566, + "step": 6232 + }, + { + "epoch": 0.79, + "grad_norm": 0.7055230017174057, + "learning_rate": 6.863868458227031e-06, + "loss": 0.5847, + "step": 6233 + }, + { + "epoch": 0.79, + "grad_norm": 0.7088863941530662, + "learning_rate": 6.862911186723914e-06, + "loss": 0.5952, + "step": 6234 + }, + { + "epoch": 0.79, + "grad_norm": 0.7315691821038331, + "learning_rate": 6.861953835922621e-06, + "loss": 0.6058, + "step": 6235 + }, + { + "epoch": 0.79, + "grad_norm": 0.7202830970738314, + "learning_rate": 6.860996405863905e-06, + "loss": 0.5745, + "step": 6236 + }, + { + "epoch": 0.79, + "grad_norm": 0.7226304380598846, + "learning_rate": 6.860038896588517e-06, + "loss": 0.5845, + "step": 6237 + }, + { + "epoch": 0.79, + "grad_norm": 0.8105305245141821, + "learning_rate": 6.859081308137217e-06, + "loss": 0.6127, + "step": 6238 + }, + { + "epoch": 0.79, + "grad_norm": 0.8045750980169796, + "learning_rate": 6.858123640550768e-06, + "loss": 0.5539, + "step": 6239 + }, + { + "epoch": 0.79, + "grad_norm": 0.6142025774557623, + "learning_rate": 6.8571658938699325e-06, + "loss": 0.5098, + "step": 6240 + }, + { + "epoch": 0.8, + "grad_norm": 0.7187259568110902, + "learning_rate": 6.856208068135479e-06, + "loss": 0.6084, + "step": 6241 + }, + { + "epoch": 0.8, + "grad_norm": 0.7185962916088359, + "learning_rate": 6.855250163388181e-06, + "loss": 0.5968, + "step": 6242 + }, + { + "epoch": 0.8, + "grad_norm": 0.8012193986950965, + "learning_rate": 6.854292179668811e-06, + "loss": 0.5973, + "step": 6243 + }, + { + "epoch": 0.8, + "grad_norm": 0.6125550453861028, + "learning_rate": 6.853334117018149e-06, + "loss": 0.4825, + "step": 6244 + }, + { + "epoch": 0.8, + "grad_norm": 0.749544309407826, + "learning_rate": 6.852375975476975e-06, + "loss": 0.5419, + "step": 6245 + }, + { + "epoch": 0.8, + "grad_norm": 0.5020314314941644, + "learning_rate": 6.851417755086076e-06, + "loss": 0.4914, + "step": 6246 + }, + { + "epoch": 0.8, + "grad_norm": 0.7697754095881776, + "learning_rate": 6.850459455886238e-06, + "loss": 0.6191, + "step": 6247 + }, + { + "epoch": 0.8, + "grad_norm": 0.5241607131772357, + "learning_rate": 6.8495010779182555e-06, + "loss": 0.5187, + "step": 6248 + }, + { + "epoch": 0.8, + "grad_norm": 0.5733990169862537, + "learning_rate": 6.848542621222922e-06, + "loss": 0.5339, + "step": 6249 + }, + { + "epoch": 0.8, + "grad_norm": 0.9785630042083076, + "learning_rate": 6.847584085841037e-06, + "loss": 0.6041, + "step": 6250 + }, + { + "epoch": 0.8, + "grad_norm": 0.6649692871869353, + "learning_rate": 6.846625471813402e-06, + "loss": 0.5882, + "step": 6251 + }, + { + "epoch": 0.8, + "grad_norm": 0.8112827043327446, + "learning_rate": 6.84566677918082e-06, + "loss": 0.6561, + "step": 6252 + }, + { + "epoch": 0.8, + "grad_norm": 0.7365231857471037, + "learning_rate": 6.8447080079841e-06, + "loss": 0.5633, + "step": 6253 + }, + { + "epoch": 0.8, + "grad_norm": 0.5696004897404208, + "learning_rate": 6.843749158264057e-06, + "loss": 0.5103, + "step": 6254 + }, + { + "epoch": 0.8, + "grad_norm": 0.6108554199444037, + "learning_rate": 6.842790230061504e-06, + "loss": 0.5826, + "step": 6255 + }, + { + "epoch": 0.8, + "grad_norm": 0.6850630542323604, + "learning_rate": 6.841831223417259e-06, + "loss": 0.5647, + "step": 6256 + }, + { + "epoch": 0.8, + "grad_norm": 0.7699240680149971, + "learning_rate": 6.840872138372145e-06, + "loss": 0.6505, + "step": 6257 + }, + { + "epoch": 0.8, + "grad_norm": 0.5776650376463175, + "learning_rate": 6.8399129749669866e-06, + "loss": 0.533, + "step": 6258 + }, + { + "epoch": 0.8, + "grad_norm": 0.5806014451399877, + "learning_rate": 6.838953733242611e-06, + "loss": 0.5068, + "step": 6259 + }, + { + "epoch": 0.8, + "grad_norm": 0.6530780661175465, + "learning_rate": 6.837994413239854e-06, + "loss": 0.5724, + "step": 6260 + }, + { + "epoch": 0.8, + "grad_norm": 0.8350028438034934, + "learning_rate": 6.837035014999546e-06, + "loss": 0.5839, + "step": 6261 + }, + { + "epoch": 0.8, + "grad_norm": 0.7615034528067307, + "learning_rate": 6.836075538562529e-06, + "loss": 0.602, + "step": 6262 + }, + { + "epoch": 0.8, + "grad_norm": 0.5822667151176976, + "learning_rate": 6.8351159839696436e-06, + "loss": 0.5173, + "step": 6263 + }, + { + "epoch": 0.8, + "grad_norm": 0.755752935684026, + "learning_rate": 6.8341563512617335e-06, + "loss": 0.5996, + "step": 6264 + }, + { + "epoch": 0.8, + "grad_norm": 0.7101963630379892, + "learning_rate": 6.833196640479649e-06, + "loss": 0.5841, + "step": 6265 + }, + { + "epoch": 0.8, + "grad_norm": 0.6852151021143676, + "learning_rate": 6.832236851664242e-06, + "loss": 0.5554, + "step": 6266 + }, + { + "epoch": 0.8, + "grad_norm": 0.6015082261462827, + "learning_rate": 6.831276984856369e-06, + "loss": 0.497, + "step": 6267 + }, + { + "epoch": 0.8, + "grad_norm": 0.6176942236606779, + "learning_rate": 6.830317040096886e-06, + "loss": 0.4986, + "step": 6268 + }, + { + "epoch": 0.8, + "grad_norm": 0.7603440133190384, + "learning_rate": 6.8293570174266535e-06, + "loss": 0.6027, + "step": 6269 + }, + { + "epoch": 0.8, + "grad_norm": 0.637943257089538, + "learning_rate": 6.828396916886539e-06, + "loss": 0.5489, + "step": 6270 + }, + { + "epoch": 0.8, + "grad_norm": 0.7500199822387734, + "learning_rate": 6.827436738517412e-06, + "loss": 0.5838, + "step": 6271 + }, + { + "epoch": 0.8, + "grad_norm": 0.6582165205043784, + "learning_rate": 6.8264764823601404e-06, + "loss": 0.5221, + "step": 6272 + }, + { + "epoch": 0.8, + "grad_norm": 0.7077572872091533, + "learning_rate": 6.8255161484556045e-06, + "loss": 0.586, + "step": 6273 + }, + { + "epoch": 0.8, + "grad_norm": 0.7533088625713023, + "learning_rate": 6.8245557368446776e-06, + "loss": 0.6029, + "step": 6274 + }, + { + "epoch": 0.8, + "grad_norm": 0.7037599100165665, + "learning_rate": 6.823595247568244e-06, + "loss": 0.5991, + "step": 6275 + }, + { + "epoch": 0.8, + "grad_norm": 0.6571375999275945, + "learning_rate": 6.8226346806671885e-06, + "loss": 0.529, + "step": 6276 + }, + { + "epoch": 0.8, + "grad_norm": 0.6469866110133251, + "learning_rate": 6.821674036182399e-06, + "loss": 0.5267, + "step": 6277 + }, + { + "epoch": 0.8, + "grad_norm": 0.5644371973477075, + "learning_rate": 6.820713314154765e-06, + "loss": 0.5203, + "step": 6278 + }, + { + "epoch": 0.8, + "grad_norm": 0.5959305779758147, + "learning_rate": 6.819752514625184e-06, + "loss": 0.5317, + "step": 6279 + }, + { + "epoch": 0.8, + "grad_norm": 0.7102077211387456, + "learning_rate": 6.8187916376345555e-06, + "loss": 0.6044, + "step": 6280 + }, + { + "epoch": 0.8, + "grad_norm": 0.5544538366449274, + "learning_rate": 6.817830683223778e-06, + "loss": 0.5082, + "step": 6281 + }, + { + "epoch": 0.8, + "grad_norm": 0.8184035636411395, + "learning_rate": 6.816869651433757e-06, + "loss": 0.6817, + "step": 6282 + }, + { + "epoch": 0.8, + "grad_norm": 0.7213404586909871, + "learning_rate": 6.815908542305402e-06, + "loss": 0.5216, + "step": 6283 + }, + { + "epoch": 0.8, + "grad_norm": 0.7223103989692812, + "learning_rate": 6.814947355879622e-06, + "loss": 0.5678, + "step": 6284 + }, + { + "epoch": 0.8, + "grad_norm": 0.7481573018533019, + "learning_rate": 6.8139860921973335e-06, + "loss": 0.5798, + "step": 6285 + }, + { + "epoch": 0.8, + "grad_norm": 0.6012225613001245, + "learning_rate": 6.813024751299453e-06, + "loss": 0.5014, + "step": 6286 + }, + { + "epoch": 0.8, + "grad_norm": 0.6577919408740096, + "learning_rate": 6.812063333226903e-06, + "loss": 0.5676, + "step": 6287 + }, + { + "epoch": 0.8, + "grad_norm": 0.5780130804933448, + "learning_rate": 6.8111018380206075e-06, + "loss": 0.5257, + "step": 6288 + }, + { + "epoch": 0.8, + "grad_norm": 0.6037215296113814, + "learning_rate": 6.8101402657214934e-06, + "loss": 0.5057, + "step": 6289 + }, + { + "epoch": 0.8, + "grad_norm": 0.6517998028216473, + "learning_rate": 6.809178616370492e-06, + "loss": 0.5293, + "step": 6290 + }, + { + "epoch": 0.8, + "grad_norm": 0.6471955927017133, + "learning_rate": 6.8082168900085385e-06, + "loss": 0.525, + "step": 6291 + }, + { + "epoch": 0.8, + "grad_norm": 0.8668090658652021, + "learning_rate": 6.807255086676572e-06, + "loss": 0.6323, + "step": 6292 + }, + { + "epoch": 0.8, + "grad_norm": 0.7283029204199846, + "learning_rate": 6.80629320641553e-06, + "loss": 0.6375, + "step": 6293 + }, + { + "epoch": 0.8, + "grad_norm": 0.777387050154529, + "learning_rate": 6.805331249266359e-06, + "loss": 0.6458, + "step": 6294 + }, + { + "epoch": 0.8, + "grad_norm": 0.6352214781589633, + "learning_rate": 6.8043692152700056e-06, + "loss": 0.5185, + "step": 6295 + }, + { + "epoch": 0.8, + "grad_norm": 0.684343183397395, + "learning_rate": 6.803407104467421e-06, + "loss": 0.6893, + "step": 6296 + }, + { + "epoch": 0.8, + "grad_norm": 0.7276600518156257, + "learning_rate": 6.802444916899558e-06, + "loss": 0.5878, + "step": 6297 + }, + { + "epoch": 0.8, + "grad_norm": 0.6796942134435697, + "learning_rate": 6.801482652607375e-06, + "loss": 0.5382, + "step": 6298 + }, + { + "epoch": 0.8, + "grad_norm": 0.6201585270623763, + "learning_rate": 6.800520311631833e-06, + "loss": 0.5295, + "step": 6299 + }, + { + "epoch": 0.8, + "grad_norm": 0.7462001506166249, + "learning_rate": 6.799557894013894e-06, + "loss": 0.6015, + "step": 6300 + }, + { + "epoch": 0.8, + "grad_norm": 0.6913324373138605, + "learning_rate": 6.798595399794527e-06, + "loss": 0.6534, + "step": 6301 + }, + { + "epoch": 0.8, + "grad_norm": 0.8357334549675459, + "learning_rate": 6.7976328290147e-06, + "loss": 0.6185, + "step": 6302 + }, + { + "epoch": 0.8, + "grad_norm": 0.546093348874749, + "learning_rate": 6.796670181715388e-06, + "loss": 0.474, + "step": 6303 + }, + { + "epoch": 0.8, + "grad_norm": 0.7175699288559444, + "learning_rate": 6.795707457937568e-06, + "loss": 0.585, + "step": 6304 + }, + { + "epoch": 0.8, + "grad_norm": 0.6450632005756834, + "learning_rate": 6.794744657722221e-06, + "loss": 0.5129, + "step": 6305 + }, + { + "epoch": 0.8, + "grad_norm": 0.7124027743503827, + "learning_rate": 6.793781781110328e-06, + "loss": 0.6108, + "step": 6306 + }, + { + "epoch": 0.8, + "grad_norm": 0.8872452299574621, + "learning_rate": 6.792818828142876e-06, + "loss": 0.6345, + "step": 6307 + }, + { + "epoch": 0.8, + "grad_norm": 0.7730338132466026, + "learning_rate": 6.791855798860857e-06, + "loss": 0.5703, + "step": 6308 + }, + { + "epoch": 0.8, + "grad_norm": 0.6589697691999539, + "learning_rate": 6.790892693305261e-06, + "loss": 0.5561, + "step": 6309 + }, + { + "epoch": 0.8, + "grad_norm": 0.6191895272824437, + "learning_rate": 6.789929511517087e-06, + "loss": 0.5128, + "step": 6310 + }, + { + "epoch": 0.8, + "grad_norm": 0.7340550573357525, + "learning_rate": 6.788966253537333e-06, + "loss": 0.6193, + "step": 6311 + }, + { + "epoch": 0.8, + "grad_norm": 0.6602665420114553, + "learning_rate": 6.788002919407003e-06, + "loss": 0.4922, + "step": 6312 + }, + { + "epoch": 0.8, + "grad_norm": 0.6551173853319442, + "learning_rate": 6.7870395091671014e-06, + "loss": 0.5302, + "step": 6313 + }, + { + "epoch": 0.8, + "grad_norm": 0.8288366529652099, + "learning_rate": 6.78607602285864e-06, + "loss": 0.6447, + "step": 6314 + }, + { + "epoch": 0.8, + "grad_norm": 0.8531539098497349, + "learning_rate": 6.785112460522627e-06, + "loss": 0.6653, + "step": 6315 + }, + { + "epoch": 0.8, + "grad_norm": 0.7759389167907225, + "learning_rate": 6.784148822200084e-06, + "loss": 0.6719, + "step": 6316 + }, + { + "epoch": 0.8, + "grad_norm": 0.6430947513586196, + "learning_rate": 6.783185107932025e-06, + "loss": 0.4969, + "step": 6317 + }, + { + "epoch": 0.8, + "grad_norm": 0.6864824543720452, + "learning_rate": 6.782221317759475e-06, + "loss": 0.5667, + "step": 6318 + }, + { + "epoch": 0.81, + "grad_norm": 0.5712421829392823, + "learning_rate": 6.78125745172346e-06, + "loss": 0.5097, + "step": 6319 + }, + { + "epoch": 0.81, + "grad_norm": 0.9133388746218416, + "learning_rate": 6.7802935098650055e-06, + "loss": 0.6281, + "step": 6320 + }, + { + "epoch": 0.81, + "grad_norm": 0.8147174738533315, + "learning_rate": 6.779329492225144e-06, + "loss": 0.6049, + "step": 6321 + }, + { + "epoch": 0.81, + "grad_norm": 0.6203870899888008, + "learning_rate": 6.778365398844915e-06, + "loss": 0.5212, + "step": 6322 + }, + { + "epoch": 0.81, + "grad_norm": 0.6220443670644604, + "learning_rate": 6.7774012297653525e-06, + "loss": 0.5344, + "step": 6323 + }, + { + "epoch": 0.81, + "grad_norm": 0.7281004215424857, + "learning_rate": 6.776436985027499e-06, + "loss": 0.5908, + "step": 6324 + }, + { + "epoch": 0.81, + "grad_norm": 0.766516496697392, + "learning_rate": 6.7754726646724e-06, + "loss": 0.5989, + "step": 6325 + }, + { + "epoch": 0.81, + "grad_norm": 0.8734819449120286, + "learning_rate": 6.774508268741105e-06, + "loss": 0.5812, + "step": 6326 + }, + { + "epoch": 0.81, + "grad_norm": 0.6165868605416652, + "learning_rate": 6.77354379727466e-06, + "loss": 0.556, + "step": 6327 + }, + { + "epoch": 0.81, + "grad_norm": 0.7544123996488468, + "learning_rate": 6.772579250314126e-06, + "loss": 0.5535, + "step": 6328 + }, + { + "epoch": 0.81, + "grad_norm": 0.6843396353768542, + "learning_rate": 6.771614627900556e-06, + "loss": 0.5076, + "step": 6329 + }, + { + "epoch": 0.81, + "grad_norm": 0.7335055256580995, + "learning_rate": 6.770649930075013e-06, + "loss": 0.626, + "step": 6330 + }, + { + "epoch": 0.81, + "grad_norm": 0.8637164624456584, + "learning_rate": 6.76968515687856e-06, + "loss": 0.6601, + "step": 6331 + }, + { + "epoch": 0.81, + "grad_norm": 0.6383292611934841, + "learning_rate": 6.768720308352267e-06, + "loss": 0.5465, + "step": 6332 + }, + { + "epoch": 0.81, + "grad_norm": 1.0248782307901643, + "learning_rate": 6.767755384537202e-06, + "loss": 0.621, + "step": 6333 + }, + { + "epoch": 0.81, + "grad_norm": 0.779170201696406, + "learning_rate": 6.766790385474436e-06, + "loss": 0.6268, + "step": 6334 + }, + { + "epoch": 0.81, + "grad_norm": 0.6401784606901068, + "learning_rate": 6.765825311205052e-06, + "loss": 0.5438, + "step": 6335 + }, + { + "epoch": 0.81, + "grad_norm": 0.7786211418180801, + "learning_rate": 6.764860161770128e-06, + "loss": 0.6498, + "step": 6336 + }, + { + "epoch": 0.81, + "grad_norm": 0.6726624664743807, + "learning_rate": 6.7638949372107455e-06, + "loss": 0.5716, + "step": 6337 + }, + { + "epoch": 0.81, + "grad_norm": 0.6774421434867293, + "learning_rate": 6.762929637567992e-06, + "loss": 0.5998, + "step": 6338 + }, + { + "epoch": 0.81, + "grad_norm": 0.5738097360089144, + "learning_rate": 6.761964262882957e-06, + "loss": 0.5201, + "step": 6339 + }, + { + "epoch": 0.81, + "grad_norm": 0.578057836464607, + "learning_rate": 6.760998813196735e-06, + "loss": 0.5358, + "step": 6340 + }, + { + "epoch": 0.81, + "grad_norm": 0.9411607403286338, + "learning_rate": 6.760033288550419e-06, + "loss": 0.6528, + "step": 6341 + }, + { + "epoch": 0.81, + "grad_norm": 0.7901292814985689, + "learning_rate": 6.759067688985111e-06, + "loss": 0.6205, + "step": 6342 + }, + { + "epoch": 0.81, + "grad_norm": 0.7859494662537901, + "learning_rate": 6.758102014541914e-06, + "loss": 0.6246, + "step": 6343 + }, + { + "epoch": 0.81, + "grad_norm": 0.5360097350128014, + "learning_rate": 6.757136265261931e-06, + "loss": 0.4813, + "step": 6344 + }, + { + "epoch": 0.81, + "grad_norm": 0.6384464118020083, + "learning_rate": 6.756170441186273e-06, + "loss": 0.575, + "step": 6345 + }, + { + "epoch": 0.81, + "grad_norm": 0.7887056311658751, + "learning_rate": 6.755204542356051e-06, + "loss": 0.6061, + "step": 6346 + }, + { + "epoch": 0.81, + "grad_norm": 0.7973621020106734, + "learning_rate": 6.754238568812379e-06, + "loss": 0.6175, + "step": 6347 + }, + { + "epoch": 0.81, + "grad_norm": 0.5730633541510033, + "learning_rate": 6.753272520596379e-06, + "loss": 0.5138, + "step": 6348 + }, + { + "epoch": 0.81, + "grad_norm": 0.647216245800709, + "learning_rate": 6.752306397749168e-06, + "loss": 0.5691, + "step": 6349 + }, + { + "epoch": 0.81, + "grad_norm": 0.5954325889514445, + "learning_rate": 6.751340200311875e-06, + "loss": 0.5111, + "step": 6350 + }, + { + "epoch": 0.81, + "grad_norm": 0.6657911872799888, + "learning_rate": 6.7503739283256275e-06, + "loss": 0.5294, + "step": 6351 + }, + { + "epoch": 0.81, + "grad_norm": 0.5328784579864676, + "learning_rate": 6.749407581831553e-06, + "loss": 0.5258, + "step": 6352 + }, + { + "epoch": 0.81, + "grad_norm": 0.8368523370346123, + "learning_rate": 6.748441160870788e-06, + "loss": 0.6113, + "step": 6353 + }, + { + "epoch": 0.81, + "grad_norm": 0.5718359238750406, + "learning_rate": 6.74747466548447e-06, + "loss": 0.5504, + "step": 6354 + }, + { + "epoch": 0.81, + "grad_norm": 0.7893892783212353, + "learning_rate": 6.746508095713742e-06, + "loss": 0.6742, + "step": 6355 + }, + { + "epoch": 0.81, + "grad_norm": 0.6334354873352155, + "learning_rate": 6.745541451599743e-06, + "loss": 0.576, + "step": 6356 + }, + { + "epoch": 0.81, + "grad_norm": 0.6671874039448502, + "learning_rate": 6.7445747331836235e-06, + "loss": 0.5022, + "step": 6357 + }, + { + "epoch": 0.81, + "grad_norm": 0.5682849379380871, + "learning_rate": 6.743607940506531e-06, + "loss": 0.5301, + "step": 6358 + }, + { + "epoch": 0.81, + "grad_norm": 0.6529245815969871, + "learning_rate": 6.742641073609621e-06, + "loss": 0.5456, + "step": 6359 + }, + { + "epoch": 0.81, + "grad_norm": 0.5529149987318653, + "learning_rate": 6.74167413253405e-06, + "loss": 0.5154, + "step": 6360 + }, + { + "epoch": 0.81, + "grad_norm": 0.6271718459165078, + "learning_rate": 6.740707117320976e-06, + "loss": 0.511, + "step": 6361 + }, + { + "epoch": 0.81, + "grad_norm": 0.8792267567828937, + "learning_rate": 6.739740028011564e-06, + "loss": 0.6005, + "step": 6362 + }, + { + "epoch": 0.81, + "grad_norm": 0.6326283319976258, + "learning_rate": 6.738772864646976e-06, + "loss": 0.5839, + "step": 6363 + }, + { + "epoch": 0.81, + "grad_norm": 0.7789312932868144, + "learning_rate": 6.737805627268385e-06, + "loss": 0.5517, + "step": 6364 + }, + { + "epoch": 0.81, + "grad_norm": 0.8044445950198794, + "learning_rate": 6.73683831591696e-06, + "loss": 0.6369, + "step": 6365 + }, + { + "epoch": 0.81, + "grad_norm": 0.5872765022955363, + "learning_rate": 6.73587093063388e-06, + "loss": 0.5741, + "step": 6366 + }, + { + "epoch": 0.81, + "grad_norm": 0.6077595396691541, + "learning_rate": 6.734903471460321e-06, + "loss": 0.5523, + "step": 6367 + }, + { + "epoch": 0.81, + "grad_norm": 0.6037433871943244, + "learning_rate": 6.733935938437466e-06, + "loss": 0.5268, + "step": 6368 + }, + { + "epoch": 0.81, + "grad_norm": 0.8342328555210821, + "learning_rate": 6.732968331606498e-06, + "loss": 0.6046, + "step": 6369 + }, + { + "epoch": 0.81, + "grad_norm": 0.623905236072365, + "learning_rate": 6.732000651008606e-06, + "loss": 0.5542, + "step": 6370 + }, + { + "epoch": 0.81, + "grad_norm": 0.5152553373721863, + "learning_rate": 6.731032896684979e-06, + "loss": 0.4848, + "step": 6371 + }, + { + "epoch": 0.81, + "grad_norm": 0.6491968961014971, + "learning_rate": 6.730065068676816e-06, + "loss": 0.5371, + "step": 6372 + }, + { + "epoch": 0.81, + "grad_norm": 0.6969049736847973, + "learning_rate": 6.72909716702531e-06, + "loss": 0.5595, + "step": 6373 + }, + { + "epoch": 0.81, + "grad_norm": 0.5510741188143825, + "learning_rate": 6.728129191771664e-06, + "loss": 0.4857, + "step": 6374 + }, + { + "epoch": 0.81, + "grad_norm": 0.7451068437304299, + "learning_rate": 6.727161142957081e-06, + "loss": 0.6152, + "step": 6375 + }, + { + "epoch": 0.81, + "grad_norm": 0.6298200907591202, + "learning_rate": 6.726193020622766e-06, + "loss": 0.49, + "step": 6376 + }, + { + "epoch": 0.81, + "grad_norm": 0.6542965102588401, + "learning_rate": 6.72522482480993e-06, + "loss": 0.5584, + "step": 6377 + }, + { + "epoch": 0.81, + "grad_norm": 0.7509972788597397, + "learning_rate": 6.724256555559787e-06, + "loss": 0.6335, + "step": 6378 + }, + { + "epoch": 0.81, + "grad_norm": 0.6254214657372537, + "learning_rate": 6.723288212913553e-06, + "loss": 0.5257, + "step": 6379 + }, + { + "epoch": 0.81, + "grad_norm": 0.7173655135127083, + "learning_rate": 6.722319796912446e-06, + "loss": 0.5995, + "step": 6380 + }, + { + "epoch": 0.81, + "grad_norm": 0.8969418779895143, + "learning_rate": 6.721351307597689e-06, + "loss": 0.6029, + "step": 6381 + }, + { + "epoch": 0.81, + "grad_norm": 0.7638516192884462, + "learning_rate": 6.720382745010507e-06, + "loss": 0.5027, + "step": 6382 + }, + { + "epoch": 0.81, + "grad_norm": 0.6582976704035635, + "learning_rate": 6.71941410919213e-06, + "loss": 0.4851, + "step": 6383 + }, + { + "epoch": 0.81, + "grad_norm": 1.1897043957818842, + "learning_rate": 6.718445400183789e-06, + "loss": 0.6378, + "step": 6384 + }, + { + "epoch": 0.81, + "grad_norm": 0.7208578914676788, + "learning_rate": 6.717476618026717e-06, + "loss": 0.5765, + "step": 6385 + }, + { + "epoch": 0.81, + "grad_norm": 0.5721751519254793, + "learning_rate": 6.716507762762155e-06, + "loss": 0.4664, + "step": 6386 + }, + { + "epoch": 0.81, + "grad_norm": 0.7802442509445575, + "learning_rate": 6.7155388344313434e-06, + "loss": 0.6487, + "step": 6387 + }, + { + "epoch": 0.81, + "grad_norm": 0.882001307796168, + "learning_rate": 6.714569833075524e-06, + "loss": 0.6357, + "step": 6388 + }, + { + "epoch": 0.81, + "grad_norm": 0.879967952624162, + "learning_rate": 6.713600758735946e-06, + "loss": 0.5818, + "step": 6389 + }, + { + "epoch": 0.81, + "grad_norm": 0.5626014171327017, + "learning_rate": 6.71263161145386e-06, + "loss": 0.5003, + "step": 6390 + }, + { + "epoch": 0.81, + "grad_norm": 0.7976670591117455, + "learning_rate": 6.7116623912705196e-06, + "loss": 0.6132, + "step": 6391 + }, + { + "epoch": 0.81, + "grad_norm": 0.5724625636422039, + "learning_rate": 6.710693098227181e-06, + "loss": 0.4979, + "step": 6392 + }, + { + "epoch": 0.81, + "grad_norm": 0.7429685766744689, + "learning_rate": 6.709723732365103e-06, + "loss": 0.5725, + "step": 6393 + }, + { + "epoch": 0.81, + "grad_norm": 0.5828556645578636, + "learning_rate": 6.70875429372555e-06, + "loss": 0.5157, + "step": 6394 + }, + { + "epoch": 0.81, + "grad_norm": 0.6320381571843299, + "learning_rate": 6.707784782349787e-06, + "loss": 0.5201, + "step": 6395 + }, + { + "epoch": 0.81, + "grad_norm": 0.6159010265565266, + "learning_rate": 6.706815198279082e-06, + "loss": 0.5008, + "step": 6396 + }, + { + "epoch": 0.81, + "grad_norm": 0.5737740421180026, + "learning_rate": 6.7058455415547085e-06, + "loss": 0.4834, + "step": 6397 + }, + { + "epoch": 0.82, + "grad_norm": 0.5789861003246338, + "learning_rate": 6.704875812217942e-06, + "loss": 0.472, + "step": 6398 + }, + { + "epoch": 0.82, + "grad_norm": 0.6760633839978232, + "learning_rate": 6.7039060103100605e-06, + "loss": 0.6028, + "step": 6399 + }, + { + "epoch": 0.82, + "grad_norm": 0.7588558279356667, + "learning_rate": 6.702936135872344e-06, + "loss": 0.6653, + "step": 6400 + }, + { + "epoch": 0.82, + "grad_norm": 0.5963435157206481, + "learning_rate": 6.70196618894608e-06, + "loss": 0.5475, + "step": 6401 + }, + { + "epoch": 0.82, + "grad_norm": 0.736899258733208, + "learning_rate": 6.700996169572553e-06, + "loss": 0.5398, + "step": 6402 + }, + { + "epoch": 0.82, + "grad_norm": 0.8047954247923363, + "learning_rate": 6.700026077793052e-06, + "loss": 0.6701, + "step": 6403 + }, + { + "epoch": 0.82, + "grad_norm": 0.7811606989006268, + "learning_rate": 6.699055913648877e-06, + "loss": 0.5097, + "step": 6404 + }, + { + "epoch": 0.82, + "grad_norm": 0.8208408967904837, + "learning_rate": 6.69808567718132e-06, + "loss": 0.6005, + "step": 6405 + }, + { + "epoch": 0.82, + "grad_norm": 0.6951735888945008, + "learning_rate": 6.6971153684316815e-06, + "loss": 0.5977, + "step": 6406 + }, + { + "epoch": 0.82, + "grad_norm": 0.7181001260605885, + "learning_rate": 6.696144987441265e-06, + "loss": 0.5459, + "step": 6407 + }, + { + "epoch": 0.82, + "grad_norm": 1.2468873166070857, + "learning_rate": 6.695174534251377e-06, + "loss": 0.6572, + "step": 6408 + }, + { + "epoch": 0.82, + "grad_norm": 0.7854443515098432, + "learning_rate": 6.694204008903326e-06, + "loss": 0.6118, + "step": 6409 + }, + { + "epoch": 0.82, + "grad_norm": 0.5717858423249883, + "learning_rate": 6.693233411438424e-06, + "loss": 0.4987, + "step": 6410 + }, + { + "epoch": 0.82, + "grad_norm": 0.7099312229948749, + "learning_rate": 6.692262741897988e-06, + "loss": 0.594, + "step": 6411 + }, + { + "epoch": 0.82, + "grad_norm": 0.8691331635780105, + "learning_rate": 6.691292000323333e-06, + "loss": 0.6265, + "step": 6412 + }, + { + "epoch": 0.82, + "grad_norm": 0.6657924821602142, + "learning_rate": 6.690321186755783e-06, + "loss": 0.4977, + "step": 6413 + }, + { + "epoch": 0.82, + "grad_norm": 0.8928952064216477, + "learning_rate": 6.68935030123666e-06, + "loss": 0.5673, + "step": 6414 + }, + { + "epoch": 0.82, + "grad_norm": 0.6431463191041895, + "learning_rate": 6.688379343807294e-06, + "loss": 0.5669, + "step": 6415 + }, + { + "epoch": 0.82, + "grad_norm": 0.891696782672777, + "learning_rate": 6.687408314509015e-06, + "loss": 0.6501, + "step": 6416 + }, + { + "epoch": 0.82, + "grad_norm": 0.7976500124486192, + "learning_rate": 6.6864372133831565e-06, + "loss": 0.5879, + "step": 6417 + }, + { + "epoch": 0.82, + "grad_norm": 0.8207467405287292, + "learning_rate": 6.685466040471055e-06, + "loss": 0.6457, + "step": 6418 + }, + { + "epoch": 0.82, + "grad_norm": 0.8132338002114149, + "learning_rate": 6.6844947958140495e-06, + "loss": 0.5782, + "step": 6419 + }, + { + "epoch": 0.82, + "grad_norm": 0.5865131703071755, + "learning_rate": 6.683523479453482e-06, + "loss": 0.5329, + "step": 6420 + }, + { + "epoch": 0.82, + "grad_norm": 0.7642471230476104, + "learning_rate": 6.682552091430702e-06, + "loss": 0.6505, + "step": 6421 + }, + { + "epoch": 0.82, + "grad_norm": 0.6269724046919594, + "learning_rate": 6.681580631787055e-06, + "loss": 0.5537, + "step": 6422 + }, + { + "epoch": 0.82, + "grad_norm": 0.787866405621647, + "learning_rate": 6.680609100563895e-06, + "loss": 0.6021, + "step": 6423 + }, + { + "epoch": 0.82, + "grad_norm": 0.63042795399231, + "learning_rate": 6.679637497802576e-06, + "loss": 0.573, + "step": 6424 + }, + { + "epoch": 0.82, + "grad_norm": 0.5839167676917506, + "learning_rate": 6.6786658235444545e-06, + "loss": 0.4792, + "step": 6425 + }, + { + "epoch": 0.82, + "grad_norm": 0.8759054067242198, + "learning_rate": 6.677694077830895e-06, + "loss": 0.6558, + "step": 6426 + }, + { + "epoch": 0.82, + "grad_norm": 0.6307238457096034, + "learning_rate": 6.676722260703257e-06, + "loss": 0.5357, + "step": 6427 + }, + { + "epoch": 0.82, + "grad_norm": 0.9370918199493273, + "learning_rate": 6.675750372202914e-06, + "loss": 0.6828, + "step": 6428 + }, + { + "epoch": 0.82, + "grad_norm": 0.8054200309083763, + "learning_rate": 6.674778412371231e-06, + "loss": 0.5958, + "step": 6429 + }, + { + "epoch": 0.82, + "grad_norm": 0.7846811580356037, + "learning_rate": 6.673806381249582e-06, + "loss": 0.6207, + "step": 6430 + }, + { + "epoch": 0.82, + "grad_norm": 2.5581675605256007, + "learning_rate": 6.6728342788793455e-06, + "loss": 0.5634, + "step": 6431 + }, + { + "epoch": 0.82, + "grad_norm": 0.7101407156229939, + "learning_rate": 6.671862105301898e-06, + "loss": 0.5452, + "step": 6432 + }, + { + "epoch": 0.82, + "grad_norm": 0.7614938372698999, + "learning_rate": 6.670889860558623e-06, + "loss": 0.4759, + "step": 6433 + }, + { + "epoch": 0.82, + "grad_norm": 0.8168065779181795, + "learning_rate": 6.669917544690908e-06, + "loss": 0.5904, + "step": 6434 + }, + { + "epoch": 0.82, + "grad_norm": 0.6750045326616718, + "learning_rate": 6.668945157740139e-06, + "loss": 0.4993, + "step": 6435 + }, + { + "epoch": 0.82, + "grad_norm": 0.7930097513448877, + "learning_rate": 6.667972699747707e-06, + "loss": 0.6017, + "step": 6436 + }, + { + "epoch": 0.82, + "grad_norm": 0.5277232096528562, + "learning_rate": 6.667000170755007e-06, + "loss": 0.4836, + "step": 6437 + }, + { + "epoch": 0.82, + "grad_norm": 0.5934540690664346, + "learning_rate": 6.666027570803437e-06, + "loss": 0.544, + "step": 6438 + }, + { + "epoch": 0.82, + "grad_norm": 0.5690663884221651, + "learning_rate": 6.665054899934397e-06, + "loss": 0.5077, + "step": 6439 + }, + { + "epoch": 0.82, + "grad_norm": 0.6735525718196225, + "learning_rate": 6.664082158189291e-06, + "loss": 0.5782, + "step": 6440 + }, + { + "epoch": 0.82, + "grad_norm": 0.7205738057595237, + "learning_rate": 6.663109345609525e-06, + "loss": 0.5601, + "step": 6441 + }, + { + "epoch": 0.82, + "grad_norm": 0.7535096231438502, + "learning_rate": 6.662136462236509e-06, + "loss": 0.6243, + "step": 6442 + }, + { + "epoch": 0.82, + "grad_norm": 0.6220989097344559, + "learning_rate": 6.661163508111655e-06, + "loss": 0.5077, + "step": 6443 + }, + { + "epoch": 0.82, + "grad_norm": 0.7335877244020317, + "learning_rate": 6.66019048327638e-06, + "loss": 0.594, + "step": 6444 + }, + { + "epoch": 0.82, + "grad_norm": 0.5847035984154572, + "learning_rate": 6.659217387772099e-06, + "loss": 0.5017, + "step": 6445 + }, + { + "epoch": 0.82, + "grad_norm": 0.6953747917685108, + "learning_rate": 6.658244221640238e-06, + "loss": 0.5715, + "step": 6446 + }, + { + "epoch": 0.82, + "grad_norm": 0.7203864369924682, + "learning_rate": 6.657270984922217e-06, + "loss": 0.5861, + "step": 6447 + }, + { + "epoch": 0.82, + "grad_norm": 0.5187896783444225, + "learning_rate": 6.6562976776594686e-06, + "loss": 0.5062, + "step": 6448 + }, + { + "epoch": 0.82, + "grad_norm": 0.7099647596179366, + "learning_rate": 6.65532429989342e-06, + "loss": 0.5598, + "step": 6449 + }, + { + "epoch": 0.82, + "grad_norm": 0.7002824694552254, + "learning_rate": 6.654350851665505e-06, + "loss": 0.5226, + "step": 6450 + }, + { + "epoch": 0.82, + "grad_norm": 0.721513877311857, + "learning_rate": 6.65337733301716e-06, + "loss": 0.546, + "step": 6451 + }, + { + "epoch": 0.82, + "grad_norm": 0.5634849603066018, + "learning_rate": 6.652403743989827e-06, + "loss": 0.5039, + "step": 6452 + }, + { + "epoch": 0.82, + "grad_norm": 0.569399846285577, + "learning_rate": 6.651430084624947e-06, + "loss": 0.4602, + "step": 6453 + }, + { + "epoch": 0.82, + "grad_norm": 0.6769632673894282, + "learning_rate": 6.650456354963966e-06, + "loss": 0.5568, + "step": 6454 + }, + { + "epoch": 0.82, + "grad_norm": 0.6914939967947034, + "learning_rate": 6.64948255504833e-06, + "loss": 0.5609, + "step": 6455 + }, + { + "epoch": 0.82, + "grad_norm": 0.8084550059355045, + "learning_rate": 6.648508684919495e-06, + "loss": 0.6309, + "step": 6456 + }, + { + "epoch": 0.82, + "grad_norm": 0.5736143024332176, + "learning_rate": 6.647534744618911e-06, + "loss": 0.4859, + "step": 6457 + }, + { + "epoch": 0.82, + "grad_norm": 0.5459911748431158, + "learning_rate": 6.646560734188039e-06, + "loss": 0.5395, + "step": 6458 + }, + { + "epoch": 0.82, + "grad_norm": 0.647497937146447, + "learning_rate": 6.645586653668337e-06, + "loss": 0.5455, + "step": 6459 + }, + { + "epoch": 0.82, + "grad_norm": 0.7416950392981845, + "learning_rate": 6.644612503101271e-06, + "loss": 0.5421, + "step": 6460 + }, + { + "epoch": 0.82, + "grad_norm": 0.769850560611796, + "learning_rate": 6.643638282528306e-06, + "loss": 0.6333, + "step": 6461 + }, + { + "epoch": 0.82, + "grad_norm": 0.787546334563346, + "learning_rate": 6.642663991990911e-06, + "loss": 0.6094, + "step": 6462 + }, + { + "epoch": 0.82, + "grad_norm": 0.6316002600992692, + "learning_rate": 6.641689631530559e-06, + "loss": 0.5062, + "step": 6463 + }, + { + "epoch": 0.82, + "grad_norm": 0.7174694295250796, + "learning_rate": 6.640715201188727e-06, + "loss": 0.6388, + "step": 6464 + }, + { + "epoch": 0.82, + "grad_norm": 0.7925445165429377, + "learning_rate": 6.639740701006889e-06, + "loss": 0.5913, + "step": 6465 + }, + { + "epoch": 0.82, + "grad_norm": 0.7000180266583456, + "learning_rate": 6.6387661310265315e-06, + "loss": 0.622, + "step": 6466 + }, + { + "epoch": 0.82, + "grad_norm": 0.6286676745558507, + "learning_rate": 6.637791491289136e-06, + "loss": 0.5497, + "step": 6467 + }, + { + "epoch": 0.82, + "grad_norm": 0.764354709144767, + "learning_rate": 6.63681678183619e-06, + "loss": 0.5325, + "step": 6468 + }, + { + "epoch": 0.82, + "grad_norm": 0.8300264652027384, + "learning_rate": 6.635842002709185e-06, + "loss": 0.5746, + "step": 6469 + }, + { + "epoch": 0.82, + "grad_norm": 0.5822342238397752, + "learning_rate": 6.634867153949613e-06, + "loss": 0.5355, + "step": 6470 + }, + { + "epoch": 0.82, + "grad_norm": 0.6681859347938008, + "learning_rate": 6.633892235598971e-06, + "loss": 0.4999, + "step": 6471 + }, + { + "epoch": 0.82, + "grad_norm": 0.6195650881913226, + "learning_rate": 6.6329172476987565e-06, + "loss": 0.521, + "step": 6472 + }, + { + "epoch": 0.82, + "grad_norm": 0.5777079027188079, + "learning_rate": 6.631942190290474e-06, + "loss": 0.5668, + "step": 6473 + }, + { + "epoch": 0.82, + "grad_norm": 0.7482830965645356, + "learning_rate": 6.6309670634156265e-06, + "loss": 0.6085, + "step": 6474 + }, + { + "epoch": 0.82, + "grad_norm": 0.6393661927024398, + "learning_rate": 6.629991867115724e-06, + "loss": 0.5, + "step": 6475 + }, + { + "epoch": 0.83, + "grad_norm": 0.6625920334685923, + "learning_rate": 6.629016601432275e-06, + "loss": 0.4856, + "step": 6476 + }, + { + "epoch": 0.83, + "grad_norm": 0.5970890277792154, + "learning_rate": 6.628041266406795e-06, + "loss": 0.5153, + "step": 6477 + }, + { + "epoch": 0.83, + "grad_norm": 0.7767704226989249, + "learning_rate": 6.6270658620808015e-06, + "loss": 0.6396, + "step": 6478 + }, + { + "epoch": 0.83, + "grad_norm": 0.8129509103335189, + "learning_rate": 6.6260903884958135e-06, + "loss": 0.605, + "step": 6479 + }, + { + "epoch": 0.83, + "grad_norm": 0.7574390987238969, + "learning_rate": 6.625114845693353e-06, + "loss": 0.6333, + "step": 6480 + }, + { + "epoch": 0.83, + "grad_norm": 0.83783835637728, + "learning_rate": 6.624139233714948e-06, + "loss": 0.5886, + "step": 6481 + }, + { + "epoch": 0.83, + "grad_norm": 0.5744249648369125, + "learning_rate": 6.623163552602125e-06, + "loss": 0.4702, + "step": 6482 + }, + { + "epoch": 0.83, + "grad_norm": 0.6065719104766597, + "learning_rate": 6.622187802396416e-06, + "loss": 0.5621, + "step": 6483 + }, + { + "epoch": 0.83, + "grad_norm": 0.6252720927272606, + "learning_rate": 6.6212119831393574e-06, + "loss": 0.526, + "step": 6484 + }, + { + "epoch": 0.83, + "grad_norm": 0.7300166847462839, + "learning_rate": 6.620236094872485e-06, + "loss": 0.5268, + "step": 6485 + }, + { + "epoch": 0.83, + "grad_norm": 0.5193651488662671, + "learning_rate": 6.619260137637339e-06, + "loss": 0.4615, + "step": 6486 + }, + { + "epoch": 0.83, + "grad_norm": 0.6086862260498218, + "learning_rate": 6.618284111475464e-06, + "loss": 0.5297, + "step": 6487 + }, + { + "epoch": 0.83, + "grad_norm": 0.7110119872791875, + "learning_rate": 6.617308016428405e-06, + "loss": 0.614, + "step": 6488 + }, + { + "epoch": 0.83, + "grad_norm": 0.7106997797929842, + "learning_rate": 6.616331852537712e-06, + "loss": 0.5398, + "step": 6489 + }, + { + "epoch": 0.83, + "grad_norm": 0.7885781093659249, + "learning_rate": 6.615355619844937e-06, + "loss": 0.6141, + "step": 6490 + }, + { + "epoch": 0.83, + "grad_norm": 0.5541225600642373, + "learning_rate": 6.614379318391635e-06, + "loss": 0.4978, + "step": 6491 + }, + { + "epoch": 0.83, + "grad_norm": 0.643156282920605, + "learning_rate": 6.613402948219365e-06, + "loss": 0.5638, + "step": 6492 + }, + { + "epoch": 0.83, + "grad_norm": 0.6530901853978821, + "learning_rate": 6.6124265093696874e-06, + "loss": 0.5167, + "step": 6493 + }, + { + "epoch": 0.83, + "grad_norm": 0.7271421660121189, + "learning_rate": 6.6114500018841655e-06, + "loss": 0.6107, + "step": 6494 + }, + { + "epoch": 0.83, + "grad_norm": 1.0035529231353035, + "learning_rate": 6.6104734258043655e-06, + "loss": 0.5976, + "step": 6495 + }, + { + "epoch": 0.83, + "grad_norm": 0.5941332092616888, + "learning_rate": 6.609496781171859e-06, + "loss": 0.505, + "step": 6496 + }, + { + "epoch": 0.83, + "grad_norm": 0.6774864696177553, + "learning_rate": 6.608520068028218e-06, + "loss": 0.5789, + "step": 6497 + }, + { + "epoch": 0.83, + "grad_norm": 0.6889388082929483, + "learning_rate": 6.607543286415017e-06, + "loss": 0.604, + "step": 6498 + }, + { + "epoch": 0.83, + "grad_norm": 1.0277921515906445, + "learning_rate": 6.606566436373836e-06, + "loss": 0.6065, + "step": 6499 + }, + { + "epoch": 0.83, + "grad_norm": 0.6169010073526765, + "learning_rate": 6.605589517946256e-06, + "loss": 0.5183, + "step": 6500 + }, + { + "epoch": 0.83, + "grad_norm": 0.7065624553356541, + "learning_rate": 6.6046125311738595e-06, + "loss": 0.5776, + "step": 6501 + }, + { + "epoch": 0.83, + "grad_norm": 0.8840570022135378, + "learning_rate": 6.603635476098236e-06, + "loss": 0.6505, + "step": 6502 + }, + { + "epoch": 0.83, + "grad_norm": 0.7868794113995556, + "learning_rate": 6.602658352760975e-06, + "loss": 0.6901, + "step": 6503 + }, + { + "epoch": 0.83, + "grad_norm": 0.7150974512226105, + "learning_rate": 6.6016811612036695e-06, + "loss": 0.5406, + "step": 6504 + }, + { + "epoch": 0.83, + "grad_norm": 0.5630857591089643, + "learning_rate": 6.600703901467914e-06, + "loss": 0.5951, + "step": 6505 + }, + { + "epoch": 0.83, + "grad_norm": 0.5373656574791883, + "learning_rate": 6.599726573595309e-06, + "loss": 0.4648, + "step": 6506 + }, + { + "epoch": 0.83, + "grad_norm": 0.8166571002152593, + "learning_rate": 6.598749177627456e-06, + "loss": 0.6198, + "step": 6507 + }, + { + "epoch": 0.83, + "grad_norm": 0.6807433614817046, + "learning_rate": 6.59777171360596e-06, + "loss": 0.6431, + "step": 6508 + }, + { + "epoch": 0.83, + "grad_norm": 0.7913926797743724, + "learning_rate": 6.596794181572428e-06, + "loss": 0.5567, + "step": 6509 + }, + { + "epoch": 0.83, + "grad_norm": 0.6324205471993662, + "learning_rate": 6.5958165815684696e-06, + "loss": 0.5484, + "step": 6510 + }, + { + "epoch": 0.83, + "grad_norm": 0.6253357026129646, + "learning_rate": 6.594838913635698e-06, + "loss": 0.5301, + "step": 6511 + }, + { + "epoch": 0.83, + "grad_norm": 0.5970319795024827, + "learning_rate": 6.5938611778157315e-06, + "loss": 0.533, + "step": 6512 + }, + { + "epoch": 0.83, + "grad_norm": 0.5828355647198787, + "learning_rate": 6.5928833741501875e-06, + "loss": 0.4857, + "step": 6513 + }, + { + "epoch": 0.83, + "grad_norm": 0.6678940477959378, + "learning_rate": 6.591905502680688e-06, + "loss": 0.5708, + "step": 6514 + }, + { + "epoch": 0.83, + "grad_norm": 0.63703960912389, + "learning_rate": 6.590927563448859e-06, + "loss": 0.519, + "step": 6515 + }, + { + "epoch": 0.83, + "grad_norm": 0.7370632149374999, + "learning_rate": 6.5899495564963266e-06, + "loss": 0.6372, + "step": 6516 + }, + { + "epoch": 0.83, + "grad_norm": 0.5803613925385694, + "learning_rate": 6.588971481864723e-06, + "loss": 0.5234, + "step": 6517 + }, + { + "epoch": 0.83, + "grad_norm": 0.6550923523037336, + "learning_rate": 6.587993339595682e-06, + "loss": 0.5217, + "step": 6518 + }, + { + "epoch": 0.83, + "grad_norm": 0.7389184340573136, + "learning_rate": 6.587015129730839e-06, + "loss": 0.5622, + "step": 6519 + }, + { + "epoch": 0.83, + "grad_norm": 0.8541712774022205, + "learning_rate": 6.5860368523118305e-06, + "loss": 0.6901, + "step": 6520 + }, + { + "epoch": 0.83, + "grad_norm": 0.6462953914893367, + "learning_rate": 6.585058507380303e-06, + "loss": 0.5339, + "step": 6521 + }, + { + "epoch": 0.83, + "grad_norm": 0.7844240914151441, + "learning_rate": 6.584080094977901e-06, + "loss": 0.6381, + "step": 6522 + }, + { + "epoch": 0.83, + "grad_norm": 0.8928868013268039, + "learning_rate": 6.58310161514627e-06, + "loss": 0.6651, + "step": 6523 + }, + { + "epoch": 0.83, + "grad_norm": 0.8344502485279831, + "learning_rate": 6.582123067927062e-06, + "loss": 0.6399, + "step": 6524 + }, + { + "epoch": 0.83, + "grad_norm": 0.6902543562148888, + "learning_rate": 6.581144453361932e-06, + "loss": 0.5723, + "step": 6525 + }, + { + "epoch": 0.83, + "grad_norm": 0.7606838378903613, + "learning_rate": 6.580165771492535e-06, + "loss": 0.6172, + "step": 6526 + }, + { + "epoch": 0.83, + "grad_norm": 0.7680349264800025, + "learning_rate": 6.57918702236053e-06, + "loss": 0.5181, + "step": 6527 + }, + { + "epoch": 0.83, + "grad_norm": 0.681643533752028, + "learning_rate": 6.5782082060075796e-06, + "loss": 0.5406, + "step": 6528 + }, + { + "epoch": 0.83, + "grad_norm": 0.6649957214325597, + "learning_rate": 6.5772293224753496e-06, + "loss": 0.5044, + "step": 6529 + }, + { + "epoch": 0.83, + "grad_norm": 0.66792564259355, + "learning_rate": 6.576250371805507e-06, + "loss": 0.5156, + "step": 6530 + }, + { + "epoch": 0.83, + "grad_norm": 0.6299622261759319, + "learning_rate": 6.575271354039724e-06, + "loss": 0.5279, + "step": 6531 + }, + { + "epoch": 0.83, + "grad_norm": 0.881512906590141, + "learning_rate": 6.5742922692196734e-06, + "loss": 0.6224, + "step": 6532 + }, + { + "epoch": 0.83, + "grad_norm": 0.6564445611536593, + "learning_rate": 6.57331311738703e-06, + "loss": 0.5309, + "step": 6533 + }, + { + "epoch": 0.83, + "grad_norm": 0.8131860687793555, + "learning_rate": 6.572333898583476e-06, + "loss": 0.6158, + "step": 6534 + }, + { + "epoch": 0.83, + "grad_norm": 0.6407124194578098, + "learning_rate": 6.571354612850693e-06, + "loss": 0.5341, + "step": 6535 + }, + { + "epoch": 0.83, + "grad_norm": 0.7328902208857833, + "learning_rate": 6.570375260230364e-06, + "loss": 0.5881, + "step": 6536 + }, + { + "epoch": 0.83, + "grad_norm": 0.8464478431660932, + "learning_rate": 6.56939584076418e-06, + "loss": 0.6299, + "step": 6537 + }, + { + "epoch": 0.83, + "grad_norm": 0.7191489096232528, + "learning_rate": 6.568416354493827e-06, + "loss": 0.5755, + "step": 6538 + }, + { + "epoch": 0.83, + "grad_norm": 0.7365817050537748, + "learning_rate": 6.5674368014610044e-06, + "loss": 0.5944, + "step": 6539 + }, + { + "epoch": 0.83, + "grad_norm": 1.0455182794406508, + "learning_rate": 6.5664571817074065e-06, + "loss": 0.6279, + "step": 6540 + }, + { + "epoch": 0.83, + "grad_norm": 0.6900667703590811, + "learning_rate": 6.56547749527473e-06, + "loss": 0.5508, + "step": 6541 + }, + { + "epoch": 0.83, + "grad_norm": 0.7366802134564446, + "learning_rate": 6.5644977422046805e-06, + "loss": 0.6097, + "step": 6542 + }, + { + "epoch": 0.83, + "grad_norm": 0.5854236989657675, + "learning_rate": 6.5635179225389615e-06, + "loss": 0.52, + "step": 6543 + }, + { + "epoch": 0.83, + "grad_norm": 0.7334019559981301, + "learning_rate": 6.5625380363192795e-06, + "loss": 0.5177, + "step": 6544 + }, + { + "epoch": 0.83, + "grad_norm": 0.8178026265013155, + "learning_rate": 6.5615580835873476e-06, + "loss": 0.6622, + "step": 6545 + }, + { + "epoch": 0.83, + "grad_norm": 1.082534676157764, + "learning_rate": 6.560578064384879e-06, + "loss": 0.5514, + "step": 6546 + }, + { + "epoch": 0.83, + "grad_norm": 0.6947063486131437, + "learning_rate": 6.5595979787535865e-06, + "loss": 0.6005, + "step": 6547 + }, + { + "epoch": 0.83, + "grad_norm": 0.716733835205423, + "learning_rate": 6.558617826735194e-06, + "loss": 0.612, + "step": 6548 + }, + { + "epoch": 0.83, + "grad_norm": 0.7176508683791621, + "learning_rate": 6.557637608371421e-06, + "loss": 0.5973, + "step": 6549 + }, + { + "epoch": 0.83, + "grad_norm": 0.6078637995408926, + "learning_rate": 6.5566573237039925e-06, + "loss": 0.5557, + "step": 6550 + }, + { + "epoch": 0.83, + "grad_norm": 0.7333544028476878, + "learning_rate": 6.555676972774634e-06, + "loss": 0.6464, + "step": 6551 + }, + { + "epoch": 0.83, + "grad_norm": 0.562588182862289, + "learning_rate": 6.554696555625079e-06, + "loss": 0.4827, + "step": 6552 + }, + { + "epoch": 0.83, + "grad_norm": 0.8022970561682957, + "learning_rate": 6.553716072297061e-06, + "loss": 0.6269, + "step": 6553 + }, + { + "epoch": 0.83, + "grad_norm": 0.7294224063774781, + "learning_rate": 6.552735522832314e-06, + "loss": 0.6535, + "step": 6554 + }, + { + "epoch": 0.84, + "grad_norm": 0.607388750049785, + "learning_rate": 6.551754907272578e-06, + "loss": 0.5564, + "step": 6555 + }, + { + "epoch": 0.84, + "grad_norm": 0.6561026084748848, + "learning_rate": 6.550774225659594e-06, + "loss": 0.517, + "step": 6556 + }, + { + "epoch": 0.84, + "grad_norm": 0.5607195291470843, + "learning_rate": 6.549793478035107e-06, + "loss": 0.4973, + "step": 6557 + }, + { + "epoch": 0.84, + "grad_norm": 0.7643485170954416, + "learning_rate": 6.548812664440864e-06, + "loss": 0.6615, + "step": 6558 + }, + { + "epoch": 0.84, + "grad_norm": 0.5919573705467266, + "learning_rate": 6.547831784918614e-06, + "loss": 0.4944, + "step": 6559 + }, + { + "epoch": 0.84, + "grad_norm": 0.5682581819493193, + "learning_rate": 6.546850839510113e-06, + "loss": 0.5503, + "step": 6560 + }, + { + "epoch": 0.84, + "grad_norm": 0.6050789187504977, + "learning_rate": 6.545869828257114e-06, + "loss": 0.5004, + "step": 6561 + }, + { + "epoch": 0.84, + "grad_norm": 0.7396563529524641, + "learning_rate": 6.5448887512013755e-06, + "loss": 0.6026, + "step": 6562 + }, + { + "epoch": 0.84, + "grad_norm": 0.7393027655715211, + "learning_rate": 6.5439076083846595e-06, + "loss": 0.6695, + "step": 6563 + }, + { + "epoch": 0.84, + "grad_norm": 0.6957350790728904, + "learning_rate": 6.542926399848732e-06, + "loss": 0.5995, + "step": 6564 + }, + { + "epoch": 0.84, + "grad_norm": 0.5852414204167266, + "learning_rate": 6.541945125635357e-06, + "loss": 0.5134, + "step": 6565 + }, + { + "epoch": 0.84, + "grad_norm": 0.8122329245922367, + "learning_rate": 6.5409637857863066e-06, + "loss": 0.6121, + "step": 6566 + }, + { + "epoch": 0.84, + "grad_norm": 0.6456744614721865, + "learning_rate": 6.539982380343351e-06, + "loss": 0.5703, + "step": 6567 + }, + { + "epoch": 0.84, + "grad_norm": 0.6256095112322341, + "learning_rate": 6.539000909348266e-06, + "loss": 0.4991, + "step": 6568 + }, + { + "epoch": 0.84, + "grad_norm": 0.661420890309588, + "learning_rate": 6.5380193728428294e-06, + "loss": 0.4821, + "step": 6569 + }, + { + "epoch": 0.84, + "grad_norm": 0.6184074991915045, + "learning_rate": 6.537037770868825e-06, + "loss": 0.6062, + "step": 6570 + }, + { + "epoch": 0.84, + "grad_norm": 0.6914185402351574, + "learning_rate": 6.536056103468034e-06, + "loss": 0.5737, + "step": 6571 + }, + { + "epoch": 0.84, + "grad_norm": 0.7803554062738187, + "learning_rate": 6.535074370682243e-06, + "loss": 0.6205, + "step": 6572 + }, + { + "epoch": 0.84, + "grad_norm": 0.7003952693039684, + "learning_rate": 6.534092572553241e-06, + "loss": 0.5838, + "step": 6573 + }, + { + "epoch": 0.84, + "grad_norm": 0.6554043794474748, + "learning_rate": 6.533110709122821e-06, + "loss": 0.5549, + "step": 6574 + }, + { + "epoch": 0.84, + "grad_norm": 0.757956701525781, + "learning_rate": 6.532128780432776e-06, + "loss": 0.5991, + "step": 6575 + }, + { + "epoch": 0.84, + "grad_norm": 0.621540951032334, + "learning_rate": 6.5311467865249064e-06, + "loss": 0.5298, + "step": 6576 + }, + { + "epoch": 0.84, + "grad_norm": 0.6822587431034667, + "learning_rate": 6.5301647274410105e-06, + "loss": 0.5544, + "step": 6577 + }, + { + "epoch": 0.84, + "grad_norm": 1.0023514807622687, + "learning_rate": 6.529182603222892e-06, + "loss": 0.5842, + "step": 6578 + }, + { + "epoch": 0.84, + "grad_norm": 0.5573906384691116, + "learning_rate": 6.528200413912357e-06, + "loss": 0.4875, + "step": 6579 + }, + { + "epoch": 0.84, + "grad_norm": 0.7442181351488076, + "learning_rate": 6.527218159551213e-06, + "loss": 0.6373, + "step": 6580 + }, + { + "epoch": 0.84, + "grad_norm": 0.711695797430948, + "learning_rate": 6.526235840181272e-06, + "loss": 0.5052, + "step": 6581 + }, + { + "epoch": 0.84, + "grad_norm": 0.6432289557796902, + "learning_rate": 6.525253455844348e-06, + "loss": 0.5628, + "step": 6582 + }, + { + "epoch": 0.84, + "grad_norm": 0.6137086632156515, + "learning_rate": 6.524271006582259e-06, + "loss": 0.5217, + "step": 6583 + }, + { + "epoch": 0.84, + "grad_norm": 0.5704356839424467, + "learning_rate": 6.523288492436826e-06, + "loss": 0.5548, + "step": 6584 + }, + { + "epoch": 0.84, + "grad_norm": 0.6205227296711358, + "learning_rate": 6.522305913449867e-06, + "loss": 0.5726, + "step": 6585 + }, + { + "epoch": 0.84, + "grad_norm": 0.7307209198799085, + "learning_rate": 6.521323269663211e-06, + "loss": 0.5787, + "step": 6586 + }, + { + "epoch": 0.84, + "grad_norm": 0.5419852748657038, + "learning_rate": 6.5203405611186855e-06, + "loss": 0.4894, + "step": 6587 + }, + { + "epoch": 0.84, + "grad_norm": 0.7233683517376814, + "learning_rate": 6.519357787858119e-06, + "loss": 0.6103, + "step": 6588 + }, + { + "epoch": 0.84, + "grad_norm": 0.6430593217668235, + "learning_rate": 6.518374949923347e-06, + "loss": 0.5225, + "step": 6589 + }, + { + "epoch": 0.84, + "grad_norm": 0.7916466069694474, + "learning_rate": 6.517392047356205e-06, + "loss": 0.5987, + "step": 6590 + }, + { + "epoch": 0.84, + "grad_norm": 1.2594116491866636, + "learning_rate": 6.516409080198535e-06, + "loss": 0.6445, + "step": 6591 + }, + { + "epoch": 0.84, + "grad_norm": 0.5840893942174273, + "learning_rate": 6.5154260484921735e-06, + "loss": 0.5139, + "step": 6592 + }, + { + "epoch": 0.84, + "grad_norm": 0.7183332341164766, + "learning_rate": 6.51444295227897e-06, + "loss": 0.5961, + "step": 6593 + }, + { + "epoch": 0.84, + "grad_norm": 0.8634584307887804, + "learning_rate": 6.513459791600768e-06, + "loss": 0.6501, + "step": 6594 + }, + { + "epoch": 0.84, + "grad_norm": 0.7755051665441314, + "learning_rate": 6.51247656649942e-06, + "loss": 0.6365, + "step": 6595 + }, + { + "epoch": 0.84, + "grad_norm": 1.19429483664414, + "learning_rate": 6.511493277016776e-06, + "loss": 0.574, + "step": 6596 + }, + { + "epoch": 0.84, + "grad_norm": 0.6991586381722998, + "learning_rate": 6.510509923194694e-06, + "loss": 0.56, + "step": 6597 + }, + { + "epoch": 0.84, + "grad_norm": 0.5917032495669147, + "learning_rate": 6.509526505075032e-06, + "loss": 0.5403, + "step": 6598 + }, + { + "epoch": 0.84, + "grad_norm": 0.6509379503007197, + "learning_rate": 6.508543022699652e-06, + "loss": 0.6081, + "step": 6599 + }, + { + "epoch": 0.84, + "grad_norm": 0.6189794787068471, + "learning_rate": 6.5075594761104126e-06, + "loss": 0.5519, + "step": 6600 + }, + { + "epoch": 0.84, + "grad_norm": 0.6791622848975046, + "learning_rate": 6.506575865349185e-06, + "loss": 0.5481, + "step": 6601 + }, + { + "epoch": 0.84, + "grad_norm": 0.7531039358858942, + "learning_rate": 6.505592190457838e-06, + "loss": 0.6141, + "step": 6602 + }, + { + "epoch": 0.84, + "grad_norm": 0.8505550260254032, + "learning_rate": 6.504608451478242e-06, + "loss": 0.5806, + "step": 6603 + }, + { + "epoch": 0.84, + "grad_norm": 1.0209036518929284, + "learning_rate": 6.503624648452273e-06, + "loss": 0.6671, + "step": 6604 + }, + { + "epoch": 0.84, + "grad_norm": 0.6238119148373528, + "learning_rate": 6.502640781421807e-06, + "loss": 0.5509, + "step": 6605 + }, + { + "epoch": 0.84, + "grad_norm": 1.0555480266865047, + "learning_rate": 6.501656850428724e-06, + "loss": 0.6019, + "step": 6606 + }, + { + "epoch": 0.84, + "grad_norm": 0.5895111872159797, + "learning_rate": 6.500672855514909e-06, + "loss": 0.5206, + "step": 6607 + }, + { + "epoch": 0.84, + "grad_norm": 0.7822501077831937, + "learning_rate": 6.499688796722244e-06, + "loss": 0.6117, + "step": 6608 + }, + { + "epoch": 0.84, + "grad_norm": 0.6030800136079699, + "learning_rate": 6.498704674092622e-06, + "loss": 0.4907, + "step": 6609 + }, + { + "epoch": 0.84, + "grad_norm": 0.7311892330290485, + "learning_rate": 6.497720487667929e-06, + "loss": 0.6285, + "step": 6610 + }, + { + "epoch": 0.84, + "grad_norm": 0.6095372510185418, + "learning_rate": 6.496736237490062e-06, + "loss": 0.5183, + "step": 6611 + }, + { + "epoch": 0.84, + "grad_norm": 0.6074076574843538, + "learning_rate": 6.4957519236009156e-06, + "loss": 0.5039, + "step": 6612 + }, + { + "epoch": 0.84, + "grad_norm": 0.7935921019013585, + "learning_rate": 6.494767546042389e-06, + "loss": 0.5032, + "step": 6613 + }, + { + "epoch": 0.84, + "grad_norm": 0.7714659687510147, + "learning_rate": 6.493783104856386e-06, + "loss": 0.6196, + "step": 6614 + }, + { + "epoch": 0.84, + "grad_norm": 0.5924889971866503, + "learning_rate": 6.492798600084809e-06, + "loss": 0.5081, + "step": 6615 + }, + { + "epoch": 0.84, + "grad_norm": 0.5687662927402892, + "learning_rate": 6.491814031769566e-06, + "loss": 0.4873, + "step": 6616 + }, + { + "epoch": 0.84, + "grad_norm": 0.7601024218974984, + "learning_rate": 6.4908293999525675e-06, + "loss": 0.6131, + "step": 6617 + }, + { + "epoch": 0.84, + "grad_norm": 1.1927784118332903, + "learning_rate": 6.489844704675724e-06, + "loss": 0.6214, + "step": 6618 + }, + { + "epoch": 0.84, + "grad_norm": 0.7791536561609305, + "learning_rate": 6.488859945980952e-06, + "loss": 0.6187, + "step": 6619 + }, + { + "epoch": 0.84, + "grad_norm": 0.6684148761859198, + "learning_rate": 6.4878751239101715e-06, + "loss": 0.5644, + "step": 6620 + }, + { + "epoch": 0.84, + "grad_norm": 0.7172590504768079, + "learning_rate": 6.486890238505301e-06, + "loss": 0.5982, + "step": 6621 + }, + { + "epoch": 0.84, + "grad_norm": 0.6198513943762066, + "learning_rate": 6.485905289808264e-06, + "loss": 0.5062, + "step": 6622 + }, + { + "epoch": 0.84, + "grad_norm": 0.6007609126335464, + "learning_rate": 6.484920277860988e-06, + "loss": 0.5225, + "step": 6623 + }, + { + "epoch": 0.84, + "grad_norm": 0.6454183493838301, + "learning_rate": 6.4839352027054e-06, + "loss": 0.519, + "step": 6624 + }, + { + "epoch": 0.84, + "grad_norm": 0.7218154042427212, + "learning_rate": 6.482950064383432e-06, + "loss": 0.5352, + "step": 6625 + }, + { + "epoch": 0.84, + "grad_norm": 0.7128891104835277, + "learning_rate": 6.481964862937019e-06, + "loss": 0.5917, + "step": 6626 + }, + { + "epoch": 0.84, + "grad_norm": 0.6481267742263872, + "learning_rate": 6.480979598408098e-06, + "loss": 0.4986, + "step": 6627 + }, + { + "epoch": 0.84, + "grad_norm": 0.576561963314315, + "learning_rate": 6.479994270838608e-06, + "loss": 0.5241, + "step": 6628 + }, + { + "epoch": 0.84, + "grad_norm": 0.7723381342276889, + "learning_rate": 6.479008880270491e-06, + "loss": 0.6232, + "step": 6629 + }, + { + "epoch": 0.84, + "grad_norm": 0.5712802892892588, + "learning_rate": 6.478023426745692e-06, + "loss": 0.5356, + "step": 6630 + }, + { + "epoch": 0.84, + "grad_norm": 0.6218614701393628, + "learning_rate": 6.477037910306159e-06, + "loss": 0.4964, + "step": 6631 + }, + { + "epoch": 0.84, + "grad_norm": 0.7774841211352943, + "learning_rate": 6.476052330993842e-06, + "loss": 0.6049, + "step": 6632 + }, + { + "epoch": 0.85, + "grad_norm": 0.5631581229202345, + "learning_rate": 6.475066688850694e-06, + "loss": 0.4696, + "step": 6633 + }, + { + "epoch": 0.85, + "grad_norm": 0.5174789809951127, + "learning_rate": 6.474080983918671e-06, + "loss": 0.4532, + "step": 6634 + }, + { + "epoch": 0.85, + "grad_norm": 0.6606388601648999, + "learning_rate": 6.473095216239731e-06, + "loss": 0.5051, + "step": 6635 + }, + { + "epoch": 0.85, + "grad_norm": 0.8533707933848041, + "learning_rate": 6.472109385855836e-06, + "loss": 0.6202, + "step": 6636 + }, + { + "epoch": 0.85, + "grad_norm": 0.6558808481903375, + "learning_rate": 6.471123492808946e-06, + "loss": 0.4936, + "step": 6637 + }, + { + "epoch": 0.85, + "grad_norm": 0.7955505713043879, + "learning_rate": 6.470137537141031e-06, + "loss": 0.6064, + "step": 6638 + }, + { + "epoch": 0.85, + "grad_norm": 0.7829917928962898, + "learning_rate": 6.46915151889406e-06, + "loss": 0.6635, + "step": 6639 + }, + { + "epoch": 0.85, + "grad_norm": 0.770157181848176, + "learning_rate": 6.468165438110004e-06, + "loss": 0.5681, + "step": 6640 + }, + { + "epoch": 0.85, + "grad_norm": 0.6250488830721793, + "learning_rate": 6.467179294830835e-06, + "loss": 0.5073, + "step": 6641 + }, + { + "epoch": 0.85, + "grad_norm": 0.6868334415337548, + "learning_rate": 6.4661930890985335e-06, + "loss": 0.5617, + "step": 6642 + }, + { + "epoch": 0.85, + "grad_norm": 0.7920069716883784, + "learning_rate": 6.465206820955076e-06, + "loss": 0.6303, + "step": 6643 + }, + { + "epoch": 0.85, + "grad_norm": 0.715335768436534, + "learning_rate": 6.464220490442446e-06, + "loss": 0.6082, + "step": 6644 + }, + { + "epoch": 0.85, + "grad_norm": 0.7336124810956771, + "learning_rate": 6.463234097602631e-06, + "loss": 0.4735, + "step": 6645 + }, + { + "epoch": 0.85, + "grad_norm": 0.7202239309172377, + "learning_rate": 6.462247642477615e-06, + "loss": 0.545, + "step": 6646 + }, + { + "epoch": 0.85, + "grad_norm": 0.5941892673870489, + "learning_rate": 6.461261125109389e-06, + "loss": 0.5382, + "step": 6647 + }, + { + "epoch": 0.85, + "grad_norm": 0.5826192815089393, + "learning_rate": 6.460274545539947e-06, + "loss": 0.5037, + "step": 6648 + }, + { + "epoch": 0.85, + "grad_norm": 0.8123260487299397, + "learning_rate": 6.459287903811285e-06, + "loss": 0.5952, + "step": 6649 + }, + { + "epoch": 0.85, + "grad_norm": 0.9380444776394964, + "learning_rate": 6.458301199965398e-06, + "loss": 0.6765, + "step": 6650 + }, + { + "epoch": 0.85, + "grad_norm": 0.5444336617343107, + "learning_rate": 6.457314434044289e-06, + "loss": 0.5142, + "step": 6651 + }, + { + "epoch": 0.85, + "grad_norm": 1.0043128034882705, + "learning_rate": 6.456327606089962e-06, + "loss": 0.6496, + "step": 6652 + }, + { + "epoch": 0.85, + "grad_norm": 0.8263048961454296, + "learning_rate": 6.455340716144425e-06, + "loss": 0.6103, + "step": 6653 + }, + { + "epoch": 0.85, + "grad_norm": 0.5912204655334908, + "learning_rate": 6.4543537642496816e-06, + "loss": 0.5506, + "step": 6654 + }, + { + "epoch": 0.85, + "grad_norm": 0.5919250552538146, + "learning_rate": 6.453366750447747e-06, + "loss": 0.545, + "step": 6655 + }, + { + "epoch": 0.85, + "grad_norm": 0.5435661626258214, + "learning_rate": 6.452379674780633e-06, + "loss": 0.5013, + "step": 6656 + }, + { + "epoch": 0.85, + "grad_norm": 0.7510925673391442, + "learning_rate": 6.45139253729036e-06, + "loss": 0.5604, + "step": 6657 + }, + { + "epoch": 0.85, + "grad_norm": 0.8422280437779414, + "learning_rate": 6.450405338018943e-06, + "loss": 0.689, + "step": 6658 + }, + { + "epoch": 0.85, + "grad_norm": 0.7140162987163611, + "learning_rate": 6.449418077008405e-06, + "loss": 0.6538, + "step": 6659 + }, + { + "epoch": 0.85, + "grad_norm": 0.5467812951779568, + "learning_rate": 6.448430754300772e-06, + "loss": 0.4883, + "step": 6660 + }, + { + "epoch": 0.85, + "grad_norm": 0.714976472578544, + "learning_rate": 6.44744336993807e-06, + "loss": 0.5659, + "step": 6661 + }, + { + "epoch": 0.85, + "grad_norm": 0.8578209927853342, + "learning_rate": 6.446455923962328e-06, + "loss": 0.5807, + "step": 6662 + }, + { + "epoch": 0.85, + "grad_norm": 0.7644429897205445, + "learning_rate": 6.44546841641558e-06, + "loss": 0.5787, + "step": 6663 + }, + { + "epoch": 0.85, + "grad_norm": 0.6509814584879079, + "learning_rate": 6.444480847339862e-06, + "loss": 0.5612, + "step": 6664 + }, + { + "epoch": 0.85, + "grad_norm": 0.599574322432731, + "learning_rate": 6.443493216777209e-06, + "loss": 0.539, + "step": 6665 + }, + { + "epoch": 0.85, + "grad_norm": 0.6536085707771628, + "learning_rate": 6.442505524769663e-06, + "loss": 0.548, + "step": 6666 + }, + { + "epoch": 0.85, + "grad_norm": 0.8134178203412349, + "learning_rate": 6.441517771359267e-06, + "loss": 0.6095, + "step": 6667 + }, + { + "epoch": 0.85, + "grad_norm": 0.6252589088357704, + "learning_rate": 6.440529956588064e-06, + "loss": 0.4954, + "step": 6668 + }, + { + "epoch": 0.85, + "grad_norm": 0.5902260662712403, + "learning_rate": 6.439542080498105e-06, + "loss": 0.5406, + "step": 6669 + }, + { + "epoch": 0.85, + "grad_norm": 0.660871271968508, + "learning_rate": 6.43855414313144e-06, + "loss": 0.5505, + "step": 6670 + }, + { + "epoch": 0.85, + "grad_norm": 0.6412905588827149, + "learning_rate": 6.437566144530123e-06, + "loss": 0.5753, + "step": 6671 + }, + { + "epoch": 0.85, + "grad_norm": 0.504343457256404, + "learning_rate": 6.436578084736207e-06, + "loss": 0.4828, + "step": 6672 + }, + { + "epoch": 0.85, + "grad_norm": 0.6024850680333286, + "learning_rate": 6.435589963791754e-06, + "loss": 0.5062, + "step": 6673 + }, + { + "epoch": 0.85, + "grad_norm": 0.7674430865032238, + "learning_rate": 6.434601781738824e-06, + "loss": 0.5023, + "step": 6674 + }, + { + "epoch": 0.85, + "grad_norm": 0.8091307097736599, + "learning_rate": 6.433613538619479e-06, + "loss": 0.5952, + "step": 6675 + }, + { + "epoch": 0.85, + "grad_norm": 0.7935332083526491, + "learning_rate": 6.432625234475788e-06, + "loss": 0.6256, + "step": 6676 + }, + { + "epoch": 0.85, + "grad_norm": 0.8335983934485832, + "learning_rate": 6.431636869349818e-06, + "loss": 0.6417, + "step": 6677 + }, + { + "epoch": 0.85, + "grad_norm": 0.7133436863802591, + "learning_rate": 6.430648443283642e-06, + "loss": 0.6089, + "step": 6678 + }, + { + "epoch": 0.85, + "grad_norm": 0.7133959633483786, + "learning_rate": 6.4296599563193344e-06, + "loss": 0.6138, + "step": 6679 + }, + { + "epoch": 0.85, + "grad_norm": 0.825994999486831, + "learning_rate": 6.428671408498969e-06, + "loss": 0.5563, + "step": 6680 + }, + { + "epoch": 0.85, + "grad_norm": 0.6029731726032074, + "learning_rate": 6.427682799864628e-06, + "loss": 0.5331, + "step": 6681 + }, + { + "epoch": 0.85, + "grad_norm": 0.6042192317360487, + "learning_rate": 6.426694130458392e-06, + "loss": 0.5252, + "step": 6682 + }, + { + "epoch": 0.85, + "grad_norm": 0.6103765821083451, + "learning_rate": 6.425705400322346e-06, + "loss": 0.5342, + "step": 6683 + }, + { + "epoch": 0.85, + "grad_norm": 0.712333074987951, + "learning_rate": 6.424716609498578e-06, + "loss": 0.6003, + "step": 6684 + }, + { + "epoch": 0.85, + "grad_norm": 0.7858545673161307, + "learning_rate": 6.423727758029175e-06, + "loss": 0.5751, + "step": 6685 + }, + { + "epoch": 0.85, + "grad_norm": 0.6271358504217927, + "learning_rate": 6.422738845956232e-06, + "loss": 0.5103, + "step": 6686 + }, + { + "epoch": 0.85, + "grad_norm": 0.6542777104160439, + "learning_rate": 6.421749873321843e-06, + "loss": 0.5235, + "step": 6687 + }, + { + "epoch": 0.85, + "grad_norm": 0.8814752299022276, + "learning_rate": 6.4207608401681045e-06, + "loss": 0.5681, + "step": 6688 + }, + { + "epoch": 0.85, + "grad_norm": 0.6455099468820157, + "learning_rate": 6.419771746537118e-06, + "loss": 0.5141, + "step": 6689 + }, + { + "epoch": 0.85, + "grad_norm": 0.7389888470521515, + "learning_rate": 6.418782592470985e-06, + "loss": 0.584, + "step": 6690 + }, + { + "epoch": 0.85, + "grad_norm": 0.6119270179492299, + "learning_rate": 6.4177933780118094e-06, + "loss": 0.5489, + "step": 6691 + }, + { + "epoch": 0.85, + "grad_norm": 0.8741890718063151, + "learning_rate": 6.4168041032017026e-06, + "loss": 0.6322, + "step": 6692 + }, + { + "epoch": 0.85, + "grad_norm": 0.6098622082197017, + "learning_rate": 6.41581476808277e-06, + "loss": 0.5259, + "step": 6693 + }, + { + "epoch": 0.85, + "grad_norm": 0.780727278145902, + "learning_rate": 6.4148253726971286e-06, + "loss": 0.618, + "step": 6694 + }, + { + "epoch": 0.85, + "grad_norm": 0.6646163861176623, + "learning_rate": 6.413835917086892e-06, + "loss": 0.5742, + "step": 6695 + }, + { + "epoch": 0.85, + "grad_norm": 0.7168848602542783, + "learning_rate": 6.4128464012941795e-06, + "loss": 0.5396, + "step": 6696 + }, + { + "epoch": 0.85, + "grad_norm": 0.5705818002988751, + "learning_rate": 6.411856825361109e-06, + "loss": 0.5351, + "step": 6697 + }, + { + "epoch": 0.85, + "grad_norm": 0.7337958905809268, + "learning_rate": 6.410867189329806e-06, + "loss": 0.5732, + "step": 6698 + }, + { + "epoch": 0.85, + "grad_norm": 1.1460587253276808, + "learning_rate": 6.4098774932423935e-06, + "loss": 0.5923, + "step": 6699 + }, + { + "epoch": 0.85, + "grad_norm": 0.7969143690271436, + "learning_rate": 6.408887737141003e-06, + "loss": 0.598, + "step": 6700 + }, + { + "epoch": 0.85, + "grad_norm": 0.6012642235502653, + "learning_rate": 6.407897921067763e-06, + "loss": 0.5133, + "step": 6701 + }, + { + "epoch": 0.85, + "grad_norm": 0.6241253434314572, + "learning_rate": 6.406908045064808e-06, + "loss": 0.5352, + "step": 6702 + }, + { + "epoch": 0.85, + "grad_norm": 0.603500412991251, + "learning_rate": 6.405918109174274e-06, + "loss": 0.5635, + "step": 6703 + }, + { + "epoch": 0.85, + "grad_norm": 0.7130743433358312, + "learning_rate": 6.404928113438298e-06, + "loss": 0.6062, + "step": 6704 + }, + { + "epoch": 0.85, + "grad_norm": 0.8390679425955562, + "learning_rate": 6.403938057899021e-06, + "loss": 0.5896, + "step": 6705 + }, + { + "epoch": 0.85, + "grad_norm": 0.6003620047611676, + "learning_rate": 6.402947942598588e-06, + "loss": 0.4483, + "step": 6706 + }, + { + "epoch": 0.85, + "grad_norm": 0.7809477775062855, + "learning_rate": 6.401957767579143e-06, + "loss": 0.6145, + "step": 6707 + }, + { + "epoch": 0.85, + "grad_norm": 0.7214247044322019, + "learning_rate": 6.400967532882838e-06, + "loss": 0.5197, + "step": 6708 + }, + { + "epoch": 0.85, + "grad_norm": 1.0330520167490873, + "learning_rate": 6.399977238551821e-06, + "loss": 0.6049, + "step": 6709 + }, + { + "epoch": 0.85, + "grad_norm": 0.538280732392651, + "learning_rate": 6.398986884628246e-06, + "loss": 0.5185, + "step": 6710 + }, + { + "epoch": 0.85, + "grad_norm": 0.7801000993264211, + "learning_rate": 6.397996471154272e-06, + "loss": 0.6316, + "step": 6711 + }, + { + "epoch": 0.86, + "grad_norm": 0.6546423021000509, + "learning_rate": 6.3970059981720526e-06, + "loss": 0.5182, + "step": 6712 + }, + { + "epoch": 0.86, + "grad_norm": 0.8467742519251735, + "learning_rate": 6.396015465723754e-06, + "loss": 0.6319, + "step": 6713 + }, + { + "epoch": 0.86, + "grad_norm": 0.6799915195617798, + "learning_rate": 6.395024873851537e-06, + "loss": 0.5505, + "step": 6714 + }, + { + "epoch": 0.86, + "grad_norm": 0.7187111517934552, + "learning_rate": 6.394034222597571e-06, + "loss": 0.5635, + "step": 6715 + }, + { + "epoch": 0.86, + "grad_norm": 0.6112867117847852, + "learning_rate": 6.393043512004022e-06, + "loss": 0.5009, + "step": 6716 + }, + { + "epoch": 0.86, + "grad_norm": 0.7582173584268032, + "learning_rate": 6.39205274211306e-06, + "loss": 0.6385, + "step": 6717 + }, + { + "epoch": 0.86, + "grad_norm": 0.6696693004498872, + "learning_rate": 6.391061912966864e-06, + "loss": 0.5543, + "step": 6718 + }, + { + "epoch": 0.86, + "grad_norm": 0.6591884404022977, + "learning_rate": 6.3900710246076055e-06, + "loss": 0.5493, + "step": 6719 + }, + { + "epoch": 0.86, + "grad_norm": 0.7622445422933825, + "learning_rate": 6.389080077077467e-06, + "loss": 0.6071, + "step": 6720 + }, + { + "epoch": 0.86, + "grad_norm": 0.7749665447037803, + "learning_rate": 6.3880890704186285e-06, + "loss": 0.6113, + "step": 6721 + }, + { + "epoch": 0.86, + "grad_norm": 0.5555647932447542, + "learning_rate": 6.387098004673274e-06, + "loss": 0.4722, + "step": 6722 + }, + { + "epoch": 0.86, + "grad_norm": 0.591486054805839, + "learning_rate": 6.386106879883589e-06, + "loss": 0.5137, + "step": 6723 + }, + { + "epoch": 0.86, + "grad_norm": 0.6986612083317825, + "learning_rate": 6.385115696091763e-06, + "loss": 0.5689, + "step": 6724 + }, + { + "epoch": 0.86, + "grad_norm": 0.6413008031795211, + "learning_rate": 6.384124453339988e-06, + "loss": 0.5151, + "step": 6725 + }, + { + "epoch": 0.86, + "grad_norm": 0.6807008739889637, + "learning_rate": 6.38313315167046e-06, + "loss": 0.5474, + "step": 6726 + }, + { + "epoch": 0.86, + "grad_norm": 0.7190364538883304, + "learning_rate": 6.382141791125371e-06, + "loss": 0.6143, + "step": 6727 + }, + { + "epoch": 0.86, + "grad_norm": 0.7138245656461645, + "learning_rate": 6.381150371746925e-06, + "loss": 0.5144, + "step": 6728 + }, + { + "epoch": 0.86, + "grad_norm": 0.6731068278629442, + "learning_rate": 6.380158893577321e-06, + "loss": 0.539, + "step": 6729 + }, + { + "epoch": 0.86, + "grad_norm": 0.6334266464840334, + "learning_rate": 6.379167356658761e-06, + "loss": 0.4887, + "step": 6730 + }, + { + "epoch": 0.86, + "grad_norm": 0.6136014123301498, + "learning_rate": 6.378175761033456e-06, + "loss": 0.5422, + "step": 6731 + }, + { + "epoch": 0.86, + "grad_norm": 0.5998200679599752, + "learning_rate": 6.377184106743612e-06, + "loss": 0.5262, + "step": 6732 + }, + { + "epoch": 0.86, + "grad_norm": 0.5659747295383475, + "learning_rate": 6.3761923938314415e-06, + "loss": 0.5157, + "step": 6733 + }, + { + "epoch": 0.86, + "grad_norm": 0.5649108934638767, + "learning_rate": 6.375200622339159e-06, + "loss": 0.4787, + "step": 6734 + }, + { + "epoch": 0.86, + "grad_norm": 0.7574165968262838, + "learning_rate": 6.37420879230898e-06, + "loss": 0.5744, + "step": 6735 + }, + { + "epoch": 0.86, + "grad_norm": 1.4388942882369706, + "learning_rate": 6.3732169037831246e-06, + "loss": 0.6236, + "step": 6736 + }, + { + "epoch": 0.86, + "grad_norm": 0.9795215249888689, + "learning_rate": 6.372224956803812e-06, + "loss": 0.622, + "step": 6737 + }, + { + "epoch": 0.86, + "grad_norm": 0.5670350333469152, + "learning_rate": 6.37123295141327e-06, + "loss": 0.5263, + "step": 6738 + }, + { + "epoch": 0.86, + "grad_norm": 0.6845583822661014, + "learning_rate": 6.3702408876537224e-06, + "loss": 0.5792, + "step": 6739 + }, + { + "epoch": 0.86, + "grad_norm": 0.6381552008092994, + "learning_rate": 6.3692487655674e-06, + "loss": 0.5252, + "step": 6740 + }, + { + "epoch": 0.86, + "grad_norm": 0.5986972026279694, + "learning_rate": 6.368256585196532e-06, + "loss": 0.552, + "step": 6741 + }, + { + "epoch": 0.86, + "grad_norm": 0.7205224659588964, + "learning_rate": 6.3672643465833525e-06, + "loss": 0.6018, + "step": 6742 + }, + { + "epoch": 0.86, + "grad_norm": 0.8062345365126261, + "learning_rate": 6.3662720497700996e-06, + "loss": 0.5429, + "step": 6743 + }, + { + "epoch": 0.86, + "grad_norm": 0.6414281985888026, + "learning_rate": 6.365279694799012e-06, + "loss": 0.5147, + "step": 6744 + }, + { + "epoch": 0.86, + "grad_norm": 0.7556485437413981, + "learning_rate": 6.364287281712331e-06, + "loss": 0.5875, + "step": 6745 + }, + { + "epoch": 0.86, + "grad_norm": 0.5826631493802104, + "learning_rate": 6.3632948105522995e-06, + "loss": 0.4515, + "step": 6746 + }, + { + "epoch": 0.86, + "grad_norm": 0.7531103371860258, + "learning_rate": 6.362302281361165e-06, + "loss": 0.6538, + "step": 6747 + }, + { + "epoch": 0.86, + "grad_norm": 0.7466964161900267, + "learning_rate": 6.361309694181175e-06, + "loss": 0.6341, + "step": 6748 + }, + { + "epoch": 0.86, + "grad_norm": 0.5391393020601636, + "learning_rate": 6.3603170490545805e-06, + "loss": 0.5266, + "step": 6749 + }, + { + "epoch": 0.86, + "grad_norm": 0.7314106997727251, + "learning_rate": 6.3593243460236376e-06, + "loss": 0.5822, + "step": 6750 + }, + { + "epoch": 0.86, + "grad_norm": 0.7067591723768756, + "learning_rate": 6.3583315851306e-06, + "loss": 0.571, + "step": 6751 + }, + { + "epoch": 0.86, + "grad_norm": 0.5537794363673365, + "learning_rate": 6.357338766417729e-06, + "loss": 0.5056, + "step": 6752 + }, + { + "epoch": 0.86, + "grad_norm": 0.6287934355885714, + "learning_rate": 6.356345889927284e-06, + "loss": 0.5547, + "step": 6753 + }, + { + "epoch": 0.86, + "grad_norm": 0.7113761522520146, + "learning_rate": 6.355352955701528e-06, + "loss": 0.6214, + "step": 6754 + }, + { + "epoch": 0.86, + "grad_norm": 0.9750561780835462, + "learning_rate": 6.354359963782726e-06, + "loss": 0.619, + "step": 6755 + }, + { + "epoch": 0.86, + "grad_norm": 0.8919735503043947, + "learning_rate": 6.353366914213151e-06, + "loss": 0.6053, + "step": 6756 + }, + { + "epoch": 0.86, + "grad_norm": 0.709234512999577, + "learning_rate": 6.3523738070350705e-06, + "loss": 0.6305, + "step": 6757 + }, + { + "epoch": 0.86, + "grad_norm": 0.8205006112398691, + "learning_rate": 6.351380642290757e-06, + "loss": 0.6353, + "step": 6758 + }, + { + "epoch": 0.86, + "grad_norm": 0.5574928899055271, + "learning_rate": 6.35038742002249e-06, + "loss": 0.4891, + "step": 6759 + }, + { + "epoch": 0.86, + "grad_norm": 0.5304699646174911, + "learning_rate": 6.349394140272546e-06, + "loss": 0.5112, + "step": 6760 + }, + { + "epoch": 0.86, + "grad_norm": 0.7501973588898401, + "learning_rate": 6.348400803083204e-06, + "loss": 0.5445, + "step": 6761 + }, + { + "epoch": 0.86, + "grad_norm": 0.9388017352297809, + "learning_rate": 6.347407408496749e-06, + "loss": 0.5875, + "step": 6762 + }, + { + "epoch": 0.86, + "grad_norm": 0.7173405857868247, + "learning_rate": 6.346413956555466e-06, + "loss": 0.5842, + "step": 6763 + }, + { + "epoch": 0.86, + "grad_norm": 0.8187707723663016, + "learning_rate": 6.345420447301644e-06, + "loss": 0.6533, + "step": 6764 + }, + { + "epoch": 0.86, + "grad_norm": 0.7417909680826302, + "learning_rate": 6.3444268807775736e-06, + "loss": 0.5896, + "step": 6765 + }, + { + "epoch": 0.86, + "grad_norm": 0.7642172112967541, + "learning_rate": 6.3434332570255465e-06, + "loss": 0.6128, + "step": 6766 + }, + { + "epoch": 0.86, + "grad_norm": 0.6114815433787782, + "learning_rate": 6.342439576087858e-06, + "loss": 0.5052, + "step": 6767 + }, + { + "epoch": 0.86, + "grad_norm": 1.0236714350210854, + "learning_rate": 6.341445838006806e-06, + "loss": 0.5985, + "step": 6768 + }, + { + "epoch": 0.86, + "grad_norm": 0.6744256826388978, + "learning_rate": 6.340452042824693e-06, + "loss": 0.4624, + "step": 6769 + }, + { + "epoch": 0.86, + "grad_norm": 0.7222292963664174, + "learning_rate": 6.339458190583819e-06, + "loss": 0.5547, + "step": 6770 + }, + { + "epoch": 0.86, + "grad_norm": 0.7537838566144314, + "learning_rate": 6.3384642813264904e-06, + "loss": 0.6251, + "step": 6771 + }, + { + "epoch": 0.86, + "grad_norm": 0.5984447339557778, + "learning_rate": 6.337470315095016e-06, + "loss": 0.5484, + "step": 6772 + }, + { + "epoch": 0.86, + "grad_norm": 0.5093405123693988, + "learning_rate": 6.336476291931702e-06, + "loss": 0.4885, + "step": 6773 + }, + { + "epoch": 0.86, + "grad_norm": 0.6065780791658569, + "learning_rate": 6.3354822118788624e-06, + "loss": 0.5449, + "step": 6774 + }, + { + "epoch": 0.86, + "grad_norm": 0.7501777419214004, + "learning_rate": 6.334488074978815e-06, + "loss": 0.5172, + "step": 6775 + }, + { + "epoch": 0.86, + "grad_norm": 0.7495723344250118, + "learning_rate": 6.3334938812738734e-06, + "loss": 0.5974, + "step": 6776 + }, + { + "epoch": 0.86, + "grad_norm": 0.7150863817269848, + "learning_rate": 6.332499630806359e-06, + "loss": 0.5837, + "step": 6777 + }, + { + "epoch": 0.86, + "grad_norm": 0.7851310025120265, + "learning_rate": 6.3315053236185935e-06, + "loss": 0.6754, + "step": 6778 + }, + { + "epoch": 0.86, + "grad_norm": 0.7222751470063057, + "learning_rate": 6.330510959752902e-06, + "loss": 0.6121, + "step": 6779 + }, + { + "epoch": 0.86, + "grad_norm": 0.9211548415234667, + "learning_rate": 6.329516539251609e-06, + "loss": 0.5665, + "step": 6780 + }, + { + "epoch": 0.86, + "grad_norm": 0.5627713639444607, + "learning_rate": 6.328522062157045e-06, + "loss": 0.5454, + "step": 6781 + }, + { + "epoch": 0.86, + "grad_norm": 0.8162151870739535, + "learning_rate": 6.327527528511544e-06, + "loss": 0.586, + "step": 6782 + }, + { + "epoch": 0.86, + "grad_norm": 0.6706600262678507, + "learning_rate": 6.326532938357438e-06, + "loss": 0.554, + "step": 6783 + }, + { + "epoch": 0.86, + "grad_norm": 0.7132656147409749, + "learning_rate": 6.3255382917370634e-06, + "loss": 0.6419, + "step": 6784 + }, + { + "epoch": 0.86, + "grad_norm": 0.5587804142270424, + "learning_rate": 6.32454358869276e-06, + "loss": 0.5254, + "step": 6785 + }, + { + "epoch": 0.86, + "grad_norm": 0.5622653100116567, + "learning_rate": 6.323548829266867e-06, + "loss": 0.5008, + "step": 6786 + }, + { + "epoch": 0.86, + "grad_norm": 0.638376379275469, + "learning_rate": 6.322554013501731e-06, + "loss": 0.5835, + "step": 6787 + }, + { + "epoch": 0.86, + "grad_norm": 0.6609193734697818, + "learning_rate": 6.321559141439697e-06, + "loss": 0.5525, + "step": 6788 + }, + { + "epoch": 0.86, + "grad_norm": 0.6013839173079497, + "learning_rate": 6.320564213123113e-06, + "loss": 0.563, + "step": 6789 + }, + { + "epoch": 0.87, + "grad_norm": 0.6008060519934829, + "learning_rate": 6.319569228594331e-06, + "loss": 0.4826, + "step": 6790 + }, + { + "epoch": 0.87, + "grad_norm": 0.6721232126266182, + "learning_rate": 6.318574187895703e-06, + "loss": 0.5577, + "step": 6791 + }, + { + "epoch": 0.87, + "grad_norm": 0.532686461543334, + "learning_rate": 6.3175790910695845e-06, + "loss": 0.5214, + "step": 6792 + }, + { + "epoch": 0.87, + "grad_norm": 0.5937242019250427, + "learning_rate": 6.316583938158336e-06, + "loss": 0.4935, + "step": 6793 + }, + { + "epoch": 0.87, + "grad_norm": 0.65146126396118, + "learning_rate": 6.3155887292043164e-06, + "loss": 0.495, + "step": 6794 + }, + { + "epoch": 0.87, + "grad_norm": 0.7826559495731986, + "learning_rate": 6.314593464249889e-06, + "loss": 0.5865, + "step": 6795 + }, + { + "epoch": 0.87, + "grad_norm": 0.6054753434688831, + "learning_rate": 6.313598143337417e-06, + "loss": 0.5177, + "step": 6796 + }, + { + "epoch": 0.87, + "grad_norm": 0.7828603925191665, + "learning_rate": 6.312602766509271e-06, + "loss": 0.5594, + "step": 6797 + }, + { + "epoch": 0.87, + "grad_norm": 0.5976344886995849, + "learning_rate": 6.31160733380782e-06, + "loss": 0.5399, + "step": 6798 + }, + { + "epoch": 0.87, + "grad_norm": 0.7608514391926767, + "learning_rate": 6.310611845275434e-06, + "loss": 0.5393, + "step": 6799 + }, + { + "epoch": 0.87, + "grad_norm": 0.7107898596213578, + "learning_rate": 6.309616300954492e-06, + "loss": 0.6108, + "step": 6800 + }, + { + "epoch": 0.87, + "grad_norm": 0.608412541856247, + "learning_rate": 6.308620700887368e-06, + "loss": 0.5478, + "step": 6801 + }, + { + "epoch": 0.87, + "grad_norm": 0.6038700595244411, + "learning_rate": 6.307625045116443e-06, + "loss": 0.5509, + "step": 6802 + }, + { + "epoch": 0.87, + "grad_norm": 1.543431507757396, + "learning_rate": 6.306629333684099e-06, + "loss": 0.6488, + "step": 6803 + }, + { + "epoch": 0.87, + "grad_norm": 0.6243108845268378, + "learning_rate": 6.305633566632719e-06, + "loss": 0.5401, + "step": 6804 + }, + { + "epoch": 0.87, + "grad_norm": 0.8794502147054255, + "learning_rate": 6.3046377440046895e-06, + "loss": 0.6461, + "step": 6805 + }, + { + "epoch": 0.87, + "grad_norm": 0.5867167302657583, + "learning_rate": 6.3036418658424e-06, + "loss": 0.5354, + "step": 6806 + }, + { + "epoch": 0.87, + "grad_norm": 0.6031382569390703, + "learning_rate": 6.3026459321882435e-06, + "loss": 0.5559, + "step": 6807 + }, + { + "epoch": 0.87, + "grad_norm": 0.5626801660294306, + "learning_rate": 6.301649943084612e-06, + "loss": 0.5391, + "step": 6808 + }, + { + "epoch": 0.87, + "grad_norm": 0.7008523751937906, + "learning_rate": 6.300653898573903e-06, + "loss": 0.5719, + "step": 6809 + }, + { + "epoch": 0.87, + "grad_norm": 0.6202315961834426, + "learning_rate": 6.299657798698512e-06, + "loss": 0.5435, + "step": 6810 + }, + { + "epoch": 0.87, + "grad_norm": 0.7578252059528312, + "learning_rate": 6.2986616435008415e-06, + "loss": 0.6158, + "step": 6811 + }, + { + "epoch": 0.87, + "grad_norm": 0.6815939664744214, + "learning_rate": 6.297665433023295e-06, + "loss": 0.554, + "step": 6812 + }, + { + "epoch": 0.87, + "grad_norm": 0.7057340399865087, + "learning_rate": 6.296669167308279e-06, + "loss": 0.6013, + "step": 6813 + }, + { + "epoch": 0.87, + "grad_norm": 0.6153740877570159, + "learning_rate": 6.2956728463982e-06, + "loss": 0.4992, + "step": 6814 + }, + { + "epoch": 0.87, + "grad_norm": 0.5921791749086263, + "learning_rate": 6.294676470335468e-06, + "loss": 0.4803, + "step": 6815 + }, + { + "epoch": 0.87, + "grad_norm": 0.7411634893068696, + "learning_rate": 6.293680039162495e-06, + "loss": 0.6038, + "step": 6816 + }, + { + "epoch": 0.87, + "grad_norm": 0.6545407829797351, + "learning_rate": 6.292683552921697e-06, + "loss": 0.5949, + "step": 6817 + }, + { + "epoch": 0.87, + "grad_norm": 0.7276433511793893, + "learning_rate": 6.291687011655491e-06, + "loss": 0.5784, + "step": 6818 + }, + { + "epoch": 0.87, + "grad_norm": 0.7790355606637095, + "learning_rate": 6.290690415406297e-06, + "loss": 0.6273, + "step": 6819 + }, + { + "epoch": 0.87, + "grad_norm": 0.8857327667942153, + "learning_rate": 6.2896937642165354e-06, + "loss": 0.5735, + "step": 6820 + }, + { + "epoch": 0.87, + "grad_norm": 0.6715998134174386, + "learning_rate": 6.2886970581286335e-06, + "loss": 0.5292, + "step": 6821 + }, + { + "epoch": 0.87, + "grad_norm": 0.7733025018650902, + "learning_rate": 6.287700297185015e-06, + "loss": 0.6041, + "step": 6822 + }, + { + "epoch": 0.87, + "grad_norm": 0.7173777103567918, + "learning_rate": 6.286703481428109e-06, + "loss": 0.6402, + "step": 6823 + }, + { + "epoch": 0.87, + "grad_norm": 0.642865696458957, + "learning_rate": 6.2857066109003484e-06, + "loss": 0.5225, + "step": 6824 + }, + { + "epoch": 0.87, + "grad_norm": 0.7270170677998824, + "learning_rate": 6.284709685644166e-06, + "loss": 0.5884, + "step": 6825 + }, + { + "epoch": 0.87, + "grad_norm": 0.5501905168697819, + "learning_rate": 6.283712705701997e-06, + "loss": 0.5299, + "step": 6826 + }, + { + "epoch": 0.87, + "grad_norm": 0.5984007750484809, + "learning_rate": 6.2827156711162805e-06, + "loss": 0.5407, + "step": 6827 + }, + { + "epoch": 0.87, + "grad_norm": 0.6576386282479347, + "learning_rate": 6.281718581929457e-06, + "loss": 0.5038, + "step": 6828 + }, + { + "epoch": 0.87, + "grad_norm": 0.6924926115000757, + "learning_rate": 6.280721438183969e-06, + "loss": 0.5721, + "step": 6829 + }, + { + "epoch": 0.87, + "grad_norm": 0.7452618188694167, + "learning_rate": 6.279724239922261e-06, + "loss": 0.5892, + "step": 6830 + }, + { + "epoch": 0.87, + "grad_norm": 0.671576464309829, + "learning_rate": 6.278726987186783e-06, + "loss": 0.5177, + "step": 6831 + }, + { + "epoch": 0.87, + "grad_norm": 0.7949311868949315, + "learning_rate": 6.277729680019984e-06, + "loss": 0.5861, + "step": 6832 + }, + { + "epoch": 0.87, + "grad_norm": 0.6807662791089962, + "learning_rate": 6.276732318464314e-06, + "loss": 0.6374, + "step": 6833 + }, + { + "epoch": 0.87, + "grad_norm": 0.6565947205416856, + "learning_rate": 6.27573490256223e-06, + "loss": 0.559, + "step": 6834 + }, + { + "epoch": 0.87, + "grad_norm": 0.6133906144028313, + "learning_rate": 6.274737432356187e-06, + "loss": 0.5161, + "step": 6835 + }, + { + "epoch": 0.87, + "grad_norm": 0.5959387865678492, + "learning_rate": 6.273739907888645e-06, + "loss": 0.5299, + "step": 6836 + }, + { + "epoch": 0.87, + "grad_norm": 0.6739268664204052, + "learning_rate": 6.272742329202066e-06, + "loss": 0.561, + "step": 6837 + }, + { + "epoch": 0.87, + "grad_norm": 0.9652026442339495, + "learning_rate": 6.271744696338913e-06, + "loss": 0.6284, + "step": 6838 + }, + { + "epoch": 0.87, + "grad_norm": 0.6663780218655141, + "learning_rate": 6.270747009341652e-06, + "loss": 0.5483, + "step": 6839 + }, + { + "epoch": 0.87, + "grad_norm": 0.6507891057327294, + "learning_rate": 6.269749268252753e-06, + "loss": 0.5399, + "step": 6840 + }, + { + "epoch": 0.87, + "grad_norm": 0.6395465662584445, + "learning_rate": 6.268751473114684e-06, + "loss": 0.4978, + "step": 6841 + }, + { + "epoch": 0.87, + "grad_norm": 0.7601071188929123, + "learning_rate": 6.267753623969919e-06, + "loss": 0.5551, + "step": 6842 + }, + { + "epoch": 0.87, + "grad_norm": 0.8001905641435049, + "learning_rate": 6.266755720860933e-06, + "loss": 0.5876, + "step": 6843 + }, + { + "epoch": 0.87, + "grad_norm": 0.7579055164094853, + "learning_rate": 6.2657577638302045e-06, + "loss": 0.5557, + "step": 6844 + }, + { + "epoch": 0.87, + "grad_norm": 0.7746683846586531, + "learning_rate": 6.2647597529202135e-06, + "loss": 0.6439, + "step": 6845 + }, + { + "epoch": 0.87, + "grad_norm": 0.9527593327937378, + "learning_rate": 6.263761688173441e-06, + "loss": 0.6052, + "step": 6846 + }, + { + "epoch": 0.87, + "grad_norm": 0.5798480463925557, + "learning_rate": 6.262763569632371e-06, + "loss": 0.5342, + "step": 6847 + }, + { + "epoch": 0.87, + "grad_norm": 0.7416176034640507, + "learning_rate": 6.261765397339491e-06, + "loss": 0.5626, + "step": 6848 + }, + { + "epoch": 0.87, + "grad_norm": 0.6264930743913123, + "learning_rate": 6.26076717133729e-06, + "loss": 0.5739, + "step": 6849 + }, + { + "epoch": 0.87, + "grad_norm": 0.8851478382258816, + "learning_rate": 6.259768891668261e-06, + "loss": 0.6162, + "step": 6850 + }, + { + "epoch": 0.87, + "grad_norm": 0.8280457744640813, + "learning_rate": 6.2587705583748945e-06, + "loss": 0.5486, + "step": 6851 + }, + { + "epoch": 0.87, + "grad_norm": 0.6002522558786202, + "learning_rate": 6.257772171499687e-06, + "loss": 0.5568, + "step": 6852 + }, + { + "epoch": 0.87, + "grad_norm": 0.9150153674525723, + "learning_rate": 6.256773731085139e-06, + "loss": 0.6725, + "step": 6853 + }, + { + "epoch": 0.87, + "grad_norm": 0.7827022016153788, + "learning_rate": 6.255775237173749e-06, + "loss": 0.6007, + "step": 6854 + }, + { + "epoch": 0.87, + "grad_norm": 0.5676984860950057, + "learning_rate": 6.25477668980802e-06, + "loss": 0.5371, + "step": 6855 + }, + { + "epoch": 0.87, + "grad_norm": 0.6547096461774186, + "learning_rate": 6.2537780890304565e-06, + "loss": 0.5605, + "step": 6856 + }, + { + "epoch": 0.87, + "grad_norm": 0.623009421679491, + "learning_rate": 6.2527794348835666e-06, + "loss": 0.4892, + "step": 6857 + }, + { + "epoch": 0.87, + "grad_norm": 0.543909343147493, + "learning_rate": 6.251780727409861e-06, + "loss": 0.512, + "step": 6858 + }, + { + "epoch": 0.87, + "grad_norm": 0.6814961859316504, + "learning_rate": 6.250781966651847e-06, + "loss": 0.525, + "step": 6859 + }, + { + "epoch": 0.87, + "grad_norm": 0.7807760097938817, + "learning_rate": 6.249783152652045e-06, + "loss": 0.5737, + "step": 6860 + }, + { + "epoch": 0.87, + "grad_norm": 0.7379681688085484, + "learning_rate": 6.248784285452964e-06, + "loss": 0.5696, + "step": 6861 + }, + { + "epoch": 0.87, + "grad_norm": 0.691373783756208, + "learning_rate": 6.247785365097129e-06, + "loss": 0.5181, + "step": 6862 + }, + { + "epoch": 0.87, + "grad_norm": 0.6137904787440552, + "learning_rate": 6.246786391627058e-06, + "loss": 0.5563, + "step": 6863 + }, + { + "epoch": 0.87, + "grad_norm": 0.5588059699715074, + "learning_rate": 6.245787365085274e-06, + "loss": 0.5149, + "step": 6864 + }, + { + "epoch": 0.87, + "grad_norm": 0.7195083567222281, + "learning_rate": 6.244788285514304e-06, + "loss": 0.5248, + "step": 6865 + }, + { + "epoch": 0.87, + "grad_norm": 0.6688699464853312, + "learning_rate": 6.2437891529566745e-06, + "loss": 0.6034, + "step": 6866 + }, + { + "epoch": 0.87, + "grad_norm": 0.6131091843852191, + "learning_rate": 6.242789967454913e-06, + "loss": 0.5273, + "step": 6867 + }, + { + "epoch": 0.87, + "grad_norm": 0.5423820291017064, + "learning_rate": 6.241790729051555e-06, + "loss": 0.4823, + "step": 6868 + }, + { + "epoch": 0.88, + "grad_norm": 0.542137685061483, + "learning_rate": 6.2407914377891355e-06, + "loss": 0.5467, + "step": 6869 + }, + { + "epoch": 0.88, + "grad_norm": 0.7002896528417133, + "learning_rate": 6.239792093710189e-06, + "loss": 0.6, + "step": 6870 + }, + { + "epoch": 0.88, + "grad_norm": 0.8505814826815032, + "learning_rate": 6.238792696857253e-06, + "loss": 0.6131, + "step": 6871 + }, + { + "epoch": 0.88, + "grad_norm": 0.5473859067486834, + "learning_rate": 6.237793247272872e-06, + "loss": 0.4759, + "step": 6872 + }, + { + "epoch": 0.88, + "grad_norm": 0.899652362697501, + "learning_rate": 6.236793744999587e-06, + "loss": 0.6738, + "step": 6873 + }, + { + "epoch": 0.88, + "grad_norm": 0.8467807264115054, + "learning_rate": 6.2357941900799455e-06, + "loss": 0.6142, + "step": 6874 + }, + { + "epoch": 0.88, + "grad_norm": 0.6985128921024746, + "learning_rate": 6.234794582556494e-06, + "loss": 0.5553, + "step": 6875 + }, + { + "epoch": 0.88, + "grad_norm": 0.7619096079649466, + "learning_rate": 6.233794922471783e-06, + "loss": 0.612, + "step": 6876 + }, + { + "epoch": 0.88, + "grad_norm": 0.6663012159165116, + "learning_rate": 6.232795209868365e-06, + "loss": 0.522, + "step": 6877 + }, + { + "epoch": 0.88, + "grad_norm": 0.7446578268848328, + "learning_rate": 6.231795444788794e-06, + "loss": 0.6283, + "step": 6878 + }, + { + "epoch": 0.88, + "grad_norm": 2.5450036146527193, + "learning_rate": 6.230795627275626e-06, + "loss": 0.5677, + "step": 6879 + }, + { + "epoch": 0.88, + "grad_norm": 0.758724870171053, + "learning_rate": 6.229795757371423e-06, + "loss": 0.5633, + "step": 6880 + }, + { + "epoch": 0.88, + "grad_norm": 0.8175139850721489, + "learning_rate": 6.228795835118744e-06, + "loss": 0.6056, + "step": 6881 + }, + { + "epoch": 0.88, + "grad_norm": 0.6643489122575102, + "learning_rate": 6.227795860560153e-06, + "loss": 0.5483, + "step": 6882 + }, + { + "epoch": 0.88, + "grad_norm": 0.7445691382149575, + "learning_rate": 6.226795833738216e-06, + "loss": 0.6273, + "step": 6883 + }, + { + "epoch": 0.88, + "grad_norm": 0.5875772040990337, + "learning_rate": 6.225795754695501e-06, + "loss": 0.5116, + "step": 6884 + }, + { + "epoch": 0.88, + "grad_norm": 0.9325925337166909, + "learning_rate": 6.224795623474576e-06, + "loss": 0.6087, + "step": 6885 + }, + { + "epoch": 0.88, + "grad_norm": 0.5917189443119621, + "learning_rate": 6.223795440118015e-06, + "loss": 0.5172, + "step": 6886 + }, + { + "epoch": 0.88, + "grad_norm": 0.6571296009952919, + "learning_rate": 6.2227952046683945e-06, + "loss": 0.5163, + "step": 6887 + }, + { + "epoch": 0.88, + "grad_norm": 0.8671543582304182, + "learning_rate": 6.22179491716829e-06, + "loss": 0.637, + "step": 6888 + }, + { + "epoch": 0.88, + "grad_norm": 0.599192086806491, + "learning_rate": 6.22079457766028e-06, + "loss": 0.5182, + "step": 6889 + }, + { + "epoch": 0.88, + "grad_norm": 0.7168495563588103, + "learning_rate": 6.219794186186945e-06, + "loss": 0.6026, + "step": 6890 + }, + { + "epoch": 0.88, + "grad_norm": 0.6995747763004064, + "learning_rate": 6.21879374279087e-06, + "loss": 0.5996, + "step": 6891 + }, + { + "epoch": 0.88, + "grad_norm": 0.6169746328907965, + "learning_rate": 6.217793247514638e-06, + "loss": 0.513, + "step": 6892 + }, + { + "epoch": 0.88, + "grad_norm": 1.2195289350244276, + "learning_rate": 6.216792700400841e-06, + "loss": 0.6221, + "step": 6893 + }, + { + "epoch": 0.88, + "grad_norm": 0.749395846257989, + "learning_rate": 6.215792101492068e-06, + "loss": 0.6589, + "step": 6894 + }, + { + "epoch": 0.88, + "grad_norm": 0.6770192456971336, + "learning_rate": 6.214791450830908e-06, + "loss": 0.5235, + "step": 6895 + }, + { + "epoch": 0.88, + "grad_norm": 0.5865526980333283, + "learning_rate": 6.2137907484599605e-06, + "loss": 0.5403, + "step": 6896 + }, + { + "epoch": 0.88, + "grad_norm": 0.8193177300012351, + "learning_rate": 6.212789994421817e-06, + "loss": 0.6289, + "step": 6897 + }, + { + "epoch": 0.88, + "grad_norm": 0.9135725602561945, + "learning_rate": 6.21178918875908e-06, + "loss": 0.5734, + "step": 6898 + }, + { + "epoch": 0.88, + "grad_norm": 1.0049178283070277, + "learning_rate": 6.210788331514349e-06, + "loss": 0.6387, + "step": 6899 + }, + { + "epoch": 0.88, + "grad_norm": 0.8394995515526554, + "learning_rate": 6.209787422730229e-06, + "loss": 0.6422, + "step": 6900 + }, + { + "epoch": 0.88, + "grad_norm": 0.7368126956076453, + "learning_rate": 6.208786462449323e-06, + "loss": 0.6024, + "step": 6901 + }, + { + "epoch": 0.88, + "grad_norm": 0.8384003347264457, + "learning_rate": 6.207785450714242e-06, + "loss": 0.6018, + "step": 6902 + }, + { + "epoch": 0.88, + "grad_norm": 0.7074314150809423, + "learning_rate": 6.206784387567592e-06, + "loss": 0.5318, + "step": 6903 + }, + { + "epoch": 0.88, + "grad_norm": 0.5627536859957167, + "learning_rate": 6.205783273051988e-06, + "loss": 0.5226, + "step": 6904 + }, + { + "epoch": 0.88, + "grad_norm": 0.6223252024920127, + "learning_rate": 6.204782107210044e-06, + "loss": 0.5519, + "step": 6905 + }, + { + "epoch": 0.88, + "grad_norm": 0.6013096013428314, + "learning_rate": 6.203780890084374e-06, + "loss": 0.4471, + "step": 6906 + }, + { + "epoch": 0.88, + "grad_norm": 0.6376797279349884, + "learning_rate": 6.2027796217175985e-06, + "loss": 0.5521, + "step": 6907 + }, + { + "epoch": 0.88, + "grad_norm": 0.9504777812926263, + "learning_rate": 6.20177830215234e-06, + "loss": 0.6301, + "step": 6908 + }, + { + "epoch": 0.88, + "grad_norm": 0.957302861659713, + "learning_rate": 6.200776931431216e-06, + "loss": 0.6086, + "step": 6909 + }, + { + "epoch": 0.88, + "grad_norm": 0.9441578880772518, + "learning_rate": 6.199775509596857e-06, + "loss": 0.655, + "step": 6910 + }, + { + "epoch": 0.88, + "grad_norm": 0.7527162940865, + "learning_rate": 6.198774036691888e-06, + "loss": 0.5941, + "step": 6911 + }, + { + "epoch": 0.88, + "grad_norm": 0.5929544365399939, + "learning_rate": 6.197772512758939e-06, + "loss": 0.512, + "step": 6912 + }, + { + "epoch": 0.88, + "grad_norm": 0.5774386893457697, + "learning_rate": 6.1967709378406425e-06, + "loss": 0.5229, + "step": 6913 + }, + { + "epoch": 0.88, + "grad_norm": 0.7774892969808319, + "learning_rate": 6.195769311979631e-06, + "loss": 0.6513, + "step": 6914 + }, + { + "epoch": 0.88, + "grad_norm": 0.8135022353073249, + "learning_rate": 6.194767635218541e-06, + "loss": 0.6416, + "step": 6915 + }, + { + "epoch": 0.88, + "grad_norm": 0.8896958118115137, + "learning_rate": 6.193765907600011e-06, + "loss": 0.5814, + "step": 6916 + }, + { + "epoch": 0.88, + "grad_norm": 0.628869665013241, + "learning_rate": 6.192764129166681e-06, + "loss": 0.5466, + "step": 6917 + }, + { + "epoch": 0.88, + "grad_norm": 0.5756931612956352, + "learning_rate": 6.191762299961194e-06, + "loss": 0.5114, + "step": 6918 + }, + { + "epoch": 0.88, + "grad_norm": 0.5808018003374665, + "learning_rate": 6.190760420026193e-06, + "loss": 0.524, + "step": 6919 + }, + { + "epoch": 0.88, + "grad_norm": 0.6915554516000589, + "learning_rate": 6.189758489404327e-06, + "loss": 0.5687, + "step": 6920 + }, + { + "epoch": 0.88, + "grad_norm": 0.636161627714552, + "learning_rate": 6.1887565081382435e-06, + "loss": 0.6299, + "step": 6921 + }, + { + "epoch": 0.88, + "grad_norm": 0.6774435912803495, + "learning_rate": 6.187754476270595e-06, + "loss": 0.5514, + "step": 6922 + }, + { + "epoch": 0.88, + "grad_norm": 0.8193920191532753, + "learning_rate": 6.186752393844032e-06, + "loss": 0.5995, + "step": 6923 + }, + { + "epoch": 0.88, + "grad_norm": 1.0017311733553744, + "learning_rate": 6.185750260901213e-06, + "loss": 0.5734, + "step": 6924 + }, + { + "epoch": 0.88, + "grad_norm": 0.9726053865363808, + "learning_rate": 6.184748077484796e-06, + "loss": 0.6573, + "step": 6925 + }, + { + "epoch": 0.88, + "grad_norm": 0.7189541834067743, + "learning_rate": 6.183745843637437e-06, + "loss": 0.594, + "step": 6926 + }, + { + "epoch": 0.88, + "grad_norm": 0.7879463764308462, + "learning_rate": 6.182743559401801e-06, + "loss": 0.664, + "step": 6927 + }, + { + "epoch": 0.88, + "grad_norm": 0.6473155588635416, + "learning_rate": 6.181741224820552e-06, + "loss": 0.4793, + "step": 6928 + }, + { + "epoch": 0.88, + "grad_norm": 0.7817179856107083, + "learning_rate": 6.180738839936354e-06, + "loss": 0.5041, + "step": 6929 + }, + { + "epoch": 0.88, + "grad_norm": 0.5812415751280942, + "learning_rate": 6.179736404791877e-06, + "loss": 0.5268, + "step": 6930 + }, + { + "epoch": 0.88, + "grad_norm": 0.6793739797508815, + "learning_rate": 6.178733919429793e-06, + "loss": 0.5722, + "step": 6931 + }, + { + "epoch": 0.88, + "grad_norm": 0.6494114722260511, + "learning_rate": 6.177731383892771e-06, + "loss": 0.5401, + "step": 6932 + }, + { + "epoch": 0.88, + "grad_norm": 0.5785418031395532, + "learning_rate": 6.176728798223488e-06, + "loss": 0.5074, + "step": 6933 + }, + { + "epoch": 0.88, + "grad_norm": 0.5861779976293513, + "learning_rate": 6.1757261624646194e-06, + "loss": 0.5502, + "step": 6934 + }, + { + "epoch": 0.88, + "grad_norm": 0.6706083698895646, + "learning_rate": 6.174723476658845e-06, + "loss": 0.5165, + "step": 6935 + }, + { + "epoch": 0.88, + "grad_norm": 0.6137462044331675, + "learning_rate": 6.1737207408488474e-06, + "loss": 0.539, + "step": 6936 + }, + { + "epoch": 0.88, + "grad_norm": 0.6274706029119311, + "learning_rate": 6.172717955077309e-06, + "loss": 0.5049, + "step": 6937 + }, + { + "epoch": 0.88, + "grad_norm": 0.6239881438203184, + "learning_rate": 6.171715119386913e-06, + "loss": 0.5485, + "step": 6938 + }, + { + "epoch": 0.88, + "grad_norm": 0.8374711885935975, + "learning_rate": 6.1707122338203496e-06, + "loss": 0.576, + "step": 6939 + }, + { + "epoch": 0.88, + "grad_norm": 0.778003576136719, + "learning_rate": 6.169709298420308e-06, + "loss": 0.6208, + "step": 6940 + }, + { + "epoch": 0.88, + "grad_norm": 0.6405126665539604, + "learning_rate": 6.1687063132294776e-06, + "loss": 0.5289, + "step": 6941 + }, + { + "epoch": 0.88, + "grad_norm": 0.7634993817468099, + "learning_rate": 6.167703278290556e-06, + "loss": 0.6004, + "step": 6942 + }, + { + "epoch": 0.88, + "grad_norm": 0.9057512115608788, + "learning_rate": 6.166700193646235e-06, + "loss": 0.653, + "step": 6943 + }, + { + "epoch": 0.88, + "grad_norm": 0.5547006559169239, + "learning_rate": 6.165697059339218e-06, + "loss": 0.5182, + "step": 6944 + }, + { + "epoch": 0.88, + "grad_norm": 0.5080745747688111, + "learning_rate": 6.1646938754122e-06, + "loss": 0.455, + "step": 6945 + }, + { + "epoch": 0.88, + "grad_norm": 0.5615632718437535, + "learning_rate": 6.1636906419078865e-06, + "loss": 0.4883, + "step": 6946 + }, + { + "epoch": 0.89, + "grad_norm": 0.6593833915263615, + "learning_rate": 6.162687358868979e-06, + "loss": 0.5638, + "step": 6947 + }, + { + "epoch": 0.89, + "grad_norm": 0.8758431849895096, + "learning_rate": 6.161684026338188e-06, + "loss": 0.5644, + "step": 6948 + }, + { + "epoch": 0.89, + "grad_norm": 0.7714982101403007, + "learning_rate": 6.160680644358221e-06, + "loss": 0.6517, + "step": 6949 + }, + { + "epoch": 0.89, + "grad_norm": 0.7229073069281465, + "learning_rate": 6.159677212971788e-06, + "loss": 0.5827, + "step": 6950 + }, + { + "epoch": 0.89, + "grad_norm": 0.7170060430273959, + "learning_rate": 6.158673732221601e-06, + "loss": 0.6252, + "step": 6951 + }, + { + "epoch": 0.89, + "grad_norm": 0.6044232803295256, + "learning_rate": 6.157670202150374e-06, + "loss": 0.5583, + "step": 6952 + }, + { + "epoch": 0.89, + "grad_norm": 0.6175731398380181, + "learning_rate": 6.156666622800829e-06, + "loss": 0.4854, + "step": 6953 + }, + { + "epoch": 0.89, + "grad_norm": 0.726168022342418, + "learning_rate": 6.155662994215679e-06, + "loss": 0.6317, + "step": 6954 + }, + { + "epoch": 0.89, + "grad_norm": 1.1010979581015248, + "learning_rate": 6.15465931643765e-06, + "loss": 0.7149, + "step": 6955 + }, + { + "epoch": 0.89, + "grad_norm": 0.5951092840843372, + "learning_rate": 6.153655589509464e-06, + "loss": 0.4564, + "step": 6956 + }, + { + "epoch": 0.89, + "grad_norm": 0.7322830194642305, + "learning_rate": 6.1526518134738455e-06, + "loss": 0.6375, + "step": 6957 + }, + { + "epoch": 0.89, + "grad_norm": 0.7118970209441388, + "learning_rate": 6.1516479883735234e-06, + "loss": 0.5813, + "step": 6958 + }, + { + "epoch": 0.89, + "grad_norm": 0.6104637232344424, + "learning_rate": 6.150644114251226e-06, + "loss": 0.5411, + "step": 6959 + }, + { + "epoch": 0.89, + "grad_norm": 0.613150555109829, + "learning_rate": 6.149640191149684e-06, + "loss": 0.5812, + "step": 6960 + }, + { + "epoch": 0.89, + "grad_norm": 0.5887407967297529, + "learning_rate": 6.148636219111635e-06, + "loss": 0.5466, + "step": 6961 + }, + { + "epoch": 0.89, + "grad_norm": 0.6929918455206895, + "learning_rate": 6.147632198179813e-06, + "loss": 0.5655, + "step": 6962 + }, + { + "epoch": 0.89, + "grad_norm": 0.6886215737073035, + "learning_rate": 6.1466281283969545e-06, + "loss": 0.5481, + "step": 6963 + }, + { + "epoch": 0.89, + "grad_norm": 0.6728231630783966, + "learning_rate": 6.1456240098058005e-06, + "loss": 0.5506, + "step": 6964 + }, + { + "epoch": 0.89, + "grad_norm": 0.72738416349824, + "learning_rate": 6.144619842449094e-06, + "loss": 0.5739, + "step": 6965 + }, + { + "epoch": 0.89, + "grad_norm": 0.6921865149606771, + "learning_rate": 6.143615626369578e-06, + "loss": 0.5789, + "step": 6966 + }, + { + "epoch": 0.89, + "grad_norm": 0.7517713041007731, + "learning_rate": 6.1426113616099995e-06, + "loss": 0.603, + "step": 6967 + }, + { + "epoch": 0.89, + "grad_norm": 0.7732631404374873, + "learning_rate": 6.141607048213107e-06, + "loss": 0.6654, + "step": 6968 + }, + { + "epoch": 0.89, + "grad_norm": 0.8065041814625559, + "learning_rate": 6.14060268622165e-06, + "loss": 0.4992, + "step": 6969 + }, + { + "epoch": 0.89, + "grad_norm": 0.7775832453012291, + "learning_rate": 6.139598275678381e-06, + "loss": 0.6143, + "step": 6970 + }, + { + "epoch": 0.89, + "grad_norm": 0.8101687830242854, + "learning_rate": 6.138593816626055e-06, + "loss": 0.5874, + "step": 6971 + }, + { + "epoch": 0.89, + "grad_norm": 0.7719784037915997, + "learning_rate": 6.137589309107429e-06, + "loss": 0.5557, + "step": 6972 + }, + { + "epoch": 0.89, + "grad_norm": 0.6989662084373551, + "learning_rate": 6.136584753165262e-06, + "loss": 0.5359, + "step": 6973 + }, + { + "epoch": 0.89, + "grad_norm": 0.663647152962222, + "learning_rate": 6.135580148842314e-06, + "loss": 0.5134, + "step": 6974 + }, + { + "epoch": 0.89, + "grad_norm": 0.7800516951878073, + "learning_rate": 6.134575496181348e-06, + "loss": 0.6692, + "step": 6975 + }, + { + "epoch": 0.89, + "grad_norm": 0.6679711901363279, + "learning_rate": 6.133570795225128e-06, + "loss": 0.4585, + "step": 6976 + }, + { + "epoch": 0.89, + "grad_norm": 0.6182815652001041, + "learning_rate": 6.132566046016422e-06, + "loss": 0.5263, + "step": 6977 + }, + { + "epoch": 0.89, + "grad_norm": 0.5417296473212366, + "learning_rate": 6.131561248597997e-06, + "loss": 0.5297, + "step": 6978 + }, + { + "epoch": 0.89, + "grad_norm": 0.6054942574934512, + "learning_rate": 6.1305564030126264e-06, + "loss": 0.5422, + "step": 6979 + }, + { + "epoch": 0.89, + "grad_norm": 0.6145197552505882, + "learning_rate": 6.1295515093030835e-06, + "loss": 0.5241, + "step": 6980 + }, + { + "epoch": 0.89, + "grad_norm": 0.6276337869361472, + "learning_rate": 6.1285465675121414e-06, + "loss": 0.541, + "step": 6981 + }, + { + "epoch": 0.89, + "grad_norm": 0.6780973649138582, + "learning_rate": 6.1275415776825795e-06, + "loss": 0.5792, + "step": 6982 + }, + { + "epoch": 0.89, + "grad_norm": 0.6101682433886345, + "learning_rate": 6.1265365398571755e-06, + "loss": 0.5689, + "step": 6983 + }, + { + "epoch": 0.89, + "grad_norm": 0.7210033167807183, + "learning_rate": 6.1255314540787105e-06, + "loss": 0.583, + "step": 6984 + }, + { + "epoch": 0.89, + "grad_norm": 0.7737271743546085, + "learning_rate": 6.1245263203899665e-06, + "loss": 0.6431, + "step": 6985 + }, + { + "epoch": 0.89, + "grad_norm": 0.7812190187635045, + "learning_rate": 6.123521138833732e-06, + "loss": 0.5542, + "step": 6986 + }, + { + "epoch": 0.89, + "grad_norm": 0.5628385126075388, + "learning_rate": 6.122515909452793e-06, + "loss": 0.536, + "step": 6987 + }, + { + "epoch": 0.89, + "grad_norm": 0.6866876473192836, + "learning_rate": 6.121510632289939e-06, + "loss": 0.5375, + "step": 6988 + }, + { + "epoch": 0.89, + "grad_norm": 0.8312761759667844, + "learning_rate": 6.1205053073879605e-06, + "loss": 0.6062, + "step": 6989 + }, + { + "epoch": 0.89, + "grad_norm": 0.5482930227360178, + "learning_rate": 6.119499934789654e-06, + "loss": 0.4728, + "step": 6990 + }, + { + "epoch": 0.89, + "grad_norm": 0.6022037195005886, + "learning_rate": 6.118494514537809e-06, + "loss": 0.5124, + "step": 6991 + }, + { + "epoch": 0.89, + "grad_norm": 0.6006512478769933, + "learning_rate": 6.117489046675229e-06, + "loss": 0.5193, + "step": 6992 + }, + { + "epoch": 0.89, + "grad_norm": 0.6906369730060452, + "learning_rate": 6.11648353124471e-06, + "loss": 0.5501, + "step": 6993 + }, + { + "epoch": 0.89, + "grad_norm": 0.7418386740062759, + "learning_rate": 6.115477968289057e-06, + "loss": 0.553, + "step": 6994 + }, + { + "epoch": 0.89, + "grad_norm": 0.6265433495134833, + "learning_rate": 6.114472357851069e-06, + "loss": 0.53, + "step": 6995 + }, + { + "epoch": 0.89, + "grad_norm": 0.6038562518554125, + "learning_rate": 6.1134666999735555e-06, + "loss": 0.5465, + "step": 6996 + }, + { + "epoch": 0.89, + "grad_norm": 0.7887925635533009, + "learning_rate": 6.1124609946993215e-06, + "loss": 0.6539, + "step": 6997 + }, + { + "epoch": 0.89, + "grad_norm": 0.7531312921636764, + "learning_rate": 6.1114552420711795e-06, + "loss": 0.5588, + "step": 6998 + }, + { + "epoch": 0.89, + "grad_norm": 0.786681441606181, + "learning_rate": 6.1104494421319385e-06, + "loss": 0.6205, + "step": 6999 + }, + { + "epoch": 0.89, + "grad_norm": 0.6395401569832504, + "learning_rate": 6.1094435949244135e-06, + "loss": 0.5425, + "step": 7000 + }, + { + "epoch": 0.89, + "grad_norm": 0.6192397084103954, + "learning_rate": 6.1084377004914195e-06, + "loss": 0.5775, + "step": 7001 + }, + { + "epoch": 0.89, + "grad_norm": 0.6416149970841349, + "learning_rate": 6.107431758875776e-06, + "loss": 0.5446, + "step": 7002 + }, + { + "epoch": 0.89, + "grad_norm": 0.6091182591298585, + "learning_rate": 6.106425770120299e-06, + "loss": 0.5454, + "step": 7003 + }, + { + "epoch": 0.89, + "grad_norm": 0.6439199776065763, + "learning_rate": 6.105419734267815e-06, + "loss": 0.514, + "step": 7004 + }, + { + "epoch": 0.89, + "grad_norm": 1.0701801205887538, + "learning_rate": 6.104413651361144e-06, + "loss": 0.5988, + "step": 7005 + }, + { + "epoch": 0.89, + "grad_norm": 0.832089162578337, + "learning_rate": 6.103407521443115e-06, + "loss": 0.5974, + "step": 7006 + }, + { + "epoch": 0.89, + "grad_norm": 0.8201899620076895, + "learning_rate": 6.1024013445565515e-06, + "loss": 0.6743, + "step": 7007 + }, + { + "epoch": 0.89, + "grad_norm": 0.7319994524681279, + "learning_rate": 6.101395120744286e-06, + "loss": 0.5104, + "step": 7008 + }, + { + "epoch": 0.89, + "grad_norm": 0.5550222837796425, + "learning_rate": 6.100388850049149e-06, + "loss": 0.5644, + "step": 7009 + }, + { + "epoch": 0.89, + "grad_norm": 0.5438093201350016, + "learning_rate": 6.099382532513976e-06, + "loss": 0.5014, + "step": 7010 + }, + { + "epoch": 0.89, + "grad_norm": 0.715979102804903, + "learning_rate": 6.098376168181602e-06, + "loss": 0.6365, + "step": 7011 + }, + { + "epoch": 0.89, + "grad_norm": 0.6219142752938432, + "learning_rate": 6.097369757094864e-06, + "loss": 0.5332, + "step": 7012 + }, + { + "epoch": 0.89, + "grad_norm": 0.5800098704411566, + "learning_rate": 6.0963632992966036e-06, + "loss": 0.5211, + "step": 7013 + }, + { + "epoch": 0.89, + "grad_norm": 0.6475692842033535, + "learning_rate": 6.09535679482966e-06, + "loss": 0.5382, + "step": 7014 + }, + { + "epoch": 0.89, + "grad_norm": 0.6349157139075965, + "learning_rate": 6.094350243736878e-06, + "loss": 0.4815, + "step": 7015 + }, + { + "epoch": 0.89, + "grad_norm": 0.9500280625936495, + "learning_rate": 6.093343646061101e-06, + "loss": 0.6404, + "step": 7016 + }, + { + "epoch": 0.89, + "grad_norm": 0.5721372743526819, + "learning_rate": 6.092337001845181e-06, + "loss": 0.5161, + "step": 7017 + }, + { + "epoch": 0.89, + "grad_norm": 0.9799732367864387, + "learning_rate": 6.091330311131965e-06, + "loss": 0.5629, + "step": 7018 + }, + { + "epoch": 0.89, + "grad_norm": 0.5914921112517635, + "learning_rate": 6.090323573964305e-06, + "loss": 0.5108, + "step": 7019 + }, + { + "epoch": 0.89, + "grad_norm": 0.5518055965519464, + "learning_rate": 6.089316790385056e-06, + "loss": 0.4329, + "step": 7020 + }, + { + "epoch": 0.89, + "grad_norm": 0.8094592049857515, + "learning_rate": 6.088309960437071e-06, + "loss": 0.6236, + "step": 7021 + }, + { + "epoch": 0.89, + "grad_norm": 0.7519590278852772, + "learning_rate": 6.087303084163207e-06, + "loss": 0.5772, + "step": 7022 + }, + { + "epoch": 0.89, + "grad_norm": 0.6435099144700268, + "learning_rate": 6.086296161606328e-06, + "loss": 0.504, + "step": 7023 + }, + { + "epoch": 0.89, + "grad_norm": 0.7090464668951559, + "learning_rate": 6.085289192809291e-06, + "loss": 0.6317, + "step": 7024 + }, + { + "epoch": 0.89, + "grad_norm": 0.5962875285809118, + "learning_rate": 6.084282177814962e-06, + "loss": 0.5077, + "step": 7025 + }, + { + "epoch": 0.9, + "grad_norm": 0.722109036630549, + "learning_rate": 6.083275116666206e-06, + "loss": 0.5609, + "step": 7026 + }, + { + "epoch": 0.9, + "grad_norm": 0.7958360727980037, + "learning_rate": 6.0822680094058885e-06, + "loss": 0.5995, + "step": 7027 + }, + { + "epoch": 0.9, + "grad_norm": 0.5849184702604707, + "learning_rate": 6.081260856076882e-06, + "loss": 0.4885, + "step": 7028 + }, + { + "epoch": 0.9, + "grad_norm": 0.5601944460241765, + "learning_rate": 6.0802536567220546e-06, + "loss": 0.4801, + "step": 7029 + }, + { + "epoch": 0.9, + "grad_norm": 0.8724346766325471, + "learning_rate": 6.079246411384282e-06, + "loss": 0.624, + "step": 7030 + }, + { + "epoch": 0.9, + "grad_norm": 0.6521322783295203, + "learning_rate": 6.0782391201064374e-06, + "loss": 0.4872, + "step": 7031 + }, + { + "epoch": 0.9, + "grad_norm": 0.7911287449441682, + "learning_rate": 6.0772317829313994e-06, + "loss": 0.6163, + "step": 7032 + }, + { + "epoch": 0.9, + "grad_norm": 0.9396882391719757, + "learning_rate": 6.0762243999020465e-06, + "loss": 0.5203, + "step": 7033 + }, + { + "epoch": 0.9, + "grad_norm": 0.5714283129024395, + "learning_rate": 6.0752169710612605e-06, + "loss": 0.5076, + "step": 7034 + }, + { + "epoch": 0.9, + "grad_norm": 0.7593724104306929, + "learning_rate": 6.074209496451924e-06, + "loss": 0.6125, + "step": 7035 + }, + { + "epoch": 0.9, + "grad_norm": 0.6438712476384332, + "learning_rate": 6.073201976116923e-06, + "loss": 0.5102, + "step": 7036 + }, + { + "epoch": 0.9, + "grad_norm": 0.5416491414306279, + "learning_rate": 6.072194410099142e-06, + "loss": 0.4788, + "step": 7037 + }, + { + "epoch": 0.9, + "grad_norm": 0.7547145890173839, + "learning_rate": 6.07118679844147e-06, + "loss": 0.5941, + "step": 7038 + }, + { + "epoch": 0.9, + "grad_norm": 0.6843022383752474, + "learning_rate": 6.070179141186802e-06, + "loss": 0.5519, + "step": 7039 + }, + { + "epoch": 0.9, + "grad_norm": 0.6489889671240688, + "learning_rate": 6.069171438378025e-06, + "loss": 0.5307, + "step": 7040 + }, + { + "epoch": 0.9, + "grad_norm": 0.6018158642552033, + "learning_rate": 6.068163690058038e-06, + "loss": 0.4985, + "step": 7041 + }, + { + "epoch": 0.9, + "grad_norm": 0.6810515386416672, + "learning_rate": 6.067155896269735e-06, + "loss": 0.5313, + "step": 7042 + }, + { + "epoch": 0.9, + "grad_norm": 0.6795412536103733, + "learning_rate": 6.066148057056017e-06, + "loss": 0.5429, + "step": 7043 + }, + { + "epoch": 0.9, + "grad_norm": 0.5638607829429544, + "learning_rate": 6.065140172459782e-06, + "loss": 0.4877, + "step": 7044 + }, + { + "epoch": 0.9, + "grad_norm": 0.6394401456542274, + "learning_rate": 6.064132242523935e-06, + "loss": 0.5457, + "step": 7045 + }, + { + "epoch": 0.9, + "grad_norm": 0.5914186049789284, + "learning_rate": 6.063124267291378e-06, + "loss": 0.525, + "step": 7046 + }, + { + "epoch": 0.9, + "grad_norm": 0.794023482227983, + "learning_rate": 6.0621162468050165e-06, + "loss": 0.5801, + "step": 7047 + }, + { + "epoch": 0.9, + "grad_norm": 0.632927484052232, + "learning_rate": 6.061108181107762e-06, + "loss": 0.5272, + "step": 7048 + }, + { + "epoch": 0.9, + "grad_norm": 0.8562898361911961, + "learning_rate": 6.060100070242524e-06, + "loss": 0.6606, + "step": 7049 + }, + { + "epoch": 0.9, + "grad_norm": 0.6690535977493315, + "learning_rate": 6.059091914252213e-06, + "loss": 0.5403, + "step": 7050 + }, + { + "epoch": 0.9, + "grad_norm": 0.519046681801265, + "learning_rate": 6.058083713179743e-06, + "loss": 0.4995, + "step": 7051 + }, + { + "epoch": 0.9, + "grad_norm": 0.6134678210055258, + "learning_rate": 6.057075467068031e-06, + "loss": 0.5696, + "step": 7052 + }, + { + "epoch": 0.9, + "grad_norm": 0.7754917113503742, + "learning_rate": 6.056067175959993e-06, + "loss": 0.5921, + "step": 7053 + }, + { + "epoch": 0.9, + "grad_norm": 0.527923556579244, + "learning_rate": 6.055058839898551e-06, + "loss": 0.4658, + "step": 7054 + }, + { + "epoch": 0.9, + "grad_norm": 0.7248729441933248, + "learning_rate": 6.054050458926626e-06, + "loss": 0.5888, + "step": 7055 + }, + { + "epoch": 0.9, + "grad_norm": 0.8296417343177384, + "learning_rate": 6.053042033087141e-06, + "loss": 0.6083, + "step": 7056 + }, + { + "epoch": 0.9, + "grad_norm": 0.6675126820356825, + "learning_rate": 6.052033562423022e-06, + "loss": 0.56, + "step": 7057 + }, + { + "epoch": 0.9, + "grad_norm": 0.668743435739018, + "learning_rate": 6.051025046977196e-06, + "loss": 0.5379, + "step": 7058 + }, + { + "epoch": 0.9, + "grad_norm": 0.7949294816777245, + "learning_rate": 6.050016486792591e-06, + "loss": 0.6224, + "step": 7059 + }, + { + "epoch": 0.9, + "grad_norm": 0.6205793233071754, + "learning_rate": 6.049007881912141e-06, + "loss": 0.5338, + "step": 7060 + }, + { + "epoch": 0.9, + "grad_norm": 0.6408703407888654, + "learning_rate": 6.047999232378777e-06, + "loss": 0.5325, + "step": 7061 + }, + { + "epoch": 0.9, + "grad_norm": 0.729530647059725, + "learning_rate": 6.046990538235435e-06, + "loss": 0.5263, + "step": 7062 + }, + { + "epoch": 0.9, + "grad_norm": 0.8706779640023593, + "learning_rate": 6.045981799525051e-06, + "loss": 0.6181, + "step": 7063 + }, + { + "epoch": 0.9, + "grad_norm": 0.5438266929594107, + "learning_rate": 6.0449730162905654e-06, + "loss": 0.4777, + "step": 7064 + }, + { + "epoch": 0.9, + "grad_norm": 0.7359860876038248, + "learning_rate": 6.043964188574915e-06, + "loss": 0.5942, + "step": 7065 + }, + { + "epoch": 0.9, + "grad_norm": 0.7241323835662788, + "learning_rate": 6.042955316421049e-06, + "loss": 0.6209, + "step": 7066 + }, + { + "epoch": 0.9, + "grad_norm": 0.8035127333265772, + "learning_rate": 6.041946399871905e-06, + "loss": 0.5655, + "step": 7067 + }, + { + "epoch": 0.9, + "grad_norm": 0.6956269386666362, + "learning_rate": 6.0409374389704335e-06, + "loss": 0.5964, + "step": 7068 + }, + { + "epoch": 0.9, + "grad_norm": 0.6660078947711036, + "learning_rate": 6.039928433759582e-06, + "loss": 0.5378, + "step": 7069 + }, + { + "epoch": 0.9, + "grad_norm": 0.589503771019717, + "learning_rate": 6.0389193842823e-06, + "loss": 0.5409, + "step": 7070 + }, + { + "epoch": 0.9, + "grad_norm": 0.5866593047121362, + "learning_rate": 6.037910290581538e-06, + "loss": 0.4917, + "step": 7071 + }, + { + "epoch": 0.9, + "grad_norm": 0.6604860005466706, + "learning_rate": 6.036901152700253e-06, + "loss": 0.5242, + "step": 7072 + }, + { + "epoch": 0.9, + "grad_norm": 0.5987571440606564, + "learning_rate": 6.0358919706814e-06, + "loss": 0.5085, + "step": 7073 + }, + { + "epoch": 0.9, + "grad_norm": 0.6245365173020011, + "learning_rate": 6.034882744567936e-06, + "loss": 0.4873, + "step": 7074 + }, + { + "epoch": 0.9, + "grad_norm": 0.6262500080672581, + "learning_rate": 6.033873474402819e-06, + "loss": 0.4865, + "step": 7075 + }, + { + "epoch": 0.9, + "grad_norm": 0.6391414222372909, + "learning_rate": 6.032864160229014e-06, + "loss": 0.5861, + "step": 7076 + }, + { + "epoch": 0.9, + "grad_norm": 0.6128183206857968, + "learning_rate": 6.0318548020894805e-06, + "loss": 0.5292, + "step": 7077 + }, + { + "epoch": 0.9, + "grad_norm": 0.5810153720405562, + "learning_rate": 6.030845400027186e-06, + "loss": 0.553, + "step": 7078 + }, + { + "epoch": 0.9, + "grad_norm": 0.7575050037271156, + "learning_rate": 6.029835954085097e-06, + "loss": 0.6126, + "step": 7079 + }, + { + "epoch": 0.9, + "grad_norm": 0.7596357862222697, + "learning_rate": 6.028826464306183e-06, + "loss": 0.6089, + "step": 7080 + }, + { + "epoch": 0.9, + "grad_norm": 0.5339339948938192, + "learning_rate": 6.027816930733413e-06, + "loss": 0.4795, + "step": 7081 + }, + { + "epoch": 0.9, + "grad_norm": 0.8948840595780623, + "learning_rate": 6.026807353409762e-06, + "loss": 0.6023, + "step": 7082 + }, + { + "epoch": 0.9, + "grad_norm": 0.732670892906231, + "learning_rate": 6.0257977323782025e-06, + "loss": 0.6055, + "step": 7083 + }, + { + "epoch": 0.9, + "grad_norm": 0.6642119407639585, + "learning_rate": 6.02478806768171e-06, + "loss": 0.5293, + "step": 7084 + }, + { + "epoch": 0.9, + "grad_norm": 0.5666997521276964, + "learning_rate": 6.023778359363266e-06, + "loss": 0.4599, + "step": 7085 + }, + { + "epoch": 0.9, + "grad_norm": 0.8067941820503703, + "learning_rate": 6.022768607465849e-06, + "loss": 0.6445, + "step": 7086 + }, + { + "epoch": 0.9, + "grad_norm": 0.8362104587225783, + "learning_rate": 6.02175881203244e-06, + "loss": 0.6197, + "step": 7087 + }, + { + "epoch": 0.9, + "grad_norm": 0.9487357113369056, + "learning_rate": 6.0207489731060234e-06, + "loss": 0.6509, + "step": 7088 + }, + { + "epoch": 0.9, + "grad_norm": 0.6394645143540107, + "learning_rate": 6.019739090729585e-06, + "loss": 0.5156, + "step": 7089 + }, + { + "epoch": 0.9, + "grad_norm": 0.7150408067804788, + "learning_rate": 6.018729164946112e-06, + "loss": 0.5964, + "step": 7090 + }, + { + "epoch": 0.9, + "grad_norm": 0.8315369821989891, + "learning_rate": 6.017719195798595e-06, + "loss": 0.6406, + "step": 7091 + }, + { + "epoch": 0.9, + "grad_norm": 0.699505475163175, + "learning_rate": 6.016709183330023e-06, + "loss": 0.5979, + "step": 7092 + }, + { + "epoch": 0.9, + "grad_norm": 0.6470538483120143, + "learning_rate": 6.0156991275833895e-06, + "loss": 0.5236, + "step": 7093 + }, + { + "epoch": 0.9, + "grad_norm": 0.816930821266561, + "learning_rate": 6.0146890286016905e-06, + "loss": 0.5391, + "step": 7094 + }, + { + "epoch": 0.9, + "grad_norm": 0.7199362070824941, + "learning_rate": 6.013678886427921e-06, + "loss": 0.5074, + "step": 7095 + }, + { + "epoch": 0.9, + "grad_norm": 0.7627784379619833, + "learning_rate": 6.012668701105081e-06, + "loss": 0.531, + "step": 7096 + }, + { + "epoch": 0.9, + "grad_norm": 1.0517068494106159, + "learning_rate": 6.011658472676172e-06, + "loss": 0.6331, + "step": 7097 + }, + { + "epoch": 0.9, + "grad_norm": 0.7076510314830814, + "learning_rate": 6.010648201184193e-06, + "loss": 0.6351, + "step": 7098 + }, + { + "epoch": 0.9, + "grad_norm": 0.7192164568048685, + "learning_rate": 6.009637886672151e-06, + "loss": 0.6225, + "step": 7099 + }, + { + "epoch": 0.9, + "grad_norm": 0.6591453296503997, + "learning_rate": 6.008627529183049e-06, + "loss": 0.5122, + "step": 7100 + }, + { + "epoch": 0.9, + "grad_norm": 0.908526565693428, + "learning_rate": 6.007617128759897e-06, + "loss": 0.636, + "step": 7101 + }, + { + "epoch": 0.9, + "grad_norm": 0.6526237649533455, + "learning_rate": 6.006606685445703e-06, + "loss": 0.5644, + "step": 7102 + }, + { + "epoch": 0.9, + "grad_norm": 0.5511515448112384, + "learning_rate": 6.005596199283479e-06, + "loss": 0.4747, + "step": 7103 + }, + { + "epoch": 0.91, + "grad_norm": 0.6527267519955694, + "learning_rate": 6.004585670316239e-06, + "loss": 0.5206, + "step": 7104 + }, + { + "epoch": 0.91, + "grad_norm": 0.7818144627686768, + "learning_rate": 6.003575098586997e-06, + "loss": 0.6209, + "step": 7105 + }, + { + "epoch": 0.91, + "grad_norm": 0.6316059920579868, + "learning_rate": 6.00256448413877e-06, + "loss": 0.5712, + "step": 7106 + }, + { + "epoch": 0.91, + "grad_norm": 0.5420774402223564, + "learning_rate": 6.001553827014577e-06, + "loss": 0.4941, + "step": 7107 + }, + { + "epoch": 0.91, + "grad_norm": 0.6586518366308904, + "learning_rate": 6.000543127257438e-06, + "loss": 0.5324, + "step": 7108 + }, + { + "epoch": 0.91, + "grad_norm": 0.7115921284145044, + "learning_rate": 5.999532384910374e-06, + "loss": 0.5458, + "step": 7109 + }, + { + "epoch": 0.91, + "grad_norm": 0.7526653039661527, + "learning_rate": 5.998521600016411e-06, + "loss": 0.5702, + "step": 7110 + }, + { + "epoch": 0.91, + "grad_norm": 0.8628833770275489, + "learning_rate": 5.997510772618576e-06, + "loss": 0.6212, + "step": 7111 + }, + { + "epoch": 0.91, + "grad_norm": 0.7805227832120558, + "learning_rate": 5.9964999027598935e-06, + "loss": 0.5973, + "step": 7112 + }, + { + "epoch": 0.91, + "grad_norm": 0.5602003628481105, + "learning_rate": 5.995488990483395e-06, + "loss": 0.5268, + "step": 7113 + }, + { + "epoch": 0.91, + "grad_norm": 0.6177402675696717, + "learning_rate": 5.994478035832111e-06, + "loss": 0.5986, + "step": 7114 + }, + { + "epoch": 0.91, + "grad_norm": 0.5909271036335947, + "learning_rate": 5.993467038849075e-06, + "loss": 0.5722, + "step": 7115 + }, + { + "epoch": 0.91, + "grad_norm": 0.8137966181961045, + "learning_rate": 5.9924559995773215e-06, + "loss": 0.6335, + "step": 7116 + }, + { + "epoch": 0.91, + "grad_norm": 0.8055655690688112, + "learning_rate": 5.991444918059887e-06, + "loss": 0.6479, + "step": 7117 + }, + { + "epoch": 0.91, + "grad_norm": 0.7368075426446393, + "learning_rate": 5.990433794339812e-06, + "loss": 0.5828, + "step": 7118 + }, + { + "epoch": 0.91, + "grad_norm": 0.6376893108332394, + "learning_rate": 5.9894226284601355e-06, + "loss": 0.5768, + "step": 7119 + }, + { + "epoch": 0.91, + "grad_norm": 0.7633560778863631, + "learning_rate": 5.988411420463898e-06, + "loss": 0.6285, + "step": 7120 + }, + { + "epoch": 0.91, + "grad_norm": 0.5656994053803676, + "learning_rate": 5.987400170394146e-06, + "loss": 0.5029, + "step": 7121 + }, + { + "epoch": 0.91, + "grad_norm": 0.6560430090900162, + "learning_rate": 5.986388878293923e-06, + "loss": 0.5222, + "step": 7122 + }, + { + "epoch": 0.91, + "grad_norm": 0.7305105623059527, + "learning_rate": 5.985377544206278e-06, + "loss": 0.5946, + "step": 7123 + }, + { + "epoch": 0.91, + "grad_norm": 0.7100536238867052, + "learning_rate": 5.98436616817426e-06, + "loss": 0.5535, + "step": 7124 + }, + { + "epoch": 0.91, + "grad_norm": 1.3688451111270026, + "learning_rate": 5.98335475024092e-06, + "loss": 0.5401, + "step": 7125 + }, + { + "epoch": 0.91, + "grad_norm": 0.606096566139521, + "learning_rate": 5.982343290449311e-06, + "loss": 0.5654, + "step": 7126 + }, + { + "epoch": 0.91, + "grad_norm": 0.9467145391972902, + "learning_rate": 5.981331788842485e-06, + "loss": 0.5792, + "step": 7127 + }, + { + "epoch": 0.91, + "grad_norm": 0.8465251048556427, + "learning_rate": 5.980320245463502e-06, + "loss": 0.6222, + "step": 7128 + }, + { + "epoch": 0.91, + "grad_norm": 0.5689033131252232, + "learning_rate": 5.979308660355419e-06, + "loss": 0.4894, + "step": 7129 + }, + { + "epoch": 0.91, + "grad_norm": 0.5929282360825587, + "learning_rate": 5.978297033561295e-06, + "loss": 0.5315, + "step": 7130 + }, + { + "epoch": 0.91, + "grad_norm": 1.2609677645202362, + "learning_rate": 5.977285365124195e-06, + "loss": 0.6, + "step": 7131 + }, + { + "epoch": 0.91, + "grad_norm": 0.6110118892147859, + "learning_rate": 5.976273655087178e-06, + "loss": 0.4886, + "step": 7132 + }, + { + "epoch": 0.91, + "grad_norm": 0.7791127968859274, + "learning_rate": 5.97526190349331e-06, + "loss": 0.619, + "step": 7133 + }, + { + "epoch": 0.91, + "grad_norm": 0.5583853862987631, + "learning_rate": 5.974250110385661e-06, + "loss": 0.5145, + "step": 7134 + }, + { + "epoch": 0.91, + "grad_norm": 0.6382480622421793, + "learning_rate": 5.9732382758072985e-06, + "loss": 0.5536, + "step": 7135 + }, + { + "epoch": 0.91, + "grad_norm": 0.824847088257256, + "learning_rate": 5.9722263998012905e-06, + "loss": 0.6172, + "step": 7136 + }, + { + "epoch": 0.91, + "grad_norm": 0.7809217260638395, + "learning_rate": 5.971214482410713e-06, + "loss": 0.619, + "step": 7137 + }, + { + "epoch": 0.91, + "grad_norm": 0.9115552827998088, + "learning_rate": 5.9702025236786385e-06, + "loss": 0.6926, + "step": 7138 + }, + { + "epoch": 0.91, + "grad_norm": 0.8096508120858464, + "learning_rate": 5.969190523648143e-06, + "loss": 0.5813, + "step": 7139 + }, + { + "epoch": 0.91, + "grad_norm": 0.5385794462804069, + "learning_rate": 5.9681784823623035e-06, + "loss": 0.4845, + "step": 7140 + }, + { + "epoch": 0.91, + "grad_norm": 0.5701950137838097, + "learning_rate": 5.967166399864199e-06, + "loss": 0.5091, + "step": 7141 + }, + { + "epoch": 0.91, + "grad_norm": 0.6510968743689732, + "learning_rate": 5.9661542761969134e-06, + "loss": 0.5597, + "step": 7142 + }, + { + "epoch": 0.91, + "grad_norm": 0.6148001912811577, + "learning_rate": 5.965142111403527e-06, + "loss": 0.5745, + "step": 7143 + }, + { + "epoch": 0.91, + "grad_norm": 0.6375709545722904, + "learning_rate": 5.964129905527125e-06, + "loss": 0.5938, + "step": 7144 + }, + { + "epoch": 0.91, + "grad_norm": 0.8125509007726149, + "learning_rate": 5.963117658610794e-06, + "loss": 0.5854, + "step": 7145 + }, + { + "epoch": 0.91, + "grad_norm": 0.616028726265624, + "learning_rate": 5.96210537069762e-06, + "loss": 0.4918, + "step": 7146 + }, + { + "epoch": 0.91, + "grad_norm": 0.6498585460254964, + "learning_rate": 5.961093041830698e-06, + "loss": 0.5581, + "step": 7147 + }, + { + "epoch": 0.91, + "grad_norm": 0.739770037097316, + "learning_rate": 5.960080672053115e-06, + "loss": 0.5302, + "step": 7148 + }, + { + "epoch": 0.91, + "grad_norm": 0.8914153248838461, + "learning_rate": 5.959068261407965e-06, + "loss": 0.6369, + "step": 7149 + }, + { + "epoch": 0.91, + "grad_norm": 0.7522871068945128, + "learning_rate": 5.958055809938345e-06, + "loss": 0.5893, + "step": 7150 + }, + { + "epoch": 0.91, + "grad_norm": 0.8661673027769349, + "learning_rate": 5.95704331768735e-06, + "loss": 0.6221, + "step": 7151 + }, + { + "epoch": 0.91, + "grad_norm": 0.8137186745809454, + "learning_rate": 5.956030784698081e-06, + "loss": 0.6102, + "step": 7152 + }, + { + "epoch": 0.91, + "grad_norm": 0.6389780260807015, + "learning_rate": 5.9550182110136345e-06, + "loss": 0.4756, + "step": 7153 + }, + { + "epoch": 0.91, + "grad_norm": 0.9007838809121117, + "learning_rate": 5.954005596677115e-06, + "loss": 0.6118, + "step": 7154 + }, + { + "epoch": 0.91, + "grad_norm": 0.8722979637220367, + "learning_rate": 5.952992941731626e-06, + "loss": 0.6217, + "step": 7155 + }, + { + "epoch": 0.91, + "grad_norm": 0.6290544796256371, + "learning_rate": 5.951980246220272e-06, + "loss": 0.5266, + "step": 7156 + }, + { + "epoch": 0.91, + "grad_norm": 0.7740339630176876, + "learning_rate": 5.9509675101861604e-06, + "loss": 0.6114, + "step": 7157 + }, + { + "epoch": 0.91, + "grad_norm": 0.7094801167297725, + "learning_rate": 5.9499547336724025e-06, + "loss": 0.5376, + "step": 7158 + }, + { + "epoch": 0.91, + "grad_norm": 0.670911401387463, + "learning_rate": 5.948941916722107e-06, + "loss": 0.5188, + "step": 7159 + }, + { + "epoch": 0.91, + "grad_norm": 0.6805473534722848, + "learning_rate": 5.9479290593783865e-06, + "loss": 0.5408, + "step": 7160 + }, + { + "epoch": 0.91, + "grad_norm": 0.5415560859130355, + "learning_rate": 5.9469161616843554e-06, + "loss": 0.5114, + "step": 7161 + }, + { + "epoch": 0.91, + "grad_norm": 0.6999988889602705, + "learning_rate": 5.945903223683128e-06, + "loss": 0.5516, + "step": 7162 + }, + { + "epoch": 0.91, + "grad_norm": 1.5969446972468377, + "learning_rate": 5.944890245417825e-06, + "loss": 0.5858, + "step": 7163 + }, + { + "epoch": 0.91, + "grad_norm": 0.780556428227834, + "learning_rate": 5.94387722693156e-06, + "loss": 0.6036, + "step": 7164 + }, + { + "epoch": 0.91, + "grad_norm": 0.5729227086510631, + "learning_rate": 5.942864168267461e-06, + "loss": 0.5475, + "step": 7165 + }, + { + "epoch": 0.91, + "grad_norm": 0.5526932724807357, + "learning_rate": 5.941851069468646e-06, + "loss": 0.53, + "step": 7166 + }, + { + "epoch": 0.91, + "grad_norm": 0.6565265821662218, + "learning_rate": 5.9408379305782415e-06, + "loss": 0.5202, + "step": 7167 + }, + { + "epoch": 0.91, + "grad_norm": 0.6865929034117233, + "learning_rate": 5.939824751639373e-06, + "loss": 0.5723, + "step": 7168 + }, + { + "epoch": 0.91, + "grad_norm": 0.7925018445090999, + "learning_rate": 5.938811532695166e-06, + "loss": 0.5967, + "step": 7169 + }, + { + "epoch": 0.91, + "grad_norm": 0.7102343774352992, + "learning_rate": 5.937798273788754e-06, + "loss": 0.5892, + "step": 7170 + }, + { + "epoch": 0.91, + "grad_norm": 0.8779940712735889, + "learning_rate": 5.936784974963266e-06, + "loss": 0.5544, + "step": 7171 + }, + { + "epoch": 0.91, + "grad_norm": 0.6772792441690239, + "learning_rate": 5.935771636261835e-06, + "loss": 0.5276, + "step": 7172 + }, + { + "epoch": 0.91, + "grad_norm": 1.043407671374952, + "learning_rate": 5.934758257727595e-06, + "loss": 0.5983, + "step": 7173 + }, + { + "epoch": 0.91, + "grad_norm": 0.5830818131182096, + "learning_rate": 5.933744839403683e-06, + "loss": 0.4846, + "step": 7174 + }, + { + "epoch": 0.91, + "grad_norm": 0.8823314896482773, + "learning_rate": 5.932731381333239e-06, + "loss": 0.646, + "step": 7175 + }, + { + "epoch": 0.91, + "grad_norm": 0.6289780052938609, + "learning_rate": 5.931717883559399e-06, + "loss": 0.52, + "step": 7176 + }, + { + "epoch": 0.91, + "grad_norm": 0.5476421579791547, + "learning_rate": 5.930704346125306e-06, + "loss": 0.5383, + "step": 7177 + }, + { + "epoch": 0.91, + "grad_norm": 0.635112023277828, + "learning_rate": 5.929690769074103e-06, + "loss": 0.5503, + "step": 7178 + }, + { + "epoch": 0.91, + "grad_norm": 0.8178462896208889, + "learning_rate": 5.928677152448935e-06, + "loss": 0.6054, + "step": 7179 + }, + { + "epoch": 0.91, + "grad_norm": 0.7534776143426254, + "learning_rate": 5.9276634962929495e-06, + "loss": 0.6388, + "step": 7180 + }, + { + "epoch": 0.91, + "grad_norm": 0.7323572319169261, + "learning_rate": 5.926649800649293e-06, + "loss": 0.5689, + "step": 7181 + }, + { + "epoch": 0.91, + "grad_norm": 2.9494818224270936, + "learning_rate": 5.925636065561113e-06, + "loss": 0.5651, + "step": 7182 + }, + { + "epoch": 0.92, + "grad_norm": 0.7334370926046292, + "learning_rate": 5.9246222910715655e-06, + "loss": 0.5804, + "step": 7183 + }, + { + "epoch": 0.92, + "grad_norm": 1.1804973622481583, + "learning_rate": 5.923608477223803e-06, + "loss": 0.593, + "step": 7184 + }, + { + "epoch": 0.92, + "grad_norm": 0.5964010756064292, + "learning_rate": 5.922594624060978e-06, + "loss": 0.5087, + "step": 7185 + }, + { + "epoch": 0.92, + "grad_norm": 0.753845475829064, + "learning_rate": 5.921580731626248e-06, + "loss": 0.5902, + "step": 7186 + }, + { + "epoch": 0.92, + "grad_norm": 0.6525589041353362, + "learning_rate": 5.92056679996277e-06, + "loss": 0.5023, + "step": 7187 + }, + { + "epoch": 0.92, + "grad_norm": 0.6700678120394333, + "learning_rate": 5.919552829113707e-06, + "loss": 0.5163, + "step": 7188 + }, + { + "epoch": 0.92, + "grad_norm": 0.7170149945197423, + "learning_rate": 5.918538819122217e-06, + "loss": 0.5363, + "step": 7189 + }, + { + "epoch": 0.92, + "grad_norm": 0.8488018843512644, + "learning_rate": 5.917524770031465e-06, + "loss": 0.592, + "step": 7190 + }, + { + "epoch": 0.92, + "grad_norm": 0.6533620222904847, + "learning_rate": 5.916510681884616e-06, + "loss": 0.5723, + "step": 7191 + }, + { + "epoch": 0.92, + "grad_norm": 0.6408270049772504, + "learning_rate": 5.915496554724837e-06, + "loss": 0.4436, + "step": 7192 + }, + { + "epoch": 0.92, + "grad_norm": 0.6786165644820878, + "learning_rate": 5.914482388595294e-06, + "loss": 0.5396, + "step": 7193 + }, + { + "epoch": 0.92, + "grad_norm": 0.6308099791159926, + "learning_rate": 5.913468183539158e-06, + "loss": 0.5553, + "step": 7194 + }, + { + "epoch": 0.92, + "grad_norm": 0.5258517317425928, + "learning_rate": 5.9124539395996e-06, + "loss": 0.5148, + "step": 7195 + }, + { + "epoch": 0.92, + "grad_norm": 0.6638753437974643, + "learning_rate": 5.911439656819794e-06, + "loss": 0.5294, + "step": 7196 + }, + { + "epoch": 0.92, + "grad_norm": 0.5745802024832648, + "learning_rate": 5.910425335242914e-06, + "loss": 0.5142, + "step": 7197 + }, + { + "epoch": 0.92, + "grad_norm": 0.7218634303445264, + "learning_rate": 5.9094109749121375e-06, + "loss": 0.592, + "step": 7198 + }, + { + "epoch": 0.92, + "grad_norm": 0.8275995525581679, + "learning_rate": 5.9083965758706415e-06, + "loss": 0.6407, + "step": 7199 + }, + { + "epoch": 0.92, + "grad_norm": 0.7125587368919056, + "learning_rate": 5.907382138161607e-06, + "loss": 0.6005, + "step": 7200 + }, + { + "epoch": 0.92, + "grad_norm": 0.683047625340476, + "learning_rate": 5.906367661828214e-06, + "loss": 0.5889, + "step": 7201 + }, + { + "epoch": 0.92, + "grad_norm": 0.8188590980388989, + "learning_rate": 5.905353146913645e-06, + "loss": 0.6049, + "step": 7202 + }, + { + "epoch": 0.92, + "grad_norm": 0.5829997877623249, + "learning_rate": 5.904338593461087e-06, + "loss": 0.544, + "step": 7203 + }, + { + "epoch": 0.92, + "grad_norm": 1.0746563096274913, + "learning_rate": 5.903324001513724e-06, + "loss": 0.5755, + "step": 7204 + }, + { + "epoch": 0.92, + "grad_norm": 0.6011755085581385, + "learning_rate": 5.902309371114745e-06, + "loss": 0.5377, + "step": 7205 + }, + { + "epoch": 0.92, + "grad_norm": 0.6765894960483733, + "learning_rate": 5.901294702307339e-06, + "loss": 0.5617, + "step": 7206 + }, + { + "epoch": 0.92, + "grad_norm": 0.6419749281686844, + "learning_rate": 5.900279995134699e-06, + "loss": 0.5236, + "step": 7207 + }, + { + "epoch": 0.92, + "grad_norm": 0.7260466237244242, + "learning_rate": 5.899265249640014e-06, + "loss": 0.5627, + "step": 7208 + }, + { + "epoch": 0.92, + "grad_norm": 0.6804589382568738, + "learning_rate": 5.898250465866483e-06, + "loss": 0.6041, + "step": 7209 + }, + { + "epoch": 0.92, + "grad_norm": 0.7260330565911145, + "learning_rate": 5.8972356438573e-06, + "loss": 0.6468, + "step": 7210 + }, + { + "epoch": 0.92, + "grad_norm": 0.8120694239781333, + "learning_rate": 5.896220783655663e-06, + "loss": 0.6234, + "step": 7211 + }, + { + "epoch": 0.92, + "grad_norm": 0.7537635229403995, + "learning_rate": 5.89520588530477e-06, + "loss": 0.6105, + "step": 7212 + }, + { + "epoch": 0.92, + "grad_norm": 0.631604668002357, + "learning_rate": 5.894190948847824e-06, + "loss": 0.5171, + "step": 7213 + }, + { + "epoch": 0.92, + "grad_norm": 0.613916991971185, + "learning_rate": 5.893175974328027e-06, + "loss": 0.5376, + "step": 7214 + }, + { + "epoch": 0.92, + "grad_norm": 0.6614329216719724, + "learning_rate": 5.892160961788582e-06, + "loss": 0.5349, + "step": 7215 + }, + { + "epoch": 0.92, + "grad_norm": 0.6898093405177034, + "learning_rate": 5.891145911272697e-06, + "loss": 0.5927, + "step": 7216 + }, + { + "epoch": 0.92, + "grad_norm": 1.2641626101636871, + "learning_rate": 5.890130822823578e-06, + "loss": 0.6472, + "step": 7217 + }, + { + "epoch": 0.92, + "grad_norm": 0.5193522151683049, + "learning_rate": 5.889115696484433e-06, + "loss": 0.4806, + "step": 7218 + }, + { + "epoch": 0.92, + "grad_norm": 0.693461264309939, + "learning_rate": 5.888100532298474e-06, + "loss": 0.5587, + "step": 7219 + }, + { + "epoch": 0.92, + "grad_norm": 0.775924265788248, + "learning_rate": 5.8870853303089145e-06, + "loss": 0.6299, + "step": 7220 + }, + { + "epoch": 0.92, + "grad_norm": 0.7564253579679279, + "learning_rate": 5.886070090558967e-06, + "loss": 0.6224, + "step": 7221 + }, + { + "epoch": 0.92, + "grad_norm": 0.7511805664848867, + "learning_rate": 5.885054813091847e-06, + "loss": 0.545, + "step": 7222 + }, + { + "epoch": 0.92, + "grad_norm": 0.7532209394603636, + "learning_rate": 5.884039497950773e-06, + "loss": 0.6317, + "step": 7223 + }, + { + "epoch": 0.92, + "grad_norm": 0.6535111507141641, + "learning_rate": 5.883024145178961e-06, + "loss": 0.5001, + "step": 7224 + }, + { + "epoch": 0.92, + "grad_norm": 0.6102915383448848, + "learning_rate": 5.882008754819634e-06, + "loss": 0.5686, + "step": 7225 + }, + { + "epoch": 0.92, + "grad_norm": 0.8643434331530937, + "learning_rate": 5.880993326916012e-06, + "loss": 0.6236, + "step": 7226 + }, + { + "epoch": 0.92, + "grad_norm": 0.7603682620802733, + "learning_rate": 5.879977861511319e-06, + "loss": 0.5728, + "step": 7227 + }, + { + "epoch": 0.92, + "grad_norm": 0.7440511129750577, + "learning_rate": 5.878962358648781e-06, + "loss": 0.5748, + "step": 7228 + }, + { + "epoch": 0.92, + "grad_norm": 0.7453367244684165, + "learning_rate": 5.877946818371624e-06, + "loss": 0.593, + "step": 7229 + }, + { + "epoch": 0.92, + "grad_norm": 0.7142201825666366, + "learning_rate": 5.876931240723076e-06, + "loss": 0.5748, + "step": 7230 + }, + { + "epoch": 0.92, + "grad_norm": 0.6348020574814905, + "learning_rate": 5.875915625746369e-06, + "loss": 0.4652, + "step": 7231 + }, + { + "epoch": 0.92, + "grad_norm": 0.5732572572783668, + "learning_rate": 5.874899973484731e-06, + "loss": 0.5243, + "step": 7232 + }, + { + "epoch": 0.92, + "grad_norm": 0.8021992513079979, + "learning_rate": 5.8738842839813966e-06, + "loss": 0.5826, + "step": 7233 + }, + { + "epoch": 0.92, + "grad_norm": 0.5565199083502078, + "learning_rate": 5.8728685572796025e-06, + "loss": 0.4745, + "step": 7234 + }, + { + "epoch": 0.92, + "grad_norm": 1.0035041655435828, + "learning_rate": 5.871852793422582e-06, + "loss": 0.6218, + "step": 7235 + }, + { + "epoch": 0.92, + "grad_norm": 0.564083561414303, + "learning_rate": 5.870836992453576e-06, + "loss": 0.5022, + "step": 7236 + }, + { + "epoch": 0.92, + "grad_norm": 0.8312163668841487, + "learning_rate": 5.86982115441582e-06, + "loss": 0.5982, + "step": 7237 + }, + { + "epoch": 0.92, + "grad_norm": 0.556786251097832, + "learning_rate": 5.868805279352559e-06, + "loss": 0.5339, + "step": 7238 + }, + { + "epoch": 0.92, + "grad_norm": 0.7339868029490866, + "learning_rate": 5.867789367307031e-06, + "loss": 0.6447, + "step": 7239 + }, + { + "epoch": 0.92, + "grad_norm": 0.7161191679438755, + "learning_rate": 5.8667734183224835e-06, + "loss": 0.5506, + "step": 7240 + }, + { + "epoch": 0.92, + "grad_norm": 0.7817509442462736, + "learning_rate": 5.865757432442162e-06, + "loss": 0.6232, + "step": 7241 + }, + { + "epoch": 0.92, + "grad_norm": 0.8237965183738162, + "learning_rate": 5.864741409709313e-06, + "loss": 0.6474, + "step": 7242 + }, + { + "epoch": 0.92, + "grad_norm": 0.7999948748300804, + "learning_rate": 5.863725350167185e-06, + "loss": 0.6375, + "step": 7243 + }, + { + "epoch": 0.92, + "grad_norm": 0.6992666595332188, + "learning_rate": 5.8627092538590305e-06, + "loss": 0.6001, + "step": 7244 + }, + { + "epoch": 0.92, + "grad_norm": 0.6423170497339399, + "learning_rate": 5.861693120828097e-06, + "loss": 0.5058, + "step": 7245 + }, + { + "epoch": 0.92, + "grad_norm": 0.8022773476160895, + "learning_rate": 5.860676951117643e-06, + "loss": 0.587, + "step": 7246 + }, + { + "epoch": 0.92, + "grad_norm": 0.7104105558828214, + "learning_rate": 5.859660744770922e-06, + "loss": 0.527, + "step": 7247 + }, + { + "epoch": 0.92, + "grad_norm": 0.7307800551745314, + "learning_rate": 5.858644501831189e-06, + "loss": 0.5882, + "step": 7248 + }, + { + "epoch": 0.92, + "grad_norm": 0.8297109238648962, + "learning_rate": 5.857628222341705e-06, + "loss": 0.5899, + "step": 7249 + }, + { + "epoch": 0.92, + "grad_norm": 0.6407306982382849, + "learning_rate": 5.856611906345726e-06, + "loss": 0.5241, + "step": 7250 + }, + { + "epoch": 0.92, + "grad_norm": 0.5248318932797091, + "learning_rate": 5.855595553886516e-06, + "loss": 0.4604, + "step": 7251 + }, + { + "epoch": 0.92, + "grad_norm": 0.7399990124653967, + "learning_rate": 5.854579165007338e-06, + "loss": 0.4952, + "step": 7252 + }, + { + "epoch": 0.92, + "grad_norm": 0.7943497501869831, + "learning_rate": 5.853562739751455e-06, + "loss": 0.6566, + "step": 7253 + }, + { + "epoch": 0.92, + "grad_norm": 0.6319138225181162, + "learning_rate": 5.852546278162135e-06, + "loss": 0.5243, + "step": 7254 + }, + { + "epoch": 0.92, + "grad_norm": 0.8630142886579614, + "learning_rate": 5.851529780282643e-06, + "loss": 0.5679, + "step": 7255 + }, + { + "epoch": 0.92, + "grad_norm": 0.648310338628721, + "learning_rate": 5.850513246156251e-06, + "loss": 0.5886, + "step": 7256 + }, + { + "epoch": 0.92, + "grad_norm": 0.5592694301524513, + "learning_rate": 5.849496675826226e-06, + "loss": 0.5162, + "step": 7257 + }, + { + "epoch": 0.92, + "grad_norm": 0.6810846159214108, + "learning_rate": 5.848480069335843e-06, + "loss": 0.5328, + "step": 7258 + }, + { + "epoch": 0.92, + "grad_norm": 0.892395292191634, + "learning_rate": 5.847463426728375e-06, + "loss": 0.6174, + "step": 7259 + }, + { + "epoch": 0.92, + "grad_norm": 0.825818034903631, + "learning_rate": 5.846446748047098e-06, + "loss": 0.6505, + "step": 7260 + }, + { + "epoch": 0.93, + "grad_norm": 0.7165267176836811, + "learning_rate": 5.845430033335286e-06, + "loss": 0.5006, + "step": 7261 + }, + { + "epoch": 0.93, + "grad_norm": 0.6705033128727618, + "learning_rate": 5.84441328263622e-06, + "loss": 0.5677, + "step": 7262 + }, + { + "epoch": 0.93, + "grad_norm": 0.6894400756286736, + "learning_rate": 5.843396495993179e-06, + "loss": 0.4957, + "step": 7263 + }, + { + "epoch": 0.93, + "grad_norm": 0.7164755467312727, + "learning_rate": 5.842379673449443e-06, + "loss": 0.6148, + "step": 7264 + }, + { + "epoch": 0.93, + "grad_norm": 0.6902510171666226, + "learning_rate": 5.841362815048297e-06, + "loss": 0.5347, + "step": 7265 + }, + { + "epoch": 0.93, + "grad_norm": 0.6157792280727161, + "learning_rate": 5.840345920833025e-06, + "loss": 0.5318, + "step": 7266 + }, + { + "epoch": 0.93, + "grad_norm": 0.6523571112720159, + "learning_rate": 5.839328990846913e-06, + "loss": 0.5343, + "step": 7267 + }, + { + "epoch": 0.93, + "grad_norm": 0.6491774948464895, + "learning_rate": 5.838312025133247e-06, + "loss": 0.5205, + "step": 7268 + }, + { + "epoch": 0.93, + "grad_norm": 0.6413892153133484, + "learning_rate": 5.837295023735318e-06, + "loss": 0.5371, + "step": 7269 + }, + { + "epoch": 0.93, + "grad_norm": 0.5681557721160956, + "learning_rate": 5.836277986696413e-06, + "loss": 0.5094, + "step": 7270 + }, + { + "epoch": 0.93, + "grad_norm": 0.7352275222706279, + "learning_rate": 5.835260914059828e-06, + "loss": 0.5759, + "step": 7271 + }, + { + "epoch": 0.93, + "grad_norm": 0.6516485592214992, + "learning_rate": 5.8342438058688554e-06, + "loss": 0.5103, + "step": 7272 + }, + { + "epoch": 0.93, + "grad_norm": 0.8861948647087583, + "learning_rate": 5.83322666216679e-06, + "loss": 0.5509, + "step": 7273 + }, + { + "epoch": 0.93, + "grad_norm": 1.1264271629169584, + "learning_rate": 5.832209482996927e-06, + "loss": 0.6502, + "step": 7274 + }, + { + "epoch": 0.93, + "grad_norm": 0.6007806751201401, + "learning_rate": 5.8311922684025665e-06, + "loss": 0.5171, + "step": 7275 + }, + { + "epoch": 0.93, + "grad_norm": 0.5815624149811355, + "learning_rate": 5.830175018427007e-06, + "loss": 0.5103, + "step": 7276 + }, + { + "epoch": 0.93, + "grad_norm": 0.746432587254956, + "learning_rate": 5.829157733113551e-06, + "loss": 0.6755, + "step": 7277 + }, + { + "epoch": 0.93, + "grad_norm": 0.5971462350187131, + "learning_rate": 5.828140412505499e-06, + "loss": 0.4912, + "step": 7278 + }, + { + "epoch": 0.93, + "grad_norm": 0.6938170524870781, + "learning_rate": 5.827123056646156e-06, + "loss": 0.5943, + "step": 7279 + }, + { + "epoch": 0.93, + "grad_norm": 0.6160802896861488, + "learning_rate": 5.826105665578827e-06, + "loss": 0.4987, + "step": 7280 + }, + { + "epoch": 0.93, + "grad_norm": 0.6498898330890953, + "learning_rate": 5.82508823934682e-06, + "loss": 0.5708, + "step": 7281 + }, + { + "epoch": 0.93, + "grad_norm": 0.7738949323289743, + "learning_rate": 5.8240707779934435e-06, + "loss": 0.5498, + "step": 7282 + }, + { + "epoch": 0.93, + "grad_norm": 0.7765301423890085, + "learning_rate": 5.823053281562008e-06, + "loss": 0.623, + "step": 7283 + }, + { + "epoch": 0.93, + "grad_norm": 0.7132515388069783, + "learning_rate": 5.822035750095824e-06, + "loss": 0.6325, + "step": 7284 + }, + { + "epoch": 0.93, + "grad_norm": 0.7143228425316213, + "learning_rate": 5.821018183638204e-06, + "loss": 0.5357, + "step": 7285 + }, + { + "epoch": 0.93, + "grad_norm": 0.5636076813919914, + "learning_rate": 5.820000582232465e-06, + "loss": 0.4801, + "step": 7286 + }, + { + "epoch": 0.93, + "grad_norm": 0.794985452295735, + "learning_rate": 5.818982945921921e-06, + "loss": 0.5896, + "step": 7287 + }, + { + "epoch": 0.93, + "grad_norm": 0.6115182909504777, + "learning_rate": 5.8179652747498885e-06, + "loss": 0.4978, + "step": 7288 + }, + { + "epoch": 0.93, + "grad_norm": 0.7767621973329313, + "learning_rate": 5.8169475687596885e-06, + "loss": 0.6628, + "step": 7289 + }, + { + "epoch": 0.93, + "grad_norm": 0.6632843083244028, + "learning_rate": 5.8159298279946415e-06, + "loss": 0.5125, + "step": 7290 + }, + { + "epoch": 0.93, + "grad_norm": 0.5658385117973802, + "learning_rate": 5.814912052498071e-06, + "loss": 0.4695, + "step": 7291 + }, + { + "epoch": 0.93, + "grad_norm": 0.5294647688395007, + "learning_rate": 5.813894242313297e-06, + "loss": 0.5064, + "step": 7292 + }, + { + "epoch": 0.93, + "grad_norm": 0.5538328673703952, + "learning_rate": 5.812876397483645e-06, + "loss": 0.4584, + "step": 7293 + }, + { + "epoch": 0.93, + "grad_norm": 0.6773857563583625, + "learning_rate": 5.811858518052445e-06, + "loss": 0.5655, + "step": 7294 + }, + { + "epoch": 0.93, + "grad_norm": 0.8299489916019177, + "learning_rate": 5.810840604063019e-06, + "loss": 0.5688, + "step": 7295 + }, + { + "epoch": 0.93, + "grad_norm": 0.8512504343072516, + "learning_rate": 5.809822655558701e-06, + "loss": 0.6078, + "step": 7296 + }, + { + "epoch": 0.93, + "grad_norm": 0.6345573429900008, + "learning_rate": 5.808804672582821e-06, + "loss": 0.5575, + "step": 7297 + }, + { + "epoch": 0.93, + "grad_norm": 0.5266777328024713, + "learning_rate": 5.80778665517871e-06, + "loss": 0.4691, + "step": 7298 + }, + { + "epoch": 0.93, + "grad_norm": 0.7904841558684887, + "learning_rate": 5.806768603389703e-06, + "loss": 0.6036, + "step": 7299 + }, + { + "epoch": 0.93, + "grad_norm": 0.610487168331742, + "learning_rate": 5.805750517259134e-06, + "loss": 0.5656, + "step": 7300 + }, + { + "epoch": 0.93, + "grad_norm": 0.777017542457991, + "learning_rate": 5.8047323968303395e-06, + "loss": 0.571, + "step": 7301 + }, + { + "epoch": 0.93, + "grad_norm": 0.5665595055161324, + "learning_rate": 5.803714242146659e-06, + "loss": 0.464, + "step": 7302 + }, + { + "epoch": 0.93, + "grad_norm": 0.6278771042895364, + "learning_rate": 5.802696053251432e-06, + "loss": 0.555, + "step": 7303 + }, + { + "epoch": 0.93, + "grad_norm": 0.6927763676822408, + "learning_rate": 5.801677830187999e-06, + "loss": 0.6089, + "step": 7304 + }, + { + "epoch": 0.93, + "grad_norm": 0.6001409500907885, + "learning_rate": 5.800659572999703e-06, + "loss": 0.5146, + "step": 7305 + }, + { + "epoch": 0.93, + "grad_norm": 0.6843889087530002, + "learning_rate": 5.799641281729887e-06, + "loss": 0.5852, + "step": 7306 + }, + { + "epoch": 0.93, + "grad_norm": 0.6261159346037729, + "learning_rate": 5.798622956421897e-06, + "loss": 0.5592, + "step": 7307 + }, + { + "epoch": 0.93, + "grad_norm": 0.710718794672907, + "learning_rate": 5.79760459711908e-06, + "loss": 0.6004, + "step": 7308 + }, + { + "epoch": 0.93, + "grad_norm": 0.5953656751254692, + "learning_rate": 5.796586203864784e-06, + "loss": 0.5419, + "step": 7309 + }, + { + "epoch": 0.93, + "grad_norm": 0.5661716398805003, + "learning_rate": 5.795567776702358e-06, + "loss": 0.5404, + "step": 7310 + }, + { + "epoch": 0.93, + "grad_norm": 0.7715856442709891, + "learning_rate": 5.794549315675155e-06, + "loss": 0.6305, + "step": 7311 + }, + { + "epoch": 0.93, + "grad_norm": 0.6470501375160036, + "learning_rate": 5.793530820826526e-06, + "loss": 0.5366, + "step": 7312 + }, + { + "epoch": 0.93, + "grad_norm": 0.6474857512999157, + "learning_rate": 5.792512292199825e-06, + "loss": 0.5427, + "step": 7313 + }, + { + "epoch": 0.93, + "grad_norm": 0.558137666083832, + "learning_rate": 5.79149372983841e-06, + "loss": 0.4815, + "step": 7314 + }, + { + "epoch": 0.93, + "grad_norm": 0.6000185161865772, + "learning_rate": 5.790475133785636e-06, + "loss": 0.5203, + "step": 7315 + }, + { + "epoch": 0.93, + "grad_norm": 0.8911390778156995, + "learning_rate": 5.789456504084861e-06, + "loss": 0.543, + "step": 7316 + }, + { + "epoch": 0.93, + "grad_norm": 0.7606267790585807, + "learning_rate": 5.788437840779445e-06, + "loss": 0.5925, + "step": 7317 + }, + { + "epoch": 0.93, + "grad_norm": 0.7153722157555635, + "learning_rate": 5.78741914391275e-06, + "loss": 0.6094, + "step": 7318 + }, + { + "epoch": 0.93, + "grad_norm": 0.8110151344299823, + "learning_rate": 5.786400413528137e-06, + "loss": 0.5681, + "step": 7319 + }, + { + "epoch": 0.93, + "grad_norm": 0.9068505354730736, + "learning_rate": 5.785381649668973e-06, + "loss": 0.6366, + "step": 7320 + }, + { + "epoch": 0.93, + "grad_norm": 0.7482058628075415, + "learning_rate": 5.7843628523786224e-06, + "loss": 0.6049, + "step": 7321 + }, + { + "epoch": 0.93, + "grad_norm": 0.7868115218222387, + "learning_rate": 5.78334402170045e-06, + "loss": 0.7053, + "step": 7322 + }, + { + "epoch": 0.93, + "grad_norm": 0.5478806940317313, + "learning_rate": 5.782325157677827e-06, + "loss": 0.5363, + "step": 7323 + }, + { + "epoch": 0.93, + "grad_norm": 0.6553736676047508, + "learning_rate": 5.781306260354121e-06, + "loss": 0.4501, + "step": 7324 + }, + { + "epoch": 0.93, + "grad_norm": 0.6139132501769308, + "learning_rate": 5.780287329772705e-06, + "loss": 0.5402, + "step": 7325 + }, + { + "epoch": 0.93, + "grad_norm": 0.6355656960870817, + "learning_rate": 5.77926836597695e-06, + "loss": 0.4826, + "step": 7326 + }, + { + "epoch": 0.93, + "grad_norm": 0.7906631153158185, + "learning_rate": 5.778249369010231e-06, + "loss": 0.6307, + "step": 7327 + }, + { + "epoch": 0.93, + "grad_norm": 0.6654108367236921, + "learning_rate": 5.777230338915925e-06, + "loss": 0.5398, + "step": 7328 + }, + { + "epoch": 0.93, + "grad_norm": 0.7802883108848935, + "learning_rate": 5.776211275737404e-06, + "loss": 0.6146, + "step": 7329 + }, + { + "epoch": 0.93, + "grad_norm": 0.8730210217566167, + "learning_rate": 5.775192179518052e-06, + "loss": 0.6861, + "step": 7330 + }, + { + "epoch": 0.93, + "grad_norm": 0.6642755256253636, + "learning_rate": 5.774173050301246e-06, + "loss": 0.5598, + "step": 7331 + }, + { + "epoch": 0.93, + "grad_norm": 0.6828931354082131, + "learning_rate": 5.773153888130365e-06, + "loss": 0.5477, + "step": 7332 + }, + { + "epoch": 0.93, + "grad_norm": 0.8504980540566135, + "learning_rate": 5.772134693048796e-06, + "loss": 0.6737, + "step": 7333 + }, + { + "epoch": 0.93, + "grad_norm": 0.8626703537939912, + "learning_rate": 5.771115465099919e-06, + "loss": 0.6658, + "step": 7334 + }, + { + "epoch": 0.93, + "grad_norm": 0.7064141432567075, + "learning_rate": 5.770096204327121e-06, + "loss": 0.5683, + "step": 7335 + }, + { + "epoch": 0.93, + "grad_norm": 0.7370490035885552, + "learning_rate": 5.76907691077379e-06, + "loss": 0.5936, + "step": 7336 + }, + { + "epoch": 0.93, + "grad_norm": 0.7928807843531035, + "learning_rate": 5.768057584483311e-06, + "loss": 0.6099, + "step": 7337 + }, + { + "epoch": 0.93, + "grad_norm": 0.5556098517357636, + "learning_rate": 5.767038225499075e-06, + "loss": 0.4458, + "step": 7338 + }, + { + "epoch": 0.93, + "grad_norm": 0.5269786739704576, + "learning_rate": 5.766018833864474e-06, + "loss": 0.4881, + "step": 7339 + }, + { + "epoch": 0.94, + "grad_norm": 0.6518402603283743, + "learning_rate": 5.764999409622899e-06, + "loss": 0.593, + "step": 7340 + }, + { + "epoch": 0.94, + "grad_norm": 0.8607622627960236, + "learning_rate": 5.763979952817742e-06, + "loss": 0.6132, + "step": 7341 + }, + { + "epoch": 0.94, + "grad_norm": 0.5550745442235996, + "learning_rate": 5.762960463492402e-06, + "loss": 0.4642, + "step": 7342 + }, + { + "epoch": 0.94, + "grad_norm": 0.6982318183972722, + "learning_rate": 5.761940941690271e-06, + "loss": 0.5902, + "step": 7343 + }, + { + "epoch": 0.94, + "grad_norm": 0.6883622646587318, + "learning_rate": 5.76092138745475e-06, + "loss": 0.517, + "step": 7344 + }, + { + "epoch": 0.94, + "grad_norm": 0.6213778884618992, + "learning_rate": 5.759901800829236e-06, + "loss": 0.5233, + "step": 7345 + }, + { + "epoch": 0.94, + "grad_norm": 0.6308561300454176, + "learning_rate": 5.758882181857132e-06, + "loss": 0.5012, + "step": 7346 + }, + { + "epoch": 0.94, + "grad_norm": 0.6492288607943988, + "learning_rate": 5.757862530581838e-06, + "loss": 0.5539, + "step": 7347 + }, + { + "epoch": 0.94, + "grad_norm": 0.5186728290297209, + "learning_rate": 5.7568428470467585e-06, + "loss": 0.5327, + "step": 7348 + }, + { + "epoch": 0.94, + "grad_norm": 0.6570017285976666, + "learning_rate": 5.755823131295297e-06, + "loss": 0.5791, + "step": 7349 + }, + { + "epoch": 0.94, + "grad_norm": 0.8145197848347757, + "learning_rate": 5.7548033833708594e-06, + "loss": 0.6382, + "step": 7350 + }, + { + "epoch": 0.94, + "grad_norm": 0.6820084692761272, + "learning_rate": 5.753783603316854e-06, + "loss": 0.5583, + "step": 7351 + }, + { + "epoch": 0.94, + "grad_norm": 0.6332458317116423, + "learning_rate": 5.75276379117669e-06, + "loss": 0.5421, + "step": 7352 + }, + { + "epoch": 0.94, + "grad_norm": 0.6461586618587962, + "learning_rate": 5.7517439469937775e-06, + "loss": 0.5205, + "step": 7353 + }, + { + "epoch": 0.94, + "grad_norm": 0.6096986912375691, + "learning_rate": 5.750724070811526e-06, + "loss": 0.5177, + "step": 7354 + }, + { + "epoch": 0.94, + "grad_norm": 0.7751609533970542, + "learning_rate": 5.74970416267335e-06, + "loss": 0.6546, + "step": 7355 + }, + { + "epoch": 0.94, + "grad_norm": 0.5793601989944073, + "learning_rate": 5.748684222622664e-06, + "loss": 0.476, + "step": 7356 + }, + { + "epoch": 0.94, + "grad_norm": 0.636761912942386, + "learning_rate": 5.747664250702882e-06, + "loss": 0.5679, + "step": 7357 + }, + { + "epoch": 0.94, + "grad_norm": 0.5912910977846981, + "learning_rate": 5.746644246957423e-06, + "loss": 0.4883, + "step": 7358 + }, + { + "epoch": 0.94, + "grad_norm": 0.8115293024724972, + "learning_rate": 5.745624211429705e-06, + "loss": 0.5448, + "step": 7359 + }, + { + "epoch": 0.94, + "grad_norm": 0.6331336748521982, + "learning_rate": 5.744604144163146e-06, + "loss": 0.5274, + "step": 7360 + }, + { + "epoch": 0.94, + "grad_norm": 0.6858756021816557, + "learning_rate": 5.7435840452011695e-06, + "loss": 0.548, + "step": 7361 + }, + { + "epoch": 0.94, + "grad_norm": 0.5814533625575896, + "learning_rate": 5.742563914587195e-06, + "loss": 0.4806, + "step": 7362 + }, + { + "epoch": 0.94, + "grad_norm": 0.8448686031097746, + "learning_rate": 5.741543752364646e-06, + "loss": 0.604, + "step": 7363 + }, + { + "epoch": 0.94, + "grad_norm": 0.7131375390963478, + "learning_rate": 5.740523558576951e-06, + "loss": 0.6207, + "step": 7364 + }, + { + "epoch": 0.94, + "grad_norm": 0.6041193615922887, + "learning_rate": 5.739503333267535e-06, + "loss": 0.5577, + "step": 7365 + }, + { + "epoch": 0.94, + "grad_norm": 0.6476494627256795, + "learning_rate": 5.738483076479825e-06, + "loss": 0.5687, + "step": 7366 + }, + { + "epoch": 0.94, + "grad_norm": 0.5732051931699436, + "learning_rate": 5.73746278825725e-06, + "loss": 0.5384, + "step": 7367 + }, + { + "epoch": 0.94, + "grad_norm": 0.7014535737222668, + "learning_rate": 5.73644246864324e-06, + "loss": 0.5415, + "step": 7368 + }, + { + "epoch": 0.94, + "grad_norm": 0.890259532896133, + "learning_rate": 5.735422117681228e-06, + "loss": 0.6049, + "step": 7369 + }, + { + "epoch": 0.94, + "grad_norm": 0.962262608558948, + "learning_rate": 5.734401735414646e-06, + "loss": 0.6252, + "step": 7370 + }, + { + "epoch": 0.94, + "grad_norm": 0.6922243929075977, + "learning_rate": 5.733381321886929e-06, + "loss": 0.6154, + "step": 7371 + }, + { + "epoch": 0.94, + "grad_norm": 0.8310460094501119, + "learning_rate": 5.732360877141514e-06, + "loss": 0.5878, + "step": 7372 + }, + { + "epoch": 0.94, + "grad_norm": 0.6793760578554305, + "learning_rate": 5.731340401221835e-06, + "loss": 0.595, + "step": 7373 + }, + { + "epoch": 0.94, + "grad_norm": 0.6752946060588411, + "learning_rate": 5.730319894171335e-06, + "loss": 0.5282, + "step": 7374 + }, + { + "epoch": 0.94, + "grad_norm": 0.6121113321309787, + "learning_rate": 5.729299356033446e-06, + "loss": 0.5057, + "step": 7375 + }, + { + "epoch": 0.94, + "grad_norm": 0.6289692836393576, + "learning_rate": 5.728278786851618e-06, + "loss": 0.5292, + "step": 7376 + }, + { + "epoch": 0.94, + "grad_norm": 0.5610746626640373, + "learning_rate": 5.727258186669288e-06, + "loss": 0.4688, + "step": 7377 + }, + { + "epoch": 0.94, + "grad_norm": 0.627680869265941, + "learning_rate": 5.726237555529901e-06, + "loss": 0.5053, + "step": 7378 + }, + { + "epoch": 0.94, + "grad_norm": 0.6137069671782773, + "learning_rate": 5.7252168934769024e-06, + "loss": 0.501, + "step": 7379 + }, + { + "epoch": 0.94, + "grad_norm": 0.7840440910327866, + "learning_rate": 5.724196200553738e-06, + "loss": 0.5845, + "step": 7380 + }, + { + "epoch": 0.94, + "grad_norm": 0.7814997338324008, + "learning_rate": 5.723175476803854e-06, + "loss": 0.6358, + "step": 7381 + }, + { + "epoch": 0.94, + "grad_norm": 0.8715585755067938, + "learning_rate": 5.722154722270703e-06, + "loss": 0.6125, + "step": 7382 + }, + { + "epoch": 0.94, + "grad_norm": 0.6487162391170347, + "learning_rate": 5.721133936997732e-06, + "loss": 0.5712, + "step": 7383 + }, + { + "epoch": 0.94, + "grad_norm": 0.541181427380261, + "learning_rate": 5.720113121028394e-06, + "loss": 0.4945, + "step": 7384 + }, + { + "epoch": 0.94, + "grad_norm": 0.645279531852808, + "learning_rate": 5.719092274406142e-06, + "loss": 0.5564, + "step": 7385 + }, + { + "epoch": 0.94, + "grad_norm": 0.8028404859534641, + "learning_rate": 5.718071397174429e-06, + "loss": 0.606, + "step": 7386 + }, + { + "epoch": 0.94, + "grad_norm": 0.6000211867106248, + "learning_rate": 5.717050489376712e-06, + "loss": 0.4833, + "step": 7387 + }, + { + "epoch": 0.94, + "grad_norm": 0.6481284086082985, + "learning_rate": 5.7160295510564456e-06, + "loss": 0.4924, + "step": 7388 + }, + { + "epoch": 0.94, + "grad_norm": 0.6870893518460691, + "learning_rate": 5.715008582257091e-06, + "loss": 0.5259, + "step": 7389 + }, + { + "epoch": 0.94, + "grad_norm": 0.5624763214807417, + "learning_rate": 5.713987583022106e-06, + "loss": 0.4652, + "step": 7390 + }, + { + "epoch": 0.94, + "grad_norm": 0.7734190247354849, + "learning_rate": 5.71296655339495e-06, + "loss": 0.5363, + "step": 7391 + }, + { + "epoch": 0.94, + "grad_norm": 0.6086705751030543, + "learning_rate": 5.7119454934190866e-06, + "loss": 0.564, + "step": 7392 + }, + { + "epoch": 0.94, + "grad_norm": 0.6276722535599929, + "learning_rate": 5.710924403137979e-06, + "loss": 0.5594, + "step": 7393 + }, + { + "epoch": 0.94, + "grad_norm": 0.7243127724305656, + "learning_rate": 5.70990328259509e-06, + "loss": 0.6064, + "step": 7394 + }, + { + "epoch": 0.94, + "grad_norm": 0.6605308434412986, + "learning_rate": 5.708882131833888e-06, + "loss": 0.5117, + "step": 7395 + }, + { + "epoch": 0.94, + "grad_norm": 1.0070714308613156, + "learning_rate": 5.707860950897839e-06, + "loss": 0.531, + "step": 7396 + }, + { + "epoch": 0.94, + "grad_norm": 0.7500848913152038, + "learning_rate": 5.706839739830411e-06, + "loss": 0.5478, + "step": 7397 + }, + { + "epoch": 0.94, + "grad_norm": 0.5618937124625564, + "learning_rate": 5.705818498675074e-06, + "loss": 0.4884, + "step": 7398 + }, + { + "epoch": 0.94, + "grad_norm": 0.6870789565856263, + "learning_rate": 5.704797227475299e-06, + "loss": 0.4901, + "step": 7399 + }, + { + "epoch": 0.94, + "grad_norm": 0.5456379918592325, + "learning_rate": 5.703775926274559e-06, + "loss": 0.5161, + "step": 7400 + }, + { + "epoch": 0.94, + "grad_norm": 0.7181685879281244, + "learning_rate": 5.702754595116325e-06, + "loss": 0.6051, + "step": 7401 + }, + { + "epoch": 0.94, + "grad_norm": 0.6961210746654559, + "learning_rate": 5.701733234044075e-06, + "loss": 0.5463, + "step": 7402 + }, + { + "epoch": 0.94, + "grad_norm": 0.6056251106359543, + "learning_rate": 5.700711843101283e-06, + "loss": 0.5379, + "step": 7403 + }, + { + "epoch": 0.94, + "grad_norm": 0.5872243201901712, + "learning_rate": 5.699690422331426e-06, + "loss": 0.5413, + "step": 7404 + }, + { + "epoch": 0.94, + "grad_norm": 0.5540002123578797, + "learning_rate": 5.698668971777985e-06, + "loss": 0.503, + "step": 7405 + }, + { + "epoch": 0.94, + "grad_norm": 0.8072134370319362, + "learning_rate": 5.697647491484439e-06, + "loss": 0.6224, + "step": 7406 + }, + { + "epoch": 0.94, + "grad_norm": 0.5572745526912098, + "learning_rate": 5.696625981494268e-06, + "loss": 0.4906, + "step": 7407 + }, + { + "epoch": 0.94, + "grad_norm": 0.6504237339968888, + "learning_rate": 5.695604441850955e-06, + "loss": 0.5197, + "step": 7408 + }, + { + "epoch": 0.94, + "grad_norm": 0.5274329768422829, + "learning_rate": 5.694582872597984e-06, + "loss": 0.4954, + "step": 7409 + }, + { + "epoch": 0.94, + "grad_norm": 0.928078603726346, + "learning_rate": 5.69356127377884e-06, + "loss": 0.6473, + "step": 7410 + }, + { + "epoch": 0.94, + "grad_norm": 0.9812196709788339, + "learning_rate": 5.692539645437009e-06, + "loss": 0.6126, + "step": 7411 + }, + { + "epoch": 0.94, + "grad_norm": 0.7582274940934607, + "learning_rate": 5.691517987615976e-06, + "loss": 0.6435, + "step": 7412 + }, + { + "epoch": 0.94, + "grad_norm": 0.5944706589682655, + "learning_rate": 5.690496300359234e-06, + "loss": 0.5122, + "step": 7413 + }, + { + "epoch": 0.94, + "grad_norm": 0.6105810228456057, + "learning_rate": 5.68947458371027e-06, + "loss": 0.5468, + "step": 7414 + }, + { + "epoch": 0.94, + "grad_norm": 0.65134934033479, + "learning_rate": 5.688452837712577e-06, + "loss": 0.4929, + "step": 7415 + }, + { + "epoch": 0.94, + "grad_norm": 0.9131277089851544, + "learning_rate": 5.687431062409647e-06, + "loss": 0.5851, + "step": 7416 + }, + { + "epoch": 0.94, + "grad_norm": 0.8079190453130916, + "learning_rate": 5.686409257844973e-06, + "loss": 0.5972, + "step": 7417 + }, + { + "epoch": 0.95, + "grad_norm": 0.8850021987073383, + "learning_rate": 5.685387424062051e-06, + "loss": 0.601, + "step": 7418 + }, + { + "epoch": 0.95, + "grad_norm": 0.7905063975038278, + "learning_rate": 5.684365561104375e-06, + "loss": 0.6438, + "step": 7419 + }, + { + "epoch": 0.95, + "grad_norm": 1.8213264804998532, + "learning_rate": 5.683343669015444e-06, + "loss": 0.5828, + "step": 7420 + }, + { + "epoch": 0.95, + "grad_norm": 0.6870240698263178, + "learning_rate": 5.682321747838758e-06, + "loss": 0.5946, + "step": 7421 + }, + { + "epoch": 0.95, + "grad_norm": 0.6722639333025503, + "learning_rate": 5.681299797617815e-06, + "loss": 0.5828, + "step": 7422 + }, + { + "epoch": 0.95, + "grad_norm": 0.6818675552627741, + "learning_rate": 5.680277818396117e-06, + "loss": 0.5748, + "step": 7423 + }, + { + "epoch": 0.95, + "grad_norm": 0.6275135559303036, + "learning_rate": 5.679255810217167e-06, + "loss": 0.4726, + "step": 7424 + }, + { + "epoch": 0.95, + "grad_norm": 0.8163185683563445, + "learning_rate": 5.678233773124465e-06, + "loss": 0.6493, + "step": 7425 + }, + { + "epoch": 0.95, + "grad_norm": 0.7144144806646856, + "learning_rate": 5.6772117071615206e-06, + "loss": 0.5964, + "step": 7426 + }, + { + "epoch": 0.95, + "grad_norm": 0.7516934859178865, + "learning_rate": 5.676189612371837e-06, + "loss": 0.6067, + "step": 7427 + }, + { + "epoch": 0.95, + "grad_norm": 0.5715928842235714, + "learning_rate": 5.675167488798924e-06, + "loss": 0.5164, + "step": 7428 + }, + { + "epoch": 0.95, + "grad_norm": 0.8984798482547672, + "learning_rate": 5.674145336486287e-06, + "loss": 0.6277, + "step": 7429 + }, + { + "epoch": 0.95, + "grad_norm": 0.9353574284200804, + "learning_rate": 5.673123155477438e-06, + "loss": 0.6345, + "step": 7430 + }, + { + "epoch": 0.95, + "grad_norm": 0.6148041296681221, + "learning_rate": 5.672100945815887e-06, + "loss": 0.5372, + "step": 7431 + }, + { + "epoch": 0.95, + "grad_norm": 0.9274117995181222, + "learning_rate": 5.671078707545147e-06, + "loss": 0.6141, + "step": 7432 + }, + { + "epoch": 0.95, + "grad_norm": 1.2818584754791995, + "learning_rate": 5.67005644070873e-06, + "loss": 0.5977, + "step": 7433 + }, + { + "epoch": 0.95, + "grad_norm": 0.5917544842531124, + "learning_rate": 5.6690341453501515e-06, + "loss": 0.5296, + "step": 7434 + }, + { + "epoch": 0.95, + "grad_norm": 0.6038382946577495, + "learning_rate": 5.668011821512929e-06, + "loss": 0.4771, + "step": 7435 + }, + { + "epoch": 0.95, + "grad_norm": 0.5992428069467104, + "learning_rate": 5.666989469240576e-06, + "loss": 0.5612, + "step": 7436 + }, + { + "epoch": 0.95, + "grad_norm": 0.5447866721389806, + "learning_rate": 5.665967088576613e-06, + "loss": 0.4725, + "step": 7437 + }, + { + "epoch": 0.95, + "grad_norm": 0.7557642963332849, + "learning_rate": 5.664944679564559e-06, + "loss": 0.5562, + "step": 7438 + }, + { + "epoch": 0.95, + "grad_norm": 0.9271305379046939, + "learning_rate": 5.663922242247936e-06, + "loss": 0.6213, + "step": 7439 + }, + { + "epoch": 0.95, + "grad_norm": 0.5811107774774571, + "learning_rate": 5.6628997766702644e-06, + "loss": 0.512, + "step": 7440 + }, + { + "epoch": 0.95, + "grad_norm": 0.6768434141643868, + "learning_rate": 5.6618772828750675e-06, + "loss": 0.5404, + "step": 7441 + }, + { + "epoch": 0.95, + "grad_norm": 0.5977941622466924, + "learning_rate": 5.660854760905869e-06, + "loss": 0.5048, + "step": 7442 + }, + { + "epoch": 0.95, + "grad_norm": 0.763565937469353, + "learning_rate": 5.659832210806195e-06, + "loss": 0.6167, + "step": 7443 + }, + { + "epoch": 0.95, + "grad_norm": 0.8373060044331139, + "learning_rate": 5.6588096326195726e-06, + "loss": 0.6458, + "step": 7444 + }, + { + "epoch": 0.95, + "grad_norm": 0.6689128337283314, + "learning_rate": 5.6577870263895306e-06, + "loss": 0.4912, + "step": 7445 + }, + { + "epoch": 0.95, + "grad_norm": 0.7595481738334218, + "learning_rate": 5.6567643921595965e-06, + "loss": 0.6323, + "step": 7446 + }, + { + "epoch": 0.95, + "grad_norm": 0.5712844219189369, + "learning_rate": 5.655741729973301e-06, + "loss": 0.542, + "step": 7447 + }, + { + "epoch": 0.95, + "grad_norm": 0.5786996986577434, + "learning_rate": 5.654719039874175e-06, + "loss": 0.4855, + "step": 7448 + }, + { + "epoch": 0.95, + "grad_norm": 0.7366752618305401, + "learning_rate": 5.653696321905752e-06, + "loss": 0.6589, + "step": 7449 + }, + { + "epoch": 0.95, + "grad_norm": 0.5312668299918883, + "learning_rate": 5.6526735761115635e-06, + "loss": 0.5062, + "step": 7450 + }, + { + "epoch": 0.95, + "grad_norm": 0.8934322478513086, + "learning_rate": 5.651650802535149e-06, + "loss": 0.6418, + "step": 7451 + }, + { + "epoch": 0.95, + "grad_norm": 0.6447074771875847, + "learning_rate": 5.650628001220041e-06, + "loss": 0.5331, + "step": 7452 + }, + { + "epoch": 0.95, + "grad_norm": 0.7473002167035475, + "learning_rate": 5.6496051722097785e-06, + "loss": 0.5632, + "step": 7453 + }, + { + "epoch": 0.95, + "grad_norm": 0.723231306865618, + "learning_rate": 5.648582315547901e-06, + "loss": 0.5889, + "step": 7454 + }, + { + "epoch": 0.95, + "grad_norm": 0.7775924834948316, + "learning_rate": 5.647559431277944e-06, + "loss": 0.5167, + "step": 7455 + }, + { + "epoch": 0.95, + "grad_norm": 0.7407817777605376, + "learning_rate": 5.646536519443453e-06, + "loss": 0.5996, + "step": 7456 + }, + { + "epoch": 0.95, + "grad_norm": 0.6986294728385855, + "learning_rate": 5.645513580087968e-06, + "loss": 0.6367, + "step": 7457 + }, + { + "epoch": 0.95, + "grad_norm": 0.597406972525196, + "learning_rate": 5.644490613255034e-06, + "loss": 0.5207, + "step": 7458 + }, + { + "epoch": 0.95, + "grad_norm": 0.711716046917632, + "learning_rate": 5.643467618988192e-06, + "loss": 0.627, + "step": 7459 + }, + { + "epoch": 0.95, + "grad_norm": 0.8041144330129631, + "learning_rate": 5.642444597330992e-06, + "loss": 0.5939, + "step": 7460 + }, + { + "epoch": 0.95, + "grad_norm": 0.7501084350677372, + "learning_rate": 5.6414215483269764e-06, + "loss": 0.5278, + "step": 7461 + }, + { + "epoch": 0.95, + "grad_norm": 0.6171344466764241, + "learning_rate": 5.640398472019697e-06, + "loss": 0.5095, + "step": 7462 + }, + { + "epoch": 0.95, + "grad_norm": 0.6353046548901394, + "learning_rate": 5.6393753684526995e-06, + "loss": 0.5005, + "step": 7463 + }, + { + "epoch": 0.95, + "grad_norm": 0.7233307148465343, + "learning_rate": 5.638352237669537e-06, + "loss": 0.5639, + "step": 7464 + }, + { + "epoch": 0.95, + "grad_norm": 0.6222856119738878, + "learning_rate": 5.637329079713758e-06, + "loss": 0.4991, + "step": 7465 + }, + { + "epoch": 0.95, + "grad_norm": 0.5494478442453682, + "learning_rate": 5.636305894628917e-06, + "loss": 0.4983, + "step": 7466 + }, + { + "epoch": 0.95, + "grad_norm": 0.9094628837538682, + "learning_rate": 5.635282682458568e-06, + "loss": 0.6231, + "step": 7467 + }, + { + "epoch": 0.95, + "grad_norm": 0.571839861934092, + "learning_rate": 5.634259443246266e-06, + "loss": 0.5163, + "step": 7468 + }, + { + "epoch": 0.95, + "grad_norm": 0.7829450278475736, + "learning_rate": 5.633236177035566e-06, + "loss": 0.6332, + "step": 7469 + }, + { + "epoch": 0.95, + "grad_norm": 0.9182791455086887, + "learning_rate": 5.632212883870024e-06, + "loss": 0.6257, + "step": 7470 + }, + { + "epoch": 0.95, + "grad_norm": 0.6367248681491957, + "learning_rate": 5.631189563793201e-06, + "loss": 0.5602, + "step": 7471 + }, + { + "epoch": 0.95, + "grad_norm": 0.592248131703522, + "learning_rate": 5.630166216848656e-06, + "loss": 0.5561, + "step": 7472 + }, + { + "epoch": 0.95, + "grad_norm": 0.6855158527421498, + "learning_rate": 5.629142843079948e-06, + "loss": 0.6157, + "step": 7473 + }, + { + "epoch": 0.95, + "grad_norm": 0.6711081524201122, + "learning_rate": 5.6281194425306386e-06, + "loss": 0.5747, + "step": 7474 + }, + { + "epoch": 0.95, + "grad_norm": 0.9488468902943121, + "learning_rate": 5.627096015244292e-06, + "loss": 0.5873, + "step": 7475 + }, + { + "epoch": 0.95, + "grad_norm": 0.5924103778816938, + "learning_rate": 5.626072561264473e-06, + "loss": 0.5409, + "step": 7476 + }, + { + "epoch": 0.95, + "grad_norm": 0.6976615426673333, + "learning_rate": 5.625049080634746e-06, + "loss": 0.5064, + "step": 7477 + }, + { + "epoch": 0.95, + "grad_norm": 0.7410077206632754, + "learning_rate": 5.624025573398676e-06, + "loss": 0.564, + "step": 7478 + }, + { + "epoch": 0.95, + "grad_norm": 0.7758628136137053, + "learning_rate": 5.623002039599832e-06, + "loss": 0.5449, + "step": 7479 + }, + { + "epoch": 0.95, + "grad_norm": 0.56500355210866, + "learning_rate": 5.6219784792817825e-06, + "loss": 0.536, + "step": 7480 + }, + { + "epoch": 0.95, + "grad_norm": 0.8655068984818446, + "learning_rate": 5.6209548924880954e-06, + "loss": 0.6632, + "step": 7481 + }, + { + "epoch": 0.95, + "grad_norm": 0.6461287796399382, + "learning_rate": 5.619931279262343e-06, + "loss": 0.5459, + "step": 7482 + }, + { + "epoch": 0.95, + "grad_norm": 0.7727957998907783, + "learning_rate": 5.618907639648098e-06, + "loss": 0.6538, + "step": 7483 + }, + { + "epoch": 0.95, + "grad_norm": 0.8576024726657329, + "learning_rate": 5.617883973688931e-06, + "loss": 0.6779, + "step": 7484 + }, + { + "epoch": 0.95, + "grad_norm": 0.5699404090759821, + "learning_rate": 5.61686028142842e-06, + "loss": 0.5295, + "step": 7485 + }, + { + "epoch": 0.95, + "grad_norm": 0.6045262952351791, + "learning_rate": 5.615836562910136e-06, + "loss": 0.4957, + "step": 7486 + }, + { + "epoch": 0.95, + "grad_norm": 0.6796026296394505, + "learning_rate": 5.614812818177657e-06, + "loss": 0.558, + "step": 7487 + }, + { + "epoch": 0.95, + "grad_norm": 0.7804945500216487, + "learning_rate": 5.6137890472745625e-06, + "loss": 0.6621, + "step": 7488 + }, + { + "epoch": 0.95, + "grad_norm": 0.9389280388329666, + "learning_rate": 5.61276525024443e-06, + "loss": 0.6651, + "step": 7489 + }, + { + "epoch": 0.95, + "grad_norm": 0.723619080503985, + "learning_rate": 5.611741427130838e-06, + "loss": 0.5922, + "step": 7490 + }, + { + "epoch": 0.95, + "grad_norm": 1.4457476941116572, + "learning_rate": 5.610717577977369e-06, + "loss": 0.6387, + "step": 7491 + }, + { + "epoch": 0.95, + "grad_norm": 0.7561310635183401, + "learning_rate": 5.609693702827605e-06, + "loss": 0.5484, + "step": 7492 + }, + { + "epoch": 0.95, + "grad_norm": 0.6382758271178987, + "learning_rate": 5.608669801725125e-06, + "loss": 0.4749, + "step": 7493 + }, + { + "epoch": 0.95, + "grad_norm": 0.659774268820042, + "learning_rate": 5.607645874713519e-06, + "loss": 0.5292, + "step": 7494 + }, + { + "epoch": 0.95, + "grad_norm": 0.6203594822082383, + "learning_rate": 5.60662192183637e-06, + "loss": 0.5286, + "step": 7495 + }, + { + "epoch": 0.95, + "grad_norm": 0.6710073324384463, + "learning_rate": 5.605597943137264e-06, + "loss": 0.5618, + "step": 7496 + }, + { + "epoch": 0.96, + "grad_norm": 0.5949885700201625, + "learning_rate": 5.6045739386597885e-06, + "loss": 0.5116, + "step": 7497 + }, + { + "epoch": 0.96, + "grad_norm": 0.6406614047485684, + "learning_rate": 5.603549908447533e-06, + "loss": 0.5583, + "step": 7498 + }, + { + "epoch": 0.96, + "grad_norm": 0.5361097158972228, + "learning_rate": 5.602525852544085e-06, + "loss": 0.4929, + "step": 7499 + }, + { + "epoch": 0.96, + "grad_norm": 0.7498696685528496, + "learning_rate": 5.6015017709930385e-06, + "loss": 0.6039, + "step": 7500 + }, + { + "epoch": 0.96, + "grad_norm": 0.8061247815788423, + "learning_rate": 5.600477663837983e-06, + "loss": 0.5824, + "step": 7501 + }, + { + "epoch": 0.96, + "grad_norm": 0.5361879481965269, + "learning_rate": 5.599453531122513e-06, + "loss": 0.5044, + "step": 7502 + }, + { + "epoch": 0.96, + "grad_norm": 0.6590883410830884, + "learning_rate": 5.5984293728902205e-06, + "loss": 0.5061, + "step": 7503 + }, + { + "epoch": 0.96, + "grad_norm": 0.6569733490302738, + "learning_rate": 5.597405189184702e-06, + "loss": 0.5126, + "step": 7504 + }, + { + "epoch": 0.96, + "grad_norm": 0.5854814873595723, + "learning_rate": 5.5963809800495535e-06, + "loss": 0.55, + "step": 7505 + }, + { + "epoch": 0.96, + "grad_norm": 0.6750277894763069, + "learning_rate": 5.595356745528374e-06, + "loss": 0.5024, + "step": 7506 + }, + { + "epoch": 0.96, + "grad_norm": 0.5940732070440541, + "learning_rate": 5.59433248566476e-06, + "loss": 0.5455, + "step": 7507 + }, + { + "epoch": 0.96, + "grad_norm": 0.6332071317074104, + "learning_rate": 5.593308200502311e-06, + "loss": 0.5383, + "step": 7508 + }, + { + "epoch": 0.96, + "grad_norm": 0.79302800554713, + "learning_rate": 5.5922838900846275e-06, + "loss": 0.5952, + "step": 7509 + }, + { + "epoch": 0.96, + "grad_norm": 0.757567081335012, + "learning_rate": 5.591259554455311e-06, + "loss": 0.6282, + "step": 7510 + }, + { + "epoch": 0.96, + "grad_norm": 0.8590191699846675, + "learning_rate": 5.590235193657965e-06, + "loss": 0.6224, + "step": 7511 + }, + { + "epoch": 0.96, + "grad_norm": 0.7424039259481556, + "learning_rate": 5.5892108077361925e-06, + "loss": 0.5549, + "step": 7512 + }, + { + "epoch": 0.96, + "grad_norm": 0.6279115641151283, + "learning_rate": 5.5881863967336e-06, + "loss": 0.547, + "step": 7513 + }, + { + "epoch": 0.96, + "grad_norm": 0.7931095274600247, + "learning_rate": 5.5871619606937925e-06, + "loss": 0.625, + "step": 7514 + }, + { + "epoch": 0.96, + "grad_norm": 0.6995865900119276, + "learning_rate": 5.586137499660376e-06, + "loss": 0.6061, + "step": 7515 + }, + { + "epoch": 0.96, + "grad_norm": 0.6616400129795555, + "learning_rate": 5.585113013676958e-06, + "loss": 0.586, + "step": 7516 + }, + { + "epoch": 0.96, + "grad_norm": 0.8003756150682517, + "learning_rate": 5.584088502787151e-06, + "loss": 0.5592, + "step": 7517 + }, + { + "epoch": 0.96, + "grad_norm": 0.7594131286486338, + "learning_rate": 5.58306396703456e-06, + "loss": 0.5948, + "step": 7518 + }, + { + "epoch": 0.96, + "grad_norm": 0.701980554897668, + "learning_rate": 5.5820394064628e-06, + "loss": 0.609, + "step": 7519 + }, + { + "epoch": 0.96, + "grad_norm": 0.6053234960436575, + "learning_rate": 5.5810148211154835e-06, + "loss": 0.4991, + "step": 7520 + }, + { + "epoch": 0.96, + "grad_norm": 0.8351394011794444, + "learning_rate": 5.579990211036223e-06, + "loss": 0.6658, + "step": 7521 + }, + { + "epoch": 0.96, + "grad_norm": 0.6514037457401015, + "learning_rate": 5.578965576268632e-06, + "loss": 0.5554, + "step": 7522 + }, + { + "epoch": 0.96, + "grad_norm": 0.6118776643235391, + "learning_rate": 5.577940916856326e-06, + "loss": 0.504, + "step": 7523 + }, + { + "epoch": 0.96, + "grad_norm": 0.6012276938343325, + "learning_rate": 5.576916232842923e-06, + "loss": 0.4909, + "step": 7524 + }, + { + "epoch": 0.96, + "grad_norm": 0.6231388550961715, + "learning_rate": 5.5758915242720395e-06, + "loss": 0.5479, + "step": 7525 + }, + { + "epoch": 0.96, + "grad_norm": 0.7024457676788421, + "learning_rate": 5.5748667911872935e-06, + "loss": 0.5909, + "step": 7526 + }, + { + "epoch": 0.96, + "grad_norm": 0.6972453079687018, + "learning_rate": 5.573842033632305e-06, + "loss": 0.6119, + "step": 7527 + }, + { + "epoch": 0.96, + "grad_norm": 0.7642496430911038, + "learning_rate": 5.572817251650695e-06, + "loss": 0.6192, + "step": 7528 + }, + { + "epoch": 0.96, + "grad_norm": 0.8392541463793172, + "learning_rate": 5.571792445286085e-06, + "loss": 0.6536, + "step": 7529 + }, + { + "epoch": 0.96, + "grad_norm": 0.5518529616944446, + "learning_rate": 5.570767614582098e-06, + "loss": 0.5169, + "step": 7530 + }, + { + "epoch": 0.96, + "grad_norm": 0.8068960512228883, + "learning_rate": 5.569742759582357e-06, + "loss": 0.6492, + "step": 7531 + }, + { + "epoch": 0.96, + "grad_norm": 0.8396541307872485, + "learning_rate": 5.568717880330489e-06, + "loss": 0.6041, + "step": 7532 + }, + { + "epoch": 0.96, + "grad_norm": 0.6797223660958093, + "learning_rate": 5.567692976870117e-06, + "loss": 0.584, + "step": 7533 + }, + { + "epoch": 0.96, + "grad_norm": 0.698770788248921, + "learning_rate": 5.56666804924487e-06, + "loss": 0.5687, + "step": 7534 + }, + { + "epoch": 0.96, + "grad_norm": 0.7028067449758266, + "learning_rate": 5.5656430974983735e-06, + "loss": 0.6244, + "step": 7535 + }, + { + "epoch": 0.96, + "grad_norm": 0.8473567542161662, + "learning_rate": 5.564618121674258e-06, + "loss": 0.5934, + "step": 7536 + }, + { + "epoch": 0.96, + "grad_norm": 0.5850820347804717, + "learning_rate": 5.563593121816155e-06, + "loss": 0.5086, + "step": 7537 + }, + { + "epoch": 0.96, + "grad_norm": 0.6427982414322996, + "learning_rate": 5.562568097967691e-06, + "loss": 0.6103, + "step": 7538 + }, + { + "epoch": 0.96, + "grad_norm": 0.6724865274200578, + "learning_rate": 5.5615430501725035e-06, + "loss": 0.5032, + "step": 7539 + }, + { + "epoch": 0.96, + "grad_norm": 0.652842206270668, + "learning_rate": 5.560517978474222e-06, + "loss": 0.4933, + "step": 7540 + }, + { + "epoch": 0.96, + "grad_norm": 0.6119030785598977, + "learning_rate": 5.5594928829164804e-06, + "loss": 0.501, + "step": 7541 + }, + { + "epoch": 0.96, + "grad_norm": 0.8612807450827021, + "learning_rate": 5.558467763542916e-06, + "loss": 0.5677, + "step": 7542 + }, + { + "epoch": 0.96, + "grad_norm": 0.5175732781417085, + "learning_rate": 5.557442620397162e-06, + "loss": 0.4908, + "step": 7543 + }, + { + "epoch": 0.96, + "grad_norm": 0.7201772136975116, + "learning_rate": 5.556417453522858e-06, + "loss": 0.5323, + "step": 7544 + }, + { + "epoch": 0.96, + "grad_norm": 0.6234394421718971, + "learning_rate": 5.55539226296364e-06, + "loss": 0.5189, + "step": 7545 + }, + { + "epoch": 0.96, + "grad_norm": 0.624635148078209, + "learning_rate": 5.554367048763149e-06, + "loss": 0.5214, + "step": 7546 + }, + { + "epoch": 0.96, + "grad_norm": 0.5900965532139819, + "learning_rate": 5.553341810965025e-06, + "loss": 0.5515, + "step": 7547 + }, + { + "epoch": 0.96, + "grad_norm": 0.7721391585343826, + "learning_rate": 5.552316549612907e-06, + "loss": 0.6673, + "step": 7548 + }, + { + "epoch": 0.96, + "grad_norm": 0.7244636016195506, + "learning_rate": 5.551291264750439e-06, + "loss": 0.5335, + "step": 7549 + }, + { + "epoch": 0.96, + "grad_norm": 0.764917421470679, + "learning_rate": 5.550265956421263e-06, + "loss": 0.6519, + "step": 7550 + }, + { + "epoch": 0.96, + "grad_norm": 0.6327877244351915, + "learning_rate": 5.549240624669025e-06, + "loss": 0.5064, + "step": 7551 + }, + { + "epoch": 0.96, + "grad_norm": 0.6637650168820303, + "learning_rate": 5.548215269537368e-06, + "loss": 0.5701, + "step": 7552 + }, + { + "epoch": 0.96, + "grad_norm": 0.6466215346331949, + "learning_rate": 5.5471898910699385e-06, + "loss": 0.5233, + "step": 7553 + }, + { + "epoch": 0.96, + "grad_norm": 0.7797419064637255, + "learning_rate": 5.546164489310384e-06, + "loss": 0.6099, + "step": 7554 + }, + { + "epoch": 0.96, + "grad_norm": 0.6021766400263207, + "learning_rate": 5.545139064302352e-06, + "loss": 0.5427, + "step": 7555 + }, + { + "epoch": 0.96, + "grad_norm": 0.640741001885347, + "learning_rate": 5.544113616089493e-06, + "loss": 0.5065, + "step": 7556 + }, + { + "epoch": 0.96, + "grad_norm": 1.2158678722939376, + "learning_rate": 5.543088144715455e-06, + "loss": 0.6242, + "step": 7557 + }, + { + "epoch": 0.96, + "grad_norm": 0.9070171850791727, + "learning_rate": 5.542062650223892e-06, + "loss": 0.6381, + "step": 7558 + }, + { + "epoch": 0.96, + "grad_norm": 0.6833671135940843, + "learning_rate": 5.5410371326584525e-06, + "loss": 0.5495, + "step": 7559 + }, + { + "epoch": 0.96, + "grad_norm": 0.7459351868089814, + "learning_rate": 5.540011592062791e-06, + "loss": 0.5814, + "step": 7560 + }, + { + "epoch": 0.96, + "grad_norm": 0.8477969766915834, + "learning_rate": 5.538986028480563e-06, + "loss": 0.6157, + "step": 7561 + }, + { + "epoch": 0.96, + "grad_norm": 0.664010709265732, + "learning_rate": 5.537960441955421e-06, + "loss": 0.5162, + "step": 7562 + }, + { + "epoch": 0.96, + "grad_norm": 0.603346770470461, + "learning_rate": 5.536934832531022e-06, + "loss": 0.5331, + "step": 7563 + }, + { + "epoch": 0.96, + "grad_norm": 0.7180540870225373, + "learning_rate": 5.535909200251024e-06, + "loss": 0.6083, + "step": 7564 + }, + { + "epoch": 0.96, + "grad_norm": 0.5268191736598359, + "learning_rate": 5.534883545159085e-06, + "loss": 0.4378, + "step": 7565 + }, + { + "epoch": 0.96, + "grad_norm": 0.648713835068757, + "learning_rate": 5.5338578672988606e-06, + "loss": 0.5791, + "step": 7566 + }, + { + "epoch": 0.96, + "grad_norm": 0.5553914496705349, + "learning_rate": 5.532832166714011e-06, + "loss": 0.519, + "step": 7567 + }, + { + "epoch": 0.96, + "grad_norm": 0.6280371686105825, + "learning_rate": 5.531806443448202e-06, + "loss": 0.5551, + "step": 7568 + }, + { + "epoch": 0.96, + "grad_norm": 0.6024276775023125, + "learning_rate": 5.530780697545091e-06, + "loss": 0.5464, + "step": 7569 + }, + { + "epoch": 0.96, + "grad_norm": 0.6414987537819092, + "learning_rate": 5.529754929048342e-06, + "loss": 0.5025, + "step": 7570 + }, + { + "epoch": 0.96, + "grad_norm": 0.6149237158979458, + "learning_rate": 5.528729138001619e-06, + "loss": 0.4912, + "step": 7571 + }, + { + "epoch": 0.96, + "grad_norm": 0.5677021856388555, + "learning_rate": 5.5277033244485855e-06, + "loss": 0.4997, + "step": 7572 + }, + { + "epoch": 0.96, + "grad_norm": 0.7354021725784786, + "learning_rate": 5.526677488432909e-06, + "loss": 0.611, + "step": 7573 + }, + { + "epoch": 0.96, + "grad_norm": 0.6210531429893793, + "learning_rate": 5.525651629998252e-06, + "loss": 0.522, + "step": 7574 + }, + { + "epoch": 0.97, + "grad_norm": 0.585476354979317, + "learning_rate": 5.5246257491882875e-06, + "loss": 0.5378, + "step": 7575 + }, + { + "epoch": 0.97, + "grad_norm": 0.6296899660493586, + "learning_rate": 5.5235998460466796e-06, + "loss": 0.5058, + "step": 7576 + }, + { + "epoch": 0.97, + "grad_norm": 0.591274840977478, + "learning_rate": 5.522573920617101e-06, + "loss": 0.5376, + "step": 7577 + }, + { + "epoch": 0.97, + "grad_norm": 0.5831344897862408, + "learning_rate": 5.52154797294322e-06, + "loss": 0.5258, + "step": 7578 + }, + { + "epoch": 0.97, + "grad_norm": 1.0086384210930563, + "learning_rate": 5.520522003068709e-06, + "loss": 0.5897, + "step": 7579 + }, + { + "epoch": 0.97, + "grad_norm": 0.8012128775170618, + "learning_rate": 5.5194960110372376e-06, + "loss": 0.6236, + "step": 7580 + }, + { + "epoch": 0.97, + "grad_norm": 0.6149023152075509, + "learning_rate": 5.518469996892483e-06, + "loss": 0.576, + "step": 7581 + }, + { + "epoch": 0.97, + "grad_norm": 0.6130289171762597, + "learning_rate": 5.517443960678116e-06, + "loss": 0.5589, + "step": 7582 + }, + { + "epoch": 0.97, + "grad_norm": 0.5702572281951223, + "learning_rate": 5.516417902437814e-06, + "loss": 0.5385, + "step": 7583 + }, + { + "epoch": 0.97, + "grad_norm": 0.6507560903497862, + "learning_rate": 5.515391822215252e-06, + "loss": 0.4782, + "step": 7584 + }, + { + "epoch": 0.97, + "grad_norm": 0.7516982480369816, + "learning_rate": 5.514365720054108e-06, + "loss": 0.5578, + "step": 7585 + }, + { + "epoch": 0.97, + "grad_norm": 0.5204808405538552, + "learning_rate": 5.513339595998058e-06, + "loss": 0.4889, + "step": 7586 + }, + { + "epoch": 0.97, + "grad_norm": 0.6956055398196305, + "learning_rate": 5.5123134500907825e-06, + "loss": 0.5202, + "step": 7587 + }, + { + "epoch": 0.97, + "grad_norm": 0.8328162300187069, + "learning_rate": 5.51128728237596e-06, + "loss": 0.5557, + "step": 7588 + }, + { + "epoch": 0.97, + "grad_norm": 0.9534275043609638, + "learning_rate": 5.5102610928972725e-06, + "loss": 0.555, + "step": 7589 + }, + { + "epoch": 0.97, + "grad_norm": 0.579336801661149, + "learning_rate": 5.5092348816983985e-06, + "loss": 0.5375, + "step": 7590 + }, + { + "epoch": 0.97, + "grad_norm": 0.9391063810120132, + "learning_rate": 5.508208648823025e-06, + "loss": 0.5938, + "step": 7591 + }, + { + "epoch": 0.97, + "grad_norm": 0.9695965226750517, + "learning_rate": 5.507182394314832e-06, + "loss": 0.6178, + "step": 7592 + }, + { + "epoch": 0.97, + "grad_norm": 0.9005124611626333, + "learning_rate": 5.506156118217506e-06, + "loss": 0.6008, + "step": 7593 + }, + { + "epoch": 0.97, + "grad_norm": 0.48799709277974174, + "learning_rate": 5.5051298205747315e-06, + "loss": 0.4413, + "step": 7594 + }, + { + "epoch": 0.97, + "grad_norm": 0.7910818204108412, + "learning_rate": 5.504103501430195e-06, + "loss": 0.5167, + "step": 7595 + }, + { + "epoch": 0.97, + "grad_norm": 0.5549774493898005, + "learning_rate": 5.503077160827583e-06, + "loss": 0.5302, + "step": 7596 + }, + { + "epoch": 0.97, + "grad_norm": 0.8722196089232908, + "learning_rate": 5.502050798810584e-06, + "loss": 0.6307, + "step": 7597 + }, + { + "epoch": 0.97, + "grad_norm": 0.7201682416213975, + "learning_rate": 5.501024415422885e-06, + "loss": 0.6432, + "step": 7598 + }, + { + "epoch": 0.97, + "grad_norm": 0.8569120142915779, + "learning_rate": 5.499998010708181e-06, + "loss": 0.5749, + "step": 7599 + }, + { + "epoch": 0.97, + "grad_norm": 0.7118726448812661, + "learning_rate": 5.498971584710158e-06, + "loss": 0.5539, + "step": 7600 + }, + { + "epoch": 0.97, + "grad_norm": 0.596970360923443, + "learning_rate": 5.497945137472508e-06, + "loss": 0.5225, + "step": 7601 + }, + { + "epoch": 0.97, + "grad_norm": 0.8129042638483244, + "learning_rate": 5.496918669038927e-06, + "loss": 0.6055, + "step": 7602 + }, + { + "epoch": 0.97, + "grad_norm": 0.8967171840855497, + "learning_rate": 5.495892179453104e-06, + "loss": 0.5558, + "step": 7603 + }, + { + "epoch": 0.97, + "grad_norm": 0.8305055714431683, + "learning_rate": 5.494865668758737e-06, + "loss": 0.5358, + "step": 7604 + }, + { + "epoch": 0.97, + "grad_norm": 0.6550216912099499, + "learning_rate": 5.493839136999517e-06, + "loss": 0.5422, + "step": 7605 + }, + { + "epoch": 0.97, + "grad_norm": 0.5836326198558194, + "learning_rate": 5.492812584219146e-06, + "loss": 0.4814, + "step": 7606 + }, + { + "epoch": 0.97, + "grad_norm": 0.6999942862132048, + "learning_rate": 5.491786010461317e-06, + "loss": 0.5101, + "step": 7607 + }, + { + "epoch": 0.97, + "grad_norm": 0.5963761150255326, + "learning_rate": 5.49075941576973e-06, + "loss": 0.5577, + "step": 7608 + }, + { + "epoch": 0.97, + "grad_norm": 0.5399962748439829, + "learning_rate": 5.4897328001880825e-06, + "loss": 0.494, + "step": 7609 + }, + { + "epoch": 0.97, + "grad_norm": 0.8377025018325649, + "learning_rate": 5.488706163760075e-06, + "loss": 0.6595, + "step": 7610 + }, + { + "epoch": 0.97, + "grad_norm": 0.6355270949141507, + "learning_rate": 5.487679506529405e-06, + "loss": 0.5234, + "step": 7611 + }, + { + "epoch": 0.97, + "grad_norm": 0.8021011466859977, + "learning_rate": 5.48665282853978e-06, + "loss": 0.6364, + "step": 7612 + }, + { + "epoch": 0.97, + "grad_norm": 0.6488854554999794, + "learning_rate": 5.485626129834898e-06, + "loss": 0.5075, + "step": 7613 + }, + { + "epoch": 0.97, + "grad_norm": 0.5743535463440743, + "learning_rate": 5.484599410458464e-06, + "loss": 0.5091, + "step": 7614 + }, + { + "epoch": 0.97, + "grad_norm": 0.5576089792025481, + "learning_rate": 5.483572670454181e-06, + "loss": 0.4852, + "step": 7615 + }, + { + "epoch": 0.97, + "grad_norm": 0.5250051752808631, + "learning_rate": 5.482545909865755e-06, + "loss": 0.533, + "step": 7616 + }, + { + "epoch": 0.97, + "grad_norm": 0.6262217878642798, + "learning_rate": 5.4815191287368914e-06, + "loss": 0.5604, + "step": 7617 + }, + { + "epoch": 0.97, + "grad_norm": 0.5474155704323684, + "learning_rate": 5.480492327111298e-06, + "loss": 0.5137, + "step": 7618 + }, + { + "epoch": 0.97, + "grad_norm": 0.603400583681847, + "learning_rate": 5.4794655050326806e-06, + "loss": 0.5085, + "step": 7619 + }, + { + "epoch": 0.97, + "grad_norm": 0.7122199404116781, + "learning_rate": 5.4784386625447495e-06, + "loss": 0.5079, + "step": 7620 + }, + { + "epoch": 0.97, + "grad_norm": 0.5957349064292433, + "learning_rate": 5.477411799691213e-06, + "loss": 0.5013, + "step": 7621 + }, + { + "epoch": 0.97, + "grad_norm": 0.5806603744617725, + "learning_rate": 5.476384916515783e-06, + "loss": 0.5218, + "step": 7622 + }, + { + "epoch": 0.97, + "grad_norm": 0.7985190895949581, + "learning_rate": 5.475358013062167e-06, + "loss": 0.6171, + "step": 7623 + }, + { + "epoch": 0.97, + "grad_norm": 0.8152396987787083, + "learning_rate": 5.474331089374081e-06, + "loss": 0.606, + "step": 7624 + }, + { + "epoch": 0.97, + "grad_norm": 0.7558770416267557, + "learning_rate": 5.473304145495236e-06, + "loss": 0.5792, + "step": 7625 + }, + { + "epoch": 0.97, + "grad_norm": 0.6950817632840202, + "learning_rate": 5.472277181469346e-06, + "loss": 0.5081, + "step": 7626 + }, + { + "epoch": 0.97, + "grad_norm": 0.7726956635313686, + "learning_rate": 5.471250197340127e-06, + "loss": 0.6511, + "step": 7627 + }, + { + "epoch": 0.97, + "grad_norm": 0.5827437905910543, + "learning_rate": 5.4702231931512914e-06, + "loss": 0.4755, + "step": 7628 + }, + { + "epoch": 0.97, + "grad_norm": 0.6035578999242772, + "learning_rate": 5.4691961689465565e-06, + "loss": 0.4976, + "step": 7629 + }, + { + "epoch": 0.97, + "grad_norm": 0.8993407079877661, + "learning_rate": 5.468169124769641e-06, + "loss": 0.6566, + "step": 7630 + }, + { + "epoch": 0.97, + "grad_norm": 0.568348462891462, + "learning_rate": 5.467142060664262e-06, + "loss": 0.5087, + "step": 7631 + }, + { + "epoch": 0.97, + "grad_norm": 0.9491565850373038, + "learning_rate": 5.4661149766741385e-06, + "loss": 0.6116, + "step": 7632 + }, + { + "epoch": 0.97, + "grad_norm": 0.6942050574064412, + "learning_rate": 5.465087872842989e-06, + "loss": 0.5743, + "step": 7633 + }, + { + "epoch": 0.97, + "grad_norm": 0.5662075550149903, + "learning_rate": 5.464060749214536e-06, + "loss": 0.5048, + "step": 7634 + }, + { + "epoch": 0.97, + "grad_norm": 0.878023432634221, + "learning_rate": 5.463033605832499e-06, + "loss": 0.5795, + "step": 7635 + }, + { + "epoch": 0.97, + "grad_norm": 0.6256919852216055, + "learning_rate": 5.4620064427406005e-06, + "loss": 0.5291, + "step": 7636 + }, + { + "epoch": 0.97, + "grad_norm": 0.683820273611286, + "learning_rate": 5.460979259982565e-06, + "loss": 0.544, + "step": 7637 + }, + { + "epoch": 0.97, + "grad_norm": 0.6462524444088971, + "learning_rate": 5.459952057602116e-06, + "loss": 0.5134, + "step": 7638 + }, + { + "epoch": 0.97, + "grad_norm": 0.6712443709615155, + "learning_rate": 5.458924835642977e-06, + "loss": 0.5097, + "step": 7639 + }, + { + "epoch": 0.97, + "grad_norm": 0.740364420120419, + "learning_rate": 5.457897594148876e-06, + "loss": 0.5932, + "step": 7640 + }, + { + "epoch": 0.97, + "grad_norm": 0.5446827282718025, + "learning_rate": 5.456870333163535e-06, + "loss": 0.4669, + "step": 7641 + }, + { + "epoch": 0.97, + "grad_norm": 0.6786201454537524, + "learning_rate": 5.455843052730684e-06, + "loss": 0.5739, + "step": 7642 + }, + { + "epoch": 0.97, + "grad_norm": 0.9191674631580304, + "learning_rate": 5.454815752894051e-06, + "loss": 0.6399, + "step": 7643 + }, + { + "epoch": 0.97, + "grad_norm": 0.7453559798957378, + "learning_rate": 5.4537884336973665e-06, + "loss": 0.5744, + "step": 7644 + }, + { + "epoch": 0.97, + "grad_norm": 0.6533767995417032, + "learning_rate": 5.452761095184358e-06, + "loss": 0.5364, + "step": 7645 + }, + { + "epoch": 0.97, + "grad_norm": 0.7773539641585071, + "learning_rate": 5.451733737398756e-06, + "loss": 0.6201, + "step": 7646 + }, + { + "epoch": 0.97, + "grad_norm": 0.6534543970475598, + "learning_rate": 5.4507063603842924e-06, + "loss": 0.536, + "step": 7647 + }, + { + "epoch": 0.97, + "grad_norm": 0.6208033437849065, + "learning_rate": 5.449678964184699e-06, + "loss": 0.5493, + "step": 7648 + }, + { + "epoch": 0.97, + "grad_norm": 0.9606060366491064, + "learning_rate": 5.44865154884371e-06, + "loss": 0.6055, + "step": 7649 + }, + { + "epoch": 0.97, + "grad_norm": 0.7554072674016378, + "learning_rate": 5.447624114405058e-06, + "loss": 0.5892, + "step": 7650 + }, + { + "epoch": 0.97, + "grad_norm": 0.7577052782835538, + "learning_rate": 5.446596660912478e-06, + "loss": 0.5999, + "step": 7651 + }, + { + "epoch": 0.97, + "grad_norm": 0.7184803457220917, + "learning_rate": 5.445569188409703e-06, + "loss": 0.6636, + "step": 7652 + }, + { + "epoch": 0.97, + "grad_norm": 0.7554933362400414, + "learning_rate": 5.444541696940473e-06, + "loss": 0.6109, + "step": 7653 + }, + { + "epoch": 0.98, + "grad_norm": 0.6525645721219876, + "learning_rate": 5.443514186548524e-06, + "loss": 0.5233, + "step": 7654 + }, + { + "epoch": 0.98, + "grad_norm": 0.8286323701669247, + "learning_rate": 5.442486657277593e-06, + "loss": 0.6284, + "step": 7655 + }, + { + "epoch": 0.98, + "grad_norm": 0.6592548567264946, + "learning_rate": 5.44145910917142e-06, + "loss": 0.6048, + "step": 7656 + }, + { + "epoch": 0.98, + "grad_norm": 0.5783620509608063, + "learning_rate": 5.440431542273742e-06, + "loss": 0.4733, + "step": 7657 + }, + { + "epoch": 0.98, + "grad_norm": 0.636704869243039, + "learning_rate": 5.439403956628302e-06, + "loss": 0.5589, + "step": 7658 + }, + { + "epoch": 0.98, + "grad_norm": 0.6829496157119364, + "learning_rate": 5.438376352278839e-06, + "loss": 0.4903, + "step": 7659 + }, + { + "epoch": 0.98, + "grad_norm": 0.5822075259785565, + "learning_rate": 5.437348729269094e-06, + "loss": 0.5102, + "step": 7660 + }, + { + "epoch": 0.98, + "grad_norm": 0.7770867537312597, + "learning_rate": 5.436321087642813e-06, + "loss": 0.5721, + "step": 7661 + }, + { + "epoch": 0.98, + "grad_norm": 0.7628500114411533, + "learning_rate": 5.435293427443737e-06, + "loss": 0.6139, + "step": 7662 + }, + { + "epoch": 0.98, + "grad_norm": 1.0242763930911243, + "learning_rate": 5.434265748715611e-06, + "loss": 0.6165, + "step": 7663 + }, + { + "epoch": 0.98, + "grad_norm": 0.8831931861429525, + "learning_rate": 5.4332380515021796e-06, + "loss": 0.5952, + "step": 7664 + }, + { + "epoch": 0.98, + "grad_norm": 0.9927954757870369, + "learning_rate": 5.432210335847189e-06, + "loss": 0.6282, + "step": 7665 + }, + { + "epoch": 0.98, + "grad_norm": 0.6387157661274618, + "learning_rate": 5.431182601794386e-06, + "loss": 0.5411, + "step": 7666 + }, + { + "epoch": 0.98, + "grad_norm": 0.8188858012671504, + "learning_rate": 5.430154849387515e-06, + "loss": 0.5674, + "step": 7667 + }, + { + "epoch": 0.98, + "grad_norm": 0.6197312203439811, + "learning_rate": 5.42912707867033e-06, + "loss": 0.4704, + "step": 7668 + }, + { + "epoch": 0.98, + "grad_norm": 0.6677575345490049, + "learning_rate": 5.4280992896865744e-06, + "loss": 0.5566, + "step": 7669 + }, + { + "epoch": 0.98, + "grad_norm": 0.5428399145339392, + "learning_rate": 5.427071482480001e-06, + "loss": 0.4867, + "step": 7670 + }, + { + "epoch": 0.98, + "grad_norm": 0.763064482947748, + "learning_rate": 5.42604365709436e-06, + "loss": 0.5985, + "step": 7671 + }, + { + "epoch": 0.98, + "grad_norm": 0.6334282403203839, + "learning_rate": 5.425015813573401e-06, + "loss": 0.5388, + "step": 7672 + }, + { + "epoch": 0.98, + "grad_norm": 0.865313502163088, + "learning_rate": 5.423987951960876e-06, + "loss": 0.6409, + "step": 7673 + }, + { + "epoch": 0.98, + "grad_norm": 0.6051889533901729, + "learning_rate": 5.42296007230054e-06, + "loss": 0.5532, + "step": 7674 + }, + { + "epoch": 0.98, + "grad_norm": 1.20833480018582, + "learning_rate": 5.421932174636145e-06, + "loss": 0.6208, + "step": 7675 + }, + { + "epoch": 0.98, + "grad_norm": 0.5883651794021639, + "learning_rate": 5.4209042590114455e-06, + "loss": 0.5447, + "step": 7676 + }, + { + "epoch": 0.98, + "grad_norm": 0.7698686865402535, + "learning_rate": 5.419876325470197e-06, + "loss": 0.5948, + "step": 7677 + }, + { + "epoch": 0.98, + "grad_norm": 0.7493995078027337, + "learning_rate": 5.418848374056156e-06, + "loss": 0.625, + "step": 7678 + }, + { + "epoch": 0.98, + "grad_norm": 0.696608713288634, + "learning_rate": 5.417820404813075e-06, + "loss": 0.593, + "step": 7679 + }, + { + "epoch": 0.98, + "grad_norm": 0.5739480096523812, + "learning_rate": 5.416792417784718e-06, + "loss": 0.4881, + "step": 7680 + }, + { + "epoch": 0.98, + "grad_norm": 0.6759667918026931, + "learning_rate": 5.415764413014838e-06, + "loss": 0.5221, + "step": 7681 + }, + { + "epoch": 0.98, + "grad_norm": 0.7777323902892049, + "learning_rate": 5.414736390547196e-06, + "loss": 0.6289, + "step": 7682 + }, + { + "epoch": 0.98, + "grad_norm": 0.7779993246484698, + "learning_rate": 5.4137083504255506e-06, + "loss": 0.656, + "step": 7683 + }, + { + "epoch": 0.98, + "grad_norm": 0.7173564828586574, + "learning_rate": 5.412680292693664e-06, + "loss": 0.6188, + "step": 7684 + }, + { + "epoch": 0.98, + "grad_norm": 0.7379808975637764, + "learning_rate": 5.411652217395294e-06, + "loss": 0.525, + "step": 7685 + }, + { + "epoch": 0.98, + "grad_norm": 0.5837568082783187, + "learning_rate": 5.410624124574206e-06, + "loss": 0.5123, + "step": 7686 + }, + { + "epoch": 0.98, + "grad_norm": 0.7511021320766276, + "learning_rate": 5.409596014274161e-06, + "loss": 0.5789, + "step": 7687 + }, + { + "epoch": 0.98, + "grad_norm": 0.8279033722935091, + "learning_rate": 5.408567886538922e-06, + "loss": 0.6342, + "step": 7688 + }, + { + "epoch": 0.98, + "grad_norm": 0.8876551816268368, + "learning_rate": 5.4075397414122555e-06, + "loss": 0.5917, + "step": 7689 + }, + { + "epoch": 0.98, + "grad_norm": 0.7098891922718354, + "learning_rate": 5.406511578937923e-06, + "loss": 0.6053, + "step": 7690 + }, + { + "epoch": 0.98, + "grad_norm": 0.8329051936661772, + "learning_rate": 5.405483399159691e-06, + "loss": 0.6319, + "step": 7691 + }, + { + "epoch": 0.98, + "grad_norm": 0.8163348430392673, + "learning_rate": 5.404455202121327e-06, + "loss": 0.6229, + "step": 7692 + }, + { + "epoch": 0.98, + "grad_norm": 0.9204914237534364, + "learning_rate": 5.403426987866598e-06, + "loss": 0.65, + "step": 7693 + }, + { + "epoch": 0.98, + "grad_norm": 0.5903937899204389, + "learning_rate": 5.402398756439271e-06, + "loss": 0.5153, + "step": 7694 + }, + { + "epoch": 0.98, + "grad_norm": 0.9063080203721176, + "learning_rate": 5.401370507883116e-06, + "loss": 0.6502, + "step": 7695 + }, + { + "epoch": 0.98, + "grad_norm": 0.7562293817119972, + "learning_rate": 5.400342242241899e-06, + "loss": 0.6432, + "step": 7696 + }, + { + "epoch": 0.98, + "grad_norm": 0.6165267075100698, + "learning_rate": 5.399313959559394e-06, + "loss": 0.519, + "step": 7697 + }, + { + "epoch": 0.98, + "grad_norm": 0.5547820862455214, + "learning_rate": 5.398285659879368e-06, + "loss": 0.5045, + "step": 7698 + }, + { + "epoch": 0.98, + "grad_norm": 0.791158681260808, + "learning_rate": 5.397257343245595e-06, + "loss": 0.6144, + "step": 7699 + }, + { + "epoch": 0.98, + "grad_norm": 0.5458784560907478, + "learning_rate": 5.396229009701846e-06, + "loss": 0.5146, + "step": 7700 + }, + { + "epoch": 0.98, + "grad_norm": 0.7107460281972412, + "learning_rate": 5.395200659291895e-06, + "loss": 0.607, + "step": 7701 + }, + { + "epoch": 0.98, + "grad_norm": 0.9041061139278497, + "learning_rate": 5.394172292059514e-06, + "loss": 0.6222, + "step": 7702 + }, + { + "epoch": 0.98, + "grad_norm": 0.5433163520556715, + "learning_rate": 5.39314390804848e-06, + "loss": 0.4533, + "step": 7703 + }, + { + "epoch": 0.98, + "grad_norm": 0.6791522029686936, + "learning_rate": 5.392115507302562e-06, + "loss": 0.5712, + "step": 7704 + }, + { + "epoch": 0.98, + "grad_norm": 0.8935288663499319, + "learning_rate": 5.391087089865543e-06, + "loss": 0.5653, + "step": 7705 + }, + { + "epoch": 0.98, + "grad_norm": 0.6239536678147045, + "learning_rate": 5.390058655781196e-06, + "loss": 0.5159, + "step": 7706 + }, + { + "epoch": 0.98, + "grad_norm": 0.7265561882945888, + "learning_rate": 5.389030205093299e-06, + "loss": 0.5767, + "step": 7707 + }, + { + "epoch": 0.98, + "grad_norm": 0.7770451388491456, + "learning_rate": 5.388001737845628e-06, + "loss": 0.575, + "step": 7708 + }, + { + "epoch": 0.98, + "grad_norm": 0.760367495109844, + "learning_rate": 5.386973254081964e-06, + "loss": 0.6234, + "step": 7709 + }, + { + "epoch": 0.98, + "grad_norm": 0.773868965923522, + "learning_rate": 5.3859447538460855e-06, + "loss": 0.55, + "step": 7710 + }, + { + "epoch": 0.98, + "grad_norm": 0.5700509815885804, + "learning_rate": 5.384916237181771e-06, + "loss": 0.4986, + "step": 7711 + }, + { + "epoch": 0.98, + "grad_norm": 0.5711033557274912, + "learning_rate": 5.3838877041328034e-06, + "loss": 0.5527, + "step": 7712 + }, + { + "epoch": 0.98, + "grad_norm": 0.9728050219099901, + "learning_rate": 5.382859154742962e-06, + "loss": 0.6433, + "step": 7713 + }, + { + "epoch": 0.98, + "grad_norm": 0.682096310741079, + "learning_rate": 5.3818305890560285e-06, + "loss": 0.5737, + "step": 7714 + }, + { + "epoch": 0.98, + "grad_norm": 0.7709356162527241, + "learning_rate": 5.380802007115788e-06, + "loss": 0.6538, + "step": 7715 + }, + { + "epoch": 0.98, + "grad_norm": 0.701122406915414, + "learning_rate": 5.379773408966025e-06, + "loss": 0.5553, + "step": 7716 + }, + { + "epoch": 0.98, + "grad_norm": 0.9142161363093375, + "learning_rate": 5.37874479465052e-06, + "loss": 0.5796, + "step": 7717 + }, + { + "epoch": 0.98, + "grad_norm": 0.7206385256700651, + "learning_rate": 5.377716164213059e-06, + "loss": 0.5651, + "step": 7718 + }, + { + "epoch": 0.98, + "grad_norm": 0.7496814463963307, + "learning_rate": 5.376687517697428e-06, + "loss": 0.5984, + "step": 7719 + }, + { + "epoch": 0.98, + "grad_norm": 0.7629454310965097, + "learning_rate": 5.375658855147415e-06, + "loss": 0.664, + "step": 7720 + }, + { + "epoch": 0.98, + "grad_norm": 0.7155635146549625, + "learning_rate": 5.374630176606802e-06, + "loss": 0.5433, + "step": 7721 + }, + { + "epoch": 0.98, + "grad_norm": 0.7681578901605628, + "learning_rate": 5.373601482119381e-06, + "loss": 0.5762, + "step": 7722 + }, + { + "epoch": 0.98, + "grad_norm": 0.5935637803021613, + "learning_rate": 5.3725727717289375e-06, + "loss": 0.4968, + "step": 7723 + }, + { + "epoch": 0.98, + "grad_norm": 0.7929195390306502, + "learning_rate": 5.371544045479264e-06, + "loss": 0.6021, + "step": 7724 + }, + { + "epoch": 0.98, + "grad_norm": 0.7285118946070908, + "learning_rate": 5.370515303414146e-06, + "loss": 0.6269, + "step": 7725 + }, + { + "epoch": 0.98, + "grad_norm": 0.8765799796380599, + "learning_rate": 5.369486545577377e-06, + "loss": 0.6, + "step": 7726 + }, + { + "epoch": 0.98, + "grad_norm": 1.0845970718274653, + "learning_rate": 5.368457772012745e-06, + "loss": 0.5602, + "step": 7727 + }, + { + "epoch": 0.98, + "grad_norm": 0.7734004673846682, + "learning_rate": 5.367428982764045e-06, + "loss": 0.6608, + "step": 7728 + }, + { + "epoch": 0.98, + "grad_norm": 0.6317609717150813, + "learning_rate": 5.366400177875064e-06, + "loss": 0.5663, + "step": 7729 + }, + { + "epoch": 0.98, + "grad_norm": 0.5557156098948434, + "learning_rate": 5.3653713573896e-06, + "loss": 0.5377, + "step": 7730 + }, + { + "epoch": 0.98, + "grad_norm": 0.6271265655965282, + "learning_rate": 5.364342521351446e-06, + "loss": 0.5328, + "step": 7731 + }, + { + "epoch": 0.99, + "grad_norm": 0.6090695624168665, + "learning_rate": 5.3633136698043945e-06, + "loss": 0.5201, + "step": 7732 + }, + { + "epoch": 0.99, + "grad_norm": 0.6289026450117043, + "learning_rate": 5.36228480279224e-06, + "loss": 0.4897, + "step": 7733 + }, + { + "epoch": 0.99, + "grad_norm": 0.756977882233415, + "learning_rate": 5.361255920358781e-06, + "loss": 0.6615, + "step": 7734 + }, + { + "epoch": 0.99, + "grad_norm": 0.6408538710284514, + "learning_rate": 5.360227022547809e-06, + "loss": 0.5532, + "step": 7735 + }, + { + "epoch": 0.99, + "grad_norm": 0.6004730240822013, + "learning_rate": 5.359198109403127e-06, + "loss": 0.5081, + "step": 7736 + }, + { + "epoch": 0.99, + "grad_norm": 0.5559908687298513, + "learning_rate": 5.358169180968527e-06, + "loss": 0.4976, + "step": 7737 + }, + { + "epoch": 0.99, + "grad_norm": 0.6404008594146845, + "learning_rate": 5.357140237287811e-06, + "loss": 0.5139, + "step": 7738 + }, + { + "epoch": 0.99, + "grad_norm": 0.7470665460472893, + "learning_rate": 5.356111278404777e-06, + "loss": 0.626, + "step": 7739 + }, + { + "epoch": 0.99, + "grad_norm": 0.5834475636775669, + "learning_rate": 5.355082304363221e-06, + "loss": 0.4851, + "step": 7740 + }, + { + "epoch": 0.99, + "grad_norm": 0.5896010140170032, + "learning_rate": 5.354053315206947e-06, + "loss": 0.4941, + "step": 7741 + }, + { + "epoch": 0.99, + "grad_norm": 1.0499949451309503, + "learning_rate": 5.353024310979757e-06, + "loss": 0.6522, + "step": 7742 + }, + { + "epoch": 0.99, + "grad_norm": 0.5054924310076513, + "learning_rate": 5.351995291725448e-06, + "loss": 0.4554, + "step": 7743 + }, + { + "epoch": 0.99, + "grad_norm": 0.6677973829308487, + "learning_rate": 5.350966257487825e-06, + "loss": 0.5491, + "step": 7744 + }, + { + "epoch": 0.99, + "grad_norm": 0.7075294731007261, + "learning_rate": 5.34993720831069e-06, + "loss": 0.5112, + "step": 7745 + }, + { + "epoch": 0.99, + "grad_norm": 0.9745288610078423, + "learning_rate": 5.348908144237846e-06, + "loss": 0.6473, + "step": 7746 + }, + { + "epoch": 0.99, + "grad_norm": 0.5622815945261704, + "learning_rate": 5.347879065313096e-06, + "loss": 0.4875, + "step": 7747 + }, + { + "epoch": 0.99, + "grad_norm": 0.6692257317838569, + "learning_rate": 5.346849971580248e-06, + "loss": 0.5901, + "step": 7748 + }, + { + "epoch": 0.99, + "grad_norm": 0.6786583804214478, + "learning_rate": 5.345820863083105e-06, + "loss": 0.5365, + "step": 7749 + }, + { + "epoch": 0.99, + "grad_norm": 0.6105684261085144, + "learning_rate": 5.344791739865471e-06, + "loss": 0.5546, + "step": 7750 + }, + { + "epoch": 0.99, + "grad_norm": 0.8076388076080914, + "learning_rate": 5.343762601971156e-06, + "loss": 0.6665, + "step": 7751 + }, + { + "epoch": 0.99, + "grad_norm": 0.8570977591880408, + "learning_rate": 5.342733449443965e-06, + "loss": 0.6219, + "step": 7752 + }, + { + "epoch": 0.99, + "grad_norm": 0.6370562571564155, + "learning_rate": 5.341704282327705e-06, + "loss": 0.5322, + "step": 7753 + }, + { + "epoch": 0.99, + "grad_norm": 0.6260290500439879, + "learning_rate": 5.340675100666188e-06, + "loss": 0.5135, + "step": 7754 + }, + { + "epoch": 0.99, + "grad_norm": 0.5904676112175424, + "learning_rate": 5.33964590450322e-06, + "loss": 0.4997, + "step": 7755 + }, + { + "epoch": 0.99, + "grad_norm": 0.7451974252699439, + "learning_rate": 5.338616693882611e-06, + "loss": 0.5632, + "step": 7756 + }, + { + "epoch": 0.99, + "grad_norm": 0.7192540204047387, + "learning_rate": 5.337587468848171e-06, + "loss": 0.5441, + "step": 7757 + }, + { + "epoch": 0.99, + "grad_norm": 0.622893404555686, + "learning_rate": 5.336558229443712e-06, + "loss": 0.547, + "step": 7758 + }, + { + "epoch": 0.99, + "grad_norm": 0.6279810475129116, + "learning_rate": 5.335528975713045e-06, + "loss": 0.4898, + "step": 7759 + }, + { + "epoch": 0.99, + "grad_norm": 1.7152529339530527, + "learning_rate": 5.33449970769998e-06, + "loss": 0.5411, + "step": 7760 + }, + { + "epoch": 0.99, + "grad_norm": 0.7052606351232354, + "learning_rate": 5.333470425448332e-06, + "loss": 0.603, + "step": 7761 + }, + { + "epoch": 0.99, + "grad_norm": 0.5823979446585354, + "learning_rate": 5.332441129001914e-06, + "loss": 0.5626, + "step": 7762 + }, + { + "epoch": 0.99, + "grad_norm": 0.7608170523306291, + "learning_rate": 5.331411818404539e-06, + "loss": 0.5586, + "step": 7763 + }, + { + "epoch": 0.99, + "grad_norm": 0.6128208302282596, + "learning_rate": 5.3303824937000225e-06, + "loss": 0.5629, + "step": 7764 + }, + { + "epoch": 0.99, + "grad_norm": 0.7164929974776274, + "learning_rate": 5.329353154932179e-06, + "loss": 0.5765, + "step": 7765 + }, + { + "epoch": 0.99, + "grad_norm": 0.7713169793183182, + "learning_rate": 5.328323802144822e-06, + "loss": 0.5695, + "step": 7766 + }, + { + "epoch": 0.99, + "grad_norm": 0.9265304642052163, + "learning_rate": 5.327294435381772e-06, + "loss": 0.6818, + "step": 7767 + }, + { + "epoch": 0.99, + "grad_norm": 0.825508822067408, + "learning_rate": 5.326265054686844e-06, + "loss": 0.5846, + "step": 7768 + }, + { + "epoch": 0.99, + "grad_norm": 0.628665724668007, + "learning_rate": 5.325235660103856e-06, + "loss": 0.5755, + "step": 7769 + }, + { + "epoch": 0.99, + "grad_norm": 0.66366246560793, + "learning_rate": 5.324206251676623e-06, + "loss": 0.5227, + "step": 7770 + }, + { + "epoch": 0.99, + "grad_norm": 0.6761650849348263, + "learning_rate": 5.323176829448967e-06, + "loss": 0.5173, + "step": 7771 + }, + { + "epoch": 0.99, + "grad_norm": 0.7199382572637044, + "learning_rate": 5.322147393464706e-06, + "loss": 0.5702, + "step": 7772 + }, + { + "epoch": 0.99, + "grad_norm": 0.5887790917193235, + "learning_rate": 5.321117943767661e-06, + "loss": 0.5038, + "step": 7773 + }, + { + "epoch": 0.99, + "grad_norm": 4.031051151846374, + "learning_rate": 5.320088480401649e-06, + "loss": 0.6142, + "step": 7774 + }, + { + "epoch": 0.99, + "grad_norm": 0.7283028926059821, + "learning_rate": 5.319059003410496e-06, + "loss": 0.5782, + "step": 7775 + }, + { + "epoch": 0.99, + "grad_norm": 0.7614281547964779, + "learning_rate": 5.318029512838018e-06, + "loss": 0.6233, + "step": 7776 + }, + { + "epoch": 0.99, + "grad_norm": 0.7501909888112255, + "learning_rate": 5.317000008728042e-06, + "loss": 0.5937, + "step": 7777 + }, + { + "epoch": 0.99, + "grad_norm": 0.737652573312114, + "learning_rate": 5.315970491124387e-06, + "loss": 0.5731, + "step": 7778 + }, + { + "epoch": 0.99, + "grad_norm": 0.7259415196763576, + "learning_rate": 5.314940960070879e-06, + "loss": 0.5387, + "step": 7779 + }, + { + "epoch": 0.99, + "grad_norm": 0.5689656767728895, + "learning_rate": 5.313911415611341e-06, + "loss": 0.5163, + "step": 7780 + }, + { + "epoch": 0.99, + "grad_norm": 1.2708990744823239, + "learning_rate": 5.312881857789596e-06, + "loss": 0.6402, + "step": 7781 + }, + { + "epoch": 0.99, + "grad_norm": 0.6190638294954918, + "learning_rate": 5.31185228664947e-06, + "loss": 0.5644, + "step": 7782 + }, + { + "epoch": 0.99, + "grad_norm": 0.6110976970031593, + "learning_rate": 5.31082270223479e-06, + "loss": 0.5053, + "step": 7783 + }, + { + "epoch": 0.99, + "grad_norm": 0.5809708065534906, + "learning_rate": 5.309793104589379e-06, + "loss": 0.4796, + "step": 7784 + }, + { + "epoch": 0.99, + "grad_norm": 0.5963783041967836, + "learning_rate": 5.308763493757067e-06, + "loss": 0.517, + "step": 7785 + }, + { + "epoch": 0.99, + "grad_norm": 0.7553283451643742, + "learning_rate": 5.30773386978168e-06, + "loss": 0.5929, + "step": 7786 + }, + { + "epoch": 0.99, + "grad_norm": 0.812690534696544, + "learning_rate": 5.306704232707045e-06, + "loss": 0.6016, + "step": 7787 + }, + { + "epoch": 0.99, + "grad_norm": 0.6015041572317619, + "learning_rate": 5.305674582576991e-06, + "loss": 0.4999, + "step": 7788 + }, + { + "epoch": 0.99, + "grad_norm": 1.089798022059888, + "learning_rate": 5.304644919435347e-06, + "loss": 0.6408, + "step": 7789 + }, + { + "epoch": 0.99, + "grad_norm": 0.8056609174808825, + "learning_rate": 5.303615243325942e-06, + "loss": 0.5716, + "step": 7790 + }, + { + "epoch": 0.99, + "grad_norm": 0.8010464937966264, + "learning_rate": 5.302585554292606e-06, + "loss": 0.5956, + "step": 7791 + }, + { + "epoch": 0.99, + "grad_norm": 0.6617620654217385, + "learning_rate": 5.30155585237917e-06, + "loss": 0.5772, + "step": 7792 + }, + { + "epoch": 0.99, + "grad_norm": 0.5711171743383584, + "learning_rate": 5.300526137629465e-06, + "loss": 0.5067, + "step": 7793 + }, + { + "epoch": 0.99, + "grad_norm": 0.5805484877470243, + "learning_rate": 5.2994964100873225e-06, + "loss": 0.5578, + "step": 7794 + }, + { + "epoch": 0.99, + "grad_norm": 0.5776063981112649, + "learning_rate": 5.2984666697965755e-06, + "loss": 0.4718, + "step": 7795 + }, + { + "epoch": 0.99, + "grad_norm": 0.7996069024995108, + "learning_rate": 5.297436916801057e-06, + "loss": 0.6427, + "step": 7796 + }, + { + "epoch": 0.99, + "grad_norm": 0.7790259147680327, + "learning_rate": 5.296407151144597e-06, + "loss": 0.631, + "step": 7797 + }, + { + "epoch": 0.99, + "grad_norm": 0.6982858058000818, + "learning_rate": 5.295377372871033e-06, + "loss": 0.5038, + "step": 7798 + }, + { + "epoch": 0.99, + "grad_norm": 0.6696859042299708, + "learning_rate": 5.2943475820241975e-06, + "loss": 0.5594, + "step": 7799 + }, + { + "epoch": 0.99, + "grad_norm": 0.9531840280237015, + "learning_rate": 5.293317778647927e-06, + "loss": 0.6101, + "step": 7800 + }, + { + "epoch": 0.99, + "grad_norm": 0.6875684693717123, + "learning_rate": 5.292287962786055e-06, + "loss": 0.5306, + "step": 7801 + }, + { + "epoch": 0.99, + "grad_norm": 0.713596805921403, + "learning_rate": 5.29125813448242e-06, + "loss": 0.5671, + "step": 7802 + }, + { + "epoch": 0.99, + "grad_norm": 0.5320728376249004, + "learning_rate": 5.290228293780855e-06, + "loss": 0.4567, + "step": 7803 + }, + { + "epoch": 0.99, + "grad_norm": 0.774006308745748, + "learning_rate": 5.2891984407252e-06, + "loss": 0.6245, + "step": 7804 + }, + { + "epoch": 0.99, + "grad_norm": 0.5712094596877593, + "learning_rate": 5.2881685753592915e-06, + "loss": 0.4808, + "step": 7805 + }, + { + "epoch": 0.99, + "grad_norm": 0.6623986780775507, + "learning_rate": 5.2871386977269675e-06, + "loss": 0.5595, + "step": 7806 + }, + { + "epoch": 0.99, + "grad_norm": 0.7005396887928569, + "learning_rate": 5.286108807872068e-06, + "loss": 0.5233, + "step": 7807 + }, + { + "epoch": 0.99, + "grad_norm": 0.7504345644670651, + "learning_rate": 5.28507890583843e-06, + "loss": 0.5768, + "step": 7808 + }, + { + "epoch": 0.99, + "grad_norm": 0.7364170525226253, + "learning_rate": 5.284048991669892e-06, + "loss": 0.5746, + "step": 7809 + }, + { + "epoch": 0.99, + "grad_norm": 0.9293607265773179, + "learning_rate": 5.283019065410298e-06, + "loss": 0.579, + "step": 7810 + }, + { + "epoch": 1.0, + "grad_norm": 0.8804876851137308, + "learning_rate": 5.281989127103486e-06, + "loss": 0.6288, + "step": 7811 + }, + { + "epoch": 1.0, + "grad_norm": 0.5648661112293749, + "learning_rate": 5.280959176793299e-06, + "loss": 0.5166, + "step": 7812 + }, + { + "epoch": 1.0, + "grad_norm": 0.641143481452317, + "learning_rate": 5.279929214523577e-06, + "loss": 0.5385, + "step": 7813 + }, + { + "epoch": 1.0, + "grad_norm": 0.7492285420082699, + "learning_rate": 5.278899240338164e-06, + "loss": 0.5846, + "step": 7814 + }, + { + "epoch": 1.0, + "grad_norm": 1.0233634774609417, + "learning_rate": 5.277869254280899e-06, + "loss": 0.5459, + "step": 7815 + }, + { + "epoch": 1.0, + "grad_norm": 0.6057256830890536, + "learning_rate": 5.27683925639563e-06, + "loss": 0.543, + "step": 7816 + }, + { + "epoch": 1.0, + "grad_norm": 0.5877385092786994, + "learning_rate": 5.275809246726198e-06, + "loss": 0.5288, + "step": 7817 + }, + { + "epoch": 1.0, + "grad_norm": 0.9096373109851044, + "learning_rate": 5.2747792253164475e-06, + "loss": 0.6386, + "step": 7818 + }, + { + "epoch": 1.0, + "grad_norm": 0.628624302150182, + "learning_rate": 5.273749192210223e-06, + "loss": 0.5669, + "step": 7819 + }, + { + "epoch": 1.0, + "grad_norm": 0.5481796449252005, + "learning_rate": 5.272719147451372e-06, + "loss": 0.519, + "step": 7820 + }, + { + "epoch": 1.0, + "grad_norm": 1.0152923342247557, + "learning_rate": 5.271689091083737e-06, + "loss": 0.6053, + "step": 7821 + }, + { + "epoch": 1.0, + "grad_norm": 2.0712457774012485, + "learning_rate": 5.270659023151164e-06, + "loss": 0.614, + "step": 7822 + }, + { + "epoch": 1.0, + "grad_norm": 0.688658659315264, + "learning_rate": 5.269628943697504e-06, + "loss": 0.5533, + "step": 7823 + }, + { + "epoch": 1.0, + "grad_norm": 0.6756675346791414, + "learning_rate": 5.2685988527666e-06, + "loss": 0.5231, + "step": 7824 + }, + { + "epoch": 1.0, + "grad_norm": 0.5546344409186675, + "learning_rate": 5.267568750402302e-06, + "loss": 0.4822, + "step": 7825 + }, + { + "epoch": 1.0, + "grad_norm": 0.7301359025716571, + "learning_rate": 5.266538636648457e-06, + "loss": 0.5981, + "step": 7826 + }, + { + "epoch": 1.0, + "grad_norm": 0.6749830205492546, + "learning_rate": 5.265508511548914e-06, + "loss": 0.5418, + "step": 7827 + }, + { + "epoch": 1.0, + "grad_norm": 0.7416448440544625, + "learning_rate": 5.26447837514752e-06, + "loss": 0.5355, + "step": 7828 + }, + { + "epoch": 1.0, + "grad_norm": 0.5509875832044191, + "learning_rate": 5.263448227488129e-06, + "loss": 0.5682, + "step": 7829 + }, + { + "epoch": 1.0, + "grad_norm": 1.07279499002777, + "learning_rate": 5.26241806861459e-06, + "loss": 0.5951, + "step": 7830 + }, + { + "epoch": 1.0, + "grad_norm": 0.5859297692953886, + "learning_rate": 5.26138789857075e-06, + "loss": 0.5068, + "step": 7831 + }, + { + "epoch": 1.0, + "grad_norm": 0.7927452168983924, + "learning_rate": 5.260357717400464e-06, + "loss": 0.6395, + "step": 7832 + }, + { + "epoch": 1.0, + "grad_norm": 0.6918959233784057, + "learning_rate": 5.2593275251475815e-06, + "loss": 0.5167, + "step": 7833 + }, + { + "epoch": 1.0, + "grad_norm": 0.5879944535026977, + "learning_rate": 5.258297321855955e-06, + "loss": 0.555, + "step": 7834 + }, + { + "epoch": 1.0, + "grad_norm": 0.7328910998645103, + "learning_rate": 5.257267107569437e-06, + "loss": 0.5177, + "step": 7835 + }, + { + "epoch": 1.0, + "grad_norm": 0.6526112273473237, + "learning_rate": 5.25623688233188e-06, + "loss": 0.4904, + "step": 7836 + }, + { + "epoch": 1.0, + "grad_norm": 0.780447083648694, + "learning_rate": 5.255206646187137e-06, + "loss": 0.6327, + "step": 7837 + }, + { + "epoch": 1.0, + "grad_norm": 0.603639518509909, + "learning_rate": 5.254176399179063e-06, + "loss": 0.5729, + "step": 7838 + }, + { + "epoch": 1.0, + "grad_norm": 0.8902777452328379, + "learning_rate": 5.253146141351513e-06, + "loss": 0.5984, + "step": 7839 + }, + { + "epoch": 1.0, + "grad_norm": 0.6154693779911932, + "learning_rate": 5.252115872748339e-06, + "loss": 0.5397, + "step": 7840 + }, + { + "epoch": 1.0, + "grad_norm": 0.6534417590394038, + "learning_rate": 5.2510855934134e-06, + "loss": 0.5333, + "step": 7841 + }, + { + "epoch": 1.0, + "grad_norm": 0.6595574881738436, + "learning_rate": 5.2500553033905475e-06, + "loss": 0.5312, + "step": 7842 + }, + { + "epoch": 1.0, + "grad_norm": 0.6746130595493086, + "learning_rate": 5.249025002723641e-06, + "loss": 0.5419, + "step": 7843 + }, + { + "epoch": 1.0, + "grad_norm": 0.6270313439856194, + "learning_rate": 5.247994691456536e-06, + "loss": 0.5279, + "step": 7844 + }, + { + "epoch": 1.0, + "grad_norm": 0.775804032349742, + "learning_rate": 5.24696436963309e-06, + "loss": 0.5997, + "step": 7845 + }, + { + "epoch": 1.0, + "grad_norm": 0.832411540395696, + "learning_rate": 5.2459340372971575e-06, + "loss": 0.5826, + "step": 7846 + }, + { + "epoch": 1.0, + "grad_norm": 0.6519801525397707, + "learning_rate": 5.244903694492601e-06, + "loss": 0.5549, + "step": 7847 + }, + { + "epoch": 1.0, + "grad_norm": 0.6134851027548354, + "learning_rate": 5.243873341263277e-06, + "loss": 0.4926, + "step": 7848 + }, + { + "epoch": 1.0, + "grad_norm": 0.7119932640697326, + "learning_rate": 5.242842977653043e-06, + "loss": 0.5691, + "step": 7849 + }, + { + "epoch": 1.0, + "grad_norm": 1.2222442909906404, + "learning_rate": 5.24181260370576e-06, + "loss": 0.5123, + "step": 7850 + }, + { + "epoch": 1.0, + "grad_norm": 0.5758065634254151, + "learning_rate": 5.240782219465288e-06, + "loss": 0.4688, + "step": 7851 + }, + { + "epoch": 1.0, + "grad_norm": 0.6321561441497726, + "learning_rate": 5.239751824975486e-06, + "loss": 0.462, + "step": 7852 + }, + { + "epoch": 1.0, + "grad_norm": 0.558115720826898, + "learning_rate": 5.238721420280214e-06, + "loss": 0.4966, + "step": 7853 + }, + { + "epoch": 1.0, + "grad_norm": 0.5588446667874375, + "learning_rate": 5.2376910054233345e-06, + "loss": 0.4864, + "step": 7854 + }, + { + "epoch": 1.0, + "grad_norm": 0.5867568047909654, + "learning_rate": 5.236660580448708e-06, + "loss": 0.4867, + "step": 7855 + }, + { + "epoch": 1.0, + "grad_norm": 0.5976199452892107, + "learning_rate": 5.2356301454001975e-06, + "loss": 0.4574, + "step": 7856 + }, + { + "epoch": 1.0, + "grad_norm": 0.57499500957606, + "learning_rate": 5.234599700321665e-06, + "loss": 0.4515, + "step": 7857 + }, + { + "epoch": 1.0, + "grad_norm": 0.6237006656153087, + "learning_rate": 5.233569245256972e-06, + "loss": 0.4501, + "step": 7858 + }, + { + "epoch": 1.0, + "grad_norm": 0.6916216831897695, + "learning_rate": 5.232538780249983e-06, + "loss": 0.4787, + "step": 7859 + }, + { + "epoch": 1.0, + "grad_norm": 0.6085300666010587, + "learning_rate": 5.23150830534456e-06, + "loss": 0.4869, + "step": 7860 + }, + { + "epoch": 1.0, + "grad_norm": 0.8276365431073447, + "learning_rate": 5.230477820584571e-06, + "loss": 0.5197, + "step": 7861 + }, + { + "epoch": 1.0, + "grad_norm": 0.5487516361202578, + "learning_rate": 5.2294473260138755e-06, + "loss": 0.4347, + "step": 7862 + }, + { + "epoch": 1.0, + "grad_norm": 0.5517042728327083, + "learning_rate": 5.228416821676342e-06, + "loss": 0.4777, + "step": 7863 + }, + { + "epoch": 1.0, + "grad_norm": 0.6183710365216042, + "learning_rate": 5.227386307615834e-06, + "loss": 0.5175, + "step": 7864 + }, + { + "epoch": 1.0, + "grad_norm": 0.6792680583163542, + "learning_rate": 5.226355783876216e-06, + "loss": 0.5101, + "step": 7865 + }, + { + "epoch": 1.0, + "grad_norm": 0.528404102546661, + "learning_rate": 5.225325250501356e-06, + "loss": 0.4194, + "step": 7866 + }, + { + "epoch": 1.0, + "grad_norm": 0.5584978234264941, + "learning_rate": 5.224294707535121e-06, + "loss": 0.4969, + "step": 7867 + }, + { + "epoch": 1.0, + "grad_norm": 0.6369438303944066, + "learning_rate": 5.223264155021377e-06, + "loss": 0.4655, + "step": 7868 + }, + { + "epoch": 1.0, + "grad_norm": 0.5845717616783592, + "learning_rate": 5.222233593003991e-06, + "loss": 0.4757, + "step": 7869 + }, + { + "epoch": 1.0, + "grad_norm": 0.5806164828941629, + "learning_rate": 5.2212030215268316e-06, + "loss": 0.4334, + "step": 7870 + }, + { + "epoch": 1.0, + "grad_norm": 0.6175428783073746, + "learning_rate": 5.220172440633765e-06, + "loss": 0.4572, + "step": 7871 + }, + { + "epoch": 1.0, + "grad_norm": 0.6714088486505605, + "learning_rate": 5.219141850368663e-06, + "loss": 0.5005, + "step": 7872 + }, + { + "epoch": 1.0, + "grad_norm": 0.6064130224923959, + "learning_rate": 5.218111250775392e-06, + "loss": 0.4589, + "step": 7873 + }, + { + "epoch": 1.0, + "grad_norm": 0.5881835032905863, + "learning_rate": 5.217080641897822e-06, + "loss": 0.4547, + "step": 7874 + }, + { + "epoch": 1.0, + "grad_norm": 0.5449062120445409, + "learning_rate": 5.216050023779823e-06, + "loss": 0.4079, + "step": 7875 + }, + { + "epoch": 1.0, + "grad_norm": 0.5739006332696504, + "learning_rate": 5.215019396465265e-06, + "loss": 0.466, + "step": 7876 + }, + { + "epoch": 1.0, + "grad_norm": 0.7404029981314925, + "learning_rate": 5.2139887599980165e-06, + "loss": 0.5023, + "step": 7877 + }, + { + "epoch": 1.0, + "grad_norm": 0.5806108049422828, + "learning_rate": 5.212958114421952e-06, + "loss": 0.4803, + "step": 7878 + }, + { + "epoch": 1.0, + "grad_norm": 0.6495553722544288, + "learning_rate": 5.211927459780941e-06, + "loss": 0.4933, + "step": 7879 + }, + { + "epoch": 1.0, + "grad_norm": 0.7496193297608316, + "learning_rate": 5.210896796118856e-06, + "loss": 0.5363, + "step": 7880 + }, + { + "epoch": 1.0, + "grad_norm": 0.7422031504574214, + "learning_rate": 5.209866123479568e-06, + "loss": 0.5495, + "step": 7881 + }, + { + "epoch": 1.0, + "grad_norm": 1.2325166974738717, + "learning_rate": 5.208835441906949e-06, + "loss": 0.5266, + "step": 7882 + }, + { + "epoch": 1.0, + "grad_norm": 0.5955422006311042, + "learning_rate": 5.207804751444873e-06, + "loss": 0.4479, + "step": 7883 + }, + { + "epoch": 1.0, + "grad_norm": 0.6866498561064781, + "learning_rate": 5.206774052137211e-06, + "loss": 0.5006, + "step": 7884 + }, + { + "epoch": 1.0, + "grad_norm": 0.7055277263865362, + "learning_rate": 5.205743344027841e-06, + "loss": 0.5079, + "step": 7885 + }, + { + "epoch": 1.0, + "grad_norm": 0.7765973950804936, + "learning_rate": 5.204712627160633e-06, + "loss": 0.5159, + "step": 7886 + }, + { + "epoch": 1.0, + "grad_norm": 0.8552237561245664, + "learning_rate": 5.203681901579463e-06, + "loss": 0.5298, + "step": 7887 + }, + { + "epoch": 1.0, + "grad_norm": 0.7650165748175736, + "learning_rate": 5.2026511673282055e-06, + "loss": 0.5489, + "step": 7888 + }, + { + "epoch": 1.01, + "grad_norm": 0.7784502397602112, + "learning_rate": 5.201620424450735e-06, + "loss": 0.5351, + "step": 7889 + }, + { + "epoch": 1.01, + "grad_norm": 0.717783693811607, + "learning_rate": 5.2005896729909265e-06, + "loss": 0.5285, + "step": 7890 + }, + { + "epoch": 1.01, + "grad_norm": 0.6732178759992189, + "learning_rate": 5.1995589129926584e-06, + "loss": 0.5296, + "step": 7891 + }, + { + "epoch": 1.01, + "grad_norm": 0.7143077838780056, + "learning_rate": 5.1985281444998035e-06, + "loss": 0.5036, + "step": 7892 + }, + { + "epoch": 1.01, + "grad_norm": 0.5784084365810707, + "learning_rate": 5.1974973675562415e-06, + "loss": 0.4775, + "step": 7893 + }, + { + "epoch": 1.01, + "grad_norm": 0.7220102673155449, + "learning_rate": 5.196466582205847e-06, + "loss": 0.5045, + "step": 7894 + }, + { + "epoch": 1.01, + "grad_norm": 0.6974815958785543, + "learning_rate": 5.195435788492498e-06, + "loss": 0.5339, + "step": 7895 + }, + { + "epoch": 1.01, + "grad_norm": 0.661259867635626, + "learning_rate": 5.194404986460072e-06, + "loss": 0.5089, + "step": 7896 + }, + { + "epoch": 1.01, + "grad_norm": 0.6067626326698959, + "learning_rate": 5.193374176152447e-06, + "loss": 0.4607, + "step": 7897 + }, + { + "epoch": 1.01, + "grad_norm": 0.5788263325410276, + "learning_rate": 5.192343357613501e-06, + "loss": 0.4585, + "step": 7898 + }, + { + "epoch": 1.01, + "grad_norm": 0.5907338388077803, + "learning_rate": 5.191312530887111e-06, + "loss": 0.4082, + "step": 7899 + }, + { + "epoch": 1.01, + "grad_norm": 0.6836441289086129, + "learning_rate": 5.190281696017161e-06, + "loss": 0.4782, + "step": 7900 + }, + { + "epoch": 1.01, + "grad_norm": 0.7343299955143782, + "learning_rate": 5.189250853047526e-06, + "loss": 0.4759, + "step": 7901 + }, + { + "epoch": 1.01, + "grad_norm": 0.7058577594577498, + "learning_rate": 5.1882200020220865e-06, + "loss": 0.5097, + "step": 7902 + }, + { + "epoch": 1.01, + "grad_norm": 0.5836534081409047, + "learning_rate": 5.187189142984724e-06, + "loss": 0.4643, + "step": 7903 + }, + { + "epoch": 1.01, + "grad_norm": 0.5992547612054283, + "learning_rate": 5.186158275979317e-06, + "loss": 0.5217, + "step": 7904 + }, + { + "epoch": 1.01, + "grad_norm": 0.7516662410073572, + "learning_rate": 5.185127401049747e-06, + "loss": 0.5022, + "step": 7905 + }, + { + "epoch": 1.01, + "grad_norm": 0.6707635254400446, + "learning_rate": 5.184096518239896e-06, + "loss": 0.5076, + "step": 7906 + }, + { + "epoch": 1.01, + "grad_norm": 0.9783763335461778, + "learning_rate": 5.183065627593644e-06, + "loss": 0.5246, + "step": 7907 + }, + { + "epoch": 1.01, + "grad_norm": 0.6263104276982511, + "learning_rate": 5.182034729154873e-06, + "loss": 0.4564, + "step": 7908 + }, + { + "epoch": 1.01, + "grad_norm": 0.8231500856341674, + "learning_rate": 5.1810038229674655e-06, + "loss": 0.499, + "step": 7909 + }, + { + "epoch": 1.01, + "grad_norm": 0.6786524373082691, + "learning_rate": 5.179972909075304e-06, + "loss": 0.5495, + "step": 7910 + }, + { + "epoch": 1.01, + "grad_norm": 0.8591074376454022, + "learning_rate": 5.17894198752227e-06, + "loss": 0.6068, + "step": 7911 + }, + { + "epoch": 1.01, + "grad_norm": 0.5978885680276532, + "learning_rate": 5.177911058352249e-06, + "loss": 0.4423, + "step": 7912 + }, + { + "epoch": 1.01, + "grad_norm": 0.5953491452164323, + "learning_rate": 5.1768801216091214e-06, + "loss": 0.4332, + "step": 7913 + }, + { + "epoch": 1.01, + "grad_norm": 0.6073291604471678, + "learning_rate": 5.175849177336772e-06, + "loss": 0.445, + "step": 7914 + }, + { + "epoch": 1.01, + "grad_norm": 0.5661976490389823, + "learning_rate": 5.1748182255790854e-06, + "loss": 0.4511, + "step": 7915 + }, + { + "epoch": 1.01, + "grad_norm": 0.7594191405537379, + "learning_rate": 5.173787266379946e-06, + "loss": 0.4677, + "step": 7916 + }, + { + "epoch": 1.01, + "grad_norm": 0.5494028434496976, + "learning_rate": 5.1727562997832385e-06, + "loss": 0.4489, + "step": 7917 + }, + { + "epoch": 1.01, + "grad_norm": 0.5562907041862178, + "learning_rate": 5.171725325832846e-06, + "loss": 0.4004, + "step": 7918 + }, + { + "epoch": 1.01, + "grad_norm": 0.6547373523823373, + "learning_rate": 5.170694344572656e-06, + "loss": 0.5031, + "step": 7919 + }, + { + "epoch": 1.01, + "grad_norm": 0.716585560276361, + "learning_rate": 5.169663356046554e-06, + "loss": 0.5387, + "step": 7920 + }, + { + "epoch": 1.01, + "grad_norm": 0.6218172132755906, + "learning_rate": 5.168632360298422e-06, + "loss": 0.4946, + "step": 7921 + }, + { + "epoch": 1.01, + "grad_norm": 0.6218170480458325, + "learning_rate": 5.167601357372152e-06, + "loss": 0.4442, + "step": 7922 + }, + { + "epoch": 1.01, + "grad_norm": 0.6189001511113672, + "learning_rate": 5.1665703473116276e-06, + "loss": 0.4465, + "step": 7923 + }, + { + "epoch": 1.01, + "grad_norm": 0.5814782125395176, + "learning_rate": 5.165539330160736e-06, + "loss": 0.4239, + "step": 7924 + }, + { + "epoch": 1.01, + "grad_norm": 0.6314575397965841, + "learning_rate": 5.164508305963363e-06, + "loss": 0.4803, + "step": 7925 + }, + { + "epoch": 1.01, + "grad_norm": 0.5776419503010333, + "learning_rate": 5.163477274763398e-06, + "loss": 0.4772, + "step": 7926 + }, + { + "epoch": 1.01, + "grad_norm": 0.6093649946999745, + "learning_rate": 5.162446236604727e-06, + "loss": 0.4915, + "step": 7927 + }, + { + "epoch": 1.01, + "grad_norm": 0.6529769421281877, + "learning_rate": 5.161415191531238e-06, + "loss": 0.48, + "step": 7928 + }, + { + "epoch": 1.01, + "grad_norm": 0.6524811141356149, + "learning_rate": 5.160384139586823e-06, + "loss": 0.4926, + "step": 7929 + }, + { + "epoch": 1.01, + "grad_norm": 0.7418814915815949, + "learning_rate": 5.159353080815366e-06, + "loss": 0.548, + "step": 7930 + }, + { + "epoch": 1.01, + "grad_norm": 0.880659114316506, + "learning_rate": 5.1583220152607576e-06, + "loss": 0.4367, + "step": 7931 + }, + { + "epoch": 1.01, + "grad_norm": 0.7589214795491664, + "learning_rate": 5.157290942966887e-06, + "loss": 0.4815, + "step": 7932 + }, + { + "epoch": 1.01, + "grad_norm": 0.7780801338528642, + "learning_rate": 5.156259863977642e-06, + "loss": 0.5576, + "step": 7933 + }, + { + "epoch": 1.01, + "grad_norm": 0.6991793599579742, + "learning_rate": 5.155228778336916e-06, + "loss": 0.475, + "step": 7934 + }, + { + "epoch": 1.01, + "grad_norm": 0.6366171494469306, + "learning_rate": 5.154197686088597e-06, + "loss": 0.483, + "step": 7935 + }, + { + "epoch": 1.01, + "grad_norm": 0.8350875705969815, + "learning_rate": 5.153166587276575e-06, + "loss": 0.5303, + "step": 7936 + }, + { + "epoch": 1.01, + "grad_norm": 0.6537874640664983, + "learning_rate": 5.15213548194474e-06, + "loss": 0.4723, + "step": 7937 + }, + { + "epoch": 1.01, + "grad_norm": 0.7102780936488094, + "learning_rate": 5.151104370136985e-06, + "loss": 0.5038, + "step": 7938 + }, + { + "epoch": 1.01, + "grad_norm": 0.6163822227108454, + "learning_rate": 5.150073251897197e-06, + "loss": 0.4586, + "step": 7939 + }, + { + "epoch": 1.01, + "grad_norm": 0.8634455972841117, + "learning_rate": 5.149042127269273e-06, + "loss": 0.4737, + "step": 7940 + }, + { + "epoch": 1.01, + "grad_norm": 0.6060153083147701, + "learning_rate": 5.148010996297101e-06, + "loss": 0.4322, + "step": 7941 + }, + { + "epoch": 1.01, + "grad_norm": 0.6140990846526703, + "learning_rate": 5.146979859024575e-06, + "loss": 0.4033, + "step": 7942 + }, + { + "epoch": 1.01, + "grad_norm": 0.6012467827600491, + "learning_rate": 5.1459487154955845e-06, + "loss": 0.4946, + "step": 7943 + }, + { + "epoch": 1.01, + "grad_norm": 0.636700584098543, + "learning_rate": 5.144917565754024e-06, + "loss": 0.4964, + "step": 7944 + }, + { + "epoch": 1.01, + "grad_norm": 0.7500634117767534, + "learning_rate": 5.143886409843787e-06, + "loss": 0.493, + "step": 7945 + }, + { + "epoch": 1.01, + "grad_norm": 0.5440606391210333, + "learning_rate": 5.142855247808763e-06, + "loss": 0.4259, + "step": 7946 + }, + { + "epoch": 1.01, + "grad_norm": 0.6452696746661204, + "learning_rate": 5.141824079692849e-06, + "loss": 0.4542, + "step": 7947 + }, + { + "epoch": 1.01, + "grad_norm": 0.6072678091343034, + "learning_rate": 5.140792905539936e-06, + "loss": 0.4354, + "step": 7948 + }, + { + "epoch": 1.01, + "grad_norm": 0.632203985306397, + "learning_rate": 5.1397617253939214e-06, + "loss": 0.4692, + "step": 7949 + }, + { + "epoch": 1.01, + "grad_norm": 0.6658873492179695, + "learning_rate": 5.1387305392986955e-06, + "loss": 0.5216, + "step": 7950 + }, + { + "epoch": 1.01, + "grad_norm": 0.7683551914337232, + "learning_rate": 5.137699347298153e-06, + "loss": 0.5434, + "step": 7951 + }, + { + "epoch": 1.01, + "grad_norm": 0.5920245185048284, + "learning_rate": 5.136668149436189e-06, + "loss": 0.4606, + "step": 7952 + }, + { + "epoch": 1.01, + "grad_norm": 0.7175111090016661, + "learning_rate": 5.1356369457567e-06, + "loss": 0.521, + "step": 7953 + }, + { + "epoch": 1.01, + "grad_norm": 0.5766744624633092, + "learning_rate": 5.1346057363035796e-06, + "loss": 0.4558, + "step": 7954 + }, + { + "epoch": 1.01, + "grad_norm": 0.6628743339456539, + "learning_rate": 5.133574521120723e-06, + "loss": 0.432, + "step": 7955 + }, + { + "epoch": 1.01, + "grad_norm": 0.6042426929200225, + "learning_rate": 5.132543300252026e-06, + "loss": 0.4063, + "step": 7956 + }, + { + "epoch": 1.01, + "grad_norm": 0.6178481322734461, + "learning_rate": 5.131512073741383e-06, + "loss": 0.4723, + "step": 7957 + }, + { + "epoch": 1.01, + "grad_norm": 0.7220445758293353, + "learning_rate": 5.1304808416326935e-06, + "loss": 0.4703, + "step": 7958 + }, + { + "epoch": 1.01, + "grad_norm": 0.5529690856768076, + "learning_rate": 5.12944960396985e-06, + "loss": 0.4275, + "step": 7959 + }, + { + "epoch": 1.01, + "grad_norm": 0.5661459335550141, + "learning_rate": 5.128418360796751e-06, + "loss": 0.4271, + "step": 7960 + }, + { + "epoch": 1.01, + "grad_norm": 0.7664397795751051, + "learning_rate": 5.1273871121572916e-06, + "loss": 0.5452, + "step": 7961 + }, + { + "epoch": 1.01, + "grad_norm": 0.7308487637051548, + "learning_rate": 5.126355858095371e-06, + "loss": 0.5179, + "step": 7962 + }, + { + "epoch": 1.01, + "grad_norm": 0.8226834474919791, + "learning_rate": 5.125324598654885e-06, + "loss": 0.5422, + "step": 7963 + }, + { + "epoch": 1.01, + "grad_norm": 0.5945051764222167, + "learning_rate": 5.1242933338797315e-06, + "loss": 0.4644, + "step": 7964 + }, + { + "epoch": 1.01, + "grad_norm": 0.6473159898242234, + "learning_rate": 5.123262063813809e-06, + "loss": 0.469, + "step": 7965 + }, + { + "epoch": 1.01, + "grad_norm": 0.7744216021454134, + "learning_rate": 5.1222307885010125e-06, + "loss": 0.5292, + "step": 7966 + }, + { + "epoch": 1.01, + "grad_norm": 0.644001575144423, + "learning_rate": 5.121199507985243e-06, + "loss": 0.5379, + "step": 7967 + }, + { + "epoch": 1.02, + "grad_norm": 0.8232099811801318, + "learning_rate": 5.120168222310398e-06, + "loss": 0.4931, + "step": 7968 + }, + { + "epoch": 1.02, + "grad_norm": 0.6478201516002157, + "learning_rate": 5.119136931520374e-06, + "loss": 0.4849, + "step": 7969 + }, + { + "epoch": 1.02, + "grad_norm": 0.6098348884653333, + "learning_rate": 5.118105635659072e-06, + "loss": 0.4889, + "step": 7970 + }, + { + "epoch": 1.02, + "grad_norm": 0.7094914566743494, + "learning_rate": 5.1170743347703925e-06, + "loss": 0.5458, + "step": 7971 + }, + { + "epoch": 1.02, + "grad_norm": 0.5057615961880692, + "learning_rate": 5.116043028898231e-06, + "loss": 0.4074, + "step": 7972 + }, + { + "epoch": 1.02, + "grad_norm": 0.6252288079856869, + "learning_rate": 5.1150117180864885e-06, + "loss": 0.4581, + "step": 7973 + }, + { + "epoch": 1.02, + "grad_norm": 0.7702721521291179, + "learning_rate": 5.113980402379066e-06, + "loss": 0.4721, + "step": 7974 + }, + { + "epoch": 1.02, + "grad_norm": 0.6409602156185231, + "learning_rate": 5.112949081819861e-06, + "loss": 0.4741, + "step": 7975 + }, + { + "epoch": 1.02, + "grad_norm": 0.7036416261389531, + "learning_rate": 5.1119177564527744e-06, + "loss": 0.5164, + "step": 7976 + }, + { + "epoch": 1.02, + "grad_norm": 0.6132219639646236, + "learning_rate": 5.110886426321706e-06, + "loss": 0.48, + "step": 7977 + }, + { + "epoch": 1.02, + "grad_norm": 0.6490586193228038, + "learning_rate": 5.109855091470558e-06, + "loss": 0.4333, + "step": 7978 + }, + { + "epoch": 1.02, + "grad_norm": 0.6223689032668894, + "learning_rate": 5.108823751943229e-06, + "loss": 0.4675, + "step": 7979 + }, + { + "epoch": 1.02, + "grad_norm": 0.7359589583358829, + "learning_rate": 5.107792407783621e-06, + "loss": 0.5063, + "step": 7980 + }, + { + "epoch": 1.02, + "grad_norm": 0.7285229381058957, + "learning_rate": 5.106761059035635e-06, + "loss": 0.4765, + "step": 7981 + }, + { + "epoch": 1.02, + "grad_norm": 0.8333369552148613, + "learning_rate": 5.105729705743172e-06, + "loss": 0.5086, + "step": 7982 + }, + { + "epoch": 1.02, + "grad_norm": 0.6623259111464884, + "learning_rate": 5.104698347950133e-06, + "loss": 0.471, + "step": 7983 + }, + { + "epoch": 1.02, + "grad_norm": 0.6391896574243159, + "learning_rate": 5.103666985700419e-06, + "loss": 0.3967, + "step": 7984 + }, + { + "epoch": 1.02, + "grad_norm": 0.5695048795154143, + "learning_rate": 5.102635619037933e-06, + "loss": 0.4193, + "step": 7985 + }, + { + "epoch": 1.02, + "grad_norm": 0.6258258704388905, + "learning_rate": 5.101604248006578e-06, + "loss": 0.4494, + "step": 7986 + }, + { + "epoch": 1.02, + "grad_norm": 0.5364360820448856, + "learning_rate": 5.100572872650253e-06, + "loss": 0.4197, + "step": 7987 + }, + { + "epoch": 1.02, + "grad_norm": 0.5681300376540526, + "learning_rate": 5.099541493012864e-06, + "loss": 0.4309, + "step": 7988 + }, + { + "epoch": 1.02, + "grad_norm": 0.6438331062224097, + "learning_rate": 5.098510109138311e-06, + "loss": 0.4402, + "step": 7989 + }, + { + "epoch": 1.02, + "grad_norm": 0.7032718223863288, + "learning_rate": 5.097478721070497e-06, + "loss": 0.442, + "step": 7990 + }, + { + "epoch": 1.02, + "grad_norm": 2.2396032828535386, + "learning_rate": 5.096447328853325e-06, + "loss": 0.5068, + "step": 7991 + }, + { + "epoch": 1.02, + "grad_norm": 0.6757881311741756, + "learning_rate": 5.095415932530699e-06, + "loss": 0.4868, + "step": 7992 + }, + { + "epoch": 1.02, + "grad_norm": 0.6961110881329763, + "learning_rate": 5.094384532146522e-06, + "loss": 0.5216, + "step": 7993 + }, + { + "epoch": 1.02, + "grad_norm": 0.7916969372080543, + "learning_rate": 5.093353127744698e-06, + "loss": 0.4935, + "step": 7994 + }, + { + "epoch": 1.02, + "grad_norm": 0.7472611838981202, + "learning_rate": 5.092321719369127e-06, + "loss": 0.4576, + "step": 7995 + }, + { + "epoch": 1.02, + "grad_norm": 0.7628731599237817, + "learning_rate": 5.091290307063718e-06, + "loss": 0.5065, + "step": 7996 + }, + { + "epoch": 1.02, + "grad_norm": 0.5843069535493849, + "learning_rate": 5.09025889087237e-06, + "loss": 0.4872, + "step": 7997 + }, + { + "epoch": 1.02, + "grad_norm": 0.8597133104545779, + "learning_rate": 5.0892274708389915e-06, + "loss": 0.5905, + "step": 7998 + }, + { + "epoch": 1.02, + "grad_norm": 0.8233981790207843, + "learning_rate": 5.088196047007484e-06, + "loss": 0.5543, + "step": 7999 + }, + { + "epoch": 1.02, + "grad_norm": 0.8092275421862285, + "learning_rate": 5.087164619421753e-06, + "loss": 0.5465, + "step": 8000 + }, + { + "epoch": 1.02, + "grad_norm": 1.1104018329533105, + "learning_rate": 5.0861331881257005e-06, + "loss": 0.5639, + "step": 8001 + }, + { + "epoch": 1.02, + "grad_norm": 1.7982239717209036, + "learning_rate": 5.085101753163235e-06, + "loss": 0.5469, + "step": 8002 + }, + { + "epoch": 1.02, + "grad_norm": 0.7441514512711345, + "learning_rate": 5.084070314578261e-06, + "loss": 0.5199, + "step": 8003 + }, + { + "epoch": 1.02, + "grad_norm": 0.63587168828236, + "learning_rate": 5.083038872414681e-06, + "loss": 0.4655, + "step": 8004 + }, + { + "epoch": 1.02, + "grad_norm": 0.505826891907447, + "learning_rate": 5.082007426716402e-06, + "loss": 0.4379, + "step": 8005 + }, + { + "epoch": 1.02, + "grad_norm": 0.5715545494586659, + "learning_rate": 5.080975977527329e-06, + "loss": 0.4634, + "step": 8006 + }, + { + "epoch": 1.02, + "grad_norm": 0.6008932968992711, + "learning_rate": 5.079944524891367e-06, + "loss": 0.4287, + "step": 8007 + }, + { + "epoch": 1.02, + "grad_norm": 0.5582738378860898, + "learning_rate": 5.078913068852421e-06, + "loss": 0.4634, + "step": 8008 + }, + { + "epoch": 1.02, + "grad_norm": 0.6942054465762226, + "learning_rate": 5.077881609454399e-06, + "loss": 0.4845, + "step": 8009 + }, + { + "epoch": 1.02, + "grad_norm": 6.845694199068934, + "learning_rate": 5.076850146741207e-06, + "loss": 0.5073, + "step": 8010 + }, + { + "epoch": 1.02, + "grad_norm": 0.6477070260379001, + "learning_rate": 5.075818680756749e-06, + "loss": 0.4626, + "step": 8011 + }, + { + "epoch": 1.02, + "grad_norm": 0.6164803333648873, + "learning_rate": 5.074787211544931e-06, + "loss": 0.4639, + "step": 8012 + }, + { + "epoch": 1.02, + "grad_norm": 0.6111837475395059, + "learning_rate": 5.0737557391496615e-06, + "loss": 0.4568, + "step": 8013 + }, + { + "epoch": 1.02, + "grad_norm": 0.6199208845182491, + "learning_rate": 5.0727242636148445e-06, + "loss": 0.5111, + "step": 8014 + }, + { + "epoch": 1.02, + "grad_norm": 0.6862575354801447, + "learning_rate": 5.071692784984389e-06, + "loss": 0.4912, + "step": 8015 + }, + { + "epoch": 1.02, + "grad_norm": 0.9000295164859495, + "learning_rate": 5.070661303302201e-06, + "loss": 0.4288, + "step": 8016 + }, + { + "epoch": 1.02, + "grad_norm": 0.8527616750432485, + "learning_rate": 5.069629818612186e-06, + "loss": 0.5453, + "step": 8017 + }, + { + "epoch": 1.02, + "grad_norm": 0.5989249596671377, + "learning_rate": 5.068598330958253e-06, + "loss": 0.4164, + "step": 8018 + }, + { + "epoch": 1.02, + "grad_norm": 0.6319806794869235, + "learning_rate": 5.067566840384309e-06, + "loss": 0.4262, + "step": 8019 + }, + { + "epoch": 1.02, + "grad_norm": 0.5789886376819435, + "learning_rate": 5.066535346934259e-06, + "loss": 0.4989, + "step": 8020 + }, + { + "epoch": 1.02, + "grad_norm": 0.5957754257489064, + "learning_rate": 5.065503850652014e-06, + "loss": 0.4031, + "step": 8021 + }, + { + "epoch": 1.02, + "grad_norm": 0.6239028935529184, + "learning_rate": 5.064472351581478e-06, + "loss": 0.5024, + "step": 8022 + }, + { + "epoch": 1.02, + "grad_norm": 0.8934761990814896, + "learning_rate": 5.063440849766559e-06, + "loss": 0.5414, + "step": 8023 + }, + { + "epoch": 1.02, + "grad_norm": 0.5852631394463695, + "learning_rate": 5.062409345251167e-06, + "loss": 0.4494, + "step": 8024 + }, + { + "epoch": 1.02, + "grad_norm": 0.6537072578740031, + "learning_rate": 5.0613778380792075e-06, + "loss": 0.433, + "step": 8025 + }, + { + "epoch": 1.02, + "grad_norm": 0.5423354152096477, + "learning_rate": 5.06034632829459e-06, + "loss": 0.4414, + "step": 8026 + }, + { + "epoch": 1.02, + "grad_norm": 0.6675241775017613, + "learning_rate": 5.059314815941224e-06, + "loss": 0.4474, + "step": 8027 + }, + { + "epoch": 1.02, + "grad_norm": 1.9803082593937613, + "learning_rate": 5.058283301063014e-06, + "loss": 0.5153, + "step": 8028 + }, + { + "epoch": 1.02, + "grad_norm": 0.7927231404575348, + "learning_rate": 5.057251783703871e-06, + "loss": 0.4663, + "step": 8029 + }, + { + "epoch": 1.02, + "grad_norm": 0.5923349367851576, + "learning_rate": 5.056220263907702e-06, + "loss": 0.4421, + "step": 8030 + }, + { + "epoch": 1.02, + "grad_norm": 0.5537083884691775, + "learning_rate": 5.055188741718416e-06, + "loss": 0.4337, + "step": 8031 + }, + { + "epoch": 1.02, + "grad_norm": 0.594545552648897, + "learning_rate": 5.054157217179922e-06, + "loss": 0.4941, + "step": 8032 + }, + { + "epoch": 1.02, + "grad_norm": 0.8084866070847276, + "learning_rate": 5.053125690336127e-06, + "loss": 0.5654, + "step": 8033 + }, + { + "epoch": 1.02, + "grad_norm": 0.758841269158062, + "learning_rate": 5.0520941612309425e-06, + "loss": 0.5102, + "step": 8034 + }, + { + "epoch": 1.02, + "grad_norm": 1.0174381773721153, + "learning_rate": 5.051062629908276e-06, + "loss": 0.4772, + "step": 8035 + }, + { + "epoch": 1.02, + "grad_norm": 0.7727389710012816, + "learning_rate": 5.050031096412036e-06, + "loss": 0.4727, + "step": 8036 + }, + { + "epoch": 1.02, + "grad_norm": 0.6869542875623863, + "learning_rate": 5.048999560786132e-06, + "loss": 0.4683, + "step": 8037 + }, + { + "epoch": 1.02, + "grad_norm": 0.7761629624732608, + "learning_rate": 5.047968023074474e-06, + "loss": 0.4871, + "step": 8038 + }, + { + "epoch": 1.02, + "grad_norm": 0.6529765213071616, + "learning_rate": 5.046936483320969e-06, + "loss": 0.4833, + "step": 8039 + }, + { + "epoch": 1.02, + "grad_norm": 0.8153465847016584, + "learning_rate": 5.045904941569529e-06, + "loss": 0.5018, + "step": 8040 + }, + { + "epoch": 1.02, + "grad_norm": 0.5779988652875337, + "learning_rate": 5.044873397864063e-06, + "loss": 0.4747, + "step": 8041 + }, + { + "epoch": 1.02, + "grad_norm": 0.6758674169813761, + "learning_rate": 5.0438418522484785e-06, + "loss": 0.4542, + "step": 8042 + }, + { + "epoch": 1.02, + "grad_norm": 0.9415339132924784, + "learning_rate": 5.042810304766688e-06, + "loss": 0.4939, + "step": 8043 + }, + { + "epoch": 1.02, + "grad_norm": 0.7408326508652936, + "learning_rate": 5.0417787554625984e-06, + "loss": 0.5083, + "step": 8044 + }, + { + "epoch": 1.02, + "grad_norm": 0.5427918811238637, + "learning_rate": 5.040747204380121e-06, + "loss": 0.4432, + "step": 8045 + }, + { + "epoch": 1.03, + "grad_norm": 0.6481608688800259, + "learning_rate": 5.0397156515631654e-06, + "loss": 0.4299, + "step": 8046 + }, + { + "epoch": 1.03, + "grad_norm": 0.5796860423287302, + "learning_rate": 5.038684097055641e-06, + "loss": 0.4965, + "step": 8047 + }, + { + "epoch": 1.03, + "grad_norm": 0.7174915973773783, + "learning_rate": 5.0376525409014585e-06, + "loss": 0.4193, + "step": 8048 + }, + { + "epoch": 1.03, + "grad_norm": 0.6954211386399821, + "learning_rate": 5.036620983144528e-06, + "loss": 0.4754, + "step": 8049 + }, + { + "epoch": 1.03, + "grad_norm": 0.7084335458577081, + "learning_rate": 5.03558942382876e-06, + "loss": 0.4233, + "step": 8050 + }, + { + "epoch": 1.03, + "grad_norm": 0.6916893243234785, + "learning_rate": 5.0345578629980605e-06, + "loss": 0.4413, + "step": 8051 + }, + { + "epoch": 1.03, + "grad_norm": 1.0630844949586986, + "learning_rate": 5.033526300696346e-06, + "loss": 0.526, + "step": 8052 + }, + { + "epoch": 1.03, + "grad_norm": 1.573586967354869, + "learning_rate": 5.032494736967525e-06, + "loss": 0.5498, + "step": 8053 + }, + { + "epoch": 1.03, + "grad_norm": 0.7079517205761564, + "learning_rate": 5.031463171855505e-06, + "loss": 0.4786, + "step": 8054 + }, + { + "epoch": 1.03, + "grad_norm": 0.6865691109249946, + "learning_rate": 5.030431605404199e-06, + "loss": 0.4786, + "step": 8055 + }, + { + "epoch": 1.03, + "grad_norm": 0.6232575626183916, + "learning_rate": 5.029400037657517e-06, + "loss": 0.5006, + "step": 8056 + }, + { + "epoch": 1.03, + "grad_norm": 0.8902358097940759, + "learning_rate": 5.028368468659368e-06, + "loss": 0.5555, + "step": 8057 + }, + { + "epoch": 1.03, + "grad_norm": 0.6793337767837564, + "learning_rate": 5.027336898453665e-06, + "loss": 0.5342, + "step": 8058 + }, + { + "epoch": 1.03, + "grad_norm": 0.5939482657068546, + "learning_rate": 5.026305327084318e-06, + "loss": 0.5256, + "step": 8059 + }, + { + "epoch": 1.03, + "grad_norm": 0.5916607362227726, + "learning_rate": 5.025273754595237e-06, + "loss": 0.4782, + "step": 8060 + }, + { + "epoch": 1.03, + "grad_norm": 0.69822646169608, + "learning_rate": 5.024242181030332e-06, + "loss": 0.5008, + "step": 8061 + }, + { + "epoch": 1.03, + "grad_norm": 0.752168127653501, + "learning_rate": 5.023210606433516e-06, + "loss": 0.5192, + "step": 8062 + }, + { + "epoch": 1.03, + "grad_norm": 0.9104293343091803, + "learning_rate": 5.022179030848698e-06, + "loss": 0.4992, + "step": 8063 + }, + { + "epoch": 1.03, + "grad_norm": 0.6680361321533085, + "learning_rate": 5.02114745431979e-06, + "loss": 0.4933, + "step": 8064 + }, + { + "epoch": 1.03, + "grad_norm": 0.5704580273500565, + "learning_rate": 5.020115876890702e-06, + "loss": 0.4279, + "step": 8065 + }, + { + "epoch": 1.03, + "grad_norm": 0.5712671736428463, + "learning_rate": 5.019084298605346e-06, + "loss": 0.448, + "step": 8066 + }, + { + "epoch": 1.03, + "grad_norm": 0.818470240512177, + "learning_rate": 5.018052719507632e-06, + "loss": 0.5086, + "step": 8067 + }, + { + "epoch": 1.03, + "grad_norm": 0.6867642127681672, + "learning_rate": 5.0170211396414726e-06, + "loss": 0.5067, + "step": 8068 + }, + { + "epoch": 1.03, + "grad_norm": 0.8538337240545483, + "learning_rate": 5.015989559050777e-06, + "loss": 0.5239, + "step": 8069 + }, + { + "epoch": 1.03, + "grad_norm": 0.7331282310856594, + "learning_rate": 5.014957977779455e-06, + "loss": 0.5354, + "step": 8070 + }, + { + "epoch": 1.03, + "grad_norm": 0.8696451510505061, + "learning_rate": 5.013926395871421e-06, + "loss": 0.5801, + "step": 8071 + }, + { + "epoch": 1.03, + "grad_norm": 0.7634593629744055, + "learning_rate": 5.012894813370586e-06, + "loss": 0.4981, + "step": 8072 + }, + { + "epoch": 1.03, + "grad_norm": 0.5974869664160931, + "learning_rate": 5.0118632303208595e-06, + "loss": 0.4292, + "step": 8073 + }, + { + "epoch": 1.03, + "grad_norm": 0.7414298773320958, + "learning_rate": 5.0108316467661525e-06, + "loss": 0.504, + "step": 8074 + }, + { + "epoch": 1.03, + "grad_norm": 0.7498987872156508, + "learning_rate": 5.0098000627503775e-06, + "loss": 0.4294, + "step": 8075 + }, + { + "epoch": 1.03, + "grad_norm": 0.9569737905633081, + "learning_rate": 5.008768478317443e-06, + "loss": 0.4443, + "step": 8076 + }, + { + "epoch": 1.03, + "grad_norm": 0.6765939635966391, + "learning_rate": 5.007736893511265e-06, + "loss": 0.5097, + "step": 8077 + }, + { + "epoch": 1.03, + "grad_norm": 0.6107574568996386, + "learning_rate": 5.0067053083757515e-06, + "loss": 0.4749, + "step": 8078 + }, + { + "epoch": 1.03, + "grad_norm": 0.54428483955781, + "learning_rate": 5.005673722954815e-06, + "loss": 0.4284, + "step": 8079 + }, + { + "epoch": 1.03, + "grad_norm": 0.577817880241673, + "learning_rate": 5.004642137292365e-06, + "loss": 0.5118, + "step": 8080 + }, + { + "epoch": 1.03, + "grad_norm": 0.7605463303602441, + "learning_rate": 5.003610551432315e-06, + "loss": 0.4798, + "step": 8081 + }, + { + "epoch": 1.03, + "grad_norm": 0.6804233083766142, + "learning_rate": 5.002578965418575e-06, + "loss": 0.536, + "step": 8082 + }, + { + "epoch": 1.03, + "grad_norm": 0.7684880567027497, + "learning_rate": 5.001547379295057e-06, + "loss": 0.554, + "step": 8083 + }, + { + "epoch": 1.03, + "grad_norm": 0.6047929180025671, + "learning_rate": 5.000515793105671e-06, + "loss": 0.4575, + "step": 8084 + }, + { + "epoch": 1.03, + "grad_norm": 0.619301551951031, + "learning_rate": 4.99948420689433e-06, + "loss": 0.4825, + "step": 8085 + }, + { + "epoch": 1.03, + "grad_norm": 0.7209483154140803, + "learning_rate": 4.998452620704944e-06, + "loss": 0.5081, + "step": 8086 + }, + { + "epoch": 1.03, + "grad_norm": 0.7318865073274593, + "learning_rate": 4.997421034581427e-06, + "loss": 0.4698, + "step": 8087 + }, + { + "epoch": 1.03, + "grad_norm": 0.7233770727932185, + "learning_rate": 4.9963894485676865e-06, + "loss": 0.5, + "step": 8088 + }, + { + "epoch": 1.03, + "grad_norm": 0.7438788769480187, + "learning_rate": 4.995357862707636e-06, + "loss": 0.4994, + "step": 8089 + }, + { + "epoch": 1.03, + "grad_norm": 0.6377103809597473, + "learning_rate": 4.994326277045188e-06, + "loss": 0.4386, + "step": 8090 + }, + { + "epoch": 1.03, + "grad_norm": 0.6318927812348238, + "learning_rate": 4.993294691624249e-06, + "loss": 0.5101, + "step": 8091 + }, + { + "epoch": 1.03, + "grad_norm": 0.8017299554073789, + "learning_rate": 4.992263106488736e-06, + "loss": 0.5517, + "step": 8092 + }, + { + "epoch": 1.03, + "grad_norm": 0.7065999620554689, + "learning_rate": 4.991231521682557e-06, + "loss": 0.5332, + "step": 8093 + }, + { + "epoch": 1.03, + "grad_norm": 0.6921646859919652, + "learning_rate": 4.990199937249624e-06, + "loss": 0.5236, + "step": 8094 + }, + { + "epoch": 1.03, + "grad_norm": 0.6314241567649539, + "learning_rate": 4.989168353233849e-06, + "loss": 0.5216, + "step": 8095 + }, + { + "epoch": 1.03, + "grad_norm": 0.7870602537592161, + "learning_rate": 4.988136769679143e-06, + "loss": 0.5748, + "step": 8096 + }, + { + "epoch": 1.03, + "grad_norm": 0.8868492606936104, + "learning_rate": 4.987105186629416e-06, + "loss": 0.5435, + "step": 8097 + }, + { + "epoch": 1.03, + "grad_norm": 0.5882130175175727, + "learning_rate": 4.98607360412858e-06, + "loss": 0.4647, + "step": 8098 + }, + { + "epoch": 1.03, + "grad_norm": 0.6667964928118641, + "learning_rate": 4.985042022220546e-06, + "loss": 0.5304, + "step": 8099 + }, + { + "epoch": 1.03, + "grad_norm": 0.7894263840401056, + "learning_rate": 4.9840104409492264e-06, + "loss": 0.5129, + "step": 8100 + }, + { + "epoch": 1.03, + "grad_norm": 0.6598057151425882, + "learning_rate": 4.982978860358531e-06, + "loss": 0.5547, + "step": 8101 + }, + { + "epoch": 1.03, + "grad_norm": 0.7475065550265066, + "learning_rate": 4.98194728049237e-06, + "loss": 0.4956, + "step": 8102 + }, + { + "epoch": 1.03, + "grad_norm": 0.6108297537146267, + "learning_rate": 4.9809157013946565e-06, + "loss": 0.5047, + "step": 8103 + }, + { + "epoch": 1.03, + "grad_norm": 0.8178384861045024, + "learning_rate": 4.979884123109298e-06, + "loss": 0.5725, + "step": 8104 + }, + { + "epoch": 1.03, + "grad_norm": 0.5955650242377624, + "learning_rate": 4.978852545680211e-06, + "loss": 0.4829, + "step": 8105 + }, + { + "epoch": 1.03, + "grad_norm": 0.5324535820881771, + "learning_rate": 4.977820969151302e-06, + "loss": 0.4561, + "step": 8106 + }, + { + "epoch": 1.03, + "grad_norm": 0.5801369673208406, + "learning_rate": 4.976789393566485e-06, + "loss": 0.4415, + "step": 8107 + }, + { + "epoch": 1.03, + "grad_norm": 0.5926204253813472, + "learning_rate": 4.975757818969669e-06, + "loss": 0.4245, + "step": 8108 + }, + { + "epoch": 1.03, + "grad_norm": 0.7496313150571022, + "learning_rate": 4.974726245404764e-06, + "loss": 0.4558, + "step": 8109 + }, + { + "epoch": 1.03, + "grad_norm": 0.6052156373739525, + "learning_rate": 4.973694672915684e-06, + "loss": 0.4904, + "step": 8110 + }, + { + "epoch": 1.03, + "grad_norm": 0.668297626818539, + "learning_rate": 4.972663101546337e-06, + "loss": 0.5069, + "step": 8111 + }, + { + "epoch": 1.03, + "grad_norm": 0.7076046746714018, + "learning_rate": 4.9716315313406336e-06, + "loss": 0.5435, + "step": 8112 + }, + { + "epoch": 1.03, + "grad_norm": 0.783893601648489, + "learning_rate": 4.970599962342486e-06, + "loss": 0.537, + "step": 8113 + }, + { + "epoch": 1.03, + "grad_norm": 0.6254790843764756, + "learning_rate": 4.969568394595803e-06, + "loss": 0.4416, + "step": 8114 + }, + { + "epoch": 1.03, + "grad_norm": 1.0921393311376757, + "learning_rate": 4.968536828144497e-06, + "loss": 0.5504, + "step": 8115 + }, + { + "epoch": 1.03, + "grad_norm": 0.7745381813738811, + "learning_rate": 4.967505263032476e-06, + "loss": 0.5733, + "step": 8116 + }, + { + "epoch": 1.03, + "grad_norm": 0.7768241016381884, + "learning_rate": 4.966473699303654e-06, + "loss": 0.4973, + "step": 8117 + }, + { + "epoch": 1.03, + "grad_norm": 0.5180697857474678, + "learning_rate": 4.965442137001939e-06, + "loss": 0.4182, + "step": 8118 + }, + { + "epoch": 1.03, + "grad_norm": 0.6311950676772196, + "learning_rate": 4.964410576171243e-06, + "loss": 0.4924, + "step": 8119 + }, + { + "epoch": 1.03, + "grad_norm": 0.7820784613143212, + "learning_rate": 4.9633790168554735e-06, + "loss": 0.4804, + "step": 8120 + }, + { + "epoch": 1.03, + "grad_norm": 0.7414341015896759, + "learning_rate": 4.962347459098542e-06, + "loss": 0.5369, + "step": 8121 + }, + { + "epoch": 1.03, + "grad_norm": 0.8201712140698587, + "learning_rate": 4.96131590294436e-06, + "loss": 0.4781, + "step": 8122 + }, + { + "epoch": 1.03, + "grad_norm": 0.6621278595704478, + "learning_rate": 4.960284348436837e-06, + "loss": 0.478, + "step": 8123 + }, + { + "epoch": 1.03, + "grad_norm": 0.7318612303570251, + "learning_rate": 4.959252795619881e-06, + "loss": 0.4795, + "step": 8124 + }, + { + "epoch": 1.04, + "grad_norm": 0.6843648599542215, + "learning_rate": 4.958221244537404e-06, + "loss": 0.4949, + "step": 8125 + }, + { + "epoch": 1.04, + "grad_norm": 0.8638883665995928, + "learning_rate": 4.9571896952333145e-06, + "loss": 0.5158, + "step": 8126 + }, + { + "epoch": 1.04, + "grad_norm": 0.8735920736913573, + "learning_rate": 4.956158147751523e-06, + "loss": 0.4905, + "step": 8127 + }, + { + "epoch": 1.04, + "grad_norm": 0.5738007049937406, + "learning_rate": 4.955126602135938e-06, + "loss": 0.4868, + "step": 8128 + }, + { + "epoch": 1.04, + "grad_norm": 0.6219592933355601, + "learning_rate": 4.954095058430471e-06, + "loss": 0.4739, + "step": 8129 + }, + { + "epoch": 1.04, + "grad_norm": 0.7957876962280005, + "learning_rate": 4.953063516679031e-06, + "loss": 0.5083, + "step": 8130 + }, + { + "epoch": 1.04, + "grad_norm": 0.6263593027929184, + "learning_rate": 4.952031976925528e-06, + "loss": 0.469, + "step": 8131 + }, + { + "epoch": 1.04, + "grad_norm": 0.6326247401095945, + "learning_rate": 4.951000439213869e-06, + "loss": 0.4568, + "step": 8132 + }, + { + "epoch": 1.04, + "grad_norm": 0.5686671787418717, + "learning_rate": 4.949968903587966e-06, + "loss": 0.4107, + "step": 8133 + }, + { + "epoch": 1.04, + "grad_norm": 0.6770448684844186, + "learning_rate": 4.948937370091726e-06, + "loss": 0.4586, + "step": 8134 + }, + { + "epoch": 1.04, + "grad_norm": 0.6198756006943074, + "learning_rate": 4.947905838769059e-06, + "loss": 0.4489, + "step": 8135 + }, + { + "epoch": 1.04, + "grad_norm": 0.588322239979068, + "learning_rate": 4.946874309663875e-06, + "loss": 0.4786, + "step": 8136 + }, + { + "epoch": 1.04, + "grad_norm": 0.7281434699718707, + "learning_rate": 4.945842782820081e-06, + "loss": 0.4367, + "step": 8137 + }, + { + "epoch": 1.04, + "grad_norm": 0.6656070348463957, + "learning_rate": 4.944811258281586e-06, + "loss": 0.4938, + "step": 8138 + }, + { + "epoch": 1.04, + "grad_norm": 0.769071296361864, + "learning_rate": 4.9437797360923005e-06, + "loss": 0.5517, + "step": 8139 + }, + { + "epoch": 1.04, + "grad_norm": 0.679566649812272, + "learning_rate": 4.942748216296132e-06, + "loss": 0.483, + "step": 8140 + }, + { + "epoch": 1.04, + "grad_norm": 0.7801902138019788, + "learning_rate": 4.941716698936987e-06, + "loss": 0.5542, + "step": 8141 + }, + { + "epoch": 1.04, + "grad_norm": 0.588084511685038, + "learning_rate": 4.940685184058778e-06, + "loss": 0.4881, + "step": 8142 + }, + { + "epoch": 1.04, + "grad_norm": 0.7338863472434032, + "learning_rate": 4.93965367170541e-06, + "loss": 0.4935, + "step": 8143 + }, + { + "epoch": 1.04, + "grad_norm": 0.5640269128996757, + "learning_rate": 4.938622161920793e-06, + "loss": 0.4243, + "step": 8144 + }, + { + "epoch": 1.04, + "grad_norm": 0.6597540894818068, + "learning_rate": 4.937590654748835e-06, + "loss": 0.4653, + "step": 8145 + }, + { + "epoch": 1.04, + "grad_norm": 0.6558563852646573, + "learning_rate": 4.936559150233443e-06, + "loss": 0.4711, + "step": 8146 + }, + { + "epoch": 1.04, + "grad_norm": 0.6021679819291212, + "learning_rate": 4.935527648418524e-06, + "loss": 0.4561, + "step": 8147 + }, + { + "epoch": 1.04, + "grad_norm": 0.774985474628311, + "learning_rate": 4.9344961493479885e-06, + "loss": 0.4616, + "step": 8148 + }, + { + "epoch": 1.04, + "grad_norm": 0.790357340425759, + "learning_rate": 4.9334646530657415e-06, + "loss": 0.5451, + "step": 8149 + }, + { + "epoch": 1.04, + "grad_norm": 0.6770652021077532, + "learning_rate": 4.932433159615693e-06, + "loss": 0.4564, + "step": 8150 + }, + { + "epoch": 1.04, + "grad_norm": 0.6783212701251177, + "learning_rate": 4.931401669041748e-06, + "loss": 0.4789, + "step": 8151 + }, + { + "epoch": 1.04, + "grad_norm": 0.796682217235123, + "learning_rate": 4.9303701813878144e-06, + "loss": 0.5437, + "step": 8152 + }, + { + "epoch": 1.04, + "grad_norm": 1.0071332259830925, + "learning_rate": 4.9293386966977994e-06, + "loss": 0.5755, + "step": 8153 + }, + { + "epoch": 1.04, + "grad_norm": 2.5310500434845555, + "learning_rate": 4.928307215015611e-06, + "loss": 0.5354, + "step": 8154 + }, + { + "epoch": 1.04, + "grad_norm": 0.6400968540349614, + "learning_rate": 4.9272757363851555e-06, + "loss": 0.4971, + "step": 8155 + }, + { + "epoch": 1.04, + "grad_norm": 0.7638101579938189, + "learning_rate": 4.92624426085034e-06, + "loss": 0.5345, + "step": 8156 + }, + { + "epoch": 1.04, + "grad_norm": 0.6978492382260936, + "learning_rate": 4.92521278845507e-06, + "loss": 0.4906, + "step": 8157 + }, + { + "epoch": 1.04, + "grad_norm": 0.7935373841448372, + "learning_rate": 4.924181319243253e-06, + "loss": 0.5321, + "step": 8158 + }, + { + "epoch": 1.04, + "grad_norm": 0.8144939574626947, + "learning_rate": 4.923149853258795e-06, + "loss": 0.5151, + "step": 8159 + }, + { + "epoch": 1.04, + "grad_norm": 0.5847324438157572, + "learning_rate": 4.922118390545602e-06, + "loss": 0.4286, + "step": 8160 + }, + { + "epoch": 1.04, + "grad_norm": 0.5837197646959561, + "learning_rate": 4.92108693114758e-06, + "loss": 0.4735, + "step": 8161 + }, + { + "epoch": 1.04, + "grad_norm": 0.6576688912176004, + "learning_rate": 4.9200554751086354e-06, + "loss": 0.472, + "step": 8162 + }, + { + "epoch": 1.04, + "grad_norm": 0.7806261387489312, + "learning_rate": 4.919024022472674e-06, + "loss": 0.5474, + "step": 8163 + }, + { + "epoch": 1.04, + "grad_norm": 0.7440678771464696, + "learning_rate": 4.9179925732836e-06, + "loss": 0.4915, + "step": 8164 + }, + { + "epoch": 1.04, + "grad_norm": 0.6270807035856898, + "learning_rate": 4.916961127585322e-06, + "loss": 0.4168, + "step": 8165 + }, + { + "epoch": 1.04, + "grad_norm": 0.6062933713288158, + "learning_rate": 4.91592968542174e-06, + "loss": 0.4253, + "step": 8166 + }, + { + "epoch": 1.04, + "grad_norm": 0.8093407222557867, + "learning_rate": 4.914898246836764e-06, + "loss": 0.5136, + "step": 8167 + }, + { + "epoch": 1.04, + "grad_norm": 0.6658720365941423, + "learning_rate": 4.9138668118742994e-06, + "loss": 0.5368, + "step": 8168 + }, + { + "epoch": 1.04, + "grad_norm": 0.8033865420421944, + "learning_rate": 4.912835380578249e-06, + "loss": 0.5311, + "step": 8169 + }, + { + "epoch": 1.04, + "grad_norm": 0.7002685475646482, + "learning_rate": 4.911803952992518e-06, + "loss": 0.496, + "step": 8170 + }, + { + "epoch": 1.04, + "grad_norm": 0.6824569246723416, + "learning_rate": 4.910772529161009e-06, + "loss": 0.4896, + "step": 8171 + }, + { + "epoch": 1.04, + "grad_norm": 0.6926253996415783, + "learning_rate": 4.909741109127631e-06, + "loss": 0.534, + "step": 8172 + }, + { + "epoch": 1.04, + "grad_norm": 1.101069266180054, + "learning_rate": 4.908709692936284e-06, + "loss": 0.4662, + "step": 8173 + }, + { + "epoch": 1.04, + "grad_norm": 0.7384209669025243, + "learning_rate": 4.907678280630874e-06, + "loss": 0.491, + "step": 8174 + }, + { + "epoch": 1.04, + "grad_norm": 0.6977103742536973, + "learning_rate": 4.906646872255305e-06, + "loss": 0.5196, + "step": 8175 + }, + { + "epoch": 1.04, + "grad_norm": 0.628473299941772, + "learning_rate": 4.90561546785348e-06, + "loss": 0.5175, + "step": 8176 + }, + { + "epoch": 1.04, + "grad_norm": 0.8071292100474738, + "learning_rate": 4.904584067469303e-06, + "loss": 0.5382, + "step": 8177 + }, + { + "epoch": 1.04, + "grad_norm": 0.7617304133126185, + "learning_rate": 4.903552671146675e-06, + "loss": 0.581, + "step": 8178 + }, + { + "epoch": 1.04, + "grad_norm": 0.7305998341939557, + "learning_rate": 4.902521278929504e-06, + "loss": 0.4861, + "step": 8179 + }, + { + "epoch": 1.04, + "grad_norm": 0.5931602093389495, + "learning_rate": 4.90148989086169e-06, + "loss": 0.4389, + "step": 8180 + }, + { + "epoch": 1.04, + "grad_norm": 0.8133656696254687, + "learning_rate": 4.900458506987137e-06, + "loss": 0.5124, + "step": 8181 + }, + { + "epoch": 1.04, + "grad_norm": 0.7196123470267825, + "learning_rate": 4.899427127349747e-06, + "loss": 0.4968, + "step": 8182 + }, + { + "epoch": 1.04, + "grad_norm": 0.6756696081050716, + "learning_rate": 4.898395751993423e-06, + "loss": 0.4096, + "step": 8183 + }, + { + "epoch": 1.04, + "grad_norm": 0.6306960210407471, + "learning_rate": 4.897364380962068e-06, + "loss": 0.4724, + "step": 8184 + }, + { + "epoch": 1.04, + "grad_norm": 0.6040945195355162, + "learning_rate": 4.8963330142995826e-06, + "loss": 0.4662, + "step": 8185 + }, + { + "epoch": 1.04, + "grad_norm": 0.7188606257204347, + "learning_rate": 4.895301652049869e-06, + "loss": 0.5029, + "step": 8186 + }, + { + "epoch": 1.04, + "grad_norm": 0.6713362921139758, + "learning_rate": 4.8942702942568305e-06, + "loss": 0.456, + "step": 8187 + }, + { + "epoch": 1.04, + "grad_norm": 0.7191827252342243, + "learning_rate": 4.893238940964367e-06, + "loss": 0.5407, + "step": 8188 + }, + { + "epoch": 1.04, + "grad_norm": 0.7805356564484168, + "learning_rate": 4.8922075922163804e-06, + "loss": 0.5193, + "step": 8189 + }, + { + "epoch": 1.04, + "grad_norm": 0.822079346452177, + "learning_rate": 4.891176248056771e-06, + "loss": 0.5152, + "step": 8190 + }, + { + "epoch": 1.04, + "grad_norm": 0.7058846562952281, + "learning_rate": 4.890144908529442e-06, + "loss": 0.5226, + "step": 8191 + }, + { + "epoch": 1.04, + "grad_norm": 0.64522121209394, + "learning_rate": 4.889113573678294e-06, + "loss": 0.5133, + "step": 8192 + }, + { + "epoch": 1.04, + "grad_norm": 0.7002708813522056, + "learning_rate": 4.888082243547226e-06, + "loss": 0.5312, + "step": 8193 + }, + { + "epoch": 1.04, + "grad_norm": 0.764951850368504, + "learning_rate": 4.88705091818014e-06, + "loss": 0.5119, + "step": 8194 + }, + { + "epoch": 1.04, + "grad_norm": 0.76976036414379, + "learning_rate": 4.8860195976209354e-06, + "loss": 0.5069, + "step": 8195 + }, + { + "epoch": 1.04, + "grad_norm": 0.8286293076818172, + "learning_rate": 4.884988281913512e-06, + "loss": 0.5364, + "step": 8196 + }, + { + "epoch": 1.04, + "grad_norm": 0.7572316449150073, + "learning_rate": 4.88395697110177e-06, + "loss": 0.4912, + "step": 8197 + }, + { + "epoch": 1.04, + "grad_norm": 0.7723155499331286, + "learning_rate": 4.88292566522961e-06, + "loss": 0.4772, + "step": 8198 + }, + { + "epoch": 1.04, + "grad_norm": 0.6078057153870111, + "learning_rate": 4.881894364340929e-06, + "loss": 0.517, + "step": 8199 + }, + { + "epoch": 1.04, + "grad_norm": 0.7053595877798743, + "learning_rate": 4.880863068479628e-06, + "loss": 0.5052, + "step": 8200 + }, + { + "epoch": 1.04, + "grad_norm": 0.6593743158834144, + "learning_rate": 4.879831777689606e-06, + "loss": 0.4666, + "step": 8201 + }, + { + "epoch": 1.04, + "grad_norm": 0.6361065123969063, + "learning_rate": 4.87880049201476e-06, + "loss": 0.4491, + "step": 8202 + }, + { + "epoch": 1.05, + "grad_norm": 0.5716349027885537, + "learning_rate": 4.877769211498989e-06, + "loss": 0.4283, + "step": 8203 + }, + { + "epoch": 1.05, + "grad_norm": 0.5720540039427114, + "learning_rate": 4.876737936186193e-06, + "loss": 0.4775, + "step": 8204 + }, + { + "epoch": 1.05, + "grad_norm": 0.6165947747468568, + "learning_rate": 4.875706666120269e-06, + "loss": 0.4801, + "step": 8205 + }, + { + "epoch": 1.05, + "grad_norm": 0.792961072769179, + "learning_rate": 4.874675401345116e-06, + "loss": 0.5268, + "step": 8206 + }, + { + "epoch": 1.05, + "grad_norm": 0.815287152901661, + "learning_rate": 4.8736441419046305e-06, + "loss": 0.4854, + "step": 8207 + }, + { + "epoch": 1.05, + "grad_norm": 0.9903397150894573, + "learning_rate": 4.87261288784271e-06, + "loss": 0.5335, + "step": 8208 + }, + { + "epoch": 1.05, + "grad_norm": 0.7595958288211314, + "learning_rate": 4.871581639203251e-06, + "loss": 0.5459, + "step": 8209 + }, + { + "epoch": 1.05, + "grad_norm": 0.6256464266848701, + "learning_rate": 4.8705503960301515e-06, + "loss": 0.4494, + "step": 8210 + }, + { + "epoch": 1.05, + "grad_norm": 0.6687912366964109, + "learning_rate": 4.869519158367308e-06, + "loss": 0.4776, + "step": 8211 + }, + { + "epoch": 1.05, + "grad_norm": 0.801744174634166, + "learning_rate": 4.8684879262586175e-06, + "loss": 0.5669, + "step": 8212 + }, + { + "epoch": 1.05, + "grad_norm": 0.7645578124566696, + "learning_rate": 4.867456699747975e-06, + "loss": 0.5607, + "step": 8213 + }, + { + "epoch": 1.05, + "grad_norm": 0.7216416974605512, + "learning_rate": 4.866425478879279e-06, + "loss": 0.5119, + "step": 8214 + }, + { + "epoch": 1.05, + "grad_norm": 0.7459689271359239, + "learning_rate": 4.86539426369642e-06, + "loss": 0.4735, + "step": 8215 + }, + { + "epoch": 1.05, + "grad_norm": 0.6405112781348862, + "learning_rate": 4.8643630542433005e-06, + "loss": 0.5296, + "step": 8216 + }, + { + "epoch": 1.05, + "grad_norm": 0.8086311427381001, + "learning_rate": 4.863331850563811e-06, + "loss": 0.5644, + "step": 8217 + }, + { + "epoch": 1.05, + "grad_norm": 1.0809850199777922, + "learning_rate": 4.8623006527018475e-06, + "loss": 0.5506, + "step": 8218 + }, + { + "epoch": 1.05, + "grad_norm": 0.8355964877091129, + "learning_rate": 4.861269460701306e-06, + "loss": 0.4848, + "step": 8219 + }, + { + "epoch": 1.05, + "grad_norm": 0.6083604806636115, + "learning_rate": 4.86023827460608e-06, + "loss": 0.4785, + "step": 8220 + }, + { + "epoch": 1.05, + "grad_norm": 0.8007060967062233, + "learning_rate": 4.859207094460065e-06, + "loss": 0.5167, + "step": 8221 + }, + { + "epoch": 1.05, + "grad_norm": 0.8139772792869403, + "learning_rate": 4.858175920307153e-06, + "loss": 0.5315, + "step": 8222 + }, + { + "epoch": 1.05, + "grad_norm": 0.6990414904321893, + "learning_rate": 4.857144752191238e-06, + "loss": 0.5069, + "step": 8223 + }, + { + "epoch": 1.05, + "grad_norm": 0.7490374445137454, + "learning_rate": 4.856113590156216e-06, + "loss": 0.5491, + "step": 8224 + }, + { + "epoch": 1.05, + "grad_norm": 0.7928538358900609, + "learning_rate": 4.855082434245978e-06, + "loss": 0.5334, + "step": 8225 + }, + { + "epoch": 1.05, + "grad_norm": 0.664875956858229, + "learning_rate": 4.854051284504418e-06, + "loss": 0.5, + "step": 8226 + }, + { + "epoch": 1.05, + "grad_norm": 0.8205903752505009, + "learning_rate": 4.8530201409754285e-06, + "loss": 0.5423, + "step": 8227 + }, + { + "epoch": 1.05, + "grad_norm": 0.8122183398227888, + "learning_rate": 4.8519890037029e-06, + "loss": 0.5103, + "step": 8228 + }, + { + "epoch": 1.05, + "grad_norm": 0.6594882528654233, + "learning_rate": 4.850957872730728e-06, + "loss": 0.4827, + "step": 8229 + }, + { + "epoch": 1.05, + "grad_norm": 0.6273417028965672, + "learning_rate": 4.849926748102803e-06, + "loss": 0.4645, + "step": 8230 + }, + { + "epoch": 1.05, + "grad_norm": 0.6912005844276828, + "learning_rate": 4.848895629863018e-06, + "loss": 0.4743, + "step": 8231 + }, + { + "epoch": 1.05, + "grad_norm": 0.7023085892697838, + "learning_rate": 4.847864518055261e-06, + "loss": 0.4735, + "step": 8232 + }, + { + "epoch": 1.05, + "grad_norm": 0.5654344139350759, + "learning_rate": 4.8468334127234275e-06, + "loss": 0.4346, + "step": 8233 + }, + { + "epoch": 1.05, + "grad_norm": 0.6681639496497829, + "learning_rate": 4.845802313911405e-06, + "loss": 0.4418, + "step": 8234 + }, + { + "epoch": 1.05, + "grad_norm": 0.6608505034354452, + "learning_rate": 4.844771221663086e-06, + "loss": 0.4885, + "step": 8235 + }, + { + "epoch": 1.05, + "grad_norm": 0.6250645790722317, + "learning_rate": 4.843740136022359e-06, + "loss": 0.5025, + "step": 8236 + }, + { + "epoch": 1.05, + "grad_norm": 0.7429447805268162, + "learning_rate": 4.842709057033116e-06, + "loss": 0.4715, + "step": 8237 + }, + { + "epoch": 1.05, + "grad_norm": 0.5633102581255295, + "learning_rate": 4.841677984739245e-06, + "loss": 0.4234, + "step": 8238 + }, + { + "epoch": 1.05, + "grad_norm": 0.5980258866552804, + "learning_rate": 4.8406469191846374e-06, + "loss": 0.4852, + "step": 8239 + }, + { + "epoch": 1.05, + "grad_norm": 0.7968220275413964, + "learning_rate": 4.839615860413178e-06, + "loss": 0.5105, + "step": 8240 + }, + { + "epoch": 1.05, + "grad_norm": 0.5617390826528584, + "learning_rate": 4.838584808468761e-06, + "loss": 0.4465, + "step": 8241 + }, + { + "epoch": 1.05, + "grad_norm": 0.6124345282934126, + "learning_rate": 4.837553763395274e-06, + "loss": 0.4664, + "step": 8242 + }, + { + "epoch": 1.05, + "grad_norm": 0.529315244232884, + "learning_rate": 4.836522725236604e-06, + "loss": 0.4149, + "step": 8243 + }, + { + "epoch": 1.05, + "grad_norm": 0.778035947403778, + "learning_rate": 4.835491694036638e-06, + "loss": 0.4814, + "step": 8244 + }, + { + "epoch": 1.05, + "grad_norm": 0.6791867705029339, + "learning_rate": 4.834460669839266e-06, + "loss": 0.5016, + "step": 8245 + }, + { + "epoch": 1.05, + "grad_norm": 0.5673865733446531, + "learning_rate": 4.833429652688374e-06, + "loss": 0.4406, + "step": 8246 + }, + { + "epoch": 1.05, + "grad_norm": 0.53186758935031, + "learning_rate": 4.832398642627849e-06, + "loss": 0.4186, + "step": 8247 + }, + { + "epoch": 1.05, + "grad_norm": 0.6744997106497785, + "learning_rate": 4.831367639701579e-06, + "loss": 0.4676, + "step": 8248 + }, + { + "epoch": 1.05, + "grad_norm": 0.7006821578003772, + "learning_rate": 4.830336643953449e-06, + "loss": 0.5783, + "step": 8249 + }, + { + "epoch": 1.05, + "grad_norm": 0.7494961788681345, + "learning_rate": 4.829305655427346e-06, + "loss": 0.4503, + "step": 8250 + }, + { + "epoch": 1.05, + "grad_norm": 0.7960997699671731, + "learning_rate": 4.828274674167156e-06, + "loss": 0.5512, + "step": 8251 + }, + { + "epoch": 1.05, + "grad_norm": 0.7252706700449726, + "learning_rate": 4.827243700216762e-06, + "loss": 0.5438, + "step": 8252 + }, + { + "epoch": 1.05, + "grad_norm": 0.6124893123667524, + "learning_rate": 4.826212733620054e-06, + "loss": 0.4649, + "step": 8253 + }, + { + "epoch": 1.05, + "grad_norm": 0.6201293585842711, + "learning_rate": 4.825181774420915e-06, + "loss": 0.4471, + "step": 8254 + }, + { + "epoch": 1.05, + "grad_norm": 0.6010911318106227, + "learning_rate": 4.8241508226632285e-06, + "loss": 0.4478, + "step": 8255 + }, + { + "epoch": 1.05, + "grad_norm": 0.6629589461654971, + "learning_rate": 4.82311987839088e-06, + "loss": 0.468, + "step": 8256 + }, + { + "epoch": 1.05, + "grad_norm": 0.7122458328776413, + "learning_rate": 4.822088941647753e-06, + "loss": 0.4981, + "step": 8257 + }, + { + "epoch": 1.05, + "grad_norm": 0.7225105454312586, + "learning_rate": 4.821058012477731e-06, + "loss": 0.4917, + "step": 8258 + }, + { + "epoch": 1.05, + "grad_norm": 0.6356762569082717, + "learning_rate": 4.820027090924698e-06, + "loss": 0.5199, + "step": 8259 + }, + { + "epoch": 1.05, + "grad_norm": 0.7390531461066636, + "learning_rate": 4.818996177032536e-06, + "loss": 0.4906, + "step": 8260 + }, + { + "epoch": 1.05, + "grad_norm": 0.6715414809108625, + "learning_rate": 4.817965270845129e-06, + "loss": 0.4907, + "step": 8261 + }, + { + "epoch": 1.05, + "grad_norm": 0.7872491401963878, + "learning_rate": 4.8169343724063574e-06, + "loss": 0.521, + "step": 8262 + }, + { + "epoch": 1.05, + "grad_norm": 0.8312153903219698, + "learning_rate": 4.8159034817601055e-06, + "loss": 0.5818, + "step": 8263 + }, + { + "epoch": 1.05, + "grad_norm": 0.7546787280603171, + "learning_rate": 4.814872598950255e-06, + "loss": 0.5496, + "step": 8264 + }, + { + "epoch": 1.05, + "grad_norm": 0.7716455672093985, + "learning_rate": 4.813841724020684e-06, + "loss": 0.5266, + "step": 8265 + }, + { + "epoch": 1.05, + "grad_norm": 0.7240966792837004, + "learning_rate": 4.812810857015278e-06, + "loss": 0.5492, + "step": 8266 + }, + { + "epoch": 1.05, + "grad_norm": 0.6694567122072812, + "learning_rate": 4.811779997977914e-06, + "loss": 0.4808, + "step": 8267 + }, + { + "epoch": 1.05, + "grad_norm": 0.8307680593191434, + "learning_rate": 4.8107491469524756e-06, + "loss": 0.5309, + "step": 8268 + }, + { + "epoch": 1.05, + "grad_norm": 0.6751536402282458, + "learning_rate": 4.809718303982841e-06, + "loss": 0.4805, + "step": 8269 + }, + { + "epoch": 1.05, + "grad_norm": 0.6384524355478715, + "learning_rate": 4.8086874691128896e-06, + "loss": 0.4779, + "step": 8270 + }, + { + "epoch": 1.05, + "grad_norm": 0.8092827043226495, + "learning_rate": 4.807656642386501e-06, + "loss": 0.5593, + "step": 8271 + }, + { + "epoch": 1.05, + "grad_norm": 0.6841599902671923, + "learning_rate": 4.806625823847555e-06, + "loss": 0.4846, + "step": 8272 + }, + { + "epoch": 1.05, + "grad_norm": 0.6388523843509164, + "learning_rate": 4.80559501353993e-06, + "loss": 0.4693, + "step": 8273 + }, + { + "epoch": 1.05, + "grad_norm": 0.686568112462965, + "learning_rate": 4.804564211507504e-06, + "loss": 0.4866, + "step": 8274 + }, + { + "epoch": 1.05, + "grad_norm": 0.7377607432941956, + "learning_rate": 4.803533417794155e-06, + "loss": 0.5379, + "step": 8275 + }, + { + "epoch": 1.05, + "grad_norm": 0.6125103250502069, + "learning_rate": 4.80250263244376e-06, + "loss": 0.45, + "step": 8276 + }, + { + "epoch": 1.05, + "grad_norm": 0.612603928040167, + "learning_rate": 4.8014718555001964e-06, + "loss": 0.5093, + "step": 8277 + }, + { + "epoch": 1.05, + "grad_norm": 0.6598029494031642, + "learning_rate": 4.800441087007342e-06, + "loss": 0.5087, + "step": 8278 + }, + { + "epoch": 1.05, + "grad_norm": 0.6311228028854338, + "learning_rate": 4.7994103270090735e-06, + "loss": 0.5057, + "step": 8279 + }, + { + "epoch": 1.05, + "grad_norm": 0.7268557525529523, + "learning_rate": 4.798379575549266e-06, + "loss": 0.5394, + "step": 8280 + }, + { + "epoch": 1.05, + "grad_norm": 0.5701984099904688, + "learning_rate": 4.797348832671796e-06, + "loss": 0.4342, + "step": 8281 + }, + { + "epoch": 1.06, + "grad_norm": 0.6421093650108104, + "learning_rate": 4.796318098420538e-06, + "loss": 0.4723, + "step": 8282 + }, + { + "epoch": 1.06, + "grad_norm": 0.8177789621404639, + "learning_rate": 4.795287372839368e-06, + "loss": 0.5494, + "step": 8283 + }, + { + "epoch": 1.06, + "grad_norm": 0.9362342834889982, + "learning_rate": 4.794256655972161e-06, + "loss": 0.5064, + "step": 8284 + }, + { + "epoch": 1.06, + "grad_norm": 0.7045524933158974, + "learning_rate": 4.79322594786279e-06, + "loss": 0.509, + "step": 8285 + }, + { + "epoch": 1.06, + "grad_norm": 0.8361017931108989, + "learning_rate": 4.7921952485551295e-06, + "loss": 0.5002, + "step": 8286 + }, + { + "epoch": 1.06, + "grad_norm": 0.6032002726730082, + "learning_rate": 4.791164558093054e-06, + "loss": 0.4493, + "step": 8287 + }, + { + "epoch": 1.06, + "grad_norm": 0.6165107148119306, + "learning_rate": 4.790133876520435e-06, + "loss": 0.4552, + "step": 8288 + }, + { + "epoch": 1.06, + "grad_norm": 0.721207995116974, + "learning_rate": 4.789103203881147e-06, + "loss": 0.5762, + "step": 8289 + }, + { + "epoch": 1.06, + "grad_norm": 0.7628121153743485, + "learning_rate": 4.7880725402190595e-06, + "loss": 0.5021, + "step": 8290 + }, + { + "epoch": 1.06, + "grad_norm": 0.8310262869235524, + "learning_rate": 4.787041885578048e-06, + "loss": 0.5734, + "step": 8291 + }, + { + "epoch": 1.06, + "grad_norm": 0.774089249774509, + "learning_rate": 4.7860112400019834e-06, + "loss": 0.5021, + "step": 8292 + }, + { + "epoch": 1.06, + "grad_norm": 0.6050785921716553, + "learning_rate": 4.784980603534737e-06, + "loss": 0.4829, + "step": 8293 + }, + { + "epoch": 1.06, + "grad_norm": 0.6850084566210694, + "learning_rate": 4.783949976220179e-06, + "loss": 0.5018, + "step": 8294 + }, + { + "epoch": 1.06, + "grad_norm": 1.0573215637059092, + "learning_rate": 4.782919358102179e-06, + "loss": 0.4795, + "step": 8295 + }, + { + "epoch": 1.06, + "grad_norm": 0.6547241669697361, + "learning_rate": 4.78188874922461e-06, + "loss": 0.4866, + "step": 8296 + }, + { + "epoch": 1.06, + "grad_norm": 0.6478676320357795, + "learning_rate": 4.7808581496313385e-06, + "loss": 0.4525, + "step": 8297 + }, + { + "epoch": 1.06, + "grad_norm": 0.6843452654307315, + "learning_rate": 4.779827559366236e-06, + "loss": 0.4987, + "step": 8298 + }, + { + "epoch": 1.06, + "grad_norm": 0.749033531854335, + "learning_rate": 4.778796978473171e-06, + "loss": 0.4797, + "step": 8299 + }, + { + "epoch": 1.06, + "grad_norm": 0.8176461849848516, + "learning_rate": 4.777766406996011e-06, + "loss": 0.5129, + "step": 8300 + }, + { + "epoch": 1.06, + "grad_norm": 0.5705261620536343, + "learning_rate": 4.776735844978626e-06, + "loss": 0.4314, + "step": 8301 + }, + { + "epoch": 1.06, + "grad_norm": 0.6700612230120924, + "learning_rate": 4.77570529246488e-06, + "loss": 0.4941, + "step": 8302 + }, + { + "epoch": 1.06, + "grad_norm": 0.6228444338642064, + "learning_rate": 4.774674749498645e-06, + "loss": 0.5081, + "step": 8303 + }, + { + "epoch": 1.06, + "grad_norm": 0.7172664407984548, + "learning_rate": 4.773644216123785e-06, + "loss": 0.5206, + "step": 8304 + }, + { + "epoch": 1.06, + "grad_norm": 0.7464573812938363, + "learning_rate": 4.772613692384168e-06, + "loss": 0.5402, + "step": 8305 + }, + { + "epoch": 1.06, + "grad_norm": 0.7071676505134432, + "learning_rate": 4.77158317832366e-06, + "loss": 0.5094, + "step": 8306 + }, + { + "epoch": 1.06, + "grad_norm": 0.5489632515712463, + "learning_rate": 4.770552673986125e-06, + "loss": 0.4181, + "step": 8307 + }, + { + "epoch": 1.06, + "grad_norm": 0.7151312126475369, + "learning_rate": 4.7695221794154315e-06, + "loss": 0.5255, + "step": 8308 + }, + { + "epoch": 1.06, + "grad_norm": 0.8168589284014309, + "learning_rate": 4.768491694655441e-06, + "loss": 0.5374, + "step": 8309 + }, + { + "epoch": 1.06, + "grad_norm": 0.7398942962946595, + "learning_rate": 4.7674612197500194e-06, + "loss": 0.5418, + "step": 8310 + }, + { + "epoch": 1.06, + "grad_norm": 0.6906132880351743, + "learning_rate": 4.76643075474303e-06, + "loss": 0.5171, + "step": 8311 + }, + { + "epoch": 1.06, + "grad_norm": 0.7496395408570655, + "learning_rate": 4.7654002996783375e-06, + "loss": 0.4947, + "step": 8312 + }, + { + "epoch": 1.06, + "grad_norm": 0.6958341591522413, + "learning_rate": 4.764369854599805e-06, + "loss": 0.5191, + "step": 8313 + }, + { + "epoch": 1.06, + "grad_norm": 0.7685756429343458, + "learning_rate": 4.763339419551292e-06, + "loss": 0.5214, + "step": 8314 + }, + { + "epoch": 1.06, + "grad_norm": 0.6372399625193196, + "learning_rate": 4.762308994576666e-06, + "loss": 0.4885, + "step": 8315 + }, + { + "epoch": 1.06, + "grad_norm": 0.6102657045220327, + "learning_rate": 4.7612785797197865e-06, + "loss": 0.4684, + "step": 8316 + }, + { + "epoch": 1.06, + "grad_norm": 0.6579304653084039, + "learning_rate": 4.760248175024515e-06, + "loss": 0.4952, + "step": 8317 + }, + { + "epoch": 1.06, + "grad_norm": 0.7343753218958654, + "learning_rate": 4.759217780534713e-06, + "loss": 0.4803, + "step": 8318 + }, + { + "epoch": 1.06, + "grad_norm": 0.7741027393340881, + "learning_rate": 4.758187396294241e-06, + "loss": 0.529, + "step": 8319 + }, + { + "epoch": 1.06, + "grad_norm": 0.7667035487416406, + "learning_rate": 4.7571570223469575e-06, + "loss": 0.5363, + "step": 8320 + }, + { + "epoch": 1.06, + "grad_norm": 0.6107481436895474, + "learning_rate": 4.756126658736725e-06, + "loss": 0.4528, + "step": 8321 + }, + { + "epoch": 1.06, + "grad_norm": 0.6737471703064847, + "learning_rate": 4.7550963055074e-06, + "loss": 0.4782, + "step": 8322 + }, + { + "epoch": 1.06, + "grad_norm": 0.8540866683262708, + "learning_rate": 4.754065962702843e-06, + "loss": 0.532, + "step": 8323 + }, + { + "epoch": 1.06, + "grad_norm": 1.1902404904276669, + "learning_rate": 4.753035630366913e-06, + "loss": 0.5155, + "step": 8324 + }, + { + "epoch": 1.06, + "grad_norm": 0.5897317747309352, + "learning_rate": 4.752005308543466e-06, + "loss": 0.4303, + "step": 8325 + }, + { + "epoch": 1.06, + "grad_norm": 0.7012971327486279, + "learning_rate": 4.750974997276361e-06, + "loss": 0.4871, + "step": 8326 + }, + { + "epoch": 1.06, + "grad_norm": 0.7640488877322428, + "learning_rate": 4.749944696609453e-06, + "loss": 0.541, + "step": 8327 + }, + { + "epoch": 1.06, + "grad_norm": 0.7737930288919439, + "learning_rate": 4.748914406586602e-06, + "loss": 0.5053, + "step": 8328 + }, + { + "epoch": 1.06, + "grad_norm": 0.5911253142058805, + "learning_rate": 4.7478841272516616e-06, + "loss": 0.4478, + "step": 8329 + }, + { + "epoch": 1.06, + "grad_norm": 0.5795290210295466, + "learning_rate": 4.746853858648489e-06, + "loss": 0.4683, + "step": 8330 + }, + { + "epoch": 1.06, + "grad_norm": 0.6489249567019832, + "learning_rate": 4.745823600820939e-06, + "loss": 0.4687, + "step": 8331 + }, + { + "epoch": 1.06, + "grad_norm": 0.7705164626644723, + "learning_rate": 4.7447933538128634e-06, + "loss": 0.5312, + "step": 8332 + }, + { + "epoch": 1.06, + "grad_norm": 0.6608120498356257, + "learning_rate": 4.743763117668121e-06, + "loss": 0.5045, + "step": 8333 + }, + { + "epoch": 1.06, + "grad_norm": 0.7930373929415717, + "learning_rate": 4.742732892430565e-06, + "loss": 0.5384, + "step": 8334 + }, + { + "epoch": 1.06, + "grad_norm": 0.7543329925328313, + "learning_rate": 4.741702678144047e-06, + "loss": 0.5222, + "step": 8335 + }, + { + "epoch": 1.06, + "grad_norm": 0.5520664241237638, + "learning_rate": 4.74067247485242e-06, + "loss": 0.4485, + "step": 8336 + }, + { + "epoch": 1.06, + "grad_norm": 0.7122324134902953, + "learning_rate": 4.739642282599538e-06, + "loss": 0.4918, + "step": 8337 + }, + { + "epoch": 1.06, + "grad_norm": 0.680478604032049, + "learning_rate": 4.7386121014292505e-06, + "loss": 0.5085, + "step": 8338 + }, + { + "epoch": 1.06, + "grad_norm": 0.5600107710486806, + "learning_rate": 4.737581931385411e-06, + "loss": 0.4584, + "step": 8339 + }, + { + "epoch": 1.06, + "grad_norm": 0.6896580366720484, + "learning_rate": 4.73655177251187e-06, + "loss": 0.4906, + "step": 8340 + }, + { + "epoch": 1.06, + "grad_norm": 0.963977787932417, + "learning_rate": 4.73552162485248e-06, + "loss": 0.5213, + "step": 8341 + }, + { + "epoch": 1.06, + "grad_norm": 0.7043908635505579, + "learning_rate": 4.734491488451087e-06, + "loss": 0.4862, + "step": 8342 + }, + { + "epoch": 1.06, + "grad_norm": 0.6881430654845729, + "learning_rate": 4.733461363351544e-06, + "loss": 0.5056, + "step": 8343 + }, + { + "epoch": 1.06, + "grad_norm": 0.7424707899379075, + "learning_rate": 4.7324312495976994e-06, + "loss": 0.5498, + "step": 8344 + }, + { + "epoch": 1.06, + "grad_norm": 0.6172766748002575, + "learning_rate": 4.731401147233402e-06, + "loss": 0.471, + "step": 8345 + }, + { + "epoch": 1.06, + "grad_norm": 0.6407102122432393, + "learning_rate": 4.730371056302498e-06, + "loss": 0.4529, + "step": 8346 + }, + { + "epoch": 1.06, + "grad_norm": 0.6719462297994628, + "learning_rate": 4.7293409768488365e-06, + "loss": 0.4964, + "step": 8347 + }, + { + "epoch": 1.06, + "grad_norm": 0.6584934755649497, + "learning_rate": 4.728310908916266e-06, + "loss": 0.4518, + "step": 8348 + }, + { + "epoch": 1.06, + "grad_norm": 0.662125686194116, + "learning_rate": 4.727280852548632e-06, + "loss": 0.4953, + "step": 8349 + }, + { + "epoch": 1.06, + "grad_norm": 0.6914782045059399, + "learning_rate": 4.726250807789779e-06, + "loss": 0.539, + "step": 8350 + }, + { + "epoch": 1.06, + "grad_norm": 0.6847308364637184, + "learning_rate": 4.725220774683555e-06, + "loss": 0.4303, + "step": 8351 + }, + { + "epoch": 1.06, + "grad_norm": 0.598398400830368, + "learning_rate": 4.724190753273803e-06, + "loss": 0.4903, + "step": 8352 + }, + { + "epoch": 1.06, + "grad_norm": 0.7146368773370222, + "learning_rate": 4.723160743604371e-06, + "loss": 0.492, + "step": 8353 + }, + { + "epoch": 1.06, + "grad_norm": 0.686039990221673, + "learning_rate": 4.7221307457191014e-06, + "loss": 0.4945, + "step": 8354 + }, + { + "epoch": 1.06, + "grad_norm": 0.6349715579352723, + "learning_rate": 4.721100759661838e-06, + "loss": 0.5377, + "step": 8355 + }, + { + "epoch": 1.06, + "grad_norm": 0.7781684646665405, + "learning_rate": 4.720070785476424e-06, + "loss": 0.5561, + "step": 8356 + }, + { + "epoch": 1.06, + "grad_norm": 0.8482692988535854, + "learning_rate": 4.719040823206702e-06, + "loss": 0.5792, + "step": 8357 + }, + { + "epoch": 1.06, + "grad_norm": 0.7842530178277606, + "learning_rate": 4.718010872896515e-06, + "loss": 0.5251, + "step": 8358 + }, + { + "epoch": 1.06, + "grad_norm": 0.7524235995930882, + "learning_rate": 4.716980934589703e-06, + "loss": 0.5123, + "step": 8359 + }, + { + "epoch": 1.07, + "grad_norm": 0.8094302828328926, + "learning_rate": 4.7159510083301095e-06, + "loss": 0.512, + "step": 8360 + }, + { + "epoch": 1.07, + "grad_norm": 0.5494737537226451, + "learning_rate": 4.714921094161573e-06, + "loss": 0.4264, + "step": 8361 + }, + { + "epoch": 1.07, + "grad_norm": 0.7242920027653952, + "learning_rate": 4.713891192127935e-06, + "loss": 0.538, + "step": 8362 + }, + { + "epoch": 1.07, + "grad_norm": 0.702317673340286, + "learning_rate": 4.712861302273034e-06, + "loss": 0.4911, + "step": 8363 + }, + { + "epoch": 1.07, + "grad_norm": 0.6389843628300853, + "learning_rate": 4.7118314246407084e-06, + "loss": 0.4368, + "step": 8364 + }, + { + "epoch": 1.07, + "grad_norm": 0.6049971019201188, + "learning_rate": 4.7108015592748005e-06, + "loss": 0.4483, + "step": 8365 + }, + { + "epoch": 1.07, + "grad_norm": 0.6903272201243086, + "learning_rate": 4.709771706219145e-06, + "loss": 0.4091, + "step": 8366 + }, + { + "epoch": 1.07, + "grad_norm": 0.7160418672888511, + "learning_rate": 4.708741865517581e-06, + "loss": 0.5074, + "step": 8367 + }, + { + "epoch": 1.07, + "grad_norm": 0.6924792251940831, + "learning_rate": 4.7077120372139455e-06, + "loss": 0.4431, + "step": 8368 + }, + { + "epoch": 1.07, + "grad_norm": 0.6828736714010408, + "learning_rate": 4.706682221352074e-06, + "loss": 0.4541, + "step": 8369 + }, + { + "epoch": 1.07, + "grad_norm": 0.7528016280901391, + "learning_rate": 4.705652417975803e-06, + "loss": 0.5315, + "step": 8370 + }, + { + "epoch": 1.07, + "grad_norm": 0.6899681482329726, + "learning_rate": 4.704622627128969e-06, + "loss": 0.541, + "step": 8371 + }, + { + "epoch": 1.07, + "grad_norm": 0.8629331194423644, + "learning_rate": 4.703592848855405e-06, + "loss": 0.6034, + "step": 8372 + }, + { + "epoch": 1.07, + "grad_norm": 0.765164791092945, + "learning_rate": 4.7025630831989465e-06, + "loss": 0.5053, + "step": 8373 + }, + { + "epoch": 1.07, + "grad_norm": 0.6877677701275337, + "learning_rate": 4.701533330203427e-06, + "loss": 0.5148, + "step": 8374 + }, + { + "epoch": 1.07, + "grad_norm": 0.7151067895192951, + "learning_rate": 4.70050358991268e-06, + "loss": 0.5062, + "step": 8375 + }, + { + "epoch": 1.07, + "grad_norm": 0.7062122950474459, + "learning_rate": 4.699473862370535e-06, + "loss": 0.4794, + "step": 8376 + }, + { + "epoch": 1.07, + "grad_norm": 0.5964550116294689, + "learning_rate": 4.698444147620831e-06, + "loss": 0.451, + "step": 8377 + }, + { + "epoch": 1.07, + "grad_norm": 0.5786820849146606, + "learning_rate": 4.697414445707395e-06, + "loss": 0.4311, + "step": 8378 + }, + { + "epoch": 1.07, + "grad_norm": 0.6020010219440008, + "learning_rate": 4.696384756674059e-06, + "loss": 0.519, + "step": 8379 + }, + { + "epoch": 1.07, + "grad_norm": 0.7530596084436874, + "learning_rate": 4.695355080564655e-06, + "loss": 0.523, + "step": 8380 + }, + { + "epoch": 1.07, + "grad_norm": 0.6512234287017661, + "learning_rate": 4.69432541742301e-06, + "loss": 0.4924, + "step": 8381 + }, + { + "epoch": 1.07, + "grad_norm": 0.7564569630147391, + "learning_rate": 4.6932957672929565e-06, + "loss": 0.5666, + "step": 8382 + }, + { + "epoch": 1.07, + "grad_norm": 1.384880958266251, + "learning_rate": 4.692266130218322e-06, + "loss": 0.4818, + "step": 8383 + }, + { + "epoch": 1.07, + "grad_norm": 0.6994452017916899, + "learning_rate": 4.6912365062429334e-06, + "loss": 0.4573, + "step": 8384 + }, + { + "epoch": 1.07, + "grad_norm": 0.6832932871007706, + "learning_rate": 4.690206895410622e-06, + "loss": 0.4641, + "step": 8385 + }, + { + "epoch": 1.07, + "grad_norm": 0.8228968802336903, + "learning_rate": 4.689177297765212e-06, + "loss": 0.5673, + "step": 8386 + }, + { + "epoch": 1.07, + "grad_norm": 0.754176995111916, + "learning_rate": 4.688147713350532e-06, + "loss": 0.5774, + "step": 8387 + }, + { + "epoch": 1.07, + "grad_norm": 0.8485384255773295, + "learning_rate": 4.687118142210407e-06, + "loss": 0.5713, + "step": 8388 + }, + { + "epoch": 1.07, + "grad_norm": 0.9218049433550063, + "learning_rate": 4.686088584388661e-06, + "loss": 0.5258, + "step": 8389 + }, + { + "epoch": 1.07, + "grad_norm": 0.7431226167112078, + "learning_rate": 4.685059039929123e-06, + "loss": 0.5334, + "step": 8390 + }, + { + "epoch": 1.07, + "grad_norm": 0.7642740832397854, + "learning_rate": 4.684029508875615e-06, + "loss": 0.5194, + "step": 8391 + }, + { + "epoch": 1.07, + "grad_norm": 0.6598599128505827, + "learning_rate": 4.682999991271961e-06, + "loss": 0.4935, + "step": 8392 + }, + { + "epoch": 1.07, + "grad_norm": 0.6323138682990631, + "learning_rate": 4.681970487161984e-06, + "loss": 0.496, + "step": 8393 + }, + { + "epoch": 1.07, + "grad_norm": 1.4095994915849337, + "learning_rate": 4.680940996589506e-06, + "loss": 0.5157, + "step": 8394 + }, + { + "epoch": 1.07, + "grad_norm": 0.7512926439974584, + "learning_rate": 4.6799115195983515e-06, + "loss": 0.48, + "step": 8395 + }, + { + "epoch": 1.07, + "grad_norm": 0.7554170976018972, + "learning_rate": 4.678882056232341e-06, + "loss": 0.5167, + "step": 8396 + }, + { + "epoch": 1.07, + "grad_norm": 0.6039114237446409, + "learning_rate": 4.677852606535295e-06, + "loss": 0.4556, + "step": 8397 + }, + { + "epoch": 1.07, + "grad_norm": 0.5463591241004426, + "learning_rate": 4.6768231705510346e-06, + "loss": 0.4468, + "step": 8398 + }, + { + "epoch": 1.07, + "grad_norm": 0.6700800719790695, + "learning_rate": 4.675793748323378e-06, + "loss": 0.4829, + "step": 8399 + }, + { + "epoch": 1.07, + "grad_norm": 0.5619466381369013, + "learning_rate": 4.6747643398961465e-06, + "loss": 0.4192, + "step": 8400 + }, + { + "epoch": 1.07, + "grad_norm": 0.660468492810452, + "learning_rate": 4.6737349453131556e-06, + "loss": 0.4634, + "step": 8401 + }, + { + "epoch": 1.07, + "grad_norm": 0.7212004286905599, + "learning_rate": 4.672705564618228e-06, + "loss": 0.5062, + "step": 8402 + }, + { + "epoch": 1.07, + "grad_norm": 0.571651013072029, + "learning_rate": 4.671676197855178e-06, + "loss": 0.4592, + "step": 8403 + }, + { + "epoch": 1.07, + "grad_norm": 0.6756887980190536, + "learning_rate": 4.670646845067823e-06, + "loss": 0.494, + "step": 8404 + }, + { + "epoch": 1.07, + "grad_norm": 0.7638395564295171, + "learning_rate": 4.669617506299979e-06, + "loss": 0.5114, + "step": 8405 + }, + { + "epoch": 1.07, + "grad_norm": 0.7078517964132525, + "learning_rate": 4.668588181595462e-06, + "loss": 0.4377, + "step": 8406 + }, + { + "epoch": 1.07, + "grad_norm": 0.8162889833163972, + "learning_rate": 4.667558870998088e-06, + "loss": 0.5008, + "step": 8407 + }, + { + "epoch": 1.07, + "grad_norm": 0.6307686552880518, + "learning_rate": 4.66652957455167e-06, + "loss": 0.5279, + "step": 8408 + }, + { + "epoch": 1.07, + "grad_norm": 0.8860161005715478, + "learning_rate": 4.665500292300022e-06, + "loss": 0.5348, + "step": 8409 + }, + { + "epoch": 1.07, + "grad_norm": 0.753833515737956, + "learning_rate": 4.6644710242869586e-06, + "loss": 0.5833, + "step": 8410 + }, + { + "epoch": 1.07, + "grad_norm": 0.7902452914431704, + "learning_rate": 4.66344177055629e-06, + "loss": 0.5909, + "step": 8411 + }, + { + "epoch": 1.07, + "grad_norm": 0.8910486117109179, + "learning_rate": 4.662412531151831e-06, + "loss": 0.5373, + "step": 8412 + }, + { + "epoch": 1.07, + "grad_norm": 0.7628166571938368, + "learning_rate": 4.661383306117392e-06, + "loss": 0.5369, + "step": 8413 + }, + { + "epoch": 1.07, + "grad_norm": 0.8626640001914128, + "learning_rate": 4.66035409549678e-06, + "loss": 0.5132, + "step": 8414 + }, + { + "epoch": 1.07, + "grad_norm": 0.6707463882512237, + "learning_rate": 4.6593248993338125e-06, + "loss": 0.5181, + "step": 8415 + }, + { + "epoch": 1.07, + "grad_norm": 0.7840943731201608, + "learning_rate": 4.658295717672295e-06, + "loss": 0.5049, + "step": 8416 + }, + { + "epoch": 1.07, + "grad_norm": 0.7362141479085723, + "learning_rate": 4.657266550556036e-06, + "loss": 0.5073, + "step": 8417 + }, + { + "epoch": 1.07, + "grad_norm": 0.7142310809842124, + "learning_rate": 4.656237398028846e-06, + "loss": 0.5513, + "step": 8418 + }, + { + "epoch": 1.07, + "grad_norm": 0.7563563098539968, + "learning_rate": 4.65520826013453e-06, + "loss": 0.5116, + "step": 8419 + }, + { + "epoch": 1.07, + "grad_norm": 0.643873280231301, + "learning_rate": 4.654179136916898e-06, + "loss": 0.4757, + "step": 8420 + }, + { + "epoch": 1.07, + "grad_norm": 0.6268166627092473, + "learning_rate": 4.653150028419754e-06, + "loss": 0.4558, + "step": 8421 + }, + { + "epoch": 1.07, + "grad_norm": 0.6315090494745018, + "learning_rate": 4.652120934686905e-06, + "loss": 0.4817, + "step": 8422 + }, + { + "epoch": 1.07, + "grad_norm": 0.730675884373918, + "learning_rate": 4.651091855762157e-06, + "loss": 0.5048, + "step": 8423 + }, + { + "epoch": 1.07, + "grad_norm": 0.9512091654455435, + "learning_rate": 4.650062791689313e-06, + "loss": 0.4413, + "step": 8424 + }, + { + "epoch": 1.07, + "grad_norm": 0.6870553837144314, + "learning_rate": 4.649033742512178e-06, + "loss": 0.5166, + "step": 8425 + }, + { + "epoch": 1.07, + "grad_norm": 0.6898670375512296, + "learning_rate": 4.648004708274553e-06, + "loss": 0.501, + "step": 8426 + }, + { + "epoch": 1.07, + "grad_norm": 0.6166967528623053, + "learning_rate": 4.646975689020244e-06, + "loss": 0.4447, + "step": 8427 + }, + { + "epoch": 1.07, + "grad_norm": 0.7622041988668076, + "learning_rate": 4.645946684793053e-06, + "loss": 0.5427, + "step": 8428 + }, + { + "epoch": 1.07, + "grad_norm": 0.9512668191239007, + "learning_rate": 4.64491769563678e-06, + "loss": 0.5007, + "step": 8429 + }, + { + "epoch": 1.07, + "grad_norm": 0.6550393821112371, + "learning_rate": 4.643888721595226e-06, + "loss": 0.4389, + "step": 8430 + }, + { + "epoch": 1.07, + "grad_norm": 0.5790761205366833, + "learning_rate": 4.64285976271219e-06, + "loss": 0.4697, + "step": 8431 + }, + { + "epoch": 1.07, + "grad_norm": 0.7332266616339383, + "learning_rate": 4.6418308190314735e-06, + "loss": 0.5028, + "step": 8432 + }, + { + "epoch": 1.07, + "grad_norm": 0.7020137781479109, + "learning_rate": 4.640801890596875e-06, + "loss": 0.4921, + "step": 8433 + }, + { + "epoch": 1.07, + "grad_norm": 0.8762488343565392, + "learning_rate": 4.639772977452192e-06, + "loss": 0.5679, + "step": 8434 + }, + { + "epoch": 1.07, + "grad_norm": 0.6958711154939297, + "learning_rate": 4.638744079641222e-06, + "loss": 0.492, + "step": 8435 + }, + { + "epoch": 1.07, + "grad_norm": 0.689114992115966, + "learning_rate": 4.6377151972077616e-06, + "loss": 0.4725, + "step": 8436 + }, + { + "epoch": 1.07, + "grad_norm": 0.7472976433703624, + "learning_rate": 4.636686330195608e-06, + "loss": 0.5361, + "step": 8437 + }, + { + "epoch": 1.07, + "grad_norm": 0.7625279445897821, + "learning_rate": 4.635657478648554e-06, + "loss": 0.5082, + "step": 8438 + }, + { + "epoch": 1.08, + "grad_norm": 0.7107890464207818, + "learning_rate": 4.6346286426104e-06, + "loss": 0.5421, + "step": 8439 + }, + { + "epoch": 1.08, + "grad_norm": 0.6283271478914435, + "learning_rate": 4.633599822124936e-06, + "loss": 0.4799, + "step": 8440 + }, + { + "epoch": 1.08, + "grad_norm": 0.7179127541771869, + "learning_rate": 4.632571017235958e-06, + "loss": 0.4939, + "step": 8441 + }, + { + "epoch": 1.08, + "grad_norm": 0.5929685599772619, + "learning_rate": 4.631542227987256e-06, + "loss": 0.4426, + "step": 8442 + }, + { + "epoch": 1.08, + "grad_norm": 0.7619415505757918, + "learning_rate": 4.630513454422625e-06, + "loss": 0.4697, + "step": 8443 + }, + { + "epoch": 1.08, + "grad_norm": 0.6145496912518909, + "learning_rate": 4.629484696585855e-06, + "loss": 0.4827, + "step": 8444 + }, + { + "epoch": 1.08, + "grad_norm": 0.7917648199349726, + "learning_rate": 4.628455954520737e-06, + "loss": 0.5028, + "step": 8445 + }, + { + "epoch": 1.08, + "grad_norm": 0.6429403948703459, + "learning_rate": 4.627427228271063e-06, + "loss": 0.4956, + "step": 8446 + }, + { + "epoch": 1.08, + "grad_norm": 0.7043783357359542, + "learning_rate": 4.626398517880621e-06, + "loss": 0.4971, + "step": 8447 + }, + { + "epoch": 1.08, + "grad_norm": 0.5603148106640026, + "learning_rate": 4.6253698233932e-06, + "loss": 0.4695, + "step": 8448 + }, + { + "epoch": 1.08, + "grad_norm": 0.7000625029904758, + "learning_rate": 4.624341144852589e-06, + "loss": 0.4331, + "step": 8449 + }, + { + "epoch": 1.08, + "grad_norm": 0.7235475734831217, + "learning_rate": 4.623312482302574e-06, + "loss": 0.4813, + "step": 8450 + }, + { + "epoch": 1.08, + "grad_norm": 0.7164336906123124, + "learning_rate": 4.622283835786942e-06, + "loss": 0.4594, + "step": 8451 + }, + { + "epoch": 1.08, + "grad_norm": 0.8073894237025039, + "learning_rate": 4.621255205349482e-06, + "loss": 0.569, + "step": 8452 + }, + { + "epoch": 1.08, + "grad_norm": 0.8459963819415339, + "learning_rate": 4.620226591033977e-06, + "loss": 0.5264, + "step": 8453 + }, + { + "epoch": 1.08, + "grad_norm": 0.7778030068358962, + "learning_rate": 4.619197992884213e-06, + "loss": 0.5397, + "step": 8454 + }, + { + "epoch": 1.08, + "grad_norm": 0.5668433521425842, + "learning_rate": 4.618169410943973e-06, + "loss": 0.4623, + "step": 8455 + }, + { + "epoch": 1.08, + "grad_norm": 0.6018822436727288, + "learning_rate": 4.61714084525704e-06, + "loss": 0.4821, + "step": 8456 + }, + { + "epoch": 1.08, + "grad_norm": 0.6914846077201479, + "learning_rate": 4.616112295867199e-06, + "loss": 0.4795, + "step": 8457 + }, + { + "epoch": 1.08, + "grad_norm": 0.626801691390319, + "learning_rate": 4.615083762818231e-06, + "loss": 0.4504, + "step": 8458 + }, + { + "epoch": 1.08, + "grad_norm": 0.5912242521165715, + "learning_rate": 4.614055246153916e-06, + "loss": 0.4401, + "step": 8459 + }, + { + "epoch": 1.08, + "grad_norm": 0.6375960596102452, + "learning_rate": 4.613026745918037e-06, + "loss": 0.4583, + "step": 8460 + }, + { + "epoch": 1.08, + "grad_norm": 0.7199161450730038, + "learning_rate": 4.611998262154373e-06, + "loss": 0.4799, + "step": 8461 + }, + { + "epoch": 1.08, + "grad_norm": 0.7765032892839526, + "learning_rate": 4.610969794906703e-06, + "loss": 0.5635, + "step": 8462 + }, + { + "epoch": 1.08, + "grad_norm": 0.7003572808161469, + "learning_rate": 4.609941344218804e-06, + "loss": 0.5241, + "step": 8463 + }, + { + "epoch": 1.08, + "grad_norm": 0.7201438386129125, + "learning_rate": 4.608912910134457e-06, + "loss": 0.5248, + "step": 8464 + }, + { + "epoch": 1.08, + "grad_norm": 0.631313299824022, + "learning_rate": 4.607884492697437e-06, + "loss": 0.4669, + "step": 8465 + }, + { + "epoch": 1.08, + "grad_norm": 0.8139530153427013, + "learning_rate": 4.606856091951523e-06, + "loss": 0.489, + "step": 8466 + }, + { + "epoch": 1.08, + "grad_norm": 0.5879956968665803, + "learning_rate": 4.605827707940488e-06, + "loss": 0.4634, + "step": 8467 + }, + { + "epoch": 1.08, + "grad_norm": 0.6685052116960838, + "learning_rate": 4.604799340708107e-06, + "loss": 0.541, + "step": 8468 + }, + { + "epoch": 1.08, + "grad_norm": 0.7104296776717307, + "learning_rate": 4.6037709902981555e-06, + "loss": 0.474, + "step": 8469 + }, + { + "epoch": 1.08, + "grad_norm": 0.6014689511117821, + "learning_rate": 4.602742656754407e-06, + "loss": 0.4802, + "step": 8470 + }, + { + "epoch": 1.08, + "grad_norm": 0.6157666300039946, + "learning_rate": 4.601714340120634e-06, + "loss": 0.4722, + "step": 8471 + }, + { + "epoch": 1.08, + "grad_norm": 0.656486953096236, + "learning_rate": 4.600686040440609e-06, + "loss": 0.4995, + "step": 8472 + }, + { + "epoch": 1.08, + "grad_norm": 0.7721062084650561, + "learning_rate": 4.599657757758103e-06, + "loss": 0.5929, + "step": 8473 + }, + { + "epoch": 1.08, + "grad_norm": 0.6894633827978588, + "learning_rate": 4.598629492116887e-06, + "loss": 0.4997, + "step": 8474 + }, + { + "epoch": 1.08, + "grad_norm": 0.8119950004487392, + "learning_rate": 4.597601243560731e-06, + "loss": 0.5158, + "step": 8475 + }, + { + "epoch": 1.08, + "grad_norm": 0.7332548052297658, + "learning_rate": 4.596573012133403e-06, + "loss": 0.5482, + "step": 8476 + }, + { + "epoch": 1.08, + "grad_norm": 0.6992240521993527, + "learning_rate": 4.595544797878673e-06, + "loss": 0.4908, + "step": 8477 + }, + { + "epoch": 1.08, + "grad_norm": 0.7172619869783801, + "learning_rate": 4.59451660084031e-06, + "loss": 0.4776, + "step": 8478 + }, + { + "epoch": 1.08, + "grad_norm": 0.5937967832691354, + "learning_rate": 4.593488421062079e-06, + "loss": 0.4863, + "step": 8479 + }, + { + "epoch": 1.08, + "grad_norm": 0.7490510484117048, + "learning_rate": 4.592460258587746e-06, + "loss": 0.4768, + "step": 8480 + }, + { + "epoch": 1.08, + "grad_norm": 0.6532429268903567, + "learning_rate": 4.591432113461079e-06, + "loss": 0.447, + "step": 8481 + }, + { + "epoch": 1.08, + "grad_norm": 0.6112587121051494, + "learning_rate": 4.59040398572584e-06, + "loss": 0.4357, + "step": 8482 + }, + { + "epoch": 1.08, + "grad_norm": 0.7077063572588193, + "learning_rate": 4.589375875425795e-06, + "loss": 0.4799, + "step": 8483 + }, + { + "epoch": 1.08, + "grad_norm": 0.6327200742886288, + "learning_rate": 4.5883477826047075e-06, + "loss": 0.4513, + "step": 8484 + }, + { + "epoch": 1.08, + "grad_norm": 0.6118384887184034, + "learning_rate": 4.5873197073063385e-06, + "loss": 0.428, + "step": 8485 + }, + { + "epoch": 1.08, + "grad_norm": 0.6382769090767949, + "learning_rate": 4.586291649574451e-06, + "loss": 0.4515, + "step": 8486 + }, + { + "epoch": 1.08, + "grad_norm": 0.63640148937763, + "learning_rate": 4.585263609452807e-06, + "loss": 0.5084, + "step": 8487 + }, + { + "epoch": 1.08, + "grad_norm": 1.6005979336395098, + "learning_rate": 4.584235586985162e-06, + "loss": 0.5446, + "step": 8488 + }, + { + "epoch": 1.08, + "grad_norm": 0.7239166760827016, + "learning_rate": 4.583207582215283e-06, + "loss": 0.5194, + "step": 8489 + }, + { + "epoch": 1.08, + "grad_norm": 0.6976519505597664, + "learning_rate": 4.582179595186925e-06, + "loss": 0.5144, + "step": 8490 + }, + { + "epoch": 1.08, + "grad_norm": 0.8564321157553273, + "learning_rate": 4.581151625943846e-06, + "loss": 0.5568, + "step": 8491 + }, + { + "epoch": 1.08, + "grad_norm": 1.019726722347934, + "learning_rate": 4.5801236745298035e-06, + "loss": 0.5156, + "step": 8492 + }, + { + "epoch": 1.08, + "grad_norm": 0.7631858368641522, + "learning_rate": 4.579095740988555e-06, + "loss": 0.5364, + "step": 8493 + }, + { + "epoch": 1.08, + "grad_norm": 0.7451487220324169, + "learning_rate": 4.5780678253638565e-06, + "loss": 0.5113, + "step": 8494 + }, + { + "epoch": 1.08, + "grad_norm": 0.5332420172698064, + "learning_rate": 4.577039927699461e-06, + "loss": 0.3696, + "step": 8495 + }, + { + "epoch": 1.08, + "grad_norm": 0.7023011347967351, + "learning_rate": 4.576012048039126e-06, + "loss": 0.445, + "step": 8496 + }, + { + "epoch": 1.08, + "grad_norm": 0.5533543310108555, + "learning_rate": 4.574984186426602e-06, + "loss": 0.4346, + "step": 8497 + }, + { + "epoch": 1.08, + "grad_norm": 0.5938299622887316, + "learning_rate": 4.573956342905643e-06, + "loss": 0.4728, + "step": 8498 + }, + { + "epoch": 1.08, + "grad_norm": 0.7108504577666083, + "learning_rate": 4.572928517520001e-06, + "loss": 0.5086, + "step": 8499 + }, + { + "epoch": 1.08, + "grad_norm": 0.7453909158545868, + "learning_rate": 4.5719007103134255e-06, + "loss": 0.5116, + "step": 8500 + }, + { + "epoch": 1.08, + "grad_norm": 1.1696602653173944, + "learning_rate": 4.570872921329671e-06, + "loss": 0.5146, + "step": 8501 + }, + { + "epoch": 1.08, + "grad_norm": 2.651065938209936, + "learning_rate": 4.569845150612485e-06, + "loss": 0.5385, + "step": 8502 + }, + { + "epoch": 1.08, + "grad_norm": 0.783421636126263, + "learning_rate": 4.5688173982056155e-06, + "loss": 0.4935, + "step": 8503 + }, + { + "epoch": 1.08, + "grad_norm": 0.6527657778411697, + "learning_rate": 4.567789664152812e-06, + "loss": 0.5161, + "step": 8504 + }, + { + "epoch": 1.08, + "grad_norm": 0.7547590276723879, + "learning_rate": 4.566761948497821e-06, + "loss": 0.4957, + "step": 8505 + }, + { + "epoch": 1.08, + "grad_norm": 0.6484318284195962, + "learning_rate": 4.5657342512843905e-06, + "loss": 0.5079, + "step": 8506 + }, + { + "epoch": 1.08, + "grad_norm": 0.6499076906790505, + "learning_rate": 4.5647065725562646e-06, + "loss": 0.4986, + "step": 8507 + }, + { + "epoch": 1.08, + "grad_norm": 0.6062918986988345, + "learning_rate": 4.563678912357189e-06, + "loss": 0.4167, + "step": 8508 + }, + { + "epoch": 1.08, + "grad_norm": 0.6235554844187612, + "learning_rate": 4.562651270730907e-06, + "loss": 0.4319, + "step": 8509 + }, + { + "epoch": 1.08, + "grad_norm": 0.6540367638124763, + "learning_rate": 4.561623647721164e-06, + "loss": 0.4961, + "step": 8510 + }, + { + "epoch": 1.08, + "grad_norm": 12.354005505526635, + "learning_rate": 4.560596043371701e-06, + "loss": 0.57, + "step": 8511 + }, + { + "epoch": 1.08, + "grad_norm": 0.6988677283713708, + "learning_rate": 4.55956845772626e-06, + "loss": 0.504, + "step": 8512 + }, + { + "epoch": 1.08, + "grad_norm": 0.7861868834231205, + "learning_rate": 4.5585408908285816e-06, + "loss": 0.5371, + "step": 8513 + }, + { + "epoch": 1.08, + "grad_norm": 0.8069722489047302, + "learning_rate": 4.557513342722408e-06, + "loss": 0.5275, + "step": 8514 + }, + { + "epoch": 1.08, + "grad_norm": 0.6365879697457588, + "learning_rate": 4.5564858134514775e-06, + "loss": 0.4497, + "step": 8515 + }, + { + "epoch": 1.08, + "grad_norm": 0.6273633559367239, + "learning_rate": 4.555458303059528e-06, + "loss": 0.4146, + "step": 8516 + }, + { + "epoch": 1.09, + "grad_norm": 0.8136841886984113, + "learning_rate": 4.554430811590298e-06, + "loss": 0.5138, + "step": 8517 + }, + { + "epoch": 1.09, + "grad_norm": 0.8264935032692735, + "learning_rate": 4.553403339087525e-06, + "loss": 0.487, + "step": 8518 + }, + { + "epoch": 1.09, + "grad_norm": 1.121747488940747, + "learning_rate": 4.5523758855949436e-06, + "loss": 0.4778, + "step": 8519 + }, + { + "epoch": 1.09, + "grad_norm": 0.700967586858247, + "learning_rate": 4.551348451156291e-06, + "loss": 0.492, + "step": 8520 + }, + { + "epoch": 1.09, + "grad_norm": 0.63724114384012, + "learning_rate": 4.550321035815302e-06, + "loss": 0.4543, + "step": 8521 + }, + { + "epoch": 1.09, + "grad_norm": 0.6270102790803425, + "learning_rate": 4.549293639615709e-06, + "loss": 0.4213, + "step": 8522 + }, + { + "epoch": 1.09, + "grad_norm": 0.582054691314822, + "learning_rate": 4.548266262601245e-06, + "loss": 0.4826, + "step": 8523 + }, + { + "epoch": 1.09, + "grad_norm": 0.6218453651697858, + "learning_rate": 4.5472389048156435e-06, + "loss": 0.4312, + "step": 8524 + }, + { + "epoch": 1.09, + "grad_norm": 0.6673240556469603, + "learning_rate": 4.5462115663026334e-06, + "loss": 0.466, + "step": 8525 + }, + { + "epoch": 1.09, + "grad_norm": 0.7173811025798682, + "learning_rate": 4.545184247105948e-06, + "loss": 0.5349, + "step": 8526 + }, + { + "epoch": 1.09, + "grad_norm": 0.7503162537633045, + "learning_rate": 4.544156947269316e-06, + "loss": 0.5353, + "step": 8527 + }, + { + "epoch": 1.09, + "grad_norm": 0.6175185389318006, + "learning_rate": 4.543129666836467e-06, + "loss": 0.4338, + "step": 8528 + }, + { + "epoch": 1.09, + "grad_norm": 0.5703059141701459, + "learning_rate": 4.542102405851127e-06, + "loss": 0.4122, + "step": 8529 + }, + { + "epoch": 1.09, + "grad_norm": 0.6690488920865705, + "learning_rate": 4.541075164357024e-06, + "loss": 0.4688, + "step": 8530 + }, + { + "epoch": 1.09, + "grad_norm": 1.0316807498732905, + "learning_rate": 4.5400479423978855e-06, + "loss": 0.4864, + "step": 8531 + }, + { + "epoch": 1.09, + "grad_norm": 3.04091603041933, + "learning_rate": 4.539020740017436e-06, + "loss": 0.5029, + "step": 8532 + }, + { + "epoch": 1.09, + "grad_norm": 0.6447024650700228, + "learning_rate": 4.5379935572594e-06, + "loss": 0.4826, + "step": 8533 + }, + { + "epoch": 1.09, + "grad_norm": 0.7603019123434447, + "learning_rate": 4.536966394167503e-06, + "loss": 0.5101, + "step": 8534 + }, + { + "epoch": 1.09, + "grad_norm": 0.5531883537578846, + "learning_rate": 4.535939250785466e-06, + "loss": 0.3976, + "step": 8535 + }, + { + "epoch": 1.09, + "grad_norm": 0.5895226896128738, + "learning_rate": 4.534912127157013e-06, + "loss": 0.4469, + "step": 8536 + }, + { + "epoch": 1.09, + "grad_norm": 0.8337964471556024, + "learning_rate": 4.533885023325864e-06, + "loss": 0.5188, + "step": 8537 + }, + { + "epoch": 1.09, + "grad_norm": 0.796924810287293, + "learning_rate": 4.532857939335738e-06, + "loss": 0.5579, + "step": 8538 + }, + { + "epoch": 1.09, + "grad_norm": 0.7125114184666226, + "learning_rate": 4.531830875230359e-06, + "loss": 0.5255, + "step": 8539 + }, + { + "epoch": 1.09, + "grad_norm": 0.7751007418744865, + "learning_rate": 4.5308038310534435e-06, + "loss": 0.512, + "step": 8540 + }, + { + "epoch": 1.09, + "grad_norm": 0.5971475669635828, + "learning_rate": 4.529776806848709e-06, + "loss": 0.4673, + "step": 8541 + }, + { + "epoch": 1.09, + "grad_norm": 0.7450481095365744, + "learning_rate": 4.528749802659875e-06, + "loss": 0.5264, + "step": 8542 + }, + { + "epoch": 1.09, + "grad_norm": 0.6659255352488508, + "learning_rate": 4.5277228185306545e-06, + "loss": 0.4933, + "step": 8543 + }, + { + "epoch": 1.09, + "grad_norm": 0.629523510727235, + "learning_rate": 4.5266958545047645e-06, + "loss": 0.4729, + "step": 8544 + }, + { + "epoch": 1.09, + "grad_norm": 0.8119753801888152, + "learning_rate": 4.52566891062592e-06, + "loss": 0.4691, + "step": 8545 + }, + { + "epoch": 1.09, + "grad_norm": 0.6606991694064694, + "learning_rate": 4.5246419869378335e-06, + "loss": 0.4831, + "step": 8546 + }, + { + "epoch": 1.09, + "grad_norm": 0.7158881713966089, + "learning_rate": 4.52361508348422e-06, + "loss": 0.5312, + "step": 8547 + }, + { + "epoch": 1.09, + "grad_norm": 1.2598959055127832, + "learning_rate": 4.522588200308789e-06, + "loss": 0.5205, + "step": 8548 + }, + { + "epoch": 1.09, + "grad_norm": 1.4816377323870071, + "learning_rate": 4.521561337455252e-06, + "loss": 0.5223, + "step": 8549 + }, + { + "epoch": 1.09, + "grad_norm": 0.6636007298329161, + "learning_rate": 4.520534494967319e-06, + "loss": 0.4837, + "step": 8550 + }, + { + "epoch": 1.09, + "grad_norm": 0.974156257685153, + "learning_rate": 4.519507672888703e-06, + "loss": 0.5202, + "step": 8551 + }, + { + "epoch": 1.09, + "grad_norm": 0.6543097079647269, + "learning_rate": 4.518480871263109e-06, + "loss": 0.4973, + "step": 8552 + }, + { + "epoch": 1.09, + "grad_norm": 0.7239561621348768, + "learning_rate": 4.517454090134246e-06, + "loss": 0.4602, + "step": 8553 + }, + { + "epoch": 1.09, + "grad_norm": 0.6288596350119882, + "learning_rate": 4.5164273295458196e-06, + "loss": 0.4684, + "step": 8554 + }, + { + "epoch": 1.09, + "grad_norm": 0.596747871792202, + "learning_rate": 4.515400589541537e-06, + "loss": 0.4458, + "step": 8555 + }, + { + "epoch": 1.09, + "grad_norm": 0.6388115211221093, + "learning_rate": 4.5143738701651035e-06, + "loss": 0.453, + "step": 8556 + }, + { + "epoch": 1.09, + "grad_norm": 0.8543492795767097, + "learning_rate": 4.5133471714602215e-06, + "loss": 0.5549, + "step": 8557 + }, + { + "epoch": 1.09, + "grad_norm": 0.910159438413612, + "learning_rate": 4.512320493470596e-06, + "loss": 0.4789, + "step": 8558 + }, + { + "epoch": 1.09, + "grad_norm": 0.5737940554333593, + "learning_rate": 4.5112938362399284e-06, + "loss": 0.4676, + "step": 8559 + }, + { + "epoch": 1.09, + "grad_norm": 0.7892243399036161, + "learning_rate": 4.51026719981192e-06, + "loss": 0.5219, + "step": 8560 + }, + { + "epoch": 1.09, + "grad_norm": 0.6176669981869405, + "learning_rate": 4.5092405842302725e-06, + "loss": 0.4941, + "step": 8561 + }, + { + "epoch": 1.09, + "grad_norm": 0.6739875231464935, + "learning_rate": 4.508213989538683e-06, + "loss": 0.4539, + "step": 8562 + }, + { + "epoch": 1.09, + "grad_norm": 0.7454513425658681, + "learning_rate": 4.507187415780855e-06, + "loss": 0.5714, + "step": 8563 + }, + { + "epoch": 1.09, + "grad_norm": 1.5481377961594522, + "learning_rate": 4.506160863000483e-06, + "loss": 0.5671, + "step": 8564 + }, + { + "epoch": 1.09, + "grad_norm": 0.687251456902005, + "learning_rate": 4.505134331241265e-06, + "loss": 0.4877, + "step": 8565 + }, + { + "epoch": 1.09, + "grad_norm": 0.7327951515727712, + "learning_rate": 4.504107820546898e-06, + "loss": 0.4892, + "step": 8566 + }, + { + "epoch": 1.09, + "grad_norm": 0.7368247642955208, + "learning_rate": 4.503081330961076e-06, + "loss": 0.5365, + "step": 8567 + }, + { + "epoch": 1.09, + "grad_norm": 0.8655420527301664, + "learning_rate": 4.502054862527494e-06, + "loss": 0.5401, + "step": 8568 + }, + { + "epoch": 1.09, + "grad_norm": 0.668336103426427, + "learning_rate": 4.5010284152898445e-06, + "loss": 0.487, + "step": 8569 + }, + { + "epoch": 1.09, + "grad_norm": 0.6543585134052369, + "learning_rate": 4.500001989291821e-06, + "loss": 0.4561, + "step": 8570 + }, + { + "epoch": 1.09, + "grad_norm": 0.616512254387168, + "learning_rate": 4.498975584577116e-06, + "loss": 0.4662, + "step": 8571 + }, + { + "epoch": 1.09, + "grad_norm": 0.5417851188518977, + "learning_rate": 4.497949201189418e-06, + "loss": 0.4544, + "step": 8572 + }, + { + "epoch": 1.09, + "grad_norm": 0.6382641415253936, + "learning_rate": 4.49692283917242e-06, + "loss": 0.4463, + "step": 8573 + }, + { + "epoch": 1.09, + "grad_norm": 0.6068944434643876, + "learning_rate": 4.495896498569807e-06, + "loss": 0.4646, + "step": 8574 + }, + { + "epoch": 1.09, + "grad_norm": 0.7079080744809629, + "learning_rate": 4.49487017942527e-06, + "loss": 0.4764, + "step": 8575 + }, + { + "epoch": 1.09, + "grad_norm": 0.5734771413038712, + "learning_rate": 4.493843881782495e-06, + "loss": 0.4756, + "step": 8576 + }, + { + "epoch": 1.09, + "grad_norm": 0.6236499671704739, + "learning_rate": 4.492817605685169e-06, + "loss": 0.4982, + "step": 8577 + }, + { + "epoch": 1.09, + "grad_norm": 0.9465062605165874, + "learning_rate": 4.491791351176977e-06, + "loss": 0.5133, + "step": 8578 + }, + { + "epoch": 1.09, + "grad_norm": 0.6049141820933159, + "learning_rate": 4.490765118301603e-06, + "loss": 0.4948, + "step": 8579 + }, + { + "epoch": 1.09, + "grad_norm": 0.7607790457656651, + "learning_rate": 4.48973890710273e-06, + "loss": 0.4705, + "step": 8580 + }, + { + "epoch": 1.09, + "grad_norm": 0.629780419140997, + "learning_rate": 4.4887127176240415e-06, + "loss": 0.493, + "step": 8581 + }, + { + "epoch": 1.09, + "grad_norm": 0.6243714315478958, + "learning_rate": 4.487686549909219e-06, + "loss": 0.4483, + "step": 8582 + }, + { + "epoch": 1.09, + "grad_norm": 0.6057055072199797, + "learning_rate": 4.486660404001944e-06, + "loss": 0.4305, + "step": 8583 + }, + { + "epoch": 1.09, + "grad_norm": 0.6615294020976404, + "learning_rate": 4.485634279945894e-06, + "loss": 0.4765, + "step": 8584 + }, + { + "epoch": 1.09, + "grad_norm": 0.6795424287282442, + "learning_rate": 4.484608177784749e-06, + "loss": 0.4713, + "step": 8585 + }, + { + "epoch": 1.09, + "grad_norm": 0.5962094773294149, + "learning_rate": 4.483582097562187e-06, + "loss": 0.4787, + "step": 8586 + }, + { + "epoch": 1.09, + "grad_norm": 0.7227475521486151, + "learning_rate": 4.482556039321884e-06, + "loss": 0.4962, + "step": 8587 + }, + { + "epoch": 1.09, + "grad_norm": 0.6434688661805869, + "learning_rate": 4.481530003107518e-06, + "loss": 0.4598, + "step": 8588 + }, + { + "epoch": 1.09, + "grad_norm": 0.5934801999275032, + "learning_rate": 4.480503988962762e-06, + "loss": 0.4845, + "step": 8589 + }, + { + "epoch": 1.09, + "grad_norm": 0.753699827132346, + "learning_rate": 4.479477996931293e-06, + "loss": 0.57, + "step": 8590 + }, + { + "epoch": 1.09, + "grad_norm": 0.7304870421957352, + "learning_rate": 4.478452027056781e-06, + "loss": 0.5929, + "step": 8591 + }, + { + "epoch": 1.09, + "grad_norm": 0.8228141212036288, + "learning_rate": 4.4774260793829e-06, + "loss": 0.5648, + "step": 8592 + }, + { + "epoch": 1.09, + "grad_norm": 0.5714437996989518, + "learning_rate": 4.476400153953321e-06, + "loss": 0.4417, + "step": 8593 + }, + { + "epoch": 1.09, + "grad_norm": 0.6184048249245075, + "learning_rate": 4.475374250811714e-06, + "loss": 0.4568, + "step": 8594 + }, + { + "epoch": 1.09, + "grad_norm": 0.772098798074029, + "learning_rate": 4.474348370001749e-06, + "loss": 0.5371, + "step": 8595 + }, + { + "epoch": 1.1, + "grad_norm": 0.9233128394480612, + "learning_rate": 4.473322511567094e-06, + "loss": 0.518, + "step": 8596 + }, + { + "epoch": 1.1, + "grad_norm": 0.7172785458518779, + "learning_rate": 4.472296675551416e-06, + "loss": 0.4532, + "step": 8597 + }, + { + "epoch": 1.1, + "grad_norm": 0.7916605861801453, + "learning_rate": 4.471270861998383e-06, + "loss": 0.542, + "step": 8598 + }, + { + "epoch": 1.1, + "grad_norm": 0.5829875274529767, + "learning_rate": 4.4702450709516595e-06, + "loss": 0.4628, + "step": 8599 + }, + { + "epoch": 1.1, + "grad_norm": 0.7499182173210165, + "learning_rate": 4.4692193024549095e-06, + "loss": 0.4486, + "step": 8600 + }, + { + "epoch": 1.1, + "grad_norm": 0.7169296975442507, + "learning_rate": 4.468193556551798e-06, + "loss": 0.4398, + "step": 8601 + }, + { + "epoch": 1.1, + "grad_norm": 0.6425119022213079, + "learning_rate": 4.467167833285988e-06, + "loss": 0.4515, + "step": 8602 + }, + { + "epoch": 1.1, + "grad_norm": 0.5763835112707627, + "learning_rate": 4.466142132701141e-06, + "loss": 0.4667, + "step": 8603 + }, + { + "epoch": 1.1, + "grad_norm": 0.6296111599683203, + "learning_rate": 4.465116454840918e-06, + "loss": 0.5095, + "step": 8604 + }, + { + "epoch": 1.1, + "grad_norm": 0.747651229603383, + "learning_rate": 4.4640907997489765e-06, + "loss": 0.5434, + "step": 8605 + }, + { + "epoch": 1.1, + "grad_norm": 0.7213999981081614, + "learning_rate": 4.4630651674689785e-06, + "loss": 0.4558, + "step": 8606 + }, + { + "epoch": 1.1, + "grad_norm": 0.7039531633040491, + "learning_rate": 4.4620395580445806e-06, + "loss": 0.48, + "step": 8607 + }, + { + "epoch": 1.1, + "grad_norm": 0.5914858093262314, + "learning_rate": 4.461013971519439e-06, + "loss": 0.4814, + "step": 8608 + }, + { + "epoch": 1.1, + "grad_norm": 0.7893659125193687, + "learning_rate": 4.45998840793721e-06, + "loss": 0.584, + "step": 8609 + }, + { + "epoch": 1.1, + "grad_norm": 0.8114567157705281, + "learning_rate": 4.45896286734155e-06, + "loss": 0.4689, + "step": 8610 + }, + { + "epoch": 1.1, + "grad_norm": 0.6929330862075997, + "learning_rate": 4.457937349776111e-06, + "loss": 0.4272, + "step": 8611 + }, + { + "epoch": 1.1, + "grad_norm": 0.7801562241755182, + "learning_rate": 4.456911855284545e-06, + "loss": 0.4953, + "step": 8612 + }, + { + "epoch": 1.1, + "grad_norm": 0.6751757870398382, + "learning_rate": 4.455886383910507e-06, + "loss": 0.5268, + "step": 8613 + }, + { + "epoch": 1.1, + "grad_norm": 0.7973034526726619, + "learning_rate": 4.454860935697648e-06, + "loss": 0.5348, + "step": 8614 + }, + { + "epoch": 1.1, + "grad_norm": 0.6372102847295429, + "learning_rate": 4.453835510689617e-06, + "loss": 0.4681, + "step": 8615 + }, + { + "epoch": 1.1, + "grad_norm": 0.6916366072561504, + "learning_rate": 4.452810108930063e-06, + "loss": 0.5278, + "step": 8616 + }, + { + "epoch": 1.1, + "grad_norm": 0.8292440576209625, + "learning_rate": 4.451784730462634e-06, + "loss": 0.5617, + "step": 8617 + }, + { + "epoch": 1.1, + "grad_norm": 0.6891838474682641, + "learning_rate": 4.450759375330977e-06, + "loss": 0.5236, + "step": 8618 + }, + { + "epoch": 1.1, + "grad_norm": 0.6294819532266165, + "learning_rate": 4.4497340435787385e-06, + "loss": 0.4739, + "step": 8619 + }, + { + "epoch": 1.1, + "grad_norm": 0.6667224319215789, + "learning_rate": 4.4487087352495625e-06, + "loss": 0.475, + "step": 8620 + }, + { + "epoch": 1.1, + "grad_norm": 0.6618942768878272, + "learning_rate": 4.447683450387095e-06, + "loss": 0.4744, + "step": 8621 + }, + { + "epoch": 1.1, + "grad_norm": 0.7378393210761782, + "learning_rate": 4.446658189034977e-06, + "loss": 0.5222, + "step": 8622 + }, + { + "epoch": 1.1, + "grad_norm": 0.6726329303074522, + "learning_rate": 4.4456329512368525e-06, + "loss": 0.4672, + "step": 8623 + }, + { + "epoch": 1.1, + "grad_norm": 0.7723465303164209, + "learning_rate": 4.44460773703636e-06, + "loss": 0.5219, + "step": 8624 + }, + { + "epoch": 1.1, + "grad_norm": 0.6935111887510977, + "learning_rate": 4.443582546477143e-06, + "loss": 0.4762, + "step": 8625 + }, + { + "epoch": 1.1, + "grad_norm": 0.565089202783496, + "learning_rate": 4.4425573796028385e-06, + "loss": 0.4188, + "step": 8626 + }, + { + "epoch": 1.1, + "grad_norm": 0.5791298537921312, + "learning_rate": 4.4415322364570854e-06, + "loss": 0.4689, + "step": 8627 + }, + { + "epoch": 1.1, + "grad_norm": 0.6218219346859978, + "learning_rate": 4.44050711708352e-06, + "loss": 0.4506, + "step": 8628 + }, + { + "epoch": 1.1, + "grad_norm": 0.6059194126133071, + "learning_rate": 4.43948202152578e-06, + "loss": 0.4636, + "step": 8629 + }, + { + "epoch": 1.1, + "grad_norm": 0.5934545801783437, + "learning_rate": 4.438456949827498e-06, + "loss": 0.4531, + "step": 8630 + }, + { + "epoch": 1.1, + "grad_norm": 0.6532553877571453, + "learning_rate": 4.43743190203231e-06, + "loss": 0.4845, + "step": 8631 + }, + { + "epoch": 1.1, + "grad_norm": 0.7466083408297207, + "learning_rate": 4.436406878183848e-06, + "loss": 0.5414, + "step": 8632 + }, + { + "epoch": 1.1, + "grad_norm": 0.7449060202951123, + "learning_rate": 4.435381878325744e-06, + "loss": 0.526, + "step": 8633 + }, + { + "epoch": 1.1, + "grad_norm": 0.6527145089285332, + "learning_rate": 4.434356902501629e-06, + "loss": 0.5192, + "step": 8634 + }, + { + "epoch": 1.1, + "grad_norm": 0.5614537143279134, + "learning_rate": 4.433331950755133e-06, + "loss": 0.4947, + "step": 8635 + }, + { + "epoch": 1.1, + "grad_norm": 0.6354571424910672, + "learning_rate": 4.432307023129885e-06, + "loss": 0.4495, + "step": 8636 + }, + { + "epoch": 1.1, + "grad_norm": 0.6750311823057951, + "learning_rate": 4.431282119669513e-06, + "loss": 0.4375, + "step": 8637 + }, + { + "epoch": 1.1, + "grad_norm": 0.6404922351551362, + "learning_rate": 4.430257240417644e-06, + "loss": 0.4317, + "step": 8638 + }, + { + "epoch": 1.1, + "grad_norm": 0.5444893108505425, + "learning_rate": 4.429232385417903e-06, + "loss": 0.3925, + "step": 8639 + }, + { + "epoch": 1.1, + "grad_norm": 0.61761876384027, + "learning_rate": 4.428207554713916e-06, + "loss": 0.4815, + "step": 8640 + }, + { + "epoch": 1.1, + "grad_norm": 0.7125928483156411, + "learning_rate": 4.4271827483493065e-06, + "loss": 0.5078, + "step": 8641 + }, + { + "epoch": 1.1, + "grad_norm": 0.7224533202582047, + "learning_rate": 4.4261579663676964e-06, + "loss": 0.5342, + "step": 8642 + }, + { + "epoch": 1.1, + "grad_norm": 0.8157519984574083, + "learning_rate": 4.425133208812708e-06, + "loss": 0.5587, + "step": 8643 + }, + { + "epoch": 1.1, + "grad_norm": 0.9054439381780619, + "learning_rate": 4.424108475727962e-06, + "loss": 0.5551, + "step": 8644 + }, + { + "epoch": 1.1, + "grad_norm": 0.6051133046357489, + "learning_rate": 4.423083767157078e-06, + "loss": 0.4632, + "step": 8645 + }, + { + "epoch": 1.1, + "grad_norm": 0.6810378920543833, + "learning_rate": 4.422059083143675e-06, + "loss": 0.5001, + "step": 8646 + }, + { + "epoch": 1.1, + "grad_norm": 0.8214147314192999, + "learning_rate": 4.4210344237313695e-06, + "loss": 0.5469, + "step": 8647 + }, + { + "epoch": 1.1, + "grad_norm": 0.7436719018597728, + "learning_rate": 4.420009788963779e-06, + "loss": 0.5448, + "step": 8648 + }, + { + "epoch": 1.1, + "grad_norm": 0.6598472872452492, + "learning_rate": 4.4189851788845165e-06, + "loss": 0.5042, + "step": 8649 + }, + { + "epoch": 1.1, + "grad_norm": 0.9718468638325388, + "learning_rate": 4.4179605935372e-06, + "loss": 0.47, + "step": 8650 + }, + { + "epoch": 1.1, + "grad_norm": 0.5969621057466895, + "learning_rate": 4.41693603296544e-06, + "loss": 0.453, + "step": 8651 + }, + { + "epoch": 1.1, + "grad_norm": 0.5265999909546717, + "learning_rate": 4.415911497212852e-06, + "loss": 0.394, + "step": 8652 + }, + { + "epoch": 1.1, + "grad_norm": 0.6303210903993633, + "learning_rate": 4.414886986323043e-06, + "loss": 0.4368, + "step": 8653 + }, + { + "epoch": 1.1, + "grad_norm": 0.6939423926267622, + "learning_rate": 4.4138625003396265e-06, + "loss": 0.4152, + "step": 8654 + }, + { + "epoch": 1.1, + "grad_norm": 0.626467544574878, + "learning_rate": 4.41283803930621e-06, + "loss": 0.5155, + "step": 8655 + }, + { + "epoch": 1.1, + "grad_norm": 0.7689550372434832, + "learning_rate": 4.411813603266401e-06, + "loss": 0.4649, + "step": 8656 + }, + { + "epoch": 1.1, + "grad_norm": 0.6866838720873619, + "learning_rate": 4.410789192263808e-06, + "loss": 0.526, + "step": 8657 + }, + { + "epoch": 1.1, + "grad_norm": 0.8654830526462901, + "learning_rate": 4.409764806342037e-06, + "loss": 0.5443, + "step": 8658 + }, + { + "epoch": 1.1, + "grad_norm": 0.5976793751090987, + "learning_rate": 4.408740445544691e-06, + "loss": 0.4873, + "step": 8659 + }, + { + "epoch": 1.1, + "grad_norm": 0.5784852569203252, + "learning_rate": 4.407716109915375e-06, + "loss": 0.4481, + "step": 8660 + }, + { + "epoch": 1.1, + "grad_norm": 0.6564506025264064, + "learning_rate": 4.406691799497692e-06, + "loss": 0.4941, + "step": 8661 + }, + { + "epoch": 1.1, + "grad_norm": 0.7154700971699651, + "learning_rate": 4.405667514335241e-06, + "loss": 0.5411, + "step": 8662 + }, + { + "epoch": 1.1, + "grad_norm": 0.6501901165115125, + "learning_rate": 4.404643254471626e-06, + "loss": 0.4738, + "step": 8663 + }, + { + "epoch": 1.1, + "grad_norm": 0.6076375323665985, + "learning_rate": 4.403619019950446e-06, + "loss": 0.4509, + "step": 8664 + }, + { + "epoch": 1.1, + "grad_norm": 0.6591207461291212, + "learning_rate": 4.4025948108152985e-06, + "loss": 0.4859, + "step": 8665 + }, + { + "epoch": 1.1, + "grad_norm": 0.6272720824264322, + "learning_rate": 4.401570627109781e-06, + "loss": 0.5301, + "step": 8666 + }, + { + "epoch": 1.1, + "grad_norm": 0.6913948497572918, + "learning_rate": 4.400546468877489e-06, + "loss": 0.4798, + "step": 8667 + }, + { + "epoch": 1.1, + "grad_norm": 0.6434516985941561, + "learning_rate": 4.3995223361620185e-06, + "loss": 0.4689, + "step": 8668 + }, + { + "epoch": 1.1, + "grad_norm": 0.5876805938368626, + "learning_rate": 4.398498229006963e-06, + "loss": 0.4696, + "step": 8669 + }, + { + "epoch": 1.1, + "grad_norm": 0.6057948307165987, + "learning_rate": 4.3974741474559165e-06, + "loss": 0.4372, + "step": 8670 + }, + { + "epoch": 1.1, + "grad_norm": 0.6715480883261464, + "learning_rate": 4.39645009155247e-06, + "loss": 0.456, + "step": 8671 + }, + { + "epoch": 1.1, + "grad_norm": 0.7285974130030384, + "learning_rate": 4.395426061340214e-06, + "loss": 0.5578, + "step": 8672 + }, + { + "epoch": 1.1, + "grad_norm": 0.6896630406295814, + "learning_rate": 4.394402056862739e-06, + "loss": 0.4578, + "step": 8673 + }, + { + "epoch": 1.11, + "grad_norm": 0.6618818964229133, + "learning_rate": 4.39337807816363e-06, + "loss": 0.4832, + "step": 8674 + }, + { + "epoch": 1.11, + "grad_norm": 0.7675535119380845, + "learning_rate": 4.392354125286482e-06, + "loss": 0.5144, + "step": 8675 + }, + { + "epoch": 1.11, + "grad_norm": 0.8639253036233522, + "learning_rate": 4.391330198274875e-06, + "loss": 0.4811, + "step": 8676 + }, + { + "epoch": 1.11, + "grad_norm": 0.6290323980018491, + "learning_rate": 4.390306297172398e-06, + "loss": 0.4495, + "step": 8677 + }, + { + "epoch": 1.11, + "grad_norm": 0.5589253284305435, + "learning_rate": 4.389282422022632e-06, + "loss": 0.4362, + "step": 8678 + }, + { + "epoch": 1.11, + "grad_norm": 0.6243316474311928, + "learning_rate": 4.388258572869163e-06, + "loss": 0.4681, + "step": 8679 + }, + { + "epoch": 1.11, + "grad_norm": 0.7325730737097648, + "learning_rate": 4.387234749755572e-06, + "loss": 0.486, + "step": 8680 + }, + { + "epoch": 1.11, + "grad_norm": 0.6590763989928909, + "learning_rate": 4.386210952725438e-06, + "loss": 0.4609, + "step": 8681 + }, + { + "epoch": 1.11, + "grad_norm": 0.6050061783622741, + "learning_rate": 4.3851871818223436e-06, + "loss": 0.4959, + "step": 8682 + }, + { + "epoch": 1.11, + "grad_norm": 0.8861113074397442, + "learning_rate": 4.3841634370898665e-06, + "loss": 0.5099, + "step": 8683 + }, + { + "epoch": 1.11, + "grad_norm": 0.6315380546199132, + "learning_rate": 4.383139718571583e-06, + "loss": 0.4538, + "step": 8684 + }, + { + "epoch": 1.11, + "grad_norm": 0.7651877298522781, + "learning_rate": 4.3821160263110706e-06, + "loss": 0.5382, + "step": 8685 + }, + { + "epoch": 1.11, + "grad_norm": 0.8077359680707338, + "learning_rate": 4.381092360351903e-06, + "loss": 0.5179, + "step": 8686 + }, + { + "epoch": 1.11, + "grad_norm": 0.6692613847095965, + "learning_rate": 4.380068720737657e-06, + "loss": 0.4142, + "step": 8687 + }, + { + "epoch": 1.11, + "grad_norm": 0.6390244607802591, + "learning_rate": 4.379045107511905e-06, + "loss": 0.4423, + "step": 8688 + }, + { + "epoch": 1.11, + "grad_norm": 0.5724714658015041, + "learning_rate": 4.378021520718219e-06, + "loss": 0.4826, + "step": 8689 + }, + { + "epoch": 1.11, + "grad_norm": 0.6262749841077924, + "learning_rate": 4.376997960400169e-06, + "loss": 0.472, + "step": 8690 + }, + { + "epoch": 1.11, + "grad_norm": 0.6323066172468694, + "learning_rate": 4.375974426601325e-06, + "loss": 0.4979, + "step": 8691 + }, + { + "epoch": 1.11, + "grad_norm": 0.7704174814894261, + "learning_rate": 4.374950919365256e-06, + "loss": 0.5666, + "step": 8692 + }, + { + "epoch": 1.11, + "grad_norm": 0.6165845872604485, + "learning_rate": 4.373927438735528e-06, + "loss": 0.4665, + "step": 8693 + }, + { + "epoch": 1.11, + "grad_norm": 0.7284894634413241, + "learning_rate": 4.3729039847557085e-06, + "loss": 0.4949, + "step": 8694 + }, + { + "epoch": 1.11, + "grad_norm": 0.7551244400144178, + "learning_rate": 4.371880557469363e-06, + "loss": 0.5939, + "step": 8695 + }, + { + "epoch": 1.11, + "grad_norm": 0.7800292378269323, + "learning_rate": 4.370857156920055e-06, + "loss": 0.5347, + "step": 8696 + }, + { + "epoch": 1.11, + "grad_norm": 0.7780927129731123, + "learning_rate": 4.369833783151347e-06, + "loss": 0.5968, + "step": 8697 + }, + { + "epoch": 1.11, + "grad_norm": 0.766192995022262, + "learning_rate": 4.368810436206801e-06, + "loss": 0.5426, + "step": 8698 + }, + { + "epoch": 1.11, + "grad_norm": 0.818234690258935, + "learning_rate": 4.3677871161299766e-06, + "loss": 0.5855, + "step": 8699 + }, + { + "epoch": 1.11, + "grad_norm": 0.7936641865945461, + "learning_rate": 4.366763822964436e-06, + "loss": 0.5746, + "step": 8700 + }, + { + "epoch": 1.11, + "grad_norm": 0.7858493847198313, + "learning_rate": 4.365740556753736e-06, + "loss": 0.531, + "step": 8701 + }, + { + "epoch": 1.11, + "grad_norm": 0.6234169750185313, + "learning_rate": 4.364717317541433e-06, + "loss": 0.4522, + "step": 8702 + }, + { + "epoch": 1.11, + "grad_norm": 0.5651259641054174, + "learning_rate": 4.363694105371085e-06, + "loss": 0.4544, + "step": 8703 + }, + { + "epoch": 1.11, + "grad_norm": 0.714957264872062, + "learning_rate": 4.362670920286243e-06, + "loss": 0.4892, + "step": 8704 + }, + { + "epoch": 1.11, + "grad_norm": 0.6221203498088445, + "learning_rate": 4.361647762330465e-06, + "loss": 0.4668, + "step": 8705 + }, + { + "epoch": 1.11, + "grad_norm": 0.8096269206852756, + "learning_rate": 4.360624631547301e-06, + "loss": 0.4826, + "step": 8706 + }, + { + "epoch": 1.11, + "grad_norm": 0.7454281668502041, + "learning_rate": 4.359601527980305e-06, + "loss": 0.4689, + "step": 8707 + }, + { + "epoch": 1.11, + "grad_norm": 0.623672530163333, + "learning_rate": 4.358578451673024e-06, + "loss": 0.4782, + "step": 8708 + }, + { + "epoch": 1.11, + "grad_norm": 0.6512129741626779, + "learning_rate": 4.35755540266901e-06, + "loss": 0.4442, + "step": 8709 + }, + { + "epoch": 1.11, + "grad_norm": 0.6338054233723615, + "learning_rate": 4.356532381011809e-06, + "loss": 0.444, + "step": 8710 + }, + { + "epoch": 1.11, + "grad_norm": 0.6410156350100498, + "learning_rate": 4.355509386744967e-06, + "loss": 0.4419, + "step": 8711 + }, + { + "epoch": 1.11, + "grad_norm": 0.8272582864463619, + "learning_rate": 4.354486419912032e-06, + "loss": 0.555, + "step": 8712 + }, + { + "epoch": 1.11, + "grad_norm": 1.0136044771380013, + "learning_rate": 4.353463480556548e-06, + "loss": 0.5477, + "step": 8713 + }, + { + "epoch": 1.11, + "grad_norm": 0.6316251371016259, + "learning_rate": 4.3524405687220564e-06, + "loss": 0.457, + "step": 8714 + }, + { + "epoch": 1.11, + "grad_norm": 0.5959454485535449, + "learning_rate": 4.351417684452101e-06, + "loss": 0.4442, + "step": 8715 + }, + { + "epoch": 1.11, + "grad_norm": 0.7357473253675132, + "learning_rate": 4.350394827790222e-06, + "loss": 0.4624, + "step": 8716 + }, + { + "epoch": 1.11, + "grad_norm": 0.708131638468915, + "learning_rate": 4.3493719987799595e-06, + "loss": 0.494, + "step": 8717 + }, + { + "epoch": 1.11, + "grad_norm": 0.7470162034134946, + "learning_rate": 4.3483491974648525e-06, + "loss": 0.5011, + "step": 8718 + }, + { + "epoch": 1.11, + "grad_norm": 0.7863145203388021, + "learning_rate": 4.347326423888438e-06, + "loss": 0.5582, + "step": 8719 + }, + { + "epoch": 1.11, + "grad_norm": 0.7020088784737551, + "learning_rate": 4.34630367809425e-06, + "loss": 0.4847, + "step": 8720 + }, + { + "epoch": 1.11, + "grad_norm": 0.6526452735990081, + "learning_rate": 4.345280960125828e-06, + "loss": 0.5078, + "step": 8721 + }, + { + "epoch": 1.11, + "grad_norm": 0.601958441186249, + "learning_rate": 4.344258270026702e-06, + "loss": 0.5041, + "step": 8722 + }, + { + "epoch": 1.11, + "grad_norm": 0.6923037317230399, + "learning_rate": 4.343235607840406e-06, + "loss": 0.5121, + "step": 8723 + }, + { + "epoch": 1.11, + "grad_norm": 0.5976230137798745, + "learning_rate": 4.342212973610469e-06, + "loss": 0.4259, + "step": 8724 + }, + { + "epoch": 1.11, + "grad_norm": 0.7412289909548726, + "learning_rate": 4.3411903673804266e-06, + "loss": 0.4473, + "step": 8725 + }, + { + "epoch": 1.11, + "grad_norm": 0.684473919796735, + "learning_rate": 4.340167789193805e-06, + "loss": 0.5068, + "step": 8726 + }, + { + "epoch": 1.11, + "grad_norm": 0.6539044304974383, + "learning_rate": 4.339145239094132e-06, + "loss": 0.4859, + "step": 8727 + }, + { + "epoch": 1.11, + "grad_norm": 0.8439199576465645, + "learning_rate": 4.338122717124934e-06, + "loss": 0.5716, + "step": 8728 + }, + { + "epoch": 1.11, + "grad_norm": 0.6855460364648731, + "learning_rate": 4.337100223329737e-06, + "loss": 0.5111, + "step": 8729 + }, + { + "epoch": 1.11, + "grad_norm": 0.6677128874231376, + "learning_rate": 4.336077757752066e-06, + "loss": 0.5386, + "step": 8730 + }, + { + "epoch": 1.11, + "grad_norm": 0.8486124846884892, + "learning_rate": 4.335055320435442e-06, + "loss": 0.5168, + "step": 8731 + }, + { + "epoch": 1.11, + "grad_norm": 0.8376220912232718, + "learning_rate": 4.3340329114233886e-06, + "loss": 0.4783, + "step": 8732 + }, + { + "epoch": 1.11, + "grad_norm": 0.6469251931162432, + "learning_rate": 4.333010530759426e-06, + "loss": 0.4524, + "step": 8733 + }, + { + "epoch": 1.11, + "grad_norm": 0.5702683842430247, + "learning_rate": 4.331988178487074e-06, + "loss": 0.4402, + "step": 8734 + }, + { + "epoch": 1.11, + "grad_norm": 0.6489907106342068, + "learning_rate": 4.330965854649851e-06, + "loss": 0.4388, + "step": 8735 + }, + { + "epoch": 1.11, + "grad_norm": 0.6926037390772614, + "learning_rate": 4.32994355929127e-06, + "loss": 0.4754, + "step": 8736 + }, + { + "epoch": 1.11, + "grad_norm": 0.8381220742630461, + "learning_rate": 4.328921292454854e-06, + "loss": 0.5717, + "step": 8737 + }, + { + "epoch": 1.11, + "grad_norm": 0.6811558963561367, + "learning_rate": 4.3278990541841135e-06, + "loss": 0.5004, + "step": 8738 + }, + { + "epoch": 1.11, + "grad_norm": 0.8772869906540514, + "learning_rate": 4.326876844522563e-06, + "loss": 0.5307, + "step": 8739 + }, + { + "epoch": 1.11, + "grad_norm": 0.6755541977106135, + "learning_rate": 4.3258546635137135e-06, + "loss": 0.5776, + "step": 8740 + }, + { + "epoch": 1.11, + "grad_norm": 0.7146363999171752, + "learning_rate": 4.3248325112010775e-06, + "loss": 0.5548, + "step": 8741 + }, + { + "epoch": 1.11, + "grad_norm": 0.7573293702064998, + "learning_rate": 4.3238103876281635e-06, + "loss": 0.5409, + "step": 8742 + }, + { + "epoch": 1.11, + "grad_norm": 0.7290973537859432, + "learning_rate": 4.32278829283848e-06, + "loss": 0.5383, + "step": 8743 + }, + { + "epoch": 1.11, + "grad_norm": 1.4555408990939345, + "learning_rate": 4.321766226875536e-06, + "loss": 0.4785, + "step": 8744 + }, + { + "epoch": 1.11, + "grad_norm": 0.788367839865495, + "learning_rate": 4.320744189782836e-06, + "loss": 0.5144, + "step": 8745 + }, + { + "epoch": 1.11, + "grad_norm": 0.58756658494428, + "learning_rate": 4.319722181603885e-06, + "loss": 0.4537, + "step": 8746 + }, + { + "epoch": 1.11, + "grad_norm": 0.7559035060987543, + "learning_rate": 4.3187002023821875e-06, + "loss": 0.4853, + "step": 8747 + }, + { + "epoch": 1.11, + "grad_norm": 0.8002218078278686, + "learning_rate": 4.317678252161242e-06, + "loss": 0.5377, + "step": 8748 + }, + { + "epoch": 1.11, + "grad_norm": 0.6308985445671924, + "learning_rate": 4.316656330984556e-06, + "loss": 0.5428, + "step": 8749 + }, + { + "epoch": 1.11, + "grad_norm": 0.7165376891723766, + "learning_rate": 4.315634438895626e-06, + "loss": 0.4991, + "step": 8750 + }, + { + "epoch": 1.11, + "grad_norm": 0.762359985958814, + "learning_rate": 4.314612575937951e-06, + "loss": 0.5207, + "step": 8751 + }, + { + "epoch": 1.11, + "grad_norm": 0.727014580873897, + "learning_rate": 4.313590742155028e-06, + "loss": 0.4511, + "step": 8752 + }, + { + "epoch": 1.12, + "grad_norm": 0.8891728874339988, + "learning_rate": 4.312568937590354e-06, + "loss": 0.5493, + "step": 8753 + }, + { + "epoch": 1.12, + "grad_norm": 0.7236852647320612, + "learning_rate": 4.311547162287424e-06, + "loss": 0.4813, + "step": 8754 + }, + { + "epoch": 1.12, + "grad_norm": 0.7870785375394218, + "learning_rate": 4.3105254162897305e-06, + "loss": 0.5121, + "step": 8755 + }, + { + "epoch": 1.12, + "grad_norm": 0.6218075671455104, + "learning_rate": 4.309503699640768e-06, + "loss": 0.473, + "step": 8756 + }, + { + "epoch": 1.12, + "grad_norm": 0.7602237625412849, + "learning_rate": 4.308482012384025e-06, + "loss": 0.5028, + "step": 8757 + }, + { + "epoch": 1.12, + "grad_norm": 0.754321941869743, + "learning_rate": 4.307460354562995e-06, + "loss": 0.4829, + "step": 8758 + }, + { + "epoch": 1.12, + "grad_norm": 0.5843259666928085, + "learning_rate": 4.306438726221163e-06, + "loss": 0.4733, + "step": 8759 + }, + { + "epoch": 1.12, + "grad_norm": 0.6226233407243121, + "learning_rate": 4.305417127402018e-06, + "loss": 0.4648, + "step": 8760 + }, + { + "epoch": 1.12, + "grad_norm": 0.5072978089465365, + "learning_rate": 4.304395558149046e-06, + "loss": 0.3804, + "step": 8761 + }, + { + "epoch": 1.12, + "grad_norm": 0.5761660222577973, + "learning_rate": 4.3033740185057335e-06, + "loss": 0.451, + "step": 8762 + }, + { + "epoch": 1.12, + "grad_norm": 0.5597541850739801, + "learning_rate": 4.3023525085155625e-06, + "loss": 0.458, + "step": 8763 + }, + { + "epoch": 1.12, + "grad_norm": 0.666625679379583, + "learning_rate": 4.301331028222016e-06, + "loss": 0.4437, + "step": 8764 + }, + { + "epoch": 1.12, + "grad_norm": 0.6442802108456889, + "learning_rate": 4.300309577668575e-06, + "loss": 0.4981, + "step": 8765 + }, + { + "epoch": 1.12, + "grad_norm": 0.5518759462909568, + "learning_rate": 4.2992881568987175e-06, + "loss": 0.4141, + "step": 8766 + }, + { + "epoch": 1.12, + "grad_norm": 0.7275051122919908, + "learning_rate": 4.298266765955926e-06, + "loss": 0.4797, + "step": 8767 + }, + { + "epoch": 1.12, + "grad_norm": 0.826745839680901, + "learning_rate": 4.297245404883676e-06, + "loss": 0.6154, + "step": 8768 + }, + { + "epoch": 1.12, + "grad_norm": 0.7263518334293324, + "learning_rate": 4.296224073725443e-06, + "loss": 0.52, + "step": 8769 + }, + { + "epoch": 1.12, + "grad_norm": 0.5940595349038942, + "learning_rate": 4.2952027725247025e-06, + "loss": 0.4792, + "step": 8770 + }, + { + "epoch": 1.12, + "grad_norm": 0.7666531327699513, + "learning_rate": 4.294181501324928e-06, + "loss": 0.5156, + "step": 8771 + }, + { + "epoch": 1.12, + "grad_norm": 0.5711495507530732, + "learning_rate": 4.293160260169591e-06, + "loss": 0.477, + "step": 8772 + }, + { + "epoch": 1.12, + "grad_norm": 0.6589047347320118, + "learning_rate": 4.292139049102162e-06, + "loss": 0.4651, + "step": 8773 + }, + { + "epoch": 1.12, + "grad_norm": 0.5847462779913895, + "learning_rate": 4.291117868166113e-06, + "loss": 0.5123, + "step": 8774 + }, + { + "epoch": 1.12, + "grad_norm": 0.7016365497518163, + "learning_rate": 4.29009671740491e-06, + "loss": 0.5135, + "step": 8775 + }, + { + "epoch": 1.12, + "grad_norm": 0.74615706784095, + "learning_rate": 4.289075596862022e-06, + "loss": 0.4529, + "step": 8776 + }, + { + "epoch": 1.12, + "grad_norm": 0.8234423431622463, + "learning_rate": 4.288054506580914e-06, + "loss": 0.5127, + "step": 8777 + }, + { + "epoch": 1.12, + "grad_norm": 0.7751321035269879, + "learning_rate": 4.287033446605051e-06, + "loss": 0.5716, + "step": 8778 + }, + { + "epoch": 1.12, + "grad_norm": 0.8007655043186032, + "learning_rate": 4.2860124169778964e-06, + "loss": 0.5661, + "step": 8779 + }, + { + "epoch": 1.12, + "grad_norm": 0.5711652050877859, + "learning_rate": 4.28499141774291e-06, + "loss": 0.4334, + "step": 8780 + }, + { + "epoch": 1.12, + "grad_norm": 0.7716805471039628, + "learning_rate": 4.283970448943555e-06, + "loss": 0.4357, + "step": 8781 + }, + { + "epoch": 1.12, + "grad_norm": 0.5555181498804138, + "learning_rate": 4.2829495106232895e-06, + "loss": 0.425, + "step": 8782 + }, + { + "epoch": 1.12, + "grad_norm": 0.5868305371839402, + "learning_rate": 4.281928602825573e-06, + "loss": 0.4803, + "step": 8783 + }, + { + "epoch": 1.12, + "grad_norm": 0.6788657402725402, + "learning_rate": 4.280907725593861e-06, + "loss": 0.4927, + "step": 8784 + }, + { + "epoch": 1.12, + "grad_norm": 0.7123124466256725, + "learning_rate": 4.279886878971608e-06, + "loss": 0.5404, + "step": 8785 + }, + { + "epoch": 1.12, + "grad_norm": 0.7126931129527746, + "learning_rate": 4.278866063002268e-06, + "loss": 0.5243, + "step": 8786 + }, + { + "epoch": 1.12, + "grad_norm": 0.6864404507368663, + "learning_rate": 4.277845277729298e-06, + "loss": 0.5005, + "step": 8787 + }, + { + "epoch": 1.12, + "grad_norm": 0.7678498373500076, + "learning_rate": 4.276824523196146e-06, + "loss": 0.5279, + "step": 8788 + }, + { + "epoch": 1.12, + "grad_norm": 0.6298741301068632, + "learning_rate": 4.275803799446263e-06, + "loss": 0.4684, + "step": 8789 + }, + { + "epoch": 1.12, + "grad_norm": 0.8026094621604701, + "learning_rate": 4.274783106523099e-06, + "loss": 0.573, + "step": 8790 + }, + { + "epoch": 1.12, + "grad_norm": 0.65629232752146, + "learning_rate": 4.2737624444701e-06, + "loss": 0.5351, + "step": 8791 + }, + { + "epoch": 1.12, + "grad_norm": 0.8464828505114691, + "learning_rate": 4.272741813330713e-06, + "loss": 0.5553, + "step": 8792 + }, + { + "epoch": 1.12, + "grad_norm": 0.756340470284226, + "learning_rate": 4.271721213148384e-06, + "loss": 0.5518, + "step": 8793 + }, + { + "epoch": 1.12, + "grad_norm": 0.7783301469637252, + "learning_rate": 4.270700643966555e-06, + "loss": 0.5199, + "step": 8794 + }, + { + "epoch": 1.12, + "grad_norm": 0.6560574216946661, + "learning_rate": 4.2696801058286685e-06, + "loss": 0.4563, + "step": 8795 + }, + { + "epoch": 1.12, + "grad_norm": 0.5899349082505061, + "learning_rate": 4.268659598778166e-06, + "loss": 0.4473, + "step": 8796 + }, + { + "epoch": 1.12, + "grad_norm": 0.6963688927216121, + "learning_rate": 4.267639122858488e-06, + "loss": 0.4467, + "step": 8797 + }, + { + "epoch": 1.12, + "grad_norm": 0.6126800030936337, + "learning_rate": 4.2666186781130706e-06, + "loss": 0.4689, + "step": 8798 + }, + { + "epoch": 1.12, + "grad_norm": 0.7855198376686179, + "learning_rate": 4.265598264585355e-06, + "loss": 0.5468, + "step": 8799 + }, + { + "epoch": 1.12, + "grad_norm": 0.9390285293092171, + "learning_rate": 4.264577882318773e-06, + "loss": 0.5517, + "step": 8800 + }, + { + "epoch": 1.12, + "grad_norm": 0.6855521345250943, + "learning_rate": 4.263557531356761e-06, + "loss": 0.4548, + "step": 8801 + }, + { + "epoch": 1.12, + "grad_norm": 0.5769063696116823, + "learning_rate": 4.262537211742752e-06, + "loss": 0.4425, + "step": 8802 + }, + { + "epoch": 1.12, + "grad_norm": 0.5732954391046863, + "learning_rate": 4.261516923520177e-06, + "loss": 0.4125, + "step": 8803 + }, + { + "epoch": 1.12, + "grad_norm": 0.5956499422494753, + "learning_rate": 4.260496666732466e-06, + "loss": 0.4549, + "step": 8804 + }, + { + "epoch": 1.12, + "grad_norm": 0.6353305042450674, + "learning_rate": 4.25947644142305e-06, + "loss": 0.4711, + "step": 8805 + }, + { + "epoch": 1.12, + "grad_norm": 0.9048638763083205, + "learning_rate": 4.2584562476353545e-06, + "loss": 0.5383, + "step": 8806 + }, + { + "epoch": 1.12, + "grad_norm": 0.7014172119541553, + "learning_rate": 4.257436085412807e-06, + "loss": 0.5191, + "step": 8807 + }, + { + "epoch": 1.12, + "grad_norm": 0.9372793001861245, + "learning_rate": 4.256415954798833e-06, + "loss": 0.4708, + "step": 8808 + }, + { + "epoch": 1.12, + "grad_norm": 0.6538557046016998, + "learning_rate": 4.255395855836856e-06, + "loss": 0.442, + "step": 8809 + }, + { + "epoch": 1.12, + "grad_norm": 0.6381707019285497, + "learning_rate": 4.2543757885702956e-06, + "loss": 0.4181, + "step": 8810 + }, + { + "epoch": 1.12, + "grad_norm": 0.580561300876263, + "learning_rate": 4.253355753042576e-06, + "loss": 0.4428, + "step": 8811 + }, + { + "epoch": 1.12, + "grad_norm": 0.5893239744886439, + "learning_rate": 4.252335749297117e-06, + "loss": 0.4486, + "step": 8812 + }, + { + "epoch": 1.12, + "grad_norm": 0.6898567139437685, + "learning_rate": 4.2513157773773365e-06, + "loss": 0.4652, + "step": 8813 + }, + { + "epoch": 1.12, + "grad_norm": 0.7056015871467056, + "learning_rate": 4.2502958373266504e-06, + "loss": 0.5056, + "step": 8814 + }, + { + "epoch": 1.12, + "grad_norm": 0.5974202773834117, + "learning_rate": 4.249275929188475e-06, + "loss": 0.4249, + "step": 8815 + }, + { + "epoch": 1.12, + "grad_norm": 0.6334414090393656, + "learning_rate": 4.248256053006224e-06, + "loss": 0.4754, + "step": 8816 + }, + { + "epoch": 1.12, + "grad_norm": 0.7799495036823637, + "learning_rate": 4.247236208823311e-06, + "loss": 0.4843, + "step": 8817 + }, + { + "epoch": 1.12, + "grad_norm": 0.5887747793367608, + "learning_rate": 4.246216396683147e-06, + "loss": 0.527, + "step": 8818 + }, + { + "epoch": 1.12, + "grad_norm": 1.3043297308638795, + "learning_rate": 4.245196616629142e-06, + "loss": 0.546, + "step": 8819 + }, + { + "epoch": 1.12, + "grad_norm": 0.8643685450581178, + "learning_rate": 4.2441768687047055e-06, + "loss": 0.5224, + "step": 8820 + }, + { + "epoch": 1.12, + "grad_norm": 0.6272675124275903, + "learning_rate": 4.243157152953244e-06, + "loss": 0.481, + "step": 8821 + }, + { + "epoch": 1.12, + "grad_norm": 1.1797957295211192, + "learning_rate": 4.242137469418164e-06, + "loss": 0.4391, + "step": 8822 + }, + { + "epoch": 1.12, + "grad_norm": 3.0286094590805708, + "learning_rate": 4.241117818142869e-06, + "loss": 0.5224, + "step": 8823 + }, + { + "epoch": 1.12, + "grad_norm": 0.7572385102685983, + "learning_rate": 4.2400981991707654e-06, + "loss": 0.4413, + "step": 8824 + }, + { + "epoch": 1.12, + "grad_norm": 0.6005078606790096, + "learning_rate": 4.239078612545252e-06, + "loss": 0.4571, + "step": 8825 + }, + { + "epoch": 1.12, + "grad_norm": 0.6661899109138069, + "learning_rate": 4.238059058309731e-06, + "loss": 0.444, + "step": 8826 + }, + { + "epoch": 1.12, + "grad_norm": 0.8352314843219187, + "learning_rate": 4.2370395365076e-06, + "loss": 0.5569, + "step": 8827 + }, + { + "epoch": 1.12, + "grad_norm": 0.709840176708013, + "learning_rate": 4.236020047182259e-06, + "loss": 0.5002, + "step": 8828 + }, + { + "epoch": 1.12, + "grad_norm": 0.5991316827714752, + "learning_rate": 4.235000590377103e-06, + "loss": 0.4729, + "step": 8829 + }, + { + "epoch": 1.12, + "grad_norm": 0.5708281681060035, + "learning_rate": 4.233981166135527e-06, + "loss": 0.456, + "step": 8830 + }, + { + "epoch": 1.13, + "grad_norm": 0.8995247446030765, + "learning_rate": 4.2329617745009255e-06, + "loss": 0.523, + "step": 8831 + }, + { + "epoch": 1.13, + "grad_norm": 0.6005560682566539, + "learning_rate": 4.2319424155166896e-06, + "loss": 0.5023, + "step": 8832 + }, + { + "epoch": 1.13, + "grad_norm": 0.6794776933384548, + "learning_rate": 4.230923089226212e-06, + "loss": 0.4945, + "step": 8833 + }, + { + "epoch": 1.13, + "grad_norm": 0.8680528274323568, + "learning_rate": 4.22990379567288e-06, + "loss": 0.5473, + "step": 8834 + }, + { + "epoch": 1.13, + "grad_norm": 0.7804034149022726, + "learning_rate": 4.22888453490008e-06, + "loss": 0.5322, + "step": 8835 + }, + { + "epoch": 1.13, + "grad_norm": 1.2199721106463128, + "learning_rate": 4.227865306951205e-06, + "loss": 0.4914, + "step": 8836 + }, + { + "epoch": 1.13, + "grad_norm": 0.7822910630972447, + "learning_rate": 4.226846111869634e-06, + "loss": 0.515, + "step": 8837 + }, + { + "epoch": 1.13, + "grad_norm": 0.5832904899114207, + "learning_rate": 4.225826949698756e-06, + "loss": 0.4443, + "step": 8838 + }, + { + "epoch": 1.13, + "grad_norm": 0.7241557654056496, + "learning_rate": 4.224807820481949e-06, + "loss": 0.4813, + "step": 8839 + }, + { + "epoch": 1.13, + "grad_norm": 0.7094004004828398, + "learning_rate": 4.223788724262597e-06, + "loss": 0.5581, + "step": 8840 + }, + { + "epoch": 1.13, + "grad_norm": 0.7708877712455543, + "learning_rate": 4.222769661084078e-06, + "loss": 0.5518, + "step": 8841 + }, + { + "epoch": 1.13, + "grad_norm": 0.6136329442266498, + "learning_rate": 4.22175063098977e-06, + "loss": 0.4447, + "step": 8842 + }, + { + "epoch": 1.13, + "grad_norm": 0.6334621032670936, + "learning_rate": 4.2207316340230514e-06, + "loss": 0.4978, + "step": 8843 + }, + { + "epoch": 1.13, + "grad_norm": 0.6974682168905553, + "learning_rate": 4.219712670227297e-06, + "loss": 0.4957, + "step": 8844 + }, + { + "epoch": 1.13, + "grad_norm": 0.6308555769922878, + "learning_rate": 4.218693739645881e-06, + "loss": 0.4816, + "step": 8845 + }, + { + "epoch": 1.13, + "grad_norm": 0.7268561989071667, + "learning_rate": 4.217674842322176e-06, + "loss": 0.5463, + "step": 8846 + }, + { + "epoch": 1.13, + "grad_norm": 0.5778709276870324, + "learning_rate": 4.216655978299552e-06, + "loss": 0.4358, + "step": 8847 + }, + { + "epoch": 1.13, + "grad_norm": 0.5998293710339382, + "learning_rate": 4.215637147621378e-06, + "loss": 0.4907, + "step": 8848 + }, + { + "epoch": 1.13, + "grad_norm": 0.719870876190144, + "learning_rate": 4.214618350331027e-06, + "loss": 0.5108, + "step": 8849 + }, + { + "epoch": 1.13, + "grad_norm": 0.7255538086626865, + "learning_rate": 4.213599586471863e-06, + "loss": 0.5299, + "step": 8850 + }, + { + "epoch": 1.13, + "grad_norm": 0.6374626330450108, + "learning_rate": 4.212580856087251e-06, + "loss": 0.451, + "step": 8851 + }, + { + "epoch": 1.13, + "grad_norm": 0.7172841179198404, + "learning_rate": 4.2115621592205565e-06, + "loss": 0.5047, + "step": 8852 + }, + { + "epoch": 1.13, + "grad_norm": 0.8912355896496387, + "learning_rate": 4.210543495915141e-06, + "loss": 0.5374, + "step": 8853 + }, + { + "epoch": 1.13, + "grad_norm": 0.6367142068190395, + "learning_rate": 4.2095248662143666e-06, + "loss": 0.4474, + "step": 8854 + }, + { + "epoch": 1.13, + "grad_norm": 0.6315193094392435, + "learning_rate": 4.2085062701615916e-06, + "loss": 0.4302, + "step": 8855 + }, + { + "epoch": 1.13, + "grad_norm": 0.7011549418738294, + "learning_rate": 4.207487707800176e-06, + "loss": 0.4922, + "step": 8856 + }, + { + "epoch": 1.13, + "grad_norm": 0.6902072334270198, + "learning_rate": 4.206469179173476e-06, + "loss": 0.4887, + "step": 8857 + }, + { + "epoch": 1.13, + "grad_norm": 0.5876436355786838, + "learning_rate": 4.205450684324847e-06, + "loss": 0.4639, + "step": 8858 + }, + { + "epoch": 1.13, + "grad_norm": 0.5818643807679521, + "learning_rate": 4.2044322232976444e-06, + "loss": 0.4423, + "step": 8859 + }, + { + "epoch": 1.13, + "grad_norm": 0.568966457609983, + "learning_rate": 4.203413796135217e-06, + "loss": 0.4479, + "step": 8860 + }, + { + "epoch": 1.13, + "grad_norm": 0.6786900564098186, + "learning_rate": 4.202395402880921e-06, + "loss": 0.477, + "step": 8861 + }, + { + "epoch": 1.13, + "grad_norm": 0.7166849866313526, + "learning_rate": 4.201377043578103e-06, + "loss": 0.4841, + "step": 8862 + }, + { + "epoch": 1.13, + "grad_norm": 0.5869151770572896, + "learning_rate": 4.200358718270114e-06, + "loss": 0.4616, + "step": 8863 + }, + { + "epoch": 1.13, + "grad_norm": 0.7906584046026495, + "learning_rate": 4.199340427000298e-06, + "loss": 0.5174, + "step": 8864 + }, + { + "epoch": 1.13, + "grad_norm": 0.7584836030933219, + "learning_rate": 4.198322169812002e-06, + "loss": 0.5124, + "step": 8865 + }, + { + "epoch": 1.13, + "grad_norm": 0.7853310210595729, + "learning_rate": 4.197303946748569e-06, + "loss": 0.4599, + "step": 8866 + }, + { + "epoch": 1.13, + "grad_norm": 0.677042962099927, + "learning_rate": 4.196285757853342e-06, + "loss": 0.4626, + "step": 8867 + }, + { + "epoch": 1.13, + "grad_norm": 0.589929344842882, + "learning_rate": 4.195267603169662e-06, + "loss": 0.4007, + "step": 8868 + }, + { + "epoch": 1.13, + "grad_norm": 0.5947996911596473, + "learning_rate": 4.1942494827408685e-06, + "loss": 0.4221, + "step": 8869 + }, + { + "epoch": 1.13, + "grad_norm": 0.6595734885032779, + "learning_rate": 4.1932313966103e-06, + "loss": 0.4976, + "step": 8870 + }, + { + "epoch": 1.13, + "grad_norm": 1.1396244402972773, + "learning_rate": 4.192213344821293e-06, + "loss": 0.4839, + "step": 8871 + }, + { + "epoch": 1.13, + "grad_norm": 0.588643884617622, + "learning_rate": 4.191195327417179e-06, + "loss": 0.4778, + "step": 8872 + }, + { + "epoch": 1.13, + "grad_norm": 0.5849963640387403, + "learning_rate": 4.190177344441299e-06, + "loss": 0.4616, + "step": 8873 + }, + { + "epoch": 1.13, + "grad_norm": 0.6118014658680612, + "learning_rate": 4.189159395936982e-06, + "loss": 0.4733, + "step": 8874 + }, + { + "epoch": 1.13, + "grad_norm": 0.5344916359361866, + "learning_rate": 4.188141481947558e-06, + "loss": 0.409, + "step": 8875 + }, + { + "epoch": 1.13, + "grad_norm": 0.5913422283984344, + "learning_rate": 4.1871236025163555e-06, + "loss": 0.503, + "step": 8876 + }, + { + "epoch": 1.13, + "grad_norm": 0.7905925305725864, + "learning_rate": 4.186105757686705e-06, + "loss": 0.5289, + "step": 8877 + }, + { + "epoch": 1.13, + "grad_norm": 0.6753084030534934, + "learning_rate": 4.185087947501931e-06, + "loss": 0.4665, + "step": 8878 + }, + { + "epoch": 1.13, + "grad_norm": 0.5571779321620085, + "learning_rate": 4.184070172005359e-06, + "loss": 0.4388, + "step": 8879 + }, + { + "epoch": 1.13, + "grad_norm": 0.6044655266484702, + "learning_rate": 4.183052431240312e-06, + "loss": 0.4475, + "step": 8880 + }, + { + "epoch": 1.13, + "grad_norm": 1.2723111294082243, + "learning_rate": 4.182034725250114e-06, + "loss": 0.4747, + "step": 8881 + }, + { + "epoch": 1.13, + "grad_norm": 1.0176464400152592, + "learning_rate": 4.1810170540780826e-06, + "loss": 0.4841, + "step": 8882 + }, + { + "epoch": 1.13, + "grad_norm": 0.7675081835243758, + "learning_rate": 4.179999417767539e-06, + "loss": 0.5361, + "step": 8883 + }, + { + "epoch": 1.13, + "grad_norm": 0.7027056699374664, + "learning_rate": 4.178981816361799e-06, + "loss": 0.4888, + "step": 8884 + }, + { + "epoch": 1.13, + "grad_norm": 0.6187308169977014, + "learning_rate": 4.177964249904179e-06, + "loss": 0.4736, + "step": 8885 + }, + { + "epoch": 1.13, + "grad_norm": 0.6993935695545946, + "learning_rate": 4.176946718437994e-06, + "loss": 0.5236, + "step": 8886 + }, + { + "epoch": 1.13, + "grad_norm": 0.8617606858911112, + "learning_rate": 4.175929222006558e-06, + "loss": 0.5814, + "step": 8887 + }, + { + "epoch": 1.13, + "grad_norm": 0.6173512784789451, + "learning_rate": 4.174911760653182e-06, + "loss": 0.4179, + "step": 8888 + }, + { + "epoch": 1.13, + "grad_norm": 0.6606884887732691, + "learning_rate": 4.1738943344211735e-06, + "loss": 0.4781, + "step": 8889 + }, + { + "epoch": 1.13, + "grad_norm": 0.9421626490039012, + "learning_rate": 4.172876943353845e-06, + "loss": 0.5081, + "step": 8890 + }, + { + "epoch": 1.13, + "grad_norm": 0.6840009487430139, + "learning_rate": 4.171859587494502e-06, + "loss": 0.4728, + "step": 8891 + }, + { + "epoch": 1.13, + "grad_norm": 0.6360795948164695, + "learning_rate": 4.17084226688645e-06, + "loss": 0.4867, + "step": 8892 + }, + { + "epoch": 1.13, + "grad_norm": 0.8072297640581896, + "learning_rate": 4.169824981572993e-06, + "loss": 0.5327, + "step": 8893 + }, + { + "epoch": 1.13, + "grad_norm": 0.6302079145475248, + "learning_rate": 4.168807731597434e-06, + "loss": 0.4216, + "step": 8894 + }, + { + "epoch": 1.13, + "grad_norm": 0.6265915277445072, + "learning_rate": 4.167790517003074e-06, + "loss": 0.4466, + "step": 8895 + }, + { + "epoch": 1.13, + "grad_norm": 0.8306004570823654, + "learning_rate": 4.166773337833212e-06, + "loss": 0.5239, + "step": 8896 + }, + { + "epoch": 1.13, + "grad_norm": 1.0538530611806352, + "learning_rate": 4.165756194131145e-06, + "loss": 0.51, + "step": 8897 + }, + { + "epoch": 1.13, + "grad_norm": 0.7418913940773039, + "learning_rate": 4.164739085940172e-06, + "loss": 0.5105, + "step": 8898 + }, + { + "epoch": 1.13, + "grad_norm": 0.6867392525629221, + "learning_rate": 4.163722013303587e-06, + "loss": 0.5311, + "step": 8899 + }, + { + "epoch": 1.13, + "grad_norm": 0.738323713708081, + "learning_rate": 4.1627049762646845e-06, + "loss": 0.5327, + "step": 8900 + }, + { + "epoch": 1.13, + "grad_norm": 0.6522210674657647, + "learning_rate": 4.1616879748667545e-06, + "loss": 0.494, + "step": 8901 + }, + { + "epoch": 1.13, + "grad_norm": 0.8231186804607882, + "learning_rate": 4.1606710091530885e-06, + "loss": 0.4958, + "step": 8902 + }, + { + "epoch": 1.13, + "grad_norm": 0.5755750166470923, + "learning_rate": 4.159654079166976e-06, + "loss": 0.4581, + "step": 8903 + }, + { + "epoch": 1.13, + "grad_norm": 0.7916153775811734, + "learning_rate": 4.158637184951704e-06, + "loss": 0.5453, + "step": 8904 + }, + { + "epoch": 1.13, + "grad_norm": 0.8241823371799982, + "learning_rate": 4.157620326550558e-06, + "loss": 0.575, + "step": 8905 + }, + { + "epoch": 1.13, + "grad_norm": 0.8173129061707199, + "learning_rate": 4.156603504006824e-06, + "loss": 0.5308, + "step": 8906 + }, + { + "epoch": 1.13, + "grad_norm": 0.5174368334878502, + "learning_rate": 4.1555867173637825e-06, + "loss": 0.4307, + "step": 8907 + }, + { + "epoch": 1.13, + "grad_norm": 0.6941899279197069, + "learning_rate": 4.154569966664716e-06, + "loss": 0.5053, + "step": 8908 + }, + { + "epoch": 1.13, + "grad_norm": 0.7752276093695819, + "learning_rate": 4.1535532519529056e-06, + "loss": 0.4935, + "step": 8909 + }, + { + "epoch": 1.14, + "grad_norm": 0.5709264687011458, + "learning_rate": 4.152536573271625e-06, + "loss": 0.4682, + "step": 8910 + }, + { + "epoch": 1.14, + "grad_norm": 0.8372322005711126, + "learning_rate": 4.151519930664158e-06, + "loss": 0.5747, + "step": 8911 + }, + { + "epoch": 1.14, + "grad_norm": 0.623406384653625, + "learning_rate": 4.150503324173775e-06, + "loss": 0.4411, + "step": 8912 + }, + { + "epoch": 1.14, + "grad_norm": 0.6222006459899796, + "learning_rate": 4.14948675384375e-06, + "loss": 0.4718, + "step": 8913 + }, + { + "epoch": 1.14, + "grad_norm": 0.6049654092042297, + "learning_rate": 4.148470219717358e-06, + "loss": 0.4993, + "step": 8914 + }, + { + "epoch": 1.14, + "grad_norm": 0.7745050340158272, + "learning_rate": 4.147453721837866e-06, + "loss": 0.5273, + "step": 8915 + }, + { + "epoch": 1.14, + "grad_norm": 0.6109987895486814, + "learning_rate": 4.146437260248546e-06, + "loss": 0.4503, + "step": 8916 + }, + { + "epoch": 1.14, + "grad_norm": 0.6793429391527532, + "learning_rate": 4.145420834992664e-06, + "loss": 0.5184, + "step": 8917 + }, + { + "epoch": 1.14, + "grad_norm": 0.8818497858691048, + "learning_rate": 4.144404446113486e-06, + "loss": 0.5765, + "step": 8918 + }, + { + "epoch": 1.14, + "grad_norm": 1.021306485742972, + "learning_rate": 4.143388093654276e-06, + "loss": 0.5162, + "step": 8919 + }, + { + "epoch": 1.14, + "grad_norm": 0.5859749858002101, + "learning_rate": 4.142371777658299e-06, + "loss": 0.4239, + "step": 8920 + }, + { + "epoch": 1.14, + "grad_norm": 0.6299049601946107, + "learning_rate": 4.141355498168813e-06, + "loss": 0.4581, + "step": 8921 + }, + { + "epoch": 1.14, + "grad_norm": 0.8455046076539325, + "learning_rate": 4.140339255229079e-06, + "loss": 0.5296, + "step": 8922 + }, + { + "epoch": 1.14, + "grad_norm": 1.136195459596776, + "learning_rate": 4.139323048882357e-06, + "loss": 0.5579, + "step": 8923 + }, + { + "epoch": 1.14, + "grad_norm": 0.7686804183218642, + "learning_rate": 4.138306879171903e-06, + "loss": 0.5021, + "step": 8924 + }, + { + "epoch": 1.14, + "grad_norm": 0.6137399353408208, + "learning_rate": 4.137290746140972e-06, + "loss": 0.4575, + "step": 8925 + }, + { + "epoch": 1.14, + "grad_norm": 0.8048452011345688, + "learning_rate": 4.136274649832816e-06, + "loss": 0.4721, + "step": 8926 + }, + { + "epoch": 1.14, + "grad_norm": 0.6495430245986274, + "learning_rate": 4.135258590290688e-06, + "loss": 0.4444, + "step": 8927 + }, + { + "epoch": 1.14, + "grad_norm": 0.847925245009506, + "learning_rate": 4.13424256755784e-06, + "loss": 0.5336, + "step": 8928 + }, + { + "epoch": 1.14, + "grad_norm": 0.6373667724772821, + "learning_rate": 4.133226581677518e-06, + "loss": 0.4617, + "step": 8929 + }, + { + "epoch": 1.14, + "grad_norm": 0.6544387934433573, + "learning_rate": 4.132210632692971e-06, + "loss": 0.4975, + "step": 8930 + }, + { + "epoch": 1.14, + "grad_norm": 0.952005884589329, + "learning_rate": 4.131194720647445e-06, + "loss": 0.5441, + "step": 8931 + }, + { + "epoch": 1.14, + "grad_norm": 0.7160581309498305, + "learning_rate": 4.130178845584183e-06, + "loss": 0.4449, + "step": 8932 + }, + { + "epoch": 1.14, + "grad_norm": 0.606984407071753, + "learning_rate": 4.129163007546427e-06, + "loss": 0.4834, + "step": 8933 + }, + { + "epoch": 1.14, + "grad_norm": 0.6049660768632178, + "learning_rate": 4.128147206577417e-06, + "loss": 0.4182, + "step": 8934 + }, + { + "epoch": 1.14, + "grad_norm": 1.016503241966969, + "learning_rate": 4.127131442720398e-06, + "loss": 0.4922, + "step": 8935 + }, + { + "epoch": 1.14, + "grad_norm": 0.5893463205746421, + "learning_rate": 4.126115716018603e-06, + "loss": 0.4684, + "step": 8936 + }, + { + "epoch": 1.14, + "grad_norm": 0.6885041023236379, + "learning_rate": 4.12510002651527e-06, + "loss": 0.4548, + "step": 8937 + }, + { + "epoch": 1.14, + "grad_norm": 0.6365596791560154, + "learning_rate": 4.124084374253633e-06, + "loss": 0.4616, + "step": 8938 + }, + { + "epoch": 1.14, + "grad_norm": 0.7166848715376369, + "learning_rate": 4.123068759276925e-06, + "loss": 0.5603, + "step": 8939 + }, + { + "epoch": 1.14, + "grad_norm": 0.8134642852047768, + "learning_rate": 4.1220531816283775e-06, + "loss": 0.5208, + "step": 8940 + }, + { + "epoch": 1.14, + "grad_norm": 0.9409062362021231, + "learning_rate": 4.121037641351221e-06, + "loss": 0.5247, + "step": 8941 + }, + { + "epoch": 1.14, + "grad_norm": 0.5593972316517262, + "learning_rate": 4.120022138488683e-06, + "loss": 0.4282, + "step": 8942 + }, + { + "epoch": 1.14, + "grad_norm": 0.6206467823627109, + "learning_rate": 4.119006673083991e-06, + "loss": 0.5663, + "step": 8943 + }, + { + "epoch": 1.14, + "grad_norm": 1.049580058284063, + "learning_rate": 4.117991245180369e-06, + "loss": 0.4577, + "step": 8944 + }, + { + "epoch": 1.14, + "grad_norm": 0.5608642755061072, + "learning_rate": 4.116975854821041e-06, + "loss": 0.4343, + "step": 8945 + }, + { + "epoch": 1.14, + "grad_norm": 0.6027099428056472, + "learning_rate": 4.11596050204923e-06, + "loss": 0.4791, + "step": 8946 + }, + { + "epoch": 1.14, + "grad_norm": 0.5888289668248263, + "learning_rate": 4.114945186908154e-06, + "loss": 0.4461, + "step": 8947 + }, + { + "epoch": 1.14, + "grad_norm": 0.5931289785112412, + "learning_rate": 4.113929909441034e-06, + "loss": 0.4489, + "step": 8948 + }, + { + "epoch": 1.14, + "grad_norm": 0.6288183169782241, + "learning_rate": 4.112914669691086e-06, + "loss": 0.466, + "step": 8949 + }, + { + "epoch": 1.14, + "grad_norm": 2.130370526799863, + "learning_rate": 4.1118994677015265e-06, + "loss": 0.5343, + "step": 8950 + }, + { + "epoch": 1.14, + "grad_norm": 0.6846604108310098, + "learning_rate": 4.110884303515568e-06, + "loss": 0.4754, + "step": 8951 + }, + { + "epoch": 1.14, + "grad_norm": 0.571214086548935, + "learning_rate": 4.109869177176424e-06, + "loss": 0.4288, + "step": 8952 + }, + { + "epoch": 1.14, + "grad_norm": 0.6284123393722311, + "learning_rate": 4.108854088727304e-06, + "loss": 0.4621, + "step": 8953 + }, + { + "epoch": 1.14, + "grad_norm": 0.597164421679138, + "learning_rate": 4.107839038211419e-06, + "loss": 0.5202, + "step": 8954 + }, + { + "epoch": 1.14, + "grad_norm": 0.7273766474979937, + "learning_rate": 4.1068240256719746e-06, + "loss": 0.4573, + "step": 8955 + }, + { + "epoch": 1.14, + "grad_norm": 0.6700952731868057, + "learning_rate": 4.105809051152177e-06, + "loss": 0.4877, + "step": 8956 + }, + { + "epoch": 1.14, + "grad_norm": 0.9662626445861817, + "learning_rate": 4.104794114695231e-06, + "loss": 0.5201, + "step": 8957 + }, + { + "epoch": 1.14, + "grad_norm": 0.6337353218064731, + "learning_rate": 4.103779216344339e-06, + "loss": 0.4806, + "step": 8958 + }, + { + "epoch": 1.14, + "grad_norm": 0.597347468555482, + "learning_rate": 4.1027643561427e-06, + "loss": 0.4529, + "step": 8959 + }, + { + "epoch": 1.14, + "grad_norm": 0.7032296208176871, + "learning_rate": 4.101749534133517e-06, + "loss": 0.5353, + "step": 8960 + }, + { + "epoch": 1.14, + "grad_norm": 0.8109327372478944, + "learning_rate": 4.100734750359986e-06, + "loss": 0.4953, + "step": 8961 + }, + { + "epoch": 1.14, + "grad_norm": 0.5384636345651621, + "learning_rate": 4.099720004865303e-06, + "loss": 0.4236, + "step": 8962 + }, + { + "epoch": 1.14, + "grad_norm": 0.5847172012755929, + "learning_rate": 4.098705297692662e-06, + "loss": 0.4492, + "step": 8963 + }, + { + "epoch": 1.14, + "grad_norm": 0.6224556095872419, + "learning_rate": 4.097690628885257e-06, + "loss": 0.4971, + "step": 8964 + }, + { + "epoch": 1.14, + "grad_norm": 0.8287472555661987, + "learning_rate": 4.096675998486278e-06, + "loss": 0.5301, + "step": 8965 + }, + { + "epoch": 1.14, + "grad_norm": 0.7877916671790234, + "learning_rate": 4.095661406538916e-06, + "loss": 0.5595, + "step": 8966 + }, + { + "epoch": 1.14, + "grad_norm": 0.6907176868516108, + "learning_rate": 4.094646853086357e-06, + "loss": 0.5248, + "step": 8967 + }, + { + "epoch": 1.14, + "grad_norm": 0.6387418884297833, + "learning_rate": 4.093632338171789e-06, + "loss": 0.5222, + "step": 8968 + }, + { + "epoch": 1.14, + "grad_norm": 0.6841759868077028, + "learning_rate": 4.092617861838396e-06, + "loss": 0.4741, + "step": 8969 + }, + { + "epoch": 1.14, + "grad_norm": 0.5932066233094653, + "learning_rate": 4.09160342412936e-06, + "loss": 0.4517, + "step": 8970 + }, + { + "epoch": 1.14, + "grad_norm": 0.5560793362032284, + "learning_rate": 4.090589025087864e-06, + "loss": 0.4687, + "step": 8971 + }, + { + "epoch": 1.14, + "grad_norm": 0.67358657501909, + "learning_rate": 4.089574664757086e-06, + "loss": 0.503, + "step": 8972 + }, + { + "epoch": 1.14, + "grad_norm": 0.5601399046352287, + "learning_rate": 4.088560343180207e-06, + "loss": 0.4659, + "step": 8973 + }, + { + "epoch": 1.14, + "grad_norm": 0.5806752798821936, + "learning_rate": 4.087546060400401e-06, + "loss": 0.4736, + "step": 8974 + }, + { + "epoch": 1.14, + "grad_norm": 0.890918080455701, + "learning_rate": 4.0865318164608435e-06, + "loss": 0.432, + "step": 8975 + }, + { + "epoch": 1.14, + "grad_norm": 0.6062355402362681, + "learning_rate": 4.085517611404708e-06, + "loss": 0.5202, + "step": 8976 + }, + { + "epoch": 1.14, + "grad_norm": 0.6896890431382324, + "learning_rate": 4.0845034452751656e-06, + "loss": 0.4868, + "step": 8977 + }, + { + "epoch": 1.14, + "grad_norm": 0.6243644930165145, + "learning_rate": 4.083489318115385e-06, + "loss": 0.4485, + "step": 8978 + }, + { + "epoch": 1.14, + "grad_norm": 0.6919195789594957, + "learning_rate": 4.082475229968536e-06, + "loss": 0.5431, + "step": 8979 + }, + { + "epoch": 1.14, + "grad_norm": 0.9770142347219757, + "learning_rate": 4.0814611808777845e-06, + "loss": 0.5338, + "step": 8980 + }, + { + "epoch": 1.14, + "grad_norm": 0.7213578510869765, + "learning_rate": 4.080447170886296e-06, + "loss": 0.481, + "step": 8981 + }, + { + "epoch": 1.14, + "grad_norm": 0.6027346606298167, + "learning_rate": 4.079433200037233e-06, + "loss": 0.4404, + "step": 8982 + }, + { + "epoch": 1.14, + "grad_norm": 0.6706439239375428, + "learning_rate": 4.078419268373756e-06, + "loss": 0.493, + "step": 8983 + }, + { + "epoch": 1.14, + "grad_norm": 1.4632090335888754, + "learning_rate": 4.077405375939023e-06, + "loss": 0.5167, + "step": 8984 + }, + { + "epoch": 1.14, + "grad_norm": 0.5743036022268803, + "learning_rate": 4.076391522776198e-06, + "loss": 0.4724, + "step": 8985 + }, + { + "epoch": 1.14, + "grad_norm": 1.000783417638258, + "learning_rate": 4.0753777089284345e-06, + "loss": 0.4938, + "step": 8986 + }, + { + "epoch": 1.14, + "grad_norm": 0.6789018943041462, + "learning_rate": 4.074363934438888e-06, + "loss": 0.4857, + "step": 8987 + }, + { + "epoch": 1.15, + "grad_norm": 1.0172097982351374, + "learning_rate": 4.07335019935071e-06, + "loss": 0.503, + "step": 8988 + }, + { + "epoch": 1.15, + "grad_norm": 0.5806497672232582, + "learning_rate": 4.072336503707053e-06, + "loss": 0.4173, + "step": 8989 + }, + { + "epoch": 1.15, + "grad_norm": 0.6914722764442861, + "learning_rate": 4.071322847551066e-06, + "loss": 0.4729, + "step": 8990 + }, + { + "epoch": 1.15, + "grad_norm": 0.7195830021633124, + "learning_rate": 4.070309230925898e-06, + "loss": 0.4907, + "step": 8991 + }, + { + "epoch": 1.15, + "grad_norm": 0.6125785274309221, + "learning_rate": 4.0692956538746966e-06, + "loss": 0.4695, + "step": 8992 + }, + { + "epoch": 1.15, + "grad_norm": 0.6989050923253964, + "learning_rate": 4.068282116440604e-06, + "loss": 0.4747, + "step": 8993 + }, + { + "epoch": 1.15, + "grad_norm": 0.5832176441382122, + "learning_rate": 4.067268618666764e-06, + "loss": 0.4553, + "step": 8994 + }, + { + "epoch": 1.15, + "grad_norm": 0.6889027638096344, + "learning_rate": 4.066255160596319e-06, + "loss": 0.5304, + "step": 8995 + }, + { + "epoch": 1.15, + "grad_norm": 0.761549075424479, + "learning_rate": 4.0652417422724054e-06, + "loss": 0.5378, + "step": 8996 + }, + { + "epoch": 1.15, + "grad_norm": 0.6770589232801114, + "learning_rate": 4.064228363738166e-06, + "loss": 0.5063, + "step": 8997 + }, + { + "epoch": 1.15, + "grad_norm": 0.6509939339017758, + "learning_rate": 4.063215025036735e-06, + "loss": 0.492, + "step": 8998 + }, + { + "epoch": 1.15, + "grad_norm": 0.8168004636442611, + "learning_rate": 4.062201726211247e-06, + "loss": 0.5236, + "step": 8999 + }, + { + "epoch": 1.15, + "grad_norm": 0.8033615596745731, + "learning_rate": 4.061188467304834e-06, + "loss": 0.5269, + "step": 9000 + }, + { + "epoch": 1.15, + "grad_norm": 0.638583581326899, + "learning_rate": 4.060175248360629e-06, + "loss": 0.4864, + "step": 9001 + }, + { + "epoch": 1.15, + "grad_norm": 0.7500171028160009, + "learning_rate": 4.05916206942176e-06, + "loss": 0.4886, + "step": 9002 + }, + { + "epoch": 1.15, + "grad_norm": 0.6828589949118525, + "learning_rate": 4.058148930531355e-06, + "loss": 0.5003, + "step": 9003 + }, + { + "epoch": 1.15, + "grad_norm": 0.7868472436310205, + "learning_rate": 4.05713583173254e-06, + "loss": 0.5041, + "step": 9004 + }, + { + "epoch": 1.15, + "grad_norm": 0.6050353891825929, + "learning_rate": 4.056122773068441e-06, + "loss": 0.4544, + "step": 9005 + }, + { + "epoch": 1.15, + "grad_norm": 0.8083388533830902, + "learning_rate": 4.055109754582178e-06, + "loss": 0.4887, + "step": 9006 + }, + { + "epoch": 1.15, + "grad_norm": 0.646573712630938, + "learning_rate": 4.054096776316874e-06, + "loss": 0.4645, + "step": 9007 + }, + { + "epoch": 1.15, + "grad_norm": 0.672270820341946, + "learning_rate": 4.053083838315647e-06, + "loss": 0.4538, + "step": 9008 + }, + { + "epoch": 1.15, + "grad_norm": 0.6628567131546937, + "learning_rate": 4.052070940621614e-06, + "loss": 0.432, + "step": 9009 + }, + { + "epoch": 1.15, + "grad_norm": 0.6478913514779144, + "learning_rate": 4.051058083277894e-06, + "loss": 0.4497, + "step": 9010 + }, + { + "epoch": 1.15, + "grad_norm": 0.5845577247889107, + "learning_rate": 4.050045266327598e-06, + "loss": 0.4767, + "step": 9011 + }, + { + "epoch": 1.15, + "grad_norm": 1.0573059363922057, + "learning_rate": 4.04903248981384e-06, + "loss": 0.5476, + "step": 9012 + }, + { + "epoch": 1.15, + "grad_norm": 0.6472651449472857, + "learning_rate": 4.0480197537797286e-06, + "loss": 0.4536, + "step": 9013 + }, + { + "epoch": 1.15, + "grad_norm": 0.7598329002180562, + "learning_rate": 4.047007058268376e-06, + "loss": 0.5106, + "step": 9014 + }, + { + "epoch": 1.15, + "grad_norm": 0.7673200262458939, + "learning_rate": 4.045994403322887e-06, + "loss": 0.5061, + "step": 9015 + }, + { + "epoch": 1.15, + "grad_norm": 0.9184684277275242, + "learning_rate": 4.044981788986367e-06, + "loss": 0.4862, + "step": 9016 + }, + { + "epoch": 1.15, + "grad_norm": 0.6969550444167093, + "learning_rate": 4.043969215301922e-06, + "loss": 0.5409, + "step": 9017 + }, + { + "epoch": 1.15, + "grad_norm": 0.6221736352650029, + "learning_rate": 4.042956682312651e-06, + "loss": 0.476, + "step": 9018 + }, + { + "epoch": 1.15, + "grad_norm": 0.6134582174370081, + "learning_rate": 4.041944190061656e-06, + "loss": 0.4461, + "step": 9019 + }, + { + "epoch": 1.15, + "grad_norm": 0.631557806281466, + "learning_rate": 4.040931738592036e-06, + "loss": 0.4557, + "step": 9020 + }, + { + "epoch": 1.15, + "grad_norm": 0.6065319619591251, + "learning_rate": 4.039919327946886e-06, + "loss": 0.4687, + "step": 9021 + }, + { + "epoch": 1.15, + "grad_norm": 0.7023971073960384, + "learning_rate": 4.038906958169303e-06, + "loss": 0.4781, + "step": 9022 + }, + { + "epoch": 1.15, + "grad_norm": 0.6913650205082353, + "learning_rate": 4.0378946293023796e-06, + "loss": 0.5183, + "step": 9023 + }, + { + "epoch": 1.15, + "grad_norm": 0.7805906534382324, + "learning_rate": 4.036882341389207e-06, + "loss": 0.443, + "step": 9024 + }, + { + "epoch": 1.15, + "grad_norm": 0.6412853220892517, + "learning_rate": 4.035870094472876e-06, + "loss": 0.4661, + "step": 9025 + }, + { + "epoch": 1.15, + "grad_norm": 0.5563209632543178, + "learning_rate": 4.034857888596474e-06, + "loss": 0.3932, + "step": 9026 + }, + { + "epoch": 1.15, + "grad_norm": 0.6111839346361122, + "learning_rate": 4.033845723803088e-06, + "loss": 0.4411, + "step": 9027 + }, + { + "epoch": 1.15, + "grad_norm": 0.6244846543392127, + "learning_rate": 4.032833600135802e-06, + "loss": 0.462, + "step": 9028 + }, + { + "epoch": 1.15, + "grad_norm": 0.6177985738441802, + "learning_rate": 4.031821517637698e-06, + "loss": 0.4518, + "step": 9029 + }, + { + "epoch": 1.15, + "grad_norm": 0.6938530739007701, + "learning_rate": 4.030809476351859e-06, + "loss": 0.4707, + "step": 9030 + }, + { + "epoch": 1.15, + "grad_norm": 0.6054436275252574, + "learning_rate": 4.029797476321363e-06, + "loss": 0.4833, + "step": 9031 + }, + { + "epoch": 1.15, + "grad_norm": 0.6034519654419125, + "learning_rate": 4.028785517589289e-06, + "loss": 0.4635, + "step": 9032 + }, + { + "epoch": 1.15, + "grad_norm": 0.8155015179816563, + "learning_rate": 4.027773600198711e-06, + "loss": 0.4319, + "step": 9033 + }, + { + "epoch": 1.15, + "grad_norm": 0.6682705107658354, + "learning_rate": 4.026761724192702e-06, + "loss": 0.4966, + "step": 9034 + }, + { + "epoch": 1.15, + "grad_norm": 0.7560401763168254, + "learning_rate": 4.02574988961434e-06, + "loss": 0.5027, + "step": 9035 + }, + { + "epoch": 1.15, + "grad_norm": 0.6071252331146583, + "learning_rate": 4.02473809650669e-06, + "loss": 0.4222, + "step": 9036 + }, + { + "epoch": 1.15, + "grad_norm": 0.5665434685751473, + "learning_rate": 4.023726344912824e-06, + "loss": 0.447, + "step": 9037 + }, + { + "epoch": 1.15, + "grad_norm": 0.5770185103714324, + "learning_rate": 4.0227146348758075e-06, + "loss": 0.5072, + "step": 9038 + }, + { + "epoch": 1.15, + "grad_norm": 0.5667922612344445, + "learning_rate": 4.021702966438705e-06, + "loss": 0.4384, + "step": 9039 + }, + { + "epoch": 1.15, + "grad_norm": 0.6097516268978369, + "learning_rate": 4.020691339644582e-06, + "loss": 0.492, + "step": 9040 + }, + { + "epoch": 1.15, + "grad_norm": 0.7460488808352627, + "learning_rate": 4.0196797545364995e-06, + "loss": 0.5028, + "step": 9041 + }, + { + "epoch": 1.15, + "grad_norm": 0.6187724371519613, + "learning_rate": 4.018668211157516e-06, + "loss": 0.4718, + "step": 9042 + }, + { + "epoch": 1.15, + "grad_norm": 0.7455951878050731, + "learning_rate": 4.017656709550693e-06, + "loss": 0.5081, + "step": 9043 + }, + { + "epoch": 1.15, + "grad_norm": 0.7889960882207994, + "learning_rate": 4.016645249759082e-06, + "loss": 0.4602, + "step": 9044 + }, + { + "epoch": 1.15, + "grad_norm": 0.6143357283415994, + "learning_rate": 4.0156338318257425e-06, + "loss": 0.4307, + "step": 9045 + }, + { + "epoch": 1.15, + "grad_norm": 0.5838916648152049, + "learning_rate": 4.014622455793723e-06, + "loss": 0.4506, + "step": 9046 + }, + { + "epoch": 1.15, + "grad_norm": 0.5911372939789319, + "learning_rate": 4.013611121706077e-06, + "loss": 0.5167, + "step": 9047 + }, + { + "epoch": 1.15, + "grad_norm": 0.8257912464304906, + "learning_rate": 4.012599829605855e-06, + "loss": 0.4686, + "step": 9048 + }, + { + "epoch": 1.15, + "grad_norm": 0.7150846440018089, + "learning_rate": 4.011588579536103e-06, + "loss": 0.5238, + "step": 9049 + }, + { + "epoch": 1.15, + "grad_norm": 0.8119351417033279, + "learning_rate": 4.010577371539867e-06, + "loss": 0.4893, + "step": 9050 + }, + { + "epoch": 1.15, + "grad_norm": 0.626663596403873, + "learning_rate": 4.009566205660189e-06, + "loss": 0.4948, + "step": 9051 + }, + { + "epoch": 1.15, + "grad_norm": 0.8603723504016599, + "learning_rate": 4.0085550819401134e-06, + "loss": 0.5311, + "step": 9052 + }, + { + "epoch": 1.15, + "grad_norm": 0.9767395437781328, + "learning_rate": 4.00754400042268e-06, + "loss": 0.5491, + "step": 9053 + }, + { + "epoch": 1.15, + "grad_norm": 0.8276632949212203, + "learning_rate": 4.006532961150927e-06, + "loss": 0.4969, + "step": 9054 + }, + { + "epoch": 1.15, + "grad_norm": 0.6460748235757424, + "learning_rate": 4.005521964167891e-06, + "loss": 0.4752, + "step": 9055 + }, + { + "epoch": 1.15, + "grad_norm": 0.6507742802554809, + "learning_rate": 4.004511009516607e-06, + "loss": 0.4548, + "step": 9056 + }, + { + "epoch": 1.15, + "grad_norm": 0.5536586439501516, + "learning_rate": 4.003500097240109e-06, + "loss": 0.4739, + "step": 9057 + }, + { + "epoch": 1.15, + "grad_norm": 0.6861642040176475, + "learning_rate": 4.002489227381425e-06, + "loss": 0.4602, + "step": 9058 + }, + { + "epoch": 1.15, + "grad_norm": 0.7315041685257905, + "learning_rate": 4.001478399983589e-06, + "loss": 0.4793, + "step": 9059 + }, + { + "epoch": 1.15, + "grad_norm": 0.5938745482269892, + "learning_rate": 4.000467615089626e-06, + "loss": 0.4507, + "step": 9060 + }, + { + "epoch": 1.15, + "grad_norm": 0.6054268412964247, + "learning_rate": 3.999456872742564e-06, + "loss": 0.4614, + "step": 9061 + }, + { + "epoch": 1.15, + "grad_norm": 0.8533290144798112, + "learning_rate": 3.9984461729854245e-06, + "loss": 0.4816, + "step": 9062 + }, + { + "epoch": 1.15, + "grad_norm": 0.7309372142370067, + "learning_rate": 3.997435515861231e-06, + "loss": 0.5166, + "step": 9063 + }, + { + "epoch": 1.15, + "grad_norm": 0.5973625572814367, + "learning_rate": 3.996424901413004e-06, + "loss": 0.4724, + "step": 9064 + }, + { + "epoch": 1.15, + "grad_norm": 0.7438344152897951, + "learning_rate": 3.995414329683762e-06, + "loss": 0.461, + "step": 9065 + }, + { + "epoch": 1.15, + "grad_norm": 0.5631106322421988, + "learning_rate": 3.994403800716523e-06, + "loss": 0.4277, + "step": 9066 + }, + { + "epoch": 1.16, + "grad_norm": 0.5838279354196818, + "learning_rate": 3.993393314554299e-06, + "loss": 0.4752, + "step": 9067 + }, + { + "epoch": 1.16, + "grad_norm": 0.7605875248332076, + "learning_rate": 3.992382871240106e-06, + "loss": 0.4666, + "step": 9068 + }, + { + "epoch": 1.16, + "grad_norm": 0.680995340606293, + "learning_rate": 3.991372470816954e-06, + "loss": 0.4718, + "step": 9069 + }, + { + "epoch": 1.16, + "grad_norm": 0.5152597336678216, + "learning_rate": 3.990362113327853e-06, + "loss": 0.3887, + "step": 9070 + }, + { + "epoch": 1.16, + "grad_norm": 0.6079573837752223, + "learning_rate": 3.989351798815808e-06, + "loss": 0.4939, + "step": 9071 + }, + { + "epoch": 1.16, + "grad_norm": 0.8132652654058701, + "learning_rate": 3.98834152732383e-06, + "loss": 0.4989, + "step": 9072 + }, + { + "epoch": 1.16, + "grad_norm": 0.5725514724098311, + "learning_rate": 3.98733129889492e-06, + "loss": 0.4218, + "step": 9073 + }, + { + "epoch": 1.16, + "grad_norm": 0.5594307124063322, + "learning_rate": 3.98632111357208e-06, + "loss": 0.465, + "step": 9074 + }, + { + "epoch": 1.16, + "grad_norm": 0.5908550158977094, + "learning_rate": 3.98531097139831e-06, + "loss": 0.4296, + "step": 9075 + }, + { + "epoch": 1.16, + "grad_norm": 0.7342108400684618, + "learning_rate": 3.984300872416612e-06, + "loss": 0.552, + "step": 9076 + }, + { + "epoch": 1.16, + "grad_norm": 0.8040761807166786, + "learning_rate": 3.983290816669979e-06, + "loss": 0.5334, + "step": 9077 + }, + { + "epoch": 1.16, + "grad_norm": 0.7062037775856744, + "learning_rate": 3.982280804201407e-06, + "loss": 0.4834, + "step": 9078 + }, + { + "epoch": 1.16, + "grad_norm": 0.7694091459615919, + "learning_rate": 3.981270835053889e-06, + "loss": 0.474, + "step": 9079 + }, + { + "epoch": 1.16, + "grad_norm": 0.6418411681188796, + "learning_rate": 3.980260909270416e-06, + "loss": 0.4525, + "step": 9080 + }, + { + "epoch": 1.16, + "grad_norm": 0.5865001399814008, + "learning_rate": 3.979251026893977e-06, + "loss": 0.4726, + "step": 9081 + }, + { + "epoch": 1.16, + "grad_norm": 0.7546366528818873, + "learning_rate": 3.978241187967561e-06, + "loss": 0.5606, + "step": 9082 + }, + { + "epoch": 1.16, + "grad_norm": 0.7466575105866234, + "learning_rate": 3.977231392534152e-06, + "loss": 0.5011, + "step": 9083 + }, + { + "epoch": 1.16, + "grad_norm": 0.5675433021653249, + "learning_rate": 3.976221640636734e-06, + "loss": 0.4636, + "step": 9084 + }, + { + "epoch": 1.16, + "grad_norm": 0.6245798588680239, + "learning_rate": 3.9752119323182906e-06, + "loss": 0.4712, + "step": 9085 + }, + { + "epoch": 1.16, + "grad_norm": 0.8089754289607965, + "learning_rate": 3.974202267621799e-06, + "loss": 0.5458, + "step": 9086 + }, + { + "epoch": 1.16, + "grad_norm": 0.7455968135757399, + "learning_rate": 3.973192646590239e-06, + "loss": 0.5792, + "step": 9087 + }, + { + "epoch": 1.16, + "grad_norm": 0.80761384602154, + "learning_rate": 3.972183069266588e-06, + "loss": 0.5336, + "step": 9088 + }, + { + "epoch": 1.16, + "grad_norm": 0.7661715973771724, + "learning_rate": 3.971173535693819e-06, + "loss": 0.4915, + "step": 9089 + }, + { + "epoch": 1.16, + "grad_norm": 0.6546664827639979, + "learning_rate": 3.9701640459149045e-06, + "loss": 0.471, + "step": 9090 + }, + { + "epoch": 1.16, + "grad_norm": 0.736428828910854, + "learning_rate": 3.969154599972815e-06, + "loss": 0.449, + "step": 9091 + }, + { + "epoch": 1.16, + "grad_norm": 0.7050627946695277, + "learning_rate": 3.968145197910521e-06, + "loss": 0.5227, + "step": 9092 + }, + { + "epoch": 1.16, + "grad_norm": 0.8317858416370292, + "learning_rate": 3.967135839770989e-06, + "loss": 0.5211, + "step": 9093 + }, + { + "epoch": 1.16, + "grad_norm": 0.6304214899789228, + "learning_rate": 3.966126525597182e-06, + "loss": 0.4257, + "step": 9094 + }, + { + "epoch": 1.16, + "grad_norm": 0.5291623505855394, + "learning_rate": 3.965117255432067e-06, + "loss": 0.4246, + "step": 9095 + }, + { + "epoch": 1.16, + "grad_norm": 0.5870379339359486, + "learning_rate": 3.9641080293186005e-06, + "loss": 0.4106, + "step": 9096 + }, + { + "epoch": 1.16, + "grad_norm": 0.5793816523184112, + "learning_rate": 3.963098847299746e-06, + "loss": 0.4289, + "step": 9097 + }, + { + "epoch": 1.16, + "grad_norm": 0.613706039667801, + "learning_rate": 3.962089709418463e-06, + "loss": 0.4932, + "step": 9098 + }, + { + "epoch": 1.16, + "grad_norm": 0.963706234550816, + "learning_rate": 3.961080615717702e-06, + "loss": 0.4763, + "step": 9099 + }, + { + "epoch": 1.16, + "grad_norm": 0.7744957939857139, + "learning_rate": 3.96007156624042e-06, + "loss": 0.5497, + "step": 9100 + }, + { + "epoch": 1.16, + "grad_norm": 0.7003710811321566, + "learning_rate": 3.959062561029567e-06, + "loss": 0.5037, + "step": 9101 + }, + { + "epoch": 1.16, + "grad_norm": 0.6349705294957484, + "learning_rate": 3.9580536001280965e-06, + "loss": 0.4975, + "step": 9102 + }, + { + "epoch": 1.16, + "grad_norm": 0.6906702390940155, + "learning_rate": 3.957044683578953e-06, + "loss": 0.4872, + "step": 9103 + }, + { + "epoch": 1.16, + "grad_norm": 0.6341404581796939, + "learning_rate": 3.9560358114250855e-06, + "loss": 0.4845, + "step": 9104 + }, + { + "epoch": 1.16, + "grad_norm": 0.7150037390349223, + "learning_rate": 3.955026983709437e-06, + "loss": 0.4792, + "step": 9105 + }, + { + "epoch": 1.16, + "grad_norm": 0.7174431303440013, + "learning_rate": 3.954018200474951e-06, + "loss": 0.48, + "step": 9106 + }, + { + "epoch": 1.16, + "grad_norm": 0.7808515385455057, + "learning_rate": 3.953009461764568e-06, + "loss": 0.5819, + "step": 9107 + }, + { + "epoch": 1.16, + "grad_norm": 0.6942780320023652, + "learning_rate": 3.952000767621224e-06, + "loss": 0.4963, + "step": 9108 + }, + { + "epoch": 1.16, + "grad_norm": 0.657704003477227, + "learning_rate": 3.95099211808786e-06, + "loss": 0.478, + "step": 9109 + }, + { + "epoch": 1.16, + "grad_norm": 0.7712681903048891, + "learning_rate": 3.94998351320741e-06, + "loss": 0.4958, + "step": 9110 + }, + { + "epoch": 1.16, + "grad_norm": 0.7358224316257641, + "learning_rate": 3.948974953022805e-06, + "loss": 0.5168, + "step": 9111 + }, + { + "epoch": 1.16, + "grad_norm": 0.6010775462509673, + "learning_rate": 3.94796643757698e-06, + "loss": 0.4673, + "step": 9112 + }, + { + "epoch": 1.16, + "grad_norm": 0.819647934147269, + "learning_rate": 3.94695796691286e-06, + "loss": 0.5547, + "step": 9113 + }, + { + "epoch": 1.16, + "grad_norm": 0.7964932874161982, + "learning_rate": 3.945949541073376e-06, + "loss": 0.4886, + "step": 9114 + }, + { + "epoch": 1.16, + "grad_norm": 0.602785597339483, + "learning_rate": 3.94494116010145e-06, + "loss": 0.4701, + "step": 9115 + }, + { + "epoch": 1.16, + "grad_norm": 0.665245239858181, + "learning_rate": 3.943932824040009e-06, + "loss": 0.489, + "step": 9116 + }, + { + "epoch": 1.16, + "grad_norm": 0.668834178179799, + "learning_rate": 3.942924532931971e-06, + "loss": 0.4795, + "step": 9117 + }, + { + "epoch": 1.16, + "grad_norm": 0.5957240798870477, + "learning_rate": 3.94191628682026e-06, + "loss": 0.4317, + "step": 9118 + }, + { + "epoch": 1.16, + "grad_norm": 0.5440271368903683, + "learning_rate": 3.94090808574779e-06, + "loss": 0.4569, + "step": 9119 + }, + { + "epoch": 1.16, + "grad_norm": 0.6572196855398421, + "learning_rate": 3.939899929757477e-06, + "loss": 0.4942, + "step": 9120 + }, + { + "epoch": 1.16, + "grad_norm": 0.9205188706946809, + "learning_rate": 3.938891818892238e-06, + "loss": 0.5233, + "step": 9121 + }, + { + "epoch": 1.16, + "grad_norm": 0.6757955028486089, + "learning_rate": 3.9378837531949834e-06, + "loss": 0.4662, + "step": 9122 + }, + { + "epoch": 1.16, + "grad_norm": 0.6469780203792238, + "learning_rate": 3.9368757327086235e-06, + "loss": 0.4821, + "step": 9123 + }, + { + "epoch": 1.16, + "grad_norm": 0.7221028375024128, + "learning_rate": 3.935867757476067e-06, + "loss": 0.5425, + "step": 9124 + }, + { + "epoch": 1.16, + "grad_norm": 0.8028123857059496, + "learning_rate": 3.934859827540219e-06, + "loss": 0.5813, + "step": 9125 + }, + { + "epoch": 1.16, + "grad_norm": 2.0441537245520633, + "learning_rate": 3.9338519429439844e-06, + "loss": 0.5078, + "step": 9126 + }, + { + "epoch": 1.16, + "grad_norm": 0.6024806031923067, + "learning_rate": 3.932844103730266e-06, + "loss": 0.4135, + "step": 9127 + }, + { + "epoch": 1.16, + "grad_norm": 0.5841228906941894, + "learning_rate": 3.931836309941964e-06, + "loss": 0.4907, + "step": 9128 + }, + { + "epoch": 1.16, + "grad_norm": 0.7006906670491828, + "learning_rate": 3.930828561621977e-06, + "loss": 0.509, + "step": 9129 + }, + { + "epoch": 1.16, + "grad_norm": 0.6420572105852815, + "learning_rate": 3.929820858813201e-06, + "loss": 0.4455, + "step": 9130 + }, + { + "epoch": 1.16, + "grad_norm": 0.6647095300568276, + "learning_rate": 3.928813201558531e-06, + "loss": 0.4785, + "step": 9131 + }, + { + "epoch": 1.16, + "grad_norm": 0.6692811303006562, + "learning_rate": 3.927805589900861e-06, + "loss": 0.5052, + "step": 9132 + }, + { + "epoch": 1.16, + "grad_norm": 0.7395503856659327, + "learning_rate": 3.926798023883079e-06, + "loss": 0.4945, + "step": 9133 + }, + { + "epoch": 1.16, + "grad_norm": 0.7046937754212744, + "learning_rate": 3.925790503548077e-06, + "loss": 0.5408, + "step": 9134 + }, + { + "epoch": 1.16, + "grad_norm": 0.7632893059144262, + "learning_rate": 3.92478302893874e-06, + "loss": 0.5102, + "step": 9135 + }, + { + "epoch": 1.16, + "grad_norm": 0.6120191728587682, + "learning_rate": 3.923775600097954e-06, + "loss": 0.4171, + "step": 9136 + }, + { + "epoch": 1.16, + "grad_norm": 0.6654767368305994, + "learning_rate": 3.922768217068601e-06, + "loss": 0.4044, + "step": 9137 + }, + { + "epoch": 1.16, + "grad_norm": 0.5990582930444863, + "learning_rate": 3.921760879893563e-06, + "loss": 0.4307, + "step": 9138 + }, + { + "epoch": 1.16, + "grad_norm": 0.6657303033587134, + "learning_rate": 3.92075358861572e-06, + "loss": 0.4622, + "step": 9139 + }, + { + "epoch": 1.16, + "grad_norm": 2.0007860509307496, + "learning_rate": 3.919746343277947e-06, + "loss": 0.5107, + "step": 9140 + }, + { + "epoch": 1.16, + "grad_norm": 0.5423788817410247, + "learning_rate": 3.91873914392312e-06, + "loss": 0.4407, + "step": 9141 + }, + { + "epoch": 1.16, + "grad_norm": 0.5877952194569003, + "learning_rate": 3.917731990594112e-06, + "loss": 0.4764, + "step": 9142 + }, + { + "epoch": 1.16, + "grad_norm": 0.8564332508298254, + "learning_rate": 3.916724883333796e-06, + "loss": 0.5384, + "step": 9143 + }, + { + "epoch": 1.16, + "grad_norm": 0.6409153378132896, + "learning_rate": 3.9157178221850395e-06, + "loss": 0.4464, + "step": 9144 + }, + { + "epoch": 1.17, + "grad_norm": 0.6126027797882431, + "learning_rate": 3.914710807190709e-06, + "loss": 0.4721, + "step": 9145 + }, + { + "epoch": 1.17, + "grad_norm": 0.6639521049575524, + "learning_rate": 3.913703838393673e-06, + "loss": 0.4303, + "step": 9146 + }, + { + "epoch": 1.17, + "grad_norm": 0.7090278906173495, + "learning_rate": 3.912696915836794e-06, + "loss": 0.4937, + "step": 9147 + }, + { + "epoch": 1.17, + "grad_norm": 0.621386579863087, + "learning_rate": 3.911690039562931e-06, + "loss": 0.4277, + "step": 9148 + }, + { + "epoch": 1.17, + "grad_norm": 0.5783861492710738, + "learning_rate": 3.910683209614946e-06, + "loss": 0.4439, + "step": 9149 + }, + { + "epoch": 1.17, + "grad_norm": 0.6201388495073825, + "learning_rate": 3.909676426035696e-06, + "loss": 0.4661, + "step": 9150 + }, + { + "epoch": 1.17, + "grad_norm": 0.6921068944146899, + "learning_rate": 3.9086696888680365e-06, + "loss": 0.5297, + "step": 9151 + }, + { + "epoch": 1.17, + "grad_norm": 0.697164699611184, + "learning_rate": 3.90766299815482e-06, + "loss": 0.4584, + "step": 9152 + }, + { + "epoch": 1.17, + "grad_norm": 0.8065126838874935, + "learning_rate": 3.9066563539389e-06, + "loss": 0.5355, + "step": 9153 + }, + { + "epoch": 1.17, + "grad_norm": 0.6212464792366286, + "learning_rate": 3.905649756263125e-06, + "loss": 0.4656, + "step": 9154 + }, + { + "epoch": 1.17, + "grad_norm": 0.5768977727785792, + "learning_rate": 3.904643205170343e-06, + "loss": 0.435, + "step": 9155 + }, + { + "epoch": 1.17, + "grad_norm": 0.6472479220189596, + "learning_rate": 3.903636700703399e-06, + "loss": 0.4406, + "step": 9156 + }, + { + "epoch": 1.17, + "grad_norm": 0.8617600728263312, + "learning_rate": 3.902630242905138e-06, + "loss": 0.5178, + "step": 9157 + }, + { + "epoch": 1.17, + "grad_norm": 0.7154616592155875, + "learning_rate": 3.901623831818398e-06, + "loss": 0.5068, + "step": 9158 + }, + { + "epoch": 1.17, + "grad_norm": 0.766981964880095, + "learning_rate": 3.9006174674860245e-06, + "loss": 0.4771, + "step": 9159 + }, + { + "epoch": 1.17, + "grad_norm": 0.7299910472352982, + "learning_rate": 3.899611149950851e-06, + "loss": 0.4874, + "step": 9160 + }, + { + "epoch": 1.17, + "grad_norm": 0.6244080099514171, + "learning_rate": 3.898604879255715e-06, + "loss": 0.5004, + "step": 9161 + }, + { + "epoch": 1.17, + "grad_norm": 0.7549839214226711, + "learning_rate": 3.89759865544345e-06, + "loss": 0.5253, + "step": 9162 + }, + { + "epoch": 1.17, + "grad_norm": 0.6385097459167149, + "learning_rate": 3.896592478556888e-06, + "loss": 0.4846, + "step": 9163 + }, + { + "epoch": 1.17, + "grad_norm": 0.8405263514369358, + "learning_rate": 3.895586348638857e-06, + "loss": 0.4935, + "step": 9164 + }, + { + "epoch": 1.17, + "grad_norm": 0.7399865673635089, + "learning_rate": 3.894580265732187e-06, + "loss": 0.538, + "step": 9165 + }, + { + "epoch": 1.17, + "grad_norm": 0.6681394900829308, + "learning_rate": 3.893574229879702e-06, + "loss": 0.5062, + "step": 9166 + }, + { + "epoch": 1.17, + "grad_norm": 0.6105976858829424, + "learning_rate": 3.892568241124227e-06, + "loss": 0.4575, + "step": 9167 + }, + { + "epoch": 1.17, + "grad_norm": 0.5858101445608391, + "learning_rate": 3.891562299508582e-06, + "loss": 0.5049, + "step": 9168 + }, + { + "epoch": 1.17, + "grad_norm": 0.6234820883859475, + "learning_rate": 3.890556405075589e-06, + "loss": 0.4336, + "step": 9169 + }, + { + "epoch": 1.17, + "grad_norm": 0.7714491599082312, + "learning_rate": 3.889550557868062e-06, + "loss": 0.4954, + "step": 9170 + }, + { + "epoch": 1.17, + "grad_norm": 0.7779044214780643, + "learning_rate": 3.888544757928821e-06, + "loss": 0.5294, + "step": 9171 + }, + { + "epoch": 1.17, + "grad_norm": 0.7271296509389674, + "learning_rate": 3.887539005300679e-06, + "loss": 0.5461, + "step": 9172 + }, + { + "epoch": 1.17, + "grad_norm": 0.7822856196753434, + "learning_rate": 3.886533300026446e-06, + "loss": 0.573, + "step": 9173 + }, + { + "epoch": 1.17, + "grad_norm": 0.733223695507951, + "learning_rate": 3.885527642148932e-06, + "loss": 0.4985, + "step": 9174 + }, + { + "epoch": 1.17, + "grad_norm": 0.7246966885432375, + "learning_rate": 3.884522031710946e-06, + "loss": 0.4717, + "step": 9175 + }, + { + "epoch": 1.17, + "grad_norm": 0.6986339145254724, + "learning_rate": 3.883516468755291e-06, + "loss": 0.5033, + "step": 9176 + }, + { + "epoch": 1.17, + "grad_norm": 0.6089393751958287, + "learning_rate": 3.882510953324773e-06, + "loss": 0.5221, + "step": 9177 + }, + { + "epoch": 1.17, + "grad_norm": 2.3038668661326622, + "learning_rate": 3.881505485462192e-06, + "loss": 0.5175, + "step": 9178 + }, + { + "epoch": 1.17, + "grad_norm": 0.6827360566959162, + "learning_rate": 3.88050006521035e-06, + "loss": 0.5045, + "step": 9179 + }, + { + "epoch": 1.17, + "grad_norm": 0.5906429046719849, + "learning_rate": 3.879494692612041e-06, + "loss": 0.4377, + "step": 9180 + }, + { + "epoch": 1.17, + "grad_norm": 0.7615171443245319, + "learning_rate": 3.878489367710063e-06, + "loss": 0.5454, + "step": 9181 + }, + { + "epoch": 1.17, + "grad_norm": 0.7664159803596445, + "learning_rate": 3.877484090547207e-06, + "loss": 0.5827, + "step": 9182 + }, + { + "epoch": 1.17, + "grad_norm": 0.6761907687636795, + "learning_rate": 3.876478861166269e-06, + "loss": 0.5063, + "step": 9183 + }, + { + "epoch": 1.17, + "grad_norm": 0.7629195482273305, + "learning_rate": 3.875473679610034e-06, + "loss": 0.503, + "step": 9184 + }, + { + "epoch": 1.17, + "grad_norm": 0.6107465754141232, + "learning_rate": 3.874468545921292e-06, + "loss": 0.4959, + "step": 9185 + }, + { + "epoch": 1.17, + "grad_norm": 0.5943484056256512, + "learning_rate": 3.873463460142827e-06, + "loss": 0.476, + "step": 9186 + }, + { + "epoch": 1.17, + "grad_norm": 15.456930780097185, + "learning_rate": 3.872458422317422e-06, + "loss": 0.5605, + "step": 9187 + }, + { + "epoch": 1.17, + "grad_norm": 0.6893922390539317, + "learning_rate": 3.871453432487859e-06, + "loss": 0.5372, + "step": 9188 + }, + { + "epoch": 1.17, + "grad_norm": 0.7219061276696814, + "learning_rate": 3.870448490696918e-06, + "loss": 0.5353, + "step": 9189 + }, + { + "epoch": 1.17, + "grad_norm": 0.6049954437753662, + "learning_rate": 3.869443596987374e-06, + "loss": 0.4549, + "step": 9190 + }, + { + "epoch": 1.17, + "grad_norm": 0.6344372661780289, + "learning_rate": 3.868438751402005e-06, + "loss": 0.5042, + "step": 9191 + }, + { + "epoch": 1.17, + "grad_norm": 0.6941526922751737, + "learning_rate": 3.867433953983582e-06, + "loss": 0.5014, + "step": 9192 + }, + { + "epoch": 1.17, + "grad_norm": 0.6881241446546059, + "learning_rate": 3.8664292047748755e-06, + "loss": 0.5136, + "step": 9193 + }, + { + "epoch": 1.17, + "grad_norm": 0.6162241616900622, + "learning_rate": 3.8654245038186556e-06, + "loss": 0.4564, + "step": 9194 + }, + { + "epoch": 1.17, + "grad_norm": 0.6233690103539943, + "learning_rate": 3.864419851157688e-06, + "loss": 0.4384, + "step": 9195 + }, + { + "epoch": 1.17, + "grad_norm": 0.9418688162080785, + "learning_rate": 3.86341524683474e-06, + "loss": 0.5472, + "step": 9196 + }, + { + "epoch": 1.17, + "grad_norm": 0.8691137996594228, + "learning_rate": 3.862410690892572e-06, + "loss": 0.5088, + "step": 9197 + }, + { + "epoch": 1.17, + "grad_norm": 0.8002155652223986, + "learning_rate": 3.8614061833739465e-06, + "loss": 0.4869, + "step": 9198 + }, + { + "epoch": 1.17, + "grad_norm": 0.7042408303696801, + "learning_rate": 3.86040172432162e-06, + "loss": 0.5247, + "step": 9199 + }, + { + "epoch": 1.17, + "grad_norm": 0.5657596663045861, + "learning_rate": 3.859397313778352e-06, + "loss": 0.4336, + "step": 9200 + }, + { + "epoch": 1.17, + "grad_norm": 0.6596114493968095, + "learning_rate": 3.858392951786895e-06, + "loss": 0.4478, + "step": 9201 + }, + { + "epoch": 1.17, + "grad_norm": 0.6281678243658037, + "learning_rate": 3.857388638390001e-06, + "loss": 0.4427, + "step": 9202 + }, + { + "epoch": 1.17, + "grad_norm": 0.6567881099577151, + "learning_rate": 3.856384373630424e-06, + "loss": 0.471, + "step": 9203 + }, + { + "epoch": 1.17, + "grad_norm": 0.6881564327469285, + "learning_rate": 3.855380157550907e-06, + "loss": 0.495, + "step": 9204 + }, + { + "epoch": 1.17, + "grad_norm": 0.7740229586344956, + "learning_rate": 3.854375990194201e-06, + "loss": 0.5614, + "step": 9205 + }, + { + "epoch": 1.17, + "grad_norm": 0.7221591605665109, + "learning_rate": 3.853371871603048e-06, + "loss": 0.5433, + "step": 9206 + }, + { + "epoch": 1.17, + "grad_norm": 0.6764043248639546, + "learning_rate": 3.852367801820188e-06, + "loss": 0.519, + "step": 9207 + }, + { + "epoch": 1.17, + "grad_norm": 0.8623854662145037, + "learning_rate": 3.851363780888365e-06, + "loss": 0.5967, + "step": 9208 + }, + { + "epoch": 1.17, + "grad_norm": 0.7071161215088375, + "learning_rate": 3.8503598088503155e-06, + "loss": 0.5319, + "step": 9209 + }, + { + "epoch": 1.17, + "grad_norm": 0.7849210922790041, + "learning_rate": 3.849355885748775e-06, + "loss": 0.4994, + "step": 9210 + }, + { + "epoch": 1.17, + "grad_norm": 0.5831460860868768, + "learning_rate": 3.848352011626478e-06, + "loss": 0.4645, + "step": 9211 + }, + { + "epoch": 1.17, + "grad_norm": 0.5863534498201394, + "learning_rate": 3.847348186526156e-06, + "loss": 0.4421, + "step": 9212 + }, + { + "epoch": 1.17, + "grad_norm": 0.5815739507435796, + "learning_rate": 3.846344410490538e-06, + "loss": 0.4289, + "step": 9213 + }, + { + "epoch": 1.17, + "grad_norm": 0.5997127376286291, + "learning_rate": 3.845340683562352e-06, + "loss": 0.4683, + "step": 9214 + }, + { + "epoch": 1.17, + "grad_norm": 0.5790066090762571, + "learning_rate": 3.844337005784322e-06, + "loss": 0.432, + "step": 9215 + }, + { + "epoch": 1.17, + "grad_norm": 0.6239345958018846, + "learning_rate": 3.843333377199173e-06, + "loss": 0.4789, + "step": 9216 + }, + { + "epoch": 1.17, + "grad_norm": 0.8485273250638531, + "learning_rate": 3.8423297978496274e-06, + "loss": 0.5108, + "step": 9217 + }, + { + "epoch": 1.17, + "grad_norm": 0.5536674072142241, + "learning_rate": 3.841326267778403e-06, + "loss": 0.4159, + "step": 9218 + }, + { + "epoch": 1.17, + "grad_norm": 0.6455557634131391, + "learning_rate": 3.840322787028216e-06, + "loss": 0.4017, + "step": 9219 + }, + { + "epoch": 1.17, + "grad_norm": 0.617889907250975, + "learning_rate": 3.83931935564178e-06, + "loss": 0.4648, + "step": 9220 + }, + { + "epoch": 1.17, + "grad_norm": 0.7682929414251215, + "learning_rate": 3.838315973661812e-06, + "loss": 0.5482, + "step": 9221 + }, + { + "epoch": 1.17, + "grad_norm": 0.7356949883681463, + "learning_rate": 3.83731264113102e-06, + "loss": 0.5363, + "step": 9222 + }, + { + "epoch": 1.17, + "grad_norm": 0.7679844367820574, + "learning_rate": 3.836309358092115e-06, + "loss": 0.5493, + "step": 9223 + }, + { + "epoch": 1.18, + "grad_norm": 0.8391703098611941, + "learning_rate": 3.835306124587801e-06, + "loss": 0.5955, + "step": 9224 + }, + { + "epoch": 1.18, + "grad_norm": 0.6816100992227841, + "learning_rate": 3.834302940660784e-06, + "loss": 0.5169, + "step": 9225 + }, + { + "epoch": 1.18, + "grad_norm": 0.6352445804077987, + "learning_rate": 3.8332998063537656e-06, + "loss": 0.4862, + "step": 9226 + }, + { + "epoch": 1.18, + "grad_norm": 0.7416147191657123, + "learning_rate": 3.832296721709447e-06, + "loss": 0.4999, + "step": 9227 + }, + { + "epoch": 1.18, + "grad_norm": 0.6579608237460424, + "learning_rate": 3.831293686770524e-06, + "loss": 0.4468, + "step": 9228 + }, + { + "epoch": 1.18, + "grad_norm": 0.6927788150118354, + "learning_rate": 3.830290701579695e-06, + "loss": 0.4558, + "step": 9229 + }, + { + "epoch": 1.18, + "grad_norm": 0.7305368181712655, + "learning_rate": 3.829287766179653e-06, + "loss": 0.4848, + "step": 9230 + }, + { + "epoch": 1.18, + "grad_norm": 0.809619839555621, + "learning_rate": 3.82828488061309e-06, + "loss": 0.4916, + "step": 9231 + }, + { + "epoch": 1.18, + "grad_norm": 1.2811635489446969, + "learning_rate": 3.827282044922692e-06, + "loss": 0.4731, + "step": 9232 + }, + { + "epoch": 1.18, + "grad_norm": 0.6003447374299552, + "learning_rate": 3.826279259151153e-06, + "loss": 0.4821, + "step": 9233 + }, + { + "epoch": 1.18, + "grad_norm": 0.7247723980490272, + "learning_rate": 3.8252765233411545e-06, + "loss": 0.445, + "step": 9234 + }, + { + "epoch": 1.18, + "grad_norm": 0.6432466770235389, + "learning_rate": 3.824273837535381e-06, + "loss": 0.4595, + "step": 9235 + }, + { + "epoch": 1.18, + "grad_norm": 0.8148596041799379, + "learning_rate": 3.8232712017765136e-06, + "loss": 0.569, + "step": 9236 + }, + { + "epoch": 1.18, + "grad_norm": 0.8379815591291666, + "learning_rate": 3.822268616107231e-06, + "loss": 0.4709, + "step": 9237 + }, + { + "epoch": 1.18, + "grad_norm": 0.6596493905406737, + "learning_rate": 3.821266080570209e-06, + "loss": 0.453, + "step": 9238 + }, + { + "epoch": 1.18, + "grad_norm": 0.7008662707900963, + "learning_rate": 3.8202635952081235e-06, + "loss": 0.4747, + "step": 9239 + }, + { + "epoch": 1.18, + "grad_norm": 0.6988174692851142, + "learning_rate": 3.8192611600636475e-06, + "loss": 0.4747, + "step": 9240 + }, + { + "epoch": 1.18, + "grad_norm": 0.7049989135272603, + "learning_rate": 3.81825877517945e-06, + "loss": 0.5187, + "step": 9241 + }, + { + "epoch": 1.18, + "grad_norm": 0.6617713091278571, + "learning_rate": 3.8172564405982e-06, + "loss": 0.5134, + "step": 9242 + }, + { + "epoch": 1.18, + "grad_norm": 0.6510452903445328, + "learning_rate": 3.816254156362565e-06, + "loss": 0.5014, + "step": 9243 + }, + { + "epoch": 1.18, + "grad_norm": 0.7906560317435627, + "learning_rate": 3.815251922515205e-06, + "loss": 0.5015, + "step": 9244 + }, + { + "epoch": 1.18, + "grad_norm": 0.8100131437901296, + "learning_rate": 3.814249739098787e-06, + "loss": 0.5174, + "step": 9245 + }, + { + "epoch": 1.18, + "grad_norm": 0.5886019392354486, + "learning_rate": 3.8132476061559683e-06, + "loss": 0.4444, + "step": 9246 + }, + { + "epoch": 1.18, + "grad_norm": 0.6982257396631043, + "learning_rate": 3.8122455237294065e-06, + "loss": 0.4943, + "step": 9247 + }, + { + "epoch": 1.18, + "grad_norm": 0.7774025952607653, + "learning_rate": 3.811243491861758e-06, + "loss": 0.5106, + "step": 9248 + }, + { + "epoch": 1.18, + "grad_norm": 0.8108360143504353, + "learning_rate": 3.8102415105956746e-06, + "loss": 0.5042, + "step": 9249 + }, + { + "epoch": 1.18, + "grad_norm": 0.7335364714964374, + "learning_rate": 3.8092395799738084e-06, + "loss": 0.5216, + "step": 9250 + }, + { + "epoch": 1.18, + "grad_norm": 0.7635895452982631, + "learning_rate": 3.8082377000388083e-06, + "loss": 0.447, + "step": 9251 + }, + { + "epoch": 1.18, + "grad_norm": 0.7662973614194442, + "learning_rate": 3.807235870833321e-06, + "loss": 0.5446, + "step": 9252 + }, + { + "epoch": 1.18, + "grad_norm": 0.7325856723178641, + "learning_rate": 3.8062340923999906e-06, + "loss": 0.5315, + "step": 9253 + }, + { + "epoch": 1.18, + "grad_norm": 0.7968720456675714, + "learning_rate": 3.8052323647814604e-06, + "loss": 0.5503, + "step": 9254 + }, + { + "epoch": 1.18, + "grad_norm": 0.7279809041346655, + "learning_rate": 3.8042306880203706e-06, + "loss": 0.4815, + "step": 9255 + }, + { + "epoch": 1.18, + "grad_norm": 0.6143506890845476, + "learning_rate": 3.803229062159359e-06, + "loss": 0.4539, + "step": 9256 + }, + { + "epoch": 1.18, + "grad_norm": 0.6321351488723453, + "learning_rate": 3.8022274872410614e-06, + "loss": 0.5123, + "step": 9257 + }, + { + "epoch": 1.18, + "grad_norm": 0.7688371835938946, + "learning_rate": 3.8012259633081133e-06, + "loss": 0.5241, + "step": 9258 + }, + { + "epoch": 1.18, + "grad_norm": 0.655250172919114, + "learning_rate": 3.800224490403145e-06, + "loss": 0.4987, + "step": 9259 + }, + { + "epoch": 1.18, + "grad_norm": 0.7011900613464648, + "learning_rate": 3.7992230685687845e-06, + "loss": 0.5243, + "step": 9260 + }, + { + "epoch": 1.18, + "grad_norm": 0.7145118081784876, + "learning_rate": 3.798221697847663e-06, + "loss": 0.4575, + "step": 9261 + }, + { + "epoch": 1.18, + "grad_norm": 0.5398549303443795, + "learning_rate": 3.7972203782824023e-06, + "loss": 0.4333, + "step": 9262 + }, + { + "epoch": 1.18, + "grad_norm": 0.8049330864013907, + "learning_rate": 3.7962191099156277e-06, + "loss": 0.4919, + "step": 9263 + }, + { + "epoch": 1.18, + "grad_norm": 0.7253450753326646, + "learning_rate": 3.795217892789958e-06, + "loss": 0.5375, + "step": 9264 + }, + { + "epoch": 1.18, + "grad_norm": 0.7160461422226329, + "learning_rate": 3.7942167269480128e-06, + "loss": 0.5535, + "step": 9265 + }, + { + "epoch": 1.18, + "grad_norm": 0.7149180236707086, + "learning_rate": 3.793215612432409e-06, + "loss": 0.5485, + "step": 9266 + }, + { + "epoch": 1.18, + "grad_norm": 0.7221360989074198, + "learning_rate": 3.7922145492857597e-06, + "loss": 0.5037, + "step": 9267 + }, + { + "epoch": 1.18, + "grad_norm": 0.6515305820180698, + "learning_rate": 3.7912135375506774e-06, + "loss": 0.5286, + "step": 9268 + }, + { + "epoch": 1.18, + "grad_norm": 0.7626597267477976, + "learning_rate": 3.790212577269772e-06, + "loss": 0.5919, + "step": 9269 + }, + { + "epoch": 1.18, + "grad_norm": 0.6999679969840675, + "learning_rate": 3.789211668485651e-06, + "loss": 0.5081, + "step": 9270 + }, + { + "epoch": 1.18, + "grad_norm": 0.6184696543695805, + "learning_rate": 3.788210811240921e-06, + "loss": 0.4932, + "step": 9271 + }, + { + "epoch": 1.18, + "grad_norm": 0.8672299006600014, + "learning_rate": 3.787210005578184e-06, + "loss": 0.5358, + "step": 9272 + }, + { + "epoch": 1.18, + "grad_norm": 1.1937749189019775, + "learning_rate": 3.7862092515400416e-06, + "loss": 0.5308, + "step": 9273 + }, + { + "epoch": 1.18, + "grad_norm": 0.7309519734392892, + "learning_rate": 3.7852085491690925e-06, + "loss": 0.5436, + "step": 9274 + }, + { + "epoch": 1.18, + "grad_norm": 1.2516075414710495, + "learning_rate": 3.784207898507934e-06, + "loss": 0.4916, + "step": 9275 + }, + { + "epoch": 1.18, + "grad_norm": 0.6548281765815485, + "learning_rate": 3.7832072995991597e-06, + "loss": 0.4572, + "step": 9276 + }, + { + "epoch": 1.18, + "grad_norm": 0.6568133177718134, + "learning_rate": 3.782206752485363e-06, + "loss": 0.44, + "step": 9277 + }, + { + "epoch": 1.18, + "grad_norm": 0.580457937503196, + "learning_rate": 3.781206257209133e-06, + "loss": 0.4221, + "step": 9278 + }, + { + "epoch": 1.18, + "grad_norm": 0.5952395096909264, + "learning_rate": 3.7802058138130577e-06, + "loss": 0.4786, + "step": 9279 + }, + { + "epoch": 1.18, + "grad_norm": 0.7074837466214744, + "learning_rate": 3.779205422339723e-06, + "loss": 0.4749, + "step": 9280 + }, + { + "epoch": 1.18, + "grad_norm": 0.7039139181821368, + "learning_rate": 3.7782050828317123e-06, + "loss": 0.471, + "step": 9281 + }, + { + "epoch": 1.18, + "grad_norm": 0.8467539821181148, + "learning_rate": 3.777204795331605e-06, + "loss": 0.5201, + "step": 9282 + }, + { + "epoch": 1.18, + "grad_norm": 0.6330137298086674, + "learning_rate": 3.776204559881984e-06, + "loss": 0.4785, + "step": 9283 + }, + { + "epoch": 1.18, + "grad_norm": 0.7968895662099147, + "learning_rate": 3.7752043765254248e-06, + "loss": 0.5121, + "step": 9284 + }, + { + "epoch": 1.18, + "grad_norm": 0.7022791635208689, + "learning_rate": 3.7742042453045014e-06, + "loss": 0.4924, + "step": 9285 + }, + { + "epoch": 1.18, + "grad_norm": 0.779389833041283, + "learning_rate": 3.773204166261785e-06, + "loss": 0.4963, + "step": 9286 + }, + { + "epoch": 1.18, + "grad_norm": 0.6689910998117216, + "learning_rate": 3.772204139439848e-06, + "loss": 0.5007, + "step": 9287 + }, + { + "epoch": 1.18, + "grad_norm": 0.6497281707322332, + "learning_rate": 3.771204164881257e-06, + "loss": 0.5289, + "step": 9288 + }, + { + "epoch": 1.18, + "grad_norm": 0.7494177434850453, + "learning_rate": 3.7702042426285783e-06, + "loss": 0.5086, + "step": 9289 + }, + { + "epoch": 1.18, + "grad_norm": 0.5489349441227107, + "learning_rate": 3.769204372724375e-06, + "loss": 0.4177, + "step": 9290 + }, + { + "epoch": 1.18, + "grad_norm": 0.7714326818527153, + "learning_rate": 3.7682045552112084e-06, + "loss": 0.4482, + "step": 9291 + }, + { + "epoch": 1.18, + "grad_norm": 1.1313778681595092, + "learning_rate": 3.7672047901316377e-06, + "loss": 0.5338, + "step": 9292 + }, + { + "epoch": 1.18, + "grad_norm": 0.7377633912869004, + "learning_rate": 3.766205077528219e-06, + "loss": 0.5001, + "step": 9293 + }, + { + "epoch": 1.18, + "grad_norm": 0.7411550229133614, + "learning_rate": 3.7652054174435065e-06, + "loss": 0.5159, + "step": 9294 + }, + { + "epoch": 1.18, + "grad_norm": 0.9335382124106479, + "learning_rate": 3.7642058099200553e-06, + "loss": 0.5174, + "step": 9295 + }, + { + "epoch": 1.18, + "grad_norm": 0.6597320538383452, + "learning_rate": 3.7632062550004135e-06, + "loss": 0.4275, + "step": 9296 + }, + { + "epoch": 1.18, + "grad_norm": 0.6055237739251296, + "learning_rate": 3.7622067527271288e-06, + "loss": 0.4492, + "step": 9297 + }, + { + "epoch": 1.18, + "grad_norm": 0.8273284377787128, + "learning_rate": 3.7612073031427484e-06, + "loss": 0.5092, + "step": 9298 + }, + { + "epoch": 1.18, + "grad_norm": 0.6844810617536631, + "learning_rate": 3.7602079062898133e-06, + "loss": 0.5355, + "step": 9299 + }, + { + "epoch": 1.18, + "grad_norm": 0.8157952071011444, + "learning_rate": 3.7592085622108666e-06, + "loss": 0.501, + "step": 9300 + }, + { + "epoch": 1.18, + "grad_norm": 0.5685104765345936, + "learning_rate": 3.758209270948446e-06, + "loss": 0.4738, + "step": 9301 + }, + { + "epoch": 1.19, + "grad_norm": 0.6627901912611, + "learning_rate": 3.7572100325450883e-06, + "loss": 0.5089, + "step": 9302 + }, + { + "epoch": 1.19, + "grad_norm": 0.7594495353184083, + "learning_rate": 3.756210847043329e-06, + "loss": 0.5367, + "step": 9303 + }, + { + "epoch": 1.19, + "grad_norm": 0.7573227987303697, + "learning_rate": 3.755211714485698e-06, + "loss": 0.5125, + "step": 9304 + }, + { + "epoch": 1.19, + "grad_norm": 0.8435916604362016, + "learning_rate": 3.7542126349147277e-06, + "loss": 0.5543, + "step": 9305 + }, + { + "epoch": 1.19, + "grad_norm": 0.5922881331706207, + "learning_rate": 3.753213608372942e-06, + "loss": 0.4347, + "step": 9306 + }, + { + "epoch": 1.19, + "grad_norm": 0.8723259026302791, + "learning_rate": 3.752214634902871e-06, + "loss": 0.4632, + "step": 9307 + }, + { + "epoch": 1.19, + "grad_norm": 1.1746982021610626, + "learning_rate": 3.751215714547036e-06, + "loss": 0.5416, + "step": 9308 + }, + { + "epoch": 1.19, + "grad_norm": 0.7410483229901984, + "learning_rate": 3.7502168473479572e-06, + "loss": 0.4885, + "step": 9309 + }, + { + "epoch": 1.19, + "grad_norm": 0.6948772900516106, + "learning_rate": 3.749218033348153e-06, + "loss": 0.4579, + "step": 9310 + }, + { + "epoch": 1.19, + "grad_norm": 0.9095252001287359, + "learning_rate": 3.748219272590141e-06, + "loss": 0.5086, + "step": 9311 + }, + { + "epoch": 1.19, + "grad_norm": 0.6682306627158009, + "learning_rate": 3.747220565116434e-06, + "loss": 0.4828, + "step": 9312 + }, + { + "epoch": 1.19, + "grad_norm": 0.6265989931863598, + "learning_rate": 3.7462219109695443e-06, + "loss": 0.4547, + "step": 9313 + }, + { + "epoch": 1.19, + "grad_norm": 0.6501863023514232, + "learning_rate": 3.7452233101919815e-06, + "loss": 0.5018, + "step": 9314 + }, + { + "epoch": 1.19, + "grad_norm": 0.6557278141796754, + "learning_rate": 3.744224762826253e-06, + "loss": 0.4884, + "step": 9315 + }, + { + "epoch": 1.19, + "grad_norm": 0.5893417255077114, + "learning_rate": 3.7432262689148625e-06, + "loss": 0.4312, + "step": 9316 + }, + { + "epoch": 1.19, + "grad_norm": 0.5455050742315564, + "learning_rate": 3.7422278285003145e-06, + "loss": 0.4005, + "step": 9317 + }, + { + "epoch": 1.19, + "grad_norm": 0.6535238794781585, + "learning_rate": 3.7412294416251084e-06, + "loss": 0.4468, + "step": 9318 + }, + { + "epoch": 1.19, + "grad_norm": 0.6805247278074009, + "learning_rate": 3.7402311083317413e-06, + "loss": 0.4741, + "step": 9319 + }, + { + "epoch": 1.19, + "grad_norm": 0.8638228911588726, + "learning_rate": 3.739232828662711e-06, + "loss": 0.563, + "step": 9320 + }, + { + "epoch": 1.19, + "grad_norm": 0.8193523934847567, + "learning_rate": 3.7382346026605108e-06, + "loss": 0.5108, + "step": 9321 + }, + { + "epoch": 1.19, + "grad_norm": 0.8829775620531921, + "learning_rate": 3.73723643036763e-06, + "loss": 0.4882, + "step": 9322 + }, + { + "epoch": 1.19, + "grad_norm": 0.5870229389234177, + "learning_rate": 3.736238311826561e-06, + "loss": 0.4887, + "step": 9323 + }, + { + "epoch": 1.19, + "grad_norm": 0.6344721944487332, + "learning_rate": 3.735240247079788e-06, + "loss": 0.505, + "step": 9324 + }, + { + "epoch": 1.19, + "grad_norm": 0.6926947312885636, + "learning_rate": 3.7342422361697967e-06, + "loss": 0.5369, + "step": 9325 + }, + { + "epoch": 1.19, + "grad_norm": 0.7269810014722693, + "learning_rate": 3.733244279139068e-06, + "loss": 0.4839, + "step": 9326 + }, + { + "epoch": 1.19, + "grad_norm": 0.610876673986871, + "learning_rate": 3.732246376030083e-06, + "loss": 0.5129, + "step": 9327 + }, + { + "epoch": 1.19, + "grad_norm": 0.7641989057294315, + "learning_rate": 3.7312485268853184e-06, + "loss": 0.5836, + "step": 9328 + }, + { + "epoch": 1.19, + "grad_norm": 0.6870809711148789, + "learning_rate": 3.7302507317472493e-06, + "loss": 0.4601, + "step": 9329 + }, + { + "epoch": 1.19, + "grad_norm": 0.6202871951475808, + "learning_rate": 3.7292529906583487e-06, + "loss": 0.5205, + "step": 9330 + }, + { + "epoch": 1.19, + "grad_norm": 0.7142741815292583, + "learning_rate": 3.728255303661087e-06, + "loss": 0.5074, + "step": 9331 + }, + { + "epoch": 1.19, + "grad_norm": 0.5518207667321977, + "learning_rate": 3.7272576707979345e-06, + "loss": 0.4166, + "step": 9332 + }, + { + "epoch": 1.19, + "grad_norm": 0.6242109176807179, + "learning_rate": 3.7262600921113555e-06, + "loss": 0.5183, + "step": 9333 + }, + { + "epoch": 1.19, + "grad_norm": 0.7938759588234392, + "learning_rate": 3.7252625676438136e-06, + "loss": 0.5432, + "step": 9334 + }, + { + "epoch": 1.19, + "grad_norm": 0.7667275698528835, + "learning_rate": 3.7242650974377716e-06, + "loss": 0.5116, + "step": 9335 + }, + { + "epoch": 1.19, + "grad_norm": 0.651678924316612, + "learning_rate": 3.7232676815356873e-06, + "loss": 0.4523, + "step": 9336 + }, + { + "epoch": 1.19, + "grad_norm": 0.8319730542316909, + "learning_rate": 3.722270319980018e-06, + "loss": 0.4918, + "step": 9337 + }, + { + "epoch": 1.19, + "grad_norm": 0.6840768630249385, + "learning_rate": 3.721273012813218e-06, + "loss": 0.525, + "step": 9338 + }, + { + "epoch": 1.19, + "grad_norm": 0.6739512175000334, + "learning_rate": 3.72027576007774e-06, + "loss": 0.4692, + "step": 9339 + }, + { + "epoch": 1.19, + "grad_norm": 0.6622551503471159, + "learning_rate": 3.7192785618160334e-06, + "loss": 0.4986, + "step": 9340 + }, + { + "epoch": 1.19, + "grad_norm": 0.7407529112361289, + "learning_rate": 3.7182814180705458e-06, + "loss": 0.5132, + "step": 9341 + }, + { + "epoch": 1.19, + "grad_norm": 0.6822790479885465, + "learning_rate": 3.7172843288837225e-06, + "loss": 0.4565, + "step": 9342 + }, + { + "epoch": 1.19, + "grad_norm": 0.6566713989972134, + "learning_rate": 3.7162872942980054e-06, + "loss": 0.4618, + "step": 9343 + }, + { + "epoch": 1.19, + "grad_norm": 0.6037750803989895, + "learning_rate": 3.7152903143558348e-06, + "loss": 0.4513, + "step": 9344 + }, + { + "epoch": 1.19, + "grad_norm": 0.7822450616837964, + "learning_rate": 3.7142933890996524e-06, + "loss": 0.4982, + "step": 9345 + }, + { + "epoch": 1.19, + "grad_norm": 1.129460569369605, + "learning_rate": 3.7132965185718916e-06, + "loss": 0.5558, + "step": 9346 + }, + { + "epoch": 1.19, + "grad_norm": 0.628171536754497, + "learning_rate": 3.7122997028149867e-06, + "loss": 0.4233, + "step": 9347 + }, + { + "epoch": 1.19, + "grad_norm": 0.6393485393375593, + "learning_rate": 3.7113029418713677e-06, + "loss": 0.4723, + "step": 9348 + }, + { + "epoch": 1.19, + "grad_norm": 0.5747234858157932, + "learning_rate": 3.710306235783465e-06, + "loss": 0.4561, + "step": 9349 + }, + { + "epoch": 1.19, + "grad_norm": 0.59850375244888, + "learning_rate": 3.7093095845937043e-06, + "loss": 0.4571, + "step": 9350 + }, + { + "epoch": 1.19, + "grad_norm": 0.7406742826051419, + "learning_rate": 3.70831298834451e-06, + "loss": 0.4595, + "step": 9351 + }, + { + "epoch": 1.19, + "grad_norm": 0.8151994076648557, + "learning_rate": 3.7073164470783053e-06, + "loss": 0.49, + "step": 9352 + }, + { + "epoch": 1.19, + "grad_norm": 0.6471894018644406, + "learning_rate": 3.706319960837507e-06, + "loss": 0.4833, + "step": 9353 + }, + { + "epoch": 1.19, + "grad_norm": 0.7934390458550817, + "learning_rate": 3.705323529664535e-06, + "loss": 0.5053, + "step": 9354 + }, + { + "epoch": 1.19, + "grad_norm": 0.6241388647738333, + "learning_rate": 3.7043271536018033e-06, + "loss": 0.4895, + "step": 9355 + }, + { + "epoch": 1.19, + "grad_norm": 0.6094969019526522, + "learning_rate": 3.7033308326917216e-06, + "loss": 0.5041, + "step": 9356 + }, + { + "epoch": 1.19, + "grad_norm": 0.6697576398262278, + "learning_rate": 3.7023345669767047e-06, + "loss": 0.468, + "step": 9357 + }, + { + "epoch": 1.19, + "grad_norm": 0.5401047491143914, + "learning_rate": 3.701338356499159e-06, + "loss": 0.4228, + "step": 9358 + }, + { + "epoch": 1.19, + "grad_norm": 0.583904784802937, + "learning_rate": 3.7003422013014887e-06, + "loss": 0.4863, + "step": 9359 + }, + { + "epoch": 1.19, + "grad_norm": 0.9277342719980841, + "learning_rate": 3.699346101426099e-06, + "loss": 0.53, + "step": 9360 + }, + { + "epoch": 1.19, + "grad_norm": 0.7715251346295013, + "learning_rate": 3.698350056915389e-06, + "loss": 0.5672, + "step": 9361 + }, + { + "epoch": 1.19, + "grad_norm": 1.4160447402707952, + "learning_rate": 3.6973540678117577e-06, + "loss": 0.5241, + "step": 9362 + }, + { + "epoch": 1.19, + "grad_norm": 0.5779429888948027, + "learning_rate": 3.6963581341576005e-06, + "loss": 0.4098, + "step": 9363 + }, + { + "epoch": 1.19, + "grad_norm": 0.6637529487488397, + "learning_rate": 3.695362255995312e-06, + "loss": 0.5328, + "step": 9364 + }, + { + "epoch": 1.19, + "grad_norm": 0.7283515188383624, + "learning_rate": 3.6943664333672835e-06, + "loss": 0.5162, + "step": 9365 + }, + { + "epoch": 1.19, + "grad_norm": 0.6114779598947504, + "learning_rate": 3.693370666315904e-06, + "loss": 0.4714, + "step": 9366 + }, + { + "epoch": 1.19, + "grad_norm": 0.6264892646274346, + "learning_rate": 3.692374954883559e-06, + "loss": 0.4412, + "step": 9367 + }, + { + "epoch": 1.19, + "grad_norm": 0.5449127386935234, + "learning_rate": 3.691379299112632e-06, + "loss": 0.4392, + "step": 9368 + }, + { + "epoch": 1.19, + "grad_norm": 0.6486514898573553, + "learning_rate": 3.6903836990455087e-06, + "loss": 0.449, + "step": 9369 + }, + { + "epoch": 1.19, + "grad_norm": 0.7254549508570198, + "learning_rate": 3.6893881547245658e-06, + "loss": 0.4963, + "step": 9370 + }, + { + "epoch": 1.19, + "grad_norm": 0.7918417743413897, + "learning_rate": 3.688392666192182e-06, + "loss": 0.4872, + "step": 9371 + }, + { + "epoch": 1.19, + "grad_norm": 0.5722716888119294, + "learning_rate": 3.6873972334907303e-06, + "loss": 0.4483, + "step": 9372 + }, + { + "epoch": 1.19, + "grad_norm": 0.6883569104658412, + "learning_rate": 3.686401856662584e-06, + "loss": 0.4523, + "step": 9373 + }, + { + "epoch": 1.19, + "grad_norm": 0.6141976950426694, + "learning_rate": 3.6854065357501133e-06, + "loss": 0.4703, + "step": 9374 + }, + { + "epoch": 1.19, + "grad_norm": 0.7885949112541654, + "learning_rate": 3.6844112707956852e-06, + "loss": 0.5138, + "step": 9375 + }, + { + "epoch": 1.19, + "grad_norm": 1.5029589723466967, + "learning_rate": 3.683416061841665e-06, + "loss": 0.5571, + "step": 9376 + }, + { + "epoch": 1.19, + "grad_norm": 0.6642345677986007, + "learning_rate": 3.6824209089304163e-06, + "loss": 0.5051, + "step": 9377 + }, + { + "epoch": 1.19, + "grad_norm": 0.6578556018806611, + "learning_rate": 3.6814258121042995e-06, + "loss": 0.5162, + "step": 9378 + }, + { + "epoch": 1.19, + "grad_norm": 0.6995339114569948, + "learning_rate": 3.680430771405672e-06, + "loss": 0.4896, + "step": 9379 + }, + { + "epoch": 1.19, + "grad_norm": 0.5122522184510796, + "learning_rate": 3.6794357868768895e-06, + "loss": 0.4089, + "step": 9380 + }, + { + "epoch": 1.2, + "grad_norm": 0.5171705227254032, + "learning_rate": 3.6784408585603047e-06, + "loss": 0.4283, + "step": 9381 + }, + { + "epoch": 1.2, + "grad_norm": 0.578098014850233, + "learning_rate": 3.677445986498271e-06, + "loss": 0.4353, + "step": 9382 + }, + { + "epoch": 1.2, + "grad_norm": 0.6636459750018662, + "learning_rate": 3.6764511707331354e-06, + "loss": 0.4648, + "step": 9383 + }, + { + "epoch": 1.2, + "grad_norm": 0.7143341456500879, + "learning_rate": 3.6754564113072423e-06, + "loss": 0.541, + "step": 9384 + }, + { + "epoch": 1.2, + "grad_norm": 0.623908847572535, + "learning_rate": 3.6744617082629387e-06, + "loss": 0.4844, + "step": 9385 + }, + { + "epoch": 1.2, + "grad_norm": 0.7033864445488113, + "learning_rate": 3.6734670616425638e-06, + "loss": 0.551, + "step": 9386 + }, + { + "epoch": 1.2, + "grad_norm": 0.7838962325925771, + "learning_rate": 3.6724724714884574e-06, + "loss": 0.5106, + "step": 9387 + }, + { + "epoch": 1.2, + "grad_norm": 0.6528243369407437, + "learning_rate": 3.6714779378429556e-06, + "loss": 0.4701, + "step": 9388 + }, + { + "epoch": 1.2, + "grad_norm": 0.659762743757758, + "learning_rate": 3.670483460748393e-06, + "loss": 0.4813, + "step": 9389 + }, + { + "epoch": 1.2, + "grad_norm": 0.7722851731699196, + "learning_rate": 3.6694890402471005e-06, + "loss": 0.5959, + "step": 9390 + }, + { + "epoch": 1.2, + "grad_norm": 0.8287668910847963, + "learning_rate": 3.6684946763814078e-06, + "loss": 0.5985, + "step": 9391 + }, + { + "epoch": 1.2, + "grad_norm": 0.7381844742000775, + "learning_rate": 3.6675003691936423e-06, + "loss": 0.5472, + "step": 9392 + }, + { + "epoch": 1.2, + "grad_norm": 0.7228108698755362, + "learning_rate": 3.666506118726127e-06, + "loss": 0.4872, + "step": 9393 + }, + { + "epoch": 1.2, + "grad_norm": 0.6189659019990906, + "learning_rate": 3.665511925021186e-06, + "loss": 0.5008, + "step": 9394 + }, + { + "epoch": 1.2, + "grad_norm": 0.6769923148269256, + "learning_rate": 3.6645177881211375e-06, + "loss": 0.476, + "step": 9395 + }, + { + "epoch": 1.2, + "grad_norm": 0.7445779339363285, + "learning_rate": 3.6635237080682994e-06, + "loss": 0.4593, + "step": 9396 + }, + { + "epoch": 1.2, + "grad_norm": 0.5909218732290121, + "learning_rate": 3.6625296849049863e-06, + "loss": 0.4433, + "step": 9397 + }, + { + "epoch": 1.2, + "grad_norm": 0.7741790992713794, + "learning_rate": 3.661535718673511e-06, + "loss": 0.5006, + "step": 9398 + }, + { + "epoch": 1.2, + "grad_norm": 0.6337812616324129, + "learning_rate": 3.660541809416182e-06, + "loss": 0.527, + "step": 9399 + }, + { + "epoch": 1.2, + "grad_norm": 0.8041060394041263, + "learning_rate": 3.659547957175309e-06, + "loss": 0.4815, + "step": 9400 + }, + { + "epoch": 1.2, + "grad_norm": 0.6972959503201985, + "learning_rate": 3.6585541619931953e-06, + "loss": 0.5485, + "step": 9401 + }, + { + "epoch": 1.2, + "grad_norm": 0.5640436797815905, + "learning_rate": 3.6575604239121444e-06, + "loss": 0.4168, + "step": 9402 + }, + { + "epoch": 1.2, + "grad_norm": 0.8431772662329683, + "learning_rate": 3.6565667429744565e-06, + "loss": 0.4642, + "step": 9403 + }, + { + "epoch": 1.2, + "grad_norm": 0.787329924089397, + "learning_rate": 3.6555731192224294e-06, + "loss": 0.5255, + "step": 9404 + }, + { + "epoch": 1.2, + "grad_norm": 0.7807554087944258, + "learning_rate": 3.6545795526983584e-06, + "loss": 0.562, + "step": 9405 + }, + { + "epoch": 1.2, + "grad_norm": 0.5846642178958515, + "learning_rate": 3.6535860434445343e-06, + "loss": 0.462, + "step": 9406 + }, + { + "epoch": 1.2, + "grad_norm": 0.5956953735044148, + "learning_rate": 3.652592591503252e-06, + "loss": 0.5001, + "step": 9407 + }, + { + "epoch": 1.2, + "grad_norm": 0.6812276395809893, + "learning_rate": 3.6515991969167975e-06, + "loss": 0.4857, + "step": 9408 + }, + { + "epoch": 1.2, + "grad_norm": 0.6365016702927165, + "learning_rate": 3.6506058597274556e-06, + "loss": 0.4019, + "step": 9409 + }, + { + "epoch": 1.2, + "grad_norm": 0.5736335716129107, + "learning_rate": 3.6496125799775113e-06, + "loss": 0.4804, + "step": 9410 + }, + { + "epoch": 1.2, + "grad_norm": 0.7305932920688418, + "learning_rate": 3.648619357709243e-06, + "loss": 0.4882, + "step": 9411 + }, + { + "epoch": 1.2, + "grad_norm": 0.6085317276675052, + "learning_rate": 3.6476261929649316e-06, + "loss": 0.4291, + "step": 9412 + }, + { + "epoch": 1.2, + "grad_norm": 0.6984835121280041, + "learning_rate": 3.646633085786851e-06, + "loss": 0.508, + "step": 9413 + }, + { + "epoch": 1.2, + "grad_norm": 0.8424748932724646, + "learning_rate": 3.6456400362172752e-06, + "loss": 0.5327, + "step": 9414 + }, + { + "epoch": 1.2, + "grad_norm": 0.7425466890467683, + "learning_rate": 3.644647044298475e-06, + "loss": 0.5255, + "step": 9415 + }, + { + "epoch": 1.2, + "grad_norm": 0.7057164369499038, + "learning_rate": 3.643654110072719e-06, + "loss": 0.5572, + "step": 9416 + }, + { + "epoch": 1.2, + "grad_norm": 0.6238592867457389, + "learning_rate": 3.6426612335822735e-06, + "loss": 0.4244, + "step": 9417 + }, + { + "epoch": 1.2, + "grad_norm": 0.6145404264522862, + "learning_rate": 3.641668414869399e-06, + "loss": 0.4653, + "step": 9418 + }, + { + "epoch": 1.2, + "grad_norm": 0.7842686843129191, + "learning_rate": 3.6406756539763633e-06, + "loss": 0.5227, + "step": 9419 + }, + { + "epoch": 1.2, + "grad_norm": 0.8000686490823119, + "learning_rate": 3.6396829509454203e-06, + "loss": 0.4634, + "step": 9420 + }, + { + "epoch": 1.2, + "grad_norm": 0.6307034047154679, + "learning_rate": 3.638690305818826e-06, + "loss": 0.4187, + "step": 9421 + }, + { + "epoch": 1.2, + "grad_norm": 0.6217178830330693, + "learning_rate": 3.637697718638837e-06, + "loss": 0.4792, + "step": 9422 + }, + { + "epoch": 1.2, + "grad_norm": 0.7085604471968104, + "learning_rate": 3.6367051894477017e-06, + "loss": 0.4821, + "step": 9423 + }, + { + "epoch": 1.2, + "grad_norm": 0.624567134615195, + "learning_rate": 3.6357127182876705e-06, + "loss": 0.4685, + "step": 9424 + }, + { + "epoch": 1.2, + "grad_norm": 0.583034659519317, + "learning_rate": 3.6347203052009895e-06, + "loss": 0.4229, + "step": 9425 + }, + { + "epoch": 1.2, + "grad_norm": 0.5609245545654077, + "learning_rate": 3.6337279502299017e-06, + "loss": 0.4346, + "step": 9426 + }, + { + "epoch": 1.2, + "grad_norm": 0.6057397309858307, + "learning_rate": 3.632735653416649e-06, + "loss": 0.4746, + "step": 9427 + }, + { + "epoch": 1.2, + "grad_norm": 0.7008415882385254, + "learning_rate": 3.6317434148034714e-06, + "loss": 0.4976, + "step": 9428 + }, + { + "epoch": 1.2, + "grad_norm": 0.600971276732898, + "learning_rate": 3.6307512344326034e-06, + "loss": 0.4568, + "step": 9429 + }, + { + "epoch": 1.2, + "grad_norm": 0.6374964222218062, + "learning_rate": 3.629759112346278e-06, + "loss": 0.3921, + "step": 9430 + }, + { + "epoch": 1.2, + "grad_norm": 0.6207756351778158, + "learning_rate": 3.6287670485867306e-06, + "loss": 0.4926, + "step": 9431 + }, + { + "epoch": 1.2, + "grad_norm": 0.854422474203202, + "learning_rate": 3.6277750431961877e-06, + "loss": 0.5187, + "step": 9432 + }, + { + "epoch": 1.2, + "grad_norm": 0.9067806553141458, + "learning_rate": 3.626783096216877e-06, + "loss": 0.5027, + "step": 9433 + }, + { + "epoch": 1.2, + "grad_norm": 0.7618917669033753, + "learning_rate": 3.6257912076910214e-06, + "loss": 0.5116, + "step": 9434 + }, + { + "epoch": 1.2, + "grad_norm": 0.7451529824388118, + "learning_rate": 3.6247993776608426e-06, + "loss": 0.4833, + "step": 9435 + }, + { + "epoch": 1.2, + "grad_norm": 0.5961462216264772, + "learning_rate": 3.6238076061685593e-06, + "loss": 0.4642, + "step": 9436 + }, + { + "epoch": 1.2, + "grad_norm": 0.7265659719787936, + "learning_rate": 3.6228158932563895e-06, + "loss": 0.5014, + "step": 9437 + }, + { + "epoch": 1.2, + "grad_norm": 0.7626185257371304, + "learning_rate": 3.6218242389665457e-06, + "loss": 0.5255, + "step": 9438 + }, + { + "epoch": 1.2, + "grad_norm": 0.8052022781133142, + "learning_rate": 3.62083264334124e-06, + "loss": 0.5387, + "step": 9439 + }, + { + "epoch": 1.2, + "grad_norm": 0.7209810536499134, + "learning_rate": 3.619841106422682e-06, + "loss": 0.5311, + "step": 9440 + }, + { + "epoch": 1.2, + "grad_norm": 0.7741278287168436, + "learning_rate": 3.6188496282530774e-06, + "loss": 0.481, + "step": 9441 + }, + { + "epoch": 1.2, + "grad_norm": 0.7068216646728024, + "learning_rate": 3.6178582088746307e-06, + "loss": 0.4893, + "step": 9442 + }, + { + "epoch": 1.2, + "grad_norm": 0.8401408212994929, + "learning_rate": 3.616866848329542e-06, + "loss": 0.4868, + "step": 9443 + }, + { + "epoch": 1.2, + "grad_norm": 0.7424489850453143, + "learning_rate": 3.615875546660013e-06, + "loss": 0.4622, + "step": 9444 + }, + { + "epoch": 1.2, + "grad_norm": 0.5707105923105024, + "learning_rate": 3.6148843039082394e-06, + "loss": 0.4654, + "step": 9445 + }, + { + "epoch": 1.2, + "grad_norm": 0.5480934382977639, + "learning_rate": 3.613893120116413e-06, + "loss": 0.4318, + "step": 9446 + }, + { + "epoch": 1.2, + "grad_norm": 0.6719126192826061, + "learning_rate": 3.6129019953267285e-06, + "loss": 0.4529, + "step": 9447 + }, + { + "epoch": 1.2, + "grad_norm": 0.577864508267278, + "learning_rate": 3.611910929581373e-06, + "loss": 0.4608, + "step": 9448 + }, + { + "epoch": 1.2, + "grad_norm": 0.7205605247021227, + "learning_rate": 3.6109199229225346e-06, + "loss": 0.4669, + "step": 9449 + }, + { + "epoch": 1.2, + "grad_norm": 0.9227561583055726, + "learning_rate": 3.609928975392395e-06, + "loss": 0.5241, + "step": 9450 + }, + { + "epoch": 1.2, + "grad_norm": 0.8264774168956006, + "learning_rate": 3.6089380870331377e-06, + "loss": 0.5459, + "step": 9451 + }, + { + "epoch": 1.2, + "grad_norm": 0.6348407077098837, + "learning_rate": 3.6079472578869405e-06, + "loss": 0.4165, + "step": 9452 + }, + { + "epoch": 1.2, + "grad_norm": 0.6035479530103958, + "learning_rate": 3.6069564879959805e-06, + "loss": 0.4698, + "step": 9453 + }, + { + "epoch": 1.2, + "grad_norm": 1.0151899137145066, + "learning_rate": 3.605965777402431e-06, + "loss": 0.5063, + "step": 9454 + }, + { + "epoch": 1.2, + "grad_norm": 0.7272700147929374, + "learning_rate": 3.6049751261484624e-06, + "loss": 0.5454, + "step": 9455 + }, + { + "epoch": 1.2, + "grad_norm": 0.8989338736016839, + "learning_rate": 3.6039845342762464e-06, + "loss": 0.5116, + "step": 9456 + }, + { + "epoch": 1.2, + "grad_norm": 0.6306512603646491, + "learning_rate": 3.6029940018279474e-06, + "loss": 0.4634, + "step": 9457 + }, + { + "epoch": 1.2, + "grad_norm": 0.6193870711223556, + "learning_rate": 3.60200352884573e-06, + "loss": 0.5318, + "step": 9458 + }, + { + "epoch": 1.21, + "grad_norm": 0.5828212196482788, + "learning_rate": 3.6010131153717544e-06, + "loss": 0.4574, + "step": 9459 + }, + { + "epoch": 1.21, + "grad_norm": 0.5727501490341228, + "learning_rate": 3.6000227614481797e-06, + "loss": 0.5265, + "step": 9460 + }, + { + "epoch": 1.21, + "grad_norm": 0.678357157869081, + "learning_rate": 3.599032467117163e-06, + "loss": 0.5516, + "step": 9461 + }, + { + "epoch": 1.21, + "grad_norm": 0.5644482035468292, + "learning_rate": 3.5980422324208576e-06, + "loss": 0.4234, + "step": 9462 + }, + { + "epoch": 1.21, + "grad_norm": 0.6079344633391223, + "learning_rate": 3.597052057401414e-06, + "loss": 0.4265, + "step": 9463 + }, + { + "epoch": 1.21, + "grad_norm": 0.5825564950202775, + "learning_rate": 3.596061942100981e-06, + "loss": 0.4027, + "step": 9464 + }, + { + "epoch": 1.21, + "grad_norm": 0.6088844888324194, + "learning_rate": 3.5950718865617052e-06, + "loss": 0.4967, + "step": 9465 + }, + { + "epoch": 1.21, + "grad_norm": 0.6896780299535039, + "learning_rate": 3.594081890825729e-06, + "loss": 0.4752, + "step": 9466 + }, + { + "epoch": 1.21, + "grad_norm": 0.7094657172600143, + "learning_rate": 3.593091954935194e-06, + "loss": 0.5189, + "step": 9467 + }, + { + "epoch": 1.21, + "grad_norm": 0.7255489766048862, + "learning_rate": 3.592102078932237e-06, + "loss": 0.6105, + "step": 9468 + }, + { + "epoch": 1.21, + "grad_norm": 0.5335851202317338, + "learning_rate": 3.5911122628589977e-06, + "loss": 0.4162, + "step": 9469 + }, + { + "epoch": 1.21, + "grad_norm": 0.958267793174056, + "learning_rate": 3.5901225067576073e-06, + "loss": 0.469, + "step": 9470 + }, + { + "epoch": 1.21, + "grad_norm": 0.8552163739072568, + "learning_rate": 3.589132810670196e-06, + "loss": 0.5468, + "step": 9471 + }, + { + "epoch": 1.21, + "grad_norm": 1.1334336176113844, + "learning_rate": 3.588143174638893e-06, + "loss": 0.5105, + "step": 9472 + }, + { + "epoch": 1.21, + "grad_norm": 0.6796844741501856, + "learning_rate": 3.5871535987058226e-06, + "loss": 0.4622, + "step": 9473 + }, + { + "epoch": 1.21, + "grad_norm": 0.6019998325886428, + "learning_rate": 3.5861640829131096e-06, + "loss": 0.4492, + "step": 9474 + }, + { + "epoch": 1.21, + "grad_norm": 0.6671691601084976, + "learning_rate": 3.585174627302873e-06, + "loss": 0.4612, + "step": 9475 + }, + { + "epoch": 1.21, + "grad_norm": 0.6137077765714484, + "learning_rate": 3.584185231917231e-06, + "loss": 0.4189, + "step": 9476 + }, + { + "epoch": 1.21, + "grad_norm": 0.6408745021382739, + "learning_rate": 3.5831958967983004e-06, + "loss": 0.4773, + "step": 9477 + }, + { + "epoch": 1.21, + "grad_norm": 0.9151327707390334, + "learning_rate": 3.5822066219881922e-06, + "loss": 0.5135, + "step": 9478 + }, + { + "epoch": 1.21, + "grad_norm": 0.7123509014075077, + "learning_rate": 3.5812174075290185e-06, + "loss": 0.4742, + "step": 9479 + }, + { + "epoch": 1.21, + "grad_norm": 0.6151227884910276, + "learning_rate": 3.580228253462883e-06, + "loss": 0.4843, + "step": 9480 + }, + { + "epoch": 1.21, + "grad_norm": 0.6466972396294765, + "learning_rate": 3.579239159831896e-06, + "loss": 0.494, + "step": 9481 + }, + { + "epoch": 1.21, + "grad_norm": 0.5525207693083822, + "learning_rate": 3.5782501266781577e-06, + "loss": 0.4292, + "step": 9482 + }, + { + "epoch": 1.21, + "grad_norm": 0.7272640659632693, + "learning_rate": 3.5772611540437687e-06, + "loss": 0.476, + "step": 9483 + }, + { + "epoch": 1.21, + "grad_norm": 0.6582388715087351, + "learning_rate": 3.5762722419708255e-06, + "loss": 0.4672, + "step": 9484 + }, + { + "epoch": 1.21, + "grad_norm": 1.0332531210569487, + "learning_rate": 3.575283390501424e-06, + "loss": 0.5439, + "step": 9485 + }, + { + "epoch": 1.21, + "grad_norm": 0.7006529815804571, + "learning_rate": 3.574294599677656e-06, + "loss": 0.5591, + "step": 9486 + }, + { + "epoch": 1.21, + "grad_norm": 0.9239951531892011, + "learning_rate": 3.57330586954161e-06, + "loss": 0.5313, + "step": 9487 + }, + { + "epoch": 1.21, + "grad_norm": 0.6797135075251783, + "learning_rate": 3.5723172001353747e-06, + "loss": 0.4539, + "step": 9488 + }, + { + "epoch": 1.21, + "grad_norm": 0.9071326519953101, + "learning_rate": 3.571328591501033e-06, + "loss": 0.5236, + "step": 9489 + }, + { + "epoch": 1.21, + "grad_norm": 0.8655001204343843, + "learning_rate": 3.570340043680669e-06, + "loss": 0.4913, + "step": 9490 + }, + { + "epoch": 1.21, + "grad_norm": 0.547057778314284, + "learning_rate": 3.5693515567163604e-06, + "loss": 0.4886, + "step": 9491 + }, + { + "epoch": 1.21, + "grad_norm": 0.7234796173785958, + "learning_rate": 3.568363130650182e-06, + "loss": 0.4973, + "step": 9492 + }, + { + "epoch": 1.21, + "grad_norm": 0.5463726485089478, + "learning_rate": 3.567374765524213e-06, + "loss": 0.4064, + "step": 9493 + }, + { + "epoch": 1.21, + "grad_norm": 0.708796163515261, + "learning_rate": 3.5663864613805217e-06, + "loss": 0.5194, + "step": 9494 + }, + { + "epoch": 1.21, + "grad_norm": 1.8742522625023077, + "learning_rate": 3.565398218261178e-06, + "loss": 0.5662, + "step": 9495 + }, + { + "epoch": 1.21, + "grad_norm": 0.7488353337072939, + "learning_rate": 3.564410036208247e-06, + "loss": 0.4824, + "step": 9496 + }, + { + "epoch": 1.21, + "grad_norm": 0.6227279932724672, + "learning_rate": 3.563421915263794e-06, + "loss": 0.407, + "step": 9497 + }, + { + "epoch": 1.21, + "grad_norm": 0.7032691500929044, + "learning_rate": 3.562433855469879e-06, + "loss": 0.4756, + "step": 9498 + }, + { + "epoch": 1.21, + "grad_norm": 0.7684014194535747, + "learning_rate": 3.561445856868561e-06, + "loss": 0.5294, + "step": 9499 + }, + { + "epoch": 1.21, + "grad_norm": 0.6712242771991961, + "learning_rate": 3.5604579195018962e-06, + "loss": 0.4962, + "step": 9500 + }, + { + "epoch": 1.21, + "grad_norm": 0.7469355859788023, + "learning_rate": 3.5594700434119378e-06, + "loss": 0.4495, + "step": 9501 + }, + { + "epoch": 1.21, + "grad_norm": 0.6225801155766425, + "learning_rate": 3.558482228640736e-06, + "loss": 0.4564, + "step": 9502 + }, + { + "epoch": 1.21, + "grad_norm": 0.692915274564329, + "learning_rate": 3.5574944752303394e-06, + "loss": 0.4773, + "step": 9503 + }, + { + "epoch": 1.21, + "grad_norm": 0.7203387464232403, + "learning_rate": 3.5565067832227933e-06, + "loss": 0.5009, + "step": 9504 + }, + { + "epoch": 1.21, + "grad_norm": 0.6163578035778042, + "learning_rate": 3.5555191526601395e-06, + "loss": 0.5272, + "step": 9505 + }, + { + "epoch": 1.21, + "grad_norm": 0.7545949123118146, + "learning_rate": 3.554531583584421e-06, + "loss": 0.5605, + "step": 9506 + }, + { + "epoch": 1.21, + "grad_norm": 0.7096062047494669, + "learning_rate": 3.5535440760376736e-06, + "loss": 0.5372, + "step": 9507 + }, + { + "epoch": 1.21, + "grad_norm": 0.7589371207711612, + "learning_rate": 3.5525566300619318e-06, + "loss": 0.5539, + "step": 9508 + }, + { + "epoch": 1.21, + "grad_norm": 0.740793068235596, + "learning_rate": 3.55156924569923e-06, + "loss": 0.4934, + "step": 9509 + }, + { + "epoch": 1.21, + "grad_norm": 0.7266898947552103, + "learning_rate": 3.5505819229915963e-06, + "loss": 0.5067, + "step": 9510 + }, + { + "epoch": 1.21, + "grad_norm": 0.676615527553606, + "learning_rate": 3.549594661981059e-06, + "loss": 0.4779, + "step": 9511 + }, + { + "epoch": 1.21, + "grad_norm": 0.7122453572063476, + "learning_rate": 3.5486074627096425e-06, + "loss": 0.4317, + "step": 9512 + }, + { + "epoch": 1.21, + "grad_norm": 0.6194319823175132, + "learning_rate": 3.5476203252193676e-06, + "loss": 0.5079, + "step": 9513 + }, + { + "epoch": 1.21, + "grad_norm": 0.59752804298602, + "learning_rate": 3.5466332495522543e-06, + "loss": 0.4442, + "step": 9514 + }, + { + "epoch": 1.21, + "grad_norm": 0.6544476110689336, + "learning_rate": 3.5456462357503197e-06, + "loss": 0.4124, + "step": 9515 + }, + { + "epoch": 1.21, + "grad_norm": 0.6783805184502651, + "learning_rate": 3.5446592838555773e-06, + "loss": 0.4609, + "step": 9516 + }, + { + "epoch": 1.21, + "grad_norm": 0.5867172345646141, + "learning_rate": 3.543672393910037e-06, + "loss": 0.4424, + "step": 9517 + }, + { + "epoch": 1.21, + "grad_norm": 0.6976572604775314, + "learning_rate": 3.5426855659557107e-06, + "loss": 0.5671, + "step": 9518 + }, + { + "epoch": 1.21, + "grad_norm": 0.7744659246456838, + "learning_rate": 3.541698800034603e-06, + "loss": 0.5592, + "step": 9519 + }, + { + "epoch": 1.21, + "grad_norm": 0.7010569671479242, + "learning_rate": 3.540712096188717e-06, + "loss": 0.5154, + "step": 9520 + }, + { + "epoch": 1.21, + "grad_norm": 0.6287833384926903, + "learning_rate": 3.5397254544600545e-06, + "loss": 0.4634, + "step": 9521 + }, + { + "epoch": 1.21, + "grad_norm": 0.6728368009414035, + "learning_rate": 3.5387388748906122e-06, + "loss": 0.5306, + "step": 9522 + }, + { + "epoch": 1.21, + "grad_norm": 0.9248970475385573, + "learning_rate": 3.537752357522387e-06, + "loss": 0.5508, + "step": 9523 + }, + { + "epoch": 1.21, + "grad_norm": 0.8055062852865371, + "learning_rate": 3.536765902397371e-06, + "loss": 0.557, + "step": 9524 + }, + { + "epoch": 1.21, + "grad_norm": 0.6662121900971837, + "learning_rate": 3.535779509557555e-06, + "loss": 0.4929, + "step": 9525 + }, + { + "epoch": 1.21, + "grad_norm": 0.6850244133045399, + "learning_rate": 3.5347931790449257e-06, + "loss": 0.4481, + "step": 9526 + }, + { + "epoch": 1.21, + "grad_norm": 0.7584437978991218, + "learning_rate": 3.5338069109014694e-06, + "loss": 0.5271, + "step": 9527 + }, + { + "epoch": 1.21, + "grad_norm": 0.8100403308718558, + "learning_rate": 3.5328207051691672e-06, + "loss": 0.5173, + "step": 9528 + }, + { + "epoch": 1.21, + "grad_norm": 0.8487650941987739, + "learning_rate": 3.5318345618899992e-06, + "loss": 0.5182, + "step": 9529 + }, + { + "epoch": 1.21, + "grad_norm": 0.6133629067743392, + "learning_rate": 3.5308484811059402e-06, + "loss": 0.4626, + "step": 9530 + }, + { + "epoch": 1.21, + "grad_norm": 0.942036253091359, + "learning_rate": 3.5298624628589683e-06, + "loss": 0.5166, + "step": 9531 + }, + { + "epoch": 1.21, + "grad_norm": 0.6445150671411881, + "learning_rate": 3.528876507191055e-06, + "loss": 0.4719, + "step": 9532 + }, + { + "epoch": 1.21, + "grad_norm": 0.5909108258872108, + "learning_rate": 3.527890614144166e-06, + "loss": 0.4, + "step": 9533 + }, + { + "epoch": 1.21, + "grad_norm": 0.5798541954088046, + "learning_rate": 3.52690478376027e-06, + "loss": 0.4077, + "step": 9534 + }, + { + "epoch": 1.21, + "grad_norm": 0.6778919284185424, + "learning_rate": 3.52591901608133e-06, + "loss": 0.502, + "step": 9535 + }, + { + "epoch": 1.21, + "grad_norm": 0.820877798191278, + "learning_rate": 3.5249333111493066e-06, + "loss": 0.5297, + "step": 9536 + }, + { + "epoch": 1.21, + "grad_norm": 0.697150675883807, + "learning_rate": 3.523947669006159e-06, + "loss": 0.4982, + "step": 9537 + }, + { + "epoch": 1.22, + "grad_norm": 0.6344091185942939, + "learning_rate": 3.522962089693843e-06, + "loss": 0.4921, + "step": 9538 + }, + { + "epoch": 1.22, + "grad_norm": 1.3954342319233473, + "learning_rate": 3.5219765732543097e-06, + "loss": 0.5411, + "step": 9539 + }, + { + "epoch": 1.22, + "grad_norm": 0.7648751669554349, + "learning_rate": 3.520991119729511e-06, + "loss": 0.5726, + "step": 9540 + }, + { + "epoch": 1.22, + "grad_norm": 0.7401215041280158, + "learning_rate": 3.520005729161394e-06, + "loss": 0.5643, + "step": 9541 + }, + { + "epoch": 1.22, + "grad_norm": 0.6102587041592459, + "learning_rate": 3.5190204015919023e-06, + "loss": 0.4681, + "step": 9542 + }, + { + "epoch": 1.22, + "grad_norm": 0.6515046034696517, + "learning_rate": 3.518035137062981e-06, + "loss": 0.4295, + "step": 9543 + }, + { + "epoch": 1.22, + "grad_norm": 0.6335726288547892, + "learning_rate": 3.5170499356165688e-06, + "loss": 0.4862, + "step": 9544 + }, + { + "epoch": 1.22, + "grad_norm": 0.931279516981549, + "learning_rate": 3.5160647972946016e-06, + "loss": 0.4985, + "step": 9545 + }, + { + "epoch": 1.22, + "grad_norm": 0.6541495643489148, + "learning_rate": 3.5150797221390137e-06, + "loss": 0.4314, + "step": 9546 + }, + { + "epoch": 1.22, + "grad_norm": 0.6574708678092356, + "learning_rate": 3.514094710191738e-06, + "loss": 0.4937, + "step": 9547 + }, + { + "epoch": 1.22, + "grad_norm": 0.7812278288880611, + "learning_rate": 3.5131097614947007e-06, + "loss": 0.4697, + "step": 9548 + }, + { + "epoch": 1.22, + "grad_norm": 0.6514992822042747, + "learning_rate": 3.51212487608983e-06, + "loss": 0.4267, + "step": 9549 + }, + { + "epoch": 1.22, + "grad_norm": 0.6239247188201337, + "learning_rate": 3.5111400540190493e-06, + "loss": 0.4916, + "step": 9550 + }, + { + "epoch": 1.22, + "grad_norm": 0.87093789534363, + "learning_rate": 3.5101552953242785e-06, + "loss": 0.5558, + "step": 9551 + }, + { + "epoch": 1.22, + "grad_norm": 0.7011620251504134, + "learning_rate": 3.509170600047436e-06, + "loss": 0.5146, + "step": 9552 + }, + { + "epoch": 1.22, + "grad_norm": 0.7252922378649093, + "learning_rate": 3.5081859682304366e-06, + "loss": 0.5136, + "step": 9553 + }, + { + "epoch": 1.22, + "grad_norm": 0.6490925619006583, + "learning_rate": 3.5072013999151917e-06, + "loss": 0.4826, + "step": 9554 + }, + { + "epoch": 1.22, + "grad_norm": 0.6454691033825337, + "learning_rate": 3.506216895143615e-06, + "loss": 0.4714, + "step": 9555 + }, + { + "epoch": 1.22, + "grad_norm": 0.6220440688297076, + "learning_rate": 3.505232453957611e-06, + "loss": 0.4995, + "step": 9556 + }, + { + "epoch": 1.22, + "grad_norm": 0.793677132118531, + "learning_rate": 3.5042480763990857e-06, + "loss": 0.5655, + "step": 9557 + }, + { + "epoch": 1.22, + "grad_norm": 0.8110021035452885, + "learning_rate": 3.5032637625099397e-06, + "loss": 0.5614, + "step": 9558 + }, + { + "epoch": 1.22, + "grad_norm": 0.8370877753247526, + "learning_rate": 3.5022795123320718e-06, + "loss": 0.5392, + "step": 9559 + }, + { + "epoch": 1.22, + "grad_norm": 1.7992332193530127, + "learning_rate": 3.50129532590738e-06, + "loss": 0.5056, + "step": 9560 + }, + { + "epoch": 1.22, + "grad_norm": 0.6268060304286129, + "learning_rate": 3.5003112032777563e-06, + "loss": 0.4374, + "step": 9561 + }, + { + "epoch": 1.22, + "grad_norm": 0.7427240184302516, + "learning_rate": 3.499327144485093e-06, + "loss": 0.5067, + "step": 9562 + }, + { + "epoch": 1.22, + "grad_norm": 0.6031566811450092, + "learning_rate": 3.4983431495712773e-06, + "loss": 0.4219, + "step": 9563 + }, + { + "epoch": 1.22, + "grad_norm": 0.692715235767375, + "learning_rate": 3.4973592185781955e-06, + "loss": 0.4432, + "step": 9564 + }, + { + "epoch": 1.22, + "grad_norm": 0.6455174718618477, + "learning_rate": 3.4963753515477294e-06, + "loss": 0.4991, + "step": 9565 + }, + { + "epoch": 1.22, + "grad_norm": 0.6618973069986885, + "learning_rate": 3.4953915485217603e-06, + "loss": 0.4734, + "step": 9566 + }, + { + "epoch": 1.22, + "grad_norm": 0.6091413365808238, + "learning_rate": 3.4944078095421636e-06, + "loss": 0.4451, + "step": 9567 + }, + { + "epoch": 1.22, + "grad_norm": 0.6361375464784068, + "learning_rate": 3.4934241346508165e-06, + "loss": 0.4555, + "step": 9568 + }, + { + "epoch": 1.22, + "grad_norm": 0.5552947658644004, + "learning_rate": 3.492440523889589e-06, + "loss": 0.437, + "step": 9569 + }, + { + "epoch": 1.22, + "grad_norm": 0.638820309850292, + "learning_rate": 3.4914569773003503e-06, + "loss": 0.4663, + "step": 9570 + }, + { + "epoch": 1.22, + "grad_norm": 0.6409077448931367, + "learning_rate": 3.4904734949249685e-06, + "loss": 0.4226, + "step": 9571 + }, + { + "epoch": 1.22, + "grad_norm": 0.6524659815371653, + "learning_rate": 3.489490076805306e-06, + "loss": 0.4889, + "step": 9572 + }, + { + "epoch": 1.22, + "grad_norm": 0.8104513560359653, + "learning_rate": 3.4885067229832246e-06, + "loss": 0.5174, + "step": 9573 + }, + { + "epoch": 1.22, + "grad_norm": 0.573260450430759, + "learning_rate": 3.487523433500582e-06, + "loss": 0.4272, + "step": 9574 + }, + { + "epoch": 1.22, + "grad_norm": 0.5968621977917752, + "learning_rate": 3.4865402083992335e-06, + "loss": 0.4538, + "step": 9575 + }, + { + "epoch": 1.22, + "grad_norm": 0.753852619431407, + "learning_rate": 3.485557047721032e-06, + "loss": 0.3875, + "step": 9576 + }, + { + "epoch": 1.22, + "grad_norm": 0.6648529597676575, + "learning_rate": 3.484573951507827e-06, + "loss": 0.4558, + "step": 9577 + }, + { + "epoch": 1.22, + "grad_norm": 0.6038583918411186, + "learning_rate": 3.4835909198014674e-06, + "loss": 0.4875, + "step": 9578 + }, + { + "epoch": 1.22, + "grad_norm": 0.6562597198557258, + "learning_rate": 3.482607952643794e-06, + "loss": 0.4694, + "step": 9579 + }, + { + "epoch": 1.22, + "grad_norm": 0.637734725565027, + "learning_rate": 3.4816250500766534e-06, + "loss": 0.4651, + "step": 9580 + }, + { + "epoch": 1.22, + "grad_norm": 0.6243187144183077, + "learning_rate": 3.480642212141882e-06, + "loss": 0.4524, + "step": 9581 + }, + { + "epoch": 1.22, + "grad_norm": 0.6205104751826146, + "learning_rate": 3.4796594388813165e-06, + "loss": 0.4633, + "step": 9582 + }, + { + "epoch": 1.22, + "grad_norm": 0.7537233492004364, + "learning_rate": 3.4786767303367904e-06, + "loss": 0.5545, + "step": 9583 + }, + { + "epoch": 1.22, + "grad_norm": 0.8048135126357497, + "learning_rate": 3.477694086550134e-06, + "loss": 0.5571, + "step": 9584 + }, + { + "epoch": 1.22, + "grad_norm": 0.7253749724185615, + "learning_rate": 3.476711507563176e-06, + "loss": 0.5059, + "step": 9585 + }, + { + "epoch": 1.22, + "grad_norm": 0.836737129944647, + "learning_rate": 3.475728993417742e-06, + "loss": 0.5169, + "step": 9586 + }, + { + "epoch": 1.22, + "grad_norm": 0.7551192186028224, + "learning_rate": 3.474746544155653e-06, + "loss": 0.4917, + "step": 9587 + }, + { + "epoch": 1.22, + "grad_norm": 0.7168967448397087, + "learning_rate": 3.47376415981873e-06, + "loss": 0.445, + "step": 9588 + }, + { + "epoch": 1.22, + "grad_norm": 0.5533768624867513, + "learning_rate": 3.47278184044879e-06, + "loss": 0.4699, + "step": 9589 + }, + { + "epoch": 1.22, + "grad_norm": 0.6531325892846266, + "learning_rate": 3.4717995860876462e-06, + "loss": 0.4578, + "step": 9590 + }, + { + "epoch": 1.22, + "grad_norm": 0.7436235037554774, + "learning_rate": 3.4708173967771107e-06, + "loss": 0.488, + "step": 9591 + }, + { + "epoch": 1.22, + "grad_norm": 0.8112057817512172, + "learning_rate": 3.46983527255899e-06, + "loss": 0.5133, + "step": 9592 + }, + { + "epoch": 1.22, + "grad_norm": 0.6072924295617089, + "learning_rate": 3.468853213475094e-06, + "loss": 0.4526, + "step": 9593 + }, + { + "epoch": 1.22, + "grad_norm": 0.6088393253612352, + "learning_rate": 3.467871219567224e-06, + "loss": 0.4677, + "step": 9594 + }, + { + "epoch": 1.22, + "grad_norm": 0.6354448602537883, + "learning_rate": 3.4668892908771802e-06, + "loss": 0.4607, + "step": 9595 + }, + { + "epoch": 1.22, + "grad_norm": 0.588668623792249, + "learning_rate": 3.4659074274467596e-06, + "loss": 0.4394, + "step": 9596 + }, + { + "epoch": 1.22, + "grad_norm": 0.6483942601325419, + "learning_rate": 3.464925629317758e-06, + "loss": 0.4328, + "step": 9597 + }, + { + "epoch": 1.22, + "grad_norm": 0.6266912586228107, + "learning_rate": 3.463943896531967e-06, + "loss": 0.4791, + "step": 9598 + }, + { + "epoch": 1.22, + "grad_norm": 0.6582908632337373, + "learning_rate": 3.462962229131176e-06, + "loss": 0.4797, + "step": 9599 + }, + { + "epoch": 1.22, + "grad_norm": 0.8104118433322373, + "learning_rate": 3.461980627157171e-06, + "loss": 0.4799, + "step": 9600 + }, + { + "epoch": 1.22, + "grad_norm": 0.6214936350332427, + "learning_rate": 3.4609990906517367e-06, + "loss": 0.47, + "step": 9601 + }, + { + "epoch": 1.22, + "grad_norm": 0.8602054275217241, + "learning_rate": 3.460017619656652e-06, + "loss": 0.5482, + "step": 9602 + }, + { + "epoch": 1.22, + "grad_norm": 0.8458931340294048, + "learning_rate": 3.459036214213697e-06, + "loss": 0.5587, + "step": 9603 + }, + { + "epoch": 1.22, + "grad_norm": 0.7366719521818139, + "learning_rate": 3.458054874364643e-06, + "loss": 0.531, + "step": 9604 + }, + { + "epoch": 1.22, + "grad_norm": 0.7087381750490165, + "learning_rate": 3.4570736001512685e-06, + "loss": 0.5459, + "step": 9605 + }, + { + "epoch": 1.22, + "grad_norm": 0.7783415464754193, + "learning_rate": 3.45609239161534e-06, + "loss": 0.4991, + "step": 9606 + }, + { + "epoch": 1.22, + "grad_norm": 0.6350011996685322, + "learning_rate": 3.455111248798625e-06, + "loss": 0.481, + "step": 9607 + }, + { + "epoch": 1.22, + "grad_norm": 0.7195693241568325, + "learning_rate": 3.4541301717428877e-06, + "loss": 0.4741, + "step": 9608 + }, + { + "epoch": 1.22, + "grad_norm": 0.7088978698882897, + "learning_rate": 3.4531491604898888e-06, + "loss": 0.464, + "step": 9609 + }, + { + "epoch": 1.22, + "grad_norm": 0.721564441042376, + "learning_rate": 3.452168215081387e-06, + "loss": 0.5201, + "step": 9610 + }, + { + "epoch": 1.22, + "grad_norm": 0.7811436353412271, + "learning_rate": 3.451187335559138e-06, + "loss": 0.5216, + "step": 9611 + }, + { + "epoch": 1.22, + "grad_norm": 0.6132511808358145, + "learning_rate": 3.450206521964895e-06, + "loss": 0.4325, + "step": 9612 + }, + { + "epoch": 1.22, + "grad_norm": 0.6925964234648885, + "learning_rate": 3.4492257743404078e-06, + "loss": 0.4474, + "step": 9613 + }, + { + "epoch": 1.22, + "grad_norm": 0.6562312598877206, + "learning_rate": 3.4482450927274242e-06, + "loss": 0.4717, + "step": 9614 + }, + { + "epoch": 1.22, + "grad_norm": 1.0228066324862592, + "learning_rate": 3.4472644771676878e-06, + "loss": 0.5333, + "step": 9615 + }, + { + "epoch": 1.23, + "grad_norm": 0.5807906563440279, + "learning_rate": 3.4462839277029393e-06, + "loss": 0.4398, + "step": 9616 + }, + { + "epoch": 1.23, + "grad_norm": 0.6340389532708894, + "learning_rate": 3.445303444374921e-06, + "loss": 0.4507, + "step": 9617 + }, + { + "epoch": 1.23, + "grad_norm": 0.7427823046939891, + "learning_rate": 3.4443230272253662e-06, + "loss": 0.5257, + "step": 9618 + }, + { + "epoch": 1.23, + "grad_norm": 0.6269877363995071, + "learning_rate": 3.44334267629601e-06, + "loss": 0.477, + "step": 9619 + }, + { + "epoch": 1.23, + "grad_norm": 0.743625191470743, + "learning_rate": 3.442362391628581e-06, + "loss": 0.5226, + "step": 9620 + }, + { + "epoch": 1.23, + "grad_norm": 0.6038063353448906, + "learning_rate": 3.4413821732648077e-06, + "loss": 0.4348, + "step": 9621 + }, + { + "epoch": 1.23, + "grad_norm": 0.7309311624728724, + "learning_rate": 3.4404020212464147e-06, + "loss": 0.4612, + "step": 9622 + }, + { + "epoch": 1.23, + "grad_norm": 0.6570554203276916, + "learning_rate": 3.439421935615124e-06, + "loss": 0.4936, + "step": 9623 + }, + { + "epoch": 1.23, + "grad_norm": 0.7456937621873247, + "learning_rate": 3.4384419164126537e-06, + "loss": 0.5251, + "step": 9624 + }, + { + "epoch": 1.23, + "grad_norm": 0.6025230415683878, + "learning_rate": 3.4374619636807217e-06, + "loss": 0.4733, + "step": 9625 + }, + { + "epoch": 1.23, + "grad_norm": 0.7239543169751721, + "learning_rate": 3.4364820774610406e-06, + "loss": 0.5405, + "step": 9626 + }, + { + "epoch": 1.23, + "grad_norm": 0.6155250039409523, + "learning_rate": 3.4355022577953216e-06, + "loss": 0.4676, + "step": 9627 + }, + { + "epoch": 1.23, + "grad_norm": 1.212115193194442, + "learning_rate": 3.4345225047252718e-06, + "loss": 0.515, + "step": 9628 + }, + { + "epoch": 1.23, + "grad_norm": 0.5883823266749431, + "learning_rate": 3.4335428182925956e-06, + "loss": 0.4876, + "step": 9629 + }, + { + "epoch": 1.23, + "grad_norm": 0.6486140846555637, + "learning_rate": 3.4325631985389964e-06, + "loss": 0.4339, + "step": 9630 + }, + { + "epoch": 1.23, + "grad_norm": 0.6670372602311031, + "learning_rate": 3.4315836455061735e-06, + "loss": 0.4921, + "step": 9631 + }, + { + "epoch": 1.23, + "grad_norm": 0.6278492413799993, + "learning_rate": 3.4306041592358218e-06, + "loss": 0.5041, + "step": 9632 + }, + { + "epoch": 1.23, + "grad_norm": 0.5915076063710176, + "learning_rate": 3.429624739769637e-06, + "loss": 0.465, + "step": 9633 + }, + { + "epoch": 1.23, + "grad_norm": 0.7136725725353825, + "learning_rate": 3.4286453871493087e-06, + "loss": 0.5453, + "step": 9634 + }, + { + "epoch": 1.23, + "grad_norm": 1.3624164864499388, + "learning_rate": 3.4276661014165246e-06, + "loss": 0.5368, + "step": 9635 + }, + { + "epoch": 1.23, + "grad_norm": 0.747894990043783, + "learning_rate": 3.426686882612971e-06, + "loss": 0.5623, + "step": 9636 + }, + { + "epoch": 1.23, + "grad_norm": 0.7980325720582407, + "learning_rate": 3.4257077307803286e-06, + "loss": 0.562, + "step": 9637 + }, + { + "epoch": 1.23, + "grad_norm": 0.712089915023937, + "learning_rate": 3.4247286459602775e-06, + "loss": 0.4962, + "step": 9638 + }, + { + "epoch": 1.23, + "grad_norm": 0.5620463504382522, + "learning_rate": 3.4237496281944936e-06, + "loss": 0.3795, + "step": 9639 + }, + { + "epoch": 1.23, + "grad_norm": 0.6172752564627064, + "learning_rate": 3.4227706775246517e-06, + "loss": 0.4245, + "step": 9640 + }, + { + "epoch": 1.23, + "grad_norm": 0.609990838409253, + "learning_rate": 3.4217917939924204e-06, + "loss": 0.5065, + "step": 9641 + }, + { + "epoch": 1.23, + "grad_norm": 0.8053919550094233, + "learning_rate": 3.4208129776394712e-06, + "loss": 0.5541, + "step": 9642 + }, + { + "epoch": 1.23, + "grad_norm": 1.120625348506504, + "learning_rate": 3.4198342285074667e-06, + "loss": 0.5251, + "step": 9643 + }, + { + "epoch": 1.23, + "grad_norm": 0.9142736406870288, + "learning_rate": 3.418855546638069e-06, + "loss": 0.4557, + "step": 9644 + }, + { + "epoch": 1.23, + "grad_norm": 0.5845516104725537, + "learning_rate": 3.417876932072939e-06, + "loss": 0.4323, + "step": 9645 + }, + { + "epoch": 1.23, + "grad_norm": 0.7049217338081998, + "learning_rate": 3.4168983848537316e-06, + "loss": 0.4971, + "step": 9646 + }, + { + "epoch": 1.23, + "grad_norm": 0.7026243089837511, + "learning_rate": 3.4159199050221015e-06, + "loss": 0.4888, + "step": 9647 + }, + { + "epoch": 1.23, + "grad_norm": 0.6423647309746382, + "learning_rate": 3.414941492619699e-06, + "loss": 0.4698, + "step": 9648 + }, + { + "epoch": 1.23, + "grad_norm": 0.6964945513856139, + "learning_rate": 3.413963147688172e-06, + "loss": 0.5212, + "step": 9649 + }, + { + "epoch": 1.23, + "grad_norm": 0.6214252614056979, + "learning_rate": 3.4129848702691648e-06, + "loss": 0.4758, + "step": 9650 + }, + { + "epoch": 1.23, + "grad_norm": 0.7473654028416774, + "learning_rate": 3.412006660404321e-06, + "loss": 0.4981, + "step": 9651 + }, + { + "epoch": 1.23, + "grad_norm": 0.7338716613941693, + "learning_rate": 3.4110285181352787e-06, + "loss": 0.4483, + "step": 9652 + }, + { + "epoch": 1.23, + "grad_norm": 0.5979132549143013, + "learning_rate": 3.410050443503675e-06, + "loss": 0.4381, + "step": 9653 + }, + { + "epoch": 1.23, + "grad_norm": 0.6845570601229272, + "learning_rate": 3.409072436551142e-06, + "loss": 0.4958, + "step": 9654 + }, + { + "epoch": 1.23, + "grad_norm": 0.5776470081109305, + "learning_rate": 3.4080944973193127e-06, + "loss": 0.4978, + "step": 9655 + }, + { + "epoch": 1.23, + "grad_norm": 0.6818693753463683, + "learning_rate": 3.4071166258498134e-06, + "loss": 0.5582, + "step": 9656 + }, + { + "epoch": 1.23, + "grad_norm": 0.7405322976684284, + "learning_rate": 3.4061388221842694e-06, + "loss": 0.5394, + "step": 9657 + }, + { + "epoch": 1.23, + "grad_norm": 0.7630809837510244, + "learning_rate": 3.4051610863643026e-06, + "loss": 0.5184, + "step": 9658 + }, + { + "epoch": 1.23, + "grad_norm": 0.6240005113126648, + "learning_rate": 3.4041834184315325e-06, + "loss": 0.5185, + "step": 9659 + }, + { + "epoch": 1.23, + "grad_norm": 0.738748319660092, + "learning_rate": 3.403205818427574e-06, + "loss": 0.5198, + "step": 9660 + }, + { + "epoch": 1.23, + "grad_norm": 0.85820358644243, + "learning_rate": 3.4022282863940415e-06, + "loss": 0.5498, + "step": 9661 + }, + { + "epoch": 1.23, + "grad_norm": 0.7723464552274426, + "learning_rate": 3.401250822372545e-06, + "loss": 0.5401, + "step": 9662 + }, + { + "epoch": 1.23, + "grad_norm": 0.9412500161288903, + "learning_rate": 3.4002734264046923e-06, + "loss": 0.4923, + "step": 9663 + }, + { + "epoch": 1.23, + "grad_norm": 0.6688416186565718, + "learning_rate": 3.3992960985320877e-06, + "loss": 0.4637, + "step": 9664 + }, + { + "epoch": 1.23, + "grad_norm": 0.6901652374288079, + "learning_rate": 3.3983188387963334e-06, + "loss": 0.4753, + "step": 9665 + }, + { + "epoch": 1.23, + "grad_norm": 0.6530480862320649, + "learning_rate": 3.397341647239025e-06, + "loss": 0.5008, + "step": 9666 + }, + { + "epoch": 1.23, + "grad_norm": 0.7884568324014779, + "learning_rate": 3.3963645239017644e-06, + "loss": 0.5431, + "step": 9667 + }, + { + "epoch": 1.23, + "grad_norm": 0.8573859812149741, + "learning_rate": 3.3953874688261413e-06, + "loss": 0.49, + "step": 9668 + }, + { + "epoch": 1.23, + "grad_norm": 1.2110211371568325, + "learning_rate": 3.3944104820537453e-06, + "loss": 0.5374, + "step": 9669 + }, + { + "epoch": 1.23, + "grad_norm": 0.6887016607639445, + "learning_rate": 3.393433563626165e-06, + "loss": 0.4968, + "step": 9670 + }, + { + "epoch": 1.23, + "grad_norm": 0.754896727447843, + "learning_rate": 3.392456713584984e-06, + "loss": 0.5469, + "step": 9671 + }, + { + "epoch": 1.23, + "grad_norm": 0.7532981159357144, + "learning_rate": 3.3914799319717837e-06, + "loss": 0.5089, + "step": 9672 + }, + { + "epoch": 1.23, + "grad_norm": 0.7393912423406087, + "learning_rate": 3.390503218828143e-06, + "loss": 0.5416, + "step": 9673 + }, + { + "epoch": 1.23, + "grad_norm": 0.8085606629083892, + "learning_rate": 3.389526574195636e-06, + "loss": 0.5285, + "step": 9674 + }, + { + "epoch": 1.23, + "grad_norm": 0.6091851025976537, + "learning_rate": 3.388549998115837e-06, + "loss": 0.4232, + "step": 9675 + }, + { + "epoch": 1.23, + "grad_norm": 0.5707818476742368, + "learning_rate": 3.3875734906303146e-06, + "loss": 0.4511, + "step": 9676 + }, + { + "epoch": 1.23, + "grad_norm": 0.7227538980858315, + "learning_rate": 3.386597051780637e-06, + "loss": 0.4839, + "step": 9677 + }, + { + "epoch": 1.23, + "grad_norm": 0.6104563736198384, + "learning_rate": 3.3856206816083647e-06, + "loss": 0.4512, + "step": 9678 + }, + { + "epoch": 1.23, + "grad_norm": 0.7664101677574381, + "learning_rate": 3.384644380155063e-06, + "loss": 0.4882, + "step": 9679 + }, + { + "epoch": 1.23, + "grad_norm": 0.7209889836951127, + "learning_rate": 3.383668147462289e-06, + "loss": 0.5079, + "step": 9680 + }, + { + "epoch": 1.23, + "grad_norm": 0.668217285319877, + "learning_rate": 3.3826919835715965e-06, + "loss": 0.4372, + "step": 9681 + }, + { + "epoch": 1.23, + "grad_norm": 0.6373655781152588, + "learning_rate": 3.3817158885245376e-06, + "loss": 0.5166, + "step": 9682 + }, + { + "epoch": 1.23, + "grad_norm": 0.9074841743386916, + "learning_rate": 3.380739862362662e-06, + "loss": 0.5054, + "step": 9683 + }, + { + "epoch": 1.23, + "grad_norm": 0.7556160265804783, + "learning_rate": 3.3797639051275166e-06, + "loss": 0.5402, + "step": 9684 + }, + { + "epoch": 1.23, + "grad_norm": 0.7500841576864383, + "learning_rate": 3.3787880168606442e-06, + "loss": 0.4938, + "step": 9685 + }, + { + "epoch": 1.23, + "grad_norm": 0.6651220283663039, + "learning_rate": 3.377812197603585e-06, + "loss": 0.4682, + "step": 9686 + }, + { + "epoch": 1.23, + "grad_norm": 0.6644771417163498, + "learning_rate": 3.3768364473978766e-06, + "loss": 0.4449, + "step": 9687 + }, + { + "epoch": 1.23, + "grad_norm": 0.5885818079786101, + "learning_rate": 3.375860766285054e-06, + "loss": 0.438, + "step": 9688 + }, + { + "epoch": 1.23, + "grad_norm": 0.6493892725602527, + "learning_rate": 3.3748851543066486e-06, + "loss": 0.4498, + "step": 9689 + }, + { + "epoch": 1.23, + "grad_norm": 0.5903591044995382, + "learning_rate": 3.373909611504189e-06, + "loss": 0.4344, + "step": 9690 + }, + { + "epoch": 1.23, + "grad_norm": 0.8475285021136163, + "learning_rate": 3.3729341379192006e-06, + "loss": 0.4975, + "step": 9691 + }, + { + "epoch": 1.23, + "grad_norm": 0.6043653900253609, + "learning_rate": 3.371958733593207e-06, + "loss": 0.4319, + "step": 9692 + }, + { + "epoch": 1.23, + "grad_norm": 0.5772567621284745, + "learning_rate": 3.370983398567727e-06, + "loss": 0.4342, + "step": 9693 + }, + { + "epoch": 1.23, + "grad_norm": 0.6100995651157373, + "learning_rate": 3.370008132884278e-06, + "loss": 0.4757, + "step": 9694 + }, + { + "epoch": 1.24, + "grad_norm": 0.5980289280900019, + "learning_rate": 3.3690329365843743e-06, + "loss": 0.4743, + "step": 9695 + }, + { + "epoch": 1.24, + "grad_norm": 0.8002261870219475, + "learning_rate": 3.3680578097095275e-06, + "loss": 0.4867, + "step": 9696 + }, + { + "epoch": 1.24, + "grad_norm": 0.7843917144754352, + "learning_rate": 3.3670827523012443e-06, + "loss": 0.4666, + "step": 9697 + }, + { + "epoch": 1.24, + "grad_norm": 0.8952009648199384, + "learning_rate": 3.3661077644010308e-06, + "loss": 0.5205, + "step": 9698 + }, + { + "epoch": 1.24, + "grad_norm": 1.006639819442437, + "learning_rate": 3.3651328460503884e-06, + "loss": 0.5611, + "step": 9699 + }, + { + "epoch": 1.24, + "grad_norm": 0.8166682729650561, + "learning_rate": 3.364157997290816e-06, + "loss": 0.5293, + "step": 9700 + }, + { + "epoch": 1.24, + "grad_norm": 0.6763966588639446, + "learning_rate": 3.363183218163811e-06, + "loss": 0.5146, + "step": 9701 + }, + { + "epoch": 1.24, + "grad_norm": 0.7478200352706385, + "learning_rate": 3.3622085087108653e-06, + "loss": 0.5068, + "step": 9702 + }, + { + "epoch": 1.24, + "grad_norm": 0.674491227335443, + "learning_rate": 3.3612338689734693e-06, + "loss": 0.444, + "step": 9703 + }, + { + "epoch": 1.24, + "grad_norm": 0.618147605645662, + "learning_rate": 3.360259298993111e-06, + "loss": 0.4791, + "step": 9704 + }, + { + "epoch": 1.24, + "grad_norm": 0.8377174390100891, + "learning_rate": 3.359284798811275e-06, + "loss": 0.5463, + "step": 9705 + }, + { + "epoch": 1.24, + "grad_norm": 0.5628948708384927, + "learning_rate": 3.3583103684694418e-06, + "loss": 0.4061, + "step": 9706 + }, + { + "epoch": 1.24, + "grad_norm": 0.654613670938272, + "learning_rate": 3.35733600800909e-06, + "loss": 0.3981, + "step": 9707 + }, + { + "epoch": 1.24, + "grad_norm": 0.6085584585489507, + "learning_rate": 3.3563617174716954e-06, + "loss": 0.4479, + "step": 9708 + }, + { + "epoch": 1.24, + "grad_norm": 0.6910659131927186, + "learning_rate": 3.3553874968987304e-06, + "loss": 0.4891, + "step": 9709 + }, + { + "epoch": 1.24, + "grad_norm": 0.7340759927453796, + "learning_rate": 3.354413346331664e-06, + "loss": 0.5156, + "step": 9710 + }, + { + "epoch": 1.24, + "grad_norm": 0.7055322321284114, + "learning_rate": 3.353439265811963e-06, + "loss": 0.5611, + "step": 9711 + }, + { + "epoch": 1.24, + "grad_norm": 0.7413780243438631, + "learning_rate": 3.352465255381091e-06, + "loss": 0.5525, + "step": 9712 + }, + { + "epoch": 1.24, + "grad_norm": 0.7415240845122817, + "learning_rate": 3.351491315080508e-06, + "loss": 0.5088, + "step": 9713 + }, + { + "epoch": 1.24, + "grad_norm": 0.6062353174867325, + "learning_rate": 3.350517444951672e-06, + "loss": 0.4666, + "step": 9714 + }, + { + "epoch": 1.24, + "grad_norm": 0.5959660815935528, + "learning_rate": 3.3495436450360375e-06, + "loss": 0.5561, + "step": 9715 + }, + { + "epoch": 1.24, + "grad_norm": 0.795469707114058, + "learning_rate": 3.348569915375053e-06, + "loss": 0.6284, + "step": 9716 + }, + { + "epoch": 1.24, + "grad_norm": 1.122140338181277, + "learning_rate": 3.3475962560101728e-06, + "loss": 0.5122, + "step": 9717 + }, + { + "epoch": 1.24, + "grad_norm": 0.7428464969622799, + "learning_rate": 3.3466226669828396e-06, + "loss": 0.5342, + "step": 9718 + }, + { + "epoch": 1.24, + "grad_norm": 0.5913019605528251, + "learning_rate": 3.345649148334496e-06, + "loss": 0.4627, + "step": 9719 + }, + { + "epoch": 1.24, + "grad_norm": 0.7312336653461774, + "learning_rate": 3.3446757001065816e-06, + "loss": 0.5261, + "step": 9720 + }, + { + "epoch": 1.24, + "grad_norm": 0.7278513619582827, + "learning_rate": 3.343702322340533e-06, + "loss": 0.5114, + "step": 9721 + }, + { + "epoch": 1.24, + "grad_norm": 0.7438463711809817, + "learning_rate": 3.3427290150777836e-06, + "loss": 0.548, + "step": 9722 + }, + { + "epoch": 1.24, + "grad_norm": 0.6437042828354095, + "learning_rate": 3.3417557783597643e-06, + "loss": 0.4669, + "step": 9723 + }, + { + "epoch": 1.24, + "grad_norm": 0.7360270876315178, + "learning_rate": 3.340782612227903e-06, + "loss": 0.5099, + "step": 9724 + }, + { + "epoch": 1.24, + "grad_norm": 0.838277362309171, + "learning_rate": 3.339809516723623e-06, + "loss": 0.4796, + "step": 9725 + }, + { + "epoch": 1.24, + "grad_norm": 0.5744679583378896, + "learning_rate": 3.338836491888347e-06, + "loss": 0.4733, + "step": 9726 + }, + { + "epoch": 1.24, + "grad_norm": 0.6886928918535186, + "learning_rate": 3.3378635377634927e-06, + "loss": 0.5025, + "step": 9727 + }, + { + "epoch": 1.24, + "grad_norm": 0.8364803837127899, + "learning_rate": 3.336890654390475e-06, + "loss": 0.4694, + "step": 9728 + }, + { + "epoch": 1.24, + "grad_norm": 0.6527493447813238, + "learning_rate": 3.335917841810709e-06, + "loss": 0.435, + "step": 9729 + }, + { + "epoch": 1.24, + "grad_norm": 0.6360823743572565, + "learning_rate": 3.334945100065603e-06, + "loss": 0.4681, + "step": 9730 + }, + { + "epoch": 1.24, + "grad_norm": 0.7511257876206643, + "learning_rate": 3.3339724291965637e-06, + "loss": 0.5205, + "step": 9731 + }, + { + "epoch": 1.24, + "grad_norm": 0.6934778456312952, + "learning_rate": 3.332999829244994e-06, + "loss": 0.5217, + "step": 9732 + }, + { + "epoch": 1.24, + "grad_norm": 0.7043104721963191, + "learning_rate": 3.3320273002522947e-06, + "loss": 0.4897, + "step": 9733 + }, + { + "epoch": 1.24, + "grad_norm": 0.5824895233477628, + "learning_rate": 3.331054842259863e-06, + "loss": 0.4755, + "step": 9734 + }, + { + "epoch": 1.24, + "grad_norm": 0.5520456878069466, + "learning_rate": 3.3300824553090934e-06, + "loss": 0.4655, + "step": 9735 + }, + { + "epoch": 1.24, + "grad_norm": 0.6506681488520155, + "learning_rate": 3.329110139441377e-06, + "loss": 0.4351, + "step": 9736 + }, + { + "epoch": 1.24, + "grad_norm": 0.5785868482380239, + "learning_rate": 3.3281378946981034e-06, + "loss": 0.4369, + "step": 9737 + }, + { + "epoch": 1.24, + "grad_norm": 0.5986856199645442, + "learning_rate": 3.3271657211206566e-06, + "loss": 0.4277, + "step": 9738 + }, + { + "epoch": 1.24, + "grad_norm": 0.5732413644115761, + "learning_rate": 3.3261936187504195e-06, + "loss": 0.424, + "step": 9739 + }, + { + "epoch": 1.24, + "grad_norm": 0.6397259322602841, + "learning_rate": 3.3252215876287695e-06, + "loss": 0.4793, + "step": 9740 + }, + { + "epoch": 1.24, + "grad_norm": 0.7759409451260584, + "learning_rate": 3.3242496277970872e-06, + "loss": 0.4811, + "step": 9741 + }, + { + "epoch": 1.24, + "grad_norm": 0.6419254316919619, + "learning_rate": 3.3232777392967417e-06, + "loss": 0.4914, + "step": 9742 + }, + { + "epoch": 1.24, + "grad_norm": 0.6695568145934685, + "learning_rate": 3.3223059221691063e-06, + "loss": 0.4765, + "step": 9743 + }, + { + "epoch": 1.24, + "grad_norm": 0.6953302181373358, + "learning_rate": 3.3213341764555463e-06, + "loss": 0.5149, + "step": 9744 + }, + { + "epoch": 1.24, + "grad_norm": 1.2427653108082664, + "learning_rate": 3.320362502197426e-06, + "loss": 0.5687, + "step": 9745 + }, + { + "epoch": 1.24, + "grad_norm": 0.6219528949720443, + "learning_rate": 3.319390899436107e-06, + "loss": 0.4848, + "step": 9746 + }, + { + "epoch": 1.24, + "grad_norm": 0.5871854366235867, + "learning_rate": 3.318419368212946e-06, + "loss": 0.4381, + "step": 9747 + }, + { + "epoch": 1.24, + "grad_norm": 0.6486799695735382, + "learning_rate": 3.3174479085692997e-06, + "loss": 0.4834, + "step": 9748 + }, + { + "epoch": 1.24, + "grad_norm": 0.7546057165321423, + "learning_rate": 3.3164765205465187e-06, + "loss": 0.5192, + "step": 9749 + }, + { + "epoch": 1.24, + "grad_norm": 1.1427536234375142, + "learning_rate": 3.315505204185953e-06, + "loss": 0.5416, + "step": 9750 + }, + { + "epoch": 1.24, + "grad_norm": 0.717578958570655, + "learning_rate": 3.314533959528948e-06, + "loss": 0.522, + "step": 9751 + }, + { + "epoch": 1.24, + "grad_norm": 0.7899855094305798, + "learning_rate": 3.3135627866168465e-06, + "loss": 0.5027, + "step": 9752 + }, + { + "epoch": 1.24, + "grad_norm": 0.7555347788382839, + "learning_rate": 3.312591685490986e-06, + "loss": 0.5021, + "step": 9753 + }, + { + "epoch": 1.24, + "grad_norm": 0.591234827869523, + "learning_rate": 3.3116206561927074e-06, + "loss": 0.4164, + "step": 9754 + }, + { + "epoch": 1.24, + "grad_norm": 0.5936487336478009, + "learning_rate": 3.3106496987633408e-06, + "loss": 0.4967, + "step": 9755 + }, + { + "epoch": 1.24, + "grad_norm": 0.8057824222276049, + "learning_rate": 3.3096788132442186e-06, + "loss": 0.5461, + "step": 9756 + }, + { + "epoch": 1.24, + "grad_norm": 0.6994944841147624, + "learning_rate": 3.308707999676668e-06, + "loss": 0.5093, + "step": 9757 + }, + { + "epoch": 1.24, + "grad_norm": 0.6242269304173834, + "learning_rate": 3.307737258102014e-06, + "loss": 0.4843, + "step": 9758 + }, + { + "epoch": 1.24, + "grad_norm": 0.6617062699804608, + "learning_rate": 3.3067665885615762e-06, + "loss": 0.4927, + "step": 9759 + }, + { + "epoch": 1.24, + "grad_norm": 0.692744029806587, + "learning_rate": 3.3057959910966754e-06, + "loss": 0.5064, + "step": 9760 + }, + { + "epoch": 1.24, + "grad_norm": 0.7759133672417525, + "learning_rate": 3.304825465748624e-06, + "loss": 0.5223, + "step": 9761 + }, + { + "epoch": 1.24, + "grad_norm": 0.7373400225765957, + "learning_rate": 3.303855012558736e-06, + "loss": 0.4861, + "step": 9762 + }, + { + "epoch": 1.24, + "grad_norm": 0.5928736253734711, + "learning_rate": 3.30288463156832e-06, + "loss": 0.4668, + "step": 9763 + }, + { + "epoch": 1.24, + "grad_norm": 0.6188388607574133, + "learning_rate": 3.3019143228186825e-06, + "loss": 0.4836, + "step": 9764 + }, + { + "epoch": 1.24, + "grad_norm": 0.5747153962949537, + "learning_rate": 3.300944086351124e-06, + "loss": 0.4496, + "step": 9765 + }, + { + "epoch": 1.24, + "grad_norm": 0.5840292329822466, + "learning_rate": 3.2999739222069477e-06, + "loss": 0.4171, + "step": 9766 + }, + { + "epoch": 1.24, + "grad_norm": 0.6775475721623886, + "learning_rate": 3.299003830427449e-06, + "loss": 0.5136, + "step": 9767 + }, + { + "epoch": 1.24, + "grad_norm": 0.8411364293719691, + "learning_rate": 3.2980338110539225e-06, + "loss": 0.495, + "step": 9768 + }, + { + "epoch": 1.24, + "grad_norm": 0.6453161014766059, + "learning_rate": 3.2970638641276563e-06, + "loss": 0.4473, + "step": 9769 + }, + { + "epoch": 1.24, + "grad_norm": 0.5595071340611592, + "learning_rate": 3.296093989689941e-06, + "loss": 0.4707, + "step": 9770 + }, + { + "epoch": 1.24, + "grad_norm": 0.7997718759840764, + "learning_rate": 3.2951241877820594e-06, + "loss": 0.5121, + "step": 9771 + }, + { + "epoch": 1.24, + "grad_norm": 0.5763401014143, + "learning_rate": 3.2941544584452928e-06, + "loss": 0.4461, + "step": 9772 + }, + { + "epoch": 1.25, + "grad_norm": 0.6561222903062404, + "learning_rate": 3.2931848017209202e-06, + "loss": 0.4714, + "step": 9773 + }, + { + "epoch": 1.25, + "grad_norm": 0.5806328696962216, + "learning_rate": 3.292215217650215e-06, + "loss": 0.47, + "step": 9774 + }, + { + "epoch": 1.25, + "grad_norm": 0.6387932215629026, + "learning_rate": 3.2912457062744526e-06, + "loss": 0.4732, + "step": 9775 + }, + { + "epoch": 1.25, + "grad_norm": 0.6183663877210778, + "learning_rate": 3.290276267634899e-06, + "loss": 0.4013, + "step": 9776 + }, + { + "epoch": 1.25, + "grad_norm": 0.6911524512861753, + "learning_rate": 3.289306901772822e-06, + "loss": 0.4614, + "step": 9777 + }, + { + "epoch": 1.25, + "grad_norm": 0.6957098179567416, + "learning_rate": 3.2883376087294804e-06, + "loss": 0.4842, + "step": 9778 + }, + { + "epoch": 1.25, + "grad_norm": 0.8892080228449, + "learning_rate": 3.28736838854614e-06, + "loss": 0.5612, + "step": 9779 + }, + { + "epoch": 1.25, + "grad_norm": 0.6049457145813931, + "learning_rate": 3.2863992412640546e-06, + "loss": 0.449, + "step": 9780 + }, + { + "epoch": 1.25, + "grad_norm": 0.6271762013024282, + "learning_rate": 3.285430166924477e-06, + "loss": 0.4191, + "step": 9781 + }, + { + "epoch": 1.25, + "grad_norm": 0.610450083761308, + "learning_rate": 3.2844611655686586e-06, + "loss": 0.5055, + "step": 9782 + }, + { + "epoch": 1.25, + "grad_norm": 0.8269049664396605, + "learning_rate": 3.2834922372378465e-06, + "loss": 0.5623, + "step": 9783 + }, + { + "epoch": 1.25, + "grad_norm": 0.7516688504708258, + "learning_rate": 3.282523381973284e-06, + "loss": 0.5023, + "step": 9784 + }, + { + "epoch": 1.25, + "grad_norm": 0.617465779449233, + "learning_rate": 3.2815545998162134e-06, + "loss": 0.4418, + "step": 9785 + }, + { + "epoch": 1.25, + "grad_norm": 0.7218667417046295, + "learning_rate": 3.280585890807872e-06, + "loss": 0.5132, + "step": 9786 + }, + { + "epoch": 1.25, + "grad_norm": 0.7065332850665154, + "learning_rate": 3.2796172549894946e-06, + "loss": 0.5278, + "step": 9787 + }, + { + "epoch": 1.25, + "grad_norm": 0.7764326359182376, + "learning_rate": 3.278648692402313e-06, + "loss": 0.4591, + "step": 9788 + }, + { + "epoch": 1.25, + "grad_norm": 0.8430399155602538, + "learning_rate": 3.2776802030875564e-06, + "loss": 0.5827, + "step": 9789 + }, + { + "epoch": 1.25, + "grad_norm": 0.858285110893183, + "learning_rate": 3.2767117870864473e-06, + "loss": 0.4886, + "step": 9790 + }, + { + "epoch": 1.25, + "grad_norm": 0.6045502596390667, + "learning_rate": 3.275743444440213e-06, + "loss": 0.4728, + "step": 9791 + }, + { + "epoch": 1.25, + "grad_norm": 0.6783595771771905, + "learning_rate": 3.27477517519007e-06, + "loss": 0.5057, + "step": 9792 + }, + { + "epoch": 1.25, + "grad_norm": 2.369519675583337, + "learning_rate": 3.2738069793772353e-06, + "loss": 0.523, + "step": 9793 + }, + { + "epoch": 1.25, + "grad_norm": 0.8034220648985635, + "learning_rate": 3.272838857042921e-06, + "loss": 0.503, + "step": 9794 + }, + { + "epoch": 1.25, + "grad_norm": 0.5758599652496723, + "learning_rate": 3.2718708082283367e-06, + "loss": 0.4371, + "step": 9795 + }, + { + "epoch": 1.25, + "grad_norm": 0.6405399937475216, + "learning_rate": 3.2709028329746905e-06, + "loss": 0.4621, + "step": 9796 + }, + { + "epoch": 1.25, + "grad_norm": 0.8655253624930225, + "learning_rate": 3.2699349313231854e-06, + "loss": 0.5093, + "step": 9797 + }, + { + "epoch": 1.25, + "grad_norm": 0.6035259467451702, + "learning_rate": 3.2689671033150215e-06, + "loss": 0.465, + "step": 9798 + }, + { + "epoch": 1.25, + "grad_norm": 0.6813695313711089, + "learning_rate": 3.267999348991397e-06, + "loss": 0.4885, + "step": 9799 + }, + { + "epoch": 1.25, + "grad_norm": 0.7322371401325936, + "learning_rate": 3.2670316683935047e-06, + "loss": 0.5128, + "step": 9800 + }, + { + "epoch": 1.25, + "grad_norm": 0.5816729474457779, + "learning_rate": 3.2660640615625374e-06, + "loss": 0.4603, + "step": 9801 + }, + { + "epoch": 1.25, + "grad_norm": 0.621793815698565, + "learning_rate": 3.26509652853968e-06, + "loss": 0.5114, + "step": 9802 + }, + { + "epoch": 1.25, + "grad_norm": 0.6550650746031575, + "learning_rate": 3.26412906936612e-06, + "loss": 0.5285, + "step": 9803 + }, + { + "epoch": 1.25, + "grad_norm": 0.744118183620498, + "learning_rate": 3.263161684083039e-06, + "loss": 0.5931, + "step": 9804 + }, + { + "epoch": 1.25, + "grad_norm": 0.7480824190284789, + "learning_rate": 3.2621943727316164e-06, + "loss": 0.5122, + "step": 9805 + }, + { + "epoch": 1.25, + "grad_norm": 0.6299208935569788, + "learning_rate": 3.261227135353025e-06, + "loss": 0.4177, + "step": 9806 + }, + { + "epoch": 1.25, + "grad_norm": 0.6400414694783045, + "learning_rate": 3.260259971988438e-06, + "loss": 0.4496, + "step": 9807 + }, + { + "epoch": 1.25, + "grad_norm": 0.5860832921444632, + "learning_rate": 3.259292882679025e-06, + "loss": 0.421, + "step": 9808 + }, + { + "epoch": 1.25, + "grad_norm": 0.620047916547074, + "learning_rate": 3.2583258674659513e-06, + "loss": 0.4473, + "step": 9809 + }, + { + "epoch": 1.25, + "grad_norm": 0.5974931480253365, + "learning_rate": 3.2573589263903803e-06, + "loss": 0.4503, + "step": 9810 + }, + { + "epoch": 1.25, + "grad_norm": 0.6033952382299103, + "learning_rate": 3.2563920594934708e-06, + "loss": 0.4677, + "step": 9811 + }, + { + "epoch": 1.25, + "grad_norm": 0.7231217390976792, + "learning_rate": 3.255425266816379e-06, + "loss": 0.4535, + "step": 9812 + }, + { + "epoch": 1.25, + "grad_norm": 0.7748083867869024, + "learning_rate": 3.254458548400259e-06, + "loss": 0.5228, + "step": 9813 + }, + { + "epoch": 1.25, + "grad_norm": 0.6604880883838006, + "learning_rate": 3.2534919042862613e-06, + "loss": 0.4374, + "step": 9814 + }, + { + "epoch": 1.25, + "grad_norm": 0.7983307918453107, + "learning_rate": 3.2525253345155305e-06, + "loss": 0.5569, + "step": 9815 + }, + { + "epoch": 1.25, + "grad_norm": 0.7923631059098096, + "learning_rate": 3.251558839129213e-06, + "loss": 0.5344, + "step": 9816 + }, + { + "epoch": 1.25, + "grad_norm": 0.6550914907641446, + "learning_rate": 3.250592418168448e-06, + "loss": 0.434, + "step": 9817 + }, + { + "epoch": 1.25, + "grad_norm": 0.576047407488201, + "learning_rate": 3.249626071674374e-06, + "loss": 0.4108, + "step": 9818 + }, + { + "epoch": 1.25, + "grad_norm": 0.6019748511060291, + "learning_rate": 3.2486597996881252e-06, + "loss": 0.479, + "step": 9819 + }, + { + "epoch": 1.25, + "grad_norm": 0.8879563645186039, + "learning_rate": 3.247693602250832e-06, + "loss": 0.5408, + "step": 9820 + }, + { + "epoch": 1.25, + "grad_norm": 0.5803954889613333, + "learning_rate": 3.246727479403623e-06, + "loss": 0.4688, + "step": 9821 + }, + { + "epoch": 1.25, + "grad_norm": 0.7346962074704275, + "learning_rate": 3.245761431187622e-06, + "loss": 0.5561, + "step": 9822 + }, + { + "epoch": 1.25, + "grad_norm": 2.6534656074696925, + "learning_rate": 3.244795457643951e-06, + "loss": 0.5511, + "step": 9823 + }, + { + "epoch": 1.25, + "grad_norm": 0.6963013815243809, + "learning_rate": 3.2438295588137293e-06, + "loss": 0.5389, + "step": 9824 + }, + { + "epoch": 1.25, + "grad_norm": 0.7117631549899102, + "learning_rate": 3.2428637347380705e-06, + "loss": 0.5052, + "step": 9825 + }, + { + "epoch": 1.25, + "grad_norm": 0.6773070660701833, + "learning_rate": 3.2418979854580877e-06, + "loss": 0.489, + "step": 9826 + }, + { + "epoch": 1.25, + "grad_norm": 0.6941230596410103, + "learning_rate": 3.240932311014889e-06, + "loss": 0.4813, + "step": 9827 + }, + { + "epoch": 1.25, + "grad_norm": 0.6645045544506429, + "learning_rate": 3.2399667114495813e-06, + "loss": 0.467, + "step": 9828 + }, + { + "epoch": 1.25, + "grad_norm": 0.8965847741915173, + "learning_rate": 3.239001186803267e-06, + "loss": 0.5227, + "step": 9829 + }, + { + "epoch": 1.25, + "grad_norm": 0.8943685918586383, + "learning_rate": 3.238035737117044e-06, + "loss": 0.5711, + "step": 9830 + }, + { + "epoch": 1.25, + "grad_norm": 0.8220622806689967, + "learning_rate": 3.23707036243201e-06, + "loss": 0.5216, + "step": 9831 + }, + { + "epoch": 1.25, + "grad_norm": 1.226829334532521, + "learning_rate": 3.2361050627892566e-06, + "loss": 0.5305, + "step": 9832 + }, + { + "epoch": 1.25, + "grad_norm": 0.7892365585912237, + "learning_rate": 3.2351398382298738e-06, + "loss": 0.4717, + "step": 9833 + }, + { + "epoch": 1.25, + "grad_norm": 0.6274119973413937, + "learning_rate": 3.234174688794949e-06, + "loss": 0.4727, + "step": 9834 + }, + { + "epoch": 1.25, + "grad_norm": 0.8326464561556434, + "learning_rate": 3.2332096145255646e-06, + "loss": 0.4679, + "step": 9835 + }, + { + "epoch": 1.25, + "grad_norm": 0.6593812566326863, + "learning_rate": 3.2322446154628013e-06, + "loss": 0.4642, + "step": 9836 + }, + { + "epoch": 1.25, + "grad_norm": 0.6184796831804252, + "learning_rate": 3.231279691647736e-06, + "loss": 0.4214, + "step": 9837 + }, + { + "epoch": 1.25, + "grad_norm": 0.644655182878305, + "learning_rate": 3.2303148431214416e-06, + "loss": 0.4533, + "step": 9838 + }, + { + "epoch": 1.25, + "grad_norm": 0.6518594687625032, + "learning_rate": 3.229350069924989e-06, + "loss": 0.497, + "step": 9839 + }, + { + "epoch": 1.25, + "grad_norm": 0.6960857625066758, + "learning_rate": 3.2283853720994444e-06, + "loss": 0.4705, + "step": 9840 + }, + { + "epoch": 1.25, + "grad_norm": 0.6121445164106195, + "learning_rate": 3.2274207496858756e-06, + "loss": 0.4997, + "step": 9841 + }, + { + "epoch": 1.25, + "grad_norm": 0.7472159977548641, + "learning_rate": 3.2264562027253402e-06, + "loss": 0.4947, + "step": 9842 + }, + { + "epoch": 1.25, + "grad_norm": 0.6558324530578811, + "learning_rate": 3.2254917312588974e-06, + "loss": 0.4478, + "step": 9843 + }, + { + "epoch": 1.25, + "grad_norm": 0.7160388078735583, + "learning_rate": 3.224527335327601e-06, + "loss": 0.51, + "step": 9844 + }, + { + "epoch": 1.25, + "grad_norm": 0.7537417454398606, + "learning_rate": 3.223563014972502e-06, + "loss": 0.5289, + "step": 9845 + }, + { + "epoch": 1.25, + "grad_norm": 0.6576783491403214, + "learning_rate": 3.2225987702346496e-06, + "loss": 0.5203, + "step": 9846 + }, + { + "epoch": 1.25, + "grad_norm": 0.7367553386767534, + "learning_rate": 3.221634601155087e-06, + "loss": 0.5654, + "step": 9847 + }, + { + "epoch": 1.25, + "grad_norm": 0.8067918691951811, + "learning_rate": 3.2206705077748566e-06, + "loss": 0.5808, + "step": 9848 + }, + { + "epoch": 1.25, + "grad_norm": 0.7643784840076346, + "learning_rate": 3.2197064901349975e-06, + "loss": 0.5053, + "step": 9849 + }, + { + "epoch": 1.25, + "grad_norm": 0.7753689071548957, + "learning_rate": 3.2187425482765435e-06, + "loss": 0.4938, + "step": 9850 + }, + { + "epoch": 1.25, + "grad_norm": 0.8013427070149979, + "learning_rate": 3.2177786822405267e-06, + "loss": 0.5056, + "step": 9851 + }, + { + "epoch": 1.26, + "grad_norm": 0.6690309837945828, + "learning_rate": 3.216814892067975e-06, + "loss": 0.4412, + "step": 9852 + }, + { + "epoch": 1.26, + "grad_norm": 0.6420970393285401, + "learning_rate": 3.215851177799917e-06, + "loss": 0.4808, + "step": 9853 + }, + { + "epoch": 1.26, + "grad_norm": 0.6855795059049861, + "learning_rate": 3.2148875394773727e-06, + "loss": 0.5056, + "step": 9854 + }, + { + "epoch": 1.26, + "grad_norm": 0.713746628553741, + "learning_rate": 3.2139239771413616e-06, + "loss": 0.4596, + "step": 9855 + }, + { + "epoch": 1.26, + "grad_norm": 0.7462333994610545, + "learning_rate": 3.2129604908328994e-06, + "loss": 0.5092, + "step": 9856 + }, + { + "epoch": 1.26, + "grad_norm": 0.7056654666830612, + "learning_rate": 3.2119970805929984e-06, + "loss": 0.4592, + "step": 9857 + }, + { + "epoch": 1.26, + "grad_norm": 0.5793630850885867, + "learning_rate": 3.2110337464626685e-06, + "loss": 0.4909, + "step": 9858 + }, + { + "epoch": 1.26, + "grad_norm": 0.628451931771063, + "learning_rate": 3.2100704884829147e-06, + "loss": 0.4842, + "step": 9859 + }, + { + "epoch": 1.26, + "grad_norm": 0.6442462346622816, + "learning_rate": 3.209107306694741e-06, + "loss": 0.5274, + "step": 9860 + }, + { + "epoch": 1.26, + "grad_norm": 1.6686654545925466, + "learning_rate": 3.2081442011391455e-06, + "loss": 0.5206, + "step": 9861 + }, + { + "epoch": 1.26, + "grad_norm": 0.5829889021833834, + "learning_rate": 3.207181171857126e-06, + "loss": 0.491, + "step": 9862 + }, + { + "epoch": 1.26, + "grad_norm": 0.7432435731302618, + "learning_rate": 3.2062182188896753e-06, + "loss": 0.533, + "step": 9863 + }, + { + "epoch": 1.26, + "grad_norm": 0.7329897402527397, + "learning_rate": 3.20525534227778e-06, + "loss": 0.5442, + "step": 9864 + }, + { + "epoch": 1.26, + "grad_norm": 0.7453179893366322, + "learning_rate": 3.204292542062432e-06, + "loss": 0.5029, + "step": 9865 + }, + { + "epoch": 1.26, + "grad_norm": 0.6339286054803679, + "learning_rate": 3.2033298182846122e-06, + "loss": 0.4973, + "step": 9866 + }, + { + "epoch": 1.26, + "grad_norm": 0.7891009532643618, + "learning_rate": 3.202367170985301e-06, + "loss": 0.4952, + "step": 9867 + }, + { + "epoch": 1.26, + "grad_norm": 0.5489119562476963, + "learning_rate": 3.201404600205475e-06, + "loss": 0.4355, + "step": 9868 + }, + { + "epoch": 1.26, + "grad_norm": 0.697021613953802, + "learning_rate": 3.2004421059861073e-06, + "loss": 0.4533, + "step": 9869 + }, + { + "epoch": 1.26, + "grad_norm": 0.5738472127371834, + "learning_rate": 3.199479688368169e-06, + "loss": 0.4254, + "step": 9870 + }, + { + "epoch": 1.26, + "grad_norm": 0.7585878860833483, + "learning_rate": 3.1985173473926263e-06, + "loss": 0.5234, + "step": 9871 + }, + { + "epoch": 1.26, + "grad_norm": 0.7739078301838713, + "learning_rate": 3.197555083100443e-06, + "loss": 0.5388, + "step": 9872 + }, + { + "epoch": 1.26, + "grad_norm": 0.6693094729705964, + "learning_rate": 3.196592895532581e-06, + "loss": 0.4792, + "step": 9873 + }, + { + "epoch": 1.26, + "grad_norm": 0.5724523216615389, + "learning_rate": 3.195630784729996e-06, + "loss": 0.4429, + "step": 9874 + }, + { + "epoch": 1.26, + "grad_norm": 0.6009674516129981, + "learning_rate": 3.1946687507336428e-06, + "loss": 0.5281, + "step": 9875 + }, + { + "epoch": 1.26, + "grad_norm": 0.8148113779397542, + "learning_rate": 3.1937067935844718e-06, + "loss": 0.553, + "step": 9876 + }, + { + "epoch": 1.26, + "grad_norm": 0.7831115003604711, + "learning_rate": 3.1927449133234295e-06, + "loss": 0.5515, + "step": 9877 + }, + { + "epoch": 1.26, + "grad_norm": 0.6437030484691998, + "learning_rate": 3.191783109991462e-06, + "loss": 0.4482, + "step": 9878 + }, + { + "epoch": 1.26, + "grad_norm": 0.5867952603655965, + "learning_rate": 3.1908213836295083e-06, + "loss": 0.4757, + "step": 9879 + }, + { + "epoch": 1.26, + "grad_norm": 0.596781784339686, + "learning_rate": 3.1898597342785074e-06, + "loss": 0.4097, + "step": 9880 + }, + { + "epoch": 1.26, + "grad_norm": 0.7115799260134911, + "learning_rate": 3.1888981619793946e-06, + "loss": 0.5045, + "step": 9881 + }, + { + "epoch": 1.26, + "grad_norm": 0.7465189151037821, + "learning_rate": 3.1879366667730986e-06, + "loss": 0.5245, + "step": 9882 + }, + { + "epoch": 1.26, + "grad_norm": 0.7501013878291433, + "learning_rate": 3.1869752487005485e-06, + "loss": 0.4648, + "step": 9883 + }, + { + "epoch": 1.26, + "grad_norm": 0.6384203260918596, + "learning_rate": 3.1860139078026686e-06, + "loss": 0.4206, + "step": 9884 + }, + { + "epoch": 1.26, + "grad_norm": 0.5984569149862544, + "learning_rate": 3.1850526441203794e-06, + "loss": 0.5041, + "step": 9885 + }, + { + "epoch": 1.26, + "grad_norm": 0.7459038381909749, + "learning_rate": 3.1840914576945996e-06, + "loss": 0.5156, + "step": 9886 + }, + { + "epoch": 1.26, + "grad_norm": 0.5813552143409968, + "learning_rate": 3.183130348566244e-06, + "loss": 0.4494, + "step": 9887 + }, + { + "epoch": 1.26, + "grad_norm": 0.7442813452056939, + "learning_rate": 3.1821693167762234e-06, + "loss": 0.4581, + "step": 9888 + }, + { + "epoch": 1.26, + "grad_norm": 0.732492200114443, + "learning_rate": 3.181208362365445e-06, + "loss": 0.5736, + "step": 9889 + }, + { + "epoch": 1.26, + "grad_norm": 0.7917636361984168, + "learning_rate": 3.1802474853748157e-06, + "loss": 0.5086, + "step": 9890 + }, + { + "epoch": 1.26, + "grad_norm": 0.665647202621241, + "learning_rate": 3.179286685845235e-06, + "loss": 0.4554, + "step": 9891 + }, + { + "epoch": 1.26, + "grad_norm": 0.563271799879308, + "learning_rate": 3.178325963817603e-06, + "loss": 0.4229, + "step": 9892 + }, + { + "epoch": 1.26, + "grad_norm": 0.7276607686629478, + "learning_rate": 3.1773653193328136e-06, + "loss": 0.467, + "step": 9893 + }, + { + "epoch": 1.26, + "grad_norm": 0.6964931286636661, + "learning_rate": 3.1764047524317578e-06, + "loss": 0.4941, + "step": 9894 + }, + { + "epoch": 1.26, + "grad_norm": 0.7717033031170665, + "learning_rate": 3.175444263155324e-06, + "loss": 0.5581, + "step": 9895 + }, + { + "epoch": 1.26, + "grad_norm": 0.5420426352527334, + "learning_rate": 3.1744838515443976e-06, + "loss": 0.4318, + "step": 9896 + }, + { + "epoch": 1.26, + "grad_norm": 0.7670061772366024, + "learning_rate": 3.1735235176398604e-06, + "loss": 0.4788, + "step": 9897 + }, + { + "epoch": 1.26, + "grad_norm": 0.7983310279508355, + "learning_rate": 3.17256326148259e-06, + "loss": 0.5207, + "step": 9898 + }, + { + "epoch": 1.26, + "grad_norm": 0.8397255911643894, + "learning_rate": 3.1716030831134627e-06, + "loss": 0.4903, + "step": 9899 + }, + { + "epoch": 1.26, + "grad_norm": 0.7256707202020258, + "learning_rate": 3.1706429825733486e-06, + "loss": 0.4844, + "step": 9900 + }, + { + "epoch": 1.26, + "grad_norm": 0.5689093431868085, + "learning_rate": 3.1696829599031176e-06, + "loss": 0.4266, + "step": 9901 + }, + { + "epoch": 1.26, + "grad_norm": 0.6262273340470185, + "learning_rate": 3.1687230151436322e-06, + "loss": 0.4583, + "step": 9902 + }, + { + "epoch": 1.26, + "grad_norm": 0.670657683213048, + "learning_rate": 3.1677631483357572e-06, + "loss": 0.4738, + "step": 9903 + }, + { + "epoch": 1.26, + "grad_norm": 0.6337897363984735, + "learning_rate": 3.1668033595203505e-06, + "loss": 0.4749, + "step": 9904 + }, + { + "epoch": 1.26, + "grad_norm": 0.6891063871269649, + "learning_rate": 3.165843648738267e-06, + "loss": 0.4953, + "step": 9905 + }, + { + "epoch": 1.26, + "grad_norm": 0.5968432218174194, + "learning_rate": 3.1648840160303585e-06, + "loss": 0.4714, + "step": 9906 + }, + { + "epoch": 1.26, + "grad_norm": 0.8001986283879055, + "learning_rate": 3.163924461437472e-06, + "loss": 0.5573, + "step": 9907 + }, + { + "epoch": 1.26, + "grad_norm": 0.7215817195717713, + "learning_rate": 3.1629649850004553e-06, + "loss": 0.5606, + "step": 9908 + }, + { + "epoch": 1.26, + "grad_norm": 0.6206732567557802, + "learning_rate": 3.1620055867601487e-06, + "loss": 0.4517, + "step": 9909 + }, + { + "epoch": 1.26, + "grad_norm": 0.7091691165945644, + "learning_rate": 3.16104626675739e-06, + "loss": 0.5157, + "step": 9910 + }, + { + "epoch": 1.26, + "grad_norm": 0.7731843108203841, + "learning_rate": 3.1600870250330155e-06, + "loss": 0.5627, + "step": 9911 + }, + { + "epoch": 1.26, + "grad_norm": 0.8672547607346605, + "learning_rate": 3.1591278616278574e-06, + "loss": 0.5336, + "step": 9912 + }, + { + "epoch": 1.26, + "grad_norm": 0.795869415216268, + "learning_rate": 3.158168776582743e-06, + "loss": 0.5074, + "step": 9913 + }, + { + "epoch": 1.26, + "grad_norm": 0.6715420976532219, + "learning_rate": 3.157209769938496e-06, + "loss": 0.4916, + "step": 9914 + }, + { + "epoch": 1.26, + "grad_norm": 0.607640260436024, + "learning_rate": 3.1562508417359433e-06, + "loss": 0.462, + "step": 9915 + }, + { + "epoch": 1.26, + "grad_norm": 0.7417548998133111, + "learning_rate": 3.1552919920159e-06, + "loss": 0.5082, + "step": 9916 + }, + { + "epoch": 1.26, + "grad_norm": 0.6717004473914451, + "learning_rate": 3.154333220819182e-06, + "loss": 0.5019, + "step": 9917 + }, + { + "epoch": 1.26, + "grad_norm": 0.6181879367924511, + "learning_rate": 3.1533745281866e-06, + "loss": 0.4976, + "step": 9918 + }, + { + "epoch": 1.26, + "grad_norm": 0.7119358765643872, + "learning_rate": 3.1524159141589644e-06, + "loss": 0.4902, + "step": 9919 + }, + { + "epoch": 1.26, + "grad_norm": 0.7530063034176033, + "learning_rate": 3.1514573787770786e-06, + "loss": 0.5051, + "step": 9920 + }, + { + "epoch": 1.26, + "grad_norm": 0.7154305445386362, + "learning_rate": 3.1504989220817457e-06, + "loss": 0.5079, + "step": 9921 + }, + { + "epoch": 1.26, + "grad_norm": 0.7660282096299629, + "learning_rate": 3.1495405441137627e-06, + "loss": 0.5076, + "step": 9922 + }, + { + "epoch": 1.26, + "grad_norm": 0.6026131679881637, + "learning_rate": 3.1485822449139263e-06, + "loss": 0.4755, + "step": 9923 + }, + { + "epoch": 1.26, + "grad_norm": 0.7836022467326218, + "learning_rate": 3.1476240245230272e-06, + "loss": 0.5113, + "step": 9924 + }, + { + "epoch": 1.26, + "grad_norm": 0.6876687248022918, + "learning_rate": 3.1466658829818543e-06, + "loss": 0.4817, + "step": 9925 + }, + { + "epoch": 1.26, + "grad_norm": 0.6415848110272832, + "learning_rate": 3.1457078203311897e-06, + "loss": 0.4566, + "step": 9926 + }, + { + "epoch": 1.26, + "grad_norm": 0.7808714798928587, + "learning_rate": 3.14474983661182e-06, + "loss": 0.5687, + "step": 9927 + }, + { + "epoch": 1.26, + "grad_norm": 0.7250699538423234, + "learning_rate": 3.1437919318645216e-06, + "loss": 0.5137, + "step": 9928 + }, + { + "epoch": 1.26, + "grad_norm": 0.7075694266545647, + "learning_rate": 3.1428341061300687e-06, + "loss": 0.4797, + "step": 9929 + }, + { + "epoch": 1.27, + "grad_norm": 0.5473386506540765, + "learning_rate": 3.141876359449233e-06, + "loss": 0.4402, + "step": 9930 + }, + { + "epoch": 1.27, + "grad_norm": 0.7825510789290535, + "learning_rate": 3.140918691862783e-06, + "loss": 0.5629, + "step": 9931 + }, + { + "epoch": 1.27, + "grad_norm": 0.880621228415058, + "learning_rate": 3.139961103411484e-06, + "loss": 0.4939, + "step": 9932 + }, + { + "epoch": 1.27, + "grad_norm": 1.1042619234925284, + "learning_rate": 3.1390035941360975e-06, + "loss": 0.5651, + "step": 9933 + }, + { + "epoch": 1.27, + "grad_norm": 0.6925194000489686, + "learning_rate": 3.1380461640773803e-06, + "loss": 0.5025, + "step": 9934 + }, + { + "epoch": 1.27, + "grad_norm": 0.6254911281763132, + "learning_rate": 3.137088813276088e-06, + "loss": 0.4415, + "step": 9935 + }, + { + "epoch": 1.27, + "grad_norm": 0.6445112189690168, + "learning_rate": 3.136131541772972e-06, + "loss": 0.4999, + "step": 9936 + }, + { + "epoch": 1.27, + "grad_norm": 0.5818518250465591, + "learning_rate": 3.13517434960878e-06, + "loss": 0.4901, + "step": 9937 + }, + { + "epoch": 1.27, + "grad_norm": 0.7642835612508834, + "learning_rate": 3.134217236824257e-06, + "loss": 0.5598, + "step": 9938 + }, + { + "epoch": 1.27, + "grad_norm": 0.663430748715996, + "learning_rate": 3.133260203460143e-06, + "loss": 0.5314, + "step": 9939 + }, + { + "epoch": 1.27, + "grad_norm": 0.623174695249876, + "learning_rate": 3.1323032495571777e-06, + "loss": 0.4703, + "step": 9940 + }, + { + "epoch": 1.27, + "grad_norm": 0.6523327564812906, + "learning_rate": 3.1313463751560935e-06, + "loss": 0.4644, + "step": 9941 + }, + { + "epoch": 1.27, + "grad_norm": 1.0993333002394308, + "learning_rate": 3.1303895802976245e-06, + "loss": 0.5285, + "step": 9942 + }, + { + "epoch": 1.27, + "grad_norm": 0.6651099934805494, + "learning_rate": 3.1294328650224965e-06, + "loss": 0.506, + "step": 9943 + }, + { + "epoch": 1.27, + "grad_norm": 0.6657817185498772, + "learning_rate": 3.128476229371433e-06, + "loss": 0.4638, + "step": 9944 + }, + { + "epoch": 1.27, + "grad_norm": 0.6483782927121519, + "learning_rate": 3.1275196733851575e-06, + "loss": 0.4324, + "step": 9945 + }, + { + "epoch": 1.27, + "grad_norm": 0.7462880908031876, + "learning_rate": 3.1265631971043854e-06, + "loss": 0.5101, + "step": 9946 + }, + { + "epoch": 1.27, + "grad_norm": 0.7695180921923955, + "learning_rate": 3.1256068005698303e-06, + "loss": 0.4851, + "step": 9947 + }, + { + "epoch": 1.27, + "grad_norm": 0.5995232463677926, + "learning_rate": 3.1246504838222047e-06, + "loss": 0.447, + "step": 9948 + }, + { + "epoch": 1.27, + "grad_norm": 0.7071339803035661, + "learning_rate": 3.123694246902216e-06, + "loss": 0.4382, + "step": 9949 + }, + { + "epoch": 1.27, + "grad_norm": 0.8026421358645626, + "learning_rate": 3.1227380898505665e-06, + "loss": 0.499, + "step": 9950 + }, + { + "epoch": 1.27, + "grad_norm": 0.5760689060702163, + "learning_rate": 3.121782012707957e-06, + "loss": 0.4422, + "step": 9951 + }, + { + "epoch": 1.27, + "grad_norm": 0.6093306136261478, + "learning_rate": 3.120826015515086e-06, + "loss": 0.4871, + "step": 9952 + }, + { + "epoch": 1.27, + "grad_norm": 0.7926944690816904, + "learning_rate": 3.1198700983126464e-06, + "loss": 0.5273, + "step": 9953 + }, + { + "epoch": 1.27, + "grad_norm": 0.6148644236290115, + "learning_rate": 3.1189142611413286e-06, + "loss": 0.469, + "step": 9954 + }, + { + "epoch": 1.27, + "grad_norm": 0.827937456644679, + "learning_rate": 3.1179585040418204e-06, + "loss": 0.4898, + "step": 9955 + }, + { + "epoch": 1.27, + "grad_norm": 0.60952679725037, + "learning_rate": 3.1170028270548036e-06, + "loss": 0.44, + "step": 9956 + }, + { + "epoch": 1.27, + "grad_norm": 0.6197475712043369, + "learning_rate": 3.11604723022096e-06, + "loss": 0.4278, + "step": 9957 + }, + { + "epoch": 1.27, + "grad_norm": 0.5446321286627389, + "learning_rate": 3.1150917135809654e-06, + "loss": 0.4496, + "step": 9958 + }, + { + "epoch": 1.27, + "grad_norm": 0.9763700108494906, + "learning_rate": 3.114136277175493e-06, + "loss": 0.4882, + "step": 9959 + }, + { + "epoch": 1.27, + "grad_norm": 0.5451786417747946, + "learning_rate": 3.113180921045213e-06, + "loss": 0.4078, + "step": 9960 + }, + { + "epoch": 1.27, + "grad_norm": 0.5711121002153202, + "learning_rate": 3.112225645230792e-06, + "loss": 0.4433, + "step": 9961 + }, + { + "epoch": 1.27, + "grad_norm": 0.6564735920284965, + "learning_rate": 3.111270449772892e-06, + "loss": 0.4842, + "step": 9962 + }, + { + "epoch": 1.27, + "grad_norm": 0.720590846859446, + "learning_rate": 3.1103153347121743e-06, + "loss": 0.5187, + "step": 9963 + }, + { + "epoch": 1.27, + "grad_norm": 0.6874548830380423, + "learning_rate": 3.1093603000892923e-06, + "loss": 0.5016, + "step": 9964 + }, + { + "epoch": 1.27, + "grad_norm": 0.7900460478602355, + "learning_rate": 3.1084053459449025e-06, + "loss": 0.5182, + "step": 9965 + }, + { + "epoch": 1.27, + "grad_norm": 0.7711855452182235, + "learning_rate": 3.107450472319652e-06, + "loss": 0.5315, + "step": 9966 + }, + { + "epoch": 1.27, + "grad_norm": 0.612830530676528, + "learning_rate": 3.1064956792541877e-06, + "loss": 0.4986, + "step": 9967 + }, + { + "epoch": 1.27, + "grad_norm": 0.574623794892652, + "learning_rate": 3.1055409667891513e-06, + "loss": 0.4765, + "step": 9968 + }, + { + "epoch": 1.27, + "grad_norm": 0.6902338288936075, + "learning_rate": 3.1045863349651827e-06, + "loss": 0.4469, + "step": 9969 + }, + { + "epoch": 1.27, + "grad_norm": 0.5685269148314458, + "learning_rate": 3.1036317838229162e-06, + "loss": 0.4537, + "step": 9970 + }, + { + "epoch": 1.27, + "grad_norm": 0.5456361567034992, + "learning_rate": 3.1026773134029854e-06, + "loss": 0.4541, + "step": 9971 + }, + { + "epoch": 1.27, + "grad_norm": 0.6217646636411817, + "learning_rate": 3.1017229237460176e-06, + "loss": 0.4837, + "step": 9972 + }, + { + "epoch": 1.27, + "grad_norm": 1.2924020366665425, + "learning_rate": 3.1007686148926396e-06, + "loss": 0.4837, + "step": 9973 + }, + { + "epoch": 1.27, + "grad_norm": 0.6089581590918436, + "learning_rate": 3.0998143868834717e-06, + "loss": 0.4703, + "step": 9974 + }, + { + "epoch": 1.27, + "grad_norm": 0.7653251331614317, + "learning_rate": 3.0988602397591338e-06, + "loss": 0.482, + "step": 9975 + }, + { + "epoch": 1.27, + "grad_norm": 0.5371536887762518, + "learning_rate": 3.097906173560239e-06, + "loss": 0.4225, + "step": 9976 + }, + { + "epoch": 1.27, + "grad_norm": 0.7761111294942714, + "learning_rate": 3.096952188327401e-06, + "loss": 0.4499, + "step": 9977 + }, + { + "epoch": 1.27, + "grad_norm": 0.6802528396942026, + "learning_rate": 3.0959982841012283e-06, + "loss": 0.4741, + "step": 9978 + }, + { + "epoch": 1.27, + "grad_norm": 0.6928241219339091, + "learning_rate": 3.095044460922323e-06, + "loss": 0.4772, + "step": 9979 + }, + { + "epoch": 1.27, + "grad_norm": 0.7108420253509063, + "learning_rate": 3.0940907188312885e-06, + "loss": 0.4493, + "step": 9980 + }, + { + "epoch": 1.27, + "grad_norm": 0.602090956177948, + "learning_rate": 3.0931370578687214e-06, + "loss": 0.4959, + "step": 9981 + }, + { + "epoch": 1.27, + "grad_norm": 0.6395492350721127, + "learning_rate": 3.0921834780752163e-06, + "loss": 0.5153, + "step": 9982 + }, + { + "epoch": 1.27, + "grad_norm": 0.601243451801059, + "learning_rate": 3.0912299794913636e-06, + "loss": 0.5149, + "step": 9983 + }, + { + "epoch": 1.27, + "grad_norm": 0.956703778109413, + "learning_rate": 3.090276562157752e-06, + "loss": 0.5875, + "step": 9984 + }, + { + "epoch": 1.27, + "grad_norm": 0.7156525862103044, + "learning_rate": 3.089323226114964e-06, + "loss": 0.5059, + "step": 9985 + }, + { + "epoch": 1.27, + "grad_norm": 0.6957222003615686, + "learning_rate": 3.0883699714035807e-06, + "loss": 0.4119, + "step": 9986 + }, + { + "epoch": 1.27, + "grad_norm": 0.583525071761603, + "learning_rate": 3.087416798064179e-06, + "loss": 0.4592, + "step": 9987 + }, + { + "epoch": 1.27, + "grad_norm": 0.6550509721118933, + "learning_rate": 3.0864637061373306e-06, + "loss": 0.4695, + "step": 9988 + }, + { + "epoch": 1.27, + "grad_norm": 0.6899522372675596, + "learning_rate": 3.0855106956636095e-06, + "loss": 0.5119, + "step": 9989 + }, + { + "epoch": 1.27, + "grad_norm": 0.6310101863850072, + "learning_rate": 3.08455776668358e-06, + "loss": 0.4036, + "step": 9990 + }, + { + "epoch": 1.27, + "grad_norm": 0.6086763255538692, + "learning_rate": 3.083604919237806e-06, + "loss": 0.4114, + "step": 9991 + }, + { + "epoch": 1.27, + "grad_norm": 0.6767013629384223, + "learning_rate": 3.082652153366846e-06, + "loss": 0.4504, + "step": 9992 + }, + { + "epoch": 1.27, + "grad_norm": 0.6882547989452196, + "learning_rate": 3.0816994691112567e-06, + "loss": 0.4779, + "step": 9993 + }, + { + "epoch": 1.27, + "grad_norm": 0.7646097099019413, + "learning_rate": 3.0807468665115913e-06, + "loss": 0.5501, + "step": 9994 + }, + { + "epoch": 1.27, + "grad_norm": 0.5742321009665137, + "learning_rate": 3.079794345608399e-06, + "loss": 0.4549, + "step": 9995 + }, + { + "epoch": 1.27, + "grad_norm": 0.5053944630526432, + "learning_rate": 3.0788419064422256e-06, + "loss": 0.3972, + "step": 9996 + }, + { + "epoch": 1.27, + "grad_norm": 0.5703044952933406, + "learning_rate": 3.0778895490536124e-06, + "loss": 0.4103, + "step": 9997 + }, + { + "epoch": 1.27, + "grad_norm": 0.7749884170205388, + "learning_rate": 3.0769372734830994e-06, + "loss": 0.5116, + "step": 9998 + }, + { + "epoch": 1.27, + "grad_norm": 0.8219816101359045, + "learning_rate": 3.075985079771221e-06, + "loss": 0.5343, + "step": 9999 + }, + { + "epoch": 1.27, + "grad_norm": 0.7689351467617338, + "learning_rate": 3.07503296795851e-06, + "loss": 0.5247, + "step": 10000 + }, + { + "epoch": 1.27, + "grad_norm": 0.7387239325911442, + "learning_rate": 3.074080938085493e-06, + "loss": 0.4952, + "step": 10001 + }, + { + "epoch": 1.27, + "grad_norm": 1.0615549005643687, + "learning_rate": 3.0731289901926964e-06, + "loss": 0.5688, + "step": 10002 + }, + { + "epoch": 1.27, + "grad_norm": 0.7533877985632432, + "learning_rate": 3.072177124320641e-06, + "loss": 0.5156, + "step": 10003 + }, + { + "epoch": 1.27, + "grad_norm": 0.7618768628850636, + "learning_rate": 3.0712253405098456e-06, + "loss": 0.5744, + "step": 10004 + }, + { + "epoch": 1.27, + "grad_norm": 0.7585952723183147, + "learning_rate": 3.0702736388008247e-06, + "loss": 0.5398, + "step": 10005 + }, + { + "epoch": 1.27, + "grad_norm": 0.7553428966500044, + "learning_rate": 3.0693220192340876e-06, + "loss": 0.5206, + "step": 10006 + }, + { + "epoch": 1.27, + "grad_norm": 0.6108437038033367, + "learning_rate": 3.0683704818501438e-06, + "loss": 0.48, + "step": 10007 + }, + { + "epoch": 1.27, + "grad_norm": 0.8480712547236272, + "learning_rate": 3.067419026689495e-06, + "loss": 0.5152, + "step": 10008 + }, + { + "epoch": 1.28, + "grad_norm": 0.7075026472470567, + "learning_rate": 3.066467653792643e-06, + "loss": 0.4912, + "step": 10009 + }, + { + "epoch": 1.28, + "grad_norm": 0.7245669215115459, + "learning_rate": 3.065516363200084e-06, + "loss": 0.4735, + "step": 10010 + }, + { + "epoch": 1.28, + "grad_norm": 0.7473986085675763, + "learning_rate": 3.0645651549523114e-06, + "loss": 0.5026, + "step": 10011 + }, + { + "epoch": 1.28, + "grad_norm": 0.5899756990735727, + "learning_rate": 3.0636140290898166e-06, + "loss": 0.4779, + "step": 10012 + }, + { + "epoch": 1.28, + "grad_norm": 0.8824406665943079, + "learning_rate": 3.062662985653082e-06, + "loss": 0.518, + "step": 10013 + }, + { + "epoch": 1.28, + "grad_norm": 0.713828159259407, + "learning_rate": 3.061712024682596e-06, + "loss": 0.5213, + "step": 10014 + }, + { + "epoch": 1.28, + "grad_norm": 0.7137738184450169, + "learning_rate": 3.0607611462188342e-06, + "loss": 0.4667, + "step": 10015 + }, + { + "epoch": 1.28, + "grad_norm": 0.6984211698335809, + "learning_rate": 3.0598103503022734e-06, + "loss": 0.4949, + "step": 10016 + }, + { + "epoch": 1.28, + "grad_norm": 0.5803382187767104, + "learning_rate": 3.0588596369733863e-06, + "loss": 0.4371, + "step": 10017 + }, + { + "epoch": 1.28, + "grad_norm": 0.704751079108312, + "learning_rate": 3.0579090062726415e-06, + "loss": 0.5002, + "step": 10018 + }, + { + "epoch": 1.28, + "grad_norm": 0.7865243535100296, + "learning_rate": 3.0569584582405044e-06, + "loss": 0.5106, + "step": 10019 + }, + { + "epoch": 1.28, + "grad_norm": 0.6013805575762681, + "learning_rate": 3.056007992917437e-06, + "loss": 0.4607, + "step": 10020 + }, + { + "epoch": 1.28, + "grad_norm": 0.6338076601598049, + "learning_rate": 3.0550576103438966e-06, + "loss": 0.4666, + "step": 10021 + }, + { + "epoch": 1.28, + "grad_norm": 0.8577282948578187, + "learning_rate": 3.0541073105603387e-06, + "loss": 0.5421, + "step": 10022 + }, + { + "epoch": 1.28, + "grad_norm": 0.7320387177694757, + "learning_rate": 3.053157093607214e-06, + "loss": 0.5511, + "step": 10023 + }, + { + "epoch": 1.28, + "grad_norm": 0.6883125579858914, + "learning_rate": 3.0522069595249717e-06, + "loss": 0.4824, + "step": 10024 + }, + { + "epoch": 1.28, + "grad_norm": 0.6183742614961625, + "learning_rate": 3.0512569083540537e-06, + "loss": 0.4445, + "step": 10025 + }, + { + "epoch": 1.28, + "grad_norm": 0.7261391838529112, + "learning_rate": 3.0503069401349006e-06, + "loss": 0.5036, + "step": 10026 + }, + { + "epoch": 1.28, + "grad_norm": 0.651606948081618, + "learning_rate": 3.0493570549079522e-06, + "loss": 0.4829, + "step": 10027 + }, + { + "epoch": 1.28, + "grad_norm": 0.676641055208218, + "learning_rate": 3.048407252713641e-06, + "loss": 0.513, + "step": 10028 + }, + { + "epoch": 1.28, + "grad_norm": 0.7831625782380829, + "learning_rate": 3.0474575335923967e-06, + "loss": 0.5288, + "step": 10029 + }, + { + "epoch": 1.28, + "grad_norm": 0.7136661129400382, + "learning_rate": 3.046507897584645e-06, + "loss": 0.5211, + "step": 10030 + }, + { + "epoch": 1.28, + "grad_norm": 0.8144461603238857, + "learning_rate": 3.0455583447308105e-06, + "loss": 0.4883, + "step": 10031 + }, + { + "epoch": 1.28, + "grad_norm": 0.7931373990728641, + "learning_rate": 3.0446088750713123e-06, + "loss": 0.5091, + "step": 10032 + }, + { + "epoch": 1.28, + "grad_norm": 0.5697133546601547, + "learning_rate": 3.043659488646564e-06, + "loss": 0.4389, + "step": 10033 + }, + { + "epoch": 1.28, + "grad_norm": 0.5604022302195278, + "learning_rate": 3.042710185496981e-06, + "loss": 0.437, + "step": 10034 + }, + { + "epoch": 1.28, + "grad_norm": 0.641352831044806, + "learning_rate": 3.041760965662971e-06, + "loss": 0.4454, + "step": 10035 + }, + { + "epoch": 1.28, + "grad_norm": 0.8715159717674851, + "learning_rate": 3.040811829184938e-06, + "loss": 0.5269, + "step": 10036 + }, + { + "epoch": 1.28, + "grad_norm": 0.8903074305787414, + "learning_rate": 3.039862776103285e-06, + "loss": 0.4897, + "step": 10037 + }, + { + "epoch": 1.28, + "grad_norm": 0.9696778903792544, + "learning_rate": 3.0389138064584084e-06, + "loss": 0.5127, + "step": 10038 + }, + { + "epoch": 1.28, + "grad_norm": 0.6020712900758779, + "learning_rate": 3.037964920290706e-06, + "loss": 0.4542, + "step": 10039 + }, + { + "epoch": 1.28, + "grad_norm": 0.7057177378866583, + "learning_rate": 3.0370161176405676e-06, + "loss": 0.4966, + "step": 10040 + }, + { + "epoch": 1.28, + "grad_norm": 0.8297266839166492, + "learning_rate": 3.0360673985483803e-06, + "loss": 0.5246, + "step": 10041 + }, + { + "epoch": 1.28, + "grad_norm": 0.5756645173131348, + "learning_rate": 3.0351187630545277e-06, + "loss": 0.4836, + "step": 10042 + }, + { + "epoch": 1.28, + "grad_norm": 0.7288508495495228, + "learning_rate": 3.03417021119939e-06, + "loss": 0.5216, + "step": 10043 + }, + { + "epoch": 1.28, + "grad_norm": 0.6407747002090415, + "learning_rate": 3.0332217430233453e-06, + "loss": 0.4732, + "step": 10044 + }, + { + "epoch": 1.28, + "grad_norm": 0.8804538125993043, + "learning_rate": 3.0322733585667653e-06, + "loss": 0.4997, + "step": 10045 + }, + { + "epoch": 1.28, + "grad_norm": 0.6284289269867019, + "learning_rate": 3.0313250578700217e-06, + "loss": 0.4698, + "step": 10046 + }, + { + "epoch": 1.28, + "grad_norm": 0.7854980850057762, + "learning_rate": 3.030376840973478e-06, + "loss": 0.5258, + "step": 10047 + }, + { + "epoch": 1.28, + "grad_norm": 0.7068188670250871, + "learning_rate": 3.0294287079174987e-06, + "loss": 0.5143, + "step": 10048 + }, + { + "epoch": 1.28, + "grad_norm": 0.7952001354525379, + "learning_rate": 3.0284806587424424e-06, + "loss": 0.5979, + "step": 10049 + }, + { + "epoch": 1.28, + "grad_norm": 0.7092844791672158, + "learning_rate": 3.0275326934886628e-06, + "loss": 0.4552, + "step": 10050 + }, + { + "epoch": 1.28, + "grad_norm": 0.6795123172734648, + "learning_rate": 3.0265848121965145e-06, + "loss": 0.507, + "step": 10051 + }, + { + "epoch": 1.28, + "grad_norm": 0.6838918044613908, + "learning_rate": 3.0256370149063453e-06, + "loss": 0.5623, + "step": 10052 + }, + { + "epoch": 1.28, + "grad_norm": 0.8283735266829618, + "learning_rate": 3.0246893016584987e-06, + "loss": 0.525, + "step": 10053 + }, + { + "epoch": 1.28, + "grad_norm": 0.8431487495445705, + "learning_rate": 3.0237416724933166e-06, + "loss": 0.5078, + "step": 10054 + }, + { + "epoch": 1.28, + "grad_norm": 0.618514115349961, + "learning_rate": 3.022794127451136e-06, + "loss": 0.4861, + "step": 10055 + }, + { + "epoch": 1.28, + "grad_norm": 0.7774048537524458, + "learning_rate": 3.021846666572291e-06, + "loss": 0.507, + "step": 10056 + }, + { + "epoch": 1.28, + "grad_norm": 0.6343040788754329, + "learning_rate": 3.020899289897113e-06, + "loss": 0.4459, + "step": 10057 + }, + { + "epoch": 1.28, + "grad_norm": 0.6149030932862146, + "learning_rate": 3.0199519974659276e-06, + "loss": 0.4797, + "step": 10058 + }, + { + "epoch": 1.28, + "grad_norm": 0.8142902832891326, + "learning_rate": 3.0190047893190575e-06, + "loss": 0.5064, + "step": 10059 + }, + { + "epoch": 1.28, + "grad_norm": 0.5875697442357543, + "learning_rate": 3.0180576654968242e-06, + "loss": 0.4749, + "step": 10060 + }, + { + "epoch": 1.28, + "grad_norm": 0.5807271333060996, + "learning_rate": 3.017110626039542e-06, + "loss": 0.4328, + "step": 10061 + }, + { + "epoch": 1.28, + "grad_norm": 0.8156702292407569, + "learning_rate": 3.0161636709875245e-06, + "loss": 0.4791, + "step": 10062 + }, + { + "epoch": 1.28, + "grad_norm": 0.8883279805108075, + "learning_rate": 3.0152168003810788e-06, + "loss": 0.5289, + "step": 10063 + }, + { + "epoch": 1.28, + "grad_norm": 0.7966769391439963, + "learning_rate": 3.014270014260513e-06, + "loss": 0.5244, + "step": 10064 + }, + { + "epoch": 1.28, + "grad_norm": 1.303583516155905, + "learning_rate": 3.013323312666126e-06, + "loss": 0.4876, + "step": 10065 + }, + { + "epoch": 1.28, + "grad_norm": 0.58675524657906, + "learning_rate": 3.0123766956382177e-06, + "loss": 0.4469, + "step": 10066 + }, + { + "epoch": 1.28, + "grad_norm": 0.7589867412843092, + "learning_rate": 3.011430163217082e-06, + "loss": 0.586, + "step": 10067 + }, + { + "epoch": 1.28, + "grad_norm": 0.7693628082801335, + "learning_rate": 3.01048371544301e-06, + "loss": 0.5209, + "step": 10068 + }, + { + "epoch": 1.28, + "grad_norm": 0.5944715712428372, + "learning_rate": 3.009537352356289e-06, + "loss": 0.4545, + "step": 10069 + }, + { + "epoch": 1.28, + "grad_norm": 0.7005635876539956, + "learning_rate": 3.0085910739972013e-06, + "loss": 0.4282, + "step": 10070 + }, + { + "epoch": 1.28, + "grad_norm": 0.8225769212919801, + "learning_rate": 3.0076448804060286e-06, + "loss": 0.5229, + "step": 10071 + }, + { + "epoch": 1.28, + "grad_norm": 1.0036227321852822, + "learning_rate": 3.006698771623046e-06, + "loss": 0.472, + "step": 10072 + }, + { + "epoch": 1.28, + "grad_norm": 0.5925436969464989, + "learning_rate": 3.005752747688528e-06, + "loss": 0.44, + "step": 10073 + }, + { + "epoch": 1.28, + "grad_norm": 0.6921521343201524, + "learning_rate": 3.004806808642742e-06, + "loss": 0.463, + "step": 10074 + }, + { + "epoch": 1.28, + "grad_norm": 0.60239773074461, + "learning_rate": 3.003860954525954e-06, + "loss": 0.4626, + "step": 10075 + }, + { + "epoch": 1.28, + "grad_norm": 0.7077316777280128, + "learning_rate": 3.002915185378427e-06, + "loss": 0.5023, + "step": 10076 + }, + { + "epoch": 1.28, + "grad_norm": 0.7522666144284539, + "learning_rate": 3.0019695012404193e-06, + "loss": 0.5559, + "step": 10077 + }, + { + "epoch": 1.28, + "grad_norm": 0.8185355937073601, + "learning_rate": 3.001023902152185e-06, + "loss": 0.5093, + "step": 10078 + }, + { + "epoch": 1.28, + "grad_norm": 0.9198465109077911, + "learning_rate": 3.0000783881539747e-06, + "loss": 0.5868, + "step": 10079 + }, + { + "epoch": 1.28, + "grad_norm": 0.8550110783014858, + "learning_rate": 2.9991329592860376e-06, + "loss": 0.5043, + "step": 10080 + }, + { + "epoch": 1.28, + "grad_norm": 0.734583464318067, + "learning_rate": 2.998187615588616e-06, + "loss": 0.4729, + "step": 10081 + }, + { + "epoch": 1.28, + "grad_norm": 0.6177460097427718, + "learning_rate": 2.9972423571019503e-06, + "loss": 0.4996, + "step": 10082 + }, + { + "epoch": 1.28, + "grad_norm": 0.9354631800349099, + "learning_rate": 2.996297183866278e-06, + "loss": 0.5243, + "step": 10083 + }, + { + "epoch": 1.28, + "grad_norm": 0.9388552029761114, + "learning_rate": 2.995352095921832e-06, + "loss": 0.4607, + "step": 10084 + }, + { + "epoch": 1.28, + "grad_norm": 0.673897873608996, + "learning_rate": 2.9944070933088408e-06, + "loss": 0.5089, + "step": 10085 + }, + { + "epoch": 1.28, + "grad_norm": 0.7970155534274046, + "learning_rate": 2.9934621760675308e-06, + "loss": 0.5325, + "step": 10086 + }, + { + "epoch": 1.29, + "grad_norm": 0.6616889414459125, + "learning_rate": 2.992517344238124e-06, + "loss": 0.4741, + "step": 10087 + }, + { + "epoch": 1.29, + "grad_norm": 0.6796734364489979, + "learning_rate": 2.991572597860837e-06, + "loss": 0.4344, + "step": 10088 + }, + { + "epoch": 1.29, + "grad_norm": 0.7216796163782366, + "learning_rate": 2.990627936975888e-06, + "loss": 0.4914, + "step": 10089 + }, + { + "epoch": 1.29, + "grad_norm": 0.7493563605490247, + "learning_rate": 2.989683361623488e-06, + "loss": 0.4512, + "step": 10090 + }, + { + "epoch": 1.29, + "grad_norm": 0.6755188370213615, + "learning_rate": 2.988738871843842e-06, + "loss": 0.4529, + "step": 10091 + }, + { + "epoch": 1.29, + "grad_norm": 0.9182093900054465, + "learning_rate": 2.9877944676771554e-06, + "loss": 0.5545, + "step": 10092 + }, + { + "epoch": 1.29, + "grad_norm": 1.1928959872543183, + "learning_rate": 2.986850149163628e-06, + "loss": 0.5176, + "step": 10093 + }, + { + "epoch": 1.29, + "grad_norm": 0.737471400068593, + "learning_rate": 2.985905916343458e-06, + "loss": 0.5067, + "step": 10094 + }, + { + "epoch": 1.29, + "grad_norm": 0.6022972409017776, + "learning_rate": 2.9849617692568356e-06, + "loss": 0.4609, + "step": 10095 + }, + { + "epoch": 1.29, + "grad_norm": 0.8303501827301287, + "learning_rate": 2.984017707943952e-06, + "loss": 0.5094, + "step": 10096 + }, + { + "epoch": 1.29, + "grad_norm": 0.876549585807127, + "learning_rate": 2.983073732444992e-06, + "loss": 0.5175, + "step": 10097 + }, + { + "epoch": 1.29, + "grad_norm": 0.594992338486311, + "learning_rate": 2.982129842800139e-06, + "loss": 0.4495, + "step": 10098 + }, + { + "epoch": 1.29, + "grad_norm": 0.7201306166707563, + "learning_rate": 2.9811860390495694e-06, + "loss": 0.4328, + "step": 10099 + }, + { + "epoch": 1.29, + "grad_norm": 0.8887686741218395, + "learning_rate": 2.9802423212334575e-06, + "loss": 0.4785, + "step": 10100 + }, + { + "epoch": 1.29, + "grad_norm": 0.8867668237412709, + "learning_rate": 2.9792986893919774e-06, + "loss": 0.5371, + "step": 10101 + }, + { + "epoch": 1.29, + "grad_norm": 0.9830089174735464, + "learning_rate": 2.9783551435652947e-06, + "loss": 0.467, + "step": 10102 + }, + { + "epoch": 1.29, + "grad_norm": 0.5411161149393208, + "learning_rate": 2.977411683793574e-06, + "loss": 0.4248, + "step": 10103 + }, + { + "epoch": 1.29, + "grad_norm": 0.6652680659069923, + "learning_rate": 2.9764683101169746e-06, + "loss": 0.4859, + "step": 10104 + }, + { + "epoch": 1.29, + "grad_norm": 0.8077801746704958, + "learning_rate": 2.9755250225756525e-06, + "loss": 0.5204, + "step": 10105 + }, + { + "epoch": 1.29, + "grad_norm": 0.7764389497501268, + "learning_rate": 2.974581821209761e-06, + "loss": 0.5202, + "step": 10106 + }, + { + "epoch": 1.29, + "grad_norm": 0.7197404009164483, + "learning_rate": 2.9736387060594484e-06, + "loss": 0.4562, + "step": 10107 + }, + { + "epoch": 1.29, + "grad_norm": 0.8357037768159049, + "learning_rate": 2.9726956771648616e-06, + "loss": 0.5501, + "step": 10108 + }, + { + "epoch": 1.29, + "grad_norm": 0.67724049421269, + "learning_rate": 2.971752734566142e-06, + "loss": 0.5183, + "step": 10109 + }, + { + "epoch": 1.29, + "grad_norm": 0.7044068443668652, + "learning_rate": 2.9708098783034263e-06, + "loss": 0.4752, + "step": 10110 + }, + { + "epoch": 1.29, + "grad_norm": 0.7033922958840514, + "learning_rate": 2.9698671084168497e-06, + "loss": 0.5317, + "step": 10111 + }, + { + "epoch": 1.29, + "grad_norm": 0.7709944479576237, + "learning_rate": 2.968924424946541e-06, + "loss": 0.5065, + "step": 10112 + }, + { + "epoch": 1.29, + "grad_norm": 0.6444397679009273, + "learning_rate": 2.967981827932631e-06, + "loss": 0.4504, + "step": 10113 + }, + { + "epoch": 1.29, + "grad_norm": 0.6266647373921675, + "learning_rate": 2.9670393174152417e-06, + "loss": 0.4897, + "step": 10114 + }, + { + "epoch": 1.29, + "grad_norm": 0.7758081475425422, + "learning_rate": 2.966096893434492e-06, + "loss": 0.5538, + "step": 10115 + }, + { + "epoch": 1.29, + "grad_norm": 0.639264055117079, + "learning_rate": 2.9651545560304986e-06, + "loss": 0.4487, + "step": 10116 + }, + { + "epoch": 1.29, + "grad_norm": 0.7670169816847134, + "learning_rate": 2.9642123052433725e-06, + "loss": 0.5225, + "step": 10117 + }, + { + "epoch": 1.29, + "grad_norm": 0.765958751322079, + "learning_rate": 2.963270141113223e-06, + "loss": 0.5261, + "step": 10118 + }, + { + "epoch": 1.29, + "grad_norm": 0.7417938385491667, + "learning_rate": 2.9623280636801554e-06, + "loss": 0.4577, + "step": 10119 + }, + { + "epoch": 1.29, + "grad_norm": 0.6330606280845634, + "learning_rate": 2.9613860729842714e-06, + "loss": 0.4686, + "step": 10120 + }, + { + "epoch": 1.29, + "grad_norm": 0.8042711924059736, + "learning_rate": 2.960444169065667e-06, + "loss": 0.5317, + "step": 10121 + }, + { + "epoch": 1.29, + "grad_norm": 0.6931330767737331, + "learning_rate": 2.959502351964437e-06, + "loss": 0.4783, + "step": 10122 + }, + { + "epoch": 1.29, + "grad_norm": 0.6227064661592011, + "learning_rate": 2.9585606217206714e-06, + "loss": 0.4434, + "step": 10123 + }, + { + "epoch": 1.29, + "grad_norm": 0.6326883520728792, + "learning_rate": 2.957618978374457e-06, + "loss": 0.4877, + "step": 10124 + }, + { + "epoch": 1.29, + "grad_norm": 0.7003053110487208, + "learning_rate": 2.9566774219658744e-06, + "loss": 0.447, + "step": 10125 + }, + { + "epoch": 1.29, + "grad_norm": 0.6386031879147133, + "learning_rate": 2.955735952535006e-06, + "loss": 0.4468, + "step": 10126 + }, + { + "epoch": 1.29, + "grad_norm": 0.7669863806583794, + "learning_rate": 2.9547945701219245e-06, + "loss": 0.4719, + "step": 10127 + }, + { + "epoch": 1.29, + "grad_norm": 0.6717987016775734, + "learning_rate": 2.9538532747667036e-06, + "loss": 0.4692, + "step": 10128 + }, + { + "epoch": 1.29, + "grad_norm": 0.8611211247390582, + "learning_rate": 2.9529120665094103e-06, + "loss": 0.5389, + "step": 10129 + }, + { + "epoch": 1.29, + "grad_norm": 0.5387871274834539, + "learning_rate": 2.951970945390108e-06, + "loss": 0.4386, + "step": 10130 + }, + { + "epoch": 1.29, + "grad_norm": 0.6376877981854637, + "learning_rate": 2.9510299114488592e-06, + "loss": 0.4805, + "step": 10131 + }, + { + "epoch": 1.29, + "grad_norm": 0.6348081336631767, + "learning_rate": 2.9500889647257193e-06, + "loss": 0.5089, + "step": 10132 + }, + { + "epoch": 1.29, + "grad_norm": 0.645884238847916, + "learning_rate": 2.9491481052607406e-06, + "loss": 0.5079, + "step": 10133 + }, + { + "epoch": 1.29, + "grad_norm": 0.6452682051769816, + "learning_rate": 2.9482073330939743e-06, + "loss": 0.4876, + "step": 10134 + }, + { + "epoch": 1.29, + "grad_norm": 0.8376356155256386, + "learning_rate": 2.947266648265465e-06, + "loss": 0.5016, + "step": 10135 + }, + { + "epoch": 1.29, + "grad_norm": 0.6292333787968463, + "learning_rate": 2.9463260508152547e-06, + "loss": 0.4734, + "step": 10136 + }, + { + "epoch": 1.29, + "grad_norm": 0.8373799011921564, + "learning_rate": 2.945385540783381e-06, + "loss": 0.5214, + "step": 10137 + }, + { + "epoch": 1.29, + "grad_norm": 0.6615495878018989, + "learning_rate": 2.9444451182098805e-06, + "loss": 0.5056, + "step": 10138 + }, + { + "epoch": 1.29, + "grad_norm": 1.201925206034663, + "learning_rate": 2.9435047831347824e-06, + "loss": 0.5585, + "step": 10139 + }, + { + "epoch": 1.29, + "grad_norm": 1.1188336960899259, + "learning_rate": 2.9425645355981143e-06, + "loss": 0.5157, + "step": 10140 + }, + { + "epoch": 1.29, + "grad_norm": 0.7477213854610992, + "learning_rate": 2.9416243756398986e-06, + "loss": 0.4936, + "step": 10141 + }, + { + "epoch": 1.29, + "grad_norm": 0.7815082644484085, + "learning_rate": 2.9406843033001563e-06, + "loss": 0.4832, + "step": 10142 + }, + { + "epoch": 1.29, + "grad_norm": 0.8209542514004152, + "learning_rate": 2.9397443186189024e-06, + "loss": 0.5489, + "step": 10143 + }, + { + "epoch": 1.29, + "grad_norm": 0.6637044558774606, + "learning_rate": 2.9388044216361492e-06, + "loss": 0.4937, + "step": 10144 + }, + { + "epoch": 1.29, + "grad_norm": 0.5880107320278892, + "learning_rate": 2.9378646123919054e-06, + "loss": 0.4471, + "step": 10145 + }, + { + "epoch": 1.29, + "grad_norm": 0.8429328183645565, + "learning_rate": 2.936924890926175e-06, + "loss": 0.5619, + "step": 10146 + }, + { + "epoch": 1.29, + "grad_norm": 0.7516802532245696, + "learning_rate": 2.9359852572789594e-06, + "loss": 0.5099, + "step": 10147 + }, + { + "epoch": 1.29, + "grad_norm": 0.6508521883211092, + "learning_rate": 2.9350457114902565e-06, + "loss": 0.479, + "step": 10148 + }, + { + "epoch": 1.29, + "grad_norm": 0.6544020286912982, + "learning_rate": 2.934106253600058e-06, + "loss": 0.4783, + "step": 10149 + }, + { + "epoch": 1.29, + "grad_norm": 0.6214310053089499, + "learning_rate": 2.933166883648353e-06, + "loss": 0.5004, + "step": 10150 + }, + { + "epoch": 1.29, + "grad_norm": 1.2185275914777103, + "learning_rate": 2.932227601675131e-06, + "loss": 0.5462, + "step": 10151 + }, + { + "epoch": 1.29, + "grad_norm": 0.7644958094851502, + "learning_rate": 2.9312884077203725e-06, + "loss": 0.4934, + "step": 10152 + }, + { + "epoch": 1.29, + "grad_norm": 0.5779097297825884, + "learning_rate": 2.930349301824056e-06, + "loss": 0.4473, + "step": 10153 + }, + { + "epoch": 1.29, + "grad_norm": 0.6559143254130309, + "learning_rate": 2.929410284026156e-06, + "loss": 0.5137, + "step": 10154 + }, + { + "epoch": 1.29, + "grad_norm": 0.725209318894737, + "learning_rate": 2.9284713543666434e-06, + "loss": 0.528, + "step": 10155 + }, + { + "epoch": 1.29, + "grad_norm": 0.7034797640431257, + "learning_rate": 2.9275325128854854e-06, + "loss": 0.5068, + "step": 10156 + }, + { + "epoch": 1.29, + "grad_norm": 0.5858274836647018, + "learning_rate": 2.926593759622646e-06, + "loss": 0.4194, + "step": 10157 + }, + { + "epoch": 1.29, + "grad_norm": 0.6500672658007484, + "learning_rate": 2.925655094618084e-06, + "loss": 0.4793, + "step": 10158 + }, + { + "epoch": 1.29, + "grad_norm": 0.7217885520479481, + "learning_rate": 2.9247165179117567e-06, + "loss": 0.4786, + "step": 10159 + }, + { + "epoch": 1.29, + "grad_norm": 0.634553578264273, + "learning_rate": 2.9237780295436153e-06, + "loss": 0.4835, + "step": 10160 + }, + { + "epoch": 1.29, + "grad_norm": 0.8408847210221582, + "learning_rate": 2.9228396295536083e-06, + "loss": 0.5354, + "step": 10161 + }, + { + "epoch": 1.29, + "grad_norm": 0.7644156960060315, + "learning_rate": 2.921901317981679e-06, + "loss": 0.5325, + "step": 10162 + }, + { + "epoch": 1.29, + "grad_norm": 0.8285579352659967, + "learning_rate": 2.9209630948677715e-06, + "loss": 0.5264, + "step": 10163 + }, + { + "epoch": 1.29, + "grad_norm": 0.7487038187038251, + "learning_rate": 2.9200249602518215e-06, + "loss": 0.5284, + "step": 10164 + }, + { + "epoch": 1.29, + "grad_norm": 0.6425912550011786, + "learning_rate": 2.9190869141737628e-06, + "loss": 0.5401, + "step": 10165 + }, + { + "epoch": 1.3, + "grad_norm": 0.7359199837551312, + "learning_rate": 2.9181489566735256e-06, + "loss": 0.5379, + "step": 10166 + }, + { + "epoch": 1.3, + "grad_norm": 0.8129476332300664, + "learning_rate": 2.917211087791032e-06, + "loss": 0.5472, + "step": 10167 + }, + { + "epoch": 1.3, + "grad_norm": 0.6923659456726867, + "learning_rate": 2.9162733075662087e-06, + "loss": 0.4581, + "step": 10168 + }, + { + "epoch": 1.3, + "grad_norm": 0.7804021163281288, + "learning_rate": 2.91533561603897e-06, + "loss": 0.5254, + "step": 10169 + }, + { + "epoch": 1.3, + "grad_norm": 0.8436480174237487, + "learning_rate": 2.9143980132492334e-06, + "loss": 0.5892, + "step": 10170 + }, + { + "epoch": 1.3, + "grad_norm": 0.8545459823577821, + "learning_rate": 2.9134604992369097e-06, + "loss": 0.536, + "step": 10171 + }, + { + "epoch": 1.3, + "grad_norm": 0.654194378378166, + "learning_rate": 2.9125230740419037e-06, + "loss": 0.4624, + "step": 10172 + }, + { + "epoch": 1.3, + "grad_norm": 0.6705653229842632, + "learning_rate": 2.911585737704121e-06, + "loss": 0.4559, + "step": 10173 + }, + { + "epoch": 1.3, + "grad_norm": 0.6687289069006598, + "learning_rate": 2.910648490263459e-06, + "loss": 0.4759, + "step": 10174 + }, + { + "epoch": 1.3, + "grad_norm": 0.6442381443013449, + "learning_rate": 2.9097113317598135e-06, + "loss": 0.4706, + "step": 10175 + }, + { + "epoch": 1.3, + "grad_norm": 0.6667896788204748, + "learning_rate": 2.908774262233079e-06, + "loss": 0.4529, + "step": 10176 + }, + { + "epoch": 1.3, + "grad_norm": 0.5904164099469337, + "learning_rate": 2.907837281723139e-06, + "loss": 0.4583, + "step": 10177 + }, + { + "epoch": 1.3, + "grad_norm": 0.55932948275627, + "learning_rate": 2.9069003902698833e-06, + "loss": 0.4731, + "step": 10178 + }, + { + "epoch": 1.3, + "grad_norm": 0.8565273952319515, + "learning_rate": 2.905963587913187e-06, + "loss": 0.4672, + "step": 10179 + }, + { + "epoch": 1.3, + "grad_norm": 0.7750285684129028, + "learning_rate": 2.905026874692931e-06, + "loss": 0.5748, + "step": 10180 + }, + { + "epoch": 1.3, + "grad_norm": 0.7166605566136042, + "learning_rate": 2.904090250648985e-06, + "loss": 0.5358, + "step": 10181 + }, + { + "epoch": 1.3, + "grad_norm": 0.6339450470409902, + "learning_rate": 2.903153715821221e-06, + "loss": 0.4702, + "step": 10182 + }, + { + "epoch": 1.3, + "grad_norm": 0.9731304868139452, + "learning_rate": 2.9022172702495004e-06, + "loss": 0.5219, + "step": 10183 + }, + { + "epoch": 1.3, + "grad_norm": 0.7514882638573377, + "learning_rate": 2.9012809139736895e-06, + "loss": 0.4829, + "step": 10184 + }, + { + "epoch": 1.3, + "grad_norm": 0.7475335600527702, + "learning_rate": 2.9003446470336415e-06, + "loss": 0.5333, + "step": 10185 + }, + { + "epoch": 1.3, + "grad_norm": 0.6985193720968823, + "learning_rate": 2.899408469469214e-06, + "loss": 0.5511, + "step": 10186 + }, + { + "epoch": 1.3, + "grad_norm": 0.8727280185459015, + "learning_rate": 2.898472381320252e-06, + "loss": 0.5722, + "step": 10187 + }, + { + "epoch": 1.3, + "grad_norm": 0.7734805303035921, + "learning_rate": 2.897536382626609e-06, + "loss": 0.4853, + "step": 10188 + }, + { + "epoch": 1.3, + "grad_norm": 0.6444911244666087, + "learning_rate": 2.8966004734281216e-06, + "loss": 0.4859, + "step": 10189 + }, + { + "epoch": 1.3, + "grad_norm": 0.7303950249005075, + "learning_rate": 2.895664653764633e-06, + "loss": 0.5195, + "step": 10190 + }, + { + "epoch": 1.3, + "grad_norm": 0.6402140605767778, + "learning_rate": 2.8947289236759725e-06, + "loss": 0.4515, + "step": 10191 + }, + { + "epoch": 1.3, + "grad_norm": 0.6763932478440389, + "learning_rate": 2.8937932832019767e-06, + "loss": 0.4783, + "step": 10192 + }, + { + "epoch": 1.3, + "grad_norm": 0.6579200449093628, + "learning_rate": 2.8928577323824687e-06, + "loss": 0.4732, + "step": 10193 + }, + { + "epoch": 1.3, + "grad_norm": 0.7027925633443014, + "learning_rate": 2.8919222712572755e-06, + "loss": 0.4976, + "step": 10194 + }, + { + "epoch": 1.3, + "grad_norm": 0.5671219139174667, + "learning_rate": 2.8909868998662127e-06, + "loss": 0.4658, + "step": 10195 + }, + { + "epoch": 1.3, + "grad_norm": 0.9705835662620246, + "learning_rate": 2.8900516182491002e-06, + "loss": 0.5174, + "step": 10196 + }, + { + "epoch": 1.3, + "grad_norm": 0.7697562743226838, + "learning_rate": 2.8891164264457457e-06, + "loss": 0.4861, + "step": 10197 + }, + { + "epoch": 1.3, + "grad_norm": 0.6050093154754461, + "learning_rate": 2.8881813244959612e-06, + "loss": 0.5307, + "step": 10198 + }, + { + "epoch": 1.3, + "grad_norm": 0.8398249656435676, + "learning_rate": 2.8872463124395457e-06, + "loss": 0.5081, + "step": 10199 + }, + { + "epoch": 1.3, + "grad_norm": 0.5772250485459917, + "learning_rate": 2.8863113903163078e-06, + "loss": 0.4704, + "step": 10200 + }, + { + "epoch": 1.3, + "grad_norm": 0.679098520169189, + "learning_rate": 2.8853765581660365e-06, + "loss": 0.5039, + "step": 10201 + }, + { + "epoch": 1.3, + "grad_norm": 0.6360127132472283, + "learning_rate": 2.8844418160285305e-06, + "loss": 0.4763, + "step": 10202 + }, + { + "epoch": 1.3, + "grad_norm": 0.7243562471193774, + "learning_rate": 2.883507163943574e-06, + "loss": 0.5064, + "step": 10203 + }, + { + "epoch": 1.3, + "grad_norm": 0.7163903348118532, + "learning_rate": 2.882572601950956e-06, + "loss": 0.4583, + "step": 10204 + }, + { + "epoch": 1.3, + "grad_norm": 0.5711941198400342, + "learning_rate": 2.881638130090454e-06, + "loss": 0.4582, + "step": 10205 + }, + { + "epoch": 1.3, + "grad_norm": 0.6274230845694725, + "learning_rate": 2.8807037484018496e-06, + "loss": 0.4164, + "step": 10206 + }, + { + "epoch": 1.3, + "grad_norm": 0.6260335194664035, + "learning_rate": 2.8797694569249122e-06, + "loss": 0.5005, + "step": 10207 + }, + { + "epoch": 1.3, + "grad_norm": 0.7128907202421917, + "learning_rate": 2.878835255699416e-06, + "loss": 0.533, + "step": 10208 + }, + { + "epoch": 1.3, + "grad_norm": 0.7845323777474015, + "learning_rate": 2.8779011447651227e-06, + "loss": 0.5281, + "step": 10209 + }, + { + "epoch": 1.3, + "grad_norm": 0.747650927201233, + "learning_rate": 2.876967124161798e-06, + "loss": 0.5459, + "step": 10210 + }, + { + "epoch": 1.3, + "grad_norm": 0.7193081240913091, + "learning_rate": 2.876033193929197e-06, + "loss": 0.5552, + "step": 10211 + }, + { + "epoch": 1.3, + "grad_norm": 0.7914127652799089, + "learning_rate": 2.8750993541070753e-06, + "loss": 0.5435, + "step": 10212 + }, + { + "epoch": 1.3, + "grad_norm": 0.7084305546304169, + "learning_rate": 2.874165604735184e-06, + "loss": 0.5492, + "step": 10213 + }, + { + "epoch": 1.3, + "grad_norm": 0.5880303783712654, + "learning_rate": 2.8732319458532722e-06, + "loss": 0.4228, + "step": 10214 + }, + { + "epoch": 1.3, + "grad_norm": 0.6229318713636494, + "learning_rate": 2.8722983775010775e-06, + "loss": 0.4712, + "step": 10215 + }, + { + "epoch": 1.3, + "grad_norm": 0.6756154003694903, + "learning_rate": 2.871364899718344e-06, + "loss": 0.503, + "step": 10216 + }, + { + "epoch": 1.3, + "grad_norm": 0.7218600006010616, + "learning_rate": 2.8704315125448027e-06, + "loss": 0.5003, + "step": 10217 + }, + { + "epoch": 1.3, + "grad_norm": 0.6322725850395896, + "learning_rate": 2.869498216020189e-06, + "loss": 0.4387, + "step": 10218 + }, + { + "epoch": 1.3, + "grad_norm": 0.6896654022079284, + "learning_rate": 2.8685650101842257e-06, + "loss": 0.4967, + "step": 10219 + }, + { + "epoch": 1.3, + "grad_norm": 0.7053372394926651, + "learning_rate": 2.8676318950766413e-06, + "loss": 0.4903, + "step": 10220 + }, + { + "epoch": 1.3, + "grad_norm": 0.6199428687595725, + "learning_rate": 2.8666988707371507e-06, + "loss": 0.4807, + "step": 10221 + }, + { + "epoch": 1.3, + "grad_norm": 0.6769955706031948, + "learning_rate": 2.8657659372054743e-06, + "loss": 0.4677, + "step": 10222 + }, + { + "epoch": 1.3, + "grad_norm": 0.675305212036312, + "learning_rate": 2.86483309452132e-06, + "loss": 0.5068, + "step": 10223 + }, + { + "epoch": 1.3, + "grad_norm": 0.8707066448333448, + "learning_rate": 2.8639003427243974e-06, + "loss": 0.4643, + "step": 10224 + }, + { + "epoch": 1.3, + "grad_norm": 0.6364961778623395, + "learning_rate": 2.8629676818544135e-06, + "loss": 0.4276, + "step": 10225 + }, + { + "epoch": 1.3, + "grad_norm": 0.6801140971692162, + "learning_rate": 2.862035111951064e-06, + "loss": 0.4505, + "step": 10226 + }, + { + "epoch": 1.3, + "grad_norm": 0.6604427994928885, + "learning_rate": 2.86110263305405e-06, + "loss": 0.4693, + "step": 10227 + }, + { + "epoch": 1.3, + "grad_norm": 0.7986591803117429, + "learning_rate": 2.860170245203061e-06, + "loss": 0.4842, + "step": 10228 + }, + { + "epoch": 1.3, + "grad_norm": 0.5920876835762492, + "learning_rate": 2.8592379484377874e-06, + "loss": 0.4548, + "step": 10229 + }, + { + "epoch": 1.3, + "grad_norm": 0.5540399558151131, + "learning_rate": 2.8583057427979123e-06, + "loss": 0.4252, + "step": 10230 + }, + { + "epoch": 1.3, + "grad_norm": 0.6078661459183704, + "learning_rate": 2.8573736283231172e-06, + "loss": 0.4432, + "step": 10231 + }, + { + "epoch": 1.3, + "grad_norm": 0.6682072685001961, + "learning_rate": 2.8564416050530826e-06, + "loss": 0.4582, + "step": 10232 + }, + { + "epoch": 1.3, + "grad_norm": 0.5689592425546806, + "learning_rate": 2.8555096730274768e-06, + "loss": 0.4613, + "step": 10233 + }, + { + "epoch": 1.3, + "grad_norm": 0.7428411694563717, + "learning_rate": 2.854577832285973e-06, + "loss": 0.4917, + "step": 10234 + }, + { + "epoch": 1.3, + "grad_norm": 0.7442687694616843, + "learning_rate": 2.8536460828682335e-06, + "loss": 0.5116, + "step": 10235 + }, + { + "epoch": 1.3, + "grad_norm": 0.848214475523199, + "learning_rate": 2.852714424813921e-06, + "loss": 0.5252, + "step": 10236 + }, + { + "epoch": 1.3, + "grad_norm": 0.6479163829099377, + "learning_rate": 2.851782858162696e-06, + "loss": 0.5247, + "step": 10237 + }, + { + "epoch": 1.3, + "grad_norm": 0.7847061284805814, + "learning_rate": 2.8508513829542086e-06, + "loss": 0.5009, + "step": 10238 + }, + { + "epoch": 1.3, + "grad_norm": 0.832324596194084, + "learning_rate": 2.8499199992281113e-06, + "loss": 0.5098, + "step": 10239 + }, + { + "epoch": 1.3, + "grad_norm": 0.5739694357939171, + "learning_rate": 2.848988707024047e-06, + "loss": 0.4637, + "step": 10240 + }, + { + "epoch": 1.3, + "grad_norm": 1.022437639239965, + "learning_rate": 2.8480575063816617e-06, + "loss": 0.5215, + "step": 10241 + }, + { + "epoch": 1.3, + "grad_norm": 0.6254561500669832, + "learning_rate": 2.8471263973405904e-06, + "loss": 0.4251, + "step": 10242 + }, + { + "epoch": 1.3, + "grad_norm": 0.8074913719827148, + "learning_rate": 2.8461953799404702e-06, + "loss": 0.4465, + "step": 10243 + }, + { + "epoch": 1.31, + "grad_norm": 0.7669159011448926, + "learning_rate": 2.8452644542209283e-06, + "loss": 0.4949, + "step": 10244 + }, + { + "epoch": 1.31, + "grad_norm": 0.6114192213671742, + "learning_rate": 2.8443336202215953e-06, + "loss": 0.5023, + "step": 10245 + }, + { + "epoch": 1.31, + "grad_norm": 0.7261979701571393, + "learning_rate": 2.843402877982089e-06, + "loss": 0.5405, + "step": 10246 + }, + { + "epoch": 1.31, + "grad_norm": 0.7405986778326757, + "learning_rate": 2.842472227542033e-06, + "loss": 0.5474, + "step": 10247 + }, + { + "epoch": 1.31, + "grad_norm": 0.7283534771526289, + "learning_rate": 2.841541668941038e-06, + "loss": 0.551, + "step": 10248 + }, + { + "epoch": 1.31, + "grad_norm": 0.606013040166489, + "learning_rate": 2.840611202218716e-06, + "loss": 0.4986, + "step": 10249 + }, + { + "epoch": 1.31, + "grad_norm": 0.6612878555801732, + "learning_rate": 2.8396808274146757e-06, + "loss": 0.5319, + "step": 10250 + }, + { + "epoch": 1.31, + "grad_norm": 0.8857413887674892, + "learning_rate": 2.8387505445685205e-06, + "loss": 0.5321, + "step": 10251 + }, + { + "epoch": 1.31, + "grad_norm": 0.5987038658197168, + "learning_rate": 2.837820353719847e-06, + "loss": 0.452, + "step": 10252 + }, + { + "epoch": 1.31, + "grad_norm": 0.6192824010431363, + "learning_rate": 2.836890254908252e-06, + "loss": 0.5001, + "step": 10253 + }, + { + "epoch": 1.31, + "grad_norm": 0.7636759672701975, + "learning_rate": 2.835960248173326e-06, + "loss": 0.5391, + "step": 10254 + }, + { + "epoch": 1.31, + "grad_norm": 0.6316826054431608, + "learning_rate": 2.835030333554659e-06, + "loss": 0.4614, + "step": 10255 + }, + { + "epoch": 1.31, + "grad_norm": 0.6172647901463921, + "learning_rate": 2.8341005110918298e-06, + "loss": 0.4436, + "step": 10256 + }, + { + "epoch": 1.31, + "grad_norm": 0.6877274755177915, + "learning_rate": 2.833170780824423e-06, + "loss": 0.498, + "step": 10257 + }, + { + "epoch": 1.31, + "grad_norm": 0.5982828188157019, + "learning_rate": 2.83224114279201e-06, + "loss": 0.4566, + "step": 10258 + }, + { + "epoch": 1.31, + "grad_norm": 0.792733103944545, + "learning_rate": 2.831311597034166e-06, + "loss": 0.4908, + "step": 10259 + }, + { + "epoch": 1.31, + "grad_norm": 0.5374601529386345, + "learning_rate": 2.8303821435904554e-06, + "loss": 0.4447, + "step": 10260 + }, + { + "epoch": 1.31, + "grad_norm": 0.627016646477181, + "learning_rate": 2.829452782500444e-06, + "loss": 0.4607, + "step": 10261 + }, + { + "epoch": 1.31, + "grad_norm": 0.7128397596507715, + "learning_rate": 2.828523513803691e-06, + "loss": 0.5326, + "step": 10262 + }, + { + "epoch": 1.31, + "grad_norm": 0.6797515649418353, + "learning_rate": 2.827594337539755e-06, + "loss": 0.4838, + "step": 10263 + }, + { + "epoch": 1.31, + "grad_norm": 0.6103839008157308, + "learning_rate": 2.8266652537481842e-06, + "loss": 0.4159, + "step": 10264 + }, + { + "epoch": 1.31, + "grad_norm": 0.6107744884089784, + "learning_rate": 2.8257362624685316e-06, + "loss": 0.4653, + "step": 10265 + }, + { + "epoch": 1.31, + "grad_norm": 0.6060263568691121, + "learning_rate": 2.824807363740335e-06, + "loss": 0.4345, + "step": 10266 + }, + { + "epoch": 1.31, + "grad_norm": 0.6151861276284115, + "learning_rate": 2.82387855760314e-06, + "loss": 0.4539, + "step": 10267 + }, + { + "epoch": 1.31, + "grad_norm": 0.719312302955807, + "learning_rate": 2.8229498440964797e-06, + "loss": 0.5098, + "step": 10268 + }, + { + "epoch": 1.31, + "grad_norm": 0.7787548917253838, + "learning_rate": 2.82202122325989e-06, + "loss": 0.4903, + "step": 10269 + }, + { + "epoch": 1.31, + "grad_norm": 0.6801256709394332, + "learning_rate": 2.821092695132894e-06, + "loss": 0.4544, + "step": 10270 + }, + { + "epoch": 1.31, + "grad_norm": 0.8934962559810521, + "learning_rate": 2.8201642597550216e-06, + "loss": 0.4587, + "step": 10271 + }, + { + "epoch": 1.31, + "grad_norm": 0.5458415199796832, + "learning_rate": 2.8192359171657883e-06, + "loss": 0.4295, + "step": 10272 + }, + { + "epoch": 1.31, + "grad_norm": 0.5869432881126402, + "learning_rate": 2.818307667404716e-06, + "loss": 0.4087, + "step": 10273 + }, + { + "epoch": 1.31, + "grad_norm": 0.8465861078488011, + "learning_rate": 2.8173795105113105e-06, + "loss": 0.4923, + "step": 10274 + }, + { + "epoch": 1.31, + "grad_norm": 0.6617769884192014, + "learning_rate": 2.8164514465250887e-06, + "loss": 0.4302, + "step": 10275 + }, + { + "epoch": 1.31, + "grad_norm": 0.7092001774204404, + "learning_rate": 2.8155234754855486e-06, + "loss": 0.5092, + "step": 10276 + }, + { + "epoch": 1.31, + "grad_norm": 0.7022471518079075, + "learning_rate": 2.8145955974321957e-06, + "loss": 0.5295, + "step": 10277 + }, + { + "epoch": 1.31, + "grad_norm": 0.8104323622871926, + "learning_rate": 2.8136678124045234e-06, + "loss": 0.4892, + "step": 10278 + }, + { + "epoch": 1.31, + "grad_norm": 0.6373869737959377, + "learning_rate": 2.8127401204420276e-06, + "loss": 0.4946, + "step": 10279 + }, + { + "epoch": 1.31, + "grad_norm": 0.7387560399374357, + "learning_rate": 2.811812521584193e-06, + "loss": 0.4795, + "step": 10280 + }, + { + "epoch": 1.31, + "grad_norm": 0.5850546118255726, + "learning_rate": 2.8108850158705093e-06, + "loss": 0.5128, + "step": 10281 + }, + { + "epoch": 1.31, + "grad_norm": 0.6728292164626909, + "learning_rate": 2.8099576033404524e-06, + "loss": 0.4763, + "step": 10282 + }, + { + "epoch": 1.31, + "grad_norm": 0.684555096851381, + "learning_rate": 2.8090302840335043e-06, + "loss": 0.4436, + "step": 10283 + }, + { + "epoch": 1.31, + "grad_norm": 0.6571009118436286, + "learning_rate": 2.808103057989134e-06, + "loss": 0.4849, + "step": 10284 + }, + { + "epoch": 1.31, + "grad_norm": 0.7087608217287694, + "learning_rate": 2.8071759252468138e-06, + "loss": 0.5212, + "step": 10285 + }, + { + "epoch": 1.31, + "grad_norm": 0.8403310059335515, + "learning_rate": 2.8062488858460056e-06, + "loss": 0.5864, + "step": 10286 + }, + { + "epoch": 1.31, + "grad_norm": 0.7314393986426435, + "learning_rate": 2.8053219398261715e-06, + "loss": 0.4602, + "step": 10287 + }, + { + "epoch": 1.31, + "grad_norm": 0.7083839069407044, + "learning_rate": 2.8043950872267717e-06, + "loss": 0.4476, + "step": 10288 + }, + { + "epoch": 1.31, + "grad_norm": 0.6700185312252959, + "learning_rate": 2.8034683280872544e-06, + "loss": 0.4606, + "step": 10289 + }, + { + "epoch": 1.31, + "grad_norm": 0.6161550710054887, + "learning_rate": 2.8025416624470737e-06, + "loss": 0.4463, + "step": 10290 + }, + { + "epoch": 1.31, + "grad_norm": 0.5871924184064675, + "learning_rate": 2.8016150903456706e-06, + "loss": 0.4657, + "step": 10291 + }, + { + "epoch": 1.31, + "grad_norm": 0.7160656083847337, + "learning_rate": 2.8006886118224898e-06, + "loss": 0.5152, + "step": 10292 + }, + { + "epoch": 1.31, + "grad_norm": 0.8708033483477031, + "learning_rate": 2.7997622269169643e-06, + "loss": 0.5324, + "step": 10293 + }, + { + "epoch": 1.31, + "grad_norm": 0.8495864015237135, + "learning_rate": 2.7988359356685307e-06, + "loss": 0.5304, + "step": 10294 + }, + { + "epoch": 1.31, + "grad_norm": 0.8758556575715707, + "learning_rate": 2.7979097381166185e-06, + "loss": 0.483, + "step": 10295 + }, + { + "epoch": 1.31, + "grad_norm": 0.6023690437847421, + "learning_rate": 2.7969836343006507e-06, + "loss": 0.4535, + "step": 10296 + }, + { + "epoch": 1.31, + "grad_norm": 0.7341064892883139, + "learning_rate": 2.796057624260051e-06, + "loss": 0.5287, + "step": 10297 + }, + { + "epoch": 1.31, + "grad_norm": 0.6943803505952622, + "learning_rate": 2.7951317080342345e-06, + "loss": 0.4895, + "step": 10298 + }, + { + "epoch": 1.31, + "grad_norm": 0.5712222322452283, + "learning_rate": 2.7942058856626147e-06, + "loss": 0.3958, + "step": 10299 + }, + { + "epoch": 1.31, + "grad_norm": 0.680760703570845, + "learning_rate": 2.7932801571846035e-06, + "loss": 0.5178, + "step": 10300 + }, + { + "epoch": 1.31, + "grad_norm": 0.7785017709815346, + "learning_rate": 2.792354522639603e-06, + "loss": 0.5255, + "step": 10301 + }, + { + "epoch": 1.31, + "grad_norm": 0.8857959021537309, + "learning_rate": 2.791428982067017e-06, + "loss": 0.4876, + "step": 10302 + }, + { + "epoch": 1.31, + "grad_norm": 0.7026159846106256, + "learning_rate": 2.7905035355062403e-06, + "loss": 0.5265, + "step": 10303 + }, + { + "epoch": 1.31, + "grad_norm": 0.6851347398625993, + "learning_rate": 2.7895781829966696e-06, + "loss": 0.5018, + "step": 10304 + }, + { + "epoch": 1.31, + "grad_norm": 0.5777702955082746, + "learning_rate": 2.7886529245776896e-06, + "loss": 0.4829, + "step": 10305 + }, + { + "epoch": 1.31, + "grad_norm": 0.6512647504553011, + "learning_rate": 2.7877277602886908e-06, + "loss": 0.5077, + "step": 10306 + }, + { + "epoch": 1.31, + "grad_norm": 0.8217845487809845, + "learning_rate": 2.786802690169049e-06, + "loss": 0.5281, + "step": 10307 + }, + { + "epoch": 1.31, + "grad_norm": 0.6647988542616354, + "learning_rate": 2.7858777142581474e-06, + "loss": 0.5418, + "step": 10308 + }, + { + "epoch": 1.31, + "grad_norm": 0.8704886978408509, + "learning_rate": 2.7849528325953532e-06, + "loss": 0.5389, + "step": 10309 + }, + { + "epoch": 1.31, + "grad_norm": 0.7667554399995513, + "learning_rate": 2.784028045220041e-06, + "loss": 0.5095, + "step": 10310 + }, + { + "epoch": 1.31, + "grad_norm": 0.5799480689458069, + "learning_rate": 2.78310335217157e-06, + "loss": 0.4272, + "step": 10311 + }, + { + "epoch": 1.31, + "grad_norm": 0.6003253809909085, + "learning_rate": 2.7821787534893086e-06, + "loss": 0.4406, + "step": 10312 + }, + { + "epoch": 1.31, + "grad_norm": 0.6632551367750247, + "learning_rate": 2.781254249212609e-06, + "loss": 0.4135, + "step": 10313 + }, + { + "epoch": 1.31, + "grad_norm": 0.6204520004420718, + "learning_rate": 2.7803298393808275e-06, + "loss": 0.5279, + "step": 10314 + }, + { + "epoch": 1.31, + "grad_norm": 0.718102852184217, + "learning_rate": 2.7794055240333097e-06, + "loss": 0.5138, + "step": 10315 + }, + { + "epoch": 1.31, + "grad_norm": 0.7621876313546597, + "learning_rate": 2.778481303209405e-06, + "loss": 0.4995, + "step": 10316 + }, + { + "epoch": 1.31, + "grad_norm": 0.6308741026634868, + "learning_rate": 2.77755717694845e-06, + "loss": 0.477, + "step": 10317 + }, + { + "epoch": 1.31, + "grad_norm": 0.5793751279011736, + "learning_rate": 2.7766331452897866e-06, + "loss": 0.4422, + "step": 10318 + }, + { + "epoch": 1.31, + "grad_norm": 0.7060617440529559, + "learning_rate": 2.775709208272742e-06, + "loss": 0.4876, + "step": 10319 + }, + { + "epoch": 1.31, + "grad_norm": 0.9337510585974735, + "learning_rate": 2.774785365936652e-06, + "loss": 0.5147, + "step": 10320 + }, + { + "epoch": 1.31, + "grad_norm": 0.5594443465244961, + "learning_rate": 2.7738616183208355e-06, + "loss": 0.4208, + "step": 10321 + }, + { + "epoch": 1.31, + "grad_norm": 0.5780470233219226, + "learning_rate": 2.772937965464619e-06, + "loss": 0.4714, + "step": 10322 + }, + { + "epoch": 1.32, + "grad_norm": 0.5957072811076571, + "learning_rate": 2.772014407407312e-06, + "loss": 0.447, + "step": 10323 + }, + { + "epoch": 1.32, + "grad_norm": 0.6632208242494634, + "learning_rate": 2.771090944188236e-06, + "loss": 0.5158, + "step": 10324 + }, + { + "epoch": 1.32, + "grad_norm": 0.7936907643249319, + "learning_rate": 2.770167575846694e-06, + "loss": 0.5607, + "step": 10325 + }, + { + "epoch": 1.32, + "grad_norm": 2.0528812035680306, + "learning_rate": 2.769244302421995e-06, + "loss": 0.528, + "step": 10326 + }, + { + "epoch": 1.32, + "grad_norm": 0.5810315354662225, + "learning_rate": 2.7683211239534346e-06, + "loss": 0.4219, + "step": 10327 + }, + { + "epoch": 1.32, + "grad_norm": 0.6969253167692392, + "learning_rate": 2.7673980404803156e-06, + "loss": 0.5126, + "step": 10328 + }, + { + "epoch": 1.32, + "grad_norm": 0.7726218100733003, + "learning_rate": 2.766475052041926e-06, + "loss": 0.5371, + "step": 10329 + }, + { + "epoch": 1.32, + "grad_norm": 0.7099240571205144, + "learning_rate": 2.765552158677557e-06, + "loss": 0.4905, + "step": 10330 + }, + { + "epoch": 1.32, + "grad_norm": 0.619004841127661, + "learning_rate": 2.7646293604264908e-06, + "loss": 0.4465, + "step": 10331 + }, + { + "epoch": 1.32, + "grad_norm": 0.6415019938912052, + "learning_rate": 2.763706657328011e-06, + "loss": 0.5054, + "step": 10332 + }, + { + "epoch": 1.32, + "grad_norm": 0.6756060090921461, + "learning_rate": 2.7627840494213914e-06, + "loss": 0.4842, + "step": 10333 + }, + { + "epoch": 1.32, + "grad_norm": 0.6233721176824821, + "learning_rate": 2.761861536745908e-06, + "loss": 0.5191, + "step": 10334 + }, + { + "epoch": 1.32, + "grad_norm": 0.6540391906862236, + "learning_rate": 2.7609391193408243e-06, + "loss": 0.4916, + "step": 10335 + }, + { + "epoch": 1.32, + "grad_norm": 0.5785833943689869, + "learning_rate": 2.760016797245407e-06, + "loss": 0.4602, + "step": 10336 + }, + { + "epoch": 1.32, + "grad_norm": 0.6214001281557133, + "learning_rate": 2.7590945704989168e-06, + "loss": 0.4682, + "step": 10337 + }, + { + "epoch": 1.32, + "grad_norm": 0.7143726147005993, + "learning_rate": 2.758172439140612e-06, + "loss": 0.515, + "step": 10338 + }, + { + "epoch": 1.32, + "grad_norm": 0.6172944417762497, + "learning_rate": 2.7572504032097406e-06, + "loss": 0.4129, + "step": 10339 + }, + { + "epoch": 1.32, + "grad_norm": 0.6894994342783959, + "learning_rate": 2.7563284627455545e-06, + "loss": 0.4387, + "step": 10340 + }, + { + "epoch": 1.32, + "grad_norm": 0.6526816237412422, + "learning_rate": 2.7554066177872948e-06, + "loss": 0.4414, + "step": 10341 + }, + { + "epoch": 1.32, + "grad_norm": 0.6287719420341326, + "learning_rate": 2.754484868374204e-06, + "loss": 0.4557, + "step": 10342 + }, + { + "epoch": 1.32, + "grad_norm": 0.8484841580775698, + "learning_rate": 2.753563214545515e-06, + "loss": 0.4992, + "step": 10343 + }, + { + "epoch": 1.32, + "grad_norm": 0.6082974126964179, + "learning_rate": 2.752641656340463e-06, + "loss": 0.4208, + "step": 10344 + }, + { + "epoch": 1.32, + "grad_norm": 0.594648707224296, + "learning_rate": 2.7517201937982724e-06, + "loss": 0.4778, + "step": 10345 + }, + { + "epoch": 1.32, + "grad_norm": 0.7926787184939269, + "learning_rate": 2.750798826958171e-06, + "loss": 0.4674, + "step": 10346 + }, + { + "epoch": 1.32, + "grad_norm": 0.6785821081690722, + "learning_rate": 2.749877555859373e-06, + "loss": 0.4938, + "step": 10347 + }, + { + "epoch": 1.32, + "grad_norm": 0.720702003143994, + "learning_rate": 2.748956380541098e-06, + "loss": 0.5642, + "step": 10348 + }, + { + "epoch": 1.32, + "grad_norm": 0.7127434240465371, + "learning_rate": 2.7480353010425586e-06, + "loss": 0.5575, + "step": 10349 + }, + { + "epoch": 1.32, + "grad_norm": 0.8407082801634858, + "learning_rate": 2.747114317402958e-06, + "loss": 0.4769, + "step": 10350 + }, + { + "epoch": 1.32, + "grad_norm": 0.7015039866122491, + "learning_rate": 2.7461934296615034e-06, + "loss": 0.4897, + "step": 10351 + }, + { + "epoch": 1.32, + "grad_norm": 0.7008876470538534, + "learning_rate": 2.7452726378573913e-06, + "loss": 0.4816, + "step": 10352 + }, + { + "epoch": 1.32, + "grad_norm": 0.6092498228414746, + "learning_rate": 2.7443519420298192e-06, + "loss": 0.4398, + "step": 10353 + }, + { + "epoch": 1.32, + "grad_norm": 0.6386698963862778, + "learning_rate": 2.743431342217975e-06, + "loss": 0.4287, + "step": 10354 + }, + { + "epoch": 1.32, + "grad_norm": 0.6112131601942666, + "learning_rate": 2.742510838461048e-06, + "loss": 0.4682, + "step": 10355 + }, + { + "epoch": 1.32, + "grad_norm": 0.6570679633884028, + "learning_rate": 2.741590430798222e-06, + "loss": 0.4796, + "step": 10356 + }, + { + "epoch": 1.32, + "grad_norm": 0.569073328216951, + "learning_rate": 2.7406701192686725e-06, + "loss": 0.4368, + "step": 10357 + }, + { + "epoch": 1.32, + "grad_norm": 0.5715520359315882, + "learning_rate": 2.7397499039115786e-06, + "loss": 0.4126, + "step": 10358 + }, + { + "epoch": 1.32, + "grad_norm": 0.7368551643100076, + "learning_rate": 2.7388297847661066e-06, + "loss": 0.5256, + "step": 10359 + }, + { + "epoch": 1.32, + "grad_norm": 0.7238767532920479, + "learning_rate": 2.737909761871425e-06, + "loss": 0.5203, + "step": 10360 + }, + { + "epoch": 1.32, + "grad_norm": 0.5591126185576063, + "learning_rate": 2.736989835266698e-06, + "loss": 0.4331, + "step": 10361 + }, + { + "epoch": 1.32, + "grad_norm": 0.879536262743953, + "learning_rate": 2.7360700049910806e-06, + "loss": 0.5727, + "step": 10362 + }, + { + "epoch": 1.32, + "grad_norm": 0.6834681890321648, + "learning_rate": 2.7351502710837303e-06, + "loss": 0.4925, + "step": 10363 + }, + { + "epoch": 1.32, + "grad_norm": 0.8807061460864501, + "learning_rate": 2.734230633583793e-06, + "loss": 0.5137, + "step": 10364 + }, + { + "epoch": 1.32, + "grad_norm": 0.7100398944654575, + "learning_rate": 2.7333110925304207e-06, + "loss": 0.5044, + "step": 10365 + }, + { + "epoch": 1.32, + "grad_norm": 0.8074920784042978, + "learning_rate": 2.732391647962749e-06, + "loss": 0.548, + "step": 10366 + }, + { + "epoch": 1.32, + "grad_norm": 0.772978060469139, + "learning_rate": 2.7314722999199206e-06, + "loss": 0.51, + "step": 10367 + }, + { + "epoch": 1.32, + "grad_norm": 0.6521215880523424, + "learning_rate": 2.7305530484410646e-06, + "loss": 0.5046, + "step": 10368 + }, + { + "epoch": 1.32, + "grad_norm": 0.6954414989450015, + "learning_rate": 2.7296338935653167e-06, + "loss": 0.4769, + "step": 10369 + }, + { + "epoch": 1.32, + "grad_norm": 0.8740772687049388, + "learning_rate": 2.728714835331796e-06, + "loss": 0.5171, + "step": 10370 + }, + { + "epoch": 1.32, + "grad_norm": 0.7581098288126822, + "learning_rate": 2.7277958737796283e-06, + "loss": 0.5144, + "step": 10371 + }, + { + "epoch": 1.32, + "grad_norm": 0.8589557568648777, + "learning_rate": 2.726877008947928e-06, + "loss": 0.5091, + "step": 10372 + }, + { + "epoch": 1.32, + "grad_norm": 0.6110068588269921, + "learning_rate": 2.7259582408758096e-06, + "loss": 0.4446, + "step": 10373 + }, + { + "epoch": 1.32, + "grad_norm": 0.7147663911311242, + "learning_rate": 2.725039569602382e-06, + "loss": 0.4835, + "step": 10374 + }, + { + "epoch": 1.32, + "grad_norm": 0.7413469584473779, + "learning_rate": 2.724120995166752e-06, + "loss": 0.544, + "step": 10375 + }, + { + "epoch": 1.32, + "grad_norm": 0.8872715134898821, + "learning_rate": 2.723202517608017e-06, + "loss": 0.5739, + "step": 10376 + }, + { + "epoch": 1.32, + "grad_norm": 0.6907243464323122, + "learning_rate": 2.7222841369652764e-06, + "loss": 0.4808, + "step": 10377 + }, + { + "epoch": 1.32, + "grad_norm": 0.5595479299385279, + "learning_rate": 2.7213658532776197e-06, + "loss": 0.4381, + "step": 10378 + }, + { + "epoch": 1.32, + "grad_norm": 0.6354036481681429, + "learning_rate": 2.7204476665841395e-06, + "loss": 0.4569, + "step": 10379 + }, + { + "epoch": 1.32, + "grad_norm": 0.6061294230134897, + "learning_rate": 2.719529576923916e-06, + "loss": 0.4636, + "step": 10380 + }, + { + "epoch": 1.32, + "grad_norm": 0.6723032050636789, + "learning_rate": 2.7186115843360326e-06, + "loss": 0.4954, + "step": 10381 + }, + { + "epoch": 1.32, + "grad_norm": 0.6208787356448009, + "learning_rate": 2.7176936888595625e-06, + "loss": 0.488, + "step": 10382 + }, + { + "epoch": 1.32, + "grad_norm": 0.6094514859095329, + "learning_rate": 2.71677589053358e-06, + "loss": 0.424, + "step": 10383 + }, + { + "epoch": 1.32, + "grad_norm": 0.7362013402079931, + "learning_rate": 2.7158581893971504e-06, + "loss": 0.4554, + "step": 10384 + }, + { + "epoch": 1.32, + "grad_norm": 0.7158430907069553, + "learning_rate": 2.714940585489338e-06, + "loss": 0.541, + "step": 10385 + }, + { + "epoch": 1.32, + "grad_norm": 0.7508972989594184, + "learning_rate": 2.714023078849203e-06, + "loss": 0.5434, + "step": 10386 + }, + { + "epoch": 1.32, + "grad_norm": 0.6837608999964359, + "learning_rate": 2.7131056695158033e-06, + "loss": 0.493, + "step": 10387 + }, + { + "epoch": 1.32, + "grad_norm": 0.8794385664082642, + "learning_rate": 2.7121883575281844e-06, + "loss": 0.5156, + "step": 10388 + }, + { + "epoch": 1.32, + "grad_norm": 0.8270004047901359, + "learning_rate": 2.7112711429253987e-06, + "loss": 0.5802, + "step": 10389 + }, + { + "epoch": 1.32, + "grad_norm": 0.852907495806386, + "learning_rate": 2.7103540257464845e-06, + "loss": 0.519, + "step": 10390 + }, + { + "epoch": 1.32, + "grad_norm": 0.6275563166804927, + "learning_rate": 2.709437006030485e-06, + "loss": 0.5077, + "step": 10391 + }, + { + "epoch": 1.32, + "grad_norm": 0.7216492105876493, + "learning_rate": 2.7085200838164306e-06, + "loss": 0.5058, + "step": 10392 + }, + { + "epoch": 1.32, + "grad_norm": 0.7367917113004402, + "learning_rate": 2.7076032591433554e-06, + "loss": 0.5284, + "step": 10393 + }, + { + "epoch": 1.32, + "grad_norm": 1.3768037595849074, + "learning_rate": 2.706686532050282e-06, + "loss": 0.527, + "step": 10394 + }, + { + "epoch": 1.32, + "grad_norm": 0.7261575510917068, + "learning_rate": 2.7057699025762372e-06, + "loss": 0.5497, + "step": 10395 + }, + { + "epoch": 1.32, + "grad_norm": 0.7461490466559105, + "learning_rate": 2.7048533707602343e-06, + "loss": 0.5054, + "step": 10396 + }, + { + "epoch": 1.32, + "grad_norm": 0.5935209370164812, + "learning_rate": 2.7039369366412906e-06, + "loss": 0.3943, + "step": 10397 + }, + { + "epoch": 1.32, + "grad_norm": 0.5945092786602868, + "learning_rate": 2.7030206002584102e-06, + "loss": 0.4446, + "step": 10398 + }, + { + "epoch": 1.32, + "grad_norm": 1.3263798578125456, + "learning_rate": 2.7021043616506073e-06, + "loss": 0.5199, + "step": 10399 + }, + { + "epoch": 1.32, + "grad_norm": 0.6861515727606337, + "learning_rate": 2.7011882208568764e-06, + "loss": 0.4987, + "step": 10400 + }, + { + "epoch": 1.33, + "grad_norm": 0.7205811719774767, + "learning_rate": 2.7002721779162194e-06, + "loss": 0.5265, + "step": 10401 + }, + { + "epoch": 1.33, + "grad_norm": 0.7385668990841435, + "learning_rate": 2.6993562328676247e-06, + "loss": 0.5232, + "step": 10402 + }, + { + "epoch": 1.33, + "grad_norm": 0.73489183386022, + "learning_rate": 2.698440385750085e-06, + "loss": 0.4837, + "step": 10403 + }, + { + "epoch": 1.33, + "grad_norm": 0.595810960905717, + "learning_rate": 2.6975246366025822e-06, + "loss": 0.4371, + "step": 10404 + }, + { + "epoch": 1.33, + "grad_norm": 0.60407533029049, + "learning_rate": 2.6966089854640987e-06, + "loss": 0.4683, + "step": 10405 + }, + { + "epoch": 1.33, + "grad_norm": 0.6359714362799845, + "learning_rate": 2.695693432373609e-06, + "loss": 0.42, + "step": 10406 + }, + { + "epoch": 1.33, + "grad_norm": 0.6396203805091751, + "learning_rate": 2.694777977370088e-06, + "loss": 0.4969, + "step": 10407 + }, + { + "epoch": 1.33, + "grad_norm": 0.7495227349500316, + "learning_rate": 2.6938626204925e-06, + "loss": 0.5232, + "step": 10408 + }, + { + "epoch": 1.33, + "grad_norm": 0.6957898811146143, + "learning_rate": 2.692947361779813e-06, + "loss": 0.4519, + "step": 10409 + }, + { + "epoch": 1.33, + "grad_norm": 0.6135722733042351, + "learning_rate": 2.6920322012709832e-06, + "loss": 0.5092, + "step": 10410 + }, + { + "epoch": 1.33, + "grad_norm": 0.8106406537348229, + "learning_rate": 2.691117139004966e-06, + "loss": 0.5211, + "step": 10411 + }, + { + "epoch": 1.33, + "grad_norm": 0.7911049720073707, + "learning_rate": 2.6902021750207176e-06, + "loss": 0.5253, + "step": 10412 + }, + { + "epoch": 1.33, + "grad_norm": 0.590970404960969, + "learning_rate": 2.689287309357179e-06, + "loss": 0.4332, + "step": 10413 + }, + { + "epoch": 1.33, + "grad_norm": 0.6897002819401162, + "learning_rate": 2.688372542053297e-06, + "loss": 0.4909, + "step": 10414 + }, + { + "epoch": 1.33, + "grad_norm": 0.6949444258759422, + "learning_rate": 2.6874578731480072e-06, + "loss": 0.4939, + "step": 10415 + }, + { + "epoch": 1.33, + "grad_norm": 0.5653657736103492, + "learning_rate": 2.6865433026802485e-06, + "loss": 0.4331, + "step": 10416 + }, + { + "epoch": 1.33, + "grad_norm": 0.6137579517920647, + "learning_rate": 2.6856288306889465e-06, + "loss": 0.4235, + "step": 10417 + }, + { + "epoch": 1.33, + "grad_norm": 0.5642045045712887, + "learning_rate": 2.684714457213029e-06, + "loss": 0.4155, + "step": 10418 + }, + { + "epoch": 1.33, + "grad_norm": 0.6865290335860689, + "learning_rate": 2.6838001822914205e-06, + "loss": 0.4674, + "step": 10419 + }, + { + "epoch": 1.33, + "grad_norm": 0.6881986360685356, + "learning_rate": 2.6828860059630355e-06, + "loss": 0.4622, + "step": 10420 + }, + { + "epoch": 1.33, + "grad_norm": 0.6616896634586286, + "learning_rate": 2.68197192826679e-06, + "loss": 0.4406, + "step": 10421 + }, + { + "epoch": 1.33, + "grad_norm": 0.6154911590713704, + "learning_rate": 2.6810579492415906e-06, + "loss": 0.4032, + "step": 10422 + }, + { + "epoch": 1.33, + "grad_norm": 0.6174447590466567, + "learning_rate": 2.6801440689263438e-06, + "loss": 0.4651, + "step": 10423 + }, + { + "epoch": 1.33, + "grad_norm": 0.6742644993561953, + "learning_rate": 2.679230287359953e-06, + "loss": 0.4749, + "step": 10424 + }, + { + "epoch": 1.33, + "grad_norm": 0.6392911119217446, + "learning_rate": 2.67831660458131e-06, + "loss": 0.4367, + "step": 10425 + }, + { + "epoch": 1.33, + "grad_norm": 0.6761821235044461, + "learning_rate": 2.6774030206293132e-06, + "loss": 0.4819, + "step": 10426 + }, + { + "epoch": 1.33, + "grad_norm": 0.5601652227907855, + "learning_rate": 2.6764895355428456e-06, + "loss": 0.4231, + "step": 10427 + }, + { + "epoch": 1.33, + "grad_norm": 1.1323072996383916, + "learning_rate": 2.675576149360795e-06, + "loss": 0.498, + "step": 10428 + }, + { + "epoch": 1.33, + "grad_norm": 0.6688631409101287, + "learning_rate": 2.674662862122038e-06, + "loss": 0.4573, + "step": 10429 + }, + { + "epoch": 1.33, + "grad_norm": 0.5925262351078397, + "learning_rate": 2.673749673865455e-06, + "loss": 0.4537, + "step": 10430 + }, + { + "epoch": 1.33, + "grad_norm": 0.6078301519512345, + "learning_rate": 2.672836584629912e-06, + "loss": 0.4297, + "step": 10431 + }, + { + "epoch": 1.33, + "grad_norm": 0.6043560196132154, + "learning_rate": 2.6719235944542816e-06, + "loss": 0.4425, + "step": 10432 + }, + { + "epoch": 1.33, + "grad_norm": 0.6145893193107074, + "learning_rate": 2.6710107033774225e-06, + "loss": 0.4801, + "step": 10433 + }, + { + "epoch": 1.33, + "grad_norm": 0.7046347501259793, + "learning_rate": 2.6700979114381966e-06, + "loss": 0.5022, + "step": 10434 + }, + { + "epoch": 1.33, + "grad_norm": 0.955439941567224, + "learning_rate": 2.6691852186754535e-06, + "loss": 0.5311, + "step": 10435 + }, + { + "epoch": 1.33, + "grad_norm": 0.7776049542735206, + "learning_rate": 2.6682726251280515e-06, + "loss": 0.5564, + "step": 10436 + }, + { + "epoch": 1.33, + "grad_norm": 0.6946366134677137, + "learning_rate": 2.6673601308348307e-06, + "loss": 0.5111, + "step": 10437 + }, + { + "epoch": 1.33, + "grad_norm": 0.6056262640099894, + "learning_rate": 2.6664477358346375e-06, + "loss": 0.4693, + "step": 10438 + }, + { + "epoch": 1.33, + "grad_norm": 0.702928538440643, + "learning_rate": 2.6655354401663047e-06, + "loss": 0.464, + "step": 10439 + }, + { + "epoch": 1.33, + "grad_norm": 0.7428536543476963, + "learning_rate": 2.6646232438686704e-06, + "loss": 0.5165, + "step": 10440 + }, + { + "epoch": 1.33, + "grad_norm": 0.7889657852679707, + "learning_rate": 2.66371114698056e-06, + "loss": 0.5534, + "step": 10441 + }, + { + "epoch": 1.33, + "grad_norm": 0.6419378921457135, + "learning_rate": 2.6627991495408025e-06, + "loss": 0.4272, + "step": 10442 + }, + { + "epoch": 1.33, + "grad_norm": 0.5456671136530137, + "learning_rate": 2.6618872515882143e-06, + "loss": 0.4286, + "step": 10443 + }, + { + "epoch": 1.33, + "grad_norm": 0.6160738899883795, + "learning_rate": 2.6609754531616162e-06, + "loss": 0.4285, + "step": 10444 + }, + { + "epoch": 1.33, + "grad_norm": 0.6922031132322712, + "learning_rate": 2.6600637542998164e-06, + "loss": 0.4647, + "step": 10445 + }, + { + "epoch": 1.33, + "grad_norm": 0.6074556864594911, + "learning_rate": 2.6591521550416275e-06, + "loss": 0.4475, + "step": 10446 + }, + { + "epoch": 1.33, + "grad_norm": 0.5960269225081608, + "learning_rate": 2.658240655425847e-06, + "loss": 0.4518, + "step": 10447 + }, + { + "epoch": 1.33, + "grad_norm": 0.6564091672395928, + "learning_rate": 2.6573292554912832e-06, + "loss": 0.486, + "step": 10448 + }, + { + "epoch": 1.33, + "grad_norm": 0.7316632297416248, + "learning_rate": 2.6564179552767245e-06, + "loss": 0.4529, + "step": 10449 + }, + { + "epoch": 1.33, + "grad_norm": 0.6384750104167783, + "learning_rate": 2.6555067548209655e-06, + "loss": 0.4736, + "step": 10450 + }, + { + "epoch": 1.33, + "grad_norm": 0.9465589381693489, + "learning_rate": 2.6545956541627913e-06, + "loss": 0.5254, + "step": 10451 + }, + { + "epoch": 1.33, + "grad_norm": 0.7444444643752165, + "learning_rate": 2.6536846533409865e-06, + "loss": 0.4599, + "step": 10452 + }, + { + "epoch": 1.33, + "grad_norm": 0.540997955708366, + "learning_rate": 2.6527737523943264e-06, + "loss": 0.3976, + "step": 10453 + }, + { + "epoch": 1.33, + "grad_norm": 0.6451066744950943, + "learning_rate": 2.6518629513615894e-06, + "loss": 0.4628, + "step": 10454 + }, + { + "epoch": 1.33, + "grad_norm": 0.7762395218882295, + "learning_rate": 2.650952250281541e-06, + "loss": 0.4863, + "step": 10455 + }, + { + "epoch": 1.33, + "grad_norm": 0.6056665589043336, + "learning_rate": 2.6500416491929505e-06, + "loss": 0.4591, + "step": 10456 + }, + { + "epoch": 1.33, + "grad_norm": 0.9170622477340048, + "learning_rate": 2.6491311481345763e-06, + "loss": 0.5517, + "step": 10457 + }, + { + "epoch": 1.33, + "grad_norm": 0.5882766029963012, + "learning_rate": 2.6482207471451775e-06, + "loss": 0.4447, + "step": 10458 + }, + { + "epoch": 1.33, + "grad_norm": 0.790139805967078, + "learning_rate": 2.6473104462635048e-06, + "loss": 0.468, + "step": 10459 + }, + { + "epoch": 1.33, + "grad_norm": 0.5435428189331143, + "learning_rate": 2.646400245528309e-06, + "loss": 0.4369, + "step": 10460 + }, + { + "epoch": 1.33, + "grad_norm": 0.6846943788527304, + "learning_rate": 2.6454901449783334e-06, + "loss": 0.4924, + "step": 10461 + }, + { + "epoch": 1.33, + "grad_norm": 0.9569522481544581, + "learning_rate": 2.64458014465232e-06, + "loss": 0.545, + "step": 10462 + }, + { + "epoch": 1.33, + "grad_norm": 0.804079115785682, + "learning_rate": 2.6436702445890007e-06, + "loss": 0.5081, + "step": 10463 + }, + { + "epoch": 1.33, + "grad_norm": 0.5884263214599849, + "learning_rate": 2.6427604448271116e-06, + "loss": 0.4055, + "step": 10464 + }, + { + "epoch": 1.33, + "grad_norm": 0.5927593392228628, + "learning_rate": 2.6418507454053764e-06, + "loss": 0.4662, + "step": 10465 + }, + { + "epoch": 1.33, + "grad_norm": 0.7954120688325812, + "learning_rate": 2.6409411463625207e-06, + "loss": 0.4887, + "step": 10466 + }, + { + "epoch": 1.33, + "grad_norm": 0.7019531549203019, + "learning_rate": 2.64003164773726e-06, + "loss": 0.4677, + "step": 10467 + }, + { + "epoch": 1.33, + "grad_norm": 0.7026126361040785, + "learning_rate": 2.6391222495683134e-06, + "loss": 0.4784, + "step": 10468 + }, + { + "epoch": 1.33, + "grad_norm": 0.8387632757186848, + "learning_rate": 2.6382129518943856e-06, + "loss": 0.4801, + "step": 10469 + }, + { + "epoch": 1.33, + "grad_norm": 0.8084044790296943, + "learning_rate": 2.6373037547541867e-06, + "loss": 0.5229, + "step": 10470 + }, + { + "epoch": 1.33, + "grad_norm": 0.6055974131143065, + "learning_rate": 2.6363946581864156e-06, + "loss": 0.4626, + "step": 10471 + }, + { + "epoch": 1.33, + "grad_norm": 0.7762961284619238, + "learning_rate": 2.6354856622297707e-06, + "loss": 0.5326, + "step": 10472 + }, + { + "epoch": 1.33, + "grad_norm": 0.6834070519293799, + "learning_rate": 2.6345767669229482e-06, + "loss": 0.4981, + "step": 10473 + }, + { + "epoch": 1.33, + "grad_norm": 0.653968676520899, + "learning_rate": 2.633667972304631e-06, + "loss": 0.435, + "step": 10474 + }, + { + "epoch": 1.33, + "grad_norm": 0.6717962390423486, + "learning_rate": 2.6327592784135082e-06, + "loss": 0.4299, + "step": 10475 + }, + { + "epoch": 1.33, + "grad_norm": 0.699017160924299, + "learning_rate": 2.631850685288257e-06, + "loss": 0.4893, + "step": 10476 + }, + { + "epoch": 1.33, + "grad_norm": 0.6701807020173425, + "learning_rate": 2.6309421929675562e-06, + "loss": 0.4929, + "step": 10477 + }, + { + "epoch": 1.33, + "grad_norm": 0.6229388228504292, + "learning_rate": 2.630033801490074e-06, + "loss": 0.4395, + "step": 10478 + }, + { + "epoch": 1.33, + "grad_norm": 0.5556153721486646, + "learning_rate": 2.6291255108944794e-06, + "loss": 0.4388, + "step": 10479 + }, + { + "epoch": 1.34, + "grad_norm": 0.8806362729927469, + "learning_rate": 2.628217321219438e-06, + "loss": 0.5149, + "step": 10480 + }, + { + "epoch": 1.34, + "grad_norm": 0.748900621539695, + "learning_rate": 2.6273092325036037e-06, + "loss": 0.5093, + "step": 10481 + }, + { + "epoch": 1.34, + "grad_norm": 0.7987917057961558, + "learning_rate": 2.6264012447856356e-06, + "loss": 0.5104, + "step": 10482 + }, + { + "epoch": 1.34, + "grad_norm": 0.5360859916938244, + "learning_rate": 2.625493358104179e-06, + "loss": 0.4201, + "step": 10483 + }, + { + "epoch": 1.34, + "grad_norm": 0.698113049700735, + "learning_rate": 2.624585572497883e-06, + "loss": 0.4428, + "step": 10484 + }, + { + "epoch": 1.34, + "grad_norm": 0.6595825637544322, + "learning_rate": 2.6236778880053903e-06, + "loss": 0.4381, + "step": 10485 + }, + { + "epoch": 1.34, + "grad_norm": 0.6149852131666086, + "learning_rate": 2.622770304665334e-06, + "loss": 0.5021, + "step": 10486 + }, + { + "epoch": 1.34, + "grad_norm": 0.6687978346064894, + "learning_rate": 2.621862822516351e-06, + "loss": 0.4944, + "step": 10487 + }, + { + "epoch": 1.34, + "grad_norm": 0.5596785788185218, + "learning_rate": 2.6209554415970668e-06, + "loss": 0.4276, + "step": 10488 + }, + { + "epoch": 1.34, + "grad_norm": 0.6159920787070838, + "learning_rate": 2.6200481619461087e-06, + "loss": 0.5368, + "step": 10489 + }, + { + "epoch": 1.34, + "grad_norm": 0.803250683156127, + "learning_rate": 2.619140983602093e-06, + "loss": 0.5126, + "step": 10490 + }, + { + "epoch": 1.34, + "grad_norm": 0.7072484586432487, + "learning_rate": 2.61823390660364e-06, + "loss": 0.5449, + "step": 10491 + }, + { + "epoch": 1.34, + "grad_norm": 0.7122089858160523, + "learning_rate": 2.617326930989357e-06, + "loss": 0.5384, + "step": 10492 + }, + { + "epoch": 1.34, + "grad_norm": 0.764709926973671, + "learning_rate": 2.6164200567978538e-06, + "loss": 0.5251, + "step": 10493 + }, + { + "epoch": 1.34, + "grad_norm": 0.7965000517741182, + "learning_rate": 2.61551328406773e-06, + "loss": 0.5426, + "step": 10494 + }, + { + "epoch": 1.34, + "grad_norm": 0.5722800226789765, + "learning_rate": 2.614606612837588e-06, + "loss": 0.4667, + "step": 10495 + }, + { + "epoch": 1.34, + "grad_norm": 0.6066833520036666, + "learning_rate": 2.613700043146018e-06, + "loss": 0.5074, + "step": 10496 + }, + { + "epoch": 1.34, + "grad_norm": 0.540690502361817, + "learning_rate": 2.612793575031611e-06, + "loss": 0.4146, + "step": 10497 + }, + { + "epoch": 1.34, + "grad_norm": 0.5691137692350555, + "learning_rate": 2.6118872085329545e-06, + "loss": 0.4446, + "step": 10498 + }, + { + "epoch": 1.34, + "grad_norm": 0.660356675828652, + "learning_rate": 2.610980943688629e-06, + "loss": 0.4777, + "step": 10499 + }, + { + "epoch": 1.34, + "grad_norm": 0.8759596562439051, + "learning_rate": 2.6100747805372097e-06, + "loss": 0.5395, + "step": 10500 + }, + { + "epoch": 1.34, + "grad_norm": 0.5968988089047996, + "learning_rate": 2.609168719117271e-06, + "loss": 0.526, + "step": 10501 + }, + { + "epoch": 1.34, + "grad_norm": 0.9312721545499884, + "learning_rate": 2.6082627594673777e-06, + "loss": 0.542, + "step": 10502 + }, + { + "epoch": 1.34, + "grad_norm": 0.5907621184336649, + "learning_rate": 2.607356901626098e-06, + "loss": 0.4629, + "step": 10503 + }, + { + "epoch": 1.34, + "grad_norm": 0.7301453061227231, + "learning_rate": 2.6064511456319874e-06, + "loss": 0.4549, + "step": 10504 + }, + { + "epoch": 1.34, + "grad_norm": 0.6367422057780036, + "learning_rate": 2.605545491523605e-06, + "loss": 0.4091, + "step": 10505 + }, + { + "epoch": 1.34, + "grad_norm": 0.6060024480535015, + "learning_rate": 2.6046399393394973e-06, + "loss": 0.4459, + "step": 10506 + }, + { + "epoch": 1.34, + "grad_norm": 0.603006322669303, + "learning_rate": 2.6037344891182147e-06, + "loss": 0.4441, + "step": 10507 + }, + { + "epoch": 1.34, + "grad_norm": 0.7097251674959564, + "learning_rate": 2.602829140898296e-06, + "loss": 0.4737, + "step": 10508 + }, + { + "epoch": 1.34, + "grad_norm": 0.6045866969731865, + "learning_rate": 2.6019238947182802e-06, + "loss": 0.5126, + "step": 10509 + }, + { + "epoch": 1.34, + "grad_norm": 0.6806143131924589, + "learning_rate": 2.6010187506167008e-06, + "loss": 0.5375, + "step": 10510 + }, + { + "epoch": 1.34, + "grad_norm": 0.735141985519335, + "learning_rate": 2.6001137086320895e-06, + "loss": 0.5032, + "step": 10511 + }, + { + "epoch": 1.34, + "grad_norm": 0.6999838503513939, + "learning_rate": 2.5992087688029666e-06, + "loss": 0.5049, + "step": 10512 + }, + { + "epoch": 1.34, + "grad_norm": 0.6874803323596783, + "learning_rate": 2.5983039311678567e-06, + "loss": 0.4751, + "step": 10513 + }, + { + "epoch": 1.34, + "grad_norm": 0.5926923896376749, + "learning_rate": 2.597399195765271e-06, + "loss": 0.4414, + "step": 10514 + }, + { + "epoch": 1.34, + "grad_norm": 0.5425670683386655, + "learning_rate": 2.5964945626337268e-06, + "loss": 0.4205, + "step": 10515 + }, + { + "epoch": 1.34, + "grad_norm": 0.6970937596512157, + "learning_rate": 2.5955900318117266e-06, + "loss": 0.4235, + "step": 10516 + }, + { + "epoch": 1.34, + "grad_norm": 0.6205585218731938, + "learning_rate": 2.5946856033377767e-06, + "loss": 0.4693, + "step": 10517 + }, + { + "epoch": 1.34, + "grad_norm": 0.6938979631909559, + "learning_rate": 2.593781277250372e-06, + "loss": 0.4685, + "step": 10518 + }, + { + "epoch": 1.34, + "grad_norm": 0.7968496125303536, + "learning_rate": 2.5928770535880122e-06, + "loss": 0.5244, + "step": 10519 + }, + { + "epoch": 1.34, + "grad_norm": 0.7664858655971052, + "learning_rate": 2.591972932389182e-06, + "loss": 0.5162, + "step": 10520 + }, + { + "epoch": 1.34, + "grad_norm": 0.6411154348649356, + "learning_rate": 2.591068913692371e-06, + "loss": 0.4921, + "step": 10521 + }, + { + "epoch": 1.34, + "grad_norm": 0.8168666097855334, + "learning_rate": 2.5901649975360543e-06, + "loss": 0.5143, + "step": 10522 + }, + { + "epoch": 1.34, + "grad_norm": 0.7723265278626535, + "learning_rate": 2.5892611839587175e-06, + "loss": 0.5433, + "step": 10523 + }, + { + "epoch": 1.34, + "grad_norm": 0.8534329527893085, + "learning_rate": 2.588357472998826e-06, + "loss": 0.4499, + "step": 10524 + }, + { + "epoch": 1.34, + "grad_norm": 0.8478258952328214, + "learning_rate": 2.587453864694852e-06, + "loss": 0.5397, + "step": 10525 + }, + { + "epoch": 1.34, + "grad_norm": 0.6214870427952506, + "learning_rate": 2.586550359085256e-06, + "loss": 0.4357, + "step": 10526 + }, + { + "epoch": 1.34, + "grad_norm": 0.6422511873552887, + "learning_rate": 2.5856469562085013e-06, + "loss": 0.4906, + "step": 10527 + }, + { + "epoch": 1.34, + "grad_norm": 0.7478310730735366, + "learning_rate": 2.584743656103038e-06, + "loss": 0.4967, + "step": 10528 + }, + { + "epoch": 1.34, + "grad_norm": 0.6021550539295655, + "learning_rate": 2.5838404588073212e-06, + "loss": 0.4701, + "step": 10529 + }, + { + "epoch": 1.34, + "grad_norm": 0.5786767914169088, + "learning_rate": 2.5829373643597932e-06, + "loss": 0.4634, + "step": 10530 + }, + { + "epoch": 1.34, + "grad_norm": 0.5903349405229537, + "learning_rate": 2.5820343727989e-06, + "loss": 0.4783, + "step": 10531 + }, + { + "epoch": 1.34, + "grad_norm": 0.7419883312138332, + "learning_rate": 2.5811314841630746e-06, + "loss": 0.5394, + "step": 10532 + }, + { + "epoch": 1.34, + "grad_norm": 0.642325443778341, + "learning_rate": 2.5802286984907544e-06, + "loss": 0.4568, + "step": 10533 + }, + { + "epoch": 1.34, + "grad_norm": 0.6013737178069675, + "learning_rate": 2.5793260158203637e-06, + "loss": 0.4121, + "step": 10534 + }, + { + "epoch": 1.34, + "grad_norm": 0.6248434686109372, + "learning_rate": 2.57842343619033e-06, + "loss": 0.47, + "step": 10535 + }, + { + "epoch": 1.34, + "grad_norm": 0.6827723721043608, + "learning_rate": 2.577520959639074e-06, + "loss": 0.5715, + "step": 10536 + }, + { + "epoch": 1.34, + "grad_norm": 0.7917541185057193, + "learning_rate": 2.576618586205007e-06, + "loss": 0.5129, + "step": 10537 + }, + { + "epoch": 1.34, + "grad_norm": 0.6355583024877731, + "learning_rate": 2.5757163159265454e-06, + "loss": 0.4837, + "step": 10538 + }, + { + "epoch": 1.34, + "grad_norm": 0.6567955558681297, + "learning_rate": 2.574814148842091e-06, + "loss": 0.4935, + "step": 10539 + }, + { + "epoch": 1.34, + "grad_norm": 0.8557123190373086, + "learning_rate": 2.5739120849900488e-06, + "loss": 0.4954, + "step": 10540 + }, + { + "epoch": 1.34, + "grad_norm": 0.7187077331673724, + "learning_rate": 2.573010124408818e-06, + "loss": 0.4767, + "step": 10541 + }, + { + "epoch": 1.34, + "grad_norm": 0.7783758985801082, + "learning_rate": 2.572108267136789e-06, + "loss": 0.5312, + "step": 10542 + }, + { + "epoch": 1.34, + "grad_norm": 0.7522295070272268, + "learning_rate": 2.5712065132123544e-06, + "loss": 0.5364, + "step": 10543 + }, + { + "epoch": 1.34, + "grad_norm": 0.6587259348762554, + "learning_rate": 2.570304862673896e-06, + "loss": 0.4672, + "step": 10544 + }, + { + "epoch": 1.34, + "grad_norm": 0.8102807501813335, + "learning_rate": 2.5694033155597962e-06, + "loss": 0.4768, + "step": 10545 + }, + { + "epoch": 1.34, + "grad_norm": 0.7204510924799229, + "learning_rate": 2.5685018719084286e-06, + "loss": 0.5349, + "step": 10546 + }, + { + "epoch": 1.34, + "grad_norm": 0.6724871736736747, + "learning_rate": 2.567600531758167e-06, + "loss": 0.4485, + "step": 10547 + }, + { + "epoch": 1.34, + "grad_norm": 0.8287670774381877, + "learning_rate": 2.566699295147379e-06, + "loss": 0.5701, + "step": 10548 + }, + { + "epoch": 1.34, + "grad_norm": 0.7504033198606173, + "learning_rate": 2.5657981621144252e-06, + "loss": 0.5133, + "step": 10549 + }, + { + "epoch": 1.34, + "grad_norm": 0.6510095384940816, + "learning_rate": 2.5648971326976667e-06, + "loss": 0.4398, + "step": 10550 + }, + { + "epoch": 1.34, + "grad_norm": 0.7776999421851082, + "learning_rate": 2.5639962069354534e-06, + "loss": 0.5608, + "step": 10551 + }, + { + "epoch": 1.34, + "grad_norm": 0.8928014711124543, + "learning_rate": 2.56309538486614e-06, + "loss": 0.5259, + "step": 10552 + }, + { + "epoch": 1.34, + "grad_norm": 0.6441973623516792, + "learning_rate": 2.5621946665280663e-06, + "loss": 0.5076, + "step": 10553 + }, + { + "epoch": 1.34, + "grad_norm": 0.7301758034332348, + "learning_rate": 2.5612940519595774e-06, + "loss": 0.4537, + "step": 10554 + }, + { + "epoch": 1.34, + "grad_norm": 0.6560502718525917, + "learning_rate": 2.5603935411990056e-06, + "loss": 0.4863, + "step": 10555 + }, + { + "epoch": 1.34, + "grad_norm": 1.0806354582198998, + "learning_rate": 2.5594931342846874e-06, + "loss": 0.5566, + "step": 10556 + }, + { + "epoch": 1.34, + "grad_norm": 0.9032036502313324, + "learning_rate": 2.558592831254946e-06, + "loss": 0.5064, + "step": 10557 + }, + { + "epoch": 1.35, + "grad_norm": 0.6583914367167848, + "learning_rate": 2.557692632148107e-06, + "loss": 0.4844, + "step": 10558 + }, + { + "epoch": 1.35, + "grad_norm": 0.55842124426622, + "learning_rate": 2.556792537002485e-06, + "loss": 0.4443, + "step": 10559 + }, + { + "epoch": 1.35, + "grad_norm": 0.7925442216745742, + "learning_rate": 2.555892545856401e-06, + "loss": 0.4937, + "step": 10560 + }, + { + "epoch": 1.35, + "grad_norm": 0.6272378172920655, + "learning_rate": 2.5549926587481596e-06, + "loss": 0.4897, + "step": 10561 + }, + { + "epoch": 1.35, + "grad_norm": 0.5655246931149864, + "learning_rate": 2.554092875716069e-06, + "loss": 0.4677, + "step": 10562 + }, + { + "epoch": 1.35, + "grad_norm": 0.683200082924845, + "learning_rate": 2.553193196798427e-06, + "loss": 0.5002, + "step": 10563 + }, + { + "epoch": 1.35, + "grad_norm": 0.6882389962346854, + "learning_rate": 2.5522936220335337e-06, + "loss": 0.4934, + "step": 10564 + }, + { + "epoch": 1.35, + "grad_norm": 0.7629533289317983, + "learning_rate": 2.5513941514596766e-06, + "loss": 0.5429, + "step": 10565 + }, + { + "epoch": 1.35, + "grad_norm": 0.7484197632077519, + "learning_rate": 2.5504947851151486e-06, + "loss": 0.5382, + "step": 10566 + }, + { + "epoch": 1.35, + "grad_norm": 0.7874615109126454, + "learning_rate": 2.5495955230382275e-06, + "loss": 0.5133, + "step": 10567 + }, + { + "epoch": 1.35, + "grad_norm": 0.6060312838473805, + "learning_rate": 2.548696365267197e-06, + "loss": 0.4946, + "step": 10568 + }, + { + "epoch": 1.35, + "grad_norm": 0.6841542014755738, + "learning_rate": 2.547797311840327e-06, + "loss": 0.5041, + "step": 10569 + }, + { + "epoch": 1.35, + "grad_norm": 0.635146392666409, + "learning_rate": 2.546898362795891e-06, + "loss": 0.4888, + "step": 10570 + }, + { + "epoch": 1.35, + "grad_norm": 0.5969763884589779, + "learning_rate": 2.5459995181721493e-06, + "loss": 0.4823, + "step": 10571 + }, + { + "epoch": 1.35, + "grad_norm": 0.6279444187968927, + "learning_rate": 2.54510077800737e-06, + "loss": 0.4376, + "step": 10572 + }, + { + "epoch": 1.35, + "grad_norm": 0.7474066497095003, + "learning_rate": 2.5442021423398034e-06, + "loss": 0.4676, + "step": 10573 + }, + { + "epoch": 1.35, + "grad_norm": 0.8083824948346612, + "learning_rate": 2.5433036112077064e-06, + "loss": 0.4662, + "step": 10574 + }, + { + "epoch": 1.35, + "grad_norm": 0.6992772934404534, + "learning_rate": 2.5424051846493225e-06, + "loss": 0.4861, + "step": 10575 + }, + { + "epoch": 1.35, + "grad_norm": 0.8115358118946803, + "learning_rate": 2.541506862702898e-06, + "loss": 0.5079, + "step": 10576 + }, + { + "epoch": 1.35, + "grad_norm": 0.5898076156590896, + "learning_rate": 2.540608645406668e-06, + "loss": 0.4876, + "step": 10577 + }, + { + "epoch": 1.35, + "grad_norm": 0.6591181353526833, + "learning_rate": 2.539710532798871e-06, + "loss": 0.432, + "step": 10578 + }, + { + "epoch": 1.35, + "grad_norm": 0.6483357111405805, + "learning_rate": 2.538812524917733e-06, + "loss": 0.4906, + "step": 10579 + }, + { + "epoch": 1.35, + "grad_norm": 0.7201600310027874, + "learning_rate": 2.5379146218014828e-06, + "loss": 0.5091, + "step": 10580 + }, + { + "epoch": 1.35, + "grad_norm": 0.7601836067444325, + "learning_rate": 2.5370168234883375e-06, + "loss": 0.5357, + "step": 10581 + }, + { + "epoch": 1.35, + "grad_norm": 0.6314016482768607, + "learning_rate": 2.536119130016518e-06, + "loss": 0.51, + "step": 10582 + }, + { + "epoch": 1.35, + "grad_norm": 0.6448711450255533, + "learning_rate": 2.535221541424231e-06, + "loss": 0.5138, + "step": 10583 + }, + { + "epoch": 1.35, + "grad_norm": 0.7122061061104349, + "learning_rate": 2.5343240577496864e-06, + "loss": 0.5563, + "step": 10584 + }, + { + "epoch": 1.35, + "grad_norm": 0.8436835601495344, + "learning_rate": 2.533426679031088e-06, + "loss": 0.5324, + "step": 10585 + }, + { + "epoch": 1.35, + "grad_norm": 0.5453104602450376, + "learning_rate": 2.532529405306636e-06, + "loss": 0.4452, + "step": 10586 + }, + { + "epoch": 1.35, + "grad_norm": 0.6346023638934389, + "learning_rate": 2.5316322366145195e-06, + "loss": 0.4616, + "step": 10587 + }, + { + "epoch": 1.35, + "grad_norm": 0.7173474582293952, + "learning_rate": 2.5307351729929334e-06, + "loss": 0.4771, + "step": 10588 + }, + { + "epoch": 1.35, + "grad_norm": 0.5371831843226925, + "learning_rate": 2.5298382144800583e-06, + "loss": 0.4291, + "step": 10589 + }, + { + "epoch": 1.35, + "grad_norm": 0.7471570013389153, + "learning_rate": 2.528941361114079e-06, + "loss": 0.5323, + "step": 10590 + }, + { + "epoch": 1.35, + "grad_norm": 0.7050985320255504, + "learning_rate": 2.5280446129331675e-06, + "loss": 0.4929, + "step": 10591 + }, + { + "epoch": 1.35, + "grad_norm": 0.6876108715515993, + "learning_rate": 2.5271479699754996e-06, + "loss": 0.487, + "step": 10592 + }, + { + "epoch": 1.35, + "grad_norm": 0.8497204672920528, + "learning_rate": 2.5262514322792387e-06, + "loss": 0.5103, + "step": 10593 + }, + { + "epoch": 1.35, + "grad_norm": 0.7966053083109497, + "learning_rate": 2.525354999882551e-06, + "loss": 0.5476, + "step": 10594 + }, + { + "epoch": 1.35, + "grad_norm": 0.6943922207606278, + "learning_rate": 2.524458672823592e-06, + "loss": 0.5145, + "step": 10595 + }, + { + "epoch": 1.35, + "grad_norm": 0.7621090276845341, + "learning_rate": 2.5235624511405153e-06, + "loss": 0.5201, + "step": 10596 + }, + { + "epoch": 1.35, + "grad_norm": 0.6480220236340348, + "learning_rate": 2.522666334871474e-06, + "loss": 0.4302, + "step": 10597 + }, + { + "epoch": 1.35, + "grad_norm": 0.5352004794161997, + "learning_rate": 2.5217703240546087e-06, + "loss": 0.4377, + "step": 10598 + }, + { + "epoch": 1.35, + "grad_norm": 0.6158427016130312, + "learning_rate": 2.520874418728063e-06, + "loss": 0.4572, + "step": 10599 + }, + { + "epoch": 1.35, + "grad_norm": 0.6290232580546541, + "learning_rate": 2.5199786189299703e-06, + "loss": 0.4678, + "step": 10600 + }, + { + "epoch": 1.35, + "grad_norm": 0.8433508395950833, + "learning_rate": 2.519082924698464e-06, + "loss": 0.5943, + "step": 10601 + }, + { + "epoch": 1.35, + "grad_norm": 0.7075211559000786, + "learning_rate": 2.5181873360716667e-06, + "loss": 0.5505, + "step": 10602 + }, + { + "epoch": 1.35, + "grad_norm": 0.7983628484356027, + "learning_rate": 2.5172918530877048e-06, + "loss": 0.5217, + "step": 10603 + }, + { + "epoch": 1.35, + "grad_norm": 0.6361455888035236, + "learning_rate": 2.516396475784697e-06, + "loss": 0.4441, + "step": 10604 + }, + { + "epoch": 1.35, + "grad_norm": 0.6358294421365027, + "learning_rate": 2.515501204200753e-06, + "loss": 0.4485, + "step": 10605 + }, + { + "epoch": 1.35, + "grad_norm": 0.6299572756032057, + "learning_rate": 2.514606038373985e-06, + "loss": 0.4884, + "step": 10606 + }, + { + "epoch": 1.35, + "grad_norm": 0.7507582170250209, + "learning_rate": 2.513710978342494e-06, + "loss": 0.4692, + "step": 10607 + }, + { + "epoch": 1.35, + "grad_norm": 0.6674577672579849, + "learning_rate": 2.512816024144382e-06, + "loss": 0.4979, + "step": 10608 + }, + { + "epoch": 1.35, + "grad_norm": 0.7385789629712277, + "learning_rate": 2.5119211758177454e-06, + "loss": 0.5023, + "step": 10609 + }, + { + "epoch": 1.35, + "grad_norm": 0.7671958718927128, + "learning_rate": 2.5110264334006714e-06, + "loss": 0.524, + "step": 10610 + }, + { + "epoch": 1.35, + "grad_norm": 1.1830286879294056, + "learning_rate": 2.5101317969312505e-06, + "loss": 0.4849, + "step": 10611 + }, + { + "epoch": 1.35, + "grad_norm": 0.6926632994338192, + "learning_rate": 2.5092372664475607e-06, + "loss": 0.4315, + "step": 10612 + }, + { + "epoch": 1.35, + "grad_norm": 0.6506380234794855, + "learning_rate": 2.5083428419876833e-06, + "loss": 0.4335, + "step": 10613 + }, + { + "epoch": 1.35, + "grad_norm": 0.6677770486213096, + "learning_rate": 2.5074485235896873e-06, + "loss": 0.4615, + "step": 10614 + }, + { + "epoch": 1.35, + "grad_norm": 0.6422924304526997, + "learning_rate": 2.5065543112916434e-06, + "loss": 0.396, + "step": 10615 + }, + { + "epoch": 1.35, + "grad_norm": 0.6540009378319542, + "learning_rate": 2.5056602051316136e-06, + "loss": 0.4973, + "step": 10616 + }, + { + "epoch": 1.35, + "grad_norm": 0.7575700931995678, + "learning_rate": 2.5047662051476597e-06, + "loss": 0.5024, + "step": 10617 + }, + { + "epoch": 1.35, + "grad_norm": 0.6582059655711523, + "learning_rate": 2.503872311377833e-06, + "loss": 0.4495, + "step": 10618 + }, + { + "epoch": 1.35, + "grad_norm": 0.6696795396076224, + "learning_rate": 2.502978523860187e-06, + "loss": 0.4741, + "step": 10619 + }, + { + "epoch": 1.35, + "grad_norm": 0.6435913695776708, + "learning_rate": 2.502084842632765e-06, + "loss": 0.483, + "step": 10620 + }, + { + "epoch": 1.35, + "grad_norm": 0.730353290183102, + "learning_rate": 2.501191267733608e-06, + "loss": 0.4844, + "step": 10621 + }, + { + "epoch": 1.35, + "grad_norm": 0.6335069027394884, + "learning_rate": 2.5002977992007538e-06, + "loss": 0.4521, + "step": 10622 + }, + { + "epoch": 1.35, + "grad_norm": 0.6154280973499405, + "learning_rate": 2.499404437072237e-06, + "loss": 0.4934, + "step": 10623 + }, + { + "epoch": 1.35, + "grad_norm": 0.5850241018083346, + "learning_rate": 2.4985111813860803e-06, + "loss": 0.4782, + "step": 10624 + }, + { + "epoch": 1.35, + "grad_norm": 0.6933493158022256, + "learning_rate": 2.4976180321803107e-06, + "loss": 0.4494, + "step": 10625 + }, + { + "epoch": 1.35, + "grad_norm": 0.6635587136374863, + "learning_rate": 2.4967249894929436e-06, + "loss": 0.4941, + "step": 10626 + }, + { + "epoch": 1.35, + "grad_norm": 0.625311579433706, + "learning_rate": 2.4958320533619956e-06, + "loss": 0.4488, + "step": 10627 + }, + { + "epoch": 1.35, + "grad_norm": 0.6327476384586453, + "learning_rate": 2.494939223825473e-06, + "loss": 0.4513, + "step": 10628 + }, + { + "epoch": 1.35, + "grad_norm": 0.773105806783291, + "learning_rate": 2.494046500921385e-06, + "loss": 0.5474, + "step": 10629 + }, + { + "epoch": 1.35, + "grad_norm": 0.6091867867165777, + "learning_rate": 2.4931538846877278e-06, + "loss": 0.4876, + "step": 10630 + }, + { + "epoch": 1.35, + "grad_norm": 0.7491279739456798, + "learning_rate": 2.4922613751625003e-06, + "loss": 0.4838, + "step": 10631 + }, + { + "epoch": 1.35, + "grad_norm": 0.6898837828551352, + "learning_rate": 2.4913689723836908e-06, + "loss": 0.4972, + "step": 10632 + }, + { + "epoch": 1.35, + "grad_norm": 0.6778226639889613, + "learning_rate": 2.490476676389288e-06, + "loss": 0.5227, + "step": 10633 + }, + { + "epoch": 1.35, + "grad_norm": 0.7018927983205488, + "learning_rate": 2.4895844872172737e-06, + "loss": 0.5119, + "step": 10634 + }, + { + "epoch": 1.35, + "grad_norm": 0.8155583147329938, + "learning_rate": 2.4886924049056273e-06, + "loss": 0.5657, + "step": 10635 + }, + { + "epoch": 1.35, + "grad_norm": 0.8061910783214565, + "learning_rate": 2.4878004294923187e-06, + "loss": 0.4525, + "step": 10636 + }, + { + "epoch": 1.36, + "grad_norm": 0.5574081462277762, + "learning_rate": 2.4869085610153193e-06, + "loss": 0.4306, + "step": 10637 + }, + { + "epoch": 1.36, + "grad_norm": 0.6670760635317258, + "learning_rate": 2.48601679951259e-06, + "loss": 0.4747, + "step": 10638 + }, + { + "epoch": 1.36, + "grad_norm": 0.570575085827016, + "learning_rate": 2.485125145022094e-06, + "loss": 0.4678, + "step": 10639 + }, + { + "epoch": 1.36, + "grad_norm": 0.7311714371413807, + "learning_rate": 2.4842335975817825e-06, + "loss": 0.5276, + "step": 10640 + }, + { + "epoch": 1.36, + "grad_norm": 0.7817605699699901, + "learning_rate": 2.4833421572296094e-06, + "loss": 0.5376, + "step": 10641 + }, + { + "epoch": 1.36, + "grad_norm": 0.7084791512606515, + "learning_rate": 2.4824508240035167e-06, + "loss": 0.459, + "step": 10642 + }, + { + "epoch": 1.36, + "grad_norm": 0.5426925526301956, + "learning_rate": 2.481559597941449e-06, + "loss": 0.4346, + "step": 10643 + }, + { + "epoch": 1.36, + "grad_norm": 0.7486718158103108, + "learning_rate": 2.48066847908134e-06, + "loss": 0.4588, + "step": 10644 + }, + { + "epoch": 1.36, + "grad_norm": 0.7632464265884233, + "learning_rate": 2.479777467461125e-06, + "loss": 0.4987, + "step": 10645 + }, + { + "epoch": 1.36, + "grad_norm": 0.6019727398459266, + "learning_rate": 2.478886563118726e-06, + "loss": 0.4955, + "step": 10646 + }, + { + "epoch": 1.36, + "grad_norm": 0.5631170630230982, + "learning_rate": 2.477995766092073e-06, + "loss": 0.4184, + "step": 10647 + }, + { + "epoch": 1.36, + "grad_norm": 0.6501602303015634, + "learning_rate": 2.4771050764190795e-06, + "loss": 0.4438, + "step": 10648 + }, + { + "epoch": 1.36, + "grad_norm": 0.6297624260295119, + "learning_rate": 2.4762144941376627e-06, + "loss": 0.457, + "step": 10649 + }, + { + "epoch": 1.36, + "grad_norm": 0.604528205843405, + "learning_rate": 2.475324019285728e-06, + "loss": 0.4524, + "step": 10650 + }, + { + "epoch": 1.36, + "grad_norm": 0.5504661044380628, + "learning_rate": 2.4744336519011848e-06, + "loss": 0.4308, + "step": 10651 + }, + { + "epoch": 1.36, + "grad_norm": 0.7918434571931015, + "learning_rate": 2.4735433920219275e-06, + "loss": 0.4656, + "step": 10652 + }, + { + "epoch": 1.36, + "grad_norm": 0.5493917510346389, + "learning_rate": 2.472653239685857e-06, + "loss": 0.3887, + "step": 10653 + }, + { + "epoch": 1.36, + "grad_norm": 0.5945150597257793, + "learning_rate": 2.4717631949308603e-06, + "loss": 0.4757, + "step": 10654 + }, + { + "epoch": 1.36, + "grad_norm": 0.7060958545896473, + "learning_rate": 2.4708732577948263e-06, + "loss": 0.5, + "step": 10655 + }, + { + "epoch": 1.36, + "grad_norm": 0.530397584421947, + "learning_rate": 2.4699834283156342e-06, + "loss": 0.4229, + "step": 10656 + }, + { + "epoch": 1.36, + "grad_norm": 0.7672624101885411, + "learning_rate": 2.4690937065311647e-06, + "loss": 0.506, + "step": 10657 + }, + { + "epoch": 1.36, + "grad_norm": 0.7874569311621559, + "learning_rate": 2.468204092479287e-06, + "loss": 0.5733, + "step": 10658 + }, + { + "epoch": 1.36, + "grad_norm": 0.8460276648134271, + "learning_rate": 2.4673145861978705e-06, + "loss": 0.5846, + "step": 10659 + }, + { + "epoch": 1.36, + "grad_norm": 0.7828687831319213, + "learning_rate": 2.466425187724781e-06, + "loss": 0.5239, + "step": 10660 + }, + { + "epoch": 1.36, + "grad_norm": 0.7700224106343307, + "learning_rate": 2.465535897097872e-06, + "loss": 0.4734, + "step": 10661 + }, + { + "epoch": 1.36, + "grad_norm": 0.5346197722840267, + "learning_rate": 2.464646714355004e-06, + "loss": 0.4442, + "step": 10662 + }, + { + "epoch": 1.36, + "grad_norm": 0.6108654957329758, + "learning_rate": 2.4637576395340214e-06, + "loss": 0.4543, + "step": 10663 + }, + { + "epoch": 1.36, + "grad_norm": 0.7150591385887055, + "learning_rate": 2.4628686726727708e-06, + "loss": 0.5846, + "step": 10664 + }, + { + "epoch": 1.36, + "grad_norm": 0.7584122496864705, + "learning_rate": 2.4619798138090954e-06, + "loss": 0.5119, + "step": 10665 + }, + { + "epoch": 1.36, + "grad_norm": 0.7356999300457018, + "learning_rate": 2.4610910629808273e-06, + "loss": 0.4591, + "step": 10666 + }, + { + "epoch": 1.36, + "grad_norm": 0.6135349839298728, + "learning_rate": 2.4602024202258013e-06, + "loss": 0.5139, + "step": 10667 + }, + { + "epoch": 1.36, + "grad_norm": 0.801710744306865, + "learning_rate": 2.4593138855818407e-06, + "loss": 0.5061, + "step": 10668 + }, + { + "epoch": 1.36, + "grad_norm": 0.6001626031433973, + "learning_rate": 2.4584254590867702e-06, + "loss": 0.5057, + "step": 10669 + }, + { + "epoch": 1.36, + "grad_norm": 0.7235302494936137, + "learning_rate": 2.457537140778405e-06, + "loss": 0.5199, + "step": 10670 + }, + { + "epoch": 1.36, + "grad_norm": 0.603529320577914, + "learning_rate": 2.456648930694559e-06, + "loss": 0.4864, + "step": 10671 + }, + { + "epoch": 1.36, + "grad_norm": 0.6812204286898121, + "learning_rate": 2.4557608288730424e-06, + "loss": 0.5462, + "step": 10672 + }, + { + "epoch": 1.36, + "grad_norm": 0.7765607089730986, + "learning_rate": 2.454872835351655e-06, + "loss": 0.4848, + "step": 10673 + }, + { + "epoch": 1.36, + "grad_norm": 0.7113788664381167, + "learning_rate": 2.4539849501682e-06, + "loss": 0.4204, + "step": 10674 + }, + { + "epoch": 1.36, + "grad_norm": 0.6469614434228078, + "learning_rate": 2.453097173360468e-06, + "loss": 0.452, + "step": 10675 + }, + { + "epoch": 1.36, + "grad_norm": 0.6888606628738865, + "learning_rate": 2.4522095049662526e-06, + "loss": 0.4846, + "step": 10676 + }, + { + "epoch": 1.36, + "grad_norm": 0.7524738566659702, + "learning_rate": 2.451321945023335e-06, + "loss": 0.5014, + "step": 10677 + }, + { + "epoch": 1.36, + "grad_norm": 0.6921572770796952, + "learning_rate": 2.4504344935694996e-06, + "loss": 0.5261, + "step": 10678 + }, + { + "epoch": 1.36, + "grad_norm": 0.7199160789820862, + "learning_rate": 2.4495471506425183e-06, + "loss": 0.4785, + "step": 10679 + }, + { + "epoch": 1.36, + "grad_norm": 0.6071135882596962, + "learning_rate": 2.4486599162801674e-06, + "loss": 0.4319, + "step": 10680 + }, + { + "epoch": 1.36, + "grad_norm": 0.5683515544675386, + "learning_rate": 2.4477727905202085e-06, + "loss": 0.3997, + "step": 10681 + }, + { + "epoch": 1.36, + "grad_norm": 0.5807853982951335, + "learning_rate": 2.446885773400408e-06, + "loss": 0.4772, + "step": 10682 + }, + { + "epoch": 1.36, + "grad_norm": 0.6791670099245443, + "learning_rate": 2.4459988649585182e-06, + "loss": 0.4622, + "step": 10683 + }, + { + "epoch": 1.36, + "grad_norm": 0.6014193527221852, + "learning_rate": 2.445112065232299e-06, + "loss": 0.4926, + "step": 10684 + }, + { + "epoch": 1.36, + "grad_norm": 0.6813237596867474, + "learning_rate": 2.444225374259493e-06, + "loss": 0.4753, + "step": 10685 + }, + { + "epoch": 1.36, + "grad_norm": 0.8235857899859498, + "learning_rate": 2.4433387920778475e-06, + "loss": 0.5193, + "step": 10686 + }, + { + "epoch": 1.36, + "grad_norm": 0.7172004316474528, + "learning_rate": 2.4424523187250974e-06, + "loss": 0.4809, + "step": 10687 + }, + { + "epoch": 1.36, + "grad_norm": 1.4542885268455246, + "learning_rate": 2.441565954238982e-06, + "loss": 0.4996, + "step": 10688 + }, + { + "epoch": 1.36, + "grad_norm": 0.7140273438417358, + "learning_rate": 2.4406796986572256e-06, + "loss": 0.5256, + "step": 10689 + }, + { + "epoch": 1.36, + "grad_norm": 0.5816882506093489, + "learning_rate": 2.439793552017559e-06, + "loss": 0.4375, + "step": 10690 + }, + { + "epoch": 1.36, + "grad_norm": 0.6688746931103752, + "learning_rate": 2.438907514357697e-06, + "loss": 0.4691, + "step": 10691 + }, + { + "epoch": 1.36, + "grad_norm": 0.7911759252836437, + "learning_rate": 2.43802158571536e-06, + "loss": 0.4808, + "step": 10692 + }, + { + "epoch": 1.36, + "grad_norm": 0.6997345922810209, + "learning_rate": 2.437135766128255e-06, + "loss": 0.441, + "step": 10693 + }, + { + "epoch": 1.36, + "grad_norm": 0.7697127989619299, + "learning_rate": 2.4362500556340936e-06, + "loss": 0.5065, + "step": 10694 + }, + { + "epoch": 1.36, + "grad_norm": 0.76261356575478, + "learning_rate": 2.4353644542705703e-06, + "loss": 0.4966, + "step": 10695 + }, + { + "epoch": 1.36, + "grad_norm": 0.5992984772511316, + "learning_rate": 2.43447896207539e-06, + "loss": 0.5088, + "step": 10696 + }, + { + "epoch": 1.36, + "grad_norm": 0.7693345937281759, + "learning_rate": 2.4335935790862406e-06, + "loss": 0.5733, + "step": 10697 + }, + { + "epoch": 1.36, + "grad_norm": 0.7374141415594203, + "learning_rate": 2.432708305340814e-06, + "loss": 0.5348, + "step": 10698 + }, + { + "epoch": 1.36, + "grad_norm": 0.6752949821992592, + "learning_rate": 2.431823140876788e-06, + "loss": 0.4986, + "step": 10699 + }, + { + "epoch": 1.36, + "grad_norm": 0.5494342031098339, + "learning_rate": 2.430938085731847e-06, + "loss": 0.4069, + "step": 10700 + }, + { + "epoch": 1.36, + "grad_norm": 0.6345041032562396, + "learning_rate": 2.43005313994366e-06, + "loss": 0.4874, + "step": 10701 + }, + { + "epoch": 1.36, + "grad_norm": 0.7599971000903185, + "learning_rate": 2.4291683035499004e-06, + "loss": 0.5238, + "step": 10702 + }, + { + "epoch": 1.36, + "grad_norm": 0.7380009544892384, + "learning_rate": 2.4282835765882286e-06, + "loss": 0.5222, + "step": 10703 + }, + { + "epoch": 1.36, + "grad_norm": 0.7670832887018235, + "learning_rate": 2.4273989590963093e-06, + "loss": 0.5281, + "step": 10704 + }, + { + "epoch": 1.36, + "grad_norm": 0.5904091697497313, + "learning_rate": 2.426514451111793e-06, + "loss": 0.473, + "step": 10705 + }, + { + "epoch": 1.36, + "grad_norm": 0.6331370955412378, + "learning_rate": 2.4256300526723352e-06, + "loss": 0.4478, + "step": 10706 + }, + { + "epoch": 1.36, + "grad_norm": 0.6468386643533353, + "learning_rate": 2.4247457638155775e-06, + "loss": 0.4845, + "step": 10707 + }, + { + "epoch": 1.36, + "grad_norm": 0.6339230424769525, + "learning_rate": 2.4238615845791636e-06, + "loss": 0.5139, + "step": 10708 + }, + { + "epoch": 1.36, + "grad_norm": 0.8816016484557027, + "learning_rate": 2.4229775150007295e-06, + "loss": 0.5033, + "step": 10709 + }, + { + "epoch": 1.36, + "grad_norm": 0.6510093955007307, + "learning_rate": 2.422093555117909e-06, + "loss": 0.4568, + "step": 10710 + }, + { + "epoch": 1.36, + "grad_norm": 0.5889514282845364, + "learning_rate": 2.421209704968327e-06, + "loss": 0.4352, + "step": 10711 + }, + { + "epoch": 1.36, + "grad_norm": 0.6342082783199369, + "learning_rate": 2.420325964589609e-06, + "loss": 0.4202, + "step": 10712 + }, + { + "epoch": 1.36, + "grad_norm": 0.6272517155245021, + "learning_rate": 2.4194423340193696e-06, + "loss": 0.5029, + "step": 10713 + }, + { + "epoch": 1.36, + "grad_norm": 0.7838106229972467, + "learning_rate": 2.418558813295225e-06, + "loss": 0.484, + "step": 10714 + }, + { + "epoch": 1.37, + "grad_norm": 0.939730119450188, + "learning_rate": 2.417675402454781e-06, + "loss": 0.5929, + "step": 10715 + }, + { + "epoch": 1.37, + "grad_norm": 0.730871296229932, + "learning_rate": 2.416792101535645e-06, + "loss": 0.5235, + "step": 10716 + }, + { + "epoch": 1.37, + "grad_norm": 0.5420394027454228, + "learning_rate": 2.4159089105754123e-06, + "loss": 0.4323, + "step": 10717 + }, + { + "epoch": 1.37, + "grad_norm": 0.5699369533277572, + "learning_rate": 2.4150258296116825e-06, + "loss": 0.4385, + "step": 10718 + }, + { + "epoch": 1.37, + "grad_norm": 0.6008010964580018, + "learning_rate": 2.41414285868204e-06, + "loss": 0.456, + "step": 10719 + }, + { + "epoch": 1.37, + "grad_norm": 0.6243355450781678, + "learning_rate": 2.4132599978240727e-06, + "loss": 0.4591, + "step": 10720 + }, + { + "epoch": 1.37, + "grad_norm": 1.0238150269553674, + "learning_rate": 2.4123772470753636e-06, + "loss": 0.5906, + "step": 10721 + }, + { + "epoch": 1.37, + "grad_norm": 0.7473902968138395, + "learning_rate": 2.4114946064734845e-06, + "loss": 0.4959, + "step": 10722 + }, + { + "epoch": 1.37, + "grad_norm": 0.6653613205142198, + "learning_rate": 2.4106120760560105e-06, + "loss": 0.49, + "step": 10723 + }, + { + "epoch": 1.37, + "grad_norm": 0.7572531889262955, + "learning_rate": 2.4097296558605034e-06, + "loss": 0.4649, + "step": 10724 + }, + { + "epoch": 1.37, + "grad_norm": 0.5732860034865839, + "learning_rate": 2.408847345924529e-06, + "loss": 0.4335, + "step": 10725 + }, + { + "epoch": 1.37, + "grad_norm": 0.5664822620728239, + "learning_rate": 2.407965146285642e-06, + "loss": 0.4193, + "step": 10726 + }, + { + "epoch": 1.37, + "grad_norm": 0.6099452083868128, + "learning_rate": 2.4070830569813954e-06, + "loss": 0.4613, + "step": 10727 + }, + { + "epoch": 1.37, + "grad_norm": 0.6531283756369385, + "learning_rate": 2.406201078049339e-06, + "loss": 0.4591, + "step": 10728 + }, + { + "epoch": 1.37, + "grad_norm": 0.6071837708400514, + "learning_rate": 2.4053192095270122e-06, + "loss": 0.4863, + "step": 10729 + }, + { + "epoch": 1.37, + "grad_norm": 0.726671450738999, + "learning_rate": 2.4044374514519575e-06, + "loss": 0.5041, + "step": 10730 + }, + { + "epoch": 1.37, + "grad_norm": 0.6753116282711898, + "learning_rate": 2.403555803861704e-06, + "loss": 0.5246, + "step": 10731 + }, + { + "epoch": 1.37, + "grad_norm": 0.7845088803810617, + "learning_rate": 2.402674266793783e-06, + "loss": 0.5073, + "step": 10732 + }, + { + "epoch": 1.37, + "grad_norm": 0.5858019626031771, + "learning_rate": 2.4017928402857206e-06, + "loss": 0.4947, + "step": 10733 + }, + { + "epoch": 1.37, + "grad_norm": 0.7594251192336049, + "learning_rate": 2.400911524375032e-06, + "loss": 0.5059, + "step": 10734 + }, + { + "epoch": 1.37, + "grad_norm": 0.6436294907042814, + "learning_rate": 2.400030319099237e-06, + "loss": 0.4427, + "step": 10735 + }, + { + "epoch": 1.37, + "grad_norm": 0.665587087036889, + "learning_rate": 2.3991492244958403e-06, + "loss": 0.4547, + "step": 10736 + }, + { + "epoch": 1.37, + "grad_norm": 0.7935765655517953, + "learning_rate": 2.398268240602352e-06, + "loss": 0.4977, + "step": 10737 + }, + { + "epoch": 1.37, + "grad_norm": 0.5973192112742913, + "learning_rate": 2.3973873674562693e-06, + "loss": 0.4672, + "step": 10738 + }, + { + "epoch": 1.37, + "grad_norm": 0.6393252440505999, + "learning_rate": 2.3965066050950913e-06, + "loss": 0.4601, + "step": 10739 + }, + { + "epoch": 1.37, + "grad_norm": 0.6370672488453737, + "learning_rate": 2.3956259535563054e-06, + "loss": 0.4552, + "step": 10740 + }, + { + "epoch": 1.37, + "grad_norm": 0.6482178678751401, + "learning_rate": 2.394745412877403e-06, + "loss": 0.4786, + "step": 10741 + }, + { + "epoch": 1.37, + "grad_norm": 0.6032793464429704, + "learning_rate": 2.3938649830958606e-06, + "loss": 0.5129, + "step": 10742 + }, + { + "epoch": 1.37, + "grad_norm": 0.8323148011681242, + "learning_rate": 2.3929846642491595e-06, + "loss": 0.4937, + "step": 10743 + }, + { + "epoch": 1.37, + "grad_norm": 0.5627431230471072, + "learning_rate": 2.3921044563747685e-06, + "loss": 0.428, + "step": 10744 + }, + { + "epoch": 1.37, + "grad_norm": 0.6393055905578594, + "learning_rate": 2.3912243595101574e-06, + "loss": 0.4671, + "step": 10745 + }, + { + "epoch": 1.37, + "grad_norm": 0.5901338716140343, + "learning_rate": 2.3903443736927897e-06, + "loss": 0.4859, + "step": 10746 + }, + { + "epoch": 1.37, + "grad_norm": 0.7812220224675036, + "learning_rate": 2.3894644989601236e-06, + "loss": 0.526, + "step": 10747 + }, + { + "epoch": 1.37, + "grad_norm": 0.5606310442516241, + "learning_rate": 2.38858473534961e-06, + "loss": 0.4667, + "step": 10748 + }, + { + "epoch": 1.37, + "grad_norm": 0.693223459316275, + "learning_rate": 2.3877050828987016e-06, + "loss": 0.4727, + "step": 10749 + }, + { + "epoch": 1.37, + "grad_norm": 0.6675015607182109, + "learning_rate": 2.3868255416448377e-06, + "loss": 0.5087, + "step": 10750 + }, + { + "epoch": 1.37, + "grad_norm": 0.6886864857442677, + "learning_rate": 2.385946111625462e-06, + "loss": 0.46, + "step": 10751 + }, + { + "epoch": 1.37, + "grad_norm": 0.6102865259761342, + "learning_rate": 2.385066792878005e-06, + "loss": 0.4672, + "step": 10752 + }, + { + "epoch": 1.37, + "grad_norm": 0.7018802541894525, + "learning_rate": 2.384187585439901e-06, + "loss": 0.4813, + "step": 10753 + }, + { + "epoch": 1.37, + "grad_norm": 0.6890841022637195, + "learning_rate": 2.3833084893485705e-06, + "loss": 0.4756, + "step": 10754 + }, + { + "epoch": 1.37, + "grad_norm": 0.7642755706983619, + "learning_rate": 2.382429504641437e-06, + "loss": 0.4847, + "step": 10755 + }, + { + "epoch": 1.37, + "grad_norm": 0.8206867294764637, + "learning_rate": 2.381550631355914e-06, + "loss": 0.469, + "step": 10756 + }, + { + "epoch": 1.37, + "grad_norm": 2.058137301693994, + "learning_rate": 2.3806718695294124e-06, + "loss": 0.4359, + "step": 10757 + }, + { + "epoch": 1.37, + "grad_norm": 0.7446157661237356, + "learning_rate": 2.3797932191993396e-06, + "loss": 0.4761, + "step": 10758 + }, + { + "epoch": 1.37, + "grad_norm": 0.7050987456196054, + "learning_rate": 2.3789146804030984e-06, + "loss": 0.4839, + "step": 10759 + }, + { + "epoch": 1.37, + "grad_norm": 0.7544500047765144, + "learning_rate": 2.3780362531780815e-06, + "loss": 0.509, + "step": 10760 + }, + { + "epoch": 1.37, + "grad_norm": 0.8407991862856738, + "learning_rate": 2.377157937561684e-06, + "loss": 0.5663, + "step": 10761 + }, + { + "epoch": 1.37, + "grad_norm": 0.7558720128445902, + "learning_rate": 2.376279733591291e-06, + "loss": 0.4997, + "step": 10762 + }, + { + "epoch": 1.37, + "grad_norm": 0.6258939976241674, + "learning_rate": 2.375401641304287e-06, + "loss": 0.4836, + "step": 10763 + }, + { + "epoch": 1.37, + "grad_norm": 0.7867419421900359, + "learning_rate": 2.374523660738046e-06, + "loss": 0.5363, + "step": 10764 + }, + { + "epoch": 1.37, + "grad_norm": 0.7395039651205694, + "learning_rate": 2.3736457919299446e-06, + "loss": 0.4981, + "step": 10765 + }, + { + "epoch": 1.37, + "grad_norm": 0.7405534995697715, + "learning_rate": 2.3727680349173475e-06, + "loss": 0.5776, + "step": 10766 + }, + { + "epoch": 1.37, + "grad_norm": 0.7104559397257032, + "learning_rate": 2.3718903897376216e-06, + "loss": 0.4762, + "step": 10767 + }, + { + "epoch": 1.37, + "grad_norm": 0.6542246828416545, + "learning_rate": 2.371012856428122e-06, + "loss": 0.4525, + "step": 10768 + }, + { + "epoch": 1.37, + "grad_norm": 0.5609690670930079, + "learning_rate": 2.370135435026206e-06, + "loss": 0.4263, + "step": 10769 + }, + { + "epoch": 1.37, + "grad_norm": 0.8051835397134139, + "learning_rate": 2.3692581255692167e-06, + "loss": 0.5298, + "step": 10770 + }, + { + "epoch": 1.37, + "grad_norm": 0.7743788436099871, + "learning_rate": 2.368380928094506e-06, + "loss": 0.5792, + "step": 10771 + }, + { + "epoch": 1.37, + "grad_norm": 0.6776847729356127, + "learning_rate": 2.3675038426394078e-06, + "loss": 0.5239, + "step": 10772 + }, + { + "epoch": 1.37, + "grad_norm": 0.6080921980201661, + "learning_rate": 2.3666268692412603e-06, + "loss": 0.4805, + "step": 10773 + }, + { + "epoch": 1.37, + "grad_norm": 0.6871087491999675, + "learning_rate": 2.365750007937391e-06, + "loss": 0.4787, + "step": 10774 + }, + { + "epoch": 1.37, + "grad_norm": 0.7243648822500646, + "learning_rate": 2.3648732587651268e-06, + "loss": 0.5107, + "step": 10775 + }, + { + "epoch": 1.37, + "grad_norm": 0.6034248568715802, + "learning_rate": 2.3639966217617868e-06, + "loss": 0.5041, + "step": 10776 + }, + { + "epoch": 1.37, + "grad_norm": 1.0120335372650437, + "learning_rate": 2.363120096964688e-06, + "loss": 0.5738, + "step": 10777 + }, + { + "epoch": 1.37, + "grad_norm": 0.7415451815977266, + "learning_rate": 2.3622436844111384e-06, + "loss": 0.556, + "step": 10778 + }, + { + "epoch": 1.37, + "grad_norm": 0.8256111580997718, + "learning_rate": 2.361367384138448e-06, + "loss": 0.5658, + "step": 10779 + }, + { + "epoch": 1.37, + "grad_norm": 0.719510323500556, + "learning_rate": 2.360491196183915e-06, + "loss": 0.5292, + "step": 10780 + }, + { + "epoch": 1.37, + "grad_norm": 1.004692350210955, + "learning_rate": 2.359615120584839e-06, + "loss": 0.559, + "step": 10781 + }, + { + "epoch": 1.37, + "grad_norm": 0.7024380001621336, + "learning_rate": 2.3587391573785073e-06, + "loss": 0.533, + "step": 10782 + }, + { + "epoch": 1.37, + "grad_norm": 0.5796511462605038, + "learning_rate": 2.35786330660221e-06, + "loss": 0.4513, + "step": 10783 + }, + { + "epoch": 1.37, + "grad_norm": 0.8258690913604891, + "learning_rate": 2.3569875682932304e-06, + "loss": 0.5415, + "step": 10784 + }, + { + "epoch": 1.37, + "grad_norm": 0.6129396432400224, + "learning_rate": 2.356111942488842e-06, + "loss": 0.5035, + "step": 10785 + }, + { + "epoch": 1.37, + "grad_norm": 0.7709876421865782, + "learning_rate": 2.3552364292263215e-06, + "loss": 0.6035, + "step": 10786 + }, + { + "epoch": 1.37, + "grad_norm": 0.7091236168630716, + "learning_rate": 2.3543610285429332e-06, + "loss": 0.4916, + "step": 10787 + }, + { + "epoch": 1.37, + "grad_norm": 0.5715637774864741, + "learning_rate": 2.3534857404759414e-06, + "loss": 0.478, + "step": 10788 + }, + { + "epoch": 1.37, + "grad_norm": 0.6525523934717318, + "learning_rate": 2.352610565062606e-06, + "loss": 0.4162, + "step": 10789 + }, + { + "epoch": 1.37, + "grad_norm": 0.6228413227067008, + "learning_rate": 2.351735502340178e-06, + "loss": 0.5088, + "step": 10790 + }, + { + "epoch": 1.37, + "grad_norm": 0.729817446892084, + "learning_rate": 2.350860552345908e-06, + "loss": 0.5746, + "step": 10791 + }, + { + "epoch": 1.37, + "grad_norm": 0.6946337814457026, + "learning_rate": 2.349985715117038e-06, + "loss": 0.5064, + "step": 10792 + }, + { + "epoch": 1.37, + "grad_norm": 0.5995985235027571, + "learning_rate": 2.3491109906908095e-06, + "loss": 0.4355, + "step": 10793 + }, + { + "epoch": 1.38, + "grad_norm": 0.6098114528755813, + "learning_rate": 2.348236379104453e-06, + "loss": 0.453, + "step": 10794 + }, + { + "epoch": 1.38, + "grad_norm": 0.5293180168757409, + "learning_rate": 2.3473618803951997e-06, + "loss": 0.4282, + "step": 10795 + }, + { + "epoch": 1.38, + "grad_norm": 0.6461010280129098, + "learning_rate": 2.3464874946002762e-06, + "loss": 0.4835, + "step": 10796 + }, + { + "epoch": 1.38, + "grad_norm": 0.604742512563059, + "learning_rate": 2.3456132217568994e-06, + "loss": 0.4334, + "step": 10797 + }, + { + "epoch": 1.38, + "grad_norm": 0.6728208997366567, + "learning_rate": 2.3447390619022873e-06, + "loss": 0.4376, + "step": 10798 + }, + { + "epoch": 1.38, + "grad_norm": 0.6260716571252182, + "learning_rate": 2.3438650150736465e-06, + "loss": 0.438, + "step": 10799 + }, + { + "epoch": 1.38, + "grad_norm": 0.6411162499271806, + "learning_rate": 2.342991081308186e-06, + "loss": 0.5091, + "step": 10800 + }, + { + "epoch": 1.38, + "grad_norm": 0.7894618104541057, + "learning_rate": 2.342117260643103e-06, + "loss": 0.4988, + "step": 10801 + }, + { + "epoch": 1.38, + "grad_norm": 0.7268751297462787, + "learning_rate": 2.3412435531155963e-06, + "loss": 0.4491, + "step": 10802 + }, + { + "epoch": 1.38, + "grad_norm": 0.8370397054839325, + "learning_rate": 2.3403699587628537e-06, + "loss": 0.507, + "step": 10803 + }, + { + "epoch": 1.38, + "grad_norm": 0.6221575888969254, + "learning_rate": 2.339496477622065e-06, + "loss": 0.4539, + "step": 10804 + }, + { + "epoch": 1.38, + "grad_norm": 0.7669903378821644, + "learning_rate": 2.3386231097304073e-06, + "loss": 0.5337, + "step": 10805 + }, + { + "epoch": 1.38, + "grad_norm": 0.7556400172219717, + "learning_rate": 2.3377498551250612e-06, + "loss": 0.5698, + "step": 10806 + }, + { + "epoch": 1.38, + "grad_norm": 0.5907499487355397, + "learning_rate": 2.3368767138431926e-06, + "loss": 0.4793, + "step": 10807 + }, + { + "epoch": 1.38, + "grad_norm": 0.9718417545714607, + "learning_rate": 2.3360036859219758e-06, + "loss": 0.5496, + "step": 10808 + }, + { + "epoch": 1.38, + "grad_norm": 0.80538505610934, + "learning_rate": 2.3351307713985678e-06, + "loss": 0.4847, + "step": 10809 + }, + { + "epoch": 1.38, + "grad_norm": 0.6634063433384094, + "learning_rate": 2.334257970310129e-06, + "loss": 0.4551, + "step": 10810 + }, + { + "epoch": 1.38, + "grad_norm": 0.7258577887381235, + "learning_rate": 2.3333852826938082e-06, + "loss": 0.5441, + "step": 10811 + }, + { + "epoch": 1.38, + "grad_norm": 0.6722498850302644, + "learning_rate": 2.3325127085867554e-06, + "loss": 0.4432, + "step": 10812 + }, + { + "epoch": 1.38, + "grad_norm": 0.7829308135547088, + "learning_rate": 2.331640248026112e-06, + "loss": 0.4876, + "step": 10813 + }, + { + "epoch": 1.38, + "grad_norm": 0.7334023455009091, + "learning_rate": 2.3307679010490175e-06, + "loss": 0.5017, + "step": 10814 + }, + { + "epoch": 1.38, + "grad_norm": 0.6371490893324795, + "learning_rate": 2.3298956676926023e-06, + "loss": 0.4521, + "step": 10815 + }, + { + "epoch": 1.38, + "grad_norm": 0.6348561939785046, + "learning_rate": 2.329023547993998e-06, + "loss": 0.453, + "step": 10816 + }, + { + "epoch": 1.38, + "grad_norm": 0.7845462444401949, + "learning_rate": 2.3281515419903233e-06, + "loss": 0.4941, + "step": 10817 + }, + { + "epoch": 1.38, + "grad_norm": 0.7707273584196197, + "learning_rate": 2.3272796497187023e-06, + "loss": 0.5087, + "step": 10818 + }, + { + "epoch": 1.38, + "grad_norm": 0.691895113348418, + "learning_rate": 2.3264078712162413e-06, + "loss": 0.4724, + "step": 10819 + }, + { + "epoch": 1.38, + "grad_norm": 0.6067262766921584, + "learning_rate": 2.325536206520058e-06, + "loss": 0.442, + "step": 10820 + }, + { + "epoch": 1.38, + "grad_norm": 0.6007288267040728, + "learning_rate": 2.32466465566725e-06, + "loss": 0.4777, + "step": 10821 + }, + { + "epoch": 1.38, + "grad_norm": 0.8321389327862625, + "learning_rate": 2.32379321869492e-06, + "loss": 0.5049, + "step": 10822 + }, + { + "epoch": 1.38, + "grad_norm": 0.7320317635548815, + "learning_rate": 2.3229218956401593e-06, + "loss": 0.5155, + "step": 10823 + }, + { + "epoch": 1.38, + "grad_norm": 0.6482921480648263, + "learning_rate": 2.3220506865400604e-06, + "loss": 0.4684, + "step": 10824 + }, + { + "epoch": 1.38, + "grad_norm": 0.8111920894798678, + "learning_rate": 2.321179591431705e-06, + "loss": 0.4723, + "step": 10825 + }, + { + "epoch": 1.38, + "grad_norm": 0.6701617418798336, + "learning_rate": 2.3203086103521753e-06, + "loss": 0.4953, + "step": 10826 + }, + { + "epoch": 1.38, + "grad_norm": 0.8428668487849036, + "learning_rate": 2.3194377433385434e-06, + "loss": 0.5295, + "step": 10827 + }, + { + "epoch": 1.38, + "grad_norm": 0.727807295443101, + "learning_rate": 2.318566990427883e-06, + "loss": 0.5495, + "step": 10828 + }, + { + "epoch": 1.38, + "grad_norm": 0.6395711581650799, + "learning_rate": 2.3176963516572546e-06, + "loss": 0.4289, + "step": 10829 + }, + { + "epoch": 1.38, + "grad_norm": 0.6438211565663589, + "learning_rate": 2.3168258270637233e-06, + "loss": 0.4228, + "step": 10830 + }, + { + "epoch": 1.38, + "grad_norm": 0.618310023751601, + "learning_rate": 2.3159554166843408e-06, + "loss": 0.4546, + "step": 10831 + }, + { + "epoch": 1.38, + "grad_norm": 0.7663220610981202, + "learning_rate": 2.315085120556159e-06, + "loss": 0.5122, + "step": 10832 + }, + { + "epoch": 1.38, + "grad_norm": 0.5277908735189812, + "learning_rate": 2.314214938716224e-06, + "loss": 0.4127, + "step": 10833 + }, + { + "epoch": 1.38, + "grad_norm": 0.6815308361486874, + "learning_rate": 2.3133448712015784e-06, + "loss": 0.4891, + "step": 10834 + }, + { + "epoch": 1.38, + "grad_norm": 0.6643055000468925, + "learning_rate": 2.312474918049254e-06, + "loss": 0.4648, + "step": 10835 + }, + { + "epoch": 1.38, + "grad_norm": 0.7628619037188938, + "learning_rate": 2.311605079296286e-06, + "loss": 0.5443, + "step": 10836 + }, + { + "epoch": 1.38, + "grad_norm": 0.697087369735636, + "learning_rate": 2.310735354979698e-06, + "loss": 0.4966, + "step": 10837 + }, + { + "epoch": 1.38, + "grad_norm": 0.63360614192725, + "learning_rate": 2.309865745136513e-06, + "loss": 0.4964, + "step": 10838 + }, + { + "epoch": 1.38, + "grad_norm": 0.5240209848865982, + "learning_rate": 2.3089962498037454e-06, + "loss": 0.386, + "step": 10839 + }, + { + "epoch": 1.38, + "grad_norm": 0.784939710411291, + "learning_rate": 2.3081268690184106e-06, + "loss": 0.4941, + "step": 10840 + }, + { + "epoch": 1.38, + "grad_norm": 0.8528359232804006, + "learning_rate": 2.307257602817511e-06, + "loss": 0.5312, + "step": 10841 + }, + { + "epoch": 1.38, + "grad_norm": 0.8980076187595706, + "learning_rate": 2.306388451238052e-06, + "loss": 0.5149, + "step": 10842 + }, + { + "epoch": 1.38, + "grad_norm": 0.6039802533407461, + "learning_rate": 2.3055194143170268e-06, + "loss": 0.4484, + "step": 10843 + }, + { + "epoch": 1.38, + "grad_norm": 0.673218966898378, + "learning_rate": 2.30465049209143e-06, + "loss": 0.5021, + "step": 10844 + }, + { + "epoch": 1.38, + "grad_norm": 0.71350439495478, + "learning_rate": 2.3037816845982512e-06, + "loss": 0.5002, + "step": 10845 + }, + { + "epoch": 1.38, + "grad_norm": 0.6059695953100277, + "learning_rate": 2.302912991874468e-06, + "loss": 0.4107, + "step": 10846 + }, + { + "epoch": 1.38, + "grad_norm": 0.629674748262165, + "learning_rate": 2.302044413957062e-06, + "loss": 0.5115, + "step": 10847 + }, + { + "epoch": 1.38, + "grad_norm": 0.6957572241525941, + "learning_rate": 2.3011759508830017e-06, + "loss": 0.5358, + "step": 10848 + }, + { + "epoch": 1.38, + "grad_norm": 0.7079104543439373, + "learning_rate": 2.300307602689259e-06, + "loss": 0.5064, + "step": 10849 + }, + { + "epoch": 1.38, + "grad_norm": 0.6748084773974228, + "learning_rate": 2.2994393694127925e-06, + "loss": 0.4545, + "step": 10850 + }, + { + "epoch": 1.38, + "grad_norm": 0.8198493326666778, + "learning_rate": 2.298571251090563e-06, + "loss": 0.5341, + "step": 10851 + }, + { + "epoch": 1.38, + "grad_norm": 0.8354673963447915, + "learning_rate": 2.297703247759524e-06, + "loss": 0.485, + "step": 10852 + }, + { + "epoch": 1.38, + "grad_norm": 0.609875454634327, + "learning_rate": 2.2968353594566204e-06, + "loss": 0.4674, + "step": 10853 + }, + { + "epoch": 1.38, + "grad_norm": 0.8023069330239143, + "learning_rate": 2.2959675862188e-06, + "loss": 0.5317, + "step": 10854 + }, + { + "epoch": 1.38, + "grad_norm": 0.646843375110653, + "learning_rate": 2.2950999280829965e-06, + "loss": 0.4665, + "step": 10855 + }, + { + "epoch": 1.38, + "grad_norm": 0.8594025282298801, + "learning_rate": 2.294232385086146e-06, + "loss": 0.5593, + "step": 10856 + }, + { + "epoch": 1.38, + "grad_norm": 0.6386188645289982, + "learning_rate": 2.2933649572651774e-06, + "loss": 0.4471, + "step": 10857 + }, + { + "epoch": 1.38, + "grad_norm": 0.595628600036395, + "learning_rate": 2.2924976446570117e-06, + "loss": 0.434, + "step": 10858 + }, + { + "epoch": 1.38, + "grad_norm": 0.6302137959335479, + "learning_rate": 2.2916304472985717e-06, + "loss": 0.4214, + "step": 10859 + }, + { + "epoch": 1.38, + "grad_norm": 0.7256589824548828, + "learning_rate": 2.2907633652267665e-06, + "loss": 0.4948, + "step": 10860 + }, + { + "epoch": 1.38, + "grad_norm": 0.5784285660717894, + "learning_rate": 2.2898963984785094e-06, + "loss": 0.4217, + "step": 10861 + }, + { + "epoch": 1.38, + "grad_norm": 0.6399792271673397, + "learning_rate": 2.2890295470907004e-06, + "loss": 0.5031, + "step": 10862 + }, + { + "epoch": 1.38, + "grad_norm": 0.8377256946518645, + "learning_rate": 2.288162811100243e-06, + "loss": 0.5681, + "step": 10863 + }, + { + "epoch": 1.38, + "grad_norm": 0.7997655840390412, + "learning_rate": 2.287296190544026e-06, + "loss": 0.5771, + "step": 10864 + }, + { + "epoch": 1.38, + "grad_norm": 0.6840156624585454, + "learning_rate": 2.2864296854589447e-06, + "loss": 0.5083, + "step": 10865 + }, + { + "epoch": 1.38, + "grad_norm": 0.7734226611635323, + "learning_rate": 2.2855632958818775e-06, + "loss": 0.5284, + "step": 10866 + }, + { + "epoch": 1.38, + "grad_norm": 0.7179434223053618, + "learning_rate": 2.2846970218497087e-06, + "loss": 0.5535, + "step": 10867 + }, + { + "epoch": 1.38, + "grad_norm": 0.853643224698523, + "learning_rate": 2.2838308633993094e-06, + "loss": 0.562, + "step": 10868 + }, + { + "epoch": 1.38, + "grad_norm": 0.7325780615361955, + "learning_rate": 2.2829648205675505e-06, + "loss": 0.5552, + "step": 10869 + }, + { + "epoch": 1.38, + "grad_norm": 0.7512348455354795, + "learning_rate": 2.282098893391297e-06, + "loss": 0.5048, + "step": 10870 + }, + { + "epoch": 1.38, + "grad_norm": 0.6871867274601527, + "learning_rate": 2.2812330819074098e-06, + "loss": 0.4601, + "step": 10871 + }, + { + "epoch": 1.39, + "grad_norm": 0.6898907884307547, + "learning_rate": 2.2803673861527402e-06, + "loss": 0.4737, + "step": 10872 + }, + { + "epoch": 1.39, + "grad_norm": 0.7204682129670187, + "learning_rate": 2.2795018061641426e-06, + "loss": 0.4918, + "step": 10873 + }, + { + "epoch": 1.39, + "grad_norm": 0.7429270747384472, + "learning_rate": 2.278636341978458e-06, + "loss": 0.5492, + "step": 10874 + }, + { + "epoch": 1.39, + "grad_norm": 0.7478198372537292, + "learning_rate": 2.27777099363253e-06, + "loss": 0.4986, + "step": 10875 + }, + { + "epoch": 1.39, + "grad_norm": 0.6253559315027629, + "learning_rate": 2.2769057611631896e-06, + "loss": 0.4849, + "step": 10876 + }, + { + "epoch": 1.39, + "grad_norm": 0.7917345277229931, + "learning_rate": 2.276040644607271e-06, + "loss": 0.539, + "step": 10877 + }, + { + "epoch": 1.39, + "grad_norm": 1.0129410880127359, + "learning_rate": 2.275175644001596e-06, + "loss": 0.529, + "step": 10878 + }, + { + "epoch": 1.39, + "grad_norm": 0.6416267711648092, + "learning_rate": 2.2743107593829882e-06, + "loss": 0.5089, + "step": 10879 + }, + { + "epoch": 1.39, + "grad_norm": 0.7531020512628711, + "learning_rate": 2.2734459907882593e-06, + "loss": 0.5236, + "step": 10880 + }, + { + "epoch": 1.39, + "grad_norm": 0.6191266285390059, + "learning_rate": 2.272581338254222e-06, + "loss": 0.4809, + "step": 10881 + }, + { + "epoch": 1.39, + "grad_norm": 0.7061625663095266, + "learning_rate": 2.2717168018176816e-06, + "loss": 0.4906, + "step": 10882 + }, + { + "epoch": 1.39, + "grad_norm": 0.6615318409787045, + "learning_rate": 2.270852381515441e-06, + "loss": 0.4588, + "step": 10883 + }, + { + "epoch": 1.39, + "grad_norm": 0.5958957323794096, + "learning_rate": 2.269988077384291e-06, + "loss": 0.4467, + "step": 10884 + }, + { + "epoch": 1.39, + "grad_norm": 0.7140301508599692, + "learning_rate": 2.2691238894610262e-06, + "loss": 0.5212, + "step": 10885 + }, + { + "epoch": 1.39, + "grad_norm": 0.70847909896908, + "learning_rate": 2.2682598177824295e-06, + "loss": 0.4742, + "step": 10886 + }, + { + "epoch": 1.39, + "grad_norm": 0.7408743619337694, + "learning_rate": 2.2673958623852845e-06, + "loss": 0.5357, + "step": 10887 + }, + { + "epoch": 1.39, + "grad_norm": 0.624318605854862, + "learning_rate": 2.266532023306364e-06, + "loss": 0.4861, + "step": 10888 + }, + { + "epoch": 1.39, + "grad_norm": 0.6330453646878533, + "learning_rate": 2.2656683005824414e-06, + "loss": 0.4498, + "step": 10889 + }, + { + "epoch": 1.39, + "grad_norm": 0.830984947084523, + "learning_rate": 2.26480469425028e-06, + "loss": 0.5654, + "step": 10890 + }, + { + "epoch": 1.39, + "grad_norm": 1.1469468777747078, + "learning_rate": 2.263941204346645e-06, + "loss": 0.5465, + "step": 10891 + }, + { + "epoch": 1.39, + "grad_norm": 0.7971891625506593, + "learning_rate": 2.263077830908287e-06, + "loss": 0.5525, + "step": 10892 + }, + { + "epoch": 1.39, + "grad_norm": 0.7687905450681011, + "learning_rate": 2.2622145739719618e-06, + "loss": 0.5422, + "step": 10893 + }, + { + "epoch": 1.39, + "grad_norm": 0.7859554968939656, + "learning_rate": 2.26135143357441e-06, + "loss": 0.5408, + "step": 10894 + }, + { + "epoch": 1.39, + "grad_norm": 0.7897704926868802, + "learning_rate": 2.26048840975238e-06, + "loss": 0.5684, + "step": 10895 + }, + { + "epoch": 1.39, + "grad_norm": 0.6984672281073324, + "learning_rate": 2.2596255025426016e-06, + "loss": 0.557, + "step": 10896 + }, + { + "epoch": 1.39, + "grad_norm": 0.7968758589073045, + "learning_rate": 2.2587627119818114e-06, + "loss": 0.5242, + "step": 10897 + }, + { + "epoch": 1.39, + "grad_norm": 0.7185412552560613, + "learning_rate": 2.25790003810673e-06, + "loss": 0.4765, + "step": 10898 + }, + { + "epoch": 1.39, + "grad_norm": 0.6079780037116617, + "learning_rate": 2.257037480954084e-06, + "loss": 0.4408, + "step": 10899 + }, + { + "epoch": 1.39, + "grad_norm": 0.6468975121598799, + "learning_rate": 2.2561750405605852e-06, + "loss": 0.4928, + "step": 10900 + }, + { + "epoch": 1.39, + "grad_norm": 0.7641646389324386, + "learning_rate": 2.2553127169629483e-06, + "loss": 0.4848, + "step": 10901 + }, + { + "epoch": 1.39, + "grad_norm": 0.6023575145887837, + "learning_rate": 2.2544505101978775e-06, + "loss": 0.4138, + "step": 10902 + }, + { + "epoch": 1.39, + "grad_norm": 0.6652981432363783, + "learning_rate": 2.253588420302076e-06, + "loss": 0.4721, + "step": 10903 + }, + { + "epoch": 1.39, + "grad_norm": 0.5903932658563311, + "learning_rate": 2.2527264473122375e-06, + "loss": 0.4271, + "step": 10904 + }, + { + "epoch": 1.39, + "grad_norm": 0.5738573114569012, + "learning_rate": 2.2518645912650565e-06, + "loss": 0.4052, + "step": 10905 + }, + { + "epoch": 1.39, + "grad_norm": 0.6396189298051217, + "learning_rate": 2.251002852197217e-06, + "loss": 0.4674, + "step": 10906 + }, + { + "epoch": 1.39, + "grad_norm": 0.8670799584371012, + "learning_rate": 2.2501412301454014e-06, + "loss": 0.5947, + "step": 10907 + }, + { + "epoch": 1.39, + "grad_norm": 0.8304689547212774, + "learning_rate": 2.249279725146288e-06, + "loss": 0.4901, + "step": 10908 + }, + { + "epoch": 1.39, + "grad_norm": 0.776941287475717, + "learning_rate": 2.248418337236545e-06, + "loss": 0.4466, + "step": 10909 + }, + { + "epoch": 1.39, + "grad_norm": 0.6066677428957276, + "learning_rate": 2.247557066452842e-06, + "loss": 0.4477, + "step": 10910 + }, + { + "epoch": 1.39, + "grad_norm": 0.5851247005247016, + "learning_rate": 2.246695912831837e-06, + "loss": 0.4324, + "step": 10911 + }, + { + "epoch": 1.39, + "grad_norm": 0.6741998445693816, + "learning_rate": 2.2458348764101893e-06, + "loss": 0.5196, + "step": 10912 + }, + { + "epoch": 1.39, + "grad_norm": 0.6836705911603268, + "learning_rate": 2.2449739572245522e-06, + "loss": 0.4922, + "step": 10913 + }, + { + "epoch": 1.39, + "grad_norm": 0.5734677035468869, + "learning_rate": 2.244113155311567e-06, + "loss": 0.4967, + "step": 10914 + }, + { + "epoch": 1.39, + "grad_norm": 0.6037662181064927, + "learning_rate": 2.2432524707078806e-06, + "loss": 0.4282, + "step": 10915 + }, + { + "epoch": 1.39, + "grad_norm": 0.6634103154685594, + "learning_rate": 2.242391903450126e-06, + "loss": 0.511, + "step": 10916 + }, + { + "epoch": 1.39, + "grad_norm": 0.6992329402401665, + "learning_rate": 2.2415314535749364e-06, + "loss": 0.5075, + "step": 10917 + }, + { + "epoch": 1.39, + "grad_norm": 0.5164375626469455, + "learning_rate": 2.2406711211189376e-06, + "loss": 0.4375, + "step": 10918 + }, + { + "epoch": 1.39, + "grad_norm": 0.7710943939776963, + "learning_rate": 2.2398109061187507e-06, + "loss": 0.5269, + "step": 10919 + }, + { + "epoch": 1.39, + "grad_norm": 0.7076536057752935, + "learning_rate": 2.2389508086109953e-06, + "loss": 0.5515, + "step": 10920 + }, + { + "epoch": 1.39, + "grad_norm": 0.5653953532490982, + "learning_rate": 2.2380908286322796e-06, + "loss": 0.498, + "step": 10921 + }, + { + "epoch": 1.39, + "grad_norm": 0.7356905916586278, + "learning_rate": 2.237230966219213e-06, + "loss": 0.4974, + "step": 10922 + }, + { + "epoch": 1.39, + "grad_norm": 0.6122952244606756, + "learning_rate": 2.236371221408393e-06, + "loss": 0.4574, + "step": 10923 + }, + { + "epoch": 1.39, + "grad_norm": 1.6170098335556904, + "learning_rate": 2.2355115942364212e-06, + "loss": 0.5068, + "step": 10924 + }, + { + "epoch": 1.39, + "grad_norm": 0.716916126601903, + "learning_rate": 2.2346520847398856e-06, + "loss": 0.5341, + "step": 10925 + }, + { + "epoch": 1.39, + "grad_norm": 0.7358376455793131, + "learning_rate": 2.233792692955375e-06, + "loss": 0.4922, + "step": 10926 + }, + { + "epoch": 1.39, + "grad_norm": 0.7445956345088092, + "learning_rate": 2.2329334189194684e-06, + "loss": 0.484, + "step": 10927 + }, + { + "epoch": 1.39, + "grad_norm": 0.7791986312983723, + "learning_rate": 2.232074262668746e-06, + "loss": 0.5081, + "step": 10928 + }, + { + "epoch": 1.39, + "grad_norm": 0.6554900309954337, + "learning_rate": 2.2312152242397748e-06, + "loss": 0.5061, + "step": 10929 + }, + { + "epoch": 1.39, + "grad_norm": 0.9264581091645545, + "learning_rate": 2.2303563036691262e-06, + "loss": 0.5572, + "step": 10930 + }, + { + "epoch": 1.39, + "grad_norm": 0.7857140826550714, + "learning_rate": 2.229497500993355e-06, + "loss": 0.4902, + "step": 10931 + }, + { + "epoch": 1.39, + "grad_norm": 0.6708662137603609, + "learning_rate": 2.228638816249026e-06, + "loss": 0.4536, + "step": 10932 + }, + { + "epoch": 1.39, + "grad_norm": 0.7342422332790777, + "learning_rate": 2.227780249472684e-06, + "loss": 0.4723, + "step": 10933 + }, + { + "epoch": 1.39, + "grad_norm": 0.5941908653503654, + "learning_rate": 2.226921800700881e-06, + "loss": 0.4013, + "step": 10934 + }, + { + "epoch": 1.39, + "grad_norm": 0.5758168265318097, + "learning_rate": 2.2260634699701527e-06, + "loss": 0.4454, + "step": 10935 + }, + { + "epoch": 1.39, + "grad_norm": 0.6809199870318094, + "learning_rate": 2.225205257317041e-06, + "loss": 0.4624, + "step": 10936 + }, + { + "epoch": 1.39, + "grad_norm": 0.7105686784561956, + "learning_rate": 2.2243471627780716e-06, + "loss": 0.491, + "step": 10937 + }, + { + "epoch": 1.39, + "grad_norm": 0.6869496746710227, + "learning_rate": 2.223489186389776e-06, + "loss": 0.4574, + "step": 10938 + }, + { + "epoch": 1.39, + "grad_norm": 0.6150084343203703, + "learning_rate": 2.2226313281886718e-06, + "loss": 0.4991, + "step": 10939 + }, + { + "epoch": 1.39, + "grad_norm": 0.6877291962032914, + "learning_rate": 2.2217735882112784e-06, + "loss": 0.4972, + "step": 10940 + }, + { + "epoch": 1.39, + "grad_norm": 0.8457120854417003, + "learning_rate": 2.2209159664941034e-06, + "loss": 0.45, + "step": 10941 + }, + { + "epoch": 1.39, + "grad_norm": 0.6081577001832593, + "learning_rate": 2.220058463073657e-06, + "loss": 0.4321, + "step": 10942 + }, + { + "epoch": 1.39, + "grad_norm": 0.624573766215268, + "learning_rate": 2.219201077986435e-06, + "loss": 0.4753, + "step": 10943 + }, + { + "epoch": 1.39, + "grad_norm": 0.7625434569000092, + "learning_rate": 2.21834381126894e-06, + "loss": 0.5031, + "step": 10944 + }, + { + "epoch": 1.39, + "grad_norm": 0.7393329927868117, + "learning_rate": 2.2174866629576585e-06, + "loss": 0.5064, + "step": 10945 + }, + { + "epoch": 1.39, + "grad_norm": 0.6317837046751107, + "learning_rate": 2.2166296330890802e-06, + "loss": 0.4425, + "step": 10946 + }, + { + "epoch": 1.39, + "grad_norm": 0.6471990765402897, + "learning_rate": 2.2157727216996823e-06, + "loss": 0.4404, + "step": 10947 + }, + { + "epoch": 1.39, + "grad_norm": 0.6592984573783606, + "learning_rate": 2.2149159288259435e-06, + "loss": 0.4622, + "step": 10948 + }, + { + "epoch": 1.39, + "grad_norm": 0.541674743752519, + "learning_rate": 2.2140592545043326e-06, + "loss": 0.4044, + "step": 10949 + }, + { + "epoch": 1.39, + "grad_norm": 0.6076175752250428, + "learning_rate": 2.2132026987713185e-06, + "loss": 0.457, + "step": 10950 + }, + { + "epoch": 1.4, + "grad_norm": 0.5562338123275646, + "learning_rate": 2.2123462616633585e-06, + "loss": 0.451, + "step": 10951 + }, + { + "epoch": 1.4, + "grad_norm": 1.6705609066312985, + "learning_rate": 2.211489943216911e-06, + "loss": 0.5134, + "step": 10952 + }, + { + "epoch": 1.4, + "grad_norm": 0.7642569088347329, + "learning_rate": 2.210633743468425e-06, + "loss": 0.5315, + "step": 10953 + }, + { + "epoch": 1.4, + "grad_norm": 0.841355749038376, + "learning_rate": 2.209777662454348e-06, + "loss": 0.5276, + "step": 10954 + }, + { + "epoch": 1.4, + "grad_norm": 1.0077373634766211, + "learning_rate": 2.2089217002111184e-06, + "loss": 0.5449, + "step": 10955 + }, + { + "epoch": 1.4, + "grad_norm": 0.584543446062315, + "learning_rate": 2.2080658567751727e-06, + "loss": 0.5077, + "step": 10956 + }, + { + "epoch": 1.4, + "grad_norm": 0.7767332844256136, + "learning_rate": 2.207210132182942e-06, + "loss": 0.5309, + "step": 10957 + }, + { + "epoch": 1.4, + "grad_norm": 0.6926558720085354, + "learning_rate": 2.2063545264708535e-06, + "loss": 0.479, + "step": 10958 + }, + { + "epoch": 1.4, + "grad_norm": 0.6884824516269165, + "learning_rate": 2.2054990396753236e-06, + "loss": 0.4546, + "step": 10959 + }, + { + "epoch": 1.4, + "grad_norm": 0.6168371490437435, + "learning_rate": 2.204643671832771e-06, + "loss": 0.4368, + "step": 10960 + }, + { + "epoch": 1.4, + "grad_norm": 0.7702975252598642, + "learning_rate": 2.203788422979603e-06, + "loss": 0.5454, + "step": 10961 + }, + { + "epoch": 1.4, + "grad_norm": 0.8119574575472767, + "learning_rate": 2.2029332931522284e-06, + "loss": 0.5198, + "step": 10962 + }, + { + "epoch": 1.4, + "grad_norm": 0.9934127618301879, + "learning_rate": 2.202078282387044e-06, + "loss": 0.4838, + "step": 10963 + }, + { + "epoch": 1.4, + "grad_norm": 0.6712955357044467, + "learning_rate": 2.2012233907204474e-06, + "loss": 0.4438, + "step": 10964 + }, + { + "epoch": 1.4, + "grad_norm": 0.5654238662769213, + "learning_rate": 2.2003686181888257e-06, + "loss": 0.4054, + "step": 10965 + }, + { + "epoch": 1.4, + "grad_norm": 0.60106904253335, + "learning_rate": 2.1995139648285673e-06, + "loss": 0.4268, + "step": 10966 + }, + { + "epoch": 1.4, + "grad_norm": 0.6669297735871007, + "learning_rate": 2.198659430676049e-06, + "loss": 0.4651, + "step": 10967 + }, + { + "epoch": 1.4, + "grad_norm": 0.7193298588316885, + "learning_rate": 2.1978050157676465e-06, + "loss": 0.4746, + "step": 10968 + }, + { + "epoch": 1.4, + "grad_norm": 0.6321282919246608, + "learning_rate": 2.1969507201397323e-06, + "loss": 0.4319, + "step": 10969 + }, + { + "epoch": 1.4, + "grad_norm": 1.8279561336571408, + "learning_rate": 2.196096543828666e-06, + "loss": 0.5255, + "step": 10970 + }, + { + "epoch": 1.4, + "grad_norm": 0.8142394644349056, + "learning_rate": 2.195242486870812e-06, + "loss": 0.4659, + "step": 10971 + }, + { + "epoch": 1.4, + "grad_norm": 0.7207852920771451, + "learning_rate": 2.194388549302521e-06, + "loss": 0.4924, + "step": 10972 + }, + { + "epoch": 1.4, + "grad_norm": 0.6702533859239594, + "learning_rate": 2.193534731160144e-06, + "loss": 0.5134, + "step": 10973 + }, + { + "epoch": 1.4, + "grad_norm": 0.7363740120200745, + "learning_rate": 2.1926810324800275e-06, + "loss": 0.4671, + "step": 10974 + }, + { + "epoch": 1.4, + "grad_norm": 0.6283988046886836, + "learning_rate": 2.191827453298506e-06, + "loss": 0.4751, + "step": 10975 + }, + { + "epoch": 1.4, + "grad_norm": 0.6227413034732902, + "learning_rate": 2.190973993651918e-06, + "loss": 0.4862, + "step": 10976 + }, + { + "epoch": 1.4, + "grad_norm": 0.5495609392943195, + "learning_rate": 2.1901206535765896e-06, + "loss": 0.4458, + "step": 10977 + }, + { + "epoch": 1.4, + "grad_norm": 0.6314091810114084, + "learning_rate": 2.189267433108847e-06, + "loss": 0.4807, + "step": 10978 + }, + { + "epoch": 1.4, + "grad_norm": 0.7020886683319169, + "learning_rate": 2.1884143322850067e-06, + "loss": 0.4894, + "step": 10979 + }, + { + "epoch": 1.4, + "grad_norm": 0.6301172822728136, + "learning_rate": 2.1875613511413835e-06, + "loss": 0.4169, + "step": 10980 + }, + { + "epoch": 1.4, + "grad_norm": 0.6803946015984277, + "learning_rate": 2.1867084897142876e-06, + "loss": 0.5174, + "step": 10981 + }, + { + "epoch": 1.4, + "grad_norm": 0.8329509918098765, + "learning_rate": 2.18585574804002e-06, + "loss": 0.5136, + "step": 10982 + }, + { + "epoch": 1.4, + "grad_norm": 0.6443847846269465, + "learning_rate": 2.1850031261548825e-06, + "loss": 0.4595, + "step": 10983 + }, + { + "epoch": 1.4, + "grad_norm": 0.5957642063948049, + "learning_rate": 2.1841506240951644e-06, + "loss": 0.4559, + "step": 10984 + }, + { + "epoch": 1.4, + "grad_norm": 0.7878583552992771, + "learning_rate": 2.183298241897158e-06, + "loss": 0.4749, + "step": 10985 + }, + { + "epoch": 1.4, + "grad_norm": 0.6979600090124072, + "learning_rate": 2.1824459795971426e-06, + "loss": 0.4922, + "step": 10986 + }, + { + "epoch": 1.4, + "grad_norm": 0.8209454940473095, + "learning_rate": 2.1815938372314e-06, + "loss": 0.5243, + "step": 10987 + }, + { + "epoch": 1.4, + "grad_norm": 0.6023625779386415, + "learning_rate": 2.1807418148362e-06, + "loss": 0.4576, + "step": 10988 + }, + { + "epoch": 1.4, + "grad_norm": 0.615940588367025, + "learning_rate": 2.1798899124478135e-06, + "loss": 0.4706, + "step": 10989 + }, + { + "epoch": 1.4, + "grad_norm": 0.61390045257212, + "learning_rate": 2.1790381301024993e-06, + "loss": 0.4279, + "step": 10990 + }, + { + "epoch": 1.4, + "grad_norm": 0.555138987211361, + "learning_rate": 2.17818646783652e-06, + "loss": 0.4089, + "step": 10991 + }, + { + "epoch": 1.4, + "grad_norm": 0.5683507604892607, + "learning_rate": 2.177334925686123e-06, + "loss": 0.499, + "step": 10992 + }, + { + "epoch": 1.4, + "grad_norm": 0.7343151709091383, + "learning_rate": 2.1764835036875585e-06, + "loss": 0.4961, + "step": 10993 + }, + { + "epoch": 1.4, + "grad_norm": 0.7442754463906057, + "learning_rate": 2.1756322018770685e-06, + "loss": 0.481, + "step": 10994 + }, + { + "epoch": 1.4, + "grad_norm": 0.5611698795914328, + "learning_rate": 2.1747810202908924e-06, + "loss": 0.4069, + "step": 10995 + }, + { + "epoch": 1.4, + "grad_norm": 0.6391799824310542, + "learning_rate": 2.1739299589652585e-06, + "loss": 0.4793, + "step": 10996 + }, + { + "epoch": 1.4, + "grad_norm": 1.0105235284549798, + "learning_rate": 2.173079017936397e-06, + "loss": 0.5128, + "step": 10997 + }, + { + "epoch": 1.4, + "grad_norm": 0.6317468501220947, + "learning_rate": 2.1722281972405263e-06, + "loss": 0.4406, + "step": 10998 + }, + { + "epoch": 1.4, + "grad_norm": 0.5781795300100511, + "learning_rate": 2.171377496913867e-06, + "loss": 0.4915, + "step": 10999 + }, + { + "epoch": 1.4, + "grad_norm": 0.8039107377411533, + "learning_rate": 2.1705269169926265e-06, + "loss": 0.5057, + "step": 11000 + }, + { + "epoch": 1.4, + "grad_norm": 0.647062086882824, + "learning_rate": 2.1696764575130155e-06, + "loss": 0.4705, + "step": 11001 + }, + { + "epoch": 1.4, + "grad_norm": 0.6032750648057195, + "learning_rate": 2.1688261185112314e-06, + "loss": 0.5155, + "step": 11002 + }, + { + "epoch": 1.4, + "grad_norm": 0.8725981161136266, + "learning_rate": 2.167975900023474e-06, + "loss": 0.5371, + "step": 11003 + }, + { + "epoch": 1.4, + "grad_norm": 0.7612226804291636, + "learning_rate": 2.167125802085931e-06, + "loss": 0.5239, + "step": 11004 + }, + { + "epoch": 1.4, + "grad_norm": 0.7044912290668476, + "learning_rate": 2.16627582473479e-06, + "loss": 0.5128, + "step": 11005 + }, + { + "epoch": 1.4, + "grad_norm": 0.6814329210257032, + "learning_rate": 2.165425968006232e-06, + "loss": 0.4399, + "step": 11006 + }, + { + "epoch": 1.4, + "grad_norm": 0.6221781101872258, + "learning_rate": 2.1645762319364346e-06, + "loss": 0.4438, + "step": 11007 + }, + { + "epoch": 1.4, + "grad_norm": 0.7546992385737619, + "learning_rate": 2.163726616561564e-06, + "loss": 0.4564, + "step": 11008 + }, + { + "epoch": 1.4, + "grad_norm": 0.6794415969231482, + "learning_rate": 2.16287712191779e-06, + "loss": 0.4865, + "step": 11009 + }, + { + "epoch": 1.4, + "grad_norm": 0.6438880633732903, + "learning_rate": 2.1620277480412684e-06, + "loss": 0.473, + "step": 11010 + }, + { + "epoch": 1.4, + "grad_norm": 0.7669935149515065, + "learning_rate": 2.1611784949681585e-06, + "loss": 0.5343, + "step": 11011 + }, + { + "epoch": 1.4, + "grad_norm": 0.8208067960589795, + "learning_rate": 2.1603293627346074e-06, + "loss": 0.5343, + "step": 11012 + }, + { + "epoch": 1.4, + "grad_norm": 0.5818989720137931, + "learning_rate": 2.1594803513767626e-06, + "loss": 0.4405, + "step": 11013 + }, + { + "epoch": 1.4, + "grad_norm": 0.7124391133639624, + "learning_rate": 2.15863146093076e-06, + "loss": 0.4811, + "step": 11014 + }, + { + "epoch": 1.4, + "grad_norm": 0.6852734525096708, + "learning_rate": 2.157782691432739e-06, + "loss": 0.5071, + "step": 11015 + }, + { + "epoch": 1.4, + "grad_norm": 0.6114566431044908, + "learning_rate": 2.156934042918824e-06, + "loss": 0.4313, + "step": 11016 + }, + { + "epoch": 1.4, + "grad_norm": 0.6343984270692141, + "learning_rate": 2.1560855154251433e-06, + "loss": 0.4384, + "step": 11017 + }, + { + "epoch": 1.4, + "grad_norm": 0.8647573277877423, + "learning_rate": 2.155237108987811e-06, + "loss": 0.4109, + "step": 11018 + }, + { + "epoch": 1.4, + "grad_norm": 0.6361098323114568, + "learning_rate": 2.1543888236429483e-06, + "loss": 0.4358, + "step": 11019 + }, + { + "epoch": 1.4, + "grad_norm": 0.7087256061379335, + "learning_rate": 2.1535406594266577e-06, + "loss": 0.4257, + "step": 11020 + }, + { + "epoch": 1.4, + "grad_norm": 0.8394145324181174, + "learning_rate": 2.1526926163750476e-06, + "loss": 0.4702, + "step": 11021 + }, + { + "epoch": 1.4, + "grad_norm": 0.6080438234027822, + "learning_rate": 2.151844694524212e-06, + "loss": 0.4134, + "step": 11022 + }, + { + "epoch": 1.4, + "grad_norm": 0.6440188023966902, + "learning_rate": 2.150996893910248e-06, + "loss": 0.4978, + "step": 11023 + }, + { + "epoch": 1.4, + "grad_norm": 0.8782516481647507, + "learning_rate": 2.150149214569241e-06, + "loss": 0.5174, + "step": 11024 + }, + { + "epoch": 1.4, + "grad_norm": 0.5621256069500332, + "learning_rate": 2.1493016565372764e-06, + "loss": 0.4472, + "step": 11025 + }, + { + "epoch": 1.4, + "grad_norm": 0.5990425563435783, + "learning_rate": 2.1484542198504294e-06, + "loss": 0.4666, + "step": 11026 + }, + { + "epoch": 1.4, + "grad_norm": 0.7703837477702888, + "learning_rate": 2.1476069045447756e-06, + "loss": 0.5427, + "step": 11027 + }, + { + "epoch": 1.4, + "grad_norm": 0.6400067325818959, + "learning_rate": 2.1467597106563786e-06, + "loss": 0.4565, + "step": 11028 + }, + { + "epoch": 1.41, + "grad_norm": 0.6537877982122922, + "learning_rate": 2.145912638221305e-06, + "loss": 0.5026, + "step": 11029 + }, + { + "epoch": 1.41, + "grad_norm": 0.7826774851046876, + "learning_rate": 2.1450656872756086e-06, + "loss": 0.5036, + "step": 11030 + }, + { + "epoch": 1.41, + "grad_norm": 0.6172469896270728, + "learning_rate": 2.144218857855343e-06, + "loss": 0.4702, + "step": 11031 + }, + { + "epoch": 1.41, + "grad_norm": 0.6895217052906698, + "learning_rate": 2.1433721499965565e-06, + "loss": 0.4968, + "step": 11032 + }, + { + "epoch": 1.41, + "grad_norm": 0.6139194578998511, + "learning_rate": 2.142525563735287e-06, + "loss": 0.4577, + "step": 11033 + }, + { + "epoch": 1.41, + "grad_norm": 0.7523131711554061, + "learning_rate": 2.141679099107575e-06, + "loss": 0.5359, + "step": 11034 + }, + { + "epoch": 1.41, + "grad_norm": 0.8977456841423956, + "learning_rate": 2.1408327561494485e-06, + "loss": 0.5379, + "step": 11035 + }, + { + "epoch": 1.41, + "grad_norm": 0.5891850449584672, + "learning_rate": 2.1399865348969345e-06, + "loss": 0.4571, + "step": 11036 + }, + { + "epoch": 1.41, + "grad_norm": 0.5442590572440746, + "learning_rate": 2.1391404353860562e-06, + "loss": 0.4469, + "step": 11037 + }, + { + "epoch": 1.41, + "grad_norm": 0.7141649666533418, + "learning_rate": 2.1382944576528257e-06, + "loss": 0.5459, + "step": 11038 + }, + { + "epoch": 1.41, + "grad_norm": 0.7935313665736218, + "learning_rate": 2.137448601733258e-06, + "loss": 0.5358, + "step": 11039 + }, + { + "epoch": 1.41, + "grad_norm": 0.849258668729484, + "learning_rate": 2.1366028676633534e-06, + "loss": 0.5159, + "step": 11040 + }, + { + "epoch": 1.41, + "grad_norm": 0.7024594752468707, + "learning_rate": 2.1357572554791166e-06, + "loss": 0.4733, + "step": 11041 + }, + { + "epoch": 1.41, + "grad_norm": 0.602915841238694, + "learning_rate": 2.1349117652165384e-06, + "loss": 0.4291, + "step": 11042 + }, + { + "epoch": 1.41, + "grad_norm": 0.6994290517955272, + "learning_rate": 2.1340663969116117e-06, + "loss": 0.4836, + "step": 11043 + }, + { + "epoch": 1.41, + "grad_norm": 0.7290551463236109, + "learning_rate": 2.1332211506003215e-06, + "loss": 0.5759, + "step": 11044 + }, + { + "epoch": 1.41, + "grad_norm": 0.7593836673540703, + "learning_rate": 2.1323760263186442e-06, + "loss": 0.5106, + "step": 11045 + }, + { + "epoch": 1.41, + "grad_norm": 1.569954565746356, + "learning_rate": 2.1315310241025576e-06, + "loss": 0.481, + "step": 11046 + }, + { + "epoch": 1.41, + "grad_norm": 0.7105864612712174, + "learning_rate": 2.130686143988027e-06, + "loss": 0.4888, + "step": 11047 + }, + { + "epoch": 1.41, + "grad_norm": 0.7373290698642639, + "learning_rate": 2.1298413860110207e-06, + "loss": 0.4938, + "step": 11048 + }, + { + "epoch": 1.41, + "grad_norm": 0.6343770581486133, + "learning_rate": 2.1289967502074927e-06, + "loss": 0.4557, + "step": 11049 + }, + { + "epoch": 1.41, + "grad_norm": 0.5774901260631604, + "learning_rate": 2.1281522366134007e-06, + "loss": 0.4714, + "step": 11050 + }, + { + "epoch": 1.41, + "grad_norm": 0.6454348805802032, + "learning_rate": 2.127307845264689e-06, + "loss": 0.5188, + "step": 11051 + }, + { + "epoch": 1.41, + "grad_norm": 1.109335767711921, + "learning_rate": 2.126463576197304e-06, + "loss": 0.4535, + "step": 11052 + }, + { + "epoch": 1.41, + "grad_norm": 0.6783969564126169, + "learning_rate": 2.1256194294471806e-06, + "loss": 0.4939, + "step": 11053 + }, + { + "epoch": 1.41, + "grad_norm": 0.766607386119204, + "learning_rate": 2.1247754050502545e-06, + "loss": 0.5071, + "step": 11054 + }, + { + "epoch": 1.41, + "grad_norm": 0.6764714887012292, + "learning_rate": 2.123931503042448e-06, + "loss": 0.4915, + "step": 11055 + }, + { + "epoch": 1.41, + "grad_norm": 0.67241679643197, + "learning_rate": 2.1230877234596904e-06, + "loss": 0.4741, + "step": 11056 + }, + { + "epoch": 1.41, + "grad_norm": 0.6111770577975992, + "learning_rate": 2.122244066337893e-06, + "loss": 0.437, + "step": 11057 + }, + { + "epoch": 1.41, + "grad_norm": 0.6379618823997341, + "learning_rate": 2.1214005317129714e-06, + "loss": 0.4311, + "step": 11058 + }, + { + "epoch": 1.41, + "grad_norm": 0.6652716344344426, + "learning_rate": 2.120557119620829e-06, + "loss": 0.5059, + "step": 11059 + }, + { + "epoch": 1.41, + "grad_norm": 0.6668437273619943, + "learning_rate": 2.119713830097371e-06, + "loss": 0.5124, + "step": 11060 + }, + { + "epoch": 1.41, + "grad_norm": 0.7391629689550185, + "learning_rate": 2.118870663178488e-06, + "loss": 0.5147, + "step": 11061 + }, + { + "epoch": 1.41, + "grad_norm": 0.7573888819210415, + "learning_rate": 2.118027618900077e-06, + "loss": 0.557, + "step": 11062 + }, + { + "epoch": 1.41, + "grad_norm": 0.5660830013577599, + "learning_rate": 2.1171846972980185e-06, + "loss": 0.455, + "step": 11063 + }, + { + "epoch": 1.41, + "grad_norm": 0.5414229853801957, + "learning_rate": 2.116341898408198e-06, + "loss": 0.4536, + "step": 11064 + }, + { + "epoch": 1.41, + "grad_norm": 0.656530204708304, + "learning_rate": 2.1154992222664857e-06, + "loss": 0.4856, + "step": 11065 + }, + { + "epoch": 1.41, + "grad_norm": 0.5606991332043892, + "learning_rate": 2.114656668908755e-06, + "loss": 0.4488, + "step": 11066 + }, + { + "epoch": 1.41, + "grad_norm": 0.6274954726238137, + "learning_rate": 2.113814238370867e-06, + "loss": 0.4774, + "step": 11067 + }, + { + "epoch": 1.41, + "grad_norm": 0.7467665757531308, + "learning_rate": 2.112971930688687e-06, + "loss": 0.5125, + "step": 11068 + }, + { + "epoch": 1.41, + "grad_norm": 0.6857575984261297, + "learning_rate": 2.112129745898065e-06, + "loss": 0.4938, + "step": 11069 + }, + { + "epoch": 1.41, + "grad_norm": 0.8540164108230414, + "learning_rate": 2.1112876840348527e-06, + "loss": 0.5532, + "step": 11070 + }, + { + "epoch": 1.41, + "grad_norm": 0.7300724933284729, + "learning_rate": 2.110445745134891e-06, + "loss": 0.5386, + "step": 11071 + }, + { + "epoch": 1.41, + "grad_norm": 0.6618732898394578, + "learning_rate": 2.109603929234022e-06, + "loss": 0.4534, + "step": 11072 + }, + { + "epoch": 1.41, + "grad_norm": 0.7967401665876949, + "learning_rate": 2.108762236368076e-06, + "loss": 0.4595, + "step": 11073 + }, + { + "epoch": 1.41, + "grad_norm": 0.6252598075712752, + "learning_rate": 2.1079206665728835e-06, + "loss": 0.4535, + "step": 11074 + }, + { + "epoch": 1.41, + "grad_norm": 0.658759871928441, + "learning_rate": 2.1070792198842654e-06, + "loss": 0.4671, + "step": 11075 + }, + { + "epoch": 1.41, + "grad_norm": 0.7109059281672796, + "learning_rate": 2.106237896338042e-06, + "loss": 0.5544, + "step": 11076 + }, + { + "epoch": 1.41, + "grad_norm": 0.825859182314527, + "learning_rate": 2.1053966959700223e-06, + "loss": 0.5077, + "step": 11077 + }, + { + "epoch": 1.41, + "grad_norm": 0.6402100340838714, + "learning_rate": 2.104555618816018e-06, + "loss": 0.4498, + "step": 11078 + }, + { + "epoch": 1.41, + "grad_norm": 0.7458329944559269, + "learning_rate": 2.1037146649118256e-06, + "loss": 0.5309, + "step": 11079 + }, + { + "epoch": 1.41, + "grad_norm": 0.7952858746286077, + "learning_rate": 2.1028738342932453e-06, + "loss": 0.5535, + "step": 11080 + }, + { + "epoch": 1.41, + "grad_norm": 0.8016345419733305, + "learning_rate": 2.1020331269960676e-06, + "loss": 0.4869, + "step": 11081 + }, + { + "epoch": 1.41, + "grad_norm": 0.7268533641558307, + "learning_rate": 2.101192543056081e-06, + "loss": 0.492, + "step": 11082 + }, + { + "epoch": 1.41, + "grad_norm": 0.7324600575608072, + "learning_rate": 2.100352082509063e-06, + "loss": 0.5313, + "step": 11083 + }, + { + "epoch": 1.41, + "grad_norm": 0.8198207889099383, + "learning_rate": 2.0995117453907927e-06, + "loss": 0.5854, + "step": 11084 + }, + { + "epoch": 1.41, + "grad_norm": 0.74181732426242, + "learning_rate": 2.098671531737037e-06, + "loss": 0.5267, + "step": 11085 + }, + { + "epoch": 1.41, + "grad_norm": 0.8123453098551288, + "learning_rate": 2.0978314415835643e-06, + "loss": 0.5246, + "step": 11086 + }, + { + "epoch": 1.41, + "grad_norm": 0.663872066440253, + "learning_rate": 2.096991474966132e-06, + "loss": 0.52, + "step": 11087 + }, + { + "epoch": 1.41, + "grad_norm": 0.5752085595551722, + "learning_rate": 2.0961516319204973e-06, + "loss": 0.4402, + "step": 11088 + }, + { + "epoch": 1.41, + "grad_norm": 0.5324093230432396, + "learning_rate": 2.095311912482406e-06, + "loss": 0.4791, + "step": 11089 + }, + { + "epoch": 1.41, + "grad_norm": 0.7119248843311933, + "learning_rate": 2.094472316687607e-06, + "loss": 0.4677, + "step": 11090 + }, + { + "epoch": 1.41, + "grad_norm": 0.7149301728187657, + "learning_rate": 2.0936328445718348e-06, + "loss": 0.5514, + "step": 11091 + }, + { + "epoch": 1.41, + "grad_norm": 0.7072166361555027, + "learning_rate": 2.0927934961708248e-06, + "loss": 0.5195, + "step": 11092 + }, + { + "epoch": 1.41, + "grad_norm": 0.6971844451125663, + "learning_rate": 2.0919542715203072e-06, + "loss": 0.5274, + "step": 11093 + }, + { + "epoch": 1.41, + "grad_norm": 0.5999122124993302, + "learning_rate": 2.0911151706560024e-06, + "loss": 0.4643, + "step": 11094 + }, + { + "epoch": 1.41, + "grad_norm": 0.5763592098279886, + "learning_rate": 2.0902761936136305e-06, + "loss": 0.4263, + "step": 11095 + }, + { + "epoch": 1.41, + "grad_norm": 0.5607236319769412, + "learning_rate": 2.0894373404289013e-06, + "loss": 0.445, + "step": 11096 + }, + { + "epoch": 1.41, + "grad_norm": 0.5850345854981462, + "learning_rate": 2.0885986111375234e-06, + "loss": 0.4734, + "step": 11097 + }, + { + "epoch": 1.41, + "grad_norm": 0.8683882082768954, + "learning_rate": 2.0877600057752014e-06, + "loss": 0.5556, + "step": 11098 + }, + { + "epoch": 1.41, + "grad_norm": 0.6896501195309843, + "learning_rate": 2.0869215243776276e-06, + "loss": 0.5399, + "step": 11099 + }, + { + "epoch": 1.41, + "grad_norm": 0.7537990629141019, + "learning_rate": 2.0860831669804978e-06, + "loss": 0.5455, + "step": 11100 + }, + { + "epoch": 1.41, + "grad_norm": 0.7181973726146353, + "learning_rate": 2.0852449336194943e-06, + "loss": 0.5286, + "step": 11101 + }, + { + "epoch": 1.41, + "grad_norm": 0.7061407701195933, + "learning_rate": 2.0844068243303006e-06, + "loss": 0.4676, + "step": 11102 + }, + { + "epoch": 1.41, + "grad_norm": 0.657599665368253, + "learning_rate": 2.0835688391485903e-06, + "loss": 0.4559, + "step": 11103 + }, + { + "epoch": 1.41, + "grad_norm": 0.738946946170603, + "learning_rate": 2.0827309781100354e-06, + "loss": 0.4881, + "step": 11104 + }, + { + "epoch": 1.41, + "grad_norm": 0.5833938737411153, + "learning_rate": 2.081893241250302e-06, + "loss": 0.4426, + "step": 11105 + }, + { + "epoch": 1.41, + "grad_norm": 0.6785732203369954, + "learning_rate": 2.081055628605046e-06, + "loss": 0.4466, + "step": 11106 + }, + { + "epoch": 1.41, + "grad_norm": 0.6813897254589607, + "learning_rate": 2.080218140209927e-06, + "loss": 0.4964, + "step": 11107 + }, + { + "epoch": 1.42, + "grad_norm": 0.6696063341332344, + "learning_rate": 2.079380776100589e-06, + "loss": 0.4914, + "step": 11108 + }, + { + "epoch": 1.42, + "grad_norm": 0.8094250466191285, + "learning_rate": 2.0785435363126806e-06, + "loss": 0.5014, + "step": 11109 + }, + { + "epoch": 1.42, + "grad_norm": 0.6141977248208548, + "learning_rate": 2.077706420881836e-06, + "loss": 0.5114, + "step": 11110 + }, + { + "epoch": 1.42, + "grad_norm": 0.7190070221409822, + "learning_rate": 2.076869429843693e-06, + "loss": 0.4776, + "step": 11111 + }, + { + "epoch": 1.42, + "grad_norm": 0.6745433180701251, + "learning_rate": 2.0760325632338753e-06, + "loss": 0.528, + "step": 11112 + }, + { + "epoch": 1.42, + "grad_norm": 0.7104035344480847, + "learning_rate": 2.0751958210880097e-06, + "loss": 0.5019, + "step": 11113 + }, + { + "epoch": 1.42, + "grad_norm": 0.602695777505395, + "learning_rate": 2.07435920344171e-06, + "loss": 0.4308, + "step": 11114 + }, + { + "epoch": 1.42, + "grad_norm": 0.5926985940921856, + "learning_rate": 2.073522710330591e-06, + "loss": 0.5058, + "step": 11115 + }, + { + "epoch": 1.42, + "grad_norm": 0.7151406497260272, + "learning_rate": 2.0726863417902576e-06, + "loss": 0.5797, + "step": 11116 + }, + { + "epoch": 1.42, + "grad_norm": 0.712438332720332, + "learning_rate": 2.0718500978563115e-06, + "loss": 0.4697, + "step": 11117 + }, + { + "epoch": 1.42, + "grad_norm": 0.5642295050730727, + "learning_rate": 2.0710139785643508e-06, + "loss": 0.4312, + "step": 11118 + }, + { + "epoch": 1.42, + "grad_norm": 0.6154997107422874, + "learning_rate": 2.0701779839499665e-06, + "loss": 0.4491, + "step": 11119 + }, + { + "epoch": 1.42, + "grad_norm": 0.8665712240284359, + "learning_rate": 2.069342114048741e-06, + "loss": 0.5318, + "step": 11120 + }, + { + "epoch": 1.42, + "grad_norm": 0.664008458206658, + "learning_rate": 2.0685063688962587e-06, + "loss": 0.5069, + "step": 11121 + }, + { + "epoch": 1.42, + "grad_norm": 0.7600195362182983, + "learning_rate": 2.067670748528091e-06, + "loss": 0.4687, + "step": 11122 + }, + { + "epoch": 1.42, + "grad_norm": 0.6821995629661899, + "learning_rate": 2.0668352529798107e-06, + "loss": 0.4949, + "step": 11123 + }, + { + "epoch": 1.42, + "grad_norm": 0.6727349306651959, + "learning_rate": 2.0659998822869786e-06, + "loss": 0.4401, + "step": 11124 + }, + { + "epoch": 1.42, + "grad_norm": 0.6592809234476772, + "learning_rate": 2.065164636485158e-06, + "loss": 0.4829, + "step": 11125 + }, + { + "epoch": 1.42, + "grad_norm": 0.7127709604367063, + "learning_rate": 2.0643295156098988e-06, + "loss": 0.4981, + "step": 11126 + }, + { + "epoch": 1.42, + "grad_norm": 0.6462875817956245, + "learning_rate": 2.063494519696752e-06, + "loss": 0.4907, + "step": 11127 + }, + { + "epoch": 1.42, + "grad_norm": 0.7877815350100381, + "learning_rate": 2.062659648781259e-06, + "loss": 0.5668, + "step": 11128 + }, + { + "epoch": 1.42, + "grad_norm": 0.7131156581817333, + "learning_rate": 2.061824902898958e-06, + "loss": 0.4769, + "step": 11129 + }, + { + "epoch": 1.42, + "grad_norm": 0.5986380935803617, + "learning_rate": 2.0609902820853813e-06, + "loss": 0.4739, + "step": 11130 + }, + { + "epoch": 1.42, + "grad_norm": 0.7528864095873341, + "learning_rate": 2.060155786376059e-06, + "loss": 0.512, + "step": 11131 + }, + { + "epoch": 1.42, + "grad_norm": 0.6173057949871051, + "learning_rate": 2.0593214158065086e-06, + "loss": 0.4628, + "step": 11132 + }, + { + "epoch": 1.42, + "grad_norm": 0.6378019275944758, + "learning_rate": 2.0584871704122494e-06, + "loss": 0.4462, + "step": 11133 + }, + { + "epoch": 1.42, + "grad_norm": 0.7292249256808478, + "learning_rate": 2.0576530502287908e-06, + "loss": 0.4834, + "step": 11134 + }, + { + "epoch": 1.42, + "grad_norm": 0.707451097709552, + "learning_rate": 2.056819055291641e-06, + "loss": 0.4663, + "step": 11135 + }, + { + "epoch": 1.42, + "grad_norm": 0.5947493844942456, + "learning_rate": 2.0559851856362973e-06, + "loss": 0.4755, + "step": 11136 + }, + { + "epoch": 1.42, + "grad_norm": 0.7038553669745728, + "learning_rate": 2.0551514412982583e-06, + "loss": 0.4593, + "step": 11137 + }, + { + "epoch": 1.42, + "grad_norm": 0.6810359109290924, + "learning_rate": 2.05431782231301e-06, + "loss": 0.4577, + "step": 11138 + }, + { + "epoch": 1.42, + "grad_norm": 0.8146909236585748, + "learning_rate": 2.0534843287160404e-06, + "loss": 0.4712, + "step": 11139 + }, + { + "epoch": 1.42, + "grad_norm": 0.7802695331532603, + "learning_rate": 2.0526509605428264e-06, + "loss": 0.5225, + "step": 11140 + }, + { + "epoch": 1.42, + "grad_norm": 0.7135752475066814, + "learning_rate": 2.0518177178288435e-06, + "loss": 0.4605, + "step": 11141 + }, + { + "epoch": 1.42, + "grad_norm": 0.5737366886981783, + "learning_rate": 2.0509846006095562e-06, + "loss": 0.4439, + "step": 11142 + }, + { + "epoch": 1.42, + "grad_norm": 0.5678085890581392, + "learning_rate": 2.0501516089204344e-06, + "loss": 0.445, + "step": 11143 + }, + { + "epoch": 1.42, + "grad_norm": 0.7178302723252884, + "learning_rate": 2.04931874279693e-06, + "loss": 0.5514, + "step": 11144 + }, + { + "epoch": 1.42, + "grad_norm": 0.7097429745694298, + "learning_rate": 2.0484860022745003e-06, + "loss": 0.4709, + "step": 11145 + }, + { + "epoch": 1.42, + "grad_norm": 0.6620569937354427, + "learning_rate": 2.0476533873885873e-06, + "loss": 0.4507, + "step": 11146 + }, + { + "epoch": 1.42, + "grad_norm": 0.6340940340417469, + "learning_rate": 2.046820898174637e-06, + "loss": 0.4386, + "step": 11147 + }, + { + "epoch": 1.42, + "grad_norm": 0.6325414899016335, + "learning_rate": 2.0459885346680825e-06, + "loss": 0.5396, + "step": 11148 + }, + { + "epoch": 1.42, + "grad_norm": 0.7611898111676605, + "learning_rate": 2.0451562969043582e-06, + "loss": 0.5393, + "step": 11149 + }, + { + "epoch": 1.42, + "grad_norm": 0.7448090154732241, + "learning_rate": 2.044324184918886e-06, + "loss": 0.5038, + "step": 11150 + }, + { + "epoch": 1.42, + "grad_norm": 0.6488845329716524, + "learning_rate": 2.04349219874709e-06, + "loss": 0.4581, + "step": 11151 + }, + { + "epoch": 1.42, + "grad_norm": 0.6951950446896673, + "learning_rate": 2.042660338424382e-06, + "loss": 0.4794, + "step": 11152 + }, + { + "epoch": 1.42, + "grad_norm": 1.1265786487627243, + "learning_rate": 2.0418286039861752e-06, + "loss": 0.5148, + "step": 11153 + }, + { + "epoch": 1.42, + "grad_norm": 0.7184494276937451, + "learning_rate": 2.0409969954678695e-06, + "loss": 0.4835, + "step": 11154 + }, + { + "epoch": 1.42, + "grad_norm": 0.6291749993280014, + "learning_rate": 2.040165512904866e-06, + "loss": 0.4895, + "step": 11155 + }, + { + "epoch": 1.42, + "grad_norm": 0.5923525579805895, + "learning_rate": 2.0393341563325603e-06, + "loss": 0.421, + "step": 11156 + }, + { + "epoch": 1.42, + "grad_norm": 0.6847813843617118, + "learning_rate": 2.038502925786337e-06, + "loss": 0.464, + "step": 11157 + }, + { + "epoch": 1.42, + "grad_norm": 0.6029038851321042, + "learning_rate": 2.0376718213015824e-06, + "loss": 0.4372, + "step": 11158 + }, + { + "epoch": 1.42, + "grad_norm": 0.7022769911202208, + "learning_rate": 2.0368408429136697e-06, + "loss": 0.5035, + "step": 11159 + }, + { + "epoch": 1.42, + "grad_norm": 0.6747759573489848, + "learning_rate": 2.036009990657974e-06, + "loss": 0.4732, + "step": 11160 + }, + { + "epoch": 1.42, + "grad_norm": 0.7743521332684044, + "learning_rate": 2.035179264569863e-06, + "loss": 0.4949, + "step": 11161 + }, + { + "epoch": 1.42, + "grad_norm": 0.7944481008965669, + "learning_rate": 2.0343486646846945e-06, + "loss": 0.5759, + "step": 11162 + }, + { + "epoch": 1.42, + "grad_norm": 0.7901209155580213, + "learning_rate": 2.0335181910378286e-06, + "loss": 0.5553, + "step": 11163 + }, + { + "epoch": 1.42, + "grad_norm": 0.6756152331872225, + "learning_rate": 2.0326878436646118e-06, + "loss": 0.4994, + "step": 11164 + }, + { + "epoch": 1.42, + "grad_norm": 0.7415721565195527, + "learning_rate": 2.0318576226003932e-06, + "loss": 0.5608, + "step": 11165 + }, + { + "epoch": 1.42, + "grad_norm": 0.7322104642579267, + "learning_rate": 2.031027527880509e-06, + "loss": 0.4858, + "step": 11166 + }, + { + "epoch": 1.42, + "grad_norm": 0.7065383463815319, + "learning_rate": 2.0301975595402954e-06, + "loss": 0.4636, + "step": 11167 + }, + { + "epoch": 1.42, + "grad_norm": 0.5976634669660313, + "learning_rate": 2.029367717615084e-06, + "loss": 0.4799, + "step": 11168 + }, + { + "epoch": 1.42, + "grad_norm": 0.8025401793153247, + "learning_rate": 2.0285380021401934e-06, + "loss": 0.5876, + "step": 11169 + }, + { + "epoch": 1.42, + "grad_norm": 0.671937427847929, + "learning_rate": 2.0277084131509468e-06, + "loss": 0.4363, + "step": 11170 + }, + { + "epoch": 1.42, + "grad_norm": 0.7865110059584666, + "learning_rate": 2.026878950682653e-06, + "loss": 0.5837, + "step": 11171 + }, + { + "epoch": 1.42, + "grad_norm": 0.8299314315854112, + "learning_rate": 2.0260496147706233e-06, + "loss": 0.5397, + "step": 11172 + }, + { + "epoch": 1.42, + "grad_norm": 0.7134080059598646, + "learning_rate": 2.025220405450157e-06, + "loss": 0.4863, + "step": 11173 + }, + { + "epoch": 1.42, + "grad_norm": 0.6397844984212663, + "learning_rate": 2.0243913227565526e-06, + "loss": 0.5193, + "step": 11174 + }, + { + "epoch": 1.42, + "grad_norm": 0.9453151750863827, + "learning_rate": 2.0235623667250996e-06, + "loss": 0.5469, + "step": 11175 + }, + { + "epoch": 1.42, + "grad_norm": 0.7859150893954437, + "learning_rate": 2.022733537391088e-06, + "loss": 0.5461, + "step": 11176 + }, + { + "epoch": 1.42, + "grad_norm": 0.827419865591755, + "learning_rate": 2.021904834789793e-06, + "loss": 0.546, + "step": 11177 + }, + { + "epoch": 1.42, + "grad_norm": 0.7229751042227601, + "learning_rate": 2.0210762589564942e-06, + "loss": 0.5172, + "step": 11178 + }, + { + "epoch": 1.42, + "grad_norm": 0.6197193337111703, + "learning_rate": 2.0202478099264565e-06, + "loss": 0.4905, + "step": 11179 + }, + { + "epoch": 1.42, + "grad_norm": 0.7552588795551329, + "learning_rate": 2.0194194877349516e-06, + "loss": 0.4981, + "step": 11180 + }, + { + "epoch": 1.42, + "grad_norm": 0.8029246219288291, + "learning_rate": 2.0185912924172325e-06, + "loss": 0.5319, + "step": 11181 + }, + { + "epoch": 1.42, + "grad_norm": 0.6915913682263175, + "learning_rate": 2.017763224008556e-06, + "loss": 0.5061, + "step": 11182 + }, + { + "epoch": 1.42, + "grad_norm": 0.7326998335946943, + "learning_rate": 2.016935282544168e-06, + "loss": 0.5124, + "step": 11183 + }, + { + "epoch": 1.42, + "grad_norm": 0.7469400014112825, + "learning_rate": 2.0161074680593144e-06, + "loss": 0.5211, + "step": 11184 + }, + { + "epoch": 1.42, + "grad_norm": 0.6264426768938137, + "learning_rate": 2.0152797805892286e-06, + "loss": 0.4671, + "step": 11185 + }, + { + "epoch": 1.43, + "grad_norm": 0.5935187660313015, + "learning_rate": 2.014452220169147e-06, + "loss": 0.4217, + "step": 11186 + }, + { + "epoch": 1.43, + "grad_norm": 0.782048419085864, + "learning_rate": 2.013624786834292e-06, + "loss": 0.5533, + "step": 11187 + }, + { + "epoch": 1.43, + "grad_norm": 0.856726064302872, + "learning_rate": 2.012797480619888e-06, + "loss": 0.5277, + "step": 11188 + }, + { + "epoch": 1.43, + "grad_norm": 0.9184237425366315, + "learning_rate": 2.011970301561148e-06, + "loss": 0.4777, + "step": 11189 + }, + { + "epoch": 1.43, + "grad_norm": 0.6465520845948611, + "learning_rate": 2.011143249693286e-06, + "loss": 0.4747, + "step": 11190 + }, + { + "epoch": 1.43, + "grad_norm": 0.6003114960282584, + "learning_rate": 2.0103163250515007e-06, + "loss": 0.4776, + "step": 11191 + }, + { + "epoch": 1.43, + "grad_norm": 0.7659828507649711, + "learning_rate": 2.009489527671e-06, + "loss": 0.5272, + "step": 11192 + }, + { + "epoch": 1.43, + "grad_norm": 0.6893242718725018, + "learning_rate": 2.0086628575869716e-06, + "loss": 0.449, + "step": 11193 + }, + { + "epoch": 1.43, + "grad_norm": 0.6823328161925785, + "learning_rate": 2.007836314834608e-06, + "loss": 0.4649, + "step": 11194 + }, + { + "epoch": 1.43, + "grad_norm": 0.630098347005655, + "learning_rate": 2.0070098994490893e-06, + "loss": 0.4504, + "step": 11195 + }, + { + "epoch": 1.43, + "grad_norm": 0.6010447452932413, + "learning_rate": 2.006183611465597e-06, + "loss": 0.4465, + "step": 11196 + }, + { + "epoch": 1.43, + "grad_norm": 0.6568021747081857, + "learning_rate": 2.0053574509192996e-06, + "loss": 0.4214, + "step": 11197 + }, + { + "epoch": 1.43, + "grad_norm": 0.6959581616117472, + "learning_rate": 2.004531417845368e-06, + "loss": 0.5096, + "step": 11198 + }, + { + "epoch": 1.43, + "grad_norm": 0.6687655739177488, + "learning_rate": 2.0037055122789594e-06, + "loss": 0.518, + "step": 11199 + }, + { + "epoch": 1.43, + "grad_norm": 0.8096255649297281, + "learning_rate": 2.0028797342552346e-06, + "loss": 0.4608, + "step": 11200 + }, + { + "epoch": 1.43, + "grad_norm": 0.5901434615746427, + "learning_rate": 2.00205408380934e-06, + "loss": 0.4439, + "step": 11201 + }, + { + "epoch": 1.43, + "grad_norm": 0.6712716973862201, + "learning_rate": 2.0012285609764255e-06, + "loss": 0.4618, + "step": 11202 + }, + { + "epoch": 1.43, + "grad_norm": 0.5695432627942246, + "learning_rate": 2.0004031657916254e-06, + "loss": 0.3809, + "step": 11203 + }, + { + "epoch": 1.43, + "grad_norm": 0.5874289189650648, + "learning_rate": 1.9995778982900783e-06, + "loss": 0.4209, + "step": 11204 + }, + { + "epoch": 1.43, + "grad_norm": 0.5770341747333945, + "learning_rate": 1.9987527585069117e-06, + "loss": 0.3952, + "step": 11205 + }, + { + "epoch": 1.43, + "grad_norm": 0.5818497800879188, + "learning_rate": 1.997927746477251e-06, + "loss": 0.4737, + "step": 11206 + }, + { + "epoch": 1.43, + "grad_norm": 0.7840189837384709, + "learning_rate": 1.9971028622362116e-06, + "loss": 0.5592, + "step": 11207 + }, + { + "epoch": 1.43, + "grad_norm": 0.652611209058209, + "learning_rate": 1.996278105818909e-06, + "loss": 0.4959, + "step": 11208 + }, + { + "epoch": 1.43, + "grad_norm": 0.7051516265529143, + "learning_rate": 1.9954534772604473e-06, + "loss": 0.5215, + "step": 11209 + }, + { + "epoch": 1.43, + "grad_norm": 0.7113256398763603, + "learning_rate": 1.994628976595932e-06, + "loss": 0.565, + "step": 11210 + }, + { + "epoch": 1.43, + "grad_norm": 0.6076305234717412, + "learning_rate": 1.993804603860455e-06, + "loss": 0.4279, + "step": 11211 + }, + { + "epoch": 1.43, + "grad_norm": 0.5894178483904796, + "learning_rate": 1.9929803590891115e-06, + "loss": 0.4303, + "step": 11212 + }, + { + "epoch": 1.43, + "grad_norm": 0.6180690102402564, + "learning_rate": 1.992156242316984e-06, + "loss": 0.4802, + "step": 11213 + }, + { + "epoch": 1.43, + "grad_norm": 0.8086175978414927, + "learning_rate": 1.991332253579154e-06, + "loss": 0.5127, + "step": 11214 + }, + { + "epoch": 1.43, + "grad_norm": 0.6086401440154163, + "learning_rate": 1.990508392910695e-06, + "loss": 0.4643, + "step": 11215 + }, + { + "epoch": 1.43, + "grad_norm": 0.7020827946931136, + "learning_rate": 1.9896846603466762e-06, + "loss": 0.4307, + "step": 11216 + }, + { + "epoch": 1.43, + "grad_norm": 0.6059436710261036, + "learning_rate": 1.988861055922164e-06, + "loss": 0.5241, + "step": 11217 + }, + { + "epoch": 1.43, + "grad_norm": 0.7541097747004496, + "learning_rate": 1.9880375796722116e-06, + "loss": 0.4994, + "step": 11218 + }, + { + "epoch": 1.43, + "grad_norm": 0.56993310040587, + "learning_rate": 1.9872142316318775e-06, + "loss": 0.4395, + "step": 11219 + }, + { + "epoch": 1.43, + "grad_norm": 0.7569533892047728, + "learning_rate": 1.986391011836204e-06, + "loss": 0.5588, + "step": 11220 + }, + { + "epoch": 1.43, + "grad_norm": 0.6036030514550667, + "learning_rate": 1.985567920320235e-06, + "loss": 0.4207, + "step": 11221 + }, + { + "epoch": 1.43, + "grad_norm": 0.580993050558816, + "learning_rate": 1.9847449571190087e-06, + "loss": 0.4573, + "step": 11222 + }, + { + "epoch": 1.43, + "grad_norm": 0.7089009313971928, + "learning_rate": 1.983922122267552e-06, + "loss": 0.4194, + "step": 11223 + }, + { + "epoch": 1.43, + "grad_norm": 0.6714513655693373, + "learning_rate": 1.983099415800895e-06, + "loss": 0.4894, + "step": 11224 + }, + { + "epoch": 1.43, + "grad_norm": 0.6338727037309118, + "learning_rate": 1.982276837754053e-06, + "loss": 0.504, + "step": 11225 + }, + { + "epoch": 1.43, + "grad_norm": 0.705206243507172, + "learning_rate": 1.9814543881620447e-06, + "loss": 0.4273, + "step": 11226 + }, + { + "epoch": 1.43, + "grad_norm": 0.657490437945099, + "learning_rate": 1.980632067059875e-06, + "loss": 0.4972, + "step": 11227 + }, + { + "epoch": 1.43, + "grad_norm": 0.7166418058104774, + "learning_rate": 1.97980987448255e-06, + "loss": 0.5376, + "step": 11228 + }, + { + "epoch": 1.43, + "grad_norm": 0.8066864628986806, + "learning_rate": 1.9789878104650694e-06, + "loss": 0.5398, + "step": 11229 + }, + { + "epoch": 1.43, + "grad_norm": 0.6102186772099529, + "learning_rate": 1.978165875042422e-06, + "loss": 0.4401, + "step": 11230 + }, + { + "epoch": 1.43, + "grad_norm": 0.6344805523278496, + "learning_rate": 1.977344068249599e-06, + "loss": 0.4665, + "step": 11231 + }, + { + "epoch": 1.43, + "grad_norm": 0.5943139674369584, + "learning_rate": 1.976522390121578e-06, + "loss": 0.4631, + "step": 11232 + }, + { + "epoch": 1.43, + "grad_norm": 0.7749393310510425, + "learning_rate": 1.9757008406933385e-06, + "loss": 0.546, + "step": 11233 + }, + { + "epoch": 1.43, + "grad_norm": 0.8174546481772259, + "learning_rate": 1.974879419999849e-06, + "loss": 0.5262, + "step": 11234 + }, + { + "epoch": 1.43, + "grad_norm": 0.7244544058728543, + "learning_rate": 1.9740581280760767e-06, + "loss": 0.5439, + "step": 11235 + }, + { + "epoch": 1.43, + "grad_norm": 0.7589498656694413, + "learning_rate": 1.9732369649569794e-06, + "loss": 0.5254, + "step": 11236 + }, + { + "epoch": 1.43, + "grad_norm": 0.6228498479634623, + "learning_rate": 1.9724159306775136e-06, + "loss": 0.4858, + "step": 11237 + }, + { + "epoch": 1.43, + "grad_norm": 0.7168217106689247, + "learning_rate": 1.971595025272625e-06, + "loss": 0.4725, + "step": 11238 + }, + { + "epoch": 1.43, + "grad_norm": 0.7366200843304248, + "learning_rate": 1.9707742487772607e-06, + "loss": 0.4036, + "step": 11239 + }, + { + "epoch": 1.43, + "grad_norm": 0.6148224541790992, + "learning_rate": 1.9699536012263546e-06, + "loss": 0.472, + "step": 11240 + }, + { + "epoch": 1.43, + "grad_norm": 0.7975209229405948, + "learning_rate": 1.9691330826548415e-06, + "loss": 0.5936, + "step": 11241 + }, + { + "epoch": 1.43, + "grad_norm": 0.7679398132762768, + "learning_rate": 1.9683126930976477e-06, + "loss": 0.5078, + "step": 11242 + }, + { + "epoch": 1.43, + "grad_norm": 0.7457473173654783, + "learning_rate": 1.967492432589696e-06, + "loss": 0.5207, + "step": 11243 + }, + { + "epoch": 1.43, + "grad_norm": 0.8982439231148711, + "learning_rate": 1.9666723011659e-06, + "loss": 0.5527, + "step": 11244 + }, + { + "epoch": 1.43, + "grad_norm": 0.7989003897825304, + "learning_rate": 1.965852298861172e-06, + "loss": 0.5473, + "step": 11245 + }, + { + "epoch": 1.43, + "grad_norm": 0.7330452786755091, + "learning_rate": 1.965032425710414e-06, + "loss": 0.5118, + "step": 11246 + }, + { + "epoch": 1.43, + "grad_norm": 0.6442375600589755, + "learning_rate": 1.9642126817485297e-06, + "loss": 0.4427, + "step": 11247 + }, + { + "epoch": 1.43, + "grad_norm": 0.6781118787117099, + "learning_rate": 1.9633930670104084e-06, + "loss": 0.4561, + "step": 11248 + }, + { + "epoch": 1.43, + "grad_norm": 0.5674129129708307, + "learning_rate": 1.9625735815309425e-06, + "loss": 0.4501, + "step": 11249 + }, + { + "epoch": 1.43, + "grad_norm": 0.8035437835222323, + "learning_rate": 1.9617542253450116e-06, + "loss": 0.5097, + "step": 11250 + }, + { + "epoch": 1.43, + "grad_norm": 0.6751218774101819, + "learning_rate": 1.960934998487496e-06, + "loss": 0.5032, + "step": 11251 + }, + { + "epoch": 1.43, + "grad_norm": 0.7490261223356152, + "learning_rate": 1.960115900993264e-06, + "loss": 0.5509, + "step": 11252 + }, + { + "epoch": 1.43, + "grad_norm": 1.2838419789777458, + "learning_rate": 1.9592969328971844e-06, + "loss": 0.5794, + "step": 11253 + }, + { + "epoch": 1.43, + "grad_norm": 0.7122731410537773, + "learning_rate": 1.9584780942341177e-06, + "loss": 0.4885, + "step": 11254 + }, + { + "epoch": 1.43, + "grad_norm": 0.6522230509906171, + "learning_rate": 1.95765938503892e-06, + "loss": 0.436, + "step": 11255 + }, + { + "epoch": 1.43, + "grad_norm": 0.7400595324739233, + "learning_rate": 1.95684080534644e-06, + "loss": 0.5197, + "step": 11256 + }, + { + "epoch": 1.43, + "grad_norm": 0.648825901543588, + "learning_rate": 1.956022355191523e-06, + "loss": 0.4806, + "step": 11257 + }, + { + "epoch": 1.43, + "grad_norm": 0.7492382274165247, + "learning_rate": 1.955204034609006e-06, + "loss": 0.5588, + "step": 11258 + }, + { + "epoch": 1.43, + "grad_norm": 0.7775395558613977, + "learning_rate": 1.9543858436337254e-06, + "loss": 0.5107, + "step": 11259 + }, + { + "epoch": 1.43, + "grad_norm": 0.6228703595391435, + "learning_rate": 1.953567782300505e-06, + "loss": 0.4804, + "step": 11260 + }, + { + "epoch": 1.43, + "grad_norm": 0.8130732016887502, + "learning_rate": 1.9527498506441704e-06, + "loss": 0.5168, + "step": 11261 + }, + { + "epoch": 1.43, + "grad_norm": 0.6351959209709755, + "learning_rate": 1.9519320486995362e-06, + "loss": 0.4897, + "step": 11262 + }, + { + "epoch": 1.43, + "grad_norm": 0.7671735089353404, + "learning_rate": 1.951114376501416e-06, + "loss": 0.4768, + "step": 11263 + }, + { + "epoch": 1.43, + "grad_norm": 0.6259057856947268, + "learning_rate": 1.9502968340846118e-06, + "loss": 0.4697, + "step": 11264 + }, + { + "epoch": 1.44, + "grad_norm": 0.7246265908714671, + "learning_rate": 1.9494794214839276e-06, + "loss": 0.5399, + "step": 11265 + }, + { + "epoch": 1.44, + "grad_norm": 0.9907033676110238, + "learning_rate": 1.9486621387341533e-06, + "loss": 0.5314, + "step": 11266 + }, + { + "epoch": 1.44, + "grad_norm": 0.7606284064873436, + "learning_rate": 1.9478449858700843e-06, + "loss": 0.5115, + "step": 11267 + }, + { + "epoch": 1.44, + "grad_norm": 0.7989713122800921, + "learning_rate": 1.947027962926499e-06, + "loss": 0.4791, + "step": 11268 + }, + { + "epoch": 1.44, + "grad_norm": 0.5817381244530586, + "learning_rate": 1.94621106993818e-06, + "loss": 0.4393, + "step": 11269 + }, + { + "epoch": 1.44, + "grad_norm": 0.7096321112885071, + "learning_rate": 1.9453943069398944e-06, + "loss": 0.4987, + "step": 11270 + }, + { + "epoch": 1.44, + "grad_norm": 0.876752355322076, + "learning_rate": 1.944577673966415e-06, + "loss": 0.531, + "step": 11271 + }, + { + "epoch": 1.44, + "grad_norm": 0.7602148269568045, + "learning_rate": 1.943761171052497e-06, + "loss": 0.5576, + "step": 11272 + }, + { + "epoch": 1.44, + "grad_norm": 0.8329920170953107, + "learning_rate": 1.9429447982329024e-06, + "loss": 0.5284, + "step": 11273 + }, + { + "epoch": 1.44, + "grad_norm": 0.684407161842788, + "learning_rate": 1.9421285555423767e-06, + "loss": 0.491, + "step": 11274 + }, + { + "epoch": 1.44, + "grad_norm": 0.7134658758564494, + "learning_rate": 1.9413124430156687e-06, + "loss": 0.4671, + "step": 11275 + }, + { + "epoch": 1.44, + "grad_norm": 0.72173377341069, + "learning_rate": 1.9404964606875144e-06, + "loss": 0.5359, + "step": 11276 + }, + { + "epoch": 1.44, + "grad_norm": 0.7418848216152063, + "learning_rate": 1.93968060859265e-06, + "loss": 0.5247, + "step": 11277 + }, + { + "epoch": 1.44, + "grad_norm": 0.6680440353861876, + "learning_rate": 1.9388648867658017e-06, + "loss": 0.4564, + "step": 11278 + }, + { + "epoch": 1.44, + "grad_norm": 0.7389863164262619, + "learning_rate": 1.938049295241693e-06, + "loss": 0.4955, + "step": 11279 + }, + { + "epoch": 1.44, + "grad_norm": 0.5829233732668375, + "learning_rate": 1.9372338340550427e-06, + "loss": 0.4627, + "step": 11280 + }, + { + "epoch": 1.44, + "grad_norm": 0.8067337924033198, + "learning_rate": 1.936418503240559e-06, + "loss": 0.5247, + "step": 11281 + }, + { + "epoch": 1.44, + "grad_norm": 0.6413165594994173, + "learning_rate": 1.9356033028329524e-06, + "loss": 0.4939, + "step": 11282 + }, + { + "epoch": 1.44, + "grad_norm": 0.7492170614952702, + "learning_rate": 1.934788232866918e-06, + "loss": 0.5465, + "step": 11283 + }, + { + "epoch": 1.44, + "grad_norm": 0.5525736106000592, + "learning_rate": 1.933973293377154e-06, + "loss": 0.4346, + "step": 11284 + }, + { + "epoch": 1.44, + "grad_norm": 0.7094188651103669, + "learning_rate": 1.9331584843983513e-06, + "loss": 0.477, + "step": 11285 + }, + { + "epoch": 1.44, + "grad_norm": 0.5852625217234446, + "learning_rate": 1.9323438059651895e-06, + "loss": 0.4597, + "step": 11286 + }, + { + "epoch": 1.44, + "grad_norm": 0.5883877451196234, + "learning_rate": 1.9315292581123514e-06, + "loss": 0.462, + "step": 11287 + }, + { + "epoch": 1.44, + "grad_norm": 0.6345828121336127, + "learning_rate": 1.930714840874505e-06, + "loss": 0.487, + "step": 11288 + }, + { + "epoch": 1.44, + "grad_norm": 0.7379326876688959, + "learning_rate": 1.9299005542863213e-06, + "loss": 0.5374, + "step": 11289 + }, + { + "epoch": 1.44, + "grad_norm": 0.5923668077726157, + "learning_rate": 1.9290863983824594e-06, + "loss": 0.4956, + "step": 11290 + }, + { + "epoch": 1.44, + "grad_norm": 1.449338058742408, + "learning_rate": 1.9282723731975754e-06, + "loss": 0.507, + "step": 11291 + }, + { + "epoch": 1.44, + "grad_norm": 0.6693495554699556, + "learning_rate": 1.927458478766323e-06, + "loss": 0.4585, + "step": 11292 + }, + { + "epoch": 1.44, + "grad_norm": 0.9453083517928659, + "learning_rate": 1.9266447151233432e-06, + "loss": 0.53, + "step": 11293 + }, + { + "epoch": 1.44, + "grad_norm": 0.6765940905558698, + "learning_rate": 1.925831082303278e-06, + "loss": 0.5118, + "step": 11294 + }, + { + "epoch": 1.44, + "grad_norm": 0.7102414801322271, + "learning_rate": 1.925017580340759e-06, + "loss": 0.4205, + "step": 11295 + }, + { + "epoch": 1.44, + "grad_norm": 0.6814300141816841, + "learning_rate": 1.924204209270416e-06, + "loss": 0.4422, + "step": 11296 + }, + { + "epoch": 1.44, + "grad_norm": 0.6169935658259402, + "learning_rate": 1.9233909691268705e-06, + "loss": 0.5047, + "step": 11297 + }, + { + "epoch": 1.44, + "grad_norm": 0.6935651286225928, + "learning_rate": 1.922577859944741e-06, + "loss": 0.5163, + "step": 11298 + }, + { + "epoch": 1.44, + "grad_norm": 0.6450800111967042, + "learning_rate": 1.921764881758636e-06, + "loss": 0.4421, + "step": 11299 + }, + { + "epoch": 1.44, + "grad_norm": 0.620304074890015, + "learning_rate": 1.9209520346031656e-06, + "loss": 0.4345, + "step": 11300 + }, + { + "epoch": 1.44, + "grad_norm": 0.7315790932606675, + "learning_rate": 1.9201393185129256e-06, + "loss": 0.4929, + "step": 11301 + }, + { + "epoch": 1.44, + "grad_norm": 0.7004323220145465, + "learning_rate": 1.919326733522515e-06, + "loss": 0.5108, + "step": 11302 + }, + { + "epoch": 1.44, + "grad_norm": 0.6271859644453363, + "learning_rate": 1.918514279666518e-06, + "loss": 0.451, + "step": 11303 + }, + { + "epoch": 1.44, + "grad_norm": 0.6497526757629732, + "learning_rate": 1.9177019569795235e-06, + "loss": 0.4695, + "step": 11304 + }, + { + "epoch": 1.44, + "grad_norm": 0.7336140160469972, + "learning_rate": 1.916889765496106e-06, + "loss": 0.534, + "step": 11305 + }, + { + "epoch": 1.44, + "grad_norm": 0.662297858676538, + "learning_rate": 1.9160777052508404e-06, + "loss": 0.46, + "step": 11306 + }, + { + "epoch": 1.44, + "grad_norm": 0.6510159096884519, + "learning_rate": 1.915265776278291e-06, + "loss": 0.5155, + "step": 11307 + }, + { + "epoch": 1.44, + "grad_norm": 0.6823491584859847, + "learning_rate": 1.9144539786130212e-06, + "loss": 0.4816, + "step": 11308 + }, + { + "epoch": 1.44, + "grad_norm": 0.7294871088530409, + "learning_rate": 1.913642312289584e-06, + "loss": 0.5134, + "step": 11309 + }, + { + "epoch": 1.44, + "grad_norm": 0.5819265618427858, + "learning_rate": 1.912830777342533e-06, + "loss": 0.4401, + "step": 11310 + }, + { + "epoch": 1.44, + "grad_norm": 0.6078669650032759, + "learning_rate": 1.9120193738064088e-06, + "loss": 0.4367, + "step": 11311 + }, + { + "epoch": 1.44, + "grad_norm": 0.9069761228682602, + "learning_rate": 1.9112081017157535e-06, + "loss": 0.5096, + "step": 11312 + }, + { + "epoch": 1.44, + "grad_norm": 0.6029898006326574, + "learning_rate": 1.910396961105097e-06, + "loss": 0.4402, + "step": 11313 + }, + { + "epoch": 1.44, + "grad_norm": 0.6438412119564003, + "learning_rate": 1.909585952008971e-06, + "loss": 0.4172, + "step": 11314 + }, + { + "epoch": 1.44, + "grad_norm": 0.5404246754231471, + "learning_rate": 1.908775074461892e-06, + "loss": 0.4303, + "step": 11315 + }, + { + "epoch": 1.44, + "grad_norm": 0.5388010883116349, + "learning_rate": 1.9079643284983836e-06, + "loss": 0.3819, + "step": 11316 + }, + { + "epoch": 1.44, + "grad_norm": 0.605711647623545, + "learning_rate": 1.9071537141529512e-06, + "loss": 0.4336, + "step": 11317 + }, + { + "epoch": 1.44, + "grad_norm": 0.6829792860415591, + "learning_rate": 1.9063432314601033e-06, + "loss": 0.5251, + "step": 11318 + }, + { + "epoch": 1.44, + "grad_norm": 0.5740065416349589, + "learning_rate": 1.9055328804543367e-06, + "loss": 0.4714, + "step": 11319 + }, + { + "epoch": 1.44, + "grad_norm": 0.716931513077479, + "learning_rate": 1.904722661170148e-06, + "loss": 0.5264, + "step": 11320 + }, + { + "epoch": 1.44, + "grad_norm": 0.671637368678671, + "learning_rate": 1.9039125736420233e-06, + "loss": 0.4914, + "step": 11321 + }, + { + "epoch": 1.44, + "grad_norm": 0.620428701543499, + "learning_rate": 1.9031026179044488e-06, + "loss": 0.4891, + "step": 11322 + }, + { + "epoch": 1.44, + "grad_norm": 0.768085628408398, + "learning_rate": 1.9022927939918972e-06, + "loss": 0.5227, + "step": 11323 + }, + { + "epoch": 1.44, + "grad_norm": 0.6760972529286847, + "learning_rate": 1.9014831019388447e-06, + "loss": 0.4718, + "step": 11324 + }, + { + "epoch": 1.44, + "grad_norm": 0.6478800558768326, + "learning_rate": 1.9006735417797528e-06, + "loss": 0.4785, + "step": 11325 + }, + { + "epoch": 1.44, + "grad_norm": 0.647028749615993, + "learning_rate": 1.8998641135490863e-06, + "loss": 0.4857, + "step": 11326 + }, + { + "epoch": 1.44, + "grad_norm": 0.6887815457774414, + "learning_rate": 1.899054817281296e-06, + "loss": 0.4534, + "step": 11327 + }, + { + "epoch": 1.44, + "grad_norm": 0.6492984440097781, + "learning_rate": 1.898245653010833e-06, + "loss": 0.4748, + "step": 11328 + }, + { + "epoch": 1.44, + "grad_norm": 0.585218737768615, + "learning_rate": 1.8974366207721407e-06, + "loss": 0.4185, + "step": 11329 + }, + { + "epoch": 1.44, + "grad_norm": 0.6147387354800559, + "learning_rate": 1.8966277205996592e-06, + "loss": 0.4824, + "step": 11330 + }, + { + "epoch": 1.44, + "grad_norm": 0.5514378080388994, + "learning_rate": 1.8958189525278164e-06, + "loss": 0.4462, + "step": 11331 + }, + { + "epoch": 1.44, + "grad_norm": 0.6715572472281072, + "learning_rate": 1.8950103165910433e-06, + "loss": 0.425, + "step": 11332 + }, + { + "epoch": 1.44, + "grad_norm": 0.6701754334618644, + "learning_rate": 1.8942018128237571e-06, + "loss": 0.4713, + "step": 11333 + }, + { + "epoch": 1.44, + "grad_norm": 0.7383832353037361, + "learning_rate": 1.8933934412603766e-06, + "loss": 0.493, + "step": 11334 + }, + { + "epoch": 1.44, + "grad_norm": 0.7137313548136808, + "learning_rate": 1.892585201935308e-06, + "loss": 0.4885, + "step": 11335 + }, + { + "epoch": 1.44, + "grad_norm": 0.654454215507374, + "learning_rate": 1.89177709488296e-06, + "loss": 0.4481, + "step": 11336 + }, + { + "epoch": 1.44, + "grad_norm": 0.6013489032851089, + "learning_rate": 1.890969120137726e-06, + "loss": 0.4469, + "step": 11337 + }, + { + "epoch": 1.44, + "grad_norm": 0.6710988442836858, + "learning_rate": 1.890161277734004e-06, + "loss": 0.5161, + "step": 11338 + }, + { + "epoch": 1.44, + "grad_norm": 0.7589133859914434, + "learning_rate": 1.889353567706177e-06, + "loss": 0.4744, + "step": 11339 + }, + { + "epoch": 1.44, + "grad_norm": 0.5587001051874372, + "learning_rate": 1.8885459900886282e-06, + "loss": 0.4116, + "step": 11340 + }, + { + "epoch": 1.44, + "grad_norm": 0.6221389837172623, + "learning_rate": 1.8877385449157354e-06, + "loss": 0.4751, + "step": 11341 + }, + { + "epoch": 1.44, + "grad_norm": 0.7048839085214085, + "learning_rate": 1.8869312322218658e-06, + "loss": 0.4974, + "step": 11342 + }, + { + "epoch": 1.45, + "grad_norm": 0.6847705843990224, + "learning_rate": 1.8861240520413876e-06, + "loss": 0.5328, + "step": 11343 + }, + { + "epoch": 1.45, + "grad_norm": 0.6374377065226372, + "learning_rate": 1.8853170044086562e-06, + "loss": 0.4914, + "step": 11344 + }, + { + "epoch": 1.45, + "grad_norm": 0.8486749525142614, + "learning_rate": 1.8845100893580264e-06, + "loss": 0.524, + "step": 11345 + }, + { + "epoch": 1.45, + "grad_norm": 0.6157314279655526, + "learning_rate": 1.8837033069238487e-06, + "loss": 0.4885, + "step": 11346 + }, + { + "epoch": 1.45, + "grad_norm": 0.7449133394383595, + "learning_rate": 1.8828966571404606e-06, + "loss": 0.5735, + "step": 11347 + }, + { + "epoch": 1.45, + "grad_norm": 0.7700475046969686, + "learning_rate": 1.8820901400422032e-06, + "loss": 0.4702, + "step": 11348 + }, + { + "epoch": 1.45, + "grad_norm": 0.6070447399975536, + "learning_rate": 1.8812837556634033e-06, + "loss": 0.4492, + "step": 11349 + }, + { + "epoch": 1.45, + "grad_norm": 0.6149711047935372, + "learning_rate": 1.8804775040383894e-06, + "loss": 0.4682, + "step": 11350 + }, + { + "epoch": 1.45, + "grad_norm": 0.6203758570810054, + "learning_rate": 1.879671385201478e-06, + "loss": 0.4655, + "step": 11351 + }, + { + "epoch": 1.45, + "grad_norm": 0.5958757086161411, + "learning_rate": 1.878865399186985e-06, + "loss": 0.4392, + "step": 11352 + }, + { + "epoch": 1.45, + "grad_norm": 0.7035927179298433, + "learning_rate": 1.8780595460292195e-06, + "loss": 0.5019, + "step": 11353 + }, + { + "epoch": 1.45, + "grad_norm": 0.6023035148596293, + "learning_rate": 1.8772538257624812e-06, + "loss": 0.4745, + "step": 11354 + }, + { + "epoch": 1.45, + "grad_norm": 0.5122257746860032, + "learning_rate": 1.8764482384210703e-06, + "loss": 0.4242, + "step": 11355 + }, + { + "epoch": 1.45, + "grad_norm": 0.6360494899344628, + "learning_rate": 1.8756427840392749e-06, + "loss": 0.4676, + "step": 11356 + }, + { + "epoch": 1.45, + "grad_norm": 0.787876292155766, + "learning_rate": 1.8748374626513838e-06, + "loss": 0.517, + "step": 11357 + }, + { + "epoch": 1.45, + "grad_norm": 0.7996470719930312, + "learning_rate": 1.8740322742916738e-06, + "loss": 0.4522, + "step": 11358 + }, + { + "epoch": 1.45, + "grad_norm": 0.7036255965940895, + "learning_rate": 1.8732272189944224e-06, + "loss": 0.5299, + "step": 11359 + }, + { + "epoch": 1.45, + "grad_norm": 0.759481931586932, + "learning_rate": 1.8724222967938944e-06, + "loss": 0.5199, + "step": 11360 + }, + { + "epoch": 1.45, + "grad_norm": 0.695578461214734, + "learning_rate": 1.871617507724357e-06, + "loss": 0.5408, + "step": 11361 + }, + { + "epoch": 1.45, + "grad_norm": 0.7121033471142931, + "learning_rate": 1.8708128518200635e-06, + "loss": 0.4968, + "step": 11362 + }, + { + "epoch": 1.45, + "grad_norm": 0.695666574665368, + "learning_rate": 1.8700083291152692e-06, + "loss": 0.4235, + "step": 11363 + }, + { + "epoch": 1.45, + "grad_norm": 0.6810425887666096, + "learning_rate": 1.8692039396442169e-06, + "loss": 0.5027, + "step": 11364 + }, + { + "epoch": 1.45, + "grad_norm": 0.8222239671783307, + "learning_rate": 1.8683996834411483e-06, + "loss": 0.5252, + "step": 11365 + }, + { + "epoch": 1.45, + "grad_norm": 0.8050408408150342, + "learning_rate": 1.8675955605402979e-06, + "loss": 0.5058, + "step": 11366 + }, + { + "epoch": 1.45, + "grad_norm": 0.8433880324569696, + "learning_rate": 1.866791570975896e-06, + "loss": 0.5451, + "step": 11367 + }, + { + "epoch": 1.45, + "grad_norm": 0.8072636368134379, + "learning_rate": 1.8659877147821637e-06, + "loss": 0.5163, + "step": 11368 + }, + { + "epoch": 1.45, + "grad_norm": 0.6638410952524344, + "learning_rate": 1.8651839919933206e-06, + "loss": 0.544, + "step": 11369 + }, + { + "epoch": 1.45, + "grad_norm": 0.7098494588444912, + "learning_rate": 1.8643804026435758e-06, + "loss": 0.528, + "step": 11370 + }, + { + "epoch": 1.45, + "grad_norm": 0.5987182590874014, + "learning_rate": 1.8635769467671394e-06, + "loss": 0.4282, + "step": 11371 + }, + { + "epoch": 1.45, + "grad_norm": 0.6676643531043296, + "learning_rate": 1.862773624398208e-06, + "loss": 0.4383, + "step": 11372 + }, + { + "epoch": 1.45, + "grad_norm": 1.8879658039018834, + "learning_rate": 1.8619704355709795e-06, + "loss": 0.5059, + "step": 11373 + }, + { + "epoch": 1.45, + "grad_norm": 0.8757993116504977, + "learning_rate": 1.8611673803196407e-06, + "loss": 0.5329, + "step": 11374 + }, + { + "epoch": 1.45, + "grad_norm": 0.8558627979305645, + "learning_rate": 1.8603644586783775e-06, + "loss": 0.5098, + "step": 11375 + }, + { + "epoch": 1.45, + "grad_norm": 0.6379557817780248, + "learning_rate": 1.8595616706813652e-06, + "loss": 0.4782, + "step": 11376 + }, + { + "epoch": 1.45, + "grad_norm": 0.6029648904436841, + "learning_rate": 1.8587590163627773e-06, + "loss": 0.4839, + "step": 11377 + }, + { + "epoch": 1.45, + "grad_norm": 0.805259381730763, + "learning_rate": 1.8579564957567791e-06, + "loss": 0.5867, + "step": 11378 + }, + { + "epoch": 1.45, + "grad_norm": 0.7387227870112327, + "learning_rate": 1.8571541088975347e-06, + "loss": 0.5359, + "step": 11379 + }, + { + "epoch": 1.45, + "grad_norm": 0.6835881395539167, + "learning_rate": 1.8563518558191957e-06, + "loss": 0.4796, + "step": 11380 + }, + { + "epoch": 1.45, + "grad_norm": 0.7993258912990056, + "learning_rate": 1.8555497365559133e-06, + "loss": 0.5667, + "step": 11381 + }, + { + "epoch": 1.45, + "grad_norm": 0.6453699321625503, + "learning_rate": 1.8547477511418288e-06, + "loss": 0.422, + "step": 11382 + }, + { + "epoch": 1.45, + "grad_norm": 0.6184484823844142, + "learning_rate": 1.8539458996110838e-06, + "loss": 0.4446, + "step": 11383 + }, + { + "epoch": 1.45, + "grad_norm": 0.7343103680563909, + "learning_rate": 1.8531441819978069e-06, + "loss": 0.5102, + "step": 11384 + }, + { + "epoch": 1.45, + "grad_norm": 0.5681145579544644, + "learning_rate": 1.8523425983361276e-06, + "loss": 0.438, + "step": 11385 + }, + { + "epoch": 1.45, + "grad_norm": 0.6409367061892022, + "learning_rate": 1.8515411486601637e-06, + "loss": 0.5028, + "step": 11386 + }, + { + "epoch": 1.45, + "grad_norm": 0.7225861684427305, + "learning_rate": 1.8507398330040343e-06, + "loss": 0.4309, + "step": 11387 + }, + { + "epoch": 1.45, + "grad_norm": 0.6597519877532663, + "learning_rate": 1.849938651401844e-06, + "loss": 0.4718, + "step": 11388 + }, + { + "epoch": 1.45, + "grad_norm": 0.6714276098028268, + "learning_rate": 1.849137603887702e-06, + "loss": 0.513, + "step": 11389 + }, + { + "epoch": 1.45, + "grad_norm": 0.5961933529361594, + "learning_rate": 1.848336690495699e-06, + "loss": 0.4208, + "step": 11390 + }, + { + "epoch": 1.45, + "grad_norm": 0.6045272297501535, + "learning_rate": 1.8475359112599362e-06, + "loss": 0.3906, + "step": 11391 + }, + { + "epoch": 1.45, + "grad_norm": 0.8572888042345367, + "learning_rate": 1.8467352662144934e-06, + "loss": 0.4952, + "step": 11392 + }, + { + "epoch": 1.45, + "grad_norm": 0.6472699547003887, + "learning_rate": 1.8459347553934559e-06, + "loss": 0.4775, + "step": 11393 + }, + { + "epoch": 1.45, + "grad_norm": 0.6696770115024709, + "learning_rate": 1.8451343788308952e-06, + "loss": 0.5135, + "step": 11394 + }, + { + "epoch": 1.45, + "grad_norm": 0.8168211517611075, + "learning_rate": 1.844334136560884e-06, + "loss": 0.5001, + "step": 11395 + }, + { + "epoch": 1.45, + "grad_norm": 0.6928336793428703, + "learning_rate": 1.843534028617483e-06, + "loss": 0.4166, + "step": 11396 + }, + { + "epoch": 1.45, + "grad_norm": 0.6208105925383309, + "learning_rate": 1.8427340550347533e-06, + "loss": 0.4772, + "step": 11397 + }, + { + "epoch": 1.45, + "grad_norm": 0.6133442560191401, + "learning_rate": 1.8419342158467441e-06, + "loss": 0.4637, + "step": 11398 + }, + { + "epoch": 1.45, + "grad_norm": 0.5850717184361476, + "learning_rate": 1.8411345110875057e-06, + "loss": 0.4364, + "step": 11399 + }, + { + "epoch": 1.45, + "grad_norm": 0.6289716505473774, + "learning_rate": 1.8403349407910742e-06, + "loss": 0.4291, + "step": 11400 + }, + { + "epoch": 1.45, + "grad_norm": 0.6269852948282604, + "learning_rate": 1.8395355049914897e-06, + "loss": 0.435, + "step": 11401 + }, + { + "epoch": 1.45, + "grad_norm": 0.5992514811941154, + "learning_rate": 1.8387362037227779e-06, + "loss": 0.4228, + "step": 11402 + }, + { + "epoch": 1.45, + "grad_norm": 0.6669948651458143, + "learning_rate": 1.8379370370189636e-06, + "loss": 0.4457, + "step": 11403 + }, + { + "epoch": 1.45, + "grad_norm": 0.5847273644145587, + "learning_rate": 1.8371380049140675e-06, + "loss": 0.458, + "step": 11404 + }, + { + "epoch": 1.45, + "grad_norm": 0.7427856648280324, + "learning_rate": 1.8363391074420972e-06, + "loss": 0.5212, + "step": 11405 + }, + { + "epoch": 1.45, + "grad_norm": 0.8804268564421519, + "learning_rate": 1.8355403446370612e-06, + "loss": 0.5632, + "step": 11406 + }, + { + "epoch": 1.45, + "grad_norm": 0.7269331058491464, + "learning_rate": 1.8347417165329628e-06, + "loss": 0.5342, + "step": 11407 + }, + { + "epoch": 1.45, + "grad_norm": 0.7447227093176003, + "learning_rate": 1.8339432231637928e-06, + "loss": 0.4994, + "step": 11408 + }, + { + "epoch": 1.45, + "grad_norm": 0.6879438327484899, + "learning_rate": 1.8331448645635441e-06, + "loss": 0.4818, + "step": 11409 + }, + { + "epoch": 1.45, + "grad_norm": 0.6820188400244761, + "learning_rate": 1.8323466407661972e-06, + "loss": 0.4584, + "step": 11410 + }, + { + "epoch": 1.45, + "grad_norm": 0.790304865458631, + "learning_rate": 1.8315485518057335e-06, + "loss": 0.4753, + "step": 11411 + }, + { + "epoch": 1.45, + "grad_norm": 0.8620632386124969, + "learning_rate": 1.8307505977161206e-06, + "loss": 0.4327, + "step": 11412 + }, + { + "epoch": 1.45, + "grad_norm": 0.814672743559231, + "learning_rate": 1.8299527785313292e-06, + "loss": 0.5779, + "step": 11413 + }, + { + "epoch": 1.45, + "grad_norm": 0.7305306394057026, + "learning_rate": 1.829155094285317e-06, + "loss": 0.5228, + "step": 11414 + }, + { + "epoch": 1.45, + "grad_norm": 0.6052248339321922, + "learning_rate": 1.8283575450120389e-06, + "loss": 0.4842, + "step": 11415 + }, + { + "epoch": 1.45, + "grad_norm": 0.7594348748008597, + "learning_rate": 1.827560130745447e-06, + "loss": 0.566, + "step": 11416 + }, + { + "epoch": 1.45, + "grad_norm": 0.7507693321932712, + "learning_rate": 1.8267628515194813e-06, + "loss": 0.5104, + "step": 11417 + }, + { + "epoch": 1.45, + "grad_norm": 0.7326304713664651, + "learning_rate": 1.8259657073680826e-06, + "loss": 0.5434, + "step": 11418 + }, + { + "epoch": 1.45, + "grad_norm": 0.6851379089020471, + "learning_rate": 1.8251686983251788e-06, + "loss": 0.5361, + "step": 11419 + }, + { + "epoch": 1.45, + "grad_norm": 0.7546095013146231, + "learning_rate": 1.8243718244247e-06, + "loss": 0.5257, + "step": 11420 + }, + { + "epoch": 1.45, + "grad_norm": 0.7441539723313164, + "learning_rate": 1.8235750857005629e-06, + "loss": 0.5138, + "step": 11421 + }, + { + "epoch": 1.46, + "grad_norm": 0.6026725448776842, + "learning_rate": 1.822778482186686e-06, + "loss": 0.4859, + "step": 11422 + }, + { + "epoch": 1.46, + "grad_norm": 0.752596168333072, + "learning_rate": 1.8219820139169741e-06, + "loss": 0.5233, + "step": 11423 + }, + { + "epoch": 1.46, + "grad_norm": 0.6210477534583401, + "learning_rate": 1.8211856809253348e-06, + "loss": 0.4774, + "step": 11424 + }, + { + "epoch": 1.46, + "grad_norm": 0.7279576260642828, + "learning_rate": 1.820389483245661e-06, + "loss": 0.4624, + "step": 11425 + }, + { + "epoch": 1.46, + "grad_norm": 0.6632562080822728, + "learning_rate": 1.8195934209118477e-06, + "loss": 0.4616, + "step": 11426 + }, + { + "epoch": 1.46, + "grad_norm": 0.6595600738049575, + "learning_rate": 1.8187974939577763e-06, + "loss": 0.4518, + "step": 11427 + }, + { + "epoch": 1.46, + "grad_norm": 0.6129363296267166, + "learning_rate": 1.818001702417333e-06, + "loss": 0.4411, + "step": 11428 + }, + { + "epoch": 1.46, + "grad_norm": 0.5890918786735774, + "learning_rate": 1.8172060463243874e-06, + "loss": 0.445, + "step": 11429 + }, + { + "epoch": 1.46, + "grad_norm": 0.7662057066416829, + "learning_rate": 1.816410525712811e-06, + "loss": 0.5797, + "step": 11430 + }, + { + "epoch": 1.46, + "grad_norm": 0.7353500142376335, + "learning_rate": 1.8156151406164645e-06, + "loss": 0.5474, + "step": 11431 + }, + { + "epoch": 1.46, + "grad_norm": 0.7961389928726098, + "learning_rate": 1.8148198910692066e-06, + "loss": 0.5096, + "step": 11432 + }, + { + "epoch": 1.46, + "grad_norm": 0.8054147583020271, + "learning_rate": 1.8140247771048863e-06, + "loss": 0.5398, + "step": 11433 + }, + { + "epoch": 1.46, + "grad_norm": 0.6711575202310909, + "learning_rate": 1.8132297987573516e-06, + "loss": 0.4428, + "step": 11434 + }, + { + "epoch": 1.46, + "grad_norm": 0.5596974355278252, + "learning_rate": 1.8124349560604394e-06, + "loss": 0.4393, + "step": 11435 + }, + { + "epoch": 1.46, + "grad_norm": 0.6128956121929855, + "learning_rate": 1.8116402490479867e-06, + "loss": 0.427, + "step": 11436 + }, + { + "epoch": 1.46, + "grad_norm": 0.6819978671117611, + "learning_rate": 1.8108456777538185e-06, + "loss": 0.4981, + "step": 11437 + }, + { + "epoch": 1.46, + "grad_norm": 0.7662783071339638, + "learning_rate": 1.8100512422117606e-06, + "loss": 0.5706, + "step": 11438 + }, + { + "epoch": 1.46, + "grad_norm": 0.685410470491554, + "learning_rate": 1.809256942455624e-06, + "loss": 0.5081, + "step": 11439 + }, + { + "epoch": 1.46, + "grad_norm": 0.6449733075945355, + "learning_rate": 1.8084627785192265e-06, + "loss": 0.4794, + "step": 11440 + }, + { + "epoch": 1.46, + "grad_norm": 0.7363273460430979, + "learning_rate": 1.8076687504363683e-06, + "loss": 0.5689, + "step": 11441 + }, + { + "epoch": 1.46, + "grad_norm": 0.8143814366111622, + "learning_rate": 1.8068748582408518e-06, + "loss": 0.4884, + "step": 11442 + }, + { + "epoch": 1.46, + "grad_norm": 0.615905581501184, + "learning_rate": 1.806081101966467e-06, + "loss": 0.4912, + "step": 11443 + }, + { + "epoch": 1.46, + "grad_norm": 0.6070018823252646, + "learning_rate": 1.8052874816470056e-06, + "loss": 0.5057, + "step": 11444 + }, + { + "epoch": 1.46, + "grad_norm": 0.7045085296061119, + "learning_rate": 1.804493997316245e-06, + "loss": 0.5331, + "step": 11445 + }, + { + "epoch": 1.46, + "grad_norm": 0.6514146328919574, + "learning_rate": 1.8037006490079656e-06, + "loss": 0.4335, + "step": 11446 + }, + { + "epoch": 1.46, + "grad_norm": 0.7368975197273516, + "learning_rate": 1.8029074367559335e-06, + "loss": 0.4163, + "step": 11447 + }, + { + "epoch": 1.46, + "grad_norm": 0.6413597586439923, + "learning_rate": 1.8021143605939173e-06, + "loss": 0.4392, + "step": 11448 + }, + { + "epoch": 1.46, + "grad_norm": 0.6562755475241322, + "learning_rate": 1.8013214205556722e-06, + "loss": 0.4934, + "step": 11449 + }, + { + "epoch": 1.46, + "grad_norm": 0.5744958712650743, + "learning_rate": 1.8005286166749548e-06, + "loss": 0.4319, + "step": 11450 + }, + { + "epoch": 1.46, + "grad_norm": 0.5898944792749423, + "learning_rate": 1.7997359489855083e-06, + "loss": 0.4334, + "step": 11451 + }, + { + "epoch": 1.46, + "grad_norm": 0.5474595776835868, + "learning_rate": 1.798943417521075e-06, + "loss": 0.4424, + "step": 11452 + }, + { + "epoch": 1.46, + "grad_norm": 0.7212489705428717, + "learning_rate": 1.7981510223153919e-06, + "loss": 0.4679, + "step": 11453 + }, + { + "epoch": 1.46, + "grad_norm": 0.6659344670590249, + "learning_rate": 1.79735876340219e-06, + "loss": 0.4797, + "step": 11454 + }, + { + "epoch": 1.46, + "grad_norm": 0.5789524318917341, + "learning_rate": 1.7965666408151894e-06, + "loss": 0.4792, + "step": 11455 + }, + { + "epoch": 1.46, + "grad_norm": 0.7589410901965923, + "learning_rate": 1.7957746545881121e-06, + "loss": 0.5447, + "step": 11456 + }, + { + "epoch": 1.46, + "grad_norm": 1.0046614548571045, + "learning_rate": 1.7949828047546668e-06, + "loss": 0.5296, + "step": 11457 + }, + { + "epoch": 1.46, + "grad_norm": 0.7933823037260812, + "learning_rate": 1.7941910913485633e-06, + "loss": 0.5279, + "step": 11458 + }, + { + "epoch": 1.46, + "grad_norm": 0.7876437667701834, + "learning_rate": 1.7933995144034994e-06, + "loss": 0.4971, + "step": 11459 + }, + { + "epoch": 1.46, + "grad_norm": 0.5719763621423096, + "learning_rate": 1.7926080739531727e-06, + "loss": 0.4383, + "step": 11460 + }, + { + "epoch": 1.46, + "grad_norm": 0.6202015258602679, + "learning_rate": 1.7918167700312695e-06, + "loss": 0.4718, + "step": 11461 + }, + { + "epoch": 1.46, + "grad_norm": 0.5853265472101226, + "learning_rate": 1.7910256026714756e-06, + "loss": 0.5024, + "step": 11462 + }, + { + "epoch": 1.46, + "grad_norm": 0.797943078973722, + "learning_rate": 1.790234571907466e-06, + "loss": 0.5613, + "step": 11463 + }, + { + "epoch": 1.46, + "grad_norm": 0.831896870677325, + "learning_rate": 1.789443677772914e-06, + "loss": 0.5199, + "step": 11464 + }, + { + "epoch": 1.46, + "grad_norm": 0.607534823891539, + "learning_rate": 1.7886529203014864e-06, + "loss": 0.4062, + "step": 11465 + }, + { + "epoch": 1.46, + "grad_norm": 0.6825610866584609, + "learning_rate": 1.7878622995268407e-06, + "loss": 0.5457, + "step": 11466 + }, + { + "epoch": 1.46, + "grad_norm": 0.7460391223936172, + "learning_rate": 1.7870718154826338e-06, + "loss": 0.54, + "step": 11467 + }, + { + "epoch": 1.46, + "grad_norm": 0.8683466754217286, + "learning_rate": 1.7862814682025115e-06, + "loss": 0.4967, + "step": 11468 + }, + { + "epoch": 1.46, + "grad_norm": 0.8502756211600191, + "learning_rate": 1.7854912577201167e-06, + "loss": 0.5297, + "step": 11469 + }, + { + "epoch": 1.46, + "grad_norm": 0.7780664606821395, + "learning_rate": 1.7847011840690892e-06, + "loss": 0.5598, + "step": 11470 + }, + { + "epoch": 1.46, + "grad_norm": 0.7337613261122693, + "learning_rate": 1.7839112472830562e-06, + "loss": 0.4994, + "step": 11471 + }, + { + "epoch": 1.46, + "grad_norm": 0.8168542737263976, + "learning_rate": 1.7831214473956454e-06, + "loss": 0.5212, + "step": 11472 + }, + { + "epoch": 1.46, + "grad_norm": 0.8352738925400326, + "learning_rate": 1.782331784440473e-06, + "loss": 0.5519, + "step": 11473 + }, + { + "epoch": 1.46, + "grad_norm": 0.8352580204131053, + "learning_rate": 1.7815422584511566e-06, + "loss": 0.5121, + "step": 11474 + }, + { + "epoch": 1.46, + "grad_norm": 0.7001900334173825, + "learning_rate": 1.7807528694612996e-06, + "loss": 0.4768, + "step": 11475 + }, + { + "epoch": 1.46, + "grad_norm": 0.8445488655757379, + "learning_rate": 1.779963617504506e-06, + "loss": 0.5117, + "step": 11476 + }, + { + "epoch": 1.46, + "grad_norm": 0.7406429898031691, + "learning_rate": 1.779174502614373e-06, + "loss": 0.498, + "step": 11477 + }, + { + "epoch": 1.46, + "grad_norm": 0.7893810111282417, + "learning_rate": 1.7783855248244875e-06, + "loss": 0.5518, + "step": 11478 + }, + { + "epoch": 1.46, + "grad_norm": 0.7855546277686893, + "learning_rate": 1.7775966841684367e-06, + "loss": 0.4456, + "step": 11479 + }, + { + "epoch": 1.46, + "grad_norm": 0.6855011200289312, + "learning_rate": 1.7768079806797967e-06, + "loss": 0.4914, + "step": 11480 + }, + { + "epoch": 1.46, + "grad_norm": 0.6938643355402148, + "learning_rate": 1.7760194143921423e-06, + "loss": 0.4794, + "step": 11481 + }, + { + "epoch": 1.46, + "grad_norm": 0.6574024281054431, + "learning_rate": 1.7752309853390377e-06, + "loss": 0.428, + "step": 11482 + }, + { + "epoch": 1.46, + "grad_norm": 0.5673485561577188, + "learning_rate": 1.7744426935540464e-06, + "loss": 0.4476, + "step": 11483 + }, + { + "epoch": 1.46, + "grad_norm": 0.6271509276827958, + "learning_rate": 1.7736545390707205e-06, + "loss": 0.5151, + "step": 11484 + }, + { + "epoch": 1.46, + "grad_norm": 0.6918206223414506, + "learning_rate": 1.7728665219226133e-06, + "loss": 0.5171, + "step": 11485 + }, + { + "epoch": 1.46, + "grad_norm": 0.6263608028548525, + "learning_rate": 1.7720786421432635e-06, + "loss": 0.5046, + "step": 11486 + }, + { + "epoch": 1.46, + "grad_norm": 0.6465377676486046, + "learning_rate": 1.771290899766213e-06, + "loss": 0.4291, + "step": 11487 + }, + { + "epoch": 1.46, + "grad_norm": 0.6907595096503566, + "learning_rate": 1.7705032948249895e-06, + "loss": 0.4566, + "step": 11488 + }, + { + "epoch": 1.46, + "grad_norm": 0.7075940045692216, + "learning_rate": 1.7697158273531207e-06, + "loss": 0.5063, + "step": 11489 + }, + { + "epoch": 1.46, + "grad_norm": 0.6813332020786361, + "learning_rate": 1.768928497384127e-06, + "loss": 0.5052, + "step": 11490 + }, + { + "epoch": 1.46, + "grad_norm": 0.7446828431298997, + "learning_rate": 1.7681413049515233e-06, + "loss": 0.4687, + "step": 11491 + }, + { + "epoch": 1.46, + "grad_norm": 0.5743777001582568, + "learning_rate": 1.7673542500888157e-06, + "loss": 0.4542, + "step": 11492 + }, + { + "epoch": 1.46, + "grad_norm": 0.797217248313285, + "learning_rate": 1.7665673328295085e-06, + "loss": 0.5179, + "step": 11493 + }, + { + "epoch": 1.46, + "grad_norm": 0.760027149228795, + "learning_rate": 1.7657805532070966e-06, + "loss": 0.485, + "step": 11494 + }, + { + "epoch": 1.46, + "grad_norm": 0.7939131495353275, + "learning_rate": 1.7649939112550724e-06, + "loss": 0.5341, + "step": 11495 + }, + { + "epoch": 1.46, + "grad_norm": 0.7116622065175608, + "learning_rate": 1.7642074070069182e-06, + "loss": 0.4707, + "step": 11496 + }, + { + "epoch": 1.46, + "grad_norm": 0.5997526212237774, + "learning_rate": 1.7634210404961165e-06, + "loss": 0.4579, + "step": 11497 + }, + { + "epoch": 1.46, + "grad_norm": 0.6592923954663346, + "learning_rate": 1.762634811756137e-06, + "loss": 0.4747, + "step": 11498 + }, + { + "epoch": 1.46, + "grad_norm": 0.5720989469457686, + "learning_rate": 1.7618487208204498e-06, + "loss": 0.4382, + "step": 11499 + }, + { + "epoch": 1.47, + "grad_norm": 0.5637432305131495, + "learning_rate": 1.7610627677225129e-06, + "loss": 0.4027, + "step": 11500 + }, + { + "epoch": 1.47, + "grad_norm": 0.6003834310857484, + "learning_rate": 1.7602769524957842e-06, + "loss": 0.4462, + "step": 11501 + }, + { + "epoch": 1.47, + "grad_norm": 0.678516916390727, + "learning_rate": 1.7594912751737132e-06, + "loss": 0.4399, + "step": 11502 + }, + { + "epoch": 1.47, + "grad_norm": 0.609378781142807, + "learning_rate": 1.7587057357897447e-06, + "loss": 0.44, + "step": 11503 + }, + { + "epoch": 1.47, + "grad_norm": 0.7350987040078207, + "learning_rate": 1.7579203343773138e-06, + "loss": 0.4426, + "step": 11504 + }, + { + "epoch": 1.47, + "grad_norm": 0.690981948533854, + "learning_rate": 1.757135070969856e-06, + "loss": 0.494, + "step": 11505 + }, + { + "epoch": 1.47, + "grad_norm": 0.6602228679499301, + "learning_rate": 1.756349945600793e-06, + "loss": 0.464, + "step": 11506 + }, + { + "epoch": 1.47, + "grad_norm": 0.6912428253486794, + "learning_rate": 1.75556495830355e-06, + "loss": 0.4572, + "step": 11507 + }, + { + "epoch": 1.47, + "grad_norm": 0.6757033083216948, + "learning_rate": 1.7547801091115374e-06, + "loss": 0.4481, + "step": 11508 + }, + { + "epoch": 1.47, + "grad_norm": 0.6004194983371172, + "learning_rate": 1.7539953980581665e-06, + "loss": 0.411, + "step": 11509 + }, + { + "epoch": 1.47, + "grad_norm": 0.6629981984011224, + "learning_rate": 1.753210825176837e-06, + "loss": 0.4508, + "step": 11510 + }, + { + "epoch": 1.47, + "grad_norm": 0.5526800456809966, + "learning_rate": 1.7524263905009499e-06, + "loss": 0.4468, + "step": 11511 + }, + { + "epoch": 1.47, + "grad_norm": 0.5990447698490425, + "learning_rate": 1.7516420940638912e-06, + "loss": 0.4835, + "step": 11512 + }, + { + "epoch": 1.47, + "grad_norm": 0.6169888985794139, + "learning_rate": 1.7508579358990501e-06, + "loss": 0.4477, + "step": 11513 + }, + { + "epoch": 1.47, + "grad_norm": 0.5892362134564003, + "learning_rate": 1.7500739160398006e-06, + "loss": 0.4263, + "step": 11514 + }, + { + "epoch": 1.47, + "grad_norm": 0.5534155623237008, + "learning_rate": 1.749290034519523e-06, + "loss": 0.4296, + "step": 11515 + }, + { + "epoch": 1.47, + "grad_norm": 0.701035013101386, + "learning_rate": 1.7485062913715784e-06, + "loss": 0.5216, + "step": 11516 + }, + { + "epoch": 1.47, + "grad_norm": 0.7680250293511391, + "learning_rate": 1.747722686629333e-06, + "loss": 0.5052, + "step": 11517 + }, + { + "epoch": 1.47, + "grad_norm": 0.7417850421576466, + "learning_rate": 1.746939220326138e-06, + "loss": 0.555, + "step": 11518 + }, + { + "epoch": 1.47, + "grad_norm": 0.7258841808079408, + "learning_rate": 1.746155892495347e-06, + "loss": 0.5377, + "step": 11519 + }, + { + "epoch": 1.47, + "grad_norm": 0.7553149431842945, + "learning_rate": 1.7453727031703e-06, + "loss": 0.5544, + "step": 11520 + }, + { + "epoch": 1.47, + "grad_norm": 0.8202925738083765, + "learning_rate": 1.7445896523843386e-06, + "loss": 0.5436, + "step": 11521 + }, + { + "epoch": 1.47, + "grad_norm": 0.7932840073403957, + "learning_rate": 1.7438067401707909e-06, + "loss": 0.5222, + "step": 11522 + }, + { + "epoch": 1.47, + "grad_norm": 0.6139123674514075, + "learning_rate": 1.7430239665629866e-06, + "loss": 0.5099, + "step": 11523 + }, + { + "epoch": 1.47, + "grad_norm": 1.3698117525881095, + "learning_rate": 1.7422413315942433e-06, + "loss": 0.5158, + "step": 11524 + }, + { + "epoch": 1.47, + "grad_norm": 0.604257915599999, + "learning_rate": 1.7414588352978774e-06, + "loss": 0.4342, + "step": 11525 + }, + { + "epoch": 1.47, + "grad_norm": 0.7645491208958822, + "learning_rate": 1.7406764777071938e-06, + "loss": 0.4932, + "step": 11526 + }, + { + "epoch": 1.47, + "grad_norm": 0.6857563844302612, + "learning_rate": 1.739894258855498e-06, + "loss": 0.5499, + "step": 11527 + }, + { + "epoch": 1.47, + "grad_norm": 0.5865634869951317, + "learning_rate": 1.7391121787760873e-06, + "loss": 0.4653, + "step": 11528 + }, + { + "epoch": 1.47, + "grad_norm": 0.6614811942942823, + "learning_rate": 1.7383302375022493e-06, + "loss": 0.5516, + "step": 11529 + }, + { + "epoch": 1.47, + "grad_norm": 0.7539716103220729, + "learning_rate": 1.73754843506727e-06, + "loss": 0.4831, + "step": 11530 + }, + { + "epoch": 1.47, + "grad_norm": 0.5865375857353755, + "learning_rate": 1.73676677150443e-06, + "loss": 0.4301, + "step": 11531 + }, + { + "epoch": 1.47, + "grad_norm": 0.6760179459749036, + "learning_rate": 1.7359852468469995e-06, + "loss": 0.4368, + "step": 11532 + }, + { + "epoch": 1.47, + "grad_norm": 0.7220857390748701, + "learning_rate": 1.7352038611282485e-06, + "loss": 0.5184, + "step": 11533 + }, + { + "epoch": 1.47, + "grad_norm": 0.7569950375840301, + "learning_rate": 1.7344226143814341e-06, + "loss": 0.506, + "step": 11534 + }, + { + "epoch": 1.47, + "grad_norm": 0.5477283439534922, + "learning_rate": 1.733641506639816e-06, + "loss": 0.4, + "step": 11535 + }, + { + "epoch": 1.47, + "grad_norm": 0.640102976398902, + "learning_rate": 1.7328605379366391e-06, + "loss": 0.4001, + "step": 11536 + }, + { + "epoch": 1.47, + "grad_norm": 0.5938063790849741, + "learning_rate": 1.7320797083051506e-06, + "loss": 0.4166, + "step": 11537 + }, + { + "epoch": 1.47, + "grad_norm": 0.6173801546358957, + "learning_rate": 1.7312990177785848e-06, + "loss": 0.4189, + "step": 11538 + }, + { + "epoch": 1.47, + "grad_norm": 0.5318305032781782, + "learning_rate": 1.7305184663901741e-06, + "loss": 0.4228, + "step": 11539 + }, + { + "epoch": 1.47, + "grad_norm": 0.7169237283214026, + "learning_rate": 1.7297380541731461e-06, + "loss": 0.4193, + "step": 11540 + }, + { + "epoch": 1.47, + "grad_norm": 0.5883173176085361, + "learning_rate": 1.7289577811607179e-06, + "loss": 0.4777, + "step": 11541 + }, + { + "epoch": 1.47, + "grad_norm": 0.6366062492789052, + "learning_rate": 1.728177647386105e-06, + "loss": 0.4791, + "step": 11542 + }, + { + "epoch": 1.47, + "grad_norm": 0.6983595428373511, + "learning_rate": 1.7273976528825132e-06, + "loss": 0.5244, + "step": 11543 + }, + { + "epoch": 1.47, + "grad_norm": 0.7089669432113576, + "learning_rate": 1.726617797683147e-06, + "loss": 0.5168, + "step": 11544 + }, + { + "epoch": 1.47, + "grad_norm": 0.7120056115174487, + "learning_rate": 1.7258380818211995e-06, + "loss": 0.5295, + "step": 11545 + }, + { + "epoch": 1.47, + "grad_norm": 0.7434373498383949, + "learning_rate": 1.725058505329864e-06, + "loss": 0.483, + "step": 11546 + }, + { + "epoch": 1.47, + "grad_norm": 0.6767554191941588, + "learning_rate": 1.7242790682423205e-06, + "loss": 0.5294, + "step": 11547 + }, + { + "epoch": 1.47, + "grad_norm": 0.7655589063541344, + "learning_rate": 1.723499770591751e-06, + "loss": 0.5271, + "step": 11548 + }, + { + "epoch": 1.47, + "grad_norm": 0.6070402806373606, + "learning_rate": 1.722720612411325e-06, + "loss": 0.5066, + "step": 11549 + }, + { + "epoch": 1.47, + "grad_norm": 0.706250517347182, + "learning_rate": 1.7219415937342115e-06, + "loss": 0.5281, + "step": 11550 + }, + { + "epoch": 1.47, + "grad_norm": 0.6880917583980143, + "learning_rate": 1.7211627145935655e-06, + "loss": 0.4951, + "step": 11551 + }, + { + "epoch": 1.47, + "grad_norm": 0.6322374933541931, + "learning_rate": 1.720383975022548e-06, + "loss": 0.4784, + "step": 11552 + }, + { + "epoch": 1.47, + "grad_norm": 0.673863985497878, + "learning_rate": 1.7196053750543034e-06, + "loss": 0.4706, + "step": 11553 + }, + { + "epoch": 1.47, + "grad_norm": 0.7028437483328559, + "learning_rate": 1.7188269147219772e-06, + "loss": 0.5012, + "step": 11554 + }, + { + "epoch": 1.47, + "grad_norm": 0.6344949671979806, + "learning_rate": 1.7180485940587021e-06, + "loss": 0.4543, + "step": 11555 + }, + { + "epoch": 1.47, + "grad_norm": 0.6669204747579709, + "learning_rate": 1.7172704130976125e-06, + "loss": 0.4921, + "step": 11556 + }, + { + "epoch": 1.47, + "grad_norm": 0.7643036261885616, + "learning_rate": 1.7164923718718296e-06, + "loss": 0.5387, + "step": 11557 + }, + { + "epoch": 1.47, + "grad_norm": 0.6034249820064262, + "learning_rate": 1.7157144704144758e-06, + "loss": 0.4326, + "step": 11558 + }, + { + "epoch": 1.47, + "grad_norm": 0.835448185065465, + "learning_rate": 1.71493670875866e-06, + "loss": 0.4862, + "step": 11559 + }, + { + "epoch": 1.47, + "grad_norm": 0.6776718149410594, + "learning_rate": 1.7141590869374925e-06, + "loss": 0.475, + "step": 11560 + }, + { + "epoch": 1.47, + "grad_norm": 0.6357531812001118, + "learning_rate": 1.7133816049840708e-06, + "loss": 0.4488, + "step": 11561 + }, + { + "epoch": 1.47, + "grad_norm": 0.6449287159431228, + "learning_rate": 1.7126042629314937e-06, + "loss": 0.4583, + "step": 11562 + }, + { + "epoch": 1.47, + "grad_norm": 0.650441838256173, + "learning_rate": 1.7118270608128446e-06, + "loss": 0.4691, + "step": 11563 + }, + { + "epoch": 1.47, + "grad_norm": 0.5524188236511227, + "learning_rate": 1.7110499986612133e-06, + "loss": 0.4289, + "step": 11564 + }, + { + "epoch": 1.47, + "grad_norm": 0.600231352496326, + "learning_rate": 1.710273076509672e-06, + "loss": 0.4067, + "step": 11565 + }, + { + "epoch": 1.47, + "grad_norm": 0.6465559373369545, + "learning_rate": 1.7094962943912951e-06, + "loss": 0.4678, + "step": 11566 + }, + { + "epoch": 1.47, + "grad_norm": 0.5671943370014317, + "learning_rate": 1.7087196523391441e-06, + "loss": 0.4676, + "step": 11567 + }, + { + "epoch": 1.47, + "grad_norm": 0.6431900325287857, + "learning_rate": 1.707943150386282e-06, + "loss": 0.458, + "step": 11568 + }, + { + "epoch": 1.47, + "grad_norm": 0.8405237219699712, + "learning_rate": 1.7071667885657585e-06, + "loss": 0.514, + "step": 11569 + }, + { + "epoch": 1.47, + "grad_norm": 0.6587106144242421, + "learning_rate": 1.706390566910624e-06, + "loss": 0.4035, + "step": 11570 + }, + { + "epoch": 1.47, + "grad_norm": 0.6780986998984287, + "learning_rate": 1.7056144854539164e-06, + "loss": 0.4827, + "step": 11571 + }, + { + "epoch": 1.47, + "grad_norm": 0.5870662835236904, + "learning_rate": 1.7048385442286741e-06, + "loss": 0.4463, + "step": 11572 + }, + { + "epoch": 1.47, + "grad_norm": 0.645092100606818, + "learning_rate": 1.7040627432679241e-06, + "loss": 0.4913, + "step": 11573 + }, + { + "epoch": 1.47, + "grad_norm": 0.7125845990609546, + "learning_rate": 1.7032870826046922e-06, + "loss": 0.5073, + "step": 11574 + }, + { + "epoch": 1.47, + "grad_norm": 0.853295875398317, + "learning_rate": 1.7025115622719929e-06, + "loss": 0.4943, + "step": 11575 + }, + { + "epoch": 1.47, + "grad_norm": 0.7416814189030903, + "learning_rate": 1.701736182302839e-06, + "loss": 0.6, + "step": 11576 + }, + { + "epoch": 1.47, + "grad_norm": 0.7131880675678445, + "learning_rate": 1.7009609427302359e-06, + "loss": 0.5232, + "step": 11577 + }, + { + "epoch": 1.47, + "grad_norm": 0.8572876704747294, + "learning_rate": 1.7001858435871855e-06, + "loss": 0.5248, + "step": 11578 + }, + { + "epoch": 1.48, + "grad_norm": 0.7649702228976412, + "learning_rate": 1.699410884906677e-06, + "loss": 0.4519, + "step": 11579 + }, + { + "epoch": 1.48, + "grad_norm": 0.5346343170959683, + "learning_rate": 1.6986360667217016e-06, + "loss": 0.4721, + "step": 11580 + }, + { + "epoch": 1.48, + "grad_norm": 0.713307221952802, + "learning_rate": 1.6978613890652373e-06, + "loss": 0.4939, + "step": 11581 + }, + { + "epoch": 1.48, + "grad_norm": 0.7546086763042094, + "learning_rate": 1.6970868519702638e-06, + "loss": 0.5017, + "step": 11582 + }, + { + "epoch": 1.48, + "grad_norm": 0.6932047398458963, + "learning_rate": 1.6963124554697464e-06, + "loss": 0.4684, + "step": 11583 + }, + { + "epoch": 1.48, + "grad_norm": 0.6003799855910703, + "learning_rate": 1.6955381995966524e-06, + "loss": 0.4724, + "step": 11584 + }, + { + "epoch": 1.48, + "grad_norm": 0.6089361341662528, + "learning_rate": 1.694764084383936e-06, + "loss": 0.425, + "step": 11585 + }, + { + "epoch": 1.48, + "grad_norm": 0.6919833018782288, + "learning_rate": 1.6939901098645517e-06, + "loss": 0.4615, + "step": 11586 + }, + { + "epoch": 1.48, + "grad_norm": 0.6286409655726857, + "learning_rate": 1.6932162760714433e-06, + "loss": 0.5113, + "step": 11587 + }, + { + "epoch": 1.48, + "grad_norm": 0.7297549599010655, + "learning_rate": 1.6924425830375501e-06, + "loss": 0.5702, + "step": 11588 + }, + { + "epoch": 1.48, + "grad_norm": 0.7785206896209446, + "learning_rate": 1.691669030795809e-06, + "loss": 0.5451, + "step": 11589 + }, + { + "epoch": 1.48, + "grad_norm": 0.6674820695033553, + "learning_rate": 1.6908956193791432e-06, + "loss": 0.4699, + "step": 11590 + }, + { + "epoch": 1.48, + "grad_norm": 0.7202785789230283, + "learning_rate": 1.6901223488204781e-06, + "loss": 0.5083, + "step": 11591 + }, + { + "epoch": 1.48, + "grad_norm": 0.6012149512688153, + "learning_rate": 1.6893492191527266e-06, + "loss": 0.4301, + "step": 11592 + }, + { + "epoch": 1.48, + "grad_norm": 0.6552170568313688, + "learning_rate": 1.6885762304087995e-06, + "loss": 0.5229, + "step": 11593 + }, + { + "epoch": 1.48, + "grad_norm": 0.7565381505453234, + "learning_rate": 1.6878033826216016e-06, + "loss": 0.4909, + "step": 11594 + }, + { + "epoch": 1.48, + "grad_norm": 1.6545339847546898, + "learning_rate": 1.6870306758240285e-06, + "loss": 0.4842, + "step": 11595 + }, + { + "epoch": 1.48, + "grad_norm": 0.7039311974330968, + "learning_rate": 1.686258110048974e-06, + "loss": 0.5023, + "step": 11596 + }, + { + "epoch": 1.48, + "grad_norm": 0.6477100130079324, + "learning_rate": 1.6854856853293212e-06, + "loss": 0.4484, + "step": 11597 + }, + { + "epoch": 1.48, + "grad_norm": 0.763500242358217, + "learning_rate": 1.6847134016979521e-06, + "loss": 0.4686, + "step": 11598 + }, + { + "epoch": 1.48, + "grad_norm": 1.3907942308871668, + "learning_rate": 1.683941259187738e-06, + "loss": 0.516, + "step": 11599 + }, + { + "epoch": 1.48, + "grad_norm": 0.6250893047220832, + "learning_rate": 1.6831692578315483e-06, + "loss": 0.4873, + "step": 11600 + }, + { + "epoch": 1.48, + "grad_norm": 0.6227874024729992, + "learning_rate": 1.6823973976622455e-06, + "loss": 0.533, + "step": 11601 + }, + { + "epoch": 1.48, + "grad_norm": 0.7012333196607653, + "learning_rate": 1.6816256787126822e-06, + "loss": 0.5406, + "step": 11602 + }, + { + "epoch": 1.48, + "grad_norm": 0.7024384664349265, + "learning_rate": 1.6808541010157115e-06, + "loss": 0.4305, + "step": 11603 + }, + { + "epoch": 1.48, + "grad_norm": 0.6349009486137714, + "learning_rate": 1.680082664604174e-06, + "loss": 0.4381, + "step": 11604 + }, + { + "epoch": 1.48, + "grad_norm": 0.6092294486809704, + "learning_rate": 1.6793113695109097e-06, + "loss": 0.4124, + "step": 11605 + }, + { + "epoch": 1.48, + "grad_norm": 0.6150864069303005, + "learning_rate": 1.6785402157687474e-06, + "loss": 0.4274, + "step": 11606 + }, + { + "epoch": 1.48, + "grad_norm": 0.6286914796107257, + "learning_rate": 1.6777692034105158e-06, + "loss": 0.4893, + "step": 11607 + }, + { + "epoch": 1.48, + "grad_norm": 0.7341163536527543, + "learning_rate": 1.6769983324690315e-06, + "loss": 0.4637, + "step": 11608 + }, + { + "epoch": 1.48, + "grad_norm": 0.6791581357361317, + "learning_rate": 1.6762276029771113e-06, + "loss": 0.5236, + "step": 11609 + }, + { + "epoch": 1.48, + "grad_norm": 0.9183999407586751, + "learning_rate": 1.6754570149675586e-06, + "loss": 0.5465, + "step": 11610 + }, + { + "epoch": 1.48, + "grad_norm": 0.8161592850414057, + "learning_rate": 1.6746865684731789e-06, + "loss": 0.5096, + "step": 11611 + }, + { + "epoch": 1.48, + "grad_norm": 0.6323710622298931, + "learning_rate": 1.6739162635267643e-06, + "loss": 0.4427, + "step": 11612 + }, + { + "epoch": 1.48, + "grad_norm": 0.6072191275980428, + "learning_rate": 1.6731461001611055e-06, + "loss": 0.4738, + "step": 11613 + }, + { + "epoch": 1.48, + "grad_norm": 0.5889060218495732, + "learning_rate": 1.672376078408986e-06, + "loss": 0.4159, + "step": 11614 + }, + { + "epoch": 1.48, + "grad_norm": 0.6562959582773006, + "learning_rate": 1.6716061983031851e-06, + "loss": 0.4423, + "step": 11615 + }, + { + "epoch": 1.48, + "grad_norm": 0.6160393703774925, + "learning_rate": 1.6708364598764703e-06, + "loss": 0.5242, + "step": 11616 + }, + { + "epoch": 1.48, + "grad_norm": 0.6883351946369045, + "learning_rate": 1.6700668631616112e-06, + "loss": 0.4514, + "step": 11617 + }, + { + "epoch": 1.48, + "grad_norm": 0.6254235523005447, + "learning_rate": 1.6692974081913627e-06, + "loss": 0.4591, + "step": 11618 + }, + { + "epoch": 1.48, + "grad_norm": 0.7676230268042148, + "learning_rate": 1.6685280949984823e-06, + "loss": 0.451, + "step": 11619 + }, + { + "epoch": 1.48, + "grad_norm": 0.5594107990452631, + "learning_rate": 1.6677589236157126e-06, + "loss": 0.4797, + "step": 11620 + }, + { + "epoch": 1.48, + "grad_norm": 0.6960349098074288, + "learning_rate": 1.6669898940757996e-06, + "loss": 0.5289, + "step": 11621 + }, + { + "epoch": 1.48, + "grad_norm": 0.5472261529464447, + "learning_rate": 1.6662210064114742e-06, + "loss": 0.4429, + "step": 11622 + }, + { + "epoch": 1.48, + "grad_norm": 0.5724791718624103, + "learning_rate": 1.6654522606554684e-06, + "loss": 0.4138, + "step": 11623 + }, + { + "epoch": 1.48, + "grad_norm": 0.7021882781477281, + "learning_rate": 1.6646836568405033e-06, + "loss": 0.4889, + "step": 11624 + }, + { + "epoch": 1.48, + "grad_norm": 0.6197098990816787, + "learning_rate": 1.6639151949992966e-06, + "loss": 0.4532, + "step": 11625 + }, + { + "epoch": 1.48, + "grad_norm": 0.688513090820751, + "learning_rate": 1.6631468751645591e-06, + "loss": 0.5016, + "step": 11626 + }, + { + "epoch": 1.48, + "grad_norm": 0.6051277156718824, + "learning_rate": 1.6623786973689981e-06, + "loss": 0.4522, + "step": 11627 + }, + { + "epoch": 1.48, + "grad_norm": 0.5521555833063058, + "learning_rate": 1.6616106616453082e-06, + "loss": 0.4509, + "step": 11628 + }, + { + "epoch": 1.48, + "grad_norm": 0.7601612657721504, + "learning_rate": 1.6608427680261868e-06, + "loss": 0.5949, + "step": 11629 + }, + { + "epoch": 1.48, + "grad_norm": 0.827222430387003, + "learning_rate": 1.6600750165443163e-06, + "loss": 0.5038, + "step": 11630 + }, + { + "epoch": 1.48, + "grad_norm": 0.5697281018012231, + "learning_rate": 1.6593074072323812e-06, + "loss": 0.4075, + "step": 11631 + }, + { + "epoch": 1.48, + "grad_norm": 0.5691538339339532, + "learning_rate": 1.6585399401230528e-06, + "loss": 0.4276, + "step": 11632 + }, + { + "epoch": 1.48, + "grad_norm": 0.6779596515561276, + "learning_rate": 1.657772615249003e-06, + "loss": 0.4723, + "step": 11633 + }, + { + "epoch": 1.48, + "grad_norm": 0.6491495786981641, + "learning_rate": 1.6570054326428914e-06, + "loss": 0.5221, + "step": 11634 + }, + { + "epoch": 1.48, + "grad_norm": 0.8610529673071755, + "learning_rate": 1.656238392337377e-06, + "loss": 0.5459, + "step": 11635 + }, + { + "epoch": 1.48, + "grad_norm": 0.7328895451009377, + "learning_rate": 1.6554714943651078e-06, + "loss": 0.4883, + "step": 11636 + }, + { + "epoch": 1.48, + "grad_norm": 0.6491557838985147, + "learning_rate": 1.654704738758731e-06, + "loss": 0.4438, + "step": 11637 + }, + { + "epoch": 1.48, + "grad_norm": 0.660849923400595, + "learning_rate": 1.6539381255508802e-06, + "loss": 0.4893, + "step": 11638 + }, + { + "epoch": 1.48, + "grad_norm": 0.7055581571554427, + "learning_rate": 1.6531716547741945e-06, + "loss": 0.5549, + "step": 11639 + }, + { + "epoch": 1.48, + "grad_norm": 2.4094232215773728, + "learning_rate": 1.6524053264612945e-06, + "loss": 0.574, + "step": 11640 + }, + { + "epoch": 1.48, + "grad_norm": 0.6809876669518666, + "learning_rate": 1.651639140644804e-06, + "loss": 0.5384, + "step": 11641 + }, + { + "epoch": 1.48, + "grad_norm": 0.9028688813525833, + "learning_rate": 1.6508730973573338e-06, + "loss": 0.4999, + "step": 11642 + }, + { + "epoch": 1.48, + "grad_norm": 0.7819055999206425, + "learning_rate": 1.6501071966314952e-06, + "loss": 0.4797, + "step": 11643 + }, + { + "epoch": 1.48, + "grad_norm": 0.7903758568525185, + "learning_rate": 1.6493414384998873e-06, + "loss": 0.4718, + "step": 11644 + }, + { + "epoch": 1.48, + "grad_norm": 1.1374231005083308, + "learning_rate": 1.6485758229951082e-06, + "loss": 0.5196, + "step": 11645 + }, + { + "epoch": 1.48, + "grad_norm": 0.8436919990140662, + "learning_rate": 1.6478103501497455e-06, + "loss": 0.5309, + "step": 11646 + }, + { + "epoch": 1.48, + "grad_norm": 0.762438017786036, + "learning_rate": 1.6470450199963856e-06, + "loss": 0.4766, + "step": 11647 + }, + { + "epoch": 1.48, + "grad_norm": 0.780788225908962, + "learning_rate": 1.6462798325676033e-06, + "loss": 0.5233, + "step": 11648 + }, + { + "epoch": 1.48, + "grad_norm": 0.6717274026315277, + "learning_rate": 1.6455147878959727e-06, + "loss": 0.5177, + "step": 11649 + }, + { + "epoch": 1.48, + "grad_norm": 0.8202416257120945, + "learning_rate": 1.644749886014057e-06, + "loss": 0.5928, + "step": 11650 + }, + { + "epoch": 1.48, + "grad_norm": 0.671860262074767, + "learning_rate": 1.6439851269544165e-06, + "loss": 0.4137, + "step": 11651 + }, + { + "epoch": 1.48, + "grad_norm": 0.62919415610537, + "learning_rate": 1.6432205107496069e-06, + "loss": 0.4568, + "step": 11652 + }, + { + "epoch": 1.48, + "grad_norm": 0.6233772364002257, + "learning_rate": 1.6424560374321713e-06, + "loss": 0.4446, + "step": 11653 + }, + { + "epoch": 1.48, + "grad_norm": 0.6564702793842595, + "learning_rate": 1.6416917070346534e-06, + "loss": 0.5211, + "step": 11654 + }, + { + "epoch": 1.48, + "grad_norm": 0.756661432533553, + "learning_rate": 1.6409275195895892e-06, + "loss": 0.5062, + "step": 11655 + }, + { + "epoch": 1.48, + "grad_norm": 0.6899665087428536, + "learning_rate": 1.6401634751295047e-06, + "loss": 0.5137, + "step": 11656 + }, + { + "epoch": 1.49, + "grad_norm": 0.7044619148972615, + "learning_rate": 1.6393995736869267e-06, + "loss": 0.477, + "step": 11657 + }, + { + "epoch": 1.49, + "grad_norm": 0.6246292546745654, + "learning_rate": 1.6386358152943682e-06, + "loss": 0.4564, + "step": 11658 + }, + { + "epoch": 1.49, + "grad_norm": 0.5545051837420987, + "learning_rate": 1.6378721999843428e-06, + "loss": 0.4251, + "step": 11659 + }, + { + "epoch": 1.49, + "grad_norm": 0.5675673493529451, + "learning_rate": 1.6371087277893527e-06, + "loss": 0.4294, + "step": 11660 + }, + { + "epoch": 1.49, + "grad_norm": 0.6752120206646379, + "learning_rate": 1.6363453987418997e-06, + "loss": 0.487, + "step": 11661 + }, + { + "epoch": 1.49, + "grad_norm": 0.7313314388976673, + "learning_rate": 1.6355822128744725e-06, + "loss": 0.546, + "step": 11662 + }, + { + "epoch": 1.49, + "grad_norm": 0.7567602038825845, + "learning_rate": 1.6348191702195593e-06, + "loss": 0.5424, + "step": 11663 + }, + { + "epoch": 1.49, + "grad_norm": 0.7669195468100269, + "learning_rate": 1.6340562708096419e-06, + "loss": 0.5364, + "step": 11664 + }, + { + "epoch": 1.49, + "grad_norm": 0.7715835948915516, + "learning_rate": 1.6332935146771917e-06, + "loss": 0.5629, + "step": 11665 + }, + { + "epoch": 1.49, + "grad_norm": 0.6390565392200357, + "learning_rate": 1.6325309018546798e-06, + "loss": 0.458, + "step": 11666 + }, + { + "epoch": 1.49, + "grad_norm": 0.7582914358538736, + "learning_rate": 1.6317684323745648e-06, + "loss": 0.4995, + "step": 11667 + }, + { + "epoch": 1.49, + "grad_norm": 0.5913005676358329, + "learning_rate": 1.6310061062693061e-06, + "loss": 0.4314, + "step": 11668 + }, + { + "epoch": 1.49, + "grad_norm": 0.6120843872338068, + "learning_rate": 1.6302439235713496e-06, + "loss": 0.4709, + "step": 11669 + }, + { + "epoch": 1.49, + "grad_norm": 0.7248412225181978, + "learning_rate": 1.6294818843131432e-06, + "loss": 0.4963, + "step": 11670 + }, + { + "epoch": 1.49, + "grad_norm": 0.6743556696793921, + "learning_rate": 1.6287199885271205e-06, + "loss": 0.4737, + "step": 11671 + }, + { + "epoch": 1.49, + "grad_norm": 0.706868681575387, + "learning_rate": 1.6279582362457165e-06, + "loss": 0.5278, + "step": 11672 + }, + { + "epoch": 1.49, + "grad_norm": 0.618794612795796, + "learning_rate": 1.6271966275013529e-06, + "loss": 0.4242, + "step": 11673 + }, + { + "epoch": 1.49, + "grad_norm": 0.6514651043071733, + "learning_rate": 1.626435162326453e-06, + "loss": 0.4208, + "step": 11674 + }, + { + "epoch": 1.49, + "grad_norm": 0.6757377179089523, + "learning_rate": 1.6256738407534245e-06, + "loss": 0.5056, + "step": 11675 + }, + { + "epoch": 1.49, + "grad_norm": 1.6786881160375586, + "learning_rate": 1.624912662814681e-06, + "loss": 0.5572, + "step": 11676 + }, + { + "epoch": 1.49, + "grad_norm": 0.7211068813967096, + "learning_rate": 1.6241516285426183e-06, + "loss": 0.5078, + "step": 11677 + }, + { + "epoch": 1.49, + "grad_norm": 0.6589763639653017, + "learning_rate": 1.623390737969635e-06, + "loss": 0.4542, + "step": 11678 + }, + { + "epoch": 1.49, + "grad_norm": 0.5719843392074726, + "learning_rate": 1.6226299911281163e-06, + "loss": 0.4577, + "step": 11679 + }, + { + "epoch": 1.49, + "grad_norm": 0.7336040203804371, + "learning_rate": 1.6218693880504482e-06, + "loss": 0.5266, + "step": 11680 + }, + { + "epoch": 1.49, + "grad_norm": 0.6448540986105957, + "learning_rate": 1.6211089287690034e-06, + "loss": 0.4804, + "step": 11681 + }, + { + "epoch": 1.49, + "grad_norm": 0.7955898616096426, + "learning_rate": 1.6203486133161555e-06, + "loss": 0.5364, + "step": 11682 + }, + { + "epoch": 1.49, + "grad_norm": 0.7468945140372171, + "learning_rate": 1.6195884417242663e-06, + "loss": 0.4587, + "step": 11683 + }, + { + "epoch": 1.49, + "grad_norm": 0.989954219237031, + "learning_rate": 1.618828414025696e-06, + "loss": 0.5095, + "step": 11684 + }, + { + "epoch": 1.49, + "grad_norm": 0.7369868046092862, + "learning_rate": 1.6180685302527938e-06, + "loss": 0.5236, + "step": 11685 + }, + { + "epoch": 1.49, + "grad_norm": 0.7431057810762721, + "learning_rate": 1.6173087904379091e-06, + "loss": 0.5428, + "step": 11686 + }, + { + "epoch": 1.49, + "grad_norm": 0.678718643119146, + "learning_rate": 1.6165491946133766e-06, + "loss": 0.4309, + "step": 11687 + }, + { + "epoch": 1.49, + "grad_norm": 0.656017728927584, + "learning_rate": 1.6157897428115354e-06, + "loss": 0.4556, + "step": 11688 + }, + { + "epoch": 1.49, + "grad_norm": 0.6436879738085989, + "learning_rate": 1.6150304350647095e-06, + "loss": 0.4862, + "step": 11689 + }, + { + "epoch": 1.49, + "grad_norm": 0.7194150380350957, + "learning_rate": 1.6142712714052223e-06, + "loss": 0.4799, + "step": 11690 + }, + { + "epoch": 1.49, + "grad_norm": 1.0333355860260767, + "learning_rate": 1.6135122518653868e-06, + "loss": 0.4995, + "step": 11691 + }, + { + "epoch": 1.49, + "grad_norm": 0.8593372914682009, + "learning_rate": 1.6127533764775143e-06, + "loss": 0.5552, + "step": 11692 + }, + { + "epoch": 1.49, + "grad_norm": 0.7344956407632456, + "learning_rate": 1.611994645273905e-06, + "loss": 0.5142, + "step": 11693 + }, + { + "epoch": 1.49, + "grad_norm": 0.618078695798422, + "learning_rate": 1.6112360582868586e-06, + "loss": 0.4296, + "step": 11694 + }, + { + "epoch": 1.49, + "grad_norm": 0.616708423446725, + "learning_rate": 1.6104776155486623e-06, + "loss": 0.4659, + "step": 11695 + }, + { + "epoch": 1.49, + "grad_norm": 0.6221525712560697, + "learning_rate": 1.6097193170916048e-06, + "loss": 0.5269, + "step": 11696 + }, + { + "epoch": 1.49, + "grad_norm": 0.8076474980628993, + "learning_rate": 1.6089611629479602e-06, + "loss": 0.5579, + "step": 11697 + }, + { + "epoch": 1.49, + "grad_norm": 0.7357107454652001, + "learning_rate": 1.6082031531500047e-06, + "loss": 0.4817, + "step": 11698 + }, + { + "epoch": 1.49, + "grad_norm": 0.5882972688449865, + "learning_rate": 1.60744528773e-06, + "loss": 0.4384, + "step": 11699 + }, + { + "epoch": 1.49, + "grad_norm": 0.5990767493900242, + "learning_rate": 1.6066875667202093e-06, + "loss": 0.4412, + "step": 11700 + }, + { + "epoch": 1.49, + "grad_norm": 0.7346472202613129, + "learning_rate": 1.6059299901528847e-06, + "loss": 0.4271, + "step": 11701 + }, + { + "epoch": 1.49, + "grad_norm": 0.7010105433833499, + "learning_rate": 1.6051725580602762e-06, + "loss": 0.4774, + "step": 11702 + }, + { + "epoch": 1.49, + "grad_norm": 0.6691420285863616, + "learning_rate": 1.6044152704746224e-06, + "loss": 0.4907, + "step": 11703 + }, + { + "epoch": 1.49, + "grad_norm": 0.7181996896352522, + "learning_rate": 1.603658127428161e-06, + "loss": 0.523, + "step": 11704 + }, + { + "epoch": 1.49, + "grad_norm": 0.642430804792435, + "learning_rate": 1.6029011289531183e-06, + "loss": 0.4818, + "step": 11705 + }, + { + "epoch": 1.49, + "grad_norm": 0.7907351539196825, + "learning_rate": 1.6021442750817208e-06, + "loss": 0.5945, + "step": 11706 + }, + { + "epoch": 1.49, + "grad_norm": 0.7397152098901558, + "learning_rate": 1.601387565846182e-06, + "loss": 0.5181, + "step": 11707 + }, + { + "epoch": 1.49, + "grad_norm": 0.6363073864898816, + "learning_rate": 1.6006310012787156e-06, + "loss": 0.4721, + "step": 11708 + }, + { + "epoch": 1.49, + "grad_norm": 0.9345310392348628, + "learning_rate": 1.599874581411523e-06, + "loss": 0.5058, + "step": 11709 + }, + { + "epoch": 1.49, + "grad_norm": 0.7084655441965263, + "learning_rate": 1.5991183062768057e-06, + "loss": 0.4809, + "step": 11710 + }, + { + "epoch": 1.49, + "grad_norm": 0.7529786534217483, + "learning_rate": 1.598362175906753e-06, + "loss": 0.4894, + "step": 11711 + }, + { + "epoch": 1.49, + "grad_norm": 0.7560265271118195, + "learning_rate": 1.5976061903335526e-06, + "loss": 0.5485, + "step": 11712 + }, + { + "epoch": 1.49, + "grad_norm": 0.7445073342863389, + "learning_rate": 1.5968503495893855e-06, + "loss": 0.5085, + "step": 11713 + }, + { + "epoch": 1.49, + "grad_norm": 0.7716263320602863, + "learning_rate": 1.5960946537064226e-06, + "loss": 0.511, + "step": 11714 + }, + { + "epoch": 1.49, + "grad_norm": 0.5971113743480645, + "learning_rate": 1.5953391027168342e-06, + "loss": 0.4205, + "step": 11715 + }, + { + "epoch": 1.49, + "grad_norm": 0.6234449235150987, + "learning_rate": 1.5945836966527794e-06, + "loss": 0.5052, + "step": 11716 + }, + { + "epoch": 1.49, + "grad_norm": 0.7259830708394077, + "learning_rate": 1.593828435546414e-06, + "loss": 0.4817, + "step": 11717 + }, + { + "epoch": 1.49, + "grad_norm": 0.6201405993008865, + "learning_rate": 1.5930733194298897e-06, + "loss": 0.4132, + "step": 11718 + }, + { + "epoch": 1.49, + "grad_norm": 0.6548015460817552, + "learning_rate": 1.5923183483353455e-06, + "loss": 0.4445, + "step": 11719 + }, + { + "epoch": 1.49, + "grad_norm": 0.5760114762894553, + "learning_rate": 1.5915635222949212e-06, + "loss": 0.4694, + "step": 11720 + }, + { + "epoch": 1.49, + "grad_norm": 0.7039268430095816, + "learning_rate": 1.590808841340744e-06, + "loss": 0.5053, + "step": 11721 + }, + { + "epoch": 1.49, + "grad_norm": 0.5600014365888779, + "learning_rate": 1.5900543055049421e-06, + "loss": 0.4094, + "step": 11722 + }, + { + "epoch": 1.49, + "grad_norm": 0.6084876811002675, + "learning_rate": 1.5892999148196302e-06, + "loss": 0.4377, + "step": 11723 + }, + { + "epoch": 1.49, + "grad_norm": 0.6953852629916997, + "learning_rate": 1.5885456693169221e-06, + "loss": 0.4994, + "step": 11724 + }, + { + "epoch": 1.49, + "grad_norm": 0.6795727701064831, + "learning_rate": 1.5877915690289252e-06, + "loss": 0.477, + "step": 11725 + }, + { + "epoch": 1.49, + "grad_norm": 0.5947685028672042, + "learning_rate": 1.5870376139877353e-06, + "loss": 0.4172, + "step": 11726 + }, + { + "epoch": 1.49, + "grad_norm": 0.7346432079184089, + "learning_rate": 1.5862838042254498e-06, + "loss": 0.4619, + "step": 11727 + }, + { + "epoch": 1.49, + "grad_norm": 0.8121603884955976, + "learning_rate": 1.5855301397741518e-06, + "loss": 0.5134, + "step": 11728 + }, + { + "epoch": 1.49, + "grad_norm": 0.658093977326385, + "learning_rate": 1.584776620665927e-06, + "loss": 0.4814, + "step": 11729 + }, + { + "epoch": 1.49, + "grad_norm": 0.6034520885958667, + "learning_rate": 1.584023246932846e-06, + "loss": 0.4365, + "step": 11730 + }, + { + "epoch": 1.49, + "grad_norm": 0.5632148660641273, + "learning_rate": 1.5832700186069815e-06, + "loss": 0.3963, + "step": 11731 + }, + { + "epoch": 1.49, + "grad_norm": 0.6144137899006358, + "learning_rate": 1.5825169357203918e-06, + "loss": 0.4759, + "step": 11732 + }, + { + "epoch": 1.49, + "grad_norm": 0.688181547907297, + "learning_rate": 1.5817639983051376e-06, + "loss": 0.5318, + "step": 11733 + }, + { + "epoch": 1.49, + "grad_norm": 0.7524833399109788, + "learning_rate": 1.5810112063932649e-06, + "loss": 0.5195, + "step": 11734 + }, + { + "epoch": 1.49, + "grad_norm": 0.6532304991786136, + "learning_rate": 1.5802585600168214e-06, + "loss": 0.455, + "step": 11735 + }, + { + "epoch": 1.5, + "grad_norm": 0.5896838061345844, + "learning_rate": 1.5795060592078415e-06, + "loss": 0.47, + "step": 11736 + }, + { + "epoch": 1.5, + "grad_norm": 0.6407654360264194, + "learning_rate": 1.578753703998358e-06, + "loss": 0.4353, + "step": 11737 + }, + { + "epoch": 1.5, + "grad_norm": 0.6875393592874859, + "learning_rate": 1.578001494420397e-06, + "loss": 0.4363, + "step": 11738 + }, + { + "epoch": 1.5, + "grad_norm": 0.6161792859934612, + "learning_rate": 1.5772494305059788e-06, + "loss": 0.4831, + "step": 11739 + }, + { + "epoch": 1.5, + "grad_norm": 0.6526223812951869, + "learning_rate": 1.576497512287113e-06, + "loss": 0.5036, + "step": 11740 + }, + { + "epoch": 1.5, + "grad_norm": 0.6491918024259562, + "learning_rate": 1.5757457397958097e-06, + "loss": 0.4523, + "step": 11741 + }, + { + "epoch": 1.5, + "grad_norm": 0.7540643503895571, + "learning_rate": 1.574994113064066e-06, + "loss": 0.4948, + "step": 11742 + }, + { + "epoch": 1.5, + "grad_norm": 0.5779330418048424, + "learning_rate": 1.5742426321238806e-06, + "loss": 0.441, + "step": 11743 + }, + { + "epoch": 1.5, + "grad_norm": 0.7225246613868435, + "learning_rate": 1.573491297007237e-06, + "loss": 0.453, + "step": 11744 + }, + { + "epoch": 1.5, + "grad_norm": 0.7379107104095686, + "learning_rate": 1.5727401077461214e-06, + "loss": 0.5373, + "step": 11745 + }, + { + "epoch": 1.5, + "grad_norm": 0.7396805372132097, + "learning_rate": 1.5719890643725055e-06, + "loss": 0.5861, + "step": 11746 + }, + { + "epoch": 1.5, + "grad_norm": 0.7411962699226632, + "learning_rate": 1.5712381669183625e-06, + "loss": 0.527, + "step": 11747 + }, + { + "epoch": 1.5, + "grad_norm": 0.7519719020075472, + "learning_rate": 1.5704874154156518e-06, + "loss": 0.5686, + "step": 11748 + }, + { + "epoch": 1.5, + "grad_norm": 0.968156842929008, + "learning_rate": 1.5697368098963334e-06, + "loss": 0.5331, + "step": 11749 + }, + { + "epoch": 1.5, + "grad_norm": 0.7384914180403779, + "learning_rate": 1.5689863503923568e-06, + "loss": 0.4788, + "step": 11750 + }, + { + "epoch": 1.5, + "grad_norm": 0.5429395082431272, + "learning_rate": 1.5682360369356698e-06, + "loss": 0.402, + "step": 11751 + }, + { + "epoch": 1.5, + "grad_norm": 0.5750268987020277, + "learning_rate": 1.5674858695582063e-06, + "loss": 0.4096, + "step": 11752 + }, + { + "epoch": 1.5, + "grad_norm": 0.654286318930066, + "learning_rate": 1.5667358482919022e-06, + "loss": 0.438, + "step": 11753 + }, + { + "epoch": 1.5, + "grad_norm": 0.6134140235453149, + "learning_rate": 1.56598597316868e-06, + "loss": 0.4763, + "step": 11754 + }, + { + "epoch": 1.5, + "grad_norm": 0.7039122258646313, + "learning_rate": 1.5652362442204632e-06, + "loss": 0.4914, + "step": 11755 + }, + { + "epoch": 1.5, + "grad_norm": 0.6116849157370886, + "learning_rate": 1.564486661479162e-06, + "loss": 0.4096, + "step": 11756 + }, + { + "epoch": 1.5, + "grad_norm": 0.7165708027256114, + "learning_rate": 1.5637372249766869e-06, + "loss": 0.4889, + "step": 11757 + }, + { + "epoch": 1.5, + "grad_norm": 0.6299564560444582, + "learning_rate": 1.5629879347449362e-06, + "loss": 0.4847, + "step": 11758 + }, + { + "epoch": 1.5, + "grad_norm": 0.5888890148812191, + "learning_rate": 1.562238790815807e-06, + "loss": 0.4118, + "step": 11759 + }, + { + "epoch": 1.5, + "grad_norm": 0.7052670554341612, + "learning_rate": 1.5614897932211854e-06, + "loss": 0.5085, + "step": 11760 + }, + { + "epoch": 1.5, + "grad_norm": 0.8093611104067335, + "learning_rate": 1.5607409419929576e-06, + "loss": 0.5263, + "step": 11761 + }, + { + "epoch": 1.5, + "grad_norm": 0.6389577030413439, + "learning_rate": 1.559992237162994e-06, + "loss": 0.4857, + "step": 11762 + }, + { + "epoch": 1.5, + "grad_norm": 0.5891752639213359, + "learning_rate": 1.559243678763171e-06, + "loss": 0.456, + "step": 11763 + }, + { + "epoch": 1.5, + "grad_norm": 0.6419411020546483, + "learning_rate": 1.5584952668253484e-06, + "loss": 0.4509, + "step": 11764 + }, + { + "epoch": 1.5, + "grad_norm": 0.6148366827308623, + "learning_rate": 1.5577470013813862e-06, + "loss": 0.4905, + "step": 11765 + }, + { + "epoch": 1.5, + "grad_norm": 0.7111504954333165, + "learning_rate": 1.5569988824631327e-06, + "loss": 0.5505, + "step": 11766 + }, + { + "epoch": 1.5, + "grad_norm": 0.7666176035982654, + "learning_rate": 1.556250910102436e-06, + "loss": 0.5043, + "step": 11767 + }, + { + "epoch": 1.5, + "grad_norm": 0.5837982535188084, + "learning_rate": 1.555503084331132e-06, + "loss": 0.4352, + "step": 11768 + }, + { + "epoch": 1.5, + "grad_norm": 0.6010695453524361, + "learning_rate": 1.5547554051810565e-06, + "loss": 0.4371, + "step": 11769 + }, + { + "epoch": 1.5, + "grad_norm": 0.6264840395814519, + "learning_rate": 1.5540078726840319e-06, + "loss": 0.4703, + "step": 11770 + }, + { + "epoch": 1.5, + "grad_norm": 0.68022669008752, + "learning_rate": 1.553260486871882e-06, + "loss": 0.514, + "step": 11771 + }, + { + "epoch": 1.5, + "grad_norm": 0.7956379103897336, + "learning_rate": 1.5525132477764176e-06, + "loss": 0.5121, + "step": 11772 + }, + { + "epoch": 1.5, + "grad_norm": 0.7154696553375018, + "learning_rate": 1.5517661554294494e-06, + "loss": 0.4875, + "step": 11773 + }, + { + "epoch": 1.5, + "grad_norm": 0.6287273317750448, + "learning_rate": 1.5510192098627752e-06, + "loss": 0.4581, + "step": 11774 + }, + { + "epoch": 1.5, + "grad_norm": 0.6814051416071311, + "learning_rate": 1.550272411108192e-06, + "loss": 0.478, + "step": 11775 + }, + { + "epoch": 1.5, + "grad_norm": 0.6542467001588551, + "learning_rate": 1.5495257591974894e-06, + "loss": 0.4654, + "step": 11776 + }, + { + "epoch": 1.5, + "grad_norm": 0.653667677085657, + "learning_rate": 1.548779254162448e-06, + "loss": 0.5436, + "step": 11777 + }, + { + "epoch": 1.5, + "grad_norm": 0.802816849316158, + "learning_rate": 1.5480328960348456e-06, + "loss": 0.4777, + "step": 11778 + }, + { + "epoch": 1.5, + "grad_norm": 0.5988588495379926, + "learning_rate": 1.547286684846453e-06, + "loss": 0.4143, + "step": 11779 + }, + { + "epoch": 1.5, + "grad_norm": 0.6680500243335126, + "learning_rate": 1.5465406206290318e-06, + "loss": 0.5123, + "step": 11780 + }, + { + "epoch": 1.5, + "grad_norm": 0.8222643093206636, + "learning_rate": 1.5457947034143417e-06, + "loss": 0.5485, + "step": 11781 + }, + { + "epoch": 1.5, + "grad_norm": 0.6696696806370672, + "learning_rate": 1.5450489332341317e-06, + "loss": 0.4826, + "step": 11782 + }, + { + "epoch": 1.5, + "grad_norm": 0.771063794677532, + "learning_rate": 1.5443033101201498e-06, + "loss": 0.5505, + "step": 11783 + }, + { + "epoch": 1.5, + "grad_norm": 0.6418626006642414, + "learning_rate": 1.5435578341041313e-06, + "loss": 0.4411, + "step": 11784 + }, + { + "epoch": 1.5, + "grad_norm": 0.5483683083025235, + "learning_rate": 1.5428125052178128e-06, + "loss": 0.4549, + "step": 11785 + }, + { + "epoch": 1.5, + "grad_norm": 0.7608689896695447, + "learning_rate": 1.5420673234929162e-06, + "loss": 0.523, + "step": 11786 + }, + { + "epoch": 1.5, + "grad_norm": 0.7668921287249512, + "learning_rate": 1.541322288961164e-06, + "loss": 0.5205, + "step": 11787 + }, + { + "epoch": 1.5, + "grad_norm": 0.8093078290301612, + "learning_rate": 1.5405774016542707e-06, + "loss": 0.5532, + "step": 11788 + }, + { + "epoch": 1.5, + "grad_norm": 0.8690668151779515, + "learning_rate": 1.5398326616039415e-06, + "loss": 0.5387, + "step": 11789 + }, + { + "epoch": 1.5, + "grad_norm": 0.6192319007616564, + "learning_rate": 1.5390880688418808e-06, + "loss": 0.47, + "step": 11790 + }, + { + "epoch": 1.5, + "grad_norm": 0.6493168836358363, + "learning_rate": 1.538343623399779e-06, + "loss": 0.4736, + "step": 11791 + }, + { + "epoch": 1.5, + "grad_norm": 0.7391252325169428, + "learning_rate": 1.5375993253093298e-06, + "loss": 0.4758, + "step": 11792 + }, + { + "epoch": 1.5, + "grad_norm": 0.6755645686739995, + "learning_rate": 1.5368551746022109e-06, + "loss": 0.486, + "step": 11793 + }, + { + "epoch": 1.5, + "grad_norm": 0.7293311495375693, + "learning_rate": 1.5361111713101017e-06, + "loss": 0.5299, + "step": 11794 + }, + { + "epoch": 1.5, + "grad_norm": 0.7938046028945703, + "learning_rate": 1.5353673154646703e-06, + "loss": 0.4883, + "step": 11795 + }, + { + "epoch": 1.5, + "grad_norm": 0.7300984710314583, + "learning_rate": 1.5346236070975823e-06, + "loss": 0.5228, + "step": 11796 + }, + { + "epoch": 1.5, + "grad_norm": 0.8588919596365745, + "learning_rate": 1.5338800462404918e-06, + "loss": 0.5292, + "step": 11797 + }, + { + "epoch": 1.5, + "grad_norm": 0.7817866249813722, + "learning_rate": 1.533136632925053e-06, + "loss": 0.5224, + "step": 11798 + }, + { + "epoch": 1.5, + "grad_norm": 0.6312476615578303, + "learning_rate": 1.532393367182906e-06, + "loss": 0.4807, + "step": 11799 + }, + { + "epoch": 1.5, + "grad_norm": 0.6032819000982859, + "learning_rate": 1.5316502490456963e-06, + "loss": 0.4443, + "step": 11800 + }, + { + "epoch": 1.5, + "grad_norm": 0.7333570353653186, + "learning_rate": 1.5309072785450502e-06, + "loss": 0.5031, + "step": 11801 + }, + { + "epoch": 1.5, + "grad_norm": 0.65520263007275, + "learning_rate": 1.5301644557125973e-06, + "loss": 0.4434, + "step": 11802 + }, + { + "epoch": 1.5, + "grad_norm": 0.6385059594453464, + "learning_rate": 1.5294217805799539e-06, + "loss": 0.4637, + "step": 11803 + }, + { + "epoch": 1.5, + "grad_norm": 0.7016805321237948, + "learning_rate": 1.5286792531787365e-06, + "loss": 0.4837, + "step": 11804 + }, + { + "epoch": 1.5, + "grad_norm": 0.8813445668346723, + "learning_rate": 1.5279368735405487e-06, + "loss": 0.5593, + "step": 11805 + }, + { + "epoch": 1.5, + "grad_norm": 0.6998779931818744, + "learning_rate": 1.5271946416969945e-06, + "loss": 0.4976, + "step": 11806 + }, + { + "epoch": 1.5, + "grad_norm": 0.75638765995227, + "learning_rate": 1.5264525576796663e-06, + "loss": 0.5477, + "step": 11807 + }, + { + "epoch": 1.5, + "grad_norm": 0.7014983236205438, + "learning_rate": 1.5257106215201533e-06, + "loss": 0.4773, + "step": 11808 + }, + { + "epoch": 1.5, + "grad_norm": 0.5784815778331148, + "learning_rate": 1.5249688332500362e-06, + "loss": 0.4077, + "step": 11809 + }, + { + "epoch": 1.5, + "grad_norm": 0.6860376601882198, + "learning_rate": 1.5242271929008923e-06, + "loss": 0.5176, + "step": 11810 + }, + { + "epoch": 1.5, + "grad_norm": 0.708522576587283, + "learning_rate": 1.523485700504287e-06, + "loss": 0.4995, + "step": 11811 + }, + { + "epoch": 1.5, + "grad_norm": 0.8573932136254214, + "learning_rate": 1.5227443560917894e-06, + "loss": 0.5143, + "step": 11812 + }, + { + "epoch": 1.5, + "grad_norm": 0.8141553424477549, + "learning_rate": 1.5220031596949508e-06, + "loss": 0.47, + "step": 11813 + }, + { + "epoch": 1.51, + "grad_norm": 0.799925582347533, + "learning_rate": 1.5212621113453252e-06, + "loss": 0.5356, + "step": 11814 + }, + { + "epoch": 1.51, + "grad_norm": 0.735865505739735, + "learning_rate": 1.5205212110744538e-06, + "loss": 0.5113, + "step": 11815 + }, + { + "epoch": 1.51, + "grad_norm": 0.5881047136717945, + "learning_rate": 1.519780458913877e-06, + "loss": 0.4616, + "step": 11816 + }, + { + "epoch": 1.51, + "grad_norm": 0.6602627792109644, + "learning_rate": 1.519039854895123e-06, + "loss": 0.5027, + "step": 11817 + }, + { + "epoch": 1.51, + "grad_norm": 0.8064955582453905, + "learning_rate": 1.5182993990497208e-06, + "loss": 0.4981, + "step": 11818 + }, + { + "epoch": 1.51, + "grad_norm": 0.6263308430897092, + "learning_rate": 1.5175590914091853e-06, + "loss": 0.5091, + "step": 11819 + }, + { + "epoch": 1.51, + "grad_norm": 0.7712133292642409, + "learning_rate": 1.516818932005033e-06, + "loss": 0.5092, + "step": 11820 + }, + { + "epoch": 1.51, + "grad_norm": 0.589759177810315, + "learning_rate": 1.5160789208687665e-06, + "loss": 0.4382, + "step": 11821 + }, + { + "epoch": 1.51, + "grad_norm": 0.5582565081809934, + "learning_rate": 1.5153390580318888e-06, + "loss": 0.438, + "step": 11822 + }, + { + "epoch": 1.51, + "grad_norm": 0.6389886936717366, + "learning_rate": 1.5145993435258905e-06, + "loss": 0.4795, + "step": 11823 + }, + { + "epoch": 1.51, + "grad_norm": 0.716650432743136, + "learning_rate": 1.5138597773822606e-06, + "loss": 0.464, + "step": 11824 + }, + { + "epoch": 1.51, + "grad_norm": 0.6478239783158753, + "learning_rate": 1.5131203596324795e-06, + "loss": 0.5096, + "step": 11825 + }, + { + "epoch": 1.51, + "grad_norm": 0.6454641410197964, + "learning_rate": 1.5123810903080239e-06, + "loss": 0.4651, + "step": 11826 + }, + { + "epoch": 1.51, + "grad_norm": 0.6202316761037979, + "learning_rate": 1.5116419694403588e-06, + "loss": 0.4546, + "step": 11827 + }, + { + "epoch": 1.51, + "grad_norm": 0.6202174516241951, + "learning_rate": 1.5109029970609495e-06, + "loss": 0.4357, + "step": 11828 + }, + { + "epoch": 1.51, + "grad_norm": 0.6908969314066419, + "learning_rate": 1.5101641732012484e-06, + "loss": 0.5059, + "step": 11829 + }, + { + "epoch": 1.51, + "grad_norm": 0.7185467599864933, + "learning_rate": 1.509425497892708e-06, + "loss": 0.513, + "step": 11830 + }, + { + "epoch": 1.51, + "grad_norm": 0.7064557693082774, + "learning_rate": 1.5086869711667684e-06, + "loss": 0.4665, + "step": 11831 + }, + { + "epoch": 1.51, + "grad_norm": 0.578798687162687, + "learning_rate": 1.50794859305487e-06, + "loss": 0.4932, + "step": 11832 + }, + { + "epoch": 1.51, + "grad_norm": 0.6905663030653478, + "learning_rate": 1.5072103635884384e-06, + "loss": 0.5603, + "step": 11833 + }, + { + "epoch": 1.51, + "grad_norm": 0.7510664241129372, + "learning_rate": 1.5064722827989026e-06, + "loss": 0.4585, + "step": 11834 + }, + { + "epoch": 1.51, + "grad_norm": 0.6273943909358254, + "learning_rate": 1.505734350717676e-06, + "loss": 0.4298, + "step": 11835 + }, + { + "epoch": 1.51, + "grad_norm": 0.6419214079985587, + "learning_rate": 1.504996567376172e-06, + "loss": 0.4343, + "step": 11836 + }, + { + "epoch": 1.51, + "grad_norm": 0.678229937919406, + "learning_rate": 1.5042589328057977e-06, + "loss": 0.4462, + "step": 11837 + }, + { + "epoch": 1.51, + "grad_norm": 0.6283421918971349, + "learning_rate": 1.5035214470379478e-06, + "loss": 0.455, + "step": 11838 + }, + { + "epoch": 1.51, + "grad_norm": 0.6304615540000009, + "learning_rate": 1.5027841101040186e-06, + "loss": 0.4386, + "step": 11839 + }, + { + "epoch": 1.51, + "grad_norm": 0.703904198555036, + "learning_rate": 1.5020469220353928e-06, + "loss": 0.4178, + "step": 11840 + }, + { + "epoch": 1.51, + "grad_norm": 0.6089485199679933, + "learning_rate": 1.5013098828634514e-06, + "loss": 0.4245, + "step": 11841 + }, + { + "epoch": 1.51, + "grad_norm": 0.7898261139824619, + "learning_rate": 1.50057299261957e-06, + "loss": 0.5052, + "step": 11842 + }, + { + "epoch": 1.51, + "grad_norm": 0.7025556079504152, + "learning_rate": 1.4998362513351118e-06, + "loss": 0.5037, + "step": 11843 + }, + { + "epoch": 1.51, + "grad_norm": 0.5482683859555755, + "learning_rate": 1.4990996590414408e-06, + "loss": 0.438, + "step": 11844 + }, + { + "epoch": 1.51, + "grad_norm": 0.5988178399422383, + "learning_rate": 1.4983632157699091e-06, + "loss": 0.4193, + "step": 11845 + }, + { + "epoch": 1.51, + "grad_norm": 0.6101592647269215, + "learning_rate": 1.4976269215518667e-06, + "loss": 0.4284, + "step": 11846 + }, + { + "epoch": 1.51, + "grad_norm": 0.6584350770773381, + "learning_rate": 1.4968907764186535e-06, + "loss": 0.4462, + "step": 11847 + }, + { + "epoch": 1.51, + "grad_norm": 0.644409796959749, + "learning_rate": 1.4961547804016046e-06, + "loss": 0.4285, + "step": 11848 + }, + { + "epoch": 1.51, + "grad_norm": 0.7596332590127007, + "learning_rate": 1.4954189335320524e-06, + "loss": 0.5522, + "step": 11849 + }, + { + "epoch": 1.51, + "grad_norm": 0.7126415012322277, + "learning_rate": 1.4946832358413154e-06, + "loss": 0.4777, + "step": 11850 + }, + { + "epoch": 1.51, + "grad_norm": 0.8145319375764453, + "learning_rate": 1.4939476873607129e-06, + "loss": 0.5306, + "step": 11851 + }, + { + "epoch": 1.51, + "grad_norm": 0.5730804762496607, + "learning_rate": 1.4932122881215522e-06, + "loss": 0.4582, + "step": 11852 + }, + { + "epoch": 1.51, + "grad_norm": 0.7628296180292785, + "learning_rate": 1.49247703815514e-06, + "loss": 0.5631, + "step": 11853 + }, + { + "epoch": 1.51, + "grad_norm": 0.7769824708820705, + "learning_rate": 1.49174193749277e-06, + "loss": 0.5013, + "step": 11854 + }, + { + "epoch": 1.51, + "grad_norm": 0.6436061963935515, + "learning_rate": 1.4910069861657367e-06, + "loss": 0.4509, + "step": 11855 + }, + { + "epoch": 1.51, + "grad_norm": 0.6284176954190089, + "learning_rate": 1.4902721842053215e-06, + "loss": 0.4675, + "step": 11856 + }, + { + "epoch": 1.51, + "grad_norm": 0.857460364630511, + "learning_rate": 1.489537531642805e-06, + "loss": 0.5037, + "step": 11857 + }, + { + "epoch": 1.51, + "grad_norm": 1.4153407418352368, + "learning_rate": 1.4888030285094569e-06, + "loss": 0.5133, + "step": 11858 + }, + { + "epoch": 1.51, + "grad_norm": 0.7606255532426416, + "learning_rate": 1.4880686748365453e-06, + "loss": 0.5251, + "step": 11859 + }, + { + "epoch": 1.51, + "grad_norm": 0.7559046403360551, + "learning_rate": 1.4873344706553255e-06, + "loss": 0.4628, + "step": 11860 + }, + { + "epoch": 1.51, + "grad_norm": 0.7370367427270319, + "learning_rate": 1.4866004159970527e-06, + "loss": 0.5052, + "step": 11861 + }, + { + "epoch": 1.51, + "grad_norm": 0.8421025089555793, + "learning_rate": 1.4858665108929732e-06, + "loss": 0.5515, + "step": 11862 + }, + { + "epoch": 1.51, + "grad_norm": 0.7308373939186672, + "learning_rate": 1.4851327553743283e-06, + "loss": 0.528, + "step": 11863 + }, + { + "epoch": 1.51, + "grad_norm": 0.7749088901483504, + "learning_rate": 1.484399149472348e-06, + "loss": 0.4869, + "step": 11864 + }, + { + "epoch": 1.51, + "grad_norm": 0.5833079333836493, + "learning_rate": 1.4836656932182635e-06, + "loss": 0.4123, + "step": 11865 + }, + { + "epoch": 1.51, + "grad_norm": 0.7145071771547642, + "learning_rate": 1.4829323866432927e-06, + "loss": 0.5104, + "step": 11866 + }, + { + "epoch": 1.51, + "grad_norm": 0.7416386903058149, + "learning_rate": 1.4821992297786524e-06, + "loss": 0.5, + "step": 11867 + }, + { + "epoch": 1.51, + "grad_norm": 0.717791106770553, + "learning_rate": 1.4814662226555482e-06, + "loss": 0.4784, + "step": 11868 + }, + { + "epoch": 1.51, + "grad_norm": 0.7302412626467278, + "learning_rate": 1.4807333653051848e-06, + "loss": 0.5481, + "step": 11869 + }, + { + "epoch": 1.51, + "grad_norm": 0.7592125182075823, + "learning_rate": 1.4800006577587545e-06, + "loss": 0.5285, + "step": 11870 + }, + { + "epoch": 1.51, + "grad_norm": 0.6763326314552282, + "learning_rate": 1.4792681000474496e-06, + "loss": 0.4864, + "step": 11871 + }, + { + "epoch": 1.51, + "grad_norm": 0.6598942039187701, + "learning_rate": 1.4785356922024491e-06, + "loss": 0.4778, + "step": 11872 + }, + { + "epoch": 1.51, + "grad_norm": 0.575117547414073, + "learning_rate": 1.4778034342549313e-06, + "loss": 0.4069, + "step": 11873 + }, + { + "epoch": 1.51, + "grad_norm": 0.643936184193435, + "learning_rate": 1.4770713262360664e-06, + "loss": 0.47, + "step": 11874 + }, + { + "epoch": 1.51, + "grad_norm": 0.5687847044527354, + "learning_rate": 1.476339368177019e-06, + "loss": 0.4304, + "step": 11875 + }, + { + "epoch": 1.51, + "grad_norm": 0.6134730889666818, + "learning_rate": 1.4756075601089426e-06, + "loss": 0.4582, + "step": 11876 + }, + { + "epoch": 1.51, + "grad_norm": 0.574258200621919, + "learning_rate": 1.4748759020629916e-06, + "loss": 0.4603, + "step": 11877 + }, + { + "epoch": 1.51, + "grad_norm": 0.7289514713332602, + "learning_rate": 1.4741443940703076e-06, + "loss": 0.4448, + "step": 11878 + }, + { + "epoch": 1.51, + "grad_norm": 0.6754957264099952, + "learning_rate": 1.4734130361620308e-06, + "loss": 0.4588, + "step": 11879 + }, + { + "epoch": 1.51, + "grad_norm": 0.5779789544473176, + "learning_rate": 1.4726818283692907e-06, + "loss": 0.4729, + "step": 11880 + }, + { + "epoch": 1.51, + "grad_norm": 0.7579048188425673, + "learning_rate": 1.4719507707232145e-06, + "loss": 0.5336, + "step": 11881 + }, + { + "epoch": 1.51, + "grad_norm": 0.7298442253391793, + "learning_rate": 1.471219863254919e-06, + "loss": 0.5254, + "step": 11882 + }, + { + "epoch": 1.51, + "grad_norm": 0.6527989485826166, + "learning_rate": 1.4704891059955184e-06, + "loss": 0.4636, + "step": 11883 + }, + { + "epoch": 1.51, + "grad_norm": 0.6276337234324153, + "learning_rate": 1.469758498976117e-06, + "loss": 0.4765, + "step": 11884 + }, + { + "epoch": 1.51, + "grad_norm": 0.7632291157898848, + "learning_rate": 1.4690280422278164e-06, + "loss": 0.5636, + "step": 11885 + }, + { + "epoch": 1.51, + "grad_norm": 0.7063143936721762, + "learning_rate": 1.4682977357817057e-06, + "loss": 0.5329, + "step": 11886 + }, + { + "epoch": 1.51, + "grad_norm": 0.6768998483318774, + "learning_rate": 1.4675675796688777e-06, + "loss": 0.4854, + "step": 11887 + }, + { + "epoch": 1.51, + "grad_norm": 0.6882104109259862, + "learning_rate": 1.4668375739204083e-06, + "loss": 0.4776, + "step": 11888 + }, + { + "epoch": 1.51, + "grad_norm": 0.7901680166928836, + "learning_rate": 1.4661077185673745e-06, + "loss": 0.578, + "step": 11889 + }, + { + "epoch": 1.51, + "grad_norm": 0.8936742457878368, + "learning_rate": 1.4653780136408407e-06, + "loss": 0.5461, + "step": 11890 + }, + { + "epoch": 1.51, + "grad_norm": 0.7593327976144598, + "learning_rate": 1.4646484591718717e-06, + "loss": 0.4654, + "step": 11891 + }, + { + "epoch": 1.52, + "grad_norm": 0.6599733636141957, + "learning_rate": 1.4639190551915189e-06, + "loss": 0.4189, + "step": 11892 + }, + { + "epoch": 1.52, + "grad_norm": 0.7696281045056309, + "learning_rate": 1.4631898017308338e-06, + "loss": 0.4771, + "step": 11893 + }, + { + "epoch": 1.52, + "grad_norm": 0.8125166737913756, + "learning_rate": 1.462460698820855e-06, + "loss": 0.5116, + "step": 11894 + }, + { + "epoch": 1.52, + "grad_norm": 0.760748975049582, + "learning_rate": 1.4617317464926217e-06, + "loss": 0.5034, + "step": 11895 + }, + { + "epoch": 1.52, + "grad_norm": 0.5917640577108612, + "learning_rate": 1.4610029447771595e-06, + "loss": 0.4541, + "step": 11896 + }, + { + "epoch": 1.52, + "grad_norm": 0.6254715694753664, + "learning_rate": 1.4602742937054942e-06, + "loss": 0.4268, + "step": 11897 + }, + { + "epoch": 1.52, + "grad_norm": 0.6612754649362964, + "learning_rate": 1.45954579330864e-06, + "loss": 0.5101, + "step": 11898 + }, + { + "epoch": 1.52, + "grad_norm": 0.7639911655799347, + "learning_rate": 1.4588174436176073e-06, + "loss": 0.5033, + "step": 11899 + }, + { + "epoch": 1.52, + "grad_norm": 0.6107291144120331, + "learning_rate": 1.4580892446634015e-06, + "loss": 0.4513, + "step": 11900 + }, + { + "epoch": 1.52, + "grad_norm": 0.7302142075022269, + "learning_rate": 1.4573611964770168e-06, + "loss": 0.4629, + "step": 11901 + }, + { + "epoch": 1.52, + "grad_norm": 0.6985277886307565, + "learning_rate": 1.4566332990894456e-06, + "loss": 0.4996, + "step": 11902 + }, + { + "epoch": 1.52, + "grad_norm": 0.7858628190559802, + "learning_rate": 1.455905552531673e-06, + "loss": 0.53, + "step": 11903 + }, + { + "epoch": 1.52, + "grad_norm": 0.6550350598902999, + "learning_rate": 1.455177956834674e-06, + "loss": 0.484, + "step": 11904 + }, + { + "epoch": 1.52, + "grad_norm": 0.7473365611735782, + "learning_rate": 1.4544505120294239e-06, + "loss": 0.5013, + "step": 11905 + }, + { + "epoch": 1.52, + "grad_norm": 0.7978067914947106, + "learning_rate": 1.4537232181468835e-06, + "loss": 0.5719, + "step": 11906 + }, + { + "epoch": 1.52, + "grad_norm": 0.6288148890051874, + "learning_rate": 1.4529960752180156e-06, + "loss": 0.4793, + "step": 11907 + }, + { + "epoch": 1.52, + "grad_norm": 0.6194600075601276, + "learning_rate": 1.4522690832737679e-06, + "loss": 0.4572, + "step": 11908 + }, + { + "epoch": 1.52, + "grad_norm": 0.7936754033642945, + "learning_rate": 1.4515422423450902e-06, + "loss": 0.5541, + "step": 11909 + }, + { + "epoch": 1.52, + "grad_norm": 0.6286049158787025, + "learning_rate": 1.4508155524629186e-06, + "loss": 0.4746, + "step": 11910 + }, + { + "epoch": 1.52, + "grad_norm": 0.7241940144325015, + "learning_rate": 1.4500890136581875e-06, + "loss": 0.516, + "step": 11911 + }, + { + "epoch": 1.52, + "grad_norm": 0.5373397971462176, + "learning_rate": 1.4493626259618242e-06, + "loss": 0.3974, + "step": 11912 + }, + { + "epoch": 1.52, + "grad_norm": 0.6185760583103315, + "learning_rate": 1.4486363894047468e-06, + "loss": 0.4874, + "step": 11913 + }, + { + "epoch": 1.52, + "grad_norm": 0.7521964921008295, + "learning_rate": 1.447910304017871e-06, + "loss": 0.4835, + "step": 11914 + }, + { + "epoch": 1.52, + "grad_norm": 0.6329308429095662, + "learning_rate": 1.4471843698321009e-06, + "loss": 0.4769, + "step": 11915 + }, + { + "epoch": 1.52, + "grad_norm": 0.7325882968022589, + "learning_rate": 1.446458586878341e-06, + "loss": 0.4859, + "step": 11916 + }, + { + "epoch": 1.52, + "grad_norm": 0.6366233076546924, + "learning_rate": 1.4457329551874816e-06, + "loss": 0.4526, + "step": 11917 + }, + { + "epoch": 1.52, + "grad_norm": 0.6325091054991279, + "learning_rate": 1.4450074747904142e-06, + "loss": 0.4685, + "step": 11918 + }, + { + "epoch": 1.52, + "grad_norm": 0.554474044348401, + "learning_rate": 1.4442821457180167e-06, + "loss": 0.4191, + "step": 11919 + }, + { + "epoch": 1.52, + "grad_norm": 0.5816684748831272, + "learning_rate": 1.443556968001168e-06, + "loss": 0.4071, + "step": 11920 + }, + { + "epoch": 1.52, + "grad_norm": 0.8849647868166594, + "learning_rate": 1.4428319416707327e-06, + "loss": 0.4318, + "step": 11921 + }, + { + "epoch": 1.52, + "grad_norm": 0.748242272080491, + "learning_rate": 1.4421070667575754e-06, + "loss": 0.5431, + "step": 11922 + }, + { + "epoch": 1.52, + "grad_norm": 0.9153034293356291, + "learning_rate": 1.4413823432925483e-06, + "loss": 0.5074, + "step": 11923 + }, + { + "epoch": 1.52, + "grad_norm": 0.622396773485618, + "learning_rate": 1.4406577713065061e-06, + "loss": 0.4936, + "step": 11924 + }, + { + "epoch": 1.52, + "grad_norm": 0.7407669943325329, + "learning_rate": 1.4399333508302871e-06, + "loss": 0.5255, + "step": 11925 + }, + { + "epoch": 1.52, + "grad_norm": 0.7967093278333663, + "learning_rate": 1.4392090818947308e-06, + "loss": 0.5271, + "step": 11926 + }, + { + "epoch": 1.52, + "grad_norm": 0.6152531122051506, + "learning_rate": 1.4384849645306631e-06, + "loss": 0.4728, + "step": 11927 + }, + { + "epoch": 1.52, + "grad_norm": 0.8028136064244589, + "learning_rate": 1.4377609987689117e-06, + "loss": 0.551, + "step": 11928 + }, + { + "epoch": 1.52, + "grad_norm": 0.6163416288227633, + "learning_rate": 1.437037184640289e-06, + "loss": 0.4352, + "step": 11929 + }, + { + "epoch": 1.52, + "grad_norm": 0.9569357815659029, + "learning_rate": 1.4363135221756097e-06, + "loss": 0.4821, + "step": 11930 + }, + { + "epoch": 1.52, + "grad_norm": 0.649481211374458, + "learning_rate": 1.4355900114056743e-06, + "loss": 0.4788, + "step": 11931 + }, + { + "epoch": 1.52, + "grad_norm": 0.673899160816937, + "learning_rate": 1.4348666523612837e-06, + "loss": 0.4911, + "step": 11932 + }, + { + "epoch": 1.52, + "grad_norm": 0.8855748910508573, + "learning_rate": 1.434143445073225e-06, + "loss": 0.5546, + "step": 11933 + }, + { + "epoch": 1.52, + "grad_norm": 0.7517144380153586, + "learning_rate": 1.433420389572287e-06, + "loss": 0.491, + "step": 11934 + }, + { + "epoch": 1.52, + "grad_norm": 0.6736783602357297, + "learning_rate": 1.4326974858892429e-06, + "loss": 0.481, + "step": 11935 + }, + { + "epoch": 1.52, + "grad_norm": 0.7518184370772593, + "learning_rate": 1.43197473405487e-06, + "loss": 0.4954, + "step": 11936 + }, + { + "epoch": 1.52, + "grad_norm": 0.684843127693918, + "learning_rate": 1.4312521340999297e-06, + "loss": 0.4079, + "step": 11937 + }, + { + "epoch": 1.52, + "grad_norm": 0.7028455966540011, + "learning_rate": 1.4305296860551831e-06, + "loss": 0.4909, + "step": 11938 + }, + { + "epoch": 1.52, + "grad_norm": 0.7536326138616248, + "learning_rate": 1.4298073899513803e-06, + "loss": 0.4917, + "step": 11939 + }, + { + "epoch": 1.52, + "grad_norm": 0.5923788123906366, + "learning_rate": 1.4290852458192695e-06, + "loss": 0.4344, + "step": 11940 + }, + { + "epoch": 1.52, + "grad_norm": 0.5748647703219977, + "learning_rate": 1.428363253689587e-06, + "loss": 0.4234, + "step": 11941 + }, + { + "epoch": 1.52, + "grad_norm": 0.5871307327750466, + "learning_rate": 1.4276414135930695e-06, + "loss": 0.4203, + "step": 11942 + }, + { + "epoch": 1.52, + "grad_norm": 0.6646586427845788, + "learning_rate": 1.4269197255604395e-06, + "loss": 0.4819, + "step": 11943 + }, + { + "epoch": 1.52, + "grad_norm": 0.7639983366964239, + "learning_rate": 1.4261981896224209e-06, + "loss": 0.4926, + "step": 11944 + }, + { + "epoch": 1.52, + "grad_norm": 0.5927902522432147, + "learning_rate": 1.4254768058097228e-06, + "loss": 0.4491, + "step": 11945 + }, + { + "epoch": 1.52, + "grad_norm": 0.7494902058550442, + "learning_rate": 1.4247555741530566e-06, + "loss": 0.4972, + "step": 11946 + }, + { + "epoch": 1.52, + "grad_norm": 0.824565801744356, + "learning_rate": 1.4240344946831192e-06, + "loss": 0.4909, + "step": 11947 + }, + { + "epoch": 1.52, + "grad_norm": 0.6245604221413591, + "learning_rate": 1.423313567430606e-06, + "loss": 0.4654, + "step": 11948 + }, + { + "epoch": 1.52, + "grad_norm": 0.5799736849700223, + "learning_rate": 1.422592792426205e-06, + "loss": 0.4471, + "step": 11949 + }, + { + "epoch": 1.52, + "grad_norm": 0.7624688173524956, + "learning_rate": 1.4218721697005984e-06, + "loss": 0.4586, + "step": 11950 + }, + { + "epoch": 1.52, + "grad_norm": 0.6614371141124121, + "learning_rate": 1.421151699284458e-06, + "loss": 0.5309, + "step": 11951 + }, + { + "epoch": 1.52, + "grad_norm": 0.746829092014637, + "learning_rate": 1.4204313812084552e-06, + "loss": 0.4954, + "step": 11952 + }, + { + "epoch": 1.52, + "grad_norm": 0.6761957546420123, + "learning_rate": 1.4197112155032484e-06, + "loss": 0.504, + "step": 11953 + }, + { + "epoch": 1.52, + "grad_norm": 0.6896137298686484, + "learning_rate": 1.418991202199495e-06, + "loss": 0.4449, + "step": 11954 + }, + { + "epoch": 1.52, + "grad_norm": 0.5946682175975541, + "learning_rate": 1.4182713413278421e-06, + "loss": 0.4231, + "step": 11955 + }, + { + "epoch": 1.52, + "grad_norm": 0.6378199333418438, + "learning_rate": 1.4175516329189336e-06, + "loss": 0.5385, + "step": 11956 + }, + { + "epoch": 1.52, + "grad_norm": 0.8033376375797316, + "learning_rate": 1.4168320770034027e-06, + "loss": 0.5389, + "step": 11957 + }, + { + "epoch": 1.52, + "grad_norm": 0.8003877712300571, + "learning_rate": 1.4161126736118818e-06, + "loss": 0.5532, + "step": 11958 + }, + { + "epoch": 1.52, + "grad_norm": 0.7013408170494367, + "learning_rate": 1.4153934227749906e-06, + "loss": 0.5007, + "step": 11959 + }, + { + "epoch": 1.52, + "grad_norm": 0.5777901475393556, + "learning_rate": 1.4146743245233463e-06, + "loss": 0.445, + "step": 11960 + }, + { + "epoch": 1.52, + "grad_norm": 0.60618280587829, + "learning_rate": 1.413955378887561e-06, + "loss": 0.4237, + "step": 11961 + }, + { + "epoch": 1.52, + "grad_norm": 0.6977927192262073, + "learning_rate": 1.4132365858982339e-06, + "loss": 0.4372, + "step": 11962 + }, + { + "epoch": 1.52, + "grad_norm": 0.5529689275852305, + "learning_rate": 1.4125179455859634e-06, + "loss": 0.4435, + "step": 11963 + }, + { + "epoch": 1.52, + "grad_norm": 0.6465283117409542, + "learning_rate": 1.4117994579813421e-06, + "loss": 0.4524, + "step": 11964 + }, + { + "epoch": 1.52, + "grad_norm": 0.6324985273586289, + "learning_rate": 1.4110811231149495e-06, + "loss": 0.4954, + "step": 11965 + }, + { + "epoch": 1.52, + "grad_norm": 0.6556563841419782, + "learning_rate": 1.410362941017367e-06, + "loss": 0.4788, + "step": 11966 + }, + { + "epoch": 1.52, + "grad_norm": 0.6874829131602086, + "learning_rate": 1.4096449117191619e-06, + "loss": 0.3874, + "step": 11967 + }, + { + "epoch": 1.52, + "grad_norm": 0.659648034006389, + "learning_rate": 1.408927035250901e-06, + "loss": 0.4688, + "step": 11968 + }, + { + "epoch": 1.52, + "grad_norm": 0.760973846093577, + "learning_rate": 1.408209311643139e-06, + "loss": 0.4835, + "step": 11969 + }, + { + "epoch": 1.52, + "grad_norm": 0.7310589069948038, + "learning_rate": 1.4074917409264304e-06, + "loss": 0.4855, + "step": 11970 + }, + { + "epoch": 1.53, + "grad_norm": 0.683464203817682, + "learning_rate": 1.4067743231313174e-06, + "loss": 0.5, + "step": 11971 + }, + { + "epoch": 1.53, + "grad_norm": 0.596613413010919, + "learning_rate": 1.4060570582883388e-06, + "loss": 0.4422, + "step": 11972 + }, + { + "epoch": 1.53, + "grad_norm": 0.6278333890724107, + "learning_rate": 1.4053399464280282e-06, + "loss": 0.4571, + "step": 11973 + }, + { + "epoch": 1.53, + "grad_norm": 0.7844696714547017, + "learning_rate": 1.4046229875809076e-06, + "loss": 0.5401, + "step": 11974 + }, + { + "epoch": 1.53, + "grad_norm": 0.6894544135343352, + "learning_rate": 1.403906181777499e-06, + "loss": 0.5196, + "step": 11975 + }, + { + "epoch": 1.53, + "grad_norm": 0.715277767498412, + "learning_rate": 1.4031895290483112e-06, + "loss": 0.5197, + "step": 11976 + }, + { + "epoch": 1.53, + "grad_norm": 0.8171187872423455, + "learning_rate": 1.4024730294238525e-06, + "loss": 0.4879, + "step": 11977 + }, + { + "epoch": 1.53, + "grad_norm": 0.6373667151281672, + "learning_rate": 1.4017566829346196e-06, + "loss": 0.4866, + "step": 11978 + }, + { + "epoch": 1.53, + "grad_norm": 0.5903546033363626, + "learning_rate": 1.401040489611108e-06, + "loss": 0.4915, + "step": 11979 + }, + { + "epoch": 1.53, + "grad_norm": 0.6854696279920867, + "learning_rate": 1.4003244494838003e-06, + "loss": 0.5144, + "step": 11980 + }, + { + "epoch": 1.53, + "grad_norm": 0.5707116265821914, + "learning_rate": 1.3996085625831801e-06, + "loss": 0.4466, + "step": 11981 + }, + { + "epoch": 1.53, + "grad_norm": 0.559962507230934, + "learning_rate": 1.3988928289397158e-06, + "loss": 0.4623, + "step": 11982 + }, + { + "epoch": 1.53, + "grad_norm": 0.6003526781341626, + "learning_rate": 1.3981772485838785e-06, + "loss": 0.4422, + "step": 11983 + }, + { + "epoch": 1.53, + "grad_norm": 0.6364568543294888, + "learning_rate": 1.397461821546124e-06, + "loss": 0.4571, + "step": 11984 + }, + { + "epoch": 1.53, + "grad_norm": 0.752513548557085, + "learning_rate": 1.3967465478569075e-06, + "loss": 0.4927, + "step": 11985 + }, + { + "epoch": 1.53, + "grad_norm": 0.6143365125200207, + "learning_rate": 1.3960314275466768e-06, + "loss": 0.4491, + "step": 11986 + }, + { + "epoch": 1.53, + "grad_norm": 0.7371996943701631, + "learning_rate": 1.3953164606458724e-06, + "loss": 0.5085, + "step": 11987 + }, + { + "epoch": 1.53, + "grad_norm": 0.776388144686789, + "learning_rate": 1.3946016471849267e-06, + "loss": 0.5261, + "step": 11988 + }, + { + "epoch": 1.53, + "grad_norm": 0.5953005002791292, + "learning_rate": 1.3938869871942683e-06, + "loss": 0.4367, + "step": 11989 + }, + { + "epoch": 1.53, + "grad_norm": 0.6058115728198914, + "learning_rate": 1.3931724807043161e-06, + "loss": 0.4764, + "step": 11990 + }, + { + "epoch": 1.53, + "grad_norm": 0.630541521661115, + "learning_rate": 1.3924581277454873e-06, + "loss": 0.4654, + "step": 11991 + }, + { + "epoch": 1.53, + "grad_norm": 0.7763509580356418, + "learning_rate": 1.3917439283481865e-06, + "loss": 0.5119, + "step": 11992 + }, + { + "epoch": 1.53, + "grad_norm": 0.6724843055458318, + "learning_rate": 1.3910298825428175e-06, + "loss": 0.482, + "step": 11993 + }, + { + "epoch": 1.53, + "grad_norm": 0.6716993553319668, + "learning_rate": 1.3903159903597728e-06, + "loss": 0.4846, + "step": 11994 + }, + { + "epoch": 1.53, + "grad_norm": 0.9784978355260202, + "learning_rate": 1.3896022518294428e-06, + "loss": 0.5521, + "step": 11995 + }, + { + "epoch": 1.53, + "grad_norm": 0.7067033772717441, + "learning_rate": 1.388888666982206e-06, + "loss": 0.5196, + "step": 11996 + }, + { + "epoch": 1.53, + "grad_norm": 0.7225292070001212, + "learning_rate": 1.388175235848439e-06, + "loss": 0.4127, + "step": 11997 + }, + { + "epoch": 1.53, + "grad_norm": 0.5543762154454209, + "learning_rate": 1.387461958458511e-06, + "loss": 0.3994, + "step": 11998 + }, + { + "epoch": 1.53, + "grad_norm": 0.599151813254321, + "learning_rate": 1.386748834842785e-06, + "loss": 0.4379, + "step": 11999 + }, + { + "epoch": 1.53, + "grad_norm": 0.5865043355783855, + "learning_rate": 1.386035865031613e-06, + "loss": 0.4301, + "step": 12000 + } + ], + "logging_steps": 1.0, + "max_steps": 15698, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1000, + "total_flos": 5811613688332288.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}