diff --git "a/parm/trainer_state.json" "b/parm/trainer_state.json" new file mode 100644--- /dev/null +++ "b/parm/trainer_state.json" @@ -0,0 +1,35280 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999006852716258, + "eval_steps": 500, + "global_step": 5034, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00019862945674843578, + "grad_norm": 33.76269020949264, + "learning_rate": 6.578947368421053e-08, + "loss": 0.2359, + "step": 1 + }, + { + "epoch": 0.00039725891349687157, + "grad_norm": 34.06001813445965, + "learning_rate": 1.3157894736842107e-07, + "loss": 0.2755, + "step": 2 + }, + { + "epoch": 0.0005958883702453074, + "grad_norm": 35.82083527820956, + "learning_rate": 1.9736842105263157e-07, + "loss": 0.285, + "step": 3 + }, + { + "epoch": 0.0007945178269937431, + "grad_norm": 31.578743559628386, + "learning_rate": 2.6315789473684213e-07, + "loss": 0.2475, + "step": 4 + }, + { + "epoch": 0.000993147283742179, + "grad_norm": 30.615127259734468, + "learning_rate": 3.2894736842105264e-07, + "loss": 0.2229, + "step": 5 + }, + { + "epoch": 0.0011917767404906149, + "grad_norm": 44.64811115111308, + "learning_rate": 3.9473684210526315e-07, + "loss": 0.3314, + "step": 6 + }, + { + "epoch": 0.0013904061972390505, + "grad_norm": 44.34169533744506, + "learning_rate": 4.605263157894737e-07, + "loss": 0.2423, + "step": 7 + }, + { + "epoch": 0.0015890356539874863, + "grad_norm": 40.11109275785226, + "learning_rate": 5.263157894736843e-07, + "loss": 0.2239, + "step": 8 + }, + { + "epoch": 0.001787665110735922, + "grad_norm": 35.00420920021999, + "learning_rate": 5.921052631578947e-07, + "loss": 0.2018, + "step": 9 + }, + { + "epoch": 0.001986294567484358, + "grad_norm": 29.808999667191497, + "learning_rate": 6.578947368421053e-07, + "loss": 0.107, + "step": 10 + }, + { + "epoch": 0.0021849240242327937, + "grad_norm": 17.62819439139421, + "learning_rate": 7.236842105263158e-07, + "loss": 0.0941, + "step": 11 + }, + { + "epoch": 0.0023835534809812297, + "grad_norm": 12.95448401579392, + "learning_rate": 7.894736842105263e-07, + "loss": 0.0777, + "step": 12 + }, + { + "epoch": 0.0025821829377296653, + "grad_norm": 6.615340288787496, + "learning_rate": 8.55263157894737e-07, + "loss": 0.0557, + "step": 13 + }, + { + "epoch": 0.002780812394478101, + "grad_norm": 8.20493935951563, + "learning_rate": 9.210526315789474e-07, + "loss": 0.0618, + "step": 14 + }, + { + "epoch": 0.002979441851226537, + "grad_norm": 6.4854536788019495, + "learning_rate": 9.86842105263158e-07, + "loss": 0.0597, + "step": 15 + }, + { + "epoch": 0.0031780713079749725, + "grad_norm": 5.948598869884694, + "learning_rate": 1.0526315789473685e-06, + "loss": 0.057, + "step": 16 + }, + { + "epoch": 0.0033767007647234086, + "grad_norm": 1.505342665823492, + "learning_rate": 1.118421052631579e-06, + "loss": 0.0389, + "step": 17 + }, + { + "epoch": 0.003575330221471844, + "grad_norm": 4.357709266078771, + "learning_rate": 1.1842105263157894e-06, + "loss": 0.0477, + "step": 18 + }, + { + "epoch": 0.00377395967822028, + "grad_norm": 4.173575780842314, + "learning_rate": 1.25e-06, + "loss": 0.0425, + "step": 19 + }, + { + "epoch": 0.003972589134968716, + "grad_norm": 4.034253079966795, + "learning_rate": 1.3157894736842106e-06, + "loss": 0.0433, + "step": 20 + }, + { + "epoch": 0.004171218591717152, + "grad_norm": 3.815661808462217, + "learning_rate": 1.3815789473684212e-06, + "loss": 0.0417, + "step": 21 + }, + { + "epoch": 0.004369848048465587, + "grad_norm": 1.9849545321722113, + "learning_rate": 1.4473684210526317e-06, + "loss": 0.0349, + "step": 22 + }, + { + "epoch": 0.004568477505214023, + "grad_norm": 0.704960225012627, + "learning_rate": 1.5131578947368421e-06, + "loss": 0.0355, + "step": 23 + }, + { + "epoch": 0.0047671069619624595, + "grad_norm": 2.738263124201555, + "learning_rate": 1.5789473684210526e-06, + "loss": 0.0351, + "step": 24 + }, + { + "epoch": 0.004965736418710895, + "grad_norm": 3.7531322321385994, + "learning_rate": 1.6447368421052635e-06, + "loss": 0.0337, + "step": 25 + }, + { + "epoch": 0.005164365875459331, + "grad_norm": 3.3650206269293643, + "learning_rate": 1.710526315789474e-06, + "loss": 0.0403, + "step": 26 + }, + { + "epoch": 0.005362995332207766, + "grad_norm": 1.1524716199784135, + "learning_rate": 1.7763157894736844e-06, + "loss": 0.0188, + "step": 27 + }, + { + "epoch": 0.005561624788956202, + "grad_norm": 0.9219916829535164, + "learning_rate": 1.8421052631578948e-06, + "loss": 0.0292, + "step": 28 + }, + { + "epoch": 0.005760254245704638, + "grad_norm": 1.5976055474296411, + "learning_rate": 1.9078947368421057e-06, + "loss": 0.0311, + "step": 29 + }, + { + "epoch": 0.005958883702453074, + "grad_norm": 1.7121779218172795, + "learning_rate": 1.973684210526316e-06, + "loss": 0.0267, + "step": 30 + }, + { + "epoch": 0.0061575131592015095, + "grad_norm": 1.4089365136243617, + "learning_rate": 2.0394736842105266e-06, + "loss": 0.0271, + "step": 31 + }, + { + "epoch": 0.006356142615949945, + "grad_norm": 1.2326619516469113, + "learning_rate": 2.105263157894737e-06, + "loss": 0.0213, + "step": 32 + }, + { + "epoch": 0.0065547720726983815, + "grad_norm": 2.5437119531443457, + "learning_rate": 2.1710526315789475e-06, + "loss": 0.0358, + "step": 33 + }, + { + "epoch": 0.006753401529446817, + "grad_norm": 3.097227411879207, + "learning_rate": 2.236842105263158e-06, + "loss": 0.026, + "step": 34 + }, + { + "epoch": 0.006952030986195253, + "grad_norm": 1.5225819387838984, + "learning_rate": 2.3026315789473684e-06, + "loss": 0.0352, + "step": 35 + }, + { + "epoch": 0.007150660442943688, + "grad_norm": 1.9028774569921925, + "learning_rate": 2.368421052631579e-06, + "loss": 0.0249, + "step": 36 + }, + { + "epoch": 0.007349289899692124, + "grad_norm": 1.3413799412407437, + "learning_rate": 2.4342105263157898e-06, + "loss": 0.023, + "step": 37 + }, + { + "epoch": 0.00754791935644056, + "grad_norm": 0.7403867166496259, + "learning_rate": 2.5e-06, + "loss": 0.0179, + "step": 38 + }, + { + "epoch": 0.007746548813188996, + "grad_norm": 1.5339334077472897, + "learning_rate": 2.565789473684211e-06, + "loss": 0.026, + "step": 39 + }, + { + "epoch": 0.007945178269937432, + "grad_norm": 1.4657589777219318, + "learning_rate": 2.631578947368421e-06, + "loss": 0.0195, + "step": 40 + }, + { + "epoch": 0.008143807726685867, + "grad_norm": 1.3319716700261164, + "learning_rate": 2.697368421052632e-06, + "loss": 0.0291, + "step": 41 + }, + { + "epoch": 0.008342437183434304, + "grad_norm": 1.1874773600556614, + "learning_rate": 2.7631578947368424e-06, + "loss": 0.0259, + "step": 42 + }, + { + "epoch": 0.008541066640182738, + "grad_norm": 0.6674176354529916, + "learning_rate": 2.828947368421053e-06, + "loss": 0.0203, + "step": 43 + }, + { + "epoch": 0.008739696096931175, + "grad_norm": 1.1733771223638585, + "learning_rate": 2.8947368421052634e-06, + "loss": 0.026, + "step": 44 + }, + { + "epoch": 0.008938325553679611, + "grad_norm": 1.7753106520406625, + "learning_rate": 2.960526315789474e-06, + "loss": 0.0247, + "step": 45 + }, + { + "epoch": 0.009136955010428046, + "grad_norm": 0.9432853830345364, + "learning_rate": 3.0263157894736843e-06, + "loss": 0.0257, + "step": 46 + }, + { + "epoch": 0.009335584467176482, + "grad_norm": 3.5238268219753857, + "learning_rate": 3.092105263157895e-06, + "loss": 0.024, + "step": 47 + }, + { + "epoch": 0.009534213923924919, + "grad_norm": 4.0482869582706975, + "learning_rate": 3.157894736842105e-06, + "loss": 0.0279, + "step": 48 + }, + { + "epoch": 0.009732843380673354, + "grad_norm": 1.9309599368634551, + "learning_rate": 3.223684210526316e-06, + "loss": 0.0307, + "step": 49 + }, + { + "epoch": 0.00993147283742179, + "grad_norm": 2.2369624906638355, + "learning_rate": 3.289473684210527e-06, + "loss": 0.0192, + "step": 50 + }, + { + "epoch": 0.010130102294170225, + "grad_norm": 1.9485240974125297, + "learning_rate": 3.355263157894737e-06, + "loss": 0.0286, + "step": 51 + }, + { + "epoch": 0.010328731750918661, + "grad_norm": 2.7566490930720473, + "learning_rate": 3.421052631578948e-06, + "loss": 0.0327, + "step": 52 + }, + { + "epoch": 0.010527361207667098, + "grad_norm": 0.9111515455287649, + "learning_rate": 3.486842105263158e-06, + "loss": 0.0266, + "step": 53 + }, + { + "epoch": 0.010725990664415532, + "grad_norm": 1.0182592431220034, + "learning_rate": 3.5526315789473687e-06, + "loss": 0.0159, + "step": 54 + }, + { + "epoch": 0.010924620121163969, + "grad_norm": 1.3689812247740223, + "learning_rate": 3.618421052631579e-06, + "loss": 0.0236, + "step": 55 + }, + { + "epoch": 0.011123249577912404, + "grad_norm": 1.415669213506178, + "learning_rate": 3.6842105263157896e-06, + "loss": 0.0192, + "step": 56 + }, + { + "epoch": 0.01132187903466084, + "grad_norm": 2.727013850743499, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0271, + "step": 57 + }, + { + "epoch": 0.011520508491409277, + "grad_norm": 21.142839252245647, + "learning_rate": 3.815789473684211e-06, + "loss": 0.0233, + "step": 58 + }, + { + "epoch": 0.011719137948157711, + "grad_norm": 1.1229291902544376, + "learning_rate": 3.8815789473684214e-06, + "loss": 0.0183, + "step": 59 + }, + { + "epoch": 0.011917767404906148, + "grad_norm": 1.196900422199011, + "learning_rate": 3.947368421052632e-06, + "loss": 0.0245, + "step": 60 + }, + { + "epoch": 0.012116396861654583, + "grad_norm": 1.356686303430086, + "learning_rate": 4.013157894736842e-06, + "loss": 0.0104, + "step": 61 + }, + { + "epoch": 0.012315026318403019, + "grad_norm": 1.0363309431722993, + "learning_rate": 4.078947368421053e-06, + "loss": 0.0162, + "step": 62 + }, + { + "epoch": 0.012513655775151455, + "grad_norm": 1.068841018945078, + "learning_rate": 4.144736842105263e-06, + "loss": 0.0215, + "step": 63 + }, + { + "epoch": 0.01271228523189989, + "grad_norm": 1.3644470124019088, + "learning_rate": 4.210526315789474e-06, + "loss": 0.0308, + "step": 64 + }, + { + "epoch": 0.012910914688648327, + "grad_norm": 1.4539722940904474, + "learning_rate": 4.276315789473684e-06, + "loss": 0.0234, + "step": 65 + }, + { + "epoch": 0.013109544145396763, + "grad_norm": 1.8771694709283084, + "learning_rate": 4.342105263157895e-06, + "loss": 0.0279, + "step": 66 + }, + { + "epoch": 0.013308173602145198, + "grad_norm": 1.56691962640963, + "learning_rate": 4.407894736842105e-06, + "loss": 0.0224, + "step": 67 + }, + { + "epoch": 0.013506803058893634, + "grad_norm": 1.0323510479136595, + "learning_rate": 4.473684210526316e-06, + "loss": 0.0223, + "step": 68 + }, + { + "epoch": 0.013705432515642069, + "grad_norm": 2.329638995795258, + "learning_rate": 4.539473684210527e-06, + "loss": 0.0278, + "step": 69 + }, + { + "epoch": 0.013904061972390505, + "grad_norm": 1.1328939658071375, + "learning_rate": 4.605263157894737e-06, + "loss": 0.0262, + "step": 70 + }, + { + "epoch": 0.014102691429138942, + "grad_norm": 1.5789619228369665, + "learning_rate": 4.671052631578948e-06, + "loss": 0.0226, + "step": 71 + }, + { + "epoch": 0.014301320885887377, + "grad_norm": 0.590549283191568, + "learning_rate": 4.736842105263158e-06, + "loss": 0.0222, + "step": 72 + }, + { + "epoch": 0.014499950342635813, + "grad_norm": 2.9484330972444797, + "learning_rate": 4.802631578947369e-06, + "loss": 0.0289, + "step": 73 + }, + { + "epoch": 0.014698579799384248, + "grad_norm": 1.7556389255244587, + "learning_rate": 4.8684210526315795e-06, + "loss": 0.0253, + "step": 74 + }, + { + "epoch": 0.014897209256132684, + "grad_norm": 0.8175815824911603, + "learning_rate": 4.9342105263157895e-06, + "loss": 0.023, + "step": 75 + }, + { + "epoch": 0.01509583871288112, + "grad_norm": 1.9543147397030654, + "learning_rate": 5e-06, + "loss": 0.024, + "step": 76 + }, + { + "epoch": 0.015294468169629555, + "grad_norm": 1.411220111931974, + "learning_rate": 5.0657894736842104e-06, + "loss": 0.0213, + "step": 77 + }, + { + "epoch": 0.015493097626377992, + "grad_norm": 2.5787589856867474, + "learning_rate": 5.131578947368422e-06, + "loss": 0.0194, + "step": 78 + }, + { + "epoch": 0.01569172708312643, + "grad_norm": 1.1433613974937111, + "learning_rate": 5.197368421052632e-06, + "loss": 0.0203, + "step": 79 + }, + { + "epoch": 0.015890356539874865, + "grad_norm": 2.943413275721806, + "learning_rate": 5.263157894736842e-06, + "loss": 0.0202, + "step": 80 + }, + { + "epoch": 0.016088985996623298, + "grad_norm": 1.2895517559346665, + "learning_rate": 5.328947368421054e-06, + "loss": 0.0175, + "step": 81 + }, + { + "epoch": 0.016287615453371734, + "grad_norm": 1.2902481789966305, + "learning_rate": 5.394736842105264e-06, + "loss": 0.0273, + "step": 82 + }, + { + "epoch": 0.01648624491012017, + "grad_norm": 1.0623933737227271, + "learning_rate": 5.460526315789474e-06, + "loss": 0.0226, + "step": 83 + }, + { + "epoch": 0.016684874366868607, + "grad_norm": 1.551352914024002, + "learning_rate": 5.526315789473685e-06, + "loss": 0.0227, + "step": 84 + }, + { + "epoch": 0.016883503823617044, + "grad_norm": 1.35068564721811, + "learning_rate": 5.592105263157896e-06, + "loss": 0.0176, + "step": 85 + }, + { + "epoch": 0.017082133280365477, + "grad_norm": 0.7178211749767736, + "learning_rate": 5.657894736842106e-06, + "loss": 0.0161, + "step": 86 + }, + { + "epoch": 0.017280762737113913, + "grad_norm": 1.3876820110580605, + "learning_rate": 5.723684210526316e-06, + "loss": 0.0211, + "step": 87 + }, + { + "epoch": 0.01747939219386235, + "grad_norm": 0.9950100462947198, + "learning_rate": 5.789473684210527e-06, + "loss": 0.0236, + "step": 88 + }, + { + "epoch": 0.017678021650610786, + "grad_norm": 2.4968422535588806, + "learning_rate": 5.855263157894738e-06, + "loss": 0.0288, + "step": 89 + }, + { + "epoch": 0.017876651107359223, + "grad_norm": 2.6434743230683515, + "learning_rate": 5.921052631578948e-06, + "loss": 0.03, + "step": 90 + }, + { + "epoch": 0.018075280564107656, + "grad_norm": 0.7671799725100394, + "learning_rate": 5.9868421052631585e-06, + "loss": 0.0303, + "step": 91 + }, + { + "epoch": 0.018273910020856092, + "grad_norm": 1.4218642540045263, + "learning_rate": 6.0526315789473685e-06, + "loss": 0.0265, + "step": 92 + }, + { + "epoch": 0.01847253947760453, + "grad_norm": 1.1607669862480212, + "learning_rate": 6.118421052631579e-06, + "loss": 0.0273, + "step": 93 + }, + { + "epoch": 0.018671168934352965, + "grad_norm": 0.5272958683372639, + "learning_rate": 6.18421052631579e-06, + "loss": 0.0208, + "step": 94 + }, + { + "epoch": 0.0188697983911014, + "grad_norm": 0.6384994881210974, + "learning_rate": 6.25e-06, + "loss": 0.0193, + "step": 95 + }, + { + "epoch": 0.019068427847849838, + "grad_norm": 0.4305083797700706, + "learning_rate": 6.31578947368421e-06, + "loss": 0.0155, + "step": 96 + }, + { + "epoch": 0.01926705730459827, + "grad_norm": 1.5613150520545582, + "learning_rate": 6.381578947368422e-06, + "loss": 0.0287, + "step": 97 + }, + { + "epoch": 0.019465686761346707, + "grad_norm": 1.207629220791821, + "learning_rate": 6.447368421052632e-06, + "loss": 0.0126, + "step": 98 + }, + { + "epoch": 0.019664316218095144, + "grad_norm": 2.0635349948166946, + "learning_rate": 6.513157894736842e-06, + "loss": 0.0267, + "step": 99 + }, + { + "epoch": 0.01986294567484358, + "grad_norm": 1.725733944785907, + "learning_rate": 6.578947368421054e-06, + "loss": 0.0307, + "step": 100 + }, + { + "epoch": 0.020061575131592017, + "grad_norm": 0.7480249394267019, + "learning_rate": 6.644736842105264e-06, + "loss": 0.0299, + "step": 101 + }, + { + "epoch": 0.02026020458834045, + "grad_norm": 3.894533969012179, + "learning_rate": 6.710526315789474e-06, + "loss": 0.036, + "step": 102 + }, + { + "epoch": 0.020458834045088886, + "grad_norm": 1.9782432860217083, + "learning_rate": 6.776315789473686e-06, + "loss": 0.019, + "step": 103 + }, + { + "epoch": 0.020657463501837323, + "grad_norm": 2.1605252856710577, + "learning_rate": 6.842105263157896e-06, + "loss": 0.0147, + "step": 104 + }, + { + "epoch": 0.02085609295858576, + "grad_norm": 1.4740136865902298, + "learning_rate": 6.907894736842106e-06, + "loss": 0.0225, + "step": 105 + }, + { + "epoch": 0.021054722415334196, + "grad_norm": 0.6354162758488423, + "learning_rate": 6.973684210526316e-06, + "loss": 0.0248, + "step": 106 + }, + { + "epoch": 0.02125335187208263, + "grad_norm": 2.7836091692257434, + "learning_rate": 7.0394736842105274e-06, + "loss": 0.0239, + "step": 107 + }, + { + "epoch": 0.021451981328831065, + "grad_norm": 1.1359242167543848, + "learning_rate": 7.1052631578947375e-06, + "loss": 0.0202, + "step": 108 + }, + { + "epoch": 0.0216506107855795, + "grad_norm": 0.9730451625121374, + "learning_rate": 7.1710526315789475e-06, + "loss": 0.0164, + "step": 109 + }, + { + "epoch": 0.021849240242327938, + "grad_norm": 0.73973440231317, + "learning_rate": 7.236842105263158e-06, + "loss": 0.0208, + "step": 110 + }, + { + "epoch": 0.022047869699076374, + "grad_norm": 2.699646318968425, + "learning_rate": 7.302631578947369e-06, + "loss": 0.0233, + "step": 111 + }, + { + "epoch": 0.022246499155824807, + "grad_norm": 1.2218290054724747, + "learning_rate": 7.368421052631579e-06, + "loss": 0.0194, + "step": 112 + }, + { + "epoch": 0.022445128612573244, + "grad_norm": 3.2868701187822915, + "learning_rate": 7.43421052631579e-06, + "loss": 0.0339, + "step": 113 + }, + { + "epoch": 0.02264375806932168, + "grad_norm": 2.5641439239722814, + "learning_rate": 7.500000000000001e-06, + "loss": 0.0286, + "step": 114 + }, + { + "epoch": 0.022842387526070117, + "grad_norm": 0.6952134707398119, + "learning_rate": 7.565789473684211e-06, + "loss": 0.0212, + "step": 115 + }, + { + "epoch": 0.023041016982818553, + "grad_norm": 2.011954955233892, + "learning_rate": 7.631578947368423e-06, + "loss": 0.0259, + "step": 116 + }, + { + "epoch": 0.023239646439566986, + "grad_norm": 0.7803532263465452, + "learning_rate": 7.697368421052632e-06, + "loss": 0.016, + "step": 117 + }, + { + "epoch": 0.023438275896315423, + "grad_norm": 1.9847980365877933, + "learning_rate": 7.763157894736843e-06, + "loss": 0.0195, + "step": 118 + }, + { + "epoch": 0.02363690535306386, + "grad_norm": 1.1441961651491355, + "learning_rate": 7.828947368421054e-06, + "loss": 0.0211, + "step": 119 + }, + { + "epoch": 0.023835534809812296, + "grad_norm": 0.6232275208284637, + "learning_rate": 7.894736842105265e-06, + "loss": 0.0155, + "step": 120 + }, + { + "epoch": 0.024034164266560732, + "grad_norm": 1.8988678704709767, + "learning_rate": 7.960526315789474e-06, + "loss": 0.0146, + "step": 121 + }, + { + "epoch": 0.024232793723309165, + "grad_norm": 2.105837941919798, + "learning_rate": 8.026315789473685e-06, + "loss": 0.0295, + "step": 122 + }, + { + "epoch": 0.0244314231800576, + "grad_norm": 1.3171076410792975, + "learning_rate": 8.092105263157896e-06, + "loss": 0.0264, + "step": 123 + }, + { + "epoch": 0.024630052636806038, + "grad_norm": 1.5991904078839954, + "learning_rate": 8.157894736842106e-06, + "loss": 0.0244, + "step": 124 + }, + { + "epoch": 0.024828682093554474, + "grad_norm": 1.1618318045886942, + "learning_rate": 8.223684210526316e-06, + "loss": 0.0201, + "step": 125 + }, + { + "epoch": 0.02502731155030291, + "grad_norm": 0.800623192132822, + "learning_rate": 8.289473684210526e-06, + "loss": 0.0177, + "step": 126 + }, + { + "epoch": 0.025225941007051347, + "grad_norm": 1.6211464121453392, + "learning_rate": 8.355263157894737e-06, + "loss": 0.0155, + "step": 127 + }, + { + "epoch": 0.02542457046379978, + "grad_norm": 0.7412717741680597, + "learning_rate": 8.421052631578948e-06, + "loss": 0.0118, + "step": 128 + }, + { + "epoch": 0.025623199920548217, + "grad_norm": 0.7011545084516744, + "learning_rate": 8.486842105263159e-06, + "loss": 0.0197, + "step": 129 + }, + { + "epoch": 0.025821829377296653, + "grad_norm": 1.5277548264021221, + "learning_rate": 8.552631578947368e-06, + "loss": 0.0139, + "step": 130 + }, + { + "epoch": 0.02602045883404509, + "grad_norm": 0.8807370373367641, + "learning_rate": 8.61842105263158e-06, + "loss": 0.0311, + "step": 131 + }, + { + "epoch": 0.026219088290793526, + "grad_norm": 1.07819195884304, + "learning_rate": 8.68421052631579e-06, + "loss": 0.0318, + "step": 132 + }, + { + "epoch": 0.02641771774754196, + "grad_norm": 0.696442786177907, + "learning_rate": 8.750000000000001e-06, + "loss": 0.0234, + "step": 133 + }, + { + "epoch": 0.026616347204290396, + "grad_norm": 0.748931405611534, + "learning_rate": 8.81578947368421e-06, + "loss": 0.0216, + "step": 134 + }, + { + "epoch": 0.026814976661038832, + "grad_norm": 0.8573350069478229, + "learning_rate": 8.881578947368423e-06, + "loss": 0.0291, + "step": 135 + }, + { + "epoch": 0.02701360611778727, + "grad_norm": 0.9745147210519538, + "learning_rate": 8.947368421052632e-06, + "loss": 0.0239, + "step": 136 + }, + { + "epoch": 0.027212235574535705, + "grad_norm": 0.9802008111815383, + "learning_rate": 9.013157894736843e-06, + "loss": 0.0209, + "step": 137 + }, + { + "epoch": 0.027410865031284138, + "grad_norm": 2.330229398267877, + "learning_rate": 9.078947368421054e-06, + "loss": 0.0244, + "step": 138 + }, + { + "epoch": 0.027609494488032574, + "grad_norm": 1.0656034579377593, + "learning_rate": 9.144736842105264e-06, + "loss": 0.0159, + "step": 139 + }, + { + "epoch": 0.02780812394478101, + "grad_norm": 2.0077938210120796, + "learning_rate": 9.210526315789474e-06, + "loss": 0.0199, + "step": 140 + }, + { + "epoch": 0.028006753401529447, + "grad_norm": 2.635982865291799, + "learning_rate": 9.276315789473686e-06, + "loss": 0.0276, + "step": 141 + }, + { + "epoch": 0.028205382858277884, + "grad_norm": 1.3234128327058614, + "learning_rate": 9.342105263157895e-06, + "loss": 0.0197, + "step": 142 + }, + { + "epoch": 0.028404012315026317, + "grad_norm": 2.8381965627330588, + "learning_rate": 9.407894736842106e-06, + "loss": 0.0256, + "step": 143 + }, + { + "epoch": 0.028602641771774753, + "grad_norm": 2.2353630682958237, + "learning_rate": 9.473684210526315e-06, + "loss": 0.0263, + "step": 144 + }, + { + "epoch": 0.02880127122852319, + "grad_norm": 2.609402334542464, + "learning_rate": 9.539473684210528e-06, + "loss": 0.0208, + "step": 145 + }, + { + "epoch": 0.028999900685271626, + "grad_norm": 1.6433963174070783, + "learning_rate": 9.605263157894737e-06, + "loss": 0.0156, + "step": 146 + }, + { + "epoch": 0.029198530142020063, + "grad_norm": 1.2120241439087565, + "learning_rate": 9.671052631578948e-06, + "loss": 0.0247, + "step": 147 + }, + { + "epoch": 0.029397159598768496, + "grad_norm": 2.393880841544898, + "learning_rate": 9.736842105263159e-06, + "loss": 0.0199, + "step": 148 + }, + { + "epoch": 0.029595789055516932, + "grad_norm": 1.147929304198294, + "learning_rate": 9.80263157894737e-06, + "loss": 0.0142, + "step": 149 + }, + { + "epoch": 0.02979441851226537, + "grad_norm": 0.7301143117977119, + "learning_rate": 9.868421052631579e-06, + "loss": 0.0268, + "step": 150 + }, + { + "epoch": 0.029993047969013805, + "grad_norm": 0.7620302081891749, + "learning_rate": 9.93421052631579e-06, + "loss": 0.0209, + "step": 151 + }, + { + "epoch": 0.03019167742576224, + "grad_norm": 1.6325286682577373, + "learning_rate": 1e-05, + "loss": 0.0165, + "step": 152 + }, + { + "epoch": 0.030390306882510678, + "grad_norm": 0.8520608847697756, + "learning_rate": 9.999998964752504e-06, + "loss": 0.023, + "step": 153 + }, + { + "epoch": 0.03058893633925911, + "grad_norm": 0.7134823765486196, + "learning_rate": 9.999995859010444e-06, + "loss": 0.0213, + "step": 154 + }, + { + "epoch": 0.030787565796007547, + "grad_norm": 1.4140312748946944, + "learning_rate": 9.999990682775105e-06, + "loss": 0.0273, + "step": 155 + }, + { + "epoch": 0.030986195252755984, + "grad_norm": 0.9491683828558153, + "learning_rate": 9.999983436048632e-06, + "loss": 0.0222, + "step": 156 + }, + { + "epoch": 0.03118482470950442, + "grad_norm": 1.3782834621848912, + "learning_rate": 9.999974118834025e-06, + "loss": 0.0162, + "step": 157 + }, + { + "epoch": 0.03138345416625286, + "grad_norm": 0.9830166709134077, + "learning_rate": 9.999962731135142e-06, + "loss": 0.017, + "step": 158 + }, + { + "epoch": 0.03158208362300129, + "grad_norm": 2.218321788640668, + "learning_rate": 9.999949272956699e-06, + "loss": 0.0184, + "step": 159 + }, + { + "epoch": 0.03178071307974973, + "grad_norm": 3.6396168680949224, + "learning_rate": 9.99993374430427e-06, + "loss": 0.033, + "step": 160 + }, + { + "epoch": 0.03197934253649816, + "grad_norm": 1.701075218546414, + "learning_rate": 9.999916145184286e-06, + "loss": 0.0191, + "step": 161 + }, + { + "epoch": 0.032177971993246596, + "grad_norm": 1.0244474776048476, + "learning_rate": 9.999896475604029e-06, + "loss": 0.0164, + "step": 162 + }, + { + "epoch": 0.03237660144999503, + "grad_norm": 1.7222754194926513, + "learning_rate": 9.999874735571652e-06, + "loss": 0.0385, + "step": 163 + }, + { + "epoch": 0.03257523090674347, + "grad_norm": 2.5845547766369417, + "learning_rate": 9.999850925096153e-06, + "loss": 0.0331, + "step": 164 + }, + { + "epoch": 0.032773860363491905, + "grad_norm": 1.878569691120879, + "learning_rate": 9.999825044187392e-06, + "loss": 0.0254, + "step": 165 + }, + { + "epoch": 0.03297248982024034, + "grad_norm": 0.9378834395475585, + "learning_rate": 9.999797092856089e-06, + "loss": 0.0262, + "step": 166 + }, + { + "epoch": 0.03317111927698878, + "grad_norm": 0.9618425366151521, + "learning_rate": 9.999767071113815e-06, + "loss": 0.017, + "step": 167 + }, + { + "epoch": 0.033369748733737215, + "grad_norm": 1.7287253946197272, + "learning_rate": 9.999734978973006e-06, + "loss": 0.0218, + "step": 168 + }, + { + "epoch": 0.03356837819048565, + "grad_norm": 0.25580217975967984, + "learning_rate": 9.999700816446947e-06, + "loss": 0.0104, + "step": 169 + }, + { + "epoch": 0.03376700764723409, + "grad_norm": 0.7079216618175301, + "learning_rate": 9.999664583549788e-06, + "loss": 0.0198, + "step": 170 + }, + { + "epoch": 0.033965637103982524, + "grad_norm": 0.5742676426294199, + "learning_rate": 9.99962628029653e-06, + "loss": 0.0261, + "step": 171 + }, + { + "epoch": 0.03416426656073095, + "grad_norm": 0.4760990748227476, + "learning_rate": 9.999585906703038e-06, + "loss": 0.0223, + "step": 172 + }, + { + "epoch": 0.03436289601747939, + "grad_norm": 0.6609769994751641, + "learning_rate": 9.999543462786028e-06, + "loss": 0.0194, + "step": 173 + }, + { + "epoch": 0.034561525474227826, + "grad_norm": 1.0985318270522382, + "learning_rate": 9.999498948563076e-06, + "loss": 0.0153, + "step": 174 + }, + { + "epoch": 0.03476015493097626, + "grad_norm": 0.8636922880587703, + "learning_rate": 9.999452364052618e-06, + "loss": 0.0243, + "step": 175 + }, + { + "epoch": 0.0349587843877247, + "grad_norm": 0.5255774215285303, + "learning_rate": 9.99940370927394e-06, + "loss": 0.0253, + "step": 176 + }, + { + "epoch": 0.035157413844473136, + "grad_norm": 0.9472944515341319, + "learning_rate": 9.999352984247196e-06, + "loss": 0.0172, + "step": 177 + }, + { + "epoch": 0.03535604330122157, + "grad_norm": 1.7710708337619419, + "learning_rate": 9.999300188993384e-06, + "loss": 0.0209, + "step": 178 + }, + { + "epoch": 0.03555467275797001, + "grad_norm": 1.62355032296528, + "learning_rate": 9.999245323534372e-06, + "loss": 0.0183, + "step": 179 + }, + { + "epoch": 0.035753302214718445, + "grad_norm": 1.6780041194910134, + "learning_rate": 9.999188387892878e-06, + "loss": 0.0184, + "step": 180 + }, + { + "epoch": 0.03595193167146688, + "grad_norm": 0.960332672150208, + "learning_rate": 9.999129382092478e-06, + "loss": 0.0121, + "step": 181 + }, + { + "epoch": 0.03615056112821531, + "grad_norm": 0.9539342476315544, + "learning_rate": 9.999068306157607e-06, + "loss": 0.0134, + "step": 182 + }, + { + "epoch": 0.03634919058496375, + "grad_norm": 1.6183931386612114, + "learning_rate": 9.999005160113558e-06, + "loss": 0.0168, + "step": 183 + }, + { + "epoch": 0.036547820041712184, + "grad_norm": 0.9338458758519154, + "learning_rate": 9.998939943986476e-06, + "loss": 0.022, + "step": 184 + }, + { + "epoch": 0.03674644949846062, + "grad_norm": 1.9345051819920607, + "learning_rate": 9.998872657803371e-06, + "loss": 0.0206, + "step": 185 + }, + { + "epoch": 0.03694507895520906, + "grad_norm": 1.1603558183362468, + "learning_rate": 9.998803301592105e-06, + "loss": 0.014, + "step": 186 + }, + { + "epoch": 0.03714370841195749, + "grad_norm": 0.9129479190883742, + "learning_rate": 9.998731875381398e-06, + "loss": 0.0288, + "step": 187 + }, + { + "epoch": 0.03734233786870593, + "grad_norm": 0.9875394679980506, + "learning_rate": 9.998658379200826e-06, + "loss": 0.0302, + "step": 188 + }, + { + "epoch": 0.037540967325454366, + "grad_norm": 1.0324491864294387, + "learning_rate": 9.998582813080824e-06, + "loss": 0.0157, + "step": 189 + }, + { + "epoch": 0.0377395967822028, + "grad_norm": 2.5399121017352098, + "learning_rate": 9.998505177052686e-06, + "loss": 0.0183, + "step": 190 + }, + { + "epoch": 0.03793822623895124, + "grad_norm": 1.0608993845566375, + "learning_rate": 9.99842547114856e-06, + "loss": 0.0142, + "step": 191 + }, + { + "epoch": 0.038136855695699676, + "grad_norm": 0.3873255475411284, + "learning_rate": 9.99834369540145e-06, + "loss": 0.0179, + "step": 192 + }, + { + "epoch": 0.038335485152448105, + "grad_norm": 0.580154672924063, + "learning_rate": 9.998259849845224e-06, + "loss": 0.0163, + "step": 193 + }, + { + "epoch": 0.03853411460919654, + "grad_norm": 0.5232842757349848, + "learning_rate": 9.9981739345146e-06, + "loss": 0.021, + "step": 194 + }, + { + "epoch": 0.03873274406594498, + "grad_norm": 0.5816313182605903, + "learning_rate": 9.998085949445154e-06, + "loss": 0.0122, + "step": 195 + }, + { + "epoch": 0.038931373522693415, + "grad_norm": 2.219159268458382, + "learning_rate": 9.99799589467332e-06, + "loss": 0.0192, + "step": 196 + }, + { + "epoch": 0.03913000297944185, + "grad_norm": 2.0126869985493956, + "learning_rate": 9.997903770236393e-06, + "loss": 0.0222, + "step": 197 + }, + { + "epoch": 0.03932863243619029, + "grad_norm": 0.8427003722810853, + "learning_rate": 9.99780957617252e-06, + "loss": 0.0155, + "step": 198 + }, + { + "epoch": 0.039527261892938724, + "grad_norm": 1.839988273195168, + "learning_rate": 9.997713312520703e-06, + "loss": 0.0177, + "step": 199 + }, + { + "epoch": 0.03972589134968716, + "grad_norm": 2.200388293444912, + "learning_rate": 9.99761497932081e-06, + "loss": 0.0313, + "step": 200 + }, + { + "epoch": 0.0399245208064356, + "grad_norm": 0.9877299401379769, + "learning_rate": 9.997514576613561e-06, + "loss": 0.0157, + "step": 201 + }, + { + "epoch": 0.04012315026318403, + "grad_norm": 1.1088588253099985, + "learning_rate": 9.99741210444053e-06, + "loss": 0.0198, + "step": 202 + }, + { + "epoch": 0.04032177971993246, + "grad_norm": 1.7028868040419975, + "learning_rate": 9.997307562844148e-06, + "loss": 0.0196, + "step": 203 + }, + { + "epoch": 0.0405204091766809, + "grad_norm": 1.2881131253386358, + "learning_rate": 9.997200951867711e-06, + "loss": 0.022, + "step": 204 + }, + { + "epoch": 0.040719038633429336, + "grad_norm": 1.1206602404111856, + "learning_rate": 9.997092271555364e-06, + "loss": 0.03, + "step": 205 + }, + { + "epoch": 0.04091766809017777, + "grad_norm": 0.9055361093963828, + "learning_rate": 9.996981521952111e-06, + "loss": 0.0166, + "step": 206 + }, + { + "epoch": 0.04111629754692621, + "grad_norm": 2.5005202676664315, + "learning_rate": 9.996868703103815e-06, + "loss": 0.0267, + "step": 207 + }, + { + "epoch": 0.041314927003674645, + "grad_norm": 1.6337249581649744, + "learning_rate": 9.996753815057191e-06, + "loss": 0.0191, + "step": 208 + }, + { + "epoch": 0.04151355646042308, + "grad_norm": 0.7101831965292786, + "learning_rate": 9.996636857859818e-06, + "loss": 0.0314, + "step": 209 + }, + { + "epoch": 0.04171218591717152, + "grad_norm": 1.5966574187467166, + "learning_rate": 9.996517831560123e-06, + "loss": 0.0264, + "step": 210 + }, + { + "epoch": 0.041910815373919955, + "grad_norm": 1.8007214636848605, + "learning_rate": 9.9963967362074e-06, + "loss": 0.0231, + "step": 211 + }, + { + "epoch": 0.04210944483066839, + "grad_norm": 3.752484634788327, + "learning_rate": 9.996273571851793e-06, + "loss": 0.0257, + "step": 212 + }, + { + "epoch": 0.04230807428741682, + "grad_norm": 2.618494261051629, + "learning_rate": 9.996148338544302e-06, + "loss": 0.0238, + "step": 213 + }, + { + "epoch": 0.04250670374416526, + "grad_norm": 1.9116871575698093, + "learning_rate": 9.996021036336786e-06, + "loss": 0.0211, + "step": 214 + }, + { + "epoch": 0.042705333200913694, + "grad_norm": 0.5453225145113627, + "learning_rate": 9.995891665281965e-06, + "loss": 0.02, + "step": 215 + }, + { + "epoch": 0.04290396265766213, + "grad_norm": 1.6984059813637056, + "learning_rate": 9.995760225433407e-06, + "loss": 0.0235, + "step": 216 + }, + { + "epoch": 0.043102592114410566, + "grad_norm": 1.4880871765884156, + "learning_rate": 9.995626716845541e-06, + "loss": 0.0168, + "step": 217 + }, + { + "epoch": 0.043301221571159, + "grad_norm": 1.4047456117126256, + "learning_rate": 9.995491139573657e-06, + "loss": 0.0316, + "step": 218 + }, + { + "epoch": 0.04349985102790744, + "grad_norm": 1.2909976897550328, + "learning_rate": 9.995353493673892e-06, + "loss": 0.0137, + "step": 219 + }, + { + "epoch": 0.043698480484655876, + "grad_norm": 0.6501623439351141, + "learning_rate": 9.99521377920325e-06, + "loss": 0.0161, + "step": 220 + }, + { + "epoch": 0.04389710994140431, + "grad_norm": 0.8580190311656244, + "learning_rate": 9.995071996219584e-06, + "loss": 0.018, + "step": 221 + }, + { + "epoch": 0.04409573939815275, + "grad_norm": 1.2921080572385664, + "learning_rate": 9.994928144781607e-06, + "loss": 0.0129, + "step": 222 + }, + { + "epoch": 0.044294368854901185, + "grad_norm": 1.8388869881713987, + "learning_rate": 9.994782224948885e-06, + "loss": 0.0244, + "step": 223 + }, + { + "epoch": 0.044492998311649615, + "grad_norm": 0.8904350028720864, + "learning_rate": 9.994634236781845e-06, + "loss": 0.0285, + "step": 224 + }, + { + "epoch": 0.04469162776839805, + "grad_norm": 1.0351479493656937, + "learning_rate": 9.994484180341773e-06, + "loss": 0.0129, + "step": 225 + }, + { + "epoch": 0.04489025722514649, + "grad_norm": 1.6356872953946395, + "learning_rate": 9.994332055690801e-06, + "loss": 0.0172, + "step": 226 + }, + { + "epoch": 0.045088886681894924, + "grad_norm": 2.4427456672158785, + "learning_rate": 9.994177862891927e-06, + "loss": 0.0262, + "step": 227 + }, + { + "epoch": 0.04528751613864336, + "grad_norm": 1.1833855069108572, + "learning_rate": 9.994021602009001e-06, + "loss": 0.0212, + "step": 228 + }, + { + "epoch": 0.0454861455953918, + "grad_norm": 1.3424482002706264, + "learning_rate": 9.99386327310673e-06, + "loss": 0.013, + "step": 229 + }, + { + "epoch": 0.045684775052140233, + "grad_norm": 1.9959735911469776, + "learning_rate": 9.99370287625068e-06, + "loss": 0.0136, + "step": 230 + }, + { + "epoch": 0.04588340450888867, + "grad_norm": 1.0872172455494484, + "learning_rate": 9.99354041150727e-06, + "loss": 0.0189, + "step": 231 + }, + { + "epoch": 0.046082033965637106, + "grad_norm": 0.8886861060907549, + "learning_rate": 9.993375878943775e-06, + "loss": 0.0248, + "step": 232 + }, + { + "epoch": 0.04628066342238554, + "grad_norm": 1.1111175252979908, + "learning_rate": 9.99320927862833e-06, + "loss": 0.0207, + "step": 233 + }, + { + "epoch": 0.04647929287913397, + "grad_norm": 0.351030054160361, + "learning_rate": 9.993040610629923e-06, + "loss": 0.0169, + "step": 234 + }, + { + "epoch": 0.04667792233588241, + "grad_norm": 1.3626687901401482, + "learning_rate": 9.992869875018398e-06, + "loss": 0.0176, + "step": 235 + }, + { + "epoch": 0.046876551792630845, + "grad_norm": 1.3745225587842778, + "learning_rate": 9.992697071864459e-06, + "loss": 0.02, + "step": 236 + }, + { + "epoch": 0.04707518124937928, + "grad_norm": 0.680922388085837, + "learning_rate": 9.992522201239661e-06, + "loss": 0.0167, + "step": 237 + }, + { + "epoch": 0.04727381070612772, + "grad_norm": 0.6405373508660324, + "learning_rate": 9.99234526321642e-06, + "loss": 0.0134, + "step": 238 + }, + { + "epoch": 0.047472440162876155, + "grad_norm": 1.5525613206844335, + "learning_rate": 9.992166257868006e-06, + "loss": 0.0244, + "step": 239 + }, + { + "epoch": 0.04767106961962459, + "grad_norm": 0.5871522946780782, + "learning_rate": 9.991985185268543e-06, + "loss": 0.0165, + "step": 240 + }, + { + "epoch": 0.04786969907637303, + "grad_norm": 0.8140407938901834, + "learning_rate": 9.991802045493013e-06, + "loss": 0.0108, + "step": 241 + }, + { + "epoch": 0.048068328533121464, + "grad_norm": 1.7987643886031939, + "learning_rate": 9.991616838617255e-06, + "loss": 0.0247, + "step": 242 + }, + { + "epoch": 0.0482669579898699, + "grad_norm": 1.4927221184896913, + "learning_rate": 9.991429564717964e-06, + "loss": 0.0243, + "step": 243 + }, + { + "epoch": 0.04846558744661833, + "grad_norm": 1.1469800583144492, + "learning_rate": 9.99124022387269e-06, + "loss": 0.0265, + "step": 244 + }, + { + "epoch": 0.04866421690336677, + "grad_norm": 0.6033563935180898, + "learning_rate": 9.991048816159834e-06, + "loss": 0.0211, + "step": 245 + }, + { + "epoch": 0.0488628463601152, + "grad_norm": 0.9368709020872853, + "learning_rate": 9.990855341658662e-06, + "loss": 0.023, + "step": 246 + }, + { + "epoch": 0.04906147581686364, + "grad_norm": 0.8399028095373245, + "learning_rate": 9.990659800449293e-06, + "loss": 0.0123, + "step": 247 + }, + { + "epoch": 0.049260105273612076, + "grad_norm": 0.4780920805092138, + "learning_rate": 9.990462192612698e-06, + "loss": 0.0197, + "step": 248 + }, + { + "epoch": 0.04945873473036051, + "grad_norm": 0.7218014814045688, + "learning_rate": 9.990262518230706e-06, + "loss": 0.0213, + "step": 249 + }, + { + "epoch": 0.04965736418710895, + "grad_norm": 1.1716975326112173, + "learning_rate": 9.990060777386004e-06, + "loss": 0.0206, + "step": 250 + }, + { + "epoch": 0.049855993643857385, + "grad_norm": 0.4751820690979132, + "learning_rate": 9.98985697016213e-06, + "loss": 0.026, + "step": 251 + }, + { + "epoch": 0.05005462310060582, + "grad_norm": 0.9003900691985227, + "learning_rate": 9.989651096643482e-06, + "loss": 0.0128, + "step": 252 + }, + { + "epoch": 0.05025325255735426, + "grad_norm": 0.9111928166207327, + "learning_rate": 9.98944315691531e-06, + "loss": 0.0232, + "step": 253 + }, + { + "epoch": 0.050451882014102695, + "grad_norm": 0.5545254300785284, + "learning_rate": 9.989233151063726e-06, + "loss": 0.0172, + "step": 254 + }, + { + "epoch": 0.050650511470851124, + "grad_norm": 0.40224669906766397, + "learning_rate": 9.98902107917569e-06, + "loss": 0.0233, + "step": 255 + }, + { + "epoch": 0.05084914092759956, + "grad_norm": 0.5824400395204213, + "learning_rate": 9.98880694133902e-06, + "loss": 0.0217, + "step": 256 + }, + { + "epoch": 0.051047770384348, + "grad_norm": 1.1709383260744466, + "learning_rate": 9.988590737642392e-06, + "loss": 0.0135, + "step": 257 + }, + { + "epoch": 0.051246399841096434, + "grad_norm": 1.5439449052427077, + "learning_rate": 9.988372468175335e-06, + "loss": 0.0271, + "step": 258 + }, + { + "epoch": 0.05144502929784487, + "grad_norm": 0.8457319931931899, + "learning_rate": 9.988152133028234e-06, + "loss": 0.024, + "step": 259 + }, + { + "epoch": 0.051643658754593307, + "grad_norm": 1.3872763327296354, + "learning_rate": 9.98792973229233e-06, + "loss": 0.0215, + "step": 260 + }, + { + "epoch": 0.05184228821134174, + "grad_norm": 2.1654406629105702, + "learning_rate": 9.987705266059721e-06, + "loss": 0.0227, + "step": 261 + }, + { + "epoch": 0.05204091766809018, + "grad_norm": 1.5458480467902418, + "learning_rate": 9.987478734423355e-06, + "loss": 0.0218, + "step": 262 + }, + { + "epoch": 0.052239547124838616, + "grad_norm": 0.37405484795190086, + "learning_rate": 9.987250137477039e-06, + "loss": 0.0173, + "step": 263 + }, + { + "epoch": 0.05243817658158705, + "grad_norm": 0.7336264315402546, + "learning_rate": 9.987019475315437e-06, + "loss": 0.018, + "step": 264 + }, + { + "epoch": 0.05263680603833548, + "grad_norm": 1.2158389596763395, + "learning_rate": 9.986786748034061e-06, + "loss": 0.0147, + "step": 265 + }, + { + "epoch": 0.05283543549508392, + "grad_norm": 1.130772342954925, + "learning_rate": 9.986551955729288e-06, + "loss": 0.0263, + "step": 266 + }, + { + "epoch": 0.053034064951832355, + "grad_norm": 0.6317791909189016, + "learning_rate": 9.986315098498345e-06, + "loss": 0.0202, + "step": 267 + }, + { + "epoch": 0.05323269440858079, + "grad_norm": 0.44226726309173453, + "learning_rate": 9.986076176439313e-06, + "loss": 0.0184, + "step": 268 + }, + { + "epoch": 0.05343132386532923, + "grad_norm": 0.5474445679262542, + "learning_rate": 9.98583518965113e-06, + "loss": 0.0095, + "step": 269 + }, + { + "epoch": 0.053629953322077664, + "grad_norm": 1.7508003601141047, + "learning_rate": 9.985592138233586e-06, + "loss": 0.0247, + "step": 270 + }, + { + "epoch": 0.0538285827788261, + "grad_norm": 1.5983209020045153, + "learning_rate": 9.98534702228733e-06, + "loss": 0.0259, + "step": 271 + }, + { + "epoch": 0.05402721223557454, + "grad_norm": 1.7251758674302107, + "learning_rate": 9.985099841913867e-06, + "loss": 0.0107, + "step": 272 + }, + { + "epoch": 0.054225841692322974, + "grad_norm": 0.4747190048835185, + "learning_rate": 9.984850597215551e-06, + "loss": 0.0135, + "step": 273 + }, + { + "epoch": 0.05442447114907141, + "grad_norm": 0.9285709587358324, + "learning_rate": 9.984599288295593e-06, + "loss": 0.017, + "step": 274 + }, + { + "epoch": 0.054623100605819846, + "grad_norm": 1.6254232235138268, + "learning_rate": 9.984345915258063e-06, + "loss": 0.0177, + "step": 275 + }, + { + "epoch": 0.054821730062568276, + "grad_norm": 0.8282505450106532, + "learning_rate": 9.98409047820788e-06, + "loss": 0.0188, + "step": 276 + }, + { + "epoch": 0.05502035951931671, + "grad_norm": 1.10441200036311, + "learning_rate": 9.983832977250822e-06, + "loss": 0.0334, + "step": 277 + }, + { + "epoch": 0.05521898897606515, + "grad_norm": 0.4476905849331348, + "learning_rate": 9.983573412493519e-06, + "loss": 0.0123, + "step": 278 + }, + { + "epoch": 0.055417618432813585, + "grad_norm": 0.41407971370056407, + "learning_rate": 9.983311784043457e-06, + "loss": 0.0163, + "step": 279 + }, + { + "epoch": 0.05561624788956202, + "grad_norm": 1.733780241117457, + "learning_rate": 9.983048092008973e-06, + "loss": 0.0204, + "step": 280 + }, + { + "epoch": 0.05581487734631046, + "grad_norm": 0.7561713460915898, + "learning_rate": 9.982782336499267e-06, + "loss": 0.0191, + "step": 281 + }, + { + "epoch": 0.056013506803058895, + "grad_norm": 0.8952250678333094, + "learning_rate": 9.982514517624385e-06, + "loss": 0.0195, + "step": 282 + }, + { + "epoch": 0.05621213625980733, + "grad_norm": 0.33837803654892235, + "learning_rate": 9.982244635495232e-06, + "loss": 0.0128, + "step": 283 + }, + { + "epoch": 0.05641076571655577, + "grad_norm": 0.3491614108577355, + "learning_rate": 9.981972690223561e-06, + "loss": 0.0104, + "step": 284 + }, + { + "epoch": 0.056609395173304204, + "grad_norm": 0.3885685718301126, + "learning_rate": 9.98169868192199e-06, + "loss": 0.014, + "step": 285 + }, + { + "epoch": 0.056808024630052634, + "grad_norm": 1.907227579601552, + "learning_rate": 9.981422610703983e-06, + "loss": 0.0226, + "step": 286 + }, + { + "epoch": 0.05700665408680107, + "grad_norm": 0.6383147141508525, + "learning_rate": 9.981144476683863e-06, + "loss": 0.0235, + "step": 287 + }, + { + "epoch": 0.05720528354354951, + "grad_norm": 1.0623634554315362, + "learning_rate": 9.980864279976803e-06, + "loss": 0.0199, + "step": 288 + }, + { + "epoch": 0.05740391300029794, + "grad_norm": 0.5121142099458341, + "learning_rate": 9.980582020698832e-06, + "loss": 0.0161, + "step": 289 + }, + { + "epoch": 0.05760254245704638, + "grad_norm": 0.8603769100451026, + "learning_rate": 9.980297698966835e-06, + "loss": 0.0124, + "step": 290 + }, + { + "epoch": 0.057801171913794816, + "grad_norm": 1.3969334526557426, + "learning_rate": 9.980011314898546e-06, + "loss": 0.0175, + "step": 291 + }, + { + "epoch": 0.05799980137054325, + "grad_norm": 0.5708269650845633, + "learning_rate": 9.97972286861256e-06, + "loss": 0.0148, + "step": 292 + }, + { + "epoch": 0.05819843082729169, + "grad_norm": 0.6229104672334838, + "learning_rate": 9.979432360228322e-06, + "loss": 0.0168, + "step": 293 + }, + { + "epoch": 0.058397060284040125, + "grad_norm": 0.615731320985735, + "learning_rate": 9.979139789866129e-06, + "loss": 0.0134, + "step": 294 + }, + { + "epoch": 0.05859568974078856, + "grad_norm": 0.625395130998787, + "learning_rate": 9.978845157647136e-06, + "loss": 0.0097, + "step": 295 + }, + { + "epoch": 0.05879431919753699, + "grad_norm": 0.8790404848653711, + "learning_rate": 9.978548463693348e-06, + "loss": 0.0166, + "step": 296 + }, + { + "epoch": 0.05899294865428543, + "grad_norm": 1.4772807736065832, + "learning_rate": 9.978249708127627e-06, + "loss": 0.0123, + "step": 297 + }, + { + "epoch": 0.059191578111033864, + "grad_norm": 0.5627615829863155, + "learning_rate": 9.977948891073688e-06, + "loss": 0.0151, + "step": 298 + }, + { + "epoch": 0.0593902075677823, + "grad_norm": 0.917332306356399, + "learning_rate": 9.977646012656099e-06, + "loss": 0.0151, + "step": 299 + }, + { + "epoch": 0.05958883702453074, + "grad_norm": 0.8670957566639708, + "learning_rate": 9.977341073000278e-06, + "loss": 0.0167, + "step": 300 + }, + { + "epoch": 0.059787466481279174, + "grad_norm": 1.1414866808620485, + "learning_rate": 9.977034072232506e-06, + "loss": 0.0203, + "step": 301 + }, + { + "epoch": 0.05998609593802761, + "grad_norm": 1.1251288241040425, + "learning_rate": 9.976725010479907e-06, + "loss": 0.0194, + "step": 302 + }, + { + "epoch": 0.06018472539477605, + "grad_norm": 0.683122296823212, + "learning_rate": 9.976413887870466e-06, + "loss": 0.016, + "step": 303 + }, + { + "epoch": 0.06038335485152448, + "grad_norm": 0.8996616395705849, + "learning_rate": 9.976100704533018e-06, + "loss": 0.0223, + "step": 304 + }, + { + "epoch": 0.06058198430827292, + "grad_norm": 0.6428548080123838, + "learning_rate": 9.97578546059725e-06, + "loss": 0.015, + "step": 305 + }, + { + "epoch": 0.060780613765021356, + "grad_norm": 1.940990193850758, + "learning_rate": 9.975468156193706e-06, + "loss": 0.028, + "step": 306 + }, + { + "epoch": 0.060979243221769786, + "grad_norm": 1.176442185876376, + "learning_rate": 9.975148791453781e-06, + "loss": 0.021, + "step": 307 + }, + { + "epoch": 0.06117787267851822, + "grad_norm": 0.5811349856371228, + "learning_rate": 9.974827366509725e-06, + "loss": 0.0164, + "step": 308 + }, + { + "epoch": 0.06137650213526666, + "grad_norm": 0.668480148637617, + "learning_rate": 9.974503881494638e-06, + "loss": 0.0144, + "step": 309 + }, + { + "epoch": 0.061575131592015095, + "grad_norm": 1.1370989077273732, + "learning_rate": 9.974178336542473e-06, + "loss": 0.0163, + "step": 310 + }, + { + "epoch": 0.06177376104876353, + "grad_norm": 1.2252608949279702, + "learning_rate": 9.973850731788041e-06, + "loss": 0.028, + "step": 311 + }, + { + "epoch": 0.06197239050551197, + "grad_norm": 1.091045811315918, + "learning_rate": 9.973521067367005e-06, + "loss": 0.0194, + "step": 312 + }, + { + "epoch": 0.062171019962260404, + "grad_norm": 0.5664431447755225, + "learning_rate": 9.973189343415872e-06, + "loss": 0.0207, + "step": 313 + }, + { + "epoch": 0.06236964941900884, + "grad_norm": 0.7809890625347147, + "learning_rate": 9.972855560072014e-06, + "loss": 0.0186, + "step": 314 + }, + { + "epoch": 0.06256827887575728, + "grad_norm": 0.4914092006339943, + "learning_rate": 9.972519717473647e-06, + "loss": 0.0208, + "step": 315 + }, + { + "epoch": 0.06276690833250571, + "grad_norm": 1.352749398041195, + "learning_rate": 9.972181815759848e-06, + "loss": 0.0178, + "step": 316 + }, + { + "epoch": 0.06296553778925415, + "grad_norm": 1.6842469979290473, + "learning_rate": 9.971841855070535e-06, + "loss": 0.0212, + "step": 317 + }, + { + "epoch": 0.06316416724600259, + "grad_norm": 1.181704020131392, + "learning_rate": 9.97149983554649e-06, + "loss": 0.0249, + "step": 318 + }, + { + "epoch": 0.06336279670275102, + "grad_norm": 0.47050532143708484, + "learning_rate": 9.971155757329341e-06, + "loss": 0.0171, + "step": 319 + }, + { + "epoch": 0.06356142615949946, + "grad_norm": 0.8319540289077562, + "learning_rate": 9.970809620561573e-06, + "loss": 0.0197, + "step": 320 + }, + { + "epoch": 0.0637600556162479, + "grad_norm": 2.00049297860715, + "learning_rate": 9.970461425386518e-06, + "loss": 0.0233, + "step": 321 + }, + { + "epoch": 0.06395868507299632, + "grad_norm": 0.6876204488927999, + "learning_rate": 9.970111171948362e-06, + "loss": 0.0218, + "step": 322 + }, + { + "epoch": 0.06415731452974476, + "grad_norm": 1.1508387565108984, + "learning_rate": 9.969758860392148e-06, + "loss": 0.0254, + "step": 323 + }, + { + "epoch": 0.06435594398649319, + "grad_norm": 1.0175823754757551, + "learning_rate": 9.96940449086377e-06, + "loss": 0.0207, + "step": 324 + }, + { + "epoch": 0.06455457344324163, + "grad_norm": 1.342558930748065, + "learning_rate": 9.969048063509965e-06, + "loss": 0.0166, + "step": 325 + }, + { + "epoch": 0.06475320289999006, + "grad_norm": 1.618591294235281, + "learning_rate": 9.968689578478334e-06, + "loss": 0.017, + "step": 326 + }, + { + "epoch": 0.0649518323567385, + "grad_norm": 0.8029753682738998, + "learning_rate": 9.968329035917326e-06, + "loss": 0.0134, + "step": 327 + }, + { + "epoch": 0.06515046181348694, + "grad_norm": 1.4258924678587896, + "learning_rate": 9.967966435976237e-06, + "loss": 0.0209, + "step": 328 + }, + { + "epoch": 0.06534909127023537, + "grad_norm": 0.6264481978575003, + "learning_rate": 9.967601778805225e-06, + "loss": 0.022, + "step": 329 + }, + { + "epoch": 0.06554772072698381, + "grad_norm": 0.9417193977601217, + "learning_rate": 9.967235064555289e-06, + "loss": 0.0182, + "step": 330 + }, + { + "epoch": 0.06574635018373225, + "grad_norm": 1.4600362140153005, + "learning_rate": 9.966866293378287e-06, + "loss": 0.0169, + "step": 331 + }, + { + "epoch": 0.06594497964048068, + "grad_norm": 0.6608946589876135, + "learning_rate": 9.966495465426927e-06, + "loss": 0.0166, + "step": 332 + }, + { + "epoch": 0.06614360909722912, + "grad_norm": 0.5334956998951328, + "learning_rate": 9.96612258085477e-06, + "loss": 0.0106, + "step": 333 + }, + { + "epoch": 0.06634223855397756, + "grad_norm": 0.7085312138032311, + "learning_rate": 9.965747639816224e-06, + "loss": 0.0147, + "step": 334 + }, + { + "epoch": 0.06654086801072599, + "grad_norm": 1.5381074566151718, + "learning_rate": 9.965370642466551e-06, + "loss": 0.0139, + "step": 335 + }, + { + "epoch": 0.06673949746747443, + "grad_norm": 0.6811831504303671, + "learning_rate": 9.96499158896187e-06, + "loss": 0.0238, + "step": 336 + }, + { + "epoch": 0.06693812692422287, + "grad_norm": 1.5626101417205556, + "learning_rate": 9.964610479459144e-06, + "loss": 0.0229, + "step": 337 + }, + { + "epoch": 0.0671367563809713, + "grad_norm": 0.5600383522887019, + "learning_rate": 9.964227314116191e-06, + "loss": 0.0171, + "step": 338 + }, + { + "epoch": 0.06733538583771974, + "grad_norm": 0.8665657897799597, + "learning_rate": 9.963842093091678e-06, + "loss": 0.0212, + "step": 339 + }, + { + "epoch": 0.06753401529446817, + "grad_norm": 0.6444012965528136, + "learning_rate": 9.963454816545124e-06, + "loss": 0.0162, + "step": 340 + }, + { + "epoch": 0.06773264475121661, + "grad_norm": 1.0179480233604894, + "learning_rate": 9.963065484636901e-06, + "loss": 0.0159, + "step": 341 + }, + { + "epoch": 0.06793127420796505, + "grad_norm": 0.33885609792917437, + "learning_rate": 9.962674097528232e-06, + "loss": 0.0105, + "step": 342 + }, + { + "epoch": 0.06812990366471347, + "grad_norm": 0.31682099229115473, + "learning_rate": 9.962280655381189e-06, + "loss": 0.0139, + "step": 343 + }, + { + "epoch": 0.0683285331214619, + "grad_norm": 1.261532302914878, + "learning_rate": 9.961885158358696e-06, + "loss": 0.013, + "step": 344 + }, + { + "epoch": 0.06852716257821034, + "grad_norm": 1.6126011541924115, + "learning_rate": 9.961487606624526e-06, + "loss": 0.021, + "step": 345 + }, + { + "epoch": 0.06872579203495878, + "grad_norm": 0.5677592432568079, + "learning_rate": 9.961088000343308e-06, + "loss": 0.0182, + "step": 346 + }, + { + "epoch": 0.06892442149170722, + "grad_norm": 0.6292493886004704, + "learning_rate": 9.960686339680518e-06, + "loss": 0.0177, + "step": 347 + }, + { + "epoch": 0.06912305094845565, + "grad_norm": 1.9905966048832493, + "learning_rate": 9.960282624802482e-06, + "loss": 0.0206, + "step": 348 + }, + { + "epoch": 0.06932168040520409, + "grad_norm": 0.6186635733129662, + "learning_rate": 9.959876855876378e-06, + "loss": 0.0162, + "step": 349 + }, + { + "epoch": 0.06952030986195253, + "grad_norm": 1.9229044356070553, + "learning_rate": 9.959469033070235e-06, + "loss": 0.0248, + "step": 350 + }, + { + "epoch": 0.06971893931870096, + "grad_norm": 1.3711744907425025, + "learning_rate": 9.959059156552931e-06, + "loss": 0.0235, + "step": 351 + }, + { + "epoch": 0.0699175687754494, + "grad_norm": 0.39607802774515655, + "learning_rate": 9.958647226494198e-06, + "loss": 0.0132, + "step": 352 + }, + { + "epoch": 0.07011619823219783, + "grad_norm": 1.7233078150248982, + "learning_rate": 9.958233243064614e-06, + "loss": 0.0168, + "step": 353 + }, + { + "epoch": 0.07031482768894627, + "grad_norm": 1.6165097055887545, + "learning_rate": 9.95781720643561e-06, + "loss": 0.022, + "step": 354 + }, + { + "epoch": 0.07051345714569471, + "grad_norm": 1.6300475923467623, + "learning_rate": 9.957399116779463e-06, + "loss": 0.0155, + "step": 355 + }, + { + "epoch": 0.07071208660244314, + "grad_norm": 1.3293283054851412, + "learning_rate": 9.95697897426931e-06, + "loss": 0.0251, + "step": 356 + }, + { + "epoch": 0.07091071605919158, + "grad_norm": 0.7311582517038507, + "learning_rate": 9.956556779079124e-06, + "loss": 0.0201, + "step": 357 + }, + { + "epoch": 0.07110934551594002, + "grad_norm": 1.0352583458625484, + "learning_rate": 9.956132531383741e-06, + "loss": 0.0179, + "step": 358 + }, + { + "epoch": 0.07130797497268845, + "grad_norm": 0.6749175905417537, + "learning_rate": 9.95570623135884e-06, + "loss": 0.0122, + "step": 359 + }, + { + "epoch": 0.07150660442943689, + "grad_norm": 0.7936842430096778, + "learning_rate": 9.955277879180951e-06, + "loss": 0.0156, + "step": 360 + }, + { + "epoch": 0.07170523388618533, + "grad_norm": 0.6066214646810274, + "learning_rate": 9.954847475027456e-06, + "loss": 0.0257, + "step": 361 + }, + { + "epoch": 0.07190386334293376, + "grad_norm": 0.5483587007588743, + "learning_rate": 9.954415019076581e-06, + "loss": 0.0195, + "step": 362 + }, + { + "epoch": 0.0721024927996822, + "grad_norm": 1.8751031257546584, + "learning_rate": 9.95398051150741e-06, + "loss": 0.0263, + "step": 363 + }, + { + "epoch": 0.07230112225643062, + "grad_norm": 0.5121194780452556, + "learning_rate": 9.95354395249987e-06, + "loss": 0.014, + "step": 364 + }, + { + "epoch": 0.07249975171317906, + "grad_norm": 0.4817927132351728, + "learning_rate": 9.95310534223474e-06, + "loss": 0.0122, + "step": 365 + }, + { + "epoch": 0.0726983811699275, + "grad_norm": 0.9420406818271349, + "learning_rate": 9.952664680893647e-06, + "loss": 0.0265, + "step": 366 + }, + { + "epoch": 0.07289701062667593, + "grad_norm": 0.700810731765772, + "learning_rate": 9.95222196865907e-06, + "loss": 0.0224, + "step": 367 + }, + { + "epoch": 0.07309564008342437, + "grad_norm": 1.1831981421496827, + "learning_rate": 9.951777205714335e-06, + "loss": 0.0157, + "step": 368 + }, + { + "epoch": 0.0732942695401728, + "grad_norm": 0.844174382418765, + "learning_rate": 9.951330392243619e-06, + "loss": 0.0208, + "step": 369 + }, + { + "epoch": 0.07349289899692124, + "grad_norm": 0.6648604009959039, + "learning_rate": 9.950881528431944e-06, + "loss": 0.0201, + "step": 370 + }, + { + "epoch": 0.07369152845366968, + "grad_norm": 0.9101607178957163, + "learning_rate": 9.950430614465187e-06, + "loss": 0.0238, + "step": 371 + }, + { + "epoch": 0.07389015791041811, + "grad_norm": 1.1877944371561115, + "learning_rate": 9.94997765053007e-06, + "loss": 0.0229, + "step": 372 + }, + { + "epoch": 0.07408878736716655, + "grad_norm": 1.1414413397286445, + "learning_rate": 9.949522636814166e-06, + "loss": 0.025, + "step": 373 + }, + { + "epoch": 0.07428741682391499, + "grad_norm": 1.0250917508528674, + "learning_rate": 9.949065573505894e-06, + "loss": 0.0223, + "step": 374 + }, + { + "epoch": 0.07448604628066342, + "grad_norm": 1.1184758483425372, + "learning_rate": 9.948606460794524e-06, + "loss": 0.0164, + "step": 375 + }, + { + "epoch": 0.07468467573741186, + "grad_norm": 1.191964024912497, + "learning_rate": 9.948145298870173e-06, + "loss": 0.0218, + "step": 376 + }, + { + "epoch": 0.0748833051941603, + "grad_norm": 1.0225207706121757, + "learning_rate": 9.94768208792381e-06, + "loss": 0.014, + "step": 377 + }, + { + "epoch": 0.07508193465090873, + "grad_norm": 0.8774375657656637, + "learning_rate": 9.947216828147249e-06, + "loss": 0.0187, + "step": 378 + }, + { + "epoch": 0.07528056410765717, + "grad_norm": 0.3736459392817161, + "learning_rate": 9.946749519733155e-06, + "loss": 0.0086, + "step": 379 + }, + { + "epoch": 0.0754791935644056, + "grad_norm": 0.8238665843850358, + "learning_rate": 9.946280162875036e-06, + "loss": 0.0157, + "step": 380 + }, + { + "epoch": 0.07567782302115404, + "grad_norm": 0.45936683210469975, + "learning_rate": 9.945808757767256e-06, + "loss": 0.0099, + "step": 381 + }, + { + "epoch": 0.07587645247790248, + "grad_norm": 2.0051899829645343, + "learning_rate": 9.945335304605023e-06, + "loss": 0.023, + "step": 382 + }, + { + "epoch": 0.07607508193465091, + "grad_norm": 0.6435406420149826, + "learning_rate": 9.944859803584392e-06, + "loss": 0.0114, + "step": 383 + }, + { + "epoch": 0.07627371139139935, + "grad_norm": 0.9399241604433789, + "learning_rate": 9.94438225490227e-06, + "loss": 0.0127, + "step": 384 + }, + { + "epoch": 0.07647234084814777, + "grad_norm": 0.6944344878094227, + "learning_rate": 9.943902658756405e-06, + "loss": 0.0197, + "step": 385 + }, + { + "epoch": 0.07667097030489621, + "grad_norm": 0.551483947123314, + "learning_rate": 9.9434210153454e-06, + "loss": 0.009, + "step": 386 + }, + { + "epoch": 0.07686959976164465, + "grad_norm": 1.5968174034926244, + "learning_rate": 9.942937324868706e-06, + "loss": 0.0233, + "step": 387 + }, + { + "epoch": 0.07706822921839308, + "grad_norm": 0.5848570587556811, + "learning_rate": 9.942451587526614e-06, + "loss": 0.0145, + "step": 388 + }, + { + "epoch": 0.07726685867514152, + "grad_norm": 0.9128747168133394, + "learning_rate": 9.94196380352027e-06, + "loss": 0.0227, + "step": 389 + }, + { + "epoch": 0.07746548813188996, + "grad_norm": 1.7264670251438237, + "learning_rate": 9.941473973051662e-06, + "loss": 0.0295, + "step": 390 + }, + { + "epoch": 0.07766411758863839, + "grad_norm": 0.4937037314512115, + "learning_rate": 9.94098209632363e-06, + "loss": 0.0097, + "step": 391 + }, + { + "epoch": 0.07786274704538683, + "grad_norm": 0.775354918247013, + "learning_rate": 9.940488173539863e-06, + "loss": 0.0211, + "step": 392 + }, + { + "epoch": 0.07806137650213527, + "grad_norm": 0.7131682892913337, + "learning_rate": 9.93999220490489e-06, + "loss": 0.0199, + "step": 393 + }, + { + "epoch": 0.0782600059588837, + "grad_norm": 0.6154638513132034, + "learning_rate": 9.93949419062409e-06, + "loss": 0.0132, + "step": 394 + }, + { + "epoch": 0.07845863541563214, + "grad_norm": 1.1169763475776913, + "learning_rate": 9.938994130903693e-06, + "loss": 0.0217, + "step": 395 + }, + { + "epoch": 0.07865726487238058, + "grad_norm": 0.8043725514476527, + "learning_rate": 9.938492025950772e-06, + "loss": 0.0095, + "step": 396 + }, + { + "epoch": 0.07885589432912901, + "grad_norm": 1.3173221094506762, + "learning_rate": 9.937987875973249e-06, + "loss": 0.0171, + "step": 397 + }, + { + "epoch": 0.07905452378587745, + "grad_norm": 0.5812582606522358, + "learning_rate": 9.937481681179892e-06, + "loss": 0.0171, + "step": 398 + }, + { + "epoch": 0.07925315324262588, + "grad_norm": 0.2793393345374467, + "learning_rate": 9.936973441780316e-06, + "loss": 0.0135, + "step": 399 + }, + { + "epoch": 0.07945178269937432, + "grad_norm": 0.34243922533366083, + "learning_rate": 9.936463157984981e-06, + "loss": 0.014, + "step": 400 + }, + { + "epoch": 0.07965041215612276, + "grad_norm": 0.3322421246762658, + "learning_rate": 9.935950830005197e-06, + "loss": 0.014, + "step": 401 + }, + { + "epoch": 0.0798490416128712, + "grad_norm": 0.3204643392343696, + "learning_rate": 9.935436458053115e-06, + "loss": 0.0117, + "step": 402 + }, + { + "epoch": 0.08004767106961963, + "grad_norm": 0.47957220575662524, + "learning_rate": 9.934920042341739e-06, + "loss": 0.0101, + "step": 403 + }, + { + "epoch": 0.08024630052636807, + "grad_norm": 0.5433074802734997, + "learning_rate": 9.934401583084916e-06, + "loss": 0.0133, + "step": 404 + }, + { + "epoch": 0.08044492998311649, + "grad_norm": 0.983630868002418, + "learning_rate": 9.933881080497339e-06, + "loss": 0.0128, + "step": 405 + }, + { + "epoch": 0.08064355943986493, + "grad_norm": 0.8766875713340728, + "learning_rate": 9.933358534794547e-06, + "loss": 0.0188, + "step": 406 + }, + { + "epoch": 0.08084218889661336, + "grad_norm": 0.7878723380271294, + "learning_rate": 9.932833946192925e-06, + "loss": 0.0144, + "step": 407 + }, + { + "epoch": 0.0810408183533618, + "grad_norm": 0.9810561422094143, + "learning_rate": 9.932307314909708e-06, + "loss": 0.0207, + "step": 408 + }, + { + "epoch": 0.08123944781011024, + "grad_norm": 0.7377525283562892, + "learning_rate": 9.93177864116297e-06, + "loss": 0.0121, + "step": 409 + }, + { + "epoch": 0.08143807726685867, + "grad_norm": 0.46490373025726456, + "learning_rate": 9.931247925171636e-06, + "loss": 0.0069, + "step": 410 + }, + { + "epoch": 0.08163670672360711, + "grad_norm": 0.44936367192605653, + "learning_rate": 9.930715167155473e-06, + "loss": 0.0107, + "step": 411 + }, + { + "epoch": 0.08183533618035554, + "grad_norm": 1.0432452279852802, + "learning_rate": 9.930180367335098e-06, + "loss": 0.0079, + "step": 412 + }, + { + "epoch": 0.08203396563710398, + "grad_norm": 1.8118427025754065, + "learning_rate": 9.929643525931971e-06, + "loss": 0.0209, + "step": 413 + }, + { + "epoch": 0.08223259509385242, + "grad_norm": 2.1277356119975, + "learning_rate": 9.929104643168396e-06, + "loss": 0.0196, + "step": 414 + }, + { + "epoch": 0.08243122455060085, + "grad_norm": 1.0672236447535246, + "learning_rate": 9.928563719267525e-06, + "loss": 0.0251, + "step": 415 + }, + { + "epoch": 0.08262985400734929, + "grad_norm": 0.6372995337769211, + "learning_rate": 9.92802075445335e-06, + "loss": 0.011, + "step": 416 + }, + { + "epoch": 0.08282848346409773, + "grad_norm": 2.6818339589471565, + "learning_rate": 9.92747574895072e-06, + "loss": 0.0234, + "step": 417 + }, + { + "epoch": 0.08302711292084616, + "grad_norm": 1.8534077650526866, + "learning_rate": 9.926928702985314e-06, + "loss": 0.0191, + "step": 418 + }, + { + "epoch": 0.0832257423775946, + "grad_norm": 0.6069645126286759, + "learning_rate": 9.926379616783667e-06, + "loss": 0.0108, + "step": 419 + }, + { + "epoch": 0.08342437183434304, + "grad_norm": 1.3113122989349562, + "learning_rate": 9.925828490573156e-06, + "loss": 0.0207, + "step": 420 + }, + { + "epoch": 0.08362300129109147, + "grad_norm": 0.5645438911441669, + "learning_rate": 9.925275324581998e-06, + "loss": 0.0099, + "step": 421 + }, + { + "epoch": 0.08382163074783991, + "grad_norm": 0.5245065449924216, + "learning_rate": 9.92472011903926e-06, + "loss": 0.0117, + "step": 422 + }, + { + "epoch": 0.08402026020458835, + "grad_norm": 0.851720051198573, + "learning_rate": 9.924162874174854e-06, + "loss": 0.0196, + "step": 423 + }, + { + "epoch": 0.08421888966133678, + "grad_norm": 0.6220395812315587, + "learning_rate": 9.923603590219531e-06, + "loss": 0.0207, + "step": 424 + }, + { + "epoch": 0.08441751911808522, + "grad_norm": 0.8302098723285082, + "learning_rate": 9.923042267404893e-06, + "loss": 0.0144, + "step": 425 + }, + { + "epoch": 0.08461614857483364, + "grad_norm": 0.44409974197466384, + "learning_rate": 9.922478905963383e-06, + "loss": 0.0139, + "step": 426 + }, + { + "epoch": 0.08481477803158208, + "grad_norm": 1.3240937379268272, + "learning_rate": 9.921913506128286e-06, + "loss": 0.0154, + "step": 427 + }, + { + "epoch": 0.08501340748833051, + "grad_norm": 0.526570789542041, + "learning_rate": 9.921346068133736e-06, + "loss": 0.015, + "step": 428 + }, + { + "epoch": 0.08521203694507895, + "grad_norm": 0.35386778919766676, + "learning_rate": 9.920776592214707e-06, + "loss": 0.0123, + "step": 429 + }, + { + "epoch": 0.08541066640182739, + "grad_norm": 0.4338611448698274, + "learning_rate": 9.92020507860702e-06, + "loss": 0.0107, + "step": 430 + }, + { + "epoch": 0.08560929585857582, + "grad_norm": 1.170842642025001, + "learning_rate": 9.919631527547336e-06, + "loss": 0.0151, + "step": 431 + }, + { + "epoch": 0.08580792531532426, + "grad_norm": 0.7363695886885039, + "learning_rate": 9.919055939273163e-06, + "loss": 0.0152, + "step": 432 + }, + { + "epoch": 0.0860065547720727, + "grad_norm": 1.019101053836205, + "learning_rate": 9.918478314022852e-06, + "loss": 0.0232, + "step": 433 + }, + { + "epoch": 0.08620518422882113, + "grad_norm": 0.782125020982536, + "learning_rate": 9.917898652035595e-06, + "loss": 0.0244, + "step": 434 + }, + { + "epoch": 0.08640381368556957, + "grad_norm": 0.6310871428581489, + "learning_rate": 9.917316953551434e-06, + "loss": 0.0225, + "step": 435 + }, + { + "epoch": 0.086602443142318, + "grad_norm": 0.43083561752173805, + "learning_rate": 9.916733218811246e-06, + "loss": 0.0136, + "step": 436 + }, + { + "epoch": 0.08680107259906644, + "grad_norm": 0.8104722545750283, + "learning_rate": 9.916147448056755e-06, + "loss": 0.0144, + "step": 437 + }, + { + "epoch": 0.08699970205581488, + "grad_norm": 0.4778684037166286, + "learning_rate": 9.91555964153053e-06, + "loss": 0.0128, + "step": 438 + }, + { + "epoch": 0.08719833151256332, + "grad_norm": 0.8825546091074031, + "learning_rate": 9.914969799475978e-06, + "loss": 0.0145, + "step": 439 + }, + { + "epoch": 0.08739696096931175, + "grad_norm": 0.600585587690455, + "learning_rate": 9.914377922137356e-06, + "loss": 0.0114, + "step": 440 + }, + { + "epoch": 0.08759559042606019, + "grad_norm": 0.9380929174319077, + "learning_rate": 9.913784009759757e-06, + "loss": 0.0184, + "step": 441 + }, + { + "epoch": 0.08779421988280862, + "grad_norm": 0.7171398264697423, + "learning_rate": 9.91318806258912e-06, + "loss": 0.0187, + "step": 442 + }, + { + "epoch": 0.08799284933955706, + "grad_norm": 0.7288666746333805, + "learning_rate": 9.912590080872227e-06, + "loss": 0.017, + "step": 443 + }, + { + "epoch": 0.0881914787963055, + "grad_norm": 0.8521776900166558, + "learning_rate": 9.911990064856703e-06, + "loss": 0.0223, + "step": 444 + }, + { + "epoch": 0.08839010825305393, + "grad_norm": 0.6528440880111782, + "learning_rate": 9.91138801479101e-06, + "loss": 0.0183, + "step": 445 + }, + { + "epoch": 0.08858873770980237, + "grad_norm": 0.4836930626793002, + "learning_rate": 9.910783930924458e-06, + "loss": 0.0098, + "step": 446 + }, + { + "epoch": 0.08878736716655079, + "grad_norm": 0.998927006114887, + "learning_rate": 9.910177813507198e-06, + "loss": 0.014, + "step": 447 + }, + { + "epoch": 0.08898599662329923, + "grad_norm": 0.8181775259423208, + "learning_rate": 9.909569662790224e-06, + "loss": 0.0192, + "step": 448 + }, + { + "epoch": 0.08918462608004767, + "grad_norm": 0.5583401583915387, + "learning_rate": 9.908959479025369e-06, + "loss": 0.0119, + "step": 449 + }, + { + "epoch": 0.0893832555367961, + "grad_norm": 0.8838426600045396, + "learning_rate": 9.908347262465308e-06, + "loss": 0.0213, + "step": 450 + }, + { + "epoch": 0.08958188499354454, + "grad_norm": 0.7498887472731395, + "learning_rate": 9.907733013363563e-06, + "loss": 0.0142, + "step": 451 + }, + { + "epoch": 0.08978051445029298, + "grad_norm": 0.8966575656361611, + "learning_rate": 9.907116731974491e-06, + "loss": 0.0193, + "step": 452 + }, + { + "epoch": 0.08997914390704141, + "grad_norm": 0.5073403960586594, + "learning_rate": 9.906498418553295e-06, + "loss": 0.0137, + "step": 453 + }, + { + "epoch": 0.09017777336378985, + "grad_norm": 1.1137939580467742, + "learning_rate": 9.905878073356015e-06, + "loss": 0.0192, + "step": 454 + }, + { + "epoch": 0.09037640282053828, + "grad_norm": 1.1344716602210057, + "learning_rate": 9.90525569663954e-06, + "loss": 0.0161, + "step": 455 + }, + { + "epoch": 0.09057503227728672, + "grad_norm": 0.7220695206586469, + "learning_rate": 9.904631288661593e-06, + "loss": 0.0182, + "step": 456 + }, + { + "epoch": 0.09077366173403516, + "grad_norm": 0.5923800565042013, + "learning_rate": 9.904004849680741e-06, + "loss": 0.0174, + "step": 457 + }, + { + "epoch": 0.0909722911907836, + "grad_norm": 1.0827911084412167, + "learning_rate": 9.903376379956392e-06, + "loss": 0.0129, + "step": 458 + }, + { + "epoch": 0.09117092064753203, + "grad_norm": 0.5158773883497091, + "learning_rate": 9.902745879748793e-06, + "loss": 0.0159, + "step": 459 + }, + { + "epoch": 0.09136955010428047, + "grad_norm": 1.0459539619692417, + "learning_rate": 9.902113349319035e-06, + "loss": 0.0253, + "step": 460 + }, + { + "epoch": 0.0915681795610289, + "grad_norm": 1.2859832032862175, + "learning_rate": 9.90147878892905e-06, + "loss": 0.0192, + "step": 461 + }, + { + "epoch": 0.09176680901777734, + "grad_norm": 0.5360090137854244, + "learning_rate": 9.900842198841606e-06, + "loss": 0.0153, + "step": 462 + }, + { + "epoch": 0.09196543847452578, + "grad_norm": 0.4013944962969106, + "learning_rate": 9.900203579320316e-06, + "loss": 0.0154, + "step": 463 + }, + { + "epoch": 0.09216406793127421, + "grad_norm": 0.41838257115387206, + "learning_rate": 9.89956293062963e-06, + "loss": 0.0128, + "step": 464 + }, + { + "epoch": 0.09236269738802265, + "grad_norm": 0.713293197602063, + "learning_rate": 9.898920253034841e-06, + "loss": 0.0148, + "step": 465 + }, + { + "epoch": 0.09256132684477109, + "grad_norm": 0.5124949670835264, + "learning_rate": 9.898275546802082e-06, + "loss": 0.0182, + "step": 466 + }, + { + "epoch": 0.09275995630151952, + "grad_norm": 0.7347929074207598, + "learning_rate": 9.897628812198324e-06, + "loss": 0.018, + "step": 467 + }, + { + "epoch": 0.09295858575826794, + "grad_norm": 0.3702075450464756, + "learning_rate": 9.896980049491378e-06, + "loss": 0.0153, + "step": 468 + }, + { + "epoch": 0.09315721521501638, + "grad_norm": 0.8668215363272558, + "learning_rate": 9.896329258949898e-06, + "loss": 0.0186, + "step": 469 + }, + { + "epoch": 0.09335584467176482, + "grad_norm": 0.9267636519344175, + "learning_rate": 9.895676440843376e-06, + "loss": 0.02, + "step": 470 + }, + { + "epoch": 0.09355447412851325, + "grad_norm": 1.975894079341777, + "learning_rate": 9.895021595442143e-06, + "loss": 0.0153, + "step": 471 + }, + { + "epoch": 0.09375310358526169, + "grad_norm": 0.9664233797506984, + "learning_rate": 9.894364723017369e-06, + "loss": 0.0248, + "step": 472 + }, + { + "epoch": 0.09395173304201013, + "grad_norm": 0.7298025816829822, + "learning_rate": 9.893705823841063e-06, + "loss": 0.0136, + "step": 473 + }, + { + "epoch": 0.09415036249875856, + "grad_norm": 0.5028928969765695, + "learning_rate": 9.893044898186077e-06, + "loss": 0.0132, + "step": 474 + }, + { + "epoch": 0.094348991955507, + "grad_norm": 0.5343396584732334, + "learning_rate": 9.8923819463261e-06, + "loss": 0.0114, + "step": 475 + }, + { + "epoch": 0.09454762141225544, + "grad_norm": 1.043121510446111, + "learning_rate": 9.891716968535655e-06, + "loss": 0.0233, + "step": 476 + }, + { + "epoch": 0.09474625086900387, + "grad_norm": 0.9607158804060207, + "learning_rate": 9.891049965090116e-06, + "loss": 0.0184, + "step": 477 + }, + { + "epoch": 0.09494488032575231, + "grad_norm": 2.04441514855444, + "learning_rate": 9.890380936265683e-06, + "loss": 0.0261, + "step": 478 + }, + { + "epoch": 0.09514350978250075, + "grad_norm": 0.44891259364731184, + "learning_rate": 9.8897098823394e-06, + "loss": 0.0161, + "step": 479 + }, + { + "epoch": 0.09534213923924918, + "grad_norm": 0.20556262152171734, + "learning_rate": 9.889036803589154e-06, + "loss": 0.0066, + "step": 480 + }, + { + "epoch": 0.09554076869599762, + "grad_norm": 0.62530012349153, + "learning_rate": 9.888361700293663e-06, + "loss": 0.013, + "step": 481 + }, + { + "epoch": 0.09573939815274606, + "grad_norm": 0.5060323442897047, + "learning_rate": 9.887684572732486e-06, + "loss": 0.0176, + "step": 482 + }, + { + "epoch": 0.09593802760949449, + "grad_norm": 0.6697767814686015, + "learning_rate": 9.887005421186022e-06, + "loss": 0.0128, + "step": 483 + }, + { + "epoch": 0.09613665706624293, + "grad_norm": 0.49988533088265713, + "learning_rate": 9.886324245935508e-06, + "loss": 0.0123, + "step": 484 + }, + { + "epoch": 0.09633528652299136, + "grad_norm": 0.5219089565515956, + "learning_rate": 9.885641047263017e-06, + "loss": 0.0245, + "step": 485 + }, + { + "epoch": 0.0965339159797398, + "grad_norm": 0.566826196384926, + "learning_rate": 9.88495582545146e-06, + "loss": 0.0145, + "step": 486 + }, + { + "epoch": 0.09673254543648824, + "grad_norm": 0.959759790672288, + "learning_rate": 9.88426858078459e-06, + "loss": 0.0194, + "step": 487 + }, + { + "epoch": 0.09693117489323666, + "grad_norm": 0.9036100999985963, + "learning_rate": 9.88357931354699e-06, + "loss": 0.0129, + "step": 488 + }, + { + "epoch": 0.0971298043499851, + "grad_norm": 0.5669073431377971, + "learning_rate": 9.882888024024086e-06, + "loss": 0.0093, + "step": 489 + }, + { + "epoch": 0.09732843380673353, + "grad_norm": 1.0332312565203494, + "learning_rate": 9.882194712502142e-06, + "loss": 0.0263, + "step": 490 + }, + { + "epoch": 0.09752706326348197, + "grad_norm": 1.4518945147996904, + "learning_rate": 9.881499379268258e-06, + "loss": 0.0165, + "step": 491 + }, + { + "epoch": 0.0977256927202304, + "grad_norm": 0.7165588378220921, + "learning_rate": 9.880802024610367e-06, + "loss": 0.0261, + "step": 492 + }, + { + "epoch": 0.09792432217697884, + "grad_norm": 0.33507821178982644, + "learning_rate": 9.880102648817249e-06, + "loss": 0.018, + "step": 493 + }, + { + "epoch": 0.09812295163372728, + "grad_norm": 1.0953941169607475, + "learning_rate": 9.879401252178508e-06, + "loss": 0.0163, + "step": 494 + }, + { + "epoch": 0.09832158109047572, + "grad_norm": 1.1884751994563965, + "learning_rate": 9.878697834984596e-06, + "loss": 0.017, + "step": 495 + }, + { + "epoch": 0.09852021054722415, + "grad_norm": 1.3895562605165783, + "learning_rate": 9.877992397526795e-06, + "loss": 0.0178, + "step": 496 + }, + { + "epoch": 0.09871884000397259, + "grad_norm": 1.0077625272176394, + "learning_rate": 9.877284940097229e-06, + "loss": 0.014, + "step": 497 + }, + { + "epoch": 0.09891746946072102, + "grad_norm": 1.1176332665051876, + "learning_rate": 9.876575462988852e-06, + "loss": 0.0176, + "step": 498 + }, + { + "epoch": 0.09911609891746946, + "grad_norm": 0.9197946583172847, + "learning_rate": 9.87586396649546e-06, + "loss": 0.0186, + "step": 499 + }, + { + "epoch": 0.0993147283742179, + "grad_norm": 0.664357412818359, + "learning_rate": 9.875150450911682e-06, + "loss": 0.0078, + "step": 500 + }, + { + "epoch": 0.09951335783096633, + "grad_norm": 0.9879849373646336, + "learning_rate": 9.874434916532984e-06, + "loss": 0.0174, + "step": 501 + }, + { + "epoch": 0.09971198728771477, + "grad_norm": 0.8015238994270766, + "learning_rate": 9.87371736365567e-06, + "loss": 0.0182, + "step": 502 + }, + { + "epoch": 0.09991061674446321, + "grad_norm": 2.5006213146117324, + "learning_rate": 9.872997792576876e-06, + "loss": 0.0192, + "step": 503 + }, + { + "epoch": 0.10010924620121164, + "grad_norm": 2.0986832355357263, + "learning_rate": 9.872276203594575e-06, + "loss": 0.015, + "step": 504 + }, + { + "epoch": 0.10030787565796008, + "grad_norm": 1.7712689606880594, + "learning_rate": 9.871552597007577e-06, + "loss": 0.0162, + "step": 505 + }, + { + "epoch": 0.10050650511470852, + "grad_norm": 0.6358423775925522, + "learning_rate": 9.870826973115528e-06, + "loss": 0.0088, + "step": 506 + }, + { + "epoch": 0.10070513457145695, + "grad_norm": 1.2361002395299776, + "learning_rate": 9.870099332218908e-06, + "loss": 0.0132, + "step": 507 + }, + { + "epoch": 0.10090376402820539, + "grad_norm": 0.3894121263307642, + "learning_rate": 9.869369674619031e-06, + "loss": 0.009, + "step": 508 + }, + { + "epoch": 0.10110239348495381, + "grad_norm": 0.6814889911507336, + "learning_rate": 9.868638000618047e-06, + "loss": 0.018, + "step": 509 + }, + { + "epoch": 0.10130102294170225, + "grad_norm": 1.5433233594496414, + "learning_rate": 9.867904310518943e-06, + "loss": 0.0169, + "step": 510 + }, + { + "epoch": 0.10149965239845068, + "grad_norm": 2.086256409582492, + "learning_rate": 9.867168604625538e-06, + "loss": 0.0272, + "step": 511 + }, + { + "epoch": 0.10169828185519912, + "grad_norm": 1.1008955506156466, + "learning_rate": 9.86643088324249e-06, + "loss": 0.0197, + "step": 512 + }, + { + "epoch": 0.10189691131194756, + "grad_norm": 0.6668853742795092, + "learning_rate": 9.865691146675286e-06, + "loss": 0.014, + "step": 513 + }, + { + "epoch": 0.102095540768696, + "grad_norm": 0.6058575326394435, + "learning_rate": 9.86494939523025e-06, + "loss": 0.0107, + "step": 514 + }, + { + "epoch": 0.10229417022544443, + "grad_norm": 0.9452200105677793, + "learning_rate": 9.864205629214542e-06, + "loss": 0.0224, + "step": 515 + }, + { + "epoch": 0.10249279968219287, + "grad_norm": 1.0087103985746844, + "learning_rate": 9.863459848936155e-06, + "loss": 0.0175, + "step": 516 + }, + { + "epoch": 0.1026914291389413, + "grad_norm": 1.4576341787329083, + "learning_rate": 9.862712054703913e-06, + "loss": 0.017, + "step": 517 + }, + { + "epoch": 0.10289005859568974, + "grad_norm": 1.007654984553146, + "learning_rate": 9.861962246827479e-06, + "loss": 0.0197, + "step": 518 + }, + { + "epoch": 0.10308868805243818, + "grad_norm": 1.1768748633973105, + "learning_rate": 9.861210425617348e-06, + "loss": 0.0146, + "step": 519 + }, + { + "epoch": 0.10328731750918661, + "grad_norm": 0.5946983343781779, + "learning_rate": 9.860456591384849e-06, + "loss": 0.0161, + "step": 520 + }, + { + "epoch": 0.10348594696593505, + "grad_norm": 0.8631523397738572, + "learning_rate": 9.85970074444214e-06, + "loss": 0.0169, + "step": 521 + }, + { + "epoch": 0.10368457642268349, + "grad_norm": 0.6524946202638839, + "learning_rate": 9.858942885102221e-06, + "loss": 0.0231, + "step": 522 + }, + { + "epoch": 0.10388320587943192, + "grad_norm": 0.5099019794008004, + "learning_rate": 9.85818301367892e-06, + "loss": 0.0181, + "step": 523 + }, + { + "epoch": 0.10408183533618036, + "grad_norm": 0.503120599970783, + "learning_rate": 9.857421130486897e-06, + "loss": 0.0172, + "step": 524 + }, + { + "epoch": 0.1042804647929288, + "grad_norm": 0.5964107917213637, + "learning_rate": 9.85665723584165e-06, + "loss": 0.0175, + "step": 525 + }, + { + "epoch": 0.10447909424967723, + "grad_norm": 0.6031856884274558, + "learning_rate": 9.855891330059502e-06, + "loss": 0.0174, + "step": 526 + }, + { + "epoch": 0.10467772370642567, + "grad_norm": 0.47774054717960784, + "learning_rate": 9.85512341345762e-06, + "loss": 0.02, + "step": 527 + }, + { + "epoch": 0.1048763531631741, + "grad_norm": 0.47747803803414446, + "learning_rate": 9.854353486353994e-06, + "loss": 0.0176, + "step": 528 + }, + { + "epoch": 0.10507498261992254, + "grad_norm": 0.4118060996615957, + "learning_rate": 9.853581549067449e-06, + "loss": 0.011, + "step": 529 + }, + { + "epoch": 0.10527361207667096, + "grad_norm": 1.7006846993605238, + "learning_rate": 9.852807601917647e-06, + "loss": 0.0245, + "step": 530 + }, + { + "epoch": 0.1054722415334194, + "grad_norm": 1.319886511097232, + "learning_rate": 9.852031645225075e-06, + "loss": 0.0133, + "step": 531 + }, + { + "epoch": 0.10567087099016784, + "grad_norm": 0.5419706523360872, + "learning_rate": 9.851253679311059e-06, + "loss": 0.0115, + "step": 532 + }, + { + "epoch": 0.10586950044691627, + "grad_norm": 0.4436521767303458, + "learning_rate": 9.850473704497752e-06, + "loss": 0.0173, + "step": 533 + }, + { + "epoch": 0.10606812990366471, + "grad_norm": 0.9414894737976166, + "learning_rate": 9.84969172110814e-06, + "loss": 0.0125, + "step": 534 + }, + { + "epoch": 0.10626675936041315, + "grad_norm": 0.5777200514403674, + "learning_rate": 9.848907729466045e-06, + "loss": 0.0157, + "step": 535 + }, + { + "epoch": 0.10646538881716158, + "grad_norm": 0.5350127642077067, + "learning_rate": 9.848121729896114e-06, + "loss": 0.0109, + "step": 536 + }, + { + "epoch": 0.10666401827391002, + "grad_norm": 0.6968674417806208, + "learning_rate": 9.84733372272383e-06, + "loss": 0.0238, + "step": 537 + }, + { + "epoch": 0.10686264773065846, + "grad_norm": 0.6180948340768215, + "learning_rate": 9.846543708275507e-06, + "loss": 0.0226, + "step": 538 + }, + { + "epoch": 0.10706127718740689, + "grad_norm": 0.7782144147736434, + "learning_rate": 9.845751686878286e-06, + "loss": 0.021, + "step": 539 + }, + { + "epoch": 0.10725990664415533, + "grad_norm": 0.8480072268503545, + "learning_rate": 9.844957658860143e-06, + "loss": 0.0189, + "step": 540 + }, + { + "epoch": 0.10745853610090376, + "grad_norm": 0.48593638963667746, + "learning_rate": 9.844161624549889e-06, + "loss": 0.0148, + "step": 541 + }, + { + "epoch": 0.1076571655576522, + "grad_norm": 1.0293729094433886, + "learning_rate": 9.843363584277154e-06, + "loss": 0.0148, + "step": 542 + }, + { + "epoch": 0.10785579501440064, + "grad_norm": 0.5138840542732326, + "learning_rate": 9.84256353837241e-06, + "loss": 0.0199, + "step": 543 + }, + { + "epoch": 0.10805442447114907, + "grad_norm": 0.9923481664493533, + "learning_rate": 9.841761487166955e-06, + "loss": 0.0209, + "step": 544 + }, + { + "epoch": 0.10825305392789751, + "grad_norm": 0.499458338835493, + "learning_rate": 9.840957430992917e-06, + "loss": 0.0195, + "step": 545 + }, + { + "epoch": 0.10845168338464595, + "grad_norm": 0.40579211347456434, + "learning_rate": 9.840151370183253e-06, + "loss": 0.0153, + "step": 546 + }, + { + "epoch": 0.10865031284139438, + "grad_norm": 0.5908120798309358, + "learning_rate": 9.839343305071755e-06, + "loss": 0.0121, + "step": 547 + }, + { + "epoch": 0.10884894229814282, + "grad_norm": 0.8968327120573901, + "learning_rate": 9.838533235993041e-06, + "loss": 0.0238, + "step": 548 + }, + { + "epoch": 0.10904757175489126, + "grad_norm": 1.01470974568391, + "learning_rate": 9.837721163282558e-06, + "loss": 0.0168, + "step": 549 + }, + { + "epoch": 0.10924620121163969, + "grad_norm": 0.3781128929172408, + "learning_rate": 9.836907087276587e-06, + "loss": 0.0188, + "step": 550 + }, + { + "epoch": 0.10944483066838812, + "grad_norm": 0.3520969782131752, + "learning_rate": 9.836091008312233e-06, + "loss": 0.0144, + "step": 551 + }, + { + "epoch": 0.10964346012513655, + "grad_norm": 1.206009624884685, + "learning_rate": 9.835272926727439e-06, + "loss": 0.0169, + "step": 552 + }, + { + "epoch": 0.10984208958188499, + "grad_norm": 0.5829489199164067, + "learning_rate": 9.834452842860967e-06, + "loss": 0.0141, + "step": 553 + }, + { + "epoch": 0.11004071903863342, + "grad_norm": 0.7985221379687066, + "learning_rate": 9.833630757052413e-06, + "loss": 0.0176, + "step": 554 + }, + { + "epoch": 0.11023934849538186, + "grad_norm": 0.7533308379120465, + "learning_rate": 9.832806669642203e-06, + "loss": 0.0151, + "step": 555 + }, + { + "epoch": 0.1104379779521303, + "grad_norm": 1.08877862811918, + "learning_rate": 9.831980580971591e-06, + "loss": 0.0196, + "step": 556 + }, + { + "epoch": 0.11063660740887873, + "grad_norm": 0.8723157647635501, + "learning_rate": 9.831152491382658e-06, + "loss": 0.0138, + "step": 557 + }, + { + "epoch": 0.11083523686562717, + "grad_norm": 1.2697238695545063, + "learning_rate": 9.83032240121832e-06, + "loss": 0.0225, + "step": 558 + }, + { + "epoch": 0.11103386632237561, + "grad_norm": 0.4616713641640445, + "learning_rate": 9.82949031082231e-06, + "loss": 0.0156, + "step": 559 + }, + { + "epoch": 0.11123249577912404, + "grad_norm": 0.349449094735304, + "learning_rate": 9.8286562205392e-06, + "loss": 0.012, + "step": 560 + }, + { + "epoch": 0.11143112523587248, + "grad_norm": 1.1595881223097364, + "learning_rate": 9.827820130714383e-06, + "loss": 0.0177, + "step": 561 + }, + { + "epoch": 0.11162975469262092, + "grad_norm": 0.5231453160678909, + "learning_rate": 9.826982041694086e-06, + "loss": 0.0086, + "step": 562 + }, + { + "epoch": 0.11182838414936935, + "grad_norm": 0.7972550695997788, + "learning_rate": 9.826141953825358e-06, + "loss": 0.0154, + "step": 563 + }, + { + "epoch": 0.11202701360611779, + "grad_norm": 0.4601783531819776, + "learning_rate": 9.825299867456082e-06, + "loss": 0.0186, + "step": 564 + }, + { + "epoch": 0.11222564306286623, + "grad_norm": 0.7214422323486367, + "learning_rate": 9.824455782934961e-06, + "loss": 0.011, + "step": 565 + }, + { + "epoch": 0.11242427251961466, + "grad_norm": 1.680930925926308, + "learning_rate": 9.823609700611534e-06, + "loss": 0.0181, + "step": 566 + }, + { + "epoch": 0.1126229019763631, + "grad_norm": 0.5298368232940289, + "learning_rate": 9.822761620836158e-06, + "loss": 0.0177, + "step": 567 + }, + { + "epoch": 0.11282153143311154, + "grad_norm": 0.7938863386225405, + "learning_rate": 9.821911543960025e-06, + "loss": 0.0117, + "step": 568 + }, + { + "epoch": 0.11302016088985997, + "grad_norm": 0.8596537266570184, + "learning_rate": 9.82105947033515e-06, + "loss": 0.0072, + "step": 569 + }, + { + "epoch": 0.11321879034660841, + "grad_norm": 0.4173258192767521, + "learning_rate": 9.820205400314378e-06, + "loss": 0.0145, + "step": 570 + }, + { + "epoch": 0.11341741980335683, + "grad_norm": 0.6115201580608163, + "learning_rate": 9.819349334251376e-06, + "loss": 0.0203, + "step": 571 + }, + { + "epoch": 0.11361604926010527, + "grad_norm": 0.30050628933401596, + "learning_rate": 9.818491272500641e-06, + "loss": 0.0107, + "step": 572 + }, + { + "epoch": 0.1138146787168537, + "grad_norm": 1.0979707173146247, + "learning_rate": 9.817631215417496e-06, + "loss": 0.0223, + "step": 573 + }, + { + "epoch": 0.11401330817360214, + "grad_norm": 0.46276625355749734, + "learning_rate": 9.816769163358087e-06, + "loss": 0.0125, + "step": 574 + }, + { + "epoch": 0.11421193763035058, + "grad_norm": 1.0534382953635288, + "learning_rate": 9.815905116679394e-06, + "loss": 0.0239, + "step": 575 + }, + { + "epoch": 0.11441056708709901, + "grad_norm": 0.4199398914552984, + "learning_rate": 9.815039075739213e-06, + "loss": 0.0113, + "step": 576 + }, + { + "epoch": 0.11460919654384745, + "grad_norm": 1.1636761751492717, + "learning_rate": 9.814171040896173e-06, + "loss": 0.0208, + "step": 577 + }, + { + "epoch": 0.11480782600059589, + "grad_norm": 0.5065188833689531, + "learning_rate": 9.813301012509725e-06, + "loss": 0.0081, + "step": 578 + }, + { + "epoch": 0.11500645545734432, + "grad_norm": 1.1440102400398982, + "learning_rate": 9.812428990940151e-06, + "loss": 0.0201, + "step": 579 + }, + { + "epoch": 0.11520508491409276, + "grad_norm": 0.5140689221203485, + "learning_rate": 9.811554976548547e-06, + "loss": 0.0154, + "step": 580 + }, + { + "epoch": 0.1154037143708412, + "grad_norm": 0.8604730664877108, + "learning_rate": 9.810678969696849e-06, + "loss": 0.0216, + "step": 581 + }, + { + "epoch": 0.11560234382758963, + "grad_norm": 1.3627495098285676, + "learning_rate": 9.809800970747805e-06, + "loss": 0.0146, + "step": 582 + }, + { + "epoch": 0.11580097328433807, + "grad_norm": 1.5215024670202477, + "learning_rate": 9.808920980064998e-06, + "loss": 0.0187, + "step": 583 + }, + { + "epoch": 0.1159996027410865, + "grad_norm": 0.6734406716826478, + "learning_rate": 9.808038998012828e-06, + "loss": 0.0174, + "step": 584 + }, + { + "epoch": 0.11619823219783494, + "grad_norm": 1.1041085950914342, + "learning_rate": 9.807155024956523e-06, + "loss": 0.0151, + "step": 585 + }, + { + "epoch": 0.11639686165458338, + "grad_norm": 1.5387484405002336, + "learning_rate": 9.806269061262135e-06, + "loss": 0.0149, + "step": 586 + }, + { + "epoch": 0.11659549111133181, + "grad_norm": 0.5875043750348269, + "learning_rate": 9.805381107296544e-06, + "loss": 0.0143, + "step": 587 + }, + { + "epoch": 0.11679412056808025, + "grad_norm": 0.8389398799982417, + "learning_rate": 9.80449116342745e-06, + "loss": 0.0255, + "step": 588 + }, + { + "epoch": 0.11699275002482869, + "grad_norm": 0.30889573926063935, + "learning_rate": 9.803599230023373e-06, + "loss": 0.0103, + "step": 589 + }, + { + "epoch": 0.11719137948157712, + "grad_norm": 0.7219078584006366, + "learning_rate": 9.802705307453667e-06, + "loss": 0.0125, + "step": 590 + }, + { + "epoch": 0.11739000893832556, + "grad_norm": 0.5466172579541949, + "learning_rate": 9.801809396088501e-06, + "loss": 0.0116, + "step": 591 + }, + { + "epoch": 0.11758863839507398, + "grad_norm": 1.0372207668042581, + "learning_rate": 9.800911496298875e-06, + "loss": 0.0123, + "step": 592 + }, + { + "epoch": 0.11778726785182242, + "grad_norm": 0.7559866086265034, + "learning_rate": 9.800011608456607e-06, + "loss": 0.0201, + "step": 593 + }, + { + "epoch": 0.11798589730857086, + "grad_norm": 0.3188709050740313, + "learning_rate": 9.799109732934338e-06, + "loss": 0.0111, + "step": 594 + }, + { + "epoch": 0.11818452676531929, + "grad_norm": 0.5314042057902478, + "learning_rate": 9.798205870105533e-06, + "loss": 0.0125, + "step": 595 + }, + { + "epoch": 0.11838315622206773, + "grad_norm": 1.8474069548887428, + "learning_rate": 9.797300020344483e-06, + "loss": 0.0231, + "step": 596 + }, + { + "epoch": 0.11858178567881617, + "grad_norm": 0.42452537891107434, + "learning_rate": 9.796392184026298e-06, + "loss": 0.0108, + "step": 597 + }, + { + "epoch": 0.1187804151355646, + "grad_norm": 0.5783590129051792, + "learning_rate": 9.795482361526915e-06, + "loss": 0.0116, + "step": 598 + }, + { + "epoch": 0.11897904459231304, + "grad_norm": 0.38163168855460666, + "learning_rate": 9.794570553223087e-06, + "loss": 0.009, + "step": 599 + }, + { + "epoch": 0.11917767404906147, + "grad_norm": 1.4917576733893148, + "learning_rate": 9.793656759492394e-06, + "loss": 0.0155, + "step": 600 + }, + { + "epoch": 0.11937630350580991, + "grad_norm": 1.0584955957981348, + "learning_rate": 9.792740980713238e-06, + "loss": 0.0207, + "step": 601 + }, + { + "epoch": 0.11957493296255835, + "grad_norm": 0.694913429075098, + "learning_rate": 9.791823217264842e-06, + "loss": 0.0161, + "step": 602 + }, + { + "epoch": 0.11977356241930678, + "grad_norm": 0.6695725185833449, + "learning_rate": 9.790903469527249e-06, + "loss": 0.0199, + "step": 603 + }, + { + "epoch": 0.11997219187605522, + "grad_norm": 2.000789888070613, + "learning_rate": 9.789981737881326e-06, + "loss": 0.039, + "step": 604 + }, + { + "epoch": 0.12017082133280366, + "grad_norm": 0.48955671291310315, + "learning_rate": 9.789058022708765e-06, + "loss": 0.0142, + "step": 605 + }, + { + "epoch": 0.1203694507895521, + "grad_norm": 1.0363261312441616, + "learning_rate": 9.788132324392072e-06, + "loss": 0.0159, + "step": 606 + }, + { + "epoch": 0.12056808024630053, + "grad_norm": 0.4884275959706906, + "learning_rate": 9.787204643314577e-06, + "loss": 0.0167, + "step": 607 + }, + { + "epoch": 0.12076670970304897, + "grad_norm": 0.7350563044633582, + "learning_rate": 9.786274979860434e-06, + "loss": 0.0158, + "step": 608 + }, + { + "epoch": 0.1209653391597974, + "grad_norm": 0.9892060951239566, + "learning_rate": 9.785343334414615e-06, + "loss": 0.0206, + "step": 609 + }, + { + "epoch": 0.12116396861654584, + "grad_norm": 0.37127352309370415, + "learning_rate": 9.784409707362913e-06, + "loss": 0.0183, + "step": 610 + }, + { + "epoch": 0.12136259807329428, + "grad_norm": 0.650383126477588, + "learning_rate": 9.783474099091943e-06, + "loss": 0.0176, + "step": 611 + }, + { + "epoch": 0.12156122753004271, + "grad_norm": 0.4119293084929478, + "learning_rate": 9.78253650998914e-06, + "loss": 0.0117, + "step": 612 + }, + { + "epoch": 0.12175985698679113, + "grad_norm": 0.829964935039823, + "learning_rate": 9.781596940442755e-06, + "loss": 0.0153, + "step": 613 + }, + { + "epoch": 0.12195848644353957, + "grad_norm": 1.7129449944097686, + "learning_rate": 9.780655390841867e-06, + "loss": 0.0136, + "step": 614 + }, + { + "epoch": 0.12215711590028801, + "grad_norm": 0.5565425099390852, + "learning_rate": 9.779711861576368e-06, + "loss": 0.0215, + "step": 615 + }, + { + "epoch": 0.12235574535703644, + "grad_norm": 0.7788538582287662, + "learning_rate": 9.778766353036975e-06, + "loss": 0.0218, + "step": 616 + }, + { + "epoch": 0.12255437481378488, + "grad_norm": 0.33444210785480794, + "learning_rate": 9.777818865615221e-06, + "loss": 0.0136, + "step": 617 + }, + { + "epoch": 0.12275300427053332, + "grad_norm": 0.400994084696202, + "learning_rate": 9.776869399703458e-06, + "loss": 0.0154, + "step": 618 + }, + { + "epoch": 0.12295163372728175, + "grad_norm": 0.7796516460912207, + "learning_rate": 9.77591795569486e-06, + "loss": 0.013, + "step": 619 + }, + { + "epoch": 0.12315026318403019, + "grad_norm": 0.536772489299672, + "learning_rate": 9.774964533983421e-06, + "loss": 0.0188, + "step": 620 + }, + { + "epoch": 0.12334889264077863, + "grad_norm": 1.3303202817826634, + "learning_rate": 9.774009134963949e-06, + "loss": 0.0193, + "step": 621 + }, + { + "epoch": 0.12354752209752706, + "grad_norm": 0.7037843214385814, + "learning_rate": 9.773051759032074e-06, + "loss": 0.0175, + "step": 622 + }, + { + "epoch": 0.1237461515542755, + "grad_norm": 0.4308203918294435, + "learning_rate": 9.772092406584248e-06, + "loss": 0.0166, + "step": 623 + }, + { + "epoch": 0.12394478101102394, + "grad_norm": 1.1697494943206082, + "learning_rate": 9.771131078017735e-06, + "loss": 0.0193, + "step": 624 + }, + { + "epoch": 0.12414341046777237, + "grad_norm": 0.5812375266471298, + "learning_rate": 9.77016777373062e-06, + "loss": 0.0146, + "step": 625 + }, + { + "epoch": 0.12434203992452081, + "grad_norm": 0.524198461309644, + "learning_rate": 9.769202494121806e-06, + "loss": 0.0167, + "step": 626 + }, + { + "epoch": 0.12454066938126925, + "grad_norm": 1.5285962472865722, + "learning_rate": 9.768235239591016e-06, + "loss": 0.0207, + "step": 627 + }, + { + "epoch": 0.12473929883801768, + "grad_norm": 0.7396830974883195, + "learning_rate": 9.767266010538788e-06, + "loss": 0.0148, + "step": 628 + }, + { + "epoch": 0.12493792829476612, + "grad_norm": 0.2274647843251192, + "learning_rate": 9.76629480736648e-06, + "loss": 0.0107, + "step": 629 + }, + { + "epoch": 0.12513655775151455, + "grad_norm": 0.28859531186159604, + "learning_rate": 9.765321630476264e-06, + "loss": 0.0166, + "step": 630 + }, + { + "epoch": 0.12533518720826298, + "grad_norm": 0.35440823267343124, + "learning_rate": 9.764346480271132e-06, + "loss": 0.0168, + "step": 631 + }, + { + "epoch": 0.12553381666501143, + "grad_norm": 0.5712874545330476, + "learning_rate": 9.763369357154895e-06, + "loss": 0.0101, + "step": 632 + }, + { + "epoch": 0.12573244612175985, + "grad_norm": 1.3923546120285113, + "learning_rate": 9.762390261532177e-06, + "loss": 0.0168, + "step": 633 + }, + { + "epoch": 0.1259310755785083, + "grad_norm": 0.47850667989496215, + "learning_rate": 9.761409193808422e-06, + "loss": 0.009, + "step": 634 + }, + { + "epoch": 0.12612970503525672, + "grad_norm": 0.4045494692733913, + "learning_rate": 9.760426154389888e-06, + "loss": 0.0127, + "step": 635 + }, + { + "epoch": 0.12632833449200517, + "grad_norm": 0.949458135135656, + "learning_rate": 9.75944114368365e-06, + "loss": 0.0245, + "step": 636 + }, + { + "epoch": 0.1265269639487536, + "grad_norm": 0.4744219220416367, + "learning_rate": 9.7584541620976e-06, + "loss": 0.01, + "step": 637 + }, + { + "epoch": 0.12672559340550205, + "grad_norm": 0.6580474069905136, + "learning_rate": 9.757465210040447e-06, + "loss": 0.0116, + "step": 638 + }, + { + "epoch": 0.12692422286225047, + "grad_norm": 1.2746603803218375, + "learning_rate": 9.756474287921716e-06, + "loss": 0.0173, + "step": 639 + }, + { + "epoch": 0.12712285231899892, + "grad_norm": 1.9417494348440496, + "learning_rate": 9.755481396151744e-06, + "loss": 0.0324, + "step": 640 + }, + { + "epoch": 0.12732148177574734, + "grad_norm": 0.618531205863786, + "learning_rate": 9.75448653514169e-06, + "loss": 0.0142, + "step": 641 + }, + { + "epoch": 0.1275201112324958, + "grad_norm": 0.9971146664077777, + "learning_rate": 9.75348970530352e-06, + "loss": 0.018, + "step": 642 + }, + { + "epoch": 0.12771874068924421, + "grad_norm": 0.8389497013907814, + "learning_rate": 9.752490907050027e-06, + "loss": 0.0151, + "step": 643 + }, + { + "epoch": 0.12791737014599264, + "grad_norm": 1.0391641435545595, + "learning_rate": 9.751490140794807e-06, + "loss": 0.0234, + "step": 644 + }, + { + "epoch": 0.1281159996027411, + "grad_norm": 0.7118837598081941, + "learning_rate": 9.750487406952277e-06, + "loss": 0.0165, + "step": 645 + }, + { + "epoch": 0.1283146290594895, + "grad_norm": 0.3126685428380962, + "learning_rate": 9.74948270593767e-06, + "loss": 0.0104, + "step": 646 + }, + { + "epoch": 0.12851325851623796, + "grad_norm": 0.6400960063705387, + "learning_rate": 9.74847603816703e-06, + "loss": 0.0146, + "step": 647 + }, + { + "epoch": 0.12871188797298638, + "grad_norm": 0.6080027304759789, + "learning_rate": 9.747467404057219e-06, + "loss": 0.0132, + "step": 648 + }, + { + "epoch": 0.12891051742973483, + "grad_norm": 0.5969440680204644, + "learning_rate": 9.74645680402591e-06, + "loss": 0.0165, + "step": 649 + }, + { + "epoch": 0.12910914688648326, + "grad_norm": 0.4844450633614373, + "learning_rate": 9.74544423849159e-06, + "loss": 0.008, + "step": 650 + }, + { + "epoch": 0.1293077763432317, + "grad_norm": 0.5731462801181816, + "learning_rate": 9.744429707873564e-06, + "loss": 0.014, + "step": 651 + }, + { + "epoch": 0.12950640579998013, + "grad_norm": 0.9087700604835134, + "learning_rate": 9.743413212591949e-06, + "loss": 0.0213, + "step": 652 + }, + { + "epoch": 0.12970503525672858, + "grad_norm": 0.7602480663063869, + "learning_rate": 9.742394753067671e-06, + "loss": 0.0158, + "step": 653 + }, + { + "epoch": 0.129903664713477, + "grad_norm": 0.6559403776033135, + "learning_rate": 9.741374329722474e-06, + "loss": 0.0126, + "step": 654 + }, + { + "epoch": 0.13010229417022545, + "grad_norm": 0.5644269287564567, + "learning_rate": 9.740351942978919e-06, + "loss": 0.0141, + "step": 655 + }, + { + "epoch": 0.13030092362697387, + "grad_norm": 0.5606392709106318, + "learning_rate": 9.739327593260367e-06, + "loss": 0.0221, + "step": 656 + }, + { + "epoch": 0.13049955308372232, + "grad_norm": 0.6807347976418896, + "learning_rate": 9.738301280991007e-06, + "loss": 0.0151, + "step": 657 + }, + { + "epoch": 0.13069818254047075, + "grad_norm": 0.8462663632413303, + "learning_rate": 9.737273006595832e-06, + "loss": 0.0194, + "step": 658 + }, + { + "epoch": 0.1308968119972192, + "grad_norm": 0.3894055488420919, + "learning_rate": 9.736242770500647e-06, + "loss": 0.0092, + "step": 659 + }, + { + "epoch": 0.13109544145396762, + "grad_norm": 0.7904884629984615, + "learning_rate": 9.735210573132078e-06, + "loss": 0.0162, + "step": 660 + }, + { + "epoch": 0.13129407091071607, + "grad_norm": 0.9674854160105776, + "learning_rate": 9.734176414917548e-06, + "loss": 0.0196, + "step": 661 + }, + { + "epoch": 0.1314927003674645, + "grad_norm": 0.33200341815567036, + "learning_rate": 9.733140296285307e-06, + "loss": 0.0093, + "step": 662 + }, + { + "epoch": 0.13169132982421294, + "grad_norm": 0.8720779831059587, + "learning_rate": 9.73210221766441e-06, + "loss": 0.0171, + "step": 663 + }, + { + "epoch": 0.13188995928096137, + "grad_norm": 0.48679544383358947, + "learning_rate": 9.731062179484723e-06, + "loss": 0.0222, + "step": 664 + }, + { + "epoch": 0.1320885887377098, + "grad_norm": 1.6442374114676053, + "learning_rate": 9.730020182176925e-06, + "loss": 0.0201, + "step": 665 + }, + { + "epoch": 0.13228721819445824, + "grad_norm": 0.6211726152538897, + "learning_rate": 9.728976226172507e-06, + "loss": 0.0115, + "step": 666 + }, + { + "epoch": 0.13248584765120666, + "grad_norm": 1.0316344031063414, + "learning_rate": 9.727930311903768e-06, + "loss": 0.0138, + "step": 667 + }, + { + "epoch": 0.1326844771079551, + "grad_norm": 0.6585612814093009, + "learning_rate": 9.726882439803822e-06, + "loss": 0.0221, + "step": 668 + }, + { + "epoch": 0.13288310656470353, + "grad_norm": 1.583096886947926, + "learning_rate": 9.725832610306592e-06, + "loss": 0.0236, + "step": 669 + }, + { + "epoch": 0.13308173602145199, + "grad_norm": 0.58785647706293, + "learning_rate": 9.72478082384681e-06, + "loss": 0.0133, + "step": 670 + }, + { + "epoch": 0.1332803654782004, + "grad_norm": 0.4215741160538269, + "learning_rate": 9.723727080860022e-06, + "loss": 0.0117, + "step": 671 + }, + { + "epoch": 0.13347899493494886, + "grad_norm": 1.1714886610581292, + "learning_rate": 9.722671381782577e-06, + "loss": 0.0289, + "step": 672 + }, + { + "epoch": 0.13367762439169728, + "grad_norm": 0.849483842098239, + "learning_rate": 9.721613727051646e-06, + "loss": 0.017, + "step": 673 + }, + { + "epoch": 0.13387625384844573, + "grad_norm": 1.4736074773129628, + "learning_rate": 9.720554117105197e-06, + "loss": 0.0132, + "step": 674 + }, + { + "epoch": 0.13407488330519415, + "grad_norm": 2.2121758689867344, + "learning_rate": 9.719492552382015e-06, + "loss": 0.0268, + "step": 675 + }, + { + "epoch": 0.1342735127619426, + "grad_norm": 2.2633647738559546, + "learning_rate": 9.718429033321693e-06, + "loss": 0.0279, + "step": 676 + }, + { + "epoch": 0.13447214221869103, + "grad_norm": 0.5748397779881033, + "learning_rate": 9.717363560364634e-06, + "loss": 0.0198, + "step": 677 + }, + { + "epoch": 0.13467077167543948, + "grad_norm": 0.7319060757746011, + "learning_rate": 9.71629613395205e-06, + "loss": 0.015, + "step": 678 + }, + { + "epoch": 0.1348694011321879, + "grad_norm": 0.8521409511987582, + "learning_rate": 9.71522675452596e-06, + "loss": 0.0141, + "step": 679 + }, + { + "epoch": 0.13506803058893635, + "grad_norm": 0.910962721954014, + "learning_rate": 9.714155422529192e-06, + "loss": 0.016, + "step": 680 + }, + { + "epoch": 0.13526666004568477, + "grad_norm": 0.3260897607925365, + "learning_rate": 9.713082138405383e-06, + "loss": 0.0105, + "step": 681 + }, + { + "epoch": 0.13546528950243322, + "grad_norm": 0.2535946790266379, + "learning_rate": 9.712006902598982e-06, + "loss": 0.0112, + "step": 682 + }, + { + "epoch": 0.13566391895918165, + "grad_norm": 0.4071046693159764, + "learning_rate": 9.710929715555241e-06, + "loss": 0.0171, + "step": 683 + }, + { + "epoch": 0.1358625484159301, + "grad_norm": 0.6842960302160229, + "learning_rate": 9.709850577720223e-06, + "loss": 0.0123, + "step": 684 + }, + { + "epoch": 0.13606117787267852, + "grad_norm": 0.35348541207601764, + "learning_rate": 9.708769489540796e-06, + "loss": 0.0127, + "step": 685 + }, + { + "epoch": 0.13625980732942694, + "grad_norm": 0.5960992242072756, + "learning_rate": 9.70768645146464e-06, + "loss": 0.0152, + "step": 686 + }, + { + "epoch": 0.1364584367861754, + "grad_norm": 0.706647286718095, + "learning_rate": 9.706601463940237e-06, + "loss": 0.0092, + "step": 687 + }, + { + "epoch": 0.1366570662429238, + "grad_norm": 0.8393587614699919, + "learning_rate": 9.705514527416885e-06, + "loss": 0.0167, + "step": 688 + }, + { + "epoch": 0.13685569569967226, + "grad_norm": 0.8895837213581118, + "learning_rate": 9.704425642344674e-06, + "loss": 0.017, + "step": 689 + }, + { + "epoch": 0.1370543251564207, + "grad_norm": 0.6798384157696477, + "learning_rate": 9.703334809174519e-06, + "loss": 0.0117, + "step": 690 + }, + { + "epoch": 0.13725295461316914, + "grad_norm": 0.7053098557106127, + "learning_rate": 9.70224202835813e-06, + "loss": 0.0131, + "step": 691 + }, + { + "epoch": 0.13745158406991756, + "grad_norm": 1.1725400512260025, + "learning_rate": 9.701147300348025e-06, + "loss": 0.0133, + "step": 692 + }, + { + "epoch": 0.137650213526666, + "grad_norm": 0.4166073254333175, + "learning_rate": 9.70005062559753e-06, + "loss": 0.0124, + "step": 693 + }, + { + "epoch": 0.13784884298341443, + "grad_norm": 0.5431686497693445, + "learning_rate": 9.69895200456078e-06, + "loss": 0.0079, + "step": 694 + }, + { + "epoch": 0.13804747244016288, + "grad_norm": 0.5724175575189289, + "learning_rate": 9.697851437692708e-06, + "loss": 0.0172, + "step": 695 + }, + { + "epoch": 0.1382461018969113, + "grad_norm": 1.243225503850031, + "learning_rate": 9.696748925449061e-06, + "loss": 0.0225, + "step": 696 + }, + { + "epoch": 0.13844473135365976, + "grad_norm": 0.5911282831149057, + "learning_rate": 9.69564446828639e-06, + "loss": 0.0167, + "step": 697 + }, + { + "epoch": 0.13864336081040818, + "grad_norm": 0.7689274476457225, + "learning_rate": 9.694538066662043e-06, + "loss": 0.0165, + "step": 698 + }, + { + "epoch": 0.13884199026715663, + "grad_norm": 0.9634756763078826, + "learning_rate": 9.693429721034186e-06, + "loss": 0.0205, + "step": 699 + }, + { + "epoch": 0.13904061972390505, + "grad_norm": 0.9777879206227595, + "learning_rate": 9.69231943186178e-06, + "loss": 0.0138, + "step": 700 + }, + { + "epoch": 0.1392392491806535, + "grad_norm": 0.5707455498722125, + "learning_rate": 9.691207199604599e-06, + "loss": 0.0191, + "step": 701 + }, + { + "epoch": 0.13943787863740192, + "grad_norm": 1.3726637583023185, + "learning_rate": 9.690093024723213e-06, + "loss": 0.0165, + "step": 702 + }, + { + "epoch": 0.13963650809415037, + "grad_norm": 1.1668665383346433, + "learning_rate": 9.688976907679001e-06, + "loss": 0.0174, + "step": 703 + }, + { + "epoch": 0.1398351375508988, + "grad_norm": 1.0967041976393477, + "learning_rate": 9.68785884893415e-06, + "loss": 0.0161, + "step": 704 + }, + { + "epoch": 0.14003376700764725, + "grad_norm": 0.3589397157463669, + "learning_rate": 9.686738848951642e-06, + "loss": 0.0226, + "step": 705 + }, + { + "epoch": 0.14023239646439567, + "grad_norm": 0.37386431956811017, + "learning_rate": 9.68561690819527e-06, + "loss": 0.012, + "step": 706 + }, + { + "epoch": 0.1404310259211441, + "grad_norm": 1.3981607309328512, + "learning_rate": 9.68449302712963e-06, + "loss": 0.0167, + "step": 707 + }, + { + "epoch": 0.14062965537789254, + "grad_norm": 1.1763685062897045, + "learning_rate": 9.683367206220118e-06, + "loss": 0.0224, + "step": 708 + }, + { + "epoch": 0.14082828483464097, + "grad_norm": 1.943944629520424, + "learning_rate": 9.682239445932937e-06, + "loss": 0.0272, + "step": 709 + }, + { + "epoch": 0.14102691429138942, + "grad_norm": 0.6898744285476524, + "learning_rate": 9.681109746735089e-06, + "loss": 0.0142, + "step": 710 + }, + { + "epoch": 0.14122554374813784, + "grad_norm": 0.5450517944671093, + "learning_rate": 9.679978109094383e-06, + "loss": 0.0169, + "step": 711 + }, + { + "epoch": 0.1414241732048863, + "grad_norm": 0.44279377105851336, + "learning_rate": 9.678844533479427e-06, + "loss": 0.013, + "step": 712 + }, + { + "epoch": 0.1416228026616347, + "grad_norm": 1.6342810458603632, + "learning_rate": 9.677709020359638e-06, + "loss": 0.0254, + "step": 713 + }, + { + "epoch": 0.14182143211838316, + "grad_norm": 2.0291941877579065, + "learning_rate": 9.676571570205227e-06, + "loss": 0.0119, + "step": 714 + }, + { + "epoch": 0.14202006157513158, + "grad_norm": 1.5157380559444824, + "learning_rate": 9.675432183487211e-06, + "loss": 0.0162, + "step": 715 + }, + { + "epoch": 0.14221869103188003, + "grad_norm": 1.0513407088044642, + "learning_rate": 9.67429086067741e-06, + "loss": 0.0202, + "step": 716 + }, + { + "epoch": 0.14241732048862846, + "grad_norm": 0.8011495579478646, + "learning_rate": 9.673147602248448e-06, + "loss": 0.0165, + "step": 717 + }, + { + "epoch": 0.1426159499453769, + "grad_norm": 0.41050875738974985, + "learning_rate": 9.67200240867374e-06, + "loss": 0.0145, + "step": 718 + }, + { + "epoch": 0.14281457940212533, + "grad_norm": 1.2640264053922001, + "learning_rate": 9.670855280427514e-06, + "loss": 0.0157, + "step": 719 + }, + { + "epoch": 0.14301320885887378, + "grad_norm": 0.7857274636818972, + "learning_rate": 9.669706217984793e-06, + "loss": 0.0105, + "step": 720 + }, + { + "epoch": 0.1432118383156222, + "grad_norm": 0.6636140885305981, + "learning_rate": 9.668555221821404e-06, + "loss": 0.0132, + "step": 721 + }, + { + "epoch": 0.14341046777237065, + "grad_norm": 0.6949080113239285, + "learning_rate": 9.667402292413975e-06, + "loss": 0.0131, + "step": 722 + }, + { + "epoch": 0.14360909722911908, + "grad_norm": 0.7348391840249473, + "learning_rate": 9.66624743023993e-06, + "loss": 0.0135, + "step": 723 + }, + { + "epoch": 0.14380772668586753, + "grad_norm": 0.515623939890902, + "learning_rate": 9.665090635777497e-06, + "loss": 0.0171, + "step": 724 + }, + { + "epoch": 0.14400635614261595, + "grad_norm": 0.8344063986031619, + "learning_rate": 9.663931909505702e-06, + "loss": 0.023, + "step": 725 + }, + { + "epoch": 0.1442049855993644, + "grad_norm": 1.6382892707257288, + "learning_rate": 9.662771251904375e-06, + "loss": 0.0214, + "step": 726 + }, + { + "epoch": 0.14440361505611282, + "grad_norm": 0.6588029713890065, + "learning_rate": 9.66160866345414e-06, + "loss": 0.0092, + "step": 727 + }, + { + "epoch": 0.14460224451286124, + "grad_norm": 0.5348463144307005, + "learning_rate": 9.660444144636429e-06, + "loss": 0.0084, + "step": 728 + }, + { + "epoch": 0.1448008739696097, + "grad_norm": 0.765885295893399, + "learning_rate": 9.659277695933462e-06, + "loss": 0.0175, + "step": 729 + }, + { + "epoch": 0.14499950342635812, + "grad_norm": 0.8497299979434756, + "learning_rate": 9.658109317828267e-06, + "loss": 0.019, + "step": 730 + }, + { + "epoch": 0.14519813288310657, + "grad_norm": 0.523328490867962, + "learning_rate": 9.656939010804672e-06, + "loss": 0.0216, + "step": 731 + }, + { + "epoch": 0.145396762339855, + "grad_norm": 0.6009169743511245, + "learning_rate": 9.655766775347292e-06, + "loss": 0.0125, + "step": 732 + }, + { + "epoch": 0.14559539179660344, + "grad_norm": 0.6220910948352019, + "learning_rate": 9.654592611941555e-06, + "loss": 0.014, + "step": 733 + }, + { + "epoch": 0.14579402125335186, + "grad_norm": 0.7443834281999502, + "learning_rate": 9.653416521073678e-06, + "loss": 0.0118, + "step": 734 + }, + { + "epoch": 0.1459926507101003, + "grad_norm": 0.3777328532342268, + "learning_rate": 9.65223850323068e-06, + "loss": 0.016, + "step": 735 + }, + { + "epoch": 0.14619128016684874, + "grad_norm": 0.36374649077029947, + "learning_rate": 9.651058558900375e-06, + "loss": 0.0134, + "step": 736 + }, + { + "epoch": 0.1463899096235972, + "grad_norm": 0.343215847483735, + "learning_rate": 9.64987668857138e-06, + "loss": 0.0175, + "step": 737 + }, + { + "epoch": 0.1465885390803456, + "grad_norm": 0.38715395916519696, + "learning_rate": 9.648692892733105e-06, + "loss": 0.0085, + "step": 738 + }, + { + "epoch": 0.14678716853709406, + "grad_norm": 0.27505352925401316, + "learning_rate": 9.647507171875758e-06, + "loss": 0.0064, + "step": 739 + }, + { + "epoch": 0.14698579799384248, + "grad_norm": 0.7665979379551244, + "learning_rate": 9.646319526490345e-06, + "loss": 0.0185, + "step": 740 + }, + { + "epoch": 0.14718442745059093, + "grad_norm": 0.4034771698141865, + "learning_rate": 9.64512995706867e-06, + "loss": 0.0114, + "step": 741 + }, + { + "epoch": 0.14738305690733935, + "grad_norm": 0.6233753352700463, + "learning_rate": 9.643938464103331e-06, + "loss": 0.0205, + "step": 742 + }, + { + "epoch": 0.1475816863640878, + "grad_norm": 0.6387978330419878, + "learning_rate": 9.642745048087724e-06, + "loss": 0.0122, + "step": 743 + }, + { + "epoch": 0.14778031582083623, + "grad_norm": 0.6530117233777584, + "learning_rate": 9.641549709516042e-06, + "loss": 0.0242, + "step": 744 + }, + { + "epoch": 0.14797894527758468, + "grad_norm": 0.6726282197782968, + "learning_rate": 9.640352448883273e-06, + "loss": 0.0207, + "step": 745 + }, + { + "epoch": 0.1481775747343331, + "grad_norm": 0.3645759942166644, + "learning_rate": 9.639153266685204e-06, + "loss": 0.0145, + "step": 746 + }, + { + "epoch": 0.14837620419108155, + "grad_norm": 0.4159658079352318, + "learning_rate": 9.63795216341841e-06, + "loss": 0.0167, + "step": 747 + }, + { + "epoch": 0.14857483364782997, + "grad_norm": 0.4271126375806547, + "learning_rate": 9.636749139580272e-06, + "loss": 0.0138, + "step": 748 + }, + { + "epoch": 0.1487734631045784, + "grad_norm": 0.659027659352402, + "learning_rate": 9.635544195668958e-06, + "loss": 0.0156, + "step": 749 + }, + { + "epoch": 0.14897209256132685, + "grad_norm": 0.42458273885035736, + "learning_rate": 9.634337332183435e-06, + "loss": 0.0193, + "step": 750 + }, + { + "epoch": 0.14917072201807527, + "grad_norm": 0.8691539370190741, + "learning_rate": 9.633128549623463e-06, + "loss": 0.0159, + "step": 751 + }, + { + "epoch": 0.14936935147482372, + "grad_norm": 0.8763737396333379, + "learning_rate": 9.6319178484896e-06, + "loss": 0.0174, + "step": 752 + }, + { + "epoch": 0.14956798093157214, + "grad_norm": 0.8359131444320231, + "learning_rate": 9.630705229283192e-06, + "loss": 0.0162, + "step": 753 + }, + { + "epoch": 0.1497666103883206, + "grad_norm": 0.3581978948784169, + "learning_rate": 9.629490692506386e-06, + "loss": 0.0171, + "step": 754 + }, + { + "epoch": 0.14996523984506901, + "grad_norm": 0.7735573422661927, + "learning_rate": 9.628274238662124e-06, + "loss": 0.0177, + "step": 755 + }, + { + "epoch": 0.15016386930181747, + "grad_norm": 0.6569012717678822, + "learning_rate": 9.627055868254131e-06, + "loss": 0.0165, + "step": 756 + }, + { + "epoch": 0.1503624987585659, + "grad_norm": 0.8802984939901024, + "learning_rate": 9.625835581786937e-06, + "loss": 0.0167, + "step": 757 + }, + { + "epoch": 0.15056112821531434, + "grad_norm": 0.42465561361686077, + "learning_rate": 9.624613379765863e-06, + "loss": 0.0112, + "step": 758 + }, + { + "epoch": 0.15075975767206276, + "grad_norm": 0.5587945392733407, + "learning_rate": 9.623389262697018e-06, + "loss": 0.0146, + "step": 759 + }, + { + "epoch": 0.1509583871288112, + "grad_norm": 0.6613542353383493, + "learning_rate": 9.62216323108731e-06, + "loss": 0.0091, + "step": 760 + }, + { + "epoch": 0.15115701658555963, + "grad_norm": 0.6745543946977648, + "learning_rate": 9.620935285444435e-06, + "loss": 0.0102, + "step": 761 + }, + { + "epoch": 0.15135564604230808, + "grad_norm": 1.0899874100297828, + "learning_rate": 9.619705426276887e-06, + "loss": 0.0101, + "step": 762 + }, + { + "epoch": 0.1515542754990565, + "grad_norm": 0.9734431138507607, + "learning_rate": 9.61847365409395e-06, + "loss": 0.0201, + "step": 763 + }, + { + "epoch": 0.15175290495580496, + "grad_norm": 0.6620377606505773, + "learning_rate": 9.617239969405696e-06, + "loss": 0.0108, + "step": 764 + }, + { + "epoch": 0.15195153441255338, + "grad_norm": 0.6543556889174846, + "learning_rate": 9.616004372722993e-06, + "loss": 0.0089, + "step": 765 + }, + { + "epoch": 0.15215016386930183, + "grad_norm": 1.1176456576818978, + "learning_rate": 9.614766864557505e-06, + "loss": 0.0149, + "step": 766 + }, + { + "epoch": 0.15234879332605025, + "grad_norm": 1.4536512494283313, + "learning_rate": 9.613527445421678e-06, + "loss": 0.0236, + "step": 767 + }, + { + "epoch": 0.1525474227827987, + "grad_norm": 0.9953239414430083, + "learning_rate": 9.612286115828757e-06, + "loss": 0.017, + "step": 768 + }, + { + "epoch": 0.15274605223954713, + "grad_norm": 0.9938930956416363, + "learning_rate": 9.611042876292774e-06, + "loss": 0.0167, + "step": 769 + }, + { + "epoch": 0.15294468169629555, + "grad_norm": 1.1047738551365187, + "learning_rate": 9.609797727328553e-06, + "loss": 0.0218, + "step": 770 + }, + { + "epoch": 0.153143311153044, + "grad_norm": 0.6023972990227837, + "learning_rate": 9.608550669451709e-06, + "loss": 0.006, + "step": 771 + }, + { + "epoch": 0.15334194060979242, + "grad_norm": 0.5085628473524939, + "learning_rate": 9.607301703178648e-06, + "loss": 0.0237, + "step": 772 + }, + { + "epoch": 0.15354057006654087, + "grad_norm": 0.7921029494776086, + "learning_rate": 9.606050829026568e-06, + "loss": 0.013, + "step": 773 + }, + { + "epoch": 0.1537391995232893, + "grad_norm": 0.3846454240514661, + "learning_rate": 9.604798047513449e-06, + "loss": 0.01, + "step": 774 + }, + { + "epoch": 0.15393782898003774, + "grad_norm": 0.8493324370898936, + "learning_rate": 9.603543359158071e-06, + "loss": 0.0206, + "step": 775 + }, + { + "epoch": 0.15413645843678617, + "grad_norm": 1.3234666263493877, + "learning_rate": 9.60228676448e-06, + "loss": 0.0173, + "step": 776 + }, + { + "epoch": 0.15433508789353462, + "grad_norm": 0.4104770870479379, + "learning_rate": 9.601028263999585e-06, + "loss": 0.019, + "step": 777 + }, + { + "epoch": 0.15453371735028304, + "grad_norm": 1.0206867249184952, + "learning_rate": 9.599767858237976e-06, + "loss": 0.0162, + "step": 778 + }, + { + "epoch": 0.1547323468070315, + "grad_norm": 0.6045402333325749, + "learning_rate": 9.598505547717103e-06, + "loss": 0.0171, + "step": 779 + }, + { + "epoch": 0.1549309762637799, + "grad_norm": 0.2885932547248781, + "learning_rate": 9.597241332959687e-06, + "loss": 0.0093, + "step": 780 + }, + { + "epoch": 0.15512960572052836, + "grad_norm": 1.0510497521650195, + "learning_rate": 9.59597521448924e-06, + "loss": 0.0136, + "step": 781 + }, + { + "epoch": 0.15532823517727679, + "grad_norm": 0.4857875665691372, + "learning_rate": 9.59470719283006e-06, + "loss": 0.0151, + "step": 782 + }, + { + "epoch": 0.15552686463402524, + "grad_norm": 0.21785943423231574, + "learning_rate": 9.59343726850723e-06, + "loss": 0.0094, + "step": 783 + }, + { + "epoch": 0.15572549409077366, + "grad_norm": 0.365806832334753, + "learning_rate": 9.592165442046628e-06, + "loss": 0.0141, + "step": 784 + }, + { + "epoch": 0.1559241235475221, + "grad_norm": 0.5929826587521397, + "learning_rate": 9.590891713974917e-06, + "loss": 0.0129, + "step": 785 + }, + { + "epoch": 0.15612275300427053, + "grad_norm": 0.7523823133800354, + "learning_rate": 9.589616084819542e-06, + "loss": 0.0219, + "step": 786 + }, + { + "epoch": 0.15632138246101898, + "grad_norm": 0.387882993262946, + "learning_rate": 9.588338555108744e-06, + "loss": 0.0209, + "step": 787 + }, + { + "epoch": 0.1565200119177674, + "grad_norm": 0.45441023735943464, + "learning_rate": 9.587059125371545e-06, + "loss": 0.0137, + "step": 788 + }, + { + "epoch": 0.15671864137451583, + "grad_norm": 0.7600639793039875, + "learning_rate": 9.585777796137756e-06, + "loss": 0.0114, + "step": 789 + }, + { + "epoch": 0.15691727083126428, + "grad_norm": 0.6247443320410846, + "learning_rate": 9.584494567937973e-06, + "loss": 0.0196, + "step": 790 + }, + { + "epoch": 0.1571159002880127, + "grad_norm": 0.2926748180028533, + "learning_rate": 9.58320944130358e-06, + "loss": 0.0067, + "step": 791 + }, + { + "epoch": 0.15731452974476115, + "grad_norm": 0.9470618869437955, + "learning_rate": 9.581922416766748e-06, + "loss": 0.0207, + "step": 792 + }, + { + "epoch": 0.15751315920150957, + "grad_norm": 0.7327000387245999, + "learning_rate": 9.580633494860432e-06, + "loss": 0.0185, + "step": 793 + }, + { + "epoch": 0.15771178865825802, + "grad_norm": 0.5905549844276268, + "learning_rate": 9.579342676118373e-06, + "loss": 0.015, + "step": 794 + }, + { + "epoch": 0.15791041811500645, + "grad_norm": 0.2330283466633709, + "learning_rate": 9.578049961075098e-06, + "loss": 0.0101, + "step": 795 + }, + { + "epoch": 0.1581090475717549, + "grad_norm": 1.4809497082727017, + "learning_rate": 9.576755350265918e-06, + "loss": 0.0228, + "step": 796 + }, + { + "epoch": 0.15830767702850332, + "grad_norm": 0.3008493915594875, + "learning_rate": 9.57545884422693e-06, + "loss": 0.0078, + "step": 797 + }, + { + "epoch": 0.15850630648525177, + "grad_norm": 0.7392324826529579, + "learning_rate": 9.574160443495017e-06, + "loss": 0.0201, + "step": 798 + }, + { + "epoch": 0.1587049359420002, + "grad_norm": 0.546883075423793, + "learning_rate": 9.572860148607846e-06, + "loss": 0.0173, + "step": 799 + }, + { + "epoch": 0.15890356539874864, + "grad_norm": 0.8293724216580041, + "learning_rate": 9.571557960103867e-06, + "loss": 0.0139, + "step": 800 + }, + { + "epoch": 0.15910219485549706, + "grad_norm": 0.9771544832714681, + "learning_rate": 9.570253878522314e-06, + "loss": 0.0123, + "step": 801 + }, + { + "epoch": 0.15930082431224551, + "grad_norm": 0.5950320778242356, + "learning_rate": 9.568947904403208e-06, + "loss": 0.0177, + "step": 802 + }, + { + "epoch": 0.15949945376899394, + "grad_norm": 1.2291334278846218, + "learning_rate": 9.567640038287349e-06, + "loss": 0.0215, + "step": 803 + }, + { + "epoch": 0.1596980832257424, + "grad_norm": 1.1112781913894665, + "learning_rate": 9.566330280716323e-06, + "loss": 0.0171, + "step": 804 + }, + { + "epoch": 0.1598967126824908, + "grad_norm": 0.8798985320697517, + "learning_rate": 9.565018632232502e-06, + "loss": 0.0245, + "step": 805 + }, + { + "epoch": 0.16009534213923926, + "grad_norm": 0.5515295292743605, + "learning_rate": 9.563705093379036e-06, + "loss": 0.0245, + "step": 806 + }, + { + "epoch": 0.16029397159598768, + "grad_norm": 0.6661943953386937, + "learning_rate": 9.562389664699863e-06, + "loss": 0.0159, + "step": 807 + }, + { + "epoch": 0.16049260105273613, + "grad_norm": 0.9464937907087646, + "learning_rate": 9.561072346739697e-06, + "loss": 0.0148, + "step": 808 + }, + { + "epoch": 0.16069123050948456, + "grad_norm": 0.39866831976662964, + "learning_rate": 9.55975314004404e-06, + "loss": 0.0177, + "step": 809 + }, + { + "epoch": 0.16088985996623298, + "grad_norm": 0.6113943733065699, + "learning_rate": 9.558432045159174e-06, + "loss": 0.0147, + "step": 810 + }, + { + "epoch": 0.16108848942298143, + "grad_norm": 0.6538607257959484, + "learning_rate": 9.557109062632164e-06, + "loss": 0.0226, + "step": 811 + }, + { + "epoch": 0.16128711887972985, + "grad_norm": 0.2911939937872243, + "learning_rate": 9.555784193010854e-06, + "loss": 0.0093, + "step": 812 + }, + { + "epoch": 0.1614857483364783, + "grad_norm": 0.43968491191440634, + "learning_rate": 9.554457436843872e-06, + "loss": 0.0148, + "step": 813 + }, + { + "epoch": 0.16168437779322672, + "grad_norm": 1.1183514039831923, + "learning_rate": 9.553128794680626e-06, + "loss": 0.017, + "step": 814 + }, + { + "epoch": 0.16188300724997517, + "grad_norm": 0.6035034096507715, + "learning_rate": 9.551798267071308e-06, + "loss": 0.0069, + "step": 815 + }, + { + "epoch": 0.1620816367067236, + "grad_norm": 0.6846614549208714, + "learning_rate": 9.550465854566884e-06, + "loss": 0.013, + "step": 816 + }, + { + "epoch": 0.16228026616347205, + "grad_norm": 0.5513653606442389, + "learning_rate": 9.549131557719106e-06, + "loss": 0.0137, + "step": 817 + }, + { + "epoch": 0.16247889562022047, + "grad_norm": 0.7496807116130924, + "learning_rate": 9.547795377080506e-06, + "loss": 0.0062, + "step": 818 + }, + { + "epoch": 0.16267752507696892, + "grad_norm": 1.1195510094425656, + "learning_rate": 9.546457313204395e-06, + "loss": 0.0186, + "step": 819 + }, + { + "epoch": 0.16287615453371734, + "grad_norm": 0.4918213514406738, + "learning_rate": 9.545117366644863e-06, + "loss": 0.0082, + "step": 820 + }, + { + "epoch": 0.1630747839904658, + "grad_norm": 0.6864874480930118, + "learning_rate": 9.543775537956781e-06, + "loss": 0.0165, + "step": 821 + }, + { + "epoch": 0.16327341344721422, + "grad_norm": 0.9104337265905692, + "learning_rate": 9.5424318276958e-06, + "loss": 0.0202, + "step": 822 + }, + { + "epoch": 0.16347204290396267, + "grad_norm": 1.3936891736313024, + "learning_rate": 9.541086236418348e-06, + "loss": 0.0139, + "step": 823 + }, + { + "epoch": 0.1636706723607111, + "grad_norm": 1.0710993378946527, + "learning_rate": 9.539738764681633e-06, + "loss": 0.0207, + "step": 824 + }, + { + "epoch": 0.16386930181745954, + "grad_norm": 0.7599724122468066, + "learning_rate": 9.538389413043641e-06, + "loss": 0.0194, + "step": 825 + }, + { + "epoch": 0.16406793127420796, + "grad_norm": 0.4138470932916712, + "learning_rate": 9.537038182063138e-06, + "loss": 0.009, + "step": 826 + }, + { + "epoch": 0.1642665607309564, + "grad_norm": 1.0081381826347446, + "learning_rate": 9.535685072299668e-06, + "loss": 0.0172, + "step": 827 + }, + { + "epoch": 0.16446519018770484, + "grad_norm": 1.0432367672352847, + "learning_rate": 9.53433008431355e-06, + "loss": 0.0161, + "step": 828 + }, + { + "epoch": 0.16466381964445329, + "grad_norm": 0.7812431690369702, + "learning_rate": 9.532973218665887e-06, + "loss": 0.0237, + "step": 829 + }, + { + "epoch": 0.1648624491012017, + "grad_norm": 0.4534298510315967, + "learning_rate": 9.531614475918552e-06, + "loss": 0.0138, + "step": 830 + }, + { + "epoch": 0.16506107855795013, + "grad_norm": 0.521000256028603, + "learning_rate": 9.530253856634202e-06, + "loss": 0.014, + "step": 831 + }, + { + "epoch": 0.16525970801469858, + "grad_norm": 0.4111263927903728, + "learning_rate": 9.528891361376265e-06, + "loss": 0.0102, + "step": 832 + }, + { + "epoch": 0.165458337471447, + "grad_norm": 0.6400161006473448, + "learning_rate": 9.527526990708952e-06, + "loss": 0.0135, + "step": 833 + }, + { + "epoch": 0.16565696692819545, + "grad_norm": 1.0992489588141927, + "learning_rate": 9.526160745197247e-06, + "loss": 0.022, + "step": 834 + }, + { + "epoch": 0.16585559638494388, + "grad_norm": 0.2568053298935382, + "learning_rate": 9.524792625406908e-06, + "loss": 0.0133, + "step": 835 + }, + { + "epoch": 0.16605422584169233, + "grad_norm": 0.6766945867655767, + "learning_rate": 9.523422631904473e-06, + "loss": 0.0104, + "step": 836 + }, + { + "epoch": 0.16625285529844075, + "grad_norm": 0.5619582062884672, + "learning_rate": 9.522050765257257e-06, + "loss": 0.0253, + "step": 837 + }, + { + "epoch": 0.1664514847551892, + "grad_norm": 0.4221715846394782, + "learning_rate": 9.52067702603335e-06, + "loss": 0.0151, + "step": 838 + }, + { + "epoch": 0.16665011421193762, + "grad_norm": 0.22009747488862705, + "learning_rate": 9.519301414801612e-06, + "loss": 0.0089, + "step": 839 + }, + { + "epoch": 0.16684874366868607, + "grad_norm": 0.6287660735362872, + "learning_rate": 9.517923932131685e-06, + "loss": 0.0216, + "step": 840 + }, + { + "epoch": 0.1670473731254345, + "grad_norm": 0.4170499590382144, + "learning_rate": 9.516544578593981e-06, + "loss": 0.0108, + "step": 841 + }, + { + "epoch": 0.16724600258218295, + "grad_norm": 0.9421745387040598, + "learning_rate": 9.51516335475969e-06, + "loss": 0.0177, + "step": 842 + }, + { + "epoch": 0.16744463203893137, + "grad_norm": 0.4757860318481506, + "learning_rate": 9.513780261200774e-06, + "loss": 0.011, + "step": 843 + }, + { + "epoch": 0.16764326149567982, + "grad_norm": 0.7429722294449872, + "learning_rate": 9.512395298489974e-06, + "loss": 0.0148, + "step": 844 + }, + { + "epoch": 0.16784189095242824, + "grad_norm": 0.5206174876646718, + "learning_rate": 9.511008467200798e-06, + "loss": 0.0156, + "step": 845 + }, + { + "epoch": 0.1680405204091767, + "grad_norm": 1.3998688162229813, + "learning_rate": 9.509619767907534e-06, + "loss": 0.0239, + "step": 846 + }, + { + "epoch": 0.16823914986592511, + "grad_norm": 0.8208188686925355, + "learning_rate": 9.508229201185242e-06, + "loss": 0.0135, + "step": 847 + }, + { + "epoch": 0.16843777932267356, + "grad_norm": 0.3774258541529112, + "learning_rate": 9.506836767609751e-06, + "loss": 0.014, + "step": 848 + }, + { + "epoch": 0.168636408779422, + "grad_norm": 0.668337123256141, + "learning_rate": 9.505442467757666e-06, + "loss": 0.013, + "step": 849 + }, + { + "epoch": 0.16883503823617044, + "grad_norm": 0.4690654187501145, + "learning_rate": 9.504046302206368e-06, + "loss": 0.0229, + "step": 850 + }, + { + "epoch": 0.16903366769291886, + "grad_norm": 0.9382417671995416, + "learning_rate": 9.50264827153401e-06, + "loss": 0.0201, + "step": 851 + }, + { + "epoch": 0.16923229714966728, + "grad_norm": 0.5705256377710812, + "learning_rate": 9.501248376319508e-06, + "loss": 0.0128, + "step": 852 + }, + { + "epoch": 0.16943092660641573, + "grad_norm": 0.48645671259102613, + "learning_rate": 9.499846617142563e-06, + "loss": 0.014, + "step": 853 + }, + { + "epoch": 0.16962955606316416, + "grad_norm": 1.4022653145077122, + "learning_rate": 9.498442994583639e-06, + "loss": 0.0174, + "step": 854 + }, + { + "epoch": 0.1698281855199126, + "grad_norm": 0.7638533431862135, + "learning_rate": 9.497037509223977e-06, + "loss": 0.0149, + "step": 855 + }, + { + "epoch": 0.17002681497666103, + "grad_norm": 1.519768782139612, + "learning_rate": 9.495630161645584e-06, + "loss": 0.0255, + "step": 856 + }, + { + "epoch": 0.17022544443340948, + "grad_norm": 0.44744869619282945, + "learning_rate": 9.494220952431243e-06, + "loss": 0.0132, + "step": 857 + }, + { + "epoch": 0.1704240738901579, + "grad_norm": 0.5073442816162103, + "learning_rate": 9.492809882164509e-06, + "loss": 0.0219, + "step": 858 + }, + { + "epoch": 0.17062270334690635, + "grad_norm": 0.6578886831791138, + "learning_rate": 9.491396951429698e-06, + "loss": 0.0118, + "step": 859 + }, + { + "epoch": 0.17082133280365477, + "grad_norm": 0.440174527734574, + "learning_rate": 9.48998216081191e-06, + "loss": 0.023, + "step": 860 + }, + { + "epoch": 0.17101996226040322, + "grad_norm": 1.2110068679815877, + "learning_rate": 9.488565510897006e-06, + "loss": 0.0213, + "step": 861 + }, + { + "epoch": 0.17121859171715165, + "grad_norm": 0.34225622494013835, + "learning_rate": 9.487147002271618e-06, + "loss": 0.0218, + "step": 862 + }, + { + "epoch": 0.1714172211739001, + "grad_norm": 0.9248969475348255, + "learning_rate": 9.48572663552315e-06, + "loss": 0.0172, + "step": 863 + }, + { + "epoch": 0.17161585063064852, + "grad_norm": 0.3111389224565366, + "learning_rate": 9.484304411239774e-06, + "loss": 0.0194, + "step": 864 + }, + { + "epoch": 0.17181448008739697, + "grad_norm": 0.5426022611346796, + "learning_rate": 9.482880330010434e-06, + "loss": 0.0177, + "step": 865 + }, + { + "epoch": 0.1720131095441454, + "grad_norm": 0.9989174638535595, + "learning_rate": 9.481454392424836e-06, + "loss": 0.0179, + "step": 866 + }, + { + "epoch": 0.17221173900089384, + "grad_norm": 1.0032440886756715, + "learning_rate": 9.480026599073463e-06, + "loss": 0.0214, + "step": 867 + }, + { + "epoch": 0.17241036845764227, + "grad_norm": 0.29351054118887016, + "learning_rate": 9.478596950547561e-06, + "loss": 0.0084, + "step": 868 + }, + { + "epoch": 0.17260899791439072, + "grad_norm": 0.5469059731359652, + "learning_rate": 9.477165447439148e-06, + "loss": 0.0228, + "step": 869 + }, + { + "epoch": 0.17280762737113914, + "grad_norm": 0.5151774165372478, + "learning_rate": 9.475732090341006e-06, + "loss": 0.017, + "step": 870 + }, + { + "epoch": 0.1730062568278876, + "grad_norm": 0.3849241059477399, + "learning_rate": 9.474296879846688e-06, + "loss": 0.0149, + "step": 871 + }, + { + "epoch": 0.173204886284636, + "grad_norm": 0.2846207510615562, + "learning_rate": 9.47285981655051e-06, + "loss": 0.0134, + "step": 872 + }, + { + "epoch": 0.17340351574138443, + "grad_norm": 0.3822669264577504, + "learning_rate": 9.471420901047564e-06, + "loss": 0.0126, + "step": 873 + }, + { + "epoch": 0.17360214519813288, + "grad_norm": 0.5809048365250173, + "learning_rate": 9.469980133933701e-06, + "loss": 0.0186, + "step": 874 + }, + { + "epoch": 0.1738007746548813, + "grad_norm": 0.8463872528550324, + "learning_rate": 9.46853751580554e-06, + "loss": 0.0195, + "step": 875 + }, + { + "epoch": 0.17399940411162976, + "grad_norm": 0.4607531534725536, + "learning_rate": 9.467093047260468e-06, + "loss": 0.0151, + "step": 876 + }, + { + "epoch": 0.17419803356837818, + "grad_norm": 1.1647174596925964, + "learning_rate": 9.465646728896641e-06, + "loss": 0.0138, + "step": 877 + }, + { + "epoch": 0.17439666302512663, + "grad_norm": 0.6812902706877884, + "learning_rate": 9.464198561312972e-06, + "loss": 0.0146, + "step": 878 + }, + { + "epoch": 0.17459529248187505, + "grad_norm": 1.118536537307821, + "learning_rate": 9.462748545109152e-06, + "loss": 0.0144, + "step": 879 + }, + { + "epoch": 0.1747939219386235, + "grad_norm": 0.6063904311802616, + "learning_rate": 9.461296680885628e-06, + "loss": 0.0073, + "step": 880 + }, + { + "epoch": 0.17499255139537193, + "grad_norm": 1.5335959139523947, + "learning_rate": 9.459842969243615e-06, + "loss": 0.0237, + "step": 881 + }, + { + "epoch": 0.17519118085212038, + "grad_norm": 0.4453751489154979, + "learning_rate": 9.458387410785096e-06, + "loss": 0.0129, + "step": 882 + }, + { + "epoch": 0.1753898103088688, + "grad_norm": 0.5312389194281991, + "learning_rate": 9.456930006112814e-06, + "loss": 0.0231, + "step": 883 + }, + { + "epoch": 0.17558843976561725, + "grad_norm": 0.5246562462701451, + "learning_rate": 9.45547075583028e-06, + "loss": 0.0141, + "step": 884 + }, + { + "epoch": 0.17578706922236567, + "grad_norm": 0.3711081470488199, + "learning_rate": 9.454009660541769e-06, + "loss": 0.0126, + "step": 885 + }, + { + "epoch": 0.17598569867911412, + "grad_norm": 0.4214526210893554, + "learning_rate": 9.452546720852317e-06, + "loss": 0.0098, + "step": 886 + }, + { + "epoch": 0.17618432813586254, + "grad_norm": 0.730638217900042, + "learning_rate": 9.451081937367725e-06, + "loss": 0.0155, + "step": 887 + }, + { + "epoch": 0.176382957592611, + "grad_norm": 0.5304446182299737, + "learning_rate": 9.449615310694563e-06, + "loss": 0.0158, + "step": 888 + }, + { + "epoch": 0.17658158704935942, + "grad_norm": 0.4787768591240991, + "learning_rate": 9.448146841440156e-06, + "loss": 0.011, + "step": 889 + }, + { + "epoch": 0.17678021650610787, + "grad_norm": 0.6573768696874656, + "learning_rate": 9.446676530212596e-06, + "loss": 0.0121, + "step": 890 + }, + { + "epoch": 0.1769788459628563, + "grad_norm": 0.8738613272343506, + "learning_rate": 9.445204377620739e-06, + "loss": 0.0218, + "step": 891 + }, + { + "epoch": 0.17717747541960474, + "grad_norm": 0.6468029328396412, + "learning_rate": 9.443730384274199e-06, + "loss": 0.0126, + "step": 892 + }, + { + "epoch": 0.17737610487635316, + "grad_norm": 0.6916082592292682, + "learning_rate": 9.442254550783357e-06, + "loss": 0.0165, + "step": 893 + }, + { + "epoch": 0.17757473433310159, + "grad_norm": 0.4406671563052852, + "learning_rate": 9.440776877759354e-06, + "loss": 0.0115, + "step": 894 + }, + { + "epoch": 0.17777336378985004, + "grad_norm": 1.0537465676266842, + "learning_rate": 9.439297365814095e-06, + "loss": 0.0174, + "step": 895 + }, + { + "epoch": 0.17797199324659846, + "grad_norm": 1.2213613739765894, + "learning_rate": 9.437816015560241e-06, + "loss": 0.0183, + "step": 896 + }, + { + "epoch": 0.1781706227033469, + "grad_norm": 0.4783500290161957, + "learning_rate": 9.43633282761122e-06, + "loss": 0.0092, + "step": 897 + }, + { + "epoch": 0.17836925216009533, + "grad_norm": 0.43432963042155853, + "learning_rate": 9.434847802581216e-06, + "loss": 0.0203, + "step": 898 + }, + { + "epoch": 0.17856788161684378, + "grad_norm": 0.7897317815340374, + "learning_rate": 9.43336094108518e-06, + "loss": 0.0108, + "step": 899 + }, + { + "epoch": 0.1787665110735922, + "grad_norm": 0.3008681013432827, + "learning_rate": 9.431872243738817e-06, + "loss": 0.0137, + "step": 900 + }, + { + "epoch": 0.17896514053034066, + "grad_norm": 0.6771184704989937, + "learning_rate": 9.430381711158597e-06, + "loss": 0.0262, + "step": 901 + }, + { + "epoch": 0.17916376998708908, + "grad_norm": 0.4029134503869768, + "learning_rate": 9.428889343961745e-06, + "loss": 0.0082, + "step": 902 + }, + { + "epoch": 0.17936239944383753, + "grad_norm": 0.6499915119874066, + "learning_rate": 9.427395142766253e-06, + "loss": 0.0104, + "step": 903 + }, + { + "epoch": 0.17956102890058595, + "grad_norm": 1.2873816596565066, + "learning_rate": 9.425899108190866e-06, + "loss": 0.014, + "step": 904 + }, + { + "epoch": 0.1797596583573344, + "grad_norm": 0.917742932464249, + "learning_rate": 9.42440124085509e-06, + "loss": 0.0098, + "step": 905 + }, + { + "epoch": 0.17995828781408282, + "grad_norm": 0.6917074788463903, + "learning_rate": 9.42290154137919e-06, + "loss": 0.0111, + "step": 906 + }, + { + "epoch": 0.18015691727083127, + "grad_norm": 0.4982836344620947, + "learning_rate": 9.421400010384191e-06, + "loss": 0.0145, + "step": 907 + }, + { + "epoch": 0.1803555467275797, + "grad_norm": 1.086400992320905, + "learning_rate": 9.419896648491875e-06, + "loss": 0.0174, + "step": 908 + }, + { + "epoch": 0.18055417618432815, + "grad_norm": 0.8193422731728996, + "learning_rate": 9.418391456324785e-06, + "loss": 0.0105, + "step": 909 + }, + { + "epoch": 0.18075280564107657, + "grad_norm": 0.7103752650742982, + "learning_rate": 9.416884434506217e-06, + "loss": 0.0139, + "step": 910 + }, + { + "epoch": 0.18095143509782502, + "grad_norm": 0.6962334695928468, + "learning_rate": 9.415375583660227e-06, + "loss": 0.0128, + "step": 911 + }, + { + "epoch": 0.18115006455457344, + "grad_norm": 0.6806563393000292, + "learning_rate": 9.41386490441163e-06, + "loss": 0.0151, + "step": 912 + }, + { + "epoch": 0.1813486940113219, + "grad_norm": 0.477963378101734, + "learning_rate": 9.412352397385997e-06, + "loss": 0.0156, + "step": 913 + }, + { + "epoch": 0.18154732346807032, + "grad_norm": 1.377024847688744, + "learning_rate": 9.410838063209653e-06, + "loss": 0.0137, + "step": 914 + }, + { + "epoch": 0.18174595292481874, + "grad_norm": 0.840278914999452, + "learning_rate": 9.409321902509686e-06, + "loss": 0.0179, + "step": 915 + }, + { + "epoch": 0.1819445823815672, + "grad_norm": 1.2080657080079076, + "learning_rate": 9.407803915913934e-06, + "loss": 0.0142, + "step": 916 + }, + { + "epoch": 0.1821432118383156, + "grad_norm": 0.6470778636335309, + "learning_rate": 9.406284104050994e-06, + "loss": 0.0153, + "step": 917 + }, + { + "epoch": 0.18234184129506406, + "grad_norm": 0.6448817205204879, + "learning_rate": 9.40476246755022e-06, + "loss": 0.0147, + "step": 918 + }, + { + "epoch": 0.18254047075181248, + "grad_norm": 0.8677307803730453, + "learning_rate": 9.403239007041719e-06, + "loss": 0.0224, + "step": 919 + }, + { + "epoch": 0.18273910020856093, + "grad_norm": 1.0883235135924039, + "learning_rate": 9.401713723156355e-06, + "loss": 0.0156, + "step": 920 + }, + { + "epoch": 0.18293772966530936, + "grad_norm": 1.378109947347798, + "learning_rate": 9.400186616525747e-06, + "loss": 0.0214, + "step": 921 + }, + { + "epoch": 0.1831363591220578, + "grad_norm": 0.45828575967420676, + "learning_rate": 9.398657687782264e-06, + "loss": 0.0143, + "step": 922 + }, + { + "epoch": 0.18333498857880623, + "grad_norm": 0.4067470046392445, + "learning_rate": 9.397126937559041e-06, + "loss": 0.0106, + "step": 923 + }, + { + "epoch": 0.18353361803555468, + "grad_norm": 0.8655229021701957, + "learning_rate": 9.395594366489956e-06, + "loss": 0.0164, + "step": 924 + }, + { + "epoch": 0.1837322474923031, + "grad_norm": 0.6544805313594291, + "learning_rate": 9.394059975209644e-06, + "loss": 0.0161, + "step": 925 + }, + { + "epoch": 0.18393087694905155, + "grad_norm": 0.5260990122458291, + "learning_rate": 9.392523764353497e-06, + "loss": 0.0121, + "step": 926 + }, + { + "epoch": 0.18412950640579998, + "grad_norm": 0.49273647924970304, + "learning_rate": 9.390985734557659e-06, + "loss": 0.0137, + "step": 927 + }, + { + "epoch": 0.18432813586254843, + "grad_norm": 0.5039646852586156, + "learning_rate": 9.389445886459026e-06, + "loss": 0.0118, + "step": 928 + }, + { + "epoch": 0.18452676531929685, + "grad_norm": 0.6159677479137697, + "learning_rate": 9.387904220695245e-06, + "loss": 0.0096, + "step": 929 + }, + { + "epoch": 0.1847253947760453, + "grad_norm": 0.4398053369763513, + "learning_rate": 9.386360737904722e-06, + "loss": 0.0082, + "step": 930 + }, + { + "epoch": 0.18492402423279372, + "grad_norm": 0.5382584798731853, + "learning_rate": 9.384815438726608e-06, + "loss": 0.0143, + "step": 931 + }, + { + "epoch": 0.18512265368954217, + "grad_norm": 0.7022731175340937, + "learning_rate": 9.383268323800815e-06, + "loss": 0.0231, + "step": 932 + }, + { + "epoch": 0.1853212831462906, + "grad_norm": 0.45625159282982203, + "learning_rate": 9.381719393767998e-06, + "loss": 0.018, + "step": 933 + }, + { + "epoch": 0.18551991260303904, + "grad_norm": 0.5960828207873747, + "learning_rate": 9.380168649269566e-06, + "loss": 0.0147, + "step": 934 + }, + { + "epoch": 0.18571854205978747, + "grad_norm": 0.8282241450571749, + "learning_rate": 9.378616090947685e-06, + "loss": 0.0215, + "step": 935 + }, + { + "epoch": 0.1859171715165359, + "grad_norm": 0.4426456726841343, + "learning_rate": 9.377061719445264e-06, + "loss": 0.0229, + "step": 936 + }, + { + "epoch": 0.18611580097328434, + "grad_norm": 0.5920397094573076, + "learning_rate": 9.375505535405969e-06, + "loss": 0.0173, + "step": 937 + }, + { + "epoch": 0.18631443043003276, + "grad_norm": 0.2981282792836881, + "learning_rate": 9.373947539474212e-06, + "loss": 0.0112, + "step": 938 + }, + { + "epoch": 0.1865130598867812, + "grad_norm": 0.7856812810221333, + "learning_rate": 9.372387732295162e-06, + "loss": 0.0119, + "step": 939 + }, + { + "epoch": 0.18671168934352964, + "grad_norm": 0.7020478883034995, + "learning_rate": 9.370826114514729e-06, + "loss": 0.0138, + "step": 940 + }, + { + "epoch": 0.18691031880027809, + "grad_norm": 0.3879630472447985, + "learning_rate": 9.369262686779578e-06, + "loss": 0.0098, + "step": 941 + }, + { + "epoch": 0.1871089482570265, + "grad_norm": 0.26290487954249336, + "learning_rate": 9.367697449737126e-06, + "loss": 0.0086, + "step": 942 + }, + { + "epoch": 0.18730757771377496, + "grad_norm": 0.8158600730845823, + "learning_rate": 9.366130404035533e-06, + "loss": 0.0127, + "step": 943 + }, + { + "epoch": 0.18750620717052338, + "grad_norm": 0.43011489476490694, + "learning_rate": 9.364561550323711e-06, + "loss": 0.0138, + "step": 944 + }, + { + "epoch": 0.18770483662727183, + "grad_norm": 0.6422387414139117, + "learning_rate": 9.362990889251325e-06, + "loss": 0.0183, + "step": 945 + }, + { + "epoch": 0.18790346608402025, + "grad_norm": 0.8940322891219529, + "learning_rate": 9.361418421468777e-06, + "loss": 0.0187, + "step": 946 + }, + { + "epoch": 0.1881020955407687, + "grad_norm": 1.4463527888545427, + "learning_rate": 9.359844147627231e-06, + "loss": 0.0214, + "step": 947 + }, + { + "epoch": 0.18830072499751713, + "grad_norm": 0.4102746620553028, + "learning_rate": 9.358268068378589e-06, + "loss": 0.0085, + "step": 948 + }, + { + "epoch": 0.18849935445426558, + "grad_norm": 0.6618029175469268, + "learning_rate": 9.356690184375504e-06, + "loss": 0.0168, + "step": 949 + }, + { + "epoch": 0.188697983911014, + "grad_norm": 0.5539961291378328, + "learning_rate": 9.355110496271376e-06, + "loss": 0.0118, + "step": 950 + }, + { + "epoch": 0.18889661336776245, + "grad_norm": 0.8598663158866856, + "learning_rate": 9.353529004720354e-06, + "loss": 0.0119, + "step": 951 + }, + { + "epoch": 0.18909524282451087, + "grad_norm": 0.5890081936780608, + "learning_rate": 9.35194571037733e-06, + "loss": 0.0199, + "step": 952 + }, + { + "epoch": 0.18929387228125932, + "grad_norm": 1.2607952878561444, + "learning_rate": 9.350360613897945e-06, + "loss": 0.021, + "step": 953 + }, + { + "epoch": 0.18949250173800775, + "grad_norm": 0.5805466556115377, + "learning_rate": 9.348773715938587e-06, + "loss": 0.0103, + "step": 954 + }, + { + "epoch": 0.18969113119475617, + "grad_norm": 0.6262277582379767, + "learning_rate": 9.347185017156388e-06, + "loss": 0.014, + "step": 955 + }, + { + "epoch": 0.18988976065150462, + "grad_norm": 0.8635643634403196, + "learning_rate": 9.345594518209227e-06, + "loss": 0.02, + "step": 956 + }, + { + "epoch": 0.19008839010825304, + "grad_norm": 0.4163572607700282, + "learning_rate": 9.344002219755728e-06, + "loss": 0.0099, + "step": 957 + }, + { + "epoch": 0.1902870195650015, + "grad_norm": 0.5499649933024405, + "learning_rate": 9.34240812245526e-06, + "loss": 0.0123, + "step": 958 + }, + { + "epoch": 0.19048564902174991, + "grad_norm": 0.4621935969511572, + "learning_rate": 9.340812226967936e-06, + "loss": 0.0141, + "step": 959 + }, + { + "epoch": 0.19068427847849836, + "grad_norm": 0.6322621882728303, + "learning_rate": 9.339214533954618e-06, + "loss": 0.0138, + "step": 960 + }, + { + "epoch": 0.1908829079352468, + "grad_norm": 0.5315385506545031, + "learning_rate": 9.337615044076906e-06, + "loss": 0.0156, + "step": 961 + }, + { + "epoch": 0.19108153739199524, + "grad_norm": 0.8395315780253293, + "learning_rate": 9.336013757997147e-06, + "loss": 0.014, + "step": 962 + }, + { + "epoch": 0.19128016684874366, + "grad_norm": 0.515182257587864, + "learning_rate": 9.334410676378433e-06, + "loss": 0.0151, + "step": 963 + }, + { + "epoch": 0.1914787963054921, + "grad_norm": 0.7503607628649587, + "learning_rate": 9.3328057998846e-06, + "loss": 0.0181, + "step": 964 + }, + { + "epoch": 0.19167742576224053, + "grad_norm": 0.6292960608616276, + "learning_rate": 9.331199129180224e-06, + "loss": 0.0256, + "step": 965 + }, + { + "epoch": 0.19187605521898898, + "grad_norm": 0.657172738095144, + "learning_rate": 9.329590664930625e-06, + "loss": 0.0131, + "step": 966 + }, + { + "epoch": 0.1920746846757374, + "grad_norm": 0.6758585625636307, + "learning_rate": 9.32798040780187e-06, + "loss": 0.0233, + "step": 967 + }, + { + "epoch": 0.19227331413248586, + "grad_norm": 1.138944554532619, + "learning_rate": 9.326368358460757e-06, + "loss": 0.0166, + "step": 968 + }, + { + "epoch": 0.19247194358923428, + "grad_norm": 0.9844802183689632, + "learning_rate": 9.324754517574844e-06, + "loss": 0.0147, + "step": 969 + }, + { + "epoch": 0.19267057304598273, + "grad_norm": 0.36727739091125694, + "learning_rate": 9.323138885812416e-06, + "loss": 0.0157, + "step": 970 + }, + { + "epoch": 0.19286920250273115, + "grad_norm": 0.39266585464073445, + "learning_rate": 9.3215214638425e-06, + "loss": 0.0147, + "step": 971 + }, + { + "epoch": 0.1930678319594796, + "grad_norm": 0.694569390497426, + "learning_rate": 9.319902252334878e-06, + "loss": 0.0132, + "step": 972 + }, + { + "epoch": 0.19326646141622802, + "grad_norm": 0.8122752981584541, + "learning_rate": 9.318281251960059e-06, + "loss": 0.0152, + "step": 973 + }, + { + "epoch": 0.19346509087297648, + "grad_norm": 1.356751012776408, + "learning_rate": 9.316658463389296e-06, + "loss": 0.0159, + "step": 974 + }, + { + "epoch": 0.1936637203297249, + "grad_norm": 0.4514146325071209, + "learning_rate": 9.315033887294588e-06, + "loss": 0.0085, + "step": 975 + }, + { + "epoch": 0.19386234978647332, + "grad_norm": 0.4492629322519748, + "learning_rate": 9.313407524348667e-06, + "loss": 0.0121, + "step": 976 + }, + { + "epoch": 0.19406097924322177, + "grad_norm": 0.42766136851441033, + "learning_rate": 9.311779375225012e-06, + "loss": 0.0113, + "step": 977 + }, + { + "epoch": 0.1942596086999702, + "grad_norm": 0.3782072201329815, + "learning_rate": 9.310149440597833e-06, + "loss": 0.006, + "step": 978 + }, + { + "epoch": 0.19445823815671864, + "grad_norm": 1.3010134952460966, + "learning_rate": 9.308517721142088e-06, + "loss": 0.0219, + "step": 979 + }, + { + "epoch": 0.19465686761346707, + "grad_norm": 1.7188646382221526, + "learning_rate": 9.30688421753347e-06, + "loss": 0.021, + "step": 980 + }, + { + "epoch": 0.19485549707021552, + "grad_norm": 1.113751111511706, + "learning_rate": 9.30524893044841e-06, + "loss": 0.0213, + "step": 981 + }, + { + "epoch": 0.19505412652696394, + "grad_norm": 0.33969922760265947, + "learning_rate": 9.303611860564079e-06, + "loss": 0.0088, + "step": 982 + }, + { + "epoch": 0.1952527559837124, + "grad_norm": 0.6797604296474096, + "learning_rate": 9.301973008558387e-06, + "loss": 0.0187, + "step": 983 + }, + { + "epoch": 0.1954513854404608, + "grad_norm": 0.7594684777223187, + "learning_rate": 9.30033237510998e-06, + "loss": 0.0195, + "step": 984 + }, + { + "epoch": 0.19565001489720926, + "grad_norm": 1.3637727775160045, + "learning_rate": 9.298689960898242e-06, + "loss": 0.0126, + "step": 985 + }, + { + "epoch": 0.19584864435395768, + "grad_norm": 1.0630474301871424, + "learning_rate": 9.297045766603297e-06, + "loss": 0.0141, + "step": 986 + }, + { + "epoch": 0.19604727381070614, + "grad_norm": 0.6542337994520813, + "learning_rate": 9.295399792906002e-06, + "loss": 0.021, + "step": 987 + }, + { + "epoch": 0.19624590326745456, + "grad_norm": 0.5665306835890809, + "learning_rate": 9.293752040487956e-06, + "loss": 0.0142, + "step": 988 + }, + { + "epoch": 0.196444532724203, + "grad_norm": 0.7553552246755394, + "learning_rate": 9.292102510031488e-06, + "loss": 0.0132, + "step": 989 + }, + { + "epoch": 0.19664316218095143, + "grad_norm": 1.470905402981988, + "learning_rate": 9.29045120221967e-06, + "loss": 0.0256, + "step": 990 + }, + { + "epoch": 0.19684179163769988, + "grad_norm": 0.5202547467174007, + "learning_rate": 9.288798117736307e-06, + "loss": 0.0128, + "step": 991 + }, + { + "epoch": 0.1970404210944483, + "grad_norm": 0.48524424654365406, + "learning_rate": 9.287143257265936e-06, + "loss": 0.0135, + "step": 992 + }, + { + "epoch": 0.19723905055119675, + "grad_norm": 0.6718955314932935, + "learning_rate": 9.285486621493836e-06, + "loss": 0.0125, + "step": 993 + }, + { + "epoch": 0.19743768000794518, + "grad_norm": 0.8034984592157217, + "learning_rate": 9.283828211106019e-06, + "loss": 0.0199, + "step": 994 + }, + { + "epoch": 0.19763630946469363, + "grad_norm": 0.5390976895123648, + "learning_rate": 9.28216802678923e-06, + "loss": 0.0131, + "step": 995 + }, + { + "epoch": 0.19783493892144205, + "grad_norm": 0.6418655391021179, + "learning_rate": 9.280506069230945e-06, + "loss": 0.0139, + "step": 996 + }, + { + "epoch": 0.19803356837819047, + "grad_norm": 0.6491724713923708, + "learning_rate": 9.278842339119388e-06, + "loss": 0.0159, + "step": 997 + }, + { + "epoch": 0.19823219783493892, + "grad_norm": 0.40209317799866046, + "learning_rate": 9.277176837143501e-06, + "loss": 0.0074, + "step": 998 + }, + { + "epoch": 0.19843082729168735, + "grad_norm": 0.5920855302131218, + "learning_rate": 9.27550956399297e-06, + "loss": 0.023, + "step": 999 + }, + { + "epoch": 0.1986294567484358, + "grad_norm": 0.37861800255788186, + "learning_rate": 9.27384052035821e-06, + "loss": 0.0131, + "step": 1000 + }, + { + "epoch": 0.19882808620518422, + "grad_norm": 0.4811422211153491, + "learning_rate": 9.27216970693037e-06, + "loss": 0.0152, + "step": 1001 + }, + { + "epoch": 0.19902671566193267, + "grad_norm": 0.6166117246588404, + "learning_rate": 9.270497124401332e-06, + "loss": 0.0159, + "step": 1002 + }, + { + "epoch": 0.1992253451186811, + "grad_norm": 0.6776544917793748, + "learning_rate": 9.268822773463715e-06, + "loss": 0.014, + "step": 1003 + }, + { + "epoch": 0.19942397457542954, + "grad_norm": 0.3906605633332408, + "learning_rate": 9.267146654810859e-06, + "loss": 0.0134, + "step": 1004 + }, + { + "epoch": 0.19962260403217796, + "grad_norm": 0.3221183282074231, + "learning_rate": 9.265468769136847e-06, + "loss": 0.0117, + "step": 1005 + }, + { + "epoch": 0.19982123348892641, + "grad_norm": 0.3942134464598503, + "learning_rate": 9.26378911713649e-06, + "loss": 0.0114, + "step": 1006 + }, + { + "epoch": 0.20001986294567484, + "grad_norm": 0.875008215072221, + "learning_rate": 9.262107699505329e-06, + "loss": 0.0226, + "step": 1007 + }, + { + "epoch": 0.2002184924024233, + "grad_norm": 0.3797025965662609, + "learning_rate": 9.260424516939636e-06, + "loss": 0.0122, + "step": 1008 + }, + { + "epoch": 0.2004171218591717, + "grad_norm": 0.7343719704793019, + "learning_rate": 9.25873957013642e-06, + "loss": 0.0145, + "step": 1009 + }, + { + "epoch": 0.20061575131592016, + "grad_norm": 0.536506479455833, + "learning_rate": 9.257052859793412e-06, + "loss": 0.0143, + "step": 1010 + }, + { + "epoch": 0.20081438077266858, + "grad_norm": 0.5622360593336543, + "learning_rate": 9.255364386609077e-06, + "loss": 0.0147, + "step": 1011 + }, + { + "epoch": 0.20101301022941703, + "grad_norm": 0.4163313975119902, + "learning_rate": 9.253674151282612e-06, + "loss": 0.0127, + "step": 1012 + }, + { + "epoch": 0.20121163968616546, + "grad_norm": 0.5537476076401138, + "learning_rate": 9.25198215451394e-06, + "loss": 0.0142, + "step": 1013 + }, + { + "epoch": 0.2014102691429139, + "grad_norm": 0.6048080720095486, + "learning_rate": 9.250288397003715e-06, + "loss": 0.0128, + "step": 1014 + }, + { + "epoch": 0.20160889859966233, + "grad_norm": 0.9243481861891079, + "learning_rate": 9.248592879453323e-06, + "loss": 0.024, + "step": 1015 + }, + { + "epoch": 0.20180752805641078, + "grad_norm": 0.3405567732176348, + "learning_rate": 9.246895602564874e-06, + "loss": 0.0085, + "step": 1016 + }, + { + "epoch": 0.2020061575131592, + "grad_norm": 0.8790274897891412, + "learning_rate": 9.245196567041207e-06, + "loss": 0.0128, + "step": 1017 + }, + { + "epoch": 0.20220478696990762, + "grad_norm": 0.6058890979082735, + "learning_rate": 9.243495773585896e-06, + "loss": 0.0165, + "step": 1018 + }, + { + "epoch": 0.20240341642665607, + "grad_norm": 0.3990747368828865, + "learning_rate": 9.241793222903233e-06, + "loss": 0.0131, + "step": 1019 + }, + { + "epoch": 0.2026020458834045, + "grad_norm": 0.4972091523991518, + "learning_rate": 9.240088915698243e-06, + "loss": 0.0152, + "step": 1020 + }, + { + "epoch": 0.20280067534015295, + "grad_norm": 0.9145107437455913, + "learning_rate": 9.238382852676679e-06, + "loss": 0.013, + "step": 1021 + }, + { + "epoch": 0.20299930479690137, + "grad_norm": 0.7916389183212782, + "learning_rate": 9.236675034545022e-06, + "loss": 0.0158, + "step": 1022 + }, + { + "epoch": 0.20319793425364982, + "grad_norm": 1.312567937111962, + "learning_rate": 9.234965462010475e-06, + "loss": 0.0235, + "step": 1023 + }, + { + "epoch": 0.20339656371039824, + "grad_norm": 0.7879443556012908, + "learning_rate": 9.233254135780973e-06, + "loss": 0.0134, + "step": 1024 + }, + { + "epoch": 0.2035951931671467, + "grad_norm": 0.7144955186930526, + "learning_rate": 9.23154105656517e-06, + "loss": 0.0185, + "step": 1025 + }, + { + "epoch": 0.20379382262389512, + "grad_norm": 0.8772618199422003, + "learning_rate": 9.229826225072455e-06, + "loss": 0.0115, + "step": 1026 + }, + { + "epoch": 0.20399245208064357, + "grad_norm": 0.3508519745307009, + "learning_rate": 9.228109642012934e-06, + "loss": 0.0161, + "step": 1027 + }, + { + "epoch": 0.204191081537392, + "grad_norm": 1.142282664161455, + "learning_rate": 9.226391308097446e-06, + "loss": 0.0219, + "step": 1028 + }, + { + "epoch": 0.20438971099414044, + "grad_norm": 0.6781354611906164, + "learning_rate": 9.22467122403755e-06, + "loss": 0.0236, + "step": 1029 + }, + { + "epoch": 0.20458834045088886, + "grad_norm": 0.7331441338639288, + "learning_rate": 9.22294939054553e-06, + "loss": 0.0144, + "step": 1030 + }, + { + "epoch": 0.2047869699076373, + "grad_norm": 0.5178653186934357, + "learning_rate": 9.221225808334396e-06, + "loss": 0.0142, + "step": 1031 + }, + { + "epoch": 0.20498559936438573, + "grad_norm": 0.2608341992646028, + "learning_rate": 9.219500478117883e-06, + "loss": 0.0155, + "step": 1032 + }, + { + "epoch": 0.20518422882113418, + "grad_norm": 1.2106616895089297, + "learning_rate": 9.217773400610447e-06, + "loss": 0.0175, + "step": 1033 + }, + { + "epoch": 0.2053828582778826, + "grad_norm": 0.28032336350734227, + "learning_rate": 9.21604457652727e-06, + "loss": 0.0128, + "step": 1034 + }, + { + "epoch": 0.20558148773463106, + "grad_norm": 0.6118204505736901, + "learning_rate": 9.214314006584256e-06, + "loss": 0.0181, + "step": 1035 + }, + { + "epoch": 0.20578011719137948, + "grad_norm": 0.579822360863569, + "learning_rate": 9.21258169149803e-06, + "loss": 0.0183, + "step": 1036 + }, + { + "epoch": 0.20597874664812793, + "grad_norm": 0.4047362585138213, + "learning_rate": 9.210847631985946e-06, + "loss": 0.0095, + "step": 1037 + }, + { + "epoch": 0.20617737610487635, + "grad_norm": 0.9852053076273041, + "learning_rate": 9.209111828766075e-06, + "loss": 0.0219, + "step": 1038 + }, + { + "epoch": 0.20637600556162478, + "grad_norm": 0.31297991161612315, + "learning_rate": 9.207374282557211e-06, + "loss": 0.0117, + "step": 1039 + }, + { + "epoch": 0.20657463501837323, + "grad_norm": 0.4164929500440738, + "learning_rate": 9.20563499407887e-06, + "loss": 0.0077, + "step": 1040 + }, + { + "epoch": 0.20677326447512165, + "grad_norm": 0.3180467674371223, + "learning_rate": 9.203893964051287e-06, + "loss": 0.011, + "step": 1041 + }, + { + "epoch": 0.2069718939318701, + "grad_norm": 1.0524726224029894, + "learning_rate": 9.202151193195426e-06, + "loss": 0.0217, + "step": 1042 + }, + { + "epoch": 0.20717052338861852, + "grad_norm": 0.8350304497147369, + "learning_rate": 9.200406682232962e-06, + "loss": 0.0206, + "step": 1043 + }, + { + "epoch": 0.20736915284536697, + "grad_norm": 0.5099630567366636, + "learning_rate": 9.198660431886299e-06, + "loss": 0.0114, + "step": 1044 + }, + { + "epoch": 0.2075677823021154, + "grad_norm": 0.45140417770852326, + "learning_rate": 9.196912442878555e-06, + "loss": 0.0187, + "step": 1045 + }, + { + "epoch": 0.20776641175886384, + "grad_norm": 0.3877058148364568, + "learning_rate": 9.195162715933573e-06, + "loss": 0.0146, + "step": 1046 + }, + { + "epoch": 0.20796504121561227, + "grad_norm": 0.5643704108621737, + "learning_rate": 9.19341125177591e-06, + "loss": 0.0204, + "step": 1047 + }, + { + "epoch": 0.20816367067236072, + "grad_norm": 1.1260233639536537, + "learning_rate": 9.191658051130845e-06, + "loss": 0.0176, + "step": 1048 + }, + { + "epoch": 0.20836230012910914, + "grad_norm": 0.8511847732487132, + "learning_rate": 9.189903114724382e-06, + "loss": 0.0185, + "step": 1049 + }, + { + "epoch": 0.2085609295858576, + "grad_norm": 1.1694023155755944, + "learning_rate": 9.188146443283233e-06, + "loss": 0.02, + "step": 1050 + }, + { + "epoch": 0.208759559042606, + "grad_norm": 0.5304190834875754, + "learning_rate": 9.186388037534836e-06, + "loss": 0.0124, + "step": 1051 + }, + { + "epoch": 0.20895818849935446, + "grad_norm": 0.9189842414829346, + "learning_rate": 9.184627898207346e-06, + "loss": 0.0145, + "step": 1052 + }, + { + "epoch": 0.2091568179561029, + "grad_norm": 0.6809255153940301, + "learning_rate": 9.182866026029633e-06, + "loss": 0.0188, + "step": 1053 + }, + { + "epoch": 0.20935544741285134, + "grad_norm": 0.36494308828456623, + "learning_rate": 9.181102421731289e-06, + "loss": 0.0146, + "step": 1054 + }, + { + "epoch": 0.20955407686959976, + "grad_norm": 0.734816285805914, + "learning_rate": 9.179337086042618e-06, + "loss": 0.018, + "step": 1055 + }, + { + "epoch": 0.2097527063263482, + "grad_norm": 0.734510701009242, + "learning_rate": 9.177570019694646e-06, + "loss": 0.0149, + "step": 1056 + }, + { + "epoch": 0.20995133578309663, + "grad_norm": 0.3903628569085709, + "learning_rate": 9.175801223419111e-06, + "loss": 0.0095, + "step": 1057 + }, + { + "epoch": 0.21014996523984508, + "grad_norm": 0.18988569699286045, + "learning_rate": 9.174030697948472e-06, + "loss": 0.0053, + "step": 1058 + }, + { + "epoch": 0.2103485946965935, + "grad_norm": 0.22752236325582623, + "learning_rate": 9.1722584440159e-06, + "loss": 0.0103, + "step": 1059 + }, + { + "epoch": 0.21054722415334193, + "grad_norm": 0.842296179765435, + "learning_rate": 9.170484462355287e-06, + "loss": 0.0154, + "step": 1060 + }, + { + "epoch": 0.21074585361009038, + "grad_norm": 0.4916092773194938, + "learning_rate": 9.168708753701232e-06, + "loss": 0.0156, + "step": 1061 + }, + { + "epoch": 0.2109444830668388, + "grad_norm": 0.619936716278479, + "learning_rate": 9.166931318789058e-06, + "loss": 0.0122, + "step": 1062 + }, + { + "epoch": 0.21114311252358725, + "grad_norm": 0.7419851034682392, + "learning_rate": 9.165152158354797e-06, + "loss": 0.0168, + "step": 1063 + }, + { + "epoch": 0.21134174198033567, + "grad_norm": 0.5313205892992046, + "learning_rate": 9.163371273135198e-06, + "loss": 0.016, + "step": 1064 + }, + { + "epoch": 0.21154037143708412, + "grad_norm": 1.029560156219283, + "learning_rate": 9.161588663867725e-06, + "loss": 0.0215, + "step": 1065 + }, + { + "epoch": 0.21173900089383255, + "grad_norm": 0.9348963919399064, + "learning_rate": 9.159804331290553e-06, + "loss": 0.0188, + "step": 1066 + }, + { + "epoch": 0.211937630350581, + "grad_norm": 0.3743440188348788, + "learning_rate": 9.158018276142573e-06, + "loss": 0.0098, + "step": 1067 + }, + { + "epoch": 0.21213625980732942, + "grad_norm": 1.169169214182386, + "learning_rate": 9.15623049916339e-06, + "loss": 0.0166, + "step": 1068 + }, + { + "epoch": 0.21233488926407787, + "grad_norm": 1.946374912712508, + "learning_rate": 9.15444100109332e-06, + "loss": 0.0123, + "step": 1069 + }, + { + "epoch": 0.2125335187208263, + "grad_norm": 0.3756066632440553, + "learning_rate": 9.15264978267339e-06, + "loss": 0.017, + "step": 1070 + }, + { + "epoch": 0.21273214817757474, + "grad_norm": 0.615631571698432, + "learning_rate": 9.150856844645345e-06, + "loss": 0.0148, + "step": 1071 + }, + { + "epoch": 0.21293077763432317, + "grad_norm": 0.5711931753500064, + "learning_rate": 9.149062187751635e-06, + "loss": 0.0135, + "step": 1072 + }, + { + "epoch": 0.21312940709107162, + "grad_norm": 0.3195420308665485, + "learning_rate": 9.14726581273543e-06, + "loss": 0.0106, + "step": 1073 + }, + { + "epoch": 0.21332803654782004, + "grad_norm": 0.7675454724598806, + "learning_rate": 9.145467720340607e-06, + "loss": 0.0186, + "step": 1074 + }, + { + "epoch": 0.2135266660045685, + "grad_norm": 0.7176331742224797, + "learning_rate": 9.143667911311748e-06, + "loss": 0.0155, + "step": 1075 + }, + { + "epoch": 0.2137252954613169, + "grad_norm": 0.3307945238844646, + "learning_rate": 9.14186638639416e-06, + "loss": 0.0162, + "step": 1076 + }, + { + "epoch": 0.21392392491806536, + "grad_norm": 0.8244909859270414, + "learning_rate": 9.140063146333849e-06, + "loss": 0.0124, + "step": 1077 + }, + { + "epoch": 0.21412255437481378, + "grad_norm": 0.8379862052945656, + "learning_rate": 9.138258191877534e-06, + "loss": 0.0182, + "step": 1078 + }, + { + "epoch": 0.21432118383156223, + "grad_norm": 0.31645715871594615, + "learning_rate": 9.136451523772644e-06, + "loss": 0.0155, + "step": 1079 + }, + { + "epoch": 0.21451981328831066, + "grad_norm": 0.573196275828462, + "learning_rate": 9.134643142767324e-06, + "loss": 0.011, + "step": 1080 + }, + { + "epoch": 0.21471844274505908, + "grad_norm": 0.7554029631929728, + "learning_rate": 9.132833049610417e-06, + "loss": 0.0152, + "step": 1081 + }, + { + "epoch": 0.21491707220180753, + "grad_norm": 0.45963932693629606, + "learning_rate": 9.131021245051482e-06, + "loss": 0.0086, + "step": 1082 + }, + { + "epoch": 0.21511570165855595, + "grad_norm": 0.3667521583026654, + "learning_rate": 9.129207729840787e-06, + "loss": 0.0151, + "step": 1083 + }, + { + "epoch": 0.2153143311153044, + "grad_norm": 0.7758394373843591, + "learning_rate": 9.127392504729308e-06, + "loss": 0.0119, + "step": 1084 + }, + { + "epoch": 0.21551296057205283, + "grad_norm": 0.8577031176131009, + "learning_rate": 9.125575570468726e-06, + "loss": 0.0189, + "step": 1085 + }, + { + "epoch": 0.21571159002880128, + "grad_norm": 0.4101225829842196, + "learning_rate": 9.123756927811429e-06, + "loss": 0.0147, + "step": 1086 + }, + { + "epoch": 0.2159102194855497, + "grad_norm": 0.20450257055095583, + "learning_rate": 9.12193657751052e-06, + "loss": 0.0097, + "step": 1087 + }, + { + "epoch": 0.21610884894229815, + "grad_norm": 0.7261120497720621, + "learning_rate": 9.120114520319801e-06, + "loss": 0.0134, + "step": 1088 + }, + { + "epoch": 0.21630747839904657, + "grad_norm": 1.0613311521030548, + "learning_rate": 9.118290756993787e-06, + "loss": 0.0204, + "step": 1089 + }, + { + "epoch": 0.21650610785579502, + "grad_norm": 0.852239674073818, + "learning_rate": 9.116465288287693e-06, + "loss": 0.0327, + "step": 1090 + }, + { + "epoch": 0.21670473731254344, + "grad_norm": 0.682837045981, + "learning_rate": 9.114638114957444e-06, + "loss": 0.0151, + "step": 1091 + }, + { + "epoch": 0.2169033667692919, + "grad_norm": 0.5746264192915811, + "learning_rate": 9.112809237759675e-06, + "loss": 0.0167, + "step": 1092 + }, + { + "epoch": 0.21710199622604032, + "grad_norm": 0.5489395595406013, + "learning_rate": 9.110978657451716e-06, + "loss": 0.0104, + "step": 1093 + }, + { + "epoch": 0.21730062568278877, + "grad_norm": 1.0914265746523228, + "learning_rate": 9.109146374791615e-06, + "loss": 0.0195, + "step": 1094 + }, + { + "epoch": 0.2174992551395372, + "grad_norm": 1.041328850309126, + "learning_rate": 9.107312390538114e-06, + "loss": 0.0177, + "step": 1095 + }, + { + "epoch": 0.21769788459628564, + "grad_norm": 1.047327530765551, + "learning_rate": 9.105476705450666e-06, + "loss": 0.0151, + "step": 1096 + }, + { + "epoch": 0.21789651405303406, + "grad_norm": 0.7382313477764061, + "learning_rate": 9.103639320289424e-06, + "loss": 0.0116, + "step": 1097 + }, + { + "epoch": 0.2180951435097825, + "grad_norm": 0.28713859066703235, + "learning_rate": 9.10180023581525e-06, + "loss": 0.0136, + "step": 1098 + }, + { + "epoch": 0.21829377296653094, + "grad_norm": 1.132089412775031, + "learning_rate": 9.099959452789706e-06, + "loss": 0.0177, + "step": 1099 + }, + { + "epoch": 0.21849240242327939, + "grad_norm": 0.708856856834622, + "learning_rate": 9.098116971975058e-06, + "loss": 0.0123, + "step": 1100 + }, + { + "epoch": 0.2186910318800278, + "grad_norm": 0.6004021170139507, + "learning_rate": 9.096272794134276e-06, + "loss": 0.0214, + "step": 1101 + }, + { + "epoch": 0.21888966133677623, + "grad_norm": 1.5930865385924153, + "learning_rate": 9.094426920031033e-06, + "loss": 0.0292, + "step": 1102 + }, + { + "epoch": 0.21908829079352468, + "grad_norm": 0.7947683989906318, + "learning_rate": 9.092579350429703e-06, + "loss": 0.014, + "step": 1103 + }, + { + "epoch": 0.2192869202502731, + "grad_norm": 0.9955767036580443, + "learning_rate": 9.090730086095359e-06, + "loss": 0.0161, + "step": 1104 + }, + { + "epoch": 0.21948554970702155, + "grad_norm": 0.3082258407552644, + "learning_rate": 9.088879127793786e-06, + "loss": 0.0102, + "step": 1105 + }, + { + "epoch": 0.21968417916376998, + "grad_norm": 0.7479627577909793, + "learning_rate": 9.08702647629146e-06, + "loss": 0.01, + "step": 1106 + }, + { + "epoch": 0.21988280862051843, + "grad_norm": 1.2508502872987448, + "learning_rate": 9.085172132355563e-06, + "loss": 0.0181, + "step": 1107 + }, + { + "epoch": 0.22008143807726685, + "grad_norm": 1.3082638036499745, + "learning_rate": 9.083316096753979e-06, + "loss": 0.0182, + "step": 1108 + }, + { + "epoch": 0.2202800675340153, + "grad_norm": 0.48875456172981985, + "learning_rate": 9.081458370255285e-06, + "loss": 0.0155, + "step": 1109 + }, + { + "epoch": 0.22047869699076372, + "grad_norm": 0.43843962541786946, + "learning_rate": 9.079598953628769e-06, + "loss": 0.0118, + "step": 1110 + }, + { + "epoch": 0.22067732644751217, + "grad_norm": 0.692522124440947, + "learning_rate": 9.077737847644411e-06, + "loss": 0.0168, + "step": 1111 + }, + { + "epoch": 0.2208759559042606, + "grad_norm": 0.5909955076987183, + "learning_rate": 9.075875053072895e-06, + "loss": 0.0146, + "step": 1112 + }, + { + "epoch": 0.22107458536100905, + "grad_norm": 0.8872938011038043, + "learning_rate": 9.0740105706856e-06, + "loss": 0.0168, + "step": 1113 + }, + { + "epoch": 0.22127321481775747, + "grad_norm": 0.44692462034582914, + "learning_rate": 9.072144401254607e-06, + "loss": 0.0096, + "step": 1114 + }, + { + "epoch": 0.22147184427450592, + "grad_norm": 0.8271646623948226, + "learning_rate": 9.070276545552696e-06, + "loss": 0.0113, + "step": 1115 + }, + { + "epoch": 0.22167047373125434, + "grad_norm": 0.46981013701744095, + "learning_rate": 9.068407004353346e-06, + "loss": 0.0091, + "step": 1116 + }, + { + "epoch": 0.2218691031880028, + "grad_norm": 0.6096189754072621, + "learning_rate": 9.066535778430727e-06, + "loss": 0.0106, + "step": 1117 + }, + { + "epoch": 0.22206773264475121, + "grad_norm": 1.2355353818301986, + "learning_rate": 9.064662868559714e-06, + "loss": 0.0177, + "step": 1118 + }, + { + "epoch": 0.22226636210149966, + "grad_norm": 0.7921361207312948, + "learning_rate": 9.062788275515878e-06, + "loss": 0.0117, + "step": 1119 + }, + { + "epoch": 0.2224649915582481, + "grad_norm": 0.621118395225743, + "learning_rate": 9.060912000075489e-06, + "loss": 0.0184, + "step": 1120 + }, + { + "epoch": 0.22266362101499654, + "grad_norm": 1.2199946768983834, + "learning_rate": 9.059034043015505e-06, + "loss": 0.022, + "step": 1121 + }, + { + "epoch": 0.22286225047174496, + "grad_norm": 1.200785379496156, + "learning_rate": 9.057154405113588e-06, + "loss": 0.0146, + "step": 1122 + }, + { + "epoch": 0.22306087992849338, + "grad_norm": 0.7879812928114914, + "learning_rate": 9.055273087148095e-06, + "loss": 0.0154, + "step": 1123 + }, + { + "epoch": 0.22325950938524183, + "grad_norm": 0.865383817784091, + "learning_rate": 9.053390089898078e-06, + "loss": 0.0151, + "step": 1124 + }, + { + "epoch": 0.22345813884199026, + "grad_norm": 0.6021432751244368, + "learning_rate": 9.051505414143283e-06, + "loss": 0.0213, + "step": 1125 + }, + { + "epoch": 0.2236567682987387, + "grad_norm": 0.7551185452295719, + "learning_rate": 9.049619060664155e-06, + "loss": 0.0234, + "step": 1126 + }, + { + "epoch": 0.22385539775548713, + "grad_norm": 0.9095260202243808, + "learning_rate": 9.047731030241827e-06, + "loss": 0.0119, + "step": 1127 + }, + { + "epoch": 0.22405402721223558, + "grad_norm": 0.7060999713583611, + "learning_rate": 9.045841323658136e-06, + "loss": 0.0139, + "step": 1128 + }, + { + "epoch": 0.224252656668984, + "grad_norm": 0.611769741942482, + "learning_rate": 9.043949941695602e-06, + "loss": 0.0135, + "step": 1129 + }, + { + "epoch": 0.22445128612573245, + "grad_norm": 0.6041679576728929, + "learning_rate": 9.042056885137447e-06, + "loss": 0.0208, + "step": 1130 + }, + { + "epoch": 0.22464991558248087, + "grad_norm": 0.4868808950406833, + "learning_rate": 9.040162154767585e-06, + "loss": 0.0139, + "step": 1131 + }, + { + "epoch": 0.22484854503922933, + "grad_norm": 0.37898547751832734, + "learning_rate": 9.03826575137062e-06, + "loss": 0.0134, + "step": 1132 + }, + { + "epoch": 0.22504717449597775, + "grad_norm": 0.3871203326790516, + "learning_rate": 9.036367675731852e-06, + "loss": 0.014, + "step": 1133 + }, + { + "epoch": 0.2252458039527262, + "grad_norm": 0.4158985639649831, + "learning_rate": 9.03446792863727e-06, + "loss": 0.0155, + "step": 1134 + }, + { + "epoch": 0.22544443340947462, + "grad_norm": 0.8248065463728428, + "learning_rate": 9.03256651087356e-06, + "loss": 0.0191, + "step": 1135 + }, + { + "epoch": 0.22564306286622307, + "grad_norm": 0.6997112013650323, + "learning_rate": 9.030663423228096e-06, + "loss": 0.0119, + "step": 1136 + }, + { + "epoch": 0.2258416923229715, + "grad_norm": 0.40813675543898414, + "learning_rate": 9.028758666488946e-06, + "loss": 0.0106, + "step": 1137 + }, + { + "epoch": 0.22604032177971994, + "grad_norm": 0.27264796677090325, + "learning_rate": 9.026852241444865e-06, + "loss": 0.0117, + "step": 1138 + }, + { + "epoch": 0.22623895123646837, + "grad_norm": 0.7634799874549393, + "learning_rate": 9.024944148885305e-06, + "loss": 0.0137, + "step": 1139 + }, + { + "epoch": 0.22643758069321682, + "grad_norm": 0.37311413583560454, + "learning_rate": 9.023034389600403e-06, + "loss": 0.014, + "step": 1140 + }, + { + "epoch": 0.22663621014996524, + "grad_norm": 1.1650537636675808, + "learning_rate": 9.021122964380988e-06, + "loss": 0.0149, + "step": 1141 + }, + { + "epoch": 0.22683483960671366, + "grad_norm": 0.6052730792928078, + "learning_rate": 9.019209874018581e-06, + "loss": 0.0153, + "step": 1142 + }, + { + "epoch": 0.2270334690634621, + "grad_norm": 0.6627925620828036, + "learning_rate": 9.01729511930539e-06, + "loss": 0.0132, + "step": 1143 + }, + { + "epoch": 0.22723209852021053, + "grad_norm": 1.831246147791384, + "learning_rate": 9.015378701034315e-06, + "loss": 0.0222, + "step": 1144 + }, + { + "epoch": 0.22743072797695899, + "grad_norm": 0.6830687198229002, + "learning_rate": 9.013460619998937e-06, + "loss": 0.0095, + "step": 1145 + }, + { + "epoch": 0.2276293574337074, + "grad_norm": 0.6897632451876908, + "learning_rate": 9.011540876993539e-06, + "loss": 0.0145, + "step": 1146 + }, + { + "epoch": 0.22782798689045586, + "grad_norm": 0.45414406716937405, + "learning_rate": 9.00961947281308e-06, + "loss": 0.0135, + "step": 1147 + }, + { + "epoch": 0.22802661634720428, + "grad_norm": 0.6724858979266666, + "learning_rate": 9.007696408253212e-06, + "loss": 0.0176, + "step": 1148 + }, + { + "epoch": 0.22822524580395273, + "grad_norm": 0.5585341592638718, + "learning_rate": 9.005771684110275e-06, + "loss": 0.0142, + "step": 1149 + }, + { + "epoch": 0.22842387526070115, + "grad_norm": 0.33631448656179974, + "learning_rate": 9.003845301181296e-06, + "loss": 0.0104, + "step": 1150 + }, + { + "epoch": 0.2286225047174496, + "grad_norm": 0.936608731900455, + "learning_rate": 9.001917260263986e-06, + "loss": 0.0176, + "step": 1151 + }, + { + "epoch": 0.22882113417419803, + "grad_norm": 0.450082775257085, + "learning_rate": 8.999987562156747e-06, + "loss": 0.0108, + "step": 1152 + }, + { + "epoch": 0.22901976363094648, + "grad_norm": 0.6502188418132533, + "learning_rate": 8.998056207658662e-06, + "loss": 0.0197, + "step": 1153 + }, + { + "epoch": 0.2292183930876949, + "grad_norm": 0.7819061233264434, + "learning_rate": 8.996123197569508e-06, + "loss": 0.0116, + "step": 1154 + }, + { + "epoch": 0.22941702254444335, + "grad_norm": 0.8372844196296657, + "learning_rate": 8.994188532689739e-06, + "loss": 0.0124, + "step": 1155 + }, + { + "epoch": 0.22961565200119177, + "grad_norm": 0.4402518430330382, + "learning_rate": 8.992252213820498e-06, + "loss": 0.0117, + "step": 1156 + }, + { + "epoch": 0.22981428145794022, + "grad_norm": 0.3109619518398801, + "learning_rate": 8.990314241763614e-06, + "loss": 0.0105, + "step": 1157 + }, + { + "epoch": 0.23001291091468865, + "grad_norm": 0.940067780887533, + "learning_rate": 8.988374617321597e-06, + "loss": 0.0186, + "step": 1158 + }, + { + "epoch": 0.2302115403714371, + "grad_norm": 1.0503338550839172, + "learning_rate": 8.986433341297646e-06, + "loss": 0.0221, + "step": 1159 + }, + { + "epoch": 0.23041016982818552, + "grad_norm": 0.6966779242628477, + "learning_rate": 8.984490414495642e-06, + "loss": 0.0104, + "step": 1160 + }, + { + "epoch": 0.23060879928493397, + "grad_norm": 0.29127162723599453, + "learning_rate": 8.982545837720148e-06, + "loss": 0.008, + "step": 1161 + }, + { + "epoch": 0.2308074287416824, + "grad_norm": 0.7864175276944835, + "learning_rate": 8.980599611776408e-06, + "loss": 0.0203, + "step": 1162 + }, + { + "epoch": 0.2310060581984308, + "grad_norm": 0.6641316865126137, + "learning_rate": 8.978651737470354e-06, + "loss": 0.016, + "step": 1163 + }, + { + "epoch": 0.23120468765517926, + "grad_norm": 0.9355779041765688, + "learning_rate": 8.976702215608603e-06, + "loss": 0.0196, + "step": 1164 + }, + { + "epoch": 0.2314033171119277, + "grad_norm": 0.5906387589001846, + "learning_rate": 8.974751046998445e-06, + "loss": 0.0188, + "step": 1165 + }, + { + "epoch": 0.23160194656867614, + "grad_norm": 1.0575477626994387, + "learning_rate": 8.97279823244786e-06, + "loss": 0.0118, + "step": 1166 + }, + { + "epoch": 0.23180057602542456, + "grad_norm": 0.2202881228331673, + "learning_rate": 8.970843772765505e-06, + "loss": 0.0091, + "step": 1167 + }, + { + "epoch": 0.231999205482173, + "grad_norm": 0.7075656613422315, + "learning_rate": 8.968887668760719e-06, + "loss": 0.0158, + "step": 1168 + }, + { + "epoch": 0.23219783493892143, + "grad_norm": 0.2683433024299603, + "learning_rate": 8.966929921243526e-06, + "loss": 0.012, + "step": 1169 + }, + { + "epoch": 0.23239646439566988, + "grad_norm": 0.42237177538366844, + "learning_rate": 8.964970531024624e-06, + "loss": 0.0131, + "step": 1170 + }, + { + "epoch": 0.2325950938524183, + "grad_norm": 0.594094475622576, + "learning_rate": 8.963009498915396e-06, + "loss": 0.0119, + "step": 1171 + }, + { + "epoch": 0.23279372330916676, + "grad_norm": 0.406385112438069, + "learning_rate": 8.961046825727904e-06, + "loss": 0.0113, + "step": 1172 + }, + { + "epoch": 0.23299235276591518, + "grad_norm": 0.8033200609983158, + "learning_rate": 8.959082512274885e-06, + "loss": 0.0123, + "step": 1173 + }, + { + "epoch": 0.23319098222266363, + "grad_norm": 0.47018932211309783, + "learning_rate": 8.957116559369767e-06, + "loss": 0.0144, + "step": 1174 + }, + { + "epoch": 0.23338961167941205, + "grad_norm": 0.6077335509097652, + "learning_rate": 8.955148967826642e-06, + "loss": 0.0191, + "step": 1175 + }, + { + "epoch": 0.2335882411361605, + "grad_norm": 0.8460343255845043, + "learning_rate": 8.95317973846029e-06, + "loss": 0.0231, + "step": 1176 + }, + { + "epoch": 0.23378687059290892, + "grad_norm": 0.568144005560869, + "learning_rate": 8.951208872086166e-06, + "loss": 0.0148, + "step": 1177 + }, + { + "epoch": 0.23398550004965737, + "grad_norm": 0.6107991897583083, + "learning_rate": 8.949236369520406e-06, + "loss": 0.0184, + "step": 1178 + }, + { + "epoch": 0.2341841295064058, + "grad_norm": 0.42161119081345094, + "learning_rate": 8.947262231579822e-06, + "loss": 0.0119, + "step": 1179 + }, + { + "epoch": 0.23438275896315425, + "grad_norm": 0.40846557188713406, + "learning_rate": 8.945286459081899e-06, + "loss": 0.0137, + "step": 1180 + }, + { + "epoch": 0.23458138841990267, + "grad_norm": 0.763309241318152, + "learning_rate": 8.943309052844806e-06, + "loss": 0.0134, + "step": 1181 + }, + { + "epoch": 0.23478001787665112, + "grad_norm": 0.5803819518194214, + "learning_rate": 8.941330013687382e-06, + "loss": 0.0179, + "step": 1182 + }, + { + "epoch": 0.23497864733339954, + "grad_norm": 1.039753700867902, + "learning_rate": 8.939349342429144e-06, + "loss": 0.0152, + "step": 1183 + }, + { + "epoch": 0.23517727679014797, + "grad_norm": 0.5593224702420636, + "learning_rate": 8.937367039890291e-06, + "loss": 0.0138, + "step": 1184 + }, + { + "epoch": 0.23537590624689642, + "grad_norm": 0.6151456252694801, + "learning_rate": 8.93538310689169e-06, + "loss": 0.0202, + "step": 1185 + }, + { + "epoch": 0.23557453570364484, + "grad_norm": 0.2979372338609462, + "learning_rate": 8.933397544254884e-06, + "loss": 0.0121, + "step": 1186 + }, + { + "epoch": 0.2357731651603933, + "grad_norm": 1.520553652264108, + "learning_rate": 8.931410352802095e-06, + "loss": 0.0222, + "step": 1187 + }, + { + "epoch": 0.2359717946171417, + "grad_norm": 0.504693883508692, + "learning_rate": 8.929421533356215e-06, + "loss": 0.0155, + "step": 1188 + }, + { + "epoch": 0.23617042407389016, + "grad_norm": 0.7873301547010652, + "learning_rate": 8.927431086740814e-06, + "loss": 0.0253, + "step": 1189 + }, + { + "epoch": 0.23636905353063858, + "grad_norm": 0.3650087454226798, + "learning_rate": 8.925439013780131e-06, + "loss": 0.0129, + "step": 1190 + }, + { + "epoch": 0.23656768298738703, + "grad_norm": 0.8064537953076155, + "learning_rate": 8.923445315299085e-06, + "loss": 0.0157, + "step": 1191 + }, + { + "epoch": 0.23676631244413546, + "grad_norm": 0.37122234928734354, + "learning_rate": 8.921449992123264e-06, + "loss": 0.0116, + "step": 1192 + }, + { + "epoch": 0.2369649419008839, + "grad_norm": 1.2902752115216132, + "learning_rate": 8.919453045078927e-06, + "loss": 0.0227, + "step": 1193 + }, + { + "epoch": 0.23716357135763233, + "grad_norm": 1.7179509273444584, + "learning_rate": 8.917454474993008e-06, + "loss": 0.0245, + "step": 1194 + }, + { + "epoch": 0.23736220081438078, + "grad_norm": 0.38492446902743044, + "learning_rate": 8.915454282693116e-06, + "loss": 0.0145, + "step": 1195 + }, + { + "epoch": 0.2375608302711292, + "grad_norm": 0.4447455285152336, + "learning_rate": 8.913452469007526e-06, + "loss": 0.0125, + "step": 1196 + }, + { + "epoch": 0.23775945972787765, + "grad_norm": 0.5145154810241487, + "learning_rate": 8.911449034765186e-06, + "loss": 0.0117, + "step": 1197 + }, + { + "epoch": 0.23795808918462608, + "grad_norm": 0.3081471801818194, + "learning_rate": 8.90944398079572e-06, + "loss": 0.0184, + "step": 1198 + }, + { + "epoch": 0.23815671864137453, + "grad_norm": 0.4384280059834026, + "learning_rate": 8.907437307929416e-06, + "loss": 0.0098, + "step": 1199 + }, + { + "epoch": 0.23835534809812295, + "grad_norm": 0.3173624393698063, + "learning_rate": 8.905429016997236e-06, + "loss": 0.0154, + "step": 1200 + }, + { + "epoch": 0.2385539775548714, + "grad_norm": 0.4241866381288965, + "learning_rate": 8.903419108830808e-06, + "loss": 0.0121, + "step": 1201 + }, + { + "epoch": 0.23875260701161982, + "grad_norm": 1.6418273480323602, + "learning_rate": 8.901407584262441e-06, + "loss": 0.0217, + "step": 1202 + }, + { + "epoch": 0.23895123646836827, + "grad_norm": 0.6482978467087832, + "learning_rate": 8.899394444125097e-06, + "loss": 0.0119, + "step": 1203 + }, + { + "epoch": 0.2391498659251167, + "grad_norm": 0.49425790948254467, + "learning_rate": 8.897379689252418e-06, + "loss": 0.0105, + "step": 1204 + }, + { + "epoch": 0.23934849538186512, + "grad_norm": 0.7361793243822061, + "learning_rate": 8.895363320478715e-06, + "loss": 0.0167, + "step": 1205 + }, + { + "epoch": 0.23954712483861357, + "grad_norm": 0.5247879216412412, + "learning_rate": 8.893345338638961e-06, + "loss": 0.0128, + "step": 1206 + }, + { + "epoch": 0.239745754295362, + "grad_norm": 0.7004974169321737, + "learning_rate": 8.891325744568802e-06, + "loss": 0.0124, + "step": 1207 + }, + { + "epoch": 0.23994438375211044, + "grad_norm": 0.5994164703367848, + "learning_rate": 8.889304539104549e-06, + "loss": 0.0082, + "step": 1208 + }, + { + "epoch": 0.24014301320885886, + "grad_norm": 0.6187312745317394, + "learning_rate": 8.887281723083179e-06, + "loss": 0.0143, + "step": 1209 + }, + { + "epoch": 0.2403416426656073, + "grad_norm": 0.3401870907046762, + "learning_rate": 8.885257297342343e-06, + "loss": 0.0092, + "step": 1210 + }, + { + "epoch": 0.24054027212235574, + "grad_norm": 0.702714768582494, + "learning_rate": 8.883231262720348e-06, + "loss": 0.0149, + "step": 1211 + }, + { + "epoch": 0.2407389015791042, + "grad_norm": 0.31545699277173606, + "learning_rate": 8.881203620056178e-06, + "loss": 0.0071, + "step": 1212 + }, + { + "epoch": 0.2409375310358526, + "grad_norm": 0.9671126025588995, + "learning_rate": 8.879174370189475e-06, + "loss": 0.0139, + "step": 1213 + }, + { + "epoch": 0.24113616049260106, + "grad_norm": 0.9187540597662088, + "learning_rate": 8.87714351396055e-06, + "loss": 0.0264, + "step": 1214 + }, + { + "epoch": 0.24133478994934948, + "grad_norm": 0.9928211950549544, + "learning_rate": 8.875111052210378e-06, + "loss": 0.0151, + "step": 1215 + }, + { + "epoch": 0.24153341940609793, + "grad_norm": 2.2372121346405853, + "learning_rate": 8.873076985780602e-06, + "loss": 0.0327, + "step": 1216 + }, + { + "epoch": 0.24173204886284635, + "grad_norm": 0.6013185780237169, + "learning_rate": 8.871041315513523e-06, + "loss": 0.0203, + "step": 1217 + }, + { + "epoch": 0.2419306783195948, + "grad_norm": 0.4762488349863747, + "learning_rate": 8.869004042252111e-06, + "loss": 0.0185, + "step": 1218 + }, + { + "epoch": 0.24212930777634323, + "grad_norm": 0.5238008487757791, + "learning_rate": 8.866965166840003e-06, + "loss": 0.0121, + "step": 1219 + }, + { + "epoch": 0.24232793723309168, + "grad_norm": 0.8737365406833077, + "learning_rate": 8.864924690121489e-06, + "loss": 0.0154, + "step": 1220 + }, + { + "epoch": 0.2425265666898401, + "grad_norm": 0.8165887277343837, + "learning_rate": 8.862882612941532e-06, + "loss": 0.0136, + "step": 1221 + }, + { + "epoch": 0.24272519614658855, + "grad_norm": 0.6144569559576514, + "learning_rate": 8.860838936145754e-06, + "loss": 0.0134, + "step": 1222 + }, + { + "epoch": 0.24292382560333697, + "grad_norm": 0.7154975367837693, + "learning_rate": 8.858793660580438e-06, + "loss": 0.0164, + "step": 1223 + }, + { + "epoch": 0.24312245506008542, + "grad_norm": 0.5286069304960067, + "learning_rate": 8.856746787092532e-06, + "loss": 0.0172, + "step": 1224 + }, + { + "epoch": 0.24332108451683385, + "grad_norm": 0.6339057196599259, + "learning_rate": 8.854698316529642e-06, + "loss": 0.0202, + "step": 1225 + }, + { + "epoch": 0.24351971397358227, + "grad_norm": 0.671220701801568, + "learning_rate": 8.852648249740041e-06, + "loss": 0.0151, + "step": 1226 + }, + { + "epoch": 0.24371834343033072, + "grad_norm": 1.1322941892865441, + "learning_rate": 8.850596587572658e-06, + "loss": 0.0153, + "step": 1227 + }, + { + "epoch": 0.24391697288707914, + "grad_norm": 0.5378549330329825, + "learning_rate": 8.848543330877084e-06, + "loss": 0.0186, + "step": 1228 + }, + { + "epoch": 0.2441156023438276, + "grad_norm": 0.5673634305039723, + "learning_rate": 8.84648848050357e-06, + "loss": 0.0197, + "step": 1229 + }, + { + "epoch": 0.24431423180057601, + "grad_norm": 0.3534387414527403, + "learning_rate": 8.84443203730303e-06, + "loss": 0.0165, + "step": 1230 + }, + { + "epoch": 0.24451286125732447, + "grad_norm": 0.5672990685019907, + "learning_rate": 8.842374002127033e-06, + "loss": 0.0136, + "step": 1231 + }, + { + "epoch": 0.2447114907140729, + "grad_norm": 0.8327923542087138, + "learning_rate": 8.840314375827808e-06, + "loss": 0.0145, + "step": 1232 + }, + { + "epoch": 0.24491012017082134, + "grad_norm": 0.3379005386095217, + "learning_rate": 8.838253159258245e-06, + "loss": 0.0112, + "step": 1233 + }, + { + "epoch": 0.24510874962756976, + "grad_norm": 0.8689521331181729, + "learning_rate": 8.836190353271894e-06, + "loss": 0.024, + "step": 1234 + }, + { + "epoch": 0.2453073790843182, + "grad_norm": 0.42828866756204587, + "learning_rate": 8.834125958722958e-06, + "loss": 0.0115, + "step": 1235 + }, + { + "epoch": 0.24550600854106663, + "grad_norm": 0.5156919400962039, + "learning_rate": 8.832059976466305e-06, + "loss": 0.0175, + "step": 1236 + }, + { + "epoch": 0.24570463799781508, + "grad_norm": 0.6054495643078346, + "learning_rate": 8.82999240735745e-06, + "loss": 0.0098, + "step": 1237 + }, + { + "epoch": 0.2459032674545635, + "grad_norm": 0.7588058992250312, + "learning_rate": 8.827923252252577e-06, + "loss": 0.0143, + "step": 1238 + }, + { + "epoch": 0.24610189691131196, + "grad_norm": 0.38121018989369687, + "learning_rate": 8.825852512008518e-06, + "loss": 0.0122, + "step": 1239 + }, + { + "epoch": 0.24630052636806038, + "grad_norm": 0.6129760460462393, + "learning_rate": 8.823780187482764e-06, + "loss": 0.0113, + "step": 1240 + }, + { + "epoch": 0.24649915582480883, + "grad_norm": 1.2945184977220947, + "learning_rate": 8.821706279533465e-06, + "loss": 0.0135, + "step": 1241 + }, + { + "epoch": 0.24669778528155725, + "grad_norm": 0.5977689562617747, + "learning_rate": 8.819630789019422e-06, + "loss": 0.0093, + "step": 1242 + }, + { + "epoch": 0.2468964147383057, + "grad_norm": 0.2571681909339781, + "learning_rate": 8.817553716800095e-06, + "loss": 0.0046, + "step": 1243 + }, + { + "epoch": 0.24709504419505413, + "grad_norm": 0.8248908002719997, + "learning_rate": 8.815475063735596e-06, + "loss": 0.0113, + "step": 1244 + }, + { + "epoch": 0.24729367365180258, + "grad_norm": 0.584105982723087, + "learning_rate": 8.813394830686695e-06, + "loss": 0.012, + "step": 1245 + }, + { + "epoch": 0.247492303108551, + "grad_norm": 0.6285446237309048, + "learning_rate": 8.811313018514812e-06, + "loss": 0.0189, + "step": 1246 + }, + { + "epoch": 0.24769093256529942, + "grad_norm": 0.7020367426867679, + "learning_rate": 8.809229628082025e-06, + "loss": 0.0223, + "step": 1247 + }, + { + "epoch": 0.24788956202204787, + "grad_norm": 0.7721912844272995, + "learning_rate": 8.807144660251065e-06, + "loss": 0.0201, + "step": 1248 + }, + { + "epoch": 0.2480881914787963, + "grad_norm": 0.6697889970539281, + "learning_rate": 8.805058115885313e-06, + "loss": 0.0188, + "step": 1249 + }, + { + "epoch": 0.24828682093554474, + "grad_norm": 0.5381683696890166, + "learning_rate": 8.802969995848807e-06, + "loss": 0.0061, + "step": 1250 + }, + { + "epoch": 0.24848545039229317, + "grad_norm": 0.20343883973471363, + "learning_rate": 8.800880301006232e-06, + "loss": 0.0043, + "step": 1251 + }, + { + "epoch": 0.24868407984904162, + "grad_norm": 1.0199510305812602, + "learning_rate": 8.798789032222932e-06, + "loss": 0.0165, + "step": 1252 + }, + { + "epoch": 0.24888270930579004, + "grad_norm": 0.6275045781700873, + "learning_rate": 8.796696190364897e-06, + "loss": 0.0221, + "step": 1253 + }, + { + "epoch": 0.2490813387625385, + "grad_norm": 1.0129544827919168, + "learning_rate": 8.794601776298772e-06, + "loss": 0.0155, + "step": 1254 + }, + { + "epoch": 0.2492799682192869, + "grad_norm": 1.1428308322663727, + "learning_rate": 8.792505790891852e-06, + "loss": 0.0135, + "step": 1255 + }, + { + "epoch": 0.24947859767603536, + "grad_norm": 0.39150197901895456, + "learning_rate": 8.790408235012081e-06, + "loss": 0.0156, + "step": 1256 + }, + { + "epoch": 0.24967722713278379, + "grad_norm": 0.5483610810377085, + "learning_rate": 8.788309109528057e-06, + "loss": 0.0113, + "step": 1257 + }, + { + "epoch": 0.24987585658953224, + "grad_norm": 0.27854942135066457, + "learning_rate": 8.786208415309023e-06, + "loss": 0.0132, + "step": 1258 + }, + { + "epoch": 0.2500744860462807, + "grad_norm": 0.36486249941255533, + "learning_rate": 8.784106153224876e-06, + "loss": 0.0111, + "step": 1259 + }, + { + "epoch": 0.2502731155030291, + "grad_norm": 0.5102349401683092, + "learning_rate": 8.782002324146162e-06, + "loss": 0.0131, + "step": 1260 + }, + { + "epoch": 0.25047174495977753, + "grad_norm": 0.590643281645968, + "learning_rate": 8.779896928944072e-06, + "loss": 0.0192, + "step": 1261 + }, + { + "epoch": 0.25067037441652595, + "grad_norm": 0.6155294741908522, + "learning_rate": 8.777789968490449e-06, + "loss": 0.0108, + "step": 1262 + }, + { + "epoch": 0.25086900387327443, + "grad_norm": 0.8591597492277506, + "learning_rate": 8.775681443657781e-06, + "loss": 0.0134, + "step": 1263 + }, + { + "epoch": 0.25106763333002285, + "grad_norm": 0.36267619775257987, + "learning_rate": 8.773571355319213e-06, + "loss": 0.0091, + "step": 1264 + }, + { + "epoch": 0.2512662627867713, + "grad_norm": 0.545487632579775, + "learning_rate": 8.771459704348521e-06, + "loss": 0.0152, + "step": 1265 + }, + { + "epoch": 0.2514648922435197, + "grad_norm": 0.4679522780645009, + "learning_rate": 8.769346491620145e-06, + "loss": 0.0125, + "step": 1266 + }, + { + "epoch": 0.2516635217002681, + "grad_norm": 0.5309851493131132, + "learning_rate": 8.767231718009161e-06, + "loss": 0.0105, + "step": 1267 + }, + { + "epoch": 0.2518621511570166, + "grad_norm": 1.162362377224985, + "learning_rate": 8.765115384391296e-06, + "loss": 0.015, + "step": 1268 + }, + { + "epoch": 0.252060780613765, + "grad_norm": 0.45539747042204515, + "learning_rate": 8.76299749164292e-06, + "loss": 0.0141, + "step": 1269 + }, + { + "epoch": 0.25225941007051345, + "grad_norm": 0.5999868464347882, + "learning_rate": 8.76087804064105e-06, + "loss": 0.0171, + "step": 1270 + }, + { + "epoch": 0.25245803952726187, + "grad_norm": 0.5339189795562727, + "learning_rate": 8.75875703226335e-06, + "loss": 0.0162, + "step": 1271 + }, + { + "epoch": 0.25265666898401035, + "grad_norm": 0.859130519663411, + "learning_rate": 8.756634467388128e-06, + "loss": 0.0165, + "step": 1272 + }, + { + "epoch": 0.25285529844075877, + "grad_norm": 0.21076209185559133, + "learning_rate": 8.754510346894334e-06, + "loss": 0.0044, + "step": 1273 + }, + { + "epoch": 0.2530539278975072, + "grad_norm": 0.7939050635268597, + "learning_rate": 8.752384671661566e-06, + "loss": 0.0182, + "step": 1274 + }, + { + "epoch": 0.2532525573542556, + "grad_norm": 0.5766317392642453, + "learning_rate": 8.750257442570064e-06, + "loss": 0.0095, + "step": 1275 + }, + { + "epoch": 0.2534511868110041, + "grad_norm": 0.7303525194562475, + "learning_rate": 8.74812866050071e-06, + "loss": 0.0117, + "step": 1276 + }, + { + "epoch": 0.2536498162677525, + "grad_norm": 0.40272585071340505, + "learning_rate": 8.74599832633503e-06, + "loss": 0.0107, + "step": 1277 + }, + { + "epoch": 0.25384844572450094, + "grad_norm": 0.9286068222372685, + "learning_rate": 8.743866440955196e-06, + "loss": 0.0207, + "step": 1278 + }, + { + "epoch": 0.25404707518124936, + "grad_norm": 0.6122165638925774, + "learning_rate": 8.741733005244016e-06, + "loss": 0.0162, + "step": 1279 + }, + { + "epoch": 0.25424570463799784, + "grad_norm": 0.590624454464984, + "learning_rate": 8.739598020084947e-06, + "loss": 0.0087, + "step": 1280 + }, + { + "epoch": 0.25444433409474626, + "grad_norm": 1.016687681475864, + "learning_rate": 8.737461486362082e-06, + "loss": 0.0161, + "step": 1281 + }, + { + "epoch": 0.2546429635514947, + "grad_norm": 0.8291330430883908, + "learning_rate": 8.735323404960159e-06, + "loss": 0.0209, + "step": 1282 + }, + { + "epoch": 0.2548415930082431, + "grad_norm": 1.2088406629766484, + "learning_rate": 8.733183776764556e-06, + "loss": 0.016, + "step": 1283 + }, + { + "epoch": 0.2550402224649916, + "grad_norm": 0.903068606712378, + "learning_rate": 8.731042602661289e-06, + "loss": 0.0201, + "step": 1284 + }, + { + "epoch": 0.25523885192174, + "grad_norm": 0.6420259885722851, + "learning_rate": 8.728899883537014e-06, + "loss": 0.0146, + "step": 1285 + }, + { + "epoch": 0.25543748137848843, + "grad_norm": 0.4337787015933656, + "learning_rate": 8.726755620279033e-06, + "loss": 0.0121, + "step": 1286 + }, + { + "epoch": 0.25563611083523685, + "grad_norm": 0.9125000801904051, + "learning_rate": 8.724609813775282e-06, + "loss": 0.0174, + "step": 1287 + }, + { + "epoch": 0.2558347402919853, + "grad_norm": 0.4601365566071554, + "learning_rate": 8.722462464914337e-06, + "loss": 0.0091, + "step": 1288 + }, + { + "epoch": 0.25603336974873375, + "grad_norm": 0.6681352622131371, + "learning_rate": 8.720313574585412e-06, + "loss": 0.0168, + "step": 1289 + }, + { + "epoch": 0.2562319992054822, + "grad_norm": 0.3959505420693962, + "learning_rate": 8.718163143678365e-06, + "loss": 0.0128, + "step": 1290 + }, + { + "epoch": 0.2564306286622306, + "grad_norm": 0.23256713254644923, + "learning_rate": 8.716011173083679e-06, + "loss": 0.0117, + "step": 1291 + }, + { + "epoch": 0.256629258118979, + "grad_norm": 0.2541092240532925, + "learning_rate": 8.713857663692492e-06, + "loss": 0.0086, + "step": 1292 + }, + { + "epoch": 0.2568278875757275, + "grad_norm": 0.6179349251330739, + "learning_rate": 8.711702616396562e-06, + "loss": 0.014, + "step": 1293 + }, + { + "epoch": 0.2570265170324759, + "grad_norm": 0.3453897821620428, + "learning_rate": 8.709546032088296e-06, + "loss": 0.0128, + "step": 1294 + }, + { + "epoch": 0.25722514648922434, + "grad_norm": 0.5847778245454376, + "learning_rate": 8.707387911660735e-06, + "loss": 0.0119, + "step": 1295 + }, + { + "epoch": 0.25742377594597277, + "grad_norm": 0.2704171725983602, + "learning_rate": 8.705228256007549e-06, + "loss": 0.0115, + "step": 1296 + }, + { + "epoch": 0.25762240540272124, + "grad_norm": 0.6897068421679288, + "learning_rate": 8.703067066023055e-06, + "loss": 0.0112, + "step": 1297 + }, + { + "epoch": 0.25782103485946967, + "grad_norm": 0.40321138772863513, + "learning_rate": 8.700904342602197e-06, + "loss": 0.0056, + "step": 1298 + }, + { + "epoch": 0.2580196643162181, + "grad_norm": 0.7069935344664505, + "learning_rate": 8.698740086640559e-06, + "loss": 0.0194, + "step": 1299 + }, + { + "epoch": 0.2582182937729665, + "grad_norm": 0.9773159570445462, + "learning_rate": 8.696574299034351e-06, + "loss": 0.0236, + "step": 1300 + }, + { + "epoch": 0.258416923229715, + "grad_norm": 1.029306053307389, + "learning_rate": 8.69440698068043e-06, + "loss": 0.0202, + "step": 1301 + }, + { + "epoch": 0.2586155526864634, + "grad_norm": 0.59016206926503, + "learning_rate": 8.692238132476278e-06, + "loss": 0.0099, + "step": 1302 + }, + { + "epoch": 0.25881418214321184, + "grad_norm": 0.516731967140984, + "learning_rate": 8.690067755320012e-06, + "loss": 0.0094, + "step": 1303 + }, + { + "epoch": 0.25901281159996026, + "grad_norm": 0.3353964879934747, + "learning_rate": 8.687895850110386e-06, + "loss": 0.0167, + "step": 1304 + }, + { + "epoch": 0.25921144105670874, + "grad_norm": 0.9274750807980529, + "learning_rate": 8.68572241774678e-06, + "loss": 0.0192, + "step": 1305 + }, + { + "epoch": 0.25941007051345716, + "grad_norm": 0.6824972255867494, + "learning_rate": 8.683547459129211e-06, + "loss": 0.0168, + "step": 1306 + }, + { + "epoch": 0.2596086999702056, + "grad_norm": 0.6136495451069736, + "learning_rate": 8.681370975158328e-06, + "loss": 0.0108, + "step": 1307 + }, + { + "epoch": 0.259807329426954, + "grad_norm": 0.37108024187396627, + "learning_rate": 8.679192966735413e-06, + "loss": 0.0109, + "step": 1308 + }, + { + "epoch": 0.2600059588837024, + "grad_norm": 0.725413502338458, + "learning_rate": 8.677013434762373e-06, + "loss": 0.0255, + "step": 1309 + }, + { + "epoch": 0.2602045883404509, + "grad_norm": 0.48143907389589863, + "learning_rate": 8.674832380141754e-06, + "loss": 0.0171, + "step": 1310 + }, + { + "epoch": 0.2604032177971993, + "grad_norm": 0.34879379609000266, + "learning_rate": 8.672649803776724e-06, + "loss": 0.0106, + "step": 1311 + }, + { + "epoch": 0.26060184725394775, + "grad_norm": 0.5825209707500266, + "learning_rate": 8.670465706571088e-06, + "loss": 0.0177, + "step": 1312 + }, + { + "epoch": 0.26080047671069617, + "grad_norm": 0.46030714480671636, + "learning_rate": 8.66828008942928e-06, + "loss": 0.0098, + "step": 1313 + }, + { + "epoch": 0.26099910616744465, + "grad_norm": 0.7703888628916056, + "learning_rate": 8.66609295325636e-06, + "loss": 0.0171, + "step": 1314 + }, + { + "epoch": 0.2611977356241931, + "grad_norm": 0.738989269533668, + "learning_rate": 8.663904298958018e-06, + "loss": 0.0157, + "step": 1315 + }, + { + "epoch": 0.2613963650809415, + "grad_norm": 0.6362768126248891, + "learning_rate": 8.661714127440578e-06, + "loss": 0.0106, + "step": 1316 + }, + { + "epoch": 0.2615949945376899, + "grad_norm": 0.5455888714935196, + "learning_rate": 8.659522439610983e-06, + "loss": 0.0142, + "step": 1317 + }, + { + "epoch": 0.2617936239944384, + "grad_norm": 0.23611279379951763, + "learning_rate": 8.657329236376811e-06, + "loss": 0.0067, + "step": 1318 + }, + { + "epoch": 0.2619922534511868, + "grad_norm": 0.31209832142135896, + "learning_rate": 8.655134518646264e-06, + "loss": 0.0133, + "step": 1319 + }, + { + "epoch": 0.26219088290793524, + "grad_norm": 0.2910883987846584, + "learning_rate": 8.652938287328174e-06, + "loss": 0.0084, + "step": 1320 + }, + { + "epoch": 0.26238951236468366, + "grad_norm": 0.46180981742302674, + "learning_rate": 8.650740543331997e-06, + "loss": 0.0094, + "step": 1321 + }, + { + "epoch": 0.26258814182143214, + "grad_norm": 1.0186730371322448, + "learning_rate": 8.648541287567817e-06, + "loss": 0.0176, + "step": 1322 + }, + { + "epoch": 0.26278677127818056, + "grad_norm": 0.7996672389811389, + "learning_rate": 8.646340520946343e-06, + "loss": 0.0241, + "step": 1323 + }, + { + "epoch": 0.262985400734929, + "grad_norm": 0.40791342137010506, + "learning_rate": 8.644138244378912e-06, + "loss": 0.0109, + "step": 1324 + }, + { + "epoch": 0.2631840301916774, + "grad_norm": 0.5936153143977514, + "learning_rate": 8.641934458777482e-06, + "loss": 0.0118, + "step": 1325 + }, + { + "epoch": 0.2633826596484259, + "grad_norm": 0.4056311995150495, + "learning_rate": 8.63972916505464e-06, + "loss": 0.0081, + "step": 1326 + }, + { + "epoch": 0.2635812891051743, + "grad_norm": 0.7376187474937147, + "learning_rate": 8.637522364123596e-06, + "loss": 0.0114, + "step": 1327 + }, + { + "epoch": 0.26377991856192273, + "grad_norm": 0.3672522299105733, + "learning_rate": 8.635314056898185e-06, + "loss": 0.0089, + "step": 1328 + }, + { + "epoch": 0.26397854801867116, + "grad_norm": 0.7185888449952436, + "learning_rate": 8.633104244292862e-06, + "loss": 0.0204, + "step": 1329 + }, + { + "epoch": 0.2641771774754196, + "grad_norm": 0.6555374705786724, + "learning_rate": 8.630892927222709e-06, + "loss": 0.0153, + "step": 1330 + }, + { + "epoch": 0.26437580693216806, + "grad_norm": 0.4167555548956689, + "learning_rate": 8.628680106603433e-06, + "loss": 0.0149, + "step": 1331 + }, + { + "epoch": 0.2645744363889165, + "grad_norm": 0.6242106937112943, + "learning_rate": 8.626465783351357e-06, + "loss": 0.0184, + "step": 1332 + }, + { + "epoch": 0.2647730658456649, + "grad_norm": 0.42946885487707775, + "learning_rate": 8.624249958383433e-06, + "loss": 0.0126, + "step": 1333 + }, + { + "epoch": 0.2649716953024133, + "grad_norm": 0.9955126087048953, + "learning_rate": 8.62203263261723e-06, + "loss": 0.0123, + "step": 1334 + }, + { + "epoch": 0.2651703247591618, + "grad_norm": 0.3025009366483527, + "learning_rate": 8.61981380697094e-06, + "loss": 0.0106, + "step": 1335 + }, + { + "epoch": 0.2653689542159102, + "grad_norm": 0.3621342251640229, + "learning_rate": 8.617593482363379e-06, + "loss": 0.0089, + "step": 1336 + }, + { + "epoch": 0.26556758367265865, + "grad_norm": 0.9828115025424791, + "learning_rate": 8.615371659713979e-06, + "loss": 0.0202, + "step": 1337 + }, + { + "epoch": 0.26576621312940707, + "grad_norm": 1.4452351020519076, + "learning_rate": 8.613148339942796e-06, + "loss": 0.0197, + "step": 1338 + }, + { + "epoch": 0.26596484258615555, + "grad_norm": 0.6375350442126254, + "learning_rate": 8.610923523970502e-06, + "loss": 0.0165, + "step": 1339 + }, + { + "epoch": 0.26616347204290397, + "grad_norm": 0.4494514278658364, + "learning_rate": 8.608697212718396e-06, + "loss": 0.0095, + "step": 1340 + }, + { + "epoch": 0.2663621014996524, + "grad_norm": 0.8983564965166115, + "learning_rate": 8.606469407108385e-06, + "loss": 0.0165, + "step": 1341 + }, + { + "epoch": 0.2665607309564008, + "grad_norm": 0.6516352105484908, + "learning_rate": 8.604240108063004e-06, + "loss": 0.0148, + "step": 1342 + }, + { + "epoch": 0.2667593604131493, + "grad_norm": 0.5267184317342489, + "learning_rate": 8.602009316505407e-06, + "loss": 0.0124, + "step": 1343 + }, + { + "epoch": 0.2669579898698977, + "grad_norm": 0.47381761284604557, + "learning_rate": 8.599777033359355e-06, + "loss": 0.0101, + "step": 1344 + }, + { + "epoch": 0.26715661932664614, + "grad_norm": 1.6669999572861662, + "learning_rate": 8.597543259549241e-06, + "loss": 0.0245, + "step": 1345 + }, + { + "epoch": 0.26735524878339456, + "grad_norm": 0.4832507507850066, + "learning_rate": 8.595307996000066e-06, + "loss": 0.0134, + "step": 1346 + }, + { + "epoch": 0.26755387824014304, + "grad_norm": 0.5156754052203035, + "learning_rate": 8.59307124363745e-06, + "loss": 0.0258, + "step": 1347 + }, + { + "epoch": 0.26775250769689146, + "grad_norm": 0.8536512969498504, + "learning_rate": 8.590833003387628e-06, + "loss": 0.0258, + "step": 1348 + }, + { + "epoch": 0.2679511371536399, + "grad_norm": 0.9428969265848054, + "learning_rate": 8.588593276177458e-06, + "loss": 0.019, + "step": 1349 + }, + { + "epoch": 0.2681497666103883, + "grad_norm": 0.7301869629392804, + "learning_rate": 8.586352062934404e-06, + "loss": 0.0145, + "step": 1350 + }, + { + "epoch": 0.26834839606713673, + "grad_norm": 0.4718543613104909, + "learning_rate": 8.584109364586554e-06, + "loss": 0.0091, + "step": 1351 + }, + { + "epoch": 0.2685470255238852, + "grad_norm": 0.271372733060467, + "learning_rate": 8.581865182062606e-06, + "loss": 0.0094, + "step": 1352 + }, + { + "epoch": 0.26874565498063363, + "grad_norm": 0.42823082027908455, + "learning_rate": 8.57961951629187e-06, + "loss": 0.0117, + "step": 1353 + }, + { + "epoch": 0.26894428443738205, + "grad_norm": 0.572008987695907, + "learning_rate": 8.57737236820428e-06, + "loss": 0.019, + "step": 1354 + }, + { + "epoch": 0.2691429138941305, + "grad_norm": 1.2931428416128332, + "learning_rate": 8.575123738730373e-06, + "loss": 0.0184, + "step": 1355 + }, + { + "epoch": 0.26934154335087895, + "grad_norm": 0.4474034296024569, + "learning_rate": 8.572873628801305e-06, + "loss": 0.0101, + "step": 1356 + }, + { + "epoch": 0.2695401728076274, + "grad_norm": 0.8648867742502259, + "learning_rate": 8.570622039348849e-06, + "loss": 0.0132, + "step": 1357 + }, + { + "epoch": 0.2697388022643758, + "grad_norm": 0.771431838314264, + "learning_rate": 8.56836897130538e-06, + "loss": 0.0176, + "step": 1358 + }, + { + "epoch": 0.2699374317211242, + "grad_norm": 0.44094886621911206, + "learning_rate": 8.566114425603892e-06, + "loss": 0.0126, + "step": 1359 + }, + { + "epoch": 0.2701360611778727, + "grad_norm": 0.6577711334649768, + "learning_rate": 8.563858403177994e-06, + "loss": 0.0185, + "step": 1360 + }, + { + "epoch": 0.2703346906346211, + "grad_norm": 0.39262415058239325, + "learning_rate": 8.5616009049619e-06, + "loss": 0.0155, + "step": 1361 + }, + { + "epoch": 0.27053332009136954, + "grad_norm": 1.140595423731314, + "learning_rate": 8.559341931890436e-06, + "loss": 0.0251, + "step": 1362 + }, + { + "epoch": 0.27073194954811797, + "grad_norm": 0.4989543570417254, + "learning_rate": 8.557081484899043e-06, + "loss": 0.016, + "step": 1363 + }, + { + "epoch": 0.27093057900486645, + "grad_norm": 0.9607995698578123, + "learning_rate": 8.55481956492377e-06, + "loss": 0.0186, + "step": 1364 + }, + { + "epoch": 0.27112920846161487, + "grad_norm": 0.5416529886893404, + "learning_rate": 8.552556172901276e-06, + "loss": 0.0172, + "step": 1365 + }, + { + "epoch": 0.2713278379183633, + "grad_norm": 0.6712403945203463, + "learning_rate": 8.550291309768826e-06, + "loss": 0.0112, + "step": 1366 + }, + { + "epoch": 0.2715264673751117, + "grad_norm": 0.34387714192881513, + "learning_rate": 8.548024976464302e-06, + "loss": 0.0094, + "step": 1367 + }, + { + "epoch": 0.2717250968318602, + "grad_norm": 0.4152557312415485, + "learning_rate": 8.545757173926187e-06, + "loss": 0.0148, + "step": 1368 + }, + { + "epoch": 0.2719237262886086, + "grad_norm": 0.5580718606691937, + "learning_rate": 8.543487903093577e-06, + "loss": 0.0112, + "step": 1369 + }, + { + "epoch": 0.27212235574535704, + "grad_norm": 0.6980069434101902, + "learning_rate": 8.541217164906177e-06, + "loss": 0.0197, + "step": 1370 + }, + { + "epoch": 0.27232098520210546, + "grad_norm": 0.6713064539961933, + "learning_rate": 8.538944960304292e-06, + "loss": 0.0237, + "step": 1371 + }, + { + "epoch": 0.2725196146588539, + "grad_norm": 0.2791585912778628, + "learning_rate": 8.536671290228846e-06, + "loss": 0.0096, + "step": 1372 + }, + { + "epoch": 0.27271824411560236, + "grad_norm": 1.7948572479774836, + "learning_rate": 8.534396155621358e-06, + "loss": 0.0206, + "step": 1373 + }, + { + "epoch": 0.2729168735723508, + "grad_norm": 0.5146160931306909, + "learning_rate": 8.532119557423964e-06, + "loss": 0.0078, + "step": 1374 + }, + { + "epoch": 0.2731155030290992, + "grad_norm": 0.6336391927705046, + "learning_rate": 8.529841496579396e-06, + "loss": 0.0154, + "step": 1375 + }, + { + "epoch": 0.2733141324858476, + "grad_norm": 1.0228381650063942, + "learning_rate": 8.527561974031e-06, + "loss": 0.0125, + "step": 1376 + }, + { + "epoch": 0.2735127619425961, + "grad_norm": 0.5621590918716233, + "learning_rate": 8.525280990722723e-06, + "loss": 0.0152, + "step": 1377 + }, + { + "epoch": 0.27371139139934453, + "grad_norm": 0.5992082774576349, + "learning_rate": 8.52299854759912e-06, + "loss": 0.015, + "step": 1378 + }, + { + "epoch": 0.27391002085609295, + "grad_norm": 0.5427316635640782, + "learning_rate": 8.520714645605344e-06, + "loss": 0.0217, + "step": 1379 + }, + { + "epoch": 0.2741086503128414, + "grad_norm": 0.7839896301325817, + "learning_rate": 8.51842928568716e-06, + "loss": 0.0117, + "step": 1380 + }, + { + "epoch": 0.27430727976958985, + "grad_norm": 0.4131385983667191, + "learning_rate": 8.516142468790931e-06, + "loss": 0.0159, + "step": 1381 + }, + { + "epoch": 0.2745059092263383, + "grad_norm": 0.4081999784122884, + "learning_rate": 8.513854195863629e-06, + "loss": 0.0099, + "step": 1382 + }, + { + "epoch": 0.2747045386830867, + "grad_norm": 0.4673684154571544, + "learning_rate": 8.511564467852822e-06, + "loss": 0.0077, + "step": 1383 + }, + { + "epoch": 0.2749031681398351, + "grad_norm": 0.5713154043757909, + "learning_rate": 8.509273285706686e-06, + "loss": 0.0195, + "step": 1384 + }, + { + "epoch": 0.2751017975965836, + "grad_norm": 0.46380108544256105, + "learning_rate": 8.506980650373995e-06, + "loss": 0.0142, + "step": 1385 + }, + { + "epoch": 0.275300427053332, + "grad_norm": 1.0792577996847905, + "learning_rate": 8.50468656280413e-06, + "loss": 0.0222, + "step": 1386 + }, + { + "epoch": 0.27549905651008044, + "grad_norm": 0.42746699373310537, + "learning_rate": 8.50239102394707e-06, + "loss": 0.0146, + "step": 1387 + }, + { + "epoch": 0.27569768596682886, + "grad_norm": 1.7457011754660738, + "learning_rate": 8.500094034753393e-06, + "loss": 0.0197, + "step": 1388 + }, + { + "epoch": 0.27589631542357734, + "grad_norm": 0.6820989464273655, + "learning_rate": 8.49779559617428e-06, + "loss": 0.0134, + "step": 1389 + }, + { + "epoch": 0.27609494488032577, + "grad_norm": 0.5308996170906365, + "learning_rate": 8.495495709161516e-06, + "loss": 0.0196, + "step": 1390 + }, + { + "epoch": 0.2762935743370742, + "grad_norm": 0.6992766539498697, + "learning_rate": 8.49319437466748e-06, + "loss": 0.0097, + "step": 1391 + }, + { + "epoch": 0.2764922037938226, + "grad_norm": 0.2293267653263177, + "learning_rate": 8.49089159364515e-06, + "loss": 0.006, + "step": 1392 + }, + { + "epoch": 0.27669083325057103, + "grad_norm": 0.32951425806657547, + "learning_rate": 8.488587367048105e-06, + "loss": 0.0073, + "step": 1393 + }, + { + "epoch": 0.2768894627073195, + "grad_norm": 1.6153168594424325, + "learning_rate": 8.486281695830527e-06, + "loss": 0.0223, + "step": 1394 + }, + { + "epoch": 0.27708809216406793, + "grad_norm": 1.864778170921737, + "learning_rate": 8.483974580947189e-06, + "loss": 0.0329, + "step": 1395 + }, + { + "epoch": 0.27728672162081636, + "grad_norm": 1.7739863390597264, + "learning_rate": 8.481666023353468e-06, + "loss": 0.0263, + "step": 1396 + }, + { + "epoch": 0.2774853510775648, + "grad_norm": 0.5128932906013113, + "learning_rate": 8.479356024005332e-06, + "loss": 0.0113, + "step": 1397 + }, + { + "epoch": 0.27768398053431326, + "grad_norm": 0.820357228467481, + "learning_rate": 8.47704458385935e-06, + "loss": 0.014, + "step": 1398 + }, + { + "epoch": 0.2778826099910617, + "grad_norm": 0.6925209483529865, + "learning_rate": 8.47473170387269e-06, + "loss": 0.0148, + "step": 1399 + }, + { + "epoch": 0.2780812394478101, + "grad_norm": 0.5888881159884609, + "learning_rate": 8.472417385003109e-06, + "loss": 0.019, + "step": 1400 + }, + { + "epoch": 0.2782798689045585, + "grad_norm": 0.8199869161478691, + "learning_rate": 8.470101628208966e-06, + "loss": 0.0135, + "step": 1401 + }, + { + "epoch": 0.278478498361307, + "grad_norm": 0.7210167222946081, + "learning_rate": 8.467784434449216e-06, + "loss": 0.0172, + "step": 1402 + }, + { + "epoch": 0.2786771278180554, + "grad_norm": 0.4196353511380168, + "learning_rate": 8.465465804683404e-06, + "loss": 0.0155, + "step": 1403 + }, + { + "epoch": 0.27887575727480385, + "grad_norm": 0.6034473650484623, + "learning_rate": 8.463145739871672e-06, + "loss": 0.0096, + "step": 1404 + }, + { + "epoch": 0.27907438673155227, + "grad_norm": 0.36016231959989076, + "learning_rate": 8.460824240974757e-06, + "loss": 0.0103, + "step": 1405 + }, + { + "epoch": 0.27927301618830075, + "grad_norm": 0.5475792609192718, + "learning_rate": 8.458501308953988e-06, + "loss": 0.0154, + "step": 1406 + }, + { + "epoch": 0.27947164564504917, + "grad_norm": 0.5922387773813316, + "learning_rate": 8.456176944771293e-06, + "loss": 0.0152, + "step": 1407 + }, + { + "epoch": 0.2796702751017976, + "grad_norm": 0.9948119748458698, + "learning_rate": 8.453851149389185e-06, + "loss": 0.0245, + "step": 1408 + }, + { + "epoch": 0.279868904558546, + "grad_norm": 0.8243450247239436, + "learning_rate": 8.451523923770776e-06, + "loss": 0.013, + "step": 1409 + }, + { + "epoch": 0.2800675340152945, + "grad_norm": 0.7318369517598357, + "learning_rate": 8.449195268879767e-06, + "loss": 0.0133, + "step": 1410 + }, + { + "epoch": 0.2802661634720429, + "grad_norm": 0.5419561316890904, + "learning_rate": 8.446865185680448e-06, + "loss": 0.0093, + "step": 1411 + }, + { + "epoch": 0.28046479292879134, + "grad_norm": 0.5265876588995337, + "learning_rate": 8.44453367513771e-06, + "loss": 0.0171, + "step": 1412 + }, + { + "epoch": 0.28066342238553976, + "grad_norm": 0.6859813997920388, + "learning_rate": 8.442200738217025e-06, + "loss": 0.0095, + "step": 1413 + }, + { + "epoch": 0.2808620518422882, + "grad_norm": 0.5649693302253302, + "learning_rate": 8.439866375884464e-06, + "loss": 0.0212, + "step": 1414 + }, + { + "epoch": 0.28106068129903666, + "grad_norm": 0.5862281313195347, + "learning_rate": 8.437530589106679e-06, + "loss": 0.0221, + "step": 1415 + }, + { + "epoch": 0.2812593107557851, + "grad_norm": 0.33080120075065145, + "learning_rate": 8.435193378850921e-06, + "loss": 0.0117, + "step": 1416 + }, + { + "epoch": 0.2814579402125335, + "grad_norm": 0.9234190022508963, + "learning_rate": 8.432854746085024e-06, + "loss": 0.0173, + "step": 1417 + }, + { + "epoch": 0.28165656966928193, + "grad_norm": 0.8430616129766179, + "learning_rate": 8.430514691777415e-06, + "loss": 0.0145, + "step": 1418 + }, + { + "epoch": 0.2818551991260304, + "grad_norm": 0.8475901106471433, + "learning_rate": 8.428173216897107e-06, + "loss": 0.0159, + "step": 1419 + }, + { + "epoch": 0.28205382858277883, + "grad_norm": 0.5735491390921624, + "learning_rate": 8.425830322413703e-06, + "loss": 0.0097, + "step": 1420 + }, + { + "epoch": 0.28225245803952725, + "grad_norm": 0.5266507760548224, + "learning_rate": 8.423486009297394e-06, + "loss": 0.0109, + "step": 1421 + }, + { + "epoch": 0.2824510874962757, + "grad_norm": 0.36869636591820587, + "learning_rate": 8.421140278518955e-06, + "loss": 0.0085, + "step": 1422 + }, + { + "epoch": 0.28264971695302415, + "grad_norm": 1.2202844836952085, + "learning_rate": 8.418793131049757e-06, + "loss": 0.0266, + "step": 1423 + }, + { + "epoch": 0.2828483464097726, + "grad_norm": 0.6504661966699112, + "learning_rate": 8.416444567861742e-06, + "loss": 0.0106, + "step": 1424 + }, + { + "epoch": 0.283046975866521, + "grad_norm": 1.5018651106882912, + "learning_rate": 8.414094589927455e-06, + "loss": 0.018, + "step": 1425 + }, + { + "epoch": 0.2832456053232694, + "grad_norm": 0.48052848756832556, + "learning_rate": 8.411743198220016e-06, + "loss": 0.0106, + "step": 1426 + }, + { + "epoch": 0.2834442347800179, + "grad_norm": 0.7465489964730609, + "learning_rate": 8.409390393713139e-06, + "loss": 0.0118, + "step": 1427 + }, + { + "epoch": 0.2836428642367663, + "grad_norm": 0.49935340474238005, + "learning_rate": 8.407036177381111e-06, + "loss": 0.0068, + "step": 1428 + }, + { + "epoch": 0.28384149369351475, + "grad_norm": 0.6438837817206333, + "learning_rate": 8.404680550198814e-06, + "loss": 0.0112, + "step": 1429 + }, + { + "epoch": 0.28404012315026317, + "grad_norm": 0.46587307611736145, + "learning_rate": 8.40232351314171e-06, + "loss": 0.0083, + "step": 1430 + }, + { + "epoch": 0.28423875260701165, + "grad_norm": 1.2474325824762231, + "learning_rate": 8.399965067185849e-06, + "loss": 0.0155, + "step": 1431 + }, + { + "epoch": 0.28443738206376007, + "grad_norm": 1.0245682468407986, + "learning_rate": 8.397605213307858e-06, + "loss": 0.0185, + "step": 1432 + }, + { + "epoch": 0.2846360115205085, + "grad_norm": 0.7066781397822776, + "learning_rate": 8.395243952484949e-06, + "loss": 0.0107, + "step": 1433 + }, + { + "epoch": 0.2848346409772569, + "grad_norm": 0.7550032030601233, + "learning_rate": 8.392881285694918e-06, + "loss": 0.015, + "step": 1434 + }, + { + "epoch": 0.28503327043400534, + "grad_norm": 0.6874113816132006, + "learning_rate": 8.390517213916147e-06, + "loss": 0.0142, + "step": 1435 + }, + { + "epoch": 0.2852318998907538, + "grad_norm": 1.2976316129854208, + "learning_rate": 8.388151738127592e-06, + "loss": 0.0181, + "step": 1436 + }, + { + "epoch": 0.28543052934750224, + "grad_norm": 1.326879456283606, + "learning_rate": 8.385784859308796e-06, + "loss": 0.0166, + "step": 1437 + }, + { + "epoch": 0.28562915880425066, + "grad_norm": 1.086178549836567, + "learning_rate": 8.383416578439881e-06, + "loss": 0.0232, + "step": 1438 + }, + { + "epoch": 0.2858277882609991, + "grad_norm": 0.6673231654729204, + "learning_rate": 8.381046896501547e-06, + "loss": 0.0112, + "step": 1439 + }, + { + "epoch": 0.28602641771774756, + "grad_norm": 0.530064954612848, + "learning_rate": 8.378675814475081e-06, + "loss": 0.0062, + "step": 1440 + }, + { + "epoch": 0.286225047174496, + "grad_norm": 1.1957721069947402, + "learning_rate": 8.376303333342342e-06, + "loss": 0.0206, + "step": 1441 + }, + { + "epoch": 0.2864236766312444, + "grad_norm": 0.45312372809065454, + "learning_rate": 8.373929454085775e-06, + "loss": 0.0085, + "step": 1442 + }, + { + "epoch": 0.28662230608799283, + "grad_norm": 0.332625950245216, + "learning_rate": 8.371554177688399e-06, + "loss": 0.0087, + "step": 1443 + }, + { + "epoch": 0.2868209355447413, + "grad_norm": 0.6970930696458172, + "learning_rate": 8.369177505133814e-06, + "loss": 0.0131, + "step": 1444 + }, + { + "epoch": 0.28701956500148973, + "grad_norm": 1.1105719027213687, + "learning_rate": 8.3667994374062e-06, + "loss": 0.0239, + "step": 1445 + }, + { + "epoch": 0.28721819445823815, + "grad_norm": 1.0006129734400397, + "learning_rate": 8.36441997549031e-06, + "loss": 0.0174, + "step": 1446 + }, + { + "epoch": 0.2874168239149866, + "grad_norm": 1.4747499745629573, + "learning_rate": 8.362039120371475e-06, + "loss": 0.0159, + "step": 1447 + }, + { + "epoch": 0.28761545337173505, + "grad_norm": 1.0411053353229378, + "learning_rate": 8.35965687303561e-06, + "loss": 0.0188, + "step": 1448 + }, + { + "epoch": 0.2878140828284835, + "grad_norm": 0.9172298750699612, + "learning_rate": 8.357273234469196e-06, + "loss": 0.0153, + "step": 1449 + }, + { + "epoch": 0.2880127122852319, + "grad_norm": 0.3913985500318929, + "learning_rate": 8.354888205659299e-06, + "loss": 0.0079, + "step": 1450 + }, + { + "epoch": 0.2882113417419803, + "grad_norm": 1.374573151737498, + "learning_rate": 8.352501787593557e-06, + "loss": 0.0281, + "step": 1451 + }, + { + "epoch": 0.2884099711987288, + "grad_norm": 0.5566702813433198, + "learning_rate": 8.35011398126018e-06, + "loss": 0.0173, + "step": 1452 + }, + { + "epoch": 0.2886086006554772, + "grad_norm": 1.199039497209081, + "learning_rate": 8.347724787647959e-06, + "loss": 0.0248, + "step": 1453 + }, + { + "epoch": 0.28880723011222564, + "grad_norm": 0.9629869712993756, + "learning_rate": 8.345334207746256e-06, + "loss": 0.0203, + "step": 1454 + }, + { + "epoch": 0.28900585956897407, + "grad_norm": 0.7850010176416995, + "learning_rate": 8.342942242545007e-06, + "loss": 0.0141, + "step": 1455 + }, + { + "epoch": 0.2892044890257225, + "grad_norm": 0.8834250404360167, + "learning_rate": 8.340548893034723e-06, + "loss": 0.0182, + "step": 1456 + }, + { + "epoch": 0.28940311848247097, + "grad_norm": 0.7477391277298652, + "learning_rate": 8.338154160206489e-06, + "loss": 0.0126, + "step": 1457 + }, + { + "epoch": 0.2896017479392194, + "grad_norm": 0.991856579976285, + "learning_rate": 8.335758045051959e-06, + "loss": 0.0191, + "step": 1458 + }, + { + "epoch": 0.2898003773959678, + "grad_norm": 1.5855642011770337, + "learning_rate": 8.333360548563363e-06, + "loss": 0.0247, + "step": 1459 + }, + { + "epoch": 0.28999900685271623, + "grad_norm": 1.3241126782557957, + "learning_rate": 8.330961671733503e-06, + "loss": 0.0165, + "step": 1460 + }, + { + "epoch": 0.2901976363094647, + "grad_norm": 1.4795489122284058, + "learning_rate": 8.32856141555575e-06, + "loss": 0.024, + "step": 1461 + }, + { + "epoch": 0.29039626576621314, + "grad_norm": 0.3260206114711344, + "learning_rate": 8.326159781024049e-06, + "loss": 0.0108, + "step": 1462 + }, + { + "epoch": 0.29059489522296156, + "grad_norm": 0.3190347690618339, + "learning_rate": 8.32375676913291e-06, + "loss": 0.016, + "step": 1463 + }, + { + "epoch": 0.29079352467971, + "grad_norm": 0.5725996431341029, + "learning_rate": 8.321352380877426e-06, + "loss": 0.017, + "step": 1464 + }, + { + "epoch": 0.29099215413645846, + "grad_norm": 0.767026255292494, + "learning_rate": 8.318946617253244e-06, + "loss": 0.017, + "step": 1465 + }, + { + "epoch": 0.2911907835932069, + "grad_norm": 0.5629140122364568, + "learning_rate": 8.316539479256594e-06, + "loss": 0.015, + "step": 1466 + }, + { + "epoch": 0.2913894130499553, + "grad_norm": 0.7358139874860172, + "learning_rate": 8.314130967884263e-06, + "loss": 0.0129, + "step": 1467 + }, + { + "epoch": 0.2915880425067037, + "grad_norm": 0.8209343112526951, + "learning_rate": 8.311721084133622e-06, + "loss": 0.0246, + "step": 1468 + }, + { + "epoch": 0.2917866719634522, + "grad_norm": 0.7719668586694085, + "learning_rate": 8.309309829002594e-06, + "loss": 0.016, + "step": 1469 + }, + { + "epoch": 0.2919853014202006, + "grad_norm": 0.41468903664501805, + "learning_rate": 8.30689720348968e-06, + "loss": 0.0096, + "step": 1470 + }, + { + "epoch": 0.29218393087694905, + "grad_norm": 0.3738228357367781, + "learning_rate": 8.304483208593944e-06, + "loss": 0.0095, + "step": 1471 + }, + { + "epoch": 0.29238256033369747, + "grad_norm": 0.24029576983374756, + "learning_rate": 8.302067845315023e-06, + "loss": 0.0097, + "step": 1472 + }, + { + "epoch": 0.29258118979044595, + "grad_norm": 0.5705732197317682, + "learning_rate": 8.299651114653113e-06, + "loss": 0.0214, + "step": 1473 + }, + { + "epoch": 0.2927798192471944, + "grad_norm": 0.922298441563043, + "learning_rate": 8.29723301760898e-06, + "loss": 0.0185, + "step": 1474 + }, + { + "epoch": 0.2929784487039428, + "grad_norm": 0.8041508059426072, + "learning_rate": 8.294813555183959e-06, + "loss": 0.0157, + "step": 1475 + }, + { + "epoch": 0.2931770781606912, + "grad_norm": 0.8308220892287086, + "learning_rate": 8.29239272837994e-06, + "loss": 0.0208, + "step": 1476 + }, + { + "epoch": 0.29337570761743964, + "grad_norm": 0.41812007821452235, + "learning_rate": 8.289970538199391e-06, + "loss": 0.016, + "step": 1477 + }, + { + "epoch": 0.2935743370741881, + "grad_norm": 0.9237991915282489, + "learning_rate": 8.28754698564534e-06, + "loss": 0.0138, + "step": 1478 + }, + { + "epoch": 0.29377296653093654, + "grad_norm": 0.28751992841811247, + "learning_rate": 8.285122071721373e-06, + "loss": 0.0107, + "step": 1479 + }, + { + "epoch": 0.29397159598768496, + "grad_norm": 0.8535192526289043, + "learning_rate": 8.282695797431644e-06, + "loss": 0.0254, + "step": 1480 + }, + { + "epoch": 0.2941702254444334, + "grad_norm": 0.5712038029199457, + "learning_rate": 8.280268163780873e-06, + "loss": 0.0075, + "step": 1481 + }, + { + "epoch": 0.29436885490118186, + "grad_norm": 0.4432551734371519, + "learning_rate": 8.27783917177434e-06, + "loss": 0.0095, + "step": 1482 + }, + { + "epoch": 0.2945674843579303, + "grad_norm": 0.26503631565417796, + "learning_rate": 8.27540882241789e-06, + "loss": 0.005, + "step": 1483 + }, + { + "epoch": 0.2947661138146787, + "grad_norm": 0.864720411309366, + "learning_rate": 8.272977116717925e-06, + "loss": 0.0164, + "step": 1484 + }, + { + "epoch": 0.29496474327142713, + "grad_norm": 0.6284726332727625, + "learning_rate": 8.270544055681415e-06, + "loss": 0.0089, + "step": 1485 + }, + { + "epoch": 0.2951633727281756, + "grad_norm": 0.508270650687714, + "learning_rate": 8.268109640315887e-06, + "loss": 0.0172, + "step": 1486 + }, + { + "epoch": 0.29536200218492403, + "grad_norm": 0.6269790340357052, + "learning_rate": 8.26567387162943e-06, + "loss": 0.0179, + "step": 1487 + }, + { + "epoch": 0.29556063164167246, + "grad_norm": 0.7297545166439348, + "learning_rate": 8.263236750630692e-06, + "loss": 0.0219, + "step": 1488 + }, + { + "epoch": 0.2957592610984209, + "grad_norm": 0.6898505446448555, + "learning_rate": 8.260798278328884e-06, + "loss": 0.0108, + "step": 1489 + }, + { + "epoch": 0.29595789055516936, + "grad_norm": 0.6708782293191564, + "learning_rate": 8.258358455733774e-06, + "loss": 0.0084, + "step": 1490 + }, + { + "epoch": 0.2961565200119178, + "grad_norm": 0.6386660284055108, + "learning_rate": 8.25591728385569e-06, + "loss": 0.0113, + "step": 1491 + }, + { + "epoch": 0.2963551494686662, + "grad_norm": 1.0471163969424355, + "learning_rate": 8.25347476370552e-06, + "loss": 0.0187, + "step": 1492 + }, + { + "epoch": 0.2965537789254146, + "grad_norm": 0.8613429954652995, + "learning_rate": 8.251030896294708e-06, + "loss": 0.0146, + "step": 1493 + }, + { + "epoch": 0.2967524083821631, + "grad_norm": 0.4725238292843531, + "learning_rate": 8.248585682635258e-06, + "loss": 0.0162, + "step": 1494 + }, + { + "epoch": 0.2969510378389115, + "grad_norm": 0.7080357972606163, + "learning_rate": 8.246139123739729e-06, + "loss": 0.0277, + "step": 1495 + }, + { + "epoch": 0.29714966729565995, + "grad_norm": 0.353145384888274, + "learning_rate": 8.243691220621241e-06, + "loss": 0.0082, + "step": 1496 + }, + { + "epoch": 0.29734829675240837, + "grad_norm": 0.5979605491522597, + "learning_rate": 8.241241974293466e-06, + "loss": 0.0116, + "step": 1497 + }, + { + "epoch": 0.2975469262091568, + "grad_norm": 0.7513238451070279, + "learning_rate": 8.238791385770638e-06, + "loss": 0.0246, + "step": 1498 + }, + { + "epoch": 0.29774555566590527, + "grad_norm": 0.33294751800259487, + "learning_rate": 8.236339456067538e-06, + "loss": 0.0164, + "step": 1499 + }, + { + "epoch": 0.2979441851226537, + "grad_norm": 0.7051678355474877, + "learning_rate": 8.233886186199508e-06, + "loss": 0.0137, + "step": 1500 + }, + { + "epoch": 0.2981428145794021, + "grad_norm": 0.5236577910644429, + "learning_rate": 8.231431577182452e-06, + "loss": 0.0132, + "step": 1501 + }, + { + "epoch": 0.29834144403615054, + "grad_norm": 0.47643786878372224, + "learning_rate": 8.228975630032812e-06, + "loss": 0.0072, + "step": 1502 + }, + { + "epoch": 0.298540073492899, + "grad_norm": 0.5218883992156759, + "learning_rate": 8.226518345767598e-06, + "loss": 0.0172, + "step": 1503 + }, + { + "epoch": 0.29873870294964744, + "grad_norm": 0.4673001168694523, + "learning_rate": 8.224059725404369e-06, + "loss": 0.0125, + "step": 1504 + }, + { + "epoch": 0.29893733240639586, + "grad_norm": 0.34307612038889873, + "learning_rate": 8.221599769961235e-06, + "loss": 0.0156, + "step": 1505 + }, + { + "epoch": 0.2991359618631443, + "grad_norm": 1.3513076887859734, + "learning_rate": 8.219138480456864e-06, + "loss": 0.0209, + "step": 1506 + }, + { + "epoch": 0.29933459131989276, + "grad_norm": 0.532303893440917, + "learning_rate": 8.21667585791047e-06, + "loss": 0.0181, + "step": 1507 + }, + { + "epoch": 0.2995332207766412, + "grad_norm": 0.4536212237431732, + "learning_rate": 8.214211903341826e-06, + "loss": 0.0139, + "step": 1508 + }, + { + "epoch": 0.2997318502333896, + "grad_norm": 0.4663375260088001, + "learning_rate": 8.211746617771253e-06, + "loss": 0.0106, + "step": 1509 + }, + { + "epoch": 0.29993047969013803, + "grad_norm": 0.38114356426993984, + "learning_rate": 8.209280002219619e-06, + "loss": 0.0157, + "step": 1510 + }, + { + "epoch": 0.3001291091468865, + "grad_norm": 0.5218807737983572, + "learning_rate": 8.206812057708352e-06, + "loss": 0.017, + "step": 1511 + }, + { + "epoch": 0.30032773860363493, + "grad_norm": 0.8868790496523004, + "learning_rate": 8.204342785259423e-06, + "loss": 0.0159, + "step": 1512 + }, + { + "epoch": 0.30052636806038335, + "grad_norm": 0.3421070765069182, + "learning_rate": 8.201872185895355e-06, + "loss": 0.0116, + "step": 1513 + }, + { + "epoch": 0.3007249975171318, + "grad_norm": 0.9202280991148633, + "learning_rate": 8.19940026063922e-06, + "loss": 0.0175, + "step": 1514 + }, + { + "epoch": 0.30092362697388025, + "grad_norm": 0.3751885600682715, + "learning_rate": 8.196927010514642e-06, + "loss": 0.0101, + "step": 1515 + }, + { + "epoch": 0.3011222564306287, + "grad_norm": 0.5044522661178389, + "learning_rate": 8.194452436545792e-06, + "loss": 0.0099, + "step": 1516 + }, + { + "epoch": 0.3013208858873771, + "grad_norm": 0.9821726841889677, + "learning_rate": 8.191976539757385e-06, + "loss": 0.0204, + "step": 1517 + }, + { + "epoch": 0.3015195153441255, + "grad_norm": 0.33670074524178356, + "learning_rate": 8.18949932117469e-06, + "loss": 0.0103, + "step": 1518 + }, + { + "epoch": 0.30171814480087394, + "grad_norm": 0.42279805484652505, + "learning_rate": 8.18702078182352e-06, + "loss": 0.0241, + "step": 1519 + }, + { + "epoch": 0.3019167742576224, + "grad_norm": 0.4322955622464677, + "learning_rate": 8.184540922730237e-06, + "loss": 0.0149, + "step": 1520 + }, + { + "epoch": 0.30211540371437084, + "grad_norm": 1.2923583198099244, + "learning_rate": 8.182059744921745e-06, + "loss": 0.0179, + "step": 1521 + }, + { + "epoch": 0.30231403317111927, + "grad_norm": 1.0983654911483387, + "learning_rate": 8.179577249425501e-06, + "loss": 0.0132, + "step": 1522 + }, + { + "epoch": 0.3025126626278677, + "grad_norm": 0.6994601865320401, + "learning_rate": 8.177093437269503e-06, + "loss": 0.0159, + "step": 1523 + }, + { + "epoch": 0.30271129208461617, + "grad_norm": 0.9946803548598048, + "learning_rate": 8.174608309482293e-06, + "loss": 0.0112, + "step": 1524 + }, + { + "epoch": 0.3029099215413646, + "grad_norm": 0.5054848367639058, + "learning_rate": 8.17212186709296e-06, + "loss": 0.0079, + "step": 1525 + }, + { + "epoch": 0.303108550998113, + "grad_norm": 1.1552485716095362, + "learning_rate": 8.16963411113114e-06, + "loss": 0.0139, + "step": 1526 + }, + { + "epoch": 0.30330718045486144, + "grad_norm": 0.6284110896879137, + "learning_rate": 8.167145042627007e-06, + "loss": 0.011, + "step": 1527 + }, + { + "epoch": 0.3035058099116099, + "grad_norm": 0.381511628810998, + "learning_rate": 8.164654662611285e-06, + "loss": 0.0112, + "step": 1528 + }, + { + "epoch": 0.30370443936835834, + "grad_norm": 0.9578113568076267, + "learning_rate": 8.162162972115235e-06, + "loss": 0.0165, + "step": 1529 + }, + { + "epoch": 0.30390306882510676, + "grad_norm": 0.3108981457607181, + "learning_rate": 8.159669972170667e-06, + "loss": 0.0041, + "step": 1530 + }, + { + "epoch": 0.3041016982818552, + "grad_norm": 0.387381343425436, + "learning_rate": 8.157175663809926e-06, + "loss": 0.0119, + "step": 1531 + }, + { + "epoch": 0.30430032773860366, + "grad_norm": 0.7862216295997828, + "learning_rate": 8.154680048065905e-06, + "loss": 0.0224, + "step": 1532 + }, + { + "epoch": 0.3044989571953521, + "grad_norm": 0.9430871632430338, + "learning_rate": 8.152183125972036e-06, + "loss": 0.0301, + "step": 1533 + }, + { + "epoch": 0.3046975866521005, + "grad_norm": 0.532176799345402, + "learning_rate": 8.149684898562289e-06, + "loss": 0.0102, + "step": 1534 + }, + { + "epoch": 0.3048962161088489, + "grad_norm": 1.2585493832335066, + "learning_rate": 8.147185366871182e-06, + "loss": 0.0168, + "step": 1535 + }, + { + "epoch": 0.3050948455655974, + "grad_norm": 0.5332469990576837, + "learning_rate": 8.144684531933765e-06, + "loss": 0.0187, + "step": 1536 + }, + { + "epoch": 0.30529347502234583, + "grad_norm": 0.6578476556556282, + "learning_rate": 8.142182394785633e-06, + "loss": 0.0098, + "step": 1537 + }, + { + "epoch": 0.30549210447909425, + "grad_norm": 0.9710766601800311, + "learning_rate": 8.139678956462917e-06, + "loss": 0.0147, + "step": 1538 + }, + { + "epoch": 0.3056907339358427, + "grad_norm": 0.3280996551077205, + "learning_rate": 8.13717421800229e-06, + "loss": 0.0114, + "step": 1539 + }, + { + "epoch": 0.3058893633925911, + "grad_norm": 0.4189662797300033, + "learning_rate": 8.134668180440962e-06, + "loss": 0.0121, + "step": 1540 + }, + { + "epoch": 0.3060879928493396, + "grad_norm": 0.9921003354040996, + "learning_rate": 8.13216084481668e-06, + "loss": 0.0136, + "step": 1541 + }, + { + "epoch": 0.306286622306088, + "grad_norm": 1.00118711821387, + "learning_rate": 8.129652212167725e-06, + "loss": 0.0126, + "step": 1542 + }, + { + "epoch": 0.3064852517628364, + "grad_norm": 1.1824434262355368, + "learning_rate": 8.127142283532926e-06, + "loss": 0.0206, + "step": 1543 + }, + { + "epoch": 0.30668388121958484, + "grad_norm": 0.3324696515714837, + "learning_rate": 8.124631059951638e-06, + "loss": 0.0117, + "step": 1544 + }, + { + "epoch": 0.3068825106763333, + "grad_norm": 0.2722477636883906, + "learning_rate": 8.122118542463758e-06, + "loss": 0.0069, + "step": 1545 + }, + { + "epoch": 0.30708114013308174, + "grad_norm": 0.38654260321116607, + "learning_rate": 8.119604732109716e-06, + "loss": 0.0128, + "step": 1546 + }, + { + "epoch": 0.30727976958983017, + "grad_norm": 0.4093726871049893, + "learning_rate": 8.117089629930479e-06, + "loss": 0.0094, + "step": 1547 + }, + { + "epoch": 0.3074783990465786, + "grad_norm": 1.471363664471753, + "learning_rate": 8.114573236967546e-06, + "loss": 0.0154, + "step": 1548 + }, + { + "epoch": 0.30767702850332707, + "grad_norm": 0.8945599914230751, + "learning_rate": 8.112055554262956e-06, + "loss": 0.0225, + "step": 1549 + }, + { + "epoch": 0.3078756579600755, + "grad_norm": 0.5070483754063576, + "learning_rate": 8.109536582859276e-06, + "loss": 0.0139, + "step": 1550 + }, + { + "epoch": 0.3080742874168239, + "grad_norm": 0.4124395849167642, + "learning_rate": 8.107016323799612e-06, + "loss": 0.0157, + "step": 1551 + }, + { + "epoch": 0.30827291687357233, + "grad_norm": 0.7163339426933371, + "learning_rate": 8.1044947781276e-06, + "loss": 0.0188, + "step": 1552 + }, + { + "epoch": 0.3084715463303208, + "grad_norm": 0.663951545919708, + "learning_rate": 8.10197194688741e-06, + "loss": 0.0095, + "step": 1553 + }, + { + "epoch": 0.30867017578706923, + "grad_norm": 0.4033057526388188, + "learning_rate": 8.099447831123742e-06, + "loss": 0.0115, + "step": 1554 + }, + { + "epoch": 0.30886880524381766, + "grad_norm": 0.5134664101495625, + "learning_rate": 8.09692243188183e-06, + "loss": 0.0146, + "step": 1555 + }, + { + "epoch": 0.3090674347005661, + "grad_norm": 0.3272561207723999, + "learning_rate": 8.094395750207443e-06, + "loss": 0.0066, + "step": 1556 + }, + { + "epoch": 0.30926606415731456, + "grad_norm": 0.47522492760295715, + "learning_rate": 8.091867787146874e-06, + "loss": 0.0175, + "step": 1557 + }, + { + "epoch": 0.309464693614063, + "grad_norm": 0.28831801397227647, + "learning_rate": 8.08933854374695e-06, + "loss": 0.0087, + "step": 1558 + }, + { + "epoch": 0.3096633230708114, + "grad_norm": 0.728136763673717, + "learning_rate": 8.086808021055029e-06, + "loss": 0.0146, + "step": 1559 + }, + { + "epoch": 0.3098619525275598, + "grad_norm": 0.5645255218117098, + "learning_rate": 8.084276220118997e-06, + "loss": 0.0131, + "step": 1560 + }, + { + "epoch": 0.31006058198430825, + "grad_norm": 0.9114867700044899, + "learning_rate": 8.081743141987271e-06, + "loss": 0.0172, + "step": 1561 + }, + { + "epoch": 0.3102592114410567, + "grad_norm": 0.5128080694665796, + "learning_rate": 8.079208787708797e-06, + "loss": 0.0181, + "step": 1562 + }, + { + "epoch": 0.31045784089780515, + "grad_norm": 0.7452682016764512, + "learning_rate": 8.076673158333046e-06, + "loss": 0.0131, + "step": 1563 + }, + { + "epoch": 0.31065647035455357, + "grad_norm": 0.5224597185446708, + "learning_rate": 8.074136254910022e-06, + "loss": 0.0172, + "step": 1564 + }, + { + "epoch": 0.310855099811302, + "grad_norm": 0.2271594138378722, + "learning_rate": 8.071598078490254e-06, + "loss": 0.0061, + "step": 1565 + }, + { + "epoch": 0.31105372926805047, + "grad_norm": 0.3208897701348875, + "learning_rate": 8.069058630124798e-06, + "loss": 0.0087, + "step": 1566 + }, + { + "epoch": 0.3112523587247989, + "grad_norm": 0.4311676394198718, + "learning_rate": 8.066517910865235e-06, + "loss": 0.0127, + "step": 1567 + }, + { + "epoch": 0.3114509881815473, + "grad_norm": 0.3605691884352593, + "learning_rate": 8.063975921763675e-06, + "loss": 0.0109, + "step": 1568 + }, + { + "epoch": 0.31164961763829574, + "grad_norm": 0.5511650816451228, + "learning_rate": 8.061432663872757e-06, + "loss": 0.014, + "step": 1569 + }, + { + "epoch": 0.3118482470950442, + "grad_norm": 0.4705883402855219, + "learning_rate": 8.058888138245639e-06, + "loss": 0.018, + "step": 1570 + }, + { + "epoch": 0.31204687655179264, + "grad_norm": 0.427333107085936, + "learning_rate": 8.056342345936005e-06, + "loss": 0.013, + "step": 1571 + }, + { + "epoch": 0.31224550600854106, + "grad_norm": 0.9899499461011341, + "learning_rate": 8.053795287998065e-06, + "loss": 0.0104, + "step": 1572 + }, + { + "epoch": 0.3124441354652895, + "grad_norm": 0.4747912372785505, + "learning_rate": 8.051246965486557e-06, + "loss": 0.0073, + "step": 1573 + }, + { + "epoch": 0.31264276492203796, + "grad_norm": 0.441160461346295, + "learning_rate": 8.048697379456733e-06, + "loss": 0.0159, + "step": 1574 + }, + { + "epoch": 0.3128413943787864, + "grad_norm": 0.34014297933534443, + "learning_rate": 8.04614653096438e-06, + "loss": 0.0097, + "step": 1575 + }, + { + "epoch": 0.3130400238355348, + "grad_norm": 1.2792886913012314, + "learning_rate": 8.043594421065796e-06, + "loss": 0.017, + "step": 1576 + }, + { + "epoch": 0.31323865329228323, + "grad_norm": 0.6529038511786388, + "learning_rate": 8.041041050817813e-06, + "loss": 0.0155, + "step": 1577 + }, + { + "epoch": 0.31343728274903165, + "grad_norm": 0.4600809221001575, + "learning_rate": 8.038486421277775e-06, + "loss": 0.0093, + "step": 1578 + }, + { + "epoch": 0.31363591220578013, + "grad_norm": 0.5257994692013639, + "learning_rate": 8.035930533503554e-06, + "loss": 0.012, + "step": 1579 + }, + { + "epoch": 0.31383454166252855, + "grad_norm": 1.482595268996986, + "learning_rate": 8.033373388553538e-06, + "loss": 0.0126, + "step": 1580 + }, + { + "epoch": 0.314033171119277, + "grad_norm": 0.697904773183433, + "learning_rate": 8.030814987486639e-06, + "loss": 0.0129, + "step": 1581 + }, + { + "epoch": 0.3142318005760254, + "grad_norm": 1.1437565550456443, + "learning_rate": 8.028255331362292e-06, + "loss": 0.0153, + "step": 1582 + }, + { + "epoch": 0.3144304300327739, + "grad_norm": 0.7568186293758398, + "learning_rate": 8.025694421240442e-06, + "loss": 0.0098, + "step": 1583 + }, + { + "epoch": 0.3146290594895223, + "grad_norm": 1.2713687932469087, + "learning_rate": 8.023132258181563e-06, + "loss": 0.0233, + "step": 1584 + }, + { + "epoch": 0.3148276889462707, + "grad_norm": 0.6183628181054079, + "learning_rate": 8.020568843246642e-06, + "loss": 0.0174, + "step": 1585 + }, + { + "epoch": 0.31502631840301915, + "grad_norm": 0.38716038710078116, + "learning_rate": 8.01800417749719e-06, + "loss": 0.0072, + "step": 1586 + }, + { + "epoch": 0.3152249478597676, + "grad_norm": 0.43109273141996973, + "learning_rate": 8.015438261995229e-06, + "loss": 0.0085, + "step": 1587 + }, + { + "epoch": 0.31542357731651605, + "grad_norm": 0.72428449579244, + "learning_rate": 8.012871097803303e-06, + "loss": 0.0134, + "step": 1588 + }, + { + "epoch": 0.31562220677326447, + "grad_norm": 0.4444888127096748, + "learning_rate": 8.010302685984473e-06, + "loss": 0.0118, + "step": 1589 + }, + { + "epoch": 0.3158208362300129, + "grad_norm": 0.5112452513036903, + "learning_rate": 8.007733027602315e-06, + "loss": 0.0085, + "step": 1590 + }, + { + "epoch": 0.31601946568676137, + "grad_norm": 0.5630116305420747, + "learning_rate": 8.005162123720924e-06, + "loss": 0.0096, + "step": 1591 + }, + { + "epoch": 0.3162180951435098, + "grad_norm": 0.7437776670920336, + "learning_rate": 8.002589975404907e-06, + "loss": 0.0144, + "step": 1592 + }, + { + "epoch": 0.3164167246002582, + "grad_norm": 0.970854724356747, + "learning_rate": 8.000016583719386e-06, + "loss": 0.0215, + "step": 1593 + }, + { + "epoch": 0.31661535405700664, + "grad_norm": 0.8828896563831873, + "learning_rate": 7.997441949730003e-06, + "loss": 0.0133, + "step": 1594 + }, + { + "epoch": 0.3168139835137551, + "grad_norm": 0.6160271865646878, + "learning_rate": 7.994866074502911e-06, + "loss": 0.0135, + "step": 1595 + }, + { + "epoch": 0.31701261297050354, + "grad_norm": 0.5364322361987068, + "learning_rate": 7.992288959104776e-06, + "loss": 0.0169, + "step": 1596 + }, + { + "epoch": 0.31721124242725196, + "grad_norm": 0.46568042207568855, + "learning_rate": 7.98971060460278e-06, + "loss": 0.0122, + "step": 1597 + }, + { + "epoch": 0.3174098718840004, + "grad_norm": 0.5264630544016791, + "learning_rate": 7.987131012064615e-06, + "loss": 0.0129, + "step": 1598 + }, + { + "epoch": 0.3176085013407488, + "grad_norm": 0.6009801290738634, + "learning_rate": 7.984550182558492e-06, + "loss": 0.0112, + "step": 1599 + }, + { + "epoch": 0.3178071307974973, + "grad_norm": 0.6884006171260445, + "learning_rate": 7.981968117153125e-06, + "loss": 0.0191, + "step": 1600 + }, + { + "epoch": 0.3180057602542457, + "grad_norm": 0.3982485689190538, + "learning_rate": 7.979384816917748e-06, + "loss": 0.0111, + "step": 1601 + }, + { + "epoch": 0.31820438971099413, + "grad_norm": 0.30136377831303135, + "learning_rate": 7.9768002829221e-06, + "loss": 0.0113, + "step": 1602 + }, + { + "epoch": 0.31840301916774255, + "grad_norm": 0.8655679733419154, + "learning_rate": 7.974214516236438e-06, + "loss": 0.0196, + "step": 1603 + }, + { + "epoch": 0.31860164862449103, + "grad_norm": 0.8439790204628401, + "learning_rate": 7.971627517931523e-06, + "loss": 0.0159, + "step": 1604 + }, + { + "epoch": 0.31880027808123945, + "grad_norm": 0.6162008342884321, + "learning_rate": 7.96903928907863e-06, + "loss": 0.0172, + "step": 1605 + }, + { + "epoch": 0.3189989075379879, + "grad_norm": 0.2737773167439073, + "learning_rate": 7.966449830749538e-06, + "loss": 0.0172, + "step": 1606 + }, + { + "epoch": 0.3191975369947363, + "grad_norm": 0.505600845809124, + "learning_rate": 7.963859144016544e-06, + "loss": 0.0164, + "step": 1607 + }, + { + "epoch": 0.3193961664514848, + "grad_norm": 0.796834862743995, + "learning_rate": 7.961267229952444e-06, + "loss": 0.011, + "step": 1608 + }, + { + "epoch": 0.3195947959082332, + "grad_norm": 1.2150625750472108, + "learning_rate": 7.958674089630551e-06, + "loss": 0.0147, + "step": 1609 + }, + { + "epoch": 0.3197934253649816, + "grad_norm": 0.3751880775430107, + "learning_rate": 7.956079724124681e-06, + "loss": 0.0131, + "step": 1610 + }, + { + "epoch": 0.31999205482173004, + "grad_norm": 0.2392680116727292, + "learning_rate": 7.953484134509158e-06, + "loss": 0.0075, + "step": 1611 + }, + { + "epoch": 0.3201906842784785, + "grad_norm": 0.3297290154412873, + "learning_rate": 7.950887321858811e-06, + "loss": 0.0084, + "step": 1612 + }, + { + "epoch": 0.32038931373522694, + "grad_norm": 0.6362428644715254, + "learning_rate": 7.948289287248979e-06, + "loss": 0.0113, + "step": 1613 + }, + { + "epoch": 0.32058794319197537, + "grad_norm": 0.703329270973474, + "learning_rate": 7.945690031755506e-06, + "loss": 0.0121, + "step": 1614 + }, + { + "epoch": 0.3207865726487238, + "grad_norm": 1.1210492893128057, + "learning_rate": 7.943089556454742e-06, + "loss": 0.0161, + "step": 1615 + }, + { + "epoch": 0.32098520210547227, + "grad_norm": 0.7398560335941972, + "learning_rate": 7.940487862423538e-06, + "loss": 0.0153, + "step": 1616 + }, + { + "epoch": 0.3211838315622207, + "grad_norm": 0.592690898155172, + "learning_rate": 7.937884950739255e-06, + "loss": 0.0126, + "step": 1617 + }, + { + "epoch": 0.3213824610189691, + "grad_norm": 0.3885094064525805, + "learning_rate": 7.935280822479758e-06, + "loss": 0.0126, + "step": 1618 + }, + { + "epoch": 0.32158109047571753, + "grad_norm": 0.37641651576137636, + "learning_rate": 7.93267547872341e-06, + "loss": 0.0158, + "step": 1619 + }, + { + "epoch": 0.32177971993246596, + "grad_norm": 0.5131102552628134, + "learning_rate": 7.930068920549084e-06, + "loss": 0.0077, + "step": 1620 + }, + { + "epoch": 0.32197834938921444, + "grad_norm": 0.6672345424407569, + "learning_rate": 7.927461149036153e-06, + "loss": 0.0154, + "step": 1621 + }, + { + "epoch": 0.32217697884596286, + "grad_norm": 0.7516794337115568, + "learning_rate": 7.924852165264491e-06, + "loss": 0.0119, + "step": 1622 + }, + { + "epoch": 0.3223756083027113, + "grad_norm": 0.3990098639952792, + "learning_rate": 7.922241970314475e-06, + "loss": 0.0089, + "step": 1623 + }, + { + "epoch": 0.3225742377594597, + "grad_norm": 0.48114802980288773, + "learning_rate": 7.919630565266987e-06, + "loss": 0.0123, + "step": 1624 + }, + { + "epoch": 0.3227728672162082, + "grad_norm": 0.5316138344004034, + "learning_rate": 7.917017951203406e-06, + "loss": 0.0064, + "step": 1625 + }, + { + "epoch": 0.3229714966729566, + "grad_norm": 0.30950798522402956, + "learning_rate": 7.91440412920561e-06, + "loss": 0.004, + "step": 1626 + }, + { + "epoch": 0.323170126129705, + "grad_norm": 0.8119698454248738, + "learning_rate": 7.911789100355985e-06, + "loss": 0.017, + "step": 1627 + }, + { + "epoch": 0.32336875558645345, + "grad_norm": 0.3266822113853905, + "learning_rate": 7.90917286573741e-06, + "loss": 0.0112, + "step": 1628 + }, + { + "epoch": 0.3235673850432019, + "grad_norm": 0.5213109216327177, + "learning_rate": 7.906555426433264e-06, + "loss": 0.0084, + "step": 1629 + }, + { + "epoch": 0.32376601449995035, + "grad_norm": 1.0671826630070904, + "learning_rate": 7.903936783527425e-06, + "loss": 0.0198, + "step": 1630 + }, + { + "epoch": 0.3239646439566988, + "grad_norm": 1.0574028610546085, + "learning_rate": 7.901316938104275e-06, + "loss": 0.0148, + "step": 1631 + }, + { + "epoch": 0.3241632734134472, + "grad_norm": 0.46764434451459613, + "learning_rate": 7.898695891248685e-06, + "loss": 0.0064, + "step": 1632 + }, + { + "epoch": 0.3243619028701957, + "grad_norm": 0.8539693335630968, + "learning_rate": 7.896073644046028e-06, + "loss": 0.0135, + "step": 1633 + }, + { + "epoch": 0.3245605323269441, + "grad_norm": 0.8064389257572152, + "learning_rate": 7.893450197582178e-06, + "loss": 0.0117, + "step": 1634 + }, + { + "epoch": 0.3247591617836925, + "grad_norm": 0.34763349103097907, + "learning_rate": 7.890825552943495e-06, + "loss": 0.0075, + "step": 1635 + }, + { + "epoch": 0.32495779124044094, + "grad_norm": 0.48244504223164625, + "learning_rate": 7.888199711216848e-06, + "loss": 0.0138, + "step": 1636 + }, + { + "epoch": 0.3251564206971894, + "grad_norm": 0.5896875038752724, + "learning_rate": 7.885572673489592e-06, + "loss": 0.0126, + "step": 1637 + }, + { + "epoch": 0.32535505015393784, + "grad_norm": 0.6571748595454835, + "learning_rate": 7.882944440849582e-06, + "loss": 0.0117, + "step": 1638 + }, + { + "epoch": 0.32555367961068626, + "grad_norm": 0.31453304815725347, + "learning_rate": 7.880315014385166e-06, + "loss": 0.0048, + "step": 1639 + }, + { + "epoch": 0.3257523090674347, + "grad_norm": 1.1519220230010478, + "learning_rate": 7.877684395185187e-06, + "loss": 0.0127, + "step": 1640 + }, + { + "epoch": 0.3259509385241831, + "grad_norm": 0.770459673164255, + "learning_rate": 7.875052584338983e-06, + "loss": 0.0206, + "step": 1641 + }, + { + "epoch": 0.3261495679809316, + "grad_norm": 0.7015223884365756, + "learning_rate": 7.872419582936382e-06, + "loss": 0.0175, + "step": 1642 + }, + { + "epoch": 0.32634819743768, + "grad_norm": 0.6126064018057952, + "learning_rate": 7.869785392067707e-06, + "loss": 0.0171, + "step": 1643 + }, + { + "epoch": 0.32654682689442843, + "grad_norm": 0.43187448867276834, + "learning_rate": 7.867150012823777e-06, + "loss": 0.0134, + "step": 1644 + }, + { + "epoch": 0.32674545635117686, + "grad_norm": 0.9447876200418266, + "learning_rate": 7.864513446295896e-06, + "loss": 0.0143, + "step": 1645 + }, + { + "epoch": 0.32694408580792533, + "grad_norm": 0.5123267111128992, + "learning_rate": 7.861875693575866e-06, + "loss": 0.0092, + "step": 1646 + }, + { + "epoch": 0.32714271526467376, + "grad_norm": 0.8471560674383536, + "learning_rate": 7.859236755755978e-06, + "loss": 0.0166, + "step": 1647 + }, + { + "epoch": 0.3273413447214222, + "grad_norm": 1.1733673039140131, + "learning_rate": 7.856596633929012e-06, + "loss": 0.0225, + "step": 1648 + }, + { + "epoch": 0.3275399741781706, + "grad_norm": 0.5145085603141826, + "learning_rate": 7.85395532918824e-06, + "loss": 0.0169, + "step": 1649 + }, + { + "epoch": 0.3277386036349191, + "grad_norm": 0.4349990032596344, + "learning_rate": 7.851312842627426e-06, + "loss": 0.0179, + "step": 1650 + }, + { + "epoch": 0.3279372330916675, + "grad_norm": 0.6127954085037567, + "learning_rate": 7.848669175340818e-06, + "loss": 0.0163, + "step": 1651 + }, + { + "epoch": 0.3281358625484159, + "grad_norm": 0.6906096363587637, + "learning_rate": 7.846024328423157e-06, + "loss": 0.0136, + "step": 1652 + }, + { + "epoch": 0.32833449200516435, + "grad_norm": 0.3831242188980579, + "learning_rate": 7.843378302969674e-06, + "loss": 0.0115, + "step": 1653 + }, + { + "epoch": 0.3285331214619128, + "grad_norm": 0.31854883023090935, + "learning_rate": 7.840731100076081e-06, + "loss": 0.0119, + "step": 1654 + }, + { + "epoch": 0.32873175091866125, + "grad_norm": 0.6135826551661053, + "learning_rate": 7.838082720838585e-06, + "loss": 0.0116, + "step": 1655 + }, + { + "epoch": 0.32893038037540967, + "grad_norm": 0.6788478557117993, + "learning_rate": 7.835433166353876e-06, + "loss": 0.0196, + "step": 1656 + }, + { + "epoch": 0.3291290098321581, + "grad_norm": 0.5204341679322476, + "learning_rate": 7.832782437719132e-06, + "loss": 0.0176, + "step": 1657 + }, + { + "epoch": 0.32932763928890657, + "grad_norm": 0.7020429742008044, + "learning_rate": 7.830130536032017e-06, + "loss": 0.0101, + "step": 1658 + }, + { + "epoch": 0.329526268745655, + "grad_norm": 0.782017865564206, + "learning_rate": 7.827477462390683e-06, + "loss": 0.0156, + "step": 1659 + }, + { + "epoch": 0.3297248982024034, + "grad_norm": 0.7872235943247256, + "learning_rate": 7.824823217893762e-06, + "loss": 0.0183, + "step": 1660 + }, + { + "epoch": 0.32992352765915184, + "grad_norm": 0.9577077977386234, + "learning_rate": 7.822167803640375e-06, + "loss": 0.017, + "step": 1661 + }, + { + "epoch": 0.33012215711590026, + "grad_norm": 0.3392933900875892, + "learning_rate": 7.819511220730127e-06, + "loss": 0.0069, + "step": 1662 + }, + { + "epoch": 0.33032078657264874, + "grad_norm": 0.769515283319616, + "learning_rate": 7.816853470263107e-06, + "loss": 0.0132, + "step": 1663 + }, + { + "epoch": 0.33051941602939716, + "grad_norm": 0.41436036382418884, + "learning_rate": 7.814194553339884e-06, + "loss": 0.0093, + "step": 1664 + }, + { + "epoch": 0.3307180454861456, + "grad_norm": 0.7489421767929256, + "learning_rate": 7.811534471061516e-06, + "loss": 0.015, + "step": 1665 + }, + { + "epoch": 0.330916674942894, + "grad_norm": 0.5296856540549193, + "learning_rate": 7.80887322452954e-06, + "loss": 0.0173, + "step": 1666 + }, + { + "epoch": 0.3311153043996425, + "grad_norm": 0.5649510594604672, + "learning_rate": 7.806210814845974e-06, + "loss": 0.0102, + "step": 1667 + }, + { + "epoch": 0.3313139338563909, + "grad_norm": 0.4243877853275429, + "learning_rate": 7.803547243113319e-06, + "loss": 0.0108, + "step": 1668 + }, + { + "epoch": 0.33151256331313933, + "grad_norm": 0.6981372474644114, + "learning_rate": 7.800882510434559e-06, + "loss": 0.013, + "step": 1669 + }, + { + "epoch": 0.33171119276988775, + "grad_norm": 0.5550131373261892, + "learning_rate": 7.798216617913155e-06, + "loss": 0.0213, + "step": 1670 + }, + { + "epoch": 0.33190982222663623, + "grad_norm": 0.867353442467047, + "learning_rate": 7.795549566653054e-06, + "loss": 0.0116, + "step": 1671 + }, + { + "epoch": 0.33210845168338465, + "grad_norm": 0.587120077251956, + "learning_rate": 7.792881357758674e-06, + "loss": 0.0177, + "step": 1672 + }, + { + "epoch": 0.3323070811401331, + "grad_norm": 0.5107619089303768, + "learning_rate": 7.790211992334923e-06, + "loss": 0.0184, + "step": 1673 + }, + { + "epoch": 0.3325057105968815, + "grad_norm": 0.9529228340714601, + "learning_rate": 7.787541471487178e-06, + "loss": 0.0116, + "step": 1674 + }, + { + "epoch": 0.33270434005363, + "grad_norm": 0.32762475045703154, + "learning_rate": 7.784869796321302e-06, + "loss": 0.0111, + "step": 1675 + }, + { + "epoch": 0.3329029695103784, + "grad_norm": 1.0160627610759643, + "learning_rate": 7.782196967943633e-06, + "loss": 0.0174, + "step": 1676 + }, + { + "epoch": 0.3331015989671268, + "grad_norm": 0.8755877388840554, + "learning_rate": 7.779522987460985e-06, + "loss": 0.0241, + "step": 1677 + }, + { + "epoch": 0.33330022842387524, + "grad_norm": 0.4517222646647724, + "learning_rate": 7.776847855980653e-06, + "loss": 0.0105, + "step": 1678 + }, + { + "epoch": 0.3334988578806237, + "grad_norm": 0.4723207594634246, + "learning_rate": 7.774171574610404e-06, + "loss": 0.0156, + "step": 1679 + }, + { + "epoch": 0.33369748733737215, + "grad_norm": 0.5220346418239198, + "learning_rate": 7.771494144458483e-06, + "loss": 0.0116, + "step": 1680 + }, + { + "epoch": 0.33389611679412057, + "grad_norm": 0.2821678482563408, + "learning_rate": 7.768815566633612e-06, + "loss": 0.0082, + "step": 1681 + }, + { + "epoch": 0.334094746250869, + "grad_norm": 1.42187836812918, + "learning_rate": 7.766135842244988e-06, + "loss": 0.0241, + "step": 1682 + }, + { + "epoch": 0.3342933757076174, + "grad_norm": 0.6174532778541035, + "learning_rate": 7.763454972402282e-06, + "loss": 0.0128, + "step": 1683 + }, + { + "epoch": 0.3344920051643659, + "grad_norm": 0.5883317240537213, + "learning_rate": 7.76077295821564e-06, + "loss": 0.0144, + "step": 1684 + }, + { + "epoch": 0.3346906346211143, + "grad_norm": 0.3588975768808438, + "learning_rate": 7.75808980079568e-06, + "loss": 0.0097, + "step": 1685 + }, + { + "epoch": 0.33488926407786274, + "grad_norm": 0.5875630793918583, + "learning_rate": 7.755405501253496e-06, + "loss": 0.0157, + "step": 1686 + }, + { + "epoch": 0.33508789353461116, + "grad_norm": 0.7281884043493897, + "learning_rate": 7.752720060700652e-06, + "loss": 0.02, + "step": 1687 + }, + { + "epoch": 0.33528652299135964, + "grad_norm": 0.6001706983891172, + "learning_rate": 7.750033480249188e-06, + "loss": 0.0155, + "step": 1688 + }, + { + "epoch": 0.33548515244810806, + "grad_norm": 0.2997842597354729, + "learning_rate": 7.747345761011616e-06, + "loss": 0.0135, + "step": 1689 + }, + { + "epoch": 0.3356837819048565, + "grad_norm": 0.5174410489177036, + "learning_rate": 7.744656904100913e-06, + "loss": 0.0093, + "step": 1690 + }, + { + "epoch": 0.3358824113616049, + "grad_norm": 0.5480320708273444, + "learning_rate": 7.741966910630536e-06, + "loss": 0.0172, + "step": 1691 + }, + { + "epoch": 0.3360810408183534, + "grad_norm": 0.4344915109534724, + "learning_rate": 7.739275781714405e-06, + "loss": 0.0129, + "step": 1692 + }, + { + "epoch": 0.3362796702751018, + "grad_norm": 0.8908712492790886, + "learning_rate": 7.736583518466919e-06, + "loss": 0.0127, + "step": 1693 + }, + { + "epoch": 0.33647829973185023, + "grad_norm": 0.849787263342614, + "learning_rate": 7.733890122002936e-06, + "loss": 0.0119, + "step": 1694 + }, + { + "epoch": 0.33667692918859865, + "grad_norm": 0.5855213279534514, + "learning_rate": 7.731195593437793e-06, + "loss": 0.012, + "step": 1695 + }, + { + "epoch": 0.33687555864534713, + "grad_norm": 0.6288057804890751, + "learning_rate": 7.728499933887288e-06, + "loss": 0.0142, + "step": 1696 + }, + { + "epoch": 0.33707418810209555, + "grad_norm": 0.43026765663973393, + "learning_rate": 7.725803144467695e-06, + "loss": 0.0116, + "step": 1697 + }, + { + "epoch": 0.337272817558844, + "grad_norm": 0.36782345928729954, + "learning_rate": 7.723105226295749e-06, + "loss": 0.0086, + "step": 1698 + }, + { + "epoch": 0.3374714470155924, + "grad_norm": 0.9472170162174446, + "learning_rate": 7.720406180488655e-06, + "loss": 0.0262, + "step": 1699 + }, + { + "epoch": 0.3376700764723409, + "grad_norm": 0.3625055211307673, + "learning_rate": 7.717706008164085e-06, + "loss": 0.0079, + "step": 1700 + }, + { + "epoch": 0.3378687059290893, + "grad_norm": 0.2153089494894364, + "learning_rate": 7.715004710440181e-06, + "loss": 0.0045, + "step": 1701 + }, + { + "epoch": 0.3380673353858377, + "grad_norm": 1.055365098303852, + "learning_rate": 7.712302288435545e-06, + "loss": 0.0146, + "step": 1702 + }, + { + "epoch": 0.33826596484258614, + "grad_norm": 1.593570003099904, + "learning_rate": 7.709598743269246e-06, + "loss": 0.0264, + "step": 1703 + }, + { + "epoch": 0.33846459429933456, + "grad_norm": 0.8049164644185485, + "learning_rate": 7.70689407606082e-06, + "loss": 0.0127, + "step": 1704 + }, + { + "epoch": 0.33866322375608304, + "grad_norm": 1.02619924035175, + "learning_rate": 7.70418828793027e-06, + "loss": 0.0222, + "step": 1705 + }, + { + "epoch": 0.33886185321283147, + "grad_norm": 0.3728588121113957, + "learning_rate": 7.701481379998057e-06, + "loss": 0.0108, + "step": 1706 + }, + { + "epoch": 0.3390604826695799, + "grad_norm": 0.6574219495167848, + "learning_rate": 7.698773353385111e-06, + "loss": 0.0121, + "step": 1707 + }, + { + "epoch": 0.3392591121263283, + "grad_norm": 0.38503544600597717, + "learning_rate": 7.696064209212822e-06, + "loss": 0.0112, + "step": 1708 + }, + { + "epoch": 0.3394577415830768, + "grad_norm": 0.558349825474669, + "learning_rate": 7.693353948603041e-06, + "loss": 0.008, + "step": 1709 + }, + { + "epoch": 0.3396563710398252, + "grad_norm": 2.1539383186369774, + "learning_rate": 7.69064257267809e-06, + "loss": 0.0121, + "step": 1710 + }, + { + "epoch": 0.33985500049657363, + "grad_norm": 0.41496259994783097, + "learning_rate": 7.687930082560744e-06, + "loss": 0.0127, + "step": 1711 + }, + { + "epoch": 0.34005362995332206, + "grad_norm": 0.48924590860014766, + "learning_rate": 7.685216479374242e-06, + "loss": 0.0137, + "step": 1712 + }, + { + "epoch": 0.34025225941007053, + "grad_norm": 0.45250983111826154, + "learning_rate": 7.682501764242284e-06, + "loss": 0.0089, + "step": 1713 + }, + { + "epoch": 0.34045088886681896, + "grad_norm": 0.45711497918986704, + "learning_rate": 7.679785938289032e-06, + "loss": 0.0132, + "step": 1714 + }, + { + "epoch": 0.3406495183235674, + "grad_norm": 0.44128917687358954, + "learning_rate": 7.677069002639109e-06, + "loss": 0.014, + "step": 1715 + }, + { + "epoch": 0.3408481477803158, + "grad_norm": 0.7869148221018819, + "learning_rate": 7.674350958417589e-06, + "loss": 0.018, + "step": 1716 + }, + { + "epoch": 0.3410467772370643, + "grad_norm": 0.4703664064866459, + "learning_rate": 7.671631806750018e-06, + "loss": 0.0144, + "step": 1717 + }, + { + "epoch": 0.3412454066938127, + "grad_norm": 0.42705552055783985, + "learning_rate": 7.66891154876239e-06, + "loss": 0.0067, + "step": 1718 + }, + { + "epoch": 0.3414440361505611, + "grad_norm": 1.4239786275656603, + "learning_rate": 7.666190185581164e-06, + "loss": 0.0215, + "step": 1719 + }, + { + "epoch": 0.34164266560730955, + "grad_norm": 0.8964200804878224, + "learning_rate": 7.66346771833325e-06, + "loss": 0.0105, + "step": 1720 + }, + { + "epoch": 0.341841295064058, + "grad_norm": 0.3234278375891176, + "learning_rate": 7.660744148146022e-06, + "loss": 0.0204, + "step": 1721 + }, + { + "epoch": 0.34203992452080645, + "grad_norm": 0.6931183761335391, + "learning_rate": 7.658019476147307e-06, + "loss": 0.0201, + "step": 1722 + }, + { + "epoch": 0.34223855397755487, + "grad_norm": 0.7400781197903014, + "learning_rate": 7.65529370346539e-06, + "loss": 0.0164, + "step": 1723 + }, + { + "epoch": 0.3424371834343033, + "grad_norm": 0.4335482304440916, + "learning_rate": 7.652566831229007e-06, + "loss": 0.0091, + "step": 1724 + }, + { + "epoch": 0.3426358128910517, + "grad_norm": 0.1969898697189572, + "learning_rate": 7.649838860567356e-06, + "loss": 0.0038, + "step": 1725 + }, + { + "epoch": 0.3428344423478002, + "grad_norm": 0.6032444248728773, + "learning_rate": 7.647109792610087e-06, + "loss": 0.0094, + "step": 1726 + }, + { + "epoch": 0.3430330718045486, + "grad_norm": 0.4008378028364625, + "learning_rate": 7.644379628487305e-06, + "loss": 0.0116, + "step": 1727 + }, + { + "epoch": 0.34323170126129704, + "grad_norm": 0.45794703272131426, + "learning_rate": 7.641648369329566e-06, + "loss": 0.0081, + "step": 1728 + }, + { + "epoch": 0.34343033071804546, + "grad_norm": 0.8915628644941473, + "learning_rate": 7.638916016267884e-06, + "loss": 0.0136, + "step": 1729 + }, + { + "epoch": 0.34362896017479394, + "grad_norm": 0.4072293871212264, + "learning_rate": 7.63618257043372e-06, + "loss": 0.0135, + "step": 1730 + }, + { + "epoch": 0.34382758963154236, + "grad_norm": 0.48094267145550845, + "learning_rate": 7.633448032958994e-06, + "loss": 0.0169, + "step": 1731 + }, + { + "epoch": 0.3440262190882908, + "grad_norm": 0.546937299007096, + "learning_rate": 7.630712404976075e-06, + "loss": 0.0113, + "step": 1732 + }, + { + "epoch": 0.3442248485450392, + "grad_norm": 0.595885300176229, + "learning_rate": 7.6279756876177835e-06, + "loss": 0.0156, + "step": 1733 + }, + { + "epoch": 0.3444234780017877, + "grad_norm": 0.3227233399899523, + "learning_rate": 7.6252378820173915e-06, + "loss": 0.0064, + "step": 1734 + }, + { + "epoch": 0.3446221074585361, + "grad_norm": 0.6990835307410963, + "learning_rate": 7.622498989308622e-06, + "loss": 0.0197, + "step": 1735 + }, + { + "epoch": 0.34482073691528453, + "grad_norm": 0.67866449998373, + "learning_rate": 7.619759010625647e-06, + "loss": 0.0113, + "step": 1736 + }, + { + "epoch": 0.34501936637203295, + "grad_norm": 0.7839634602784449, + "learning_rate": 7.617017947103089e-06, + "loss": 0.0128, + "step": 1737 + }, + { + "epoch": 0.34521799582878143, + "grad_norm": 0.2870079380203802, + "learning_rate": 7.614275799876021e-06, + "loss": 0.0045, + "step": 1738 + }, + { + "epoch": 0.34541662528552985, + "grad_norm": 0.4689823234820678, + "learning_rate": 7.61153257007996e-06, + "loss": 0.0104, + "step": 1739 + }, + { + "epoch": 0.3456152547422783, + "grad_norm": 0.4687644833338894, + "learning_rate": 7.608788258850879e-06, + "loss": 0.0116, + "step": 1740 + }, + { + "epoch": 0.3458138841990267, + "grad_norm": 0.5581341559370254, + "learning_rate": 7.6060428673251915e-06, + "loss": 0.0102, + "step": 1741 + }, + { + "epoch": 0.3460125136557752, + "grad_norm": 0.5745297067207734, + "learning_rate": 7.603296396639763e-06, + "loss": 0.0129, + "step": 1742 + }, + { + "epoch": 0.3462111431125236, + "grad_norm": 0.5785097862293327, + "learning_rate": 7.600548847931903e-06, + "loss": 0.0101, + "step": 1743 + }, + { + "epoch": 0.346409772569272, + "grad_norm": 0.6057076832337547, + "learning_rate": 7.597800222339371e-06, + "loss": 0.0113, + "step": 1744 + }, + { + "epoch": 0.34660840202602045, + "grad_norm": 0.7114365255059848, + "learning_rate": 7.595050521000367e-06, + "loss": 0.0147, + "step": 1745 + }, + { + "epoch": 0.34680703148276887, + "grad_norm": 0.9549334149084857, + "learning_rate": 7.5922997450535405e-06, + "loss": 0.0079, + "step": 1746 + }, + { + "epoch": 0.34700566093951735, + "grad_norm": 0.9194225699846286, + "learning_rate": 7.589547895637987e-06, + "loss": 0.0176, + "step": 1747 + }, + { + "epoch": 0.34720429039626577, + "grad_norm": 0.317751182542115, + "learning_rate": 7.586794973893241e-06, + "loss": 0.0062, + "step": 1748 + }, + { + "epoch": 0.3474029198530142, + "grad_norm": 0.7272366693065906, + "learning_rate": 7.584040980959288e-06, + "loss": 0.0114, + "step": 1749 + }, + { + "epoch": 0.3476015493097626, + "grad_norm": 0.5400337381754199, + "learning_rate": 7.5812859179765555e-06, + "loss": 0.0084, + "step": 1750 + }, + { + "epoch": 0.3478001787665111, + "grad_norm": 1.0261218838104105, + "learning_rate": 7.578529786085904e-06, + "loss": 0.0144, + "step": 1751 + }, + { + "epoch": 0.3479988082232595, + "grad_norm": 0.5694769536171905, + "learning_rate": 7.5757725864286536e-06, + "loss": 0.0104, + "step": 1752 + }, + { + "epoch": 0.34819743768000794, + "grad_norm": 0.521723957173526, + "learning_rate": 7.573014320146554e-06, + "loss": 0.0119, + "step": 1753 + }, + { + "epoch": 0.34839606713675636, + "grad_norm": 0.6919236767327321, + "learning_rate": 7.570254988381801e-06, + "loss": 0.0134, + "step": 1754 + }, + { + "epoch": 0.34859469659350484, + "grad_norm": 0.5814649266815375, + "learning_rate": 7.567494592277031e-06, + "loss": 0.017, + "step": 1755 + }, + { + "epoch": 0.34879332605025326, + "grad_norm": 0.5249271812496219, + "learning_rate": 7.564733132975321e-06, + "loss": 0.011, + "step": 1756 + }, + { + "epoch": 0.3489919555070017, + "grad_norm": 0.3963080625064441, + "learning_rate": 7.561970611620191e-06, + "loss": 0.0111, + "step": 1757 + }, + { + "epoch": 0.3491905849637501, + "grad_norm": 0.7223775018258682, + "learning_rate": 7.559207029355593e-06, + "loss": 0.0173, + "step": 1758 + }, + { + "epoch": 0.3493892144204986, + "grad_norm": 0.50134557033894, + "learning_rate": 7.5564423873259306e-06, + "loss": 0.0104, + "step": 1759 + }, + { + "epoch": 0.349587843877247, + "grad_norm": 0.6488452488216098, + "learning_rate": 7.553676686676034e-06, + "loss": 0.0175, + "step": 1760 + }, + { + "epoch": 0.34978647333399543, + "grad_norm": 0.49384847104699575, + "learning_rate": 7.5509099285511775e-06, + "loss": 0.0141, + "step": 1761 + }, + { + "epoch": 0.34998510279074385, + "grad_norm": 0.3953416148883208, + "learning_rate": 7.548142114097077e-06, + "loss": 0.012, + "step": 1762 + }, + { + "epoch": 0.35018373224749233, + "grad_norm": 0.666969527433996, + "learning_rate": 7.545373244459877e-06, + "loss": 0.0129, + "step": 1763 + }, + { + "epoch": 0.35038236170424075, + "grad_norm": 0.561354252191077, + "learning_rate": 7.542603320786166e-06, + "loss": 0.0208, + "step": 1764 + }, + { + "epoch": 0.3505809911609892, + "grad_norm": 0.6255119965679132, + "learning_rate": 7.539832344222966e-06, + "loss": 0.0152, + "step": 1765 + }, + { + "epoch": 0.3507796206177376, + "grad_norm": 0.4306557341184174, + "learning_rate": 7.537060315917734e-06, + "loss": 0.0211, + "step": 1766 + }, + { + "epoch": 0.350978250074486, + "grad_norm": 0.5528048136181183, + "learning_rate": 7.53428723701837e-06, + "loss": 0.0238, + "step": 1767 + }, + { + "epoch": 0.3511768795312345, + "grad_norm": 0.5116386343446967, + "learning_rate": 7.531513108673196e-06, + "loss": 0.0172, + "step": 1768 + }, + { + "epoch": 0.3513755089879829, + "grad_norm": 0.4279479662462199, + "learning_rate": 7.528737932030978e-06, + "loss": 0.0134, + "step": 1769 + }, + { + "epoch": 0.35157413844473134, + "grad_norm": 0.3728795654509069, + "learning_rate": 7.5259617082409165e-06, + "loss": 0.016, + "step": 1770 + }, + { + "epoch": 0.35177276790147977, + "grad_norm": 0.3025444908515605, + "learning_rate": 7.52318443845264e-06, + "loss": 0.0122, + "step": 1771 + }, + { + "epoch": 0.35197139735822824, + "grad_norm": 0.5225986004045365, + "learning_rate": 7.520406123816215e-06, + "loss": 0.0126, + "step": 1772 + }, + { + "epoch": 0.35217002681497667, + "grad_norm": 0.22475056903545595, + "learning_rate": 7.517626765482139e-06, + "loss": 0.0075, + "step": 1773 + }, + { + "epoch": 0.3523686562717251, + "grad_norm": 0.6644702771927852, + "learning_rate": 7.5148463646013405e-06, + "loss": 0.0201, + "step": 1774 + }, + { + "epoch": 0.3525672857284735, + "grad_norm": 0.21757342206133162, + "learning_rate": 7.512064922325179e-06, + "loss": 0.007, + "step": 1775 + }, + { + "epoch": 0.352765915185222, + "grad_norm": 0.7067021760903283, + "learning_rate": 7.50928243980545e-06, + "loss": 0.014, + "step": 1776 + }, + { + "epoch": 0.3529645446419704, + "grad_norm": 0.6595482172027973, + "learning_rate": 7.506498918194376e-06, + "loss": 0.0117, + "step": 1777 + }, + { + "epoch": 0.35316317409871884, + "grad_norm": 0.6788932987088132, + "learning_rate": 7.5037143586446095e-06, + "loss": 0.0179, + "step": 1778 + }, + { + "epoch": 0.35336180355546726, + "grad_norm": 0.28064297644399, + "learning_rate": 7.500928762309234e-06, + "loss": 0.0086, + "step": 1779 + }, + { + "epoch": 0.35356043301221574, + "grad_norm": 0.5596372651009542, + "learning_rate": 7.498142130341764e-06, + "loss": 0.0166, + "step": 1780 + }, + { + "epoch": 0.35375906246896416, + "grad_norm": 0.7073177849186396, + "learning_rate": 7.495354463896137e-06, + "loss": 0.0095, + "step": 1781 + }, + { + "epoch": 0.3539576919257126, + "grad_norm": 0.5066305988153633, + "learning_rate": 7.492565764126728e-06, + "loss": 0.0104, + "step": 1782 + }, + { + "epoch": 0.354156321382461, + "grad_norm": 0.44054068142958164, + "learning_rate": 7.4897760321883295e-06, + "loss": 0.0085, + "step": 1783 + }, + { + "epoch": 0.3543549508392095, + "grad_norm": 0.4553697052742875, + "learning_rate": 7.486985269236171e-06, + "loss": 0.0113, + "step": 1784 + }, + { + "epoch": 0.3545535802959579, + "grad_norm": 0.5212306828856347, + "learning_rate": 7.4841934764259025e-06, + "loss": 0.0182, + "step": 1785 + }, + { + "epoch": 0.3547522097527063, + "grad_norm": 0.31537340873334424, + "learning_rate": 7.481400654913606e-06, + "loss": 0.0095, + "step": 1786 + }, + { + "epoch": 0.35495083920945475, + "grad_norm": 0.7414978066014976, + "learning_rate": 7.47860680585578e-06, + "loss": 0.0207, + "step": 1787 + }, + { + "epoch": 0.35514946866620317, + "grad_norm": 0.6019848486242421, + "learning_rate": 7.475811930409359e-06, + "loss": 0.0157, + "step": 1788 + }, + { + "epoch": 0.35534809812295165, + "grad_norm": 0.3030605279151099, + "learning_rate": 7.473016029731696e-06, + "loss": 0.0067, + "step": 1789 + }, + { + "epoch": 0.3555467275797001, + "grad_norm": 1.415851763954458, + "learning_rate": 7.470219104980572e-06, + "loss": 0.0157, + "step": 1790 + }, + { + "epoch": 0.3557453570364485, + "grad_norm": 0.3333734372525097, + "learning_rate": 7.467421157314191e-06, + "loss": 0.0102, + "step": 1791 + }, + { + "epoch": 0.3559439864931969, + "grad_norm": 0.3295733399959379, + "learning_rate": 7.464622187891179e-06, + "loss": 0.0093, + "step": 1792 + }, + { + "epoch": 0.3561426159499454, + "grad_norm": 0.5348317682346659, + "learning_rate": 7.4618221978705875e-06, + "loss": 0.0225, + "step": 1793 + }, + { + "epoch": 0.3563412454066938, + "grad_norm": 0.4680465985095257, + "learning_rate": 7.45902118841189e-06, + "loss": 0.0098, + "step": 1794 + }, + { + "epoch": 0.35653987486344224, + "grad_norm": 0.5680929119198581, + "learning_rate": 7.45621916067498e-06, + "loss": 0.0154, + "step": 1795 + }, + { + "epoch": 0.35673850432019066, + "grad_norm": 0.7557185174597132, + "learning_rate": 7.453416115820173e-06, + "loss": 0.0157, + "step": 1796 + }, + { + "epoch": 0.35693713377693914, + "grad_norm": 0.6641326101462152, + "learning_rate": 7.4506120550082125e-06, + "loss": 0.0158, + "step": 1797 + }, + { + "epoch": 0.35713576323368756, + "grad_norm": 0.7831104663272818, + "learning_rate": 7.447806979400255e-06, + "loss": 0.0126, + "step": 1798 + }, + { + "epoch": 0.357334392690436, + "grad_norm": 0.4053205882012716, + "learning_rate": 7.445000890157876e-06, + "loss": 0.0114, + "step": 1799 + }, + { + "epoch": 0.3575330221471844, + "grad_norm": 0.5298271130207239, + "learning_rate": 7.442193788443078e-06, + "loss": 0.0094, + "step": 1800 + }, + { + "epoch": 0.3577316516039329, + "grad_norm": 0.9078634591414197, + "learning_rate": 7.439385675418278e-06, + "loss": 0.0128, + "step": 1801 + }, + { + "epoch": 0.3579302810606813, + "grad_norm": 0.4697586290419777, + "learning_rate": 7.436576552246312e-06, + "loss": 0.0103, + "step": 1802 + }, + { + "epoch": 0.35812891051742973, + "grad_norm": 0.9172712379027379, + "learning_rate": 7.433766420090436e-06, + "loss": 0.0198, + "step": 1803 + }, + { + "epoch": 0.35832753997417816, + "grad_norm": 0.45567523468910553, + "learning_rate": 7.430955280114322e-06, + "loss": 0.0119, + "step": 1804 + }, + { + "epoch": 0.35852616943092663, + "grad_norm": 0.7802133005429396, + "learning_rate": 7.428143133482063e-06, + "loss": 0.009, + "step": 1805 + }, + { + "epoch": 0.35872479888767506, + "grad_norm": 0.6423209564757857, + "learning_rate": 7.425329981358163e-06, + "loss": 0.0214, + "step": 1806 + }, + { + "epoch": 0.3589234283444235, + "grad_norm": 0.49872407430644544, + "learning_rate": 7.422515824907546e-06, + "loss": 0.0155, + "step": 1807 + }, + { + "epoch": 0.3591220578011719, + "grad_norm": 0.40918220122528187, + "learning_rate": 7.419700665295551e-06, + "loss": 0.0113, + "step": 1808 + }, + { + "epoch": 0.3593206872579203, + "grad_norm": 0.4717139361458346, + "learning_rate": 7.416884503687936e-06, + "loss": 0.0089, + "step": 1809 + }, + { + "epoch": 0.3595193167146688, + "grad_norm": 0.7744599282851102, + "learning_rate": 7.414067341250868e-06, + "loss": 0.0175, + "step": 1810 + }, + { + "epoch": 0.3597179461714172, + "grad_norm": 0.7426285798479693, + "learning_rate": 7.41124917915093e-06, + "loss": 0.018, + "step": 1811 + }, + { + "epoch": 0.35991657562816565, + "grad_norm": 0.3054174866414779, + "learning_rate": 7.408430018555122e-06, + "loss": 0.0108, + "step": 1812 + }, + { + "epoch": 0.36011520508491407, + "grad_norm": 0.44806274127264745, + "learning_rate": 7.405609860630855e-06, + "loss": 0.0159, + "step": 1813 + }, + { + "epoch": 0.36031383454166255, + "grad_norm": 0.45456734941331584, + "learning_rate": 7.402788706545953e-06, + "loss": 0.0114, + "step": 1814 + }, + { + "epoch": 0.36051246399841097, + "grad_norm": 0.41653981798920153, + "learning_rate": 7.3999665574686566e-06, + "loss": 0.0167, + "step": 1815 + }, + { + "epoch": 0.3607110934551594, + "grad_norm": 0.42591262940535, + "learning_rate": 7.39714341456761e-06, + "loss": 0.012, + "step": 1816 + }, + { + "epoch": 0.3609097229119078, + "grad_norm": 0.5559851848169303, + "learning_rate": 7.394319279011877e-06, + "loss": 0.016, + "step": 1817 + }, + { + "epoch": 0.3611083523686563, + "grad_norm": 0.3959904663844897, + "learning_rate": 7.391494151970928e-06, + "loss": 0.0123, + "step": 1818 + }, + { + "epoch": 0.3613069818254047, + "grad_norm": 0.6179966278625685, + "learning_rate": 7.388668034614645e-06, + "loss": 0.0135, + "step": 1819 + }, + { + "epoch": 0.36150561128215314, + "grad_norm": 0.37571678679572235, + "learning_rate": 7.385840928113321e-06, + "loss": 0.01, + "step": 1820 + }, + { + "epoch": 0.36170424073890156, + "grad_norm": 0.4874416376117431, + "learning_rate": 7.383012833637657e-06, + "loss": 0.0111, + "step": 1821 + }, + { + "epoch": 0.36190287019565004, + "grad_norm": 0.4693254096635751, + "learning_rate": 7.380183752358768e-06, + "loss": 0.0154, + "step": 1822 + }, + { + "epoch": 0.36210149965239846, + "grad_norm": 0.6119621192596436, + "learning_rate": 7.37735368544817e-06, + "loss": 0.0187, + "step": 1823 + }, + { + "epoch": 0.3623001291091469, + "grad_norm": 0.35374113186809264, + "learning_rate": 7.37452263407779e-06, + "loss": 0.0102, + "step": 1824 + }, + { + "epoch": 0.3624987585658953, + "grad_norm": 0.7498478793443032, + "learning_rate": 7.371690599419965e-06, + "loss": 0.0147, + "step": 1825 + }, + { + "epoch": 0.3626973880226438, + "grad_norm": 0.9295665798522207, + "learning_rate": 7.3688575826474385e-06, + "loss": 0.026, + "step": 1826 + }, + { + "epoch": 0.3628960174793922, + "grad_norm": 0.523390784697274, + "learning_rate": 7.3660235849333594e-06, + "loss": 0.0169, + "step": 1827 + }, + { + "epoch": 0.36309464693614063, + "grad_norm": 0.4162831159506049, + "learning_rate": 7.363188607451283e-06, + "loss": 0.0132, + "step": 1828 + }, + { + "epoch": 0.36329327639288905, + "grad_norm": 0.6182223304228157, + "learning_rate": 7.360352651375171e-06, + "loss": 0.0138, + "step": 1829 + }, + { + "epoch": 0.3634919058496375, + "grad_norm": 0.43001898999202587, + "learning_rate": 7.35751571787939e-06, + "loss": 0.0137, + "step": 1830 + }, + { + "epoch": 0.36369053530638595, + "grad_norm": 0.5552465012209282, + "learning_rate": 7.35467780813871e-06, + "loss": 0.0103, + "step": 1831 + }, + { + "epoch": 0.3638891647631344, + "grad_norm": 0.5016437757951848, + "learning_rate": 7.3518389233283095e-06, + "loss": 0.0094, + "step": 1832 + }, + { + "epoch": 0.3640877942198828, + "grad_norm": 0.18076930934715546, + "learning_rate": 7.348999064623763e-06, + "loss": 0.0059, + "step": 1833 + }, + { + "epoch": 0.3642864236766312, + "grad_norm": 0.817939106262419, + "learning_rate": 7.34615823320106e-06, + "loss": 0.021, + "step": 1834 + }, + { + "epoch": 0.3644850531333797, + "grad_norm": 0.517907031178373, + "learning_rate": 7.34331643023658e-06, + "loss": 0.0129, + "step": 1835 + }, + { + "epoch": 0.3646836825901281, + "grad_norm": 0.4513320397295181, + "learning_rate": 7.340473656907113e-06, + "loss": 0.0127, + "step": 1836 + }, + { + "epoch": 0.36488231204687654, + "grad_norm": 0.7014290051670106, + "learning_rate": 7.33762991438985e-06, + "loss": 0.0126, + "step": 1837 + }, + { + "epoch": 0.36508094150362497, + "grad_norm": 0.34486163648433743, + "learning_rate": 7.334785203862378e-06, + "loss": 0.0131, + "step": 1838 + }, + { + "epoch": 0.36527957096037345, + "grad_norm": 0.42658500377007785, + "learning_rate": 7.331939526502692e-06, + "loss": 0.0148, + "step": 1839 + }, + { + "epoch": 0.36547820041712187, + "grad_norm": 0.4012284928050507, + "learning_rate": 7.329092883489184e-06, + "loss": 0.0103, + "step": 1840 + }, + { + "epoch": 0.3656768298738703, + "grad_norm": 0.6923841780226727, + "learning_rate": 7.326245276000645e-06, + "loss": 0.0173, + "step": 1841 + }, + { + "epoch": 0.3658754593306187, + "grad_norm": 0.6204656956557247, + "learning_rate": 7.323396705216267e-06, + "loss": 0.0149, + "step": 1842 + }, + { + "epoch": 0.3660740887873672, + "grad_norm": 0.43915467134030417, + "learning_rate": 7.320547172315639e-06, + "loss": 0.0115, + "step": 1843 + }, + { + "epoch": 0.3662727182441156, + "grad_norm": 0.9453561263857528, + "learning_rate": 7.317696678478752e-06, + "loss": 0.0147, + "step": 1844 + }, + { + "epoch": 0.36647134770086404, + "grad_norm": 0.4116346164383005, + "learning_rate": 7.314845224885992e-06, + "loss": 0.0132, + "step": 1845 + }, + { + "epoch": 0.36666997715761246, + "grad_norm": 0.6010586200072675, + "learning_rate": 7.31199281271814e-06, + "loss": 0.016, + "step": 1846 + }, + { + "epoch": 0.36686860661436094, + "grad_norm": 0.379756980977871, + "learning_rate": 7.309139443156382e-06, + "loss": 0.0084, + "step": 1847 + }, + { + "epoch": 0.36706723607110936, + "grad_norm": 0.32996713284661944, + "learning_rate": 7.306285117382292e-06, + "loss": 0.0078, + "step": 1848 + }, + { + "epoch": 0.3672658655278578, + "grad_norm": 0.3528111052658405, + "learning_rate": 7.3034298365778455e-06, + "loss": 0.0105, + "step": 1849 + }, + { + "epoch": 0.3674644949846062, + "grad_norm": 0.32743299157906436, + "learning_rate": 7.300573601925409e-06, + "loss": 0.0129, + "step": 1850 + }, + { + "epoch": 0.3676631244413546, + "grad_norm": 0.40089814984231775, + "learning_rate": 7.297716414607747e-06, + "loss": 0.0074, + "step": 1851 + }, + { + "epoch": 0.3678617538981031, + "grad_norm": 0.8879963171017308, + "learning_rate": 7.294858275808021e-06, + "loss": 0.0158, + "step": 1852 + }, + { + "epoch": 0.36806038335485153, + "grad_norm": 1.1591210573488786, + "learning_rate": 7.29199918670978e-06, + "loss": 0.0179, + "step": 1853 + }, + { + "epoch": 0.36825901281159995, + "grad_norm": 0.5766334292979398, + "learning_rate": 7.289139148496971e-06, + "loss": 0.0134, + "step": 1854 + }, + { + "epoch": 0.3684576422683484, + "grad_norm": 0.5074699980363011, + "learning_rate": 7.286278162353934e-06, + "loss": 0.0087, + "step": 1855 + }, + { + "epoch": 0.36865627172509685, + "grad_norm": 0.5200342371854682, + "learning_rate": 7.283416229465399e-06, + "loss": 0.0154, + "step": 1856 + }, + { + "epoch": 0.3688549011818453, + "grad_norm": 0.4688805862662405, + "learning_rate": 7.280553351016489e-06, + "loss": 0.0083, + "step": 1857 + }, + { + "epoch": 0.3690535306385937, + "grad_norm": 0.5049930399031888, + "learning_rate": 7.277689528192722e-06, + "loss": 0.0113, + "step": 1858 + }, + { + "epoch": 0.3692521600953421, + "grad_norm": 0.8576327838457353, + "learning_rate": 7.2748247621800005e-06, + "loss": 0.0151, + "step": 1859 + }, + { + "epoch": 0.3694507895520906, + "grad_norm": 0.5565380252033648, + "learning_rate": 7.271959054164623e-06, + "loss": 0.0117, + "step": 1860 + }, + { + "epoch": 0.369649419008839, + "grad_norm": 1.193299984883163, + "learning_rate": 7.269092405333278e-06, + "loss": 0.0153, + "step": 1861 + }, + { + "epoch": 0.36984804846558744, + "grad_norm": 0.8109235962868375, + "learning_rate": 7.26622481687304e-06, + "loss": 0.0166, + "step": 1862 + }, + { + "epoch": 0.37004667792233586, + "grad_norm": 0.7632333804661434, + "learning_rate": 7.263356289971374e-06, + "loss": 0.0169, + "step": 1863 + }, + { + "epoch": 0.37024530737908434, + "grad_norm": 0.8232197885172541, + "learning_rate": 7.260486825816134e-06, + "loss": 0.0165, + "step": 1864 + }, + { + "epoch": 0.37044393683583277, + "grad_norm": 0.4215445953865816, + "learning_rate": 7.257616425595564e-06, + "loss": 0.0146, + "step": 1865 + }, + { + "epoch": 0.3706425662925812, + "grad_norm": 0.6077339267039444, + "learning_rate": 7.254745090498294e-06, + "loss": 0.0179, + "step": 1866 + }, + { + "epoch": 0.3708411957493296, + "grad_norm": 0.8156039287786392, + "learning_rate": 7.251872821713339e-06, + "loss": 0.015, + "step": 1867 + }, + { + "epoch": 0.3710398252060781, + "grad_norm": 0.4884625901216402, + "learning_rate": 7.248999620430104e-06, + "loss": 0.0076, + "step": 1868 + }, + { + "epoch": 0.3712384546628265, + "grad_norm": 0.47130775104617345, + "learning_rate": 7.246125487838378e-06, + "loss": 0.012, + "step": 1869 + }, + { + "epoch": 0.37143708411957493, + "grad_norm": 0.3343798674016324, + "learning_rate": 7.243250425128337e-06, + "loss": 0.0082, + "step": 1870 + }, + { + "epoch": 0.37163571357632336, + "grad_norm": 0.2962776732176791, + "learning_rate": 7.240374433490542e-06, + "loss": 0.0092, + "step": 1871 + }, + { + "epoch": 0.3718343430330718, + "grad_norm": 0.6443601303540771, + "learning_rate": 7.237497514115937e-06, + "loss": 0.0149, + "step": 1872 + }, + { + "epoch": 0.37203297248982026, + "grad_norm": 0.6090312171198041, + "learning_rate": 7.234619668195853e-06, + "loss": 0.022, + "step": 1873 + }, + { + "epoch": 0.3722316019465687, + "grad_norm": 1.008498309205369, + "learning_rate": 7.231740896922e-06, + "loss": 0.0181, + "step": 1874 + }, + { + "epoch": 0.3724302314033171, + "grad_norm": 0.2263113111319505, + "learning_rate": 7.228861201486479e-06, + "loss": 0.0107, + "step": 1875 + }, + { + "epoch": 0.3726288608600655, + "grad_norm": 1.0824047330601112, + "learning_rate": 7.225980583081764e-06, + "loss": 0.015, + "step": 1876 + }, + { + "epoch": 0.372827490316814, + "grad_norm": 0.4519637815818529, + "learning_rate": 7.2230990429007205e-06, + "loss": 0.0167, + "step": 1877 + }, + { + "epoch": 0.3730261197735624, + "grad_norm": 0.3996641096482069, + "learning_rate": 7.2202165821365884e-06, + "loss": 0.0099, + "step": 1878 + }, + { + "epoch": 0.37322474923031085, + "grad_norm": 0.5247153139606248, + "learning_rate": 7.217333201982994e-06, + "loss": 0.0112, + "step": 1879 + }, + { + "epoch": 0.37342337868705927, + "grad_norm": 0.6322765470012125, + "learning_rate": 7.2144489036339414e-06, + "loss": 0.01, + "step": 1880 + }, + { + "epoch": 0.37362200814380775, + "grad_norm": 0.7784772761848201, + "learning_rate": 7.211563688283815e-06, + "loss": 0.0093, + "step": 1881 + }, + { + "epoch": 0.37382063760055617, + "grad_norm": 0.489780522096192, + "learning_rate": 7.20867755712738e-06, + "loss": 0.0107, + "step": 1882 + }, + { + "epoch": 0.3740192670573046, + "grad_norm": 0.4222831983087626, + "learning_rate": 7.20579051135978e-06, + "loss": 0.0133, + "step": 1883 + }, + { + "epoch": 0.374217896514053, + "grad_norm": 0.39180257048360806, + "learning_rate": 7.2029025521765395e-06, + "loss": 0.0084, + "step": 1884 + }, + { + "epoch": 0.3744165259708015, + "grad_norm": 0.328589453430018, + "learning_rate": 7.200013680773556e-06, + "loss": 0.0086, + "step": 1885 + }, + { + "epoch": 0.3746151554275499, + "grad_norm": 0.8840476580346277, + "learning_rate": 7.197123898347113e-06, + "loss": 0.0197, + "step": 1886 + }, + { + "epoch": 0.37481378488429834, + "grad_norm": 0.7445076848470832, + "learning_rate": 7.194233206093862e-06, + "loss": 0.0155, + "step": 1887 + }, + { + "epoch": 0.37501241434104676, + "grad_norm": 0.823712336571771, + "learning_rate": 7.1913416052108385e-06, + "loss": 0.0138, + "step": 1888 + }, + { + "epoch": 0.37521104379779524, + "grad_norm": 0.5026615807876057, + "learning_rate": 7.18844909689545e-06, + "loss": 0.0127, + "step": 1889 + }, + { + "epoch": 0.37540967325454366, + "grad_norm": 0.5235102842153762, + "learning_rate": 7.185555682345483e-06, + "loss": 0.0167, + "step": 1890 + }, + { + "epoch": 0.3756083027112921, + "grad_norm": 0.38122765851314344, + "learning_rate": 7.182661362759096e-06, + "loss": 0.016, + "step": 1891 + }, + { + "epoch": 0.3758069321680405, + "grad_norm": 0.659952705620418, + "learning_rate": 7.179766139334825e-06, + "loss": 0.0217, + "step": 1892 + }, + { + "epoch": 0.37600556162478893, + "grad_norm": 0.3460455608781431, + "learning_rate": 7.1768700132715785e-06, + "loss": 0.0122, + "step": 1893 + }, + { + "epoch": 0.3762041910815374, + "grad_norm": 0.5470309221958665, + "learning_rate": 7.173972985768639e-06, + "loss": 0.0159, + "step": 1894 + }, + { + "epoch": 0.37640282053828583, + "grad_norm": 0.609575691356759, + "learning_rate": 7.171075058025664e-06, + "loss": 0.015, + "step": 1895 + }, + { + "epoch": 0.37660144999503425, + "grad_norm": 0.6256172821754186, + "learning_rate": 7.168176231242681e-06, + "loss": 0.0091, + "step": 1896 + }, + { + "epoch": 0.3768000794517827, + "grad_norm": 0.43669928645159256, + "learning_rate": 7.165276506620092e-06, + "loss": 0.0108, + "step": 1897 + }, + { + "epoch": 0.37699870890853115, + "grad_norm": 0.452629904550346, + "learning_rate": 7.16237588535867e-06, + "loss": 0.0131, + "step": 1898 + }, + { + "epoch": 0.3771973383652796, + "grad_norm": 0.5306623947422398, + "learning_rate": 7.159474368659559e-06, + "loss": 0.0101, + "step": 1899 + }, + { + "epoch": 0.377395967822028, + "grad_norm": 0.877007063554911, + "learning_rate": 7.156571957724275e-06, + "loss": 0.0221, + "step": 1900 + }, + { + "epoch": 0.3775945972787764, + "grad_norm": 0.5158241756842965, + "learning_rate": 7.153668653754702e-06, + "loss": 0.0129, + "step": 1901 + }, + { + "epoch": 0.3777932267355249, + "grad_norm": 0.3126658544481895, + "learning_rate": 7.150764457953096e-06, + "loss": 0.0131, + "step": 1902 + }, + { + "epoch": 0.3779918561922733, + "grad_norm": 0.8119239874875152, + "learning_rate": 7.147859371522083e-06, + "loss": 0.0149, + "step": 1903 + }, + { + "epoch": 0.37819048564902175, + "grad_norm": 0.37864224886279996, + "learning_rate": 7.1449533956646555e-06, + "loss": 0.0166, + "step": 1904 + }, + { + "epoch": 0.37838911510577017, + "grad_norm": 0.31041396376686636, + "learning_rate": 7.142046531584176e-06, + "loss": 0.0077, + "step": 1905 + }, + { + "epoch": 0.37858774456251865, + "grad_norm": 0.44678399616930514, + "learning_rate": 7.139138780484371e-06, + "loss": 0.0147, + "step": 1906 + }, + { + "epoch": 0.37878637401926707, + "grad_norm": 0.6814421963342644, + "learning_rate": 7.136230143569338e-06, + "loss": 0.0092, + "step": 1907 + }, + { + "epoch": 0.3789850034760155, + "grad_norm": 0.5699394737446986, + "learning_rate": 7.133320622043544e-06, + "loss": 0.0095, + "step": 1908 + }, + { + "epoch": 0.3791836329327639, + "grad_norm": 0.5118628899874634, + "learning_rate": 7.1304102171118165e-06, + "loss": 0.018, + "step": 1909 + }, + { + "epoch": 0.37938226238951234, + "grad_norm": 0.8450993817197867, + "learning_rate": 7.12749892997935e-06, + "loss": 0.0152, + "step": 1910 + }, + { + "epoch": 0.3795808918462608, + "grad_norm": 0.5936810791362285, + "learning_rate": 7.124586761851709e-06, + "loss": 0.0137, + "step": 1911 + }, + { + "epoch": 0.37977952130300924, + "grad_norm": 0.7147959398627135, + "learning_rate": 7.121673713934816e-06, + "loss": 0.0075, + "step": 1912 + }, + { + "epoch": 0.37997815075975766, + "grad_norm": 0.3268090768205218, + "learning_rate": 7.1187597874349635e-06, + "loss": 0.0086, + "step": 1913 + }, + { + "epoch": 0.3801767802165061, + "grad_norm": 0.8429071661543824, + "learning_rate": 7.115844983558804e-06, + "loss": 0.0164, + "step": 1914 + }, + { + "epoch": 0.38037540967325456, + "grad_norm": 0.5508174982198967, + "learning_rate": 7.112929303513356e-06, + "loss": 0.0185, + "step": 1915 + }, + { + "epoch": 0.380574039130003, + "grad_norm": 0.8107547717459483, + "learning_rate": 7.110012748506e-06, + "loss": 0.0191, + "step": 1916 + }, + { + "epoch": 0.3807726685867514, + "grad_norm": 0.4591386080076129, + "learning_rate": 7.107095319744479e-06, + "loss": 0.0108, + "step": 1917 + }, + { + "epoch": 0.38097129804349983, + "grad_norm": 0.7765034782212996, + "learning_rate": 7.1041770184368945e-06, + "loss": 0.0119, + "step": 1918 + }, + { + "epoch": 0.3811699275002483, + "grad_norm": 0.5274182540525633, + "learning_rate": 7.101257845791714e-06, + "loss": 0.0141, + "step": 1919 + }, + { + "epoch": 0.38136855695699673, + "grad_norm": 0.3603680268183808, + "learning_rate": 7.098337803017763e-06, + "loss": 0.0087, + "step": 1920 + }, + { + "epoch": 0.38156718641374515, + "grad_norm": 0.47632384862279487, + "learning_rate": 7.095416891324231e-06, + "loss": 0.0083, + "step": 1921 + }, + { + "epoch": 0.3817658158704936, + "grad_norm": 0.6379591206386355, + "learning_rate": 7.0924951119206605e-06, + "loss": 0.0113, + "step": 1922 + }, + { + "epoch": 0.38196444532724205, + "grad_norm": 0.5608777505358337, + "learning_rate": 7.0895724660169615e-06, + "loss": 0.0159, + "step": 1923 + }, + { + "epoch": 0.3821630747839905, + "grad_norm": 0.45075216436612026, + "learning_rate": 7.086648954823396e-06, + "loss": 0.0103, + "step": 1924 + }, + { + "epoch": 0.3823617042407389, + "grad_norm": 0.8387316921379768, + "learning_rate": 7.083724579550588e-06, + "loss": 0.0248, + "step": 1925 + }, + { + "epoch": 0.3825603336974873, + "grad_norm": 0.504954461339415, + "learning_rate": 7.080799341409518e-06, + "loss": 0.0162, + "step": 1926 + }, + { + "epoch": 0.3827589631542358, + "grad_norm": 0.633175612122431, + "learning_rate": 7.077873241611525e-06, + "loss": 0.0134, + "step": 1927 + }, + { + "epoch": 0.3829575926109842, + "grad_norm": 0.5726947868855516, + "learning_rate": 7.074946281368304e-06, + "loss": 0.0154, + "step": 1928 + }, + { + "epoch": 0.38315622206773264, + "grad_norm": 0.27366180430338843, + "learning_rate": 7.072018461891906e-06, + "loss": 0.0122, + "step": 1929 + }, + { + "epoch": 0.38335485152448107, + "grad_norm": 0.3847532093947302, + "learning_rate": 7.069089784394737e-06, + "loss": 0.01, + "step": 1930 + }, + { + "epoch": 0.3835534809812295, + "grad_norm": 0.40845059746079004, + "learning_rate": 7.066160250089561e-06, + "loss": 0.0101, + "step": 1931 + }, + { + "epoch": 0.38375211043797797, + "grad_norm": 0.4485746408020155, + "learning_rate": 7.063229860189493e-06, + "loss": 0.0105, + "step": 1932 + }, + { + "epoch": 0.3839507398947264, + "grad_norm": 0.9400839412508927, + "learning_rate": 7.060298615908006e-06, + "loss": 0.0153, + "step": 1933 + }, + { + "epoch": 0.3841493693514748, + "grad_norm": 0.3182534498151112, + "learning_rate": 7.057366518458928e-06, + "loss": 0.0111, + "step": 1934 + }, + { + "epoch": 0.38434799880822323, + "grad_norm": 0.4800246818941992, + "learning_rate": 7.0544335690564334e-06, + "loss": 0.0209, + "step": 1935 + }, + { + "epoch": 0.3845466282649717, + "grad_norm": 0.46973294531377413, + "learning_rate": 7.051499768915056e-06, + "loss": 0.0114, + "step": 1936 + }, + { + "epoch": 0.38474525772172014, + "grad_norm": 0.7859855280724575, + "learning_rate": 7.048565119249677e-06, + "loss": 0.0141, + "step": 1937 + }, + { + "epoch": 0.38494388717846856, + "grad_norm": 0.6876110737606786, + "learning_rate": 7.0456296212755344e-06, + "loss": 0.0128, + "step": 1938 + }, + { + "epoch": 0.385142516635217, + "grad_norm": 0.46899691256126286, + "learning_rate": 7.0426932762082135e-06, + "loss": 0.0093, + "step": 1939 + }, + { + "epoch": 0.38534114609196546, + "grad_norm": 0.3513359380168546, + "learning_rate": 7.039756085263654e-06, + "loss": 0.0072, + "step": 1940 + }, + { + "epoch": 0.3855397755487139, + "grad_norm": 0.6075133338918507, + "learning_rate": 7.036818049658143e-06, + "loss": 0.0143, + "step": 1941 + }, + { + "epoch": 0.3857384050054623, + "grad_norm": 0.7662029488077239, + "learning_rate": 7.0338791706083155e-06, + "loss": 0.0132, + "step": 1942 + }, + { + "epoch": 0.3859370344622107, + "grad_norm": 0.628109956535636, + "learning_rate": 7.03093944933116e-06, + "loss": 0.016, + "step": 1943 + }, + { + "epoch": 0.3861356639189592, + "grad_norm": 0.19803867516817825, + "learning_rate": 7.027998887044013e-06, + "loss": 0.0052, + "step": 1944 + }, + { + "epoch": 0.3863342933757076, + "grad_norm": 0.662751247765471, + "learning_rate": 7.025057484964558e-06, + "loss": 0.0104, + "step": 1945 + }, + { + "epoch": 0.38653292283245605, + "grad_norm": 0.6819086619122386, + "learning_rate": 7.022115244310826e-06, + "loss": 0.0113, + "step": 1946 + }, + { + "epoch": 0.38673155228920447, + "grad_norm": 1.5798621153907724, + "learning_rate": 7.019172166301197e-06, + "loss": 0.0223, + "step": 1947 + }, + { + "epoch": 0.38693018174595295, + "grad_norm": 0.5987945163370364, + "learning_rate": 7.016228252154396e-06, + "loss": 0.0121, + "step": 1948 + }, + { + "epoch": 0.3871288112027014, + "grad_norm": 0.9981941212502284, + "learning_rate": 7.013283503089494e-06, + "loss": 0.0115, + "step": 1949 + }, + { + "epoch": 0.3873274406594498, + "grad_norm": 1.0406382038091888, + "learning_rate": 7.01033792032591e-06, + "loss": 0.0248, + "step": 1950 + }, + { + "epoch": 0.3875260701161982, + "grad_norm": 0.41775245079601303, + "learning_rate": 7.007391505083405e-06, + "loss": 0.0104, + "step": 1951 + }, + { + "epoch": 0.38772469957294664, + "grad_norm": 0.5898166538524512, + "learning_rate": 7.004444258582091e-06, + "loss": 0.0129, + "step": 1952 + }, + { + "epoch": 0.3879233290296951, + "grad_norm": 0.7611476458360803, + "learning_rate": 7.001496182042416e-06, + "loss": 0.0157, + "step": 1953 + }, + { + "epoch": 0.38812195848644354, + "grad_norm": 0.33055910901797114, + "learning_rate": 6.998547276685175e-06, + "loss": 0.0107, + "step": 1954 + }, + { + "epoch": 0.38832058794319196, + "grad_norm": 0.48354068516638127, + "learning_rate": 6.995597543731509e-06, + "loss": 0.0085, + "step": 1955 + }, + { + "epoch": 0.3885192173999404, + "grad_norm": 1.3859038603039011, + "learning_rate": 6.992646984402898e-06, + "loss": 0.022, + "step": 1956 + }, + { + "epoch": 0.38871784685668886, + "grad_norm": 0.9177731485118693, + "learning_rate": 6.989695599921166e-06, + "loss": 0.0192, + "step": 1957 + }, + { + "epoch": 0.3889164763134373, + "grad_norm": 0.6987553756434028, + "learning_rate": 6.986743391508479e-06, + "loss": 0.0182, + "step": 1958 + }, + { + "epoch": 0.3891151057701857, + "grad_norm": 0.40997833208509904, + "learning_rate": 6.983790360387344e-06, + "loss": 0.0088, + "step": 1959 + }, + { + "epoch": 0.38931373522693413, + "grad_norm": 0.34916159941815383, + "learning_rate": 6.980836507780606e-06, + "loss": 0.0128, + "step": 1960 + }, + { + "epoch": 0.3895123646836826, + "grad_norm": 0.7820977108647957, + "learning_rate": 6.977881834911455e-06, + "loss": 0.0122, + "step": 1961 + }, + { + "epoch": 0.38971099414043103, + "grad_norm": 0.9305707403285578, + "learning_rate": 6.974926343003416e-06, + "loss": 0.0128, + "step": 1962 + }, + { + "epoch": 0.38990962359717946, + "grad_norm": 0.682174849117662, + "learning_rate": 6.971970033280356e-06, + "loss": 0.0108, + "step": 1963 + }, + { + "epoch": 0.3901082530539279, + "grad_norm": 0.5350787998659173, + "learning_rate": 6.96901290696648e-06, + "loss": 0.0154, + "step": 1964 + }, + { + "epoch": 0.39030688251067636, + "grad_norm": 0.3819759718576897, + "learning_rate": 6.966054965286333e-06, + "loss": 0.0098, + "step": 1965 + }, + { + "epoch": 0.3905055119674248, + "grad_norm": 0.5388430417097986, + "learning_rate": 6.963096209464793e-06, + "loss": 0.0144, + "step": 1966 + }, + { + "epoch": 0.3907041414241732, + "grad_norm": 0.5604706168748357, + "learning_rate": 6.9601366407270766e-06, + "loss": 0.0157, + "step": 1967 + }, + { + "epoch": 0.3909027708809216, + "grad_norm": 0.5074402312211344, + "learning_rate": 6.957176260298742e-06, + "loss": 0.008, + "step": 1968 + }, + { + "epoch": 0.3911014003376701, + "grad_norm": 0.37696620230600386, + "learning_rate": 6.954215069405677e-06, + "loss": 0.0164, + "step": 1969 + }, + { + "epoch": 0.3913000297944185, + "grad_norm": 0.7845133541601853, + "learning_rate": 6.951253069274109e-06, + "loss": 0.013, + "step": 1970 + }, + { + "epoch": 0.39149865925116695, + "grad_norm": 0.6377358194064576, + "learning_rate": 6.948290261130598e-06, + "loss": 0.0108, + "step": 1971 + }, + { + "epoch": 0.39169728870791537, + "grad_norm": 0.37305516874735534, + "learning_rate": 6.945326646202042e-06, + "loss": 0.0118, + "step": 1972 + }, + { + "epoch": 0.3918959181646638, + "grad_norm": 0.6101758865406128, + "learning_rate": 6.942362225715671e-06, + "loss": 0.0072, + "step": 1973 + }, + { + "epoch": 0.39209454762141227, + "grad_norm": 0.6669153608848816, + "learning_rate": 6.939397000899046e-06, + "loss": 0.0164, + "step": 1974 + }, + { + "epoch": 0.3922931770781607, + "grad_norm": 0.4434397317662605, + "learning_rate": 6.936430972980066e-06, + "loss": 0.0142, + "step": 1975 + }, + { + "epoch": 0.3924918065349091, + "grad_norm": 1.1486290060634483, + "learning_rate": 6.933464143186958e-06, + "loss": 0.0246, + "step": 1976 + }, + { + "epoch": 0.39269043599165754, + "grad_norm": 0.4402208692427919, + "learning_rate": 6.930496512748286e-06, + "loss": 0.0142, + "step": 1977 + }, + { + "epoch": 0.392889065448406, + "grad_norm": 0.8605918213090531, + "learning_rate": 6.927528082892941e-06, + "loss": 0.0162, + "step": 1978 + }, + { + "epoch": 0.39308769490515444, + "grad_norm": 0.602808108046716, + "learning_rate": 6.924558854850146e-06, + "loss": 0.0202, + "step": 1979 + }, + { + "epoch": 0.39328632436190286, + "grad_norm": 1.044860369928853, + "learning_rate": 6.921588829849458e-06, + "loss": 0.0239, + "step": 1980 + }, + { + "epoch": 0.3934849538186513, + "grad_norm": 1.0031230137699616, + "learning_rate": 6.918618009120759e-06, + "loss": 0.021, + "step": 1981 + }, + { + "epoch": 0.39368358327539976, + "grad_norm": 0.4596310481079393, + "learning_rate": 6.915646393894263e-06, + "loss": 0.0126, + "step": 1982 + }, + { + "epoch": 0.3938822127321482, + "grad_norm": 0.48189428114732397, + "learning_rate": 6.912673985400515e-06, + "loss": 0.0119, + "step": 1983 + }, + { + "epoch": 0.3940808421888966, + "grad_norm": 0.4738322562132466, + "learning_rate": 6.909700784870384e-06, + "loss": 0.021, + "step": 1984 + }, + { + "epoch": 0.39427947164564503, + "grad_norm": 0.41216128439817645, + "learning_rate": 6.906726793535072e-06, + "loss": 0.0141, + "step": 1985 + }, + { + "epoch": 0.3944781011023935, + "grad_norm": 0.33966794763975494, + "learning_rate": 6.903752012626104e-06, + "loss": 0.012, + "step": 1986 + }, + { + "epoch": 0.39467673055914193, + "grad_norm": 0.2801511238879822, + "learning_rate": 6.9007764433753324e-06, + "loss": 0.0155, + "step": 1987 + }, + { + "epoch": 0.39487536001589035, + "grad_norm": 0.16933671446526258, + "learning_rate": 6.897800087014939e-06, + "loss": 0.0064, + "step": 1988 + }, + { + "epoch": 0.3950739894726388, + "grad_norm": 0.8536198892078616, + "learning_rate": 6.894822944777433e-06, + "loss": 0.0156, + "step": 1989 + }, + { + "epoch": 0.39527261892938725, + "grad_norm": 0.2747101542433025, + "learning_rate": 6.891845017895641e-06, + "loss": 0.0069, + "step": 1990 + }, + { + "epoch": 0.3954712483861357, + "grad_norm": 0.5826546759975959, + "learning_rate": 6.888866307602722e-06, + "loss": 0.0206, + "step": 1991 + }, + { + "epoch": 0.3956698778428841, + "grad_norm": 0.5732473635409778, + "learning_rate": 6.885886815132156e-06, + "loss": 0.0144, + "step": 1992 + }, + { + "epoch": 0.3958685072996325, + "grad_norm": 0.25716654176152753, + "learning_rate": 6.882906541717749e-06, + "loss": 0.0091, + "step": 1993 + }, + { + "epoch": 0.39606713675638094, + "grad_norm": 0.3022114556941632, + "learning_rate": 6.879925488593629e-06, + "loss": 0.009, + "step": 1994 + }, + { + "epoch": 0.3962657662131294, + "grad_norm": 0.4902950717878552, + "learning_rate": 6.876943656994246e-06, + "loss": 0.0113, + "step": 1995 + }, + { + "epoch": 0.39646439566987784, + "grad_norm": 0.46038753699336915, + "learning_rate": 6.873961048154374e-06, + "loss": 0.0146, + "step": 1996 + }, + { + "epoch": 0.39666302512662627, + "grad_norm": 0.6323728442617919, + "learning_rate": 6.8709776633091085e-06, + "loss": 0.011, + "step": 1997 + }, + { + "epoch": 0.3968616545833747, + "grad_norm": 0.4427297026630537, + "learning_rate": 6.867993503693868e-06, + "loss": 0.0168, + "step": 1998 + }, + { + "epoch": 0.39706028404012317, + "grad_norm": 0.4675745693620594, + "learning_rate": 6.865008570544387e-06, + "loss": 0.0155, + "step": 1999 + }, + { + "epoch": 0.3972589134968716, + "grad_norm": 1.0376746039699174, + "learning_rate": 6.862022865096724e-06, + "loss": 0.0198, + "step": 2000 + }, + { + "epoch": 0.39745754295362, + "grad_norm": 0.8009967163692941, + "learning_rate": 6.859036388587259e-06, + "loss": 0.0097, + "step": 2001 + }, + { + "epoch": 0.39765617241036844, + "grad_norm": 0.6394354987730926, + "learning_rate": 6.856049142252687e-06, + "loss": 0.011, + "step": 2002 + }, + { + "epoch": 0.3978548018671169, + "grad_norm": 1.2279342276594911, + "learning_rate": 6.853061127330023e-06, + "loss": 0.0207, + "step": 2003 + }, + { + "epoch": 0.39805343132386534, + "grad_norm": 0.5253473078917413, + "learning_rate": 6.850072345056602e-06, + "loss": 0.0119, + "step": 2004 + }, + { + "epoch": 0.39825206078061376, + "grad_norm": 0.4416957674480141, + "learning_rate": 6.847082796670075e-06, + "loss": 0.0116, + "step": 2005 + }, + { + "epoch": 0.3984506902373622, + "grad_norm": 0.36052345349554943, + "learning_rate": 6.844092483408411e-06, + "loss": 0.0119, + "step": 2006 + }, + { + "epoch": 0.39864931969411066, + "grad_norm": 0.5852938989301314, + "learning_rate": 6.8411014065098976e-06, + "loss": 0.0155, + "step": 2007 + }, + { + "epoch": 0.3988479491508591, + "grad_norm": 0.7611535304666339, + "learning_rate": 6.838109567213136e-06, + "loss": 0.011, + "step": 2008 + }, + { + "epoch": 0.3990465786076075, + "grad_norm": 0.6832215844317394, + "learning_rate": 6.835116966757042e-06, + "loss": 0.0128, + "step": 2009 + }, + { + "epoch": 0.3992452080643559, + "grad_norm": 0.35682225109643834, + "learning_rate": 6.832123606380852e-06, + "loss": 0.0058, + "step": 2010 + }, + { + "epoch": 0.3994438375211044, + "grad_norm": 0.4423053994022838, + "learning_rate": 6.82912948732411e-06, + "loss": 0.0129, + "step": 2011 + }, + { + "epoch": 0.39964246697785283, + "grad_norm": 0.8397841328233963, + "learning_rate": 6.826134610826678e-06, + "loss": 0.0142, + "step": 2012 + }, + { + "epoch": 0.39984109643460125, + "grad_norm": 0.7620187002515791, + "learning_rate": 6.8231389781287334e-06, + "loss": 0.0191, + "step": 2013 + }, + { + "epoch": 0.4000397258913497, + "grad_norm": 0.41386810405085306, + "learning_rate": 6.820142590470764e-06, + "loss": 0.0129, + "step": 2014 + }, + { + "epoch": 0.4002383553480981, + "grad_norm": 0.7141109172857413, + "learning_rate": 6.81714544909357e-06, + "loss": 0.0183, + "step": 2015 + }, + { + "epoch": 0.4004369848048466, + "grad_norm": 0.3552446795117014, + "learning_rate": 6.8141475552382665e-06, + "loss": 0.0094, + "step": 2016 + }, + { + "epoch": 0.400635614261595, + "grad_norm": 0.44775694205320965, + "learning_rate": 6.8111489101462755e-06, + "loss": 0.0126, + "step": 2017 + }, + { + "epoch": 0.4008342437183434, + "grad_norm": 0.5103022028781918, + "learning_rate": 6.808149515059336e-06, + "loss": 0.015, + "step": 2018 + }, + { + "epoch": 0.40103287317509184, + "grad_norm": 1.1270462570715771, + "learning_rate": 6.805149371219491e-06, + "loss": 0.0214, + "step": 2019 + }, + { + "epoch": 0.4012315026318403, + "grad_norm": 0.49338047514807803, + "learning_rate": 6.8021484798691005e-06, + "loss": 0.0094, + "step": 2020 + }, + { + "epoch": 0.40143013208858874, + "grad_norm": 0.4300331782209231, + "learning_rate": 6.799146842250829e-06, + "loss": 0.0163, + "step": 2021 + }, + { + "epoch": 0.40162876154533717, + "grad_norm": 0.33283287964762853, + "learning_rate": 6.796144459607652e-06, + "loss": 0.0094, + "step": 2022 + }, + { + "epoch": 0.4018273910020856, + "grad_norm": 0.3659723339747389, + "learning_rate": 6.793141333182853e-06, + "loss": 0.0111, + "step": 2023 + }, + { + "epoch": 0.40202602045883407, + "grad_norm": 0.49565704935423, + "learning_rate": 6.7901374642200235e-06, + "loss": 0.0164, + "step": 2024 + }, + { + "epoch": 0.4022246499155825, + "grad_norm": 0.6254189475535632, + "learning_rate": 6.787132853963063e-06, + "loss": 0.0145, + "step": 2025 + }, + { + "epoch": 0.4024232793723309, + "grad_norm": 0.39002228377365256, + "learning_rate": 6.784127503656177e-06, + "loss": 0.0087, + "step": 2026 + }, + { + "epoch": 0.40262190882907933, + "grad_norm": 0.2860968118458668, + "learning_rate": 6.781121414543878e-06, + "loss": 0.0076, + "step": 2027 + }, + { + "epoch": 0.4028205382858278, + "grad_norm": 0.37170294728652653, + "learning_rate": 6.778114587870985e-06, + "loss": 0.0104, + "step": 2028 + }, + { + "epoch": 0.40301916774257623, + "grad_norm": 0.4036659979664484, + "learning_rate": 6.775107024882623e-06, + "loss": 0.007, + "step": 2029 + }, + { + "epoch": 0.40321779719932466, + "grad_norm": 0.5549451713316448, + "learning_rate": 6.772098726824219e-06, + "loss": 0.0119, + "step": 2030 + }, + { + "epoch": 0.4034164266560731, + "grad_norm": 0.9605322431392109, + "learning_rate": 6.769089694941506e-06, + "loss": 0.0165, + "step": 2031 + }, + { + "epoch": 0.40361505611282156, + "grad_norm": 0.21818067243379816, + "learning_rate": 6.766079930480523e-06, + "loss": 0.0055, + "step": 2032 + }, + { + "epoch": 0.40381368556957, + "grad_norm": 0.394154840294968, + "learning_rate": 6.763069434687608e-06, + "loss": 0.0177, + "step": 2033 + }, + { + "epoch": 0.4040123150263184, + "grad_norm": 0.4819653179358817, + "learning_rate": 6.760058208809407e-06, + "loss": 0.0123, + "step": 2034 + }, + { + "epoch": 0.4042109444830668, + "grad_norm": 1.280679469593761, + "learning_rate": 6.757046254092865e-06, + "loss": 0.016, + "step": 2035 + }, + { + "epoch": 0.40440957393981525, + "grad_norm": 0.5941803654709766, + "learning_rate": 6.754033571785227e-06, + "loss": 0.0104, + "step": 2036 + }, + { + "epoch": 0.4046082033965637, + "grad_norm": 0.5054580109994721, + "learning_rate": 6.7510201631340445e-06, + "loss": 0.0165, + "step": 2037 + }, + { + "epoch": 0.40480683285331215, + "grad_norm": 0.5427400354352391, + "learning_rate": 6.748006029387165e-06, + "loss": 0.0102, + "step": 2038 + }, + { + "epoch": 0.40500546231006057, + "grad_norm": 0.8181768765515004, + "learning_rate": 6.744991171792741e-06, + "loss": 0.0104, + "step": 2039 + }, + { + "epoch": 0.405204091766809, + "grad_norm": 0.6029402222723133, + "learning_rate": 6.741975591599219e-06, + "loss": 0.0115, + "step": 2040 + }, + { + "epoch": 0.40540272122355747, + "grad_norm": 0.5908122684007071, + "learning_rate": 6.738959290055349e-06, + "loss": 0.014, + "step": 2041 + }, + { + "epoch": 0.4056013506803059, + "grad_norm": 0.2594953161338611, + "learning_rate": 6.735942268410178e-06, + "loss": 0.009, + "step": 2042 + }, + { + "epoch": 0.4057999801370543, + "grad_norm": 0.9374727512612783, + "learning_rate": 6.7329245279130525e-06, + "loss": 0.0129, + "step": 2043 + }, + { + "epoch": 0.40599860959380274, + "grad_norm": 0.978985312338324, + "learning_rate": 6.7299060698136146e-06, + "loss": 0.0153, + "step": 2044 + }, + { + "epoch": 0.4061972390505512, + "grad_norm": 0.4019327020061037, + "learning_rate": 6.726886895361807e-06, + "loss": 0.014, + "step": 2045 + }, + { + "epoch": 0.40639586850729964, + "grad_norm": 0.7831143997306873, + "learning_rate": 6.723867005807865e-06, + "loss": 0.0185, + "step": 2046 + }, + { + "epoch": 0.40659449796404806, + "grad_norm": 0.5998403552907489, + "learning_rate": 6.720846402402321e-06, + "loss": 0.0133, + "step": 2047 + }, + { + "epoch": 0.4067931274207965, + "grad_norm": 0.6325931920573962, + "learning_rate": 6.717825086396007e-06, + "loss": 0.0091, + "step": 2048 + }, + { + "epoch": 0.40699175687754496, + "grad_norm": 1.0061561033266881, + "learning_rate": 6.714803059040043e-06, + "loss": 0.0142, + "step": 2049 + }, + { + "epoch": 0.4071903863342934, + "grad_norm": 0.6985348942673328, + "learning_rate": 6.711780321585851e-06, + "loss": 0.0118, + "step": 2050 + }, + { + "epoch": 0.4073890157910418, + "grad_norm": 1.2385741562172954, + "learning_rate": 6.708756875285143e-06, + "loss": 0.0112, + "step": 2051 + }, + { + "epoch": 0.40758764524779023, + "grad_norm": 0.3580125091627547, + "learning_rate": 6.705732721389922e-06, + "loss": 0.0089, + "step": 2052 + }, + { + "epoch": 0.4077862747045387, + "grad_norm": 0.5168739359216307, + "learning_rate": 6.7027078611524895e-06, + "loss": 0.012, + "step": 2053 + }, + { + "epoch": 0.40798490416128713, + "grad_norm": 1.176391662569905, + "learning_rate": 6.699682295825438e-06, + "loss": 0.0159, + "step": 2054 + }, + { + "epoch": 0.40818353361803555, + "grad_norm": 0.32971423902812974, + "learning_rate": 6.69665602666165e-06, + "loss": 0.0095, + "step": 2055 + }, + { + "epoch": 0.408382163074784, + "grad_norm": 0.7928255269070492, + "learning_rate": 6.693629054914298e-06, + "loss": 0.0061, + "step": 2056 + }, + { + "epoch": 0.4085807925315324, + "grad_norm": 0.35670943004316913, + "learning_rate": 6.690601381836852e-06, + "loss": 0.0096, + "step": 2057 + }, + { + "epoch": 0.4087794219882809, + "grad_norm": 0.7975764075408337, + "learning_rate": 6.687573008683067e-06, + "loss": 0.015, + "step": 2058 + }, + { + "epoch": 0.4089780514450293, + "grad_norm": 0.8319504575889517, + "learning_rate": 6.684543936706989e-06, + "loss": 0.0094, + "step": 2059 + }, + { + "epoch": 0.4091766809017777, + "grad_norm": 0.7205610429728381, + "learning_rate": 6.681514167162954e-06, + "loss": 0.0217, + "step": 2060 + }, + { + "epoch": 0.40937531035852615, + "grad_norm": 0.5035140786062322, + "learning_rate": 6.678483701305587e-06, + "loss": 0.0143, + "step": 2061 + }, + { + "epoch": 0.4095739398152746, + "grad_norm": 0.9123728525009268, + "learning_rate": 6.675452540389799e-06, + "loss": 0.0146, + "step": 2062 + }, + { + "epoch": 0.40977256927202305, + "grad_norm": 1.2095776877970332, + "learning_rate": 6.672420685670791e-06, + "loss": 0.0232, + "step": 2063 + }, + { + "epoch": 0.40997119872877147, + "grad_norm": 0.44951274178922074, + "learning_rate": 6.669388138404053e-06, + "loss": 0.011, + "step": 2064 + }, + { + "epoch": 0.4101698281855199, + "grad_norm": 1.2023015543359805, + "learning_rate": 6.666354899845359e-06, + "loss": 0.0202, + "step": 2065 + }, + { + "epoch": 0.41036845764226837, + "grad_norm": 1.221622773848238, + "learning_rate": 6.6633209712507685e-06, + "loss": 0.0185, + "step": 2066 + }, + { + "epoch": 0.4105670870990168, + "grad_norm": 1.245700627421625, + "learning_rate": 6.66028635387663e-06, + "loss": 0.0187, + "step": 2067 + }, + { + "epoch": 0.4107657165557652, + "grad_norm": 1.3483216195815648, + "learning_rate": 6.657251048979576e-06, + "loss": 0.0159, + "step": 2068 + }, + { + "epoch": 0.41096434601251364, + "grad_norm": 0.782416570047913, + "learning_rate": 6.654215057816521e-06, + "loss": 0.0172, + "step": 2069 + }, + { + "epoch": 0.4111629754692621, + "grad_norm": 0.3826135913453981, + "learning_rate": 6.651178381644668e-06, + "loss": 0.0147, + "step": 2070 + }, + { + "epoch": 0.41136160492601054, + "grad_norm": 0.4239048220757515, + "learning_rate": 6.648141021721499e-06, + "loss": 0.0131, + "step": 2071 + }, + { + "epoch": 0.41156023438275896, + "grad_norm": 0.20186798617854174, + "learning_rate": 6.645102979304785e-06, + "loss": 0.0067, + "step": 2072 + }, + { + "epoch": 0.4117588638395074, + "grad_norm": 0.44996535792808406, + "learning_rate": 6.642064255652576e-06, + "loss": 0.0134, + "step": 2073 + }, + { + "epoch": 0.41195749329625586, + "grad_norm": 0.7266208044448531, + "learning_rate": 6.6390248520232005e-06, + "loss": 0.0104, + "step": 2074 + }, + { + "epoch": 0.4121561227530043, + "grad_norm": 0.8966072482079172, + "learning_rate": 6.6359847696752765e-06, + "loss": 0.019, + "step": 2075 + }, + { + "epoch": 0.4123547522097527, + "grad_norm": 0.39163729067805625, + "learning_rate": 6.6329440098676975e-06, + "loss": 0.0125, + "step": 2076 + }, + { + "epoch": 0.41255338166650113, + "grad_norm": 0.5443082189044185, + "learning_rate": 6.629902573859639e-06, + "loss": 0.0127, + "step": 2077 + }, + { + "epoch": 0.41275201112324955, + "grad_norm": 0.8156259321153082, + "learning_rate": 6.626860462910557e-06, + "loss": 0.0175, + "step": 2078 + }, + { + "epoch": 0.41295064057999803, + "grad_norm": 0.7619649973471931, + "learning_rate": 6.623817678280187e-06, + "loss": 0.0136, + "step": 2079 + }, + { + "epoch": 0.41314927003674645, + "grad_norm": 0.779374490528315, + "learning_rate": 6.620774221228542e-06, + "loss": 0.0155, + "step": 2080 + }, + { + "epoch": 0.4133478994934949, + "grad_norm": 0.3046402817702944, + "learning_rate": 6.617730093015915e-06, + "loss": 0.0108, + "step": 2081 + }, + { + "epoch": 0.4135465289502433, + "grad_norm": 0.442045865809681, + "learning_rate": 6.614685294902876e-06, + "loss": 0.0121, + "step": 2082 + }, + { + "epoch": 0.4137451584069918, + "grad_norm": 0.5778707285503455, + "learning_rate": 6.611639828150273e-06, + "loss": 0.0133, + "step": 2083 + }, + { + "epoch": 0.4139437878637402, + "grad_norm": 0.8172818474796, + "learning_rate": 6.608593694019233e-06, + "loss": 0.0118, + "step": 2084 + }, + { + "epoch": 0.4141424173204886, + "grad_norm": 1.0008201942397041, + "learning_rate": 6.605546893771155e-06, + "loss": 0.0099, + "step": 2085 + }, + { + "epoch": 0.41434104677723704, + "grad_norm": 0.32404676420516487, + "learning_rate": 6.602499428667717e-06, + "loss": 0.008, + "step": 2086 + }, + { + "epoch": 0.4145396762339855, + "grad_norm": 0.9189946664738823, + "learning_rate": 6.599451299970867e-06, + "loss": 0.0102, + "step": 2087 + }, + { + "epoch": 0.41473830569073394, + "grad_norm": 0.5740526296204994, + "learning_rate": 6.596402508942838e-06, + "loss": 0.0186, + "step": 2088 + }, + { + "epoch": 0.41493693514748237, + "grad_norm": 0.4517267054867236, + "learning_rate": 6.593353056846128e-06, + "loss": 0.017, + "step": 2089 + }, + { + "epoch": 0.4151355646042308, + "grad_norm": 0.6912662883562258, + "learning_rate": 6.590302944943513e-06, + "loss": 0.0149, + "step": 2090 + }, + { + "epoch": 0.41533419406097927, + "grad_norm": 0.7887240811333215, + "learning_rate": 6.58725217449804e-06, + "loss": 0.0118, + "step": 2091 + }, + { + "epoch": 0.4155328235177277, + "grad_norm": 1.0223004745638395, + "learning_rate": 6.584200746773033e-06, + "loss": 0.0222, + "step": 2092 + }, + { + "epoch": 0.4157314529744761, + "grad_norm": 0.4963546873514082, + "learning_rate": 6.581148663032082e-06, + "loss": 0.0154, + "step": 2093 + }, + { + "epoch": 0.41593008243122453, + "grad_norm": 0.9897376857036329, + "learning_rate": 6.5780959245390516e-06, + "loss": 0.0132, + "step": 2094 + }, + { + "epoch": 0.416128711887973, + "grad_norm": 0.5755955971080043, + "learning_rate": 6.5750425325580815e-06, + "loss": 0.0164, + "step": 2095 + }, + { + "epoch": 0.41632734134472144, + "grad_norm": 0.4895034942702204, + "learning_rate": 6.571988488353574e-06, + "loss": 0.0096, + "step": 2096 + }, + { + "epoch": 0.41652597080146986, + "grad_norm": 0.8734463704411604, + "learning_rate": 6.5689337931902106e-06, + "loss": 0.0139, + "step": 2097 + }, + { + "epoch": 0.4167246002582183, + "grad_norm": 0.3329657332391327, + "learning_rate": 6.565878448332933e-06, + "loss": 0.0093, + "step": 2098 + }, + { + "epoch": 0.4169232297149667, + "grad_norm": 0.40090511983879706, + "learning_rate": 6.562822455046958e-06, + "loss": 0.0076, + "step": 2099 + }, + { + "epoch": 0.4171218591717152, + "grad_norm": 1.05951571631075, + "learning_rate": 6.559765814597768e-06, + "loss": 0.0173, + "step": 2100 + }, + { + "epoch": 0.4173204886284636, + "grad_norm": 0.7267645054655884, + "learning_rate": 6.556708528251119e-06, + "loss": 0.0166, + "step": 2101 + }, + { + "epoch": 0.417519118085212, + "grad_norm": 0.4476817973691744, + "learning_rate": 6.553650597273025e-06, + "loss": 0.0098, + "step": 2102 + }, + { + "epoch": 0.41771774754196045, + "grad_norm": 0.8482543101543035, + "learning_rate": 6.550592022929776e-06, + "loss": 0.0206, + "step": 2103 + }, + { + "epoch": 0.4179163769987089, + "grad_norm": 0.3610267762233538, + "learning_rate": 6.5475328064879225e-06, + "loss": 0.0103, + "step": 2104 + }, + { + "epoch": 0.41811500645545735, + "grad_norm": 0.8566164925435632, + "learning_rate": 6.544472949214283e-06, + "loss": 0.0127, + "step": 2105 + }, + { + "epoch": 0.4183136359122058, + "grad_norm": 0.33243771966184194, + "learning_rate": 6.5414124523759425e-06, + "loss": 0.0098, + "step": 2106 + }, + { + "epoch": 0.4185122653689542, + "grad_norm": 0.380868671779114, + "learning_rate": 6.538351317240249e-06, + "loss": 0.0131, + "step": 2107 + }, + { + "epoch": 0.4187108948257027, + "grad_norm": 0.5135335899779155, + "learning_rate": 6.5352895450748155e-06, + "loss": 0.0067, + "step": 2108 + }, + { + "epoch": 0.4189095242824511, + "grad_norm": 0.38253084605058807, + "learning_rate": 6.53222713714752e-06, + "loss": 0.0096, + "step": 2109 + }, + { + "epoch": 0.4191081537391995, + "grad_norm": 0.6100179434270661, + "learning_rate": 6.5291640947265015e-06, + "loss": 0.0116, + "step": 2110 + }, + { + "epoch": 0.41930678319594794, + "grad_norm": 1.412550135096153, + "learning_rate": 6.5261004190801615e-06, + "loss": 0.0201, + "step": 2111 + }, + { + "epoch": 0.4195054126526964, + "grad_norm": 0.26253036733279284, + "learning_rate": 6.523036111477165e-06, + "loss": 0.0057, + "step": 2112 + }, + { + "epoch": 0.41970404210944484, + "grad_norm": 0.7640094825754855, + "learning_rate": 6.519971173186441e-06, + "loss": 0.0129, + "step": 2113 + }, + { + "epoch": 0.41990267156619326, + "grad_norm": 0.41106174134380463, + "learning_rate": 6.516905605477177e-06, + "loss": 0.0081, + "step": 2114 + }, + { + "epoch": 0.4201013010229417, + "grad_norm": 0.45933590667057456, + "learning_rate": 6.513839409618821e-06, + "loss": 0.0078, + "step": 2115 + }, + { + "epoch": 0.42029993047969016, + "grad_norm": 0.7965887185750786, + "learning_rate": 6.510772586881081e-06, + "loss": 0.0179, + "step": 2116 + }, + { + "epoch": 0.4204985599364386, + "grad_norm": 0.59035117918782, + "learning_rate": 6.5077051385339266e-06, + "loss": 0.0096, + "step": 2117 + }, + { + "epoch": 0.420697189393187, + "grad_norm": 0.2143236615237929, + "learning_rate": 6.504637065847583e-06, + "loss": 0.0045, + "step": 2118 + }, + { + "epoch": 0.42089581884993543, + "grad_norm": 0.7334121261956231, + "learning_rate": 6.501568370092536e-06, + "loss": 0.0147, + "step": 2119 + }, + { + "epoch": 0.42109444830668386, + "grad_norm": 0.7597269250272318, + "learning_rate": 6.498499052539533e-06, + "loss": 0.0092, + "step": 2120 + }, + { + "epoch": 0.42129307776343233, + "grad_norm": 0.47333533855501414, + "learning_rate": 6.495429114459573e-06, + "loss": 0.0065, + "step": 2121 + }, + { + "epoch": 0.42149170722018076, + "grad_norm": 1.330156731921058, + "learning_rate": 6.4923585571239135e-06, + "loss": 0.0182, + "step": 2122 + }, + { + "epoch": 0.4216903366769292, + "grad_norm": 0.6948668654706006, + "learning_rate": 6.48928738180407e-06, + "loss": 0.0085, + "step": 2123 + }, + { + "epoch": 0.4218889661336776, + "grad_norm": 0.7890280489148934, + "learning_rate": 6.486215589771812e-06, + "loss": 0.015, + "step": 2124 + }, + { + "epoch": 0.4220875955904261, + "grad_norm": 0.452242433827678, + "learning_rate": 6.483143182299168e-06, + "loss": 0.0051, + "step": 2125 + }, + { + "epoch": 0.4222862250471745, + "grad_norm": 0.5748220685978489, + "learning_rate": 6.480070160658416e-06, + "loss": 0.0185, + "step": 2126 + }, + { + "epoch": 0.4224848545039229, + "grad_norm": 1.0878583524025376, + "learning_rate": 6.476996526122095e-06, + "loss": 0.0176, + "step": 2127 + }, + { + "epoch": 0.42268348396067135, + "grad_norm": 1.0214588977314998, + "learning_rate": 6.47392227996299e-06, + "loss": 0.0229, + "step": 2128 + }, + { + "epoch": 0.4228821134174198, + "grad_norm": 0.8368364297826585, + "learning_rate": 6.470847423454145e-06, + "loss": 0.0119, + "step": 2129 + }, + { + "epoch": 0.42308074287416825, + "grad_norm": 1.5630364581414389, + "learning_rate": 6.467771957868856e-06, + "loss": 0.0279, + "step": 2130 + }, + { + "epoch": 0.42327937233091667, + "grad_norm": 0.5265919309050093, + "learning_rate": 6.464695884480666e-06, + "loss": 0.0066, + "step": 2131 + }, + { + "epoch": 0.4234780017876651, + "grad_norm": 0.5458391814360123, + "learning_rate": 6.461619204563379e-06, + "loss": 0.0078, + "step": 2132 + }, + { + "epoch": 0.42367663124441357, + "grad_norm": 0.5629575812962587, + "learning_rate": 6.458541919391046e-06, + "loss": 0.0164, + "step": 2133 + }, + { + "epoch": 0.423875260701162, + "grad_norm": 1.0217593729153478, + "learning_rate": 6.455464030237961e-06, + "loss": 0.0132, + "step": 2134 + }, + { + "epoch": 0.4240738901579104, + "grad_norm": 0.3745957899282288, + "learning_rate": 6.452385538378681e-06, + "loss": 0.0078, + "step": 2135 + }, + { + "epoch": 0.42427251961465884, + "grad_norm": 0.4083516790878554, + "learning_rate": 6.4493064450880014e-06, + "loss": 0.0102, + "step": 2136 + }, + { + "epoch": 0.4244711490714073, + "grad_norm": 1.0042440949633358, + "learning_rate": 6.446226751640976e-06, + "loss": 0.0146, + "step": 2137 + }, + { + "epoch": 0.42466977852815574, + "grad_norm": 0.6514777945792959, + "learning_rate": 6.4431464593129e-06, + "loss": 0.0108, + "step": 2138 + }, + { + "epoch": 0.42486840798490416, + "grad_norm": 0.8367257022237136, + "learning_rate": 6.440065569379321e-06, + "loss": 0.0221, + "step": 2139 + }, + { + "epoch": 0.4250670374416526, + "grad_norm": 1.226693427969879, + "learning_rate": 6.436984083116032e-06, + "loss": 0.0229, + "step": 2140 + }, + { + "epoch": 0.425265666898401, + "grad_norm": 0.5627979518455061, + "learning_rate": 6.433902001799074e-06, + "loss": 0.012, + "step": 2141 + }, + { + "epoch": 0.4254642963551495, + "grad_norm": 0.7446434572287394, + "learning_rate": 6.430819326704732e-06, + "loss": 0.0133, + "step": 2142 + }, + { + "epoch": 0.4256629258118979, + "grad_norm": 0.5119208241159949, + "learning_rate": 6.427736059109539e-06, + "loss": 0.0134, + "step": 2143 + }, + { + "epoch": 0.42586155526864633, + "grad_norm": 0.43798331629976245, + "learning_rate": 6.424652200290275e-06, + "loss": 0.0116, + "step": 2144 + }, + { + "epoch": 0.42606018472539475, + "grad_norm": 0.8345104818237911, + "learning_rate": 6.421567751523962e-06, + "loss": 0.0146, + "step": 2145 + }, + { + "epoch": 0.42625881418214323, + "grad_norm": 0.5168528918687458, + "learning_rate": 6.4184827140878655e-06, + "loss": 0.0164, + "step": 2146 + }, + { + "epoch": 0.42645744363889165, + "grad_norm": 0.769263430644991, + "learning_rate": 6.415397089259497e-06, + "loss": 0.011, + "step": 2147 + }, + { + "epoch": 0.4266560730956401, + "grad_norm": 1.2006056414732815, + "learning_rate": 6.412310878316612e-06, + "loss": 0.0199, + "step": 2148 + }, + { + "epoch": 0.4268547025523885, + "grad_norm": 0.36140757435107507, + "learning_rate": 6.409224082537206e-06, + "loss": 0.0095, + "step": 2149 + }, + { + "epoch": 0.427053332009137, + "grad_norm": 0.4887806589406039, + "learning_rate": 6.4061367031995205e-06, + "loss": 0.0137, + "step": 2150 + }, + { + "epoch": 0.4272519614658854, + "grad_norm": 0.4372668846370759, + "learning_rate": 6.403048741582033e-06, + "loss": 0.0143, + "step": 2151 + }, + { + "epoch": 0.4274505909226338, + "grad_norm": 0.4264517517122641, + "learning_rate": 6.3999601989634665e-06, + "loss": 0.011, + "step": 2152 + }, + { + "epoch": 0.42764922037938224, + "grad_norm": 0.5939697458462438, + "learning_rate": 6.396871076622784e-06, + "loss": 0.0153, + "step": 2153 + }, + { + "epoch": 0.4278478498361307, + "grad_norm": 0.8720261216914841, + "learning_rate": 6.393781375839187e-06, + "loss": 0.0133, + "step": 2154 + }, + { + "epoch": 0.42804647929287915, + "grad_norm": 0.39205208543034786, + "learning_rate": 6.390691097892119e-06, + "loss": 0.0122, + "step": 2155 + }, + { + "epoch": 0.42824510874962757, + "grad_norm": 0.3140641676904937, + "learning_rate": 6.387600244061258e-06, + "loss": 0.0074, + "step": 2156 + }, + { + "epoch": 0.428443738206376, + "grad_norm": 0.5950724392389523, + "learning_rate": 6.384508815626529e-06, + "loss": 0.0119, + "step": 2157 + }, + { + "epoch": 0.42864236766312447, + "grad_norm": 1.1164418050217393, + "learning_rate": 6.381416813868085e-06, + "loss": 0.0168, + "step": 2158 + }, + { + "epoch": 0.4288409971198729, + "grad_norm": 1.0956077716877128, + "learning_rate": 6.37832424006632e-06, + "loss": 0.0246, + "step": 2159 + }, + { + "epoch": 0.4290396265766213, + "grad_norm": 0.5788002270708077, + "learning_rate": 6.375231095501868e-06, + "loss": 0.016, + "step": 2160 + }, + { + "epoch": 0.42923825603336974, + "grad_norm": 0.634313578842633, + "learning_rate": 6.3721373814555965e-06, + "loss": 0.0083, + "step": 2161 + }, + { + "epoch": 0.42943688549011816, + "grad_norm": 0.3256946524279106, + "learning_rate": 6.369043099208609e-06, + "loss": 0.0151, + "step": 2162 + }, + { + "epoch": 0.42963551494686664, + "grad_norm": 1.4280037849128235, + "learning_rate": 6.365948250042246e-06, + "loss": 0.0127, + "step": 2163 + }, + { + "epoch": 0.42983414440361506, + "grad_norm": 0.34022719740651086, + "learning_rate": 6.3628528352380804e-06, + "loss": 0.0073, + "step": 2164 + }, + { + "epoch": 0.4300327738603635, + "grad_norm": 0.5438789476793532, + "learning_rate": 6.359756856077921e-06, + "loss": 0.0109, + "step": 2165 + }, + { + "epoch": 0.4302314033171119, + "grad_norm": 0.9150416889469685, + "learning_rate": 6.3566603138438074e-06, + "loss": 0.0176, + "step": 2166 + }, + { + "epoch": 0.4304300327738604, + "grad_norm": 0.37232770382572944, + "learning_rate": 6.353563209818019e-06, + "loss": 0.0108, + "step": 2167 + }, + { + "epoch": 0.4306286622306088, + "grad_norm": 0.5476731401751236, + "learning_rate": 6.35046554528306e-06, + "loss": 0.0097, + "step": 2168 + }, + { + "epoch": 0.43082729168735723, + "grad_norm": 0.996056551510471, + "learning_rate": 6.347367321521671e-06, + "loss": 0.0164, + "step": 2169 + }, + { + "epoch": 0.43102592114410565, + "grad_norm": 0.7735149654496623, + "learning_rate": 6.3442685398168226e-06, + "loss": 0.0174, + "step": 2170 + }, + { + "epoch": 0.43122455060085413, + "grad_norm": 0.5452867098400445, + "learning_rate": 6.3411692014517175e-06, + "loss": 0.0129, + "step": 2171 + }, + { + "epoch": 0.43142318005760255, + "grad_norm": 0.4948507173958259, + "learning_rate": 6.338069307709791e-06, + "loss": 0.0154, + "step": 2172 + }, + { + "epoch": 0.431621809514351, + "grad_norm": 0.612880978569296, + "learning_rate": 6.334968859874704e-06, + "loss": 0.0177, + "step": 2173 + }, + { + "epoch": 0.4318204389710994, + "grad_norm": 0.6349394426674493, + "learning_rate": 6.331867859230347e-06, + "loss": 0.0123, + "step": 2174 + }, + { + "epoch": 0.4320190684278479, + "grad_norm": 0.44612232259436474, + "learning_rate": 6.328766307060845e-06, + "loss": 0.015, + "step": 2175 + }, + { + "epoch": 0.4322176978845963, + "grad_norm": 0.8671464943443922, + "learning_rate": 6.3256642046505455e-06, + "loss": 0.0155, + "step": 2176 + }, + { + "epoch": 0.4324163273413447, + "grad_norm": 0.35014656407021144, + "learning_rate": 6.322561553284027e-06, + "loss": 0.0049, + "step": 2177 + }, + { + "epoch": 0.43261495679809314, + "grad_norm": 1.0746580549448301, + "learning_rate": 6.319458354246093e-06, + "loss": 0.0128, + "step": 2178 + }, + { + "epoch": 0.4328135862548416, + "grad_norm": 0.7495218834739403, + "learning_rate": 6.316354608821775e-06, + "loss": 0.0167, + "step": 2179 + }, + { + "epoch": 0.43301221571159004, + "grad_norm": 0.6790113545596657, + "learning_rate": 6.313250318296333e-06, + "loss": 0.0071, + "step": 2180 + }, + { + "epoch": 0.43321084516833847, + "grad_norm": 0.5279299100486009, + "learning_rate": 6.31014548395525e-06, + "loss": 0.0125, + "step": 2181 + }, + { + "epoch": 0.4334094746250869, + "grad_norm": 0.3722980884712896, + "learning_rate": 6.307040107084232e-06, + "loss": 0.0067, + "step": 2182 + }, + { + "epoch": 0.4336081040818353, + "grad_norm": 0.5681920025290187, + "learning_rate": 6.3039341889692164e-06, + "loss": 0.0168, + "step": 2183 + }, + { + "epoch": 0.4338067335385838, + "grad_norm": 0.7370282183909415, + "learning_rate": 6.300827730896359e-06, + "loss": 0.013, + "step": 2184 + }, + { + "epoch": 0.4340053629953322, + "grad_norm": 1.0952586557587631, + "learning_rate": 6.29772073415204e-06, + "loss": 0.0246, + "step": 2185 + }, + { + "epoch": 0.43420399245208063, + "grad_norm": 0.36852862876653647, + "learning_rate": 6.294613200022865e-06, + "loss": 0.0073, + "step": 2186 + }, + { + "epoch": 0.43440262190882906, + "grad_norm": 0.5506267036624556, + "learning_rate": 6.291505129795659e-06, + "loss": 0.0078, + "step": 2187 + }, + { + "epoch": 0.43460125136557753, + "grad_norm": 0.642929767010988, + "learning_rate": 6.288396524757473e-06, + "loss": 0.02, + "step": 2188 + }, + { + "epoch": 0.43479988082232596, + "grad_norm": 0.4663773137832936, + "learning_rate": 6.285287386195577e-06, + "loss": 0.0096, + "step": 2189 + }, + { + "epoch": 0.4349985102790744, + "grad_norm": 0.899128172317911, + "learning_rate": 6.28217771539746e-06, + "loss": 0.0165, + "step": 2190 + }, + { + "epoch": 0.4351971397358228, + "grad_norm": 0.5828032708204945, + "learning_rate": 6.279067513650837e-06, + "loss": 0.0138, + "step": 2191 + }, + { + "epoch": 0.4353957691925713, + "grad_norm": 0.5191364681793799, + "learning_rate": 6.275956782243637e-06, + "loss": 0.0117, + "step": 2192 + }, + { + "epoch": 0.4355943986493197, + "grad_norm": 0.5129148486056785, + "learning_rate": 6.272845522464011e-06, + "loss": 0.0125, + "step": 2193 + }, + { + "epoch": 0.4357930281060681, + "grad_norm": 0.5310527010612295, + "learning_rate": 6.269733735600328e-06, + "loss": 0.0106, + "step": 2194 + }, + { + "epoch": 0.43599165756281655, + "grad_norm": 0.4310848203884204, + "learning_rate": 6.266621422941177e-06, + "loss": 0.0124, + "step": 2195 + }, + { + "epoch": 0.436190287019565, + "grad_norm": 0.9514944377133658, + "learning_rate": 6.263508585775364e-06, + "loss": 0.0127, + "step": 2196 + }, + { + "epoch": 0.43638891647631345, + "grad_norm": 0.7287942888398268, + "learning_rate": 6.260395225391911e-06, + "loss": 0.0107, + "step": 2197 + }, + { + "epoch": 0.43658754593306187, + "grad_norm": 0.3824710094767856, + "learning_rate": 6.2572813430800565e-06, + "loss": 0.0131, + "step": 2198 + }, + { + "epoch": 0.4367861753898103, + "grad_norm": 0.4588648637376047, + "learning_rate": 6.254166940129256e-06, + "loss": 0.0152, + "step": 2199 + }, + { + "epoch": 0.43698480484655877, + "grad_norm": 0.7375069280141057, + "learning_rate": 6.251052017829183e-06, + "loss": 0.0132, + "step": 2200 + }, + { + "epoch": 0.4371834343033072, + "grad_norm": 0.6624079769881998, + "learning_rate": 6.247936577469722e-06, + "loss": 0.0131, + "step": 2201 + }, + { + "epoch": 0.4373820637600556, + "grad_norm": 0.2923891307524442, + "learning_rate": 6.244820620340974e-06, + "loss": 0.0098, + "step": 2202 + }, + { + "epoch": 0.43758069321680404, + "grad_norm": 1.2709855509660637, + "learning_rate": 6.241704147733254e-06, + "loss": 0.0213, + "step": 2203 + }, + { + "epoch": 0.43777932267355246, + "grad_norm": 0.593801147450338, + "learning_rate": 6.2385871609370885e-06, + "loss": 0.0172, + "step": 2204 + }, + { + "epoch": 0.43797795213030094, + "grad_norm": 0.9452658486066292, + "learning_rate": 6.235469661243222e-06, + "loss": 0.0191, + "step": 2205 + }, + { + "epoch": 0.43817658158704936, + "grad_norm": 0.46873430033477437, + "learning_rate": 6.2323516499426055e-06, + "loss": 0.017, + "step": 2206 + }, + { + "epoch": 0.4383752110437978, + "grad_norm": 0.934149350494909, + "learning_rate": 6.229233128326404e-06, + "loss": 0.0168, + "step": 2207 + }, + { + "epoch": 0.4385738405005462, + "grad_norm": 0.7694849628244852, + "learning_rate": 6.226114097685996e-06, + "loss": 0.0106, + "step": 2208 + }, + { + "epoch": 0.4387724699572947, + "grad_norm": 0.36801759673916484, + "learning_rate": 6.222994559312967e-06, + "loss": 0.0058, + "step": 2209 + }, + { + "epoch": 0.4389710994140431, + "grad_norm": 0.6695445746957661, + "learning_rate": 6.219874514499116e-06, + "loss": 0.0267, + "step": 2210 + }, + { + "epoch": 0.43916972887079153, + "grad_norm": 0.38996735486229944, + "learning_rate": 6.21675396453645e-06, + "loss": 0.009, + "step": 2211 + }, + { + "epoch": 0.43936835832753995, + "grad_norm": 0.25082029907152686, + "learning_rate": 6.213632910717186e-06, + "loss": 0.0089, + "step": 2212 + }, + { + "epoch": 0.43956698778428843, + "grad_norm": 0.30323741324880354, + "learning_rate": 6.210511354333749e-06, + "loss": 0.0114, + "step": 2213 + }, + { + "epoch": 0.43976561724103685, + "grad_norm": 0.611598681425317, + "learning_rate": 6.207389296678773e-06, + "loss": 0.0237, + "step": 2214 + }, + { + "epoch": 0.4399642466977853, + "grad_norm": 0.5733866943683243, + "learning_rate": 6.2042667390450975e-06, + "loss": 0.0097, + "step": 2215 + }, + { + "epoch": 0.4401628761545337, + "grad_norm": 0.79670133428446, + "learning_rate": 6.201143682725772e-06, + "loss": 0.0176, + "step": 2216 + }, + { + "epoch": 0.4403615056112822, + "grad_norm": 0.9551481409610042, + "learning_rate": 6.19802012901405e-06, + "loss": 0.0143, + "step": 2217 + }, + { + "epoch": 0.4405601350680306, + "grad_norm": 0.5431923190522518, + "learning_rate": 6.1948960792033926e-06, + "loss": 0.0231, + "step": 2218 + }, + { + "epoch": 0.440758764524779, + "grad_norm": 1.0130427924002454, + "learning_rate": 6.191771534587465e-06, + "loss": 0.0125, + "step": 2219 + }, + { + "epoch": 0.44095739398152745, + "grad_norm": 0.5857570298238707, + "learning_rate": 6.188646496460139e-06, + "loss": 0.0135, + "step": 2220 + }, + { + "epoch": 0.4411560234382759, + "grad_norm": 0.9795520817741151, + "learning_rate": 6.185520966115489e-06, + "loss": 0.0154, + "step": 2221 + }, + { + "epoch": 0.44135465289502435, + "grad_norm": 0.7885367572135659, + "learning_rate": 6.182394944847794e-06, + "loss": 0.0173, + "step": 2222 + }, + { + "epoch": 0.44155328235177277, + "grad_norm": 0.21893643286317785, + "learning_rate": 6.179268433951536e-06, + "loss": 0.0058, + "step": 2223 + }, + { + "epoch": 0.4417519118085212, + "grad_norm": 0.7459506265711627, + "learning_rate": 6.1761414347213995e-06, + "loss": 0.0089, + "step": 2224 + }, + { + "epoch": 0.4419505412652696, + "grad_norm": 0.27792088533372517, + "learning_rate": 6.173013948452274e-06, + "loss": 0.0098, + "step": 2225 + }, + { + "epoch": 0.4421491707220181, + "grad_norm": 1.2345740081065522, + "learning_rate": 6.1698859764392475e-06, + "loss": 0.0143, + "step": 2226 + }, + { + "epoch": 0.4423478001787665, + "grad_norm": 0.3732015361677137, + "learning_rate": 6.1667575199776096e-06, + "loss": 0.0131, + "step": 2227 + }, + { + "epoch": 0.44254642963551494, + "grad_norm": 0.5343008204160175, + "learning_rate": 6.163628580362851e-06, + "loss": 0.0089, + "step": 2228 + }, + { + "epoch": 0.44274505909226336, + "grad_norm": 0.5522418996318227, + "learning_rate": 6.160499158890664e-06, + "loss": 0.0106, + "step": 2229 + }, + { + "epoch": 0.44294368854901184, + "grad_norm": 0.6254354935039006, + "learning_rate": 6.157369256856934e-06, + "loss": 0.0167, + "step": 2230 + }, + { + "epoch": 0.44314231800576026, + "grad_norm": 0.42379511270760073, + "learning_rate": 6.154238875557755e-06, + "loss": 0.0131, + "step": 2231 + }, + { + "epoch": 0.4433409474625087, + "grad_norm": 0.6058120408697395, + "learning_rate": 6.151108016289416e-06, + "loss": 0.0096, + "step": 2232 + }, + { + "epoch": 0.4435395769192571, + "grad_norm": 0.25542634551541554, + "learning_rate": 6.147976680348398e-06, + "loss": 0.0072, + "step": 2233 + }, + { + "epoch": 0.4437382063760056, + "grad_norm": 0.2899452724247626, + "learning_rate": 6.144844869031385e-06, + "loss": 0.006, + "step": 2234 + }, + { + "epoch": 0.443936835832754, + "grad_norm": 0.7491935474647948, + "learning_rate": 6.141712583635261e-06, + "loss": 0.0145, + "step": 2235 + }, + { + "epoch": 0.44413546528950243, + "grad_norm": 0.4422482327524406, + "learning_rate": 6.138579825457098e-06, + "loss": 0.0092, + "step": 2236 + }, + { + "epoch": 0.44433409474625085, + "grad_norm": 0.21771092100081543, + "learning_rate": 6.135446595794169e-06, + "loss": 0.0085, + "step": 2237 + }, + { + "epoch": 0.44453272420299933, + "grad_norm": 1.4002752559333222, + "learning_rate": 6.132312895943942e-06, + "loss": 0.0207, + "step": 2238 + }, + { + "epoch": 0.44473135365974775, + "grad_norm": 0.7190190028304118, + "learning_rate": 6.129178727204079e-06, + "loss": 0.0161, + "step": 2239 + }, + { + "epoch": 0.4449299831164962, + "grad_norm": 0.42822061766771496, + "learning_rate": 6.126044090872437e-06, + "loss": 0.0124, + "step": 2240 + }, + { + "epoch": 0.4451286125732446, + "grad_norm": 0.677711220443176, + "learning_rate": 6.1229089882470645e-06, + "loss": 0.0101, + "step": 2241 + }, + { + "epoch": 0.4453272420299931, + "grad_norm": 1.059203479885282, + "learning_rate": 6.119773420626203e-06, + "loss": 0.0325, + "step": 2242 + }, + { + "epoch": 0.4455258714867415, + "grad_norm": 0.5706231879648851, + "learning_rate": 6.1166373893082895e-06, + "loss": 0.0106, + "step": 2243 + }, + { + "epoch": 0.4457245009434899, + "grad_norm": 0.995491363379659, + "learning_rate": 6.113500895591953e-06, + "loss": 0.0167, + "step": 2244 + }, + { + "epoch": 0.44592313040023834, + "grad_norm": 0.29779451432903564, + "learning_rate": 6.110363940776008e-06, + "loss": 0.0092, + "step": 2245 + }, + { + "epoch": 0.44612175985698677, + "grad_norm": 0.5762002893389644, + "learning_rate": 6.10722652615947e-06, + "loss": 0.0143, + "step": 2246 + }, + { + "epoch": 0.44632038931373524, + "grad_norm": 0.7215045066061389, + "learning_rate": 6.104088653041534e-06, + "loss": 0.0174, + "step": 2247 + }, + { + "epoch": 0.44651901877048367, + "grad_norm": 0.32513640970058094, + "learning_rate": 6.100950322721594e-06, + "loss": 0.0041, + "step": 2248 + }, + { + "epoch": 0.4467176482272321, + "grad_norm": 0.5763190265316815, + "learning_rate": 6.097811536499227e-06, + "loss": 0.018, + "step": 2249 + }, + { + "epoch": 0.4469162776839805, + "grad_norm": 0.8055544487235922, + "learning_rate": 6.094672295674202e-06, + "loss": 0.0155, + "step": 2250 + }, + { + "epoch": 0.447114907140729, + "grad_norm": 0.992649917079707, + "learning_rate": 6.091532601546476e-06, + "loss": 0.0161, + "step": 2251 + }, + { + "epoch": 0.4473135365974774, + "grad_norm": 0.4457080027649029, + "learning_rate": 6.088392455416194e-06, + "loss": 0.0073, + "step": 2252 + }, + { + "epoch": 0.44751216605422584, + "grad_norm": 0.2552247040191699, + "learning_rate": 6.085251858583685e-06, + "loss": 0.0092, + "step": 2253 + }, + { + "epoch": 0.44771079551097426, + "grad_norm": 0.590972028978692, + "learning_rate": 6.082110812349468e-06, + "loss": 0.0109, + "step": 2254 + }, + { + "epoch": 0.44790942496772274, + "grad_norm": 0.4091054658923765, + "learning_rate": 6.078969318014246e-06, + "loss": 0.0124, + "step": 2255 + }, + { + "epoch": 0.44810805442447116, + "grad_norm": 0.7196903790599142, + "learning_rate": 6.075827376878911e-06, + "loss": 0.0148, + "step": 2256 + }, + { + "epoch": 0.4483066838812196, + "grad_norm": 0.3261047677288729, + "learning_rate": 6.072684990244537e-06, + "loss": 0.0098, + "step": 2257 + }, + { + "epoch": 0.448505313337968, + "grad_norm": 0.29300124021584395, + "learning_rate": 6.069542159412384e-06, + "loss": 0.0122, + "step": 2258 + }, + { + "epoch": 0.4487039427947165, + "grad_norm": 0.978139352013834, + "learning_rate": 6.066398885683892e-06, + "loss": 0.0144, + "step": 2259 + }, + { + "epoch": 0.4489025722514649, + "grad_norm": 0.7871382091207846, + "learning_rate": 6.06325517036069e-06, + "loss": 0.0155, + "step": 2260 + }, + { + "epoch": 0.4491012017082133, + "grad_norm": 1.1988153830297887, + "learning_rate": 6.060111014744586e-06, + "loss": 0.0114, + "step": 2261 + }, + { + "epoch": 0.44929983116496175, + "grad_norm": 0.5272884595442661, + "learning_rate": 6.056966420137572e-06, + "loss": 0.013, + "step": 2262 + }, + { + "epoch": 0.44949846062171017, + "grad_norm": 0.5853642484110945, + "learning_rate": 6.053821387841823e-06, + "loss": 0.0107, + "step": 2263 + }, + { + "epoch": 0.44969709007845865, + "grad_norm": 0.3163330948551929, + "learning_rate": 6.0506759191596944e-06, + "loss": 0.0122, + "step": 2264 + }, + { + "epoch": 0.4498957195352071, + "grad_norm": 0.43256342417833826, + "learning_rate": 6.047530015393718e-06, + "loss": 0.0081, + "step": 2265 + }, + { + "epoch": 0.4500943489919555, + "grad_norm": 0.8015470911760131, + "learning_rate": 6.04438367784661e-06, + "loss": 0.012, + "step": 2266 + }, + { + "epoch": 0.4502929784487039, + "grad_norm": 0.40936392070753425, + "learning_rate": 6.041236907821267e-06, + "loss": 0.0098, + "step": 2267 + }, + { + "epoch": 0.4504916079054524, + "grad_norm": 1.3184403290579898, + "learning_rate": 6.0380897066207654e-06, + "loss": 0.0159, + "step": 2268 + }, + { + "epoch": 0.4506902373622008, + "grad_norm": 0.36040823408450195, + "learning_rate": 6.034942075548355e-06, + "loss": 0.0063, + "step": 2269 + }, + { + "epoch": 0.45088886681894924, + "grad_norm": 0.5708270902664976, + "learning_rate": 6.0317940159074675e-06, + "loss": 0.0203, + "step": 2270 + }, + { + "epoch": 0.45108749627569766, + "grad_norm": 0.4185899585374011, + "learning_rate": 6.028645529001711e-06, + "loss": 0.009, + "step": 2271 + }, + { + "epoch": 0.45128612573244614, + "grad_norm": 0.5374794859076718, + "learning_rate": 6.02549661613487e-06, + "loss": 0.0184, + "step": 2272 + }, + { + "epoch": 0.45148475518919456, + "grad_norm": 0.3634143098642771, + "learning_rate": 6.0223472786109095e-06, + "loss": 0.0101, + "step": 2273 + }, + { + "epoch": 0.451683384645943, + "grad_norm": 0.42164752734379324, + "learning_rate": 6.019197517733963e-06, + "loss": 0.0124, + "step": 2274 + }, + { + "epoch": 0.4518820141026914, + "grad_norm": 0.2932256480659242, + "learning_rate": 6.016047334808345e-06, + "loss": 0.0056, + "step": 2275 + }, + { + "epoch": 0.4520806435594399, + "grad_norm": 0.950748673633275, + "learning_rate": 6.012896731138545e-06, + "loss": 0.0131, + "step": 2276 + }, + { + "epoch": 0.4522792730161883, + "grad_norm": 0.5735740533696118, + "learning_rate": 6.0097457080292206e-06, + "loss": 0.0158, + "step": 2277 + }, + { + "epoch": 0.45247790247293673, + "grad_norm": 0.6300357309402919, + "learning_rate": 6.006594266785212e-06, + "loss": 0.0198, + "step": 2278 + }, + { + "epoch": 0.45267653192968516, + "grad_norm": 0.45140542258277755, + "learning_rate": 6.003442408711524e-06, + "loss": 0.0103, + "step": 2279 + }, + { + "epoch": 0.45287516138643363, + "grad_norm": 0.31203226057353545, + "learning_rate": 6.000290135113338e-06, + "loss": 0.0072, + "step": 2280 + }, + { + "epoch": 0.45307379084318206, + "grad_norm": 0.5315768399241099, + "learning_rate": 5.997137447296011e-06, + "loss": 0.0189, + "step": 2281 + }, + { + "epoch": 0.4532724202999305, + "grad_norm": 0.7502568266180624, + "learning_rate": 5.993984346565065e-06, + "loss": 0.0168, + "step": 2282 + }, + { + "epoch": 0.4534710497566789, + "grad_norm": 0.7922773049664108, + "learning_rate": 5.990830834226197e-06, + "loss": 0.0197, + "step": 2283 + }, + { + "epoch": 0.4536696792134273, + "grad_norm": 0.5256093593559897, + "learning_rate": 5.987676911585273e-06, + "loss": 0.0118, + "step": 2284 + }, + { + "epoch": 0.4538683086701758, + "grad_norm": 0.4936069122490565, + "learning_rate": 5.984522579948329e-06, + "loss": 0.0119, + "step": 2285 + }, + { + "epoch": 0.4540669381269242, + "grad_norm": 0.6240742984791606, + "learning_rate": 5.981367840621569e-06, + "loss": 0.0103, + "step": 2286 + }, + { + "epoch": 0.45426556758367265, + "grad_norm": 0.4580728528848644, + "learning_rate": 5.97821269491137e-06, + "loss": 0.0089, + "step": 2287 + }, + { + "epoch": 0.45446419704042107, + "grad_norm": 1.0311027752364688, + "learning_rate": 5.975057144124274e-06, + "loss": 0.0109, + "step": 2288 + }, + { + "epoch": 0.45466282649716955, + "grad_norm": 0.47911929621431937, + "learning_rate": 5.971901189566991e-06, + "loss": 0.0127, + "step": 2289 + }, + { + "epoch": 0.45486145595391797, + "grad_norm": 0.31717175346501003, + "learning_rate": 5.968744832546398e-06, + "loss": 0.0131, + "step": 2290 + }, + { + "epoch": 0.4550600854106664, + "grad_norm": 0.5084576453823957, + "learning_rate": 5.965588074369541e-06, + "loss": 0.0108, + "step": 2291 + }, + { + "epoch": 0.4552587148674148, + "grad_norm": 0.4321949217258047, + "learning_rate": 5.962430916343627e-06, + "loss": 0.0073, + "step": 2292 + }, + { + "epoch": 0.4554573443241633, + "grad_norm": 0.3587917028460577, + "learning_rate": 5.959273359776037e-06, + "loss": 0.0095, + "step": 2293 + }, + { + "epoch": 0.4556559737809117, + "grad_norm": 0.39093795226899214, + "learning_rate": 5.956115405974308e-06, + "loss": 0.0067, + "step": 2294 + }, + { + "epoch": 0.45585460323766014, + "grad_norm": 1.022914745931775, + "learning_rate": 5.952957056246147e-06, + "loss": 0.0163, + "step": 2295 + }, + { + "epoch": 0.45605323269440856, + "grad_norm": 0.35063776749240805, + "learning_rate": 5.949798311899424e-06, + "loss": 0.0096, + "step": 2296 + }, + { + "epoch": 0.45625186215115704, + "grad_norm": 0.34523519354493915, + "learning_rate": 5.94663917424217e-06, + "loss": 0.0114, + "step": 2297 + }, + { + "epoch": 0.45645049160790546, + "grad_norm": 0.4356127664422994, + "learning_rate": 5.9434796445825835e-06, + "loss": 0.0114, + "step": 2298 + }, + { + "epoch": 0.4566491210646539, + "grad_norm": 0.34697933488413446, + "learning_rate": 5.940319724229019e-06, + "loss": 0.007, + "step": 2299 + }, + { + "epoch": 0.4568477505214023, + "grad_norm": 0.45136277819236226, + "learning_rate": 5.937159414490001e-06, + "loss": 0.0141, + "step": 2300 + }, + { + "epoch": 0.4570463799781508, + "grad_norm": 0.9498700129253315, + "learning_rate": 5.933998716674206e-06, + "loss": 0.0113, + "step": 2301 + }, + { + "epoch": 0.4572450094348992, + "grad_norm": 1.0293691155517941, + "learning_rate": 5.930837632090479e-06, + "loss": 0.0221, + "step": 2302 + }, + { + "epoch": 0.45744363889164763, + "grad_norm": 0.3086672015952359, + "learning_rate": 5.92767616204782e-06, + "loss": 0.0081, + "step": 2303 + }, + { + "epoch": 0.45764226834839605, + "grad_norm": 0.3479833097855302, + "learning_rate": 5.92451430785539e-06, + "loss": 0.0062, + "step": 2304 + }, + { + "epoch": 0.4578408978051445, + "grad_norm": 0.5660458407661002, + "learning_rate": 5.921352070822513e-06, + "loss": 0.0141, + "step": 2305 + }, + { + "epoch": 0.45803952726189295, + "grad_norm": 1.281389383626456, + "learning_rate": 5.918189452258665e-06, + "loss": 0.0192, + "step": 2306 + }, + { + "epoch": 0.4582381567186414, + "grad_norm": 0.6209042441203885, + "learning_rate": 5.915026453473485e-06, + "loss": 0.0237, + "step": 2307 + }, + { + "epoch": 0.4584367861753898, + "grad_norm": 0.372320922451502, + "learning_rate": 5.9118630757767666e-06, + "loss": 0.0078, + "step": 2308 + }, + { + "epoch": 0.4586354156321382, + "grad_norm": 0.5819030047417552, + "learning_rate": 5.908699320478462e-06, + "loss": 0.0073, + "step": 2309 + }, + { + "epoch": 0.4588340450888867, + "grad_norm": 0.7061784530868636, + "learning_rate": 5.905535188888678e-06, + "loss": 0.0129, + "step": 2310 + }, + { + "epoch": 0.4590326745456351, + "grad_norm": 0.6778701936405351, + "learning_rate": 5.9023706823176795e-06, + "loss": 0.0114, + "step": 2311 + }, + { + "epoch": 0.45923130400238354, + "grad_norm": 0.7264468288248946, + "learning_rate": 5.8992058020758855e-06, + "loss": 0.0231, + "step": 2312 + }, + { + "epoch": 0.45942993345913197, + "grad_norm": 0.6212393297209905, + "learning_rate": 5.89604054947387e-06, + "loss": 0.0143, + "step": 2313 + }, + { + "epoch": 0.45962856291588045, + "grad_norm": 0.35909749045150213, + "learning_rate": 5.892874925822359e-06, + "loss": 0.0122, + "step": 2314 + }, + { + "epoch": 0.45982719237262887, + "grad_norm": 0.3641939953347697, + "learning_rate": 5.889708932432235e-06, + "loss": 0.0105, + "step": 2315 + }, + { + "epoch": 0.4600258218293773, + "grad_norm": 0.48230559364020364, + "learning_rate": 5.8865425706145355e-06, + "loss": 0.0127, + "step": 2316 + }, + { + "epoch": 0.4602244512861257, + "grad_norm": 0.46675212979529435, + "learning_rate": 5.883375841680442e-06, + "loss": 0.0124, + "step": 2317 + }, + { + "epoch": 0.4604230807428742, + "grad_norm": 0.3866132495591225, + "learning_rate": 5.880208746941299e-06, + "loss": 0.0073, + "step": 2318 + }, + { + "epoch": 0.4606217101996226, + "grad_norm": 0.439249606081369, + "learning_rate": 5.877041287708595e-06, + "loss": 0.0137, + "step": 2319 + }, + { + "epoch": 0.46082033965637104, + "grad_norm": 0.9131678817021807, + "learning_rate": 5.8738734652939725e-06, + "loss": 0.0144, + "step": 2320 + }, + { + "epoch": 0.46101896911311946, + "grad_norm": 0.4209332036129976, + "learning_rate": 5.870705281009222e-06, + "loss": 0.011, + "step": 2321 + }, + { + "epoch": 0.46121759856986794, + "grad_norm": 0.5289405995879706, + "learning_rate": 5.867536736166287e-06, + "loss": 0.0141, + "step": 2322 + }, + { + "epoch": 0.46141622802661636, + "grad_norm": 0.4232956656008544, + "learning_rate": 5.864367832077259e-06, + "loss": 0.0118, + "step": 2323 + }, + { + "epoch": 0.4616148574833648, + "grad_norm": 0.38126143858947364, + "learning_rate": 5.861198570054377e-06, + "loss": 0.0094, + "step": 2324 + }, + { + "epoch": 0.4618134869401132, + "grad_norm": 0.5109437768251036, + "learning_rate": 5.858028951410029e-06, + "loss": 0.019, + "step": 2325 + }, + { + "epoch": 0.4620121163968616, + "grad_norm": 0.9866712175295075, + "learning_rate": 5.854858977456753e-06, + "loss": 0.011, + "step": 2326 + }, + { + "epoch": 0.4622107458536101, + "grad_norm": 0.5509612624995485, + "learning_rate": 5.85168864950723e-06, + "loss": 0.0146, + "step": 2327 + }, + { + "epoch": 0.46240937531035853, + "grad_norm": 0.6697882679503572, + "learning_rate": 5.8485179688742896e-06, + "loss": 0.0156, + "step": 2328 + }, + { + "epoch": 0.46260800476710695, + "grad_norm": 0.25998329362845135, + "learning_rate": 5.845346936870907e-06, + "loss": 0.009, + "step": 2329 + }, + { + "epoch": 0.4628066342238554, + "grad_norm": 0.445245200827973, + "learning_rate": 5.8421755548102054e-06, + "loss": 0.0107, + "step": 2330 + }, + { + "epoch": 0.46300526368060385, + "grad_norm": 0.30406825765433, + "learning_rate": 5.8390038240054505e-06, + "loss": 0.0087, + "step": 2331 + }, + { + "epoch": 0.4632038931373523, + "grad_norm": 0.5121143693890272, + "learning_rate": 5.835831745770052e-06, + "loss": 0.0121, + "step": 2332 + }, + { + "epoch": 0.4634025225941007, + "grad_norm": 0.6199846252981046, + "learning_rate": 5.832659321417565e-06, + "loss": 0.0112, + "step": 2333 + }, + { + "epoch": 0.4636011520508491, + "grad_norm": 0.48346062372856696, + "learning_rate": 5.829486552261685e-06, + "loss": 0.0131, + "step": 2334 + }, + { + "epoch": 0.4637997815075976, + "grad_norm": 0.8757641479679567, + "learning_rate": 5.826313439616256e-06, + "loss": 0.0156, + "step": 2335 + }, + { + "epoch": 0.463998410964346, + "grad_norm": 0.43652114504307693, + "learning_rate": 5.8231399847952585e-06, + "loss": 0.0114, + "step": 2336 + }, + { + "epoch": 0.46419704042109444, + "grad_norm": 0.47839962377266626, + "learning_rate": 5.819966189112818e-06, + "loss": 0.0102, + "step": 2337 + }, + { + "epoch": 0.46439566987784286, + "grad_norm": 1.2127218820606134, + "learning_rate": 5.8167920538832e-06, + "loss": 0.014, + "step": 2338 + }, + { + "epoch": 0.46459429933459134, + "grad_norm": 0.769446434011527, + "learning_rate": 5.8136175804208106e-06, + "loss": 0.0114, + "step": 2339 + }, + { + "epoch": 0.46479292879133977, + "grad_norm": 0.34763955904579885, + "learning_rate": 5.8104427700401945e-06, + "loss": 0.015, + "step": 2340 + }, + { + "epoch": 0.4649915582480882, + "grad_norm": 0.5597168255035329, + "learning_rate": 5.80726762405604e-06, + "loss": 0.014, + "step": 2341 + }, + { + "epoch": 0.4651901877048366, + "grad_norm": 0.9135852247092504, + "learning_rate": 5.80409214378317e-06, + "loss": 0.0199, + "step": 2342 + }, + { + "epoch": 0.4653888171615851, + "grad_norm": 0.6722348948820552, + "learning_rate": 5.800916330536549e-06, + "loss": 0.0111, + "step": 2343 + }, + { + "epoch": 0.4655874466183335, + "grad_norm": 0.2794802612129201, + "learning_rate": 5.7977401856312775e-06, + "loss": 0.0081, + "step": 2344 + }, + { + "epoch": 0.46578607607508193, + "grad_norm": 0.4778286582110137, + "learning_rate": 5.794563710382595e-06, + "loss": 0.0113, + "step": 2345 + }, + { + "epoch": 0.46598470553183036, + "grad_norm": 0.3799982420436121, + "learning_rate": 5.791386906105875e-06, + "loss": 0.0112, + "step": 2346 + }, + { + "epoch": 0.4661833349885788, + "grad_norm": 0.5794990884883824, + "learning_rate": 5.788209774116629e-06, + "loss": 0.0167, + "step": 2347 + }, + { + "epoch": 0.46638196444532726, + "grad_norm": 0.9348715395945014, + "learning_rate": 5.785032315730506e-06, + "loss": 0.0248, + "step": 2348 + }, + { + "epoch": 0.4665805939020757, + "grad_norm": 0.5055416806585602, + "learning_rate": 5.781854532263287e-06, + "loss": 0.0158, + "step": 2349 + }, + { + "epoch": 0.4667792233588241, + "grad_norm": 0.6028354260098756, + "learning_rate": 5.778676425030888e-06, + "loss": 0.0103, + "step": 2350 + }, + { + "epoch": 0.4669778528155725, + "grad_norm": 0.42355897766306966, + "learning_rate": 5.775497995349363e-06, + "loss": 0.0058, + "step": 2351 + }, + { + "epoch": 0.467176482272321, + "grad_norm": 0.30607023801120187, + "learning_rate": 5.772319244534893e-06, + "loss": 0.0116, + "step": 2352 + }, + { + "epoch": 0.4673751117290694, + "grad_norm": 0.8992242787812346, + "learning_rate": 5.769140173903799e-06, + "loss": 0.0221, + "step": 2353 + }, + { + "epoch": 0.46757374118581785, + "grad_norm": 0.8268467494450803, + "learning_rate": 5.765960784772527e-06, + "loss": 0.01, + "step": 2354 + }, + { + "epoch": 0.46777237064256627, + "grad_norm": 0.31652768813051085, + "learning_rate": 5.762781078457662e-06, + "loss": 0.0071, + "step": 2355 + }, + { + "epoch": 0.46797100009931475, + "grad_norm": 0.5660464775694583, + "learning_rate": 5.759601056275916e-06, + "loss": 0.0099, + "step": 2356 + }, + { + "epoch": 0.46816962955606317, + "grad_norm": 0.6382863746802383, + "learning_rate": 5.756420719544135e-06, + "loss": 0.0161, + "step": 2357 + }, + { + "epoch": 0.4683682590128116, + "grad_norm": 0.6570237568395549, + "learning_rate": 5.75324006957929e-06, + "loss": 0.0151, + "step": 2358 + }, + { + "epoch": 0.46856688846956, + "grad_norm": 0.455513564680463, + "learning_rate": 5.7500591076984865e-06, + "loss": 0.0137, + "step": 2359 + }, + { + "epoch": 0.4687655179263085, + "grad_norm": 0.3782815896106589, + "learning_rate": 5.746877835218959e-06, + "loss": 0.0082, + "step": 2360 + }, + { + "epoch": 0.4689641473830569, + "grad_norm": 0.25011529623111944, + "learning_rate": 5.7436962534580665e-06, + "loss": 0.0099, + "step": 2361 + }, + { + "epoch": 0.46916277683980534, + "grad_norm": 0.2755162028039524, + "learning_rate": 5.7405143637333e-06, + "loss": 0.0074, + "step": 2362 + }, + { + "epoch": 0.46936140629655376, + "grad_norm": 0.4887019159723483, + "learning_rate": 5.737332167362278e-06, + "loss": 0.0123, + "step": 2363 + }, + { + "epoch": 0.46956003575330224, + "grad_norm": 0.6804047453581535, + "learning_rate": 5.734149665662744e-06, + "loss": 0.0099, + "step": 2364 + }, + { + "epoch": 0.46975866521005066, + "grad_norm": 0.8398249561968816, + "learning_rate": 5.730966859952568e-06, + "loss": 0.0193, + "step": 2365 + }, + { + "epoch": 0.4699572946667991, + "grad_norm": 0.30808476371188004, + "learning_rate": 5.727783751549748e-06, + "loss": 0.0101, + "step": 2366 + }, + { + "epoch": 0.4701559241235475, + "grad_norm": 0.40427106496463305, + "learning_rate": 5.724600341772404e-06, + "loss": 0.0072, + "step": 2367 + }, + { + "epoch": 0.47035455358029593, + "grad_norm": 0.5655774899898666, + "learning_rate": 5.721416631938785e-06, + "loss": 0.0161, + "step": 2368 + }, + { + "epoch": 0.4705531830370444, + "grad_norm": 0.6325786828061096, + "learning_rate": 5.718232623367262e-06, + "loss": 0.0183, + "step": 2369 + }, + { + "epoch": 0.47075181249379283, + "grad_norm": 0.5808533874064912, + "learning_rate": 5.715048317376327e-06, + "loss": 0.0074, + "step": 2370 + }, + { + "epoch": 0.47095044195054125, + "grad_norm": 0.8445148294332612, + "learning_rate": 5.7118637152846015e-06, + "loss": 0.0161, + "step": 2371 + }, + { + "epoch": 0.4711490714072897, + "grad_norm": 0.4846773932797683, + "learning_rate": 5.708678818410823e-06, + "loss": 0.0057, + "step": 2372 + }, + { + "epoch": 0.47134770086403815, + "grad_norm": 0.4942960720168355, + "learning_rate": 5.705493628073856e-06, + "loss": 0.0163, + "step": 2373 + }, + { + "epoch": 0.4715463303207866, + "grad_norm": 0.2926160774657436, + "learning_rate": 5.702308145592684e-06, + "loss": 0.0084, + "step": 2374 + }, + { + "epoch": 0.471744959777535, + "grad_norm": 0.36092426480142087, + "learning_rate": 5.699122372286413e-06, + "loss": 0.0156, + "step": 2375 + }, + { + "epoch": 0.4719435892342834, + "grad_norm": 0.971391873556755, + "learning_rate": 5.6959363094742684e-06, + "loss": 0.0209, + "step": 2376 + }, + { + "epoch": 0.4721422186910319, + "grad_norm": 0.5192492237273335, + "learning_rate": 5.692749958475593e-06, + "loss": 0.0158, + "step": 2377 + }, + { + "epoch": 0.4723408481477803, + "grad_norm": 0.37516394436059797, + "learning_rate": 5.689563320609854e-06, + "loss": 0.012, + "step": 2378 + }, + { + "epoch": 0.47253947760452875, + "grad_norm": 0.7321340081454777, + "learning_rate": 5.686376397196635e-06, + "loss": 0.0095, + "step": 2379 + }, + { + "epoch": 0.47273810706127717, + "grad_norm": 0.6380956047969817, + "learning_rate": 5.683189189555637e-06, + "loss": 0.0094, + "step": 2380 + }, + { + "epoch": 0.47293673651802565, + "grad_norm": 0.5714109992216438, + "learning_rate": 5.68000169900668e-06, + "loss": 0.008, + "step": 2381 + }, + { + "epoch": 0.47313536597477407, + "grad_norm": 0.773380440894765, + "learning_rate": 5.6768139268697e-06, + "loss": 0.0152, + "step": 2382 + }, + { + "epoch": 0.4733339954315225, + "grad_norm": 0.6960015446023513, + "learning_rate": 5.673625874464751e-06, + "loss": 0.0217, + "step": 2383 + }, + { + "epoch": 0.4735326248882709, + "grad_norm": 0.6037591137291943, + "learning_rate": 5.670437543112003e-06, + "loss": 0.0136, + "step": 2384 + }, + { + "epoch": 0.4737312543450194, + "grad_norm": 0.541539265791468, + "learning_rate": 5.667248934131739e-06, + "loss": 0.0087, + "step": 2385 + }, + { + "epoch": 0.4739298838017678, + "grad_norm": 0.4801580538978032, + "learning_rate": 5.6640600488443585e-06, + "loss": 0.0106, + "step": 2386 + }, + { + "epoch": 0.47412851325851624, + "grad_norm": 0.4106221925173567, + "learning_rate": 5.660870888570378e-06, + "loss": 0.0121, + "step": 2387 + }, + { + "epoch": 0.47432714271526466, + "grad_norm": 1.2008332110233264, + "learning_rate": 5.657681454630424e-06, + "loss": 0.0164, + "step": 2388 + }, + { + "epoch": 0.4745257721720131, + "grad_norm": 0.46847613164327123, + "learning_rate": 5.654491748345238e-06, + "loss": 0.0112, + "step": 2389 + }, + { + "epoch": 0.47472440162876156, + "grad_norm": 0.4866905116366235, + "learning_rate": 5.651301771035675e-06, + "loss": 0.0123, + "step": 2390 + }, + { + "epoch": 0.47492303108551, + "grad_norm": 0.8229938698433211, + "learning_rate": 5.6481115240227005e-06, + "loss": 0.0144, + "step": 2391 + }, + { + "epoch": 0.4751216605422584, + "grad_norm": 0.9137037442295208, + "learning_rate": 5.644921008627391e-06, + "loss": 0.0151, + "step": 2392 + }, + { + "epoch": 0.47532028999900683, + "grad_norm": 0.4876172441311117, + "learning_rate": 5.6417302261709404e-06, + "loss": 0.0095, + "step": 2393 + }, + { + "epoch": 0.4755189194557553, + "grad_norm": 0.37524536677463444, + "learning_rate": 5.638539177974645e-06, + "loss": 0.0121, + "step": 2394 + }, + { + "epoch": 0.47571754891250373, + "grad_norm": 0.7581160723528595, + "learning_rate": 5.635347865359915e-06, + "loss": 0.0107, + "step": 2395 + }, + { + "epoch": 0.47591617836925215, + "grad_norm": 0.4697124876851263, + "learning_rate": 5.632156289648272e-06, + "loss": 0.0141, + "step": 2396 + }, + { + "epoch": 0.4761148078260006, + "grad_norm": 1.2004601762194573, + "learning_rate": 5.62896445216134e-06, + "loss": 0.0178, + "step": 2397 + }, + { + "epoch": 0.47631343728274905, + "grad_norm": 0.44792347789247144, + "learning_rate": 5.6257723542208595e-06, + "loss": 0.0094, + "step": 2398 + }, + { + "epoch": 0.4765120667394975, + "grad_norm": 0.6253105011573149, + "learning_rate": 5.622579997148674e-06, + "loss": 0.0104, + "step": 2399 + }, + { + "epoch": 0.4767106961962459, + "grad_norm": 0.38725763342264224, + "learning_rate": 5.619387382266734e-06, + "loss": 0.0181, + "step": 2400 + }, + { + "epoch": 0.4769093256529943, + "grad_norm": 0.5602962469904611, + "learning_rate": 5.6161945108971005e-06, + "loss": 0.0195, + "step": 2401 + }, + { + "epoch": 0.4771079551097428, + "grad_norm": 0.48104235373399207, + "learning_rate": 5.6130013843619366e-06, + "loss": 0.0197, + "step": 2402 + }, + { + "epoch": 0.4773065845664912, + "grad_norm": 0.3039832053300856, + "learning_rate": 5.609808003983513e-06, + "loss": 0.0067, + "step": 2403 + }, + { + "epoch": 0.47750521402323964, + "grad_norm": 0.7235246350240075, + "learning_rate": 5.606614371084206e-06, + "loss": 0.0193, + "step": 2404 + }, + { + "epoch": 0.47770384347998807, + "grad_norm": 0.547167273803981, + "learning_rate": 5.603420486986495e-06, + "loss": 0.0059, + "step": 2405 + }, + { + "epoch": 0.47790247293673654, + "grad_norm": 0.5020367397204322, + "learning_rate": 5.600226353012965e-06, + "loss": 0.0124, + "step": 2406 + }, + { + "epoch": 0.47810110239348497, + "grad_norm": 0.29441250473742775, + "learning_rate": 5.597031970486304e-06, + "loss": 0.0089, + "step": 2407 + }, + { + "epoch": 0.4782997318502334, + "grad_norm": 0.3051620947194149, + "learning_rate": 5.593837340729302e-06, + "loss": 0.0102, + "step": 2408 + }, + { + "epoch": 0.4784983613069818, + "grad_norm": 0.46639149386889667, + "learning_rate": 5.590642465064852e-06, + "loss": 0.0154, + "step": 2409 + }, + { + "epoch": 0.47869699076373023, + "grad_norm": 0.6356240089361235, + "learning_rate": 5.587447344815946e-06, + "loss": 0.0158, + "step": 2410 + }, + { + "epoch": 0.4788956202204787, + "grad_norm": 0.7929409757443711, + "learning_rate": 5.584251981305685e-06, + "loss": 0.016, + "step": 2411 + }, + { + "epoch": 0.47909424967722714, + "grad_norm": 0.7886868857089763, + "learning_rate": 5.581056375857263e-06, + "loss": 0.0158, + "step": 2412 + }, + { + "epoch": 0.47929287913397556, + "grad_norm": 0.6545632498798621, + "learning_rate": 5.577860529793978e-06, + "loss": 0.0142, + "step": 2413 + }, + { + "epoch": 0.479491508590724, + "grad_norm": 0.6445078069693473, + "learning_rate": 5.574664444439226e-06, + "loss": 0.0128, + "step": 2414 + }, + { + "epoch": 0.47969013804747246, + "grad_norm": 0.4114117395918578, + "learning_rate": 5.571468121116504e-06, + "loss": 0.0118, + "step": 2415 + }, + { + "epoch": 0.4798887675042209, + "grad_norm": 0.49308068014298745, + "learning_rate": 5.568271561149404e-06, + "loss": 0.0199, + "step": 2416 + }, + { + "epoch": 0.4800873969609693, + "grad_norm": 0.8300192404106222, + "learning_rate": 5.56507476586162e-06, + "loss": 0.0163, + "step": 2417 + }, + { + "epoch": 0.4802860264177177, + "grad_norm": 0.7203031117955251, + "learning_rate": 5.561877736576942e-06, + "loss": 0.0192, + "step": 2418 + }, + { + "epoch": 0.4804846558744662, + "grad_norm": 0.63747435264011, + "learning_rate": 5.558680474619255e-06, + "loss": 0.0116, + "step": 2419 + }, + { + "epoch": 0.4806832853312146, + "grad_norm": 0.6686966563642719, + "learning_rate": 5.5554829813125446e-06, + "loss": 0.0137, + "step": 2420 + }, + { + "epoch": 0.48088191478796305, + "grad_norm": 0.7236264839110779, + "learning_rate": 5.552285257980887e-06, + "loss": 0.0118, + "step": 2421 + }, + { + "epoch": 0.48108054424471147, + "grad_norm": 0.3489337234437465, + "learning_rate": 5.549087305948455e-06, + "loss": 0.0125, + "step": 2422 + }, + { + "epoch": 0.48127917370145995, + "grad_norm": 0.6402623274118869, + "learning_rate": 5.545889126539522e-06, + "loss": 0.0158, + "step": 2423 + }, + { + "epoch": 0.4814778031582084, + "grad_norm": 0.6755621454969479, + "learning_rate": 5.542690721078447e-06, + "loss": 0.018, + "step": 2424 + }, + { + "epoch": 0.4816764326149568, + "grad_norm": 0.3813390848817102, + "learning_rate": 5.5394920908896895e-06, + "loss": 0.0082, + "step": 2425 + }, + { + "epoch": 0.4818750620717052, + "grad_norm": 0.5944323310591813, + "learning_rate": 5.536293237297796e-06, + "loss": 0.0086, + "step": 2426 + }, + { + "epoch": 0.4820736915284537, + "grad_norm": 0.5823022738844985, + "learning_rate": 5.533094161627412e-06, + "loss": 0.0129, + "step": 2427 + }, + { + "epoch": 0.4822723209852021, + "grad_norm": 0.24109484982366644, + "learning_rate": 5.529894865203267e-06, + "loss": 0.0059, + "step": 2428 + }, + { + "epoch": 0.48247095044195054, + "grad_norm": 0.6279027842239673, + "learning_rate": 5.52669534935019e-06, + "loss": 0.0151, + "step": 2429 + }, + { + "epoch": 0.48266957989869896, + "grad_norm": 0.8846786775845136, + "learning_rate": 5.523495615393095e-06, + "loss": 0.0208, + "step": 2430 + }, + { + "epoch": 0.4828682093554474, + "grad_norm": 0.38091540228676024, + "learning_rate": 5.520295664656992e-06, + "loss": 0.0069, + "step": 2431 + }, + { + "epoch": 0.48306683881219586, + "grad_norm": 0.30556096556441176, + "learning_rate": 5.517095498466976e-06, + "loss": 0.0061, + "step": 2432 + }, + { + "epoch": 0.4832654682689443, + "grad_norm": 0.8509282833418818, + "learning_rate": 5.51389511814823e-06, + "loss": 0.0135, + "step": 2433 + }, + { + "epoch": 0.4834640977256927, + "grad_norm": 0.37578690343483734, + "learning_rate": 5.51069452502603e-06, + "loss": 0.0091, + "step": 2434 + }, + { + "epoch": 0.48366272718244113, + "grad_norm": 0.46966875266445063, + "learning_rate": 5.5074937204257385e-06, + "loss": 0.0147, + "step": 2435 + }, + { + "epoch": 0.4838613566391896, + "grad_norm": 0.509057592751693, + "learning_rate": 5.504292705672807e-06, + "loss": 0.0142, + "step": 2436 + }, + { + "epoch": 0.48405998609593803, + "grad_norm": 0.472350938168894, + "learning_rate": 5.5010914820927695e-06, + "loss": 0.01, + "step": 2437 + }, + { + "epoch": 0.48425861555268646, + "grad_norm": 0.4293231269108412, + "learning_rate": 5.497890051011252e-06, + "loss": 0.009, + "step": 2438 + }, + { + "epoch": 0.4844572450094349, + "grad_norm": 0.6277901027183694, + "learning_rate": 5.494688413753964e-06, + "loss": 0.0085, + "step": 2439 + }, + { + "epoch": 0.48465587446618336, + "grad_norm": 0.4248433246133367, + "learning_rate": 5.491486571646698e-06, + "loss": 0.0103, + "step": 2440 + }, + { + "epoch": 0.4848545039229318, + "grad_norm": 0.7745270788422335, + "learning_rate": 5.488284526015335e-06, + "loss": 0.0169, + "step": 2441 + }, + { + "epoch": 0.4850531333796802, + "grad_norm": 0.9595839872373445, + "learning_rate": 5.485082278185838e-06, + "loss": 0.0123, + "step": 2442 + }, + { + "epoch": 0.4852517628364286, + "grad_norm": 0.24937919516680546, + "learning_rate": 5.481879829484256e-06, + "loss": 0.0072, + "step": 2443 + }, + { + "epoch": 0.4854503922931771, + "grad_norm": 0.5052250608119354, + "learning_rate": 5.47867718123672e-06, + "loss": 0.0126, + "step": 2444 + }, + { + "epoch": 0.4856490217499255, + "grad_norm": 0.6095834648470921, + "learning_rate": 5.475474334769443e-06, + "loss": 0.0127, + "step": 2445 + }, + { + "epoch": 0.48584765120667395, + "grad_norm": 0.5239132674664736, + "learning_rate": 5.47227129140872e-06, + "loss": 0.0109, + "step": 2446 + }, + { + "epoch": 0.48604628066342237, + "grad_norm": 0.40049557705857963, + "learning_rate": 5.4690680524809246e-06, + "loss": 0.0072, + "step": 2447 + }, + { + "epoch": 0.48624491012017085, + "grad_norm": 0.6583755900248895, + "learning_rate": 5.465864619312522e-06, + "loss": 0.0112, + "step": 2448 + }, + { + "epoch": 0.48644353957691927, + "grad_norm": 0.511543136523891, + "learning_rate": 5.462660993230045e-06, + "loss": 0.0111, + "step": 2449 + }, + { + "epoch": 0.4866421690336677, + "grad_norm": 0.6333349669994685, + "learning_rate": 5.459457175560117e-06, + "loss": 0.0142, + "step": 2450 + }, + { + "epoch": 0.4868407984904161, + "grad_norm": 0.4081955737456362, + "learning_rate": 5.456253167629431e-06, + "loss": 0.0106, + "step": 2451 + }, + { + "epoch": 0.48703942794716454, + "grad_norm": 0.45569800327311816, + "learning_rate": 5.4530489707647646e-06, + "loss": 0.0122, + "step": 2452 + }, + { + "epoch": 0.487238057403913, + "grad_norm": 0.49468712119644775, + "learning_rate": 5.449844586292974e-06, + "loss": 0.0163, + "step": 2453 + }, + { + "epoch": 0.48743668686066144, + "grad_norm": 1.3504172026817023, + "learning_rate": 5.446640015540989e-06, + "loss": 0.0165, + "step": 2454 + }, + { + "epoch": 0.48763531631740986, + "grad_norm": 0.48452241295226994, + "learning_rate": 5.443435259835822e-06, + "loss": 0.0116, + "step": 2455 + }, + { + "epoch": 0.4878339457741583, + "grad_norm": 0.5427977276769387, + "learning_rate": 5.44023032050456e-06, + "loss": 0.0129, + "step": 2456 + }, + { + "epoch": 0.48803257523090676, + "grad_norm": 0.3796715316602044, + "learning_rate": 5.437025198874361e-06, + "loss": 0.0107, + "step": 2457 + }, + { + "epoch": 0.4882312046876552, + "grad_norm": 0.9111058024326351, + "learning_rate": 5.433819896272464e-06, + "loss": 0.0157, + "step": 2458 + }, + { + "epoch": 0.4884298341444036, + "grad_norm": 0.490889205825391, + "learning_rate": 5.4306144140261845e-06, + "loss": 0.009, + "step": 2459 + }, + { + "epoch": 0.48862846360115203, + "grad_norm": 0.6591819950885145, + "learning_rate": 5.427408753462905e-06, + "loss": 0.0151, + "step": 2460 + }, + { + "epoch": 0.4888270930579005, + "grad_norm": 0.3574688903762408, + "learning_rate": 5.42420291591009e-06, + "loss": 0.0145, + "step": 2461 + }, + { + "epoch": 0.48902572251464893, + "grad_norm": 0.7539287197857844, + "learning_rate": 5.420996902695273e-06, + "loss": 0.0134, + "step": 2462 + }, + { + "epoch": 0.48922435197139735, + "grad_norm": 0.6981359145878261, + "learning_rate": 5.4177907151460585e-06, + "loss": 0.0119, + "step": 2463 + }, + { + "epoch": 0.4894229814281458, + "grad_norm": 0.606244215044152, + "learning_rate": 5.414584354590129e-06, + "loss": 0.0111, + "step": 2464 + }, + { + "epoch": 0.48962161088489425, + "grad_norm": 0.5459437232022195, + "learning_rate": 5.411377822355233e-06, + "loss": 0.0091, + "step": 2465 + }, + { + "epoch": 0.4898202403416427, + "grad_norm": 0.5279168001785495, + "learning_rate": 5.408171119769192e-06, + "loss": 0.0112, + "step": 2466 + }, + { + "epoch": 0.4900188697983911, + "grad_norm": 0.4695727703267886, + "learning_rate": 5.4049642481598985e-06, + "loss": 0.0117, + "step": 2467 + }, + { + "epoch": 0.4902174992551395, + "grad_norm": 0.43550533403024305, + "learning_rate": 5.401757208855317e-06, + "loss": 0.0048, + "step": 2468 + }, + { + "epoch": 0.490416128711888, + "grad_norm": 0.42677326587153497, + "learning_rate": 5.398550003183478e-06, + "loss": 0.0081, + "step": 2469 + }, + { + "epoch": 0.4906147581686364, + "grad_norm": 0.6348135613049047, + "learning_rate": 5.39534263247248e-06, + "loss": 0.0209, + "step": 2470 + }, + { + "epoch": 0.49081338762538484, + "grad_norm": 0.5524103691775981, + "learning_rate": 5.392135098050495e-06, + "loss": 0.0119, + "step": 2471 + }, + { + "epoch": 0.49101201708213327, + "grad_norm": 0.7108743625403123, + "learning_rate": 5.388927401245757e-06, + "loss": 0.0176, + "step": 2472 + }, + { + "epoch": 0.4912106465388817, + "grad_norm": 0.4353791961565994, + "learning_rate": 5.385719543386574e-06, + "loss": 0.0106, + "step": 2473 + }, + { + "epoch": 0.49140927599563017, + "grad_norm": 0.6306640021625024, + "learning_rate": 5.3825115258013126e-06, + "loss": 0.0103, + "step": 2474 + }, + { + "epoch": 0.4916079054523786, + "grad_norm": 0.5210772415889535, + "learning_rate": 5.379303349818412e-06, + "loss": 0.009, + "step": 2475 + }, + { + "epoch": 0.491806534909127, + "grad_norm": 0.414410686373316, + "learning_rate": 5.376095016766374e-06, + "loss": 0.0089, + "step": 2476 + }, + { + "epoch": 0.49200516436587544, + "grad_norm": 0.4551862364482468, + "learning_rate": 5.372886527973767e-06, + "loss": 0.0082, + "step": 2477 + }, + { + "epoch": 0.4922037938226239, + "grad_norm": 0.7775904217757333, + "learning_rate": 5.369677884769221e-06, + "loss": 0.0177, + "step": 2478 + }, + { + "epoch": 0.49240242327937234, + "grad_norm": 1.0123752948840063, + "learning_rate": 5.366469088481433e-06, + "loss": 0.0137, + "step": 2479 + }, + { + "epoch": 0.49260105273612076, + "grad_norm": 0.4494991849679043, + "learning_rate": 5.3632601404391635e-06, + "loss": 0.0094, + "step": 2480 + }, + { + "epoch": 0.4927996821928692, + "grad_norm": 0.7550282831087276, + "learning_rate": 5.360051041971234e-06, + "loss": 0.014, + "step": 2481 + }, + { + "epoch": 0.49299831164961766, + "grad_norm": 0.46682384084711037, + "learning_rate": 5.356841794406527e-06, + "loss": 0.0151, + "step": 2482 + }, + { + "epoch": 0.4931969411063661, + "grad_norm": 0.4115591498845582, + "learning_rate": 5.353632399073991e-06, + "loss": 0.0113, + "step": 2483 + }, + { + "epoch": 0.4933955705631145, + "grad_norm": 0.7247787357221698, + "learning_rate": 5.350422857302633e-06, + "loss": 0.0134, + "step": 2484 + }, + { + "epoch": 0.4935942000198629, + "grad_norm": 0.8578245479997127, + "learning_rate": 5.347213170421519e-06, + "loss": 0.0148, + "step": 2485 + }, + { + "epoch": 0.4937928294766114, + "grad_norm": 0.4699237127768561, + "learning_rate": 5.344003339759781e-06, + "loss": 0.0097, + "step": 2486 + }, + { + "epoch": 0.49399145893335983, + "grad_norm": 0.4744010503049874, + "learning_rate": 5.340793366646604e-06, + "loss": 0.0115, + "step": 2487 + }, + { + "epoch": 0.49419008839010825, + "grad_norm": 0.636008252077129, + "learning_rate": 5.337583252411235e-06, + "loss": 0.0102, + "step": 2488 + }, + { + "epoch": 0.4943887178468567, + "grad_norm": 0.47474845684362593, + "learning_rate": 5.33437299838298e-06, + "loss": 0.0124, + "step": 2489 + }, + { + "epoch": 0.49458734730360515, + "grad_norm": 1.0566024501358895, + "learning_rate": 5.3311626058911994e-06, + "loss": 0.0151, + "step": 2490 + }, + { + "epoch": 0.4947859767603536, + "grad_norm": 0.36678370138916344, + "learning_rate": 5.327952076265317e-06, + "loss": 0.0147, + "step": 2491 + }, + { + "epoch": 0.494984606217102, + "grad_norm": 0.25272585797585356, + "learning_rate": 5.324741410834807e-06, + "loss": 0.0067, + "step": 2492 + }, + { + "epoch": 0.4951832356738504, + "grad_norm": 0.6048164575103134, + "learning_rate": 5.321530610929204e-06, + "loss": 0.0113, + "step": 2493 + }, + { + "epoch": 0.49538186513059884, + "grad_norm": 0.7695263702174417, + "learning_rate": 5.318319677878098e-06, + "loss": 0.0142, + "step": 2494 + }, + { + "epoch": 0.4955804945873473, + "grad_norm": 0.33525412117269704, + "learning_rate": 5.315108613011132e-06, + "loss": 0.0124, + "step": 2495 + }, + { + "epoch": 0.49577912404409574, + "grad_norm": 0.7878389738793319, + "learning_rate": 5.311897417658005e-06, + "loss": 0.0151, + "step": 2496 + }, + { + "epoch": 0.49597775350084417, + "grad_norm": 0.4578947485727716, + "learning_rate": 5.308686093148467e-06, + "loss": 0.0115, + "step": 2497 + }, + { + "epoch": 0.4961763829575926, + "grad_norm": 1.030479841136134, + "learning_rate": 5.305474640812331e-06, + "loss": 0.0139, + "step": 2498 + }, + { + "epoch": 0.49637501241434107, + "grad_norm": 0.796961987633012, + "learning_rate": 5.30226306197945e-06, + "loss": 0.0109, + "step": 2499 + }, + { + "epoch": 0.4965736418710895, + "grad_norm": 0.782382140972235, + "learning_rate": 5.299051357979738e-06, + "loss": 0.0187, + "step": 2500 + }, + { + "epoch": 0.4967722713278379, + "grad_norm": 1.121114404342789, + "learning_rate": 5.295839530143159e-06, + "loss": 0.0178, + "step": 2501 + }, + { + "epoch": 0.49697090078458633, + "grad_norm": 0.7261947822033853, + "learning_rate": 5.2926275797997255e-06, + "loss": 0.0188, + "step": 2502 + }, + { + "epoch": 0.4971695302413348, + "grad_norm": 0.46923022153419347, + "learning_rate": 5.289415508279505e-06, + "loss": 0.0073, + "step": 2503 + }, + { + "epoch": 0.49736815969808323, + "grad_norm": 0.7182836113385063, + "learning_rate": 5.286203316912613e-06, + "loss": 0.015, + "step": 2504 + }, + { + "epoch": 0.49756678915483166, + "grad_norm": 0.603411625751702, + "learning_rate": 5.282991007029213e-06, + "loss": 0.0106, + "step": 2505 + }, + { + "epoch": 0.4977654186115801, + "grad_norm": 0.8174796617956737, + "learning_rate": 5.27977857995952e-06, + "loss": 0.0149, + "step": 2506 + }, + { + "epoch": 0.49796404806832856, + "grad_norm": 1.0146932905608117, + "learning_rate": 5.276566037033798e-06, + "loss": 0.0141, + "step": 2507 + }, + { + "epoch": 0.498162677525077, + "grad_norm": 0.4163900253441471, + "learning_rate": 5.273353379582357e-06, + "loss": 0.0108, + "step": 2508 + }, + { + "epoch": 0.4983613069818254, + "grad_norm": 0.763281658551113, + "learning_rate": 5.270140608935555e-06, + "loss": 0.015, + "step": 2509 + }, + { + "epoch": 0.4985599364385738, + "grad_norm": 0.22259719035648037, + "learning_rate": 5.266927726423797e-06, + "loss": 0.0087, + "step": 2510 + }, + { + "epoch": 0.4987585658953223, + "grad_norm": 0.42395383851091467, + "learning_rate": 5.263714733377535e-06, + "loss": 0.0133, + "step": 2511 + }, + { + "epoch": 0.4989571953520707, + "grad_norm": 0.5328797312516291, + "learning_rate": 5.260501631127266e-06, + "loss": 0.012, + "step": 2512 + }, + { + "epoch": 0.49915582480881915, + "grad_norm": 0.4743705135796904, + "learning_rate": 5.257288421003534e-06, + "loss": 0.0196, + "step": 2513 + }, + { + "epoch": 0.49935445426556757, + "grad_norm": 0.5093401512061038, + "learning_rate": 5.254075104336922e-06, + "loss": 0.0082, + "step": 2514 + }, + { + "epoch": 0.499553083722316, + "grad_norm": 0.9029692999709029, + "learning_rate": 5.250861682458066e-06, + "loss": 0.0249, + "step": 2515 + }, + { + "epoch": 0.49975171317906447, + "grad_norm": 1.777870716049857, + "learning_rate": 5.247648156697637e-06, + "loss": 0.0258, + "step": 2516 + }, + { + "epoch": 0.4999503426358129, + "grad_norm": 0.5892960441699706, + "learning_rate": 5.2444345283863555e-06, + "loss": 0.008, + "step": 2517 + }, + { + "epoch": 0.5001489720925614, + "grad_norm": 0.44368783450479404, + "learning_rate": 5.241220798854979e-06, + "loss": 0.0097, + "step": 2518 + }, + { + "epoch": 0.5003476015493098, + "grad_norm": 0.4921667565769311, + "learning_rate": 5.238006969434313e-06, + "loss": 0.0111, + "step": 2519 + }, + { + "epoch": 0.5005462310060582, + "grad_norm": 0.5508976317775016, + "learning_rate": 5.234793041455199e-06, + "loss": 0.0143, + "step": 2520 + }, + { + "epoch": 0.5007448604628066, + "grad_norm": 0.7142397409065645, + "learning_rate": 5.2315790162485204e-06, + "loss": 0.0141, + "step": 2521 + }, + { + "epoch": 0.5009434899195551, + "grad_norm": 0.47566862647539554, + "learning_rate": 5.228364895145203e-06, + "loss": 0.0162, + "step": 2522 + }, + { + "epoch": 0.5011421193763035, + "grad_norm": 0.9705471048491111, + "learning_rate": 5.225150679476212e-06, + "loss": 0.0165, + "step": 2523 + }, + { + "epoch": 0.5013407488330519, + "grad_norm": 0.5751231880679607, + "learning_rate": 5.22193637057255e-06, + "loss": 0.0138, + "step": 2524 + }, + { + "epoch": 0.5015393782898003, + "grad_norm": 0.40697823166729785, + "learning_rate": 5.218721969765259e-06, + "loss": 0.0055, + "step": 2525 + }, + { + "epoch": 0.5017380077465489, + "grad_norm": 0.4094426383199233, + "learning_rate": 5.21550747838542e-06, + "loss": 0.0122, + "step": 2526 + }, + { + "epoch": 0.5019366372032973, + "grad_norm": 0.4031192231606706, + "learning_rate": 5.21229289776415e-06, + "loss": 0.0092, + "step": 2527 + }, + { + "epoch": 0.5021352666600457, + "grad_norm": 0.35911635894078986, + "learning_rate": 5.209078229232603e-06, + "loss": 0.0068, + "step": 2528 + }, + { + "epoch": 0.5023338961167941, + "grad_norm": 0.7098839446706783, + "learning_rate": 5.205863474121972e-06, + "loss": 0.0152, + "step": 2529 + }, + { + "epoch": 0.5025325255735426, + "grad_norm": 0.3375041096332709, + "learning_rate": 5.2026486337634804e-06, + "loss": 0.0089, + "step": 2530 + }, + { + "epoch": 0.502731155030291, + "grad_norm": 0.41527798701142316, + "learning_rate": 5.199433709488394e-06, + "loss": 0.0141, + "step": 2531 + }, + { + "epoch": 0.5029297844870394, + "grad_norm": 0.39038459676259, + "learning_rate": 5.196218702628008e-06, + "loss": 0.0115, + "step": 2532 + }, + { + "epoch": 0.5031284139437878, + "grad_norm": 0.6528062096390664, + "learning_rate": 5.193003614513653e-06, + "loss": 0.0231, + "step": 2533 + }, + { + "epoch": 0.5033270434005362, + "grad_norm": 1.7917958287890257, + "learning_rate": 5.189788446476695e-06, + "loss": 0.0243, + "step": 2534 + }, + { + "epoch": 0.5035256728572848, + "grad_norm": 0.44912076622692515, + "learning_rate": 5.186573199848532e-06, + "loss": 0.0106, + "step": 2535 + }, + { + "epoch": 0.5037243023140332, + "grad_norm": 0.40776135095164723, + "learning_rate": 5.183357875960592e-06, + "loss": 0.0122, + "step": 2536 + }, + { + "epoch": 0.5039229317707816, + "grad_norm": 0.5810563884951965, + "learning_rate": 5.1801424761443404e-06, + "loss": 0.0164, + "step": 2537 + }, + { + "epoch": 0.50412156122753, + "grad_norm": 0.7568118365890487, + "learning_rate": 5.17692700173127e-06, + "loss": 0.0133, + "step": 2538 + }, + { + "epoch": 0.5043201906842785, + "grad_norm": 0.42687995689269276, + "learning_rate": 5.173711454052905e-06, + "loss": 0.0095, + "step": 2539 + }, + { + "epoch": 0.5045188201410269, + "grad_norm": 0.4374548051838743, + "learning_rate": 5.170495834440802e-06, + "loss": 0.0121, + "step": 2540 + }, + { + "epoch": 0.5047174495977753, + "grad_norm": 0.4622644074029037, + "learning_rate": 5.167280144226543e-06, + "loss": 0.0118, + "step": 2541 + }, + { + "epoch": 0.5049160790545237, + "grad_norm": 0.7577713982849161, + "learning_rate": 5.164064384741745e-06, + "loss": 0.0126, + "step": 2542 + }, + { + "epoch": 0.5051147085112723, + "grad_norm": 0.3726201408129837, + "learning_rate": 5.160848557318049e-06, + "loss": 0.0127, + "step": 2543 + }, + { + "epoch": 0.5053133379680207, + "grad_norm": 0.797286065504275, + "learning_rate": 5.157632663287126e-06, + "loss": 0.0126, + "step": 2544 + }, + { + "epoch": 0.5055119674247691, + "grad_norm": 0.4918743375069235, + "learning_rate": 5.1544167039806755e-06, + "loss": 0.0082, + "step": 2545 + }, + { + "epoch": 0.5057105968815175, + "grad_norm": 1.8250694853729397, + "learning_rate": 5.1512006807304235e-06, + "loss": 0.0155, + "step": 2546 + }, + { + "epoch": 0.505909226338266, + "grad_norm": 0.6131857955146129, + "learning_rate": 5.1479845948681185e-06, + "loss": 0.0169, + "step": 2547 + }, + { + "epoch": 0.5061078557950144, + "grad_norm": 0.48035054079034195, + "learning_rate": 5.144768447725544e-06, + "loss": 0.009, + "step": 2548 + }, + { + "epoch": 0.5063064852517628, + "grad_norm": 0.3214285229524216, + "learning_rate": 5.1415522406344976e-06, + "loss": 0.0101, + "step": 2549 + }, + { + "epoch": 0.5065051147085112, + "grad_norm": 0.7075802214664444, + "learning_rate": 5.1383359749268114e-06, + "loss": 0.0066, + "step": 2550 + }, + { + "epoch": 0.5067037441652598, + "grad_norm": 0.5521771461028792, + "learning_rate": 5.135119651934337e-06, + "loss": 0.0089, + "step": 2551 + }, + { + "epoch": 0.5069023736220082, + "grad_norm": 0.25160655768692114, + "learning_rate": 5.131903272988951e-06, + "loss": 0.0114, + "step": 2552 + }, + { + "epoch": 0.5071010030787566, + "grad_norm": 0.49214589306265416, + "learning_rate": 5.128686839422548e-06, + "loss": 0.0139, + "step": 2553 + }, + { + "epoch": 0.507299632535505, + "grad_norm": 0.3311557204020224, + "learning_rate": 5.125470352567057e-06, + "loss": 0.0085, + "step": 2554 + }, + { + "epoch": 0.5074982619922535, + "grad_norm": 0.5102290741270068, + "learning_rate": 5.122253813754418e-06, + "loss": 0.0135, + "step": 2555 + }, + { + "epoch": 0.5076968914490019, + "grad_norm": 0.47249684319774726, + "learning_rate": 5.119037224316596e-06, + "loss": 0.0142, + "step": 2556 + }, + { + "epoch": 0.5078955209057503, + "grad_norm": 0.605698552614918, + "learning_rate": 5.115820585585579e-06, + "loss": 0.0113, + "step": 2557 + }, + { + "epoch": 0.5080941503624987, + "grad_norm": 0.6910282577333509, + "learning_rate": 5.1126038988933745e-06, + "loss": 0.0098, + "step": 2558 + }, + { + "epoch": 0.5082927798192471, + "grad_norm": 0.3860296102305923, + "learning_rate": 5.109387165572007e-06, + "loss": 0.0149, + "step": 2559 + }, + { + "epoch": 0.5084914092759957, + "grad_norm": 0.47180358188811444, + "learning_rate": 5.106170386953524e-06, + "loss": 0.0099, + "step": 2560 + }, + { + "epoch": 0.5086900387327441, + "grad_norm": 0.30576774774595783, + "learning_rate": 5.10295356436999e-06, + "loss": 0.0077, + "step": 2561 + }, + { + "epoch": 0.5088886681894925, + "grad_norm": 0.30760305826870893, + "learning_rate": 5.099736699153489e-06, + "loss": 0.0076, + "step": 2562 + }, + { + "epoch": 0.5090872976462409, + "grad_norm": 0.3838863165764435, + "learning_rate": 5.09651979263612e-06, + "loss": 0.0117, + "step": 2563 + }, + { + "epoch": 0.5092859271029894, + "grad_norm": 0.7031651252049763, + "learning_rate": 5.093302846150003e-06, + "loss": 0.0124, + "step": 2564 + }, + { + "epoch": 0.5094845565597378, + "grad_norm": 0.2976625692750354, + "learning_rate": 5.0900858610272686e-06, + "loss": 0.0081, + "step": 2565 + }, + { + "epoch": 0.5096831860164862, + "grad_norm": 0.7678894575982894, + "learning_rate": 5.0868688386000705e-06, + "loss": 0.0082, + "step": 2566 + }, + { + "epoch": 0.5098818154732346, + "grad_norm": 1.202459097306564, + "learning_rate": 5.083651780200573e-06, + "loss": 0.0143, + "step": 2567 + }, + { + "epoch": 0.5100804449299832, + "grad_norm": 0.7918825373152566, + "learning_rate": 5.080434687160958e-06, + "loss": 0.0194, + "step": 2568 + }, + { + "epoch": 0.5102790743867316, + "grad_norm": 0.35999767545572886, + "learning_rate": 5.077217560813419e-06, + "loss": 0.0131, + "step": 2569 + }, + { + "epoch": 0.51047770384348, + "grad_norm": 0.6835356858570736, + "learning_rate": 5.074000402490166e-06, + "loss": 0.0125, + "step": 2570 + }, + { + "epoch": 0.5106763333002284, + "grad_norm": 0.8376402761684941, + "learning_rate": 5.0707832135234196e-06, + "loss": 0.0111, + "step": 2571 + }, + { + "epoch": 0.5108749627569769, + "grad_norm": 0.4974372233299198, + "learning_rate": 5.067565995245413e-06, + "loss": 0.0094, + "step": 2572 + }, + { + "epoch": 0.5110735922137253, + "grad_norm": 0.661553749690577, + "learning_rate": 5.0643487489883995e-06, + "loss": 0.0107, + "step": 2573 + }, + { + "epoch": 0.5112722216704737, + "grad_norm": 0.6776284432083003, + "learning_rate": 5.061131476084632e-06, + "loss": 0.0138, + "step": 2574 + }, + { + "epoch": 0.5114708511272221, + "grad_norm": 0.6037483195136601, + "learning_rate": 5.057914177866381e-06, + "loss": 0.0122, + "step": 2575 + }, + { + "epoch": 0.5116694805839705, + "grad_norm": 0.5131467239719532, + "learning_rate": 5.054696855665928e-06, + "loss": 0.0103, + "step": 2576 + }, + { + "epoch": 0.5118681100407191, + "grad_norm": 0.43566722341066894, + "learning_rate": 5.051479510815561e-06, + "loss": 0.0098, + "step": 2577 + }, + { + "epoch": 0.5120667394974675, + "grad_norm": 0.45003515001134503, + "learning_rate": 5.04826214464758e-06, + "loss": 0.0122, + "step": 2578 + }, + { + "epoch": 0.5122653689542159, + "grad_norm": 0.750722907248036, + "learning_rate": 5.0450447584942945e-06, + "loss": 0.0112, + "step": 2579 + }, + { + "epoch": 0.5124639984109643, + "grad_norm": 0.6004993496158042, + "learning_rate": 5.041827353688018e-06, + "loss": 0.0185, + "step": 2580 + }, + { + "epoch": 0.5126626278677128, + "grad_norm": 3.1872299490121687, + "learning_rate": 5.038609931561077e-06, + "loss": 0.0182, + "step": 2581 + }, + { + "epoch": 0.5128612573244612, + "grad_norm": 0.57628867563776, + "learning_rate": 5.035392493445802e-06, + "loss": 0.0133, + "step": 2582 + }, + { + "epoch": 0.5130598867812096, + "grad_norm": 0.4439739183814535, + "learning_rate": 5.03217504067453e-06, + "loss": 0.0083, + "step": 2583 + }, + { + "epoch": 0.513258516237958, + "grad_norm": 0.29526566381496966, + "learning_rate": 5.028957574579607e-06, + "loss": 0.0122, + "step": 2584 + }, + { + "epoch": 0.5134571456947066, + "grad_norm": 0.5866932164892713, + "learning_rate": 5.02574009649338e-06, + "loss": 0.0173, + "step": 2585 + }, + { + "epoch": 0.513655775151455, + "grad_norm": 0.6315522926249754, + "learning_rate": 5.0225226077482055e-06, + "loss": 0.0196, + "step": 2586 + }, + { + "epoch": 0.5138544046082034, + "grad_norm": 0.430607086857732, + "learning_rate": 5.019305109676443e-06, + "loss": 0.0118, + "step": 2587 + }, + { + "epoch": 0.5140530340649518, + "grad_norm": 0.6939527777771256, + "learning_rate": 5.016087603610454e-06, + "loss": 0.013, + "step": 2588 + }, + { + "epoch": 0.5142516635217003, + "grad_norm": 0.43910241298197344, + "learning_rate": 5.012870090882604e-06, + "loss": 0.0118, + "step": 2589 + }, + { + "epoch": 0.5144502929784487, + "grad_norm": 0.49333214750546195, + "learning_rate": 5.00965257282526e-06, + "loss": 0.0104, + "step": 2590 + }, + { + "epoch": 0.5146489224351971, + "grad_norm": 0.800628768602885, + "learning_rate": 5.006435050770797e-06, + "loss": 0.0112, + "step": 2591 + }, + { + "epoch": 0.5148475518919455, + "grad_norm": 0.5851511459639136, + "learning_rate": 5.003217526051586e-06, + "loss": 0.012, + "step": 2592 + }, + { + "epoch": 0.5150461813486941, + "grad_norm": 0.47041917943176503, + "learning_rate": 5e-06, + "loss": 0.0168, + "step": 2593 + }, + { + "epoch": 0.5152448108054425, + "grad_norm": 0.47959696982675354, + "learning_rate": 4.996782473948416e-06, + "loss": 0.0132, + "step": 2594 + }, + { + "epoch": 0.5154434402621909, + "grad_norm": 0.47813958473253837, + "learning_rate": 4.993564949229204e-06, + "loss": 0.0108, + "step": 2595 + }, + { + "epoch": 0.5156420697189393, + "grad_norm": 0.3852993385433787, + "learning_rate": 4.990347427174742e-06, + "loss": 0.0089, + "step": 2596 + }, + { + "epoch": 0.5158406991756878, + "grad_norm": 0.6016579451534518, + "learning_rate": 4.987129909117398e-06, + "loss": 0.0175, + "step": 2597 + }, + { + "epoch": 0.5160393286324362, + "grad_norm": 0.45074106282002463, + "learning_rate": 4.983912396389547e-06, + "loss": 0.0094, + "step": 2598 + }, + { + "epoch": 0.5162379580891846, + "grad_norm": 0.7101086980066497, + "learning_rate": 4.980694890323558e-06, + "loss": 0.0171, + "step": 2599 + }, + { + "epoch": 0.516436587545933, + "grad_norm": 0.4153502818151871, + "learning_rate": 4.977477392251794e-06, + "loss": 0.0176, + "step": 2600 + }, + { + "epoch": 0.5166352170026814, + "grad_norm": 0.9128192784068005, + "learning_rate": 4.97425990350662e-06, + "loss": 0.0119, + "step": 2601 + }, + { + "epoch": 0.51683384645943, + "grad_norm": 0.532406897997197, + "learning_rate": 4.971042425420394e-06, + "loss": 0.0098, + "step": 2602 + }, + { + "epoch": 0.5170324759161784, + "grad_norm": 0.6432723125325949, + "learning_rate": 4.96782495932547e-06, + "loss": 0.0125, + "step": 2603 + }, + { + "epoch": 0.5172311053729268, + "grad_norm": 0.4441861132175126, + "learning_rate": 4.964607506554199e-06, + "loss": 0.0088, + "step": 2604 + }, + { + "epoch": 0.5174297348296752, + "grad_norm": 0.6071744574706942, + "learning_rate": 4.961390068438926e-06, + "loss": 0.0125, + "step": 2605 + }, + { + "epoch": 0.5176283642864237, + "grad_norm": 0.5144396277013178, + "learning_rate": 4.958172646311985e-06, + "loss": 0.0108, + "step": 2606 + }, + { + "epoch": 0.5178269937431721, + "grad_norm": 0.27271582525812527, + "learning_rate": 4.954955241505709e-06, + "loss": 0.0057, + "step": 2607 + }, + { + "epoch": 0.5180256231999205, + "grad_norm": 0.41602838022157057, + "learning_rate": 4.951737855352422e-06, + "loss": 0.014, + "step": 2608 + }, + { + "epoch": 0.5182242526566689, + "grad_norm": 0.8548531908521733, + "learning_rate": 4.948520489184441e-06, + "loss": 0.0192, + "step": 2609 + }, + { + "epoch": 0.5184228821134175, + "grad_norm": 0.7707419054286566, + "learning_rate": 4.945303144334074e-06, + "loss": 0.0156, + "step": 2610 + }, + { + "epoch": 0.5186215115701659, + "grad_norm": 0.9002762306327531, + "learning_rate": 4.94208582213362e-06, + "loss": 0.0109, + "step": 2611 + }, + { + "epoch": 0.5188201410269143, + "grad_norm": 0.5969255787289416, + "learning_rate": 4.9388685239153696e-06, + "loss": 0.013, + "step": 2612 + }, + { + "epoch": 0.5190187704836627, + "grad_norm": 1.321061946411639, + "learning_rate": 4.935651251011602e-06, + "loss": 0.0265, + "step": 2613 + }, + { + "epoch": 0.5192173999404112, + "grad_norm": 0.9544926651703668, + "learning_rate": 4.9324340047545874e-06, + "loss": 0.0232, + "step": 2614 + }, + { + "epoch": 0.5194160293971596, + "grad_norm": 0.8724356174787311, + "learning_rate": 4.929216786476582e-06, + "loss": 0.0172, + "step": 2615 + }, + { + "epoch": 0.519614658853908, + "grad_norm": 0.5481496452758235, + "learning_rate": 4.925999597509836e-06, + "loss": 0.0105, + "step": 2616 + }, + { + "epoch": 0.5198132883106564, + "grad_norm": 0.5924502586289119, + "learning_rate": 4.922782439186583e-06, + "loss": 0.0126, + "step": 2617 + }, + { + "epoch": 0.5200119177674049, + "grad_norm": 1.097058049346646, + "learning_rate": 4.9195653128390436e-06, + "loss": 0.0194, + "step": 2618 + }, + { + "epoch": 0.5202105472241534, + "grad_norm": 0.34572029206305155, + "learning_rate": 4.9163482197994275e-06, + "loss": 0.0114, + "step": 2619 + }, + { + "epoch": 0.5204091766809018, + "grad_norm": 0.6478854101066456, + "learning_rate": 4.913131161399931e-06, + "loss": 0.0124, + "step": 2620 + }, + { + "epoch": 0.5206078061376502, + "grad_norm": 0.8509089389165168, + "learning_rate": 4.909914138972733e-06, + "loss": 0.0152, + "step": 2621 + }, + { + "epoch": 0.5208064355943987, + "grad_norm": 0.43045550772154256, + "learning_rate": 4.9066971538499985e-06, + "loss": 0.0058, + "step": 2622 + }, + { + "epoch": 0.5210050650511471, + "grad_norm": 0.2811848823106481, + "learning_rate": 4.903480207363881e-06, + "loss": 0.0078, + "step": 2623 + }, + { + "epoch": 0.5212036945078955, + "grad_norm": 0.17886282113428878, + "learning_rate": 4.900263300846512e-06, + "loss": 0.0045, + "step": 2624 + }, + { + "epoch": 0.5214023239646439, + "grad_norm": 1.0905060510323827, + "learning_rate": 4.89704643563001e-06, + "loss": 0.021, + "step": 2625 + }, + { + "epoch": 0.5216009534213923, + "grad_norm": 0.52942449810658, + "learning_rate": 4.893829613046476e-06, + "loss": 0.0105, + "step": 2626 + }, + { + "epoch": 0.5217995828781409, + "grad_norm": 0.38872032429287773, + "learning_rate": 4.890612834427994e-06, + "loss": 0.0119, + "step": 2627 + }, + { + "epoch": 0.5219982123348893, + "grad_norm": 0.4144058112551697, + "learning_rate": 4.887396101106627e-06, + "loss": 0.0122, + "step": 2628 + }, + { + "epoch": 0.5221968417916377, + "grad_norm": 0.7047127790035546, + "learning_rate": 4.884179414414423e-06, + "loss": 0.013, + "step": 2629 + }, + { + "epoch": 0.5223954712483861, + "grad_norm": 0.41046615550881144, + "learning_rate": 4.880962775683406e-06, + "loss": 0.0143, + "step": 2630 + }, + { + "epoch": 0.5225941007051346, + "grad_norm": 0.2756154625309793, + "learning_rate": 4.877746186245586e-06, + "loss": 0.0114, + "step": 2631 + }, + { + "epoch": 0.522792730161883, + "grad_norm": 0.8783246622943536, + "learning_rate": 4.874529647432946e-06, + "loss": 0.0111, + "step": 2632 + }, + { + "epoch": 0.5229913596186314, + "grad_norm": 0.35351879117875357, + "learning_rate": 4.871313160577453e-06, + "loss": 0.0114, + "step": 2633 + }, + { + "epoch": 0.5231899890753798, + "grad_norm": 0.3497304664011684, + "learning_rate": 4.868096727011052e-06, + "loss": 0.0117, + "step": 2634 + }, + { + "epoch": 0.5233886185321284, + "grad_norm": 0.38460261605226964, + "learning_rate": 4.864880348065665e-06, + "loss": 0.0099, + "step": 2635 + }, + { + "epoch": 0.5235872479888768, + "grad_norm": 0.34385257857225565, + "learning_rate": 4.86166402507319e-06, + "loss": 0.0131, + "step": 2636 + }, + { + "epoch": 0.5237858774456252, + "grad_norm": 0.7154889328263488, + "learning_rate": 4.858447759365504e-06, + "loss": 0.0185, + "step": 2637 + }, + { + "epoch": 0.5239845069023736, + "grad_norm": 0.45589891363247864, + "learning_rate": 4.855231552274459e-06, + "loss": 0.0089, + "step": 2638 + }, + { + "epoch": 0.5241831363591221, + "grad_norm": 0.3846830282655784, + "learning_rate": 4.852015405131882e-06, + "loss": 0.018, + "step": 2639 + }, + { + "epoch": 0.5243817658158705, + "grad_norm": 0.2940469069583859, + "learning_rate": 4.848799319269578e-06, + "loss": 0.0064, + "step": 2640 + }, + { + "epoch": 0.5245803952726189, + "grad_norm": 0.2149661207674454, + "learning_rate": 4.845583296019325e-06, + "loss": 0.0061, + "step": 2641 + }, + { + "epoch": 0.5247790247293673, + "grad_norm": 0.3446058201626362, + "learning_rate": 4.8423673367128745e-06, + "loss": 0.0072, + "step": 2642 + }, + { + "epoch": 0.5249776541861157, + "grad_norm": 0.9384941741122217, + "learning_rate": 4.839151442681952e-06, + "loss": 0.014, + "step": 2643 + }, + { + "epoch": 0.5251762836428643, + "grad_norm": 0.7648356533107046, + "learning_rate": 4.835935615258257e-06, + "loss": 0.0112, + "step": 2644 + }, + { + "epoch": 0.5253749130996127, + "grad_norm": 0.5947850726160407, + "learning_rate": 4.832719855773458e-06, + "loss": 0.0237, + "step": 2645 + }, + { + "epoch": 0.5255735425563611, + "grad_norm": 0.5069947415968155, + "learning_rate": 4.8295041655592e-06, + "loss": 0.019, + "step": 2646 + }, + { + "epoch": 0.5257721720131096, + "grad_norm": 0.530771872211901, + "learning_rate": 4.826288545947095e-06, + "loss": 0.0167, + "step": 2647 + }, + { + "epoch": 0.525970801469858, + "grad_norm": 0.7343719370370166, + "learning_rate": 4.82307299826873e-06, + "loss": 0.0153, + "step": 2648 + }, + { + "epoch": 0.5261694309266064, + "grad_norm": 0.43053313989705466, + "learning_rate": 4.81985752385566e-06, + "loss": 0.0113, + "step": 2649 + }, + { + "epoch": 0.5263680603833548, + "grad_norm": 1.0486479094087207, + "learning_rate": 4.816642124039408e-06, + "loss": 0.0225, + "step": 2650 + }, + { + "epoch": 0.5265666898401032, + "grad_norm": 1.204001697819146, + "learning_rate": 4.813426800151469e-06, + "loss": 0.0119, + "step": 2651 + }, + { + "epoch": 0.5267653192968518, + "grad_norm": 0.38383932212493976, + "learning_rate": 4.8102115535233054e-06, + "loss": 0.0108, + "step": 2652 + }, + { + "epoch": 0.5269639487536002, + "grad_norm": 0.7521314041717428, + "learning_rate": 4.806996385486349e-06, + "loss": 0.0115, + "step": 2653 + }, + { + "epoch": 0.5271625782103486, + "grad_norm": 0.35629755731123935, + "learning_rate": 4.803781297371995e-06, + "loss": 0.0073, + "step": 2654 + }, + { + "epoch": 0.527361207667097, + "grad_norm": 0.4526855571654173, + "learning_rate": 4.8005662905116085e-06, + "loss": 0.0092, + "step": 2655 + }, + { + "epoch": 0.5275598371238455, + "grad_norm": 0.3182080909457293, + "learning_rate": 4.797351366236522e-06, + "loss": 0.0132, + "step": 2656 + }, + { + "epoch": 0.5277584665805939, + "grad_norm": 0.6049331198936719, + "learning_rate": 4.794136525878032e-06, + "loss": 0.016, + "step": 2657 + }, + { + "epoch": 0.5279570960373423, + "grad_norm": 0.4754537156663623, + "learning_rate": 4.7909217707673984e-06, + "loss": 0.0168, + "step": 2658 + }, + { + "epoch": 0.5281557254940907, + "grad_norm": 0.998499211825424, + "learning_rate": 4.787707102235852e-06, + "loss": 0.0176, + "step": 2659 + }, + { + "epoch": 0.5283543549508392, + "grad_norm": 0.46739699878852936, + "learning_rate": 4.784492521614582e-06, + "loss": 0.01, + "step": 2660 + }, + { + "epoch": 0.5285529844075877, + "grad_norm": 0.4320350647544851, + "learning_rate": 4.781278030234742e-06, + "loss": 0.0078, + "step": 2661 + }, + { + "epoch": 0.5287516138643361, + "grad_norm": 0.6016783731374997, + "learning_rate": 4.778063629427451e-06, + "loss": 0.0127, + "step": 2662 + }, + { + "epoch": 0.5289502433210845, + "grad_norm": 0.49313103783452733, + "learning_rate": 4.77484932052379e-06, + "loss": 0.0146, + "step": 2663 + }, + { + "epoch": 0.529148872777833, + "grad_norm": 0.40985189124421423, + "learning_rate": 4.771635104854799e-06, + "loss": 0.0138, + "step": 2664 + }, + { + "epoch": 0.5293475022345814, + "grad_norm": 0.3135781617489447, + "learning_rate": 4.768420983751481e-06, + "loss": 0.0105, + "step": 2665 + }, + { + "epoch": 0.5295461316913298, + "grad_norm": 0.41689021986146035, + "learning_rate": 4.765206958544803e-06, + "loss": 0.0154, + "step": 2666 + }, + { + "epoch": 0.5297447611480782, + "grad_norm": 0.2399269526721442, + "learning_rate": 4.761993030565688e-06, + "loss": 0.0076, + "step": 2667 + }, + { + "epoch": 0.5299433906048266, + "grad_norm": 0.49800855497998864, + "learning_rate": 4.758779201145022e-06, + "loss": 0.02, + "step": 2668 + }, + { + "epoch": 0.5301420200615752, + "grad_norm": 0.3540794930683414, + "learning_rate": 4.755565471613646e-06, + "loss": 0.0074, + "step": 2669 + }, + { + "epoch": 0.5303406495183236, + "grad_norm": 0.37327188769326325, + "learning_rate": 4.752351843302364e-06, + "loss": 0.0093, + "step": 2670 + }, + { + "epoch": 0.530539278975072, + "grad_norm": 0.6218524242302055, + "learning_rate": 4.749138317541936e-06, + "loss": 0.0114, + "step": 2671 + }, + { + "epoch": 0.5307379084318204, + "grad_norm": 0.47629545880771196, + "learning_rate": 4.745924895663078e-06, + "loss": 0.0095, + "step": 2672 + }, + { + "epoch": 0.5309365378885689, + "grad_norm": 0.41948688036972176, + "learning_rate": 4.742711578996467e-06, + "loss": 0.0128, + "step": 2673 + }, + { + "epoch": 0.5311351673453173, + "grad_norm": 0.26707306675736925, + "learning_rate": 4.739498368872734e-06, + "loss": 0.0114, + "step": 2674 + }, + { + "epoch": 0.5313337968020657, + "grad_norm": 0.4462577977563272, + "learning_rate": 4.7362852666224654e-06, + "loss": 0.022, + "step": 2675 + }, + { + "epoch": 0.5315324262588141, + "grad_norm": 0.42752531113748565, + "learning_rate": 4.733072273576204e-06, + "loss": 0.0128, + "step": 2676 + }, + { + "epoch": 0.5317310557155627, + "grad_norm": 0.1988579553958363, + "learning_rate": 4.729859391064447e-06, + "loss": 0.0066, + "step": 2677 + }, + { + "epoch": 0.5319296851723111, + "grad_norm": 0.3623897142655911, + "learning_rate": 4.726646620417646e-06, + "loss": 0.0082, + "step": 2678 + }, + { + "epoch": 0.5321283146290595, + "grad_norm": 0.5471633235503327, + "learning_rate": 4.723433962966204e-06, + "loss": 0.0099, + "step": 2679 + }, + { + "epoch": 0.5323269440858079, + "grad_norm": 0.30009321328868405, + "learning_rate": 4.720221420040483e-06, + "loss": 0.0084, + "step": 2680 + }, + { + "epoch": 0.5325255735425564, + "grad_norm": 0.36399418358138586, + "learning_rate": 4.71700899297079e-06, + "loss": 0.0081, + "step": 2681 + }, + { + "epoch": 0.5327242029993048, + "grad_norm": 0.4970147699892617, + "learning_rate": 4.7137966830873905e-06, + "loss": 0.0113, + "step": 2682 + }, + { + "epoch": 0.5329228324560532, + "grad_norm": 0.5953544717285123, + "learning_rate": 4.710584491720496e-06, + "loss": 0.0182, + "step": 2683 + }, + { + "epoch": 0.5331214619128016, + "grad_norm": 0.4802266373130576, + "learning_rate": 4.707372420200275e-06, + "loss": 0.0124, + "step": 2684 + }, + { + "epoch": 0.53332009136955, + "grad_norm": 0.6689446433586343, + "learning_rate": 4.7041604698568436e-06, + "loss": 0.0196, + "step": 2685 + }, + { + "epoch": 0.5335187208262986, + "grad_norm": 0.5247125265711512, + "learning_rate": 4.700948642020263e-06, + "loss": 0.0108, + "step": 2686 + }, + { + "epoch": 0.533717350283047, + "grad_norm": 0.4728274310812671, + "learning_rate": 4.6977369380205514e-06, + "loss": 0.0123, + "step": 2687 + }, + { + "epoch": 0.5339159797397954, + "grad_norm": 0.60501199620917, + "learning_rate": 4.694525359187671e-06, + "loss": 0.0071, + "step": 2688 + }, + { + "epoch": 0.5341146091965439, + "grad_norm": 0.36671802535764086, + "learning_rate": 4.691313906851534e-06, + "loss": 0.0119, + "step": 2689 + }, + { + "epoch": 0.5343132386532923, + "grad_norm": 1.0242303491155098, + "learning_rate": 4.688102582341997e-06, + "loss": 0.0172, + "step": 2690 + }, + { + "epoch": 0.5345118681100407, + "grad_norm": 0.7058136287291272, + "learning_rate": 4.684891386988869e-06, + "loss": 0.014, + "step": 2691 + }, + { + "epoch": 0.5347104975667891, + "grad_norm": 1.1395881555357235, + "learning_rate": 4.681680322121903e-06, + "loss": 0.0197, + "step": 2692 + }, + { + "epoch": 0.5349091270235375, + "grad_norm": 0.8146146171242227, + "learning_rate": 4.678469389070797e-06, + "loss": 0.0135, + "step": 2693 + }, + { + "epoch": 0.5351077564802861, + "grad_norm": 0.4681663478402444, + "learning_rate": 4.675258589165194e-06, + "loss": 0.0061, + "step": 2694 + }, + { + "epoch": 0.5353063859370345, + "grad_norm": 0.7888374699099346, + "learning_rate": 4.672047923734685e-06, + "loss": 0.0126, + "step": 2695 + }, + { + "epoch": 0.5355050153937829, + "grad_norm": 0.26698990733974676, + "learning_rate": 4.668837394108801e-06, + "loss": 0.0081, + "step": 2696 + }, + { + "epoch": 0.5357036448505313, + "grad_norm": 0.6481355354112126, + "learning_rate": 4.665627001617021e-06, + "loss": 0.0106, + "step": 2697 + }, + { + "epoch": 0.5359022743072798, + "grad_norm": 0.9141985443318152, + "learning_rate": 4.662416747588765e-06, + "loss": 0.019, + "step": 2698 + }, + { + "epoch": 0.5361009037640282, + "grad_norm": 0.345436456845145, + "learning_rate": 4.6592066333533966e-06, + "loss": 0.0101, + "step": 2699 + }, + { + "epoch": 0.5362995332207766, + "grad_norm": 0.5340280696377245, + "learning_rate": 4.6559966602402195e-06, + "loss": 0.0132, + "step": 2700 + }, + { + "epoch": 0.536498162677525, + "grad_norm": 0.495489992727139, + "learning_rate": 4.652786829578482e-06, + "loss": 0.0108, + "step": 2701 + }, + { + "epoch": 0.5366967921342735, + "grad_norm": 0.48163074793680033, + "learning_rate": 4.649577142697369e-06, + "loss": 0.0089, + "step": 2702 + }, + { + "epoch": 0.536895421591022, + "grad_norm": 0.6202192336463302, + "learning_rate": 4.6463676009260115e-06, + "loss": 0.0128, + "step": 2703 + }, + { + "epoch": 0.5370940510477704, + "grad_norm": 0.8438747669064174, + "learning_rate": 4.643158205593475e-06, + "loss": 0.0207, + "step": 2704 + }, + { + "epoch": 0.5372926805045188, + "grad_norm": 0.5444604148537907, + "learning_rate": 4.639948958028769e-06, + "loss": 0.011, + "step": 2705 + }, + { + "epoch": 0.5374913099612673, + "grad_norm": 0.40458432983482345, + "learning_rate": 4.636739859560839e-06, + "loss": 0.0137, + "step": 2706 + }, + { + "epoch": 0.5376899394180157, + "grad_norm": 0.38258143370225456, + "learning_rate": 4.633530911518569e-06, + "loss": 0.0121, + "step": 2707 + }, + { + "epoch": 0.5378885688747641, + "grad_norm": 0.3338515448391374, + "learning_rate": 4.63032211523078e-06, + "loss": 0.0081, + "step": 2708 + }, + { + "epoch": 0.5380871983315125, + "grad_norm": 0.2721756551155944, + "learning_rate": 4.627113472026235e-06, + "loss": 0.0114, + "step": 2709 + }, + { + "epoch": 0.538285827788261, + "grad_norm": 0.8199742148753297, + "learning_rate": 4.623904983233628e-06, + "loss": 0.0194, + "step": 2710 + }, + { + "epoch": 0.5384844572450095, + "grad_norm": 0.6501960596074884, + "learning_rate": 4.6206966501815895e-06, + "loss": 0.0105, + "step": 2711 + }, + { + "epoch": 0.5386830867017579, + "grad_norm": 0.3113077005159778, + "learning_rate": 4.617488474198689e-06, + "loss": 0.0066, + "step": 2712 + }, + { + "epoch": 0.5388817161585063, + "grad_norm": 0.46285719527025915, + "learning_rate": 4.614280456613428e-06, + "loss": 0.015, + "step": 2713 + }, + { + "epoch": 0.5390803456152548, + "grad_norm": 0.9043967381501636, + "learning_rate": 4.6110725987542436e-06, + "loss": 0.0161, + "step": 2714 + }, + { + "epoch": 0.5392789750720032, + "grad_norm": 0.6074770026179739, + "learning_rate": 4.607864901949506e-06, + "loss": 0.0085, + "step": 2715 + }, + { + "epoch": 0.5394776045287516, + "grad_norm": 0.5580886715434685, + "learning_rate": 4.6046573675275204e-06, + "loss": 0.0124, + "step": 2716 + }, + { + "epoch": 0.5396762339855, + "grad_norm": 0.4824079804144595, + "learning_rate": 4.601449996816524e-06, + "loss": 0.0099, + "step": 2717 + }, + { + "epoch": 0.5398748634422484, + "grad_norm": 0.48189458946683134, + "learning_rate": 4.598242791144684e-06, + "loss": 0.0126, + "step": 2718 + }, + { + "epoch": 0.540073492898997, + "grad_norm": 1.2183320489020404, + "learning_rate": 4.5950357518401015e-06, + "loss": 0.0142, + "step": 2719 + }, + { + "epoch": 0.5402721223557454, + "grad_norm": 0.30529970276531804, + "learning_rate": 4.591828880230809e-06, + "loss": 0.0101, + "step": 2720 + }, + { + "epoch": 0.5404707518124938, + "grad_norm": 0.23725652114441975, + "learning_rate": 4.588622177644769e-06, + "loss": 0.0052, + "step": 2721 + }, + { + "epoch": 0.5406693812692422, + "grad_norm": 0.49526465559783595, + "learning_rate": 4.585415645409872e-06, + "loss": 0.0101, + "step": 2722 + }, + { + "epoch": 0.5408680107259907, + "grad_norm": 0.629120686573954, + "learning_rate": 4.5822092848539415e-06, + "loss": 0.0178, + "step": 2723 + }, + { + "epoch": 0.5410666401827391, + "grad_norm": 0.6034085054382206, + "learning_rate": 4.579003097304728e-06, + "loss": 0.0152, + "step": 2724 + }, + { + "epoch": 0.5412652696394875, + "grad_norm": 0.32942180161724255, + "learning_rate": 4.575797084089912e-06, + "loss": 0.0093, + "step": 2725 + }, + { + "epoch": 0.5414638990962359, + "grad_norm": 0.42192709837470616, + "learning_rate": 4.572591246537097e-06, + "loss": 0.0112, + "step": 2726 + }, + { + "epoch": 0.5416625285529844, + "grad_norm": 0.6217230182883882, + "learning_rate": 4.569385585973818e-06, + "loss": 0.01, + "step": 2727 + }, + { + "epoch": 0.5418611580097329, + "grad_norm": 0.6470622146574528, + "learning_rate": 4.566180103727538e-06, + "loss": 0.0172, + "step": 2728 + }, + { + "epoch": 0.5420597874664813, + "grad_norm": 0.15106597972191477, + "learning_rate": 4.562974801125642e-06, + "loss": 0.0053, + "step": 2729 + }, + { + "epoch": 0.5422584169232297, + "grad_norm": 0.6869662187626556, + "learning_rate": 4.559769679495443e-06, + "loss": 0.0138, + "step": 2730 + }, + { + "epoch": 0.5424570463799782, + "grad_norm": 0.34343413551532936, + "learning_rate": 4.55656474016418e-06, + "loss": 0.0077, + "step": 2731 + }, + { + "epoch": 0.5426556758367266, + "grad_norm": 0.6162031016897686, + "learning_rate": 4.553359984459012e-06, + "loss": 0.0146, + "step": 2732 + }, + { + "epoch": 0.542854305293475, + "grad_norm": 0.33775023474661575, + "learning_rate": 4.550155413707028e-06, + "loss": 0.0112, + "step": 2733 + }, + { + "epoch": 0.5430529347502234, + "grad_norm": 0.5286312992072117, + "learning_rate": 4.546951029235237e-06, + "loss": 0.0089, + "step": 2734 + }, + { + "epoch": 0.5432515642069718, + "grad_norm": 0.44205639734611607, + "learning_rate": 4.543746832370572e-06, + "loss": 0.0101, + "step": 2735 + }, + { + "epoch": 0.5434501936637204, + "grad_norm": 0.7046118783607311, + "learning_rate": 4.540542824439885e-06, + "loss": 0.0133, + "step": 2736 + }, + { + "epoch": 0.5436488231204688, + "grad_norm": 0.2920099315721433, + "learning_rate": 4.5373390067699555e-06, + "loss": 0.0075, + "step": 2737 + }, + { + "epoch": 0.5438474525772172, + "grad_norm": 0.40661921340738155, + "learning_rate": 4.53413538068748e-06, + "loss": 0.0118, + "step": 2738 + }, + { + "epoch": 0.5440460820339657, + "grad_norm": 0.7267749331431869, + "learning_rate": 4.530931947519076e-06, + "loss": 0.0148, + "step": 2739 + }, + { + "epoch": 0.5442447114907141, + "grad_norm": 0.3581174389427978, + "learning_rate": 4.527728708591283e-06, + "loss": 0.0107, + "step": 2740 + }, + { + "epoch": 0.5444433409474625, + "grad_norm": 0.41211789962071976, + "learning_rate": 4.524525665230559e-06, + "loss": 0.0129, + "step": 2741 + }, + { + "epoch": 0.5446419704042109, + "grad_norm": 0.9758329205888799, + "learning_rate": 4.521322818763281e-06, + "loss": 0.0179, + "step": 2742 + }, + { + "epoch": 0.5448405998609593, + "grad_norm": 0.8121007154836791, + "learning_rate": 4.518120170515744e-06, + "loss": 0.018, + "step": 2743 + }, + { + "epoch": 0.5450392293177078, + "grad_norm": 0.2556833343570392, + "learning_rate": 4.514917721814163e-06, + "loss": 0.0062, + "step": 2744 + }, + { + "epoch": 0.5452378587744563, + "grad_norm": 0.49149727060895204, + "learning_rate": 4.5117154739846665e-06, + "loss": 0.0176, + "step": 2745 + }, + { + "epoch": 0.5454364882312047, + "grad_norm": 1.0477647176321618, + "learning_rate": 4.5085134283533035e-06, + "loss": 0.0168, + "step": 2746 + }, + { + "epoch": 0.5456351176879531, + "grad_norm": 0.3199467470120506, + "learning_rate": 4.505311586246037e-06, + "loss": 0.0087, + "step": 2747 + }, + { + "epoch": 0.5458337471447016, + "grad_norm": 0.4189748938840971, + "learning_rate": 4.502109948988748e-06, + "loss": 0.0122, + "step": 2748 + }, + { + "epoch": 0.54603237660145, + "grad_norm": 0.43642675146362775, + "learning_rate": 4.498908517907232e-06, + "loss": 0.0065, + "step": 2749 + }, + { + "epoch": 0.5462310060581984, + "grad_norm": 1.5799130468680098, + "learning_rate": 4.4957072943271965e-06, + "loss": 0.0129, + "step": 2750 + }, + { + "epoch": 0.5464296355149468, + "grad_norm": 0.3761584834352775, + "learning_rate": 4.492506279574262e-06, + "loss": 0.0124, + "step": 2751 + }, + { + "epoch": 0.5466282649716953, + "grad_norm": 0.7976727590944505, + "learning_rate": 4.4893054749739715e-06, + "loss": 0.0131, + "step": 2752 + }, + { + "epoch": 0.5468268944284438, + "grad_norm": 1.038660703724904, + "learning_rate": 4.4861048818517725e-06, + "loss": 0.0164, + "step": 2753 + }, + { + "epoch": 0.5470255238851922, + "grad_norm": 0.8037713236955946, + "learning_rate": 4.482904501533027e-06, + "loss": 0.0148, + "step": 2754 + }, + { + "epoch": 0.5472241533419406, + "grad_norm": 0.3649529151254279, + "learning_rate": 4.47970433534301e-06, + "loss": 0.0109, + "step": 2755 + }, + { + "epoch": 0.5474227827986891, + "grad_norm": 0.539372194994621, + "learning_rate": 4.476504384606906e-06, + "loss": 0.0128, + "step": 2756 + }, + { + "epoch": 0.5476214122554375, + "grad_norm": 0.7258264788596848, + "learning_rate": 4.473304650649812e-06, + "loss": 0.0146, + "step": 2757 + }, + { + "epoch": 0.5478200417121859, + "grad_norm": 0.28150530699276105, + "learning_rate": 4.470105134796734e-06, + "loss": 0.0065, + "step": 2758 + }, + { + "epoch": 0.5480186711689343, + "grad_norm": 0.7482265228868586, + "learning_rate": 4.466905838372591e-06, + "loss": 0.0179, + "step": 2759 + }, + { + "epoch": 0.5482173006256827, + "grad_norm": 0.6119275773400306, + "learning_rate": 4.463706762702205e-06, + "loss": 0.0176, + "step": 2760 + }, + { + "epoch": 0.5484159300824313, + "grad_norm": 0.4938248337453184, + "learning_rate": 4.460507909110312e-06, + "loss": 0.0106, + "step": 2761 + }, + { + "epoch": 0.5486145595391797, + "grad_norm": 0.27287982790646315, + "learning_rate": 4.457309278921554e-06, + "loss": 0.0054, + "step": 2762 + }, + { + "epoch": 0.5488131889959281, + "grad_norm": 0.7541141348539235, + "learning_rate": 4.4541108734604795e-06, + "loss": 0.0108, + "step": 2763 + }, + { + "epoch": 0.5490118184526765, + "grad_norm": 0.7194720755343721, + "learning_rate": 4.450912694051546e-06, + "loss": 0.0131, + "step": 2764 + }, + { + "epoch": 0.549210447909425, + "grad_norm": 0.36732971751956595, + "learning_rate": 4.447714742019115e-06, + "loss": 0.0114, + "step": 2765 + }, + { + "epoch": 0.5494090773661734, + "grad_norm": 0.6386170728830878, + "learning_rate": 4.444517018687457e-06, + "loss": 0.009, + "step": 2766 + }, + { + "epoch": 0.5496077068229218, + "grad_norm": 0.39169752604182223, + "learning_rate": 4.441319525380745e-06, + "loss": 0.0133, + "step": 2767 + }, + { + "epoch": 0.5498063362796702, + "grad_norm": 0.569101509077439, + "learning_rate": 4.438122263423059e-06, + "loss": 0.0131, + "step": 2768 + }, + { + "epoch": 0.5500049657364187, + "grad_norm": 0.7374889233619399, + "learning_rate": 4.434925234138381e-06, + "loss": 0.0154, + "step": 2769 + }, + { + "epoch": 0.5502035951931672, + "grad_norm": 0.27469994570416645, + "learning_rate": 4.431728438850597e-06, + "loss": 0.01, + "step": 2770 + }, + { + "epoch": 0.5504022246499156, + "grad_norm": 0.5480817776491356, + "learning_rate": 4.4285318788834976e-06, + "loss": 0.0112, + "step": 2771 + }, + { + "epoch": 0.550600854106664, + "grad_norm": 0.4678729381775268, + "learning_rate": 4.425335555560773e-06, + "loss": 0.0112, + "step": 2772 + }, + { + "epoch": 0.5507994835634125, + "grad_norm": 0.6929526581318076, + "learning_rate": 4.422139470206024e-06, + "loss": 0.0147, + "step": 2773 + }, + { + "epoch": 0.5509981130201609, + "grad_norm": 0.5590965545680023, + "learning_rate": 4.4189436241427395e-06, + "loss": 0.0128, + "step": 2774 + }, + { + "epoch": 0.5511967424769093, + "grad_norm": 0.4299925224605084, + "learning_rate": 4.415748018694317e-06, + "loss": 0.0078, + "step": 2775 + }, + { + "epoch": 0.5513953719336577, + "grad_norm": 0.30426919330239416, + "learning_rate": 4.412552655184055e-06, + "loss": 0.0071, + "step": 2776 + }, + { + "epoch": 0.5515940013904062, + "grad_norm": 1.7960268104440869, + "learning_rate": 4.409357534935151e-06, + "loss": 0.0276, + "step": 2777 + }, + { + "epoch": 0.5517926308471547, + "grad_norm": 2.6449701275835187, + "learning_rate": 4.4061626592707e-06, + "loss": 0.0182, + "step": 2778 + }, + { + "epoch": 0.5519912603039031, + "grad_norm": 0.22507808021610604, + "learning_rate": 4.4029680295136975e-06, + "loss": 0.0056, + "step": 2779 + }, + { + "epoch": 0.5521898897606515, + "grad_norm": 0.7832386144210031, + "learning_rate": 4.399773646987036e-06, + "loss": 0.0183, + "step": 2780 + }, + { + "epoch": 0.5523885192174, + "grad_norm": 0.632793978123542, + "learning_rate": 4.396579513013506e-06, + "loss": 0.0148, + "step": 2781 + }, + { + "epoch": 0.5525871486741484, + "grad_norm": 0.5274197472069998, + "learning_rate": 4.393385628915795e-06, + "loss": 0.0132, + "step": 2782 + }, + { + "epoch": 0.5527857781308968, + "grad_norm": 0.6239929821382354, + "learning_rate": 4.390191996016488e-06, + "loss": 0.0139, + "step": 2783 + }, + { + "epoch": 0.5529844075876452, + "grad_norm": 0.5616827331201757, + "learning_rate": 4.386998615638064e-06, + "loss": 0.0099, + "step": 2784 + }, + { + "epoch": 0.5531830370443936, + "grad_norm": 0.5048749306077575, + "learning_rate": 4.383805489102901e-06, + "loss": 0.0098, + "step": 2785 + }, + { + "epoch": 0.5533816665011421, + "grad_norm": 0.3937045403697834, + "learning_rate": 4.380612617733267e-06, + "loss": 0.0106, + "step": 2786 + }, + { + "epoch": 0.5535802959578906, + "grad_norm": 0.4297000120260985, + "learning_rate": 4.3774200028513275e-06, + "loss": 0.0085, + "step": 2787 + }, + { + "epoch": 0.553778925414639, + "grad_norm": 0.7721765597312535, + "learning_rate": 4.374227645779142e-06, + "loss": 0.0146, + "step": 2788 + }, + { + "epoch": 0.5539775548713874, + "grad_norm": 0.6219951031088091, + "learning_rate": 4.371035547838661e-06, + "loss": 0.0123, + "step": 2789 + }, + { + "epoch": 0.5541761843281359, + "grad_norm": 0.6784123591321396, + "learning_rate": 4.36784371035173e-06, + "loss": 0.0119, + "step": 2790 + }, + { + "epoch": 0.5543748137848843, + "grad_norm": 0.4407590870450644, + "learning_rate": 4.364652134640085e-06, + "loss": 0.0105, + "step": 2791 + }, + { + "epoch": 0.5545734432416327, + "grad_norm": 0.5200832710265008, + "learning_rate": 4.361460822025356e-06, + "loss": 0.01, + "step": 2792 + }, + { + "epoch": 0.5547720726983811, + "grad_norm": 0.5895406136146861, + "learning_rate": 4.35826977382906e-06, + "loss": 0.0082, + "step": 2793 + }, + { + "epoch": 0.5549707021551296, + "grad_norm": 0.3137853635557372, + "learning_rate": 4.355078991372609e-06, + "loss": 0.0062, + "step": 2794 + }, + { + "epoch": 0.5551693316118781, + "grad_norm": 0.3276947209644395, + "learning_rate": 4.351888475977302e-06, + "loss": 0.0062, + "step": 2795 + }, + { + "epoch": 0.5553679610686265, + "grad_norm": 0.49943742040860356, + "learning_rate": 4.348698228964327e-06, + "loss": 0.0077, + "step": 2796 + }, + { + "epoch": 0.5555665905253749, + "grad_norm": 0.46359295428980324, + "learning_rate": 4.345508251654765e-06, + "loss": 0.0068, + "step": 2797 + }, + { + "epoch": 0.5557652199821234, + "grad_norm": 1.1215529581908548, + "learning_rate": 4.34231854536958e-06, + "loss": 0.0217, + "step": 2798 + }, + { + "epoch": 0.5559638494388718, + "grad_norm": 0.6207216506521158, + "learning_rate": 4.339129111429625e-06, + "loss": 0.0096, + "step": 2799 + }, + { + "epoch": 0.5561624788956202, + "grad_norm": 0.3028484585994268, + "learning_rate": 4.335939951155644e-06, + "loss": 0.0053, + "step": 2800 + }, + { + "epoch": 0.5563611083523686, + "grad_norm": 1.030312295449838, + "learning_rate": 4.332751065868264e-06, + "loss": 0.0145, + "step": 2801 + }, + { + "epoch": 0.556559737809117, + "grad_norm": 0.4829944470180176, + "learning_rate": 4.329562456888e-06, + "loss": 0.0117, + "step": 2802 + }, + { + "epoch": 0.5567583672658655, + "grad_norm": 0.3015442911784513, + "learning_rate": 4.32637412553525e-06, + "loss": 0.0066, + "step": 2803 + }, + { + "epoch": 0.556956996722614, + "grad_norm": 0.49410995263115004, + "learning_rate": 4.323186073130302e-06, + "loss": 0.0134, + "step": 2804 + }, + { + "epoch": 0.5571556261793624, + "grad_norm": 1.1059775844917823, + "learning_rate": 4.319998300993322e-06, + "loss": 0.0167, + "step": 2805 + }, + { + "epoch": 0.5573542556361109, + "grad_norm": 0.5759009436326822, + "learning_rate": 4.316810810444365e-06, + "loss": 0.0121, + "step": 2806 + }, + { + "epoch": 0.5575528850928593, + "grad_norm": 0.41776379254655316, + "learning_rate": 4.313623602803367e-06, + "loss": 0.0048, + "step": 2807 + }, + { + "epoch": 0.5577515145496077, + "grad_norm": 0.5034867323356977, + "learning_rate": 4.310436679390147e-06, + "loss": 0.0121, + "step": 2808 + }, + { + "epoch": 0.5579501440063561, + "grad_norm": 0.41318019364973024, + "learning_rate": 4.307250041524408e-06, + "loss": 0.0064, + "step": 2809 + }, + { + "epoch": 0.5581487734631045, + "grad_norm": 1.0091161435064295, + "learning_rate": 4.304063690525734e-06, + "loss": 0.0137, + "step": 2810 + }, + { + "epoch": 0.558347402919853, + "grad_norm": 0.6276179334811938, + "learning_rate": 4.300877627713588e-06, + "loss": 0.0113, + "step": 2811 + }, + { + "epoch": 0.5585460323766015, + "grad_norm": 0.8074924839212643, + "learning_rate": 4.297691854407317e-06, + "loss": 0.0175, + "step": 2812 + }, + { + "epoch": 0.5587446618333499, + "grad_norm": 1.2308453281661025, + "learning_rate": 4.294506371926145e-06, + "loss": 0.0256, + "step": 2813 + }, + { + "epoch": 0.5589432912900983, + "grad_norm": 0.8890857016385656, + "learning_rate": 4.291321181589179e-06, + "loss": 0.0138, + "step": 2814 + }, + { + "epoch": 0.5591419207468468, + "grad_norm": 0.42316267167957217, + "learning_rate": 4.288136284715399e-06, + "loss": 0.0061, + "step": 2815 + }, + { + "epoch": 0.5593405502035952, + "grad_norm": 0.9297151487475857, + "learning_rate": 4.284951682623674e-06, + "loss": 0.0149, + "step": 2816 + }, + { + "epoch": 0.5595391796603436, + "grad_norm": 0.4544147000009162, + "learning_rate": 4.281767376632739e-06, + "loss": 0.0071, + "step": 2817 + }, + { + "epoch": 0.559737809117092, + "grad_norm": 0.6237208093686428, + "learning_rate": 4.278583368061216e-06, + "loss": 0.0225, + "step": 2818 + }, + { + "epoch": 0.5599364385738405, + "grad_norm": 0.39156008750074794, + "learning_rate": 4.275399658227596e-06, + "loss": 0.0123, + "step": 2819 + }, + { + "epoch": 0.560135068030589, + "grad_norm": 0.41234897144867805, + "learning_rate": 4.272216248450253e-06, + "loss": 0.0072, + "step": 2820 + }, + { + "epoch": 0.5603336974873374, + "grad_norm": 0.4726168304536307, + "learning_rate": 4.2690331400474335e-06, + "loss": 0.0095, + "step": 2821 + }, + { + "epoch": 0.5605323269440858, + "grad_norm": 0.36137404100034154, + "learning_rate": 4.265850334337258e-06, + "loss": 0.0106, + "step": 2822 + }, + { + "epoch": 0.5607309564008343, + "grad_norm": 0.41668059273849295, + "learning_rate": 4.262667832637724e-06, + "loss": 0.0085, + "step": 2823 + }, + { + "epoch": 0.5609295858575827, + "grad_norm": 0.547862940561988, + "learning_rate": 4.2594856362667015e-06, + "loss": 0.0129, + "step": 2824 + }, + { + "epoch": 0.5611282153143311, + "grad_norm": 0.40441643120996035, + "learning_rate": 4.256303746541936e-06, + "loss": 0.0102, + "step": 2825 + }, + { + "epoch": 0.5613268447710795, + "grad_norm": 0.5954924239361523, + "learning_rate": 4.253122164781043e-06, + "loss": 0.0143, + "step": 2826 + }, + { + "epoch": 0.561525474227828, + "grad_norm": 0.7813919479034077, + "learning_rate": 4.249940892301514e-06, + "loss": 0.009, + "step": 2827 + }, + { + "epoch": 0.5617241036845764, + "grad_norm": 0.4565410720159291, + "learning_rate": 4.246759930420711e-06, + "loss": 0.0124, + "step": 2828 + }, + { + "epoch": 0.5619227331413249, + "grad_norm": 0.5694645470193745, + "learning_rate": 4.243579280455867e-06, + "loss": 0.0108, + "step": 2829 + }, + { + "epoch": 0.5621213625980733, + "grad_norm": 0.8304296127324543, + "learning_rate": 4.240398943724085e-06, + "loss": 0.0101, + "step": 2830 + }, + { + "epoch": 0.5623199920548217, + "grad_norm": 0.7021520729713497, + "learning_rate": 4.237218921542339e-06, + "loss": 0.0177, + "step": 2831 + }, + { + "epoch": 0.5625186215115702, + "grad_norm": 0.6305791789892291, + "learning_rate": 4.234039215227474e-06, + "loss": 0.009, + "step": 2832 + }, + { + "epoch": 0.5627172509683186, + "grad_norm": 0.9230478228209408, + "learning_rate": 4.230859826096203e-06, + "loss": 0.0147, + "step": 2833 + }, + { + "epoch": 0.562915880425067, + "grad_norm": 0.4300837811713559, + "learning_rate": 4.2276807554651074e-06, + "loss": 0.0099, + "step": 2834 + }, + { + "epoch": 0.5631145098818154, + "grad_norm": 0.5975455251314918, + "learning_rate": 4.224502004650639e-06, + "loss": 0.0138, + "step": 2835 + }, + { + "epoch": 0.5633131393385639, + "grad_norm": 0.5208038837005452, + "learning_rate": 4.221323574969113e-06, + "loss": 0.0129, + "step": 2836 + }, + { + "epoch": 0.5635117687953124, + "grad_norm": 0.4113017440579882, + "learning_rate": 4.218145467736715e-06, + "loss": 0.013, + "step": 2837 + }, + { + "epoch": 0.5637103982520608, + "grad_norm": 0.8453183610999317, + "learning_rate": 4.214967684269495e-06, + "loss": 0.0069, + "step": 2838 + }, + { + "epoch": 0.5639090277088092, + "grad_norm": 0.3829389981415355, + "learning_rate": 4.211790225883372e-06, + "loss": 0.007, + "step": 2839 + }, + { + "epoch": 0.5641076571655577, + "grad_norm": 0.45669163404263635, + "learning_rate": 4.208613093894126e-06, + "loss": 0.0141, + "step": 2840 + }, + { + "epoch": 0.5643062866223061, + "grad_norm": 0.833349864065106, + "learning_rate": 4.205436289617406e-06, + "loss": 0.0169, + "step": 2841 + }, + { + "epoch": 0.5645049160790545, + "grad_norm": 0.6077279896370484, + "learning_rate": 4.2022598143687224e-06, + "loss": 0.0123, + "step": 2842 + }, + { + "epoch": 0.5647035455358029, + "grad_norm": 0.38738975198937353, + "learning_rate": 4.199083669463452e-06, + "loss": 0.0072, + "step": 2843 + }, + { + "epoch": 0.5649021749925514, + "grad_norm": 0.38114824416375975, + "learning_rate": 4.195907856216831e-06, + "loss": 0.0092, + "step": 2844 + }, + { + "epoch": 0.5651008044492998, + "grad_norm": 0.8831215464756551, + "learning_rate": 4.192732375943962e-06, + "loss": 0.0167, + "step": 2845 + }, + { + "epoch": 0.5652994339060483, + "grad_norm": 0.3030833462369814, + "learning_rate": 4.189557229959807e-06, + "loss": 0.0077, + "step": 2846 + }, + { + "epoch": 0.5654980633627967, + "grad_norm": 1.0605500779156578, + "learning_rate": 4.186382419579193e-06, + "loss": 0.0132, + "step": 2847 + }, + { + "epoch": 0.5656966928195452, + "grad_norm": 0.6755075790503954, + "learning_rate": 4.183207946116802e-06, + "loss": 0.0182, + "step": 2848 + }, + { + "epoch": 0.5658953222762936, + "grad_norm": 0.9423178403605319, + "learning_rate": 4.180033810887184e-06, + "loss": 0.0147, + "step": 2849 + }, + { + "epoch": 0.566093951733042, + "grad_norm": 0.6355943716774898, + "learning_rate": 4.176860015204743e-06, + "loss": 0.0122, + "step": 2850 + }, + { + "epoch": 0.5662925811897904, + "grad_norm": 0.46477508165506526, + "learning_rate": 4.173686560383745e-06, + "loss": 0.0143, + "step": 2851 + }, + { + "epoch": 0.5664912106465388, + "grad_norm": 0.489449386116713, + "learning_rate": 4.170513447738316e-06, + "loss": 0.015, + "step": 2852 + }, + { + "epoch": 0.5666898401032873, + "grad_norm": 0.475425921121026, + "learning_rate": 4.167340678582437e-06, + "loss": 0.0132, + "step": 2853 + }, + { + "epoch": 0.5668884695600358, + "grad_norm": 0.5947470913713117, + "learning_rate": 4.16416825422995e-06, + "loss": 0.0144, + "step": 2854 + }, + { + "epoch": 0.5670870990167842, + "grad_norm": 0.8415609819118625, + "learning_rate": 4.160996175994551e-06, + "loss": 0.0167, + "step": 2855 + }, + { + "epoch": 0.5672857284735326, + "grad_norm": 0.9514310803640764, + "learning_rate": 4.157824445189796e-06, + "loss": 0.0103, + "step": 2856 + }, + { + "epoch": 0.5674843579302811, + "grad_norm": 0.6091553295909007, + "learning_rate": 4.1546530631290945e-06, + "loss": 0.0125, + "step": 2857 + }, + { + "epoch": 0.5676829873870295, + "grad_norm": 0.4674705036869269, + "learning_rate": 4.151482031125712e-06, + "loss": 0.0158, + "step": 2858 + }, + { + "epoch": 0.5678816168437779, + "grad_norm": 0.7383391584281234, + "learning_rate": 4.148311350492772e-06, + "loss": 0.0128, + "step": 2859 + }, + { + "epoch": 0.5680802463005263, + "grad_norm": 1.9683596728322885, + "learning_rate": 4.145141022543248e-06, + "loss": 0.0126, + "step": 2860 + }, + { + "epoch": 0.5682788757572748, + "grad_norm": 0.717958085427442, + "learning_rate": 4.1419710485899715e-06, + "loss": 0.0106, + "step": 2861 + }, + { + "epoch": 0.5684775052140233, + "grad_norm": 0.7306809802401357, + "learning_rate": 4.138801429945624e-06, + "loss": 0.0088, + "step": 2862 + }, + { + "epoch": 0.5686761346707717, + "grad_norm": 0.3406143943303101, + "learning_rate": 4.135632167922742e-06, + "loss": 0.0104, + "step": 2863 + }, + { + "epoch": 0.5688747641275201, + "grad_norm": 0.35075777130300395, + "learning_rate": 4.1324632638337134e-06, + "loss": 0.0071, + "step": 2864 + }, + { + "epoch": 0.5690733935842686, + "grad_norm": 3.6206297212120857, + "learning_rate": 4.129294718990779e-06, + "loss": 0.0089, + "step": 2865 + }, + { + "epoch": 0.569272023041017, + "grad_norm": 0.6092937120736366, + "learning_rate": 4.126126534706028e-06, + "loss": 0.0129, + "step": 2866 + }, + { + "epoch": 0.5694706524977654, + "grad_norm": 0.6599744938071138, + "learning_rate": 4.122958712291406e-06, + "loss": 0.0132, + "step": 2867 + }, + { + "epoch": 0.5696692819545138, + "grad_norm": 0.40817723392650745, + "learning_rate": 4.119791253058701e-06, + "loss": 0.0155, + "step": 2868 + }, + { + "epoch": 0.5698679114112623, + "grad_norm": 0.7207327641411628, + "learning_rate": 4.1166241583195596e-06, + "loss": 0.0149, + "step": 2869 + }, + { + "epoch": 0.5700665408680107, + "grad_norm": 0.5234949232683128, + "learning_rate": 4.113457429385468e-06, + "loss": 0.0089, + "step": 2870 + }, + { + "epoch": 0.5702651703247592, + "grad_norm": 0.5297434498649813, + "learning_rate": 4.110291067567766e-06, + "loss": 0.0107, + "step": 2871 + }, + { + "epoch": 0.5704637997815076, + "grad_norm": 0.5678519577896356, + "learning_rate": 4.107125074177643e-06, + "loss": 0.0092, + "step": 2872 + }, + { + "epoch": 0.570662429238256, + "grad_norm": 0.35857068739907116, + "learning_rate": 4.103959450526133e-06, + "loss": 0.0089, + "step": 2873 + }, + { + "epoch": 0.5708610586950045, + "grad_norm": 0.7083827991263594, + "learning_rate": 4.100794197924117e-06, + "loss": 0.0118, + "step": 2874 + }, + { + "epoch": 0.5710596881517529, + "grad_norm": 0.5189059502478018, + "learning_rate": 4.097629317682322e-06, + "loss": 0.008, + "step": 2875 + }, + { + "epoch": 0.5712583176085013, + "grad_norm": 1.0428771302955275, + "learning_rate": 4.094464811111323e-06, + "loss": 0.0139, + "step": 2876 + }, + { + "epoch": 0.5714569470652497, + "grad_norm": 0.711904256313604, + "learning_rate": 4.091300679521539e-06, + "loss": 0.0162, + "step": 2877 + }, + { + "epoch": 0.5716555765219982, + "grad_norm": 0.32525294522509485, + "learning_rate": 4.088136924223235e-06, + "loss": 0.0106, + "step": 2878 + }, + { + "epoch": 0.5718542059787467, + "grad_norm": 0.6302841695249592, + "learning_rate": 4.084973546526517e-06, + "loss": 0.0115, + "step": 2879 + }, + { + "epoch": 0.5720528354354951, + "grad_norm": 0.828179414805218, + "learning_rate": 4.081810547741336e-06, + "loss": 0.0161, + "step": 2880 + }, + { + "epoch": 0.5722514648922435, + "grad_norm": 0.4625165253053684, + "learning_rate": 4.078647929177489e-06, + "loss": 0.0064, + "step": 2881 + }, + { + "epoch": 0.572450094348992, + "grad_norm": 0.6812644619662183, + "learning_rate": 4.075485692144611e-06, + "loss": 0.0173, + "step": 2882 + }, + { + "epoch": 0.5726487238057404, + "grad_norm": 0.8313655313026633, + "learning_rate": 4.072323837952181e-06, + "loss": 0.0144, + "step": 2883 + }, + { + "epoch": 0.5728473532624888, + "grad_norm": 0.45743710231818924, + "learning_rate": 4.069162367909522e-06, + "loss": 0.0113, + "step": 2884 + }, + { + "epoch": 0.5730459827192372, + "grad_norm": 0.9030179886207947, + "learning_rate": 4.0660012833257945e-06, + "loss": 0.0153, + "step": 2885 + }, + { + "epoch": 0.5732446121759857, + "grad_norm": 0.7782720108694118, + "learning_rate": 4.062840585510001e-06, + "loss": 0.01, + "step": 2886 + }, + { + "epoch": 0.5734432416327341, + "grad_norm": 0.40236050134983353, + "learning_rate": 4.05968027577098e-06, + "loss": 0.0071, + "step": 2887 + }, + { + "epoch": 0.5736418710894826, + "grad_norm": 0.6486277794089854, + "learning_rate": 4.056520355417418e-06, + "loss": 0.01, + "step": 2888 + }, + { + "epoch": 0.573840500546231, + "grad_norm": 0.5243182245221714, + "learning_rate": 4.053360825757831e-06, + "loss": 0.0125, + "step": 2889 + }, + { + "epoch": 0.5740391300029795, + "grad_norm": 0.6340711859635116, + "learning_rate": 4.050201688100577e-06, + "loss": 0.0114, + "step": 2890 + }, + { + "epoch": 0.5742377594597279, + "grad_norm": 0.44387658891634135, + "learning_rate": 4.047042943753853e-06, + "loss": 0.0091, + "step": 2891 + }, + { + "epoch": 0.5744363889164763, + "grad_norm": 0.2538412173316712, + "learning_rate": 4.043884594025692e-06, + "loss": 0.0054, + "step": 2892 + }, + { + "epoch": 0.5746350183732247, + "grad_norm": 0.6434174001298162, + "learning_rate": 4.040726640223967e-06, + "loss": 0.0163, + "step": 2893 + }, + { + "epoch": 0.5748336478299731, + "grad_norm": 0.4516311123319802, + "learning_rate": 4.037569083656374e-06, + "loss": 0.0088, + "step": 2894 + }, + { + "epoch": 0.5750322772867216, + "grad_norm": 0.6032522782543573, + "learning_rate": 4.034411925630462e-06, + "loss": 0.0184, + "step": 2895 + }, + { + "epoch": 0.5752309067434701, + "grad_norm": 0.5036736272759907, + "learning_rate": 4.031255167453604e-06, + "loss": 0.006, + "step": 2896 + }, + { + "epoch": 0.5754295362002185, + "grad_norm": 0.5616796264869481, + "learning_rate": 4.028098810433012e-06, + "loss": 0.02, + "step": 2897 + }, + { + "epoch": 0.575628165656967, + "grad_norm": 0.7195746914339065, + "learning_rate": 4.024942855875728e-06, + "loss": 0.0204, + "step": 2898 + }, + { + "epoch": 0.5758267951137154, + "grad_norm": 0.681543673069867, + "learning_rate": 4.021787305088633e-06, + "loss": 0.0143, + "step": 2899 + }, + { + "epoch": 0.5760254245704638, + "grad_norm": 0.4669150650713162, + "learning_rate": 4.0186321593784325e-06, + "loss": 0.0089, + "step": 2900 + }, + { + "epoch": 0.5762240540272122, + "grad_norm": 0.33038421962080927, + "learning_rate": 4.015477420051673e-06, + "loss": 0.0096, + "step": 2901 + }, + { + "epoch": 0.5764226834839606, + "grad_norm": 0.3634840974532087, + "learning_rate": 4.012323088414729e-06, + "loss": 0.0104, + "step": 2902 + }, + { + "epoch": 0.5766213129407091, + "grad_norm": 0.8343922468702314, + "learning_rate": 4.009169165773804e-06, + "loss": 0.0117, + "step": 2903 + }, + { + "epoch": 0.5768199423974576, + "grad_norm": 0.6495727500722914, + "learning_rate": 4.0060156534349355e-06, + "loss": 0.0125, + "step": 2904 + }, + { + "epoch": 0.577018571854206, + "grad_norm": 0.6135698010073438, + "learning_rate": 4.00286255270399e-06, + "loss": 0.0146, + "step": 2905 + }, + { + "epoch": 0.5772172013109544, + "grad_norm": 0.5573911672484038, + "learning_rate": 3.9997098648866624e-06, + "loss": 0.0124, + "step": 2906 + }, + { + "epoch": 0.5774158307677029, + "grad_norm": 0.40620283454662026, + "learning_rate": 3.996557591288477e-06, + "loss": 0.0106, + "step": 2907 + }, + { + "epoch": 0.5776144602244513, + "grad_norm": 0.7192076124521594, + "learning_rate": 3.99340573321479e-06, + "loss": 0.0102, + "step": 2908 + }, + { + "epoch": 0.5778130896811997, + "grad_norm": 0.4360510467100798, + "learning_rate": 3.99025429197078e-06, + "loss": 0.0141, + "step": 2909 + }, + { + "epoch": 0.5780117191379481, + "grad_norm": 0.463212787813459, + "learning_rate": 3.987103268861457e-06, + "loss": 0.0111, + "step": 2910 + }, + { + "epoch": 0.5782103485946966, + "grad_norm": 0.8251451590325685, + "learning_rate": 3.983952665191656e-06, + "loss": 0.0142, + "step": 2911 + }, + { + "epoch": 0.578408978051445, + "grad_norm": 0.761966750523954, + "learning_rate": 3.980802482266038e-06, + "loss": 0.0163, + "step": 2912 + }, + { + "epoch": 0.5786076075081935, + "grad_norm": 0.6924212159837746, + "learning_rate": 3.977652721389092e-06, + "loss": 0.0099, + "step": 2913 + }, + { + "epoch": 0.5788062369649419, + "grad_norm": 0.3979725594269708, + "learning_rate": 3.97450338386513e-06, + "loss": 0.0085, + "step": 2914 + }, + { + "epoch": 0.5790048664216904, + "grad_norm": 0.5618585012904748, + "learning_rate": 3.97135447099829e-06, + "loss": 0.0127, + "step": 2915 + }, + { + "epoch": 0.5792034958784388, + "grad_norm": 0.7497853838343431, + "learning_rate": 3.968205984092533e-06, + "loss": 0.0108, + "step": 2916 + }, + { + "epoch": 0.5794021253351872, + "grad_norm": 0.5120214709576271, + "learning_rate": 3.965057924451648e-06, + "loss": 0.0131, + "step": 2917 + }, + { + "epoch": 0.5796007547919356, + "grad_norm": 0.6871961008819738, + "learning_rate": 3.961910293379236e-06, + "loss": 0.0144, + "step": 2918 + }, + { + "epoch": 0.579799384248684, + "grad_norm": 0.41734721830693366, + "learning_rate": 3.958763092178734e-06, + "loss": 0.01, + "step": 2919 + }, + { + "epoch": 0.5799980137054325, + "grad_norm": 0.4036222914847742, + "learning_rate": 3.955616322153391e-06, + "loss": 0.0105, + "step": 2920 + }, + { + "epoch": 0.580196643162181, + "grad_norm": 0.6130853508919999, + "learning_rate": 3.952469984606285e-06, + "loss": 0.0134, + "step": 2921 + }, + { + "epoch": 0.5803952726189294, + "grad_norm": 0.3509331583951365, + "learning_rate": 3.949324080840309e-06, + "loss": 0.0098, + "step": 2922 + }, + { + "epoch": 0.5805939020756778, + "grad_norm": 0.37833538757507784, + "learning_rate": 3.946178612158178e-06, + "loss": 0.0134, + "step": 2923 + }, + { + "epoch": 0.5807925315324263, + "grad_norm": 0.46980032869709154, + "learning_rate": 3.94303357986243e-06, + "loss": 0.0093, + "step": 2924 + }, + { + "epoch": 0.5809911609891747, + "grad_norm": 0.47678450473727596, + "learning_rate": 3.939888985255415e-06, + "loss": 0.0156, + "step": 2925 + }, + { + "epoch": 0.5811897904459231, + "grad_norm": 0.48897265097095066, + "learning_rate": 3.9367448296393115e-06, + "loss": 0.0082, + "step": 2926 + }, + { + "epoch": 0.5813884199026715, + "grad_norm": 0.39022191785178684, + "learning_rate": 3.93360111431611e-06, + "loss": 0.0097, + "step": 2927 + }, + { + "epoch": 0.58158704935942, + "grad_norm": 0.9510163155213661, + "learning_rate": 3.930457840587618e-06, + "loss": 0.0189, + "step": 2928 + }, + { + "epoch": 0.5817856788161684, + "grad_norm": 0.4727876728395284, + "learning_rate": 3.927315009755464e-06, + "loss": 0.0142, + "step": 2929 + }, + { + "epoch": 0.5819843082729169, + "grad_norm": 0.7381075193685331, + "learning_rate": 3.92417262312109e-06, + "loss": 0.0111, + "step": 2930 + }, + { + "epoch": 0.5821829377296653, + "grad_norm": 0.6627709815131074, + "learning_rate": 3.921030681985755e-06, + "loss": 0.0108, + "step": 2931 + }, + { + "epoch": 0.5823815671864138, + "grad_norm": 0.49816760107685104, + "learning_rate": 3.917889187650533e-06, + "loss": 0.012, + "step": 2932 + }, + { + "epoch": 0.5825801966431622, + "grad_norm": 0.7768642446479597, + "learning_rate": 3.914748141416317e-06, + "loss": 0.0104, + "step": 2933 + }, + { + "epoch": 0.5827788260999106, + "grad_norm": 0.2884247629212007, + "learning_rate": 3.9116075445838075e-06, + "loss": 0.0094, + "step": 2934 + }, + { + "epoch": 0.582977455556659, + "grad_norm": 0.47011413952328, + "learning_rate": 3.908467398453524e-06, + "loss": 0.0066, + "step": 2935 + }, + { + "epoch": 0.5831760850134075, + "grad_norm": 0.4443819609591886, + "learning_rate": 3.905327704325799e-06, + "loss": 0.0104, + "step": 2936 + }, + { + "epoch": 0.5833747144701559, + "grad_norm": 0.3839458518134734, + "learning_rate": 3.902188463500774e-06, + "loss": 0.0119, + "step": 2937 + }, + { + "epoch": 0.5835733439269044, + "grad_norm": 0.6111807494928548, + "learning_rate": 3.899049677278407e-06, + "loss": 0.013, + "step": 2938 + }, + { + "epoch": 0.5837719733836528, + "grad_norm": 0.60033600270167, + "learning_rate": 3.895911346958466e-06, + "loss": 0.013, + "step": 2939 + }, + { + "epoch": 0.5839706028404013, + "grad_norm": 0.2970921298615577, + "learning_rate": 3.892773473840531e-06, + "loss": 0.0074, + "step": 2940 + }, + { + "epoch": 0.5841692322971497, + "grad_norm": 0.610226222580878, + "learning_rate": 3.889636059223993e-06, + "loss": 0.0167, + "step": 2941 + }, + { + "epoch": 0.5843678617538981, + "grad_norm": 0.6862312895558023, + "learning_rate": 3.886499104408051e-06, + "loss": 0.0121, + "step": 2942 + }, + { + "epoch": 0.5845664912106465, + "grad_norm": 0.4404684118718429, + "learning_rate": 3.883362610691711e-06, + "loss": 0.0142, + "step": 2943 + }, + { + "epoch": 0.5847651206673949, + "grad_norm": 0.9493244733608, + "learning_rate": 3.880226579373799e-06, + "loss": 0.0111, + "step": 2944 + }, + { + "epoch": 0.5849637501241434, + "grad_norm": 0.7593496884130583, + "learning_rate": 3.877091011752938e-06, + "loss": 0.015, + "step": 2945 + }, + { + "epoch": 0.5851623795808919, + "grad_norm": 0.6721422032614358, + "learning_rate": 3.8739559091275646e-06, + "loss": 0.0139, + "step": 2946 + }, + { + "epoch": 0.5853610090376403, + "grad_norm": 0.48578862136014994, + "learning_rate": 3.870821272795922e-06, + "loss": 0.0087, + "step": 2947 + }, + { + "epoch": 0.5855596384943887, + "grad_norm": 0.3783706141485433, + "learning_rate": 3.867687104056059e-06, + "loss": 0.0068, + "step": 2948 + }, + { + "epoch": 0.5857582679511372, + "grad_norm": 0.7342026067918515, + "learning_rate": 3.864553404205833e-06, + "loss": 0.0154, + "step": 2949 + }, + { + "epoch": 0.5859568974078856, + "grad_norm": 0.45238663502264936, + "learning_rate": 3.861420174542903e-06, + "loss": 0.0132, + "step": 2950 + }, + { + "epoch": 0.586155526864634, + "grad_norm": 0.5760123588598605, + "learning_rate": 3.85828741636474e-06, + "loss": 0.0088, + "step": 2951 + }, + { + "epoch": 0.5863541563213824, + "grad_norm": 0.37486015440914744, + "learning_rate": 3.855155130968616e-06, + "loss": 0.0137, + "step": 2952 + }, + { + "epoch": 0.5865527857781309, + "grad_norm": 0.4390439990790977, + "learning_rate": 3.852023319651605e-06, + "loss": 0.011, + "step": 2953 + }, + { + "epoch": 0.5867514152348793, + "grad_norm": 0.9686179462089476, + "learning_rate": 3.848891983710587e-06, + "loss": 0.0108, + "step": 2954 + }, + { + "epoch": 0.5869500446916278, + "grad_norm": 0.7875155365033674, + "learning_rate": 3.845761124442246e-06, + "loss": 0.0238, + "step": 2955 + }, + { + "epoch": 0.5871486741483762, + "grad_norm": 0.38882678317489044, + "learning_rate": 3.842630743143068e-06, + "loss": 0.0119, + "step": 2956 + }, + { + "epoch": 0.5873473036051247, + "grad_norm": 0.6336228129818564, + "learning_rate": 3.839500841109338e-06, + "loss": 0.0104, + "step": 2957 + }, + { + "epoch": 0.5875459330618731, + "grad_norm": 0.5805224655559802, + "learning_rate": 3.836371419637149e-06, + "loss": 0.014, + "step": 2958 + }, + { + "epoch": 0.5877445625186215, + "grad_norm": 0.27676008745737823, + "learning_rate": 3.833242480022391e-06, + "loss": 0.0075, + "step": 2959 + }, + { + "epoch": 0.5879431919753699, + "grad_norm": 0.30567765332023794, + "learning_rate": 3.8301140235607525e-06, + "loss": 0.0074, + "step": 2960 + }, + { + "epoch": 0.5881418214321184, + "grad_norm": 0.3651536661757168, + "learning_rate": 3.826986051547726e-06, + "loss": 0.008, + "step": 2961 + }, + { + "epoch": 0.5883404508888668, + "grad_norm": 0.5291250242062171, + "learning_rate": 3.8238585652786004e-06, + "loss": 0.0109, + "step": 2962 + }, + { + "epoch": 0.5885390803456153, + "grad_norm": 0.3137663400127696, + "learning_rate": 3.820731566048466e-06, + "loss": 0.0087, + "step": 2963 + }, + { + "epoch": 0.5887377098023637, + "grad_norm": 2.235523320234743, + "learning_rate": 3.817605055152208e-06, + "loss": 0.012, + "step": 2964 + }, + { + "epoch": 0.5889363392591122, + "grad_norm": 0.3358171761925821, + "learning_rate": 3.814479033884514e-06, + "loss": 0.008, + "step": 2965 + }, + { + "epoch": 0.5891349687158606, + "grad_norm": 0.987905521258979, + "learning_rate": 3.8113535035398637e-06, + "loss": 0.0168, + "step": 2966 + }, + { + "epoch": 0.589333598172609, + "grad_norm": 0.6892134567235605, + "learning_rate": 3.8082284654125373e-06, + "loss": 0.0164, + "step": 2967 + }, + { + "epoch": 0.5895322276293574, + "grad_norm": 0.987943395384034, + "learning_rate": 3.805103920796609e-06, + "loss": 0.0133, + "step": 2968 + }, + { + "epoch": 0.5897308570861058, + "grad_norm": 0.8509307481721003, + "learning_rate": 3.8019798709859512e-06, + "loss": 0.0116, + "step": 2969 + }, + { + "epoch": 0.5899294865428543, + "grad_norm": 0.759142605103621, + "learning_rate": 3.79885631727423e-06, + "loss": 0.0115, + "step": 2970 + }, + { + "epoch": 0.5901281159996027, + "grad_norm": 0.24316763299724442, + "learning_rate": 3.7957332609549037e-06, + "loss": 0.0069, + "step": 2971 + }, + { + "epoch": 0.5903267454563512, + "grad_norm": 0.4485372299375667, + "learning_rate": 3.792610703321229e-06, + "loss": 0.0086, + "step": 2972 + }, + { + "epoch": 0.5905253749130996, + "grad_norm": 0.588418063546346, + "learning_rate": 3.789488645666253e-06, + "loss": 0.0127, + "step": 2973 + }, + { + "epoch": 0.5907240043698481, + "grad_norm": 0.36059543710830055, + "learning_rate": 3.7863670892828156e-06, + "loss": 0.012, + "step": 2974 + }, + { + "epoch": 0.5909226338265965, + "grad_norm": 0.902669268812175, + "learning_rate": 3.783246035463551e-06, + "loss": 0.014, + "step": 2975 + }, + { + "epoch": 0.5911212632833449, + "grad_norm": 0.31958820455962267, + "learning_rate": 3.780125485500885e-06, + "loss": 0.0087, + "step": 2976 + }, + { + "epoch": 0.5913198927400933, + "grad_norm": 0.4986550681651737, + "learning_rate": 3.777005440687035e-06, + "loss": 0.0098, + "step": 2977 + }, + { + "epoch": 0.5915185221968418, + "grad_norm": 0.7580605603680164, + "learning_rate": 3.773885902314006e-06, + "loss": 0.0241, + "step": 2978 + }, + { + "epoch": 0.5917171516535902, + "grad_norm": 0.2330131586895299, + "learning_rate": 3.770766871673598e-06, + "loss": 0.0069, + "step": 2979 + }, + { + "epoch": 0.5919157811103387, + "grad_norm": 0.2919054898745666, + "learning_rate": 3.7676483500573966e-06, + "loss": 0.0046, + "step": 2980 + }, + { + "epoch": 0.5921144105670871, + "grad_norm": 0.38396343904172237, + "learning_rate": 3.76453033875678e-06, + "loss": 0.0065, + "step": 2981 + }, + { + "epoch": 0.5923130400238356, + "grad_norm": 0.8594455212714647, + "learning_rate": 3.761412839062911e-06, + "loss": 0.0133, + "step": 2982 + }, + { + "epoch": 0.592511669480584, + "grad_norm": 0.20526215304091466, + "learning_rate": 3.7582958522667466e-06, + "loss": 0.0043, + "step": 2983 + }, + { + "epoch": 0.5927102989373324, + "grad_norm": 0.3004063766309997, + "learning_rate": 3.7551793796590263e-06, + "loss": 0.0062, + "step": 2984 + }, + { + "epoch": 0.5929089283940808, + "grad_norm": 0.41297527826978053, + "learning_rate": 3.7520634225302788e-06, + "loss": 0.0105, + "step": 2985 + }, + { + "epoch": 0.5931075578508292, + "grad_norm": 0.8190693978061193, + "learning_rate": 3.7489479821708173e-06, + "loss": 0.0171, + "step": 2986 + }, + { + "epoch": 0.5933061873075777, + "grad_norm": 0.4697013617601209, + "learning_rate": 3.7458330598707443e-06, + "loss": 0.0065, + "step": 2987 + }, + { + "epoch": 0.5935048167643262, + "grad_norm": 0.5384383900255248, + "learning_rate": 3.7427186569199456e-06, + "loss": 0.0128, + "step": 2988 + }, + { + "epoch": 0.5937034462210746, + "grad_norm": 0.753459781552131, + "learning_rate": 3.739604774608092e-06, + "loss": 0.02, + "step": 2989 + }, + { + "epoch": 0.593902075677823, + "grad_norm": 0.49028554186530215, + "learning_rate": 3.7364914142246383e-06, + "loss": 0.0101, + "step": 2990 + }, + { + "epoch": 0.5941007051345715, + "grad_norm": 0.7809148931549772, + "learning_rate": 3.733378577058825e-06, + "loss": 0.0146, + "step": 2991 + }, + { + "epoch": 0.5942993345913199, + "grad_norm": 0.7722259717302931, + "learning_rate": 3.7302662643996747e-06, + "loss": 0.0141, + "step": 2992 + }, + { + "epoch": 0.5944979640480683, + "grad_norm": 0.6140405841322572, + "learning_rate": 3.7271544775359906e-06, + "loss": 0.0162, + "step": 2993 + }, + { + "epoch": 0.5946965935048167, + "grad_norm": 0.7586513477389154, + "learning_rate": 3.7240432177563646e-06, + "loss": 0.0132, + "step": 2994 + }, + { + "epoch": 0.5948952229615652, + "grad_norm": 0.6630658842640412, + "learning_rate": 3.720932486349165e-06, + "loss": 0.0153, + "step": 2995 + }, + { + "epoch": 0.5950938524183136, + "grad_norm": 0.6232367511499796, + "learning_rate": 3.7178222846025404e-06, + "loss": 0.0097, + "step": 2996 + }, + { + "epoch": 0.5952924818750621, + "grad_norm": 0.38137155843187615, + "learning_rate": 3.7147126138044243e-06, + "loss": 0.0072, + "step": 2997 + }, + { + "epoch": 0.5954911113318105, + "grad_norm": 0.6383755710136322, + "learning_rate": 3.7116034752425277e-06, + "loss": 0.0124, + "step": 2998 + }, + { + "epoch": 0.595689740788559, + "grad_norm": 0.5596917796188703, + "learning_rate": 3.708494870204342e-06, + "loss": 0.0158, + "step": 2999 + }, + { + "epoch": 0.5958883702453074, + "grad_norm": 0.42835170745607165, + "learning_rate": 3.7053867999771366e-06, + "loss": 0.0125, + "step": 3000 + }, + { + "epoch": 0.5960869997020558, + "grad_norm": 0.39430591078922234, + "learning_rate": 3.702279265847961e-06, + "loss": 0.0099, + "step": 3001 + }, + { + "epoch": 0.5962856291588042, + "grad_norm": 0.509207291742738, + "learning_rate": 3.6991722691036423e-06, + "loss": 0.0104, + "step": 3002 + }, + { + "epoch": 0.5964842586155527, + "grad_norm": 0.8424408408284407, + "learning_rate": 3.6960658110307844e-06, + "loss": 0.0109, + "step": 3003 + }, + { + "epoch": 0.5966828880723011, + "grad_norm": 0.7528576265240631, + "learning_rate": 3.6929598929157682e-06, + "loss": 0.0151, + "step": 3004 + }, + { + "epoch": 0.5968815175290496, + "grad_norm": 0.45476504766802717, + "learning_rate": 3.689854516044752e-06, + "loss": 0.0143, + "step": 3005 + }, + { + "epoch": 0.597080146985798, + "grad_norm": 0.44031126237277396, + "learning_rate": 3.6867496817036674e-06, + "loss": 0.0138, + "step": 3006 + }, + { + "epoch": 0.5972787764425465, + "grad_norm": 0.5965406726879514, + "learning_rate": 3.6836453911782244e-06, + "loss": 0.0097, + "step": 3007 + }, + { + "epoch": 0.5974774058992949, + "grad_norm": 0.32200058430663714, + "learning_rate": 3.680541645753908e-06, + "loss": 0.0065, + "step": 3008 + }, + { + "epoch": 0.5976760353560433, + "grad_norm": 0.35025081668798586, + "learning_rate": 3.677438446715974e-06, + "loss": 0.0148, + "step": 3009 + }, + { + "epoch": 0.5978746648127917, + "grad_norm": 0.732734140663941, + "learning_rate": 3.6743357953494554e-06, + "loss": 0.0232, + "step": 3010 + }, + { + "epoch": 0.5980732942695401, + "grad_norm": 0.9147359044098355, + "learning_rate": 3.6712336929391558e-06, + "loss": 0.0171, + "step": 3011 + }, + { + "epoch": 0.5982719237262886, + "grad_norm": 0.47195978016132334, + "learning_rate": 3.6681321407696546e-06, + "loss": 0.0108, + "step": 3012 + }, + { + "epoch": 0.598470553183037, + "grad_norm": 0.3255368283156676, + "learning_rate": 3.665031140125299e-06, + "loss": 0.0082, + "step": 3013 + }, + { + "epoch": 0.5986691826397855, + "grad_norm": 0.4043095691857067, + "learning_rate": 3.661930692290211e-06, + "loss": 0.0111, + "step": 3014 + }, + { + "epoch": 0.598867812096534, + "grad_norm": 0.8606549728317674, + "learning_rate": 3.658830798548284e-06, + "loss": 0.0208, + "step": 3015 + }, + { + "epoch": 0.5990664415532824, + "grad_norm": 0.31791862321187564, + "learning_rate": 3.6557314601831804e-06, + "loss": 0.0075, + "step": 3016 + }, + { + "epoch": 0.5992650710100308, + "grad_norm": 0.61615355943961, + "learning_rate": 3.6526326784783328e-06, + "loss": 0.0188, + "step": 3017 + }, + { + "epoch": 0.5994637004667792, + "grad_norm": 0.3392910567038716, + "learning_rate": 3.649534454716942e-06, + "loss": 0.0071, + "step": 3018 + }, + { + "epoch": 0.5996623299235276, + "grad_norm": 0.3407654018111544, + "learning_rate": 3.646436790181983e-06, + "loss": 0.0091, + "step": 3019 + }, + { + "epoch": 0.5998609593802761, + "grad_norm": 0.43481756324822807, + "learning_rate": 3.643339686156193e-06, + "loss": 0.007, + "step": 3020 + }, + { + "epoch": 0.6000595888370245, + "grad_norm": 0.42659281115467734, + "learning_rate": 3.6402431439220807e-06, + "loss": 0.0081, + "step": 3021 + }, + { + "epoch": 0.600258218293773, + "grad_norm": 0.7240516093008215, + "learning_rate": 3.6371471647619212e-06, + "loss": 0.0172, + "step": 3022 + }, + { + "epoch": 0.6004568477505214, + "grad_norm": 0.2238508808138508, + "learning_rate": 3.6340517499577552e-06, + "loss": 0.0068, + "step": 3023 + }, + { + "epoch": 0.6006554772072699, + "grad_norm": 0.641644301510662, + "learning_rate": 3.6309569007913926e-06, + "loss": 0.0192, + "step": 3024 + }, + { + "epoch": 0.6008541066640183, + "grad_norm": 0.6086067117270952, + "learning_rate": 3.6278626185444043e-06, + "loss": 0.0146, + "step": 3025 + }, + { + "epoch": 0.6010527361207667, + "grad_norm": 0.562268484312575, + "learning_rate": 3.624768904498133e-06, + "loss": 0.0127, + "step": 3026 + }, + { + "epoch": 0.6012513655775151, + "grad_norm": 0.5503772462099394, + "learning_rate": 3.6216757599336817e-06, + "loss": 0.0122, + "step": 3027 + }, + { + "epoch": 0.6014499950342636, + "grad_norm": 0.3503335156258801, + "learning_rate": 3.6185831861319175e-06, + "loss": 0.0096, + "step": 3028 + }, + { + "epoch": 0.601648624491012, + "grad_norm": 0.4135501081556177, + "learning_rate": 3.6154911843734726e-06, + "loss": 0.0098, + "step": 3029 + }, + { + "epoch": 0.6018472539477605, + "grad_norm": 0.3319077445993011, + "learning_rate": 3.612399755938741e-06, + "loss": 0.0095, + "step": 3030 + }, + { + "epoch": 0.6020458834045089, + "grad_norm": 0.4357565471543654, + "learning_rate": 3.609308902107882e-06, + "loss": 0.0093, + "step": 3031 + }, + { + "epoch": 0.6022445128612574, + "grad_norm": 0.3754875478839089, + "learning_rate": 3.6062186241608127e-06, + "loss": 0.0067, + "step": 3032 + }, + { + "epoch": 0.6024431423180058, + "grad_norm": 0.4655009047495604, + "learning_rate": 3.603128923377216e-06, + "loss": 0.0104, + "step": 3033 + }, + { + "epoch": 0.6026417717747542, + "grad_norm": 0.37526412933260067, + "learning_rate": 3.6000398010365335e-06, + "loss": 0.0102, + "step": 3034 + }, + { + "epoch": 0.6028404012315026, + "grad_norm": 0.37466649162476384, + "learning_rate": 3.5969512584179676e-06, + "loss": 0.0062, + "step": 3035 + }, + { + "epoch": 0.603039030688251, + "grad_norm": 0.6376237051267966, + "learning_rate": 3.5938632968004816e-06, + "loss": 0.007, + "step": 3036 + }, + { + "epoch": 0.6032376601449995, + "grad_norm": 0.5472784553576067, + "learning_rate": 3.590775917462795e-06, + "loss": 0.0087, + "step": 3037 + }, + { + "epoch": 0.6034362896017479, + "grad_norm": 0.35009623082706764, + "learning_rate": 3.5876891216833898e-06, + "loss": 0.0111, + "step": 3038 + }, + { + "epoch": 0.6036349190584964, + "grad_norm": 0.46974213511879326, + "learning_rate": 3.5846029107405043e-06, + "loss": 0.0095, + "step": 3039 + }, + { + "epoch": 0.6038335485152448, + "grad_norm": 0.7485818959883949, + "learning_rate": 3.581517285912137e-06, + "loss": 0.0173, + "step": 3040 + }, + { + "epoch": 0.6040321779719933, + "grad_norm": 0.38815346960654723, + "learning_rate": 3.578432248476041e-06, + "loss": 0.0086, + "step": 3041 + }, + { + "epoch": 0.6042308074287417, + "grad_norm": 0.5263522252501535, + "learning_rate": 3.5753477997097274e-06, + "loss": 0.0089, + "step": 3042 + }, + { + "epoch": 0.6044294368854901, + "grad_norm": 0.5860186189203216, + "learning_rate": 3.5722639408904613e-06, + "loss": 0.0121, + "step": 3043 + }, + { + "epoch": 0.6046280663422385, + "grad_norm": 0.5281102457455624, + "learning_rate": 3.5691806732952695e-06, + "loss": 0.0121, + "step": 3044 + }, + { + "epoch": 0.604826695798987, + "grad_norm": 0.6500090193718753, + "learning_rate": 3.5660979982009283e-06, + "loss": 0.0136, + "step": 3045 + }, + { + "epoch": 0.6050253252557354, + "grad_norm": 0.39939760825863435, + "learning_rate": 3.563015916883969e-06, + "loss": 0.0085, + "step": 3046 + }, + { + "epoch": 0.6052239547124839, + "grad_norm": 0.347841392980447, + "learning_rate": 3.5599344306206797e-06, + "loss": 0.0106, + "step": 3047 + }, + { + "epoch": 0.6054225841692323, + "grad_norm": 0.9314772406942418, + "learning_rate": 3.5568535406871006e-06, + "loss": 0.0122, + "step": 3048 + }, + { + "epoch": 0.6056212136259808, + "grad_norm": 0.4754175168943589, + "learning_rate": 3.553773248359026e-06, + "loss": 0.0114, + "step": 3049 + }, + { + "epoch": 0.6058198430827292, + "grad_norm": 0.3708403797175746, + "learning_rate": 3.5506935549119994e-06, + "loss": 0.0091, + "step": 3050 + }, + { + "epoch": 0.6060184725394776, + "grad_norm": 1.7346629552296347, + "learning_rate": 3.547614461621321e-06, + "loss": 0.013, + "step": 3051 + }, + { + "epoch": 0.606217101996226, + "grad_norm": 0.40695318664902086, + "learning_rate": 3.5445359697620396e-06, + "loss": 0.0092, + "step": 3052 + }, + { + "epoch": 0.6064157314529744, + "grad_norm": 0.7125792193716718, + "learning_rate": 3.541458080608956e-06, + "loss": 0.0119, + "step": 3053 + }, + { + "epoch": 0.6066143609097229, + "grad_norm": 0.6915939424187886, + "learning_rate": 3.5383807954366207e-06, + "loss": 0.0168, + "step": 3054 + }, + { + "epoch": 0.6068129903664713, + "grad_norm": 1.730306247282683, + "learning_rate": 3.5353041155193333e-06, + "loss": 0.0073, + "step": 3055 + }, + { + "epoch": 0.6070116198232198, + "grad_norm": 0.438350129352838, + "learning_rate": 3.5322280421311462e-06, + "loss": 0.0095, + "step": 3056 + }, + { + "epoch": 0.6072102492799683, + "grad_norm": 1.0041964816668, + "learning_rate": 3.5291525765458555e-06, + "loss": 0.0138, + "step": 3057 + }, + { + "epoch": 0.6074088787367167, + "grad_norm": 0.48577362615681124, + "learning_rate": 3.5260777200370108e-06, + "loss": 0.0109, + "step": 3058 + }, + { + "epoch": 0.6076075081934651, + "grad_norm": 0.5217130396983579, + "learning_rate": 3.5230034738779062e-06, + "loss": 0.0166, + "step": 3059 + }, + { + "epoch": 0.6078061376502135, + "grad_norm": 0.519867329149172, + "learning_rate": 3.519929839341586e-06, + "loss": 0.0135, + "step": 3060 + }, + { + "epoch": 0.6080047671069619, + "grad_norm": 0.5343544411448541, + "learning_rate": 3.5168568177008343e-06, + "loss": 0.0121, + "step": 3061 + }, + { + "epoch": 0.6082033965637104, + "grad_norm": 0.5921739219433378, + "learning_rate": 3.51378441022819e-06, + "loss": 0.0217, + "step": 3062 + }, + { + "epoch": 0.6084020260204588, + "grad_norm": 0.4319692092553288, + "learning_rate": 3.5107126181959326e-06, + "loss": 0.0087, + "step": 3063 + }, + { + "epoch": 0.6086006554772073, + "grad_norm": 0.7628603176542459, + "learning_rate": 3.507641442876089e-06, + "loss": 0.0079, + "step": 3064 + }, + { + "epoch": 0.6087992849339557, + "grad_norm": 0.5049220741692332, + "learning_rate": 3.50457088554043e-06, + "loss": 0.0047, + "step": 3065 + }, + { + "epoch": 0.6089979143907042, + "grad_norm": 1.412081883923703, + "learning_rate": 3.5015009474604687e-06, + "loss": 0.0155, + "step": 3066 + }, + { + "epoch": 0.6091965438474526, + "grad_norm": 1.8218808527341421, + "learning_rate": 3.498431629907465e-06, + "loss": 0.0156, + "step": 3067 + }, + { + "epoch": 0.609395173304201, + "grad_norm": 0.3862970491125843, + "learning_rate": 3.4953629341524185e-06, + "loss": 0.0083, + "step": 3068 + }, + { + "epoch": 0.6095938027609494, + "grad_norm": 0.5034340958370394, + "learning_rate": 3.4922948614660755e-06, + "loss": 0.0106, + "step": 3069 + }, + { + "epoch": 0.6097924322176979, + "grad_norm": 0.6806056800334587, + "learning_rate": 3.4892274131189203e-06, + "loss": 0.0091, + "step": 3070 + }, + { + "epoch": 0.6099910616744463, + "grad_norm": 1.6108884326279738, + "learning_rate": 3.4861605903811802e-06, + "loss": 0.0169, + "step": 3071 + }, + { + "epoch": 0.6101896911311948, + "grad_norm": 0.4012224150673856, + "learning_rate": 3.4830943945228243e-06, + "loss": 0.0086, + "step": 3072 + }, + { + "epoch": 0.6103883205879432, + "grad_norm": 0.5349335803458497, + "learning_rate": 3.4800288268135598e-06, + "loss": 0.0174, + "step": 3073 + }, + { + "epoch": 0.6105869500446917, + "grad_norm": 0.4841940483295753, + "learning_rate": 3.4769638885228364e-06, + "loss": 0.0168, + "step": 3074 + }, + { + "epoch": 0.6107855795014401, + "grad_norm": 0.6441740428833413, + "learning_rate": 3.47389958091984e-06, + "loss": 0.015, + "step": 3075 + }, + { + "epoch": 0.6109842089581885, + "grad_norm": 0.357406903865928, + "learning_rate": 3.4708359052735006e-06, + "loss": 0.0082, + "step": 3076 + }, + { + "epoch": 0.6111828384149369, + "grad_norm": 0.5834140528899641, + "learning_rate": 3.4677728628524807e-06, + "loss": 0.0115, + "step": 3077 + }, + { + "epoch": 0.6113814678716853, + "grad_norm": 0.4928712484829136, + "learning_rate": 3.464710454925184e-06, + "loss": 0.0132, + "step": 3078 + }, + { + "epoch": 0.6115800973284338, + "grad_norm": 0.5979492434923857, + "learning_rate": 3.461648682759752e-06, + "loss": 0.0095, + "step": 3079 + }, + { + "epoch": 0.6117787267851822, + "grad_norm": 0.5154017529353565, + "learning_rate": 3.4585875476240584e-06, + "loss": 0.0131, + "step": 3080 + }, + { + "epoch": 0.6119773562419307, + "grad_norm": 1.054941339463001, + "learning_rate": 3.4555270507857174e-06, + "loss": 0.0082, + "step": 3081 + }, + { + "epoch": 0.6121759856986791, + "grad_norm": 0.7383146639599456, + "learning_rate": 3.452467193512078e-06, + "loss": 0.0086, + "step": 3082 + }, + { + "epoch": 0.6123746151554276, + "grad_norm": 0.46223660532581257, + "learning_rate": 3.449407977070225e-06, + "loss": 0.008, + "step": 3083 + }, + { + "epoch": 0.612573244612176, + "grad_norm": 0.6168524172398883, + "learning_rate": 3.4463494027269772e-06, + "loss": 0.0111, + "step": 3084 + }, + { + "epoch": 0.6127718740689244, + "grad_norm": 0.443610681926009, + "learning_rate": 3.443291471748884e-06, + "loss": 0.0088, + "step": 3085 + }, + { + "epoch": 0.6129705035256728, + "grad_norm": 0.42211292893459496, + "learning_rate": 3.4402341854022326e-06, + "loss": 0.0115, + "step": 3086 + }, + { + "epoch": 0.6131691329824213, + "grad_norm": 0.38959656912953755, + "learning_rate": 3.4371775449530444e-06, + "loss": 0.0118, + "step": 3087 + }, + { + "epoch": 0.6133677624391697, + "grad_norm": 0.482420771604837, + "learning_rate": 3.434121551667069e-06, + "loss": 0.0124, + "step": 3088 + }, + { + "epoch": 0.6135663918959182, + "grad_norm": 0.9713114677647458, + "learning_rate": 3.4310662068097915e-06, + "loss": 0.0177, + "step": 3089 + }, + { + "epoch": 0.6137650213526666, + "grad_norm": 0.41367064378744073, + "learning_rate": 3.4280115116464263e-06, + "loss": 0.0137, + "step": 3090 + }, + { + "epoch": 0.6139636508094151, + "grad_norm": 0.3784820028429733, + "learning_rate": 3.4249574674419206e-06, + "loss": 0.0163, + "step": 3091 + }, + { + "epoch": 0.6141622802661635, + "grad_norm": 0.4813484208222651, + "learning_rate": 3.4219040754609497e-06, + "loss": 0.0141, + "step": 3092 + }, + { + "epoch": 0.6143609097229119, + "grad_norm": 0.28494432612150494, + "learning_rate": 3.41885133696792e-06, + "loss": 0.0054, + "step": 3093 + }, + { + "epoch": 0.6145595391796603, + "grad_norm": 0.5830609541475335, + "learning_rate": 3.415799253226969e-06, + "loss": 0.0095, + "step": 3094 + }, + { + "epoch": 0.6147581686364088, + "grad_norm": 0.32540607111091596, + "learning_rate": 3.4127478255019607e-06, + "loss": 0.0107, + "step": 3095 + }, + { + "epoch": 0.6149567980931572, + "grad_norm": 0.35280809057789947, + "learning_rate": 3.409697055056489e-06, + "loss": 0.0105, + "step": 3096 + }, + { + "epoch": 0.6151554275499056, + "grad_norm": 0.7711115094509354, + "learning_rate": 3.406646943153874e-06, + "loss": 0.0131, + "step": 3097 + }, + { + "epoch": 0.6153540570066541, + "grad_norm": 0.6585705185875225, + "learning_rate": 3.4035974910571635e-06, + "loss": 0.0089, + "step": 3098 + }, + { + "epoch": 0.6155526864634026, + "grad_norm": 0.8014653930270288, + "learning_rate": 3.4005487000291336e-06, + "loss": 0.0094, + "step": 3099 + }, + { + "epoch": 0.615751315920151, + "grad_norm": 0.6004322261525369, + "learning_rate": 3.3975005713322852e-06, + "loss": 0.0183, + "step": 3100 + }, + { + "epoch": 0.6159499453768994, + "grad_norm": 0.5039813081737613, + "learning_rate": 3.3944531062288456e-06, + "loss": 0.0117, + "step": 3101 + }, + { + "epoch": 0.6161485748336478, + "grad_norm": 0.32998773181787877, + "learning_rate": 3.391406305980767e-06, + "loss": 0.0072, + "step": 3102 + }, + { + "epoch": 0.6163472042903962, + "grad_norm": 0.39531334946742824, + "learning_rate": 3.388360171849726e-06, + "loss": 0.0131, + "step": 3103 + }, + { + "epoch": 0.6165458337471447, + "grad_norm": 1.4423036226921424, + "learning_rate": 3.3853147050971245e-06, + "loss": 0.028, + "step": 3104 + }, + { + "epoch": 0.6167444632038931, + "grad_norm": 0.6108617381540014, + "learning_rate": 3.382269906984086e-06, + "loss": 0.0083, + "step": 3105 + }, + { + "epoch": 0.6169430926606416, + "grad_norm": 0.5450037541557885, + "learning_rate": 3.3792257787714593e-06, + "loss": 0.0192, + "step": 3106 + }, + { + "epoch": 0.61714172211739, + "grad_norm": 1.7949375217276946, + "learning_rate": 3.376182321719813e-06, + "loss": 0.0241, + "step": 3107 + }, + { + "epoch": 0.6173403515741385, + "grad_norm": 0.47593830021557054, + "learning_rate": 3.3731395370894447e-06, + "loss": 0.0137, + "step": 3108 + }, + { + "epoch": 0.6175389810308869, + "grad_norm": 0.5799733370382245, + "learning_rate": 3.370097426140363e-06, + "loss": 0.0119, + "step": 3109 + }, + { + "epoch": 0.6177376104876353, + "grad_norm": 0.4307995841914232, + "learning_rate": 3.3670559901323054e-06, + "loss": 0.0081, + "step": 3110 + }, + { + "epoch": 0.6179362399443837, + "grad_norm": 1.0801970697450218, + "learning_rate": 3.364015230324725e-06, + "loss": 0.0237, + "step": 3111 + }, + { + "epoch": 0.6181348694011322, + "grad_norm": 0.4367611467960593, + "learning_rate": 3.3609751479768003e-06, + "loss": 0.0098, + "step": 3112 + }, + { + "epoch": 0.6183334988578806, + "grad_norm": 0.3381363192748383, + "learning_rate": 3.3579357443474264e-06, + "loss": 0.0083, + "step": 3113 + }, + { + "epoch": 0.6185321283146291, + "grad_norm": 0.31029202975354897, + "learning_rate": 3.354897020695216e-06, + "loss": 0.0069, + "step": 3114 + }, + { + "epoch": 0.6187307577713775, + "grad_norm": 0.36141553067284127, + "learning_rate": 3.3518589782785016e-06, + "loss": 0.0053, + "step": 3115 + }, + { + "epoch": 0.618929387228126, + "grad_norm": 0.7205196066259474, + "learning_rate": 3.348821618355334e-06, + "loss": 0.0174, + "step": 3116 + }, + { + "epoch": 0.6191280166848744, + "grad_norm": 0.392173406386398, + "learning_rate": 3.345784942183481e-06, + "loss": 0.0118, + "step": 3117 + }, + { + "epoch": 0.6193266461416228, + "grad_norm": 0.3923813550404335, + "learning_rate": 3.342748951020425e-06, + "loss": 0.0072, + "step": 3118 + }, + { + "epoch": 0.6195252755983712, + "grad_norm": 0.9318581326956561, + "learning_rate": 3.3397136461233705e-06, + "loss": 0.0158, + "step": 3119 + }, + { + "epoch": 0.6197239050551197, + "grad_norm": 0.4305556661534459, + "learning_rate": 3.3366790287492323e-06, + "loss": 0.01, + "step": 3120 + }, + { + "epoch": 0.6199225345118681, + "grad_norm": 0.494409947487882, + "learning_rate": 3.3336451001546422e-06, + "loss": 0.0114, + "step": 3121 + }, + { + "epoch": 0.6201211639686165, + "grad_norm": 0.41455988706859453, + "learning_rate": 3.3306118615959483e-06, + "loss": 0.006, + "step": 3122 + }, + { + "epoch": 0.620319793425365, + "grad_norm": 0.7803252545572082, + "learning_rate": 3.32757931432921e-06, + "loss": 0.0181, + "step": 3123 + }, + { + "epoch": 0.6205184228821135, + "grad_norm": 0.5185588948320111, + "learning_rate": 3.3245474596102034e-06, + "loss": 0.0094, + "step": 3124 + }, + { + "epoch": 0.6207170523388619, + "grad_norm": 0.35619054220848395, + "learning_rate": 3.3215162986944145e-06, + "loss": 0.0169, + "step": 3125 + }, + { + "epoch": 0.6209156817956103, + "grad_norm": 0.6897006764622771, + "learning_rate": 3.3184858328370464e-06, + "loss": 0.0112, + "step": 3126 + }, + { + "epoch": 0.6211143112523587, + "grad_norm": 0.579124162879879, + "learning_rate": 3.315456063293011e-06, + "loss": 0.0161, + "step": 3127 + }, + { + "epoch": 0.6213129407091071, + "grad_norm": 0.45691926633221563, + "learning_rate": 3.312426991316933e-06, + "loss": 0.0079, + "step": 3128 + }, + { + "epoch": 0.6215115701658556, + "grad_norm": 0.3550439857141177, + "learning_rate": 3.309398618163148e-06, + "loss": 0.01, + "step": 3129 + }, + { + "epoch": 0.621710199622604, + "grad_norm": 0.4261114797732587, + "learning_rate": 3.306370945085702e-06, + "loss": 0.0093, + "step": 3130 + }, + { + "epoch": 0.6219088290793525, + "grad_norm": 0.509512632829503, + "learning_rate": 3.303343973338352e-06, + "loss": 0.0149, + "step": 3131 + }, + { + "epoch": 0.6221074585361009, + "grad_norm": 0.33665150006855243, + "learning_rate": 3.3003177041745644e-06, + "loss": 0.0044, + "step": 3132 + }, + { + "epoch": 0.6223060879928494, + "grad_norm": 0.6093443769153506, + "learning_rate": 3.297292138847512e-06, + "loss": 0.0139, + "step": 3133 + }, + { + "epoch": 0.6225047174495978, + "grad_norm": 0.6356168599910234, + "learning_rate": 3.2942672786100806e-06, + "loss": 0.0119, + "step": 3134 + }, + { + "epoch": 0.6227033469063462, + "grad_norm": 0.4716855426406557, + "learning_rate": 3.2912431247148603e-06, + "loss": 0.0123, + "step": 3135 + }, + { + "epoch": 0.6229019763630946, + "grad_norm": 0.26872765852592456, + "learning_rate": 3.2882196784141497e-06, + "loss": 0.0087, + "step": 3136 + }, + { + "epoch": 0.6231006058198431, + "grad_norm": 0.9289996201648486, + "learning_rate": 3.285196940959957e-06, + "loss": 0.0128, + "step": 3137 + }, + { + "epoch": 0.6232992352765915, + "grad_norm": 0.5778170278567689, + "learning_rate": 3.2821749136039947e-06, + "loss": 0.015, + "step": 3138 + }, + { + "epoch": 0.6234978647333399, + "grad_norm": 0.44484550235368014, + "learning_rate": 3.2791535975976796e-06, + "loss": 0.0112, + "step": 3139 + }, + { + "epoch": 0.6236964941900884, + "grad_norm": 0.5650552894350845, + "learning_rate": 3.276132994192137e-06, + "loss": 0.0168, + "step": 3140 + }, + { + "epoch": 0.6238951236468369, + "grad_norm": 0.45586647670638364, + "learning_rate": 3.2731131046381946e-06, + "loss": 0.0165, + "step": 3141 + }, + { + "epoch": 0.6240937531035853, + "grad_norm": 0.3991031664463805, + "learning_rate": 3.2700939301863867e-06, + "loss": 0.0088, + "step": 3142 + }, + { + "epoch": 0.6242923825603337, + "grad_norm": 0.5441221777716391, + "learning_rate": 3.2670754720869483e-06, + "loss": 0.0091, + "step": 3143 + }, + { + "epoch": 0.6244910120170821, + "grad_norm": 0.4508262833394596, + "learning_rate": 3.2640577315898232e-06, + "loss": 0.0101, + "step": 3144 + }, + { + "epoch": 0.6246896414738305, + "grad_norm": 0.5522463595883214, + "learning_rate": 3.261040709944653e-06, + "loss": 0.0199, + "step": 3145 + }, + { + "epoch": 0.624888270930579, + "grad_norm": 0.5241449671995597, + "learning_rate": 3.258024408400783e-06, + "loss": 0.0155, + "step": 3146 + }, + { + "epoch": 0.6250869003873274, + "grad_norm": 0.5548789843023203, + "learning_rate": 3.2550088282072614e-06, + "loss": 0.0109, + "step": 3147 + }, + { + "epoch": 0.6252855298440759, + "grad_norm": 1.0504625440702693, + "learning_rate": 3.2519939706128354e-06, + "loss": 0.019, + "step": 3148 + }, + { + "epoch": 0.6254841593008243, + "grad_norm": 0.4102947730181052, + "learning_rate": 3.2489798368659568e-06, + "loss": 0.0106, + "step": 3149 + }, + { + "epoch": 0.6256827887575728, + "grad_norm": 0.6501870965913413, + "learning_rate": 3.245966428214773e-06, + "loss": 0.0102, + "step": 3150 + }, + { + "epoch": 0.6258814182143212, + "grad_norm": 0.542258565431279, + "learning_rate": 3.242953745907136e-06, + "loss": 0.0164, + "step": 3151 + }, + { + "epoch": 0.6260800476710696, + "grad_norm": 0.43709875345405597, + "learning_rate": 3.2399417911905928e-06, + "loss": 0.0099, + "step": 3152 + }, + { + "epoch": 0.626278677127818, + "grad_norm": 0.4169502805419377, + "learning_rate": 3.2369305653123918e-06, + "loss": 0.0161, + "step": 3153 + }, + { + "epoch": 0.6264773065845665, + "grad_norm": 0.7203211824671325, + "learning_rate": 3.2339200695194776e-06, + "loss": 0.0132, + "step": 3154 + }, + { + "epoch": 0.6266759360413149, + "grad_norm": 0.6337861054133493, + "learning_rate": 3.2309103050584943e-06, + "loss": 0.0155, + "step": 3155 + }, + { + "epoch": 0.6268745654980633, + "grad_norm": 0.5461213073528943, + "learning_rate": 3.227901273175783e-06, + "loss": 0.018, + "step": 3156 + }, + { + "epoch": 0.6270731949548118, + "grad_norm": 0.4288994784155615, + "learning_rate": 3.2248929751173796e-06, + "loss": 0.013, + "step": 3157 + }, + { + "epoch": 0.6272718244115603, + "grad_norm": 0.5336781946873922, + "learning_rate": 3.2218854121290167e-06, + "loss": 0.0125, + "step": 3158 + }, + { + "epoch": 0.6274704538683087, + "grad_norm": 0.39411430837415473, + "learning_rate": 3.218878585456124e-06, + "loss": 0.0155, + "step": 3159 + }, + { + "epoch": 0.6276690833250571, + "grad_norm": 0.7372205200276526, + "learning_rate": 3.215872496343826e-06, + "loss": 0.0109, + "step": 3160 + }, + { + "epoch": 0.6278677127818055, + "grad_norm": 0.31051639973025613, + "learning_rate": 3.212867146036939e-06, + "loss": 0.0112, + "step": 3161 + }, + { + "epoch": 0.628066342238554, + "grad_norm": 0.7517003368889213, + "learning_rate": 3.2098625357799777e-06, + "loss": 0.0152, + "step": 3162 + }, + { + "epoch": 0.6282649716953024, + "grad_norm": 0.4493855979835352, + "learning_rate": 3.2068586668171487e-06, + "loss": 0.0128, + "step": 3163 + }, + { + "epoch": 0.6284636011520508, + "grad_norm": 0.43259693880401673, + "learning_rate": 3.2038555403923495e-06, + "loss": 0.0105, + "step": 3164 + }, + { + "epoch": 0.6286622306087993, + "grad_norm": 0.6757306391799492, + "learning_rate": 3.2008531577491726e-06, + "loss": 0.015, + "step": 3165 + }, + { + "epoch": 0.6288608600655478, + "grad_norm": 0.7200622315806112, + "learning_rate": 3.197851520130901e-06, + "loss": 0.0127, + "step": 3166 + }, + { + "epoch": 0.6290594895222962, + "grad_norm": 0.30945546413393143, + "learning_rate": 3.1948506287805105e-06, + "loss": 0.0071, + "step": 3167 + }, + { + "epoch": 0.6292581189790446, + "grad_norm": 0.5184927164257134, + "learning_rate": 3.1918504849406655e-06, + "loss": 0.013, + "step": 3168 + }, + { + "epoch": 0.629456748435793, + "grad_norm": 0.37944118777802155, + "learning_rate": 3.188851089853725e-06, + "loss": 0.0121, + "step": 3169 + }, + { + "epoch": 0.6296553778925414, + "grad_norm": 0.6108105036736555, + "learning_rate": 3.185852444761735e-06, + "loss": 0.0175, + "step": 3170 + }, + { + "epoch": 0.6298540073492899, + "grad_norm": 0.45049388301001664, + "learning_rate": 3.182854550906431e-06, + "loss": 0.0119, + "step": 3171 + }, + { + "epoch": 0.6300526368060383, + "grad_norm": 0.5933853421541697, + "learning_rate": 3.1798574095292366e-06, + "loss": 0.0153, + "step": 3172 + }, + { + "epoch": 0.6302512662627868, + "grad_norm": 0.3057531950463691, + "learning_rate": 3.176861021871267e-06, + "loss": 0.0109, + "step": 3173 + }, + { + "epoch": 0.6304498957195352, + "grad_norm": 0.3058524461492556, + "learning_rate": 3.1738653891733228e-06, + "loss": 0.0102, + "step": 3174 + }, + { + "epoch": 0.6306485251762837, + "grad_norm": 0.297644692305566, + "learning_rate": 3.170870512675891e-06, + "loss": 0.0083, + "step": 3175 + }, + { + "epoch": 0.6308471546330321, + "grad_norm": 0.41503730094845476, + "learning_rate": 3.1678763936191493e-06, + "loss": 0.0071, + "step": 3176 + }, + { + "epoch": 0.6310457840897805, + "grad_norm": 0.43732569514354086, + "learning_rate": 3.1648830332429576e-06, + "loss": 0.0149, + "step": 3177 + }, + { + "epoch": 0.6312444135465289, + "grad_norm": 0.4053359605257468, + "learning_rate": 3.161890432786864e-06, + "loss": 0.0119, + "step": 3178 + }, + { + "epoch": 0.6314430430032774, + "grad_norm": 0.4018745491695557, + "learning_rate": 3.1588985934901024e-06, + "loss": 0.0102, + "step": 3179 + }, + { + "epoch": 0.6316416724600258, + "grad_norm": 0.7769830143664423, + "learning_rate": 3.1559075165915897e-06, + "loss": 0.0141, + "step": 3180 + }, + { + "epoch": 0.6318403019167742, + "grad_norm": 0.4333501607393468, + "learning_rate": 3.152917203329927e-06, + "loss": 0.0073, + "step": 3181 + }, + { + "epoch": 0.6320389313735227, + "grad_norm": 0.3780998689723311, + "learning_rate": 3.149927654943401e-06, + "loss": 0.0101, + "step": 3182 + }, + { + "epoch": 0.6322375608302712, + "grad_norm": 0.5924510687322293, + "learning_rate": 3.14693887266998e-06, + "loss": 0.015, + "step": 3183 + }, + { + "epoch": 0.6324361902870196, + "grad_norm": 0.7703336171257334, + "learning_rate": 3.143950857747316e-06, + "loss": 0.009, + "step": 3184 + }, + { + "epoch": 0.632634819743768, + "grad_norm": 0.47579749826164863, + "learning_rate": 3.1409636114127434e-06, + "loss": 0.0103, + "step": 3185 + }, + { + "epoch": 0.6328334492005164, + "grad_norm": 0.5329585694417279, + "learning_rate": 3.137977134903276e-06, + "loss": 0.015, + "step": 3186 + }, + { + "epoch": 0.6330320786572649, + "grad_norm": 0.3926580971839605, + "learning_rate": 3.1349914294556146e-06, + "loss": 0.0122, + "step": 3187 + }, + { + "epoch": 0.6332307081140133, + "grad_norm": 0.7331321468927112, + "learning_rate": 3.1320064963061335e-06, + "loss": 0.0179, + "step": 3188 + }, + { + "epoch": 0.6334293375707617, + "grad_norm": 0.7309606038405821, + "learning_rate": 3.1290223366908923e-06, + "loss": 0.0097, + "step": 3189 + }, + { + "epoch": 0.6336279670275102, + "grad_norm": 0.40442647112520747, + "learning_rate": 3.1260389518456275e-06, + "loss": 0.0081, + "step": 3190 + }, + { + "epoch": 0.6338265964842587, + "grad_norm": 0.23396485229178887, + "learning_rate": 3.123056343005756e-06, + "loss": 0.0042, + "step": 3191 + }, + { + "epoch": 0.6340252259410071, + "grad_norm": 0.31067689461138565, + "learning_rate": 3.1200745114063733e-06, + "loss": 0.0068, + "step": 3192 + }, + { + "epoch": 0.6342238553977555, + "grad_norm": 0.4863770217084106, + "learning_rate": 3.117093458282252e-06, + "loss": 0.0082, + "step": 3193 + }, + { + "epoch": 0.6344224848545039, + "grad_norm": 0.6185800085213491, + "learning_rate": 3.1141131848678453e-06, + "loss": 0.0113, + "step": 3194 + }, + { + "epoch": 0.6346211143112523, + "grad_norm": 0.4166395787409532, + "learning_rate": 3.111133692397279e-06, + "loss": 0.0072, + "step": 3195 + }, + { + "epoch": 0.6348197437680008, + "grad_norm": 0.6441347162448549, + "learning_rate": 3.10815498210436e-06, + "loss": 0.0191, + "step": 3196 + }, + { + "epoch": 0.6350183732247492, + "grad_norm": 0.5648143884299345, + "learning_rate": 3.105177055222569e-06, + "loss": 0.0188, + "step": 3197 + }, + { + "epoch": 0.6352170026814976, + "grad_norm": 0.2983740379639698, + "learning_rate": 3.102199912985061e-06, + "loss": 0.0074, + "step": 3198 + }, + { + "epoch": 0.6354156321382461, + "grad_norm": 0.4602927312462115, + "learning_rate": 3.099223556624669e-06, + "loss": 0.0139, + "step": 3199 + }, + { + "epoch": 0.6356142615949946, + "grad_norm": 0.697211094443911, + "learning_rate": 3.096247987373897e-06, + "loss": 0.0155, + "step": 3200 + }, + { + "epoch": 0.635812891051743, + "grad_norm": 0.7485575048841167, + "learning_rate": 3.0932732064649284e-06, + "loss": 0.016, + "step": 3201 + }, + { + "epoch": 0.6360115205084914, + "grad_norm": 0.78529100896383, + "learning_rate": 3.0902992151296156e-06, + "loss": 0.0226, + "step": 3202 + }, + { + "epoch": 0.6362101499652398, + "grad_norm": 0.41085026543693437, + "learning_rate": 3.0873260145994857e-06, + "loss": 0.0112, + "step": 3203 + }, + { + "epoch": 0.6364087794219883, + "grad_norm": 0.6494964040575566, + "learning_rate": 3.0843536061057378e-06, + "loss": 0.0121, + "step": 3204 + }, + { + "epoch": 0.6366074088787367, + "grad_norm": 0.6702929678649525, + "learning_rate": 3.081381990879243e-06, + "loss": 0.0129, + "step": 3205 + }, + { + "epoch": 0.6368060383354851, + "grad_norm": 0.6836348848424111, + "learning_rate": 3.078411170150545e-06, + "loss": 0.0116, + "step": 3206 + }, + { + "epoch": 0.6370046677922336, + "grad_norm": 0.393253021137455, + "learning_rate": 3.0754411451498557e-06, + "loss": 0.0145, + "step": 3207 + }, + { + "epoch": 0.6372032972489821, + "grad_norm": 1.4739247811366072, + "learning_rate": 3.0724719171070615e-06, + "loss": 0.0185, + "step": 3208 + }, + { + "epoch": 0.6374019267057305, + "grad_norm": 0.4801500029884138, + "learning_rate": 3.0695034872517166e-06, + "loss": 0.0075, + "step": 3209 + }, + { + "epoch": 0.6376005561624789, + "grad_norm": 0.7278188114507573, + "learning_rate": 3.066535856813044e-06, + "loss": 0.0134, + "step": 3210 + }, + { + "epoch": 0.6377991856192273, + "grad_norm": 1.2241754299310377, + "learning_rate": 3.063569027019936e-06, + "loss": 0.0163, + "step": 3211 + }, + { + "epoch": 0.6379978150759757, + "grad_norm": 0.2911942180277458, + "learning_rate": 3.0606029991009557e-06, + "loss": 0.0116, + "step": 3212 + }, + { + "epoch": 0.6381964445327242, + "grad_norm": 0.19274138226813456, + "learning_rate": 3.057637774284331e-06, + "loss": 0.0055, + "step": 3213 + }, + { + "epoch": 0.6383950739894726, + "grad_norm": 0.37853841414052497, + "learning_rate": 3.0546733537979588e-06, + "loss": 0.0141, + "step": 3214 + }, + { + "epoch": 0.6385937034462211, + "grad_norm": 0.3776259956184919, + "learning_rate": 3.051709738869403e-06, + "loss": 0.0083, + "step": 3215 + }, + { + "epoch": 0.6387923329029696, + "grad_norm": 0.560635040612453, + "learning_rate": 3.048746930725893e-06, + "loss": 0.011, + "step": 3216 + }, + { + "epoch": 0.638990962359718, + "grad_norm": 0.48130658527669296, + "learning_rate": 3.0457849305943256e-06, + "loss": 0.0118, + "step": 3217 + }, + { + "epoch": 0.6391895918164664, + "grad_norm": 0.4771547132379289, + "learning_rate": 3.04282373970126e-06, + "loss": 0.0104, + "step": 3218 + }, + { + "epoch": 0.6393882212732148, + "grad_norm": 0.3460116166064898, + "learning_rate": 3.0398633592729243e-06, + "loss": 0.0071, + "step": 3219 + }, + { + "epoch": 0.6395868507299632, + "grad_norm": 0.5824392990720808, + "learning_rate": 3.0369037905352093e-06, + "loss": 0.0088, + "step": 3220 + }, + { + "epoch": 0.6397854801867117, + "grad_norm": 0.20323244555014405, + "learning_rate": 3.033945034713669e-06, + "loss": 0.0064, + "step": 3221 + }, + { + "epoch": 0.6399841096434601, + "grad_norm": 0.638442739435459, + "learning_rate": 3.0309870930335204e-06, + "loss": 0.0137, + "step": 3222 + }, + { + "epoch": 0.6401827391002085, + "grad_norm": 0.77301437107863, + "learning_rate": 3.0280299667196444e-06, + "loss": 0.0087, + "step": 3223 + }, + { + "epoch": 0.640381368556957, + "grad_norm": 0.7822928728867742, + "learning_rate": 3.0250736569965857e-06, + "loss": 0.0181, + "step": 3224 + }, + { + "epoch": 0.6405799980137055, + "grad_norm": 0.33540923186909954, + "learning_rate": 3.0221181650885454e-06, + "loss": 0.0094, + "step": 3225 + }, + { + "epoch": 0.6407786274704539, + "grad_norm": 0.3931991439665449, + "learning_rate": 3.0191634922193946e-06, + "loss": 0.014, + "step": 3226 + }, + { + "epoch": 0.6409772569272023, + "grad_norm": 0.6751895750801706, + "learning_rate": 3.016209639612657e-06, + "loss": 0.0127, + "step": 3227 + }, + { + "epoch": 0.6411758863839507, + "grad_norm": 0.5026416514430642, + "learning_rate": 3.0132566084915236e-06, + "loss": 0.0133, + "step": 3228 + }, + { + "epoch": 0.6413745158406992, + "grad_norm": 0.8976233584442671, + "learning_rate": 3.0103044000788356e-06, + "loss": 0.0139, + "step": 3229 + }, + { + "epoch": 0.6415731452974476, + "grad_norm": 0.7798871995666934, + "learning_rate": 3.007353015597104e-06, + "loss": 0.0134, + "step": 3230 + }, + { + "epoch": 0.641771774754196, + "grad_norm": 0.5380549093775145, + "learning_rate": 3.0044024562684938e-06, + "loss": 0.0139, + "step": 3231 + }, + { + "epoch": 0.6419704042109445, + "grad_norm": 0.3154555145345817, + "learning_rate": 3.001452723314827e-06, + "loss": 0.0102, + "step": 3232 + }, + { + "epoch": 0.642169033667693, + "grad_norm": 0.3340853456189174, + "learning_rate": 2.998503817957587e-06, + "loss": 0.0058, + "step": 3233 + }, + { + "epoch": 0.6423676631244414, + "grad_norm": 0.2936801248452502, + "learning_rate": 2.9955557414179117e-06, + "loss": 0.0043, + "step": 3234 + }, + { + "epoch": 0.6425662925811898, + "grad_norm": 0.7864359116686, + "learning_rate": 2.9926084949165956e-06, + "loss": 0.0189, + "step": 3235 + }, + { + "epoch": 0.6427649220379382, + "grad_norm": 0.7167406061383401, + "learning_rate": 2.989662079674092e-06, + "loss": 0.0102, + "step": 3236 + }, + { + "epoch": 0.6429635514946866, + "grad_norm": 0.3351720288278042, + "learning_rate": 2.9867164969105073e-06, + "loss": 0.0062, + "step": 3237 + }, + { + "epoch": 0.6431621809514351, + "grad_norm": 0.40908005659181024, + "learning_rate": 2.983771747845606e-06, + "loss": 0.0111, + "step": 3238 + }, + { + "epoch": 0.6433608104081835, + "grad_norm": 0.3277290055237293, + "learning_rate": 2.9808278336988043e-06, + "loss": 0.0073, + "step": 3239 + }, + { + "epoch": 0.6435594398649319, + "grad_norm": 0.5048253802977973, + "learning_rate": 2.9778847556891754e-06, + "loss": 0.0077, + "step": 3240 + }, + { + "epoch": 0.6437580693216804, + "grad_norm": 0.3405168121598885, + "learning_rate": 2.974942515035444e-06, + "loss": 0.0135, + "step": 3241 + }, + { + "epoch": 0.6439566987784289, + "grad_norm": 0.6104144042070537, + "learning_rate": 2.972001112955989e-06, + "loss": 0.0141, + "step": 3242 + }, + { + "epoch": 0.6441553282351773, + "grad_norm": 0.3216837585137305, + "learning_rate": 2.969060550668841e-06, + "loss": 0.0045, + "step": 3243 + }, + { + "epoch": 0.6443539576919257, + "grad_norm": 0.4861048452267066, + "learning_rate": 2.966120829391686e-06, + "loss": 0.0073, + "step": 3244 + }, + { + "epoch": 0.6445525871486741, + "grad_norm": 0.4499553189439919, + "learning_rate": 2.963181950341859e-06, + "loss": 0.0089, + "step": 3245 + }, + { + "epoch": 0.6447512166054226, + "grad_norm": 0.435117209252951, + "learning_rate": 2.9602439147363472e-06, + "loss": 0.0101, + "step": 3246 + }, + { + "epoch": 0.644949846062171, + "grad_norm": 0.5927891171510099, + "learning_rate": 2.957306723791787e-06, + "loss": 0.0067, + "step": 3247 + }, + { + "epoch": 0.6451484755189194, + "grad_norm": 0.37326850014800456, + "learning_rate": 2.9543703787244672e-06, + "loss": 0.0064, + "step": 3248 + }, + { + "epoch": 0.6453471049756679, + "grad_norm": 0.5716478987296174, + "learning_rate": 2.9514348807503248e-06, + "loss": 0.0076, + "step": 3249 + }, + { + "epoch": 0.6455457344324164, + "grad_norm": 0.5615102052694282, + "learning_rate": 2.9485002310849454e-06, + "loss": 0.0106, + "step": 3250 + }, + { + "epoch": 0.6457443638891648, + "grad_norm": 0.47977193647612537, + "learning_rate": 2.9455664309435674e-06, + "loss": 0.0085, + "step": 3251 + }, + { + "epoch": 0.6459429933459132, + "grad_norm": 0.6327506375262189, + "learning_rate": 2.942633481541075e-06, + "loss": 0.0138, + "step": 3252 + }, + { + "epoch": 0.6461416228026616, + "grad_norm": 0.558340339517197, + "learning_rate": 2.9397013840919953e-06, + "loss": 0.0085, + "step": 3253 + }, + { + "epoch": 0.64634025225941, + "grad_norm": 0.6265421103163499, + "learning_rate": 2.9367701398105087e-06, + "loss": 0.0088, + "step": 3254 + }, + { + "epoch": 0.6465388817161585, + "grad_norm": 0.6651718817702527, + "learning_rate": 2.933839749910442e-06, + "loss": 0.011, + "step": 3255 + }, + { + "epoch": 0.6467375111729069, + "grad_norm": 0.7677766935582281, + "learning_rate": 2.930910215605265e-06, + "loss": 0.0126, + "step": 3256 + }, + { + "epoch": 0.6469361406296554, + "grad_norm": 0.6946854991421672, + "learning_rate": 2.9279815381080966e-06, + "loss": 0.018, + "step": 3257 + }, + { + "epoch": 0.6471347700864039, + "grad_norm": 0.9714783676460519, + "learning_rate": 2.9250537186316975e-06, + "loss": 0.0204, + "step": 3258 + }, + { + "epoch": 0.6473333995431523, + "grad_norm": 0.5991115101907825, + "learning_rate": 2.9221267583884762e-06, + "loss": 0.0127, + "step": 3259 + }, + { + "epoch": 0.6475320289999007, + "grad_norm": 0.5265824803297029, + "learning_rate": 2.919200658590483e-06, + "loss": 0.0098, + "step": 3260 + }, + { + "epoch": 0.6477306584566491, + "grad_norm": 0.4157857124333679, + "learning_rate": 2.9162754204494125e-06, + "loss": 0.0074, + "step": 3261 + }, + { + "epoch": 0.6479292879133975, + "grad_norm": 0.5129766838144425, + "learning_rate": 2.913351045176606e-06, + "loss": 0.0135, + "step": 3262 + }, + { + "epoch": 0.648127917370146, + "grad_norm": 0.614863644307128, + "learning_rate": 2.91042753398304e-06, + "loss": 0.0082, + "step": 3263 + }, + { + "epoch": 0.6483265468268944, + "grad_norm": 0.7205082964279731, + "learning_rate": 2.9075048880793395e-06, + "loss": 0.012, + "step": 3264 + }, + { + "epoch": 0.6485251762836428, + "grad_norm": 0.6148645605362121, + "learning_rate": 2.9045831086757716e-06, + "loss": 0.0157, + "step": 3265 + }, + { + "epoch": 0.6487238057403913, + "grad_norm": 1.7983560604746254, + "learning_rate": 2.9016621969822374e-06, + "loss": 0.0122, + "step": 3266 + }, + { + "epoch": 0.6489224351971398, + "grad_norm": 0.219136851702214, + "learning_rate": 2.8987421542082885e-06, + "loss": 0.0034, + "step": 3267 + }, + { + "epoch": 0.6491210646538882, + "grad_norm": 0.6386998265055096, + "learning_rate": 2.8958229815631068e-06, + "loss": 0.0098, + "step": 3268 + }, + { + "epoch": 0.6493196941106366, + "grad_norm": 0.5134293758082206, + "learning_rate": 2.892904680255524e-06, + "loss": 0.012, + "step": 3269 + }, + { + "epoch": 0.649518323567385, + "grad_norm": 0.5345560851450427, + "learning_rate": 2.889987251494e-06, + "loss": 0.0116, + "step": 3270 + }, + { + "epoch": 0.6497169530241335, + "grad_norm": 0.7400191306883878, + "learning_rate": 2.8870706964866436e-06, + "loss": 0.0143, + "step": 3271 + }, + { + "epoch": 0.6499155824808819, + "grad_norm": 0.4635388451227108, + "learning_rate": 2.8841550164411967e-06, + "loss": 0.0131, + "step": 3272 + }, + { + "epoch": 0.6501142119376303, + "grad_norm": 0.5109709585210221, + "learning_rate": 2.881240212565037e-06, + "loss": 0.0051, + "step": 3273 + }, + { + "epoch": 0.6503128413943788, + "grad_norm": 0.7108773032847435, + "learning_rate": 2.878326286065185e-06, + "loss": 0.0139, + "step": 3274 + }, + { + "epoch": 0.6505114708511273, + "grad_norm": 0.6626555461496284, + "learning_rate": 2.8754132381482926e-06, + "loss": 0.0119, + "step": 3275 + }, + { + "epoch": 0.6507101003078757, + "grad_norm": 0.5269547367333896, + "learning_rate": 2.8725010700206514e-06, + "loss": 0.0145, + "step": 3276 + }, + { + "epoch": 0.6509087297646241, + "grad_norm": 0.48445706983464565, + "learning_rate": 2.869589782888187e-06, + "loss": 0.0069, + "step": 3277 + }, + { + "epoch": 0.6511073592213725, + "grad_norm": 0.39715086701308067, + "learning_rate": 2.866679377956458e-06, + "loss": 0.0107, + "step": 3278 + }, + { + "epoch": 0.651305988678121, + "grad_norm": 0.6713147395317994, + "learning_rate": 2.8637698564306637e-06, + "loss": 0.0145, + "step": 3279 + }, + { + "epoch": 0.6515046181348694, + "grad_norm": 0.6460254311609365, + "learning_rate": 2.8608612195156318e-06, + "loss": 0.022, + "step": 3280 + }, + { + "epoch": 0.6517032475916178, + "grad_norm": 0.689742162700817, + "learning_rate": 2.8579534684158277e-06, + "loss": 0.0121, + "step": 3281 + }, + { + "epoch": 0.6519018770483662, + "grad_norm": 0.6959754949888912, + "learning_rate": 2.8550466043353453e-06, + "loss": 0.0175, + "step": 3282 + }, + { + "epoch": 0.6521005065051148, + "grad_norm": 0.47915768218904303, + "learning_rate": 2.852140628477916e-06, + "loss": 0.0153, + "step": 3283 + }, + { + "epoch": 0.6522991359618632, + "grad_norm": 0.6904735505096031, + "learning_rate": 2.849235542046904e-06, + "loss": 0.0146, + "step": 3284 + }, + { + "epoch": 0.6524977654186116, + "grad_norm": 0.8025885992496349, + "learning_rate": 2.846331346245298e-06, + "loss": 0.0239, + "step": 3285 + }, + { + "epoch": 0.65269639487536, + "grad_norm": 0.619059088683545, + "learning_rate": 2.843428042275727e-06, + "loss": 0.0113, + "step": 3286 + }, + { + "epoch": 0.6528950243321084, + "grad_norm": 0.8459065891949454, + "learning_rate": 2.8405256313404417e-06, + "loss": 0.0184, + "step": 3287 + }, + { + "epoch": 0.6530936537888569, + "grad_norm": 0.5453957141251398, + "learning_rate": 2.8376241146413324e-06, + "loss": 0.0132, + "step": 3288 + }, + { + "epoch": 0.6532922832456053, + "grad_norm": 0.377901914882823, + "learning_rate": 2.8347234933799097e-06, + "loss": 0.0104, + "step": 3289 + }, + { + "epoch": 0.6534909127023537, + "grad_norm": 0.38612479544338135, + "learning_rate": 2.831823768757319e-06, + "loss": 0.0108, + "step": 3290 + }, + { + "epoch": 0.6536895421591022, + "grad_norm": 0.7541376084135651, + "learning_rate": 2.8289249419743376e-06, + "loss": 0.0134, + "step": 3291 + }, + { + "epoch": 0.6538881716158507, + "grad_norm": 0.22674881778038922, + "learning_rate": 2.826027014231361e-06, + "loss": 0.0112, + "step": 3292 + }, + { + "epoch": 0.6540868010725991, + "grad_norm": 0.5466667057789234, + "learning_rate": 2.8231299867284228e-06, + "loss": 0.0126, + "step": 3293 + }, + { + "epoch": 0.6542854305293475, + "grad_norm": 0.5346884562381211, + "learning_rate": 2.820233860665175e-06, + "loss": 0.0085, + "step": 3294 + }, + { + "epoch": 0.6544840599860959, + "grad_norm": 0.3347131133980114, + "learning_rate": 2.817338637240905e-06, + "loss": 0.0091, + "step": 3295 + }, + { + "epoch": 0.6546826894428444, + "grad_norm": 0.6898205612146502, + "learning_rate": 2.814444317654518e-06, + "loss": 0.01, + "step": 3296 + }, + { + "epoch": 0.6548813188995928, + "grad_norm": 0.6113056037292642, + "learning_rate": 2.811550903104549e-06, + "loss": 0.0135, + "step": 3297 + }, + { + "epoch": 0.6550799483563412, + "grad_norm": 0.24086210100341407, + "learning_rate": 2.8086583947891623e-06, + "loss": 0.0098, + "step": 3298 + }, + { + "epoch": 0.6552785778130897, + "grad_norm": 0.29528694256953764, + "learning_rate": 2.8057667939061394e-06, + "loss": 0.0091, + "step": 3299 + }, + { + "epoch": 0.6554772072698382, + "grad_norm": 0.32915813162257146, + "learning_rate": 2.8028761016528882e-06, + "loss": 0.0112, + "step": 3300 + }, + { + "epoch": 0.6556758367265866, + "grad_norm": 0.5845599531100432, + "learning_rate": 2.7999863192264453e-06, + "loss": 0.0084, + "step": 3301 + }, + { + "epoch": 0.655874466183335, + "grad_norm": 0.5157143599088475, + "learning_rate": 2.7970974478234626e-06, + "loss": 0.0081, + "step": 3302 + }, + { + "epoch": 0.6560730956400834, + "grad_norm": 0.21072408406022408, + "learning_rate": 2.7942094886402214e-06, + "loss": 0.0057, + "step": 3303 + }, + { + "epoch": 0.6562717250968318, + "grad_norm": 0.4446472530319208, + "learning_rate": 2.7913224428726215e-06, + "loss": 0.0109, + "step": 3304 + }, + { + "epoch": 0.6564703545535803, + "grad_norm": 0.7754716523493385, + "learning_rate": 2.788436311716187e-06, + "loss": 0.0217, + "step": 3305 + }, + { + "epoch": 0.6566689840103287, + "grad_norm": 0.3370976421618834, + "learning_rate": 2.78555109636606e-06, + "loss": 0.01, + "step": 3306 + }, + { + "epoch": 0.6568676134670771, + "grad_norm": 0.5079691302186063, + "learning_rate": 2.7826667980170064e-06, + "loss": 0.0109, + "step": 3307 + }, + { + "epoch": 0.6570662429238256, + "grad_norm": 0.49334190860082194, + "learning_rate": 2.7797834178634124e-06, + "loss": 0.0094, + "step": 3308 + }, + { + "epoch": 0.6572648723805741, + "grad_norm": 0.5546559533772156, + "learning_rate": 2.77690095709928e-06, + "loss": 0.0143, + "step": 3309 + }, + { + "epoch": 0.6574635018373225, + "grad_norm": 0.4708980828292352, + "learning_rate": 2.7740194169182377e-06, + "loss": 0.0122, + "step": 3310 + }, + { + "epoch": 0.6576621312940709, + "grad_norm": 0.7391696704500784, + "learning_rate": 2.771138798513523e-06, + "loss": 0.0113, + "step": 3311 + }, + { + "epoch": 0.6578607607508193, + "grad_norm": 0.7253819426997582, + "learning_rate": 2.7682591030780014e-06, + "loss": 0.0114, + "step": 3312 + }, + { + "epoch": 0.6580593902075678, + "grad_norm": 0.6177374318507232, + "learning_rate": 2.7653803318041495e-06, + "loss": 0.0143, + "step": 3313 + }, + { + "epoch": 0.6582580196643162, + "grad_norm": 0.43170689684000746, + "learning_rate": 2.7625024858840634e-06, + "loss": 0.0082, + "step": 3314 + }, + { + "epoch": 0.6584566491210646, + "grad_norm": 0.4260173502271494, + "learning_rate": 2.7596255665094594e-06, + "loss": 0.0125, + "step": 3315 + }, + { + "epoch": 0.6586552785778131, + "grad_norm": 0.40502002471903215, + "learning_rate": 2.7567495748716632e-06, + "loss": 0.0127, + "step": 3316 + }, + { + "epoch": 0.6588539080345616, + "grad_norm": 0.523810770366039, + "learning_rate": 2.7538745121616235e-06, + "loss": 0.0111, + "step": 3317 + }, + { + "epoch": 0.65905253749131, + "grad_norm": 1.0090297301535212, + "learning_rate": 2.751000379569897e-06, + "loss": 0.0136, + "step": 3318 + }, + { + "epoch": 0.6592511669480584, + "grad_norm": 0.45840798674019173, + "learning_rate": 2.7481271782866623e-06, + "loss": 0.0045, + "step": 3319 + }, + { + "epoch": 0.6594497964048068, + "grad_norm": 0.7665200456822501, + "learning_rate": 2.7452549095017065e-06, + "loss": 0.0171, + "step": 3320 + }, + { + "epoch": 0.6596484258615553, + "grad_norm": 0.4338808266126319, + "learning_rate": 2.7423835744044346e-06, + "loss": 0.0074, + "step": 3321 + }, + { + "epoch": 0.6598470553183037, + "grad_norm": 0.36880213167051384, + "learning_rate": 2.7395131741838666e-06, + "loss": 0.006, + "step": 3322 + }, + { + "epoch": 0.6600456847750521, + "grad_norm": 0.3898475854075473, + "learning_rate": 2.7366437100286286e-06, + "loss": 0.0088, + "step": 3323 + }, + { + "epoch": 0.6602443142318005, + "grad_norm": 0.5107579237424404, + "learning_rate": 2.7337751831269637e-06, + "loss": 0.0132, + "step": 3324 + }, + { + "epoch": 0.6604429436885491, + "grad_norm": 0.40112286710842526, + "learning_rate": 2.730907594666724e-06, + "loss": 0.0077, + "step": 3325 + }, + { + "epoch": 0.6606415731452975, + "grad_norm": 0.5870623533419401, + "learning_rate": 2.7280409458353775e-06, + "loss": 0.0133, + "step": 3326 + }, + { + "epoch": 0.6608402026020459, + "grad_norm": 0.32740499713756716, + "learning_rate": 2.725175237820002e-06, + "loss": 0.0103, + "step": 3327 + }, + { + "epoch": 0.6610388320587943, + "grad_norm": 0.4982828855799693, + "learning_rate": 2.72231047180728e-06, + "loss": 0.0075, + "step": 3328 + }, + { + "epoch": 0.6612374615155427, + "grad_norm": 0.42218049160104415, + "learning_rate": 2.7194466489835132e-06, + "loss": 0.0128, + "step": 3329 + }, + { + "epoch": 0.6614360909722912, + "grad_norm": 0.6155184301745691, + "learning_rate": 2.7165837705346033e-06, + "loss": 0.0155, + "step": 3330 + }, + { + "epoch": 0.6616347204290396, + "grad_norm": 0.6852751161684462, + "learning_rate": 2.7137218376460683e-06, + "loss": 0.0093, + "step": 3331 + }, + { + "epoch": 0.661833349885788, + "grad_norm": 0.3782884028083007, + "learning_rate": 2.7108608515030297e-06, + "loss": 0.0086, + "step": 3332 + }, + { + "epoch": 0.6620319793425365, + "grad_norm": 0.4146005267325709, + "learning_rate": 2.70800081329022e-06, + "loss": 0.0101, + "step": 3333 + }, + { + "epoch": 0.662230608799285, + "grad_norm": 0.36972782718978237, + "learning_rate": 2.7051417241919808e-06, + "loss": 0.0096, + "step": 3334 + }, + { + "epoch": 0.6624292382560334, + "grad_norm": 0.6516852569514466, + "learning_rate": 2.7022835853922525e-06, + "loss": 0.0114, + "step": 3335 + }, + { + "epoch": 0.6626278677127818, + "grad_norm": 0.29398406434321445, + "learning_rate": 2.699426398074593e-06, + "loss": 0.005, + "step": 3336 + }, + { + "epoch": 0.6628264971695302, + "grad_norm": 0.3956847791060924, + "learning_rate": 2.6965701634221566e-06, + "loss": 0.0093, + "step": 3337 + }, + { + "epoch": 0.6630251266262787, + "grad_norm": 0.688077347911631, + "learning_rate": 2.6937148826177095e-06, + "loss": 0.0154, + "step": 3338 + }, + { + "epoch": 0.6632237560830271, + "grad_norm": 0.5079820743351281, + "learning_rate": 2.690860556843619e-06, + "loss": 0.0105, + "step": 3339 + }, + { + "epoch": 0.6634223855397755, + "grad_norm": 0.4440992131204025, + "learning_rate": 2.688007187281859e-06, + "loss": 0.0102, + "step": 3340 + }, + { + "epoch": 0.663621014996524, + "grad_norm": 0.49214330396861955, + "learning_rate": 2.68515477511401e-06, + "loss": 0.0118, + "step": 3341 + }, + { + "epoch": 0.6638196444532725, + "grad_norm": 1.0900456782512908, + "learning_rate": 2.6823033215212478e-06, + "loss": 0.022, + "step": 3342 + }, + { + "epoch": 0.6640182739100209, + "grad_norm": 0.5381851320515335, + "learning_rate": 2.679452827684362e-06, + "loss": 0.0101, + "step": 3343 + }, + { + "epoch": 0.6642169033667693, + "grad_norm": 0.4016766272309056, + "learning_rate": 2.676603294783734e-06, + "loss": 0.0087, + "step": 3344 + }, + { + "epoch": 0.6644155328235177, + "grad_norm": 0.6652421012666038, + "learning_rate": 2.6737547239993565e-06, + "loss": 0.0186, + "step": 3345 + }, + { + "epoch": 0.6646141622802662, + "grad_norm": 0.6608874683072442, + "learning_rate": 2.670907116510817e-06, + "loss": 0.0132, + "step": 3346 + }, + { + "epoch": 0.6648127917370146, + "grad_norm": 0.47308686328108424, + "learning_rate": 2.668060473497309e-06, + "loss": 0.0075, + "step": 3347 + }, + { + "epoch": 0.665011421193763, + "grad_norm": 0.5810128092238643, + "learning_rate": 2.6652147961376253e-06, + "loss": 0.0138, + "step": 3348 + }, + { + "epoch": 0.6652100506505114, + "grad_norm": 0.535033001330009, + "learning_rate": 2.662370085610153e-06, + "loss": 0.0094, + "step": 3349 + }, + { + "epoch": 0.66540868010726, + "grad_norm": 0.3123641456923419, + "learning_rate": 2.6595263430928874e-06, + "loss": 0.0086, + "step": 3350 + }, + { + "epoch": 0.6656073095640084, + "grad_norm": 0.8214580684647604, + "learning_rate": 2.656683569763422e-06, + "loss": 0.0191, + "step": 3351 + }, + { + "epoch": 0.6658059390207568, + "grad_norm": 0.8066452608717822, + "learning_rate": 2.6538417667989414e-06, + "loss": 0.0159, + "step": 3352 + }, + { + "epoch": 0.6660045684775052, + "grad_norm": 0.41875862549545895, + "learning_rate": 2.651000935376238e-06, + "loss": 0.0066, + "step": 3353 + }, + { + "epoch": 0.6662031979342536, + "grad_norm": 0.6547577856421022, + "learning_rate": 2.6481610766716926e-06, + "loss": 0.0115, + "step": 3354 + }, + { + "epoch": 0.6664018273910021, + "grad_norm": 0.6959330954668181, + "learning_rate": 2.6453221918612915e-06, + "loss": 0.014, + "step": 3355 + }, + { + "epoch": 0.6666004568477505, + "grad_norm": 0.5030885709048069, + "learning_rate": 2.642484282120612e-06, + "loss": 0.0095, + "step": 3356 + }, + { + "epoch": 0.6667990863044989, + "grad_norm": 0.4719568465097182, + "learning_rate": 2.63964734862483e-06, + "loss": 0.014, + "step": 3357 + }, + { + "epoch": 0.6669977157612474, + "grad_norm": 0.6796444534634053, + "learning_rate": 2.6368113925487182e-06, + "loss": 0.0091, + "step": 3358 + }, + { + "epoch": 0.6671963452179959, + "grad_norm": 0.6346635781019123, + "learning_rate": 2.6339764150666414e-06, + "loss": 0.0174, + "step": 3359 + }, + { + "epoch": 0.6673949746747443, + "grad_norm": 0.5601756578875082, + "learning_rate": 2.6311424173525636e-06, + "loss": 0.0081, + "step": 3360 + }, + { + "epoch": 0.6675936041314927, + "grad_norm": 0.4608080766660191, + "learning_rate": 2.628309400580036e-06, + "loss": 0.0105, + "step": 3361 + }, + { + "epoch": 0.6677922335882411, + "grad_norm": 0.8768255616150576, + "learning_rate": 2.6254773659222123e-06, + "loss": 0.0146, + "step": 3362 + }, + { + "epoch": 0.6679908630449896, + "grad_norm": 0.5125903666857362, + "learning_rate": 2.622646314551832e-06, + "loss": 0.017, + "step": 3363 + }, + { + "epoch": 0.668189492501738, + "grad_norm": 0.4250565635417071, + "learning_rate": 2.6198162476412324e-06, + "loss": 0.0168, + "step": 3364 + }, + { + "epoch": 0.6683881219584864, + "grad_norm": 0.5993238921833851, + "learning_rate": 2.6169871663623424e-06, + "loss": 0.0079, + "step": 3365 + }, + { + "epoch": 0.6685867514152348, + "grad_norm": 0.4691003256708194, + "learning_rate": 2.614159071886679e-06, + "loss": 0.0194, + "step": 3366 + }, + { + "epoch": 0.6687853808719834, + "grad_norm": 0.34795435504024236, + "learning_rate": 2.6113319653853565e-06, + "loss": 0.01, + "step": 3367 + }, + { + "epoch": 0.6689840103287318, + "grad_norm": 0.7334349474214593, + "learning_rate": 2.6085058480290724e-06, + "loss": 0.0168, + "step": 3368 + }, + { + "epoch": 0.6691826397854802, + "grad_norm": 0.3782004612022668, + "learning_rate": 2.6056807209881247e-06, + "loss": 0.0097, + "step": 3369 + }, + { + "epoch": 0.6693812692422286, + "grad_norm": 0.6207564025203681, + "learning_rate": 2.6028565854323905e-06, + "loss": 0.0145, + "step": 3370 + }, + { + "epoch": 0.669579898698977, + "grad_norm": 0.3664793557426668, + "learning_rate": 2.6000334425313455e-06, + "loss": 0.0094, + "step": 3371 + }, + { + "epoch": 0.6697785281557255, + "grad_norm": 0.7947514445887053, + "learning_rate": 2.5972112934540483e-06, + "loss": 0.0137, + "step": 3372 + }, + { + "epoch": 0.6699771576124739, + "grad_norm": 0.5244578943642776, + "learning_rate": 2.5943901393691463e-06, + "loss": 0.0146, + "step": 3373 + }, + { + "epoch": 0.6701757870692223, + "grad_norm": 0.31429941997048494, + "learning_rate": 2.5915699814448813e-06, + "loss": 0.0084, + "step": 3374 + }, + { + "epoch": 0.6703744165259709, + "grad_norm": 0.36243016062186306, + "learning_rate": 2.5887508208490726e-06, + "loss": 0.0089, + "step": 3375 + }, + { + "epoch": 0.6705730459827193, + "grad_norm": 0.5468920277671807, + "learning_rate": 2.585932658749134e-06, + "loss": 0.0128, + "step": 3376 + }, + { + "epoch": 0.6707716754394677, + "grad_norm": 0.4530603705172164, + "learning_rate": 2.583115496312066e-06, + "loss": 0.0095, + "step": 3377 + }, + { + "epoch": 0.6709703048962161, + "grad_norm": 0.6766180998941852, + "learning_rate": 2.5802993347044494e-06, + "loss": 0.0111, + "step": 3378 + }, + { + "epoch": 0.6711689343529645, + "grad_norm": 0.34837311844564506, + "learning_rate": 2.5774841750924564e-06, + "loss": 0.0118, + "step": 3379 + }, + { + "epoch": 0.671367563809713, + "grad_norm": 0.4237238100703997, + "learning_rate": 2.5746700186418388e-06, + "loss": 0.0094, + "step": 3380 + }, + { + "epoch": 0.6715661932664614, + "grad_norm": 0.2705398980966825, + "learning_rate": 2.5718568665179394e-06, + "loss": 0.0094, + "step": 3381 + }, + { + "epoch": 0.6717648227232098, + "grad_norm": 0.7486608005177356, + "learning_rate": 2.5690447198856784e-06, + "loss": 0.0155, + "step": 3382 + }, + { + "epoch": 0.6719634521799583, + "grad_norm": 0.38166376147207953, + "learning_rate": 2.566233579909564e-06, + "loss": 0.0082, + "step": 3383 + }, + { + "epoch": 0.6721620816367068, + "grad_norm": 0.7618562250325333, + "learning_rate": 2.5634234477536894e-06, + "loss": 0.0117, + "step": 3384 + }, + { + "epoch": 0.6723607110934552, + "grad_norm": 0.4237062506104705, + "learning_rate": 2.5606143245817227e-06, + "loss": 0.0159, + "step": 3385 + }, + { + "epoch": 0.6725593405502036, + "grad_norm": 0.4786823768082389, + "learning_rate": 2.5578062115569234e-06, + "loss": 0.0133, + "step": 3386 + }, + { + "epoch": 0.672757970006952, + "grad_norm": 0.9433799903017728, + "learning_rate": 2.5549991098421245e-06, + "loss": 0.0195, + "step": 3387 + }, + { + "epoch": 0.6729565994637005, + "grad_norm": 0.3981282726321889, + "learning_rate": 2.5521930205997476e-06, + "loss": 0.0109, + "step": 3388 + }, + { + "epoch": 0.6731552289204489, + "grad_norm": 0.5626283367420652, + "learning_rate": 2.5493879449917875e-06, + "loss": 0.014, + "step": 3389 + }, + { + "epoch": 0.6733538583771973, + "grad_norm": 1.4117855646706072, + "learning_rate": 2.546583884179825e-06, + "loss": 0.0172, + "step": 3390 + }, + { + "epoch": 0.6735524878339457, + "grad_norm": 0.6383343978861289, + "learning_rate": 2.543780839325022e-06, + "loss": 0.0106, + "step": 3391 + }, + { + "epoch": 0.6737511172906943, + "grad_norm": 0.7138217958976034, + "learning_rate": 2.540978811588111e-06, + "loss": 0.0106, + "step": 3392 + }, + { + "epoch": 0.6739497467474427, + "grad_norm": 0.34623408436167746, + "learning_rate": 2.5381778021294133e-06, + "loss": 0.0091, + "step": 3393 + }, + { + "epoch": 0.6741483762041911, + "grad_norm": 0.8557016580449002, + "learning_rate": 2.535377812108821e-06, + "loss": 0.0151, + "step": 3394 + }, + { + "epoch": 0.6743470056609395, + "grad_norm": 0.24818536799482643, + "learning_rate": 2.5325788426858106e-06, + "loss": 0.0073, + "step": 3395 + }, + { + "epoch": 0.674545635117688, + "grad_norm": 0.31460883387664695, + "learning_rate": 2.5297808950194303e-06, + "loss": 0.0092, + "step": 3396 + }, + { + "epoch": 0.6747442645744364, + "grad_norm": 0.9971267660879034, + "learning_rate": 2.526983970268305e-06, + "loss": 0.0162, + "step": 3397 + }, + { + "epoch": 0.6749428940311848, + "grad_norm": 0.9119079346987756, + "learning_rate": 2.524188069590644e-06, + "loss": 0.0153, + "step": 3398 + }, + { + "epoch": 0.6751415234879332, + "grad_norm": 0.7270396119139402, + "learning_rate": 2.521393194144222e-06, + "loss": 0.0131, + "step": 3399 + }, + { + "epoch": 0.6753401529446817, + "grad_norm": 0.8376328773880279, + "learning_rate": 2.518599345086396e-06, + "loss": 0.0145, + "step": 3400 + }, + { + "epoch": 0.6755387824014302, + "grad_norm": 0.5909829299184999, + "learning_rate": 2.515806523574098e-06, + "loss": 0.0141, + "step": 3401 + }, + { + "epoch": 0.6757374118581786, + "grad_norm": 0.7347277470642827, + "learning_rate": 2.513014730763829e-06, + "loss": 0.0136, + "step": 3402 + }, + { + "epoch": 0.675936041314927, + "grad_norm": 0.5930317321813116, + "learning_rate": 2.5102239678116714e-06, + "loss": 0.0177, + "step": 3403 + }, + { + "epoch": 0.6761346707716754, + "grad_norm": 0.3929167896627322, + "learning_rate": 2.507434235873274e-06, + "loss": 0.0087, + "step": 3404 + }, + { + "epoch": 0.6763333002284239, + "grad_norm": 0.33640967171465747, + "learning_rate": 2.5046455361038646e-06, + "loss": 0.0066, + "step": 3405 + }, + { + "epoch": 0.6765319296851723, + "grad_norm": 0.32477500635650375, + "learning_rate": 2.5018578696582387e-06, + "loss": 0.0108, + "step": 3406 + }, + { + "epoch": 0.6767305591419207, + "grad_norm": 0.6222872814600147, + "learning_rate": 2.499071237690766e-06, + "loss": 0.0153, + "step": 3407 + }, + { + "epoch": 0.6769291885986691, + "grad_norm": 0.3552835031167747, + "learning_rate": 2.496285641355392e-06, + "loss": 0.0054, + "step": 3408 + }, + { + "epoch": 0.6771278180554177, + "grad_norm": 0.4119535077595573, + "learning_rate": 2.493501081805625e-06, + "loss": 0.0088, + "step": 3409 + }, + { + "epoch": 0.6773264475121661, + "grad_norm": 0.6664231184464069, + "learning_rate": 2.490717560194551e-06, + "loss": 0.0107, + "step": 3410 + }, + { + "epoch": 0.6775250769689145, + "grad_norm": 0.6170792412387821, + "learning_rate": 2.4879350776748214e-06, + "loss": 0.013, + "step": 3411 + }, + { + "epoch": 0.6777237064256629, + "grad_norm": 0.44050710440567764, + "learning_rate": 2.485153635398662e-06, + "loss": 0.009, + "step": 3412 + }, + { + "epoch": 0.6779223358824114, + "grad_norm": 1.1119697405780344, + "learning_rate": 2.482373234517862e-06, + "loss": 0.0136, + "step": 3413 + }, + { + "epoch": 0.6781209653391598, + "grad_norm": 1.0238357497490542, + "learning_rate": 2.479593876183784e-06, + "loss": 0.0126, + "step": 3414 + }, + { + "epoch": 0.6783195947959082, + "grad_norm": 0.46374993699075306, + "learning_rate": 2.47681556154736e-06, + "loss": 0.014, + "step": 3415 + }, + { + "epoch": 0.6785182242526566, + "grad_norm": 0.7825050148026104, + "learning_rate": 2.4740382917590834e-06, + "loss": 0.0149, + "step": 3416 + }, + { + "epoch": 0.6787168537094052, + "grad_norm": 0.45223615016890584, + "learning_rate": 2.4712620679690223e-06, + "loss": 0.0082, + "step": 3417 + }, + { + "epoch": 0.6789154831661536, + "grad_norm": 0.5359050703153959, + "learning_rate": 2.468486891326805e-06, + "loss": 0.0103, + "step": 3418 + }, + { + "epoch": 0.679114112622902, + "grad_norm": 0.6388123076656673, + "learning_rate": 2.4657127629816323e-06, + "loss": 0.0128, + "step": 3419 + }, + { + "epoch": 0.6793127420796504, + "grad_norm": 0.624704597446212, + "learning_rate": 2.4629396840822665e-06, + "loss": 0.0112, + "step": 3420 + }, + { + "epoch": 0.6795113715363988, + "grad_norm": 0.8921444577470375, + "learning_rate": 2.4601676557770353e-06, + "loss": 0.0108, + "step": 3421 + }, + { + "epoch": 0.6797100009931473, + "grad_norm": 0.4586739844639518, + "learning_rate": 2.4573966792138363e-06, + "loss": 0.0054, + "step": 3422 + }, + { + "epoch": 0.6799086304498957, + "grad_norm": 0.3515731328172263, + "learning_rate": 2.4546267555401245e-06, + "loss": 0.0108, + "step": 3423 + }, + { + "epoch": 0.6801072599066441, + "grad_norm": 0.5037818639650333, + "learning_rate": 2.451857885902926e-06, + "loss": 0.0144, + "step": 3424 + }, + { + "epoch": 0.6803058893633926, + "grad_norm": 0.7695438647313355, + "learning_rate": 2.449090071448823e-06, + "loss": 0.0078, + "step": 3425 + }, + { + "epoch": 0.6805045188201411, + "grad_norm": 1.0299643704038546, + "learning_rate": 2.4463233133239668e-06, + "loss": 0.0113, + "step": 3426 + }, + { + "epoch": 0.6807031482768895, + "grad_norm": 0.43828329932828497, + "learning_rate": 2.443557612674071e-06, + "loss": 0.0075, + "step": 3427 + }, + { + "epoch": 0.6809017777336379, + "grad_norm": 0.35986529838291326, + "learning_rate": 2.4407929706444067e-06, + "loss": 0.0091, + "step": 3428 + }, + { + "epoch": 0.6811004071903863, + "grad_norm": 0.5825951519865635, + "learning_rate": 2.4380293883798118e-06, + "loss": 0.0132, + "step": 3429 + }, + { + "epoch": 0.6812990366471348, + "grad_norm": 0.7054553585094656, + "learning_rate": 2.435266867024679e-06, + "loss": 0.0132, + "step": 3430 + }, + { + "epoch": 0.6814976661038832, + "grad_norm": 0.5757263663466883, + "learning_rate": 2.432505407722971e-06, + "loss": 0.0135, + "step": 3431 + }, + { + "epoch": 0.6816962955606316, + "grad_norm": 0.5381046623539257, + "learning_rate": 2.4297450116182003e-06, + "loss": 0.0116, + "step": 3432 + }, + { + "epoch": 0.68189492501738, + "grad_norm": 0.20663755135436665, + "learning_rate": 2.4269856798534463e-06, + "loss": 0.0044, + "step": 3433 + }, + { + "epoch": 0.6820935544741286, + "grad_norm": 0.414472597291746, + "learning_rate": 2.4242274135713477e-06, + "loss": 0.0122, + "step": 3434 + }, + { + "epoch": 0.682292183930877, + "grad_norm": 0.4289247107743778, + "learning_rate": 2.4214702139140954e-06, + "loss": 0.0103, + "step": 3435 + }, + { + "epoch": 0.6824908133876254, + "grad_norm": 0.3110902803437075, + "learning_rate": 2.418714082023448e-06, + "loss": 0.0078, + "step": 3436 + }, + { + "epoch": 0.6826894428443738, + "grad_norm": 0.44667625859781807, + "learning_rate": 2.4159590190407113e-06, + "loss": 0.0091, + "step": 3437 + }, + { + "epoch": 0.6828880723011223, + "grad_norm": 0.6388858453233094, + "learning_rate": 2.4132050261067595e-06, + "loss": 0.0113, + "step": 3438 + }, + { + "epoch": 0.6830867017578707, + "grad_norm": 0.6488255956688261, + "learning_rate": 2.410452104362014e-06, + "loss": 0.0139, + "step": 3439 + }, + { + "epoch": 0.6832853312146191, + "grad_norm": 0.399390098306922, + "learning_rate": 2.407700254946459e-06, + "loss": 0.0106, + "step": 3440 + }, + { + "epoch": 0.6834839606713675, + "grad_norm": 0.42623686761820223, + "learning_rate": 2.404949478999634e-06, + "loss": 0.0114, + "step": 3441 + }, + { + "epoch": 0.683682590128116, + "grad_norm": 0.6585189499135641, + "learning_rate": 2.4021997776606294e-06, + "loss": 0.0164, + "step": 3442 + }, + { + "epoch": 0.6838812195848645, + "grad_norm": 0.5067746225527527, + "learning_rate": 2.3994511520680975e-06, + "loss": 0.0229, + "step": 3443 + }, + { + "epoch": 0.6840798490416129, + "grad_norm": 0.6378096505999058, + "learning_rate": 2.3967036033602388e-06, + "loss": 0.0119, + "step": 3444 + }, + { + "epoch": 0.6842784784983613, + "grad_norm": 0.5093697789433972, + "learning_rate": 2.393957132674809e-06, + "loss": 0.0111, + "step": 3445 + }, + { + "epoch": 0.6844771079551097, + "grad_norm": 0.8605508533457091, + "learning_rate": 2.3912117411491233e-06, + "loss": 0.0123, + "step": 3446 + }, + { + "epoch": 0.6846757374118582, + "grad_norm": 0.3992536191232021, + "learning_rate": 2.3884674299200404e-06, + "loss": 0.0073, + "step": 3447 + }, + { + "epoch": 0.6848743668686066, + "grad_norm": 0.3301132162498039, + "learning_rate": 2.3857242001239815e-06, + "loss": 0.0111, + "step": 3448 + }, + { + "epoch": 0.685072996325355, + "grad_norm": 0.47428147751600336, + "learning_rate": 2.3829820528969118e-06, + "loss": 0.0149, + "step": 3449 + }, + { + "epoch": 0.6852716257821034, + "grad_norm": 0.5910659829970917, + "learning_rate": 2.3802409893743534e-06, + "loss": 0.0128, + "step": 3450 + }, + { + "epoch": 0.685470255238852, + "grad_norm": 0.3809691879291683, + "learning_rate": 2.3775010106913794e-06, + "loss": 0.0128, + "step": 3451 + }, + { + "epoch": 0.6856688846956004, + "grad_norm": 0.5463503817648073, + "learning_rate": 2.3747621179826084e-06, + "loss": 0.0109, + "step": 3452 + }, + { + "epoch": 0.6858675141523488, + "grad_norm": 0.5779042575602321, + "learning_rate": 2.3720243123822177e-06, + "loss": 0.0154, + "step": 3453 + }, + { + "epoch": 0.6860661436090972, + "grad_norm": 0.5292906640923668, + "learning_rate": 2.3692875950239256e-06, + "loss": 0.012, + "step": 3454 + }, + { + "epoch": 0.6862647730658457, + "grad_norm": 0.7193268858083541, + "learning_rate": 2.366551967041008e-06, + "loss": 0.015, + "step": 3455 + }, + { + "epoch": 0.6864634025225941, + "grad_norm": 0.7981922366337875, + "learning_rate": 2.3638174295662815e-06, + "loss": 0.0149, + "step": 3456 + }, + { + "epoch": 0.6866620319793425, + "grad_norm": 1.8969690617361947, + "learning_rate": 2.3610839837321174e-06, + "loss": 0.009, + "step": 3457 + }, + { + "epoch": 0.6868606614360909, + "grad_norm": 1.1223983859989493, + "learning_rate": 2.3583516306704353e-06, + "loss": 0.0138, + "step": 3458 + }, + { + "epoch": 0.6870592908928395, + "grad_norm": 0.7263132364715444, + "learning_rate": 2.3556203715126953e-06, + "loss": 0.0209, + "step": 3459 + }, + { + "epoch": 0.6872579203495879, + "grad_norm": 0.4466445789942108, + "learning_rate": 2.352890207389913e-06, + "loss": 0.0097, + "step": 3460 + }, + { + "epoch": 0.6874565498063363, + "grad_norm": 0.9150117897195746, + "learning_rate": 2.3501611394326434e-06, + "loss": 0.0133, + "step": 3461 + }, + { + "epoch": 0.6876551792630847, + "grad_norm": 0.5168870032711956, + "learning_rate": 2.3474331687709937e-06, + "loss": 0.0136, + "step": 3462 + }, + { + "epoch": 0.6878538087198331, + "grad_norm": 0.31968699277456936, + "learning_rate": 2.3447062965346113e-06, + "loss": 0.0073, + "step": 3463 + }, + { + "epoch": 0.6880524381765816, + "grad_norm": 0.6109378846286353, + "learning_rate": 2.3419805238526923e-06, + "loss": 0.0179, + "step": 3464 + }, + { + "epoch": 0.68825106763333, + "grad_norm": 0.35760154395629234, + "learning_rate": 2.339255851853978e-06, + "loss": 0.0115, + "step": 3465 + }, + { + "epoch": 0.6884496970900784, + "grad_norm": 0.5708486697315434, + "learning_rate": 2.3365322816667497e-06, + "loss": 0.0132, + "step": 3466 + }, + { + "epoch": 0.688648326546827, + "grad_norm": 0.6178889637099267, + "learning_rate": 2.3338098144188394e-06, + "loss": 0.0093, + "step": 3467 + }, + { + "epoch": 0.6888469560035754, + "grad_norm": 0.16097938067312248, + "learning_rate": 2.3310884512376115e-06, + "loss": 0.0034, + "step": 3468 + }, + { + "epoch": 0.6890455854603238, + "grad_norm": 0.3961429560117535, + "learning_rate": 2.328368193249983e-06, + "loss": 0.0067, + "step": 3469 + }, + { + "epoch": 0.6892442149170722, + "grad_norm": 0.38510183274734977, + "learning_rate": 2.3256490415824124e-06, + "loss": 0.0118, + "step": 3470 + }, + { + "epoch": 0.6894428443738206, + "grad_norm": 0.9257497192314168, + "learning_rate": 2.322930997360894e-06, + "loss": 0.014, + "step": 3471 + }, + { + "epoch": 0.6896414738305691, + "grad_norm": 0.6459018691049223, + "learning_rate": 2.3202140617109697e-06, + "loss": 0.0098, + "step": 3472 + }, + { + "epoch": 0.6898401032873175, + "grad_norm": 1.043612774948338, + "learning_rate": 2.3174982357577174e-06, + "loss": 0.0099, + "step": 3473 + }, + { + "epoch": 0.6900387327440659, + "grad_norm": 0.2929171606322306, + "learning_rate": 2.3147835206257615e-06, + "loss": 0.0066, + "step": 3474 + }, + { + "epoch": 0.6902373622008143, + "grad_norm": 0.4205660357282004, + "learning_rate": 2.312069917439258e-06, + "loss": 0.0106, + "step": 3475 + }, + { + "epoch": 0.6904359916575629, + "grad_norm": 0.28539942824667136, + "learning_rate": 2.3093574273219103e-06, + "loss": 0.0107, + "step": 3476 + }, + { + "epoch": 0.6906346211143113, + "grad_norm": 0.38833687964976527, + "learning_rate": 2.30664605139696e-06, + "loss": 0.0107, + "step": 3477 + }, + { + "epoch": 0.6908332505710597, + "grad_norm": 0.40899264773370975, + "learning_rate": 2.30393579078718e-06, + "loss": 0.0044, + "step": 3478 + }, + { + "epoch": 0.6910318800278081, + "grad_norm": 0.5323829934882054, + "learning_rate": 2.3012266466148913e-06, + "loss": 0.0164, + "step": 3479 + }, + { + "epoch": 0.6912305094845566, + "grad_norm": 0.5241906848713265, + "learning_rate": 2.2985186200019434e-06, + "loss": 0.0091, + "step": 3480 + }, + { + "epoch": 0.691429138941305, + "grad_norm": 0.47844187118354103, + "learning_rate": 2.2958117120697322e-06, + "loss": 0.0088, + "step": 3481 + }, + { + "epoch": 0.6916277683980534, + "grad_norm": 0.5162909926785495, + "learning_rate": 2.29310592393918e-06, + "loss": 0.0079, + "step": 3482 + }, + { + "epoch": 0.6918263978548018, + "grad_norm": 0.4668928613690837, + "learning_rate": 2.290401256730755e-06, + "loss": 0.0087, + "step": 3483 + }, + { + "epoch": 0.6920250273115504, + "grad_norm": 0.47892066762124963, + "learning_rate": 2.2876977115644577e-06, + "loss": 0.0098, + "step": 3484 + }, + { + "epoch": 0.6922236567682988, + "grad_norm": 0.4015035321499451, + "learning_rate": 2.28499528955982e-06, + "loss": 0.0082, + "step": 3485 + }, + { + "epoch": 0.6924222862250472, + "grad_norm": 0.4774526229757127, + "learning_rate": 2.2822939918359157e-06, + "loss": 0.0131, + "step": 3486 + }, + { + "epoch": 0.6926209156817956, + "grad_norm": 0.3237800455442647, + "learning_rate": 2.279593819511346e-06, + "loss": 0.0094, + "step": 3487 + }, + { + "epoch": 0.692819545138544, + "grad_norm": 0.8338717940603173, + "learning_rate": 2.276894773704253e-06, + "loss": 0.0164, + "step": 3488 + }, + { + "epoch": 0.6930181745952925, + "grad_norm": 0.3897033888651404, + "learning_rate": 2.274196855532306e-06, + "loss": 0.0061, + "step": 3489 + }, + { + "epoch": 0.6932168040520409, + "grad_norm": 0.6207140161695185, + "learning_rate": 2.2715000661127107e-06, + "loss": 0.0168, + "step": 3490 + }, + { + "epoch": 0.6934154335087893, + "grad_norm": 0.36947534908833934, + "learning_rate": 2.26880440656221e-06, + "loss": 0.0092, + "step": 3491 + }, + { + "epoch": 0.6936140629655377, + "grad_norm": 0.5230707821198057, + "learning_rate": 2.2661098779970653e-06, + "loss": 0.0116, + "step": 3492 + }, + { + "epoch": 0.6938126924222863, + "grad_norm": 0.34175221266600403, + "learning_rate": 2.263416481533082e-06, + "loss": 0.0057, + "step": 3493 + }, + { + "epoch": 0.6940113218790347, + "grad_norm": 0.5044214335668877, + "learning_rate": 2.260724218285596e-06, + "loss": 0.0121, + "step": 3494 + }, + { + "epoch": 0.6942099513357831, + "grad_norm": 0.7343017142175041, + "learning_rate": 2.258033089369466e-06, + "loss": 0.0109, + "step": 3495 + }, + { + "epoch": 0.6944085807925315, + "grad_norm": 0.7027496441081319, + "learning_rate": 2.255343095899089e-06, + "loss": 0.0133, + "step": 3496 + }, + { + "epoch": 0.69460721024928, + "grad_norm": 0.515249317298665, + "learning_rate": 2.252654238988386e-06, + "loss": 0.0117, + "step": 3497 + }, + { + "epoch": 0.6948058397060284, + "grad_norm": 0.3406590742309431, + "learning_rate": 2.2499665197508137e-06, + "loss": 0.0073, + "step": 3498 + }, + { + "epoch": 0.6950044691627768, + "grad_norm": 1.054212903506879, + "learning_rate": 2.247279939299349e-06, + "loss": 0.0198, + "step": 3499 + }, + { + "epoch": 0.6952030986195252, + "grad_norm": 1.254784991849465, + "learning_rate": 2.244594498746505e-06, + "loss": 0.0131, + "step": 3500 + }, + { + "epoch": 0.6954017280762738, + "grad_norm": 0.5023270239108601, + "learning_rate": 2.241910199204321e-06, + "loss": 0.0068, + "step": 3501 + }, + { + "epoch": 0.6956003575330222, + "grad_norm": 0.33395472573545676, + "learning_rate": 2.239227041784361e-06, + "loss": 0.0054, + "step": 3502 + }, + { + "epoch": 0.6957989869897706, + "grad_norm": 1.1377173779793461, + "learning_rate": 2.236545027597719e-06, + "loss": 0.0275, + "step": 3503 + }, + { + "epoch": 0.695997616446519, + "grad_norm": 0.46319236840052935, + "learning_rate": 2.2338641577550124e-06, + "loss": 0.0162, + "step": 3504 + }, + { + "epoch": 0.6961962459032675, + "grad_norm": 0.3934534179163216, + "learning_rate": 2.2311844333663897e-06, + "loss": 0.0071, + "step": 3505 + }, + { + "epoch": 0.6963948753600159, + "grad_norm": 0.7869903742607097, + "learning_rate": 2.2285058555415184e-06, + "loss": 0.0174, + "step": 3506 + }, + { + "epoch": 0.6965935048167643, + "grad_norm": 0.5402812555281423, + "learning_rate": 2.2258284253895975e-06, + "loss": 0.0067, + "step": 3507 + }, + { + "epoch": 0.6967921342735127, + "grad_norm": 0.5141673771075058, + "learning_rate": 2.2231521440193486e-06, + "loss": 0.0141, + "step": 3508 + }, + { + "epoch": 0.6969907637302611, + "grad_norm": 0.7019261554336447, + "learning_rate": 2.2204770125390147e-06, + "loss": 0.0112, + "step": 3509 + }, + { + "epoch": 0.6971893931870097, + "grad_norm": 0.6811176170277796, + "learning_rate": 2.2178030320563677e-06, + "loss": 0.0106, + "step": 3510 + }, + { + "epoch": 0.6973880226437581, + "grad_norm": 0.39398429666162565, + "learning_rate": 2.2151302036786976e-06, + "loss": 0.0087, + "step": 3511 + }, + { + "epoch": 0.6975866521005065, + "grad_norm": 0.7331468400657002, + "learning_rate": 2.212458528512823e-06, + "loss": 0.0166, + "step": 3512 + }, + { + "epoch": 0.6977852815572549, + "grad_norm": 0.5535187539164983, + "learning_rate": 2.2097880076650786e-06, + "loss": 0.0096, + "step": 3513 + }, + { + "epoch": 0.6979839110140034, + "grad_norm": 0.3967575431172973, + "learning_rate": 2.2071186422413253e-06, + "loss": 0.0116, + "step": 3514 + }, + { + "epoch": 0.6981825404707518, + "grad_norm": 0.7538890683797367, + "learning_rate": 2.2044504333469497e-06, + "loss": 0.0134, + "step": 3515 + }, + { + "epoch": 0.6983811699275002, + "grad_norm": 0.3956918733926533, + "learning_rate": 2.2017833820868466e-06, + "loss": 0.0119, + "step": 3516 + }, + { + "epoch": 0.6985797993842486, + "grad_norm": 0.6929067669197901, + "learning_rate": 2.1991174895654445e-06, + "loss": 0.0131, + "step": 3517 + }, + { + "epoch": 0.6987784288409972, + "grad_norm": 3.3273773145583596, + "learning_rate": 2.1964527568866828e-06, + "loss": 0.013, + "step": 3518 + }, + { + "epoch": 0.6989770582977456, + "grad_norm": 0.9093484745445498, + "learning_rate": 2.193789185154027e-06, + "loss": 0.0223, + "step": 3519 + }, + { + "epoch": 0.699175687754494, + "grad_norm": 0.6123078414872011, + "learning_rate": 2.191126775470462e-06, + "loss": 0.0098, + "step": 3520 + }, + { + "epoch": 0.6993743172112424, + "grad_norm": 0.5104306223926839, + "learning_rate": 2.188465528938484e-06, + "loss": 0.0169, + "step": 3521 + }, + { + "epoch": 0.6995729466679909, + "grad_norm": 0.6464141115145141, + "learning_rate": 2.185805446660117e-06, + "loss": 0.0158, + "step": 3522 + }, + { + "epoch": 0.6997715761247393, + "grad_norm": 0.5539549495565598, + "learning_rate": 2.183146529736894e-06, + "loss": 0.0142, + "step": 3523 + }, + { + "epoch": 0.6999702055814877, + "grad_norm": 1.1776865246831185, + "learning_rate": 2.180488779269874e-06, + "loss": 0.0158, + "step": 3524 + }, + { + "epoch": 0.7001688350382361, + "grad_norm": 0.5566688947877497, + "learning_rate": 2.177832196359626e-06, + "loss": 0.0093, + "step": 3525 + }, + { + "epoch": 0.7003674644949847, + "grad_norm": 0.2616688972825115, + "learning_rate": 2.1751767821062385e-06, + "loss": 0.0077, + "step": 3526 + }, + { + "epoch": 0.7005660939517331, + "grad_norm": 0.8922237616752913, + "learning_rate": 2.172522537609319e-06, + "loss": 0.0136, + "step": 3527 + }, + { + "epoch": 0.7007647234084815, + "grad_norm": 0.42603522315846554, + "learning_rate": 2.169869463967983e-06, + "loss": 0.0178, + "step": 3528 + }, + { + "epoch": 0.7009633528652299, + "grad_norm": 0.4466000675946425, + "learning_rate": 2.16721756228087e-06, + "loss": 0.0137, + "step": 3529 + }, + { + "epoch": 0.7011619823219783, + "grad_norm": 0.6276370519416008, + "learning_rate": 2.164566833646125e-06, + "loss": 0.0192, + "step": 3530 + }, + { + "epoch": 0.7013606117787268, + "grad_norm": 0.5258723027999391, + "learning_rate": 2.1619172791614175e-06, + "loss": 0.0151, + "step": 3531 + }, + { + "epoch": 0.7015592412354752, + "grad_norm": 0.6959177505640589, + "learning_rate": 2.1592688999239204e-06, + "loss": 0.0166, + "step": 3532 + }, + { + "epoch": 0.7017578706922236, + "grad_norm": 0.6402652235456184, + "learning_rate": 2.156621697030327e-06, + "loss": 0.0114, + "step": 3533 + }, + { + "epoch": 0.701956500148972, + "grad_norm": 0.46310938800434814, + "learning_rate": 2.1539756715768434e-06, + "loss": 0.0097, + "step": 3534 + }, + { + "epoch": 0.7021551296057206, + "grad_norm": 0.44032965187225664, + "learning_rate": 2.151330824659182e-06, + "loss": 0.0097, + "step": 3535 + }, + { + "epoch": 0.702353759062469, + "grad_norm": 0.8920415210582037, + "learning_rate": 2.1486871573725752e-06, + "loss": 0.0143, + "step": 3536 + }, + { + "epoch": 0.7025523885192174, + "grad_norm": 0.37431884202742494, + "learning_rate": 2.1460446708117594e-06, + "loss": 0.0098, + "step": 3537 + }, + { + "epoch": 0.7027510179759658, + "grad_norm": 0.4891117814991983, + "learning_rate": 2.1434033660709896e-06, + "loss": 0.0131, + "step": 3538 + }, + { + "epoch": 0.7029496474327143, + "grad_norm": 1.047496707565232, + "learning_rate": 2.1407632442440247e-06, + "loss": 0.0132, + "step": 3539 + }, + { + "epoch": 0.7031482768894627, + "grad_norm": 0.4074516687437718, + "learning_rate": 2.138124306424136e-06, + "loss": 0.0096, + "step": 3540 + }, + { + "epoch": 0.7033469063462111, + "grad_norm": 0.6921084878895216, + "learning_rate": 2.135486553704107e-06, + "loss": 0.012, + "step": 3541 + }, + { + "epoch": 0.7035455358029595, + "grad_norm": 0.352632241603671, + "learning_rate": 2.132849987176226e-06, + "loss": 0.0085, + "step": 3542 + }, + { + "epoch": 0.7037441652597081, + "grad_norm": 0.5517973185214629, + "learning_rate": 2.1302146079322945e-06, + "loss": 0.0078, + "step": 3543 + }, + { + "epoch": 0.7039427947164565, + "grad_norm": 0.3723571720752435, + "learning_rate": 2.1275804170636213e-06, + "loss": 0.0087, + "step": 3544 + }, + { + "epoch": 0.7041414241732049, + "grad_norm": 0.57601818247446, + "learning_rate": 2.124947415661019e-06, + "loss": 0.0152, + "step": 3545 + }, + { + "epoch": 0.7043400536299533, + "grad_norm": 0.21023435616038813, + "learning_rate": 2.1223156048148146e-06, + "loss": 0.0068, + "step": 3546 + }, + { + "epoch": 0.7045386830867018, + "grad_norm": 0.33170514161187187, + "learning_rate": 2.119684985614835e-06, + "loss": 0.0091, + "step": 3547 + }, + { + "epoch": 0.7047373125434502, + "grad_norm": 0.6974394026163584, + "learning_rate": 2.1170555591504198e-06, + "loss": 0.0195, + "step": 3548 + }, + { + "epoch": 0.7049359420001986, + "grad_norm": 0.6571182045495615, + "learning_rate": 2.1144273265104088e-06, + "loss": 0.0166, + "step": 3549 + }, + { + "epoch": 0.705134571456947, + "grad_norm": 0.5118877638635031, + "learning_rate": 2.1118002887831523e-06, + "loss": 0.017, + "step": 3550 + }, + { + "epoch": 0.7053332009136954, + "grad_norm": 0.607122918396555, + "learning_rate": 2.1091744470565062e-06, + "loss": 0.0123, + "step": 3551 + }, + { + "epoch": 0.705531830370444, + "grad_norm": 0.46800730184021394, + "learning_rate": 2.1065498024178237e-06, + "loss": 0.0117, + "step": 3552 + }, + { + "epoch": 0.7057304598271924, + "grad_norm": 0.48441945139013526, + "learning_rate": 2.1039263559539737e-06, + "loss": 0.0137, + "step": 3553 + }, + { + "epoch": 0.7059290892839408, + "grad_norm": 0.6602368453331714, + "learning_rate": 2.1013041087513163e-06, + "loss": 0.0202, + "step": 3554 + }, + { + "epoch": 0.7061277187406892, + "grad_norm": 0.4140800156964033, + "learning_rate": 2.098683061895727e-06, + "loss": 0.013, + "step": 3555 + }, + { + "epoch": 0.7063263481974377, + "grad_norm": 0.39987172496263784, + "learning_rate": 2.0960632164725746e-06, + "loss": 0.009, + "step": 3556 + }, + { + "epoch": 0.7065249776541861, + "grad_norm": 0.42338870067574086, + "learning_rate": 2.0934445735667366e-06, + "loss": 0.012, + "step": 3557 + }, + { + "epoch": 0.7067236071109345, + "grad_norm": 0.5296392321338635, + "learning_rate": 2.0908271342625907e-06, + "loss": 0.0116, + "step": 3558 + }, + { + "epoch": 0.7069222365676829, + "grad_norm": 0.6562700338635507, + "learning_rate": 2.0882108996440144e-06, + "loss": 0.0162, + "step": 3559 + }, + { + "epoch": 0.7071208660244315, + "grad_norm": 0.45596472145739425, + "learning_rate": 2.0855958707943903e-06, + "loss": 0.0132, + "step": 3560 + }, + { + "epoch": 0.7073194954811799, + "grad_norm": 0.5851977975386901, + "learning_rate": 2.082982048796595e-06, + "loss": 0.0115, + "step": 3561 + }, + { + "epoch": 0.7075181249379283, + "grad_norm": 0.6246161154127471, + "learning_rate": 2.0803694347330146e-06, + "loss": 0.025, + "step": 3562 + }, + { + "epoch": 0.7077167543946767, + "grad_norm": 0.70364745633366, + "learning_rate": 2.077758029685527e-06, + "loss": 0.0059, + "step": 3563 + }, + { + "epoch": 0.7079153838514252, + "grad_norm": 1.2354575585632148, + "learning_rate": 2.0751478347355112e-06, + "loss": 0.0226, + "step": 3564 + }, + { + "epoch": 0.7081140133081736, + "grad_norm": 0.40712325234995933, + "learning_rate": 2.0725388509638504e-06, + "loss": 0.0098, + "step": 3565 + }, + { + "epoch": 0.708312642764922, + "grad_norm": 0.318136163341281, + "learning_rate": 2.0699310794509176e-06, + "loss": 0.0089, + "step": 3566 + }, + { + "epoch": 0.7085112722216704, + "grad_norm": 0.22133628429597416, + "learning_rate": 2.0673245212765923e-06, + "loss": 0.0054, + "step": 3567 + }, + { + "epoch": 0.708709901678419, + "grad_norm": 1.0519287362857404, + "learning_rate": 2.064719177520244e-06, + "loss": 0.0146, + "step": 3568 + }, + { + "epoch": 0.7089085311351674, + "grad_norm": 0.38952311086071517, + "learning_rate": 2.062115049260745e-06, + "loss": 0.0088, + "step": 3569 + }, + { + "epoch": 0.7091071605919158, + "grad_norm": 0.5339933404088719, + "learning_rate": 2.0595121375764638e-06, + "loss": 0.0148, + "step": 3570 + }, + { + "epoch": 0.7093057900486642, + "grad_norm": 0.35237702102373786, + "learning_rate": 2.0569104435452597e-06, + "loss": 0.0071, + "step": 3571 + }, + { + "epoch": 0.7095044195054127, + "grad_norm": 0.5657247740451654, + "learning_rate": 2.0543099682444958e-06, + "loss": 0.0107, + "step": 3572 + }, + { + "epoch": 0.7097030489621611, + "grad_norm": 0.5548852852278983, + "learning_rate": 2.0517107127510223e-06, + "loss": 0.0107, + "step": 3573 + }, + { + "epoch": 0.7099016784189095, + "grad_norm": 0.34677930544310487, + "learning_rate": 2.049112678141192e-06, + "loss": 0.0053, + "step": 3574 + }, + { + "epoch": 0.7101003078756579, + "grad_norm": 0.3763747072298554, + "learning_rate": 2.0465158654908447e-06, + "loss": 0.0092, + "step": 3575 + }, + { + "epoch": 0.7102989373324063, + "grad_norm": 0.4447085423083739, + "learning_rate": 2.0439202758753196e-06, + "loss": 0.0177, + "step": 3576 + }, + { + "epoch": 0.7104975667891549, + "grad_norm": 0.43408562131660483, + "learning_rate": 2.04132591036945e-06, + "loss": 0.011, + "step": 3577 + }, + { + "epoch": 0.7106961962459033, + "grad_norm": 0.7457101724605041, + "learning_rate": 2.0387327700475564e-06, + "loss": 0.0103, + "step": 3578 + }, + { + "epoch": 0.7108948257026517, + "grad_norm": 0.7667181305870763, + "learning_rate": 2.036140855983458e-06, + "loss": 0.0115, + "step": 3579 + }, + { + "epoch": 0.7110934551594001, + "grad_norm": 0.4608200956966256, + "learning_rate": 2.0335501692504624e-06, + "loss": 0.0138, + "step": 3580 + }, + { + "epoch": 0.7112920846161486, + "grad_norm": 0.43968224621643054, + "learning_rate": 2.0309607109213725e-06, + "loss": 0.0119, + "step": 3581 + }, + { + "epoch": 0.711490714072897, + "grad_norm": 0.38784857298098396, + "learning_rate": 2.028372482068477e-06, + "loss": 0.0086, + "step": 3582 + }, + { + "epoch": 0.7116893435296454, + "grad_norm": 0.4987232153713858, + "learning_rate": 2.025785483763561e-06, + "loss": 0.0195, + "step": 3583 + }, + { + "epoch": 0.7118879729863938, + "grad_norm": 0.7964544674594572, + "learning_rate": 2.0231997170778996e-06, + "loss": 0.0187, + "step": 3584 + }, + { + "epoch": 0.7120866024431424, + "grad_norm": 0.9392661611065467, + "learning_rate": 2.0206151830822523e-06, + "loss": 0.013, + "step": 3585 + }, + { + "epoch": 0.7122852318998908, + "grad_norm": 0.5308227215692699, + "learning_rate": 2.018031882846876e-06, + "loss": 0.0074, + "step": 3586 + }, + { + "epoch": 0.7124838613566392, + "grad_norm": 0.44551851990638935, + "learning_rate": 2.0154498174415104e-06, + "loss": 0.0069, + "step": 3587 + }, + { + "epoch": 0.7126824908133876, + "grad_norm": 0.4912170131917664, + "learning_rate": 2.0128689879353853e-06, + "loss": 0.0249, + "step": 3588 + }, + { + "epoch": 0.7128811202701361, + "grad_norm": 0.9819312671738274, + "learning_rate": 2.0102893953972223e-06, + "loss": 0.0155, + "step": 3589 + }, + { + "epoch": 0.7130797497268845, + "grad_norm": 0.6813312541973012, + "learning_rate": 2.0077110408952254e-06, + "loss": 0.0128, + "step": 3590 + }, + { + "epoch": 0.7132783791836329, + "grad_norm": 0.5331664640007728, + "learning_rate": 2.0051339254970912e-06, + "loss": 0.0146, + "step": 3591 + }, + { + "epoch": 0.7134770086403813, + "grad_norm": 0.2943537320564372, + "learning_rate": 2.0025580502699983e-06, + "loss": 0.0083, + "step": 3592 + }, + { + "epoch": 0.7136756380971297, + "grad_norm": 0.5213864974119058, + "learning_rate": 1.9999834162806143e-06, + "loss": 0.0119, + "step": 3593 + }, + { + "epoch": 0.7138742675538783, + "grad_norm": 0.46666853895153365, + "learning_rate": 1.997410024595095e-06, + "loss": 0.0091, + "step": 3594 + }, + { + "epoch": 0.7140728970106267, + "grad_norm": 0.5085939748759494, + "learning_rate": 1.9948378762790767e-06, + "loss": 0.0138, + "step": 3595 + }, + { + "epoch": 0.7142715264673751, + "grad_norm": 0.2990427145994346, + "learning_rate": 1.992266972397685e-06, + "loss": 0.0062, + "step": 3596 + }, + { + "epoch": 0.7144701559241236, + "grad_norm": 0.7704563352878734, + "learning_rate": 1.9896973140155274e-06, + "loss": 0.0136, + "step": 3597 + }, + { + "epoch": 0.714668785380872, + "grad_norm": 0.4896227179477158, + "learning_rate": 1.9871289021966984e-06, + "loss": 0.0091, + "step": 3598 + }, + { + "epoch": 0.7148674148376204, + "grad_norm": 0.5270626574804558, + "learning_rate": 1.9845617380047725e-06, + "loss": 0.0119, + "step": 3599 + }, + { + "epoch": 0.7150660442943688, + "grad_norm": 0.38674602009064785, + "learning_rate": 1.981995822502811e-06, + "loss": 0.008, + "step": 3600 + }, + { + "epoch": 0.7152646737511172, + "grad_norm": 0.291567327736593, + "learning_rate": 1.979431156753359e-06, + "loss": 0.0089, + "step": 3601 + }, + { + "epoch": 0.7154633032078658, + "grad_norm": 0.7286287787040973, + "learning_rate": 1.9768677418184382e-06, + "loss": 0.0097, + "step": 3602 + }, + { + "epoch": 0.7156619326646142, + "grad_norm": 0.3147595174206081, + "learning_rate": 1.97430557875956e-06, + "loss": 0.0074, + "step": 3603 + }, + { + "epoch": 0.7158605621213626, + "grad_norm": 0.557758074899719, + "learning_rate": 1.97174466863771e-06, + "loss": 0.014, + "step": 3604 + }, + { + "epoch": 0.716059191578111, + "grad_norm": 0.312647733026765, + "learning_rate": 1.9691850125133617e-06, + "loss": 0.0106, + "step": 3605 + }, + { + "epoch": 0.7162578210348595, + "grad_norm": 0.5740235026460454, + "learning_rate": 1.9666266114464626e-06, + "loss": 0.0105, + "step": 3606 + }, + { + "epoch": 0.7164564504916079, + "grad_norm": 0.42789475100988844, + "learning_rate": 1.964069466496446e-06, + "loss": 0.008, + "step": 3607 + }, + { + "epoch": 0.7166550799483563, + "grad_norm": 0.3997543712505119, + "learning_rate": 1.961513578722225e-06, + "loss": 0.0067, + "step": 3608 + }, + { + "epoch": 0.7168537094051047, + "grad_norm": 0.5950672430416363, + "learning_rate": 1.958958949182187e-06, + "loss": 0.012, + "step": 3609 + }, + { + "epoch": 0.7170523388618533, + "grad_norm": 0.6623144173230123, + "learning_rate": 1.956405578934204e-06, + "loss": 0.0094, + "step": 3610 + }, + { + "epoch": 0.7172509683186017, + "grad_norm": 0.6793926564221918, + "learning_rate": 1.9538534690356225e-06, + "loss": 0.014, + "step": 3611 + }, + { + "epoch": 0.7174495977753501, + "grad_norm": 0.5600807419635767, + "learning_rate": 1.951302620543268e-06, + "loss": 0.0142, + "step": 3612 + }, + { + "epoch": 0.7176482272320985, + "grad_norm": 0.4854484281401081, + "learning_rate": 1.9487530345134464e-06, + "loss": 0.01, + "step": 3613 + }, + { + "epoch": 0.717846856688847, + "grad_norm": 0.9611540312341723, + "learning_rate": 1.946204712001936e-06, + "loss": 0.0098, + "step": 3614 + }, + { + "epoch": 0.7180454861455954, + "grad_norm": 0.5517679891572496, + "learning_rate": 1.9436576540639985e-06, + "loss": 0.0096, + "step": 3615 + }, + { + "epoch": 0.7182441156023438, + "grad_norm": 0.41701208366022036, + "learning_rate": 1.9411118617543634e-06, + "loss": 0.0077, + "step": 3616 + }, + { + "epoch": 0.7184427450590922, + "grad_norm": 0.36515434166067073, + "learning_rate": 1.938567336127245e-06, + "loss": 0.0145, + "step": 3617 + }, + { + "epoch": 0.7186413745158406, + "grad_norm": 0.42121979670363874, + "learning_rate": 1.936024078236325e-06, + "loss": 0.0089, + "step": 3618 + }, + { + "epoch": 0.7188400039725892, + "grad_norm": 0.43493645872145054, + "learning_rate": 1.9334820891347663e-06, + "loss": 0.0106, + "step": 3619 + }, + { + "epoch": 0.7190386334293376, + "grad_norm": 0.39201007950116096, + "learning_rate": 1.930941369875205e-06, + "loss": 0.0066, + "step": 3620 + }, + { + "epoch": 0.719237262886086, + "grad_norm": 0.44616612890490304, + "learning_rate": 1.928401921509747e-06, + "loss": 0.0089, + "step": 3621 + }, + { + "epoch": 0.7194358923428344, + "grad_norm": 0.5005235650718418, + "learning_rate": 1.9258637450899796e-06, + "loss": 0.0082, + "step": 3622 + }, + { + "epoch": 0.7196345217995829, + "grad_norm": 0.9873137170116204, + "learning_rate": 1.9233268416669547e-06, + "loss": 0.0143, + "step": 3623 + }, + { + "epoch": 0.7198331512563313, + "grad_norm": 0.9359450207876253, + "learning_rate": 1.920791212291206e-06, + "loss": 0.0142, + "step": 3624 + }, + { + "epoch": 0.7200317807130797, + "grad_norm": 0.8723213929265855, + "learning_rate": 1.9182568580127304e-06, + "loss": 0.0147, + "step": 3625 + }, + { + "epoch": 0.7202304101698281, + "grad_norm": 0.7238675157448663, + "learning_rate": 1.9157237798810037e-06, + "loss": 0.0116, + "step": 3626 + }, + { + "epoch": 0.7204290396265767, + "grad_norm": 0.8076205888225351, + "learning_rate": 1.9131919789449733e-06, + "loss": 0.0177, + "step": 3627 + }, + { + "epoch": 0.7206276690833251, + "grad_norm": 1.0779343741041851, + "learning_rate": 1.910661456253051e-06, + "loss": 0.021, + "step": 3628 + }, + { + "epoch": 0.7208262985400735, + "grad_norm": 0.5096320669800932, + "learning_rate": 1.9081322128531277e-06, + "loss": 0.0114, + "step": 3629 + }, + { + "epoch": 0.7210249279968219, + "grad_norm": 0.6623908050965808, + "learning_rate": 1.905604249792557e-06, + "loss": 0.0052, + "step": 3630 + }, + { + "epoch": 0.7212235574535704, + "grad_norm": 0.47990832396620436, + "learning_rate": 1.9030775681181696e-06, + "loss": 0.0091, + "step": 3631 + }, + { + "epoch": 0.7214221869103188, + "grad_norm": 0.7334184988391341, + "learning_rate": 1.9005521688762585e-06, + "loss": 0.0159, + "step": 3632 + }, + { + "epoch": 0.7216208163670672, + "grad_norm": 0.4980418429190839, + "learning_rate": 1.89802805311259e-06, + "loss": 0.0128, + "step": 3633 + }, + { + "epoch": 0.7218194458238156, + "grad_norm": 0.47426037363700535, + "learning_rate": 1.8955052218724002e-06, + "loss": 0.0085, + "step": 3634 + }, + { + "epoch": 0.722018075280564, + "grad_norm": 0.5843280660499018, + "learning_rate": 1.892983676200389e-06, + "loss": 0.0122, + "step": 3635 + }, + { + "epoch": 0.7222167047373126, + "grad_norm": 0.8246276543714378, + "learning_rate": 1.8904634171407238e-06, + "loss": 0.0095, + "step": 3636 + }, + { + "epoch": 0.722415334194061, + "grad_norm": 1.1929203320268906, + "learning_rate": 1.887944445737046e-06, + "loss": 0.0182, + "step": 3637 + }, + { + "epoch": 0.7226139636508094, + "grad_norm": 0.46024686499789297, + "learning_rate": 1.8854267630324547e-06, + "loss": 0.0061, + "step": 3638 + }, + { + "epoch": 0.7228125931075579, + "grad_norm": 0.3041353357374024, + "learning_rate": 1.882910370069524e-06, + "loss": 0.006, + "step": 3639 + }, + { + "epoch": 0.7230112225643063, + "grad_norm": 0.566018431463613, + "learning_rate": 1.8803952678902853e-06, + "loss": 0.0125, + "step": 3640 + }, + { + "epoch": 0.7232098520210547, + "grad_norm": 0.3629859525601858, + "learning_rate": 1.877881457536244e-06, + "loss": 0.0135, + "step": 3641 + }, + { + "epoch": 0.7234084814778031, + "grad_norm": 0.49111992542952826, + "learning_rate": 1.8753689400483627e-06, + "loss": 0.0075, + "step": 3642 + }, + { + "epoch": 0.7236071109345515, + "grad_norm": 0.3138065124597806, + "learning_rate": 1.8728577164670747e-06, + "loss": 0.0043, + "step": 3643 + }, + { + "epoch": 0.7238057403913001, + "grad_norm": 0.9114034734955218, + "learning_rate": 1.8703477878322763e-06, + "loss": 0.0131, + "step": 3644 + }, + { + "epoch": 0.7240043698480485, + "grad_norm": 0.4633138637914843, + "learning_rate": 1.8678391551833225e-06, + "loss": 0.0076, + "step": 3645 + }, + { + "epoch": 0.7242029993047969, + "grad_norm": 0.5559865923945981, + "learning_rate": 1.8653318195590403e-06, + "loss": 0.0102, + "step": 3646 + }, + { + "epoch": 0.7244016287615453, + "grad_norm": 0.45909590775240494, + "learning_rate": 1.8628257819977102e-06, + "loss": 0.0114, + "step": 3647 + }, + { + "epoch": 0.7246002582182938, + "grad_norm": 0.5720442731394239, + "learning_rate": 1.8603210435370845e-06, + "loss": 0.0139, + "step": 3648 + }, + { + "epoch": 0.7247988876750422, + "grad_norm": 1.0088288086073944, + "learning_rate": 1.8578176052143682e-06, + "loss": 0.0105, + "step": 3649 + }, + { + "epoch": 0.7249975171317906, + "grad_norm": 0.341268216037555, + "learning_rate": 1.8553154680662355e-06, + "loss": 0.0102, + "step": 3650 + }, + { + "epoch": 0.725196146588539, + "grad_norm": 0.5430532731720251, + "learning_rate": 1.8528146331288198e-06, + "loss": 0.0056, + "step": 3651 + }, + { + "epoch": 0.7253947760452876, + "grad_norm": 0.6547103394450872, + "learning_rate": 1.8503151014377108e-06, + "loss": 0.0141, + "step": 3652 + }, + { + "epoch": 0.725593405502036, + "grad_norm": 0.7543034404187979, + "learning_rate": 1.8478168740279662e-06, + "loss": 0.0146, + "step": 3653 + }, + { + "epoch": 0.7257920349587844, + "grad_norm": 0.8527254403878096, + "learning_rate": 1.8453199519340959e-06, + "loss": 0.0108, + "step": 3654 + }, + { + "epoch": 0.7259906644155328, + "grad_norm": 0.8087062079395743, + "learning_rate": 1.8428243361900754e-06, + "loss": 0.0127, + "step": 3655 + }, + { + "epoch": 0.7261892938722813, + "grad_norm": 0.906253089444188, + "learning_rate": 1.840330027829334e-06, + "loss": 0.0245, + "step": 3656 + }, + { + "epoch": 0.7263879233290297, + "grad_norm": 0.6648053391954097, + "learning_rate": 1.8378370278847646e-06, + "loss": 0.012, + "step": 3657 + }, + { + "epoch": 0.7265865527857781, + "grad_norm": 0.8741980778803626, + "learning_rate": 1.835345337388718e-06, + "loss": 0.0136, + "step": 3658 + }, + { + "epoch": 0.7267851822425265, + "grad_norm": 0.637411767418505, + "learning_rate": 1.8328549573729948e-06, + "loss": 0.0148, + "step": 3659 + }, + { + "epoch": 0.726983811699275, + "grad_norm": 0.6026345033209373, + "learning_rate": 1.8303658888688642e-06, + "loss": 0.011, + "step": 3660 + }, + { + "epoch": 0.7271824411560235, + "grad_norm": 0.7594705123411722, + "learning_rate": 1.8278781329070422e-06, + "loss": 0.017, + "step": 3661 + }, + { + "epoch": 0.7273810706127719, + "grad_norm": 0.41929886968876007, + "learning_rate": 1.8253916905177093e-06, + "loss": 0.0134, + "step": 3662 + }, + { + "epoch": 0.7275797000695203, + "grad_norm": 1.0949746883385296, + "learning_rate": 1.8229065627305003e-06, + "loss": 0.0237, + "step": 3663 + }, + { + "epoch": 0.7277783295262688, + "grad_norm": 0.3118311043232577, + "learning_rate": 1.8204227505744998e-06, + "loss": 0.0068, + "step": 3664 + }, + { + "epoch": 0.7279769589830172, + "grad_norm": 0.659409007129231, + "learning_rate": 1.817940255078256e-06, + "loss": 0.01, + "step": 3665 + }, + { + "epoch": 0.7281755884397656, + "grad_norm": 0.5934643848129301, + "learning_rate": 1.8154590772697644e-06, + "loss": 0.0096, + "step": 3666 + }, + { + "epoch": 0.728374217896514, + "grad_norm": 0.4976392266560354, + "learning_rate": 1.8129792181764817e-06, + "loss": 0.013, + "step": 3667 + }, + { + "epoch": 0.7285728473532624, + "grad_norm": 0.47726296885447955, + "learning_rate": 1.810500678825311e-06, + "loss": 0.0144, + "step": 3668 + }, + { + "epoch": 0.728771476810011, + "grad_norm": 0.22945888004342208, + "learning_rate": 1.8080234602426155e-06, + "loss": 0.0047, + "step": 3669 + }, + { + "epoch": 0.7289701062667594, + "grad_norm": 0.5110342276655542, + "learning_rate": 1.8055475634542102e-06, + "loss": 0.0078, + "step": 3670 + }, + { + "epoch": 0.7291687357235078, + "grad_norm": 0.31793929875854604, + "learning_rate": 1.8030729894853582e-06, + "loss": 0.0134, + "step": 3671 + }, + { + "epoch": 0.7293673651802562, + "grad_norm": 0.7487761059663722, + "learning_rate": 1.8005997393607816e-06, + "loss": 0.0206, + "step": 3672 + }, + { + "epoch": 0.7295659946370047, + "grad_norm": 0.2538735229220153, + "learning_rate": 1.7981278141046472e-06, + "loss": 0.0088, + "step": 3673 + }, + { + "epoch": 0.7297646240937531, + "grad_norm": 0.4930426210508408, + "learning_rate": 1.7956572147405798e-06, + "loss": 0.0116, + "step": 3674 + }, + { + "epoch": 0.7299632535505015, + "grad_norm": 0.46080199402940236, + "learning_rate": 1.7931879422916499e-06, + "loss": 0.0054, + "step": 3675 + }, + { + "epoch": 0.7301618830072499, + "grad_norm": 0.44044922022937444, + "learning_rate": 1.7907199977803809e-06, + "loss": 0.0099, + "step": 3676 + }, + { + "epoch": 0.7303605124639984, + "grad_norm": 0.42505852730210714, + "learning_rate": 1.788253382228749e-06, + "loss": 0.0196, + "step": 3677 + }, + { + "epoch": 0.7305591419207469, + "grad_norm": 0.3898875170650358, + "learning_rate": 1.7857880966581738e-06, + "loss": 0.0051, + "step": 3678 + }, + { + "epoch": 0.7307577713774953, + "grad_norm": 0.49491000937059526, + "learning_rate": 1.7833241420895304e-06, + "loss": 0.011, + "step": 3679 + }, + { + "epoch": 0.7309564008342437, + "grad_norm": 0.8767113299077823, + "learning_rate": 1.7808615195431366e-06, + "loss": 0.0131, + "step": 3680 + }, + { + "epoch": 0.7311550302909922, + "grad_norm": 0.6506677708061251, + "learning_rate": 1.778400230038766e-06, + "loss": 0.0141, + "step": 3681 + }, + { + "epoch": 0.7313536597477406, + "grad_norm": 0.31762522470848703, + "learning_rate": 1.775940274595634e-06, + "loss": 0.0072, + "step": 3682 + }, + { + "epoch": 0.731552289204489, + "grad_norm": 0.3400734480718243, + "learning_rate": 1.7734816542324034e-06, + "loss": 0.0069, + "step": 3683 + }, + { + "epoch": 0.7317509186612374, + "grad_norm": 0.3523223550612423, + "learning_rate": 1.7710243699671908e-06, + "loss": 0.0089, + "step": 3684 + }, + { + "epoch": 0.7319495481179858, + "grad_norm": 0.26978468960487634, + "learning_rate": 1.768568422817551e-06, + "loss": 0.0061, + "step": 3685 + }, + { + "epoch": 0.7321481775747344, + "grad_norm": 0.5668983473598859, + "learning_rate": 1.7661138138004918e-06, + "loss": 0.0153, + "step": 3686 + }, + { + "epoch": 0.7323468070314828, + "grad_norm": 1.5335058937602353, + "learning_rate": 1.763660543932465e-06, + "loss": 0.0177, + "step": 3687 + }, + { + "epoch": 0.7325454364882312, + "grad_norm": 0.5330209491574611, + "learning_rate": 1.7612086142293643e-06, + "loss": 0.0097, + "step": 3688 + }, + { + "epoch": 0.7327440659449797, + "grad_norm": 0.5463971064029184, + "learning_rate": 1.758758025706535e-06, + "loss": 0.0125, + "step": 3689 + }, + { + "epoch": 0.7329426954017281, + "grad_norm": 0.4545424691631491, + "learning_rate": 1.7563087793787597e-06, + "loss": 0.0095, + "step": 3690 + }, + { + "epoch": 0.7331413248584765, + "grad_norm": 0.4004144836163272, + "learning_rate": 1.753860876260272e-06, + "loss": 0.0109, + "step": 3691 + }, + { + "epoch": 0.7333399543152249, + "grad_norm": 0.4843018902925179, + "learning_rate": 1.751414317364743e-06, + "loss": 0.0128, + "step": 3692 + }, + { + "epoch": 0.7335385837719733, + "grad_norm": 0.27511747492039373, + "learning_rate": 1.7489691037052925e-06, + "loss": 0.0068, + "step": 3693 + }, + { + "epoch": 0.7337372132287219, + "grad_norm": 0.36643270106143355, + "learning_rate": 1.7465252362944818e-06, + "loss": 0.0093, + "step": 3694 + }, + { + "epoch": 0.7339358426854703, + "grad_norm": 0.320728805385628, + "learning_rate": 1.7440827161443108e-06, + "loss": 0.0115, + "step": 3695 + }, + { + "epoch": 0.7341344721422187, + "grad_norm": 0.42125602343146334, + "learning_rate": 1.7416415442662283e-06, + "loss": 0.0135, + "step": 3696 + }, + { + "epoch": 0.7343331015989671, + "grad_norm": 0.418662760132342, + "learning_rate": 1.7392017216711178e-06, + "loss": 0.0113, + "step": 3697 + }, + { + "epoch": 0.7345317310557156, + "grad_norm": 0.29967942764047406, + "learning_rate": 1.7367632493693098e-06, + "loss": 0.0108, + "step": 3698 + }, + { + "epoch": 0.734730360512464, + "grad_norm": 0.683085498593672, + "learning_rate": 1.7343261283705714e-06, + "loss": 0.0108, + "step": 3699 + }, + { + "epoch": 0.7349289899692124, + "grad_norm": 0.3669500417485653, + "learning_rate": 1.731890359684113e-06, + "loss": 0.0086, + "step": 3700 + }, + { + "epoch": 0.7351276194259608, + "grad_norm": 0.7989281415130531, + "learning_rate": 1.7294559443185854e-06, + "loss": 0.0114, + "step": 3701 + }, + { + "epoch": 0.7353262488827093, + "grad_norm": 0.3908467521711616, + "learning_rate": 1.727022883282074e-06, + "loss": 0.0079, + "step": 3702 + }, + { + "epoch": 0.7355248783394578, + "grad_norm": 0.4811819714857938, + "learning_rate": 1.7245911775821112e-06, + "loss": 0.005, + "step": 3703 + }, + { + "epoch": 0.7357235077962062, + "grad_norm": 0.5482194796559414, + "learning_rate": 1.7221608282256597e-06, + "loss": 0.0088, + "step": 3704 + }, + { + "epoch": 0.7359221372529546, + "grad_norm": 0.3769237988240613, + "learning_rate": 1.7197318362191284e-06, + "loss": 0.0138, + "step": 3705 + }, + { + "epoch": 0.7361207667097031, + "grad_norm": 0.6344995974195727, + "learning_rate": 1.717304202568359e-06, + "loss": 0.0169, + "step": 3706 + }, + { + "epoch": 0.7363193961664515, + "grad_norm": 0.5850068173437899, + "learning_rate": 1.7148779282786305e-06, + "loss": 0.0138, + "step": 3707 + }, + { + "epoch": 0.7365180256231999, + "grad_norm": 0.7795863924661017, + "learning_rate": 1.712453014354663e-06, + "loss": 0.0122, + "step": 3708 + }, + { + "epoch": 0.7367166550799483, + "grad_norm": 0.35280976439404965, + "learning_rate": 1.7100294618006092e-06, + "loss": 0.0084, + "step": 3709 + }, + { + "epoch": 0.7369152845366967, + "grad_norm": 0.5700366010660607, + "learning_rate": 1.7076072716200616e-06, + "loss": 0.0155, + "step": 3710 + }, + { + "epoch": 0.7371139139934453, + "grad_norm": 0.40895343575304466, + "learning_rate": 1.7051864448160444e-06, + "loss": 0.0084, + "step": 3711 + }, + { + "epoch": 0.7373125434501937, + "grad_norm": 0.4305656202174194, + "learning_rate": 1.7027669823910208e-06, + "loss": 0.0111, + "step": 3712 + }, + { + "epoch": 0.7375111729069421, + "grad_norm": 0.984538631213734, + "learning_rate": 1.7003488853468897e-06, + "loss": 0.0131, + "step": 3713 + }, + { + "epoch": 0.7377098023636905, + "grad_norm": 0.5010493377039495, + "learning_rate": 1.6979321546849786e-06, + "loss": 0.0063, + "step": 3714 + }, + { + "epoch": 0.737908431820439, + "grad_norm": 0.45048026983636796, + "learning_rate": 1.6955167914060578e-06, + "loss": 0.012, + "step": 3715 + }, + { + "epoch": 0.7381070612771874, + "grad_norm": 0.2741879991054503, + "learning_rate": 1.6931027965103224e-06, + "loss": 0.0057, + "step": 3716 + }, + { + "epoch": 0.7383056907339358, + "grad_norm": 0.4130916332257633, + "learning_rate": 1.6906901709974093e-06, + "loss": 0.0115, + "step": 3717 + }, + { + "epoch": 0.7385043201906842, + "grad_norm": 0.561420411364561, + "learning_rate": 1.6882789158663803e-06, + "loss": 0.0128, + "step": 3718 + }, + { + "epoch": 0.7387029496474327, + "grad_norm": 0.22467074037808235, + "learning_rate": 1.6858690321157362e-06, + "loss": 0.0036, + "step": 3719 + }, + { + "epoch": 0.7389015791041812, + "grad_norm": 0.7303477211037094, + "learning_rate": 1.6834605207434084e-06, + "loss": 0.0167, + "step": 3720 + }, + { + "epoch": 0.7391002085609296, + "grad_norm": 0.5588024743579839, + "learning_rate": 1.6810533827467563e-06, + "loss": 0.0104, + "step": 3721 + }, + { + "epoch": 0.739298838017678, + "grad_norm": 0.2374237543068285, + "learning_rate": 1.6786476191225764e-06, + "loss": 0.0045, + "step": 3722 + }, + { + "epoch": 0.7394974674744265, + "grad_norm": 0.4732242064561535, + "learning_rate": 1.6762432308670895e-06, + "loss": 0.0095, + "step": 3723 + }, + { + "epoch": 0.7396960969311749, + "grad_norm": 1.3353741885909276, + "learning_rate": 1.6738402189759539e-06, + "loss": 0.0084, + "step": 3724 + }, + { + "epoch": 0.7398947263879233, + "grad_norm": 0.7402268023476685, + "learning_rate": 1.671438584444251e-06, + "loss": 0.0161, + "step": 3725 + }, + { + "epoch": 0.7400933558446717, + "grad_norm": 0.48374235673574467, + "learning_rate": 1.6690383282664975e-06, + "loss": 0.0117, + "step": 3726 + }, + { + "epoch": 0.7402919853014202, + "grad_norm": 0.3143118173710068, + "learning_rate": 1.6666394514366374e-06, + "loss": 0.0106, + "step": 3727 + }, + { + "epoch": 0.7404906147581687, + "grad_norm": 0.622043363621592, + "learning_rate": 1.6642419549480414e-06, + "loss": 0.0069, + "step": 3728 + }, + { + "epoch": 0.7406892442149171, + "grad_norm": 0.5599094194736531, + "learning_rate": 1.6618458397935128e-06, + "loss": 0.015, + "step": 3729 + }, + { + "epoch": 0.7408878736716655, + "grad_norm": 0.48331856066211576, + "learning_rate": 1.6594511069652786e-06, + "loss": 0.0066, + "step": 3730 + }, + { + "epoch": 0.741086503128414, + "grad_norm": 0.34166598414346067, + "learning_rate": 1.6570577574549945e-06, + "loss": 0.0066, + "step": 3731 + }, + { + "epoch": 0.7412851325851624, + "grad_norm": 0.427571456888473, + "learning_rate": 1.6546657922537467e-06, + "loss": 0.0069, + "step": 3732 + }, + { + "epoch": 0.7414837620419108, + "grad_norm": 0.3951042263387438, + "learning_rate": 1.6522752123520431e-06, + "loss": 0.0057, + "step": 3733 + }, + { + "epoch": 0.7416823914986592, + "grad_norm": 0.5078153456049683, + "learning_rate": 1.6498860187398225e-06, + "loss": 0.0113, + "step": 3734 + }, + { + "epoch": 0.7418810209554076, + "grad_norm": 0.8021479414729211, + "learning_rate": 1.6474982124064453e-06, + "loss": 0.0165, + "step": 3735 + }, + { + "epoch": 0.7420796504121562, + "grad_norm": 0.2070717809242952, + "learning_rate": 1.6451117943407014e-06, + "loss": 0.0068, + "step": 3736 + }, + { + "epoch": 0.7422782798689046, + "grad_norm": 1.1209489750991877, + "learning_rate": 1.6427267655308048e-06, + "loss": 0.0134, + "step": 3737 + }, + { + "epoch": 0.742476909325653, + "grad_norm": 0.9481691520290123, + "learning_rate": 1.6403431269643916e-06, + "loss": 0.0169, + "step": 3738 + }, + { + "epoch": 0.7426755387824014, + "grad_norm": 0.6585340668755794, + "learning_rate": 1.6379608796285263e-06, + "loss": 0.01, + "step": 3739 + }, + { + "epoch": 0.7428741682391499, + "grad_norm": 0.7454877441367229, + "learning_rate": 1.635580024509692e-06, + "loss": 0.0084, + "step": 3740 + }, + { + "epoch": 0.7430727976958983, + "grad_norm": 0.6830037620298528, + "learning_rate": 1.6332005625938025e-06, + "loss": 0.0139, + "step": 3741 + }, + { + "epoch": 0.7432714271526467, + "grad_norm": 0.40026913724686264, + "learning_rate": 1.6308224948661867e-06, + "loss": 0.0102, + "step": 3742 + }, + { + "epoch": 0.7434700566093951, + "grad_norm": 1.211034939945853, + "learning_rate": 1.6284458223116011e-06, + "loss": 0.0208, + "step": 3743 + }, + { + "epoch": 0.7436686860661436, + "grad_norm": 0.4950643350049753, + "learning_rate": 1.6260705459142268e-06, + "loss": 0.0099, + "step": 3744 + }, + { + "epoch": 0.7438673155228921, + "grad_norm": 0.7561528087411485, + "learning_rate": 1.6236966666576586e-06, + "loss": 0.015, + "step": 3745 + }, + { + "epoch": 0.7440659449796405, + "grad_norm": 0.4587296244272926, + "learning_rate": 1.6213241855249211e-06, + "loss": 0.0105, + "step": 3746 + }, + { + "epoch": 0.7442645744363889, + "grad_norm": 0.35825062637065686, + "learning_rate": 1.6189531034984534e-06, + "loss": 0.0071, + "step": 3747 + }, + { + "epoch": 0.7444632038931374, + "grad_norm": 0.1667528612749447, + "learning_rate": 1.616583421560121e-06, + "loss": 0.0034, + "step": 3748 + }, + { + "epoch": 0.7446618333498858, + "grad_norm": 1.4414518890486638, + "learning_rate": 1.6142151406912043e-06, + "loss": 0.0252, + "step": 3749 + }, + { + "epoch": 0.7448604628066342, + "grad_norm": 0.8832252217740378, + "learning_rate": 1.6118482618724073e-06, + "loss": 0.0266, + "step": 3750 + }, + { + "epoch": 0.7450590922633826, + "grad_norm": 0.76220401745124, + "learning_rate": 1.6094827860838535e-06, + "loss": 0.0106, + "step": 3751 + }, + { + "epoch": 0.745257721720131, + "grad_norm": 0.9771556397073955, + "learning_rate": 1.6071187143050809e-06, + "loss": 0.0122, + "step": 3752 + }, + { + "epoch": 0.7454563511768796, + "grad_norm": 0.3409011178631712, + "learning_rate": 1.6047560475150532e-06, + "loss": 0.0044, + "step": 3753 + }, + { + "epoch": 0.745654980633628, + "grad_norm": 1.2118712960840325, + "learning_rate": 1.6023947866921452e-06, + "loss": 0.0163, + "step": 3754 + }, + { + "epoch": 0.7458536100903764, + "grad_norm": 0.7180041710450296, + "learning_rate": 1.6000349328141528e-06, + "loss": 0.0171, + "step": 3755 + }, + { + "epoch": 0.7460522395471249, + "grad_norm": 0.8759814111147064, + "learning_rate": 1.597676486858291e-06, + "loss": 0.0129, + "step": 3756 + }, + { + "epoch": 0.7462508690038733, + "grad_norm": 0.3860414331142917, + "learning_rate": 1.5953194498011876e-06, + "loss": 0.0081, + "step": 3757 + }, + { + "epoch": 0.7464494984606217, + "grad_norm": 0.6321819223523495, + "learning_rate": 1.5929638226188915e-06, + "loss": 0.0144, + "step": 3758 + }, + { + "epoch": 0.7466481279173701, + "grad_norm": 0.21545187521317896, + "learning_rate": 1.5906096062868638e-06, + "loss": 0.0035, + "step": 3759 + }, + { + "epoch": 0.7468467573741185, + "grad_norm": 0.47253677184045745, + "learning_rate": 1.5882568017799848e-06, + "loss": 0.0099, + "step": 3760 + }, + { + "epoch": 0.747045386830867, + "grad_norm": 0.678212792886403, + "learning_rate": 1.5859054100725463e-06, + "loss": 0.0164, + "step": 3761 + }, + { + "epoch": 0.7472440162876155, + "grad_norm": 0.5852719573661888, + "learning_rate": 1.5835554321382585e-06, + "loss": 0.0111, + "step": 3762 + }, + { + "epoch": 0.7474426457443639, + "grad_norm": 0.46344554315419095, + "learning_rate": 1.5812068689502464e-06, + "loss": 0.0174, + "step": 3763 + }, + { + "epoch": 0.7476412752011123, + "grad_norm": 0.6136914065626226, + "learning_rate": 1.5788597214810448e-06, + "loss": 0.0084, + "step": 3764 + }, + { + "epoch": 0.7478399046578608, + "grad_norm": 0.6281579179223541, + "learning_rate": 1.576513990702608e-06, + "loss": 0.0083, + "step": 3765 + }, + { + "epoch": 0.7480385341146092, + "grad_norm": 0.543301875004366, + "learning_rate": 1.5741696775862975e-06, + "loss": 0.009, + "step": 3766 + }, + { + "epoch": 0.7482371635713576, + "grad_norm": 0.36572588305171494, + "learning_rate": 1.5718267831028944e-06, + "loss": 0.0119, + "step": 3767 + }, + { + "epoch": 0.748435793028106, + "grad_norm": 0.24068019330565446, + "learning_rate": 1.5694853082225859e-06, + "loss": 0.0055, + "step": 3768 + }, + { + "epoch": 0.7486344224848545, + "grad_norm": 0.41680849368459344, + "learning_rate": 1.5671452539149761e-06, + "loss": 0.0062, + "step": 3769 + }, + { + "epoch": 0.748833051941603, + "grad_norm": 0.8860868458069969, + "learning_rate": 1.5648066211490804e-06, + "loss": 0.0158, + "step": 3770 + }, + { + "epoch": 0.7490316813983514, + "grad_norm": 0.36239580091258583, + "learning_rate": 1.5624694108933208e-06, + "loss": 0.0079, + "step": 3771 + }, + { + "epoch": 0.7492303108550998, + "grad_norm": 0.855233140426005, + "learning_rate": 1.560133624115538e-06, + "loss": 0.0133, + "step": 3772 + }, + { + "epoch": 0.7494289403118483, + "grad_norm": 0.5163747324216429, + "learning_rate": 1.5577992617829745e-06, + "loss": 0.0119, + "step": 3773 + }, + { + "epoch": 0.7496275697685967, + "grad_norm": 4.943484511480794, + "learning_rate": 1.5554663248622914e-06, + "loss": 0.0085, + "step": 3774 + }, + { + "epoch": 0.7498261992253451, + "grad_norm": 0.6847981517474558, + "learning_rate": 1.5531348143195524e-06, + "loss": 0.0158, + "step": 3775 + }, + { + "epoch": 0.7500248286820935, + "grad_norm": 0.24726926855367248, + "learning_rate": 1.5508047311202346e-06, + "loss": 0.0063, + "step": 3776 + }, + { + "epoch": 0.750223458138842, + "grad_norm": 1.0653880290235649, + "learning_rate": 1.548476076229225e-06, + "loss": 0.0152, + "step": 3777 + }, + { + "epoch": 0.7504220875955905, + "grad_norm": 0.41225668837673585, + "learning_rate": 1.5461488506108163e-06, + "loss": 0.0079, + "step": 3778 + }, + { + "epoch": 0.7506207170523389, + "grad_norm": 0.39688034716378284, + "learning_rate": 1.5438230552287076e-06, + "loss": 0.0133, + "step": 3779 + }, + { + "epoch": 0.7508193465090873, + "grad_norm": 0.7160254723600693, + "learning_rate": 1.5414986910460127e-06, + "loss": 0.0119, + "step": 3780 + }, + { + "epoch": 0.7510179759658357, + "grad_norm": 0.5240990626907865, + "learning_rate": 1.5391757590252448e-06, + "loss": 0.0115, + "step": 3781 + }, + { + "epoch": 0.7512166054225842, + "grad_norm": 1.2549979086012635, + "learning_rate": 1.536854260128331e-06, + "loss": 0.0133, + "step": 3782 + }, + { + "epoch": 0.7514152348793326, + "grad_norm": 0.9872465010490661, + "learning_rate": 1.5345341953165982e-06, + "loss": 0.0137, + "step": 3783 + }, + { + "epoch": 0.751613864336081, + "grad_norm": 0.34007165388302485, + "learning_rate": 1.5322155655507859e-06, + "loss": 0.0084, + "step": 3784 + }, + { + "epoch": 0.7518124937928294, + "grad_norm": 0.494044558950559, + "learning_rate": 1.5298983717910342e-06, + "loss": 0.0127, + "step": 3785 + }, + { + "epoch": 0.7520111232495779, + "grad_norm": 0.8210240909130295, + "learning_rate": 1.5275826149968913e-06, + "loss": 0.0089, + "step": 3786 + }, + { + "epoch": 0.7522097527063264, + "grad_norm": 0.47775536674059704, + "learning_rate": 1.5252682961273125e-06, + "loss": 0.0107, + "step": 3787 + }, + { + "epoch": 0.7524083821630748, + "grad_norm": 0.42651256912812496, + "learning_rate": 1.5229554161406502e-06, + "loss": 0.0104, + "step": 3788 + }, + { + "epoch": 0.7526070116198232, + "grad_norm": 0.5814023196245535, + "learning_rate": 1.52064397599467e-06, + "loss": 0.0115, + "step": 3789 + }, + { + "epoch": 0.7528056410765717, + "grad_norm": 0.39860990655481554, + "learning_rate": 1.5183339766465332e-06, + "loss": 0.0108, + "step": 3790 + }, + { + "epoch": 0.7530042705333201, + "grad_norm": 0.3816121403616372, + "learning_rate": 1.5160254190528118e-06, + "loss": 0.0073, + "step": 3791 + }, + { + "epoch": 0.7532028999900685, + "grad_norm": 0.5562129339192128, + "learning_rate": 1.5137183041694736e-06, + "loss": 0.0145, + "step": 3792 + }, + { + "epoch": 0.7534015294468169, + "grad_norm": 0.35565917270513564, + "learning_rate": 1.5114126329518947e-06, + "loss": 0.0088, + "step": 3793 + }, + { + "epoch": 0.7536001589035654, + "grad_norm": 0.47105502946267286, + "learning_rate": 1.5091084063548527e-06, + "loss": 0.0126, + "step": 3794 + }, + { + "epoch": 0.7537987883603139, + "grad_norm": 0.6185552661380914, + "learning_rate": 1.506805625332522e-06, + "loss": 0.0123, + "step": 3795 + }, + { + "epoch": 0.7539974178170623, + "grad_norm": 0.40215289956017086, + "learning_rate": 1.504504290838485e-06, + "loss": 0.0075, + "step": 3796 + }, + { + "epoch": 0.7541960472738107, + "grad_norm": 0.4782485953056544, + "learning_rate": 1.5022044038257195e-06, + "loss": 0.0162, + "step": 3797 + }, + { + "epoch": 0.7543946767305592, + "grad_norm": 0.7324829878695893, + "learning_rate": 1.4999059652466085e-06, + "loss": 0.0105, + "step": 3798 + }, + { + "epoch": 0.7545933061873076, + "grad_norm": 0.47495420804446264, + "learning_rate": 1.4976089760529311e-06, + "loss": 0.0139, + "step": 3799 + }, + { + "epoch": 0.754791935644056, + "grad_norm": 0.30470882134001215, + "learning_rate": 1.4953134371958694e-06, + "loss": 0.0044, + "step": 3800 + }, + { + "epoch": 0.7549905651008044, + "grad_norm": 0.5558913333675957, + "learning_rate": 1.4930193496260053e-06, + "loss": 0.0136, + "step": 3801 + }, + { + "epoch": 0.7551891945575528, + "grad_norm": 0.5677602935945104, + "learning_rate": 1.4907267142933162e-06, + "loss": 0.007, + "step": 3802 + }, + { + "epoch": 0.7553878240143013, + "grad_norm": 0.44674486843299116, + "learning_rate": 1.488435532147181e-06, + "loss": 0.0097, + "step": 3803 + }, + { + "epoch": 0.7555864534710498, + "grad_norm": 0.24979032118916042, + "learning_rate": 1.4861458041363736e-06, + "loss": 0.0067, + "step": 3804 + }, + { + "epoch": 0.7557850829277982, + "grad_norm": 0.644903890844482, + "learning_rate": 1.4838575312090692e-06, + "loss": 0.0176, + "step": 3805 + }, + { + "epoch": 0.7559837123845466, + "grad_norm": 0.4250844938968267, + "learning_rate": 1.481570714312842e-06, + "loss": 0.0095, + "step": 3806 + }, + { + "epoch": 0.7561823418412951, + "grad_norm": 0.5450830869397256, + "learning_rate": 1.4792853543946572e-06, + "loss": 0.014, + "step": 3807 + }, + { + "epoch": 0.7563809712980435, + "grad_norm": 0.2125242305530338, + "learning_rate": 1.477001452400883e-06, + "loss": 0.0067, + "step": 3808 + }, + { + "epoch": 0.7565796007547919, + "grad_norm": 0.6807746018493928, + "learning_rate": 1.4747190092772774e-06, + "loss": 0.0151, + "step": 3809 + }, + { + "epoch": 0.7567782302115403, + "grad_norm": 0.42307268538948456, + "learning_rate": 1.4724380259690013e-06, + "loss": 0.0119, + "step": 3810 + }, + { + "epoch": 0.7569768596682888, + "grad_norm": 0.6930123507211039, + "learning_rate": 1.470158503420605e-06, + "loss": 0.0104, + "step": 3811 + }, + { + "epoch": 0.7571754891250373, + "grad_norm": 0.3363804661976727, + "learning_rate": 1.4678804425760374e-06, + "loss": 0.0049, + "step": 3812 + }, + { + "epoch": 0.7573741185817857, + "grad_norm": 0.5791696042890436, + "learning_rate": 1.4656038443786425e-06, + "loss": 0.0119, + "step": 3813 + }, + { + "epoch": 0.7575727480385341, + "grad_norm": 0.30695431877058926, + "learning_rate": 1.4633287097711552e-06, + "loss": 0.0062, + "step": 3814 + }, + { + "epoch": 0.7577713774952826, + "grad_norm": 0.5269032914372845, + "learning_rate": 1.4610550396957085e-06, + "loss": 0.0175, + "step": 3815 + }, + { + "epoch": 0.757970006952031, + "grad_norm": 0.38075958392152365, + "learning_rate": 1.458782835093825e-06, + "loss": 0.008, + "step": 3816 + }, + { + "epoch": 0.7581686364087794, + "grad_norm": 0.3840340193367981, + "learning_rate": 1.4565120969064222e-06, + "loss": 0.0062, + "step": 3817 + }, + { + "epoch": 0.7583672658655278, + "grad_norm": 0.44995171484252383, + "learning_rate": 1.454242826073814e-06, + "loss": 0.0105, + "step": 3818 + }, + { + "epoch": 0.7585658953222763, + "grad_norm": 0.755410175506028, + "learning_rate": 1.451975023535699e-06, + "loss": 0.0151, + "step": 3819 + }, + { + "epoch": 0.7587645247790247, + "grad_norm": 0.4925822601592495, + "learning_rate": 1.4497086902311746e-06, + "loss": 0.0118, + "step": 3820 + }, + { + "epoch": 0.7589631542357732, + "grad_norm": 0.33772272051963276, + "learning_rate": 1.4474438270987257e-06, + "loss": 0.0062, + "step": 3821 + }, + { + "epoch": 0.7591617836925216, + "grad_norm": 0.8290174834097047, + "learning_rate": 1.445180435076231e-06, + "loss": 0.0136, + "step": 3822 + }, + { + "epoch": 0.75936041314927, + "grad_norm": 0.43343542171958976, + "learning_rate": 1.4429185151009573e-06, + "loss": 0.0092, + "step": 3823 + }, + { + "epoch": 0.7595590426060185, + "grad_norm": 0.775111328446171, + "learning_rate": 1.4406580681095639e-06, + "loss": 0.0181, + "step": 3824 + }, + { + "epoch": 0.7597576720627669, + "grad_norm": 0.39166179536426354, + "learning_rate": 1.4383990950381022e-06, + "loss": 0.0088, + "step": 3825 + }, + { + "epoch": 0.7599563015195153, + "grad_norm": 0.4925837143073198, + "learning_rate": 1.4361415968220082e-06, + "loss": 0.0101, + "step": 3826 + }, + { + "epoch": 0.7601549309762637, + "grad_norm": 0.32696898445858813, + "learning_rate": 1.43388557439611e-06, + "loss": 0.0086, + "step": 3827 + }, + { + "epoch": 0.7603535604330122, + "grad_norm": 0.4958561019808758, + "learning_rate": 1.4316310286946228e-06, + "loss": 0.0137, + "step": 3828 + }, + { + "epoch": 0.7605521898897607, + "grad_norm": 0.4975604655384996, + "learning_rate": 1.4293779606511527e-06, + "loss": 0.0136, + "step": 3829 + }, + { + "epoch": 0.7607508193465091, + "grad_norm": 0.49887393821198955, + "learning_rate": 1.4271263711986954e-06, + "loss": 0.0103, + "step": 3830 + }, + { + "epoch": 0.7609494488032575, + "grad_norm": 0.3683363611647419, + "learning_rate": 1.4248762612696282e-06, + "loss": 0.0063, + "step": 3831 + }, + { + "epoch": 0.761148078260006, + "grad_norm": 0.3079985582051433, + "learning_rate": 1.4226276317957228e-06, + "loss": 0.0125, + "step": 3832 + }, + { + "epoch": 0.7613467077167544, + "grad_norm": 0.4074575929705946, + "learning_rate": 1.4203804837081308e-06, + "loss": 0.0075, + "step": 3833 + }, + { + "epoch": 0.7615453371735028, + "grad_norm": 0.49220265205108876, + "learning_rate": 1.4181348179373972e-06, + "loss": 0.0127, + "step": 3834 + }, + { + "epoch": 0.7617439666302512, + "grad_norm": 0.2819678338268758, + "learning_rate": 1.4158906354134472e-06, + "loss": 0.0082, + "step": 3835 + }, + { + "epoch": 0.7619425960869997, + "grad_norm": 0.5745246590432623, + "learning_rate": 1.413647937065596e-06, + "loss": 0.0149, + "step": 3836 + }, + { + "epoch": 0.7621412255437482, + "grad_norm": 0.6022591411514704, + "learning_rate": 1.4114067238225438e-06, + "loss": 0.017, + "step": 3837 + }, + { + "epoch": 0.7623398550004966, + "grad_norm": 0.5962439448569538, + "learning_rate": 1.4091669966123717e-06, + "loss": 0.0079, + "step": 3838 + }, + { + "epoch": 0.762538484457245, + "grad_norm": 0.48725897414015545, + "learning_rate": 1.4069287563625523e-06, + "loss": 0.0122, + "step": 3839 + }, + { + "epoch": 0.7627371139139935, + "grad_norm": 0.6291651873134785, + "learning_rate": 1.404692003999935e-06, + "loss": 0.0142, + "step": 3840 + }, + { + "epoch": 0.7629357433707419, + "grad_norm": 0.5770793807297394, + "learning_rate": 1.4024567404507606e-06, + "loss": 0.0129, + "step": 3841 + }, + { + "epoch": 0.7631343728274903, + "grad_norm": 0.4801313035665573, + "learning_rate": 1.4002229666406454e-06, + "loss": 0.0127, + "step": 3842 + }, + { + "epoch": 0.7633330022842387, + "grad_norm": 0.5099357827368747, + "learning_rate": 1.3979906834945944e-06, + "loss": 0.0139, + "step": 3843 + }, + { + "epoch": 0.7635316317409871, + "grad_norm": 0.3833860818815375, + "learning_rate": 1.3957598919369958e-06, + "loss": 0.0099, + "step": 3844 + }, + { + "epoch": 0.7637302611977356, + "grad_norm": 0.41162753016444315, + "learning_rate": 1.3935305928916154e-06, + "loss": 0.0108, + "step": 3845 + }, + { + "epoch": 0.7639288906544841, + "grad_norm": 0.3078560285036505, + "learning_rate": 1.3913027872816064e-06, + "loss": 0.0095, + "step": 3846 + }, + { + "epoch": 0.7641275201112325, + "grad_norm": 0.37973724509719714, + "learning_rate": 1.3890764760294979e-06, + "loss": 0.0074, + "step": 3847 + }, + { + "epoch": 0.764326149567981, + "grad_norm": 0.4498229089715063, + "learning_rate": 1.3868516600572056e-06, + "loss": 0.0091, + "step": 3848 + }, + { + "epoch": 0.7645247790247294, + "grad_norm": 0.37444629015372083, + "learning_rate": 1.3846283402860216e-06, + "loss": 0.0099, + "step": 3849 + }, + { + "epoch": 0.7647234084814778, + "grad_norm": 0.5009045045676753, + "learning_rate": 1.3824065176366225e-06, + "loss": 0.0105, + "step": 3850 + }, + { + "epoch": 0.7649220379382262, + "grad_norm": 0.4475657849726896, + "learning_rate": 1.3801861930290623e-06, + "loss": 0.0064, + "step": 3851 + }, + { + "epoch": 0.7651206673949746, + "grad_norm": 0.774374062344075, + "learning_rate": 1.377967367382772e-06, + "loss": 0.0182, + "step": 3852 + }, + { + "epoch": 0.7653192968517231, + "grad_norm": 0.4792793910315004, + "learning_rate": 1.3757500416165686e-06, + "loss": 0.012, + "step": 3853 + }, + { + "epoch": 0.7655179263084716, + "grad_norm": 0.6802435312134761, + "learning_rate": 1.3735342166486448e-06, + "loss": 0.0146, + "step": 3854 + }, + { + "epoch": 0.76571655576522, + "grad_norm": 0.5822617738126644, + "learning_rate": 1.3713198933965687e-06, + "loss": 0.0102, + "step": 3855 + }, + { + "epoch": 0.7659151852219684, + "grad_norm": 0.3228251141085814, + "learning_rate": 1.3691070727772927e-06, + "loss": 0.0058, + "step": 3856 + }, + { + "epoch": 0.7661138146787169, + "grad_norm": 0.5602496268212196, + "learning_rate": 1.36689575570714e-06, + "loss": 0.0138, + "step": 3857 + }, + { + "epoch": 0.7663124441354653, + "grad_norm": 0.8410546969132571, + "learning_rate": 1.3646859431018178e-06, + "loss": 0.0168, + "step": 3858 + }, + { + "epoch": 0.7665110735922137, + "grad_norm": 0.49153277899098013, + "learning_rate": 1.3624776358764046e-06, + "loss": 0.0101, + "step": 3859 + }, + { + "epoch": 0.7667097030489621, + "grad_norm": 0.4718983704981794, + "learning_rate": 1.3602708349453603e-06, + "loss": 0.0146, + "step": 3860 + }, + { + "epoch": 0.7669083325057106, + "grad_norm": 0.46323476790072704, + "learning_rate": 1.3580655412225192e-06, + "loss": 0.0141, + "step": 3861 + }, + { + "epoch": 0.767106961962459, + "grad_norm": 0.72794578293661, + "learning_rate": 1.3558617556210891e-06, + "loss": 0.0146, + "step": 3862 + }, + { + "epoch": 0.7673055914192075, + "grad_norm": 0.36007802301261976, + "learning_rate": 1.3536594790536584e-06, + "loss": 0.0073, + "step": 3863 + }, + { + "epoch": 0.7675042208759559, + "grad_norm": 0.6871366174736734, + "learning_rate": 1.3514587124321842e-06, + "loss": 0.0154, + "step": 3864 + }, + { + "epoch": 0.7677028503327044, + "grad_norm": 0.77934118848991, + "learning_rate": 1.3492594566680052e-06, + "loss": 0.0133, + "step": 3865 + }, + { + "epoch": 0.7679014797894528, + "grad_norm": 0.31816704466209683, + "learning_rate": 1.3470617126718272e-06, + "loss": 0.0114, + "step": 3866 + }, + { + "epoch": 0.7681001092462012, + "grad_norm": 0.39457341298241466, + "learning_rate": 1.344865481353736e-06, + "loss": 0.0088, + "step": 3867 + }, + { + "epoch": 0.7682987387029496, + "grad_norm": 0.3878354688307015, + "learning_rate": 1.34267076362319e-06, + "loss": 0.0051, + "step": 3868 + }, + { + "epoch": 0.768497368159698, + "grad_norm": 0.5176469767858003, + "learning_rate": 1.3404775603890175e-06, + "loss": 0.0102, + "step": 3869 + }, + { + "epoch": 0.7686959976164465, + "grad_norm": 0.6546341810751645, + "learning_rate": 1.3382858725594233e-06, + "loss": 0.016, + "step": 3870 + }, + { + "epoch": 0.768894627073195, + "grad_norm": 0.4077083813837435, + "learning_rate": 1.3360957010419813e-06, + "loss": 0.0127, + "step": 3871 + }, + { + "epoch": 0.7690932565299434, + "grad_norm": 1.0786672783122584, + "learning_rate": 1.333907046743641e-06, + "loss": 0.0139, + "step": 3872 + }, + { + "epoch": 0.7692918859866918, + "grad_norm": 0.620538079771535, + "learning_rate": 1.3317199105707207e-06, + "loss": 0.0119, + "step": 3873 + }, + { + "epoch": 0.7694905154434403, + "grad_norm": 0.7487056379611181, + "learning_rate": 1.3295342934289128e-06, + "loss": 0.0135, + "step": 3874 + }, + { + "epoch": 0.7696891449001887, + "grad_norm": 0.5658770968139971, + "learning_rate": 1.3273501962232787e-06, + "loss": 0.0136, + "step": 3875 + }, + { + "epoch": 0.7698877743569371, + "grad_norm": 0.5544123537115362, + "learning_rate": 1.3251676198582491e-06, + "loss": 0.0076, + "step": 3876 + }, + { + "epoch": 0.7700864038136855, + "grad_norm": 0.42604725459840853, + "learning_rate": 1.3229865652376295e-06, + "loss": 0.0092, + "step": 3877 + }, + { + "epoch": 0.770285033270434, + "grad_norm": 0.7356682784487021, + "learning_rate": 1.3208070332645889e-06, + "loss": 0.0091, + "step": 3878 + }, + { + "epoch": 0.7704836627271825, + "grad_norm": 0.2945392911865687, + "learning_rate": 1.3186290248416723e-06, + "loss": 0.0094, + "step": 3879 + }, + { + "epoch": 0.7706822921839309, + "grad_norm": 0.4400140082230475, + "learning_rate": 1.3164525408707908e-06, + "loss": 0.012, + "step": 3880 + }, + { + "epoch": 0.7708809216406793, + "grad_norm": 0.3677167541782022, + "learning_rate": 1.3142775822532216e-06, + "loss": 0.0068, + "step": 3881 + }, + { + "epoch": 0.7710795510974278, + "grad_norm": 0.393778558813364, + "learning_rate": 1.3121041498896165e-06, + "loss": 0.0093, + "step": 3882 + }, + { + "epoch": 0.7712781805541762, + "grad_norm": 0.41937295389208945, + "learning_rate": 1.3099322446799883e-06, + "loss": 0.0109, + "step": 3883 + }, + { + "epoch": 0.7714768100109246, + "grad_norm": 0.45418818468109395, + "learning_rate": 1.3077618675237235e-06, + "loss": 0.009, + "step": 3884 + }, + { + "epoch": 0.771675439467673, + "grad_norm": 0.5022438379809215, + "learning_rate": 1.305593019319571e-06, + "loss": 0.0133, + "step": 3885 + }, + { + "epoch": 0.7718740689244215, + "grad_norm": 0.7531062350453083, + "learning_rate": 1.3034257009656486e-06, + "loss": 0.0098, + "step": 3886 + }, + { + "epoch": 0.7720726983811699, + "grad_norm": 0.32685807203315603, + "learning_rate": 1.3012599133594438e-06, + "loss": 0.0122, + "step": 3887 + }, + { + "epoch": 0.7722713278379184, + "grad_norm": 0.6199595695269277, + "learning_rate": 1.299095657397803e-06, + "loss": 0.0137, + "step": 3888 + }, + { + "epoch": 0.7724699572946668, + "grad_norm": 0.918750119255603, + "learning_rate": 1.296932933976946e-06, + "loss": 0.0183, + "step": 3889 + }, + { + "epoch": 0.7726685867514153, + "grad_norm": 0.5007197845896081, + "learning_rate": 1.294771743992451e-06, + "loss": 0.01, + "step": 3890 + }, + { + "epoch": 0.7728672162081637, + "grad_norm": 0.4774983945538157, + "learning_rate": 1.292612088339268e-06, + "loss": 0.0112, + "step": 3891 + }, + { + "epoch": 0.7730658456649121, + "grad_norm": 0.40168632160434964, + "learning_rate": 1.2904539679117051e-06, + "loss": 0.0045, + "step": 3892 + }, + { + "epoch": 0.7732644751216605, + "grad_norm": 0.6613299836977145, + "learning_rate": 1.2882973836034391e-06, + "loss": 0.0173, + "step": 3893 + }, + { + "epoch": 0.7734631045784089, + "grad_norm": 2.4503594172896195, + "learning_rate": 1.286142336307511e-06, + "loss": 0.016, + "step": 3894 + }, + { + "epoch": 0.7736617340351574, + "grad_norm": 0.5175369687816846, + "learning_rate": 1.283988826916321e-06, + "loss": 0.0129, + "step": 3895 + }, + { + "epoch": 0.7738603634919059, + "grad_norm": 0.32059427002808744, + "learning_rate": 1.2818368563216377e-06, + "loss": 0.0094, + "step": 3896 + }, + { + "epoch": 0.7740589929486543, + "grad_norm": 0.45258914822238383, + "learning_rate": 1.2796864254145875e-06, + "loss": 0.0123, + "step": 3897 + }, + { + "epoch": 0.7742576224054027, + "grad_norm": 0.31073316498638376, + "learning_rate": 1.277537535085664e-06, + "loss": 0.0105, + "step": 3898 + }, + { + "epoch": 0.7744562518621512, + "grad_norm": 0.27654739609890827, + "learning_rate": 1.2753901862247198e-06, + "loss": 0.0068, + "step": 3899 + }, + { + "epoch": 0.7746548813188996, + "grad_norm": 0.7702240957803342, + "learning_rate": 1.2732443797209676e-06, + "loss": 0.014, + "step": 3900 + }, + { + "epoch": 0.774853510775648, + "grad_norm": 0.7888401795775241, + "learning_rate": 1.2711001164629878e-06, + "loss": 0.0107, + "step": 3901 + }, + { + "epoch": 0.7750521402323964, + "grad_norm": 0.5102152605501087, + "learning_rate": 1.2689573973387136e-06, + "loss": 0.0109, + "step": 3902 + }, + { + "epoch": 0.7752507696891449, + "grad_norm": 0.42949075002493076, + "learning_rate": 1.2668162232354453e-06, + "loss": 0.0102, + "step": 3903 + }, + { + "epoch": 0.7754493991458933, + "grad_norm": 0.51913928021783, + "learning_rate": 1.2646765950398415e-06, + "loss": 0.0152, + "step": 3904 + }, + { + "epoch": 0.7756480286026418, + "grad_norm": 0.5147647620976568, + "learning_rate": 1.2625385136379181e-06, + "loss": 0.0109, + "step": 3905 + }, + { + "epoch": 0.7758466580593902, + "grad_norm": 0.3620633765074665, + "learning_rate": 1.2604019799150547e-06, + "loss": 0.0099, + "step": 3906 + }, + { + "epoch": 0.7760452875161387, + "grad_norm": 0.623127129178747, + "learning_rate": 1.2582669947559845e-06, + "loss": 0.0142, + "step": 3907 + }, + { + "epoch": 0.7762439169728871, + "grad_norm": 0.8733934868624431, + "learning_rate": 1.2561335590448066e-06, + "loss": 0.0162, + "step": 3908 + }, + { + "epoch": 0.7764425464296355, + "grad_norm": 0.8813493665439702, + "learning_rate": 1.2540016736649713e-06, + "loss": 0.0126, + "step": 3909 + }, + { + "epoch": 0.7766411758863839, + "grad_norm": 0.27843671726361163, + "learning_rate": 1.2518713394992916e-06, + "loss": 0.0077, + "step": 3910 + }, + { + "epoch": 0.7768398053431324, + "grad_norm": 0.5249639405585835, + "learning_rate": 1.2497425574299376e-06, + "loss": 0.0139, + "step": 3911 + }, + { + "epoch": 0.7770384347998808, + "grad_norm": 0.4152029494822117, + "learning_rate": 1.247615328338434e-06, + "loss": 0.0069, + "step": 3912 + }, + { + "epoch": 0.7772370642566293, + "grad_norm": 0.9012103889077504, + "learning_rate": 1.2454896531056665e-06, + "loss": 0.012, + "step": 3913 + }, + { + "epoch": 0.7774356937133777, + "grad_norm": 0.42111435803255814, + "learning_rate": 1.2433655326118726e-06, + "loss": 0.0102, + "step": 3914 + }, + { + "epoch": 0.7776343231701262, + "grad_norm": 0.7641685303436699, + "learning_rate": 1.2412429677366512e-06, + "loss": 0.0118, + "step": 3915 + }, + { + "epoch": 0.7778329526268746, + "grad_norm": 0.5032971538363441, + "learning_rate": 1.239121959358951e-06, + "loss": 0.0073, + "step": 3916 + }, + { + "epoch": 0.778031582083623, + "grad_norm": 0.774857905002289, + "learning_rate": 1.2370025083570813e-06, + "loss": 0.0082, + "step": 3917 + }, + { + "epoch": 0.7782302115403714, + "grad_norm": 0.5279560951716294, + "learning_rate": 1.2348846156087058e-06, + "loss": 0.0139, + "step": 3918 + }, + { + "epoch": 0.7784288409971198, + "grad_norm": 0.4746514702620925, + "learning_rate": 1.2327682819908393e-06, + "loss": 0.0111, + "step": 3919 + }, + { + "epoch": 0.7786274704538683, + "grad_norm": 0.5401006357078507, + "learning_rate": 1.230653508379856e-06, + "loss": 0.0128, + "step": 3920 + }, + { + "epoch": 0.7788260999106168, + "grad_norm": 0.5347132085056566, + "learning_rate": 1.2285402956514786e-06, + "loss": 0.0094, + "step": 3921 + }, + { + "epoch": 0.7790247293673652, + "grad_norm": 0.8006086363779595, + "learning_rate": 1.2264286446807893e-06, + "loss": 0.0211, + "step": 3922 + }, + { + "epoch": 0.7792233588241136, + "grad_norm": 0.729596534109269, + "learning_rate": 1.2243185563422194e-06, + "loss": 0.0102, + "step": 3923 + }, + { + "epoch": 0.7794219882808621, + "grad_norm": 0.8162366400705077, + "learning_rate": 1.222210031509553e-06, + "loss": 0.0154, + "step": 3924 + }, + { + "epoch": 0.7796206177376105, + "grad_norm": 0.8298607900398938, + "learning_rate": 1.2201030710559309e-06, + "loss": 0.0154, + "step": 3925 + }, + { + "epoch": 0.7798192471943589, + "grad_norm": 0.9295004548979635, + "learning_rate": 1.2179976758538397e-06, + "loss": 0.0202, + "step": 3926 + }, + { + "epoch": 0.7800178766511073, + "grad_norm": 0.6284727316819763, + "learning_rate": 1.2158938467751258e-06, + "loss": 0.0144, + "step": 3927 + }, + { + "epoch": 0.7802165061078558, + "grad_norm": 0.31174985864230775, + "learning_rate": 1.213791584690978e-06, + "loss": 0.008, + "step": 3928 + }, + { + "epoch": 0.7804151355646042, + "grad_norm": 0.3904178709197262, + "learning_rate": 1.2116908904719443e-06, + "loss": 0.008, + "step": 3929 + }, + { + "epoch": 0.7806137650213527, + "grad_norm": 0.28385884511776377, + "learning_rate": 1.2095917649879202e-06, + "loss": 0.0072, + "step": 3930 + }, + { + "epoch": 0.7808123944781011, + "grad_norm": 0.8225239608519143, + "learning_rate": 1.2074942091081493e-06, + "loss": 0.0077, + "step": 3931 + }, + { + "epoch": 0.7810110239348496, + "grad_norm": 0.25070849880393475, + "learning_rate": 1.2053982237012295e-06, + "loss": 0.0067, + "step": 3932 + }, + { + "epoch": 0.781209653391598, + "grad_norm": 1.009173744589044, + "learning_rate": 1.2033038096351042e-06, + "loss": 0.0153, + "step": 3933 + }, + { + "epoch": 0.7814082828483464, + "grad_norm": 0.3001079077055634, + "learning_rate": 1.20121096777707e-06, + "loss": 0.0073, + "step": 3934 + }, + { + "epoch": 0.7816069123050948, + "grad_norm": 0.2675585992071396, + "learning_rate": 1.1991196989937693e-06, + "loss": 0.0077, + "step": 3935 + }, + { + "epoch": 0.7818055417618432, + "grad_norm": 0.36211169776646895, + "learning_rate": 1.1970300041511945e-06, + "loss": 0.0065, + "step": 3936 + }, + { + "epoch": 0.7820041712185917, + "grad_norm": 0.556461940995312, + "learning_rate": 1.1949418841146875e-06, + "loss": 0.0149, + "step": 3937 + }, + { + "epoch": 0.7822028006753402, + "grad_norm": 0.41740062523359517, + "learning_rate": 1.192855339748935e-06, + "loss": 0.0151, + "step": 3938 + }, + { + "epoch": 0.7824014301320886, + "grad_norm": 0.31475234587927653, + "learning_rate": 1.1907703719179752e-06, + "loss": 0.0064, + "step": 3939 + }, + { + "epoch": 0.782600059588837, + "grad_norm": 0.6800297738810858, + "learning_rate": 1.1886869814851881e-06, + "loss": 0.0138, + "step": 3940 + }, + { + "epoch": 0.7827986890455855, + "grad_norm": 0.5648602421057914, + "learning_rate": 1.186605169313307e-06, + "loss": 0.0143, + "step": 3941 + }, + { + "epoch": 0.7829973185023339, + "grad_norm": 0.32364785454126055, + "learning_rate": 1.1845249362644046e-06, + "loss": 0.0111, + "step": 3942 + }, + { + "epoch": 0.7831959479590823, + "grad_norm": 7.128471218729785, + "learning_rate": 1.1824462831999057e-06, + "loss": 0.018, + "step": 3943 + }, + { + "epoch": 0.7833945774158307, + "grad_norm": 0.46537352198306225, + "learning_rate": 1.1803692109805786e-06, + "loss": 0.0096, + "step": 3944 + }, + { + "epoch": 0.7835932068725792, + "grad_norm": 0.40237938194467165, + "learning_rate": 1.178293720466535e-06, + "loss": 0.006, + "step": 3945 + }, + { + "epoch": 0.7837918363293276, + "grad_norm": 0.47649481459898346, + "learning_rate": 1.1762198125172364e-06, + "loss": 0.0135, + "step": 3946 + }, + { + "epoch": 0.7839904657860761, + "grad_norm": 0.3889132630569039, + "learning_rate": 1.1741474879914837e-06, + "loss": 0.0091, + "step": 3947 + }, + { + "epoch": 0.7841890952428245, + "grad_norm": 0.3589855854840536, + "learning_rate": 1.1720767477474238e-06, + "loss": 0.0071, + "step": 3948 + }, + { + "epoch": 0.784387724699573, + "grad_norm": 0.6600060042525093, + "learning_rate": 1.1700075926425508e-06, + "loss": 0.0073, + "step": 3949 + }, + { + "epoch": 0.7845863541563214, + "grad_norm": 0.5605141682819981, + "learning_rate": 1.167940023533697e-06, + "loss": 0.0107, + "step": 3950 + }, + { + "epoch": 0.7847849836130698, + "grad_norm": 0.4276017759614575, + "learning_rate": 1.1658740412770426e-06, + "loss": 0.0108, + "step": 3951 + }, + { + "epoch": 0.7849836130698182, + "grad_norm": 0.5245144378224614, + "learning_rate": 1.1638096467281074e-06, + "loss": 0.0085, + "step": 3952 + }, + { + "epoch": 0.7851822425265667, + "grad_norm": 0.504334640539037, + "learning_rate": 1.1617468407417553e-06, + "loss": 0.0101, + "step": 3953 + }, + { + "epoch": 0.7853808719833151, + "grad_norm": 0.48205839823898566, + "learning_rate": 1.1596856241721944e-06, + "loss": 0.0116, + "step": 3954 + }, + { + "epoch": 0.7855795014400636, + "grad_norm": 0.6530170104490158, + "learning_rate": 1.1576259978729692e-06, + "loss": 0.0178, + "step": 3955 + }, + { + "epoch": 0.785778130896812, + "grad_norm": 0.710759087565475, + "learning_rate": 1.1555679626969724e-06, + "loss": 0.0144, + "step": 3956 + }, + { + "epoch": 0.7859767603535605, + "grad_norm": 0.5424274465145213, + "learning_rate": 1.1535115194964304e-06, + "loss": 0.0132, + "step": 3957 + }, + { + "epoch": 0.7861753898103089, + "grad_norm": 0.6661319005283131, + "learning_rate": 1.1514566691229178e-06, + "loss": 0.013, + "step": 3958 + }, + { + "epoch": 0.7863740192670573, + "grad_norm": 0.5573412948277443, + "learning_rate": 1.1494034124273428e-06, + "loss": 0.0133, + "step": 3959 + }, + { + "epoch": 0.7865726487238057, + "grad_norm": 0.4236751414096416, + "learning_rate": 1.147351750259959e-06, + "loss": 0.0079, + "step": 3960 + }, + { + "epoch": 0.7867712781805541, + "grad_norm": 0.6700103105511918, + "learning_rate": 1.1453016834703584e-06, + "loss": 0.0142, + "step": 3961 + }, + { + "epoch": 0.7869699076373026, + "grad_norm": 0.6722312403375201, + "learning_rate": 1.1432532129074692e-06, + "loss": 0.0175, + "step": 3962 + }, + { + "epoch": 0.7871685370940511, + "grad_norm": 0.29540238303248256, + "learning_rate": 1.1412063394195634e-06, + "loss": 0.0082, + "step": 3963 + }, + { + "epoch": 0.7873671665507995, + "grad_norm": 0.453040253508881, + "learning_rate": 1.1391610638542473e-06, + "loss": 0.0149, + "step": 3964 + }, + { + "epoch": 0.787565796007548, + "grad_norm": 0.3063401650275241, + "learning_rate": 1.1371173870584696e-06, + "loss": 0.0087, + "step": 3965 + }, + { + "epoch": 0.7877644254642964, + "grad_norm": 0.7536587487092659, + "learning_rate": 1.1350753098785117e-06, + "loss": 0.0112, + "step": 3966 + }, + { + "epoch": 0.7879630549210448, + "grad_norm": 0.31791285239524447, + "learning_rate": 1.1330348331599978e-06, + "loss": 0.0101, + "step": 3967 + }, + { + "epoch": 0.7881616843777932, + "grad_norm": 0.5345985537470007, + "learning_rate": 1.1309959577478885e-06, + "loss": 0.0138, + "step": 3968 + }, + { + "epoch": 0.7883603138345416, + "grad_norm": 0.39066543567981465, + "learning_rate": 1.128958684486477e-06, + "loss": 0.0105, + "step": 3969 + }, + { + "epoch": 0.7885589432912901, + "grad_norm": 0.6971256838903706, + "learning_rate": 1.1269230142194004e-06, + "loss": 0.0219, + "step": 3970 + }, + { + "epoch": 0.7887575727480385, + "grad_norm": 0.3025242348367761, + "learning_rate": 1.1248889477896224e-06, + "loss": 0.0064, + "step": 3971 + }, + { + "epoch": 0.788956202204787, + "grad_norm": 0.4101701372618817, + "learning_rate": 1.1228564860394508e-06, + "loss": 0.0085, + "step": 3972 + }, + { + "epoch": 0.7891548316615354, + "grad_norm": 1.342044070773984, + "learning_rate": 1.120825629810527e-06, + "loss": 0.0192, + "step": 3973 + }, + { + "epoch": 0.7893534611182839, + "grad_norm": 0.43993593424734334, + "learning_rate": 1.1187963799438235e-06, + "loss": 0.0124, + "step": 3974 + }, + { + "epoch": 0.7895520905750323, + "grad_norm": 0.8408300332951749, + "learning_rate": 1.1167687372796537e-06, + "loss": 0.0113, + "step": 3975 + }, + { + "epoch": 0.7897507200317807, + "grad_norm": 0.9328690964868325, + "learning_rate": 1.1147427026576597e-06, + "loss": 0.0184, + "step": 3976 + }, + { + "epoch": 0.7899493494885291, + "grad_norm": 0.41486356516047107, + "learning_rate": 1.1127182769168231e-06, + "loss": 0.0048, + "step": 3977 + }, + { + "epoch": 0.7901479789452776, + "grad_norm": 0.6855136367785546, + "learning_rate": 1.110695460895454e-06, + "loss": 0.0129, + "step": 3978 + }, + { + "epoch": 0.790346608402026, + "grad_norm": 0.28701281574074733, + "learning_rate": 1.108674255431199e-06, + "loss": 0.0041, + "step": 3979 + }, + { + "epoch": 0.7905452378587745, + "grad_norm": 0.4540954582400495, + "learning_rate": 1.10665466136104e-06, + "loss": 0.0152, + "step": 3980 + }, + { + "epoch": 0.7907438673155229, + "grad_norm": 0.7368225481887282, + "learning_rate": 1.1046366795212854e-06, + "loss": 0.0125, + "step": 3981 + }, + { + "epoch": 0.7909424967722714, + "grad_norm": 0.9802282075745697, + "learning_rate": 1.1026203107475824e-06, + "loss": 0.0145, + "step": 3982 + }, + { + "epoch": 0.7911411262290198, + "grad_norm": 1.0658263094138696, + "learning_rate": 1.100605555874904e-06, + "loss": 0.0151, + "step": 3983 + }, + { + "epoch": 0.7913397556857682, + "grad_norm": 0.35002483862277856, + "learning_rate": 1.0985924157375616e-06, + "loss": 0.0088, + "step": 3984 + }, + { + "epoch": 0.7915383851425166, + "grad_norm": 0.3244457738853698, + "learning_rate": 1.0965808911691917e-06, + "loss": 0.0066, + "step": 3985 + }, + { + "epoch": 0.791737014599265, + "grad_norm": 0.6796365282260834, + "learning_rate": 1.0945709830027657e-06, + "loss": 0.0094, + "step": 3986 + }, + { + "epoch": 0.7919356440560135, + "grad_norm": 0.45854564583815816, + "learning_rate": 1.0925626920705857e-06, + "loss": 0.0098, + "step": 3987 + }, + { + "epoch": 0.7921342735127619, + "grad_norm": 0.5264577666006467, + "learning_rate": 1.0905560192042808e-06, + "loss": 0.0071, + "step": 3988 + }, + { + "epoch": 0.7923329029695104, + "grad_norm": 0.3097781743722543, + "learning_rate": 1.0885509652348142e-06, + "loss": 0.008, + "step": 3989 + }, + { + "epoch": 0.7925315324262588, + "grad_norm": 0.4200986146469633, + "learning_rate": 1.086547530992475e-06, + "loss": 0.0089, + "step": 3990 + }, + { + "epoch": 0.7927301618830073, + "grad_norm": 0.6691123935864665, + "learning_rate": 1.0845457173068858e-06, + "loss": 0.0151, + "step": 3991 + }, + { + "epoch": 0.7929287913397557, + "grad_norm": 0.3305478813634996, + "learning_rate": 1.0825455250069921e-06, + "loss": 0.005, + "step": 3992 + }, + { + "epoch": 0.7931274207965041, + "grad_norm": 0.33748170186551923, + "learning_rate": 1.080546954921075e-06, + "loss": 0.0062, + "step": 3993 + }, + { + "epoch": 0.7933260502532525, + "grad_norm": 0.3245694682203549, + "learning_rate": 1.0785500078767392e-06, + "loss": 0.0057, + "step": 3994 + }, + { + "epoch": 0.793524679710001, + "grad_norm": 0.4644644236708028, + "learning_rate": 1.076554684700916e-06, + "loss": 0.0089, + "step": 3995 + }, + { + "epoch": 0.7937233091667494, + "grad_norm": 0.5038171184311694, + "learning_rate": 1.0745609862198692e-06, + "loss": 0.0116, + "step": 3996 + }, + { + "epoch": 0.7939219386234979, + "grad_norm": 0.6885486523828257, + "learning_rate": 1.0725689132591888e-06, + "loss": 0.0163, + "step": 3997 + }, + { + "epoch": 0.7941205680802463, + "grad_norm": 0.3900499706842763, + "learning_rate": 1.0705784666437863e-06, + "loss": 0.011, + "step": 3998 + }, + { + "epoch": 0.7943191975369948, + "grad_norm": 0.39376689041252166, + "learning_rate": 1.0685896471979074e-06, + "loss": 0.0072, + "step": 3999 + }, + { + "epoch": 0.7945178269937432, + "grad_norm": 0.472464666055705, + "learning_rate": 1.066602455745117e-06, + "loss": 0.0113, + "step": 4000 + }, + { + "epoch": 0.7947164564504916, + "grad_norm": 0.7011028914294587, + "learning_rate": 1.0646168931083123e-06, + "loss": 0.0159, + "step": 4001 + }, + { + "epoch": 0.79491508590724, + "grad_norm": 0.5949751520146886, + "learning_rate": 1.0626329601097102e-06, + "loss": 0.0188, + "step": 4002 + }, + { + "epoch": 0.7951137153639884, + "grad_norm": 0.5403625268434289, + "learning_rate": 1.0606506575708559e-06, + "loss": 0.0153, + "step": 4003 + }, + { + "epoch": 0.7953123448207369, + "grad_norm": 0.32460874730935, + "learning_rate": 1.0586699863126205e-06, + "loss": 0.0074, + "step": 4004 + }, + { + "epoch": 0.7955109742774854, + "grad_norm": 0.5225527629713734, + "learning_rate": 1.0566909471551956e-06, + "loss": 0.011, + "step": 4005 + }, + { + "epoch": 0.7957096037342338, + "grad_norm": 0.23452137300486403, + "learning_rate": 1.054713540918102e-06, + "loss": 0.006, + "step": 4006 + }, + { + "epoch": 0.7959082331909823, + "grad_norm": 0.9970244401372391, + "learning_rate": 1.0527377684201788e-06, + "loss": 0.0139, + "step": 4007 + }, + { + "epoch": 0.7961068626477307, + "grad_norm": 0.5496516753977624, + "learning_rate": 1.0507636304795942e-06, + "loss": 0.0113, + "step": 4008 + }, + { + "epoch": 0.7963054921044791, + "grad_norm": 0.40426703473069714, + "learning_rate": 1.0487911279138341e-06, + "loss": 0.0115, + "step": 4009 + }, + { + "epoch": 0.7965041215612275, + "grad_norm": 0.4432391943593047, + "learning_rate": 1.046820261539711e-06, + "loss": 0.0087, + "step": 4010 + }, + { + "epoch": 0.7967027510179759, + "grad_norm": 0.5434341498818477, + "learning_rate": 1.04485103217336e-06, + "loss": 0.0133, + "step": 4011 + }, + { + "epoch": 0.7969013804747244, + "grad_norm": 0.6869762130705815, + "learning_rate": 1.0428834406302345e-06, + "loss": 0.0172, + "step": 4012 + }, + { + "epoch": 0.7971000099314728, + "grad_norm": 0.3868532857982542, + "learning_rate": 1.040917487725115e-06, + "loss": 0.0107, + "step": 4013 + }, + { + "epoch": 0.7972986393882213, + "grad_norm": 0.7528663077873841, + "learning_rate": 1.0389531742720976e-06, + "loss": 0.008, + "step": 4014 + }, + { + "epoch": 0.7974972688449697, + "grad_norm": 0.47927375494835317, + "learning_rate": 1.0369905010846054e-06, + "loss": 0.0083, + "step": 4015 + }, + { + "epoch": 0.7976958983017182, + "grad_norm": 0.4350659995552571, + "learning_rate": 1.0350294689753764e-06, + "loss": 0.0088, + "step": 4016 + }, + { + "epoch": 0.7978945277584666, + "grad_norm": 0.295174664809995, + "learning_rate": 1.0330700787564756e-06, + "loss": 0.0045, + "step": 4017 + }, + { + "epoch": 0.798093157215215, + "grad_norm": 0.31793807650251443, + "learning_rate": 1.0311123312392823e-06, + "loss": 0.0099, + "step": 4018 + }, + { + "epoch": 0.7982917866719634, + "grad_norm": 0.23482176137295935, + "learning_rate": 1.0291562272344968e-06, + "loss": 0.0062, + "step": 4019 + }, + { + "epoch": 0.7984904161287119, + "grad_norm": 0.5061544706704849, + "learning_rate": 1.0272017675521423e-06, + "loss": 0.0089, + "step": 4020 + }, + { + "epoch": 0.7986890455854603, + "grad_norm": 0.31649122670119934, + "learning_rate": 1.0252489530015564e-06, + "loss": 0.0064, + "step": 4021 + }, + { + "epoch": 0.7988876750422088, + "grad_norm": 0.7586656461806361, + "learning_rate": 1.0232977843913983e-06, + "loss": 0.0149, + "step": 4022 + }, + { + "epoch": 0.7990863044989572, + "grad_norm": 0.6009260380452704, + "learning_rate": 1.0213482625296468e-06, + "loss": 0.0108, + "step": 4023 + }, + { + "epoch": 0.7992849339557057, + "grad_norm": 0.9958776677403033, + "learning_rate": 1.0194003882235943e-06, + "loss": 0.0113, + "step": 4024 + }, + { + "epoch": 0.7994835634124541, + "grad_norm": 0.5106754309922006, + "learning_rate": 1.0174541622798556e-06, + "loss": 0.0092, + "step": 4025 + }, + { + "epoch": 0.7996821928692025, + "grad_norm": 0.5365858874211565, + "learning_rate": 1.015509585504359e-06, + "loss": 0.0112, + "step": 4026 + }, + { + "epoch": 0.7998808223259509, + "grad_norm": 0.5783336315909131, + "learning_rate": 1.0135666587023545e-06, + "loss": 0.012, + "step": 4027 + }, + { + "epoch": 0.8000794517826993, + "grad_norm": 0.7373444717399422, + "learning_rate": 1.0116253826784028e-06, + "loss": 0.0112, + "step": 4028 + }, + { + "epoch": 0.8002780812394478, + "grad_norm": 0.23832355380940837, + "learning_rate": 1.0096857582363862e-06, + "loss": 0.0037, + "step": 4029 + }, + { + "epoch": 0.8004767106961962, + "grad_norm": 0.4162968251704296, + "learning_rate": 1.0077477861795026e-06, + "loss": 0.0098, + "step": 4030 + }, + { + "epoch": 0.8006753401529447, + "grad_norm": 0.2707466456827671, + "learning_rate": 1.0058114673102616e-06, + "loss": 0.0055, + "step": 4031 + }, + { + "epoch": 0.8008739696096931, + "grad_norm": 0.6216931576856428, + "learning_rate": 1.0038768024304928e-06, + "loss": 0.0139, + "step": 4032 + }, + { + "epoch": 0.8010725990664416, + "grad_norm": 0.5832019943429834, + "learning_rate": 1.0019437923413373e-06, + "loss": 0.0149, + "step": 4033 + }, + { + "epoch": 0.80127122852319, + "grad_norm": 0.6152187266424713, + "learning_rate": 1.0000124378432553e-06, + "loss": 0.0086, + "step": 4034 + }, + { + "epoch": 0.8014698579799384, + "grad_norm": 0.3619901915947861, + "learning_rate": 9.980827397360155e-07, + "loss": 0.0082, + "step": 4035 + }, + { + "epoch": 0.8016684874366868, + "grad_norm": 0.3343601589064828, + "learning_rate": 9.961546988187055e-07, + "loss": 0.0083, + "step": 4036 + }, + { + "epoch": 0.8018671168934353, + "grad_norm": 0.5999940967835934, + "learning_rate": 9.942283158897264e-07, + "loss": 0.0149, + "step": 4037 + }, + { + "epoch": 0.8020657463501837, + "grad_norm": 0.7931795315952762, + "learning_rate": 9.923035917467887e-07, + "loss": 0.0166, + "step": 4038 + }, + { + "epoch": 0.8022643758069322, + "grad_norm": 0.3931590426635836, + "learning_rate": 9.90380527186922e-07, + "loss": 0.0076, + "step": 4039 + }, + { + "epoch": 0.8024630052636806, + "grad_norm": 0.4836490308516654, + "learning_rate": 9.884591230064622e-07, + "loss": 0.0098, + "step": 4040 + }, + { + "epoch": 0.8026616347204291, + "grad_norm": 0.473992042830245, + "learning_rate": 9.865393800010636e-07, + "loss": 0.011, + "step": 4041 + }, + { + "epoch": 0.8028602641771775, + "grad_norm": 0.29184782245234137, + "learning_rate": 9.84621298965689e-07, + "loss": 0.0083, + "step": 4042 + }, + { + "epoch": 0.8030588936339259, + "grad_norm": 0.642632073832344, + "learning_rate": 9.827048806946115e-07, + "loss": 0.0094, + "step": 4043 + }, + { + "epoch": 0.8032575230906743, + "grad_norm": 0.3687910450246707, + "learning_rate": 9.807901259814211e-07, + "loss": 0.0072, + "step": 4044 + }, + { + "epoch": 0.8034561525474228, + "grad_norm": 0.37471175625878944, + "learning_rate": 9.788770356190137e-07, + "loss": 0.0097, + "step": 4045 + }, + { + "epoch": 0.8036547820041712, + "grad_norm": 0.357576240436411, + "learning_rate": 9.76965610399599e-07, + "loss": 0.0065, + "step": 4046 + }, + { + "epoch": 0.8038534114609197, + "grad_norm": 0.6214890939496734, + "learning_rate": 9.750558511146974e-07, + "loss": 0.0069, + "step": 4047 + }, + { + "epoch": 0.8040520409176681, + "grad_norm": 1.1538127906781666, + "learning_rate": 9.731477585551357e-07, + "loss": 0.0152, + "step": 4048 + }, + { + "epoch": 0.8042506703744166, + "grad_norm": 0.45686551117881485, + "learning_rate": 9.71241333511056e-07, + "loss": 0.0082, + "step": 4049 + }, + { + "epoch": 0.804449299831165, + "grad_norm": 0.3975400079457581, + "learning_rate": 9.693365767719044e-07, + "loss": 0.0087, + "step": 4050 + }, + { + "epoch": 0.8046479292879134, + "grad_norm": 0.7165138344226095, + "learning_rate": 9.674334891264414e-07, + "loss": 0.0122, + "step": 4051 + }, + { + "epoch": 0.8048465587446618, + "grad_norm": 0.5277203363825931, + "learning_rate": 9.65532071362731e-07, + "loss": 0.009, + "step": 4052 + }, + { + "epoch": 0.8050451882014102, + "grad_norm": 0.6225019351917528, + "learning_rate": 9.63632324268149e-07, + "loss": 0.0198, + "step": 4053 + }, + { + "epoch": 0.8052438176581587, + "grad_norm": 0.4903031415168761, + "learning_rate": 9.617342486293812e-07, + "loss": 0.0069, + "step": 4054 + }, + { + "epoch": 0.8054424471149071, + "grad_norm": 0.5308344689616842, + "learning_rate": 9.59837845232416e-07, + "loss": 0.0107, + "step": 4055 + }, + { + "epoch": 0.8056410765716556, + "grad_norm": 0.7708014862830759, + "learning_rate": 9.57943114862554e-07, + "loss": 0.0099, + "step": 4056 + }, + { + "epoch": 0.805839706028404, + "grad_norm": 1.0172527611136573, + "learning_rate": 9.560500583043986e-07, + "loss": 0.0087, + "step": 4057 + }, + { + "epoch": 0.8060383354851525, + "grad_norm": 0.3705945382686297, + "learning_rate": 9.541586763418664e-07, + "loss": 0.0122, + "step": 4058 + }, + { + "epoch": 0.8062369649419009, + "grad_norm": 0.7158602396036523, + "learning_rate": 9.522689697581733e-07, + "loss": 0.0117, + "step": 4059 + }, + { + "epoch": 0.8064355943986493, + "grad_norm": 0.5429010252706425, + "learning_rate": 9.50380939335846e-07, + "loss": 0.014, + "step": 4060 + }, + { + "epoch": 0.8066342238553977, + "grad_norm": 0.6956019948957465, + "learning_rate": 9.48494585856718e-07, + "loss": 0.0223, + "step": 4061 + }, + { + "epoch": 0.8068328533121462, + "grad_norm": 0.8506079494222205, + "learning_rate": 9.466099101019233e-07, + "loss": 0.0063, + "step": 4062 + }, + { + "epoch": 0.8070314827688946, + "grad_norm": 0.42075834755040786, + "learning_rate": 9.447269128519065e-07, + "loss": 0.0076, + "step": 4063 + }, + { + "epoch": 0.8072301122256431, + "grad_norm": 0.595418453912562, + "learning_rate": 9.428455948864134e-07, + "loss": 0.0117, + "step": 4064 + }, + { + "epoch": 0.8074287416823915, + "grad_norm": 0.6130960282660696, + "learning_rate": 9.409659569844975e-07, + "loss": 0.0107, + "step": 4065 + }, + { + "epoch": 0.80762737113914, + "grad_norm": 1.5118482480538398, + "learning_rate": 9.390879999245139e-07, + "loss": 0.0109, + "step": 4066 + }, + { + "epoch": 0.8078260005958884, + "grad_norm": 0.40907353954507686, + "learning_rate": 9.372117244841216e-07, + "loss": 0.0077, + "step": 4067 + }, + { + "epoch": 0.8080246300526368, + "grad_norm": 0.6408271348047319, + "learning_rate": 9.353371314402871e-07, + "loss": 0.0112, + "step": 4068 + }, + { + "epoch": 0.8082232595093852, + "grad_norm": 0.4828591067512512, + "learning_rate": 9.334642215692746e-07, + "loss": 0.0095, + "step": 4069 + }, + { + "epoch": 0.8084218889661337, + "grad_norm": 0.5431520041134413, + "learning_rate": 9.315929956466568e-07, + "loss": 0.0139, + "step": 4070 + }, + { + "epoch": 0.8086205184228821, + "grad_norm": 0.6310284789435388, + "learning_rate": 9.297234544473044e-07, + "loss": 0.0113, + "step": 4071 + }, + { + "epoch": 0.8088191478796305, + "grad_norm": 0.5081618596537647, + "learning_rate": 9.27855598745393e-07, + "loss": 0.0099, + "step": 4072 + }, + { + "epoch": 0.809017777336379, + "grad_norm": 0.44491804248735184, + "learning_rate": 9.259894293144017e-07, + "loss": 0.0101, + "step": 4073 + }, + { + "epoch": 0.8092164067931275, + "grad_norm": 0.46738266736467293, + "learning_rate": 9.241249469271068e-07, + "loss": 0.0103, + "step": 4074 + }, + { + "epoch": 0.8094150362498759, + "grad_norm": 0.5209162826736811, + "learning_rate": 9.222621523555908e-07, + "loss": 0.0065, + "step": 4075 + }, + { + "epoch": 0.8096136657066243, + "grad_norm": 0.3940989383722829, + "learning_rate": 9.204010463712326e-07, + "loss": 0.0087, + "step": 4076 + }, + { + "epoch": 0.8098122951633727, + "grad_norm": 0.4123877332825086, + "learning_rate": 9.18541629744717e-07, + "loss": 0.0093, + "step": 4077 + }, + { + "epoch": 0.8100109246201211, + "grad_norm": 0.6265895051388984, + "learning_rate": 9.16683903246024e-07, + "loss": 0.0155, + "step": 4078 + }, + { + "epoch": 0.8102095540768696, + "grad_norm": 0.5490533621387226, + "learning_rate": 9.148278676444372e-07, + "loss": 0.0067, + "step": 4079 + }, + { + "epoch": 0.810408183533618, + "grad_norm": 0.3131848581841642, + "learning_rate": 9.129735237085408e-07, + "loss": 0.0049, + "step": 4080 + }, + { + "epoch": 0.8106068129903665, + "grad_norm": 0.7500483326690944, + "learning_rate": 9.111208722062143e-07, + "loss": 0.0113, + "step": 4081 + }, + { + "epoch": 0.8108054424471149, + "grad_norm": 0.5484117682655739, + "learning_rate": 9.092699139046413e-07, + "loss": 0.011, + "step": 4082 + }, + { + "epoch": 0.8110040719038634, + "grad_norm": 0.3098865334339739, + "learning_rate": 9.074206495702992e-07, + "loss": 0.0086, + "step": 4083 + }, + { + "epoch": 0.8112027013606118, + "grad_norm": 0.17468366708042246, + "learning_rate": 9.055730799689688e-07, + "loss": 0.0028, + "step": 4084 + }, + { + "epoch": 0.8114013308173602, + "grad_norm": 0.538338960986114, + "learning_rate": 9.037272058657242e-07, + "loss": 0.0105, + "step": 4085 + }, + { + "epoch": 0.8115999602741086, + "grad_norm": 0.4201409900907841, + "learning_rate": 9.018830280249419e-07, + "loss": 0.0063, + "step": 4086 + }, + { + "epoch": 0.8117985897308571, + "grad_norm": 0.5876635452613788, + "learning_rate": 9.000405472102946e-07, + "loss": 0.0126, + "step": 4087 + }, + { + "epoch": 0.8119972191876055, + "grad_norm": 0.5156736720046141, + "learning_rate": 8.981997641847501e-07, + "loss": 0.0128, + "step": 4088 + }, + { + "epoch": 0.812195848644354, + "grad_norm": 0.5052009076325246, + "learning_rate": 8.963606797105767e-07, + "loss": 0.015, + "step": 4089 + }, + { + "epoch": 0.8123944781011024, + "grad_norm": 0.5430853287081518, + "learning_rate": 8.94523294549336e-07, + "loss": 0.01, + "step": 4090 + }, + { + "epoch": 0.8125931075578509, + "grad_norm": 0.44810523621001547, + "learning_rate": 8.92687609461887e-07, + "loss": 0.0098, + "step": 4091 + }, + { + "epoch": 0.8127917370145993, + "grad_norm": 0.5134814399572584, + "learning_rate": 8.908536252083865e-07, + "loss": 0.0056, + "step": 4092 + }, + { + "epoch": 0.8129903664713477, + "grad_norm": 0.46375774276643456, + "learning_rate": 8.890213425482841e-07, + "loss": 0.0089, + "step": 4093 + }, + { + "epoch": 0.8131889959280961, + "grad_norm": 1.1162249657192767, + "learning_rate": 8.871907622403275e-07, + "loss": 0.0214, + "step": 4094 + }, + { + "epoch": 0.8133876253848445, + "grad_norm": 0.42917047757228455, + "learning_rate": 8.853618850425572e-07, + "loss": 0.006, + "step": 4095 + }, + { + "epoch": 0.813586254841593, + "grad_norm": 0.5208590114708673, + "learning_rate": 8.835347117123089e-07, + "loss": 0.0161, + "step": 4096 + }, + { + "epoch": 0.8137848842983414, + "grad_norm": 0.5308424443047222, + "learning_rate": 8.817092430062158e-07, + "loss": 0.0161, + "step": 4097 + }, + { + "epoch": 0.8139835137550899, + "grad_norm": 0.46028366709106316, + "learning_rate": 8.798854796801997e-07, + "loss": 0.0097, + "step": 4098 + }, + { + "epoch": 0.8141821432118383, + "grad_norm": 0.448761200240888, + "learning_rate": 8.780634224894818e-07, + "loss": 0.0128, + "step": 4099 + }, + { + "epoch": 0.8143807726685868, + "grad_norm": 0.3671637643734095, + "learning_rate": 8.762430721885717e-07, + "loss": 0.0073, + "step": 4100 + }, + { + "epoch": 0.8145794021253352, + "grad_norm": 0.7115867455847144, + "learning_rate": 8.74424429531277e-07, + "loss": 0.0087, + "step": 4101 + }, + { + "epoch": 0.8147780315820836, + "grad_norm": 0.42494740814520693, + "learning_rate": 8.726074952706931e-07, + "loss": 0.0092, + "step": 4102 + }, + { + "epoch": 0.814976661038832, + "grad_norm": 0.7513309884495961, + "learning_rate": 8.707922701592126e-07, + "loss": 0.008, + "step": 4103 + }, + { + "epoch": 0.8151752904955805, + "grad_norm": 1.1383897819360642, + "learning_rate": 8.689787549485185e-07, + "loss": 0.0138, + "step": 4104 + }, + { + "epoch": 0.8153739199523289, + "grad_norm": 0.6557470447168657, + "learning_rate": 8.671669503895841e-07, + "loss": 0.0198, + "step": 4105 + }, + { + "epoch": 0.8155725494090774, + "grad_norm": 0.6581283241574932, + "learning_rate": 8.653568572326781e-07, + "loss": 0.0148, + "step": 4106 + }, + { + "epoch": 0.8157711788658258, + "grad_norm": 0.417139009113113, + "learning_rate": 8.635484762273561e-07, + "loss": 0.0123, + "step": 4107 + }, + { + "epoch": 0.8159698083225743, + "grad_norm": 0.6867991303871931, + "learning_rate": 8.617418081224682e-07, + "loss": 0.0168, + "step": 4108 + }, + { + "epoch": 0.8161684377793227, + "grad_norm": 1.0724616229540893, + "learning_rate": 8.599368536661528e-07, + "loss": 0.0131, + "step": 4109 + }, + { + "epoch": 0.8163670672360711, + "grad_norm": 0.5087955548304802, + "learning_rate": 8.581336136058405e-07, + "loss": 0.0137, + "step": 4110 + }, + { + "epoch": 0.8165656966928195, + "grad_norm": 0.421017074957917, + "learning_rate": 8.563320886882514e-07, + "loss": 0.0154, + "step": 4111 + }, + { + "epoch": 0.816764326149568, + "grad_norm": 0.39496073798864595, + "learning_rate": 8.545322796593941e-07, + "loss": 0.0157, + "step": 4112 + }, + { + "epoch": 0.8169629556063164, + "grad_norm": 0.4997123669546495, + "learning_rate": 8.527341872645711e-07, + "loss": 0.0143, + "step": 4113 + }, + { + "epoch": 0.8171615850630648, + "grad_norm": 0.37415956728088906, + "learning_rate": 8.509378122483652e-07, + "loss": 0.0102, + "step": 4114 + }, + { + "epoch": 0.8173602145198133, + "grad_norm": 0.383596392690376, + "learning_rate": 8.491431553546564e-07, + "loss": 0.0054, + "step": 4115 + }, + { + "epoch": 0.8175588439765618, + "grad_norm": 0.4821601487830514, + "learning_rate": 8.473502173266113e-07, + "loss": 0.0087, + "step": 4116 + }, + { + "epoch": 0.8177574734333102, + "grad_norm": 0.47018873094016517, + "learning_rate": 8.455589989066815e-07, + "loss": 0.0102, + "step": 4117 + }, + { + "epoch": 0.8179561028900586, + "grad_norm": 0.2810129004401643, + "learning_rate": 8.437695008366115e-07, + "loss": 0.0077, + "step": 4118 + }, + { + "epoch": 0.818154732346807, + "grad_norm": 0.5366359137320363, + "learning_rate": 8.419817238574273e-07, + "loss": 0.0083, + "step": 4119 + }, + { + "epoch": 0.8183533618035554, + "grad_norm": 0.316839455464566, + "learning_rate": 8.401956687094487e-07, + "loss": 0.006, + "step": 4120 + }, + { + "epoch": 0.8185519912603039, + "grad_norm": 0.3111926776617216, + "learning_rate": 8.384113361322765e-07, + "loss": 0.0086, + "step": 4121 + }, + { + "epoch": 0.8187506207170523, + "grad_norm": 0.33471856971083414, + "learning_rate": 8.366287268648027e-07, + "loss": 0.0079, + "step": 4122 + }, + { + "epoch": 0.8189492501738008, + "grad_norm": 0.4351278758006974, + "learning_rate": 8.348478416452049e-07, + "loss": 0.0078, + "step": 4123 + }, + { + "epoch": 0.8191478796305492, + "grad_norm": 0.1570818636838516, + "learning_rate": 8.330686812109439e-07, + "loss": 0.0027, + "step": 4124 + }, + { + "epoch": 0.8193465090872977, + "grad_norm": 0.5523549117457559, + "learning_rate": 8.312912462987699e-07, + "loss": 0.013, + "step": 4125 + }, + { + "epoch": 0.8195451385440461, + "grad_norm": 0.3214475082205346, + "learning_rate": 8.295155376447151e-07, + "loss": 0.0087, + "step": 4126 + }, + { + "epoch": 0.8197437680007945, + "grad_norm": 0.29871131133299855, + "learning_rate": 8.277415559841012e-07, + "loss": 0.0064, + "step": 4127 + }, + { + "epoch": 0.8199423974575429, + "grad_norm": 0.9678390162363811, + "learning_rate": 8.259693020515292e-07, + "loss": 0.0096, + "step": 4128 + }, + { + "epoch": 0.8201410269142914, + "grad_norm": 0.460127250128567, + "learning_rate": 8.241987765808896e-07, + "loss": 0.0128, + "step": 4129 + }, + { + "epoch": 0.8203396563710398, + "grad_norm": 0.6057853897482152, + "learning_rate": 8.224299803053559e-07, + "loss": 0.0088, + "step": 4130 + }, + { + "epoch": 0.8205382858277883, + "grad_norm": 0.5206279732652137, + "learning_rate": 8.206629139573824e-07, + "loss": 0.008, + "step": 4131 + }, + { + "epoch": 0.8207369152845367, + "grad_norm": 0.22610796187469387, + "learning_rate": 8.188975782687125e-07, + "loss": 0.0035, + "step": 4132 + }, + { + "epoch": 0.8209355447412852, + "grad_norm": 0.5861117727426646, + "learning_rate": 8.171339739703671e-07, + "loss": 0.0154, + "step": 4133 + }, + { + "epoch": 0.8211341741980336, + "grad_norm": 0.5374366990301481, + "learning_rate": 8.153721017926552e-07, + "loss": 0.0129, + "step": 4134 + }, + { + "epoch": 0.821332803654782, + "grad_norm": 0.4143698403269354, + "learning_rate": 8.136119624651645e-07, + "loss": 0.0114, + "step": 4135 + }, + { + "epoch": 0.8215314331115304, + "grad_norm": 0.37663933807526523, + "learning_rate": 8.118535567167673e-07, + "loss": 0.0067, + "step": 4136 + }, + { + "epoch": 0.8217300625682789, + "grad_norm": 0.9518297277488379, + "learning_rate": 8.100968852756208e-07, + "loss": 0.0155, + "step": 4137 + }, + { + "epoch": 0.8219286920250273, + "grad_norm": 0.4973523714635241, + "learning_rate": 8.083419488691563e-07, + "loss": 0.0108, + "step": 4138 + }, + { + "epoch": 0.8221273214817757, + "grad_norm": 0.4128944945882541, + "learning_rate": 8.065887482240925e-07, + "loss": 0.0105, + "step": 4139 + }, + { + "epoch": 0.8223259509385242, + "grad_norm": 0.43279127906825904, + "learning_rate": 8.048372840664298e-07, + "loss": 0.0086, + "step": 4140 + }, + { + "epoch": 0.8225245803952727, + "grad_norm": 0.5404311137276034, + "learning_rate": 8.030875571214458e-07, + "loss": 0.0118, + "step": 4141 + }, + { + "epoch": 0.8227232098520211, + "grad_norm": 0.6347070223716965, + "learning_rate": 8.013395681137027e-07, + "loss": 0.0154, + "step": 4142 + }, + { + "epoch": 0.8229218393087695, + "grad_norm": 0.44237408691615476, + "learning_rate": 7.995933177670385e-07, + "loss": 0.0108, + "step": 4143 + }, + { + "epoch": 0.8231204687655179, + "grad_norm": 0.6010470786401767, + "learning_rate": 7.978488068045764e-07, + "loss": 0.015, + "step": 4144 + }, + { + "epoch": 0.8233190982222663, + "grad_norm": 0.7701565584935676, + "learning_rate": 7.961060359487138e-07, + "loss": 0.0119, + "step": 4145 + }, + { + "epoch": 0.8235177276790148, + "grad_norm": 0.4310605835446213, + "learning_rate": 7.943650059211322e-07, + "loss": 0.0081, + "step": 4146 + }, + { + "epoch": 0.8237163571357632, + "grad_norm": 0.590740099431193, + "learning_rate": 7.92625717442791e-07, + "loss": 0.0167, + "step": 4147 + }, + { + "epoch": 0.8239149865925117, + "grad_norm": 0.9726708073139096, + "learning_rate": 7.908881712339256e-07, + "loss": 0.0189, + "step": 4148 + }, + { + "epoch": 0.8241136160492601, + "grad_norm": 0.4053559892400188, + "learning_rate": 7.891523680140545e-07, + "loss": 0.007, + "step": 4149 + }, + { + "epoch": 0.8243122455060086, + "grad_norm": 0.2627943863024738, + "learning_rate": 7.874183085019698e-07, + "loss": 0.0075, + "step": 4150 + }, + { + "epoch": 0.824510874962757, + "grad_norm": 0.6733876425443319, + "learning_rate": 7.856859934157463e-07, + "loss": 0.0113, + "step": 4151 + }, + { + "epoch": 0.8247095044195054, + "grad_norm": 0.42436341922801024, + "learning_rate": 7.839554234727309e-07, + "loss": 0.0105, + "step": 4152 + }, + { + "epoch": 0.8249081338762538, + "grad_norm": 0.6937300129824238, + "learning_rate": 7.822265993895533e-07, + "loss": 0.01, + "step": 4153 + }, + { + "epoch": 0.8251067633330023, + "grad_norm": 0.3959765854010693, + "learning_rate": 7.804995218821182e-07, + "loss": 0.0069, + "step": 4154 + }, + { + "epoch": 0.8253053927897507, + "grad_norm": 0.5846032872762472, + "learning_rate": 7.787741916656038e-07, + "loss": 0.0091, + "step": 4155 + }, + { + "epoch": 0.8255040222464991, + "grad_norm": 0.6565011551242094, + "learning_rate": 7.770506094544711e-07, + "loss": 0.0107, + "step": 4156 + }, + { + "epoch": 0.8257026517032476, + "grad_norm": 0.612137058681032, + "learning_rate": 7.753287759624506e-07, + "loss": 0.0123, + "step": 4157 + }, + { + "epoch": 0.8259012811599961, + "grad_norm": 0.5434012901407049, + "learning_rate": 7.736086919025549e-07, + "loss": 0.0098, + "step": 4158 + }, + { + "epoch": 0.8260999106167445, + "grad_norm": 0.45931089115946994, + "learning_rate": 7.718903579870656e-07, + "loss": 0.0132, + "step": 4159 + }, + { + "epoch": 0.8262985400734929, + "grad_norm": 0.3361927264054978, + "learning_rate": 7.701737749275457e-07, + "loss": 0.0074, + "step": 4160 + }, + { + "epoch": 0.8264971695302413, + "grad_norm": 0.5681567318696761, + "learning_rate": 7.684589434348316e-07, + "loss": 0.0092, + "step": 4161 + }, + { + "epoch": 0.8266957989869897, + "grad_norm": 0.5567095462260514, + "learning_rate": 7.66745864219029e-07, + "loss": 0.0152, + "step": 4162 + }, + { + "epoch": 0.8268944284437382, + "grad_norm": 0.6126892428547912, + "learning_rate": 7.650345379895263e-07, + "loss": 0.0118, + "step": 4163 + }, + { + "epoch": 0.8270930579004866, + "grad_norm": 0.8912645228011347, + "learning_rate": 7.63324965454979e-07, + "loss": 0.0145, + "step": 4164 + }, + { + "epoch": 0.8272916873572351, + "grad_norm": 0.5553683620343327, + "learning_rate": 7.616171473233208e-07, + "loss": 0.0127, + "step": 4165 + }, + { + "epoch": 0.8274903168139836, + "grad_norm": 0.5464361540926801, + "learning_rate": 7.599110843017588e-07, + "loss": 0.0196, + "step": 4166 + }, + { + "epoch": 0.827688946270732, + "grad_norm": 0.7188318229017358, + "learning_rate": 7.582067770967694e-07, + "loss": 0.0091, + "step": 4167 + }, + { + "epoch": 0.8278875757274804, + "grad_norm": 0.6648555543168041, + "learning_rate": 7.565042264141071e-07, + "loss": 0.0161, + "step": 4168 + }, + { + "epoch": 0.8280862051842288, + "grad_norm": 0.5054813315229681, + "learning_rate": 7.548034329587934e-07, + "loss": 0.0143, + "step": 4169 + }, + { + "epoch": 0.8282848346409772, + "grad_norm": 0.3704689407540544, + "learning_rate": 7.531043974351282e-07, + "loss": 0.0096, + "step": 4170 + }, + { + "epoch": 0.8284834640977257, + "grad_norm": 0.5217578074725935, + "learning_rate": 7.514071205466783e-07, + "loss": 0.0092, + "step": 4171 + }, + { + "epoch": 0.8286820935544741, + "grad_norm": 0.4232794133302012, + "learning_rate": 7.497116029962848e-07, + "loss": 0.0113, + "step": 4172 + }, + { + "epoch": 0.8288807230112225, + "grad_norm": 0.5154993721621637, + "learning_rate": 7.480178454860615e-07, + "loss": 0.0137, + "step": 4173 + }, + { + "epoch": 0.829079352467971, + "grad_norm": 0.7002762553322276, + "learning_rate": 7.463258487173891e-07, + "loss": 0.0122, + "step": 4174 + }, + { + "epoch": 0.8292779819247195, + "grad_norm": 0.2849167502483706, + "learning_rate": 7.446356133909244e-07, + "loss": 0.0053, + "step": 4175 + }, + { + "epoch": 0.8294766113814679, + "grad_norm": 0.3481357192610811, + "learning_rate": 7.429471402065891e-07, + "loss": 0.0057, + "step": 4176 + }, + { + "epoch": 0.8296752408382163, + "grad_norm": 0.5261029513879698, + "learning_rate": 7.412604298635817e-07, + "loss": 0.0142, + "step": 4177 + }, + { + "epoch": 0.8298738702949647, + "grad_norm": 0.43848638143307833, + "learning_rate": 7.395754830603636e-07, + "loss": 0.0083, + "step": 4178 + }, + { + "epoch": 0.8300724997517132, + "grad_norm": 0.47296015769234956, + "learning_rate": 7.37892300494672e-07, + "loss": 0.0129, + "step": 4179 + }, + { + "epoch": 0.8302711292084616, + "grad_norm": 0.2953837424429043, + "learning_rate": 7.362108828635117e-07, + "loss": 0.0095, + "step": 4180 + }, + { + "epoch": 0.83046975866521, + "grad_norm": 0.3474888761012372, + "learning_rate": 7.345312308631536e-07, + "loss": 0.0084, + "step": 4181 + }, + { + "epoch": 0.8306683881219585, + "grad_norm": 0.4370889556934582, + "learning_rate": 7.328533451891423e-07, + "loss": 0.011, + "step": 4182 + }, + { + "epoch": 0.830867017578707, + "grad_norm": 0.4255737427042872, + "learning_rate": 7.311772265362866e-07, + "loss": 0.0133, + "step": 4183 + }, + { + "epoch": 0.8310656470354554, + "grad_norm": 0.3946025875468651, + "learning_rate": 7.295028755986678e-07, + "loss": 0.0093, + "step": 4184 + }, + { + "epoch": 0.8312642764922038, + "grad_norm": 0.40631146235195387, + "learning_rate": 7.278302930696312e-07, + "loss": 0.0105, + "step": 4185 + }, + { + "epoch": 0.8314629059489522, + "grad_norm": 0.37683409726127903, + "learning_rate": 7.261594796417915e-07, + "loss": 0.016, + "step": 4186 + }, + { + "epoch": 0.8316615354057006, + "grad_norm": 0.5759629439702126, + "learning_rate": 7.244904360070321e-07, + "loss": 0.0093, + "step": 4187 + }, + { + "epoch": 0.8318601648624491, + "grad_norm": 0.4095970190194349, + "learning_rate": 7.228231628565003e-07, + "loss": 0.0089, + "step": 4188 + }, + { + "epoch": 0.8320587943191975, + "grad_norm": 0.8007934056507748, + "learning_rate": 7.211576608806132e-07, + "loss": 0.0129, + "step": 4189 + }, + { + "epoch": 0.832257423775946, + "grad_norm": 0.4194690457518919, + "learning_rate": 7.194939307690557e-07, + "loss": 0.0088, + "step": 4190 + }, + { + "epoch": 0.8324560532326944, + "grad_norm": 0.4443434778514458, + "learning_rate": 7.17831973210773e-07, + "loss": 0.0093, + "step": 4191 + }, + { + "epoch": 0.8326546826894429, + "grad_norm": 0.3928978159389848, + "learning_rate": 7.161717888939834e-07, + "loss": 0.0095, + "step": 4192 + }, + { + "epoch": 0.8328533121461913, + "grad_norm": 0.431478336303014, + "learning_rate": 7.145133785061648e-07, + "loss": 0.0105, + "step": 4193 + }, + { + "epoch": 0.8330519416029397, + "grad_norm": 0.5225194392334359, + "learning_rate": 7.12856742734066e-07, + "loss": 0.0196, + "step": 4194 + }, + { + "epoch": 0.8332505710596881, + "grad_norm": 0.36971643133265614, + "learning_rate": 7.112018822636951e-07, + "loss": 0.0079, + "step": 4195 + }, + { + "epoch": 0.8334492005164366, + "grad_norm": 0.6234502273500208, + "learning_rate": 7.095487977803306e-07, + "loss": 0.0126, + "step": 4196 + }, + { + "epoch": 0.833647829973185, + "grad_norm": 0.4720027613289749, + "learning_rate": 7.078974899685132e-07, + "loss": 0.0145, + "step": 4197 + }, + { + "epoch": 0.8338464594299334, + "grad_norm": 0.42761253917678405, + "learning_rate": 7.062479595120458e-07, + "loss": 0.0095, + "step": 4198 + }, + { + "epoch": 0.8340450888866819, + "grad_norm": 0.2903106453517186, + "learning_rate": 7.046002070939995e-07, + "loss": 0.0076, + "step": 4199 + }, + { + "epoch": 0.8342437183434304, + "grad_norm": 0.5331366190524331, + "learning_rate": 7.029542333967049e-07, + "loss": 0.0138, + "step": 4200 + }, + { + "epoch": 0.8344423478001788, + "grad_norm": 0.3941782778497735, + "learning_rate": 7.013100391017602e-07, + "loss": 0.0053, + "step": 4201 + }, + { + "epoch": 0.8346409772569272, + "grad_norm": 0.5155012599849433, + "learning_rate": 6.996676248900219e-07, + "loss": 0.0119, + "step": 4202 + }, + { + "epoch": 0.8348396067136756, + "grad_norm": 0.6437821506076101, + "learning_rate": 6.980269914416144e-07, + "loss": 0.0117, + "step": 4203 + }, + { + "epoch": 0.835038236170424, + "grad_norm": 0.24442114698882744, + "learning_rate": 6.963881394359223e-07, + "loss": 0.004, + "step": 4204 + }, + { + "epoch": 0.8352368656271725, + "grad_norm": 0.9459746769962994, + "learning_rate": 6.947510695515913e-07, + "loss": 0.0263, + "step": 4205 + }, + { + "epoch": 0.8354354950839209, + "grad_norm": 0.2833211575993794, + "learning_rate": 6.931157824665319e-07, + "loss": 0.0066, + "step": 4206 + }, + { + "epoch": 0.8356341245406694, + "grad_norm": 0.4356603803658812, + "learning_rate": 6.914822788579123e-07, + "loss": 0.01, + "step": 4207 + }, + { + "epoch": 0.8358327539974179, + "grad_norm": 0.5338876349294479, + "learning_rate": 6.898505594021681e-07, + "loss": 0.0127, + "step": 4208 + }, + { + "epoch": 0.8360313834541663, + "grad_norm": 0.5494695551352939, + "learning_rate": 6.882206247749907e-07, + "loss": 0.012, + "step": 4209 + }, + { + "epoch": 0.8362300129109147, + "grad_norm": 0.6861938545700319, + "learning_rate": 6.865924756513336e-07, + "loss": 0.0191, + "step": 4210 + }, + { + "epoch": 0.8364286423676631, + "grad_norm": 0.4886731636680826, + "learning_rate": 6.849661127054141e-07, + "loss": 0.009, + "step": 4211 + }, + { + "epoch": 0.8366272718244115, + "grad_norm": 0.3818112246211818, + "learning_rate": 6.833415366107049e-07, + "loss": 0.0103, + "step": 4212 + }, + { + "epoch": 0.83682590128116, + "grad_norm": 0.21974960483824155, + "learning_rate": 6.817187480399434e-07, + "loss": 0.0045, + "step": 4213 + }, + { + "epoch": 0.8370245307379084, + "grad_norm": 0.46671217892639616, + "learning_rate": 6.800977476651232e-07, + "loss": 0.0128, + "step": 4214 + }, + { + "epoch": 0.8372231601946568, + "grad_norm": 0.6528384272178425, + "learning_rate": 6.784785361574997e-07, + "loss": 0.0114, + "step": 4215 + }, + { + "epoch": 0.8374217896514053, + "grad_norm": 0.4510583456447262, + "learning_rate": 6.768611141875875e-07, + "loss": 0.0088, + "step": 4216 + }, + { + "epoch": 0.8376204191081538, + "grad_norm": 0.40374865267390264, + "learning_rate": 6.752454824251575e-07, + "loss": 0.011, + "step": 4217 + }, + { + "epoch": 0.8378190485649022, + "grad_norm": 0.41459767009959575, + "learning_rate": 6.736316415392435e-07, + "loss": 0.0093, + "step": 4218 + }, + { + "epoch": 0.8380176780216506, + "grad_norm": 0.2681325117645612, + "learning_rate": 6.720195921981332e-07, + "loss": 0.0038, + "step": 4219 + }, + { + "epoch": 0.838216307478399, + "grad_norm": 0.42508478595734384, + "learning_rate": 6.704093350693763e-07, + "loss": 0.015, + "step": 4220 + }, + { + "epoch": 0.8384149369351475, + "grad_norm": 0.37595114785371714, + "learning_rate": 6.688008708197774e-07, + "loss": 0.0073, + "step": 4221 + }, + { + "epoch": 0.8386135663918959, + "grad_norm": 0.31877225252641367, + "learning_rate": 6.671942001154003e-07, + "loss": 0.0067, + "step": 4222 + }, + { + "epoch": 0.8388121958486443, + "grad_norm": 0.46674482672364, + "learning_rate": 6.655893236215671e-07, + "loss": 0.0135, + "step": 4223 + }, + { + "epoch": 0.8390108253053928, + "grad_norm": 0.41001893376938964, + "learning_rate": 6.639862420028531e-07, + "loss": 0.0105, + "step": 4224 + }, + { + "epoch": 0.8392094547621413, + "grad_norm": 0.7742237052495848, + "learning_rate": 6.623849559230955e-07, + "loss": 0.0171, + "step": 4225 + }, + { + "epoch": 0.8394080842188897, + "grad_norm": 1.0107664886273078, + "learning_rate": 6.607854660453827e-07, + "loss": 0.0148, + "step": 4226 + }, + { + "epoch": 0.8396067136756381, + "grad_norm": 0.7350900223607449, + "learning_rate": 6.591877730320639e-07, + "loss": 0.0105, + "step": 4227 + }, + { + "epoch": 0.8398053431323865, + "grad_norm": 0.6237840242821349, + "learning_rate": 6.575918775447404e-07, + "loss": 0.0104, + "step": 4228 + }, + { + "epoch": 0.840003972589135, + "grad_norm": 0.2205986767974159, + "learning_rate": 6.559977802442719e-07, + "loss": 0.0079, + "step": 4229 + }, + { + "epoch": 0.8402026020458834, + "grad_norm": 0.391922815119898, + "learning_rate": 6.54405481790773e-07, + "loss": 0.0071, + "step": 4230 + }, + { + "epoch": 0.8404012315026318, + "grad_norm": 0.6980785689595316, + "learning_rate": 6.52814982843612e-07, + "loss": 0.0181, + "step": 4231 + }, + { + "epoch": 0.8405998609593803, + "grad_norm": 0.8308250329575882, + "learning_rate": 6.512262840614137e-07, + "loss": 0.0107, + "step": 4232 + }, + { + "epoch": 0.8407984904161288, + "grad_norm": 0.5525708162928655, + "learning_rate": 6.496393861020562e-07, + "loss": 0.0099, + "step": 4233 + }, + { + "epoch": 0.8409971198728772, + "grad_norm": 0.4976875182817474, + "learning_rate": 6.480542896226716e-07, + "loss": 0.0083, + "step": 4234 + }, + { + "epoch": 0.8411957493296256, + "grad_norm": 0.34387989751239384, + "learning_rate": 6.464709952796482e-07, + "loss": 0.0082, + "step": 4235 + }, + { + "epoch": 0.841394378786374, + "grad_norm": 0.5003515193396486, + "learning_rate": 6.448895037286251e-07, + "loss": 0.0079, + "step": 4236 + }, + { + "epoch": 0.8415930082431224, + "grad_norm": 0.476787360201741, + "learning_rate": 6.433098156244982e-07, + "loss": 0.0098, + "step": 4237 + }, + { + "epoch": 0.8417916376998709, + "grad_norm": 0.5623162439430519, + "learning_rate": 6.417319316214126e-07, + "loss": 0.0124, + "step": 4238 + }, + { + "epoch": 0.8419902671566193, + "grad_norm": 0.368015208338776, + "learning_rate": 6.401558523727703e-07, + "loss": 0.0098, + "step": 4239 + }, + { + "epoch": 0.8421888966133677, + "grad_norm": 0.5533902338765675, + "learning_rate": 6.385815785312238e-07, + "loss": 0.012, + "step": 4240 + }, + { + "epoch": 0.8423875260701162, + "grad_norm": 0.4726896319372486, + "learning_rate": 6.37009110748677e-07, + "loss": 0.0139, + "step": 4241 + }, + { + "epoch": 0.8425861555268647, + "grad_norm": 0.4250515882151428, + "learning_rate": 6.354384496762894e-07, + "loss": 0.01, + "step": 4242 + }, + { + "epoch": 0.8427847849836131, + "grad_norm": 0.25361240255528045, + "learning_rate": 6.33869595964468e-07, + "loss": 0.0069, + "step": 4243 + }, + { + "epoch": 0.8429834144403615, + "grad_norm": 0.44441454060510527, + "learning_rate": 6.323025502628754e-07, + "loss": 0.0095, + "step": 4244 + }, + { + "epoch": 0.8431820438971099, + "grad_norm": 0.599605477417074, + "learning_rate": 6.30737313220422e-07, + "loss": 0.0091, + "step": 4245 + }, + { + "epoch": 0.8433806733538584, + "grad_norm": 0.4374505279129274, + "learning_rate": 6.291738854852719e-07, + "loss": 0.0079, + "step": 4246 + }, + { + "epoch": 0.8435793028106068, + "grad_norm": 0.7710884713360233, + "learning_rate": 6.276122677048396e-07, + "loss": 0.0135, + "step": 4247 + }, + { + "epoch": 0.8437779322673552, + "grad_norm": 0.5753325234304452, + "learning_rate": 6.260524605257873e-07, + "loss": 0.0148, + "step": 4248 + }, + { + "epoch": 0.8439765617241037, + "grad_norm": 0.5226453715532203, + "learning_rate": 6.244944645940326e-07, + "loss": 0.013, + "step": 4249 + }, + { + "epoch": 0.8441751911808522, + "grad_norm": 0.4733145956448767, + "learning_rate": 6.229382805547369e-07, + "loss": 0.0142, + "step": 4250 + }, + { + "epoch": 0.8443738206376006, + "grad_norm": 0.7137863419541726, + "learning_rate": 6.213839090523166e-07, + "loss": 0.0124, + "step": 4251 + }, + { + "epoch": 0.844572450094349, + "grad_norm": 0.5052172387349066, + "learning_rate": 6.198313507304343e-07, + "loss": 0.0214, + "step": 4252 + }, + { + "epoch": 0.8447710795510974, + "grad_norm": 0.36751253905346604, + "learning_rate": 6.182806062320029e-07, + "loss": 0.0086, + "step": 4253 + }, + { + "epoch": 0.8449697090078458, + "grad_norm": 0.6136397561793601, + "learning_rate": 6.167316761991854e-07, + "loss": 0.0144, + "step": 4254 + }, + { + "epoch": 0.8451683384645943, + "grad_norm": 1.413194361081298, + "learning_rate": 6.151845612733909e-07, + "loss": 0.0066, + "step": 4255 + }, + { + "epoch": 0.8453669679213427, + "grad_norm": 0.38786949863789816, + "learning_rate": 6.136392620952791e-07, + "loss": 0.0148, + "step": 4256 + }, + { + "epoch": 0.8455655973780911, + "grad_norm": 0.3696999820971262, + "learning_rate": 6.120957793047561e-07, + "loss": 0.0065, + "step": 4257 + }, + { + "epoch": 0.8457642268348396, + "grad_norm": 0.5675751446068034, + "learning_rate": 6.105541135409759e-07, + "loss": 0.0134, + "step": 4258 + }, + { + "epoch": 0.8459628562915881, + "grad_norm": 0.49713563656670917, + "learning_rate": 6.090142654423425e-07, + "loss": 0.0076, + "step": 4259 + }, + { + "epoch": 0.8461614857483365, + "grad_norm": 0.36177582839219674, + "learning_rate": 6.074762356465036e-07, + "loss": 0.0101, + "step": 4260 + }, + { + "epoch": 0.8463601152050849, + "grad_norm": 0.682481331309334, + "learning_rate": 6.05940024790358e-07, + "loss": 0.0127, + "step": 4261 + }, + { + "epoch": 0.8465587446618333, + "grad_norm": 0.3292789124528157, + "learning_rate": 6.044056335100462e-07, + "loss": 0.006, + "step": 4262 + }, + { + "epoch": 0.8467573741185818, + "grad_norm": 0.64966583910457, + "learning_rate": 6.028730624409612e-07, + "loss": 0.0116, + "step": 4263 + }, + { + "epoch": 0.8469560035753302, + "grad_norm": 0.8237702138641505, + "learning_rate": 6.013423122177364e-07, + "loss": 0.0103, + "step": 4264 + }, + { + "epoch": 0.8471546330320786, + "grad_norm": 0.39478232550170367, + "learning_rate": 5.998133834742553e-07, + "loss": 0.0058, + "step": 4265 + }, + { + "epoch": 0.8473532624888271, + "grad_norm": 0.6707468045773747, + "learning_rate": 5.982862768436464e-07, + "loss": 0.0093, + "step": 4266 + }, + { + "epoch": 0.8475518919455756, + "grad_norm": 0.7479799316317778, + "learning_rate": 5.967609929582818e-07, + "loss": 0.0226, + "step": 4267 + }, + { + "epoch": 0.847750521402324, + "grad_norm": 0.6860784103609919, + "learning_rate": 5.952375324497811e-07, + "loss": 0.0106, + "step": 4268 + }, + { + "epoch": 0.8479491508590724, + "grad_norm": 0.5184502971263298, + "learning_rate": 5.937158959490064e-07, + "loss": 0.0086, + "step": 4269 + }, + { + "epoch": 0.8481477803158208, + "grad_norm": 0.39623222920861734, + "learning_rate": 5.921960840860674e-07, + "loss": 0.0142, + "step": 4270 + }, + { + "epoch": 0.8483464097725693, + "grad_norm": 2.3172804734227426, + "learning_rate": 5.906780974903154e-07, + "loss": 0.0167, + "step": 4271 + }, + { + "epoch": 0.8485450392293177, + "grad_norm": 0.3739822933902876, + "learning_rate": 5.891619367903467e-07, + "loss": 0.0104, + "step": 4272 + }, + { + "epoch": 0.8487436686860661, + "grad_norm": 0.3235758648301256, + "learning_rate": 5.876476026140043e-07, + "loss": 0.0065, + "step": 4273 + }, + { + "epoch": 0.8489422981428146, + "grad_norm": 0.4318882083611534, + "learning_rate": 5.861350955883705e-07, + "loss": 0.011, + "step": 4274 + }, + { + "epoch": 0.8491409275995631, + "grad_norm": 1.0955982519259333, + "learning_rate": 5.846244163397741e-07, + "loss": 0.0139, + "step": 4275 + }, + { + "epoch": 0.8493395570563115, + "grad_norm": 0.5712222512112204, + "learning_rate": 5.831155654937837e-07, + "loss": 0.0149, + "step": 4276 + }, + { + "epoch": 0.8495381865130599, + "grad_norm": 0.3652696761055196, + "learning_rate": 5.816085436752162e-07, + "loss": 0.0064, + "step": 4277 + }, + { + "epoch": 0.8497368159698083, + "grad_norm": 0.7739356696768167, + "learning_rate": 5.80103351508125e-07, + "loss": 0.0139, + "step": 4278 + }, + { + "epoch": 0.8499354454265567, + "grad_norm": 0.42583485037192603, + "learning_rate": 5.785999896158096e-07, + "loss": 0.0092, + "step": 4279 + }, + { + "epoch": 0.8501340748833052, + "grad_norm": 0.6344045019567613, + "learning_rate": 5.770984586208112e-07, + "loss": 0.0082, + "step": 4280 + }, + { + "epoch": 0.8503327043400536, + "grad_norm": 0.6040578317404494, + "learning_rate": 5.755987591449125e-07, + "loss": 0.0127, + "step": 4281 + }, + { + "epoch": 0.850531333796802, + "grad_norm": 0.4502937353117837, + "learning_rate": 5.741008918091362e-07, + "loss": 0.0113, + "step": 4282 + }, + { + "epoch": 0.8507299632535505, + "grad_norm": 0.4734975385054619, + "learning_rate": 5.726048572337489e-07, + "loss": 0.0123, + "step": 4283 + }, + { + "epoch": 0.850928592710299, + "grad_norm": 0.335158237315028, + "learning_rate": 5.711106560382556e-07, + "loss": 0.0071, + "step": 4284 + }, + { + "epoch": 0.8511272221670474, + "grad_norm": 0.6315586556303954, + "learning_rate": 5.696182888414053e-07, + "loss": 0.0202, + "step": 4285 + }, + { + "epoch": 0.8513258516237958, + "grad_norm": 0.5356384799146305, + "learning_rate": 5.681277562611842e-07, + "loss": 0.013, + "step": 4286 + }, + { + "epoch": 0.8515244810805442, + "grad_norm": 0.5933153411083376, + "learning_rate": 5.666390589148219e-07, + "loss": 0.0101, + "step": 4287 + }, + { + "epoch": 0.8517231105372927, + "grad_norm": 0.29937152219167246, + "learning_rate": 5.651521974187846e-07, + "loss": 0.0083, + "step": 4288 + }, + { + "epoch": 0.8519217399940411, + "grad_norm": 0.7624990790569742, + "learning_rate": 5.636671723887816e-07, + "loss": 0.0132, + "step": 4289 + }, + { + "epoch": 0.8521203694507895, + "grad_norm": 0.18558021382686216, + "learning_rate": 5.621839844397603e-07, + "loss": 0.0047, + "step": 4290 + }, + { + "epoch": 0.852318998907538, + "grad_norm": 0.31218589825449133, + "learning_rate": 5.607026341859062e-07, + "loss": 0.0094, + "step": 4291 + }, + { + "epoch": 0.8525176283642865, + "grad_norm": 0.3659669083845685, + "learning_rate": 5.592231222406463e-07, + "loss": 0.0033, + "step": 4292 + }, + { + "epoch": 0.8527162578210349, + "grad_norm": 0.5248556606937119, + "learning_rate": 5.57745449216644e-07, + "loss": 0.0075, + "step": 4293 + }, + { + "epoch": 0.8529148872777833, + "grad_norm": 0.30725668244669896, + "learning_rate": 5.562696157258029e-07, + "loss": 0.0075, + "step": 4294 + }, + { + "epoch": 0.8531135167345317, + "grad_norm": 0.5613863285877204, + "learning_rate": 5.547956223792633e-07, + "loss": 0.0081, + "step": 4295 + }, + { + "epoch": 0.8533121461912802, + "grad_norm": 0.2473969543840592, + "learning_rate": 5.533234697874045e-07, + "loss": 0.0054, + "step": 4296 + }, + { + "epoch": 0.8535107756480286, + "grad_norm": 0.46726588405248737, + "learning_rate": 5.518531585598452e-07, + "loss": 0.0125, + "step": 4297 + }, + { + "epoch": 0.853709405104777, + "grad_norm": 0.5320500221062148, + "learning_rate": 5.503846893054376e-07, + "loss": 0.0147, + "step": 4298 + }, + { + "epoch": 0.8539080345615254, + "grad_norm": 0.5026086372784023, + "learning_rate": 5.489180626322749e-07, + "loss": 0.0147, + "step": 4299 + }, + { + "epoch": 0.854106664018274, + "grad_norm": 0.36955112376345584, + "learning_rate": 5.474532791476844e-07, + "loss": 0.0063, + "step": 4300 + }, + { + "epoch": 0.8543052934750224, + "grad_norm": 0.4981679024030091, + "learning_rate": 5.459903394582328e-07, + "loss": 0.0064, + "step": 4301 + }, + { + "epoch": 0.8545039229317708, + "grad_norm": 0.4244213243244062, + "learning_rate": 5.445292441697203e-07, + "loss": 0.0144, + "step": 4302 + }, + { + "epoch": 0.8547025523885192, + "grad_norm": 0.6977278332167148, + "learning_rate": 5.430699938871858e-07, + "loss": 0.0088, + "step": 4303 + }, + { + "epoch": 0.8549011818452676, + "grad_norm": 0.7642388896941835, + "learning_rate": 5.416125892149049e-07, + "loss": 0.0149, + "step": 4304 + }, + { + "epoch": 0.8550998113020161, + "grad_norm": 0.4029155837440189, + "learning_rate": 5.401570307563858e-07, + "loss": 0.0064, + "step": 4305 + }, + { + "epoch": 0.8552984407587645, + "grad_norm": 0.6354016421469559, + "learning_rate": 5.387033191143742e-07, + "loss": 0.0156, + "step": 4306 + }, + { + "epoch": 0.8554970702155129, + "grad_norm": 0.36260476179749174, + "learning_rate": 5.372514548908498e-07, + "loss": 0.0073, + "step": 4307 + }, + { + "epoch": 0.8556956996722614, + "grad_norm": 0.5767576719232735, + "learning_rate": 5.358014386870286e-07, + "loss": 0.0071, + "step": 4308 + }, + { + "epoch": 0.8558943291290099, + "grad_norm": 0.5407542575497629, + "learning_rate": 5.343532711033617e-07, + "loss": 0.0089, + "step": 4309 + }, + { + "epoch": 0.8560929585857583, + "grad_norm": 0.39473103537864135, + "learning_rate": 5.329069527395325e-07, + "loss": 0.0085, + "step": 4310 + }, + { + "epoch": 0.8562915880425067, + "grad_norm": 0.3010851871794071, + "learning_rate": 5.314624841944616e-07, + "loss": 0.0084, + "step": 4311 + }, + { + "epoch": 0.8564902174992551, + "grad_norm": 0.4049809213204187, + "learning_rate": 5.300198660663003e-07, + "loss": 0.0106, + "step": 4312 + }, + { + "epoch": 0.8566888469560036, + "grad_norm": 0.5330916206019888, + "learning_rate": 5.28579098952437e-07, + "loss": 0.016, + "step": 4313 + }, + { + "epoch": 0.856887476412752, + "grad_norm": 0.32358362989327244, + "learning_rate": 5.2714018344949e-07, + "loss": 0.013, + "step": 4314 + }, + { + "epoch": 0.8570861058695004, + "grad_norm": 0.39221129075451266, + "learning_rate": 5.257031201533141e-07, + "loss": 0.006, + "step": 4315 + }, + { + "epoch": 0.8572847353262489, + "grad_norm": 0.4547069807214053, + "learning_rate": 5.242679096589959e-07, + "loss": 0.0131, + "step": 4316 + }, + { + "epoch": 0.8574833647829974, + "grad_norm": 0.24851871977697923, + "learning_rate": 5.228345525608536e-07, + "loss": 0.007, + "step": 4317 + }, + { + "epoch": 0.8576819942397458, + "grad_norm": 0.3943911033364027, + "learning_rate": 5.214030494524408e-07, + "loss": 0.0136, + "step": 4318 + }, + { + "epoch": 0.8578806236964942, + "grad_norm": 0.40090137599123665, + "learning_rate": 5.199734009265389e-07, + "loss": 0.009, + "step": 4319 + }, + { + "epoch": 0.8580792531532426, + "grad_norm": 0.5959732058644688, + "learning_rate": 5.18545607575166e-07, + "loss": 0.0098, + "step": 4320 + }, + { + "epoch": 0.858277882609991, + "grad_norm": 0.6067428479491546, + "learning_rate": 5.171196699895687e-07, + "loss": 0.0081, + "step": 4321 + }, + { + "epoch": 0.8584765120667395, + "grad_norm": 0.46155290089707957, + "learning_rate": 5.15695588760226e-07, + "loss": 0.0116, + "step": 4322 + }, + { + "epoch": 0.8586751415234879, + "grad_norm": 0.4010737738551735, + "learning_rate": 5.142733644768511e-07, + "loss": 0.0154, + "step": 4323 + }, + { + "epoch": 0.8588737709802363, + "grad_norm": 0.2686236162523568, + "learning_rate": 5.128529977283824e-07, + "loss": 0.0055, + "step": 4324 + }, + { + "epoch": 0.8590724004369849, + "grad_norm": 0.618702187000194, + "learning_rate": 5.114344891029949e-07, + "loss": 0.0114, + "step": 4325 + }, + { + "epoch": 0.8592710298937333, + "grad_norm": 0.18789334381194944, + "learning_rate": 5.1001783918809e-07, + "loss": 0.0039, + "step": 4326 + }, + { + "epoch": 0.8594696593504817, + "grad_norm": 0.6465736250291123, + "learning_rate": 5.086030485703019e-07, + "loss": 0.0084, + "step": 4327 + }, + { + "epoch": 0.8596682888072301, + "grad_norm": 0.5719333455155227, + "learning_rate": 5.071901178354927e-07, + "loss": 0.0094, + "step": 4328 + }, + { + "epoch": 0.8598669182639785, + "grad_norm": 0.5000117385352728, + "learning_rate": 5.057790475687574e-07, + "loss": 0.0065, + "step": 4329 + }, + { + "epoch": 0.860065547720727, + "grad_norm": 0.6335598418218465, + "learning_rate": 5.043698383544182e-07, + "loss": 0.0147, + "step": 4330 + }, + { + "epoch": 0.8602641771774754, + "grad_norm": 0.7245941891795274, + "learning_rate": 5.029624907760255e-07, + "loss": 0.0075, + "step": 4331 + }, + { + "epoch": 0.8604628066342238, + "grad_norm": 0.5740508426452544, + "learning_rate": 5.015570054163621e-07, + "loss": 0.0092, + "step": 4332 + }, + { + "epoch": 0.8606614360909723, + "grad_norm": 0.47282095433648685, + "learning_rate": 5.001533828574389e-07, + "loss": 0.0105, + "step": 4333 + }, + { + "epoch": 0.8608600655477208, + "grad_norm": 0.5065183646911192, + "learning_rate": 4.987516236804929e-07, + "loss": 0.01, + "step": 4334 + }, + { + "epoch": 0.8610586950044692, + "grad_norm": 0.6252766191350665, + "learning_rate": 4.973517284659923e-07, + "loss": 0.0121, + "step": 4335 + }, + { + "epoch": 0.8612573244612176, + "grad_norm": 0.8191924217356521, + "learning_rate": 4.959536977936313e-07, + "loss": 0.0168, + "step": 4336 + }, + { + "epoch": 0.861455953917966, + "grad_norm": 0.7702791527849939, + "learning_rate": 4.945575322423346e-07, + "loss": 0.014, + "step": 4337 + }, + { + "epoch": 0.8616545833747145, + "grad_norm": 0.4220973789902732, + "learning_rate": 4.931632323902508e-07, + "loss": 0.0071, + "step": 4338 + }, + { + "epoch": 0.8618532128314629, + "grad_norm": 0.7487513974820313, + "learning_rate": 4.917707988147591e-07, + "loss": 0.0109, + "step": 4339 + }, + { + "epoch": 0.8620518422882113, + "grad_norm": 0.5040477610803438, + "learning_rate": 4.903802320924661e-07, + "loss": 0.0175, + "step": 4340 + }, + { + "epoch": 0.8622504717449597, + "grad_norm": 0.8484536149810432, + "learning_rate": 4.889915327992024e-07, + "loss": 0.0114, + "step": 4341 + }, + { + "epoch": 0.8624491012017083, + "grad_norm": 0.34276135719378503, + "learning_rate": 4.876047015100277e-07, + "loss": 0.0091, + "step": 4342 + }, + { + "epoch": 0.8626477306584567, + "grad_norm": 0.3138227924206721, + "learning_rate": 4.862197387992267e-07, + "loss": 0.0081, + "step": 4343 + }, + { + "epoch": 0.8628463601152051, + "grad_norm": 0.48755996193098844, + "learning_rate": 4.848366452403125e-07, + "loss": 0.0141, + "step": 4344 + }, + { + "epoch": 0.8630449895719535, + "grad_norm": 0.4243160855813772, + "learning_rate": 4.834554214060211e-07, + "loss": 0.0129, + "step": 4345 + }, + { + "epoch": 0.863243619028702, + "grad_norm": 0.28134303568802305, + "learning_rate": 4.820760678683168e-07, + "loss": 0.0092, + "step": 4346 + }, + { + "epoch": 0.8634422484854504, + "grad_norm": 0.9313255602494132, + "learning_rate": 4.806985851983892e-07, + "loss": 0.0148, + "step": 4347 + }, + { + "epoch": 0.8636408779421988, + "grad_norm": 0.33444347069896996, + "learning_rate": 4.793229739666505e-07, + "loss": 0.0057, + "step": 4348 + }, + { + "epoch": 0.8638395073989472, + "grad_norm": 0.4575240882152554, + "learning_rate": 4.779492347427422e-07, + "loss": 0.0069, + "step": 4349 + }, + { + "epoch": 0.8640381368556957, + "grad_norm": 0.25494908087050244, + "learning_rate": 4.7657736809552655e-07, + "loss": 0.007, + "step": 4350 + }, + { + "epoch": 0.8642367663124442, + "grad_norm": 0.674796943258971, + "learning_rate": 4.752073745930941e-07, + "loss": 0.0126, + "step": 4351 + }, + { + "epoch": 0.8644353957691926, + "grad_norm": 0.8848459103510878, + "learning_rate": 4.738392548027565e-07, + "loss": 0.0094, + "step": 4352 + }, + { + "epoch": 0.864634025225941, + "grad_norm": 0.48385172701891577, + "learning_rate": 4.724730092910496e-07, + "loss": 0.0138, + "step": 4353 + }, + { + "epoch": 0.8648326546826894, + "grad_norm": 0.31959578564614055, + "learning_rate": 4.7110863862373677e-07, + "loss": 0.0059, + "step": 4354 + }, + { + "epoch": 0.8650312841394379, + "grad_norm": 0.45707641134122046, + "learning_rate": 4.6974614336580014e-07, + "loss": 0.0126, + "step": 4355 + }, + { + "epoch": 0.8652299135961863, + "grad_norm": 0.24519345456981262, + "learning_rate": 4.6838552408145e-07, + "loss": 0.006, + "step": 4356 + }, + { + "epoch": 0.8654285430529347, + "grad_norm": 0.6553366699161421, + "learning_rate": 4.6702678133411505e-07, + "loss": 0.0137, + "step": 4357 + }, + { + "epoch": 0.8656271725096832, + "grad_norm": 0.6216476033899038, + "learning_rate": 4.656699156864508e-07, + "loss": 0.0076, + "step": 4358 + }, + { + "epoch": 0.8658258019664317, + "grad_norm": 0.9884101474028949, + "learning_rate": 4.643149277003345e-07, + "loss": 0.0187, + "step": 4359 + }, + { + "epoch": 0.8660244314231801, + "grad_norm": 0.3261124525807072, + "learning_rate": 4.6296181793686337e-07, + "loss": 0.0059, + "step": 4360 + }, + { + "epoch": 0.8662230608799285, + "grad_norm": 0.4910131929561135, + "learning_rate": 4.616105869563614e-07, + "loss": 0.0144, + "step": 4361 + }, + { + "epoch": 0.8664216903366769, + "grad_norm": 0.5104540545077465, + "learning_rate": 4.602612353183689e-07, + "loss": 0.0084, + "step": 4362 + }, + { + "epoch": 0.8666203197934254, + "grad_norm": 0.4238162272185062, + "learning_rate": 4.589137635816543e-07, + "loss": 0.008, + "step": 4363 + }, + { + "epoch": 0.8668189492501738, + "grad_norm": 0.5117371152238335, + "learning_rate": 4.575681723042014e-07, + "loss": 0.0121, + "step": 4364 + }, + { + "epoch": 0.8670175787069222, + "grad_norm": 0.4015325998809057, + "learning_rate": 4.5622446204321936e-07, + "loss": 0.0074, + "step": 4365 + }, + { + "epoch": 0.8672162081636706, + "grad_norm": 0.35993039383283, + "learning_rate": 4.548826333551382e-07, + "loss": 0.0067, + "step": 4366 + }, + { + "epoch": 0.8674148376204192, + "grad_norm": 0.503064253155119, + "learning_rate": 4.535426867956061e-07, + "loss": 0.0073, + "step": 4367 + }, + { + "epoch": 0.8676134670771676, + "grad_norm": 0.4651717976831416, + "learning_rate": 4.522046229194954e-07, + "loss": 0.0075, + "step": 4368 + }, + { + "epoch": 0.867812096533916, + "grad_norm": 0.9069952716545454, + "learning_rate": 4.508684422808951e-07, + "loss": 0.0136, + "step": 4369 + }, + { + "epoch": 0.8680107259906644, + "grad_norm": 0.4405186014248274, + "learning_rate": 4.4953414543311815e-07, + "loss": 0.013, + "step": 4370 + }, + { + "epoch": 0.8682093554474128, + "grad_norm": 0.5402377194016759, + "learning_rate": 4.4820173292869416e-07, + "loss": 0.0143, + "step": 4371 + }, + { + "epoch": 0.8684079849041613, + "grad_norm": 0.43013105755484454, + "learning_rate": 4.4687120531937357e-07, + "loss": 0.0107, + "step": 4372 + }, + { + "epoch": 0.8686066143609097, + "grad_norm": 0.4369419022550253, + "learning_rate": 4.4554256315612833e-07, + "loss": 0.0124, + "step": 4373 + }, + { + "epoch": 0.8688052438176581, + "grad_norm": 0.5827970943937599, + "learning_rate": 4.4421580698914615e-07, + "loss": 0.0091, + "step": 4374 + }, + { + "epoch": 0.8690038732744066, + "grad_norm": 0.8284652360245892, + "learning_rate": 4.4289093736783695e-07, + "loss": 0.0131, + "step": 4375 + }, + { + "epoch": 0.8692025027311551, + "grad_norm": 0.5145577977088364, + "learning_rate": 4.4156795484082694e-07, + "loss": 0.014, + "step": 4376 + }, + { + "epoch": 0.8694011321879035, + "grad_norm": 0.24551320324099252, + "learning_rate": 4.402468599559606e-07, + "loss": 0.0056, + "step": 4377 + }, + { + "epoch": 0.8695997616446519, + "grad_norm": 0.44050784980271485, + "learning_rate": 4.3892765326030427e-07, + "loss": 0.0092, + "step": 4378 + }, + { + "epoch": 0.8697983911014003, + "grad_norm": 0.5213423596831668, + "learning_rate": 4.376103353001387e-07, + "loss": 0.0078, + "step": 4379 + }, + { + "epoch": 0.8699970205581488, + "grad_norm": 0.30488209551484446, + "learning_rate": 4.3629490662096484e-07, + "loss": 0.0071, + "step": 4380 + }, + { + "epoch": 0.8701956500148972, + "grad_norm": 0.6465717561681386, + "learning_rate": 4.349813677674991e-07, + "loss": 0.0066, + "step": 4381 + }, + { + "epoch": 0.8703942794716456, + "grad_norm": 0.9699003433798918, + "learning_rate": 4.336697192836775e-07, + "loss": 0.0155, + "step": 4382 + }, + { + "epoch": 0.870592908928394, + "grad_norm": 0.5174827015842891, + "learning_rate": 4.323599617126534e-07, + "loss": 0.0075, + "step": 4383 + }, + { + "epoch": 0.8707915383851426, + "grad_norm": 0.3745007213523484, + "learning_rate": 4.3105209559679397e-07, + "loss": 0.0083, + "step": 4384 + }, + { + "epoch": 0.870990167841891, + "grad_norm": 0.37209009481585975, + "learning_rate": 4.2974612147768715e-07, + "loss": 0.0075, + "step": 4385 + }, + { + "epoch": 0.8711887972986394, + "grad_norm": 0.605148782197854, + "learning_rate": 4.284420398961342e-07, + "loss": 0.0082, + "step": 4386 + }, + { + "epoch": 0.8713874267553878, + "grad_norm": 0.7211710943486761, + "learning_rate": 4.2713985139215485e-07, + "loss": 0.0083, + "step": 4387 + }, + { + "epoch": 0.8715860562121363, + "grad_norm": 0.29150430656637677, + "learning_rate": 4.2583955650498276e-07, + "loss": 0.0079, + "step": 4388 + }, + { + "epoch": 0.8717846856688847, + "grad_norm": 0.6277540412956093, + "learning_rate": 4.2454115577307e-07, + "loss": 0.0104, + "step": 4389 + }, + { + "epoch": 0.8719833151256331, + "grad_norm": 0.8937024962263034, + "learning_rate": 4.2324464973408306e-07, + "loss": 0.016, + "step": 4390 + }, + { + "epoch": 0.8721819445823815, + "grad_norm": 0.22626655374898483, + "learning_rate": 4.219500389249026e-07, + "loss": 0.0039, + "step": 4391 + }, + { + "epoch": 0.87238057403913, + "grad_norm": 0.5483301349143612, + "learning_rate": 4.206573238816275e-07, + "loss": 0.0116, + "step": 4392 + }, + { + "epoch": 0.8725792034958785, + "grad_norm": 0.140657628470087, + "learning_rate": 4.193665051395679e-07, + "loss": 0.0021, + "step": 4393 + }, + { + "epoch": 0.8727778329526269, + "grad_norm": 0.41737224691478453, + "learning_rate": 4.180775832332523e-07, + "loss": 0.0113, + "step": 4394 + }, + { + "epoch": 0.8729764624093753, + "grad_norm": 0.32655825358404184, + "learning_rate": 4.1679055869641993e-07, + "loss": 0.0078, + "step": 4395 + }, + { + "epoch": 0.8731750918661237, + "grad_norm": 0.5392995614964906, + "learning_rate": 4.155054320620272e-07, + "loss": 0.011, + "step": 4396 + }, + { + "epoch": 0.8733737213228722, + "grad_norm": 0.2600770739951475, + "learning_rate": 4.1422220386224567e-07, + "loss": 0.0055, + "step": 4397 + }, + { + "epoch": 0.8735723507796206, + "grad_norm": 0.9550612436854214, + "learning_rate": 4.1294087462845576e-07, + "loss": 0.0125, + "step": 4398 + }, + { + "epoch": 0.873770980236369, + "grad_norm": 0.5668918705649273, + "learning_rate": 4.1166144489125703e-07, + "loss": 0.0092, + "step": 4399 + }, + { + "epoch": 0.8739696096931175, + "grad_norm": 0.3898610540948218, + "learning_rate": 4.1038391518045895e-07, + "loss": 0.0098, + "step": 4400 + }, + { + "epoch": 0.874168239149866, + "grad_norm": 0.542748206533429, + "learning_rate": 4.09108286025085e-07, + "loss": 0.009, + "step": 4401 + }, + { + "epoch": 0.8743668686066144, + "grad_norm": 0.4832034435765608, + "learning_rate": 4.0783455795337267e-07, + "loss": 0.0096, + "step": 4402 + }, + { + "epoch": 0.8745654980633628, + "grad_norm": 0.5202243446046367, + "learning_rate": 4.065627314927706e-07, + "loss": 0.0109, + "step": 4403 + }, + { + "epoch": 0.8747641275201112, + "grad_norm": 0.5784085936698135, + "learning_rate": 4.0529280716994246e-07, + "loss": 0.0125, + "step": 4404 + }, + { + "epoch": 0.8749627569768597, + "grad_norm": 1.1495995355735231, + "learning_rate": 4.0402478551076095e-07, + "loss": 0.0127, + "step": 4405 + }, + { + "epoch": 0.8751613864336081, + "grad_norm": 0.31144818566157473, + "learning_rate": 4.027586670403133e-07, + "loss": 0.0075, + "step": 4406 + }, + { + "epoch": 0.8753600158903565, + "grad_norm": 0.4268316810889361, + "learning_rate": 4.0149445228289787e-07, + "loss": 0.0082, + "step": 4407 + }, + { + "epoch": 0.8755586453471049, + "grad_norm": 0.6936100882597547, + "learning_rate": 4.002321417620242e-07, + "loss": 0.0143, + "step": 4408 + }, + { + "epoch": 0.8757572748038535, + "grad_norm": 0.3286470991413137, + "learning_rate": 3.989717360004153e-07, + "loss": 0.006, + "step": 4409 + }, + { + "epoch": 0.8759559042606019, + "grad_norm": 0.8501238578826438, + "learning_rate": 3.9771323552000196e-07, + "loss": 0.0141, + "step": 4410 + }, + { + "epoch": 0.8761545337173503, + "grad_norm": 0.6051259450991804, + "learning_rate": 3.964566408419296e-07, + "loss": 0.0109, + "step": 4411 + }, + { + "epoch": 0.8763531631740987, + "grad_norm": 0.5210373235659236, + "learning_rate": 3.952019524865519e-07, + "loss": 0.0086, + "step": 4412 + }, + { + "epoch": 0.8765517926308471, + "grad_norm": 1.012680225277243, + "learning_rate": 3.939491709734344e-07, + "loss": 0.0125, + "step": 4413 + }, + { + "epoch": 0.8767504220875956, + "grad_norm": 0.32859384106789974, + "learning_rate": 3.926982968213522e-07, + "loss": 0.0084, + "step": 4414 + }, + { + "epoch": 0.876949051544344, + "grad_norm": 0.2505465396417254, + "learning_rate": 3.914493305482914e-07, + "loss": 0.0059, + "step": 4415 + }, + { + "epoch": 0.8771476810010924, + "grad_norm": 0.542229522925549, + "learning_rate": 3.902022726714488e-07, + "loss": 0.0134, + "step": 4416 + }, + { + "epoch": 0.877346310457841, + "grad_norm": 0.4435250855553033, + "learning_rate": 3.889571237072276e-07, + "loss": 0.0109, + "step": 4417 + }, + { + "epoch": 0.8775449399145894, + "grad_norm": 0.7624348302946883, + "learning_rate": 3.877138841712447e-07, + "loss": 0.0191, + "step": 4418 + }, + { + "epoch": 0.8777435693713378, + "grad_norm": 0.39223360906206584, + "learning_rate": 3.8647255457832264e-07, + "loss": 0.0108, + "step": 4419 + }, + { + "epoch": 0.8779421988280862, + "grad_norm": 0.26266628381114066, + "learning_rate": 3.8523313544249653e-07, + "loss": 0.0055, + "step": 4420 + }, + { + "epoch": 0.8781408282848346, + "grad_norm": 0.6195454555078306, + "learning_rate": 3.839956272770068e-07, + "loss": 0.0088, + "step": 4421 + }, + { + "epoch": 0.8783394577415831, + "grad_norm": 0.6243351595233597, + "learning_rate": 3.8276003059430523e-07, + "loss": 0.0117, + "step": 4422 + }, + { + "epoch": 0.8785380871983315, + "grad_norm": 0.5652958236632496, + "learning_rate": 3.8152634590605196e-07, + "loss": 0.0199, + "step": 4423 + }, + { + "epoch": 0.8787367166550799, + "grad_norm": 0.9261366755651307, + "learning_rate": 3.80294573723114e-07, + "loss": 0.0146, + "step": 4424 + }, + { + "epoch": 0.8789353461118283, + "grad_norm": 0.4162303412607668, + "learning_rate": 3.7906471455556537e-07, + "loss": 0.0077, + "step": 4425 + }, + { + "epoch": 0.8791339755685769, + "grad_norm": 0.47965614259858963, + "learning_rate": 3.7783676891269216e-07, + "loss": 0.0107, + "step": 4426 + }, + { + "epoch": 0.8793326050253253, + "grad_norm": 0.758019511885331, + "learning_rate": 3.766107373029837e-07, + "loss": 0.0135, + "step": 4427 + }, + { + "epoch": 0.8795312344820737, + "grad_norm": 1.122056922868509, + "learning_rate": 3.753866202341394e-07, + "loss": 0.0115, + "step": 4428 + }, + { + "epoch": 0.8797298639388221, + "grad_norm": 0.26469452888733724, + "learning_rate": 3.7416441821306325e-07, + "loss": 0.0068, + "step": 4429 + }, + { + "epoch": 0.8799284933955706, + "grad_norm": 0.7299933289853252, + "learning_rate": 3.7294413174587043e-07, + "loss": 0.015, + "step": 4430 + }, + { + "epoch": 0.880127122852319, + "grad_norm": 0.7634736305853262, + "learning_rate": 3.717257613378783e-07, + "loss": 0.0093, + "step": 4431 + }, + { + "epoch": 0.8803257523090674, + "grad_norm": 0.7735715700253204, + "learning_rate": 3.7050930749361335e-07, + "loss": 0.0156, + "step": 4432 + }, + { + "epoch": 0.8805243817658158, + "grad_norm": 0.5172736405040491, + "learning_rate": 3.6929477071680876e-07, + "loss": 0.0071, + "step": 4433 + }, + { + "epoch": 0.8807230112225644, + "grad_norm": 0.744487659123002, + "learning_rate": 3.680821515104016e-07, + "loss": 0.0143, + "step": 4434 + }, + { + "epoch": 0.8809216406793128, + "grad_norm": 0.5354693790419052, + "learning_rate": 3.6687145037653804e-07, + "loss": 0.0101, + "step": 4435 + }, + { + "epoch": 0.8811202701360612, + "grad_norm": 0.3906157093858662, + "learning_rate": 3.656626678165659e-07, + "loss": 0.0057, + "step": 4436 + }, + { + "epoch": 0.8813188995928096, + "grad_norm": 0.37180422992439127, + "learning_rate": 3.6445580433104313e-07, + "loss": 0.0073, + "step": 4437 + }, + { + "epoch": 0.881517529049558, + "grad_norm": 0.3879523154614315, + "learning_rate": 3.632508604197288e-07, + "loss": 0.0124, + "step": 4438 + }, + { + "epoch": 0.8817161585063065, + "grad_norm": 0.5095632886042696, + "learning_rate": 3.6204783658158995e-07, + "loss": 0.0117, + "step": 4439 + }, + { + "epoch": 0.8819147879630549, + "grad_norm": 0.39433819698390393, + "learning_rate": 3.60846733314798e-07, + "loss": 0.0089, + "step": 4440 + }, + { + "epoch": 0.8821134174198033, + "grad_norm": 0.43438384905622585, + "learning_rate": 3.596475511167269e-07, + "loss": 0.0061, + "step": 4441 + }, + { + "epoch": 0.8823120468765518, + "grad_norm": 0.6305498537007402, + "learning_rate": 3.584502904839593e-07, + "loss": 0.0126, + "step": 4442 + }, + { + "epoch": 0.8825106763333003, + "grad_norm": 0.6097637763944662, + "learning_rate": 3.57254951912277e-07, + "loss": 0.0137, + "step": 4443 + }, + { + "epoch": 0.8827093057900487, + "grad_norm": 0.41692408428569017, + "learning_rate": 3.560615358966707e-07, + "loss": 0.013, + "step": 4444 + }, + { + "epoch": 0.8829079352467971, + "grad_norm": 0.4647122199079791, + "learning_rate": 3.548700429313312e-07, + "loss": 0.0075, + "step": 4445 + }, + { + "epoch": 0.8831065647035455, + "grad_norm": 0.748689177498006, + "learning_rate": 3.5368047350965496e-07, + "loss": 0.0114, + "step": 4446 + }, + { + "epoch": 0.883305194160294, + "grad_norm": 0.8174465249948543, + "learning_rate": 3.524928281242429e-07, + "loss": 0.0073, + "step": 4447 + }, + { + "epoch": 0.8835038236170424, + "grad_norm": 0.4656654252231502, + "learning_rate": 3.513071072668961e-07, + "loss": 0.0111, + "step": 4448 + }, + { + "epoch": 0.8837024530737908, + "grad_norm": 0.5205301107761227, + "learning_rate": 3.5012331142862065e-07, + "loss": 0.0099, + "step": 4449 + }, + { + "epoch": 0.8839010825305392, + "grad_norm": 0.42783686791908027, + "learning_rate": 3.4894144109962557e-07, + "loss": 0.0114, + "step": 4450 + }, + { + "epoch": 0.8840997119872878, + "grad_norm": 0.4787814662703929, + "learning_rate": 3.477614967693216e-07, + "loss": 0.0086, + "step": 4451 + }, + { + "epoch": 0.8842983414440362, + "grad_norm": 0.4314781867922066, + "learning_rate": 3.4658347892632337e-07, + "loss": 0.0103, + "step": 4452 + }, + { + "epoch": 0.8844969709007846, + "grad_norm": 0.35834664121669135, + "learning_rate": 3.454073880584463e-07, + "loss": 0.0095, + "step": 4453 + }, + { + "epoch": 0.884695600357533, + "grad_norm": 0.4202353721736912, + "learning_rate": 3.4423322465270914e-07, + "loss": 0.0112, + "step": 4454 + }, + { + "epoch": 0.8848942298142815, + "grad_norm": 1.639304177182807, + "learning_rate": 3.4306098919533013e-07, + "loss": 0.0088, + "step": 4455 + }, + { + "epoch": 0.8850928592710299, + "grad_norm": 0.5271434662250698, + "learning_rate": 3.4189068217173216e-07, + "loss": 0.0105, + "step": 4456 + }, + { + "epoch": 0.8852914887277783, + "grad_norm": 0.4513582350757716, + "learning_rate": 3.407223040665386e-07, + "loss": 0.0089, + "step": 4457 + }, + { + "epoch": 0.8854901181845267, + "grad_norm": 0.43566762732715747, + "learning_rate": 3.395558553635725e-07, + "loss": 0.0083, + "step": 4458 + }, + { + "epoch": 0.8856887476412753, + "grad_norm": 0.4675844643952693, + "learning_rate": 3.383913365458602e-07, + "loss": 0.0124, + "step": 4459 + }, + { + "epoch": 0.8858873770980237, + "grad_norm": 0.5086115524011803, + "learning_rate": 3.3722874809562655e-07, + "loss": 0.0085, + "step": 4460 + }, + { + "epoch": 0.8860860065547721, + "grad_norm": 0.4995927325240558, + "learning_rate": 3.3606809049429976e-07, + "loss": 0.0147, + "step": 4461 + }, + { + "epoch": 0.8862846360115205, + "grad_norm": 0.563092019434094, + "learning_rate": 3.3490936422250486e-07, + "loss": 0.0091, + "step": 4462 + }, + { + "epoch": 0.8864832654682689, + "grad_norm": 0.3664198660805334, + "learning_rate": 3.337525697600713e-07, + "loss": 0.0085, + "step": 4463 + }, + { + "epoch": 0.8866818949250174, + "grad_norm": 0.5626785718095834, + "learning_rate": 3.3259770758602593e-07, + "loss": 0.0078, + "step": 4464 + }, + { + "epoch": 0.8868805243817658, + "grad_norm": 0.6720037981827449, + "learning_rate": 3.314447781785951e-07, + "loss": 0.0166, + "step": 4465 + }, + { + "epoch": 0.8870791538385142, + "grad_norm": 0.6089564989189392, + "learning_rate": 3.302937820152069e-07, + "loss": 0.0109, + "step": 4466 + }, + { + "epoch": 0.8872777832952626, + "grad_norm": 0.8451481981228195, + "learning_rate": 3.291447195724867e-07, + "loss": 0.0129, + "step": 4467 + }, + { + "epoch": 0.8874764127520112, + "grad_norm": 0.44291299876872686, + "learning_rate": 3.2799759132626176e-07, + "loss": 0.0075, + "step": 4468 + }, + { + "epoch": 0.8876750422087596, + "grad_norm": 0.6004352791066179, + "learning_rate": 3.268523977515542e-07, + "loss": 0.0122, + "step": 4469 + }, + { + "epoch": 0.887873671665508, + "grad_norm": 0.4086550054310082, + "learning_rate": 3.257091393225892e-07, + "loss": 0.0087, + "step": 4470 + }, + { + "epoch": 0.8880723011222564, + "grad_norm": 0.46507471921866933, + "learning_rate": 3.245678165127891e-07, + "loss": 0.0073, + "step": 4471 + }, + { + "epoch": 0.8882709305790049, + "grad_norm": 0.39929592869202446, + "learning_rate": 3.234284297947748e-07, + "loss": 0.0076, + "step": 4472 + }, + { + "epoch": 0.8884695600357533, + "grad_norm": 0.44202911846396886, + "learning_rate": 3.222909796403639e-07, + "loss": 0.0103, + "step": 4473 + }, + { + "epoch": 0.8886681894925017, + "grad_norm": 0.4416813065644919, + "learning_rate": 3.211554665205735e-07, + "loss": 0.0085, + "step": 4474 + }, + { + "epoch": 0.8888668189492501, + "grad_norm": 0.451292002362962, + "learning_rate": 3.200218909056185e-07, + "loss": 0.0072, + "step": 4475 + }, + { + "epoch": 0.8890654484059987, + "grad_norm": 0.39357809600693094, + "learning_rate": 3.18890253264913e-07, + "loss": 0.0076, + "step": 4476 + }, + { + "epoch": 0.8892640778627471, + "grad_norm": 0.599306137463403, + "learning_rate": 3.1776055406706474e-07, + "loss": 0.0094, + "step": 4477 + }, + { + "epoch": 0.8894627073194955, + "grad_norm": 0.3551885948768195, + "learning_rate": 3.16632793779883e-07, + "loss": 0.0111, + "step": 4478 + }, + { + "epoch": 0.8896613367762439, + "grad_norm": 0.303987373083012, + "learning_rate": 3.155069728703708e-07, + "loss": 0.0064, + "step": 4479 + }, + { + "epoch": 0.8898599662329923, + "grad_norm": 0.7886640360675943, + "learning_rate": 3.1438309180473083e-07, + "loss": 0.0145, + "step": 4480 + }, + { + "epoch": 0.8900585956897408, + "grad_norm": 0.3640471787782126, + "learning_rate": 3.132611510483591e-07, + "loss": 0.0081, + "step": 4481 + }, + { + "epoch": 0.8902572251464892, + "grad_norm": 0.4516366521716688, + "learning_rate": 3.121411510658512e-07, + "loss": 0.0058, + "step": 4482 + }, + { + "epoch": 0.8904558546032376, + "grad_norm": 0.5960707874212607, + "learning_rate": 3.1102309232099895e-07, + "loss": 0.0079, + "step": 4483 + }, + { + "epoch": 0.8906544840599862, + "grad_norm": 0.6565899058793593, + "learning_rate": 3.09906975276788e-07, + "loss": 0.0106, + "step": 4484 + }, + { + "epoch": 0.8908531135167346, + "grad_norm": 0.43991379023445737, + "learning_rate": 3.087928003954027e-07, + "loss": 0.0079, + "step": 4485 + }, + { + "epoch": 0.891051742973483, + "grad_norm": 0.5593853069391717, + "learning_rate": 3.0768056813821943e-07, + "loss": 0.0135, + "step": 4486 + }, + { + "epoch": 0.8912503724302314, + "grad_norm": 0.3950431747898158, + "learning_rate": 3.0657027896581537e-07, + "loss": 0.0094, + "step": 4487 + }, + { + "epoch": 0.8914490018869798, + "grad_norm": 0.25409553815336716, + "learning_rate": 3.054619333379577e-07, + "loss": 0.0034, + "step": 4488 + }, + { + "epoch": 0.8916476313437283, + "grad_norm": 0.5330841755954928, + "learning_rate": 3.0435553171361207e-07, + "loss": 0.0072, + "step": 4489 + }, + { + "epoch": 0.8918462608004767, + "grad_norm": 0.6056435536748148, + "learning_rate": 3.032510745509393e-07, + "loss": 0.0125, + "step": 4490 + }, + { + "epoch": 0.8920448902572251, + "grad_norm": 0.40977754735724337, + "learning_rate": 3.021485623072923e-07, + "loss": 0.0091, + "step": 4491 + }, + { + "epoch": 0.8922435197139735, + "grad_norm": 0.5596491771836665, + "learning_rate": 3.0104799543922146e-07, + "loss": 0.0105, + "step": 4492 + }, + { + "epoch": 0.8924421491707221, + "grad_norm": 0.4786492106128223, + "learning_rate": 2.999493744024701e-07, + "loss": 0.0083, + "step": 4493 + }, + { + "epoch": 0.8926407786274705, + "grad_norm": 0.9836960055347553, + "learning_rate": 2.988526996519764e-07, + "loss": 0.018, + "step": 4494 + }, + { + "epoch": 0.8928394080842189, + "grad_norm": 0.6957875722193689, + "learning_rate": 2.97757971641871e-07, + "loss": 0.0072, + "step": 4495 + }, + { + "epoch": 0.8930380375409673, + "grad_norm": 0.7150594587760443, + "learning_rate": 2.966651908254814e-07, + "loss": 0.0151, + "step": 4496 + }, + { + "epoch": 0.8932366669977158, + "grad_norm": 0.9452942090466192, + "learning_rate": 2.955743576553266e-07, + "loss": 0.011, + "step": 4497 + }, + { + "epoch": 0.8934352964544642, + "grad_norm": 0.4147804118671288, + "learning_rate": 2.944854725831181e-07, + "loss": 0.0086, + "step": 4498 + }, + { + "epoch": 0.8936339259112126, + "grad_norm": 0.6444712673599818, + "learning_rate": 2.93398536059763e-07, + "loss": 0.0116, + "step": 4499 + }, + { + "epoch": 0.893832555367961, + "grad_norm": 0.4545175271742511, + "learning_rate": 2.923135485353618e-07, + "loss": 0.0091, + "step": 4500 + }, + { + "epoch": 0.8940311848247096, + "grad_norm": 0.435024614613689, + "learning_rate": 2.9123051045920503e-07, + "loss": 0.0099, + "step": 4501 + }, + { + "epoch": 0.894229814281458, + "grad_norm": 0.5137299220895429, + "learning_rate": 2.9014942227977907e-07, + "loss": 0.0114, + "step": 4502 + }, + { + "epoch": 0.8944284437382064, + "grad_norm": 0.5936552009342531, + "learning_rate": 2.8907028444476017e-07, + "loss": 0.0147, + "step": 4503 + }, + { + "epoch": 0.8946270731949548, + "grad_norm": 0.6272540958685057, + "learning_rate": 2.879930974010198e-07, + "loss": 0.0118, + "step": 4504 + }, + { + "epoch": 0.8948257026517032, + "grad_norm": 0.626455749063314, + "learning_rate": 2.8691786159461776e-07, + "loss": 0.0086, + "step": 4505 + }, + { + "epoch": 0.8950243321084517, + "grad_norm": 0.4727682477458918, + "learning_rate": 2.858445774708096e-07, + "loss": 0.0125, + "step": 4506 + }, + { + "epoch": 0.8952229615652001, + "grad_norm": 0.42330205421234, + "learning_rate": 2.8477324547404204e-07, + "loss": 0.0131, + "step": 4507 + }, + { + "epoch": 0.8954215910219485, + "grad_norm": 0.8016337669641905, + "learning_rate": 2.837038660479508e-07, + "loss": 0.0087, + "step": 4508 + }, + { + "epoch": 0.8956202204786969, + "grad_norm": 0.4415565035557052, + "learning_rate": 2.826364396353659e-07, + "loss": 0.0127, + "step": 4509 + }, + { + "epoch": 0.8958188499354455, + "grad_norm": 0.5118589639856184, + "learning_rate": 2.815709666783073e-07, + "loss": 0.0076, + "step": 4510 + }, + { + "epoch": 0.8960174793921939, + "grad_norm": 0.7602872016834586, + "learning_rate": 2.805074476179864e-07, + "loss": 0.0098, + "step": 4511 + }, + { + "epoch": 0.8962161088489423, + "grad_norm": 0.29573595687294996, + "learning_rate": 2.7944588289480436e-07, + "loss": 0.0071, + "step": 4512 + }, + { + "epoch": 0.8964147383056907, + "grad_norm": 0.6148007368180166, + "learning_rate": 2.7838627294835553e-07, + "loss": 0.0177, + "step": 4513 + }, + { + "epoch": 0.8966133677624392, + "grad_norm": 0.3086576744890267, + "learning_rate": 2.7732861821742285e-07, + "loss": 0.0037, + "step": 4514 + }, + { + "epoch": 0.8968119972191876, + "grad_norm": 0.33506300714770954, + "learning_rate": 2.762729191399799e-07, + "loss": 0.0078, + "step": 4515 + }, + { + "epoch": 0.897010626675936, + "grad_norm": 0.7280224457903677, + "learning_rate": 2.752191761531908e-07, + "loss": 0.0132, + "step": 4516 + }, + { + "epoch": 0.8972092561326844, + "grad_norm": 0.902157920426683, + "learning_rate": 2.7416738969340884e-07, + "loss": 0.0157, + "step": 4517 + }, + { + "epoch": 0.897407885589433, + "grad_norm": 1.5039398413544367, + "learning_rate": 2.7311756019617886e-07, + "loss": 0.0096, + "step": 4518 + }, + { + "epoch": 0.8976065150461814, + "grad_norm": 0.43159755127163674, + "learning_rate": 2.720696880962331e-07, + "loss": 0.0142, + "step": 4519 + }, + { + "epoch": 0.8978051445029298, + "grad_norm": 0.43080340115106225, + "learning_rate": 2.710237738274951e-07, + "loss": 0.0147, + "step": 4520 + }, + { + "epoch": 0.8980037739596782, + "grad_norm": 0.7326367128578075, + "learning_rate": 2.699798178230772e-07, + "loss": 0.0189, + "step": 4521 + }, + { + "epoch": 0.8982024034164267, + "grad_norm": 0.43562861431632444, + "learning_rate": 2.6893782051527873e-07, + "loss": 0.011, + "step": 4522 + }, + { + "epoch": 0.8984010328731751, + "grad_norm": 0.3484366560219996, + "learning_rate": 2.6789778233559214e-07, + "loss": 0.0075, + "step": 4523 + }, + { + "epoch": 0.8985996623299235, + "grad_norm": 0.45427660263747915, + "learning_rate": 2.6685970371469414e-07, + "loss": 0.0057, + "step": 4524 + }, + { + "epoch": 0.8987982917866719, + "grad_norm": 0.34514097610134814, + "learning_rate": 2.658235850824531e-07, + "loss": 0.009, + "step": 4525 + }, + { + "epoch": 0.8989969212434203, + "grad_norm": 0.9536404644314742, + "learning_rate": 2.647894268679252e-07, + "loss": 0.0058, + "step": 4526 + }, + { + "epoch": 0.8991955507001689, + "grad_norm": 0.679047679167587, + "learning_rate": 2.637572294993529e-07, + "loss": 0.0158, + "step": 4527 + }, + { + "epoch": 0.8993941801569173, + "grad_norm": 0.48771424479973163, + "learning_rate": 2.627269934041693e-07, + "loss": 0.0082, + "step": 4528 + }, + { + "epoch": 0.8995928096136657, + "grad_norm": 0.8876585963703878, + "learning_rate": 2.6169871900899367e-07, + "loss": 0.0129, + "step": 4529 + }, + { + "epoch": 0.8997914390704141, + "grad_norm": 0.42451999038659616, + "learning_rate": 2.606724067396338e-07, + "loss": 0.0106, + "step": 4530 + }, + { + "epoch": 0.8999900685271626, + "grad_norm": 0.5490668531272829, + "learning_rate": 2.59648057021083e-07, + "loss": 0.0152, + "step": 4531 + }, + { + "epoch": 0.900188697983911, + "grad_norm": 0.5470093186008119, + "learning_rate": 2.5862567027752526e-07, + "loss": 0.0094, + "step": 4532 + }, + { + "epoch": 0.9003873274406594, + "grad_norm": 0.6398893462509314, + "learning_rate": 2.576052469323298e-07, + "loss": 0.0116, + "step": 4533 + }, + { + "epoch": 0.9005859568974078, + "grad_norm": 0.4022552451748213, + "learning_rate": 2.5658678740805186e-07, + "loss": 0.0077, + "step": 4534 + }, + { + "epoch": 0.9007845863541564, + "grad_norm": 0.39557786288450486, + "learning_rate": 2.555702921264358e-07, + "loss": 0.0088, + "step": 4535 + }, + { + "epoch": 0.9009832158109048, + "grad_norm": 1.0099485838483995, + "learning_rate": 2.545557615084099e-07, + "loss": 0.0117, + "step": 4536 + }, + { + "epoch": 0.9011818452676532, + "grad_norm": 0.6052324571559026, + "learning_rate": 2.5354319597409194e-07, + "loss": 0.0095, + "step": 4537 + }, + { + "epoch": 0.9013804747244016, + "grad_norm": 0.4649008984948733, + "learning_rate": 2.525325959427821e-07, + "loss": 0.0056, + "step": 4538 + }, + { + "epoch": 0.9015791041811501, + "grad_norm": 0.4320126986367103, + "learning_rate": 2.515239618329701e-07, + "loss": 0.0102, + "step": 4539 + }, + { + "epoch": 0.9017777336378985, + "grad_norm": 0.3576353257805511, + "learning_rate": 2.505172940623313e-07, + "loss": 0.0062, + "step": 4540 + }, + { + "epoch": 0.9019763630946469, + "grad_norm": 0.3318197981973977, + "learning_rate": 2.495125930477238e-07, + "loss": 0.01, + "step": 4541 + }, + { + "epoch": 0.9021749925513953, + "grad_norm": 0.4512411192015429, + "learning_rate": 2.485098592051949e-07, + "loss": 0.0048, + "step": 4542 + }, + { + "epoch": 0.9023736220081439, + "grad_norm": 0.3890900313378956, + "learning_rate": 2.475090929499746e-07, + "loss": 0.0101, + "step": 4543 + }, + { + "epoch": 0.9025722514648923, + "grad_norm": 0.43260373969735505, + "learning_rate": 2.465102946964798e-07, + "loss": 0.0148, + "step": 4544 + }, + { + "epoch": 0.9027708809216407, + "grad_norm": 0.586975403052094, + "learning_rate": 2.455134648583124e-07, + "loss": 0.0123, + "step": 4545 + }, + { + "epoch": 0.9029695103783891, + "grad_norm": 0.4911950176274392, + "learning_rate": 2.4451860384825663e-07, + "loss": 0.011, + "step": 4546 + }, + { + "epoch": 0.9031681398351376, + "grad_norm": 0.389644189362611, + "learning_rate": 2.4352571207828577e-07, + "loss": 0.011, + "step": 4547 + }, + { + "epoch": 0.903366769291886, + "grad_norm": 0.43830143444725367, + "learning_rate": 2.425347899595537e-07, + "loss": 0.0056, + "step": 4548 + }, + { + "epoch": 0.9035653987486344, + "grad_norm": 0.6957717254966186, + "learning_rate": 2.415458379024005e-07, + "loss": 0.0163, + "step": 4549 + }, + { + "epoch": 0.9037640282053828, + "grad_norm": 0.4740762518443394, + "learning_rate": 2.405588563163519e-07, + "loss": 0.0146, + "step": 4550 + }, + { + "epoch": 0.9039626576621312, + "grad_norm": 0.6021255205184528, + "learning_rate": 2.395738456101132e-07, + "loss": 0.0124, + "step": 4551 + }, + { + "epoch": 0.9041612871188798, + "grad_norm": 0.741752144803565, + "learning_rate": 2.3859080619157925e-07, + "loss": 0.0125, + "step": 4552 + }, + { + "epoch": 0.9043599165756282, + "grad_norm": 0.42927765078239116, + "learning_rate": 2.376097384678233e-07, + "loss": 0.0156, + "step": 4553 + }, + { + "epoch": 0.9045585460323766, + "grad_norm": 0.6840710533669623, + "learning_rate": 2.3663064284510594e-07, + "loss": 0.0157, + "step": 4554 + }, + { + "epoch": 0.904757175489125, + "grad_norm": 0.62982881490239, + "learning_rate": 2.356535197288684e-07, + "loss": 0.0129, + "step": 4555 + }, + { + "epoch": 0.9049558049458735, + "grad_norm": 0.35044902975137593, + "learning_rate": 2.3467836952373756e-07, + "loss": 0.0073, + "step": 4556 + }, + { + "epoch": 0.9051544344026219, + "grad_norm": 0.5392210976679882, + "learning_rate": 2.3370519263352264e-07, + "loss": 0.0094, + "step": 4557 + }, + { + "epoch": 0.9053530638593703, + "grad_norm": 0.48828299525425184, + "learning_rate": 2.327339894612135e-07, + "loss": 0.0153, + "step": 4558 + }, + { + "epoch": 0.9055516933161187, + "grad_norm": 0.4056484456009865, + "learning_rate": 2.3176476040898566e-07, + "loss": 0.0135, + "step": 4559 + }, + { + "epoch": 0.9057503227728673, + "grad_norm": 0.6301698090976487, + "learning_rate": 2.3079750587819527e-07, + "loss": 0.0113, + "step": 4560 + }, + { + "epoch": 0.9059489522296157, + "grad_norm": 0.48414953276680756, + "learning_rate": 2.2983222626938196e-07, + "loss": 0.0165, + "step": 4561 + }, + { + "epoch": 0.9061475816863641, + "grad_norm": 0.44003626001855367, + "learning_rate": 2.288689219822665e-07, + "loss": 0.0124, + "step": 4562 + }, + { + "epoch": 0.9063462111431125, + "grad_norm": 0.6813734049579375, + "learning_rate": 2.2790759341575208e-07, + "loss": 0.0116, + "step": 4563 + }, + { + "epoch": 0.906544840599861, + "grad_norm": 0.5755670216305095, + "learning_rate": 2.2694824096792522e-07, + "loss": 0.0124, + "step": 4564 + }, + { + "epoch": 0.9067434700566094, + "grad_norm": 0.40263299299179744, + "learning_rate": 2.259908650360515e-07, + "loss": 0.0122, + "step": 4565 + }, + { + "epoch": 0.9069420995133578, + "grad_norm": 0.6401801441669921, + "learning_rate": 2.2503546601657988e-07, + "loss": 0.0105, + "step": 4566 + }, + { + "epoch": 0.9071407289701062, + "grad_norm": 0.785747890099103, + "learning_rate": 2.2408204430514003e-07, + "loss": 0.0169, + "step": 4567 + }, + { + "epoch": 0.9073393584268546, + "grad_norm": 0.4264636528520051, + "learning_rate": 2.2313060029654276e-07, + "loss": 0.0059, + "step": 4568 + }, + { + "epoch": 0.9075379878836032, + "grad_norm": 0.42739671235798343, + "learning_rate": 2.2218113438478074e-07, + "loss": 0.0083, + "step": 4569 + }, + { + "epoch": 0.9077366173403516, + "grad_norm": 0.40779650096228554, + "learning_rate": 2.2123364696302553e-07, + "loss": 0.0077, + "step": 4570 + }, + { + "epoch": 0.9079352467971, + "grad_norm": 0.39853271354961034, + "learning_rate": 2.2028813842363272e-07, + "loss": 0.0107, + "step": 4571 + }, + { + "epoch": 0.9081338762538484, + "grad_norm": 0.33762970470811593, + "learning_rate": 2.1934460915813416e-07, + "loss": 0.0075, + "step": 4572 + }, + { + "epoch": 0.9083325057105969, + "grad_norm": 0.5183895697114581, + "learning_rate": 2.1840305955724561e-07, + "loss": 0.0096, + "step": 4573 + }, + { + "epoch": 0.9085311351673453, + "grad_norm": 0.48886933649478814, + "learning_rate": 2.1746349001086187e-07, + "loss": 0.0061, + "step": 4574 + }, + { + "epoch": 0.9087297646240937, + "grad_norm": 0.638424883774554, + "learning_rate": 2.1652590090805725e-07, + "loss": 0.0085, + "step": 4575 + }, + { + "epoch": 0.9089283940808421, + "grad_norm": 0.47399133102593094, + "learning_rate": 2.155902926370873e-07, + "loss": 0.0102, + "step": 4576 + }, + { + "epoch": 0.9091270235375907, + "grad_norm": 0.8460449737384751, + "learning_rate": 2.1465666558538544e-07, + "loss": 0.0103, + "step": 4577 + }, + { + "epoch": 0.9093256529943391, + "grad_norm": 0.5377971540122163, + "learning_rate": 2.1372502013956687e-07, + "loss": 0.0114, + "step": 4578 + }, + { + "epoch": 0.9095242824510875, + "grad_norm": 0.4809164280538038, + "learning_rate": 2.127953566854235e-07, + "loss": 0.0105, + "step": 4579 + }, + { + "epoch": 0.9097229119078359, + "grad_norm": 0.2805309828097, + "learning_rate": 2.1186767560792964e-07, + "loss": 0.0079, + "step": 4580 + }, + { + "epoch": 0.9099215413645844, + "grad_norm": 0.43230047439300484, + "learning_rate": 2.1094197729123577e-07, + "loss": 0.012, + "step": 4581 + }, + { + "epoch": 0.9101201708213328, + "grad_norm": 0.5605835051911726, + "learning_rate": 2.100182621186736e-07, + "loss": 0.0159, + "step": 4582 + }, + { + "epoch": 0.9103188002780812, + "grad_norm": 0.5202163319138345, + "learning_rate": 2.090965304727527e-07, + "loss": 0.0096, + "step": 4583 + }, + { + "epoch": 0.9105174297348296, + "grad_norm": 0.4097715169980986, + "learning_rate": 2.0817678273515996e-07, + "loss": 0.0107, + "step": 4584 + }, + { + "epoch": 0.9107160591915782, + "grad_norm": 0.8758598360143836, + "learning_rate": 2.072590192867635e-07, + "loss": 0.0103, + "step": 4585 + }, + { + "epoch": 0.9109146886483266, + "grad_norm": 0.5918227290134394, + "learning_rate": 2.0634324050760658e-07, + "loss": 0.0122, + "step": 4586 + }, + { + "epoch": 0.911113318105075, + "grad_norm": 0.49732322398821266, + "learning_rate": 2.0542944677691467e-07, + "loss": 0.0112, + "step": 4587 + }, + { + "epoch": 0.9113119475618234, + "grad_norm": 0.7732911114954113, + "learning_rate": 2.0451763847308626e-07, + "loss": 0.0122, + "step": 4588 + }, + { + "epoch": 0.9115105770185719, + "grad_norm": 0.6007278204049679, + "learning_rate": 2.036078159737015e-07, + "loss": 0.0139, + "step": 4589 + }, + { + "epoch": 0.9117092064753203, + "grad_norm": 1.284947054234802, + "learning_rate": 2.026999796555179e-07, + "loss": 0.0237, + "step": 4590 + }, + { + "epoch": 0.9119078359320687, + "grad_norm": 0.567152964186292, + "learning_rate": 2.0179412989446756e-07, + "loss": 0.0088, + "step": 4591 + }, + { + "epoch": 0.9121064653888171, + "grad_norm": 0.38390704525282005, + "learning_rate": 2.0089026706566372e-07, + "loss": 0.0067, + "step": 4592 + }, + { + "epoch": 0.9123050948455655, + "grad_norm": 0.6975714046203888, + "learning_rate": 1.999883915433948e-07, + "loss": 0.017, + "step": 4593 + }, + { + "epoch": 0.9125037243023141, + "grad_norm": 0.5217159598877467, + "learning_rate": 1.9908850370112476e-07, + "loss": 0.0149, + "step": 4594 + }, + { + "epoch": 0.9127023537590625, + "grad_norm": 0.5062818955759204, + "learning_rate": 1.9819060391149837e-07, + "loss": 0.0045, + "step": 4595 + }, + { + "epoch": 0.9129009832158109, + "grad_norm": 0.30318092705281424, + "learning_rate": 1.9729469254633425e-07, + "loss": 0.0063, + "step": 4596 + }, + { + "epoch": 0.9130996126725593, + "grad_norm": 0.2722804564584692, + "learning_rate": 1.9640076997662848e-07, + "loss": 0.0067, + "step": 4597 + }, + { + "epoch": 0.9132982421293078, + "grad_norm": 0.4823650756559266, + "learning_rate": 1.955088365725527e-07, + "loss": 0.0092, + "step": 4598 + }, + { + "epoch": 0.9134968715860562, + "grad_norm": 0.45532989870452495, + "learning_rate": 1.9461889270345645e-07, + "loss": 0.011, + "step": 4599 + }, + { + "epoch": 0.9136955010428046, + "grad_norm": 0.6373971983656296, + "learning_rate": 1.9373093873786497e-07, + "loss": 0.0084, + "step": 4600 + }, + { + "epoch": 0.913894130499553, + "grad_norm": 0.491770676105079, + "learning_rate": 1.9284497504347854e-07, + "loss": 0.006, + "step": 4601 + }, + { + "epoch": 0.9140927599563016, + "grad_norm": 0.6167707314340485, + "learning_rate": 1.9196100198717427e-07, + "loss": 0.0161, + "step": 4602 + }, + { + "epoch": 0.91429138941305, + "grad_norm": 0.462717621588757, + "learning_rate": 1.9107901993500322e-07, + "loss": 0.0157, + "step": 4603 + }, + { + "epoch": 0.9144900188697984, + "grad_norm": 0.17662504554317196, + "learning_rate": 1.9019902925219548e-07, + "loss": 0.004, + "step": 4604 + }, + { + "epoch": 0.9146886483265468, + "grad_norm": 0.49148437356429236, + "learning_rate": 1.893210303031523e-07, + "loss": 0.0121, + "step": 4605 + }, + { + "epoch": 0.9148872777832953, + "grad_norm": 0.7776087485666608, + "learning_rate": 1.8844502345145233e-07, + "loss": 0.0096, + "step": 4606 + }, + { + "epoch": 0.9150859072400437, + "grad_norm": 0.5332572925885097, + "learning_rate": 1.8757100905985094e-07, + "loss": 0.0055, + "step": 4607 + }, + { + "epoch": 0.9152845366967921, + "grad_norm": 0.32966084466808204, + "learning_rate": 1.8669898749027472e-07, + "loss": 0.0094, + "step": 4608 + }, + { + "epoch": 0.9154831661535405, + "grad_norm": 0.3990945821247028, + "learning_rate": 1.8582895910382813e-07, + "loss": 0.0125, + "step": 4609 + }, + { + "epoch": 0.915681795610289, + "grad_norm": 0.44485780478520864, + "learning_rate": 1.8496092426078805e-07, + "loss": 0.0074, + "step": 4610 + }, + { + "epoch": 0.9158804250670375, + "grad_norm": 0.558341054302273, + "learning_rate": 1.8409488332060799e-07, + "loss": 0.0115, + "step": 4611 + }, + { + "epoch": 0.9160790545237859, + "grad_norm": 0.37536643210218906, + "learning_rate": 1.8323083664191333e-07, + "loss": 0.0057, + "step": 4612 + }, + { + "epoch": 0.9162776839805343, + "grad_norm": 0.7931572433347439, + "learning_rate": 1.823687845825056e-07, + "loss": 0.0135, + "step": 4613 + }, + { + "epoch": 0.9164763134372828, + "grad_norm": 0.5130348406821796, + "learning_rate": 1.8150872749935989e-07, + "loss": 0.017, + "step": 4614 + }, + { + "epoch": 0.9166749428940312, + "grad_norm": 0.5767895711884279, + "learning_rate": 1.8065066574862455e-07, + "loss": 0.0193, + "step": 4615 + }, + { + "epoch": 0.9168735723507796, + "grad_norm": 0.5029366482371473, + "learning_rate": 1.7979459968562317e-07, + "loss": 0.0144, + "step": 4616 + }, + { + "epoch": 0.917072201807528, + "grad_norm": 0.49339222260197624, + "learning_rate": 1.7894052966485053e-07, + "loss": 0.0069, + "step": 4617 + }, + { + "epoch": 0.9172708312642764, + "grad_norm": 0.4827004588018792, + "learning_rate": 1.7808845603997594e-07, + "loss": 0.0089, + "step": 4618 + }, + { + "epoch": 0.917469460721025, + "grad_norm": 0.48201824114443204, + "learning_rate": 1.772383791638438e-07, + "loss": 0.0117, + "step": 4619 + }, + { + "epoch": 0.9176680901777734, + "grad_norm": 0.42119151357623763, + "learning_rate": 1.7639029938846808e-07, + "loss": 0.0152, + "step": 4620 + }, + { + "epoch": 0.9178667196345218, + "grad_norm": 0.42340226485526916, + "learning_rate": 1.755442170650401e-07, + "loss": 0.0062, + "step": 4621 + }, + { + "epoch": 0.9180653490912702, + "grad_norm": 0.42028816533360497, + "learning_rate": 1.747001325439196e-07, + "loss": 0.0109, + "step": 4622 + }, + { + "epoch": 0.9182639785480187, + "grad_norm": 0.37252045119966465, + "learning_rate": 1.7385804617464308e-07, + "loss": 0.0132, + "step": 4623 + }, + { + "epoch": 0.9184626080047671, + "grad_norm": 0.6415858988525973, + "learning_rate": 1.730179583059155e-07, + "loss": 0.014, + "step": 4624 + }, + { + "epoch": 0.9186612374615155, + "grad_norm": 0.6620697719569957, + "learning_rate": 1.7217986928561803e-07, + "loss": 0.0073, + "step": 4625 + }, + { + "epoch": 0.9188598669182639, + "grad_norm": 0.534573318706696, + "learning_rate": 1.7134377946080193e-07, + "loss": 0.0085, + "step": 4626 + }, + { + "epoch": 0.9190584963750125, + "grad_norm": 0.4937748956041207, + "learning_rate": 1.7050968917769139e-07, + "loss": 0.0108, + "step": 4627 + }, + { + "epoch": 0.9192571258317609, + "grad_norm": 0.7534757131719959, + "learning_rate": 1.6967759878168233e-07, + "loss": 0.0149, + "step": 4628 + }, + { + "epoch": 0.9194557552885093, + "grad_norm": 0.48683536600548843, + "learning_rate": 1.688475086173419e-07, + "loss": 0.0071, + "step": 4629 + }, + { + "epoch": 0.9196543847452577, + "grad_norm": 0.5205814438223413, + "learning_rate": 1.6801941902841068e-07, + "loss": 0.0133, + "step": 4630 + }, + { + "epoch": 0.9198530142020062, + "grad_norm": 0.37512216595028314, + "learning_rate": 1.6719333035779827e-07, + "loss": 0.0075, + "step": 4631 + }, + { + "epoch": 0.9200516436587546, + "grad_norm": 0.5986458365684142, + "learning_rate": 1.6636924294758828e-07, + "loss": 0.0095, + "step": 4632 + }, + { + "epoch": 0.920250273115503, + "grad_norm": 0.5772681496058722, + "learning_rate": 1.655471571390349e-07, + "loss": 0.0083, + "step": 4633 + }, + { + "epoch": 0.9204489025722514, + "grad_norm": 0.29855689621547776, + "learning_rate": 1.6472707327256198e-07, + "loss": 0.0048, + "step": 4634 + }, + { + "epoch": 0.9206475320289998, + "grad_norm": 0.5804263227677894, + "learning_rate": 1.639089916877662e-07, + "loss": 0.0156, + "step": 4635 + }, + { + "epoch": 0.9208461614857484, + "grad_norm": 0.47922034362262383, + "learning_rate": 1.6309291272341377e-07, + "loss": 0.0108, + "step": 4636 + }, + { + "epoch": 0.9210447909424968, + "grad_norm": 0.3263256249555732, + "learning_rate": 1.622788367174427e-07, + "loss": 0.0058, + "step": 4637 + }, + { + "epoch": 0.9212434203992452, + "grad_norm": 0.6192492406914758, + "learning_rate": 1.6146676400696003e-07, + "loss": 0.012, + "step": 4638 + }, + { + "epoch": 0.9214420498559937, + "grad_norm": 0.4766849044662014, + "learning_rate": 1.606566949282451e-07, + "loss": 0.0172, + "step": 4639 + }, + { + "epoch": 0.9216406793127421, + "grad_norm": 0.6710327474510234, + "learning_rate": 1.5984862981674786e-07, + "loss": 0.0129, + "step": 4640 + }, + { + "epoch": 0.9218393087694905, + "grad_norm": 0.42862810177240407, + "learning_rate": 1.5904256900708459e-07, + "loss": 0.0063, + "step": 4641 + }, + { + "epoch": 0.9220379382262389, + "grad_norm": 0.3757229399422139, + "learning_rate": 1.5823851283304546e-07, + "loss": 0.008, + "step": 4642 + }, + { + "epoch": 0.9222365676829873, + "grad_norm": 0.4649954354241579, + "learning_rate": 1.574364616275903e-07, + "loss": 0.009, + "step": 4643 + }, + { + "epoch": 0.9224351971397359, + "grad_norm": 0.24892525017019804, + "learning_rate": 1.5663641572284672e-07, + "loss": 0.0055, + "step": 4644 + }, + { + "epoch": 0.9226338265964843, + "grad_norm": 0.620937103802513, + "learning_rate": 1.5583837545011305e-07, + "loss": 0.0108, + "step": 4645 + }, + { + "epoch": 0.9228324560532327, + "grad_norm": 0.7464819124618941, + "learning_rate": 1.5504234113985661e-07, + "loss": 0.005, + "step": 4646 + }, + { + "epoch": 0.9230310855099811, + "grad_norm": 0.39950890377449805, + "learning_rate": 1.5424831312171595e-07, + "loss": 0.0091, + "step": 4647 + }, + { + "epoch": 0.9232297149667296, + "grad_norm": 0.7650531790637041, + "learning_rate": 1.5345629172449472e-07, + "loss": 0.0182, + "step": 4648 + }, + { + "epoch": 0.923428344423478, + "grad_norm": 0.4823999396347639, + "learning_rate": 1.5266627727617056e-07, + "loss": 0.0192, + "step": 4649 + }, + { + "epoch": 0.9236269738802264, + "grad_norm": 0.7401078670894918, + "learning_rate": 1.5187827010388677e-07, + "loss": 0.0127, + "step": 4650 + }, + { + "epoch": 0.9238256033369748, + "grad_norm": 0.3397450263276877, + "learning_rate": 1.510922705339557e-07, + "loss": 0.0108, + "step": 4651 + }, + { + "epoch": 0.9240242327937233, + "grad_norm": 0.3518773712824609, + "learning_rate": 1.503082788918603e-07, + "loss": 0.0057, + "step": 4652 + }, + { + "epoch": 0.9242228622504718, + "grad_norm": 0.2477018526451415, + "learning_rate": 1.4952629550224916e-07, + "loss": 0.0058, + "step": 4653 + }, + { + "epoch": 0.9244214917072202, + "grad_norm": 0.5414836310524654, + "learning_rate": 1.487463206889428e-07, + "loss": 0.0165, + "step": 4654 + }, + { + "epoch": 0.9246201211639686, + "grad_norm": 0.47446038228846893, + "learning_rate": 1.479683547749261e-07, + "loss": 0.0131, + "step": 4655 + }, + { + "epoch": 0.9248187506207171, + "grad_norm": 0.2636023459839493, + "learning_rate": 1.4719239808235418e-07, + "loss": 0.0041, + "step": 4656 + }, + { + "epoch": 0.9250173800774655, + "grad_norm": 0.40166636205399747, + "learning_rate": 1.464184509325517e-07, + "loss": 0.0055, + "step": 4657 + }, + { + "epoch": 0.9252160095342139, + "grad_norm": 0.44435793160055864, + "learning_rate": 1.4564651364600724e-07, + "loss": 0.0059, + "step": 4658 + }, + { + "epoch": 0.9254146389909623, + "grad_norm": 0.3994984612235425, + "learning_rate": 1.4487658654238123e-07, + "loss": 0.0071, + "step": 4659 + }, + { + "epoch": 0.9256132684477107, + "grad_norm": 0.47362741123170454, + "learning_rate": 1.4410866994049755e-07, + "loss": 0.011, + "step": 4660 + }, + { + "epoch": 0.9258118979044593, + "grad_norm": 0.7872100192173731, + "learning_rate": 1.433427641583518e-07, + "loss": 0.0169, + "step": 4661 + }, + { + "epoch": 0.9260105273612077, + "grad_norm": 0.8552459275140093, + "learning_rate": 1.4257886951310307e-07, + "loss": 0.0185, + "step": 4662 + }, + { + "epoch": 0.9262091568179561, + "grad_norm": 0.6608328624853623, + "learning_rate": 1.4181698632108055e-07, + "loss": 0.009, + "step": 4663 + }, + { + "epoch": 0.9264077862747045, + "grad_norm": 0.414776424248957, + "learning_rate": 1.4105711489777962e-07, + "loss": 0.0088, + "step": 4664 + }, + { + "epoch": 0.926606415731453, + "grad_norm": 0.6398136251284509, + "learning_rate": 1.4029925555786027e-07, + "loss": 0.0154, + "step": 4665 + }, + { + "epoch": 0.9268050451882014, + "grad_norm": 0.7474279649587607, + "learning_rate": 1.395434086151537e-07, + "loss": 0.0067, + "step": 4666 + }, + { + "epoch": 0.9270036746449498, + "grad_norm": 0.5064378151011119, + "learning_rate": 1.3878957438265338e-07, + "loss": 0.0116, + "step": 4667 + }, + { + "epoch": 0.9272023041016982, + "grad_norm": 0.45989881995150333, + "learning_rate": 1.3803775317252188e-07, + "loss": 0.0092, + "step": 4668 + }, + { + "epoch": 0.9274009335584468, + "grad_norm": 0.6599032509747758, + "learning_rate": 1.3728794529608846e-07, + "loss": 0.0093, + "step": 4669 + }, + { + "epoch": 0.9275995630151952, + "grad_norm": 0.5141005371853344, + "learning_rate": 1.3654015106384698e-07, + "loss": 0.0093, + "step": 4670 + }, + { + "epoch": 0.9277981924719436, + "grad_norm": 0.5941689240567093, + "learning_rate": 1.357943707854592e-07, + "loss": 0.0146, + "step": 4671 + }, + { + "epoch": 0.927996821928692, + "grad_norm": 0.20114286406354762, + "learning_rate": 1.3505060476975085e-07, + "loss": 0.003, + "step": 4672 + }, + { + "epoch": 0.9281954513854405, + "grad_norm": 0.45688487671874534, + "learning_rate": 1.3430885332471554e-07, + "loss": 0.0162, + "step": 4673 + }, + { + "epoch": 0.9283940808421889, + "grad_norm": 0.35614643694965575, + "learning_rate": 1.3356911675751093e-07, + "loss": 0.0051, + "step": 4674 + }, + { + "epoch": 0.9285927102989373, + "grad_norm": 0.5744415497903277, + "learning_rate": 1.3283139537446144e-07, + "loss": 0.0121, + "step": 4675 + }, + { + "epoch": 0.9287913397556857, + "grad_norm": 1.0850590038936332, + "learning_rate": 1.3209568948105768e-07, + "loss": 0.0145, + "step": 4676 + }, + { + "epoch": 0.9289899692124342, + "grad_norm": 0.45964179898690255, + "learning_rate": 1.313619993819537e-07, + "loss": 0.0131, + "step": 4677 + }, + { + "epoch": 0.9291885986691827, + "grad_norm": 0.5818030589778348, + "learning_rate": 1.3063032538097097e-07, + "loss": 0.0111, + "step": 4678 + }, + { + "epoch": 0.9293872281259311, + "grad_norm": 0.6481989151858105, + "learning_rate": 1.2990066778109323e-07, + "loss": 0.0136, + "step": 4679 + }, + { + "epoch": 0.9295858575826795, + "grad_norm": 0.37947609369661844, + "learning_rate": 1.2917302688447265e-07, + "loss": 0.0113, + "step": 4680 + }, + { + "epoch": 0.929784487039428, + "grad_norm": 0.5081483427859849, + "learning_rate": 1.284474029924232e-07, + "loss": 0.0103, + "step": 4681 + }, + { + "epoch": 0.9299831164961764, + "grad_norm": 0.21444824245931055, + "learning_rate": 1.2772379640542564e-07, + "loss": 0.005, + "step": 4682 + }, + { + "epoch": 0.9301817459529248, + "grad_norm": 0.26879234255685003, + "learning_rate": 1.270022074231253e-07, + "loss": 0.0099, + "step": 4683 + }, + { + "epoch": 0.9303803754096732, + "grad_norm": 0.5160956745621564, + "learning_rate": 1.2628263634433035e-07, + "loss": 0.0138, + "step": 4684 + }, + { + "epoch": 0.9305790048664216, + "grad_norm": 0.49211468181460044, + "learning_rate": 1.2556508346701578e-07, + "loss": 0.0064, + "step": 4685 + }, + { + "epoch": 0.9307776343231702, + "grad_norm": 0.8023564960097778, + "learning_rate": 1.2484954908831837e-07, + "loss": 0.0125, + "step": 4686 + }, + { + "epoch": 0.9309762637799186, + "grad_norm": 0.9862722885789399, + "learning_rate": 1.241360335045405e-07, + "loss": 0.0085, + "step": 4687 + }, + { + "epoch": 0.931174893236667, + "grad_norm": 0.7887851867631835, + "learning_rate": 1.2342453701114864e-07, + "loss": 0.0131, + "step": 4688 + }, + { + "epoch": 0.9313735226934154, + "grad_norm": 0.3479006313592782, + "learning_rate": 1.227150599027721e-07, + "loss": 0.0094, + "step": 4689 + }, + { + "epoch": 0.9315721521501639, + "grad_norm": 0.674927856372601, + "learning_rate": 1.2200760247320586e-07, + "loss": 0.0098, + "step": 4690 + }, + { + "epoch": 0.9317707816069123, + "grad_norm": 0.42722316421638, + "learning_rate": 1.213021650154056e-07, + "loss": 0.0095, + "step": 4691 + }, + { + "epoch": 0.9319694110636607, + "grad_norm": 0.301197655900226, + "learning_rate": 1.2059874782149317e-07, + "loss": 0.0061, + "step": 4692 + }, + { + "epoch": 0.9321680405204091, + "grad_norm": 0.5133950310604148, + "learning_rate": 1.1989735118275337e-07, + "loss": 0.0127, + "step": 4693 + }, + { + "epoch": 0.9323666699771576, + "grad_norm": 0.34014268074393605, + "learning_rate": 1.1919797538963274e-07, + "loss": 0.0059, + "step": 4694 + }, + { + "epoch": 0.9325652994339061, + "grad_norm": 0.5164175565820704, + "learning_rate": 1.1850062073174351e-07, + "loss": 0.0116, + "step": 4695 + }, + { + "epoch": 0.9327639288906545, + "grad_norm": 0.4070984340693311, + "learning_rate": 1.1780528749785802e-07, + "loss": 0.0077, + "step": 4696 + }, + { + "epoch": 0.9329625583474029, + "grad_norm": 0.38897032187757813, + "learning_rate": 1.1711197597591428e-07, + "loss": 0.0063, + "step": 4697 + }, + { + "epoch": 0.9331611878041514, + "grad_norm": 0.2930844011928402, + "learning_rate": 1.1642068645301152e-07, + "loss": 0.0053, + "step": 4698 + }, + { + "epoch": 0.9333598172608998, + "grad_norm": 0.3764839957960558, + "learning_rate": 1.1573141921541131e-07, + "loss": 0.0054, + "step": 4699 + }, + { + "epoch": 0.9335584467176482, + "grad_norm": 0.43523780379200877, + "learning_rate": 1.1504417454854033e-07, + "loss": 0.013, + "step": 4700 + }, + { + "epoch": 0.9337570761743966, + "grad_norm": 0.5750229920284355, + "learning_rate": 1.1435895273698372e-07, + "loss": 0.0084, + "step": 4701 + }, + { + "epoch": 0.933955705631145, + "grad_norm": 0.19886700519682282, + "learning_rate": 1.1367575406449282e-07, + "loss": 0.0046, + "step": 4702 + }, + { + "epoch": 0.9341543350878936, + "grad_norm": 0.790833202260621, + "learning_rate": 1.1299457881397858e-07, + "loss": 0.0185, + "step": 4703 + }, + { + "epoch": 0.934352964544642, + "grad_norm": 0.4219918573113129, + "learning_rate": 1.1231542726751532e-07, + "loss": 0.0092, + "step": 4704 + }, + { + "epoch": 0.9345515940013904, + "grad_norm": 0.7230909029282245, + "learning_rate": 1.1163829970633865e-07, + "loss": 0.0147, + "step": 4705 + }, + { + "epoch": 0.9347502234581389, + "grad_norm": 0.5093534424476391, + "learning_rate": 1.1096319641084708e-07, + "loss": 0.0116, + "step": 4706 + }, + { + "epoch": 0.9349488529148873, + "grad_norm": 0.27347946955908975, + "learning_rate": 1.1029011766059972e-07, + "loss": 0.0034, + "step": 4707 + }, + { + "epoch": 0.9351474823716357, + "grad_norm": 1.0701505055181986, + "learning_rate": 1.0961906373431808e-07, + "loss": 0.0117, + "step": 4708 + }, + { + "epoch": 0.9353461118283841, + "grad_norm": 0.6516188206878525, + "learning_rate": 1.0895003490988487e-07, + "loss": 0.0114, + "step": 4709 + }, + { + "epoch": 0.9355447412851325, + "grad_norm": 0.4902354426875574, + "learning_rate": 1.0828303146434404e-07, + "loss": 0.0088, + "step": 4710 + }, + { + "epoch": 0.9357433707418811, + "grad_norm": 0.5209392249997609, + "learning_rate": 1.0761805367390187e-07, + "loss": 0.0117, + "step": 4711 + }, + { + "epoch": 0.9359420001986295, + "grad_norm": 1.259910339066219, + "learning_rate": 1.0695510181392365e-07, + "loss": 0.0088, + "step": 4712 + }, + { + "epoch": 0.9361406296553779, + "grad_norm": 0.5151392556944235, + "learning_rate": 1.0629417615893756e-07, + "loss": 0.0139, + "step": 4713 + }, + { + "epoch": 0.9363392591121263, + "grad_norm": 0.9491713148601669, + "learning_rate": 1.0563527698263298e-07, + "loss": 0.018, + "step": 4714 + }, + { + "epoch": 0.9365378885688748, + "grad_norm": 1.2153903329483169, + "learning_rate": 1.0497840455785835e-07, + "loss": 0.0173, + "step": 4715 + }, + { + "epoch": 0.9367365180256232, + "grad_norm": 0.288079961745967, + "learning_rate": 1.0432355915662496e-07, + "loss": 0.0033, + "step": 4716 + }, + { + "epoch": 0.9369351474823716, + "grad_norm": 0.3536849277399397, + "learning_rate": 1.0367074105010256e-07, + "loss": 0.0075, + "step": 4717 + }, + { + "epoch": 0.93713377693912, + "grad_norm": 0.6473193451222942, + "learning_rate": 1.0301995050862323e-07, + "loss": 0.0126, + "step": 4718 + }, + { + "epoch": 0.9373324063958685, + "grad_norm": 0.6978153920712078, + "learning_rate": 1.0237118780167809e-07, + "loss": 0.0106, + "step": 4719 + }, + { + "epoch": 0.937531035852617, + "grad_norm": 0.41837417287713097, + "learning_rate": 1.0172445319792002e-07, + "loss": 0.009, + "step": 4720 + }, + { + "epoch": 0.9377296653093654, + "grad_norm": 0.3535301700831529, + "learning_rate": 1.0107974696516032e-07, + "loss": 0.0078, + "step": 4721 + }, + { + "epoch": 0.9379282947661138, + "grad_norm": 0.7905465076037768, + "learning_rate": 1.0043706937037156e-07, + "loss": 0.0134, + "step": 4722 + }, + { + "epoch": 0.9381269242228623, + "grad_norm": 0.36966492424476793, + "learning_rate": 9.979642067968587e-08, + "loss": 0.0116, + "step": 4723 + }, + { + "epoch": 0.9383255536796107, + "grad_norm": 0.6252230556557902, + "learning_rate": 9.915780115839491e-08, + "loss": 0.0116, + "step": 4724 + }, + { + "epoch": 0.9385241831363591, + "grad_norm": 0.8581563383955567, + "learning_rate": 9.852121107095047e-08, + "loss": 0.0266, + "step": 4725 + }, + { + "epoch": 0.9387228125931075, + "grad_norm": 0.5722638109056252, + "learning_rate": 9.788665068096504e-08, + "loss": 0.0162, + "step": 4726 + }, + { + "epoch": 0.938921442049856, + "grad_norm": 0.7084688510586287, + "learning_rate": 9.725412025120783e-08, + "loss": 0.0087, + "step": 4727 + }, + { + "epoch": 0.9391200715066045, + "grad_norm": 0.1863360147125236, + "learning_rate": 9.662362004360992e-08, + "loss": 0.0036, + "step": 4728 + }, + { + "epoch": 0.9393187009633529, + "grad_norm": 0.5073294558989344, + "learning_rate": 9.599515031926021e-08, + "loss": 0.0093, + "step": 4729 + }, + { + "epoch": 0.9395173304201013, + "grad_norm": 0.8287013770837947, + "learning_rate": 9.536871133840775e-08, + "loss": 0.0186, + "step": 4730 + }, + { + "epoch": 0.9397159598768497, + "grad_norm": 0.4145640260838776, + "learning_rate": 9.474430336046059e-08, + "loss": 0.011, + "step": 4731 + }, + { + "epoch": 0.9399145893335982, + "grad_norm": 0.3121093385078184, + "learning_rate": 9.412192664398467e-08, + "loss": 0.004, + "step": 4732 + }, + { + "epoch": 0.9401132187903466, + "grad_norm": 0.5686734537347415, + "learning_rate": 9.350158144670662e-08, + "loss": 0.0153, + "step": 4733 + }, + { + "epoch": 0.940311848247095, + "grad_norm": 1.060966428278419, + "learning_rate": 9.28832680255104e-08, + "loss": 0.019, + "step": 4734 + }, + { + "epoch": 0.9405104777038434, + "grad_norm": 0.4224352499347425, + "learning_rate": 9.22669866364384e-08, + "loss": 0.0067, + "step": 4735 + }, + { + "epoch": 0.9407091071605919, + "grad_norm": 0.3341915120102901, + "learning_rate": 9.165273753469261e-08, + "loss": 0.0069, + "step": 4736 + }, + { + "epoch": 0.9409077366173404, + "grad_norm": 0.6457098860974007, + "learning_rate": 9.10405209746329e-08, + "loss": 0.0137, + "step": 4737 + }, + { + "epoch": 0.9411063660740888, + "grad_norm": 0.24789794897733472, + "learning_rate": 9.043033720977756e-08, + "loss": 0.0048, + "step": 4738 + }, + { + "epoch": 0.9413049955308372, + "grad_norm": 0.3608454499887148, + "learning_rate": 8.982218649280284e-08, + "loss": 0.0122, + "step": 4739 + }, + { + "epoch": 0.9415036249875857, + "grad_norm": 0.7509200650856567, + "learning_rate": 8.921606907554337e-08, + "loss": 0.0116, + "step": 4740 + }, + { + "epoch": 0.9417022544443341, + "grad_norm": 0.3689801779957159, + "learning_rate": 8.861198520899172e-08, + "loss": 0.0076, + "step": 4741 + }, + { + "epoch": 0.9419008839010825, + "grad_norm": 0.5557873310711061, + "learning_rate": 8.800993514329892e-08, + "loss": 0.0153, + "step": 4742 + }, + { + "epoch": 0.9420995133578309, + "grad_norm": 0.37590619822900495, + "learning_rate": 8.74099191277733e-08, + "loss": 0.0091, + "step": 4743 + }, + { + "epoch": 0.9422981428145794, + "grad_norm": 0.8370582813494728, + "learning_rate": 8.681193741088e-08, + "loss": 0.0087, + "step": 4744 + }, + { + "epoch": 0.9424967722713279, + "grad_norm": 0.9398151319661212, + "learning_rate": 8.621599024024374e-08, + "loss": 0.0148, + "step": 4745 + }, + { + "epoch": 0.9426954017280763, + "grad_norm": 0.5122269675023894, + "learning_rate": 8.562207786264487e-08, + "loss": 0.0089, + "step": 4746 + }, + { + "epoch": 0.9428940311848247, + "grad_norm": 0.6367900598425142, + "learning_rate": 8.503020052402223e-08, + "loss": 0.0109, + "step": 4747 + }, + { + "epoch": 0.9430926606415732, + "grad_norm": 0.7304382048017897, + "learning_rate": 8.444035846947141e-08, + "loss": 0.0169, + "step": 4748 + }, + { + "epoch": 0.9432912900983216, + "grad_norm": 0.7902391038165741, + "learning_rate": 8.385255194324593e-08, + "loss": 0.0119, + "step": 4749 + }, + { + "epoch": 0.94348991955507, + "grad_norm": 0.403170445331138, + "learning_rate": 8.326678118875554e-08, + "loss": 0.0101, + "step": 4750 + }, + { + "epoch": 0.9436885490118184, + "grad_norm": 0.4907736576120689, + "learning_rate": 8.268304644856673e-08, + "loss": 0.0108, + "step": 4751 + }, + { + "epoch": 0.9438871784685668, + "grad_norm": 0.5505554874481177, + "learning_rate": 8.210134796440449e-08, + "loss": 0.0107, + "step": 4752 + }, + { + "epoch": 0.9440858079253154, + "grad_norm": 0.5428734384940673, + "learning_rate": 8.152168597714894e-08, + "loss": 0.0093, + "step": 4753 + }, + { + "epoch": 0.9442844373820638, + "grad_norm": 0.44060876168794505, + "learning_rate": 8.094406072683858e-08, + "loss": 0.0062, + "step": 4754 + }, + { + "epoch": 0.9444830668388122, + "grad_norm": 0.40725989340402463, + "learning_rate": 8.036847245266543e-08, + "loss": 0.0099, + "step": 4755 + }, + { + "epoch": 0.9446816962955606, + "grad_norm": 0.46261890604962713, + "learning_rate": 7.979492139298162e-08, + "loss": 0.011, + "step": 4756 + }, + { + "epoch": 0.9448803257523091, + "grad_norm": 0.4976138795208467, + "learning_rate": 7.92234077852938e-08, + "loss": 0.0136, + "step": 4757 + }, + { + "epoch": 0.9450789552090575, + "grad_norm": 0.5403839208497061, + "learning_rate": 7.865393186626491e-08, + "loss": 0.0068, + "step": 4758 + }, + { + "epoch": 0.9452775846658059, + "grad_norm": 0.34280255446442026, + "learning_rate": 7.808649387171519e-08, + "loss": 0.0063, + "step": 4759 + }, + { + "epoch": 0.9454762141225543, + "grad_norm": 0.9007767217887392, + "learning_rate": 7.752109403661834e-08, + "loss": 0.011, + "step": 4760 + }, + { + "epoch": 0.9456748435793028, + "grad_norm": 1.0223302663255138, + "learning_rate": 7.695773259510764e-08, + "loss": 0.0094, + "step": 4761 + }, + { + "epoch": 0.9458734730360513, + "grad_norm": 0.41230357058037975, + "learning_rate": 7.639640978046981e-08, + "loss": 0.0076, + "step": 4762 + }, + { + "epoch": 0.9460721024927997, + "grad_norm": 0.4318170038225908, + "learning_rate": 7.583712582514724e-08, + "loss": 0.0121, + "step": 4763 + }, + { + "epoch": 0.9462707319495481, + "grad_norm": 0.2930715164840165, + "learning_rate": 7.527988096074079e-08, + "loss": 0.004, + "step": 4764 + }, + { + "epoch": 0.9464693614062966, + "grad_norm": 1.1647632495336515, + "learning_rate": 7.47246754180031e-08, + "loss": 0.0114, + "step": 4765 + }, + { + "epoch": 0.946667990863045, + "grad_norm": 0.5002685742542026, + "learning_rate": 7.417150942684525e-08, + "loss": 0.0115, + "step": 4766 + }, + { + "epoch": 0.9468666203197934, + "grad_norm": 0.5858011863544192, + "learning_rate": 7.362038321633235e-08, + "loss": 0.0086, + "step": 4767 + }, + { + "epoch": 0.9470652497765418, + "grad_norm": 0.5066199298525029, + "learning_rate": 7.307129701468574e-08, + "loss": 0.0103, + "step": 4768 + }, + { + "epoch": 0.9472638792332903, + "grad_norm": 0.2689653914202334, + "learning_rate": 7.252425104928074e-08, + "loss": 0.0064, + "step": 4769 + }, + { + "epoch": 0.9474625086900388, + "grad_norm": 0.6049148054355731, + "learning_rate": 7.197924554664893e-08, + "loss": 0.0119, + "step": 4770 + }, + { + "epoch": 0.9476611381467872, + "grad_norm": 0.5517727484698056, + "learning_rate": 7.1436280732477e-08, + "loss": 0.0132, + "step": 4771 + }, + { + "epoch": 0.9478597676035356, + "grad_norm": 0.32666164342892834, + "learning_rate": 7.089535683160508e-08, + "loss": 0.0062, + "step": 4772 + }, + { + "epoch": 0.948058397060284, + "grad_norm": 1.102212142762428, + "learning_rate": 7.035647406803015e-08, + "loss": 0.0136, + "step": 4773 + }, + { + "epoch": 0.9482570265170325, + "grad_norm": 0.7658504815210466, + "learning_rate": 6.981963266490199e-08, + "loss": 0.0116, + "step": 4774 + }, + { + "epoch": 0.9484556559737809, + "grad_norm": 0.8130273192518539, + "learning_rate": 6.92848328445267e-08, + "loss": 0.0128, + "step": 4775 + }, + { + "epoch": 0.9486542854305293, + "grad_norm": 0.6824866401998253, + "learning_rate": 6.875207482836544e-08, + "loss": 0.0164, + "step": 4776 + }, + { + "epoch": 0.9488529148872777, + "grad_norm": 0.47569708326658633, + "learning_rate": 6.822135883703063e-08, + "loss": 0.0089, + "step": 4777 + }, + { + "epoch": 0.9490515443440262, + "grad_norm": 0.38871295881882434, + "learning_rate": 6.769268509029315e-08, + "loss": 0.0124, + "step": 4778 + }, + { + "epoch": 0.9492501738007747, + "grad_norm": 0.6378821616400225, + "learning_rate": 6.716605380707508e-08, + "loss": 0.0104, + "step": 4779 + }, + { + "epoch": 0.9494488032575231, + "grad_norm": 0.745002833768245, + "learning_rate": 6.664146520545422e-08, + "loss": 0.0115, + "step": 4780 + }, + { + "epoch": 0.9496474327142715, + "grad_norm": 0.3103526017311052, + "learning_rate": 6.611891950266235e-08, + "loss": 0.006, + "step": 4781 + }, + { + "epoch": 0.94984606217102, + "grad_norm": 0.44138783004158555, + "learning_rate": 6.559841691508473e-08, + "loss": 0.0109, + "step": 4782 + }, + { + "epoch": 0.9500446916277684, + "grad_norm": 0.6016533565370209, + "learning_rate": 6.507995765826169e-08, + "loss": 0.0135, + "step": 4783 + }, + { + "epoch": 0.9502433210845168, + "grad_norm": 0.9664137702511328, + "learning_rate": 6.456354194688597e-08, + "loss": 0.0103, + "step": 4784 + }, + { + "epoch": 0.9504419505412652, + "grad_norm": 0.2967531583933644, + "learning_rate": 6.404916999480482e-08, + "loss": 0.0061, + "step": 4785 + }, + { + "epoch": 0.9506405799980137, + "grad_norm": 0.4734645839133741, + "learning_rate": 6.353684201502008e-08, + "loss": 0.0094, + "step": 4786 + }, + { + "epoch": 0.9508392094547622, + "grad_norm": 0.7133914033624615, + "learning_rate": 6.302655821968485e-08, + "loss": 0.0146, + "step": 4787 + }, + { + "epoch": 0.9510378389115106, + "grad_norm": 0.4600218420642334, + "learning_rate": 6.25183188201084e-08, + "loss": 0.0097, + "step": 4788 + }, + { + "epoch": 0.951236468368259, + "grad_norm": 0.38688423685277884, + "learning_rate": 6.201212402675072e-08, + "loss": 0.0084, + "step": 4789 + }, + { + "epoch": 0.9514350978250075, + "grad_norm": 0.566156909391181, + "learning_rate": 6.15079740492286e-08, + "loss": 0.0127, + "step": 4790 + }, + { + "epoch": 0.9516337272817559, + "grad_norm": 0.5610189586192836, + "learning_rate": 6.100586909630779e-08, + "loss": 0.0126, + "step": 4791 + }, + { + "epoch": 0.9518323567385043, + "grad_norm": 0.6857347036976907, + "learning_rate": 6.050580937591144e-08, + "loss": 0.0117, + "step": 4792 + }, + { + "epoch": 0.9520309861952527, + "grad_norm": 0.5458484629152233, + "learning_rate": 6.000779509511279e-08, + "loss": 0.0115, + "step": 4793 + }, + { + "epoch": 0.9522296156520011, + "grad_norm": 0.6424256763252679, + "learning_rate": 5.951182646013853e-08, + "loss": 0.0161, + "step": 4794 + }, + { + "epoch": 0.9524282451087497, + "grad_norm": 0.544918430900207, + "learning_rate": 5.901790367636995e-08, + "loss": 0.0138, + "step": 4795 + }, + { + "epoch": 0.9526268745654981, + "grad_norm": 0.7253601543796748, + "learning_rate": 5.8526026948338974e-08, + "loss": 0.0188, + "step": 4796 + }, + { + "epoch": 0.9528255040222465, + "grad_norm": 0.6596213294085469, + "learning_rate": 5.803619647973213e-08, + "loss": 0.0108, + "step": 4797 + }, + { + "epoch": 0.953024133478995, + "grad_norm": 0.3094292663332667, + "learning_rate": 5.754841247338716e-08, + "loss": 0.0075, + "step": 4798 + }, + { + "epoch": 0.9532227629357434, + "grad_norm": 0.2653176462450698, + "learning_rate": 5.706267513129527e-08, + "loss": 0.006, + "step": 4799 + }, + { + "epoch": 0.9534213923924918, + "grad_norm": 0.2862839957357303, + "learning_rate": 5.657898465459943e-08, + "loss": 0.007, + "step": 4800 + }, + { + "epoch": 0.9536200218492402, + "grad_norm": 0.46267765672416544, + "learning_rate": 5.609734124359556e-08, + "loss": 0.0115, + "step": 4801 + }, + { + "epoch": 0.9538186513059886, + "grad_norm": 1.3552174604785634, + "learning_rate": 5.5617745097731876e-08, + "loss": 0.0066, + "step": 4802 + }, + { + "epoch": 0.9540172807627371, + "grad_norm": 0.6673917782595586, + "learning_rate": 5.5140196415608414e-08, + "loss": 0.0128, + "step": 4803 + }, + { + "epoch": 0.9542159102194856, + "grad_norm": 0.6949616822551641, + "learning_rate": 5.466469539497809e-08, + "loss": 0.0158, + "step": 4804 + }, + { + "epoch": 0.954414539676234, + "grad_norm": 0.7623499887827077, + "learning_rate": 5.419124223274452e-08, + "loss": 0.0139, + "step": 4805 + }, + { + "epoch": 0.9546131691329824, + "grad_norm": 0.548628908422014, + "learning_rate": 5.371983712496476e-08, + "loss": 0.0122, + "step": 4806 + }, + { + "epoch": 0.9548117985897309, + "grad_norm": 0.8368126791005601, + "learning_rate": 5.325048026684765e-08, + "loss": 0.0156, + "step": 4807 + }, + { + "epoch": 0.9550104280464793, + "grad_norm": 0.5712358763963159, + "learning_rate": 5.278317185275217e-08, + "loss": 0.0143, + "step": 4808 + }, + { + "epoch": 0.9552090575032277, + "grad_norm": 0.33369125553839335, + "learning_rate": 5.23179120761913e-08, + "loss": 0.0091, + "step": 4809 + }, + { + "epoch": 0.9554076869599761, + "grad_norm": 0.5426548631897875, + "learning_rate": 5.185470112982816e-08, + "loss": 0.01, + "step": 4810 + }, + { + "epoch": 0.9556063164167246, + "grad_norm": 0.7736330728886897, + "learning_rate": 5.139353920547818e-08, + "loss": 0.0084, + "step": 4811 + }, + { + "epoch": 0.9558049458734731, + "grad_norm": 0.7726944536488665, + "learning_rate": 5.093442649410807e-08, + "loss": 0.0122, + "step": 4812 + }, + { + "epoch": 0.9560035753302215, + "grad_norm": 0.41025603735219146, + "learning_rate": 5.0477363185835736e-08, + "loss": 0.0082, + "step": 4813 + }, + { + "epoch": 0.9562022047869699, + "grad_norm": 0.42810031719834896, + "learning_rate": 5.0022349469930344e-08, + "loss": 0.0076, + "step": 4814 + }, + { + "epoch": 0.9564008342437184, + "grad_norm": 0.5312298899611213, + "learning_rate": 4.9569385534813386e-08, + "loss": 0.0091, + "step": 4815 + }, + { + "epoch": 0.9565994637004668, + "grad_norm": 0.5843424770418346, + "learning_rate": 4.911847156805649e-08, + "loss": 0.0058, + "step": 4816 + }, + { + "epoch": 0.9567980931572152, + "grad_norm": 0.627548785238714, + "learning_rate": 4.866960775638252e-08, + "loss": 0.0131, + "step": 4817 + }, + { + "epoch": 0.9569967226139636, + "grad_norm": 0.7105539318515306, + "learning_rate": 4.8222794285665006e-08, + "loss": 0.0183, + "step": 4818 + }, + { + "epoch": 0.957195352070712, + "grad_norm": 0.6002583148423645, + "learning_rate": 4.7778031340930397e-08, + "loss": 0.0107, + "step": 4819 + }, + { + "epoch": 0.9573939815274605, + "grad_norm": 0.6818174820977825, + "learning_rate": 4.7335319106353026e-08, + "loss": 0.0168, + "step": 4820 + }, + { + "epoch": 0.957592610984209, + "grad_norm": 0.3965043691901418, + "learning_rate": 4.689465776526125e-08, + "loss": 0.0083, + "step": 4821 + }, + { + "epoch": 0.9577912404409574, + "grad_norm": 0.6228622351159547, + "learning_rate": 4.645604750013078e-08, + "loss": 0.009, + "step": 4822 + }, + { + "epoch": 0.9579898698977058, + "grad_norm": 0.47410239295040185, + "learning_rate": 4.601948849259019e-08, + "loss": 0.014, + "step": 4823 + }, + { + "epoch": 0.9581884993544543, + "grad_norm": 1.0847330066440664, + "learning_rate": 4.558498092341879e-08, + "loss": 0.016, + "step": 4824 + }, + { + "epoch": 0.9583871288112027, + "grad_norm": 1.061431136868962, + "learning_rate": 4.515252497254541e-08, + "loss": 0.0147, + "step": 4825 + }, + { + "epoch": 0.9585857582679511, + "grad_norm": 0.4114241358058321, + "learning_rate": 4.4722120819049586e-08, + "loss": 0.0111, + "step": 4826 + }, + { + "epoch": 0.9587843877246995, + "grad_norm": 0.6247463058011158, + "learning_rate": 4.4293768641160416e-08, + "loss": 0.0099, + "step": 4827 + }, + { + "epoch": 0.958983017181448, + "grad_norm": 0.8021242243802347, + "learning_rate": 4.38674686162599e-08, + "loss": 0.0136, + "step": 4828 + }, + { + "epoch": 0.9591816466381965, + "grad_norm": 0.5276059971971032, + "learning_rate": 4.344322092087683e-08, + "loss": 0.0139, + "step": 4829 + }, + { + "epoch": 0.9593802760949449, + "grad_norm": 0.4836014882831527, + "learning_rate": 4.302102573069289e-08, + "loss": 0.0115, + "step": 4830 + }, + { + "epoch": 0.9595789055516933, + "grad_norm": 0.34443701008701366, + "learning_rate": 4.260088322053768e-08, + "loss": 0.0123, + "step": 4831 + }, + { + "epoch": 0.9597775350084418, + "grad_norm": 0.3859441891012454, + "learning_rate": 4.2182793564392034e-08, + "loss": 0.0092, + "step": 4832 + }, + { + "epoch": 0.9599761644651902, + "grad_norm": 0.5084520994177352, + "learning_rate": 4.176675693538745e-08, + "loss": 0.013, + "step": 4833 + }, + { + "epoch": 0.9601747939219386, + "grad_norm": 0.3918431048956989, + "learning_rate": 4.13527735058028e-08, + "loss": 0.0069, + "step": 4834 + }, + { + "epoch": 0.960373423378687, + "grad_norm": 0.5475933146987769, + "learning_rate": 4.094084344706928e-08, + "loss": 0.0131, + "step": 4835 + }, + { + "epoch": 0.9605720528354355, + "grad_norm": 0.8255199408065383, + "learning_rate": 4.053096692976655e-08, + "loss": 0.0127, + "step": 4836 + }, + { + "epoch": 0.960770682292184, + "grad_norm": 0.9276148823282381, + "learning_rate": 4.012314412362328e-08, + "loss": 0.0119, + "step": 4837 + }, + { + "epoch": 0.9609693117489324, + "grad_norm": 0.5402494802195912, + "learning_rate": 3.971737519751939e-08, + "loss": 0.0083, + "step": 4838 + }, + { + "epoch": 0.9611679412056808, + "grad_norm": 0.35876195272358025, + "learning_rate": 3.9313660319483246e-08, + "loss": 0.0085, + "step": 4839 + }, + { + "epoch": 0.9613665706624293, + "grad_norm": 0.42863726316703926, + "learning_rate": 3.8911999656692787e-08, + "loss": 0.0053, + "step": 4840 + }, + { + "epoch": 0.9615652001191777, + "grad_norm": 0.43862831859121126, + "learning_rate": 3.851239337547441e-08, + "loss": 0.0099, + "step": 4841 + }, + { + "epoch": 0.9617638295759261, + "grad_norm": 0.5380419062452532, + "learning_rate": 3.8114841641305744e-08, + "loss": 0.0102, + "step": 4842 + }, + { + "epoch": 0.9619624590326745, + "grad_norm": 0.3862160575949929, + "learning_rate": 3.7719344618812326e-08, + "loss": 0.0121, + "step": 4843 + }, + { + "epoch": 0.9621610884894229, + "grad_norm": 0.4397239298419148, + "learning_rate": 3.7325902471768706e-08, + "loss": 0.0095, + "step": 4844 + }, + { + "epoch": 0.9623597179461714, + "grad_norm": 0.8870761334367195, + "learning_rate": 3.693451536309955e-08, + "loss": 0.018, + "step": 4845 + }, + { + "epoch": 0.9625583474029199, + "grad_norm": 0.7392267280358624, + "learning_rate": 3.65451834548769e-08, + "loss": 0.0132, + "step": 4846 + }, + { + "epoch": 0.9627569768596683, + "grad_norm": 0.9086319168545279, + "learning_rate": 3.6157906908323995e-08, + "loss": 0.0112, + "step": 4847 + }, + { + "epoch": 0.9629556063164167, + "grad_norm": 0.5583448293826135, + "learning_rate": 3.5772685883809775e-08, + "loss": 0.008, + "step": 4848 + }, + { + "epoch": 0.9631542357731652, + "grad_norm": 0.5863970387552007, + "learning_rate": 3.5389520540856094e-08, + "loss": 0.0062, + "step": 4849 + }, + { + "epoch": 0.9633528652299136, + "grad_norm": 0.25369702479905915, + "learning_rate": 3.500841103812991e-08, + "loss": 0.006, + "step": 4850 + }, + { + "epoch": 0.963551494686662, + "grad_norm": 0.7699812014005178, + "learning_rate": 3.462935753344832e-08, + "loss": 0.0124, + "step": 4851 + }, + { + "epoch": 0.9637501241434104, + "grad_norm": 0.39448570416002493, + "learning_rate": 3.4252360183777976e-08, + "loss": 0.0068, + "step": 4852 + }, + { + "epoch": 0.9639487536001589, + "grad_norm": 0.37419322323165516, + "learning_rate": 3.38774191452318e-08, + "loss": 0.0063, + "step": 4853 + }, + { + "epoch": 0.9641473830569074, + "grad_norm": 0.6355343515700734, + "learning_rate": 3.350453457307335e-08, + "loss": 0.0141, + "step": 4854 + }, + { + "epoch": 0.9643460125136558, + "grad_norm": 0.8356043327709957, + "learning_rate": 3.313370662171411e-08, + "loss": 0.0197, + "step": 4855 + }, + { + "epoch": 0.9645446419704042, + "grad_norm": 0.88373860293526, + "learning_rate": 3.276493544471237e-08, + "loss": 0.0137, + "step": 4856 + }, + { + "epoch": 0.9647432714271527, + "grad_norm": 0.43188108347987786, + "learning_rate": 3.239822119477709e-08, + "loss": 0.0099, + "step": 4857 + }, + { + "epoch": 0.9649419008839011, + "grad_norm": 0.2922777433644811, + "learning_rate": 3.2033564023762895e-08, + "loss": 0.0076, + "step": 4858 + }, + { + "epoch": 0.9651405303406495, + "grad_norm": 0.41982816990822347, + "learning_rate": 3.167096408267567e-08, + "loss": 0.0064, + "step": 4859 + }, + { + "epoch": 0.9653391597973979, + "grad_norm": 0.3912328421574355, + "learning_rate": 3.131042152166641e-08, + "loss": 0.0078, + "step": 4860 + }, + { + "epoch": 0.9655377892541464, + "grad_norm": 0.49125451640633777, + "learning_rate": 3.0951936490035696e-08, + "loss": 0.0071, + "step": 4861 + }, + { + "epoch": 0.9657364187108948, + "grad_norm": 0.38541352755974334, + "learning_rate": 3.059550913623199e-08, + "loss": 0.0106, + "step": 4862 + }, + { + "epoch": 0.9659350481676433, + "grad_norm": 0.34748274835246395, + "learning_rate": 3.024113960785169e-08, + "loss": 0.0068, + "step": 4863 + }, + { + "epoch": 0.9661336776243917, + "grad_norm": 0.33066145540841996, + "learning_rate": 2.9888828051638505e-08, + "loss": 0.0088, + "step": 4864 + }, + { + "epoch": 0.9663323070811402, + "grad_norm": 0.21097836166400444, + "learning_rate": 2.9538574613484084e-08, + "loss": 0.0058, + "step": 4865 + }, + { + "epoch": 0.9665309365378886, + "grad_norm": 0.36457461980716876, + "learning_rate": 2.919037943842906e-08, + "loss": 0.0062, + "step": 4866 + }, + { + "epoch": 0.966729565994637, + "grad_norm": 0.42668003224731427, + "learning_rate": 2.884424267065922e-08, + "loss": 0.0083, + "step": 4867 + }, + { + "epoch": 0.9669281954513854, + "grad_norm": 0.8275980493196213, + "learning_rate": 2.8500164453511002e-08, + "loss": 0.019, + "step": 4868 + }, + { + "epoch": 0.9671268249081338, + "grad_norm": 0.3290524233842703, + "learning_rate": 2.8158144929466e-08, + "loss": 0.0066, + "step": 4869 + }, + { + "epoch": 0.9673254543648823, + "grad_norm": 0.372833561243726, + "learning_rate": 2.78181842401537e-08, + "loss": 0.0045, + "step": 4870 + }, + { + "epoch": 0.9675240838216308, + "grad_norm": 0.4524320356338434, + "learning_rate": 2.74802825263526e-08, + "loss": 0.0061, + "step": 4871 + }, + { + "epoch": 0.9677227132783792, + "grad_norm": 0.5396272827787818, + "learning_rate": 2.714443992798632e-08, + "loss": 0.0114, + "step": 4872 + }, + { + "epoch": 0.9679213427351276, + "grad_norm": 0.71897085626895, + "learning_rate": 2.681065658412807e-08, + "loss": 0.0149, + "step": 4873 + }, + { + "epoch": 0.9681199721918761, + "grad_norm": 0.3973267868046733, + "learning_rate": 2.6478932632996724e-08, + "loss": 0.0071, + "step": 4874 + }, + { + "epoch": 0.9683186016486245, + "grad_norm": 0.7077697157290327, + "learning_rate": 2.6149268211957955e-08, + "loss": 0.0185, + "step": 4875 + }, + { + "epoch": 0.9685172311053729, + "grad_norm": 0.5366383773695785, + "learning_rate": 2.5821663457527013e-08, + "loss": 0.0115, + "step": 4876 + }, + { + "epoch": 0.9687158605621213, + "grad_norm": 0.587638684038049, + "learning_rate": 2.549611850536371e-08, + "loss": 0.0098, + "step": 4877 + }, + { + "epoch": 0.9689144900188698, + "grad_norm": 0.5064559833862212, + "learning_rate": 2.517263349027632e-08, + "loss": 0.0114, + "step": 4878 + }, + { + "epoch": 0.9691131194756182, + "grad_norm": 0.677575640563524, + "learning_rate": 2.485120854621992e-08, + "loss": 0.0112, + "step": 4879 + }, + { + "epoch": 0.9693117489323667, + "grad_norm": 0.5454976845516769, + "learning_rate": 2.4531843806294696e-08, + "loss": 0.0071, + "step": 4880 + }, + { + "epoch": 0.9695103783891151, + "grad_norm": 0.5457073272241642, + "learning_rate": 2.4214539402751534e-08, + "loss": 0.0078, + "step": 4881 + }, + { + "epoch": 0.9697090078458636, + "grad_norm": 0.4264160073509605, + "learning_rate": 2.3899295466983663e-08, + "loss": 0.0122, + "step": 4882 + }, + { + "epoch": 0.969907637302612, + "grad_norm": 0.4258970818620188, + "learning_rate": 2.3586112129534988e-08, + "loss": 0.0112, + "step": 4883 + }, + { + "epoch": 0.9701062667593604, + "grad_norm": 0.6087695074036127, + "learning_rate": 2.3274989520093994e-08, + "loss": 0.0109, + "step": 4884 + }, + { + "epoch": 0.9703048962161088, + "grad_norm": 0.857276501396098, + "learning_rate": 2.29659277674954e-08, + "loss": 0.0159, + "step": 4885 + }, + { + "epoch": 0.9705035256728572, + "grad_norm": 0.30379709830289564, + "learning_rate": 2.2658926999722386e-08, + "loss": 0.005, + "step": 4886 + }, + { + "epoch": 0.9707021551296057, + "grad_norm": 0.44475948970448625, + "learning_rate": 2.2353987343902704e-08, + "loss": 0.0083, + "step": 4887 + }, + { + "epoch": 0.9709007845863542, + "grad_norm": 0.5243677778649878, + "learning_rate": 2.2051108926313125e-08, + "loss": 0.0063, + "step": 4888 + }, + { + "epoch": 0.9710994140431026, + "grad_norm": 0.6869080614393989, + "learning_rate": 2.175029187237332e-08, + "loss": 0.0101, + "step": 4889 + }, + { + "epoch": 0.971298043499851, + "grad_norm": 0.41495749218023437, + "learning_rate": 2.1451536306653088e-08, + "loss": 0.0096, + "step": 4890 + }, + { + "epoch": 0.9714966729565995, + "grad_norm": 0.44501375126763126, + "learning_rate": 2.1154842352865134e-08, + "loss": 0.0076, + "step": 4891 + }, + { + "epoch": 0.9716953024133479, + "grad_norm": 0.48307549732523714, + "learning_rate": 2.0860210133871738e-08, + "loss": 0.0098, + "step": 4892 + }, + { + "epoch": 0.9718939318700963, + "grad_norm": 0.3302165574725224, + "learning_rate": 2.0567639771679192e-08, + "loss": 0.006, + "step": 4893 + }, + { + "epoch": 0.9720925613268447, + "grad_norm": 0.6477082027663088, + "learning_rate": 2.027713138744003e-08, + "loss": 0.0128, + "step": 4894 + }, + { + "epoch": 0.9722911907835932, + "grad_norm": 0.6335639722740967, + "learning_rate": 1.998868510145413e-08, + "loss": 0.0092, + "step": 4895 + }, + { + "epoch": 0.9724898202403417, + "grad_norm": 0.7004054321677288, + "learning_rate": 1.9702301033166505e-08, + "loss": 0.0134, + "step": 4896 + }, + { + "epoch": 0.9726884496970901, + "grad_norm": 0.46136836622282223, + "learning_rate": 1.9417979301168956e-08, + "loss": 0.0084, + "step": 4897 + }, + { + "epoch": 0.9728870791538385, + "grad_norm": 0.4510335143166035, + "learning_rate": 1.9135720023197857e-08, + "loss": 0.0092, + "step": 4898 + }, + { + "epoch": 0.973085708610587, + "grad_norm": 0.2875052305885156, + "learning_rate": 1.8855523316137492e-08, + "loss": 0.0056, + "step": 4899 + }, + { + "epoch": 0.9732843380673354, + "grad_norm": 0.4981975875701382, + "learning_rate": 1.8577389296016713e-08, + "loss": 0.0111, + "step": 4900 + }, + { + "epoch": 0.9734829675240838, + "grad_norm": 0.6442750654417164, + "learning_rate": 1.830131807801061e-08, + "loss": 0.0131, + "step": 4901 + }, + { + "epoch": 0.9736815969808322, + "grad_norm": 0.4873509567237911, + "learning_rate": 1.802730977643996e-08, + "loss": 0.0079, + "step": 4902 + }, + { + "epoch": 0.9738802264375807, + "grad_norm": 0.35940737168787174, + "learning_rate": 1.7755364504771222e-08, + "loss": 0.009, + "step": 4903 + }, + { + "epoch": 0.9740788558943291, + "grad_norm": 0.6049711416734347, + "learning_rate": 1.7485482375616534e-08, + "loss": 0.0115, + "step": 4904 + }, + { + "epoch": 0.9742774853510776, + "grad_norm": 0.540096719001103, + "learning_rate": 1.721766350073373e-08, + "loss": 0.0085, + "step": 4905 + }, + { + "epoch": 0.974476114807826, + "grad_norm": 0.5403139332223295, + "learning_rate": 1.6951907991026863e-08, + "loss": 0.0104, + "step": 4906 + }, + { + "epoch": 0.9746747442645745, + "grad_norm": 0.3412169774660418, + "learning_rate": 1.6688215956545128e-08, + "loss": 0.0081, + "step": 4907 + }, + { + "epoch": 0.9748733737213229, + "grad_norm": 0.30705865819778627, + "learning_rate": 1.6426587506482295e-08, + "loss": 0.0102, + "step": 4908 + }, + { + "epoch": 0.9750720031780713, + "grad_norm": 0.46701451128719806, + "learning_rate": 1.616702274917892e-08, + "loss": 0.0101, + "step": 4909 + }, + { + "epoch": 0.9752706326348197, + "grad_norm": 0.39809981599821737, + "learning_rate": 1.590952179212013e-08, + "loss": 0.0109, + "step": 4910 + }, + { + "epoch": 0.9754692620915681, + "grad_norm": 0.2555686289992959, + "learning_rate": 1.565408474193786e-08, + "loss": 0.0054, + "step": 4911 + }, + { + "epoch": 0.9756678915483166, + "grad_norm": 0.24199181404380177, + "learning_rate": 1.540071170440749e-08, + "loss": 0.0096, + "step": 4912 + }, + { + "epoch": 0.9758665210050651, + "grad_norm": 0.5796493821986631, + "learning_rate": 1.514940278445065e-08, + "loss": 0.0074, + "step": 4913 + }, + { + "epoch": 0.9760651504618135, + "grad_norm": 0.5748057498768764, + "learning_rate": 1.4900158086134097e-08, + "loss": 0.0154, + "step": 4914 + }, + { + "epoch": 0.976263779918562, + "grad_norm": 0.33859494804875856, + "learning_rate": 1.4652977712669714e-08, + "loss": 0.0057, + "step": 4915 + }, + { + "epoch": 0.9764624093753104, + "grad_norm": 0.4017728258831618, + "learning_rate": 1.4407861766415066e-08, + "loss": 0.0132, + "step": 4916 + }, + { + "epoch": 0.9766610388320588, + "grad_norm": 0.6787171557695976, + "learning_rate": 1.4164810348871739e-08, + "loss": 0.0118, + "step": 4917 + }, + { + "epoch": 0.9768596682888072, + "grad_norm": 0.6909520206503219, + "learning_rate": 1.392382356068811e-08, + "loss": 0.0164, + "step": 4918 + }, + { + "epoch": 0.9770582977455556, + "grad_norm": 0.5243266524129274, + "learning_rate": 1.3684901501655468e-08, + "loss": 0.0102, + "step": 4919 + }, + { + "epoch": 0.9772569272023041, + "grad_norm": 0.5013715407112671, + "learning_rate": 1.344804427071189e-08, + "loss": 0.0145, + "step": 4920 + }, + { + "epoch": 0.9774555566590525, + "grad_norm": 0.6104887396616863, + "learning_rate": 1.3213251965939478e-08, + "loss": 0.0094, + "step": 4921 + }, + { + "epoch": 0.977654186115801, + "grad_norm": 12.92537382124017, + "learning_rate": 1.2980524684565455e-08, + "loss": 0.0142, + "step": 4922 + }, + { + "epoch": 0.9778528155725494, + "grad_norm": 0.4169042250958797, + "learning_rate": 1.274986252296273e-08, + "loss": 0.0092, + "step": 4923 + }, + { + "epoch": 0.9780514450292979, + "grad_norm": 0.8118130101634882, + "learning_rate": 1.2521265576646569e-08, + "loss": 0.015, + "step": 4924 + }, + { + "epoch": 0.9782500744860463, + "grad_norm": 0.6249056291594971, + "learning_rate": 1.2294733940280135e-08, + "loss": 0.011, + "step": 4925 + }, + { + "epoch": 0.9784487039427947, + "grad_norm": 0.39444979264014, + "learning_rate": 1.2070267707670058e-08, + "loss": 0.0086, + "step": 4926 + }, + { + "epoch": 0.9786473333995431, + "grad_norm": 1.2105651014944392, + "learning_rate": 1.184786697176643e-08, + "loss": 0.0201, + "step": 4927 + }, + { + "epoch": 0.9788459628562916, + "grad_norm": 0.6828183839448554, + "learning_rate": 1.1627531824666138e-08, + "loss": 0.0137, + "step": 4928 + }, + { + "epoch": 0.97904459231304, + "grad_norm": 0.7000845033148919, + "learning_rate": 1.1409262357609529e-08, + "loss": 0.012, + "step": 4929 + }, + { + "epoch": 0.9792432217697885, + "grad_norm": 0.44350465099943603, + "learning_rate": 1.1193058660980971e-08, + "loss": 0.0115, + "step": 4930 + }, + { + "epoch": 0.9794418512265369, + "grad_norm": 0.7669306721956611, + "learning_rate": 1.0978920824311622e-08, + "loss": 0.0188, + "step": 4931 + }, + { + "epoch": 0.9796404806832854, + "grad_norm": 0.4653719983059989, + "learning_rate": 1.0766848936274998e-08, + "loss": 0.0079, + "step": 4932 + }, + { + "epoch": 0.9798391101400338, + "grad_norm": 0.3330343266105515, + "learning_rate": 1.0556843084689738e-08, + "loss": 0.0084, + "step": 4933 + }, + { + "epoch": 0.9800377395967822, + "grad_norm": 0.4031244671758501, + "learning_rate": 1.0348903356519057e-08, + "loss": 0.0088, + "step": 4934 + }, + { + "epoch": 0.9802363690535306, + "grad_norm": 0.47173933428617837, + "learning_rate": 1.0143029837870744e-08, + "loss": 0.0117, + "step": 4935 + }, + { + "epoch": 0.980434998510279, + "grad_norm": 0.5623629301817437, + "learning_rate": 9.939222613997157e-09, + "loss": 0.0088, + "step": 4936 + }, + { + "epoch": 0.9806336279670275, + "grad_norm": 0.6962143816147889, + "learning_rate": 9.737481769293566e-09, + "loss": 0.0147, + "step": 4937 + }, + { + "epoch": 0.980832257423776, + "grad_norm": 0.456040337121184, + "learning_rate": 9.537807387302034e-09, + "loss": 0.0093, + "step": 4938 + }, + { + "epoch": 0.9810308868805244, + "grad_norm": 0.4400816228735132, + "learning_rate": 9.340199550706974e-09, + "loss": 0.0062, + "step": 4939 + }, + { + "epoch": 0.9812295163372728, + "grad_norm": 0.7076270413554725, + "learning_rate": 9.144658341337375e-09, + "loss": 0.014, + "step": 4940 + }, + { + "epoch": 0.9814281457940213, + "grad_norm": 0.8596549261688607, + "learning_rate": 8.9511838401668e-09, + "loss": 0.0221, + "step": 4941 + }, + { + "epoch": 0.9816267752507697, + "grad_norm": 0.3642453441116622, + "learning_rate": 8.75977612731227e-09, + "loss": 0.0109, + "step": 4942 + }, + { + "epoch": 0.9818254047075181, + "grad_norm": 0.38678000727812256, + "learning_rate": 8.570435282037048e-09, + "loss": 0.0095, + "step": 4943 + }, + { + "epoch": 0.9820240341642665, + "grad_norm": 0.5698335800750048, + "learning_rate": 8.383161382745087e-09, + "loss": 0.0101, + "step": 4944 + }, + { + "epoch": 0.982222663621015, + "grad_norm": 0.46124466080444443, + "learning_rate": 8.197954506988237e-09, + "loss": 0.0057, + "step": 4945 + }, + { + "epoch": 0.9824212930777634, + "grad_norm": 0.3552672963335277, + "learning_rate": 8.014814731458487e-09, + "loss": 0.0059, + "step": 4946 + }, + { + "epoch": 0.9826199225345119, + "grad_norm": 0.42903428026043167, + "learning_rate": 7.833742131995725e-09, + "loss": 0.0077, + "step": 4947 + }, + { + "epoch": 0.9828185519912603, + "grad_norm": 0.5963106438933184, + "learning_rate": 7.65473678358053e-09, + "loss": 0.0132, + "step": 4948 + }, + { + "epoch": 0.9830171814480088, + "grad_norm": 0.6044799566434823, + "learning_rate": 7.477798760339717e-09, + "loss": 0.0083, + "step": 4949 + }, + { + "epoch": 0.9832158109047572, + "grad_norm": 0.25409019391898363, + "learning_rate": 7.302928135542453e-09, + "loss": 0.0096, + "step": 4950 + }, + { + "epoch": 0.9834144403615056, + "grad_norm": 0.4559853535199204, + "learning_rate": 7.130124981603037e-09, + "loss": 0.0154, + "step": 4951 + }, + { + "epoch": 0.983613069818254, + "grad_norm": 0.5446426719314116, + "learning_rate": 6.959389370079228e-09, + "loss": 0.0084, + "step": 4952 + }, + { + "epoch": 0.9838116992750024, + "grad_norm": 0.4002074352633331, + "learning_rate": 6.7907213716716936e-09, + "loss": 0.0134, + "step": 4953 + }, + { + "epoch": 0.9840103287317509, + "grad_norm": 0.5482538576918561, + "learning_rate": 6.624121056225674e-09, + "loss": 0.0101, + "step": 4954 + }, + { + "epoch": 0.9842089581884994, + "grad_norm": 0.2548625973854045, + "learning_rate": 6.459588492731539e-09, + "loss": 0.0071, + "step": 4955 + }, + { + "epoch": 0.9844075876452478, + "grad_norm": 0.7503639525229563, + "learning_rate": 6.297123749320344e-09, + "loss": 0.0075, + "step": 4956 + }, + { + "epoch": 0.9846062171019963, + "grad_norm": 0.4290624194383481, + "learning_rate": 6.13672689326994e-09, + "loss": 0.0142, + "step": 4957 + }, + { + "epoch": 0.9848048465587447, + "grad_norm": 0.47732768665483954, + "learning_rate": 5.978397990999973e-09, + "loss": 0.0084, + "step": 4958 + }, + { + "epoch": 0.9850034760154931, + "grad_norm": 0.6430141219587137, + "learning_rate": 5.822137108074111e-09, + "loss": 0.0122, + "step": 4959 + }, + { + "epoch": 0.9852021054722415, + "grad_norm": 0.4601266343624083, + "learning_rate": 5.6679443092000354e-09, + "loss": 0.0111, + "step": 4960 + }, + { + "epoch": 0.9854007349289899, + "grad_norm": 0.8545753873794333, + "learning_rate": 5.515819658228339e-09, + "loss": 0.0072, + "step": 4961 + }, + { + "epoch": 0.9855993643857384, + "grad_norm": 0.5894818362611455, + "learning_rate": 5.3657632181547405e-09, + "loss": 0.0107, + "step": 4962 + }, + { + "epoch": 0.9857979938424868, + "grad_norm": 0.6638553478242332, + "learning_rate": 5.217775051116203e-09, + "loss": 0.0135, + "step": 4963 + }, + { + "epoch": 0.9859966232992353, + "grad_norm": 0.28446784639161005, + "learning_rate": 5.071855218395927e-09, + "loss": 0.0057, + "step": 4964 + }, + { + "epoch": 0.9861952527559837, + "grad_norm": 0.5910524592127953, + "learning_rate": 4.9280037804178e-09, + "loss": 0.0144, + "step": 4965 + }, + { + "epoch": 0.9863938822127322, + "grad_norm": 0.6211297902663819, + "learning_rate": 4.78622079675084e-09, + "loss": 0.0109, + "step": 4966 + }, + { + "epoch": 0.9865925116694806, + "grad_norm": 0.5622570313549148, + "learning_rate": 4.64650632610808e-09, + "loss": 0.0115, + "step": 4967 + }, + { + "epoch": 0.986791141126229, + "grad_norm": 0.7851311388372668, + "learning_rate": 4.508860426344353e-09, + "loss": 0.0108, + "step": 4968 + }, + { + "epoch": 0.9869897705829774, + "grad_norm": 0.9651707733360702, + "learning_rate": 4.3732831544590625e-09, + "loss": 0.0158, + "step": 4969 + }, + { + "epoch": 0.9871884000397259, + "grad_norm": 1.3074665840528905, + "learning_rate": 4.239774566594523e-09, + "loss": 0.009, + "step": 4970 + }, + { + "epoch": 0.9873870294964743, + "grad_norm": 0.6168920501208438, + "learning_rate": 4.1083347180359555e-09, + "loss": 0.0158, + "step": 4971 + }, + { + "epoch": 0.9875856589532228, + "grad_norm": 0.5709471615250453, + "learning_rate": 3.9789636632131536e-09, + "loss": 0.0088, + "step": 4972 + }, + { + "epoch": 0.9877842884099712, + "grad_norm": 0.4726577740179389, + "learning_rate": 3.851661455698819e-09, + "loss": 0.0092, + "step": 4973 + }, + { + "epoch": 0.9879829178667197, + "grad_norm": 0.34416204276241585, + "learning_rate": 3.726428148208006e-09, + "loss": 0.005, + "step": 4974 + }, + { + "epoch": 0.9881815473234681, + "grad_norm": 0.5945886172045202, + "learning_rate": 3.6032637925997873e-09, + "loss": 0.0175, + "step": 4975 + }, + { + "epoch": 0.9883801767802165, + "grad_norm": 0.6385884578112276, + "learning_rate": 3.4821684398766987e-09, + "loss": 0.0105, + "step": 4976 + }, + { + "epoch": 0.9885788062369649, + "grad_norm": 0.4104801504398456, + "learning_rate": 3.3631421401836284e-09, + "loss": 0.0114, + "step": 4977 + }, + { + "epoch": 0.9887774356937133, + "grad_norm": 0.31068946253955954, + "learning_rate": 3.2461849428094827e-09, + "loss": 0.006, + "step": 4978 + }, + { + "epoch": 0.9889760651504618, + "grad_norm": 0.6688702368786089, + "learning_rate": 3.131296896187186e-09, + "loss": 0.0101, + "step": 4979 + }, + { + "epoch": 0.9891746946072103, + "grad_norm": 0.4449185151964188, + "learning_rate": 3.0184780478897947e-09, + "loss": 0.0123, + "step": 4980 + }, + { + "epoch": 0.9893733240639587, + "grad_norm": 0.5828177516464593, + "learning_rate": 2.907728444637159e-09, + "loss": 0.0127, + "step": 4981 + }, + { + "epoch": 0.9895719535207071, + "grad_norm": 0.4850738539372369, + "learning_rate": 2.7990481322898166e-09, + "loss": 0.0141, + "step": 4982 + }, + { + "epoch": 0.9897705829774556, + "grad_norm": 0.46338314092768906, + "learning_rate": 2.6924371558523233e-09, + "loss": 0.0139, + "step": 4983 + }, + { + "epoch": 0.989969212434204, + "grad_norm": 0.4030592860660505, + "learning_rate": 2.5878955594726974e-09, + "loss": 0.0117, + "step": 4984 + }, + { + "epoch": 0.9901678418909524, + "grad_norm": 1.0961667604547174, + "learning_rate": 2.4854233864402e-09, + "loss": 0.0202, + "step": 4985 + }, + { + "epoch": 0.9903664713477008, + "grad_norm": 0.4721411563769088, + "learning_rate": 2.3850206791897756e-09, + "loss": 0.0113, + "step": 4986 + }, + { + "epoch": 0.9905651008044493, + "grad_norm": 0.4833604924531587, + "learning_rate": 2.286687479297056e-09, + "loss": 0.0095, + "step": 4987 + }, + { + "epoch": 0.9907637302611977, + "grad_norm": 0.5752978520181773, + "learning_rate": 2.1904238274828016e-09, + "loss": 0.0097, + "step": 4988 + }, + { + "epoch": 0.9909623597179462, + "grad_norm": 0.39596085840021944, + "learning_rate": 2.0962297636084593e-09, + "loss": 0.0083, + "step": 4989 + }, + { + "epoch": 0.9911609891746946, + "grad_norm": 0.6351520216922857, + "learning_rate": 2.0041053266806054e-09, + "loss": 0.0117, + "step": 4990 + }, + { + "epoch": 0.9913596186314431, + "grad_norm": 0.537475588519044, + "learning_rate": 1.9140505548476128e-09, + "loss": 0.0079, + "step": 4991 + }, + { + "epoch": 0.9915582480881915, + "grad_norm": 0.4082058783031262, + "learning_rate": 1.8260654854013182e-09, + "loss": 0.0097, + "step": 4992 + }, + { + "epoch": 0.9917568775449399, + "grad_norm": 1.0157688286043665, + "learning_rate": 1.7401501547759104e-09, + "loss": 0.021, + "step": 4993 + }, + { + "epoch": 0.9919555070016883, + "grad_norm": 0.3634022299259286, + "learning_rate": 1.6563045985490412e-09, + "loss": 0.0107, + "step": 4994 + }, + { + "epoch": 0.9921541364584368, + "grad_norm": 0.6952556762857542, + "learning_rate": 1.5745288514407153e-09, + "loss": 0.0147, + "step": 4995 + }, + { + "epoch": 0.9923527659151852, + "grad_norm": 0.5786296634418221, + "learning_rate": 1.4948229473144005e-09, + "loss": 0.0117, + "step": 4996 + }, + { + "epoch": 0.9925513953719337, + "grad_norm": 0.6356775912475472, + "learning_rate": 1.417186919176472e-09, + "loss": 0.0151, + "step": 4997 + }, + { + "epoch": 0.9927500248286821, + "grad_norm": 0.5692096365130425, + "learning_rate": 1.341620799175658e-09, + "loss": 0.0076, + "step": 4998 + }, + { + "epoch": 0.9929486542854306, + "grad_norm": 0.8015252962881247, + "learning_rate": 1.2681246186035945e-09, + "loss": 0.0166, + "step": 4999 + }, + { + "epoch": 0.993147283742179, + "grad_norm": 0.5276135206707997, + "learning_rate": 1.1966984078959354e-09, + "loss": 0.0087, + "step": 5000 + }, + { + "epoch": 0.9933459131989274, + "grad_norm": 0.4908418867095688, + "learning_rate": 1.1273421966290221e-09, + "loss": 0.0094, + "step": 5001 + }, + { + "epoch": 0.9935445426556758, + "grad_norm": 0.9923607226693753, + "learning_rate": 1.0600560135237691e-09, + "loss": 0.0145, + "step": 5002 + }, + { + "epoch": 0.9937431721124242, + "grad_norm": 0.2731344484135086, + "learning_rate": 9.948398864434439e-10, + "loss": 0.0068, + "step": 5003 + }, + { + "epoch": 0.9939418015691727, + "grad_norm": 0.7287487195663518, + "learning_rate": 9.316938423936662e-10, + "loss": 0.0092, + "step": 5004 + }, + { + "epoch": 0.9941404310259211, + "grad_norm": 0.505240386716718, + "learning_rate": 8.706179075229637e-10, + "loss": 0.0117, + "step": 5005 + }, + { + "epoch": 0.9943390604826696, + "grad_norm": 0.5101453671929035, + "learning_rate": 8.116121071238825e-10, + "loss": 0.0136, + "step": 5006 + }, + { + "epoch": 0.994537689939418, + "grad_norm": 0.4807315646835647, + "learning_rate": 7.546764656291005e-10, + "loss": 0.0086, + "step": 5007 + }, + { + "epoch": 0.9947363193961665, + "grad_norm": 0.4595127871163573, + "learning_rate": 6.998110066169794e-10, + "loss": 0.0083, + "step": 5008 + }, + { + "epoch": 0.9949349488529149, + "grad_norm": 0.5817190396655636, + "learning_rate": 6.470157528065679e-10, + "loss": 0.0101, + "step": 5009 + }, + { + "epoch": 0.9951335783096633, + "grad_norm": 0.3676483776227927, + "learning_rate": 5.962907260603779e-10, + "loss": 0.0088, + "step": 5010 + }, + { + "epoch": 0.9953322077664117, + "grad_norm": 0.5527914548170548, + "learning_rate": 5.476359473838289e-10, + "loss": 0.0089, + "step": 5011 + }, + { + "epoch": 0.9955308372231602, + "grad_norm": 0.47874706302938747, + "learning_rate": 5.010514369246933e-10, + "loss": 0.0131, + "step": 5012 + }, + { + "epoch": 0.9957294666799086, + "grad_norm": 0.4328300045576964, + "learning_rate": 4.565372139730961e-10, + "loss": 0.0162, + "step": 5013 + }, + { + "epoch": 0.9959280961366571, + "grad_norm": 0.45665840240784356, + "learning_rate": 4.140932969631806e-10, + "loss": 0.0085, + "step": 5014 + }, + { + "epoch": 0.9961267255934055, + "grad_norm": 0.9001422570931706, + "learning_rate": 3.737197034703322e-10, + "loss": 0.0126, + "step": 5015 + }, + { + "epoch": 0.996325355050154, + "grad_norm": 0.45807153798940137, + "learning_rate": 3.3541645021339944e-10, + "loss": 0.0105, + "step": 5016 + }, + { + "epoch": 0.9965239845069024, + "grad_norm": 0.6282268630028757, + "learning_rate": 2.991835530535836e-10, + "loss": 0.0092, + "step": 5017 + }, + { + "epoch": 0.9967226139636508, + "grad_norm": 0.42023530746133536, + "learning_rate": 2.650210269955489e-10, + "loss": 0.01, + "step": 5018 + }, + { + "epoch": 0.9969212434203992, + "grad_norm": 0.40516998628725354, + "learning_rate": 2.3292888618520195e-10, + "loss": 0.006, + "step": 5019 + }, + { + "epoch": 0.9971198728771477, + "grad_norm": 0.5106870738378689, + "learning_rate": 2.0290714391191235e-10, + "loss": 0.0135, + "step": 5020 + }, + { + "epoch": 0.9973185023338961, + "grad_norm": 0.5702272196550405, + "learning_rate": 1.7495581260795758e-10, + "loss": 0.0084, + "step": 5021 + }, + { + "epoch": 0.9975171317906446, + "grad_norm": 0.3574001167310024, + "learning_rate": 1.4907490384796774e-10, + "loss": 0.0089, + "step": 5022 + }, + { + "epoch": 0.997715761247393, + "grad_norm": 0.6482126256169631, + "learning_rate": 1.252644283489257e-10, + "loss": 0.009, + "step": 5023 + }, + { + "epoch": 0.9979143907041415, + "grad_norm": 0.40764995367026596, + "learning_rate": 1.0352439597072217e-10, + "loss": 0.0123, + "step": 5024 + }, + { + "epoch": 0.9981130201608899, + "grad_norm": 0.464525483618827, + "learning_rate": 8.385481571615561e-11, + "loss": 0.0124, + "step": 5025 + }, + { + "epoch": 0.9983116496176383, + "grad_norm": 0.49823901728048225, + "learning_rate": 6.625569573037727e-11, + "loss": 0.0176, + "step": 5026 + }, + { + "epoch": 0.9985102790743867, + "grad_norm": 0.5896583089333627, + "learning_rate": 5.0727043301446175e-11, + "loss": 0.0129, + "step": 5027 + }, + { + "epoch": 0.9987089085311351, + "grad_norm": 0.37718748502681293, + "learning_rate": 3.726886485866388e-11, + "loss": 0.0133, + "step": 5028 + }, + { + "epoch": 0.9989075379878836, + "grad_norm": 0.5830988240086593, + "learning_rate": 2.5881165976460178e-11, + "loss": 0.0161, + "step": 5029 + }, + { + "epoch": 0.999106167444632, + "grad_norm": 0.6251502587104545, + "learning_rate": 1.6563951368842034e-11, + "loss": 0.0125, + "step": 5030 + }, + { + "epoch": 0.9993047969013805, + "grad_norm": 0.5850394153814858, + "learning_rate": 9.317224895499799e-12, + "loss": 0.013, + "step": 5031 + }, + { + "epoch": 0.9995034263581289, + "grad_norm": 0.5740375396508653, + "learning_rate": 4.1409895568111924e-12, + "loss": 0.0112, + "step": 5032 + }, + { + "epoch": 0.9997020558148774, + "grad_norm": 0.2949227457860251, + "learning_rate": 1.0352474966168758e-12, + "loss": 0.0055, + "step": 5033 + }, + { + "epoch": 0.9999006852716258, + "grad_norm": 0.8152259997227421, + "learning_rate": 0.0, + "loss": 0.0059, + "step": 5034 + }, + { + "epoch": 0.9999006852716258, + "step": 5034, + "total_flos": 393450150948864.0, + "train_loss": 0.013911830744355322, + "train_runtime": 43586.62, + "train_samples_per_second": 7.393, + "train_steps_per_second": 0.115 + } + ], + "logging_steps": 1.0, + "max_steps": 5034, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 393450150948864.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}