{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999006852716258, "eval_steps": 500, "global_step": 5034, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019862945674843578, "grad_norm": 33.76269020949264, "learning_rate": 6.578947368421053e-08, "loss": 0.2359, "step": 1 }, { "epoch": 0.00039725891349687157, "grad_norm": 34.06001813445965, "learning_rate": 1.3157894736842107e-07, "loss": 0.2755, "step": 2 }, { "epoch": 0.0005958883702453074, "grad_norm": 35.82083527820956, "learning_rate": 1.9736842105263157e-07, "loss": 0.285, "step": 3 }, { "epoch": 0.0007945178269937431, "grad_norm": 31.578743559628386, "learning_rate": 2.6315789473684213e-07, "loss": 0.2475, "step": 4 }, { "epoch": 0.000993147283742179, "grad_norm": 30.615127259734468, "learning_rate": 3.2894736842105264e-07, "loss": 0.2229, "step": 5 }, { "epoch": 0.0011917767404906149, "grad_norm": 44.64811115111308, "learning_rate": 3.9473684210526315e-07, "loss": 0.3314, "step": 6 }, { "epoch": 0.0013904061972390505, "grad_norm": 44.34169533744506, "learning_rate": 4.605263157894737e-07, "loss": 0.2423, "step": 7 }, { "epoch": 0.0015890356539874863, "grad_norm": 40.11109275785226, "learning_rate": 5.263157894736843e-07, "loss": 0.2239, "step": 8 }, { "epoch": 0.001787665110735922, "grad_norm": 35.00420920021999, "learning_rate": 5.921052631578947e-07, "loss": 0.2018, "step": 9 }, { "epoch": 0.001986294567484358, "grad_norm": 29.808999667191497, "learning_rate": 6.578947368421053e-07, "loss": 0.107, "step": 10 }, { "epoch": 0.0021849240242327937, "grad_norm": 17.62819439139421, "learning_rate": 7.236842105263158e-07, "loss": 0.0941, "step": 11 }, { "epoch": 0.0023835534809812297, "grad_norm": 12.95448401579392, "learning_rate": 7.894736842105263e-07, "loss": 0.0777, "step": 12 }, { "epoch": 0.0025821829377296653, "grad_norm": 6.615340288787496, "learning_rate": 8.55263157894737e-07, "loss": 0.0557, "step": 13 }, { "epoch": 0.002780812394478101, "grad_norm": 8.20493935951563, "learning_rate": 9.210526315789474e-07, "loss": 0.0618, "step": 14 }, { "epoch": 0.002979441851226537, "grad_norm": 6.4854536788019495, "learning_rate": 9.86842105263158e-07, "loss": 0.0597, "step": 15 }, { "epoch": 0.0031780713079749725, "grad_norm": 5.948598869884694, "learning_rate": 1.0526315789473685e-06, "loss": 0.057, "step": 16 }, { "epoch": 0.0033767007647234086, "grad_norm": 1.505342665823492, "learning_rate": 1.118421052631579e-06, "loss": 0.0389, "step": 17 }, { "epoch": 0.003575330221471844, "grad_norm": 4.357709266078771, "learning_rate": 1.1842105263157894e-06, "loss": 0.0477, "step": 18 }, { "epoch": 0.00377395967822028, "grad_norm": 4.173575780842314, "learning_rate": 1.25e-06, "loss": 0.0425, "step": 19 }, { "epoch": 0.003972589134968716, "grad_norm": 4.034253079966795, "learning_rate": 1.3157894736842106e-06, "loss": 0.0433, "step": 20 }, { "epoch": 0.004171218591717152, "grad_norm": 3.815661808462217, "learning_rate": 1.3815789473684212e-06, "loss": 0.0417, "step": 21 }, { "epoch": 0.004369848048465587, "grad_norm": 1.9849545321722113, "learning_rate": 1.4473684210526317e-06, "loss": 0.0349, "step": 22 }, { "epoch": 0.004568477505214023, "grad_norm": 0.704960225012627, "learning_rate": 1.5131578947368421e-06, "loss": 0.0355, "step": 23 }, { "epoch": 0.0047671069619624595, "grad_norm": 2.738263124201555, "learning_rate": 1.5789473684210526e-06, "loss": 0.0351, "step": 24 }, { "epoch": 0.004965736418710895, "grad_norm": 3.7531322321385994, "learning_rate": 1.6447368421052635e-06, "loss": 0.0337, "step": 25 }, { "epoch": 0.005164365875459331, "grad_norm": 3.3650206269293643, "learning_rate": 1.710526315789474e-06, "loss": 0.0403, "step": 26 }, { "epoch": 0.005362995332207766, "grad_norm": 1.1524716199784135, "learning_rate": 1.7763157894736844e-06, "loss": 0.0188, "step": 27 }, { "epoch": 0.005561624788956202, "grad_norm": 0.9219916829535164, "learning_rate": 1.8421052631578948e-06, "loss": 0.0292, "step": 28 }, { "epoch": 0.005760254245704638, "grad_norm": 1.5976055474296411, "learning_rate": 1.9078947368421057e-06, "loss": 0.0311, "step": 29 }, { "epoch": 0.005958883702453074, "grad_norm": 1.7121779218172795, "learning_rate": 1.973684210526316e-06, "loss": 0.0267, "step": 30 }, { "epoch": 0.0061575131592015095, "grad_norm": 1.4089365136243617, "learning_rate": 2.0394736842105266e-06, "loss": 0.0271, "step": 31 }, { "epoch": 0.006356142615949945, "grad_norm": 1.2326619516469113, "learning_rate": 2.105263157894737e-06, "loss": 0.0213, "step": 32 }, { "epoch": 0.0065547720726983815, "grad_norm": 2.5437119531443457, "learning_rate": 2.1710526315789475e-06, "loss": 0.0358, "step": 33 }, { "epoch": 0.006753401529446817, "grad_norm": 3.097227411879207, "learning_rate": 2.236842105263158e-06, "loss": 0.026, "step": 34 }, { "epoch": 0.006952030986195253, "grad_norm": 1.5225819387838984, "learning_rate": 2.3026315789473684e-06, "loss": 0.0352, "step": 35 }, { "epoch": 0.007150660442943688, "grad_norm": 1.9028774569921925, "learning_rate": 2.368421052631579e-06, "loss": 0.0249, "step": 36 }, { "epoch": 0.007349289899692124, "grad_norm": 1.3413799412407437, "learning_rate": 2.4342105263157898e-06, "loss": 0.023, "step": 37 }, { "epoch": 0.00754791935644056, "grad_norm": 0.7403867166496259, "learning_rate": 2.5e-06, "loss": 0.0179, "step": 38 }, { "epoch": 0.007746548813188996, "grad_norm": 1.5339334077472897, "learning_rate": 2.565789473684211e-06, "loss": 0.026, "step": 39 }, { "epoch": 0.007945178269937432, "grad_norm": 1.4657589777219318, "learning_rate": 2.631578947368421e-06, "loss": 0.0195, "step": 40 }, { "epoch": 0.008143807726685867, "grad_norm": 1.3319716700261164, "learning_rate": 2.697368421052632e-06, "loss": 0.0291, "step": 41 }, { "epoch": 0.008342437183434304, "grad_norm": 1.1874773600556614, "learning_rate": 2.7631578947368424e-06, "loss": 0.0259, "step": 42 }, { "epoch": 0.008541066640182738, "grad_norm": 0.6674176354529916, "learning_rate": 2.828947368421053e-06, "loss": 0.0203, "step": 43 }, { "epoch": 0.008739696096931175, "grad_norm": 1.1733771223638585, "learning_rate": 2.8947368421052634e-06, "loss": 0.026, "step": 44 }, { "epoch": 0.008938325553679611, "grad_norm": 1.7753106520406625, "learning_rate": 2.960526315789474e-06, "loss": 0.0247, "step": 45 }, { "epoch": 0.009136955010428046, "grad_norm": 0.9432853830345364, "learning_rate": 3.0263157894736843e-06, "loss": 0.0257, "step": 46 }, { "epoch": 0.009335584467176482, "grad_norm": 3.5238268219753857, "learning_rate": 3.092105263157895e-06, "loss": 0.024, "step": 47 }, { "epoch": 0.009534213923924919, "grad_norm": 4.0482869582706975, "learning_rate": 3.157894736842105e-06, "loss": 0.0279, "step": 48 }, { "epoch": 0.009732843380673354, "grad_norm": 1.9309599368634551, "learning_rate": 3.223684210526316e-06, "loss": 0.0307, "step": 49 }, { "epoch": 0.00993147283742179, "grad_norm": 2.2369624906638355, "learning_rate": 3.289473684210527e-06, "loss": 0.0192, "step": 50 }, { "epoch": 0.010130102294170225, "grad_norm": 1.9485240974125297, "learning_rate": 3.355263157894737e-06, "loss": 0.0286, "step": 51 }, { "epoch": 0.010328731750918661, "grad_norm": 2.7566490930720473, "learning_rate": 3.421052631578948e-06, "loss": 0.0327, "step": 52 }, { "epoch": 0.010527361207667098, "grad_norm": 0.9111515455287649, "learning_rate": 3.486842105263158e-06, "loss": 0.0266, "step": 53 }, { "epoch": 0.010725990664415532, "grad_norm": 1.0182592431220034, "learning_rate": 3.5526315789473687e-06, "loss": 0.0159, "step": 54 }, { "epoch": 0.010924620121163969, "grad_norm": 1.3689812247740223, "learning_rate": 3.618421052631579e-06, "loss": 0.0236, "step": 55 }, { "epoch": 0.011123249577912404, "grad_norm": 1.415669213506178, "learning_rate": 3.6842105263157896e-06, "loss": 0.0192, "step": 56 }, { "epoch": 0.01132187903466084, "grad_norm": 2.727013850743499, "learning_rate": 3.7500000000000005e-06, "loss": 0.0271, "step": 57 }, { "epoch": 0.011520508491409277, "grad_norm": 21.142839252245647, "learning_rate": 3.815789473684211e-06, "loss": 0.0233, "step": 58 }, { "epoch": 0.011719137948157711, "grad_norm": 1.1229291902544376, "learning_rate": 3.8815789473684214e-06, "loss": 0.0183, "step": 59 }, { "epoch": 0.011917767404906148, "grad_norm": 1.196900422199011, "learning_rate": 3.947368421052632e-06, "loss": 0.0245, "step": 60 }, { "epoch": 0.012116396861654583, "grad_norm": 1.356686303430086, "learning_rate": 4.013157894736842e-06, "loss": 0.0104, "step": 61 }, { "epoch": 0.012315026318403019, "grad_norm": 1.0363309431722993, "learning_rate": 4.078947368421053e-06, "loss": 0.0162, "step": 62 }, { "epoch": 0.012513655775151455, "grad_norm": 1.068841018945078, "learning_rate": 4.144736842105263e-06, "loss": 0.0215, "step": 63 }, { "epoch": 0.01271228523189989, "grad_norm": 1.3644470124019088, "learning_rate": 4.210526315789474e-06, "loss": 0.0308, "step": 64 }, { "epoch": 0.012910914688648327, "grad_norm": 1.4539722940904474, "learning_rate": 4.276315789473684e-06, "loss": 0.0234, "step": 65 }, { "epoch": 0.013109544145396763, "grad_norm": 1.8771694709283084, "learning_rate": 4.342105263157895e-06, "loss": 0.0279, "step": 66 }, { "epoch": 0.013308173602145198, "grad_norm": 1.56691962640963, "learning_rate": 4.407894736842105e-06, "loss": 0.0224, "step": 67 }, { "epoch": 0.013506803058893634, "grad_norm": 1.0323510479136595, "learning_rate": 4.473684210526316e-06, "loss": 0.0223, "step": 68 }, { "epoch": 0.013705432515642069, "grad_norm": 2.329638995795258, "learning_rate": 4.539473684210527e-06, "loss": 0.0278, "step": 69 }, { "epoch": 0.013904061972390505, "grad_norm": 1.1328939658071375, "learning_rate": 4.605263157894737e-06, "loss": 0.0262, "step": 70 }, { "epoch": 0.014102691429138942, "grad_norm": 1.5789619228369665, "learning_rate": 4.671052631578948e-06, "loss": 0.0226, "step": 71 }, { "epoch": 0.014301320885887377, "grad_norm": 0.590549283191568, "learning_rate": 4.736842105263158e-06, "loss": 0.0222, "step": 72 }, { "epoch": 0.014499950342635813, "grad_norm": 2.9484330972444797, "learning_rate": 4.802631578947369e-06, "loss": 0.0289, "step": 73 }, { "epoch": 0.014698579799384248, "grad_norm": 1.7556389255244587, "learning_rate": 4.8684210526315795e-06, "loss": 0.0253, "step": 74 }, { "epoch": 0.014897209256132684, "grad_norm": 0.8175815824911603, "learning_rate": 4.9342105263157895e-06, "loss": 0.023, "step": 75 }, { "epoch": 0.01509583871288112, "grad_norm": 1.9543147397030654, "learning_rate": 5e-06, "loss": 0.024, "step": 76 }, { "epoch": 0.015294468169629555, "grad_norm": 1.411220111931974, "learning_rate": 5.0657894736842104e-06, "loss": 0.0213, "step": 77 }, { "epoch": 0.015493097626377992, "grad_norm": 2.5787589856867474, "learning_rate": 5.131578947368422e-06, "loss": 0.0194, "step": 78 }, { "epoch": 0.01569172708312643, "grad_norm": 1.1433613974937111, "learning_rate": 5.197368421052632e-06, "loss": 0.0203, "step": 79 }, { "epoch": 0.015890356539874865, "grad_norm": 2.943413275721806, "learning_rate": 5.263157894736842e-06, "loss": 0.0202, "step": 80 }, { "epoch": 0.016088985996623298, "grad_norm": 1.2895517559346665, "learning_rate": 5.328947368421054e-06, "loss": 0.0175, "step": 81 }, { "epoch": 0.016287615453371734, "grad_norm": 1.2902481789966305, "learning_rate": 5.394736842105264e-06, "loss": 0.0273, "step": 82 }, { "epoch": 0.01648624491012017, "grad_norm": 1.0623933737227271, "learning_rate": 5.460526315789474e-06, "loss": 0.0226, "step": 83 }, { "epoch": 0.016684874366868607, "grad_norm": 1.551352914024002, "learning_rate": 5.526315789473685e-06, "loss": 0.0227, "step": 84 }, { "epoch": 0.016883503823617044, "grad_norm": 1.35068564721811, "learning_rate": 5.592105263157896e-06, "loss": 0.0176, "step": 85 }, { "epoch": 0.017082133280365477, "grad_norm": 0.7178211749767736, "learning_rate": 5.657894736842106e-06, "loss": 0.0161, "step": 86 }, { "epoch": 0.017280762737113913, "grad_norm": 1.3876820110580605, "learning_rate": 5.723684210526316e-06, "loss": 0.0211, "step": 87 }, { "epoch": 0.01747939219386235, "grad_norm": 0.9950100462947198, "learning_rate": 5.789473684210527e-06, "loss": 0.0236, "step": 88 }, { "epoch": 0.017678021650610786, "grad_norm": 2.4968422535588806, "learning_rate": 5.855263157894738e-06, "loss": 0.0288, "step": 89 }, { "epoch": 0.017876651107359223, "grad_norm": 2.6434743230683515, "learning_rate": 5.921052631578948e-06, "loss": 0.03, "step": 90 }, { "epoch": 0.018075280564107656, "grad_norm": 0.7671799725100394, "learning_rate": 5.9868421052631585e-06, "loss": 0.0303, "step": 91 }, { "epoch": 0.018273910020856092, "grad_norm": 1.4218642540045263, "learning_rate": 6.0526315789473685e-06, "loss": 0.0265, "step": 92 }, { "epoch": 0.01847253947760453, "grad_norm": 1.1607669862480212, "learning_rate": 6.118421052631579e-06, "loss": 0.0273, "step": 93 }, { "epoch": 0.018671168934352965, "grad_norm": 0.5272958683372639, "learning_rate": 6.18421052631579e-06, "loss": 0.0208, "step": 94 }, { "epoch": 0.0188697983911014, "grad_norm": 0.6384994881210974, "learning_rate": 6.25e-06, "loss": 0.0193, "step": 95 }, { "epoch": 0.019068427847849838, "grad_norm": 0.4305083797700706, "learning_rate": 6.31578947368421e-06, "loss": 0.0155, "step": 96 }, { "epoch": 0.01926705730459827, "grad_norm": 1.5613150520545582, "learning_rate": 6.381578947368422e-06, "loss": 0.0287, "step": 97 }, { "epoch": 0.019465686761346707, "grad_norm": 1.207629220791821, "learning_rate": 6.447368421052632e-06, "loss": 0.0126, "step": 98 }, { "epoch": 0.019664316218095144, "grad_norm": 2.0635349948166946, "learning_rate": 6.513157894736842e-06, "loss": 0.0267, "step": 99 }, { "epoch": 0.01986294567484358, "grad_norm": 1.725733944785907, "learning_rate": 6.578947368421054e-06, "loss": 0.0307, "step": 100 }, { "epoch": 0.020061575131592017, "grad_norm": 0.7480249394267019, "learning_rate": 6.644736842105264e-06, "loss": 0.0299, "step": 101 }, { "epoch": 0.02026020458834045, "grad_norm": 3.894533969012179, "learning_rate": 6.710526315789474e-06, "loss": 0.036, "step": 102 }, { "epoch": 0.020458834045088886, "grad_norm": 1.9782432860217083, "learning_rate": 6.776315789473686e-06, "loss": 0.019, "step": 103 }, { "epoch": 0.020657463501837323, "grad_norm": 2.1605252856710577, "learning_rate": 6.842105263157896e-06, "loss": 0.0147, "step": 104 }, { "epoch": 0.02085609295858576, "grad_norm": 1.4740136865902298, "learning_rate": 6.907894736842106e-06, "loss": 0.0225, "step": 105 }, { "epoch": 0.021054722415334196, "grad_norm": 0.6354162758488423, "learning_rate": 6.973684210526316e-06, "loss": 0.0248, "step": 106 }, { "epoch": 0.02125335187208263, "grad_norm": 2.7836091692257434, "learning_rate": 7.0394736842105274e-06, "loss": 0.0239, "step": 107 }, { "epoch": 0.021451981328831065, "grad_norm": 1.1359242167543848, "learning_rate": 7.1052631578947375e-06, "loss": 0.0202, "step": 108 }, { "epoch": 0.0216506107855795, "grad_norm": 0.9730451625121374, "learning_rate": 7.1710526315789475e-06, "loss": 0.0164, "step": 109 }, { "epoch": 0.021849240242327938, "grad_norm": 0.73973440231317, "learning_rate": 7.236842105263158e-06, "loss": 0.0208, "step": 110 }, { "epoch": 0.022047869699076374, "grad_norm": 2.699646318968425, "learning_rate": 7.302631578947369e-06, "loss": 0.0233, "step": 111 }, { "epoch": 0.022246499155824807, "grad_norm": 1.2218290054724747, "learning_rate": 7.368421052631579e-06, "loss": 0.0194, "step": 112 }, { "epoch": 0.022445128612573244, "grad_norm": 3.2868701187822915, "learning_rate": 7.43421052631579e-06, "loss": 0.0339, "step": 113 }, { "epoch": 0.02264375806932168, "grad_norm": 2.5641439239722814, "learning_rate": 7.500000000000001e-06, "loss": 0.0286, "step": 114 }, { "epoch": 0.022842387526070117, "grad_norm": 0.6952134707398119, "learning_rate": 7.565789473684211e-06, "loss": 0.0212, "step": 115 }, { "epoch": 0.023041016982818553, "grad_norm": 2.011954955233892, "learning_rate": 7.631578947368423e-06, "loss": 0.0259, "step": 116 }, { "epoch": 0.023239646439566986, "grad_norm": 0.7803532263465452, "learning_rate": 7.697368421052632e-06, "loss": 0.016, "step": 117 }, { "epoch": 0.023438275896315423, "grad_norm": 1.9847980365877933, "learning_rate": 7.763157894736843e-06, "loss": 0.0195, "step": 118 }, { "epoch": 0.02363690535306386, "grad_norm": 1.1441961651491355, "learning_rate": 7.828947368421054e-06, "loss": 0.0211, "step": 119 }, { "epoch": 0.023835534809812296, "grad_norm": 0.6232275208284637, "learning_rate": 7.894736842105265e-06, "loss": 0.0155, "step": 120 }, { "epoch": 0.024034164266560732, "grad_norm": 1.8988678704709767, "learning_rate": 7.960526315789474e-06, "loss": 0.0146, "step": 121 }, { "epoch": 0.024232793723309165, "grad_norm": 2.105837941919798, "learning_rate": 8.026315789473685e-06, "loss": 0.0295, "step": 122 }, { "epoch": 0.0244314231800576, "grad_norm": 1.3171076410792975, "learning_rate": 8.092105263157896e-06, "loss": 0.0264, "step": 123 }, { "epoch": 0.024630052636806038, "grad_norm": 1.5991904078839954, "learning_rate": 8.157894736842106e-06, "loss": 0.0244, "step": 124 }, { "epoch": 0.024828682093554474, "grad_norm": 1.1618318045886942, "learning_rate": 8.223684210526316e-06, "loss": 0.0201, "step": 125 }, { "epoch": 0.02502731155030291, "grad_norm": 0.800623192132822, "learning_rate": 8.289473684210526e-06, "loss": 0.0177, "step": 126 }, { "epoch": 0.025225941007051347, "grad_norm": 1.6211464121453392, "learning_rate": 8.355263157894737e-06, "loss": 0.0155, "step": 127 }, { "epoch": 0.02542457046379978, "grad_norm": 0.7412717741680597, "learning_rate": 8.421052631578948e-06, "loss": 0.0118, "step": 128 }, { "epoch": 0.025623199920548217, "grad_norm": 0.7011545084516744, "learning_rate": 8.486842105263159e-06, "loss": 0.0197, "step": 129 }, { "epoch": 0.025821829377296653, "grad_norm": 1.5277548264021221, "learning_rate": 8.552631578947368e-06, "loss": 0.0139, "step": 130 }, { "epoch": 0.02602045883404509, "grad_norm": 0.8807370373367641, "learning_rate": 8.61842105263158e-06, "loss": 0.0311, "step": 131 }, { "epoch": 0.026219088290793526, "grad_norm": 1.07819195884304, "learning_rate": 8.68421052631579e-06, "loss": 0.0318, "step": 132 }, { "epoch": 0.02641771774754196, "grad_norm": 0.696442786177907, "learning_rate": 8.750000000000001e-06, "loss": 0.0234, "step": 133 }, { "epoch": 0.026616347204290396, "grad_norm": 0.748931405611534, "learning_rate": 8.81578947368421e-06, "loss": 0.0216, "step": 134 }, { "epoch": 0.026814976661038832, "grad_norm": 0.8573350069478229, "learning_rate": 8.881578947368423e-06, "loss": 0.0291, "step": 135 }, { "epoch": 0.02701360611778727, "grad_norm": 0.9745147210519538, "learning_rate": 8.947368421052632e-06, "loss": 0.0239, "step": 136 }, { "epoch": 0.027212235574535705, "grad_norm": 0.9802008111815383, "learning_rate": 9.013157894736843e-06, "loss": 0.0209, "step": 137 }, { "epoch": 0.027410865031284138, "grad_norm": 2.330229398267877, "learning_rate": 9.078947368421054e-06, "loss": 0.0244, "step": 138 }, { "epoch": 0.027609494488032574, "grad_norm": 1.0656034579377593, "learning_rate": 9.144736842105264e-06, "loss": 0.0159, "step": 139 }, { "epoch": 0.02780812394478101, "grad_norm": 2.0077938210120796, "learning_rate": 9.210526315789474e-06, "loss": 0.0199, "step": 140 }, { "epoch": 0.028006753401529447, "grad_norm": 2.635982865291799, "learning_rate": 9.276315789473686e-06, "loss": 0.0276, "step": 141 }, { "epoch": 0.028205382858277884, "grad_norm": 1.3234128327058614, "learning_rate": 9.342105263157895e-06, "loss": 0.0197, "step": 142 }, { "epoch": 0.028404012315026317, "grad_norm": 2.8381965627330588, "learning_rate": 9.407894736842106e-06, "loss": 0.0256, "step": 143 }, { "epoch": 0.028602641771774753, "grad_norm": 2.2353630682958237, "learning_rate": 9.473684210526315e-06, "loss": 0.0263, "step": 144 }, { "epoch": 0.02880127122852319, "grad_norm": 2.609402334542464, "learning_rate": 9.539473684210528e-06, "loss": 0.0208, "step": 145 }, { "epoch": 0.028999900685271626, "grad_norm": 1.6433963174070783, "learning_rate": 9.605263157894737e-06, "loss": 0.0156, "step": 146 }, { "epoch": 0.029198530142020063, "grad_norm": 1.2120241439087565, "learning_rate": 9.671052631578948e-06, "loss": 0.0247, "step": 147 }, { "epoch": 0.029397159598768496, "grad_norm": 2.393880841544898, "learning_rate": 9.736842105263159e-06, "loss": 0.0199, "step": 148 }, { "epoch": 0.029595789055516932, "grad_norm": 1.147929304198294, "learning_rate": 9.80263157894737e-06, "loss": 0.0142, "step": 149 }, { "epoch": 0.02979441851226537, "grad_norm": 0.7301143117977119, "learning_rate": 9.868421052631579e-06, "loss": 0.0268, "step": 150 }, { "epoch": 0.029993047969013805, "grad_norm": 0.7620302081891749, "learning_rate": 9.93421052631579e-06, "loss": 0.0209, "step": 151 }, { "epoch": 0.03019167742576224, "grad_norm": 1.6325286682577373, "learning_rate": 1e-05, "loss": 0.0165, "step": 152 }, { "epoch": 0.030390306882510678, "grad_norm": 0.8520608847697756, "learning_rate": 9.999998964752504e-06, "loss": 0.023, "step": 153 }, { "epoch": 0.03058893633925911, "grad_norm": 0.7134823765486196, "learning_rate": 9.999995859010444e-06, "loss": 0.0213, "step": 154 }, { "epoch": 0.030787565796007547, "grad_norm": 1.4140312748946944, "learning_rate": 9.999990682775105e-06, "loss": 0.0273, "step": 155 }, { "epoch": 0.030986195252755984, "grad_norm": 0.9491683828558153, "learning_rate": 9.999983436048632e-06, "loss": 0.0222, "step": 156 }, { "epoch": 0.03118482470950442, "grad_norm": 1.3782834621848912, "learning_rate": 9.999974118834025e-06, "loss": 0.0162, "step": 157 }, { "epoch": 0.03138345416625286, "grad_norm": 0.9830166709134077, "learning_rate": 9.999962731135142e-06, "loss": 0.017, "step": 158 }, { "epoch": 0.03158208362300129, "grad_norm": 2.218321788640668, "learning_rate": 9.999949272956699e-06, "loss": 0.0184, "step": 159 }, { "epoch": 0.03178071307974973, "grad_norm": 3.6396168680949224, "learning_rate": 9.99993374430427e-06, "loss": 0.033, "step": 160 }, { "epoch": 0.03197934253649816, "grad_norm": 1.701075218546414, "learning_rate": 9.999916145184286e-06, "loss": 0.0191, "step": 161 }, { "epoch": 0.032177971993246596, "grad_norm": 1.0244474776048476, "learning_rate": 9.999896475604029e-06, "loss": 0.0164, "step": 162 }, { "epoch": 0.03237660144999503, "grad_norm": 1.7222754194926513, "learning_rate": 9.999874735571652e-06, "loss": 0.0385, "step": 163 }, { "epoch": 0.03257523090674347, "grad_norm": 2.5845547766369417, "learning_rate": 9.999850925096153e-06, "loss": 0.0331, "step": 164 }, { "epoch": 0.032773860363491905, "grad_norm": 1.878569691120879, "learning_rate": 9.999825044187392e-06, "loss": 0.0254, "step": 165 }, { "epoch": 0.03297248982024034, "grad_norm": 0.9378834395475585, "learning_rate": 9.999797092856089e-06, "loss": 0.0262, "step": 166 }, { "epoch": 0.03317111927698878, "grad_norm": 0.9618425366151521, "learning_rate": 9.999767071113815e-06, "loss": 0.017, "step": 167 }, { "epoch": 0.033369748733737215, "grad_norm": 1.7287253946197272, "learning_rate": 9.999734978973006e-06, "loss": 0.0218, "step": 168 }, { "epoch": 0.03356837819048565, "grad_norm": 0.25580217975967984, "learning_rate": 9.999700816446947e-06, "loss": 0.0104, "step": 169 }, { "epoch": 0.03376700764723409, "grad_norm": 0.7079216618175301, "learning_rate": 9.999664583549788e-06, "loss": 0.0198, "step": 170 }, { "epoch": 0.033965637103982524, "grad_norm": 0.5742676426294199, "learning_rate": 9.99962628029653e-06, "loss": 0.0261, "step": 171 }, { "epoch": 0.03416426656073095, "grad_norm": 0.4760990748227476, "learning_rate": 9.999585906703038e-06, "loss": 0.0223, "step": 172 }, { "epoch": 0.03436289601747939, "grad_norm": 0.6609769994751641, "learning_rate": 9.999543462786028e-06, "loss": 0.0194, "step": 173 }, { "epoch": 0.034561525474227826, "grad_norm": 1.0985318270522382, "learning_rate": 9.999498948563076e-06, "loss": 0.0153, "step": 174 }, { "epoch": 0.03476015493097626, "grad_norm": 0.8636922880587703, "learning_rate": 9.999452364052618e-06, "loss": 0.0243, "step": 175 }, { "epoch": 0.0349587843877247, "grad_norm": 0.5255774215285303, "learning_rate": 9.99940370927394e-06, "loss": 0.0253, "step": 176 }, { "epoch": 0.035157413844473136, "grad_norm": 0.9472944515341319, "learning_rate": 9.999352984247196e-06, "loss": 0.0172, "step": 177 }, { "epoch": 0.03535604330122157, "grad_norm": 1.7710708337619419, "learning_rate": 9.999300188993384e-06, "loss": 0.0209, "step": 178 }, { "epoch": 0.03555467275797001, "grad_norm": 1.62355032296528, "learning_rate": 9.999245323534372e-06, "loss": 0.0183, "step": 179 }, { "epoch": 0.035753302214718445, "grad_norm": 1.6780041194910134, "learning_rate": 9.999188387892878e-06, "loss": 0.0184, "step": 180 }, { "epoch": 0.03595193167146688, "grad_norm": 0.960332672150208, "learning_rate": 9.999129382092478e-06, "loss": 0.0121, "step": 181 }, { "epoch": 0.03615056112821531, "grad_norm": 0.9539342476315544, "learning_rate": 9.999068306157607e-06, "loss": 0.0134, "step": 182 }, { "epoch": 0.03634919058496375, "grad_norm": 1.6183931386612114, "learning_rate": 9.999005160113558e-06, "loss": 0.0168, "step": 183 }, { "epoch": 0.036547820041712184, "grad_norm": 0.9338458758519154, "learning_rate": 9.998939943986476e-06, "loss": 0.022, "step": 184 }, { "epoch": 0.03674644949846062, "grad_norm": 1.9345051819920607, "learning_rate": 9.998872657803371e-06, "loss": 0.0206, "step": 185 }, { "epoch": 0.03694507895520906, "grad_norm": 1.1603558183362468, "learning_rate": 9.998803301592105e-06, "loss": 0.014, "step": 186 }, { "epoch": 0.03714370841195749, "grad_norm": 0.9129479190883742, "learning_rate": 9.998731875381398e-06, "loss": 0.0288, "step": 187 }, { "epoch": 0.03734233786870593, "grad_norm": 0.9875394679980506, "learning_rate": 9.998658379200826e-06, "loss": 0.0302, "step": 188 }, { "epoch": 0.037540967325454366, "grad_norm": 1.0324491864294387, "learning_rate": 9.998582813080824e-06, "loss": 0.0157, "step": 189 }, { "epoch": 0.0377395967822028, "grad_norm": 2.5399121017352098, "learning_rate": 9.998505177052686e-06, "loss": 0.0183, "step": 190 }, { "epoch": 0.03793822623895124, "grad_norm": 1.0608993845566375, "learning_rate": 9.99842547114856e-06, "loss": 0.0142, "step": 191 }, { "epoch": 0.038136855695699676, "grad_norm": 0.3873255475411284, "learning_rate": 9.99834369540145e-06, "loss": 0.0179, "step": 192 }, { "epoch": 0.038335485152448105, "grad_norm": 0.580154672924063, "learning_rate": 9.998259849845224e-06, "loss": 0.0163, "step": 193 }, { "epoch": 0.03853411460919654, "grad_norm": 0.5232842757349848, "learning_rate": 9.9981739345146e-06, "loss": 0.021, "step": 194 }, { "epoch": 0.03873274406594498, "grad_norm": 0.5816313182605903, "learning_rate": 9.998085949445154e-06, "loss": 0.0122, "step": 195 }, { "epoch": 0.038931373522693415, "grad_norm": 2.219159268458382, "learning_rate": 9.99799589467332e-06, "loss": 0.0192, "step": 196 }, { "epoch": 0.03913000297944185, "grad_norm": 2.0126869985493956, "learning_rate": 9.997903770236393e-06, "loss": 0.0222, "step": 197 }, { "epoch": 0.03932863243619029, "grad_norm": 0.8427003722810853, "learning_rate": 9.99780957617252e-06, "loss": 0.0155, "step": 198 }, { "epoch": 0.039527261892938724, "grad_norm": 1.839988273195168, "learning_rate": 9.997713312520703e-06, "loss": 0.0177, "step": 199 }, { "epoch": 0.03972589134968716, "grad_norm": 2.200388293444912, "learning_rate": 9.99761497932081e-06, "loss": 0.0313, "step": 200 }, { "epoch": 0.0399245208064356, "grad_norm": 0.9877299401379769, "learning_rate": 9.997514576613561e-06, "loss": 0.0157, "step": 201 }, { "epoch": 0.04012315026318403, "grad_norm": 1.1088588253099985, "learning_rate": 9.99741210444053e-06, "loss": 0.0198, "step": 202 }, { "epoch": 0.04032177971993246, "grad_norm": 1.7028868040419975, "learning_rate": 9.997307562844148e-06, "loss": 0.0196, "step": 203 }, { "epoch": 0.0405204091766809, "grad_norm": 1.2881131253386358, "learning_rate": 9.997200951867711e-06, "loss": 0.022, "step": 204 }, { "epoch": 0.040719038633429336, "grad_norm": 1.1206602404111856, "learning_rate": 9.997092271555364e-06, "loss": 0.03, "step": 205 }, { "epoch": 0.04091766809017777, "grad_norm": 0.9055361093963828, "learning_rate": 9.996981521952111e-06, "loss": 0.0166, "step": 206 }, { "epoch": 0.04111629754692621, "grad_norm": 2.5005202676664315, "learning_rate": 9.996868703103815e-06, "loss": 0.0267, "step": 207 }, { "epoch": 0.041314927003674645, "grad_norm": 1.6337249581649744, "learning_rate": 9.996753815057191e-06, "loss": 0.0191, "step": 208 }, { "epoch": 0.04151355646042308, "grad_norm": 0.7101831965292786, "learning_rate": 9.996636857859818e-06, "loss": 0.0314, "step": 209 }, { "epoch": 0.04171218591717152, "grad_norm": 1.5966574187467166, "learning_rate": 9.996517831560123e-06, "loss": 0.0264, "step": 210 }, { "epoch": 0.041910815373919955, "grad_norm": 1.8007214636848605, "learning_rate": 9.9963967362074e-06, "loss": 0.0231, "step": 211 }, { "epoch": 0.04210944483066839, "grad_norm": 3.752484634788327, "learning_rate": 9.996273571851793e-06, "loss": 0.0257, "step": 212 }, { "epoch": 0.04230807428741682, "grad_norm": 2.618494261051629, "learning_rate": 9.996148338544302e-06, "loss": 0.0238, "step": 213 }, { "epoch": 0.04250670374416526, "grad_norm": 1.9116871575698093, "learning_rate": 9.996021036336786e-06, "loss": 0.0211, "step": 214 }, { "epoch": 0.042705333200913694, "grad_norm": 0.5453225145113627, "learning_rate": 9.995891665281965e-06, "loss": 0.02, "step": 215 }, { "epoch": 0.04290396265766213, "grad_norm": 1.6984059813637056, "learning_rate": 9.995760225433407e-06, "loss": 0.0235, "step": 216 }, { "epoch": 0.043102592114410566, "grad_norm": 1.4880871765884156, "learning_rate": 9.995626716845541e-06, "loss": 0.0168, "step": 217 }, { "epoch": 0.043301221571159, "grad_norm": 1.4047456117126256, "learning_rate": 9.995491139573657e-06, "loss": 0.0316, "step": 218 }, { "epoch": 0.04349985102790744, "grad_norm": 1.2909976897550328, "learning_rate": 9.995353493673892e-06, "loss": 0.0137, "step": 219 }, { "epoch": 0.043698480484655876, "grad_norm": 0.6501623439351141, "learning_rate": 9.99521377920325e-06, "loss": 0.0161, "step": 220 }, { "epoch": 0.04389710994140431, "grad_norm": 0.8580190311656244, "learning_rate": 9.995071996219584e-06, "loss": 0.018, "step": 221 }, { "epoch": 0.04409573939815275, "grad_norm": 1.2921080572385664, "learning_rate": 9.994928144781607e-06, "loss": 0.0129, "step": 222 }, { "epoch": 0.044294368854901185, "grad_norm": 1.8388869881713987, "learning_rate": 9.994782224948885e-06, "loss": 0.0244, "step": 223 }, { "epoch": 0.044492998311649615, "grad_norm": 0.8904350028720864, "learning_rate": 9.994634236781845e-06, "loss": 0.0285, "step": 224 }, { "epoch": 0.04469162776839805, "grad_norm": 1.0351479493656937, "learning_rate": 9.994484180341773e-06, "loss": 0.0129, "step": 225 }, { "epoch": 0.04489025722514649, "grad_norm": 1.6356872953946395, "learning_rate": 9.994332055690801e-06, "loss": 0.0172, "step": 226 }, { "epoch": 0.045088886681894924, "grad_norm": 2.4427456672158785, "learning_rate": 9.994177862891927e-06, "loss": 0.0262, "step": 227 }, { "epoch": 0.04528751613864336, "grad_norm": 1.1833855069108572, "learning_rate": 9.994021602009001e-06, "loss": 0.0212, "step": 228 }, { "epoch": 0.0454861455953918, "grad_norm": 1.3424482002706264, "learning_rate": 9.99386327310673e-06, "loss": 0.013, "step": 229 }, { "epoch": 0.045684775052140233, "grad_norm": 1.9959735911469776, "learning_rate": 9.99370287625068e-06, "loss": 0.0136, "step": 230 }, { "epoch": 0.04588340450888867, "grad_norm": 1.0872172455494484, "learning_rate": 9.99354041150727e-06, "loss": 0.0189, "step": 231 }, { "epoch": 0.046082033965637106, "grad_norm": 0.8886861060907549, "learning_rate": 9.993375878943775e-06, "loss": 0.0248, "step": 232 }, { "epoch": 0.04628066342238554, "grad_norm": 1.1111175252979908, "learning_rate": 9.99320927862833e-06, "loss": 0.0207, "step": 233 }, { "epoch": 0.04647929287913397, "grad_norm": 0.351030054160361, "learning_rate": 9.993040610629923e-06, "loss": 0.0169, "step": 234 }, { "epoch": 0.04667792233588241, "grad_norm": 1.3626687901401482, "learning_rate": 9.992869875018398e-06, "loss": 0.0176, "step": 235 }, { "epoch": 0.046876551792630845, "grad_norm": 1.3745225587842778, "learning_rate": 9.992697071864459e-06, "loss": 0.02, "step": 236 }, { "epoch": 0.04707518124937928, "grad_norm": 0.680922388085837, "learning_rate": 9.992522201239661e-06, "loss": 0.0167, "step": 237 }, { "epoch": 0.04727381070612772, "grad_norm": 0.6405373508660324, "learning_rate": 9.99234526321642e-06, "loss": 0.0134, "step": 238 }, { "epoch": 0.047472440162876155, "grad_norm": 1.5525613206844335, "learning_rate": 9.992166257868006e-06, "loss": 0.0244, "step": 239 }, { "epoch": 0.04767106961962459, "grad_norm": 0.5871522946780782, "learning_rate": 9.991985185268543e-06, "loss": 0.0165, "step": 240 }, { "epoch": 0.04786969907637303, "grad_norm": 0.8140407938901834, "learning_rate": 9.991802045493013e-06, "loss": 0.0108, "step": 241 }, { "epoch": 0.048068328533121464, "grad_norm": 1.7987643886031939, "learning_rate": 9.991616838617255e-06, "loss": 0.0247, "step": 242 }, { "epoch": 0.0482669579898699, "grad_norm": 1.4927221184896913, "learning_rate": 9.991429564717964e-06, "loss": 0.0243, "step": 243 }, { "epoch": 0.04846558744661833, "grad_norm": 1.1469800583144492, "learning_rate": 9.99124022387269e-06, "loss": 0.0265, "step": 244 }, { "epoch": 0.04866421690336677, "grad_norm": 0.6033563935180898, "learning_rate": 9.991048816159834e-06, "loss": 0.0211, "step": 245 }, { "epoch": 0.0488628463601152, "grad_norm": 0.9368709020872853, "learning_rate": 9.990855341658662e-06, "loss": 0.023, "step": 246 }, { "epoch": 0.04906147581686364, "grad_norm": 0.8399028095373245, "learning_rate": 9.990659800449293e-06, "loss": 0.0123, "step": 247 }, { "epoch": 0.049260105273612076, "grad_norm": 0.4780920805092138, "learning_rate": 9.990462192612698e-06, "loss": 0.0197, "step": 248 }, { "epoch": 0.04945873473036051, "grad_norm": 0.7218014814045688, "learning_rate": 9.990262518230706e-06, "loss": 0.0213, "step": 249 }, { "epoch": 0.04965736418710895, "grad_norm": 1.1716975326112173, "learning_rate": 9.990060777386004e-06, "loss": 0.0206, "step": 250 }, { "epoch": 0.049855993643857385, "grad_norm": 0.4751820690979132, "learning_rate": 9.98985697016213e-06, "loss": 0.026, "step": 251 }, { "epoch": 0.05005462310060582, "grad_norm": 0.9003900691985227, "learning_rate": 9.989651096643482e-06, "loss": 0.0128, "step": 252 }, { "epoch": 0.05025325255735426, "grad_norm": 0.9111928166207327, "learning_rate": 9.98944315691531e-06, "loss": 0.0232, "step": 253 }, { "epoch": 0.050451882014102695, "grad_norm": 0.5545254300785284, "learning_rate": 9.989233151063726e-06, "loss": 0.0172, "step": 254 }, { "epoch": 0.050650511470851124, "grad_norm": 0.40224669906766397, "learning_rate": 9.98902107917569e-06, "loss": 0.0233, "step": 255 }, { "epoch": 0.05084914092759956, "grad_norm": 0.5824400395204213, "learning_rate": 9.98880694133902e-06, "loss": 0.0217, "step": 256 }, { "epoch": 0.051047770384348, "grad_norm": 1.1709383260744466, "learning_rate": 9.988590737642392e-06, "loss": 0.0135, "step": 257 }, { "epoch": 0.051246399841096434, "grad_norm": 1.5439449052427077, "learning_rate": 9.988372468175335e-06, "loss": 0.0271, "step": 258 }, { "epoch": 0.05144502929784487, "grad_norm": 0.8457319931931899, "learning_rate": 9.988152133028234e-06, "loss": 0.024, "step": 259 }, { "epoch": 0.051643658754593307, "grad_norm": 1.3872763327296354, "learning_rate": 9.98792973229233e-06, "loss": 0.0215, "step": 260 }, { "epoch": 0.05184228821134174, "grad_norm": 2.1654406629105702, "learning_rate": 9.987705266059721e-06, "loss": 0.0227, "step": 261 }, { "epoch": 0.05204091766809018, "grad_norm": 1.5458480467902418, "learning_rate": 9.987478734423355e-06, "loss": 0.0218, "step": 262 }, { "epoch": 0.052239547124838616, "grad_norm": 0.37405484795190086, "learning_rate": 9.987250137477039e-06, "loss": 0.0173, "step": 263 }, { "epoch": 0.05243817658158705, "grad_norm": 0.7336264315402546, "learning_rate": 9.987019475315437e-06, "loss": 0.018, "step": 264 }, { "epoch": 0.05263680603833548, "grad_norm": 1.2158389596763395, "learning_rate": 9.986786748034061e-06, "loss": 0.0147, "step": 265 }, { "epoch": 0.05283543549508392, "grad_norm": 1.130772342954925, "learning_rate": 9.986551955729288e-06, "loss": 0.0263, "step": 266 }, { "epoch": 0.053034064951832355, "grad_norm": 0.6317791909189016, "learning_rate": 9.986315098498345e-06, "loss": 0.0202, "step": 267 }, { "epoch": 0.05323269440858079, "grad_norm": 0.44226726309173453, "learning_rate": 9.986076176439313e-06, "loss": 0.0184, "step": 268 }, { "epoch": 0.05343132386532923, "grad_norm": 0.5474445679262542, "learning_rate": 9.98583518965113e-06, "loss": 0.0095, "step": 269 }, { "epoch": 0.053629953322077664, "grad_norm": 1.7508003601141047, "learning_rate": 9.985592138233586e-06, "loss": 0.0247, "step": 270 }, { "epoch": 0.0538285827788261, "grad_norm": 1.5983209020045153, "learning_rate": 9.98534702228733e-06, "loss": 0.0259, "step": 271 }, { "epoch": 0.05402721223557454, "grad_norm": 1.7251758674302107, "learning_rate": 9.985099841913867e-06, "loss": 0.0107, "step": 272 }, { "epoch": 0.054225841692322974, "grad_norm": 0.4747190048835185, "learning_rate": 9.984850597215551e-06, "loss": 0.0135, "step": 273 }, { "epoch": 0.05442447114907141, "grad_norm": 0.9285709587358324, "learning_rate": 9.984599288295593e-06, "loss": 0.017, "step": 274 }, { "epoch": 0.054623100605819846, "grad_norm": 1.6254232235138268, "learning_rate": 9.984345915258063e-06, "loss": 0.0177, "step": 275 }, { "epoch": 0.054821730062568276, "grad_norm": 0.8282505450106532, "learning_rate": 9.98409047820788e-06, "loss": 0.0188, "step": 276 }, { "epoch": 0.05502035951931671, "grad_norm": 1.10441200036311, "learning_rate": 9.983832977250822e-06, "loss": 0.0334, "step": 277 }, { "epoch": 0.05521898897606515, "grad_norm": 0.4476905849331348, "learning_rate": 9.983573412493519e-06, "loss": 0.0123, "step": 278 }, { "epoch": 0.055417618432813585, "grad_norm": 0.41407971370056407, "learning_rate": 9.983311784043457e-06, "loss": 0.0163, "step": 279 }, { "epoch": 0.05561624788956202, "grad_norm": 1.733780241117457, "learning_rate": 9.983048092008973e-06, "loss": 0.0204, "step": 280 }, { "epoch": 0.05581487734631046, "grad_norm": 0.7561713460915898, "learning_rate": 9.982782336499267e-06, "loss": 0.0191, "step": 281 }, { "epoch": 0.056013506803058895, "grad_norm": 0.8952250678333094, "learning_rate": 9.982514517624385e-06, "loss": 0.0195, "step": 282 }, { "epoch": 0.05621213625980733, "grad_norm": 0.33837803654892235, "learning_rate": 9.982244635495232e-06, "loss": 0.0128, "step": 283 }, { "epoch": 0.05641076571655577, "grad_norm": 0.3491614108577355, "learning_rate": 9.981972690223561e-06, "loss": 0.0104, "step": 284 }, { "epoch": 0.056609395173304204, "grad_norm": 0.3885685718301126, "learning_rate": 9.98169868192199e-06, "loss": 0.014, "step": 285 }, { "epoch": 0.056808024630052634, "grad_norm": 1.907227579601552, "learning_rate": 9.981422610703983e-06, "loss": 0.0226, "step": 286 }, { "epoch": 0.05700665408680107, "grad_norm": 0.6383147141508525, "learning_rate": 9.981144476683863e-06, "loss": 0.0235, "step": 287 }, { "epoch": 0.05720528354354951, "grad_norm": 1.0623634554315362, "learning_rate": 9.980864279976803e-06, "loss": 0.0199, "step": 288 }, { "epoch": 0.05740391300029794, "grad_norm": 0.5121142099458341, "learning_rate": 9.980582020698832e-06, "loss": 0.0161, "step": 289 }, { "epoch": 0.05760254245704638, "grad_norm": 0.8603769100451026, "learning_rate": 9.980297698966835e-06, "loss": 0.0124, "step": 290 }, { "epoch": 0.057801171913794816, "grad_norm": 1.3969334526557426, "learning_rate": 9.980011314898546e-06, "loss": 0.0175, "step": 291 }, { "epoch": 0.05799980137054325, "grad_norm": 0.5708269650845633, "learning_rate": 9.97972286861256e-06, "loss": 0.0148, "step": 292 }, { "epoch": 0.05819843082729169, "grad_norm": 0.6229104672334838, "learning_rate": 9.979432360228322e-06, "loss": 0.0168, "step": 293 }, { "epoch": 0.058397060284040125, "grad_norm": 0.615731320985735, "learning_rate": 9.979139789866129e-06, "loss": 0.0134, "step": 294 }, { "epoch": 0.05859568974078856, "grad_norm": 0.625395130998787, "learning_rate": 9.978845157647136e-06, "loss": 0.0097, "step": 295 }, { "epoch": 0.05879431919753699, "grad_norm": 0.8790404848653711, "learning_rate": 9.978548463693348e-06, "loss": 0.0166, "step": 296 }, { "epoch": 0.05899294865428543, "grad_norm": 1.4772807736065832, "learning_rate": 9.978249708127627e-06, "loss": 0.0123, "step": 297 }, { "epoch": 0.059191578111033864, "grad_norm": 0.5627615829863155, "learning_rate": 9.977948891073688e-06, "loss": 0.0151, "step": 298 }, { "epoch": 0.0593902075677823, "grad_norm": 0.917332306356399, "learning_rate": 9.977646012656099e-06, "loss": 0.0151, "step": 299 }, { "epoch": 0.05958883702453074, "grad_norm": 0.8670957566639708, "learning_rate": 9.977341073000278e-06, "loss": 0.0167, "step": 300 }, { "epoch": 0.059787466481279174, "grad_norm": 1.1414866808620485, "learning_rate": 9.977034072232506e-06, "loss": 0.0203, "step": 301 }, { "epoch": 0.05998609593802761, "grad_norm": 1.1251288241040425, "learning_rate": 9.976725010479907e-06, "loss": 0.0194, "step": 302 }, { "epoch": 0.06018472539477605, "grad_norm": 0.683122296823212, "learning_rate": 9.976413887870466e-06, "loss": 0.016, "step": 303 }, { "epoch": 0.06038335485152448, "grad_norm": 0.8996616395705849, "learning_rate": 9.976100704533018e-06, "loss": 0.0223, "step": 304 }, { "epoch": 0.06058198430827292, "grad_norm": 0.6428548080123838, "learning_rate": 9.97578546059725e-06, "loss": 0.015, "step": 305 }, { "epoch": 0.060780613765021356, "grad_norm": 1.940990193850758, "learning_rate": 9.975468156193706e-06, "loss": 0.028, "step": 306 }, { "epoch": 0.060979243221769786, "grad_norm": 1.176442185876376, "learning_rate": 9.975148791453781e-06, "loss": 0.021, "step": 307 }, { "epoch": 0.06117787267851822, "grad_norm": 0.5811349856371228, "learning_rate": 9.974827366509725e-06, "loss": 0.0164, "step": 308 }, { "epoch": 0.06137650213526666, "grad_norm": 0.668480148637617, "learning_rate": 9.974503881494638e-06, "loss": 0.0144, "step": 309 }, { "epoch": 0.061575131592015095, "grad_norm": 1.1370989077273732, "learning_rate": 9.974178336542473e-06, "loss": 0.0163, "step": 310 }, { "epoch": 0.06177376104876353, "grad_norm": 1.2252608949279702, "learning_rate": 9.973850731788041e-06, "loss": 0.028, "step": 311 }, { "epoch": 0.06197239050551197, "grad_norm": 1.091045811315918, "learning_rate": 9.973521067367005e-06, "loss": 0.0194, "step": 312 }, { "epoch": 0.062171019962260404, "grad_norm": 0.5664431447755225, "learning_rate": 9.973189343415872e-06, "loss": 0.0207, "step": 313 }, { "epoch": 0.06236964941900884, "grad_norm": 0.7809890625347147, "learning_rate": 9.972855560072014e-06, "loss": 0.0186, "step": 314 }, { "epoch": 0.06256827887575728, "grad_norm": 0.4914092006339943, "learning_rate": 9.972519717473647e-06, "loss": 0.0208, "step": 315 }, { "epoch": 0.06276690833250571, "grad_norm": 1.352749398041195, "learning_rate": 9.972181815759848e-06, "loss": 0.0178, "step": 316 }, { "epoch": 0.06296553778925415, "grad_norm": 1.6842469979290473, "learning_rate": 9.971841855070535e-06, "loss": 0.0212, "step": 317 }, { "epoch": 0.06316416724600259, "grad_norm": 1.181704020131392, "learning_rate": 9.97149983554649e-06, "loss": 0.0249, "step": 318 }, { "epoch": 0.06336279670275102, "grad_norm": 0.47050532143708484, "learning_rate": 9.971155757329341e-06, "loss": 0.0171, "step": 319 }, { "epoch": 0.06356142615949946, "grad_norm": 0.8319540289077562, "learning_rate": 9.970809620561573e-06, "loss": 0.0197, "step": 320 }, { "epoch": 0.0637600556162479, "grad_norm": 2.00049297860715, "learning_rate": 9.970461425386518e-06, "loss": 0.0233, "step": 321 }, { "epoch": 0.06395868507299632, "grad_norm": 0.6876204488927999, "learning_rate": 9.970111171948362e-06, "loss": 0.0218, "step": 322 }, { "epoch": 0.06415731452974476, "grad_norm": 1.1508387565108984, "learning_rate": 9.969758860392148e-06, "loss": 0.0254, "step": 323 }, { "epoch": 0.06435594398649319, "grad_norm": 1.0175823754757551, "learning_rate": 9.96940449086377e-06, "loss": 0.0207, "step": 324 }, { "epoch": 0.06455457344324163, "grad_norm": 1.342558930748065, "learning_rate": 9.969048063509965e-06, "loss": 0.0166, "step": 325 }, { "epoch": 0.06475320289999006, "grad_norm": 1.618591294235281, "learning_rate": 9.968689578478334e-06, "loss": 0.017, "step": 326 }, { "epoch": 0.0649518323567385, "grad_norm": 0.8029753682738998, "learning_rate": 9.968329035917326e-06, "loss": 0.0134, "step": 327 }, { "epoch": 0.06515046181348694, "grad_norm": 1.4258924678587896, "learning_rate": 9.967966435976237e-06, "loss": 0.0209, "step": 328 }, { "epoch": 0.06534909127023537, "grad_norm": 0.6264481978575003, "learning_rate": 9.967601778805225e-06, "loss": 0.022, "step": 329 }, { "epoch": 0.06554772072698381, "grad_norm": 0.9417193977601217, "learning_rate": 9.967235064555289e-06, "loss": 0.0182, "step": 330 }, { "epoch": 0.06574635018373225, "grad_norm": 1.4600362140153005, "learning_rate": 9.966866293378287e-06, "loss": 0.0169, "step": 331 }, { "epoch": 0.06594497964048068, "grad_norm": 0.6608946589876135, "learning_rate": 9.966495465426927e-06, "loss": 0.0166, "step": 332 }, { "epoch": 0.06614360909722912, "grad_norm": 0.5334956998951328, "learning_rate": 9.96612258085477e-06, "loss": 0.0106, "step": 333 }, { "epoch": 0.06634223855397756, "grad_norm": 0.7085312138032311, "learning_rate": 9.965747639816224e-06, "loss": 0.0147, "step": 334 }, { "epoch": 0.06654086801072599, "grad_norm": 1.5381074566151718, "learning_rate": 9.965370642466551e-06, "loss": 0.0139, "step": 335 }, { "epoch": 0.06673949746747443, "grad_norm": 0.6811831504303671, "learning_rate": 9.96499158896187e-06, "loss": 0.0238, "step": 336 }, { "epoch": 0.06693812692422287, "grad_norm": 1.5626101417205556, "learning_rate": 9.964610479459144e-06, "loss": 0.0229, "step": 337 }, { "epoch": 0.0671367563809713, "grad_norm": 0.5600383522887019, "learning_rate": 9.964227314116191e-06, "loss": 0.0171, "step": 338 }, { "epoch": 0.06733538583771974, "grad_norm": 0.8665657897799597, "learning_rate": 9.963842093091678e-06, "loss": 0.0212, "step": 339 }, { "epoch": 0.06753401529446817, "grad_norm": 0.6444012965528136, "learning_rate": 9.963454816545124e-06, "loss": 0.0162, "step": 340 }, { "epoch": 0.06773264475121661, "grad_norm": 1.0179480233604894, "learning_rate": 9.963065484636901e-06, "loss": 0.0159, "step": 341 }, { "epoch": 0.06793127420796505, "grad_norm": 0.33885609792917437, "learning_rate": 9.962674097528232e-06, "loss": 0.0105, "step": 342 }, { "epoch": 0.06812990366471347, "grad_norm": 0.31682099229115473, "learning_rate": 9.962280655381189e-06, "loss": 0.0139, "step": 343 }, { "epoch": 0.0683285331214619, "grad_norm": 1.261532302914878, "learning_rate": 9.961885158358696e-06, "loss": 0.013, "step": 344 }, { "epoch": 0.06852716257821034, "grad_norm": 1.6126011541924115, "learning_rate": 9.961487606624526e-06, "loss": 0.021, "step": 345 }, { "epoch": 0.06872579203495878, "grad_norm": 0.5677592432568079, "learning_rate": 9.961088000343308e-06, "loss": 0.0182, "step": 346 }, { "epoch": 0.06892442149170722, "grad_norm": 0.6292493886004704, "learning_rate": 9.960686339680518e-06, "loss": 0.0177, "step": 347 }, { "epoch": 0.06912305094845565, "grad_norm": 1.9905966048832493, "learning_rate": 9.960282624802482e-06, "loss": 0.0206, "step": 348 }, { "epoch": 0.06932168040520409, "grad_norm": 0.6186635733129662, "learning_rate": 9.959876855876378e-06, "loss": 0.0162, "step": 349 }, { "epoch": 0.06952030986195253, "grad_norm": 1.9229044356070553, "learning_rate": 9.959469033070235e-06, "loss": 0.0248, "step": 350 }, { "epoch": 0.06971893931870096, "grad_norm": 1.3711744907425025, "learning_rate": 9.959059156552931e-06, "loss": 0.0235, "step": 351 }, { "epoch": 0.0699175687754494, "grad_norm": 0.39607802774515655, "learning_rate": 9.958647226494198e-06, "loss": 0.0132, "step": 352 }, { "epoch": 0.07011619823219783, "grad_norm": 1.7233078150248982, "learning_rate": 9.958233243064614e-06, "loss": 0.0168, "step": 353 }, { "epoch": 0.07031482768894627, "grad_norm": 1.6165097055887545, "learning_rate": 9.95781720643561e-06, "loss": 0.022, "step": 354 }, { "epoch": 0.07051345714569471, "grad_norm": 1.6300475923467623, "learning_rate": 9.957399116779463e-06, "loss": 0.0155, "step": 355 }, { "epoch": 0.07071208660244314, "grad_norm": 1.3293283054851412, "learning_rate": 9.95697897426931e-06, "loss": 0.0251, "step": 356 }, { "epoch": 0.07091071605919158, "grad_norm": 0.7311582517038507, "learning_rate": 9.956556779079124e-06, "loss": 0.0201, "step": 357 }, { "epoch": 0.07110934551594002, "grad_norm": 1.0352583458625484, "learning_rate": 9.956132531383741e-06, "loss": 0.0179, "step": 358 }, { "epoch": 0.07130797497268845, "grad_norm": 0.6749175905417537, "learning_rate": 9.95570623135884e-06, "loss": 0.0122, "step": 359 }, { "epoch": 0.07150660442943689, "grad_norm": 0.7936842430096778, "learning_rate": 9.955277879180951e-06, "loss": 0.0156, "step": 360 }, { "epoch": 0.07170523388618533, "grad_norm": 0.6066214646810274, "learning_rate": 9.954847475027456e-06, "loss": 0.0257, "step": 361 }, { "epoch": 0.07190386334293376, "grad_norm": 0.5483587007588743, "learning_rate": 9.954415019076581e-06, "loss": 0.0195, "step": 362 }, { "epoch": 0.0721024927996822, "grad_norm": 1.8751031257546584, "learning_rate": 9.95398051150741e-06, "loss": 0.0263, "step": 363 }, { "epoch": 0.07230112225643062, "grad_norm": 0.5121194780452556, "learning_rate": 9.95354395249987e-06, "loss": 0.014, "step": 364 }, { "epoch": 0.07249975171317906, "grad_norm": 0.4817927132351728, "learning_rate": 9.95310534223474e-06, "loss": 0.0122, "step": 365 }, { "epoch": 0.0726983811699275, "grad_norm": 0.9420406818271349, "learning_rate": 9.952664680893647e-06, "loss": 0.0265, "step": 366 }, { "epoch": 0.07289701062667593, "grad_norm": 0.700810731765772, "learning_rate": 9.95222196865907e-06, "loss": 0.0224, "step": 367 }, { "epoch": 0.07309564008342437, "grad_norm": 1.1831981421496827, "learning_rate": 9.951777205714335e-06, "loss": 0.0157, "step": 368 }, { "epoch": 0.0732942695401728, "grad_norm": 0.844174382418765, "learning_rate": 9.951330392243619e-06, "loss": 0.0208, "step": 369 }, { "epoch": 0.07349289899692124, "grad_norm": 0.6648604009959039, "learning_rate": 9.950881528431944e-06, "loss": 0.0201, "step": 370 }, { "epoch": 0.07369152845366968, "grad_norm": 0.9101607178957163, "learning_rate": 9.950430614465187e-06, "loss": 0.0238, "step": 371 }, { "epoch": 0.07389015791041811, "grad_norm": 1.1877944371561115, "learning_rate": 9.94997765053007e-06, "loss": 0.0229, "step": 372 }, { "epoch": 0.07408878736716655, "grad_norm": 1.1414413397286445, "learning_rate": 9.949522636814166e-06, "loss": 0.025, "step": 373 }, { "epoch": 0.07428741682391499, "grad_norm": 1.0250917508528674, "learning_rate": 9.949065573505894e-06, "loss": 0.0223, "step": 374 }, { "epoch": 0.07448604628066342, "grad_norm": 1.1184758483425372, "learning_rate": 9.948606460794524e-06, "loss": 0.0164, "step": 375 }, { "epoch": 0.07468467573741186, "grad_norm": 1.191964024912497, "learning_rate": 9.948145298870173e-06, "loss": 0.0218, "step": 376 }, { "epoch": 0.0748833051941603, "grad_norm": 1.0225207706121757, "learning_rate": 9.94768208792381e-06, "loss": 0.014, "step": 377 }, { "epoch": 0.07508193465090873, "grad_norm": 0.8774375657656637, "learning_rate": 9.947216828147249e-06, "loss": 0.0187, "step": 378 }, { "epoch": 0.07528056410765717, "grad_norm": 0.3736459392817161, "learning_rate": 9.946749519733155e-06, "loss": 0.0086, "step": 379 }, { "epoch": 0.0754791935644056, "grad_norm": 0.8238665843850358, "learning_rate": 9.946280162875036e-06, "loss": 0.0157, "step": 380 }, { "epoch": 0.07567782302115404, "grad_norm": 0.45936683210469975, "learning_rate": 9.945808757767256e-06, "loss": 0.0099, "step": 381 }, { "epoch": 0.07587645247790248, "grad_norm": 2.0051899829645343, "learning_rate": 9.945335304605023e-06, "loss": 0.023, "step": 382 }, { "epoch": 0.07607508193465091, "grad_norm": 0.6435406420149826, "learning_rate": 9.944859803584392e-06, "loss": 0.0114, "step": 383 }, { "epoch": 0.07627371139139935, "grad_norm": 0.9399241604433789, "learning_rate": 9.94438225490227e-06, "loss": 0.0127, "step": 384 }, { "epoch": 0.07647234084814777, "grad_norm": 0.6944344878094227, "learning_rate": 9.943902658756405e-06, "loss": 0.0197, "step": 385 }, { "epoch": 0.07667097030489621, "grad_norm": 0.551483947123314, "learning_rate": 9.9434210153454e-06, "loss": 0.009, "step": 386 }, { "epoch": 0.07686959976164465, "grad_norm": 1.5968174034926244, "learning_rate": 9.942937324868706e-06, "loss": 0.0233, "step": 387 }, { "epoch": 0.07706822921839308, "grad_norm": 0.5848570587556811, "learning_rate": 9.942451587526614e-06, "loss": 0.0145, "step": 388 }, { "epoch": 0.07726685867514152, "grad_norm": 0.9128747168133394, "learning_rate": 9.94196380352027e-06, "loss": 0.0227, "step": 389 }, { "epoch": 0.07746548813188996, "grad_norm": 1.7264670251438237, "learning_rate": 9.941473973051662e-06, "loss": 0.0295, "step": 390 }, { "epoch": 0.07766411758863839, "grad_norm": 0.4937037314512115, "learning_rate": 9.94098209632363e-06, "loss": 0.0097, "step": 391 }, { "epoch": 0.07786274704538683, "grad_norm": 0.775354918247013, "learning_rate": 9.940488173539863e-06, "loss": 0.0211, "step": 392 }, { "epoch": 0.07806137650213527, "grad_norm": 0.7131682892913337, "learning_rate": 9.93999220490489e-06, "loss": 0.0199, "step": 393 }, { "epoch": 0.0782600059588837, "grad_norm": 0.6154638513132034, "learning_rate": 9.93949419062409e-06, "loss": 0.0132, "step": 394 }, { "epoch": 0.07845863541563214, "grad_norm": 1.1169763475776913, "learning_rate": 9.938994130903693e-06, "loss": 0.0217, "step": 395 }, { "epoch": 0.07865726487238058, "grad_norm": 0.8043725514476527, "learning_rate": 9.938492025950772e-06, "loss": 0.0095, "step": 396 }, { "epoch": 0.07885589432912901, "grad_norm": 1.3173221094506762, "learning_rate": 9.937987875973249e-06, "loss": 0.0171, "step": 397 }, { "epoch": 0.07905452378587745, "grad_norm": 0.5812582606522358, "learning_rate": 9.937481681179892e-06, "loss": 0.0171, "step": 398 }, { "epoch": 0.07925315324262588, "grad_norm": 0.2793393345374467, "learning_rate": 9.936973441780316e-06, "loss": 0.0135, "step": 399 }, { "epoch": 0.07945178269937432, "grad_norm": 0.34243922533366083, "learning_rate": 9.936463157984981e-06, "loss": 0.014, "step": 400 }, { "epoch": 0.07965041215612276, "grad_norm": 0.3322421246762658, "learning_rate": 9.935950830005197e-06, "loss": 0.014, "step": 401 }, { "epoch": 0.0798490416128712, "grad_norm": 0.3204643392343696, "learning_rate": 9.935436458053115e-06, "loss": 0.0117, "step": 402 }, { "epoch": 0.08004767106961963, "grad_norm": 0.47957220575662524, "learning_rate": 9.934920042341739e-06, "loss": 0.0101, "step": 403 }, { "epoch": 0.08024630052636807, "grad_norm": 0.5433074802734997, "learning_rate": 9.934401583084916e-06, "loss": 0.0133, "step": 404 }, { "epoch": 0.08044492998311649, "grad_norm": 0.983630868002418, "learning_rate": 9.933881080497339e-06, "loss": 0.0128, "step": 405 }, { "epoch": 0.08064355943986493, "grad_norm": 0.8766875713340728, "learning_rate": 9.933358534794547e-06, "loss": 0.0188, "step": 406 }, { "epoch": 0.08084218889661336, "grad_norm": 0.7878723380271294, "learning_rate": 9.932833946192925e-06, "loss": 0.0144, "step": 407 }, { "epoch": 0.0810408183533618, "grad_norm": 0.9810561422094143, "learning_rate": 9.932307314909708e-06, "loss": 0.0207, "step": 408 }, { "epoch": 0.08123944781011024, "grad_norm": 0.7377525283562892, "learning_rate": 9.93177864116297e-06, "loss": 0.0121, "step": 409 }, { "epoch": 0.08143807726685867, "grad_norm": 0.46490373025726456, "learning_rate": 9.931247925171636e-06, "loss": 0.0069, "step": 410 }, { "epoch": 0.08163670672360711, "grad_norm": 0.44936367192605653, "learning_rate": 9.930715167155473e-06, "loss": 0.0107, "step": 411 }, { "epoch": 0.08183533618035554, "grad_norm": 1.0432452279852802, "learning_rate": 9.930180367335098e-06, "loss": 0.0079, "step": 412 }, { "epoch": 0.08203396563710398, "grad_norm": 1.8118427025754065, "learning_rate": 9.929643525931971e-06, "loss": 0.0209, "step": 413 }, { "epoch": 0.08223259509385242, "grad_norm": 2.1277356119975, "learning_rate": 9.929104643168396e-06, "loss": 0.0196, "step": 414 }, { "epoch": 0.08243122455060085, "grad_norm": 1.0672236447535246, "learning_rate": 9.928563719267525e-06, "loss": 0.0251, "step": 415 }, { "epoch": 0.08262985400734929, "grad_norm": 0.6372995337769211, "learning_rate": 9.92802075445335e-06, "loss": 0.011, "step": 416 }, { "epoch": 0.08282848346409773, "grad_norm": 2.6818339589471565, "learning_rate": 9.92747574895072e-06, "loss": 0.0234, "step": 417 }, { "epoch": 0.08302711292084616, "grad_norm": 1.8534077650526866, "learning_rate": 9.926928702985314e-06, "loss": 0.0191, "step": 418 }, { "epoch": 0.0832257423775946, "grad_norm": 0.6069645126286759, "learning_rate": 9.926379616783667e-06, "loss": 0.0108, "step": 419 }, { "epoch": 0.08342437183434304, "grad_norm": 1.3113122989349562, "learning_rate": 9.925828490573156e-06, "loss": 0.0207, "step": 420 }, { "epoch": 0.08362300129109147, "grad_norm": 0.5645438911441669, "learning_rate": 9.925275324581998e-06, "loss": 0.0099, "step": 421 }, { "epoch": 0.08382163074783991, "grad_norm": 0.5245065449924216, "learning_rate": 9.92472011903926e-06, "loss": 0.0117, "step": 422 }, { "epoch": 0.08402026020458835, "grad_norm": 0.851720051198573, "learning_rate": 9.924162874174854e-06, "loss": 0.0196, "step": 423 }, { "epoch": 0.08421888966133678, "grad_norm": 0.6220395812315587, "learning_rate": 9.923603590219531e-06, "loss": 0.0207, "step": 424 }, { "epoch": 0.08441751911808522, "grad_norm": 0.8302098723285082, "learning_rate": 9.923042267404893e-06, "loss": 0.0144, "step": 425 }, { "epoch": 0.08461614857483364, "grad_norm": 0.44409974197466384, "learning_rate": 9.922478905963383e-06, "loss": 0.0139, "step": 426 }, { "epoch": 0.08481477803158208, "grad_norm": 1.3240937379268272, "learning_rate": 9.921913506128286e-06, "loss": 0.0154, "step": 427 }, { "epoch": 0.08501340748833051, "grad_norm": 0.526570789542041, "learning_rate": 9.921346068133736e-06, "loss": 0.015, "step": 428 }, { "epoch": 0.08521203694507895, "grad_norm": 0.35386778919766676, "learning_rate": 9.920776592214707e-06, "loss": 0.0123, "step": 429 }, { "epoch": 0.08541066640182739, "grad_norm": 0.4338611448698274, "learning_rate": 9.92020507860702e-06, "loss": 0.0107, "step": 430 }, { "epoch": 0.08560929585857582, "grad_norm": 1.170842642025001, "learning_rate": 9.919631527547336e-06, "loss": 0.0151, "step": 431 }, { "epoch": 0.08580792531532426, "grad_norm": 0.7363695886885039, "learning_rate": 9.919055939273163e-06, "loss": 0.0152, "step": 432 }, { "epoch": 0.0860065547720727, "grad_norm": 1.019101053836205, "learning_rate": 9.918478314022852e-06, "loss": 0.0232, "step": 433 }, { "epoch": 0.08620518422882113, "grad_norm": 0.782125020982536, "learning_rate": 9.917898652035595e-06, "loss": 0.0244, "step": 434 }, { "epoch": 0.08640381368556957, "grad_norm": 0.6310871428581489, "learning_rate": 9.917316953551434e-06, "loss": 0.0225, "step": 435 }, { "epoch": 0.086602443142318, "grad_norm": 0.43083561752173805, "learning_rate": 9.916733218811246e-06, "loss": 0.0136, "step": 436 }, { "epoch": 0.08680107259906644, "grad_norm": 0.8104722545750283, "learning_rate": 9.916147448056755e-06, "loss": 0.0144, "step": 437 }, { "epoch": 0.08699970205581488, "grad_norm": 0.4778684037166286, "learning_rate": 9.91555964153053e-06, "loss": 0.0128, "step": 438 }, { "epoch": 0.08719833151256332, "grad_norm": 0.8825546091074031, "learning_rate": 9.914969799475978e-06, "loss": 0.0145, "step": 439 }, { "epoch": 0.08739696096931175, "grad_norm": 0.600585587690455, "learning_rate": 9.914377922137356e-06, "loss": 0.0114, "step": 440 }, { "epoch": 0.08759559042606019, "grad_norm": 0.9380929174319077, "learning_rate": 9.913784009759757e-06, "loss": 0.0184, "step": 441 }, { "epoch": 0.08779421988280862, "grad_norm": 0.7171398264697423, "learning_rate": 9.91318806258912e-06, "loss": 0.0187, "step": 442 }, { "epoch": 0.08799284933955706, "grad_norm": 0.7288666746333805, "learning_rate": 9.912590080872227e-06, "loss": 0.017, "step": 443 }, { "epoch": 0.0881914787963055, "grad_norm": 0.8521776900166558, "learning_rate": 9.911990064856703e-06, "loss": 0.0223, "step": 444 }, { "epoch": 0.08839010825305393, "grad_norm": 0.6528440880111782, "learning_rate": 9.91138801479101e-06, "loss": 0.0183, "step": 445 }, { "epoch": 0.08858873770980237, "grad_norm": 0.4836930626793002, "learning_rate": 9.910783930924458e-06, "loss": 0.0098, "step": 446 }, { "epoch": 0.08878736716655079, "grad_norm": 0.998927006114887, "learning_rate": 9.910177813507198e-06, "loss": 0.014, "step": 447 }, { "epoch": 0.08898599662329923, "grad_norm": 0.8181775259423208, "learning_rate": 9.909569662790224e-06, "loss": 0.0192, "step": 448 }, { "epoch": 0.08918462608004767, "grad_norm": 0.5583401583915387, "learning_rate": 9.908959479025369e-06, "loss": 0.0119, "step": 449 }, { "epoch": 0.0893832555367961, "grad_norm": 0.8838426600045396, "learning_rate": 9.908347262465308e-06, "loss": 0.0213, "step": 450 }, { "epoch": 0.08958188499354454, "grad_norm": 0.7498887472731395, "learning_rate": 9.907733013363563e-06, "loss": 0.0142, "step": 451 }, { "epoch": 0.08978051445029298, "grad_norm": 0.8966575656361611, "learning_rate": 9.907116731974491e-06, "loss": 0.0193, "step": 452 }, { "epoch": 0.08997914390704141, "grad_norm": 0.5073403960586594, "learning_rate": 9.906498418553295e-06, "loss": 0.0137, "step": 453 }, { "epoch": 0.09017777336378985, "grad_norm": 1.1137939580467742, "learning_rate": 9.905878073356015e-06, "loss": 0.0192, "step": 454 }, { "epoch": 0.09037640282053828, "grad_norm": 1.1344716602210057, "learning_rate": 9.90525569663954e-06, "loss": 0.0161, "step": 455 }, { "epoch": 0.09057503227728672, "grad_norm": 0.7220695206586469, "learning_rate": 9.904631288661593e-06, "loss": 0.0182, "step": 456 }, { "epoch": 0.09077366173403516, "grad_norm": 0.5923800565042013, "learning_rate": 9.904004849680741e-06, "loss": 0.0174, "step": 457 }, { "epoch": 0.0909722911907836, "grad_norm": 1.0827911084412167, "learning_rate": 9.903376379956392e-06, "loss": 0.0129, "step": 458 }, { "epoch": 0.09117092064753203, "grad_norm": 0.5158773883497091, "learning_rate": 9.902745879748793e-06, "loss": 0.0159, "step": 459 }, { "epoch": 0.09136955010428047, "grad_norm": 1.0459539619692417, "learning_rate": 9.902113349319035e-06, "loss": 0.0253, "step": 460 }, { "epoch": 0.0915681795610289, "grad_norm": 1.2859832032862175, "learning_rate": 9.90147878892905e-06, "loss": 0.0192, "step": 461 }, { "epoch": 0.09176680901777734, "grad_norm": 0.5360090137854244, "learning_rate": 9.900842198841606e-06, "loss": 0.0153, "step": 462 }, { "epoch": 0.09196543847452578, "grad_norm": 0.4013944962969106, "learning_rate": 9.900203579320316e-06, "loss": 0.0154, "step": 463 }, { "epoch": 0.09216406793127421, "grad_norm": 0.41838257115387206, "learning_rate": 9.89956293062963e-06, "loss": 0.0128, "step": 464 }, { "epoch": 0.09236269738802265, "grad_norm": 0.713293197602063, "learning_rate": 9.898920253034841e-06, "loss": 0.0148, "step": 465 }, { "epoch": 0.09256132684477109, "grad_norm": 0.5124949670835264, "learning_rate": 9.898275546802082e-06, "loss": 0.0182, "step": 466 }, { "epoch": 0.09275995630151952, "grad_norm": 0.7347929074207598, "learning_rate": 9.897628812198324e-06, "loss": 0.018, "step": 467 }, { "epoch": 0.09295858575826794, "grad_norm": 0.3702075450464756, "learning_rate": 9.896980049491378e-06, "loss": 0.0153, "step": 468 }, { "epoch": 0.09315721521501638, "grad_norm": 0.8668215363272558, "learning_rate": 9.896329258949898e-06, "loss": 0.0186, "step": 469 }, { "epoch": 0.09335584467176482, "grad_norm": 0.9267636519344175, "learning_rate": 9.895676440843376e-06, "loss": 0.02, "step": 470 }, { "epoch": 0.09355447412851325, "grad_norm": 1.975894079341777, "learning_rate": 9.895021595442143e-06, "loss": 0.0153, "step": 471 }, { "epoch": 0.09375310358526169, "grad_norm": 0.9664233797506984, "learning_rate": 9.894364723017369e-06, "loss": 0.0248, "step": 472 }, { "epoch": 0.09395173304201013, "grad_norm": 0.7298025816829822, "learning_rate": 9.893705823841063e-06, "loss": 0.0136, "step": 473 }, { "epoch": 0.09415036249875856, "grad_norm": 0.5028928969765695, "learning_rate": 9.893044898186077e-06, "loss": 0.0132, "step": 474 }, { "epoch": 0.094348991955507, "grad_norm": 0.5343396584732334, "learning_rate": 9.8923819463261e-06, "loss": 0.0114, "step": 475 }, { "epoch": 0.09454762141225544, "grad_norm": 1.043121510446111, "learning_rate": 9.891716968535655e-06, "loss": 0.0233, "step": 476 }, { "epoch": 0.09474625086900387, "grad_norm": 0.9607158804060207, "learning_rate": 9.891049965090116e-06, "loss": 0.0184, "step": 477 }, { "epoch": 0.09494488032575231, "grad_norm": 2.04441514855444, "learning_rate": 9.890380936265683e-06, "loss": 0.0261, "step": 478 }, { "epoch": 0.09514350978250075, "grad_norm": 0.44891259364731184, "learning_rate": 9.8897098823394e-06, "loss": 0.0161, "step": 479 }, { "epoch": 0.09534213923924918, "grad_norm": 0.20556262152171734, "learning_rate": 9.889036803589154e-06, "loss": 0.0066, "step": 480 }, { "epoch": 0.09554076869599762, "grad_norm": 0.62530012349153, "learning_rate": 9.888361700293663e-06, "loss": 0.013, "step": 481 }, { "epoch": 0.09573939815274606, "grad_norm": 0.5060323442897047, "learning_rate": 9.887684572732486e-06, "loss": 0.0176, "step": 482 }, { "epoch": 0.09593802760949449, "grad_norm": 0.6697767814686015, "learning_rate": 9.887005421186022e-06, "loss": 0.0128, "step": 483 }, { "epoch": 0.09613665706624293, "grad_norm": 0.49988533088265713, "learning_rate": 9.886324245935508e-06, "loss": 0.0123, "step": 484 }, { "epoch": 0.09633528652299136, "grad_norm": 0.5219089565515956, "learning_rate": 9.885641047263017e-06, "loss": 0.0245, "step": 485 }, { "epoch": 0.0965339159797398, "grad_norm": 0.566826196384926, "learning_rate": 9.88495582545146e-06, "loss": 0.0145, "step": 486 }, { "epoch": 0.09673254543648824, "grad_norm": 0.959759790672288, "learning_rate": 9.88426858078459e-06, "loss": 0.0194, "step": 487 }, { "epoch": 0.09693117489323666, "grad_norm": 0.9036100999985963, "learning_rate": 9.88357931354699e-06, "loss": 0.0129, "step": 488 }, { "epoch": 0.0971298043499851, "grad_norm": 0.5669073431377971, "learning_rate": 9.882888024024086e-06, "loss": 0.0093, "step": 489 }, { "epoch": 0.09732843380673353, "grad_norm": 1.0332312565203494, "learning_rate": 9.882194712502142e-06, "loss": 0.0263, "step": 490 }, { "epoch": 0.09752706326348197, "grad_norm": 1.4518945147996904, "learning_rate": 9.881499379268258e-06, "loss": 0.0165, "step": 491 }, { "epoch": 0.0977256927202304, "grad_norm": 0.7165588378220921, "learning_rate": 9.880802024610367e-06, "loss": 0.0261, "step": 492 }, { "epoch": 0.09792432217697884, "grad_norm": 0.33507821178982644, "learning_rate": 9.880102648817249e-06, "loss": 0.018, "step": 493 }, { "epoch": 0.09812295163372728, "grad_norm": 1.0953941169607475, "learning_rate": 9.879401252178508e-06, "loss": 0.0163, "step": 494 }, { "epoch": 0.09832158109047572, "grad_norm": 1.1884751994563965, "learning_rate": 9.878697834984596e-06, "loss": 0.017, "step": 495 }, { "epoch": 0.09852021054722415, "grad_norm": 1.3895562605165783, "learning_rate": 9.877992397526795e-06, "loss": 0.0178, "step": 496 }, { "epoch": 0.09871884000397259, "grad_norm": 1.0077625272176394, "learning_rate": 9.877284940097229e-06, "loss": 0.014, "step": 497 }, { "epoch": 0.09891746946072102, "grad_norm": 1.1176332665051876, "learning_rate": 9.876575462988852e-06, "loss": 0.0176, "step": 498 }, { "epoch": 0.09911609891746946, "grad_norm": 0.9197946583172847, "learning_rate": 9.87586396649546e-06, "loss": 0.0186, "step": 499 }, { "epoch": 0.0993147283742179, "grad_norm": 0.664357412818359, "learning_rate": 9.875150450911682e-06, "loss": 0.0078, "step": 500 }, { "epoch": 0.09951335783096633, "grad_norm": 0.9879849373646336, "learning_rate": 9.874434916532984e-06, "loss": 0.0174, "step": 501 }, { "epoch": 0.09971198728771477, "grad_norm": 0.8015238994270766, "learning_rate": 9.87371736365567e-06, "loss": 0.0182, "step": 502 }, { "epoch": 0.09991061674446321, "grad_norm": 2.5006213146117324, "learning_rate": 9.872997792576876e-06, "loss": 0.0192, "step": 503 }, { "epoch": 0.10010924620121164, "grad_norm": 2.0986832355357263, "learning_rate": 9.872276203594575e-06, "loss": 0.015, "step": 504 }, { "epoch": 0.10030787565796008, "grad_norm": 1.7712689606880594, "learning_rate": 9.871552597007577e-06, "loss": 0.0162, "step": 505 }, { "epoch": 0.10050650511470852, "grad_norm": 0.6358423775925522, "learning_rate": 9.870826973115528e-06, "loss": 0.0088, "step": 506 }, { "epoch": 0.10070513457145695, "grad_norm": 1.2361002395299776, "learning_rate": 9.870099332218908e-06, "loss": 0.0132, "step": 507 }, { "epoch": 0.10090376402820539, "grad_norm": 0.3894121263307642, "learning_rate": 9.869369674619031e-06, "loss": 0.009, "step": 508 }, { "epoch": 0.10110239348495381, "grad_norm": 0.6814889911507336, "learning_rate": 9.868638000618047e-06, "loss": 0.018, "step": 509 }, { "epoch": 0.10130102294170225, "grad_norm": 1.5433233594496414, "learning_rate": 9.867904310518943e-06, "loss": 0.0169, "step": 510 }, { "epoch": 0.10149965239845068, "grad_norm": 2.086256409582492, "learning_rate": 9.867168604625538e-06, "loss": 0.0272, "step": 511 }, { "epoch": 0.10169828185519912, "grad_norm": 1.1008955506156466, "learning_rate": 9.86643088324249e-06, "loss": 0.0197, "step": 512 }, { "epoch": 0.10189691131194756, "grad_norm": 0.6668853742795092, "learning_rate": 9.865691146675286e-06, "loss": 0.014, "step": 513 }, { "epoch": 0.102095540768696, "grad_norm": 0.6058575326394435, "learning_rate": 9.86494939523025e-06, "loss": 0.0107, "step": 514 }, { "epoch": 0.10229417022544443, "grad_norm": 0.9452200105677793, "learning_rate": 9.864205629214542e-06, "loss": 0.0224, "step": 515 }, { "epoch": 0.10249279968219287, "grad_norm": 1.0087103985746844, "learning_rate": 9.863459848936155e-06, "loss": 0.0175, "step": 516 }, { "epoch": 0.1026914291389413, "grad_norm": 1.4576341787329083, "learning_rate": 9.862712054703913e-06, "loss": 0.017, "step": 517 }, { "epoch": 0.10289005859568974, "grad_norm": 1.007654984553146, "learning_rate": 9.861962246827479e-06, "loss": 0.0197, "step": 518 }, { "epoch": 0.10308868805243818, "grad_norm": 1.1768748633973105, "learning_rate": 9.861210425617348e-06, "loss": 0.0146, "step": 519 }, { "epoch": 0.10328731750918661, "grad_norm": 0.5946983343781779, "learning_rate": 9.860456591384849e-06, "loss": 0.0161, "step": 520 }, { "epoch": 0.10348594696593505, "grad_norm": 0.8631523397738572, "learning_rate": 9.85970074444214e-06, "loss": 0.0169, "step": 521 }, { "epoch": 0.10368457642268349, "grad_norm": 0.6524946202638839, "learning_rate": 9.858942885102221e-06, "loss": 0.0231, "step": 522 }, { "epoch": 0.10388320587943192, "grad_norm": 0.5099019794008004, "learning_rate": 9.85818301367892e-06, "loss": 0.0181, "step": 523 }, { "epoch": 0.10408183533618036, "grad_norm": 0.503120599970783, "learning_rate": 9.857421130486897e-06, "loss": 0.0172, "step": 524 }, { "epoch": 0.1042804647929288, "grad_norm": 0.5964107917213637, "learning_rate": 9.85665723584165e-06, "loss": 0.0175, "step": 525 }, { "epoch": 0.10447909424967723, "grad_norm": 0.6031856884274558, "learning_rate": 9.855891330059502e-06, "loss": 0.0174, "step": 526 }, { "epoch": 0.10467772370642567, "grad_norm": 0.47774054717960784, "learning_rate": 9.85512341345762e-06, "loss": 0.02, "step": 527 }, { "epoch": 0.1048763531631741, "grad_norm": 0.47747803803414446, "learning_rate": 9.854353486353994e-06, "loss": 0.0176, "step": 528 }, { "epoch": 0.10507498261992254, "grad_norm": 0.4118060996615957, "learning_rate": 9.853581549067449e-06, "loss": 0.011, "step": 529 }, { "epoch": 0.10527361207667096, "grad_norm": 1.7006846993605238, "learning_rate": 9.852807601917647e-06, "loss": 0.0245, "step": 530 }, { "epoch": 0.1054722415334194, "grad_norm": 1.319886511097232, "learning_rate": 9.852031645225075e-06, "loss": 0.0133, "step": 531 }, { "epoch": 0.10567087099016784, "grad_norm": 0.5419706523360872, "learning_rate": 9.851253679311059e-06, "loss": 0.0115, "step": 532 }, { "epoch": 0.10586950044691627, "grad_norm": 0.4436521767303458, "learning_rate": 9.850473704497752e-06, "loss": 0.0173, "step": 533 }, { "epoch": 0.10606812990366471, "grad_norm": 0.9414894737976166, "learning_rate": 9.84969172110814e-06, "loss": 0.0125, "step": 534 }, { "epoch": 0.10626675936041315, "grad_norm": 0.5777200514403674, "learning_rate": 9.848907729466045e-06, "loss": 0.0157, "step": 535 }, { "epoch": 0.10646538881716158, "grad_norm": 0.5350127642077067, "learning_rate": 9.848121729896114e-06, "loss": 0.0109, "step": 536 }, { "epoch": 0.10666401827391002, "grad_norm": 0.6968674417806208, "learning_rate": 9.84733372272383e-06, "loss": 0.0238, "step": 537 }, { "epoch": 0.10686264773065846, "grad_norm": 0.6180948340768215, "learning_rate": 9.846543708275507e-06, "loss": 0.0226, "step": 538 }, { "epoch": 0.10706127718740689, "grad_norm": 0.7782144147736434, "learning_rate": 9.845751686878286e-06, "loss": 0.021, "step": 539 }, { "epoch": 0.10725990664415533, "grad_norm": 0.8480072268503545, "learning_rate": 9.844957658860143e-06, "loss": 0.0189, "step": 540 }, { "epoch": 0.10745853610090376, "grad_norm": 0.48593638963667746, "learning_rate": 9.844161624549889e-06, "loss": 0.0148, "step": 541 }, { "epoch": 0.1076571655576522, "grad_norm": 1.0293729094433886, "learning_rate": 9.843363584277154e-06, "loss": 0.0148, "step": 542 }, { "epoch": 0.10785579501440064, "grad_norm": 0.5138840542732326, "learning_rate": 9.84256353837241e-06, "loss": 0.0199, "step": 543 }, { "epoch": 0.10805442447114907, "grad_norm": 0.9923481664493533, "learning_rate": 9.841761487166955e-06, "loss": 0.0209, "step": 544 }, { "epoch": 0.10825305392789751, "grad_norm": 0.499458338835493, "learning_rate": 9.840957430992917e-06, "loss": 0.0195, "step": 545 }, { "epoch": 0.10845168338464595, "grad_norm": 0.40579211347456434, "learning_rate": 9.840151370183253e-06, "loss": 0.0153, "step": 546 }, { "epoch": 0.10865031284139438, "grad_norm": 0.5908120798309358, "learning_rate": 9.839343305071755e-06, "loss": 0.0121, "step": 547 }, { "epoch": 0.10884894229814282, "grad_norm": 0.8968327120573901, "learning_rate": 9.838533235993041e-06, "loss": 0.0238, "step": 548 }, { "epoch": 0.10904757175489126, "grad_norm": 1.01470974568391, "learning_rate": 9.837721163282558e-06, "loss": 0.0168, "step": 549 }, { "epoch": 0.10924620121163969, "grad_norm": 0.3781128929172408, "learning_rate": 9.836907087276587e-06, "loss": 0.0188, "step": 550 }, { "epoch": 0.10944483066838812, "grad_norm": 0.3520969782131752, "learning_rate": 9.836091008312233e-06, "loss": 0.0144, "step": 551 }, { "epoch": 0.10964346012513655, "grad_norm": 1.206009624884685, "learning_rate": 9.835272926727439e-06, "loss": 0.0169, "step": 552 }, { "epoch": 0.10984208958188499, "grad_norm": 0.5829489199164067, "learning_rate": 9.834452842860967e-06, "loss": 0.0141, "step": 553 }, { "epoch": 0.11004071903863342, "grad_norm": 0.7985221379687066, "learning_rate": 9.833630757052413e-06, "loss": 0.0176, "step": 554 }, { "epoch": 0.11023934849538186, "grad_norm": 0.7533308379120465, "learning_rate": 9.832806669642203e-06, "loss": 0.0151, "step": 555 }, { "epoch": 0.1104379779521303, "grad_norm": 1.08877862811918, "learning_rate": 9.831980580971591e-06, "loss": 0.0196, "step": 556 }, { "epoch": 0.11063660740887873, "grad_norm": 0.8723157647635501, "learning_rate": 9.831152491382658e-06, "loss": 0.0138, "step": 557 }, { "epoch": 0.11083523686562717, "grad_norm": 1.2697238695545063, "learning_rate": 9.83032240121832e-06, "loss": 0.0225, "step": 558 }, { "epoch": 0.11103386632237561, "grad_norm": 0.4616713641640445, "learning_rate": 9.82949031082231e-06, "loss": 0.0156, "step": 559 }, { "epoch": 0.11123249577912404, "grad_norm": 0.349449094735304, "learning_rate": 9.8286562205392e-06, "loss": 0.012, "step": 560 }, { "epoch": 0.11143112523587248, "grad_norm": 1.1595881223097364, "learning_rate": 9.827820130714383e-06, "loss": 0.0177, "step": 561 }, { "epoch": 0.11162975469262092, "grad_norm": 0.5231453160678909, "learning_rate": 9.826982041694086e-06, "loss": 0.0086, "step": 562 }, { "epoch": 0.11182838414936935, "grad_norm": 0.7972550695997788, "learning_rate": 9.826141953825358e-06, "loss": 0.0154, "step": 563 }, { "epoch": 0.11202701360611779, "grad_norm": 0.4601783531819776, "learning_rate": 9.825299867456082e-06, "loss": 0.0186, "step": 564 }, { "epoch": 0.11222564306286623, "grad_norm": 0.7214422323486367, "learning_rate": 9.824455782934961e-06, "loss": 0.011, "step": 565 }, { "epoch": 0.11242427251961466, "grad_norm": 1.680930925926308, "learning_rate": 9.823609700611534e-06, "loss": 0.0181, "step": 566 }, { "epoch": 0.1126229019763631, "grad_norm": 0.5298368232940289, "learning_rate": 9.822761620836158e-06, "loss": 0.0177, "step": 567 }, { "epoch": 0.11282153143311154, "grad_norm": 0.7938863386225405, "learning_rate": 9.821911543960025e-06, "loss": 0.0117, "step": 568 }, { "epoch": 0.11302016088985997, "grad_norm": 0.8596537266570184, "learning_rate": 9.82105947033515e-06, "loss": 0.0072, "step": 569 }, { "epoch": 0.11321879034660841, "grad_norm": 0.4173258192767521, "learning_rate": 9.820205400314378e-06, "loss": 0.0145, "step": 570 }, { "epoch": 0.11341741980335683, "grad_norm": 0.6115201580608163, "learning_rate": 9.819349334251376e-06, "loss": 0.0203, "step": 571 }, { "epoch": 0.11361604926010527, "grad_norm": 0.30050628933401596, "learning_rate": 9.818491272500641e-06, "loss": 0.0107, "step": 572 }, { "epoch": 0.1138146787168537, "grad_norm": 1.0979707173146247, "learning_rate": 9.817631215417496e-06, "loss": 0.0223, "step": 573 }, { "epoch": 0.11401330817360214, "grad_norm": 0.46276625355749734, "learning_rate": 9.816769163358087e-06, "loss": 0.0125, "step": 574 }, { "epoch": 0.11421193763035058, "grad_norm": 1.0534382953635288, "learning_rate": 9.815905116679394e-06, "loss": 0.0239, "step": 575 }, { "epoch": 0.11441056708709901, "grad_norm": 0.4199398914552984, "learning_rate": 9.815039075739213e-06, "loss": 0.0113, "step": 576 }, { "epoch": 0.11460919654384745, "grad_norm": 1.1636761751492717, "learning_rate": 9.814171040896173e-06, "loss": 0.0208, "step": 577 }, { "epoch": 0.11480782600059589, "grad_norm": 0.5065188833689531, "learning_rate": 9.813301012509725e-06, "loss": 0.0081, "step": 578 }, { "epoch": 0.11500645545734432, "grad_norm": 1.1440102400398982, "learning_rate": 9.812428990940151e-06, "loss": 0.0201, "step": 579 }, { "epoch": 0.11520508491409276, "grad_norm": 0.5140689221203485, "learning_rate": 9.811554976548547e-06, "loss": 0.0154, "step": 580 }, { "epoch": 0.1154037143708412, "grad_norm": 0.8604730664877108, "learning_rate": 9.810678969696849e-06, "loss": 0.0216, "step": 581 }, { "epoch": 0.11560234382758963, "grad_norm": 1.3627495098285676, "learning_rate": 9.809800970747805e-06, "loss": 0.0146, "step": 582 }, { "epoch": 0.11580097328433807, "grad_norm": 1.5215024670202477, "learning_rate": 9.808920980064998e-06, "loss": 0.0187, "step": 583 }, { "epoch": 0.1159996027410865, "grad_norm": 0.6734406716826478, "learning_rate": 9.808038998012828e-06, "loss": 0.0174, "step": 584 }, { "epoch": 0.11619823219783494, "grad_norm": 1.1041085950914342, "learning_rate": 9.807155024956523e-06, "loss": 0.0151, "step": 585 }, { "epoch": 0.11639686165458338, "grad_norm": 1.5387484405002336, "learning_rate": 9.806269061262135e-06, "loss": 0.0149, "step": 586 }, { "epoch": 0.11659549111133181, "grad_norm": 0.5875043750348269, "learning_rate": 9.805381107296544e-06, "loss": 0.0143, "step": 587 }, { "epoch": 0.11679412056808025, "grad_norm": 0.8389398799982417, "learning_rate": 9.80449116342745e-06, "loss": 0.0255, "step": 588 }, { "epoch": 0.11699275002482869, "grad_norm": 0.30889573926063935, "learning_rate": 9.803599230023373e-06, "loss": 0.0103, "step": 589 }, { "epoch": 0.11719137948157712, "grad_norm": 0.7219078584006366, "learning_rate": 9.802705307453667e-06, "loss": 0.0125, "step": 590 }, { "epoch": 0.11739000893832556, "grad_norm": 0.5466172579541949, "learning_rate": 9.801809396088501e-06, "loss": 0.0116, "step": 591 }, { "epoch": 0.11758863839507398, "grad_norm": 1.0372207668042581, "learning_rate": 9.800911496298875e-06, "loss": 0.0123, "step": 592 }, { "epoch": 0.11778726785182242, "grad_norm": 0.7559866086265034, "learning_rate": 9.800011608456607e-06, "loss": 0.0201, "step": 593 }, { "epoch": 0.11798589730857086, "grad_norm": 0.3188709050740313, "learning_rate": 9.799109732934338e-06, "loss": 0.0111, "step": 594 }, { "epoch": 0.11818452676531929, "grad_norm": 0.5314042057902478, "learning_rate": 9.798205870105533e-06, "loss": 0.0125, "step": 595 }, { "epoch": 0.11838315622206773, "grad_norm": 1.8474069548887428, "learning_rate": 9.797300020344483e-06, "loss": 0.0231, "step": 596 }, { "epoch": 0.11858178567881617, "grad_norm": 0.42452537891107434, "learning_rate": 9.796392184026298e-06, "loss": 0.0108, "step": 597 }, { "epoch": 0.1187804151355646, "grad_norm": 0.5783590129051792, "learning_rate": 9.795482361526915e-06, "loss": 0.0116, "step": 598 }, { "epoch": 0.11897904459231304, "grad_norm": 0.38163168855460666, "learning_rate": 9.794570553223087e-06, "loss": 0.009, "step": 599 }, { "epoch": 0.11917767404906147, "grad_norm": 1.4917576733893148, "learning_rate": 9.793656759492394e-06, "loss": 0.0155, "step": 600 }, { "epoch": 0.11937630350580991, "grad_norm": 1.0584955957981348, "learning_rate": 9.792740980713238e-06, "loss": 0.0207, "step": 601 }, { "epoch": 0.11957493296255835, "grad_norm": 0.694913429075098, "learning_rate": 9.791823217264842e-06, "loss": 0.0161, "step": 602 }, { "epoch": 0.11977356241930678, "grad_norm": 0.6695725185833449, "learning_rate": 9.790903469527249e-06, "loss": 0.0199, "step": 603 }, { "epoch": 0.11997219187605522, "grad_norm": 2.000789888070613, "learning_rate": 9.789981737881326e-06, "loss": 0.039, "step": 604 }, { "epoch": 0.12017082133280366, "grad_norm": 0.48955671291310315, "learning_rate": 9.789058022708765e-06, "loss": 0.0142, "step": 605 }, { "epoch": 0.1203694507895521, "grad_norm": 1.0363261312441616, "learning_rate": 9.788132324392072e-06, "loss": 0.0159, "step": 606 }, { "epoch": 0.12056808024630053, "grad_norm": 0.4884275959706906, "learning_rate": 9.787204643314577e-06, "loss": 0.0167, "step": 607 }, { "epoch": 0.12076670970304897, "grad_norm": 0.7350563044633582, "learning_rate": 9.786274979860434e-06, "loss": 0.0158, "step": 608 }, { "epoch": 0.1209653391597974, "grad_norm": 0.9892060951239566, "learning_rate": 9.785343334414615e-06, "loss": 0.0206, "step": 609 }, { "epoch": 0.12116396861654584, "grad_norm": 0.37127352309370415, "learning_rate": 9.784409707362913e-06, "loss": 0.0183, "step": 610 }, { "epoch": 0.12136259807329428, "grad_norm": 0.650383126477588, "learning_rate": 9.783474099091943e-06, "loss": 0.0176, "step": 611 }, { "epoch": 0.12156122753004271, "grad_norm": 0.4119293084929478, "learning_rate": 9.78253650998914e-06, "loss": 0.0117, "step": 612 }, { "epoch": 0.12175985698679113, "grad_norm": 0.829964935039823, "learning_rate": 9.781596940442755e-06, "loss": 0.0153, "step": 613 }, { "epoch": 0.12195848644353957, "grad_norm": 1.7129449944097686, "learning_rate": 9.780655390841867e-06, "loss": 0.0136, "step": 614 }, { "epoch": 0.12215711590028801, "grad_norm": 0.5565425099390852, "learning_rate": 9.779711861576368e-06, "loss": 0.0215, "step": 615 }, { "epoch": 0.12235574535703644, "grad_norm": 0.7788538582287662, "learning_rate": 9.778766353036975e-06, "loss": 0.0218, "step": 616 }, { "epoch": 0.12255437481378488, "grad_norm": 0.33444210785480794, "learning_rate": 9.777818865615221e-06, "loss": 0.0136, "step": 617 }, { "epoch": 0.12275300427053332, "grad_norm": 0.400994084696202, "learning_rate": 9.776869399703458e-06, "loss": 0.0154, "step": 618 }, { "epoch": 0.12295163372728175, "grad_norm": 0.7796516460912207, "learning_rate": 9.77591795569486e-06, "loss": 0.013, "step": 619 }, { "epoch": 0.12315026318403019, "grad_norm": 0.536772489299672, "learning_rate": 9.774964533983421e-06, "loss": 0.0188, "step": 620 }, { "epoch": 0.12334889264077863, "grad_norm": 1.3303202817826634, "learning_rate": 9.774009134963949e-06, "loss": 0.0193, "step": 621 }, { "epoch": 0.12354752209752706, "grad_norm": 0.7037843214385814, "learning_rate": 9.773051759032074e-06, "loss": 0.0175, "step": 622 }, { "epoch": 0.1237461515542755, "grad_norm": 0.4308203918294435, "learning_rate": 9.772092406584248e-06, "loss": 0.0166, "step": 623 }, { "epoch": 0.12394478101102394, "grad_norm": 1.1697494943206082, "learning_rate": 9.771131078017735e-06, "loss": 0.0193, "step": 624 }, { "epoch": 0.12414341046777237, "grad_norm": 0.5812375266471298, "learning_rate": 9.77016777373062e-06, "loss": 0.0146, "step": 625 }, { "epoch": 0.12434203992452081, "grad_norm": 0.524198461309644, "learning_rate": 9.769202494121806e-06, "loss": 0.0167, "step": 626 }, { "epoch": 0.12454066938126925, "grad_norm": 1.5285962472865722, "learning_rate": 9.768235239591016e-06, "loss": 0.0207, "step": 627 }, { "epoch": 0.12473929883801768, "grad_norm": 0.7396830974883195, "learning_rate": 9.767266010538788e-06, "loss": 0.0148, "step": 628 }, { "epoch": 0.12493792829476612, "grad_norm": 0.2274647843251192, "learning_rate": 9.76629480736648e-06, "loss": 0.0107, "step": 629 }, { "epoch": 0.12513655775151455, "grad_norm": 0.28859531186159604, "learning_rate": 9.765321630476264e-06, "loss": 0.0166, "step": 630 }, { "epoch": 0.12533518720826298, "grad_norm": 0.35440823267343124, "learning_rate": 9.764346480271132e-06, "loss": 0.0168, "step": 631 }, { "epoch": 0.12553381666501143, "grad_norm": 0.5712874545330476, "learning_rate": 9.763369357154895e-06, "loss": 0.0101, "step": 632 }, { "epoch": 0.12573244612175985, "grad_norm": 1.3923546120285113, "learning_rate": 9.762390261532177e-06, "loss": 0.0168, "step": 633 }, { "epoch": 0.1259310755785083, "grad_norm": 0.47850667989496215, "learning_rate": 9.761409193808422e-06, "loss": 0.009, "step": 634 }, { "epoch": 0.12612970503525672, "grad_norm": 0.4045494692733913, "learning_rate": 9.760426154389888e-06, "loss": 0.0127, "step": 635 }, { "epoch": 0.12632833449200517, "grad_norm": 0.949458135135656, "learning_rate": 9.75944114368365e-06, "loss": 0.0245, "step": 636 }, { "epoch": 0.1265269639487536, "grad_norm": 0.4744219220416367, "learning_rate": 9.7584541620976e-06, "loss": 0.01, "step": 637 }, { "epoch": 0.12672559340550205, "grad_norm": 0.6580474069905136, "learning_rate": 9.757465210040447e-06, "loss": 0.0116, "step": 638 }, { "epoch": 0.12692422286225047, "grad_norm": 1.2746603803218375, "learning_rate": 9.756474287921716e-06, "loss": 0.0173, "step": 639 }, { "epoch": 0.12712285231899892, "grad_norm": 1.9417494348440496, "learning_rate": 9.755481396151744e-06, "loss": 0.0324, "step": 640 }, { "epoch": 0.12732148177574734, "grad_norm": 0.618531205863786, "learning_rate": 9.75448653514169e-06, "loss": 0.0142, "step": 641 }, { "epoch": 0.1275201112324958, "grad_norm": 0.9971146664077777, "learning_rate": 9.75348970530352e-06, "loss": 0.018, "step": 642 }, { "epoch": 0.12771874068924421, "grad_norm": 0.8389497013907814, "learning_rate": 9.752490907050027e-06, "loss": 0.0151, "step": 643 }, { "epoch": 0.12791737014599264, "grad_norm": 1.0391641435545595, "learning_rate": 9.751490140794807e-06, "loss": 0.0234, "step": 644 }, { "epoch": 0.1281159996027411, "grad_norm": 0.7118837598081941, "learning_rate": 9.750487406952277e-06, "loss": 0.0165, "step": 645 }, { "epoch": 0.1283146290594895, "grad_norm": 0.3126685428380962, "learning_rate": 9.74948270593767e-06, "loss": 0.0104, "step": 646 }, { "epoch": 0.12851325851623796, "grad_norm": 0.6400960063705387, "learning_rate": 9.74847603816703e-06, "loss": 0.0146, "step": 647 }, { "epoch": 0.12871188797298638, "grad_norm": 0.6080027304759789, "learning_rate": 9.747467404057219e-06, "loss": 0.0132, "step": 648 }, { "epoch": 0.12891051742973483, "grad_norm": 0.5969440680204644, "learning_rate": 9.74645680402591e-06, "loss": 0.0165, "step": 649 }, { "epoch": 0.12910914688648326, "grad_norm": 0.4844450633614373, "learning_rate": 9.74544423849159e-06, "loss": 0.008, "step": 650 }, { "epoch": 0.1293077763432317, "grad_norm": 0.5731462801181816, "learning_rate": 9.744429707873564e-06, "loss": 0.014, "step": 651 }, { "epoch": 0.12950640579998013, "grad_norm": 0.9087700604835134, "learning_rate": 9.743413212591949e-06, "loss": 0.0213, "step": 652 }, { "epoch": 0.12970503525672858, "grad_norm": 0.7602480663063869, "learning_rate": 9.742394753067671e-06, "loss": 0.0158, "step": 653 }, { "epoch": 0.129903664713477, "grad_norm": 0.6559403776033135, "learning_rate": 9.741374329722474e-06, "loss": 0.0126, "step": 654 }, { "epoch": 0.13010229417022545, "grad_norm": 0.5644269287564567, "learning_rate": 9.740351942978919e-06, "loss": 0.0141, "step": 655 }, { "epoch": 0.13030092362697387, "grad_norm": 0.5606392709106318, "learning_rate": 9.739327593260367e-06, "loss": 0.0221, "step": 656 }, { "epoch": 0.13049955308372232, "grad_norm": 0.6807347976418896, "learning_rate": 9.738301280991007e-06, "loss": 0.0151, "step": 657 }, { "epoch": 0.13069818254047075, "grad_norm": 0.8462663632413303, "learning_rate": 9.737273006595832e-06, "loss": 0.0194, "step": 658 }, { "epoch": 0.1308968119972192, "grad_norm": 0.3894055488420919, "learning_rate": 9.736242770500647e-06, "loss": 0.0092, "step": 659 }, { "epoch": 0.13109544145396762, "grad_norm": 0.7904884629984615, "learning_rate": 9.735210573132078e-06, "loss": 0.0162, "step": 660 }, { "epoch": 0.13129407091071607, "grad_norm": 0.9674854160105776, "learning_rate": 9.734176414917548e-06, "loss": 0.0196, "step": 661 }, { "epoch": 0.1314927003674645, "grad_norm": 0.33200341815567036, "learning_rate": 9.733140296285307e-06, "loss": 0.0093, "step": 662 }, { "epoch": 0.13169132982421294, "grad_norm": 0.8720779831059587, "learning_rate": 9.73210221766441e-06, "loss": 0.0171, "step": 663 }, { "epoch": 0.13188995928096137, "grad_norm": 0.48679544383358947, "learning_rate": 9.731062179484723e-06, "loss": 0.0222, "step": 664 }, { "epoch": 0.1320885887377098, "grad_norm": 1.6442374114676053, "learning_rate": 9.730020182176925e-06, "loss": 0.0201, "step": 665 }, { "epoch": 0.13228721819445824, "grad_norm": 0.6211726152538897, "learning_rate": 9.728976226172507e-06, "loss": 0.0115, "step": 666 }, { "epoch": 0.13248584765120666, "grad_norm": 1.0316344031063414, "learning_rate": 9.727930311903768e-06, "loss": 0.0138, "step": 667 }, { "epoch": 0.1326844771079551, "grad_norm": 0.6585612814093009, "learning_rate": 9.726882439803822e-06, "loss": 0.0221, "step": 668 }, { "epoch": 0.13288310656470353, "grad_norm": 1.583096886947926, "learning_rate": 9.725832610306592e-06, "loss": 0.0236, "step": 669 }, { "epoch": 0.13308173602145199, "grad_norm": 0.58785647706293, "learning_rate": 9.72478082384681e-06, "loss": 0.0133, "step": 670 }, { "epoch": 0.1332803654782004, "grad_norm": 0.4215741160538269, "learning_rate": 9.723727080860022e-06, "loss": 0.0117, "step": 671 }, { "epoch": 0.13347899493494886, "grad_norm": 1.1714886610581292, "learning_rate": 9.722671381782577e-06, "loss": 0.0289, "step": 672 }, { "epoch": 0.13367762439169728, "grad_norm": 0.849483842098239, "learning_rate": 9.721613727051646e-06, "loss": 0.017, "step": 673 }, { "epoch": 0.13387625384844573, "grad_norm": 1.4736074773129628, "learning_rate": 9.720554117105197e-06, "loss": 0.0132, "step": 674 }, { "epoch": 0.13407488330519415, "grad_norm": 2.2121758689867344, "learning_rate": 9.719492552382015e-06, "loss": 0.0268, "step": 675 }, { "epoch": 0.1342735127619426, "grad_norm": 2.2633647738559546, "learning_rate": 9.718429033321693e-06, "loss": 0.0279, "step": 676 }, { "epoch": 0.13447214221869103, "grad_norm": 0.5748397779881033, "learning_rate": 9.717363560364634e-06, "loss": 0.0198, "step": 677 }, { "epoch": 0.13467077167543948, "grad_norm": 0.7319060757746011, "learning_rate": 9.71629613395205e-06, "loss": 0.015, "step": 678 }, { "epoch": 0.1348694011321879, "grad_norm": 0.8521409511987582, "learning_rate": 9.71522675452596e-06, "loss": 0.0141, "step": 679 }, { "epoch": 0.13506803058893635, "grad_norm": 0.910962721954014, "learning_rate": 9.714155422529192e-06, "loss": 0.016, "step": 680 }, { "epoch": 0.13526666004568477, "grad_norm": 0.3260897607925365, "learning_rate": 9.713082138405383e-06, "loss": 0.0105, "step": 681 }, { "epoch": 0.13546528950243322, "grad_norm": 0.2535946790266379, "learning_rate": 9.712006902598982e-06, "loss": 0.0112, "step": 682 }, { "epoch": 0.13566391895918165, "grad_norm": 0.4071046693159764, "learning_rate": 9.710929715555241e-06, "loss": 0.0171, "step": 683 }, { "epoch": 0.1358625484159301, "grad_norm": 0.6842960302160229, "learning_rate": 9.709850577720223e-06, "loss": 0.0123, "step": 684 }, { "epoch": 0.13606117787267852, "grad_norm": 0.35348541207601764, "learning_rate": 9.708769489540796e-06, "loss": 0.0127, "step": 685 }, { "epoch": 0.13625980732942694, "grad_norm": 0.5960992242072756, "learning_rate": 9.70768645146464e-06, "loss": 0.0152, "step": 686 }, { "epoch": 0.1364584367861754, "grad_norm": 0.706647286718095, "learning_rate": 9.706601463940237e-06, "loss": 0.0092, "step": 687 }, { "epoch": 0.1366570662429238, "grad_norm": 0.8393587614699919, "learning_rate": 9.705514527416885e-06, "loss": 0.0167, "step": 688 }, { "epoch": 0.13685569569967226, "grad_norm": 0.8895837213581118, "learning_rate": 9.704425642344674e-06, "loss": 0.017, "step": 689 }, { "epoch": 0.1370543251564207, "grad_norm": 0.6798384157696477, "learning_rate": 9.703334809174519e-06, "loss": 0.0117, "step": 690 }, { "epoch": 0.13725295461316914, "grad_norm": 0.7053098557106127, "learning_rate": 9.70224202835813e-06, "loss": 0.0131, "step": 691 }, { "epoch": 0.13745158406991756, "grad_norm": 1.1725400512260025, "learning_rate": 9.701147300348025e-06, "loss": 0.0133, "step": 692 }, { "epoch": 0.137650213526666, "grad_norm": 0.4166073254333175, "learning_rate": 9.70005062559753e-06, "loss": 0.0124, "step": 693 }, { "epoch": 0.13784884298341443, "grad_norm": 0.5431686497693445, "learning_rate": 9.69895200456078e-06, "loss": 0.0079, "step": 694 }, { "epoch": 0.13804747244016288, "grad_norm": 0.5724175575189289, "learning_rate": 9.697851437692708e-06, "loss": 0.0172, "step": 695 }, { "epoch": 0.1382461018969113, "grad_norm": 1.243225503850031, "learning_rate": 9.696748925449061e-06, "loss": 0.0225, "step": 696 }, { "epoch": 0.13844473135365976, "grad_norm": 0.5911282831149057, "learning_rate": 9.69564446828639e-06, "loss": 0.0167, "step": 697 }, { "epoch": 0.13864336081040818, "grad_norm": 0.7689274476457225, "learning_rate": 9.694538066662043e-06, "loss": 0.0165, "step": 698 }, { "epoch": 0.13884199026715663, "grad_norm": 0.9634756763078826, "learning_rate": 9.693429721034186e-06, "loss": 0.0205, "step": 699 }, { "epoch": 0.13904061972390505, "grad_norm": 0.9777879206227595, "learning_rate": 9.69231943186178e-06, "loss": 0.0138, "step": 700 }, { "epoch": 0.1392392491806535, "grad_norm": 0.5707455498722125, "learning_rate": 9.691207199604599e-06, "loss": 0.0191, "step": 701 }, { "epoch": 0.13943787863740192, "grad_norm": 1.3726637583023185, "learning_rate": 9.690093024723213e-06, "loss": 0.0165, "step": 702 }, { "epoch": 0.13963650809415037, "grad_norm": 1.1668665383346433, "learning_rate": 9.688976907679001e-06, "loss": 0.0174, "step": 703 }, { "epoch": 0.1398351375508988, "grad_norm": 1.0967041976393477, "learning_rate": 9.68785884893415e-06, "loss": 0.0161, "step": 704 }, { "epoch": 0.14003376700764725, "grad_norm": 0.3589397157463669, "learning_rate": 9.686738848951642e-06, "loss": 0.0226, "step": 705 }, { "epoch": 0.14023239646439567, "grad_norm": 0.37386431956811017, "learning_rate": 9.68561690819527e-06, "loss": 0.012, "step": 706 }, { "epoch": 0.1404310259211441, "grad_norm": 1.3981607309328512, "learning_rate": 9.68449302712963e-06, "loss": 0.0167, "step": 707 }, { "epoch": 0.14062965537789254, "grad_norm": 1.1763685062897045, "learning_rate": 9.683367206220118e-06, "loss": 0.0224, "step": 708 }, { "epoch": 0.14082828483464097, "grad_norm": 1.943944629520424, "learning_rate": 9.682239445932937e-06, "loss": 0.0272, "step": 709 }, { "epoch": 0.14102691429138942, "grad_norm": 0.6898744285476524, "learning_rate": 9.681109746735089e-06, "loss": 0.0142, "step": 710 }, { "epoch": 0.14122554374813784, "grad_norm": 0.5450517944671093, "learning_rate": 9.679978109094383e-06, "loss": 0.0169, "step": 711 }, { "epoch": 0.1414241732048863, "grad_norm": 0.44279377105851336, "learning_rate": 9.678844533479427e-06, "loss": 0.013, "step": 712 }, { "epoch": 0.1416228026616347, "grad_norm": 1.6342810458603632, "learning_rate": 9.677709020359638e-06, "loss": 0.0254, "step": 713 }, { "epoch": 0.14182143211838316, "grad_norm": 2.0291941877579065, "learning_rate": 9.676571570205227e-06, "loss": 0.0119, "step": 714 }, { "epoch": 0.14202006157513158, "grad_norm": 1.5157380559444824, "learning_rate": 9.675432183487211e-06, "loss": 0.0162, "step": 715 }, { "epoch": 0.14221869103188003, "grad_norm": 1.0513407088044642, "learning_rate": 9.67429086067741e-06, "loss": 0.0202, "step": 716 }, { "epoch": 0.14241732048862846, "grad_norm": 0.8011495579478646, "learning_rate": 9.673147602248448e-06, "loss": 0.0165, "step": 717 }, { "epoch": 0.1426159499453769, "grad_norm": 0.41050875738974985, "learning_rate": 9.67200240867374e-06, "loss": 0.0145, "step": 718 }, { "epoch": 0.14281457940212533, "grad_norm": 1.2640264053922001, "learning_rate": 9.670855280427514e-06, "loss": 0.0157, "step": 719 }, { "epoch": 0.14301320885887378, "grad_norm": 0.7857274636818972, "learning_rate": 9.669706217984793e-06, "loss": 0.0105, "step": 720 }, { "epoch": 0.1432118383156222, "grad_norm": 0.6636140885305981, "learning_rate": 9.668555221821404e-06, "loss": 0.0132, "step": 721 }, { "epoch": 0.14341046777237065, "grad_norm": 0.6949080113239285, "learning_rate": 9.667402292413975e-06, "loss": 0.0131, "step": 722 }, { "epoch": 0.14360909722911908, "grad_norm": 0.7348391840249473, "learning_rate": 9.66624743023993e-06, "loss": 0.0135, "step": 723 }, { "epoch": 0.14380772668586753, "grad_norm": 0.515623939890902, "learning_rate": 9.665090635777497e-06, "loss": 0.0171, "step": 724 }, { "epoch": 0.14400635614261595, "grad_norm": 0.8344063986031619, "learning_rate": 9.663931909505702e-06, "loss": 0.023, "step": 725 }, { "epoch": 0.1442049855993644, "grad_norm": 1.6382892707257288, "learning_rate": 9.662771251904375e-06, "loss": 0.0214, "step": 726 }, { "epoch": 0.14440361505611282, "grad_norm": 0.6588029713890065, "learning_rate": 9.66160866345414e-06, "loss": 0.0092, "step": 727 }, { "epoch": 0.14460224451286124, "grad_norm": 0.5348463144307005, "learning_rate": 9.660444144636429e-06, "loss": 0.0084, "step": 728 }, { "epoch": 0.1448008739696097, "grad_norm": 0.765885295893399, "learning_rate": 9.659277695933462e-06, "loss": 0.0175, "step": 729 }, { "epoch": 0.14499950342635812, "grad_norm": 0.8497299979434756, "learning_rate": 9.658109317828267e-06, "loss": 0.019, "step": 730 }, { "epoch": 0.14519813288310657, "grad_norm": 0.523328490867962, "learning_rate": 9.656939010804672e-06, "loss": 0.0216, "step": 731 }, { "epoch": 0.145396762339855, "grad_norm": 0.6009169743511245, "learning_rate": 9.655766775347292e-06, "loss": 0.0125, "step": 732 }, { "epoch": 0.14559539179660344, "grad_norm": 0.6220910948352019, "learning_rate": 9.654592611941555e-06, "loss": 0.014, "step": 733 }, { "epoch": 0.14579402125335186, "grad_norm": 0.7443834281999502, "learning_rate": 9.653416521073678e-06, "loss": 0.0118, "step": 734 }, { "epoch": 0.1459926507101003, "grad_norm": 0.3777328532342268, "learning_rate": 9.65223850323068e-06, "loss": 0.016, "step": 735 }, { "epoch": 0.14619128016684874, "grad_norm": 0.36374649077029947, "learning_rate": 9.651058558900375e-06, "loss": 0.0134, "step": 736 }, { "epoch": 0.1463899096235972, "grad_norm": 0.343215847483735, "learning_rate": 9.64987668857138e-06, "loss": 0.0175, "step": 737 }, { "epoch": 0.1465885390803456, "grad_norm": 0.38715395916519696, "learning_rate": 9.648692892733105e-06, "loss": 0.0085, "step": 738 }, { "epoch": 0.14678716853709406, "grad_norm": 0.27505352925401316, "learning_rate": 9.647507171875758e-06, "loss": 0.0064, "step": 739 }, { "epoch": 0.14698579799384248, "grad_norm": 0.7665979379551244, "learning_rate": 9.646319526490345e-06, "loss": 0.0185, "step": 740 }, { "epoch": 0.14718442745059093, "grad_norm": 0.4034771698141865, "learning_rate": 9.64512995706867e-06, "loss": 0.0114, "step": 741 }, { "epoch": 0.14738305690733935, "grad_norm": 0.6233753352700463, "learning_rate": 9.643938464103331e-06, "loss": 0.0205, "step": 742 }, { "epoch": 0.1475816863640878, "grad_norm": 0.6387978330419878, "learning_rate": 9.642745048087724e-06, "loss": 0.0122, "step": 743 }, { "epoch": 0.14778031582083623, "grad_norm": 0.6530117233777584, "learning_rate": 9.641549709516042e-06, "loss": 0.0242, "step": 744 }, { "epoch": 0.14797894527758468, "grad_norm": 0.6726282197782968, "learning_rate": 9.640352448883273e-06, "loss": 0.0207, "step": 745 }, { "epoch": 0.1481775747343331, "grad_norm": 0.3645759942166644, "learning_rate": 9.639153266685204e-06, "loss": 0.0145, "step": 746 }, { "epoch": 0.14837620419108155, "grad_norm": 0.4159658079352318, "learning_rate": 9.63795216341841e-06, "loss": 0.0167, "step": 747 }, { "epoch": 0.14857483364782997, "grad_norm": 0.4271126375806547, "learning_rate": 9.636749139580272e-06, "loss": 0.0138, "step": 748 }, { "epoch": 0.1487734631045784, "grad_norm": 0.659027659352402, "learning_rate": 9.635544195668958e-06, "loss": 0.0156, "step": 749 }, { "epoch": 0.14897209256132685, "grad_norm": 0.42458273885035736, "learning_rate": 9.634337332183435e-06, "loss": 0.0193, "step": 750 }, { "epoch": 0.14917072201807527, "grad_norm": 0.8691539370190741, "learning_rate": 9.633128549623463e-06, "loss": 0.0159, "step": 751 }, { "epoch": 0.14936935147482372, "grad_norm": 0.8763737396333379, "learning_rate": 9.6319178484896e-06, "loss": 0.0174, "step": 752 }, { "epoch": 0.14956798093157214, "grad_norm": 0.8359131444320231, "learning_rate": 9.630705229283192e-06, "loss": 0.0162, "step": 753 }, { "epoch": 0.1497666103883206, "grad_norm": 0.3581978948784169, "learning_rate": 9.629490692506386e-06, "loss": 0.0171, "step": 754 }, { "epoch": 0.14996523984506901, "grad_norm": 0.7735573422661927, "learning_rate": 9.628274238662124e-06, "loss": 0.0177, "step": 755 }, { "epoch": 0.15016386930181747, "grad_norm": 0.6569012717678822, "learning_rate": 9.627055868254131e-06, "loss": 0.0165, "step": 756 }, { "epoch": 0.1503624987585659, "grad_norm": 0.8802984939901024, "learning_rate": 9.625835581786937e-06, "loss": 0.0167, "step": 757 }, { "epoch": 0.15056112821531434, "grad_norm": 0.42465561361686077, "learning_rate": 9.624613379765863e-06, "loss": 0.0112, "step": 758 }, { "epoch": 0.15075975767206276, "grad_norm": 0.5587945392733407, "learning_rate": 9.623389262697018e-06, "loss": 0.0146, "step": 759 }, { "epoch": 0.1509583871288112, "grad_norm": 0.6613542353383493, "learning_rate": 9.62216323108731e-06, "loss": 0.0091, "step": 760 }, { "epoch": 0.15115701658555963, "grad_norm": 0.6745543946977648, "learning_rate": 9.620935285444435e-06, "loss": 0.0102, "step": 761 }, { "epoch": 0.15135564604230808, "grad_norm": 1.0899874100297828, "learning_rate": 9.619705426276887e-06, "loss": 0.0101, "step": 762 }, { "epoch": 0.1515542754990565, "grad_norm": 0.9734431138507607, "learning_rate": 9.61847365409395e-06, "loss": 0.0201, "step": 763 }, { "epoch": 0.15175290495580496, "grad_norm": 0.6620377606505773, "learning_rate": 9.617239969405696e-06, "loss": 0.0108, "step": 764 }, { "epoch": 0.15195153441255338, "grad_norm": 0.6543556889174846, "learning_rate": 9.616004372722993e-06, "loss": 0.0089, "step": 765 }, { "epoch": 0.15215016386930183, "grad_norm": 1.1176456576818978, "learning_rate": 9.614766864557505e-06, "loss": 0.0149, "step": 766 }, { "epoch": 0.15234879332605025, "grad_norm": 1.4536512494283313, "learning_rate": 9.613527445421678e-06, "loss": 0.0236, "step": 767 }, { "epoch": 0.1525474227827987, "grad_norm": 0.9953239414430083, "learning_rate": 9.612286115828757e-06, "loss": 0.017, "step": 768 }, { "epoch": 0.15274605223954713, "grad_norm": 0.9938930956416363, "learning_rate": 9.611042876292774e-06, "loss": 0.0167, "step": 769 }, { "epoch": 0.15294468169629555, "grad_norm": 1.1047738551365187, "learning_rate": 9.609797727328553e-06, "loss": 0.0218, "step": 770 }, { "epoch": 0.153143311153044, "grad_norm": 0.6023972990227837, "learning_rate": 9.608550669451709e-06, "loss": 0.006, "step": 771 }, { "epoch": 0.15334194060979242, "grad_norm": 0.5085628473524939, "learning_rate": 9.607301703178648e-06, "loss": 0.0237, "step": 772 }, { "epoch": 0.15354057006654087, "grad_norm": 0.7921029494776086, "learning_rate": 9.606050829026568e-06, "loss": 0.013, "step": 773 }, { "epoch": 0.1537391995232893, "grad_norm": 0.3846454240514661, "learning_rate": 9.604798047513449e-06, "loss": 0.01, "step": 774 }, { "epoch": 0.15393782898003774, "grad_norm": 0.8493324370898936, "learning_rate": 9.603543359158071e-06, "loss": 0.0206, "step": 775 }, { "epoch": 0.15413645843678617, "grad_norm": 1.3234666263493877, "learning_rate": 9.60228676448e-06, "loss": 0.0173, "step": 776 }, { "epoch": 0.15433508789353462, "grad_norm": 0.4104770870479379, "learning_rate": 9.601028263999585e-06, "loss": 0.019, "step": 777 }, { "epoch": 0.15453371735028304, "grad_norm": 1.0206867249184952, "learning_rate": 9.599767858237976e-06, "loss": 0.0162, "step": 778 }, { "epoch": 0.1547323468070315, "grad_norm": 0.6045402333325749, "learning_rate": 9.598505547717103e-06, "loss": 0.0171, "step": 779 }, { "epoch": 0.1549309762637799, "grad_norm": 0.2885932547248781, "learning_rate": 9.597241332959687e-06, "loss": 0.0093, "step": 780 }, { "epoch": 0.15512960572052836, "grad_norm": 1.0510497521650195, "learning_rate": 9.59597521448924e-06, "loss": 0.0136, "step": 781 }, { "epoch": 0.15532823517727679, "grad_norm": 0.4857875665691372, "learning_rate": 9.59470719283006e-06, "loss": 0.0151, "step": 782 }, { "epoch": 0.15552686463402524, "grad_norm": 0.21785943423231574, "learning_rate": 9.59343726850723e-06, "loss": 0.0094, "step": 783 }, { "epoch": 0.15572549409077366, "grad_norm": 0.365806832334753, "learning_rate": 9.592165442046628e-06, "loss": 0.0141, "step": 784 }, { "epoch": 0.1559241235475221, "grad_norm": 0.5929826587521397, "learning_rate": 9.590891713974917e-06, "loss": 0.0129, "step": 785 }, { "epoch": 0.15612275300427053, "grad_norm": 0.7523823133800354, "learning_rate": 9.589616084819542e-06, "loss": 0.0219, "step": 786 }, { "epoch": 0.15632138246101898, "grad_norm": 0.387882993262946, "learning_rate": 9.588338555108744e-06, "loss": 0.0209, "step": 787 }, { "epoch": 0.1565200119177674, "grad_norm": 0.45441023735943464, "learning_rate": 9.587059125371545e-06, "loss": 0.0137, "step": 788 }, { "epoch": 0.15671864137451583, "grad_norm": 0.7600639793039875, "learning_rate": 9.585777796137756e-06, "loss": 0.0114, "step": 789 }, { "epoch": 0.15691727083126428, "grad_norm": 0.6247443320410846, "learning_rate": 9.584494567937973e-06, "loss": 0.0196, "step": 790 }, { "epoch": 0.1571159002880127, "grad_norm": 0.2926748180028533, "learning_rate": 9.58320944130358e-06, "loss": 0.0067, "step": 791 }, { "epoch": 0.15731452974476115, "grad_norm": 0.9470618869437955, "learning_rate": 9.581922416766748e-06, "loss": 0.0207, "step": 792 }, { "epoch": 0.15751315920150957, "grad_norm": 0.7327000387245999, "learning_rate": 9.580633494860432e-06, "loss": 0.0185, "step": 793 }, { "epoch": 0.15771178865825802, "grad_norm": 0.5905549844276268, "learning_rate": 9.579342676118373e-06, "loss": 0.015, "step": 794 }, { "epoch": 0.15791041811500645, "grad_norm": 0.2330283466633709, "learning_rate": 9.578049961075098e-06, "loss": 0.0101, "step": 795 }, { "epoch": 0.1581090475717549, "grad_norm": 1.4809497082727017, "learning_rate": 9.576755350265918e-06, "loss": 0.0228, "step": 796 }, { "epoch": 0.15830767702850332, "grad_norm": 0.3008493915594875, "learning_rate": 9.57545884422693e-06, "loss": 0.0078, "step": 797 }, { "epoch": 0.15850630648525177, "grad_norm": 0.7392324826529579, "learning_rate": 9.574160443495017e-06, "loss": 0.0201, "step": 798 }, { "epoch": 0.1587049359420002, "grad_norm": 0.546883075423793, "learning_rate": 9.572860148607846e-06, "loss": 0.0173, "step": 799 }, { "epoch": 0.15890356539874864, "grad_norm": 0.8293724216580041, "learning_rate": 9.571557960103867e-06, "loss": 0.0139, "step": 800 }, { "epoch": 0.15910219485549706, "grad_norm": 0.9771544832714681, "learning_rate": 9.570253878522314e-06, "loss": 0.0123, "step": 801 }, { "epoch": 0.15930082431224551, "grad_norm": 0.5950320778242356, "learning_rate": 9.568947904403208e-06, "loss": 0.0177, "step": 802 }, { "epoch": 0.15949945376899394, "grad_norm": 1.2291334278846218, "learning_rate": 9.567640038287349e-06, "loss": 0.0215, "step": 803 }, { "epoch": 0.1596980832257424, "grad_norm": 1.1112781913894665, "learning_rate": 9.566330280716323e-06, "loss": 0.0171, "step": 804 }, { "epoch": 0.1598967126824908, "grad_norm": 0.8798985320697517, "learning_rate": 9.565018632232502e-06, "loss": 0.0245, "step": 805 }, { "epoch": 0.16009534213923926, "grad_norm": 0.5515295292743605, "learning_rate": 9.563705093379036e-06, "loss": 0.0245, "step": 806 }, { "epoch": 0.16029397159598768, "grad_norm": 0.6661943953386937, "learning_rate": 9.562389664699863e-06, "loss": 0.0159, "step": 807 }, { "epoch": 0.16049260105273613, "grad_norm": 0.9464937907087646, "learning_rate": 9.561072346739697e-06, "loss": 0.0148, "step": 808 }, { "epoch": 0.16069123050948456, "grad_norm": 0.39866831976662964, "learning_rate": 9.55975314004404e-06, "loss": 0.0177, "step": 809 }, { "epoch": 0.16088985996623298, "grad_norm": 0.6113943733065699, "learning_rate": 9.558432045159174e-06, "loss": 0.0147, "step": 810 }, { "epoch": 0.16108848942298143, "grad_norm": 0.6538607257959484, "learning_rate": 9.557109062632164e-06, "loss": 0.0226, "step": 811 }, { "epoch": 0.16128711887972985, "grad_norm": 0.2911939937872243, "learning_rate": 9.555784193010854e-06, "loss": 0.0093, "step": 812 }, { "epoch": 0.1614857483364783, "grad_norm": 0.43968491191440634, "learning_rate": 9.554457436843872e-06, "loss": 0.0148, "step": 813 }, { "epoch": 0.16168437779322672, "grad_norm": 1.1183514039831923, "learning_rate": 9.553128794680626e-06, "loss": 0.017, "step": 814 }, { "epoch": 0.16188300724997517, "grad_norm": 0.6035034096507715, "learning_rate": 9.551798267071308e-06, "loss": 0.0069, "step": 815 }, { "epoch": 0.1620816367067236, "grad_norm": 0.6846614549208714, "learning_rate": 9.550465854566884e-06, "loss": 0.013, "step": 816 }, { "epoch": 0.16228026616347205, "grad_norm": 0.5513653606442389, "learning_rate": 9.549131557719106e-06, "loss": 0.0137, "step": 817 }, { "epoch": 0.16247889562022047, "grad_norm": 0.7496807116130924, "learning_rate": 9.547795377080506e-06, "loss": 0.0062, "step": 818 }, { "epoch": 0.16267752507696892, "grad_norm": 1.1195510094425656, "learning_rate": 9.546457313204395e-06, "loss": 0.0186, "step": 819 }, { "epoch": 0.16287615453371734, "grad_norm": 0.4918213514406738, "learning_rate": 9.545117366644863e-06, "loss": 0.0082, "step": 820 }, { "epoch": 0.1630747839904658, "grad_norm": 0.6864874480930118, "learning_rate": 9.543775537956781e-06, "loss": 0.0165, "step": 821 }, { "epoch": 0.16327341344721422, "grad_norm": 0.9104337265905692, "learning_rate": 9.5424318276958e-06, "loss": 0.0202, "step": 822 }, { "epoch": 0.16347204290396267, "grad_norm": 1.3936891736313024, "learning_rate": 9.541086236418348e-06, "loss": 0.0139, "step": 823 }, { "epoch": 0.1636706723607111, "grad_norm": 1.0710993378946527, "learning_rate": 9.539738764681633e-06, "loss": 0.0207, "step": 824 }, { "epoch": 0.16386930181745954, "grad_norm": 0.7599724122468066, "learning_rate": 9.538389413043641e-06, "loss": 0.0194, "step": 825 }, { "epoch": 0.16406793127420796, "grad_norm": 0.4138470932916712, "learning_rate": 9.537038182063138e-06, "loss": 0.009, "step": 826 }, { "epoch": 0.1642665607309564, "grad_norm": 1.0081381826347446, "learning_rate": 9.535685072299668e-06, "loss": 0.0172, "step": 827 }, { "epoch": 0.16446519018770484, "grad_norm": 1.0432367672352847, "learning_rate": 9.53433008431355e-06, "loss": 0.0161, "step": 828 }, { "epoch": 0.16466381964445329, "grad_norm": 0.7812431690369702, "learning_rate": 9.532973218665887e-06, "loss": 0.0237, "step": 829 }, { "epoch": 0.1648624491012017, "grad_norm": 0.4534298510315967, "learning_rate": 9.531614475918552e-06, "loss": 0.0138, "step": 830 }, { "epoch": 0.16506107855795013, "grad_norm": 0.521000256028603, "learning_rate": 9.530253856634202e-06, "loss": 0.014, "step": 831 }, { "epoch": 0.16525970801469858, "grad_norm": 0.4111263927903728, "learning_rate": 9.528891361376265e-06, "loss": 0.0102, "step": 832 }, { "epoch": 0.165458337471447, "grad_norm": 0.6400161006473448, "learning_rate": 9.527526990708952e-06, "loss": 0.0135, "step": 833 }, { "epoch": 0.16565696692819545, "grad_norm": 1.0992489588141927, "learning_rate": 9.526160745197247e-06, "loss": 0.022, "step": 834 }, { "epoch": 0.16585559638494388, "grad_norm": 0.2568053298935382, "learning_rate": 9.524792625406908e-06, "loss": 0.0133, "step": 835 }, { "epoch": 0.16605422584169233, "grad_norm": 0.6766945867655767, "learning_rate": 9.523422631904473e-06, "loss": 0.0104, "step": 836 }, { "epoch": 0.16625285529844075, "grad_norm": 0.5619582062884672, "learning_rate": 9.522050765257257e-06, "loss": 0.0253, "step": 837 }, { "epoch": 0.1664514847551892, "grad_norm": 0.4221715846394782, "learning_rate": 9.52067702603335e-06, "loss": 0.0151, "step": 838 }, { "epoch": 0.16665011421193762, "grad_norm": 0.22009747488862705, "learning_rate": 9.519301414801612e-06, "loss": 0.0089, "step": 839 }, { "epoch": 0.16684874366868607, "grad_norm": 0.6287660735362872, "learning_rate": 9.517923932131685e-06, "loss": 0.0216, "step": 840 }, { "epoch": 0.1670473731254345, "grad_norm": 0.4170499590382144, "learning_rate": 9.516544578593981e-06, "loss": 0.0108, "step": 841 }, { "epoch": 0.16724600258218295, "grad_norm": 0.9421745387040598, "learning_rate": 9.51516335475969e-06, "loss": 0.0177, "step": 842 }, { "epoch": 0.16744463203893137, "grad_norm": 0.4757860318481506, "learning_rate": 9.513780261200774e-06, "loss": 0.011, "step": 843 }, { "epoch": 0.16764326149567982, "grad_norm": 0.7429722294449872, "learning_rate": 9.512395298489974e-06, "loss": 0.0148, "step": 844 }, { "epoch": 0.16784189095242824, "grad_norm": 0.5206174876646718, "learning_rate": 9.511008467200798e-06, "loss": 0.0156, "step": 845 }, { "epoch": 0.1680405204091767, "grad_norm": 1.3998688162229813, "learning_rate": 9.509619767907534e-06, "loss": 0.0239, "step": 846 }, { "epoch": 0.16823914986592511, "grad_norm": 0.8208188686925355, "learning_rate": 9.508229201185242e-06, "loss": 0.0135, "step": 847 }, { "epoch": 0.16843777932267356, "grad_norm": 0.3774258541529112, "learning_rate": 9.506836767609751e-06, "loss": 0.014, "step": 848 }, { "epoch": 0.168636408779422, "grad_norm": 0.668337123256141, "learning_rate": 9.505442467757666e-06, "loss": 0.013, "step": 849 }, { "epoch": 0.16883503823617044, "grad_norm": 0.4690654187501145, "learning_rate": 9.504046302206368e-06, "loss": 0.0229, "step": 850 }, { "epoch": 0.16903366769291886, "grad_norm": 0.9382417671995416, "learning_rate": 9.50264827153401e-06, "loss": 0.0201, "step": 851 }, { "epoch": 0.16923229714966728, "grad_norm": 0.5705256377710812, "learning_rate": 9.501248376319508e-06, "loss": 0.0128, "step": 852 }, { "epoch": 0.16943092660641573, "grad_norm": 0.48645671259102613, "learning_rate": 9.499846617142563e-06, "loss": 0.014, "step": 853 }, { "epoch": 0.16962955606316416, "grad_norm": 1.4022653145077122, "learning_rate": 9.498442994583639e-06, "loss": 0.0174, "step": 854 }, { "epoch": 0.1698281855199126, "grad_norm": 0.7638533431862135, "learning_rate": 9.497037509223977e-06, "loss": 0.0149, "step": 855 }, { "epoch": 0.17002681497666103, "grad_norm": 1.519768782139612, "learning_rate": 9.495630161645584e-06, "loss": 0.0255, "step": 856 }, { "epoch": 0.17022544443340948, "grad_norm": 0.44744869619282945, "learning_rate": 9.494220952431243e-06, "loss": 0.0132, "step": 857 }, { "epoch": 0.1704240738901579, "grad_norm": 0.5073442816162103, "learning_rate": 9.492809882164509e-06, "loss": 0.0219, "step": 858 }, { "epoch": 0.17062270334690635, "grad_norm": 0.6578886831791138, "learning_rate": 9.491396951429698e-06, "loss": 0.0118, "step": 859 }, { "epoch": 0.17082133280365477, "grad_norm": 0.440174527734574, "learning_rate": 9.48998216081191e-06, "loss": 0.023, "step": 860 }, { "epoch": 0.17101996226040322, "grad_norm": 1.2110068679815877, "learning_rate": 9.488565510897006e-06, "loss": 0.0213, "step": 861 }, { "epoch": 0.17121859171715165, "grad_norm": 0.34225622494013835, "learning_rate": 9.487147002271618e-06, "loss": 0.0218, "step": 862 }, { "epoch": 0.1714172211739001, "grad_norm": 0.9248969475348255, "learning_rate": 9.48572663552315e-06, "loss": 0.0172, "step": 863 }, { "epoch": 0.17161585063064852, "grad_norm": 0.3111389224565366, "learning_rate": 9.484304411239774e-06, "loss": 0.0194, "step": 864 }, { "epoch": 0.17181448008739697, "grad_norm": 0.5426022611346796, "learning_rate": 9.482880330010434e-06, "loss": 0.0177, "step": 865 }, { "epoch": 0.1720131095441454, "grad_norm": 0.9989174638535595, "learning_rate": 9.481454392424836e-06, "loss": 0.0179, "step": 866 }, { "epoch": 0.17221173900089384, "grad_norm": 1.0032440886756715, "learning_rate": 9.480026599073463e-06, "loss": 0.0214, "step": 867 }, { "epoch": 0.17241036845764227, "grad_norm": 0.29351054118887016, "learning_rate": 9.478596950547561e-06, "loss": 0.0084, "step": 868 }, { "epoch": 0.17260899791439072, "grad_norm": 0.5469059731359652, "learning_rate": 9.477165447439148e-06, "loss": 0.0228, "step": 869 }, { "epoch": 0.17280762737113914, "grad_norm": 0.5151774165372478, "learning_rate": 9.475732090341006e-06, "loss": 0.017, "step": 870 }, { "epoch": 0.1730062568278876, "grad_norm": 0.3849241059477399, "learning_rate": 9.474296879846688e-06, "loss": 0.0149, "step": 871 }, { "epoch": 0.173204886284636, "grad_norm": 0.2846207510615562, "learning_rate": 9.47285981655051e-06, "loss": 0.0134, "step": 872 }, { "epoch": 0.17340351574138443, "grad_norm": 0.3822669264577504, "learning_rate": 9.471420901047564e-06, "loss": 0.0126, "step": 873 }, { "epoch": 0.17360214519813288, "grad_norm": 0.5809048365250173, "learning_rate": 9.469980133933701e-06, "loss": 0.0186, "step": 874 }, { "epoch": 0.1738007746548813, "grad_norm": 0.8463872528550324, "learning_rate": 9.46853751580554e-06, "loss": 0.0195, "step": 875 }, { "epoch": 0.17399940411162976, "grad_norm": 0.4607531534725536, "learning_rate": 9.467093047260468e-06, "loss": 0.0151, "step": 876 }, { "epoch": 0.17419803356837818, "grad_norm": 1.1647174596925964, "learning_rate": 9.465646728896641e-06, "loss": 0.0138, "step": 877 }, { "epoch": 0.17439666302512663, "grad_norm": 0.6812902706877884, "learning_rate": 9.464198561312972e-06, "loss": 0.0146, "step": 878 }, { "epoch": 0.17459529248187505, "grad_norm": 1.118536537307821, "learning_rate": 9.462748545109152e-06, "loss": 0.0144, "step": 879 }, { "epoch": 0.1747939219386235, "grad_norm": 0.6063904311802616, "learning_rate": 9.461296680885628e-06, "loss": 0.0073, "step": 880 }, { "epoch": 0.17499255139537193, "grad_norm": 1.5335959139523947, "learning_rate": 9.459842969243615e-06, "loss": 0.0237, "step": 881 }, { "epoch": 0.17519118085212038, "grad_norm": 0.4453751489154979, "learning_rate": 9.458387410785096e-06, "loss": 0.0129, "step": 882 }, { "epoch": 0.1753898103088688, "grad_norm": 0.5312389194281991, "learning_rate": 9.456930006112814e-06, "loss": 0.0231, "step": 883 }, { "epoch": 0.17558843976561725, "grad_norm": 0.5246562462701451, "learning_rate": 9.45547075583028e-06, "loss": 0.0141, "step": 884 }, { "epoch": 0.17578706922236567, "grad_norm": 0.3711081470488199, "learning_rate": 9.454009660541769e-06, "loss": 0.0126, "step": 885 }, { "epoch": 0.17598569867911412, "grad_norm": 0.4214526210893554, "learning_rate": 9.452546720852317e-06, "loss": 0.0098, "step": 886 }, { "epoch": 0.17618432813586254, "grad_norm": 0.730638217900042, "learning_rate": 9.451081937367725e-06, "loss": 0.0155, "step": 887 }, { "epoch": 0.176382957592611, "grad_norm": 0.5304446182299737, "learning_rate": 9.449615310694563e-06, "loss": 0.0158, "step": 888 }, { "epoch": 0.17658158704935942, "grad_norm": 0.4787768591240991, "learning_rate": 9.448146841440156e-06, "loss": 0.011, "step": 889 }, { "epoch": 0.17678021650610787, "grad_norm": 0.6573768696874656, "learning_rate": 9.446676530212596e-06, "loss": 0.0121, "step": 890 }, { "epoch": 0.1769788459628563, "grad_norm": 0.8738613272343506, "learning_rate": 9.445204377620739e-06, "loss": 0.0218, "step": 891 }, { "epoch": 0.17717747541960474, "grad_norm": 0.6468029328396412, "learning_rate": 9.443730384274199e-06, "loss": 0.0126, "step": 892 }, { "epoch": 0.17737610487635316, "grad_norm": 0.6916082592292682, "learning_rate": 9.442254550783357e-06, "loss": 0.0165, "step": 893 }, { "epoch": 0.17757473433310159, "grad_norm": 0.4406671563052852, "learning_rate": 9.440776877759354e-06, "loss": 0.0115, "step": 894 }, { "epoch": 0.17777336378985004, "grad_norm": 1.0537465676266842, "learning_rate": 9.439297365814095e-06, "loss": 0.0174, "step": 895 }, { "epoch": 0.17797199324659846, "grad_norm": 1.2213613739765894, "learning_rate": 9.437816015560241e-06, "loss": 0.0183, "step": 896 }, { "epoch": 0.1781706227033469, "grad_norm": 0.4783500290161957, "learning_rate": 9.43633282761122e-06, "loss": 0.0092, "step": 897 }, { "epoch": 0.17836925216009533, "grad_norm": 0.43432963042155853, "learning_rate": 9.434847802581216e-06, "loss": 0.0203, "step": 898 }, { "epoch": 0.17856788161684378, "grad_norm": 0.7897317815340374, "learning_rate": 9.43336094108518e-06, "loss": 0.0108, "step": 899 }, { "epoch": 0.1787665110735922, "grad_norm": 0.3008681013432827, "learning_rate": 9.431872243738817e-06, "loss": 0.0137, "step": 900 }, { "epoch": 0.17896514053034066, "grad_norm": 0.6771184704989937, "learning_rate": 9.430381711158597e-06, "loss": 0.0262, "step": 901 }, { "epoch": 0.17916376998708908, "grad_norm": 0.4029134503869768, "learning_rate": 9.428889343961745e-06, "loss": 0.0082, "step": 902 }, { "epoch": 0.17936239944383753, "grad_norm": 0.6499915119874066, "learning_rate": 9.427395142766253e-06, "loss": 0.0104, "step": 903 }, { "epoch": 0.17956102890058595, "grad_norm": 1.2873816596565066, "learning_rate": 9.425899108190866e-06, "loss": 0.014, "step": 904 }, { "epoch": 0.1797596583573344, "grad_norm": 0.917742932464249, "learning_rate": 9.42440124085509e-06, "loss": 0.0098, "step": 905 }, { "epoch": 0.17995828781408282, "grad_norm": 0.6917074788463903, "learning_rate": 9.42290154137919e-06, "loss": 0.0111, "step": 906 }, { "epoch": 0.18015691727083127, "grad_norm": 0.4982836344620947, "learning_rate": 9.421400010384191e-06, "loss": 0.0145, "step": 907 }, { "epoch": 0.1803555467275797, "grad_norm": 1.086400992320905, "learning_rate": 9.419896648491875e-06, "loss": 0.0174, "step": 908 }, { "epoch": 0.18055417618432815, "grad_norm": 0.8193422731728996, "learning_rate": 9.418391456324785e-06, "loss": 0.0105, "step": 909 }, { "epoch": 0.18075280564107657, "grad_norm": 0.7103752650742982, "learning_rate": 9.416884434506217e-06, "loss": 0.0139, "step": 910 }, { "epoch": 0.18095143509782502, "grad_norm": 0.6962334695928468, "learning_rate": 9.415375583660227e-06, "loss": 0.0128, "step": 911 }, { "epoch": 0.18115006455457344, "grad_norm": 0.6806563393000292, "learning_rate": 9.41386490441163e-06, "loss": 0.0151, "step": 912 }, { "epoch": 0.1813486940113219, "grad_norm": 0.477963378101734, "learning_rate": 9.412352397385997e-06, "loss": 0.0156, "step": 913 }, { "epoch": 0.18154732346807032, "grad_norm": 1.377024847688744, "learning_rate": 9.410838063209653e-06, "loss": 0.0137, "step": 914 }, { "epoch": 0.18174595292481874, "grad_norm": 0.840278914999452, "learning_rate": 9.409321902509686e-06, "loss": 0.0179, "step": 915 }, { "epoch": 0.1819445823815672, "grad_norm": 1.2080657080079076, "learning_rate": 9.407803915913934e-06, "loss": 0.0142, "step": 916 }, { "epoch": 0.1821432118383156, "grad_norm": 0.6470778636335309, "learning_rate": 9.406284104050994e-06, "loss": 0.0153, "step": 917 }, { "epoch": 0.18234184129506406, "grad_norm": 0.6448817205204879, "learning_rate": 9.40476246755022e-06, "loss": 0.0147, "step": 918 }, { "epoch": 0.18254047075181248, "grad_norm": 0.8677307803730453, "learning_rate": 9.403239007041719e-06, "loss": 0.0224, "step": 919 }, { "epoch": 0.18273910020856093, "grad_norm": 1.0883235135924039, "learning_rate": 9.401713723156355e-06, "loss": 0.0156, "step": 920 }, { "epoch": 0.18293772966530936, "grad_norm": 1.378109947347798, "learning_rate": 9.400186616525747e-06, "loss": 0.0214, "step": 921 }, { "epoch": 0.1831363591220578, "grad_norm": 0.45828575967420676, "learning_rate": 9.398657687782264e-06, "loss": 0.0143, "step": 922 }, { "epoch": 0.18333498857880623, "grad_norm": 0.4067470046392445, "learning_rate": 9.397126937559041e-06, "loss": 0.0106, "step": 923 }, { "epoch": 0.18353361803555468, "grad_norm": 0.8655229021701957, "learning_rate": 9.395594366489956e-06, "loss": 0.0164, "step": 924 }, { "epoch": 0.1837322474923031, "grad_norm": 0.6544805313594291, "learning_rate": 9.394059975209644e-06, "loss": 0.0161, "step": 925 }, { "epoch": 0.18393087694905155, "grad_norm": 0.5260990122458291, "learning_rate": 9.392523764353497e-06, "loss": 0.0121, "step": 926 }, { "epoch": 0.18412950640579998, "grad_norm": 0.49273647924970304, "learning_rate": 9.390985734557659e-06, "loss": 0.0137, "step": 927 }, { "epoch": 0.18432813586254843, "grad_norm": 0.5039646852586156, "learning_rate": 9.389445886459026e-06, "loss": 0.0118, "step": 928 }, { "epoch": 0.18452676531929685, "grad_norm": 0.6159677479137697, "learning_rate": 9.387904220695245e-06, "loss": 0.0096, "step": 929 }, { "epoch": 0.1847253947760453, "grad_norm": 0.4398053369763513, "learning_rate": 9.386360737904722e-06, "loss": 0.0082, "step": 930 }, { "epoch": 0.18492402423279372, "grad_norm": 0.5382584798731853, "learning_rate": 9.384815438726608e-06, "loss": 0.0143, "step": 931 }, { "epoch": 0.18512265368954217, "grad_norm": 0.7022731175340937, "learning_rate": 9.383268323800815e-06, "loss": 0.0231, "step": 932 }, { "epoch": 0.1853212831462906, "grad_norm": 0.45625159282982203, "learning_rate": 9.381719393767998e-06, "loss": 0.018, "step": 933 }, { "epoch": 0.18551991260303904, "grad_norm": 0.5960828207873747, "learning_rate": 9.380168649269566e-06, "loss": 0.0147, "step": 934 }, { "epoch": 0.18571854205978747, "grad_norm": 0.8282241450571749, "learning_rate": 9.378616090947685e-06, "loss": 0.0215, "step": 935 }, { "epoch": 0.1859171715165359, "grad_norm": 0.4426456726841343, "learning_rate": 9.377061719445264e-06, "loss": 0.0229, "step": 936 }, { "epoch": 0.18611580097328434, "grad_norm": 0.5920397094573076, "learning_rate": 9.375505535405969e-06, "loss": 0.0173, "step": 937 }, { "epoch": 0.18631443043003276, "grad_norm": 0.2981282792836881, "learning_rate": 9.373947539474212e-06, "loss": 0.0112, "step": 938 }, { "epoch": 0.1865130598867812, "grad_norm": 0.7856812810221333, "learning_rate": 9.372387732295162e-06, "loss": 0.0119, "step": 939 }, { "epoch": 0.18671168934352964, "grad_norm": 0.7020478883034995, "learning_rate": 9.370826114514729e-06, "loss": 0.0138, "step": 940 }, { "epoch": 0.18691031880027809, "grad_norm": 0.3879630472447985, "learning_rate": 9.369262686779578e-06, "loss": 0.0098, "step": 941 }, { "epoch": 0.1871089482570265, "grad_norm": 0.26290487954249336, "learning_rate": 9.367697449737126e-06, "loss": 0.0086, "step": 942 }, { "epoch": 0.18730757771377496, "grad_norm": 0.8158600730845823, "learning_rate": 9.366130404035533e-06, "loss": 0.0127, "step": 943 }, { "epoch": 0.18750620717052338, "grad_norm": 0.43011489476490694, "learning_rate": 9.364561550323711e-06, "loss": 0.0138, "step": 944 }, { "epoch": 0.18770483662727183, "grad_norm": 0.6422387414139117, "learning_rate": 9.362990889251325e-06, "loss": 0.0183, "step": 945 }, { "epoch": 0.18790346608402025, "grad_norm": 0.8940322891219529, "learning_rate": 9.361418421468777e-06, "loss": 0.0187, "step": 946 }, { "epoch": 0.1881020955407687, "grad_norm": 1.4463527888545427, "learning_rate": 9.359844147627231e-06, "loss": 0.0214, "step": 947 }, { "epoch": 0.18830072499751713, "grad_norm": 0.4102746620553028, "learning_rate": 9.358268068378589e-06, "loss": 0.0085, "step": 948 }, { "epoch": 0.18849935445426558, "grad_norm": 0.6618029175469268, "learning_rate": 9.356690184375504e-06, "loss": 0.0168, "step": 949 }, { "epoch": 0.188697983911014, "grad_norm": 0.5539961291378328, "learning_rate": 9.355110496271376e-06, "loss": 0.0118, "step": 950 }, { "epoch": 0.18889661336776245, "grad_norm": 0.8598663158866856, "learning_rate": 9.353529004720354e-06, "loss": 0.0119, "step": 951 }, { "epoch": 0.18909524282451087, "grad_norm": 0.5890081936780608, "learning_rate": 9.35194571037733e-06, "loss": 0.0199, "step": 952 }, { "epoch": 0.18929387228125932, "grad_norm": 1.2607952878561444, "learning_rate": 9.350360613897945e-06, "loss": 0.021, "step": 953 }, { "epoch": 0.18949250173800775, "grad_norm": 0.5805466556115377, "learning_rate": 9.348773715938587e-06, "loss": 0.0103, "step": 954 }, { "epoch": 0.18969113119475617, "grad_norm": 0.6262277582379767, "learning_rate": 9.347185017156388e-06, "loss": 0.014, "step": 955 }, { "epoch": 0.18988976065150462, "grad_norm": 0.8635643634403196, "learning_rate": 9.345594518209227e-06, "loss": 0.02, "step": 956 }, { "epoch": 0.19008839010825304, "grad_norm": 0.4163572607700282, "learning_rate": 9.344002219755728e-06, "loss": 0.0099, "step": 957 }, { "epoch": 0.1902870195650015, "grad_norm": 0.5499649933024405, "learning_rate": 9.34240812245526e-06, "loss": 0.0123, "step": 958 }, { "epoch": 0.19048564902174991, "grad_norm": 0.4621935969511572, "learning_rate": 9.340812226967936e-06, "loss": 0.0141, "step": 959 }, { "epoch": 0.19068427847849836, "grad_norm": 0.6322621882728303, "learning_rate": 9.339214533954618e-06, "loss": 0.0138, "step": 960 }, { "epoch": 0.1908829079352468, "grad_norm": 0.5315385506545031, "learning_rate": 9.337615044076906e-06, "loss": 0.0156, "step": 961 }, { "epoch": 0.19108153739199524, "grad_norm": 0.8395315780253293, "learning_rate": 9.336013757997147e-06, "loss": 0.014, "step": 962 }, { "epoch": 0.19128016684874366, "grad_norm": 0.515182257587864, "learning_rate": 9.334410676378433e-06, "loss": 0.0151, "step": 963 }, { "epoch": 0.1914787963054921, "grad_norm": 0.7503607628649587, "learning_rate": 9.3328057998846e-06, "loss": 0.0181, "step": 964 }, { "epoch": 0.19167742576224053, "grad_norm": 0.6292960608616276, "learning_rate": 9.331199129180224e-06, "loss": 0.0256, "step": 965 }, { "epoch": 0.19187605521898898, "grad_norm": 0.657172738095144, "learning_rate": 9.329590664930625e-06, "loss": 0.0131, "step": 966 }, { "epoch": 0.1920746846757374, "grad_norm": 0.6758585625636307, "learning_rate": 9.32798040780187e-06, "loss": 0.0233, "step": 967 }, { "epoch": 0.19227331413248586, "grad_norm": 1.138944554532619, "learning_rate": 9.326368358460757e-06, "loss": 0.0166, "step": 968 }, { "epoch": 0.19247194358923428, "grad_norm": 0.9844802183689632, "learning_rate": 9.324754517574844e-06, "loss": 0.0147, "step": 969 }, { "epoch": 0.19267057304598273, "grad_norm": 0.36727739091125694, "learning_rate": 9.323138885812416e-06, "loss": 0.0157, "step": 970 }, { "epoch": 0.19286920250273115, "grad_norm": 0.39266585464073445, "learning_rate": 9.3215214638425e-06, "loss": 0.0147, "step": 971 }, { "epoch": 0.1930678319594796, "grad_norm": 0.694569390497426, "learning_rate": 9.319902252334878e-06, "loss": 0.0132, "step": 972 }, { "epoch": 0.19326646141622802, "grad_norm": 0.8122752981584541, "learning_rate": 9.318281251960059e-06, "loss": 0.0152, "step": 973 }, { "epoch": 0.19346509087297648, "grad_norm": 1.356751012776408, "learning_rate": 9.316658463389296e-06, "loss": 0.0159, "step": 974 }, { "epoch": 0.1936637203297249, "grad_norm": 0.4514146325071209, "learning_rate": 9.315033887294588e-06, "loss": 0.0085, "step": 975 }, { "epoch": 0.19386234978647332, "grad_norm": 0.4492629322519748, "learning_rate": 9.313407524348667e-06, "loss": 0.0121, "step": 976 }, { "epoch": 0.19406097924322177, "grad_norm": 0.42766136851441033, "learning_rate": 9.311779375225012e-06, "loss": 0.0113, "step": 977 }, { "epoch": 0.1942596086999702, "grad_norm": 0.3782072201329815, "learning_rate": 9.310149440597833e-06, "loss": 0.006, "step": 978 }, { "epoch": 0.19445823815671864, "grad_norm": 1.3010134952460966, "learning_rate": 9.308517721142088e-06, "loss": 0.0219, "step": 979 }, { "epoch": 0.19465686761346707, "grad_norm": 1.7188646382221526, "learning_rate": 9.30688421753347e-06, "loss": 0.021, "step": 980 }, { "epoch": 0.19485549707021552, "grad_norm": 1.113751111511706, "learning_rate": 9.30524893044841e-06, "loss": 0.0213, "step": 981 }, { "epoch": 0.19505412652696394, "grad_norm": 0.33969922760265947, "learning_rate": 9.303611860564079e-06, "loss": 0.0088, "step": 982 }, { "epoch": 0.1952527559837124, "grad_norm": 0.6797604296474096, "learning_rate": 9.301973008558387e-06, "loss": 0.0187, "step": 983 }, { "epoch": 0.1954513854404608, "grad_norm": 0.7594684777223187, "learning_rate": 9.30033237510998e-06, "loss": 0.0195, "step": 984 }, { "epoch": 0.19565001489720926, "grad_norm": 1.3637727775160045, "learning_rate": 9.298689960898242e-06, "loss": 0.0126, "step": 985 }, { "epoch": 0.19584864435395768, "grad_norm": 1.0630474301871424, "learning_rate": 9.297045766603297e-06, "loss": 0.0141, "step": 986 }, { "epoch": 0.19604727381070614, "grad_norm": 0.6542337994520813, "learning_rate": 9.295399792906002e-06, "loss": 0.021, "step": 987 }, { "epoch": 0.19624590326745456, "grad_norm": 0.5665306835890809, "learning_rate": 9.293752040487956e-06, "loss": 0.0142, "step": 988 }, { "epoch": 0.196444532724203, "grad_norm": 0.7553552246755394, "learning_rate": 9.292102510031488e-06, "loss": 0.0132, "step": 989 }, { "epoch": 0.19664316218095143, "grad_norm": 1.470905402981988, "learning_rate": 9.29045120221967e-06, "loss": 0.0256, "step": 990 }, { "epoch": 0.19684179163769988, "grad_norm": 0.5202547467174007, "learning_rate": 9.288798117736307e-06, "loss": 0.0128, "step": 991 }, { "epoch": 0.1970404210944483, "grad_norm": 0.48524424654365406, "learning_rate": 9.287143257265936e-06, "loss": 0.0135, "step": 992 }, { "epoch": 0.19723905055119675, "grad_norm": 0.6718955314932935, "learning_rate": 9.285486621493836e-06, "loss": 0.0125, "step": 993 }, { "epoch": 0.19743768000794518, "grad_norm": 0.8034984592157217, "learning_rate": 9.283828211106019e-06, "loss": 0.0199, "step": 994 }, { "epoch": 0.19763630946469363, "grad_norm": 0.5390976895123648, "learning_rate": 9.28216802678923e-06, "loss": 0.0131, "step": 995 }, { "epoch": 0.19783493892144205, "grad_norm": 0.6418655391021179, "learning_rate": 9.280506069230945e-06, "loss": 0.0139, "step": 996 }, { "epoch": 0.19803356837819047, "grad_norm": 0.6491724713923708, "learning_rate": 9.278842339119388e-06, "loss": 0.0159, "step": 997 }, { "epoch": 0.19823219783493892, "grad_norm": 0.40209317799866046, "learning_rate": 9.277176837143501e-06, "loss": 0.0074, "step": 998 }, { "epoch": 0.19843082729168735, "grad_norm": 0.5920855302131218, "learning_rate": 9.27550956399297e-06, "loss": 0.023, "step": 999 }, { "epoch": 0.1986294567484358, "grad_norm": 0.37861800255788186, "learning_rate": 9.27384052035821e-06, "loss": 0.0131, "step": 1000 }, { "epoch": 0.19882808620518422, "grad_norm": 0.4811422211153491, "learning_rate": 9.27216970693037e-06, "loss": 0.0152, "step": 1001 }, { "epoch": 0.19902671566193267, "grad_norm": 0.6166117246588404, "learning_rate": 9.270497124401332e-06, "loss": 0.0159, "step": 1002 }, { "epoch": 0.1992253451186811, "grad_norm": 0.6776544917793748, "learning_rate": 9.268822773463715e-06, "loss": 0.014, "step": 1003 }, { "epoch": 0.19942397457542954, "grad_norm": 0.3906605633332408, "learning_rate": 9.267146654810859e-06, "loss": 0.0134, "step": 1004 }, { "epoch": 0.19962260403217796, "grad_norm": 0.3221183282074231, "learning_rate": 9.265468769136847e-06, "loss": 0.0117, "step": 1005 }, { "epoch": 0.19982123348892641, "grad_norm": 0.3942134464598503, "learning_rate": 9.26378911713649e-06, "loss": 0.0114, "step": 1006 }, { "epoch": 0.20001986294567484, "grad_norm": 0.875008215072221, "learning_rate": 9.262107699505329e-06, "loss": 0.0226, "step": 1007 }, { "epoch": 0.2002184924024233, "grad_norm": 0.3797025965662609, "learning_rate": 9.260424516939636e-06, "loss": 0.0122, "step": 1008 }, { "epoch": 0.2004171218591717, "grad_norm": 0.7343719704793019, "learning_rate": 9.25873957013642e-06, "loss": 0.0145, "step": 1009 }, { "epoch": 0.20061575131592016, "grad_norm": 0.536506479455833, "learning_rate": 9.257052859793412e-06, "loss": 0.0143, "step": 1010 }, { "epoch": 0.20081438077266858, "grad_norm": 0.5622360593336543, "learning_rate": 9.255364386609077e-06, "loss": 0.0147, "step": 1011 }, { "epoch": 0.20101301022941703, "grad_norm": 0.4163313975119902, "learning_rate": 9.253674151282612e-06, "loss": 0.0127, "step": 1012 }, { "epoch": 0.20121163968616546, "grad_norm": 0.5537476076401138, "learning_rate": 9.25198215451394e-06, "loss": 0.0142, "step": 1013 }, { "epoch": 0.2014102691429139, "grad_norm": 0.6048080720095486, "learning_rate": 9.250288397003715e-06, "loss": 0.0128, "step": 1014 }, { "epoch": 0.20160889859966233, "grad_norm": 0.9243481861891079, "learning_rate": 9.248592879453323e-06, "loss": 0.024, "step": 1015 }, { "epoch": 0.20180752805641078, "grad_norm": 0.3405567732176348, "learning_rate": 9.246895602564874e-06, "loss": 0.0085, "step": 1016 }, { "epoch": 0.2020061575131592, "grad_norm": 0.8790274897891412, "learning_rate": 9.245196567041207e-06, "loss": 0.0128, "step": 1017 }, { "epoch": 0.20220478696990762, "grad_norm": 0.6058890979082735, "learning_rate": 9.243495773585896e-06, "loss": 0.0165, "step": 1018 }, { "epoch": 0.20240341642665607, "grad_norm": 0.3990747368828865, "learning_rate": 9.241793222903233e-06, "loss": 0.0131, "step": 1019 }, { "epoch": 0.2026020458834045, "grad_norm": 0.4972091523991518, "learning_rate": 9.240088915698243e-06, "loss": 0.0152, "step": 1020 }, { "epoch": 0.20280067534015295, "grad_norm": 0.9145107437455913, "learning_rate": 9.238382852676679e-06, "loss": 0.013, "step": 1021 }, { "epoch": 0.20299930479690137, "grad_norm": 0.7916389183212782, "learning_rate": 9.236675034545022e-06, "loss": 0.0158, "step": 1022 }, { "epoch": 0.20319793425364982, "grad_norm": 1.312567937111962, "learning_rate": 9.234965462010475e-06, "loss": 0.0235, "step": 1023 }, { "epoch": 0.20339656371039824, "grad_norm": 0.7879443556012908, "learning_rate": 9.233254135780973e-06, "loss": 0.0134, "step": 1024 }, { "epoch": 0.2035951931671467, "grad_norm": 0.7144955186930526, "learning_rate": 9.23154105656517e-06, "loss": 0.0185, "step": 1025 }, { "epoch": 0.20379382262389512, "grad_norm": 0.8772618199422003, "learning_rate": 9.229826225072455e-06, "loss": 0.0115, "step": 1026 }, { "epoch": 0.20399245208064357, "grad_norm": 0.3508519745307009, "learning_rate": 9.228109642012934e-06, "loss": 0.0161, "step": 1027 }, { "epoch": 0.204191081537392, "grad_norm": 1.142282664161455, "learning_rate": 9.226391308097446e-06, "loss": 0.0219, "step": 1028 }, { "epoch": 0.20438971099414044, "grad_norm": 0.6781354611906164, "learning_rate": 9.22467122403755e-06, "loss": 0.0236, "step": 1029 }, { "epoch": 0.20458834045088886, "grad_norm": 0.7331441338639288, "learning_rate": 9.22294939054553e-06, "loss": 0.0144, "step": 1030 }, { "epoch": 0.2047869699076373, "grad_norm": 0.5178653186934357, "learning_rate": 9.221225808334396e-06, "loss": 0.0142, "step": 1031 }, { "epoch": 0.20498559936438573, "grad_norm": 0.2608341992646028, "learning_rate": 9.219500478117883e-06, "loss": 0.0155, "step": 1032 }, { "epoch": 0.20518422882113418, "grad_norm": 1.2106616895089297, "learning_rate": 9.217773400610447e-06, "loss": 0.0175, "step": 1033 }, { "epoch": 0.2053828582778826, "grad_norm": 0.28032336350734227, "learning_rate": 9.21604457652727e-06, "loss": 0.0128, "step": 1034 }, { "epoch": 0.20558148773463106, "grad_norm": 0.6118204505736901, "learning_rate": 9.214314006584256e-06, "loss": 0.0181, "step": 1035 }, { "epoch": 0.20578011719137948, "grad_norm": 0.579822360863569, "learning_rate": 9.21258169149803e-06, "loss": 0.0183, "step": 1036 }, { "epoch": 0.20597874664812793, "grad_norm": 0.4047362585138213, "learning_rate": 9.210847631985946e-06, "loss": 0.0095, "step": 1037 }, { "epoch": 0.20617737610487635, "grad_norm": 0.9852053076273041, "learning_rate": 9.209111828766075e-06, "loss": 0.0219, "step": 1038 }, { "epoch": 0.20637600556162478, "grad_norm": 0.31297991161612315, "learning_rate": 9.207374282557211e-06, "loss": 0.0117, "step": 1039 }, { "epoch": 0.20657463501837323, "grad_norm": 0.4164929500440738, "learning_rate": 9.20563499407887e-06, "loss": 0.0077, "step": 1040 }, { "epoch": 0.20677326447512165, "grad_norm": 0.3180467674371223, "learning_rate": 9.203893964051287e-06, "loss": 0.011, "step": 1041 }, { "epoch": 0.2069718939318701, "grad_norm": 1.0524726224029894, "learning_rate": 9.202151193195426e-06, "loss": 0.0217, "step": 1042 }, { "epoch": 0.20717052338861852, "grad_norm": 0.8350304497147369, "learning_rate": 9.200406682232962e-06, "loss": 0.0206, "step": 1043 }, { "epoch": 0.20736915284536697, "grad_norm": 0.5099630567366636, "learning_rate": 9.198660431886299e-06, "loss": 0.0114, "step": 1044 }, { "epoch": 0.2075677823021154, "grad_norm": 0.45140417770852326, "learning_rate": 9.196912442878555e-06, "loss": 0.0187, "step": 1045 }, { "epoch": 0.20776641175886384, "grad_norm": 0.3877058148364568, "learning_rate": 9.195162715933573e-06, "loss": 0.0146, "step": 1046 }, { "epoch": 0.20796504121561227, "grad_norm": 0.5643704108621737, "learning_rate": 9.19341125177591e-06, "loss": 0.0204, "step": 1047 }, { "epoch": 0.20816367067236072, "grad_norm": 1.1260233639536537, "learning_rate": 9.191658051130845e-06, "loss": 0.0176, "step": 1048 }, { "epoch": 0.20836230012910914, "grad_norm": 0.8511847732487132, "learning_rate": 9.189903114724382e-06, "loss": 0.0185, "step": 1049 }, { "epoch": 0.2085609295858576, "grad_norm": 1.1694023155755944, "learning_rate": 9.188146443283233e-06, "loss": 0.02, "step": 1050 }, { "epoch": 0.208759559042606, "grad_norm": 0.5304190834875754, "learning_rate": 9.186388037534836e-06, "loss": 0.0124, "step": 1051 }, { "epoch": 0.20895818849935446, "grad_norm": 0.9189842414829346, "learning_rate": 9.184627898207346e-06, "loss": 0.0145, "step": 1052 }, { "epoch": 0.2091568179561029, "grad_norm": 0.6809255153940301, "learning_rate": 9.182866026029633e-06, "loss": 0.0188, "step": 1053 }, { "epoch": 0.20935544741285134, "grad_norm": 0.36494308828456623, "learning_rate": 9.181102421731289e-06, "loss": 0.0146, "step": 1054 }, { "epoch": 0.20955407686959976, "grad_norm": 0.734816285805914, "learning_rate": 9.179337086042618e-06, "loss": 0.018, "step": 1055 }, { "epoch": 0.2097527063263482, "grad_norm": 0.734510701009242, "learning_rate": 9.177570019694646e-06, "loss": 0.0149, "step": 1056 }, { "epoch": 0.20995133578309663, "grad_norm": 0.3903628569085709, "learning_rate": 9.175801223419111e-06, "loss": 0.0095, "step": 1057 }, { "epoch": 0.21014996523984508, "grad_norm": 0.18988569699286045, "learning_rate": 9.174030697948472e-06, "loss": 0.0053, "step": 1058 }, { "epoch": 0.2103485946965935, "grad_norm": 0.22752236325582623, "learning_rate": 9.1722584440159e-06, "loss": 0.0103, "step": 1059 }, { "epoch": 0.21054722415334193, "grad_norm": 0.842296179765435, "learning_rate": 9.170484462355287e-06, "loss": 0.0154, "step": 1060 }, { "epoch": 0.21074585361009038, "grad_norm": 0.4916092773194938, "learning_rate": 9.168708753701232e-06, "loss": 0.0156, "step": 1061 }, { "epoch": 0.2109444830668388, "grad_norm": 0.619936716278479, "learning_rate": 9.166931318789058e-06, "loss": 0.0122, "step": 1062 }, { "epoch": 0.21114311252358725, "grad_norm": 0.7419851034682392, "learning_rate": 9.165152158354797e-06, "loss": 0.0168, "step": 1063 }, { "epoch": 0.21134174198033567, "grad_norm": 0.5313205892992046, "learning_rate": 9.163371273135198e-06, "loss": 0.016, "step": 1064 }, { "epoch": 0.21154037143708412, "grad_norm": 1.029560156219283, "learning_rate": 9.161588663867725e-06, "loss": 0.0215, "step": 1065 }, { "epoch": 0.21173900089383255, "grad_norm": 0.9348963919399064, "learning_rate": 9.159804331290553e-06, "loss": 0.0188, "step": 1066 }, { "epoch": 0.211937630350581, "grad_norm": 0.3743440188348788, "learning_rate": 9.158018276142573e-06, "loss": 0.0098, "step": 1067 }, { "epoch": 0.21213625980732942, "grad_norm": 1.169169214182386, "learning_rate": 9.15623049916339e-06, "loss": 0.0166, "step": 1068 }, { "epoch": 0.21233488926407787, "grad_norm": 1.946374912712508, "learning_rate": 9.15444100109332e-06, "loss": 0.0123, "step": 1069 }, { "epoch": 0.2125335187208263, "grad_norm": 0.3756066632440553, "learning_rate": 9.15264978267339e-06, "loss": 0.017, "step": 1070 }, { "epoch": 0.21273214817757474, "grad_norm": 0.615631571698432, "learning_rate": 9.150856844645345e-06, "loss": 0.0148, "step": 1071 }, { "epoch": 0.21293077763432317, "grad_norm": 0.5711931753500064, "learning_rate": 9.149062187751635e-06, "loss": 0.0135, "step": 1072 }, { "epoch": 0.21312940709107162, "grad_norm": 0.3195420308665485, "learning_rate": 9.14726581273543e-06, "loss": 0.0106, "step": 1073 }, { "epoch": 0.21332803654782004, "grad_norm": 0.7675454724598806, "learning_rate": 9.145467720340607e-06, "loss": 0.0186, "step": 1074 }, { "epoch": 0.2135266660045685, "grad_norm": 0.7176331742224797, "learning_rate": 9.143667911311748e-06, "loss": 0.0155, "step": 1075 }, { "epoch": 0.2137252954613169, "grad_norm": 0.3307945238844646, "learning_rate": 9.14186638639416e-06, "loss": 0.0162, "step": 1076 }, { "epoch": 0.21392392491806536, "grad_norm": 0.8244909859270414, "learning_rate": 9.140063146333849e-06, "loss": 0.0124, "step": 1077 }, { "epoch": 0.21412255437481378, "grad_norm": 0.8379862052945656, "learning_rate": 9.138258191877534e-06, "loss": 0.0182, "step": 1078 }, { "epoch": 0.21432118383156223, "grad_norm": 0.31645715871594615, "learning_rate": 9.136451523772644e-06, "loss": 0.0155, "step": 1079 }, { "epoch": 0.21451981328831066, "grad_norm": 0.573196275828462, "learning_rate": 9.134643142767324e-06, "loss": 0.011, "step": 1080 }, { "epoch": 0.21471844274505908, "grad_norm": 0.7554029631929728, "learning_rate": 9.132833049610417e-06, "loss": 0.0152, "step": 1081 }, { "epoch": 0.21491707220180753, "grad_norm": 0.45963932693629606, "learning_rate": 9.131021245051482e-06, "loss": 0.0086, "step": 1082 }, { "epoch": 0.21511570165855595, "grad_norm": 0.3667521583026654, "learning_rate": 9.129207729840787e-06, "loss": 0.0151, "step": 1083 }, { "epoch": 0.2153143311153044, "grad_norm": 0.7758394373843591, "learning_rate": 9.127392504729308e-06, "loss": 0.0119, "step": 1084 }, { "epoch": 0.21551296057205283, "grad_norm": 0.8577031176131009, "learning_rate": 9.125575570468726e-06, "loss": 0.0189, "step": 1085 }, { "epoch": 0.21571159002880128, "grad_norm": 0.4101225829842196, "learning_rate": 9.123756927811429e-06, "loss": 0.0147, "step": 1086 }, { "epoch": 0.2159102194855497, "grad_norm": 0.20450257055095583, "learning_rate": 9.12193657751052e-06, "loss": 0.0097, "step": 1087 }, { "epoch": 0.21610884894229815, "grad_norm": 0.7261120497720621, "learning_rate": 9.120114520319801e-06, "loss": 0.0134, "step": 1088 }, { "epoch": 0.21630747839904657, "grad_norm": 1.0613311521030548, "learning_rate": 9.118290756993787e-06, "loss": 0.0204, "step": 1089 }, { "epoch": 0.21650610785579502, "grad_norm": 0.852239674073818, "learning_rate": 9.116465288287693e-06, "loss": 0.0327, "step": 1090 }, { "epoch": 0.21670473731254344, "grad_norm": 0.682837045981, "learning_rate": 9.114638114957444e-06, "loss": 0.0151, "step": 1091 }, { "epoch": 0.2169033667692919, "grad_norm": 0.5746264192915811, "learning_rate": 9.112809237759675e-06, "loss": 0.0167, "step": 1092 }, { "epoch": 0.21710199622604032, "grad_norm": 0.5489395595406013, "learning_rate": 9.110978657451716e-06, "loss": 0.0104, "step": 1093 }, { "epoch": 0.21730062568278877, "grad_norm": 1.0914265746523228, "learning_rate": 9.109146374791615e-06, "loss": 0.0195, "step": 1094 }, { "epoch": 0.2174992551395372, "grad_norm": 1.041328850309126, "learning_rate": 9.107312390538114e-06, "loss": 0.0177, "step": 1095 }, { "epoch": 0.21769788459628564, "grad_norm": 1.047327530765551, "learning_rate": 9.105476705450666e-06, "loss": 0.0151, "step": 1096 }, { "epoch": 0.21789651405303406, "grad_norm": 0.7382313477764061, "learning_rate": 9.103639320289424e-06, "loss": 0.0116, "step": 1097 }, { "epoch": 0.2180951435097825, "grad_norm": 0.28713859066703235, "learning_rate": 9.10180023581525e-06, "loss": 0.0136, "step": 1098 }, { "epoch": 0.21829377296653094, "grad_norm": 1.132089412775031, "learning_rate": 9.099959452789706e-06, "loss": 0.0177, "step": 1099 }, { "epoch": 0.21849240242327939, "grad_norm": 0.708856856834622, "learning_rate": 9.098116971975058e-06, "loss": 0.0123, "step": 1100 }, { "epoch": 0.2186910318800278, "grad_norm": 0.6004021170139507, "learning_rate": 9.096272794134276e-06, "loss": 0.0214, "step": 1101 }, { "epoch": 0.21888966133677623, "grad_norm": 1.5930865385924153, "learning_rate": 9.094426920031033e-06, "loss": 0.0292, "step": 1102 }, { "epoch": 0.21908829079352468, "grad_norm": 0.7947683989906318, "learning_rate": 9.092579350429703e-06, "loss": 0.014, "step": 1103 }, { "epoch": 0.2192869202502731, "grad_norm": 0.9955767036580443, "learning_rate": 9.090730086095359e-06, "loss": 0.0161, "step": 1104 }, { "epoch": 0.21948554970702155, "grad_norm": 0.3082258407552644, "learning_rate": 9.088879127793786e-06, "loss": 0.0102, "step": 1105 }, { "epoch": 0.21968417916376998, "grad_norm": 0.7479627577909793, "learning_rate": 9.08702647629146e-06, "loss": 0.01, "step": 1106 }, { "epoch": 0.21988280862051843, "grad_norm": 1.2508502872987448, "learning_rate": 9.085172132355563e-06, "loss": 0.0181, "step": 1107 }, { "epoch": 0.22008143807726685, "grad_norm": 1.3082638036499745, "learning_rate": 9.083316096753979e-06, "loss": 0.0182, "step": 1108 }, { "epoch": 0.2202800675340153, "grad_norm": 0.48875456172981985, "learning_rate": 9.081458370255285e-06, "loss": 0.0155, "step": 1109 }, { "epoch": 0.22047869699076372, "grad_norm": 0.43843962541786946, "learning_rate": 9.079598953628769e-06, "loss": 0.0118, "step": 1110 }, { "epoch": 0.22067732644751217, "grad_norm": 0.692522124440947, "learning_rate": 9.077737847644411e-06, "loss": 0.0168, "step": 1111 }, { "epoch": 0.2208759559042606, "grad_norm": 0.5909955076987183, "learning_rate": 9.075875053072895e-06, "loss": 0.0146, "step": 1112 }, { "epoch": 0.22107458536100905, "grad_norm": 0.8872938011038043, "learning_rate": 9.0740105706856e-06, "loss": 0.0168, "step": 1113 }, { "epoch": 0.22127321481775747, "grad_norm": 0.44692462034582914, "learning_rate": 9.072144401254607e-06, "loss": 0.0096, "step": 1114 }, { "epoch": 0.22147184427450592, "grad_norm": 0.8271646623948226, "learning_rate": 9.070276545552696e-06, "loss": 0.0113, "step": 1115 }, { "epoch": 0.22167047373125434, "grad_norm": 0.46981013701744095, "learning_rate": 9.068407004353346e-06, "loss": 0.0091, "step": 1116 }, { "epoch": 0.2218691031880028, "grad_norm": 0.6096189754072621, "learning_rate": 9.066535778430727e-06, "loss": 0.0106, "step": 1117 }, { "epoch": 0.22206773264475121, "grad_norm": 1.2355353818301986, "learning_rate": 9.064662868559714e-06, "loss": 0.0177, "step": 1118 }, { "epoch": 0.22226636210149966, "grad_norm": 0.7921361207312948, "learning_rate": 9.062788275515878e-06, "loss": 0.0117, "step": 1119 }, { "epoch": 0.2224649915582481, "grad_norm": 0.621118395225743, "learning_rate": 9.060912000075489e-06, "loss": 0.0184, "step": 1120 }, { "epoch": 0.22266362101499654, "grad_norm": 1.2199946768983834, "learning_rate": 9.059034043015505e-06, "loss": 0.022, "step": 1121 }, { "epoch": 0.22286225047174496, "grad_norm": 1.200785379496156, "learning_rate": 9.057154405113588e-06, "loss": 0.0146, "step": 1122 }, { "epoch": 0.22306087992849338, "grad_norm": 0.7879812928114914, "learning_rate": 9.055273087148095e-06, "loss": 0.0154, "step": 1123 }, { "epoch": 0.22325950938524183, "grad_norm": 0.865383817784091, "learning_rate": 9.053390089898078e-06, "loss": 0.0151, "step": 1124 }, { "epoch": 0.22345813884199026, "grad_norm": 0.6021432751244368, "learning_rate": 9.051505414143283e-06, "loss": 0.0213, "step": 1125 }, { "epoch": 0.2236567682987387, "grad_norm": 0.7551185452295719, "learning_rate": 9.049619060664155e-06, "loss": 0.0234, "step": 1126 }, { "epoch": 0.22385539775548713, "grad_norm": 0.9095260202243808, "learning_rate": 9.047731030241827e-06, "loss": 0.0119, "step": 1127 }, { "epoch": 0.22405402721223558, "grad_norm": 0.7060999713583611, "learning_rate": 9.045841323658136e-06, "loss": 0.0139, "step": 1128 }, { "epoch": 0.224252656668984, "grad_norm": 0.611769741942482, "learning_rate": 9.043949941695602e-06, "loss": 0.0135, "step": 1129 }, { "epoch": 0.22445128612573245, "grad_norm": 0.6041679576728929, "learning_rate": 9.042056885137447e-06, "loss": 0.0208, "step": 1130 }, { "epoch": 0.22464991558248087, "grad_norm": 0.4868808950406833, "learning_rate": 9.040162154767585e-06, "loss": 0.0139, "step": 1131 }, { "epoch": 0.22484854503922933, "grad_norm": 0.37898547751832734, "learning_rate": 9.03826575137062e-06, "loss": 0.0134, "step": 1132 }, { "epoch": 0.22504717449597775, "grad_norm": 0.3871203326790516, "learning_rate": 9.036367675731852e-06, "loss": 0.014, "step": 1133 }, { "epoch": 0.2252458039527262, "grad_norm": 0.4158985639649831, "learning_rate": 9.03446792863727e-06, "loss": 0.0155, "step": 1134 }, { "epoch": 0.22544443340947462, "grad_norm": 0.8248065463728428, "learning_rate": 9.03256651087356e-06, "loss": 0.0191, "step": 1135 }, { "epoch": 0.22564306286622307, "grad_norm": 0.6997112013650323, "learning_rate": 9.030663423228096e-06, "loss": 0.0119, "step": 1136 }, { "epoch": 0.2258416923229715, "grad_norm": 0.40813675543898414, "learning_rate": 9.028758666488946e-06, "loss": 0.0106, "step": 1137 }, { "epoch": 0.22604032177971994, "grad_norm": 0.27264796677090325, "learning_rate": 9.026852241444865e-06, "loss": 0.0117, "step": 1138 }, { "epoch": 0.22623895123646837, "grad_norm": 0.7634799874549393, "learning_rate": 9.024944148885305e-06, "loss": 0.0137, "step": 1139 }, { "epoch": 0.22643758069321682, "grad_norm": 0.37311413583560454, "learning_rate": 9.023034389600403e-06, "loss": 0.014, "step": 1140 }, { "epoch": 0.22663621014996524, "grad_norm": 1.1650537636675808, "learning_rate": 9.021122964380988e-06, "loss": 0.0149, "step": 1141 }, { "epoch": 0.22683483960671366, "grad_norm": 0.6052730792928078, "learning_rate": 9.019209874018581e-06, "loss": 0.0153, "step": 1142 }, { "epoch": 0.2270334690634621, "grad_norm": 0.6627925620828036, "learning_rate": 9.01729511930539e-06, "loss": 0.0132, "step": 1143 }, { "epoch": 0.22723209852021053, "grad_norm": 1.831246147791384, "learning_rate": 9.015378701034315e-06, "loss": 0.0222, "step": 1144 }, { "epoch": 0.22743072797695899, "grad_norm": 0.6830687198229002, "learning_rate": 9.013460619998937e-06, "loss": 0.0095, "step": 1145 }, { "epoch": 0.2276293574337074, "grad_norm": 0.6897632451876908, "learning_rate": 9.011540876993539e-06, "loss": 0.0145, "step": 1146 }, { "epoch": 0.22782798689045586, "grad_norm": 0.45414406716937405, "learning_rate": 9.00961947281308e-06, "loss": 0.0135, "step": 1147 }, { "epoch": 0.22802661634720428, "grad_norm": 0.6724858979266666, "learning_rate": 9.007696408253212e-06, "loss": 0.0176, "step": 1148 }, { "epoch": 0.22822524580395273, "grad_norm": 0.5585341592638718, "learning_rate": 9.005771684110275e-06, "loss": 0.0142, "step": 1149 }, { "epoch": 0.22842387526070115, "grad_norm": 0.33631448656179974, "learning_rate": 9.003845301181296e-06, "loss": 0.0104, "step": 1150 }, { "epoch": 0.2286225047174496, "grad_norm": 0.936608731900455, "learning_rate": 9.001917260263986e-06, "loss": 0.0176, "step": 1151 }, { "epoch": 0.22882113417419803, "grad_norm": 0.450082775257085, "learning_rate": 8.999987562156747e-06, "loss": 0.0108, "step": 1152 }, { "epoch": 0.22901976363094648, "grad_norm": 0.6502188418132533, "learning_rate": 8.998056207658662e-06, "loss": 0.0197, "step": 1153 }, { "epoch": 0.2292183930876949, "grad_norm": 0.7819061233264434, "learning_rate": 8.996123197569508e-06, "loss": 0.0116, "step": 1154 }, { "epoch": 0.22941702254444335, "grad_norm": 0.8372844196296657, "learning_rate": 8.994188532689739e-06, "loss": 0.0124, "step": 1155 }, { "epoch": 0.22961565200119177, "grad_norm": 0.4402518430330382, "learning_rate": 8.992252213820498e-06, "loss": 0.0117, "step": 1156 }, { "epoch": 0.22981428145794022, "grad_norm": 0.3109619518398801, "learning_rate": 8.990314241763614e-06, "loss": 0.0105, "step": 1157 }, { "epoch": 0.23001291091468865, "grad_norm": 0.940067780887533, "learning_rate": 8.988374617321597e-06, "loss": 0.0186, "step": 1158 }, { "epoch": 0.2302115403714371, "grad_norm": 1.0503338550839172, "learning_rate": 8.986433341297646e-06, "loss": 0.0221, "step": 1159 }, { "epoch": 0.23041016982818552, "grad_norm": 0.6966779242628477, "learning_rate": 8.984490414495642e-06, "loss": 0.0104, "step": 1160 }, { "epoch": 0.23060879928493397, "grad_norm": 0.29127162723599453, "learning_rate": 8.982545837720148e-06, "loss": 0.008, "step": 1161 }, { "epoch": 0.2308074287416824, "grad_norm": 0.7864175276944835, "learning_rate": 8.980599611776408e-06, "loss": 0.0203, "step": 1162 }, { "epoch": 0.2310060581984308, "grad_norm": 0.6641316865126137, "learning_rate": 8.978651737470354e-06, "loss": 0.016, "step": 1163 }, { "epoch": 0.23120468765517926, "grad_norm": 0.9355779041765688, "learning_rate": 8.976702215608603e-06, "loss": 0.0196, "step": 1164 }, { "epoch": 0.2314033171119277, "grad_norm": 0.5906387589001846, "learning_rate": 8.974751046998445e-06, "loss": 0.0188, "step": 1165 }, { "epoch": 0.23160194656867614, "grad_norm": 1.0575477626994387, "learning_rate": 8.97279823244786e-06, "loss": 0.0118, "step": 1166 }, { "epoch": 0.23180057602542456, "grad_norm": 0.2202881228331673, "learning_rate": 8.970843772765505e-06, "loss": 0.0091, "step": 1167 }, { "epoch": 0.231999205482173, "grad_norm": 0.7075656613422315, "learning_rate": 8.968887668760719e-06, "loss": 0.0158, "step": 1168 }, { "epoch": 0.23219783493892143, "grad_norm": 0.2683433024299603, "learning_rate": 8.966929921243526e-06, "loss": 0.012, "step": 1169 }, { "epoch": 0.23239646439566988, "grad_norm": 0.42237177538366844, "learning_rate": 8.964970531024624e-06, "loss": 0.0131, "step": 1170 }, { "epoch": 0.2325950938524183, "grad_norm": 0.594094475622576, "learning_rate": 8.963009498915396e-06, "loss": 0.0119, "step": 1171 }, { "epoch": 0.23279372330916676, "grad_norm": 0.406385112438069, "learning_rate": 8.961046825727904e-06, "loss": 0.0113, "step": 1172 }, { "epoch": 0.23299235276591518, "grad_norm": 0.8033200609983158, "learning_rate": 8.959082512274885e-06, "loss": 0.0123, "step": 1173 }, { "epoch": 0.23319098222266363, "grad_norm": 0.47018932211309783, "learning_rate": 8.957116559369767e-06, "loss": 0.0144, "step": 1174 }, { "epoch": 0.23338961167941205, "grad_norm": 0.6077335509097652, "learning_rate": 8.955148967826642e-06, "loss": 0.0191, "step": 1175 }, { "epoch": 0.2335882411361605, "grad_norm": 0.8460343255845043, "learning_rate": 8.95317973846029e-06, "loss": 0.0231, "step": 1176 }, { "epoch": 0.23378687059290892, "grad_norm": 0.568144005560869, "learning_rate": 8.951208872086166e-06, "loss": 0.0148, "step": 1177 }, { "epoch": 0.23398550004965737, "grad_norm": 0.6107991897583083, "learning_rate": 8.949236369520406e-06, "loss": 0.0184, "step": 1178 }, { "epoch": 0.2341841295064058, "grad_norm": 0.42161119081345094, "learning_rate": 8.947262231579822e-06, "loss": 0.0119, "step": 1179 }, { "epoch": 0.23438275896315425, "grad_norm": 0.40846557188713406, "learning_rate": 8.945286459081899e-06, "loss": 0.0137, "step": 1180 }, { "epoch": 0.23458138841990267, "grad_norm": 0.763309241318152, "learning_rate": 8.943309052844806e-06, "loss": 0.0134, "step": 1181 }, { "epoch": 0.23478001787665112, "grad_norm": 0.5803819518194214, "learning_rate": 8.941330013687382e-06, "loss": 0.0179, "step": 1182 }, { "epoch": 0.23497864733339954, "grad_norm": 1.039753700867902, "learning_rate": 8.939349342429144e-06, "loss": 0.0152, "step": 1183 }, { "epoch": 0.23517727679014797, "grad_norm": 0.5593224702420636, "learning_rate": 8.937367039890291e-06, "loss": 0.0138, "step": 1184 }, { "epoch": 0.23537590624689642, "grad_norm": 0.6151456252694801, "learning_rate": 8.93538310689169e-06, "loss": 0.0202, "step": 1185 }, { "epoch": 0.23557453570364484, "grad_norm": 0.2979372338609462, "learning_rate": 8.933397544254884e-06, "loss": 0.0121, "step": 1186 }, { "epoch": 0.2357731651603933, "grad_norm": 1.520553652264108, "learning_rate": 8.931410352802095e-06, "loss": 0.0222, "step": 1187 }, { "epoch": 0.2359717946171417, "grad_norm": 0.504693883508692, "learning_rate": 8.929421533356215e-06, "loss": 0.0155, "step": 1188 }, { "epoch": 0.23617042407389016, "grad_norm": 0.7873301547010652, "learning_rate": 8.927431086740814e-06, "loss": 0.0253, "step": 1189 }, { "epoch": 0.23636905353063858, "grad_norm": 0.3650087454226798, "learning_rate": 8.925439013780131e-06, "loss": 0.0129, "step": 1190 }, { "epoch": 0.23656768298738703, "grad_norm": 0.8064537953076155, "learning_rate": 8.923445315299085e-06, "loss": 0.0157, "step": 1191 }, { "epoch": 0.23676631244413546, "grad_norm": 0.37122234928734354, "learning_rate": 8.921449992123264e-06, "loss": 0.0116, "step": 1192 }, { "epoch": 0.2369649419008839, "grad_norm": 1.2902752115216132, "learning_rate": 8.919453045078927e-06, "loss": 0.0227, "step": 1193 }, { "epoch": 0.23716357135763233, "grad_norm": 1.7179509273444584, "learning_rate": 8.917454474993008e-06, "loss": 0.0245, "step": 1194 }, { "epoch": 0.23736220081438078, "grad_norm": 0.38492446902743044, "learning_rate": 8.915454282693116e-06, "loss": 0.0145, "step": 1195 }, { "epoch": 0.2375608302711292, "grad_norm": 0.4447455285152336, "learning_rate": 8.913452469007526e-06, "loss": 0.0125, "step": 1196 }, { "epoch": 0.23775945972787765, "grad_norm": 0.5145154810241487, "learning_rate": 8.911449034765186e-06, "loss": 0.0117, "step": 1197 }, { "epoch": 0.23795808918462608, "grad_norm": 0.3081471801818194, "learning_rate": 8.90944398079572e-06, "loss": 0.0184, "step": 1198 }, { "epoch": 0.23815671864137453, "grad_norm": 0.4384280059834026, "learning_rate": 8.907437307929416e-06, "loss": 0.0098, "step": 1199 }, { "epoch": 0.23835534809812295, "grad_norm": 0.3173624393698063, "learning_rate": 8.905429016997236e-06, "loss": 0.0154, "step": 1200 }, { "epoch": 0.2385539775548714, "grad_norm": 0.4241866381288965, "learning_rate": 8.903419108830808e-06, "loss": 0.0121, "step": 1201 }, { "epoch": 0.23875260701161982, "grad_norm": 1.6418273480323602, "learning_rate": 8.901407584262441e-06, "loss": 0.0217, "step": 1202 }, { "epoch": 0.23895123646836827, "grad_norm": 0.6482978467087832, "learning_rate": 8.899394444125097e-06, "loss": 0.0119, "step": 1203 }, { "epoch": 0.2391498659251167, "grad_norm": 0.49425790948254467, "learning_rate": 8.897379689252418e-06, "loss": 0.0105, "step": 1204 }, { "epoch": 0.23934849538186512, "grad_norm": 0.7361793243822061, "learning_rate": 8.895363320478715e-06, "loss": 0.0167, "step": 1205 }, { "epoch": 0.23954712483861357, "grad_norm": 0.5247879216412412, "learning_rate": 8.893345338638961e-06, "loss": 0.0128, "step": 1206 }, { "epoch": 0.239745754295362, "grad_norm": 0.7004974169321737, "learning_rate": 8.891325744568802e-06, "loss": 0.0124, "step": 1207 }, { "epoch": 0.23994438375211044, "grad_norm": 0.5994164703367848, "learning_rate": 8.889304539104549e-06, "loss": 0.0082, "step": 1208 }, { "epoch": 0.24014301320885886, "grad_norm": 0.6187312745317394, "learning_rate": 8.887281723083179e-06, "loss": 0.0143, "step": 1209 }, { "epoch": 0.2403416426656073, "grad_norm": 0.3401870907046762, "learning_rate": 8.885257297342343e-06, "loss": 0.0092, "step": 1210 }, { "epoch": 0.24054027212235574, "grad_norm": 0.702714768582494, "learning_rate": 8.883231262720348e-06, "loss": 0.0149, "step": 1211 }, { "epoch": 0.2407389015791042, "grad_norm": 0.31545699277173606, "learning_rate": 8.881203620056178e-06, "loss": 0.0071, "step": 1212 }, { "epoch": 0.2409375310358526, "grad_norm": 0.9671126025588995, "learning_rate": 8.879174370189475e-06, "loss": 0.0139, "step": 1213 }, { "epoch": 0.24113616049260106, "grad_norm": 0.9187540597662088, "learning_rate": 8.87714351396055e-06, "loss": 0.0264, "step": 1214 }, { "epoch": 0.24133478994934948, "grad_norm": 0.9928211950549544, "learning_rate": 8.875111052210378e-06, "loss": 0.0151, "step": 1215 }, { "epoch": 0.24153341940609793, "grad_norm": 2.2372121346405853, "learning_rate": 8.873076985780602e-06, "loss": 0.0327, "step": 1216 }, { "epoch": 0.24173204886284635, "grad_norm": 0.6013185780237169, "learning_rate": 8.871041315513523e-06, "loss": 0.0203, "step": 1217 }, { "epoch": 0.2419306783195948, "grad_norm": 0.4762488349863747, "learning_rate": 8.869004042252111e-06, "loss": 0.0185, "step": 1218 }, { "epoch": 0.24212930777634323, "grad_norm": 0.5238008487757791, "learning_rate": 8.866965166840003e-06, "loss": 0.0121, "step": 1219 }, { "epoch": 0.24232793723309168, "grad_norm": 0.8737365406833077, "learning_rate": 8.864924690121489e-06, "loss": 0.0154, "step": 1220 }, { "epoch": 0.2425265666898401, "grad_norm": 0.8165887277343837, "learning_rate": 8.862882612941532e-06, "loss": 0.0136, "step": 1221 }, { "epoch": 0.24272519614658855, "grad_norm": 0.6144569559576514, "learning_rate": 8.860838936145754e-06, "loss": 0.0134, "step": 1222 }, { "epoch": 0.24292382560333697, "grad_norm": 0.7154975367837693, "learning_rate": 8.858793660580438e-06, "loss": 0.0164, "step": 1223 }, { "epoch": 0.24312245506008542, "grad_norm": 0.5286069304960067, "learning_rate": 8.856746787092532e-06, "loss": 0.0172, "step": 1224 }, { "epoch": 0.24332108451683385, "grad_norm": 0.6339057196599259, "learning_rate": 8.854698316529642e-06, "loss": 0.0202, "step": 1225 }, { "epoch": 0.24351971397358227, "grad_norm": 0.671220701801568, "learning_rate": 8.852648249740041e-06, "loss": 0.0151, "step": 1226 }, { "epoch": 0.24371834343033072, "grad_norm": 1.1322941892865441, "learning_rate": 8.850596587572658e-06, "loss": 0.0153, "step": 1227 }, { "epoch": 0.24391697288707914, "grad_norm": 0.5378549330329825, "learning_rate": 8.848543330877084e-06, "loss": 0.0186, "step": 1228 }, { "epoch": 0.2441156023438276, "grad_norm": 0.5673634305039723, "learning_rate": 8.84648848050357e-06, "loss": 0.0197, "step": 1229 }, { "epoch": 0.24431423180057601, "grad_norm": 0.3534387414527403, "learning_rate": 8.84443203730303e-06, "loss": 0.0165, "step": 1230 }, { "epoch": 0.24451286125732447, "grad_norm": 0.5672990685019907, "learning_rate": 8.842374002127033e-06, "loss": 0.0136, "step": 1231 }, { "epoch": 0.2447114907140729, "grad_norm": 0.8327923542087138, "learning_rate": 8.840314375827808e-06, "loss": 0.0145, "step": 1232 }, { "epoch": 0.24491012017082134, "grad_norm": 0.3379005386095217, "learning_rate": 8.838253159258245e-06, "loss": 0.0112, "step": 1233 }, { "epoch": 0.24510874962756976, "grad_norm": 0.8689521331181729, "learning_rate": 8.836190353271894e-06, "loss": 0.024, "step": 1234 }, { "epoch": 0.2453073790843182, "grad_norm": 0.42828866756204587, "learning_rate": 8.834125958722958e-06, "loss": 0.0115, "step": 1235 }, { "epoch": 0.24550600854106663, "grad_norm": 0.5156919400962039, "learning_rate": 8.832059976466305e-06, "loss": 0.0175, "step": 1236 }, { "epoch": 0.24570463799781508, "grad_norm": 0.6054495643078346, "learning_rate": 8.82999240735745e-06, "loss": 0.0098, "step": 1237 }, { "epoch": 0.2459032674545635, "grad_norm": 0.7588058992250312, "learning_rate": 8.827923252252577e-06, "loss": 0.0143, "step": 1238 }, { "epoch": 0.24610189691131196, "grad_norm": 0.38121018989369687, "learning_rate": 8.825852512008518e-06, "loss": 0.0122, "step": 1239 }, { "epoch": 0.24630052636806038, "grad_norm": 0.6129760460462393, "learning_rate": 8.823780187482764e-06, "loss": 0.0113, "step": 1240 }, { "epoch": 0.24649915582480883, "grad_norm": 1.2945184977220947, "learning_rate": 8.821706279533465e-06, "loss": 0.0135, "step": 1241 }, { "epoch": 0.24669778528155725, "grad_norm": 0.5977689562617747, "learning_rate": 8.819630789019422e-06, "loss": 0.0093, "step": 1242 }, { "epoch": 0.2468964147383057, "grad_norm": 0.2571681909339781, "learning_rate": 8.817553716800095e-06, "loss": 0.0046, "step": 1243 }, { "epoch": 0.24709504419505413, "grad_norm": 0.8248908002719997, "learning_rate": 8.815475063735596e-06, "loss": 0.0113, "step": 1244 }, { "epoch": 0.24729367365180258, "grad_norm": 0.584105982723087, "learning_rate": 8.813394830686695e-06, "loss": 0.012, "step": 1245 }, { "epoch": 0.247492303108551, "grad_norm": 0.6285446237309048, "learning_rate": 8.811313018514812e-06, "loss": 0.0189, "step": 1246 }, { "epoch": 0.24769093256529942, "grad_norm": 0.7020367426867679, "learning_rate": 8.809229628082025e-06, "loss": 0.0223, "step": 1247 }, { "epoch": 0.24788956202204787, "grad_norm": 0.7721912844272995, "learning_rate": 8.807144660251065e-06, "loss": 0.0201, "step": 1248 }, { "epoch": 0.2480881914787963, "grad_norm": 0.6697889970539281, "learning_rate": 8.805058115885313e-06, "loss": 0.0188, "step": 1249 }, { "epoch": 0.24828682093554474, "grad_norm": 0.5381683696890166, "learning_rate": 8.802969995848807e-06, "loss": 0.0061, "step": 1250 }, { "epoch": 0.24848545039229317, "grad_norm": 0.20343883973471363, "learning_rate": 8.800880301006232e-06, "loss": 0.0043, "step": 1251 }, { "epoch": 0.24868407984904162, "grad_norm": 1.0199510305812602, "learning_rate": 8.798789032222932e-06, "loss": 0.0165, "step": 1252 }, { "epoch": 0.24888270930579004, "grad_norm": 0.6275045781700873, "learning_rate": 8.796696190364897e-06, "loss": 0.0221, "step": 1253 }, { "epoch": 0.2490813387625385, "grad_norm": 1.0129544827919168, "learning_rate": 8.794601776298772e-06, "loss": 0.0155, "step": 1254 }, { "epoch": 0.2492799682192869, "grad_norm": 1.1428308322663727, "learning_rate": 8.792505790891852e-06, "loss": 0.0135, "step": 1255 }, { "epoch": 0.24947859767603536, "grad_norm": 0.39150197901895456, "learning_rate": 8.790408235012081e-06, "loss": 0.0156, "step": 1256 }, { "epoch": 0.24967722713278379, "grad_norm": 0.5483610810377085, "learning_rate": 8.788309109528057e-06, "loss": 0.0113, "step": 1257 }, { "epoch": 0.24987585658953224, "grad_norm": 0.27854942135066457, "learning_rate": 8.786208415309023e-06, "loss": 0.0132, "step": 1258 }, { "epoch": 0.2500744860462807, "grad_norm": 0.36486249941255533, "learning_rate": 8.784106153224876e-06, "loss": 0.0111, "step": 1259 }, { "epoch": 0.2502731155030291, "grad_norm": 0.5102349401683092, "learning_rate": 8.782002324146162e-06, "loss": 0.0131, "step": 1260 }, { "epoch": 0.25047174495977753, "grad_norm": 0.590643281645968, "learning_rate": 8.779896928944072e-06, "loss": 0.0192, "step": 1261 }, { "epoch": 0.25067037441652595, "grad_norm": 0.6155294741908522, "learning_rate": 8.777789968490449e-06, "loss": 0.0108, "step": 1262 }, { "epoch": 0.25086900387327443, "grad_norm": 0.8591597492277506, "learning_rate": 8.775681443657781e-06, "loss": 0.0134, "step": 1263 }, { "epoch": 0.25106763333002285, "grad_norm": 0.36267619775257987, "learning_rate": 8.773571355319213e-06, "loss": 0.0091, "step": 1264 }, { "epoch": 0.2512662627867713, "grad_norm": 0.545487632579775, "learning_rate": 8.771459704348521e-06, "loss": 0.0152, "step": 1265 }, { "epoch": 0.2514648922435197, "grad_norm": 0.4679522780645009, "learning_rate": 8.769346491620145e-06, "loss": 0.0125, "step": 1266 }, { "epoch": 0.2516635217002681, "grad_norm": 0.5309851493131132, "learning_rate": 8.767231718009161e-06, "loss": 0.0105, "step": 1267 }, { "epoch": 0.2518621511570166, "grad_norm": 1.162362377224985, "learning_rate": 8.765115384391296e-06, "loss": 0.015, "step": 1268 }, { "epoch": 0.252060780613765, "grad_norm": 0.45539747042204515, "learning_rate": 8.76299749164292e-06, "loss": 0.0141, "step": 1269 }, { "epoch": 0.25225941007051345, "grad_norm": 0.5999868464347882, "learning_rate": 8.76087804064105e-06, "loss": 0.0171, "step": 1270 }, { "epoch": 0.25245803952726187, "grad_norm": 0.5339189795562727, "learning_rate": 8.75875703226335e-06, "loss": 0.0162, "step": 1271 }, { "epoch": 0.25265666898401035, "grad_norm": 0.859130519663411, "learning_rate": 8.756634467388128e-06, "loss": 0.0165, "step": 1272 }, { "epoch": 0.25285529844075877, "grad_norm": 0.21076209185559133, "learning_rate": 8.754510346894334e-06, "loss": 0.0044, "step": 1273 }, { "epoch": 0.2530539278975072, "grad_norm": 0.7939050635268597, "learning_rate": 8.752384671661566e-06, "loss": 0.0182, "step": 1274 }, { "epoch": 0.2532525573542556, "grad_norm": 0.5766317392642453, "learning_rate": 8.750257442570064e-06, "loss": 0.0095, "step": 1275 }, { "epoch": 0.2534511868110041, "grad_norm": 0.7303525194562475, "learning_rate": 8.74812866050071e-06, "loss": 0.0117, "step": 1276 }, { "epoch": 0.2536498162677525, "grad_norm": 0.40272585071340505, "learning_rate": 8.74599832633503e-06, "loss": 0.0107, "step": 1277 }, { "epoch": 0.25384844572450094, "grad_norm": 0.9286068222372685, "learning_rate": 8.743866440955196e-06, "loss": 0.0207, "step": 1278 }, { "epoch": 0.25404707518124936, "grad_norm": 0.6122165638925774, "learning_rate": 8.741733005244016e-06, "loss": 0.0162, "step": 1279 }, { "epoch": 0.25424570463799784, "grad_norm": 0.590624454464984, "learning_rate": 8.739598020084947e-06, "loss": 0.0087, "step": 1280 }, { "epoch": 0.25444433409474626, "grad_norm": 1.016687681475864, "learning_rate": 8.737461486362082e-06, "loss": 0.0161, "step": 1281 }, { "epoch": 0.2546429635514947, "grad_norm": 0.8291330430883908, "learning_rate": 8.735323404960159e-06, "loss": 0.0209, "step": 1282 }, { "epoch": 0.2548415930082431, "grad_norm": 1.2088406629766484, "learning_rate": 8.733183776764556e-06, "loss": 0.016, "step": 1283 }, { "epoch": 0.2550402224649916, "grad_norm": 0.903068606712378, "learning_rate": 8.731042602661289e-06, "loss": 0.0201, "step": 1284 }, { "epoch": 0.25523885192174, "grad_norm": 0.6420259885722851, "learning_rate": 8.728899883537014e-06, "loss": 0.0146, "step": 1285 }, { "epoch": 0.25543748137848843, "grad_norm": 0.4337787015933656, "learning_rate": 8.726755620279033e-06, "loss": 0.0121, "step": 1286 }, { "epoch": 0.25563611083523685, "grad_norm": 0.9125000801904051, "learning_rate": 8.724609813775282e-06, "loss": 0.0174, "step": 1287 }, { "epoch": 0.2558347402919853, "grad_norm": 0.4601365566071554, "learning_rate": 8.722462464914337e-06, "loss": 0.0091, "step": 1288 }, { "epoch": 0.25603336974873375, "grad_norm": 0.6681352622131371, "learning_rate": 8.720313574585412e-06, "loss": 0.0168, "step": 1289 }, { "epoch": 0.2562319992054822, "grad_norm": 0.3959505420693962, "learning_rate": 8.718163143678365e-06, "loss": 0.0128, "step": 1290 }, { "epoch": 0.2564306286622306, "grad_norm": 0.23256713254644923, "learning_rate": 8.716011173083679e-06, "loss": 0.0117, "step": 1291 }, { "epoch": 0.256629258118979, "grad_norm": 0.2541092240532925, "learning_rate": 8.713857663692492e-06, "loss": 0.0086, "step": 1292 }, { "epoch": 0.2568278875757275, "grad_norm": 0.6179349251330739, "learning_rate": 8.711702616396562e-06, "loss": 0.014, "step": 1293 }, { "epoch": 0.2570265170324759, "grad_norm": 0.3453897821620428, "learning_rate": 8.709546032088296e-06, "loss": 0.0128, "step": 1294 }, { "epoch": 0.25722514648922434, "grad_norm": 0.5847778245454376, "learning_rate": 8.707387911660735e-06, "loss": 0.0119, "step": 1295 }, { "epoch": 0.25742377594597277, "grad_norm": 0.2704171725983602, "learning_rate": 8.705228256007549e-06, "loss": 0.0115, "step": 1296 }, { "epoch": 0.25762240540272124, "grad_norm": 0.6897068421679288, "learning_rate": 8.703067066023055e-06, "loss": 0.0112, "step": 1297 }, { "epoch": 0.25782103485946967, "grad_norm": 0.40321138772863513, "learning_rate": 8.700904342602197e-06, "loss": 0.0056, "step": 1298 }, { "epoch": 0.2580196643162181, "grad_norm": 0.7069935344664505, "learning_rate": 8.698740086640559e-06, "loss": 0.0194, "step": 1299 }, { "epoch": 0.2582182937729665, "grad_norm": 0.9773159570445462, "learning_rate": 8.696574299034351e-06, "loss": 0.0236, "step": 1300 }, { "epoch": 0.258416923229715, "grad_norm": 1.029306053307389, "learning_rate": 8.69440698068043e-06, "loss": 0.0202, "step": 1301 }, { "epoch": 0.2586155526864634, "grad_norm": 0.59016206926503, "learning_rate": 8.692238132476278e-06, "loss": 0.0099, "step": 1302 }, { "epoch": 0.25881418214321184, "grad_norm": 0.516731967140984, "learning_rate": 8.690067755320012e-06, "loss": 0.0094, "step": 1303 }, { "epoch": 0.25901281159996026, "grad_norm": 0.3353964879934747, "learning_rate": 8.687895850110386e-06, "loss": 0.0167, "step": 1304 }, { "epoch": 0.25921144105670874, "grad_norm": 0.9274750807980529, "learning_rate": 8.68572241774678e-06, "loss": 0.0192, "step": 1305 }, { "epoch": 0.25941007051345716, "grad_norm": 0.6824972255867494, "learning_rate": 8.683547459129211e-06, "loss": 0.0168, "step": 1306 }, { "epoch": 0.2596086999702056, "grad_norm": 0.6136495451069736, "learning_rate": 8.681370975158328e-06, "loss": 0.0108, "step": 1307 }, { "epoch": 0.259807329426954, "grad_norm": 0.37108024187396627, "learning_rate": 8.679192966735413e-06, "loss": 0.0109, "step": 1308 }, { "epoch": 0.2600059588837024, "grad_norm": 0.725413502338458, "learning_rate": 8.677013434762373e-06, "loss": 0.0255, "step": 1309 }, { "epoch": 0.2602045883404509, "grad_norm": 0.48143907389589863, "learning_rate": 8.674832380141754e-06, "loss": 0.0171, "step": 1310 }, { "epoch": 0.2604032177971993, "grad_norm": 0.34879379609000266, "learning_rate": 8.672649803776724e-06, "loss": 0.0106, "step": 1311 }, { "epoch": 0.26060184725394775, "grad_norm": 0.5825209707500266, "learning_rate": 8.670465706571088e-06, "loss": 0.0177, "step": 1312 }, { "epoch": 0.26080047671069617, "grad_norm": 0.46030714480671636, "learning_rate": 8.66828008942928e-06, "loss": 0.0098, "step": 1313 }, { "epoch": 0.26099910616744465, "grad_norm": 0.7703888628916056, "learning_rate": 8.66609295325636e-06, "loss": 0.0171, "step": 1314 }, { "epoch": 0.2611977356241931, "grad_norm": 0.738989269533668, "learning_rate": 8.663904298958018e-06, "loss": 0.0157, "step": 1315 }, { "epoch": 0.2613963650809415, "grad_norm": 0.6362768126248891, "learning_rate": 8.661714127440578e-06, "loss": 0.0106, "step": 1316 }, { "epoch": 0.2615949945376899, "grad_norm": 0.5455888714935196, "learning_rate": 8.659522439610983e-06, "loss": 0.0142, "step": 1317 }, { "epoch": 0.2617936239944384, "grad_norm": 0.23611279379951763, "learning_rate": 8.657329236376811e-06, "loss": 0.0067, "step": 1318 }, { "epoch": 0.2619922534511868, "grad_norm": 0.31209832142135896, "learning_rate": 8.655134518646264e-06, "loss": 0.0133, "step": 1319 }, { "epoch": 0.26219088290793524, "grad_norm": 0.2910883987846584, "learning_rate": 8.652938287328174e-06, "loss": 0.0084, "step": 1320 }, { "epoch": 0.26238951236468366, "grad_norm": 0.46180981742302674, "learning_rate": 8.650740543331997e-06, "loss": 0.0094, "step": 1321 }, { "epoch": 0.26258814182143214, "grad_norm": 1.0186730371322448, "learning_rate": 8.648541287567817e-06, "loss": 0.0176, "step": 1322 }, { "epoch": 0.26278677127818056, "grad_norm": 0.7996672389811389, "learning_rate": 8.646340520946343e-06, "loss": 0.0241, "step": 1323 }, { "epoch": 0.262985400734929, "grad_norm": 0.40791342137010506, "learning_rate": 8.644138244378912e-06, "loss": 0.0109, "step": 1324 }, { "epoch": 0.2631840301916774, "grad_norm": 0.5936153143977514, "learning_rate": 8.641934458777482e-06, "loss": 0.0118, "step": 1325 }, { "epoch": 0.2633826596484259, "grad_norm": 0.4056311995150495, "learning_rate": 8.63972916505464e-06, "loss": 0.0081, "step": 1326 }, { "epoch": 0.2635812891051743, "grad_norm": 0.7376187474937147, "learning_rate": 8.637522364123596e-06, "loss": 0.0114, "step": 1327 }, { "epoch": 0.26377991856192273, "grad_norm": 0.3672522299105733, "learning_rate": 8.635314056898185e-06, "loss": 0.0089, "step": 1328 }, { "epoch": 0.26397854801867116, "grad_norm": 0.7185888449952436, "learning_rate": 8.633104244292862e-06, "loss": 0.0204, "step": 1329 }, { "epoch": 0.2641771774754196, "grad_norm": 0.6555374705786724, "learning_rate": 8.630892927222709e-06, "loss": 0.0153, "step": 1330 }, { "epoch": 0.26437580693216806, "grad_norm": 0.4167555548956689, "learning_rate": 8.628680106603433e-06, "loss": 0.0149, "step": 1331 }, { "epoch": 0.2645744363889165, "grad_norm": 0.6242106937112943, "learning_rate": 8.626465783351357e-06, "loss": 0.0184, "step": 1332 }, { "epoch": 0.2647730658456649, "grad_norm": 0.42946885487707775, "learning_rate": 8.624249958383433e-06, "loss": 0.0126, "step": 1333 }, { "epoch": 0.2649716953024133, "grad_norm": 0.9955126087048953, "learning_rate": 8.62203263261723e-06, "loss": 0.0123, "step": 1334 }, { "epoch": 0.2651703247591618, "grad_norm": 0.3025009366483527, "learning_rate": 8.61981380697094e-06, "loss": 0.0106, "step": 1335 }, { "epoch": 0.2653689542159102, "grad_norm": 0.3621342251640229, "learning_rate": 8.617593482363379e-06, "loss": 0.0089, "step": 1336 }, { "epoch": 0.26556758367265865, "grad_norm": 0.9828115025424791, "learning_rate": 8.615371659713979e-06, "loss": 0.0202, "step": 1337 }, { "epoch": 0.26576621312940707, "grad_norm": 1.4452351020519076, "learning_rate": 8.613148339942796e-06, "loss": 0.0197, "step": 1338 }, { "epoch": 0.26596484258615555, "grad_norm": 0.6375350442126254, "learning_rate": 8.610923523970502e-06, "loss": 0.0165, "step": 1339 }, { "epoch": 0.26616347204290397, "grad_norm": 0.4494514278658364, "learning_rate": 8.608697212718396e-06, "loss": 0.0095, "step": 1340 }, { "epoch": 0.2663621014996524, "grad_norm": 0.8983564965166115, "learning_rate": 8.606469407108385e-06, "loss": 0.0165, "step": 1341 }, { "epoch": 0.2665607309564008, "grad_norm": 0.6516352105484908, "learning_rate": 8.604240108063004e-06, "loss": 0.0148, "step": 1342 }, { "epoch": 0.2667593604131493, "grad_norm": 0.5267184317342489, "learning_rate": 8.602009316505407e-06, "loss": 0.0124, "step": 1343 }, { "epoch": 0.2669579898698977, "grad_norm": 0.47381761284604557, "learning_rate": 8.599777033359355e-06, "loss": 0.0101, "step": 1344 }, { "epoch": 0.26715661932664614, "grad_norm": 1.6669999572861662, "learning_rate": 8.597543259549241e-06, "loss": 0.0245, "step": 1345 }, { "epoch": 0.26735524878339456, "grad_norm": 0.4832507507850066, "learning_rate": 8.595307996000066e-06, "loss": 0.0134, "step": 1346 }, { "epoch": 0.26755387824014304, "grad_norm": 0.5156754052203035, "learning_rate": 8.59307124363745e-06, "loss": 0.0258, "step": 1347 }, { "epoch": 0.26775250769689146, "grad_norm": 0.8536512969498504, "learning_rate": 8.590833003387628e-06, "loss": 0.0258, "step": 1348 }, { "epoch": 0.2679511371536399, "grad_norm": 0.9428969265848054, "learning_rate": 8.588593276177458e-06, "loss": 0.019, "step": 1349 }, { "epoch": 0.2681497666103883, "grad_norm": 0.7301869629392804, "learning_rate": 8.586352062934404e-06, "loss": 0.0145, "step": 1350 }, { "epoch": 0.26834839606713673, "grad_norm": 0.4718543613104909, "learning_rate": 8.584109364586554e-06, "loss": 0.0091, "step": 1351 }, { "epoch": 0.2685470255238852, "grad_norm": 0.271372733060467, "learning_rate": 8.581865182062606e-06, "loss": 0.0094, "step": 1352 }, { "epoch": 0.26874565498063363, "grad_norm": 0.42823082027908455, "learning_rate": 8.57961951629187e-06, "loss": 0.0117, "step": 1353 }, { "epoch": 0.26894428443738205, "grad_norm": 0.572008987695907, "learning_rate": 8.57737236820428e-06, "loss": 0.019, "step": 1354 }, { "epoch": 0.2691429138941305, "grad_norm": 1.2931428416128332, "learning_rate": 8.575123738730373e-06, "loss": 0.0184, "step": 1355 }, { "epoch": 0.26934154335087895, "grad_norm": 0.4474034296024569, "learning_rate": 8.572873628801305e-06, "loss": 0.0101, "step": 1356 }, { "epoch": 0.2695401728076274, "grad_norm": 0.8648867742502259, "learning_rate": 8.570622039348849e-06, "loss": 0.0132, "step": 1357 }, { "epoch": 0.2697388022643758, "grad_norm": 0.771431838314264, "learning_rate": 8.56836897130538e-06, "loss": 0.0176, "step": 1358 }, { "epoch": 0.2699374317211242, "grad_norm": 0.44094886621911206, "learning_rate": 8.566114425603892e-06, "loss": 0.0126, "step": 1359 }, { "epoch": 0.2701360611778727, "grad_norm": 0.6577711334649768, "learning_rate": 8.563858403177994e-06, "loss": 0.0185, "step": 1360 }, { "epoch": 0.2703346906346211, "grad_norm": 0.39262415058239325, "learning_rate": 8.5616009049619e-06, "loss": 0.0155, "step": 1361 }, { "epoch": 0.27053332009136954, "grad_norm": 1.140595423731314, "learning_rate": 8.559341931890436e-06, "loss": 0.0251, "step": 1362 }, { "epoch": 0.27073194954811797, "grad_norm": 0.4989543570417254, "learning_rate": 8.557081484899043e-06, "loss": 0.016, "step": 1363 }, { "epoch": 0.27093057900486645, "grad_norm": 0.9607995698578123, "learning_rate": 8.55481956492377e-06, "loss": 0.0186, "step": 1364 }, { "epoch": 0.27112920846161487, "grad_norm": 0.5416529886893404, "learning_rate": 8.552556172901276e-06, "loss": 0.0172, "step": 1365 }, { "epoch": 0.2713278379183633, "grad_norm": 0.6712403945203463, "learning_rate": 8.550291309768826e-06, "loss": 0.0112, "step": 1366 }, { "epoch": 0.2715264673751117, "grad_norm": 0.34387714192881513, "learning_rate": 8.548024976464302e-06, "loss": 0.0094, "step": 1367 }, { "epoch": 0.2717250968318602, "grad_norm": 0.4152557312415485, "learning_rate": 8.545757173926187e-06, "loss": 0.0148, "step": 1368 }, { "epoch": 0.2719237262886086, "grad_norm": 0.5580718606691937, "learning_rate": 8.543487903093577e-06, "loss": 0.0112, "step": 1369 }, { "epoch": 0.27212235574535704, "grad_norm": 0.6980069434101902, "learning_rate": 8.541217164906177e-06, "loss": 0.0197, "step": 1370 }, { "epoch": 0.27232098520210546, "grad_norm": 0.6713064539961933, "learning_rate": 8.538944960304292e-06, "loss": 0.0237, "step": 1371 }, { "epoch": 0.2725196146588539, "grad_norm": 0.2791585912778628, "learning_rate": 8.536671290228846e-06, "loss": 0.0096, "step": 1372 }, { "epoch": 0.27271824411560236, "grad_norm": 1.7948572479774836, "learning_rate": 8.534396155621358e-06, "loss": 0.0206, "step": 1373 }, { "epoch": 0.2729168735723508, "grad_norm": 0.5146160931306909, "learning_rate": 8.532119557423964e-06, "loss": 0.0078, "step": 1374 }, { "epoch": 0.2731155030290992, "grad_norm": 0.6336391927705046, "learning_rate": 8.529841496579396e-06, "loss": 0.0154, "step": 1375 }, { "epoch": 0.2733141324858476, "grad_norm": 1.0228381650063942, "learning_rate": 8.527561974031e-06, "loss": 0.0125, "step": 1376 }, { "epoch": 0.2735127619425961, "grad_norm": 0.5621590918716233, "learning_rate": 8.525280990722723e-06, "loss": 0.0152, "step": 1377 }, { "epoch": 0.27371139139934453, "grad_norm": 0.5992082774576349, "learning_rate": 8.52299854759912e-06, "loss": 0.015, "step": 1378 }, { "epoch": 0.27391002085609295, "grad_norm": 0.5427316635640782, "learning_rate": 8.520714645605344e-06, "loss": 0.0217, "step": 1379 }, { "epoch": 0.2741086503128414, "grad_norm": 0.7839896301325817, "learning_rate": 8.51842928568716e-06, "loss": 0.0117, "step": 1380 }, { "epoch": 0.27430727976958985, "grad_norm": 0.4131385983667191, "learning_rate": 8.516142468790931e-06, "loss": 0.0159, "step": 1381 }, { "epoch": 0.2745059092263383, "grad_norm": 0.4081999784122884, "learning_rate": 8.513854195863629e-06, "loss": 0.0099, "step": 1382 }, { "epoch": 0.2747045386830867, "grad_norm": 0.4673684154571544, "learning_rate": 8.511564467852822e-06, "loss": 0.0077, "step": 1383 }, { "epoch": 0.2749031681398351, "grad_norm": 0.5713154043757909, "learning_rate": 8.509273285706686e-06, "loss": 0.0195, "step": 1384 }, { "epoch": 0.2751017975965836, "grad_norm": 0.46380108544256105, "learning_rate": 8.506980650373995e-06, "loss": 0.0142, "step": 1385 }, { "epoch": 0.275300427053332, "grad_norm": 1.0792577996847905, "learning_rate": 8.50468656280413e-06, "loss": 0.0222, "step": 1386 }, { "epoch": 0.27549905651008044, "grad_norm": 0.42746699373310537, "learning_rate": 8.50239102394707e-06, "loss": 0.0146, "step": 1387 }, { "epoch": 0.27569768596682886, "grad_norm": 1.7457011754660738, "learning_rate": 8.500094034753393e-06, "loss": 0.0197, "step": 1388 }, { "epoch": 0.27589631542357734, "grad_norm": 0.6820989464273655, "learning_rate": 8.49779559617428e-06, "loss": 0.0134, "step": 1389 }, { "epoch": 0.27609494488032577, "grad_norm": 0.5308996170906365, "learning_rate": 8.495495709161516e-06, "loss": 0.0196, "step": 1390 }, { "epoch": 0.2762935743370742, "grad_norm": 0.6992766539498697, "learning_rate": 8.49319437466748e-06, "loss": 0.0097, "step": 1391 }, { "epoch": 0.2764922037938226, "grad_norm": 0.2293267653263177, "learning_rate": 8.49089159364515e-06, "loss": 0.006, "step": 1392 }, { "epoch": 0.27669083325057103, "grad_norm": 0.32951425806657547, "learning_rate": 8.488587367048105e-06, "loss": 0.0073, "step": 1393 }, { "epoch": 0.2768894627073195, "grad_norm": 1.6153168594424325, "learning_rate": 8.486281695830527e-06, "loss": 0.0223, "step": 1394 }, { "epoch": 0.27708809216406793, "grad_norm": 1.864778170921737, "learning_rate": 8.483974580947189e-06, "loss": 0.0329, "step": 1395 }, { "epoch": 0.27728672162081636, "grad_norm": 1.7739863390597264, "learning_rate": 8.481666023353468e-06, "loss": 0.0263, "step": 1396 }, { "epoch": 0.2774853510775648, "grad_norm": 0.5128932906013113, "learning_rate": 8.479356024005332e-06, "loss": 0.0113, "step": 1397 }, { "epoch": 0.27768398053431326, "grad_norm": 0.820357228467481, "learning_rate": 8.47704458385935e-06, "loss": 0.014, "step": 1398 }, { "epoch": 0.2778826099910617, "grad_norm": 0.6925209483529865, "learning_rate": 8.47473170387269e-06, "loss": 0.0148, "step": 1399 }, { "epoch": 0.2780812394478101, "grad_norm": 0.5888881159884609, "learning_rate": 8.472417385003109e-06, "loss": 0.019, "step": 1400 }, { "epoch": 0.2782798689045585, "grad_norm": 0.8199869161478691, "learning_rate": 8.470101628208966e-06, "loss": 0.0135, "step": 1401 }, { "epoch": 0.278478498361307, "grad_norm": 0.7210167222946081, "learning_rate": 8.467784434449216e-06, "loss": 0.0172, "step": 1402 }, { "epoch": 0.2786771278180554, "grad_norm": 0.4196353511380168, "learning_rate": 8.465465804683404e-06, "loss": 0.0155, "step": 1403 }, { "epoch": 0.27887575727480385, "grad_norm": 0.6034473650484623, "learning_rate": 8.463145739871672e-06, "loss": 0.0096, "step": 1404 }, { "epoch": 0.27907438673155227, "grad_norm": 0.36016231959989076, "learning_rate": 8.460824240974757e-06, "loss": 0.0103, "step": 1405 }, { "epoch": 0.27927301618830075, "grad_norm": 0.5475792609192718, "learning_rate": 8.458501308953988e-06, "loss": 0.0154, "step": 1406 }, { "epoch": 0.27947164564504917, "grad_norm": 0.5922387773813316, "learning_rate": 8.456176944771293e-06, "loss": 0.0152, "step": 1407 }, { "epoch": 0.2796702751017976, "grad_norm": 0.9948119748458698, "learning_rate": 8.453851149389185e-06, "loss": 0.0245, "step": 1408 }, { "epoch": 0.279868904558546, "grad_norm": 0.8243450247239436, "learning_rate": 8.451523923770776e-06, "loss": 0.013, "step": 1409 }, { "epoch": 0.2800675340152945, "grad_norm": 0.7318369517598357, "learning_rate": 8.449195268879767e-06, "loss": 0.0133, "step": 1410 }, { "epoch": 0.2802661634720429, "grad_norm": 0.5419561316890904, "learning_rate": 8.446865185680448e-06, "loss": 0.0093, "step": 1411 }, { "epoch": 0.28046479292879134, "grad_norm": 0.5265876588995337, "learning_rate": 8.44453367513771e-06, "loss": 0.0171, "step": 1412 }, { "epoch": 0.28066342238553976, "grad_norm": 0.6859813997920388, "learning_rate": 8.442200738217025e-06, "loss": 0.0095, "step": 1413 }, { "epoch": 0.2808620518422882, "grad_norm": 0.5649693302253302, "learning_rate": 8.439866375884464e-06, "loss": 0.0212, "step": 1414 }, { "epoch": 0.28106068129903666, "grad_norm": 0.5862281313195347, "learning_rate": 8.437530589106679e-06, "loss": 0.0221, "step": 1415 }, { "epoch": 0.2812593107557851, "grad_norm": 0.33080120075065145, "learning_rate": 8.435193378850921e-06, "loss": 0.0117, "step": 1416 }, { "epoch": 0.2814579402125335, "grad_norm": 0.9234190022508963, "learning_rate": 8.432854746085024e-06, "loss": 0.0173, "step": 1417 }, { "epoch": 0.28165656966928193, "grad_norm": 0.8430616129766179, "learning_rate": 8.430514691777415e-06, "loss": 0.0145, "step": 1418 }, { "epoch": 0.2818551991260304, "grad_norm": 0.8475901106471433, "learning_rate": 8.428173216897107e-06, "loss": 0.0159, "step": 1419 }, { "epoch": 0.28205382858277883, "grad_norm": 0.5735491390921624, "learning_rate": 8.425830322413703e-06, "loss": 0.0097, "step": 1420 }, { "epoch": 0.28225245803952725, "grad_norm": 0.5266507760548224, "learning_rate": 8.423486009297394e-06, "loss": 0.0109, "step": 1421 }, { "epoch": 0.2824510874962757, "grad_norm": 0.36869636591820587, "learning_rate": 8.421140278518955e-06, "loss": 0.0085, "step": 1422 }, { "epoch": 0.28264971695302415, "grad_norm": 1.2202844836952085, "learning_rate": 8.418793131049757e-06, "loss": 0.0266, "step": 1423 }, { "epoch": 0.2828483464097726, "grad_norm": 0.6504661966699112, "learning_rate": 8.416444567861742e-06, "loss": 0.0106, "step": 1424 }, { "epoch": 0.283046975866521, "grad_norm": 1.5018651106882912, "learning_rate": 8.414094589927455e-06, "loss": 0.018, "step": 1425 }, { "epoch": 0.2832456053232694, "grad_norm": 0.48052848756832556, "learning_rate": 8.411743198220016e-06, "loss": 0.0106, "step": 1426 }, { "epoch": 0.2834442347800179, "grad_norm": 0.7465489964730609, "learning_rate": 8.409390393713139e-06, "loss": 0.0118, "step": 1427 }, { "epoch": 0.2836428642367663, "grad_norm": 0.49935340474238005, "learning_rate": 8.407036177381111e-06, "loss": 0.0068, "step": 1428 }, { "epoch": 0.28384149369351475, "grad_norm": 0.6438837817206333, "learning_rate": 8.404680550198814e-06, "loss": 0.0112, "step": 1429 }, { "epoch": 0.28404012315026317, "grad_norm": 0.46587307611736145, "learning_rate": 8.40232351314171e-06, "loss": 0.0083, "step": 1430 }, { "epoch": 0.28423875260701165, "grad_norm": 1.2474325824762231, "learning_rate": 8.399965067185849e-06, "loss": 0.0155, "step": 1431 }, { "epoch": 0.28443738206376007, "grad_norm": 1.0245682468407986, "learning_rate": 8.397605213307858e-06, "loss": 0.0185, "step": 1432 }, { "epoch": 0.2846360115205085, "grad_norm": 0.7066781397822776, "learning_rate": 8.395243952484949e-06, "loss": 0.0107, "step": 1433 }, { "epoch": 0.2848346409772569, "grad_norm": 0.7550032030601233, "learning_rate": 8.392881285694918e-06, "loss": 0.015, "step": 1434 }, { "epoch": 0.28503327043400534, "grad_norm": 0.6874113816132006, "learning_rate": 8.390517213916147e-06, "loss": 0.0142, "step": 1435 }, { "epoch": 0.2852318998907538, "grad_norm": 1.2976316129854208, "learning_rate": 8.388151738127592e-06, "loss": 0.0181, "step": 1436 }, { "epoch": 0.28543052934750224, "grad_norm": 1.326879456283606, "learning_rate": 8.385784859308796e-06, "loss": 0.0166, "step": 1437 }, { "epoch": 0.28562915880425066, "grad_norm": 1.086178549836567, "learning_rate": 8.383416578439881e-06, "loss": 0.0232, "step": 1438 }, { "epoch": 0.2858277882609991, "grad_norm": 0.6673231654729204, "learning_rate": 8.381046896501547e-06, "loss": 0.0112, "step": 1439 }, { "epoch": 0.28602641771774756, "grad_norm": 0.530064954612848, "learning_rate": 8.378675814475081e-06, "loss": 0.0062, "step": 1440 }, { "epoch": 0.286225047174496, "grad_norm": 1.1957721069947402, "learning_rate": 8.376303333342342e-06, "loss": 0.0206, "step": 1441 }, { "epoch": 0.2864236766312444, "grad_norm": 0.45312372809065454, "learning_rate": 8.373929454085775e-06, "loss": 0.0085, "step": 1442 }, { "epoch": 0.28662230608799283, "grad_norm": 0.332625950245216, "learning_rate": 8.371554177688399e-06, "loss": 0.0087, "step": 1443 }, { "epoch": 0.2868209355447413, "grad_norm": 0.6970930696458172, "learning_rate": 8.369177505133814e-06, "loss": 0.0131, "step": 1444 }, { "epoch": 0.28701956500148973, "grad_norm": 1.1105719027213687, "learning_rate": 8.3667994374062e-06, "loss": 0.0239, "step": 1445 }, { "epoch": 0.28721819445823815, "grad_norm": 1.0006129734400397, "learning_rate": 8.36441997549031e-06, "loss": 0.0174, "step": 1446 }, { "epoch": 0.2874168239149866, "grad_norm": 1.4747499745629573, "learning_rate": 8.362039120371475e-06, "loss": 0.0159, "step": 1447 }, { "epoch": 0.28761545337173505, "grad_norm": 1.0411053353229378, "learning_rate": 8.35965687303561e-06, "loss": 0.0188, "step": 1448 }, { "epoch": 0.2878140828284835, "grad_norm": 0.9172298750699612, "learning_rate": 8.357273234469196e-06, "loss": 0.0153, "step": 1449 }, { "epoch": 0.2880127122852319, "grad_norm": 0.3913985500318929, "learning_rate": 8.354888205659299e-06, "loss": 0.0079, "step": 1450 }, { "epoch": 0.2882113417419803, "grad_norm": 1.374573151737498, "learning_rate": 8.352501787593557e-06, "loss": 0.0281, "step": 1451 }, { "epoch": 0.2884099711987288, "grad_norm": 0.5566702813433198, "learning_rate": 8.35011398126018e-06, "loss": 0.0173, "step": 1452 }, { "epoch": 0.2886086006554772, "grad_norm": 1.199039497209081, "learning_rate": 8.347724787647959e-06, "loss": 0.0248, "step": 1453 }, { "epoch": 0.28880723011222564, "grad_norm": 0.9629869712993756, "learning_rate": 8.345334207746256e-06, "loss": 0.0203, "step": 1454 }, { "epoch": 0.28900585956897407, "grad_norm": 0.7850010176416995, "learning_rate": 8.342942242545007e-06, "loss": 0.0141, "step": 1455 }, { "epoch": 0.2892044890257225, "grad_norm": 0.8834250404360167, "learning_rate": 8.340548893034723e-06, "loss": 0.0182, "step": 1456 }, { "epoch": 0.28940311848247097, "grad_norm": 0.7477391277298652, "learning_rate": 8.338154160206489e-06, "loss": 0.0126, "step": 1457 }, { "epoch": 0.2896017479392194, "grad_norm": 0.991856579976285, "learning_rate": 8.335758045051959e-06, "loss": 0.0191, "step": 1458 }, { "epoch": 0.2898003773959678, "grad_norm": 1.5855642011770337, "learning_rate": 8.333360548563363e-06, "loss": 0.0247, "step": 1459 }, { "epoch": 0.28999900685271623, "grad_norm": 1.3241126782557957, "learning_rate": 8.330961671733503e-06, "loss": 0.0165, "step": 1460 }, { "epoch": 0.2901976363094647, "grad_norm": 1.4795489122284058, "learning_rate": 8.32856141555575e-06, "loss": 0.024, "step": 1461 }, { "epoch": 0.29039626576621314, "grad_norm": 0.3260206114711344, "learning_rate": 8.326159781024049e-06, "loss": 0.0108, "step": 1462 }, { "epoch": 0.29059489522296156, "grad_norm": 0.3190347690618339, "learning_rate": 8.32375676913291e-06, "loss": 0.016, "step": 1463 }, { "epoch": 0.29079352467971, "grad_norm": 0.5725996431341029, "learning_rate": 8.321352380877426e-06, "loss": 0.017, "step": 1464 }, { "epoch": 0.29099215413645846, "grad_norm": 0.767026255292494, "learning_rate": 8.318946617253244e-06, "loss": 0.017, "step": 1465 }, { "epoch": 0.2911907835932069, "grad_norm": 0.5629140122364568, "learning_rate": 8.316539479256594e-06, "loss": 0.015, "step": 1466 }, { "epoch": 0.2913894130499553, "grad_norm": 0.7358139874860172, "learning_rate": 8.314130967884263e-06, "loss": 0.0129, "step": 1467 }, { "epoch": 0.2915880425067037, "grad_norm": 0.8209343112526951, "learning_rate": 8.311721084133622e-06, "loss": 0.0246, "step": 1468 }, { "epoch": 0.2917866719634522, "grad_norm": 0.7719668586694085, "learning_rate": 8.309309829002594e-06, "loss": 0.016, "step": 1469 }, { "epoch": 0.2919853014202006, "grad_norm": 0.41468903664501805, "learning_rate": 8.30689720348968e-06, "loss": 0.0096, "step": 1470 }, { "epoch": 0.29218393087694905, "grad_norm": 0.3738228357367781, "learning_rate": 8.304483208593944e-06, "loss": 0.0095, "step": 1471 }, { "epoch": 0.29238256033369747, "grad_norm": 0.24029576983374756, "learning_rate": 8.302067845315023e-06, "loss": 0.0097, "step": 1472 }, { "epoch": 0.29258118979044595, "grad_norm": 0.5705732197317682, "learning_rate": 8.299651114653113e-06, "loss": 0.0214, "step": 1473 }, { "epoch": 0.2927798192471944, "grad_norm": 0.922298441563043, "learning_rate": 8.29723301760898e-06, "loss": 0.0185, "step": 1474 }, { "epoch": 0.2929784487039428, "grad_norm": 0.8041508059426072, "learning_rate": 8.294813555183959e-06, "loss": 0.0157, "step": 1475 }, { "epoch": 0.2931770781606912, "grad_norm": 0.8308220892287086, "learning_rate": 8.29239272837994e-06, "loss": 0.0208, "step": 1476 }, { "epoch": 0.29337570761743964, "grad_norm": 0.41812007821452235, "learning_rate": 8.289970538199391e-06, "loss": 0.016, "step": 1477 }, { "epoch": 0.2935743370741881, "grad_norm": 0.9237991915282489, "learning_rate": 8.28754698564534e-06, "loss": 0.0138, "step": 1478 }, { "epoch": 0.29377296653093654, "grad_norm": 0.28751992841811247, "learning_rate": 8.285122071721373e-06, "loss": 0.0107, "step": 1479 }, { "epoch": 0.29397159598768496, "grad_norm": 0.8535192526289043, "learning_rate": 8.282695797431644e-06, "loss": 0.0254, "step": 1480 }, { "epoch": 0.2941702254444334, "grad_norm": 0.5712038029199457, "learning_rate": 8.280268163780873e-06, "loss": 0.0075, "step": 1481 }, { "epoch": 0.29436885490118186, "grad_norm": 0.4432551734371519, "learning_rate": 8.27783917177434e-06, "loss": 0.0095, "step": 1482 }, { "epoch": 0.2945674843579303, "grad_norm": 0.26503631565417796, "learning_rate": 8.27540882241789e-06, "loss": 0.005, "step": 1483 }, { "epoch": 0.2947661138146787, "grad_norm": 0.864720411309366, "learning_rate": 8.272977116717925e-06, "loss": 0.0164, "step": 1484 }, { "epoch": 0.29496474327142713, "grad_norm": 0.6284726332727625, "learning_rate": 8.270544055681415e-06, "loss": 0.0089, "step": 1485 }, { "epoch": 0.2951633727281756, "grad_norm": 0.508270650687714, "learning_rate": 8.268109640315887e-06, "loss": 0.0172, "step": 1486 }, { "epoch": 0.29536200218492403, "grad_norm": 0.6269790340357052, "learning_rate": 8.26567387162943e-06, "loss": 0.0179, "step": 1487 }, { "epoch": 0.29556063164167246, "grad_norm": 0.7297545166439348, "learning_rate": 8.263236750630692e-06, "loss": 0.0219, "step": 1488 }, { "epoch": 0.2957592610984209, "grad_norm": 0.6898505446448555, "learning_rate": 8.260798278328884e-06, "loss": 0.0108, "step": 1489 }, { "epoch": 0.29595789055516936, "grad_norm": 0.6708782293191564, "learning_rate": 8.258358455733774e-06, "loss": 0.0084, "step": 1490 }, { "epoch": 0.2961565200119178, "grad_norm": 0.6386660284055108, "learning_rate": 8.25591728385569e-06, "loss": 0.0113, "step": 1491 }, { "epoch": 0.2963551494686662, "grad_norm": 1.0471163969424355, "learning_rate": 8.25347476370552e-06, "loss": 0.0187, "step": 1492 }, { "epoch": 0.2965537789254146, "grad_norm": 0.8613429954652995, "learning_rate": 8.251030896294708e-06, "loss": 0.0146, "step": 1493 }, { "epoch": 0.2967524083821631, "grad_norm": 0.4725238292843531, "learning_rate": 8.248585682635258e-06, "loss": 0.0162, "step": 1494 }, { "epoch": 0.2969510378389115, "grad_norm": 0.7080357972606163, "learning_rate": 8.246139123739729e-06, "loss": 0.0277, "step": 1495 }, { "epoch": 0.29714966729565995, "grad_norm": 0.353145384888274, "learning_rate": 8.243691220621241e-06, "loss": 0.0082, "step": 1496 }, { "epoch": 0.29734829675240837, "grad_norm": 0.5979605491522597, "learning_rate": 8.241241974293466e-06, "loss": 0.0116, "step": 1497 }, { "epoch": 0.2975469262091568, "grad_norm": 0.7513238451070279, "learning_rate": 8.238791385770638e-06, "loss": 0.0246, "step": 1498 }, { "epoch": 0.29774555566590527, "grad_norm": 0.33294751800259487, "learning_rate": 8.236339456067538e-06, "loss": 0.0164, "step": 1499 }, { "epoch": 0.2979441851226537, "grad_norm": 0.7051678355474877, "learning_rate": 8.233886186199508e-06, "loss": 0.0137, "step": 1500 }, { "epoch": 0.2981428145794021, "grad_norm": 0.5236577910644429, "learning_rate": 8.231431577182452e-06, "loss": 0.0132, "step": 1501 }, { "epoch": 0.29834144403615054, "grad_norm": 0.47643786878372224, "learning_rate": 8.228975630032812e-06, "loss": 0.0072, "step": 1502 }, { "epoch": 0.298540073492899, "grad_norm": 0.5218883992156759, "learning_rate": 8.226518345767598e-06, "loss": 0.0172, "step": 1503 }, { "epoch": 0.29873870294964744, "grad_norm": 0.4673001168694523, "learning_rate": 8.224059725404369e-06, "loss": 0.0125, "step": 1504 }, { "epoch": 0.29893733240639586, "grad_norm": 0.34307612038889873, "learning_rate": 8.221599769961235e-06, "loss": 0.0156, "step": 1505 }, { "epoch": 0.2991359618631443, "grad_norm": 1.3513076887859734, "learning_rate": 8.219138480456864e-06, "loss": 0.0209, "step": 1506 }, { "epoch": 0.29933459131989276, "grad_norm": 0.532303893440917, "learning_rate": 8.21667585791047e-06, "loss": 0.0181, "step": 1507 }, { "epoch": 0.2995332207766412, "grad_norm": 0.4536212237431732, "learning_rate": 8.214211903341826e-06, "loss": 0.0139, "step": 1508 }, { "epoch": 0.2997318502333896, "grad_norm": 0.4663375260088001, "learning_rate": 8.211746617771253e-06, "loss": 0.0106, "step": 1509 }, { "epoch": 0.29993047969013803, "grad_norm": 0.38114356426993984, "learning_rate": 8.209280002219619e-06, "loss": 0.0157, "step": 1510 }, { "epoch": 0.3001291091468865, "grad_norm": 0.5218807737983572, "learning_rate": 8.206812057708352e-06, "loss": 0.017, "step": 1511 }, { "epoch": 0.30032773860363493, "grad_norm": 0.8868790496523004, "learning_rate": 8.204342785259423e-06, "loss": 0.0159, "step": 1512 }, { "epoch": 0.30052636806038335, "grad_norm": 0.3421070765069182, "learning_rate": 8.201872185895355e-06, "loss": 0.0116, "step": 1513 }, { "epoch": 0.3007249975171318, "grad_norm": 0.9202280991148633, "learning_rate": 8.19940026063922e-06, "loss": 0.0175, "step": 1514 }, { "epoch": 0.30092362697388025, "grad_norm": 0.3751885600682715, "learning_rate": 8.196927010514642e-06, "loss": 0.0101, "step": 1515 }, { "epoch": 0.3011222564306287, "grad_norm": 0.5044522661178389, "learning_rate": 8.194452436545792e-06, "loss": 0.0099, "step": 1516 }, { "epoch": 0.3013208858873771, "grad_norm": 0.9821726841889677, "learning_rate": 8.191976539757385e-06, "loss": 0.0204, "step": 1517 }, { "epoch": 0.3015195153441255, "grad_norm": 0.33670074524178356, "learning_rate": 8.18949932117469e-06, "loss": 0.0103, "step": 1518 }, { "epoch": 0.30171814480087394, "grad_norm": 0.42279805484652505, "learning_rate": 8.18702078182352e-06, "loss": 0.0241, "step": 1519 }, { "epoch": 0.3019167742576224, "grad_norm": 0.4322955622464677, "learning_rate": 8.184540922730237e-06, "loss": 0.0149, "step": 1520 }, { "epoch": 0.30211540371437084, "grad_norm": 1.2923583198099244, "learning_rate": 8.182059744921745e-06, "loss": 0.0179, "step": 1521 }, { "epoch": 0.30231403317111927, "grad_norm": 1.0983654911483387, "learning_rate": 8.179577249425501e-06, "loss": 0.0132, "step": 1522 }, { "epoch": 0.3025126626278677, "grad_norm": 0.6994601865320401, "learning_rate": 8.177093437269503e-06, "loss": 0.0159, "step": 1523 }, { "epoch": 0.30271129208461617, "grad_norm": 0.9946803548598048, "learning_rate": 8.174608309482293e-06, "loss": 0.0112, "step": 1524 }, { "epoch": 0.3029099215413646, "grad_norm": 0.5054848367639058, "learning_rate": 8.17212186709296e-06, "loss": 0.0079, "step": 1525 }, { "epoch": 0.303108550998113, "grad_norm": 1.1552485716095362, "learning_rate": 8.16963411113114e-06, "loss": 0.0139, "step": 1526 }, { "epoch": 0.30330718045486144, "grad_norm": 0.6284110896879137, "learning_rate": 8.167145042627007e-06, "loss": 0.011, "step": 1527 }, { "epoch": 0.3035058099116099, "grad_norm": 0.381511628810998, "learning_rate": 8.164654662611285e-06, "loss": 0.0112, "step": 1528 }, { "epoch": 0.30370443936835834, "grad_norm": 0.9578113568076267, "learning_rate": 8.162162972115235e-06, "loss": 0.0165, "step": 1529 }, { "epoch": 0.30390306882510676, "grad_norm": 0.3108981457607181, "learning_rate": 8.159669972170667e-06, "loss": 0.0041, "step": 1530 }, { "epoch": 0.3041016982818552, "grad_norm": 0.387381343425436, "learning_rate": 8.157175663809926e-06, "loss": 0.0119, "step": 1531 }, { "epoch": 0.30430032773860366, "grad_norm": 0.7862216295997828, "learning_rate": 8.154680048065905e-06, "loss": 0.0224, "step": 1532 }, { "epoch": 0.3044989571953521, "grad_norm": 0.9430871632430338, "learning_rate": 8.152183125972036e-06, "loss": 0.0301, "step": 1533 }, { "epoch": 0.3046975866521005, "grad_norm": 0.532176799345402, "learning_rate": 8.149684898562289e-06, "loss": 0.0102, "step": 1534 }, { "epoch": 0.3048962161088489, "grad_norm": 1.2585493832335066, "learning_rate": 8.147185366871182e-06, "loss": 0.0168, "step": 1535 }, { "epoch": 0.3050948455655974, "grad_norm": 0.5332469990576837, "learning_rate": 8.144684531933765e-06, "loss": 0.0187, "step": 1536 }, { "epoch": 0.30529347502234583, "grad_norm": 0.6578476556556282, "learning_rate": 8.142182394785633e-06, "loss": 0.0098, "step": 1537 }, { "epoch": 0.30549210447909425, "grad_norm": 0.9710766601800311, "learning_rate": 8.139678956462917e-06, "loss": 0.0147, "step": 1538 }, { "epoch": 0.3056907339358427, "grad_norm": 0.3280996551077205, "learning_rate": 8.13717421800229e-06, "loss": 0.0114, "step": 1539 }, { "epoch": 0.3058893633925911, "grad_norm": 0.4189662797300033, "learning_rate": 8.134668180440962e-06, "loss": 0.0121, "step": 1540 }, { "epoch": 0.3060879928493396, "grad_norm": 0.9921003354040996, "learning_rate": 8.13216084481668e-06, "loss": 0.0136, "step": 1541 }, { "epoch": 0.306286622306088, "grad_norm": 1.00118711821387, "learning_rate": 8.129652212167725e-06, "loss": 0.0126, "step": 1542 }, { "epoch": 0.3064852517628364, "grad_norm": 1.1824434262355368, "learning_rate": 8.127142283532926e-06, "loss": 0.0206, "step": 1543 }, { "epoch": 0.30668388121958484, "grad_norm": 0.3324696515714837, "learning_rate": 8.124631059951638e-06, "loss": 0.0117, "step": 1544 }, { "epoch": 0.3068825106763333, "grad_norm": 0.2722477636883906, "learning_rate": 8.122118542463758e-06, "loss": 0.0069, "step": 1545 }, { "epoch": 0.30708114013308174, "grad_norm": 0.38654260321116607, "learning_rate": 8.119604732109716e-06, "loss": 0.0128, "step": 1546 }, { "epoch": 0.30727976958983017, "grad_norm": 0.4093726871049893, "learning_rate": 8.117089629930479e-06, "loss": 0.0094, "step": 1547 }, { "epoch": 0.3074783990465786, "grad_norm": 1.471363664471753, "learning_rate": 8.114573236967546e-06, "loss": 0.0154, "step": 1548 }, { "epoch": 0.30767702850332707, "grad_norm": 0.8945599914230751, "learning_rate": 8.112055554262956e-06, "loss": 0.0225, "step": 1549 }, { "epoch": 0.3078756579600755, "grad_norm": 0.5070483754063576, "learning_rate": 8.109536582859276e-06, "loss": 0.0139, "step": 1550 }, { "epoch": 0.3080742874168239, "grad_norm": 0.4124395849167642, "learning_rate": 8.107016323799612e-06, "loss": 0.0157, "step": 1551 }, { "epoch": 0.30827291687357233, "grad_norm": 0.7163339426933371, "learning_rate": 8.1044947781276e-06, "loss": 0.0188, "step": 1552 }, { "epoch": 0.3084715463303208, "grad_norm": 0.663951545919708, "learning_rate": 8.10197194688741e-06, "loss": 0.0095, "step": 1553 }, { "epoch": 0.30867017578706923, "grad_norm": 0.4033057526388188, "learning_rate": 8.099447831123742e-06, "loss": 0.0115, "step": 1554 }, { "epoch": 0.30886880524381766, "grad_norm": 0.5134664101495625, "learning_rate": 8.09692243188183e-06, "loss": 0.0146, "step": 1555 }, { "epoch": 0.3090674347005661, "grad_norm": 0.3272561207723999, "learning_rate": 8.094395750207443e-06, "loss": 0.0066, "step": 1556 }, { "epoch": 0.30926606415731456, "grad_norm": 0.47522492760295715, "learning_rate": 8.091867787146874e-06, "loss": 0.0175, "step": 1557 }, { "epoch": 0.309464693614063, "grad_norm": 0.28831801397227647, "learning_rate": 8.08933854374695e-06, "loss": 0.0087, "step": 1558 }, { "epoch": 0.3096633230708114, "grad_norm": 0.728136763673717, "learning_rate": 8.086808021055029e-06, "loss": 0.0146, "step": 1559 }, { "epoch": 0.3098619525275598, "grad_norm": 0.5645255218117098, "learning_rate": 8.084276220118997e-06, "loss": 0.0131, "step": 1560 }, { "epoch": 0.31006058198430825, "grad_norm": 0.9114867700044899, "learning_rate": 8.081743141987271e-06, "loss": 0.0172, "step": 1561 }, { "epoch": 0.3102592114410567, "grad_norm": 0.5128080694665796, "learning_rate": 8.079208787708797e-06, "loss": 0.0181, "step": 1562 }, { "epoch": 0.31045784089780515, "grad_norm": 0.7452682016764512, "learning_rate": 8.076673158333046e-06, "loss": 0.0131, "step": 1563 }, { "epoch": 0.31065647035455357, "grad_norm": 0.5224597185446708, "learning_rate": 8.074136254910022e-06, "loss": 0.0172, "step": 1564 }, { "epoch": 0.310855099811302, "grad_norm": 0.2271594138378722, "learning_rate": 8.071598078490254e-06, "loss": 0.0061, "step": 1565 }, { "epoch": 0.31105372926805047, "grad_norm": 0.3208897701348875, "learning_rate": 8.069058630124798e-06, "loss": 0.0087, "step": 1566 }, { "epoch": 0.3112523587247989, "grad_norm": 0.4311676394198718, "learning_rate": 8.066517910865235e-06, "loss": 0.0127, "step": 1567 }, { "epoch": 0.3114509881815473, "grad_norm": 0.3605691884352593, "learning_rate": 8.063975921763675e-06, "loss": 0.0109, "step": 1568 }, { "epoch": 0.31164961763829574, "grad_norm": 0.5511650816451228, "learning_rate": 8.061432663872757e-06, "loss": 0.014, "step": 1569 }, { "epoch": 0.3118482470950442, "grad_norm": 0.4705883402855219, "learning_rate": 8.058888138245639e-06, "loss": 0.018, "step": 1570 }, { "epoch": 0.31204687655179264, "grad_norm": 0.427333107085936, "learning_rate": 8.056342345936005e-06, "loss": 0.013, "step": 1571 }, { "epoch": 0.31224550600854106, "grad_norm": 0.9899499461011341, "learning_rate": 8.053795287998065e-06, "loss": 0.0104, "step": 1572 }, { "epoch": 0.3124441354652895, "grad_norm": 0.4747912372785505, "learning_rate": 8.051246965486557e-06, "loss": 0.0073, "step": 1573 }, { "epoch": 0.31264276492203796, "grad_norm": 0.441160461346295, "learning_rate": 8.048697379456733e-06, "loss": 0.0159, "step": 1574 }, { "epoch": 0.3128413943787864, "grad_norm": 0.34014297933534443, "learning_rate": 8.04614653096438e-06, "loss": 0.0097, "step": 1575 }, { "epoch": 0.3130400238355348, "grad_norm": 1.2792886913012314, "learning_rate": 8.043594421065796e-06, "loss": 0.017, "step": 1576 }, { "epoch": 0.31323865329228323, "grad_norm": 0.6529038511786388, "learning_rate": 8.041041050817813e-06, "loss": 0.0155, "step": 1577 }, { "epoch": 0.31343728274903165, "grad_norm": 0.4600809221001575, "learning_rate": 8.038486421277775e-06, "loss": 0.0093, "step": 1578 }, { "epoch": 0.31363591220578013, "grad_norm": 0.5257994692013639, "learning_rate": 8.035930533503554e-06, "loss": 0.012, "step": 1579 }, { "epoch": 0.31383454166252855, "grad_norm": 1.482595268996986, "learning_rate": 8.033373388553538e-06, "loss": 0.0126, "step": 1580 }, { "epoch": 0.314033171119277, "grad_norm": 0.697904773183433, "learning_rate": 8.030814987486639e-06, "loss": 0.0129, "step": 1581 }, { "epoch": 0.3142318005760254, "grad_norm": 1.1437565550456443, "learning_rate": 8.028255331362292e-06, "loss": 0.0153, "step": 1582 }, { "epoch": 0.3144304300327739, "grad_norm": 0.7568186293758398, "learning_rate": 8.025694421240442e-06, "loss": 0.0098, "step": 1583 }, { "epoch": 0.3146290594895223, "grad_norm": 1.2713687932469087, "learning_rate": 8.023132258181563e-06, "loss": 0.0233, "step": 1584 }, { "epoch": 0.3148276889462707, "grad_norm": 0.6183628181054079, "learning_rate": 8.020568843246642e-06, "loss": 0.0174, "step": 1585 }, { "epoch": 0.31502631840301915, "grad_norm": 0.38716038710078116, "learning_rate": 8.01800417749719e-06, "loss": 0.0072, "step": 1586 }, { "epoch": 0.3152249478597676, "grad_norm": 0.43109273141996973, "learning_rate": 8.015438261995229e-06, "loss": 0.0085, "step": 1587 }, { "epoch": 0.31542357731651605, "grad_norm": 0.72428449579244, "learning_rate": 8.012871097803303e-06, "loss": 0.0134, "step": 1588 }, { "epoch": 0.31562220677326447, "grad_norm": 0.4444888127096748, "learning_rate": 8.010302685984473e-06, "loss": 0.0118, "step": 1589 }, { "epoch": 0.3158208362300129, "grad_norm": 0.5112452513036903, "learning_rate": 8.007733027602315e-06, "loss": 0.0085, "step": 1590 }, { "epoch": 0.31601946568676137, "grad_norm": 0.5630116305420747, "learning_rate": 8.005162123720924e-06, "loss": 0.0096, "step": 1591 }, { "epoch": 0.3162180951435098, "grad_norm": 0.7437776670920336, "learning_rate": 8.002589975404907e-06, "loss": 0.0144, "step": 1592 }, { "epoch": 0.3164167246002582, "grad_norm": 0.970854724356747, "learning_rate": 8.000016583719386e-06, "loss": 0.0215, "step": 1593 }, { "epoch": 0.31661535405700664, "grad_norm": 0.8828896563831873, "learning_rate": 7.997441949730003e-06, "loss": 0.0133, "step": 1594 }, { "epoch": 0.3168139835137551, "grad_norm": 0.6160271865646878, "learning_rate": 7.994866074502911e-06, "loss": 0.0135, "step": 1595 }, { "epoch": 0.31701261297050354, "grad_norm": 0.5364322361987068, "learning_rate": 7.992288959104776e-06, "loss": 0.0169, "step": 1596 }, { "epoch": 0.31721124242725196, "grad_norm": 0.46568042207568855, "learning_rate": 7.98971060460278e-06, "loss": 0.0122, "step": 1597 }, { "epoch": 0.3174098718840004, "grad_norm": 0.5264630544016791, "learning_rate": 7.987131012064615e-06, "loss": 0.0129, "step": 1598 }, { "epoch": 0.3176085013407488, "grad_norm": 0.6009801290738634, "learning_rate": 7.984550182558492e-06, "loss": 0.0112, "step": 1599 }, { "epoch": 0.3178071307974973, "grad_norm": 0.6884006171260445, "learning_rate": 7.981968117153125e-06, "loss": 0.0191, "step": 1600 }, { "epoch": 0.3180057602542457, "grad_norm": 0.3982485689190538, "learning_rate": 7.979384816917748e-06, "loss": 0.0111, "step": 1601 }, { "epoch": 0.31820438971099413, "grad_norm": 0.30136377831303135, "learning_rate": 7.9768002829221e-06, "loss": 0.0113, "step": 1602 }, { "epoch": 0.31840301916774255, "grad_norm": 0.8655679733419154, "learning_rate": 7.974214516236438e-06, "loss": 0.0196, "step": 1603 }, { "epoch": 0.31860164862449103, "grad_norm": 0.8439790204628401, "learning_rate": 7.971627517931523e-06, "loss": 0.0159, "step": 1604 }, { "epoch": 0.31880027808123945, "grad_norm": 0.6162008342884321, "learning_rate": 7.96903928907863e-06, "loss": 0.0172, "step": 1605 }, { "epoch": 0.3189989075379879, "grad_norm": 0.2737773167439073, "learning_rate": 7.966449830749538e-06, "loss": 0.0172, "step": 1606 }, { "epoch": 0.3191975369947363, "grad_norm": 0.505600845809124, "learning_rate": 7.963859144016544e-06, "loss": 0.0164, "step": 1607 }, { "epoch": 0.3193961664514848, "grad_norm": 0.796834862743995, "learning_rate": 7.961267229952444e-06, "loss": 0.011, "step": 1608 }, { "epoch": 0.3195947959082332, "grad_norm": 1.2150625750472108, "learning_rate": 7.958674089630551e-06, "loss": 0.0147, "step": 1609 }, { "epoch": 0.3197934253649816, "grad_norm": 0.3751880775430107, "learning_rate": 7.956079724124681e-06, "loss": 0.0131, "step": 1610 }, { "epoch": 0.31999205482173004, "grad_norm": 0.2392680116727292, "learning_rate": 7.953484134509158e-06, "loss": 0.0075, "step": 1611 }, { "epoch": 0.3201906842784785, "grad_norm": 0.3297290154412873, "learning_rate": 7.950887321858811e-06, "loss": 0.0084, "step": 1612 }, { "epoch": 0.32038931373522694, "grad_norm": 0.6362428644715254, "learning_rate": 7.948289287248979e-06, "loss": 0.0113, "step": 1613 }, { "epoch": 0.32058794319197537, "grad_norm": 0.703329270973474, "learning_rate": 7.945690031755506e-06, "loss": 0.0121, "step": 1614 }, { "epoch": 0.3207865726487238, "grad_norm": 1.1210492893128057, "learning_rate": 7.943089556454742e-06, "loss": 0.0161, "step": 1615 }, { "epoch": 0.32098520210547227, "grad_norm": 0.7398560335941972, "learning_rate": 7.940487862423538e-06, "loss": 0.0153, "step": 1616 }, { "epoch": 0.3211838315622207, "grad_norm": 0.592690898155172, "learning_rate": 7.937884950739255e-06, "loss": 0.0126, "step": 1617 }, { "epoch": 0.3213824610189691, "grad_norm": 0.3885094064525805, "learning_rate": 7.935280822479758e-06, "loss": 0.0126, "step": 1618 }, { "epoch": 0.32158109047571753, "grad_norm": 0.37641651576137636, "learning_rate": 7.93267547872341e-06, "loss": 0.0158, "step": 1619 }, { "epoch": 0.32177971993246596, "grad_norm": 0.5131102552628134, "learning_rate": 7.930068920549084e-06, "loss": 0.0077, "step": 1620 }, { "epoch": 0.32197834938921444, "grad_norm": 0.6672345424407569, "learning_rate": 7.927461149036153e-06, "loss": 0.0154, "step": 1621 }, { "epoch": 0.32217697884596286, "grad_norm": 0.7516794337115568, "learning_rate": 7.924852165264491e-06, "loss": 0.0119, "step": 1622 }, { "epoch": 0.3223756083027113, "grad_norm": 0.3990098639952792, "learning_rate": 7.922241970314475e-06, "loss": 0.0089, "step": 1623 }, { "epoch": 0.3225742377594597, "grad_norm": 0.48114802980288773, "learning_rate": 7.919630565266987e-06, "loss": 0.0123, "step": 1624 }, { "epoch": 0.3227728672162082, "grad_norm": 0.5316138344004034, "learning_rate": 7.917017951203406e-06, "loss": 0.0064, "step": 1625 }, { "epoch": 0.3229714966729566, "grad_norm": 0.30950798522402956, "learning_rate": 7.91440412920561e-06, "loss": 0.004, "step": 1626 }, { "epoch": 0.323170126129705, "grad_norm": 0.8119698454248738, "learning_rate": 7.911789100355985e-06, "loss": 0.017, "step": 1627 }, { "epoch": 0.32336875558645345, "grad_norm": 0.3266822113853905, "learning_rate": 7.90917286573741e-06, "loss": 0.0112, "step": 1628 }, { "epoch": 0.3235673850432019, "grad_norm": 0.5213109216327177, "learning_rate": 7.906555426433264e-06, "loss": 0.0084, "step": 1629 }, { "epoch": 0.32376601449995035, "grad_norm": 1.0671826630070904, "learning_rate": 7.903936783527425e-06, "loss": 0.0198, "step": 1630 }, { "epoch": 0.3239646439566988, "grad_norm": 1.0574028610546085, "learning_rate": 7.901316938104275e-06, "loss": 0.0148, "step": 1631 }, { "epoch": 0.3241632734134472, "grad_norm": 0.46764434451459613, "learning_rate": 7.898695891248685e-06, "loss": 0.0064, "step": 1632 }, { "epoch": 0.3243619028701957, "grad_norm": 0.8539693335630968, "learning_rate": 7.896073644046028e-06, "loss": 0.0135, "step": 1633 }, { "epoch": 0.3245605323269441, "grad_norm": 0.8064389257572152, "learning_rate": 7.893450197582178e-06, "loss": 0.0117, "step": 1634 }, { "epoch": 0.3247591617836925, "grad_norm": 0.34763349103097907, "learning_rate": 7.890825552943495e-06, "loss": 0.0075, "step": 1635 }, { "epoch": 0.32495779124044094, "grad_norm": 0.48244504223164625, "learning_rate": 7.888199711216848e-06, "loss": 0.0138, "step": 1636 }, { "epoch": 0.3251564206971894, "grad_norm": 0.5896875038752724, "learning_rate": 7.885572673489592e-06, "loss": 0.0126, "step": 1637 }, { "epoch": 0.32535505015393784, "grad_norm": 0.6571748595454835, "learning_rate": 7.882944440849582e-06, "loss": 0.0117, "step": 1638 }, { "epoch": 0.32555367961068626, "grad_norm": 0.31453304815725347, "learning_rate": 7.880315014385166e-06, "loss": 0.0048, "step": 1639 }, { "epoch": 0.3257523090674347, "grad_norm": 1.1519220230010478, "learning_rate": 7.877684395185187e-06, "loss": 0.0127, "step": 1640 }, { "epoch": 0.3259509385241831, "grad_norm": 0.770459673164255, "learning_rate": 7.875052584338983e-06, "loss": 0.0206, "step": 1641 }, { "epoch": 0.3261495679809316, "grad_norm": 0.7015223884365756, "learning_rate": 7.872419582936382e-06, "loss": 0.0175, "step": 1642 }, { "epoch": 0.32634819743768, "grad_norm": 0.6126064018057952, "learning_rate": 7.869785392067707e-06, "loss": 0.0171, "step": 1643 }, { "epoch": 0.32654682689442843, "grad_norm": 0.43187448867276834, "learning_rate": 7.867150012823777e-06, "loss": 0.0134, "step": 1644 }, { "epoch": 0.32674545635117686, "grad_norm": 0.9447876200418266, "learning_rate": 7.864513446295896e-06, "loss": 0.0143, "step": 1645 }, { "epoch": 0.32694408580792533, "grad_norm": 0.5123267111128992, "learning_rate": 7.861875693575866e-06, "loss": 0.0092, "step": 1646 }, { "epoch": 0.32714271526467376, "grad_norm": 0.8471560674383536, "learning_rate": 7.859236755755978e-06, "loss": 0.0166, "step": 1647 }, { "epoch": 0.3273413447214222, "grad_norm": 1.1733673039140131, "learning_rate": 7.856596633929012e-06, "loss": 0.0225, "step": 1648 }, { "epoch": 0.3275399741781706, "grad_norm": 0.5145085603141826, "learning_rate": 7.85395532918824e-06, "loss": 0.0169, "step": 1649 }, { "epoch": 0.3277386036349191, "grad_norm": 0.4349990032596344, "learning_rate": 7.851312842627426e-06, "loss": 0.0179, "step": 1650 }, { "epoch": 0.3279372330916675, "grad_norm": 0.6127954085037567, "learning_rate": 7.848669175340818e-06, "loss": 0.0163, "step": 1651 }, { "epoch": 0.3281358625484159, "grad_norm": 0.6906096363587637, "learning_rate": 7.846024328423157e-06, "loss": 0.0136, "step": 1652 }, { "epoch": 0.32833449200516435, "grad_norm": 0.3831242188980579, "learning_rate": 7.843378302969674e-06, "loss": 0.0115, "step": 1653 }, { "epoch": 0.3285331214619128, "grad_norm": 0.31854883023090935, "learning_rate": 7.840731100076081e-06, "loss": 0.0119, "step": 1654 }, { "epoch": 0.32873175091866125, "grad_norm": 0.6135826551661053, "learning_rate": 7.838082720838585e-06, "loss": 0.0116, "step": 1655 }, { "epoch": 0.32893038037540967, "grad_norm": 0.6788478557117993, "learning_rate": 7.835433166353876e-06, "loss": 0.0196, "step": 1656 }, { "epoch": 0.3291290098321581, "grad_norm": 0.5204341679322476, "learning_rate": 7.832782437719132e-06, "loss": 0.0176, "step": 1657 }, { "epoch": 0.32932763928890657, "grad_norm": 0.7020429742008044, "learning_rate": 7.830130536032017e-06, "loss": 0.0101, "step": 1658 }, { "epoch": 0.329526268745655, "grad_norm": 0.782017865564206, "learning_rate": 7.827477462390683e-06, "loss": 0.0156, "step": 1659 }, { "epoch": 0.3297248982024034, "grad_norm": 0.7872235943247256, "learning_rate": 7.824823217893762e-06, "loss": 0.0183, "step": 1660 }, { "epoch": 0.32992352765915184, "grad_norm": 0.9577077977386234, "learning_rate": 7.822167803640375e-06, "loss": 0.017, "step": 1661 }, { "epoch": 0.33012215711590026, "grad_norm": 0.3392933900875892, "learning_rate": 7.819511220730127e-06, "loss": 0.0069, "step": 1662 }, { "epoch": 0.33032078657264874, "grad_norm": 0.769515283319616, "learning_rate": 7.816853470263107e-06, "loss": 0.0132, "step": 1663 }, { "epoch": 0.33051941602939716, "grad_norm": 0.41436036382418884, "learning_rate": 7.814194553339884e-06, "loss": 0.0093, "step": 1664 }, { "epoch": 0.3307180454861456, "grad_norm": 0.7489421767929256, "learning_rate": 7.811534471061516e-06, "loss": 0.015, "step": 1665 }, { "epoch": 0.330916674942894, "grad_norm": 0.5296856540549193, "learning_rate": 7.80887322452954e-06, "loss": 0.0173, "step": 1666 }, { "epoch": 0.3311153043996425, "grad_norm": 0.5649510594604672, "learning_rate": 7.806210814845974e-06, "loss": 0.0102, "step": 1667 }, { "epoch": 0.3313139338563909, "grad_norm": 0.4243877853275429, "learning_rate": 7.803547243113319e-06, "loss": 0.0108, "step": 1668 }, { "epoch": 0.33151256331313933, "grad_norm": 0.6981372474644114, "learning_rate": 7.800882510434559e-06, "loss": 0.013, "step": 1669 }, { "epoch": 0.33171119276988775, "grad_norm": 0.5550131373261892, "learning_rate": 7.798216617913155e-06, "loss": 0.0213, "step": 1670 }, { "epoch": 0.33190982222663623, "grad_norm": 0.867353442467047, "learning_rate": 7.795549566653054e-06, "loss": 0.0116, "step": 1671 }, { "epoch": 0.33210845168338465, "grad_norm": 0.587120077251956, "learning_rate": 7.792881357758674e-06, "loss": 0.0177, "step": 1672 }, { "epoch": 0.3323070811401331, "grad_norm": 0.5107619089303768, "learning_rate": 7.790211992334923e-06, "loss": 0.0184, "step": 1673 }, { "epoch": 0.3325057105968815, "grad_norm": 0.9529228340714601, "learning_rate": 7.787541471487178e-06, "loss": 0.0116, "step": 1674 }, { "epoch": 0.33270434005363, "grad_norm": 0.32762475045703154, "learning_rate": 7.784869796321302e-06, "loss": 0.0111, "step": 1675 }, { "epoch": 0.3329029695103784, "grad_norm": 1.0160627610759643, "learning_rate": 7.782196967943633e-06, "loss": 0.0174, "step": 1676 }, { "epoch": 0.3331015989671268, "grad_norm": 0.8755877388840554, "learning_rate": 7.779522987460985e-06, "loss": 0.0241, "step": 1677 }, { "epoch": 0.33330022842387524, "grad_norm": 0.4517222646647724, "learning_rate": 7.776847855980653e-06, "loss": 0.0105, "step": 1678 }, { "epoch": 0.3334988578806237, "grad_norm": 0.4723207594634246, "learning_rate": 7.774171574610404e-06, "loss": 0.0156, "step": 1679 }, { "epoch": 0.33369748733737215, "grad_norm": 0.5220346418239198, "learning_rate": 7.771494144458483e-06, "loss": 0.0116, "step": 1680 }, { "epoch": 0.33389611679412057, "grad_norm": 0.2821678482563408, "learning_rate": 7.768815566633612e-06, "loss": 0.0082, "step": 1681 }, { "epoch": 0.334094746250869, "grad_norm": 1.42187836812918, "learning_rate": 7.766135842244988e-06, "loss": 0.0241, "step": 1682 }, { "epoch": 0.3342933757076174, "grad_norm": 0.6174532778541035, "learning_rate": 7.763454972402282e-06, "loss": 0.0128, "step": 1683 }, { "epoch": 0.3344920051643659, "grad_norm": 0.5883317240537213, "learning_rate": 7.76077295821564e-06, "loss": 0.0144, "step": 1684 }, { "epoch": 0.3346906346211143, "grad_norm": 0.3588975768808438, "learning_rate": 7.75808980079568e-06, "loss": 0.0097, "step": 1685 }, { "epoch": 0.33488926407786274, "grad_norm": 0.5875630793918583, "learning_rate": 7.755405501253496e-06, "loss": 0.0157, "step": 1686 }, { "epoch": 0.33508789353461116, "grad_norm": 0.7281884043493897, "learning_rate": 7.752720060700652e-06, "loss": 0.02, "step": 1687 }, { "epoch": 0.33528652299135964, "grad_norm": 0.6001706983891172, "learning_rate": 7.750033480249188e-06, "loss": 0.0155, "step": 1688 }, { "epoch": 0.33548515244810806, "grad_norm": 0.2997842597354729, "learning_rate": 7.747345761011616e-06, "loss": 0.0135, "step": 1689 }, { "epoch": 0.3356837819048565, "grad_norm": 0.5174410489177036, "learning_rate": 7.744656904100913e-06, "loss": 0.0093, "step": 1690 }, { "epoch": 0.3358824113616049, "grad_norm": 0.5480320708273444, "learning_rate": 7.741966910630536e-06, "loss": 0.0172, "step": 1691 }, { "epoch": 0.3360810408183534, "grad_norm": 0.4344915109534724, "learning_rate": 7.739275781714405e-06, "loss": 0.0129, "step": 1692 }, { "epoch": 0.3362796702751018, "grad_norm": 0.8908712492790886, "learning_rate": 7.736583518466919e-06, "loss": 0.0127, "step": 1693 }, { "epoch": 0.33647829973185023, "grad_norm": 0.849787263342614, "learning_rate": 7.733890122002936e-06, "loss": 0.0119, "step": 1694 }, { "epoch": 0.33667692918859865, "grad_norm": 0.5855213279534514, "learning_rate": 7.731195593437793e-06, "loss": 0.012, "step": 1695 }, { "epoch": 0.33687555864534713, "grad_norm": 0.6288057804890751, "learning_rate": 7.728499933887288e-06, "loss": 0.0142, "step": 1696 }, { "epoch": 0.33707418810209555, "grad_norm": 0.43026765663973393, "learning_rate": 7.725803144467695e-06, "loss": 0.0116, "step": 1697 }, { "epoch": 0.337272817558844, "grad_norm": 0.36782345928729954, "learning_rate": 7.723105226295749e-06, "loss": 0.0086, "step": 1698 }, { "epoch": 0.3374714470155924, "grad_norm": 0.9472170162174446, "learning_rate": 7.720406180488655e-06, "loss": 0.0262, "step": 1699 }, { "epoch": 0.3376700764723409, "grad_norm": 0.3625055211307673, "learning_rate": 7.717706008164085e-06, "loss": 0.0079, "step": 1700 }, { "epoch": 0.3378687059290893, "grad_norm": 0.2153089494894364, "learning_rate": 7.715004710440181e-06, "loss": 0.0045, "step": 1701 }, { "epoch": 0.3380673353858377, "grad_norm": 1.055365098303852, "learning_rate": 7.712302288435545e-06, "loss": 0.0146, "step": 1702 }, { "epoch": 0.33826596484258614, "grad_norm": 1.593570003099904, "learning_rate": 7.709598743269246e-06, "loss": 0.0264, "step": 1703 }, { "epoch": 0.33846459429933456, "grad_norm": 0.8049164644185485, "learning_rate": 7.70689407606082e-06, "loss": 0.0127, "step": 1704 }, { "epoch": 0.33866322375608304, "grad_norm": 1.02619924035175, "learning_rate": 7.70418828793027e-06, "loss": 0.0222, "step": 1705 }, { "epoch": 0.33886185321283147, "grad_norm": 0.3728588121113957, "learning_rate": 7.701481379998057e-06, "loss": 0.0108, "step": 1706 }, { "epoch": 0.3390604826695799, "grad_norm": 0.6574219495167848, "learning_rate": 7.698773353385111e-06, "loss": 0.0121, "step": 1707 }, { "epoch": 0.3392591121263283, "grad_norm": 0.38503544600597717, "learning_rate": 7.696064209212822e-06, "loss": 0.0112, "step": 1708 }, { "epoch": 0.3394577415830768, "grad_norm": 0.558349825474669, "learning_rate": 7.693353948603041e-06, "loss": 0.008, "step": 1709 }, { "epoch": 0.3396563710398252, "grad_norm": 2.1539383186369774, "learning_rate": 7.69064257267809e-06, "loss": 0.0121, "step": 1710 }, { "epoch": 0.33985500049657363, "grad_norm": 0.41496259994783097, "learning_rate": 7.687930082560744e-06, "loss": 0.0127, "step": 1711 }, { "epoch": 0.34005362995332206, "grad_norm": 0.48924590860014766, "learning_rate": 7.685216479374242e-06, "loss": 0.0137, "step": 1712 }, { "epoch": 0.34025225941007053, "grad_norm": 0.45250983111826154, "learning_rate": 7.682501764242284e-06, "loss": 0.0089, "step": 1713 }, { "epoch": 0.34045088886681896, "grad_norm": 0.45711497918986704, "learning_rate": 7.679785938289032e-06, "loss": 0.0132, "step": 1714 }, { "epoch": 0.3406495183235674, "grad_norm": 0.44128917687358954, "learning_rate": 7.677069002639109e-06, "loss": 0.014, "step": 1715 }, { "epoch": 0.3408481477803158, "grad_norm": 0.7869148221018819, "learning_rate": 7.674350958417589e-06, "loss": 0.018, "step": 1716 }, { "epoch": 0.3410467772370643, "grad_norm": 0.4703664064866459, "learning_rate": 7.671631806750018e-06, "loss": 0.0144, "step": 1717 }, { "epoch": 0.3412454066938127, "grad_norm": 0.42705552055783985, "learning_rate": 7.66891154876239e-06, "loss": 0.0067, "step": 1718 }, { "epoch": 0.3414440361505611, "grad_norm": 1.4239786275656603, "learning_rate": 7.666190185581164e-06, "loss": 0.0215, "step": 1719 }, { "epoch": 0.34164266560730955, "grad_norm": 0.8964200804878224, "learning_rate": 7.66346771833325e-06, "loss": 0.0105, "step": 1720 }, { "epoch": 0.341841295064058, "grad_norm": 0.3234278375891176, "learning_rate": 7.660744148146022e-06, "loss": 0.0204, "step": 1721 }, { "epoch": 0.34203992452080645, "grad_norm": 0.6931183761335391, "learning_rate": 7.658019476147307e-06, "loss": 0.0201, "step": 1722 }, { "epoch": 0.34223855397755487, "grad_norm": 0.7400781197903014, "learning_rate": 7.65529370346539e-06, "loss": 0.0164, "step": 1723 }, { "epoch": 0.3424371834343033, "grad_norm": 0.4335482304440916, "learning_rate": 7.652566831229007e-06, "loss": 0.0091, "step": 1724 }, { "epoch": 0.3426358128910517, "grad_norm": 0.1969898697189572, "learning_rate": 7.649838860567356e-06, "loss": 0.0038, "step": 1725 }, { "epoch": 0.3428344423478002, "grad_norm": 0.6032444248728773, "learning_rate": 7.647109792610087e-06, "loss": 0.0094, "step": 1726 }, { "epoch": 0.3430330718045486, "grad_norm": 0.4008378028364625, "learning_rate": 7.644379628487305e-06, "loss": 0.0116, "step": 1727 }, { "epoch": 0.34323170126129704, "grad_norm": 0.45794703272131426, "learning_rate": 7.641648369329566e-06, "loss": 0.0081, "step": 1728 }, { "epoch": 0.34343033071804546, "grad_norm": 0.8915628644941473, "learning_rate": 7.638916016267884e-06, "loss": 0.0136, "step": 1729 }, { "epoch": 0.34362896017479394, "grad_norm": 0.4072293871212264, "learning_rate": 7.63618257043372e-06, "loss": 0.0135, "step": 1730 }, { "epoch": 0.34382758963154236, "grad_norm": 0.48094267145550845, "learning_rate": 7.633448032958994e-06, "loss": 0.0169, "step": 1731 }, { "epoch": 0.3440262190882908, "grad_norm": 0.546937299007096, "learning_rate": 7.630712404976075e-06, "loss": 0.0113, "step": 1732 }, { "epoch": 0.3442248485450392, "grad_norm": 0.595885300176229, "learning_rate": 7.6279756876177835e-06, "loss": 0.0156, "step": 1733 }, { "epoch": 0.3444234780017877, "grad_norm": 0.3227233399899523, "learning_rate": 7.6252378820173915e-06, "loss": 0.0064, "step": 1734 }, { "epoch": 0.3446221074585361, "grad_norm": 0.6990835307410963, "learning_rate": 7.622498989308622e-06, "loss": 0.0197, "step": 1735 }, { "epoch": 0.34482073691528453, "grad_norm": 0.67866449998373, "learning_rate": 7.619759010625647e-06, "loss": 0.0113, "step": 1736 }, { "epoch": 0.34501936637203295, "grad_norm": 0.7839634602784449, "learning_rate": 7.617017947103089e-06, "loss": 0.0128, "step": 1737 }, { "epoch": 0.34521799582878143, "grad_norm": 0.2870079380203802, "learning_rate": 7.614275799876021e-06, "loss": 0.0045, "step": 1738 }, { "epoch": 0.34541662528552985, "grad_norm": 0.4689823234820678, "learning_rate": 7.61153257007996e-06, "loss": 0.0104, "step": 1739 }, { "epoch": 0.3456152547422783, "grad_norm": 0.4687644833338894, "learning_rate": 7.608788258850879e-06, "loss": 0.0116, "step": 1740 }, { "epoch": 0.3458138841990267, "grad_norm": 0.5581341559370254, "learning_rate": 7.6060428673251915e-06, "loss": 0.0102, "step": 1741 }, { "epoch": 0.3460125136557752, "grad_norm": 0.5745297067207734, "learning_rate": 7.603296396639763e-06, "loss": 0.0129, "step": 1742 }, { "epoch": 0.3462111431125236, "grad_norm": 0.5785097862293327, "learning_rate": 7.600548847931903e-06, "loss": 0.0101, "step": 1743 }, { "epoch": 0.346409772569272, "grad_norm": 0.6057076832337547, "learning_rate": 7.597800222339371e-06, "loss": 0.0113, "step": 1744 }, { "epoch": 0.34660840202602045, "grad_norm": 0.7114365255059848, "learning_rate": 7.595050521000367e-06, "loss": 0.0147, "step": 1745 }, { "epoch": 0.34680703148276887, "grad_norm": 0.9549334149084857, "learning_rate": 7.5922997450535405e-06, "loss": 0.0079, "step": 1746 }, { "epoch": 0.34700566093951735, "grad_norm": 0.9194225699846286, "learning_rate": 7.589547895637987e-06, "loss": 0.0176, "step": 1747 }, { "epoch": 0.34720429039626577, "grad_norm": 0.317751182542115, "learning_rate": 7.586794973893241e-06, "loss": 0.0062, "step": 1748 }, { "epoch": 0.3474029198530142, "grad_norm": 0.7272366693065906, "learning_rate": 7.584040980959288e-06, "loss": 0.0114, "step": 1749 }, { "epoch": 0.3476015493097626, "grad_norm": 0.5400337381754199, "learning_rate": 7.5812859179765555e-06, "loss": 0.0084, "step": 1750 }, { "epoch": 0.3478001787665111, "grad_norm": 1.0261218838104105, "learning_rate": 7.578529786085904e-06, "loss": 0.0144, "step": 1751 }, { "epoch": 0.3479988082232595, "grad_norm": 0.5694769536171905, "learning_rate": 7.5757725864286536e-06, "loss": 0.0104, "step": 1752 }, { "epoch": 0.34819743768000794, "grad_norm": 0.521723957173526, "learning_rate": 7.573014320146554e-06, "loss": 0.0119, "step": 1753 }, { "epoch": 0.34839606713675636, "grad_norm": 0.6919236767327321, "learning_rate": 7.570254988381801e-06, "loss": 0.0134, "step": 1754 }, { "epoch": 0.34859469659350484, "grad_norm": 0.5814649266815375, "learning_rate": 7.567494592277031e-06, "loss": 0.017, "step": 1755 }, { "epoch": 0.34879332605025326, "grad_norm": 0.5249271812496219, "learning_rate": 7.564733132975321e-06, "loss": 0.011, "step": 1756 }, { "epoch": 0.3489919555070017, "grad_norm": 0.3963080625064441, "learning_rate": 7.561970611620191e-06, "loss": 0.0111, "step": 1757 }, { "epoch": 0.3491905849637501, "grad_norm": 0.7223775018258682, "learning_rate": 7.559207029355593e-06, "loss": 0.0173, "step": 1758 }, { "epoch": 0.3493892144204986, "grad_norm": 0.50134557033894, "learning_rate": 7.5564423873259306e-06, "loss": 0.0104, "step": 1759 }, { "epoch": 0.349587843877247, "grad_norm": 0.6488452488216098, "learning_rate": 7.553676686676034e-06, "loss": 0.0175, "step": 1760 }, { "epoch": 0.34978647333399543, "grad_norm": 0.49384847104699575, "learning_rate": 7.5509099285511775e-06, "loss": 0.0141, "step": 1761 }, { "epoch": 0.34998510279074385, "grad_norm": 0.3953416148883208, "learning_rate": 7.548142114097077e-06, "loss": 0.012, "step": 1762 }, { "epoch": 0.35018373224749233, "grad_norm": 0.666969527433996, "learning_rate": 7.545373244459877e-06, "loss": 0.0129, "step": 1763 }, { "epoch": 0.35038236170424075, "grad_norm": 0.561354252191077, "learning_rate": 7.542603320786166e-06, "loss": 0.0208, "step": 1764 }, { "epoch": 0.3505809911609892, "grad_norm": 0.6255119965679132, "learning_rate": 7.539832344222966e-06, "loss": 0.0152, "step": 1765 }, { "epoch": 0.3507796206177376, "grad_norm": 0.4306557341184174, "learning_rate": 7.537060315917734e-06, "loss": 0.0211, "step": 1766 }, { "epoch": 0.350978250074486, "grad_norm": 0.5528048136181183, "learning_rate": 7.53428723701837e-06, "loss": 0.0238, "step": 1767 }, { "epoch": 0.3511768795312345, "grad_norm": 0.5116386343446967, "learning_rate": 7.531513108673196e-06, "loss": 0.0172, "step": 1768 }, { "epoch": 0.3513755089879829, "grad_norm": 0.4279479662462199, "learning_rate": 7.528737932030978e-06, "loss": 0.0134, "step": 1769 }, { "epoch": 0.35157413844473134, "grad_norm": 0.3728795654509069, "learning_rate": 7.5259617082409165e-06, "loss": 0.016, "step": 1770 }, { "epoch": 0.35177276790147977, "grad_norm": 0.3025444908515605, "learning_rate": 7.52318443845264e-06, "loss": 0.0122, "step": 1771 }, { "epoch": 0.35197139735822824, "grad_norm": 0.5225986004045365, "learning_rate": 7.520406123816215e-06, "loss": 0.0126, "step": 1772 }, { "epoch": 0.35217002681497667, "grad_norm": 0.22475056903545595, "learning_rate": 7.517626765482139e-06, "loss": 0.0075, "step": 1773 }, { "epoch": 0.3523686562717251, "grad_norm": 0.6644702771927852, "learning_rate": 7.5148463646013405e-06, "loss": 0.0201, "step": 1774 }, { "epoch": 0.3525672857284735, "grad_norm": 0.21757342206133162, "learning_rate": 7.512064922325179e-06, "loss": 0.007, "step": 1775 }, { "epoch": 0.352765915185222, "grad_norm": 0.7067021760903283, "learning_rate": 7.50928243980545e-06, "loss": 0.014, "step": 1776 }, { "epoch": 0.3529645446419704, "grad_norm": 0.6595482172027973, "learning_rate": 7.506498918194376e-06, "loss": 0.0117, "step": 1777 }, { "epoch": 0.35316317409871884, "grad_norm": 0.6788932987088132, "learning_rate": 7.5037143586446095e-06, "loss": 0.0179, "step": 1778 }, { "epoch": 0.35336180355546726, "grad_norm": 0.28064297644399, "learning_rate": 7.500928762309234e-06, "loss": 0.0086, "step": 1779 }, { "epoch": 0.35356043301221574, "grad_norm": 0.5596372651009542, "learning_rate": 7.498142130341764e-06, "loss": 0.0166, "step": 1780 }, { "epoch": 0.35375906246896416, "grad_norm": 0.7073177849186396, "learning_rate": 7.495354463896137e-06, "loss": 0.0095, "step": 1781 }, { "epoch": 0.3539576919257126, "grad_norm": 0.5066305988153633, "learning_rate": 7.492565764126728e-06, "loss": 0.0104, "step": 1782 }, { "epoch": 0.354156321382461, "grad_norm": 0.44054068142958164, "learning_rate": 7.4897760321883295e-06, "loss": 0.0085, "step": 1783 }, { "epoch": 0.3543549508392095, "grad_norm": 0.4553697052742875, "learning_rate": 7.486985269236171e-06, "loss": 0.0113, "step": 1784 }, { "epoch": 0.3545535802959579, "grad_norm": 0.5212306828856347, "learning_rate": 7.4841934764259025e-06, "loss": 0.0182, "step": 1785 }, { "epoch": 0.3547522097527063, "grad_norm": 0.31537340873334424, "learning_rate": 7.481400654913606e-06, "loss": 0.0095, "step": 1786 }, { "epoch": 0.35495083920945475, "grad_norm": 0.7414978066014976, "learning_rate": 7.47860680585578e-06, "loss": 0.0207, "step": 1787 }, { "epoch": 0.35514946866620317, "grad_norm": 0.6019848486242421, "learning_rate": 7.475811930409359e-06, "loss": 0.0157, "step": 1788 }, { "epoch": 0.35534809812295165, "grad_norm": 0.3030605279151099, "learning_rate": 7.473016029731696e-06, "loss": 0.0067, "step": 1789 }, { "epoch": 0.3555467275797001, "grad_norm": 1.415851763954458, "learning_rate": 7.470219104980572e-06, "loss": 0.0157, "step": 1790 }, { "epoch": 0.3557453570364485, "grad_norm": 0.3333734372525097, "learning_rate": 7.467421157314191e-06, "loss": 0.0102, "step": 1791 }, { "epoch": 0.3559439864931969, "grad_norm": 0.3295733399959379, "learning_rate": 7.464622187891179e-06, "loss": 0.0093, "step": 1792 }, { "epoch": 0.3561426159499454, "grad_norm": 0.5348317682346659, "learning_rate": 7.4618221978705875e-06, "loss": 0.0225, "step": 1793 }, { "epoch": 0.3563412454066938, "grad_norm": 0.4680465985095257, "learning_rate": 7.45902118841189e-06, "loss": 0.0098, "step": 1794 }, { "epoch": 0.35653987486344224, "grad_norm": 0.5680929119198581, "learning_rate": 7.45621916067498e-06, "loss": 0.0154, "step": 1795 }, { "epoch": 0.35673850432019066, "grad_norm": 0.7557185174597132, "learning_rate": 7.453416115820173e-06, "loss": 0.0157, "step": 1796 }, { "epoch": 0.35693713377693914, "grad_norm": 0.6641326101462152, "learning_rate": 7.4506120550082125e-06, "loss": 0.0158, "step": 1797 }, { "epoch": 0.35713576323368756, "grad_norm": 0.7831104663272818, "learning_rate": 7.447806979400255e-06, "loss": 0.0126, "step": 1798 }, { "epoch": 0.357334392690436, "grad_norm": 0.4053205882012716, "learning_rate": 7.445000890157876e-06, "loss": 0.0114, "step": 1799 }, { "epoch": 0.3575330221471844, "grad_norm": 0.5298271130207239, "learning_rate": 7.442193788443078e-06, "loss": 0.0094, "step": 1800 }, { "epoch": 0.3577316516039329, "grad_norm": 0.9078634591414197, "learning_rate": 7.439385675418278e-06, "loss": 0.0128, "step": 1801 }, { "epoch": 0.3579302810606813, "grad_norm": 0.4697586290419777, "learning_rate": 7.436576552246312e-06, "loss": 0.0103, "step": 1802 }, { "epoch": 0.35812891051742973, "grad_norm": 0.9172712379027379, "learning_rate": 7.433766420090436e-06, "loss": 0.0198, "step": 1803 }, { "epoch": 0.35832753997417816, "grad_norm": 0.45567523468910553, "learning_rate": 7.430955280114322e-06, "loss": 0.0119, "step": 1804 }, { "epoch": 0.35852616943092663, "grad_norm": 0.7802133005429396, "learning_rate": 7.428143133482063e-06, "loss": 0.009, "step": 1805 }, { "epoch": 0.35872479888767506, "grad_norm": 0.6423209564757857, "learning_rate": 7.425329981358163e-06, "loss": 0.0214, "step": 1806 }, { "epoch": 0.3589234283444235, "grad_norm": 0.49872407430644544, "learning_rate": 7.422515824907546e-06, "loss": 0.0155, "step": 1807 }, { "epoch": 0.3591220578011719, "grad_norm": 0.40918220122528187, "learning_rate": 7.419700665295551e-06, "loss": 0.0113, "step": 1808 }, { "epoch": 0.3593206872579203, "grad_norm": 0.4717139361458346, "learning_rate": 7.416884503687936e-06, "loss": 0.0089, "step": 1809 }, { "epoch": 0.3595193167146688, "grad_norm": 0.7744599282851102, "learning_rate": 7.414067341250868e-06, "loss": 0.0175, "step": 1810 }, { "epoch": 0.3597179461714172, "grad_norm": 0.7426285798479693, "learning_rate": 7.41124917915093e-06, "loss": 0.018, "step": 1811 }, { "epoch": 0.35991657562816565, "grad_norm": 0.3054174866414779, "learning_rate": 7.408430018555122e-06, "loss": 0.0108, "step": 1812 }, { "epoch": 0.36011520508491407, "grad_norm": 0.44806274127264745, "learning_rate": 7.405609860630855e-06, "loss": 0.0159, "step": 1813 }, { "epoch": 0.36031383454166255, "grad_norm": 0.45456734941331584, "learning_rate": 7.402788706545953e-06, "loss": 0.0114, "step": 1814 }, { "epoch": 0.36051246399841097, "grad_norm": 0.41653981798920153, "learning_rate": 7.3999665574686566e-06, "loss": 0.0167, "step": 1815 }, { "epoch": 0.3607110934551594, "grad_norm": 0.42591262940535, "learning_rate": 7.39714341456761e-06, "loss": 0.012, "step": 1816 }, { "epoch": 0.3609097229119078, "grad_norm": 0.5559851848169303, "learning_rate": 7.394319279011877e-06, "loss": 0.016, "step": 1817 }, { "epoch": 0.3611083523686563, "grad_norm": 0.3959904663844897, "learning_rate": 7.391494151970928e-06, "loss": 0.0123, "step": 1818 }, { "epoch": 0.3613069818254047, "grad_norm": 0.6179966278625685, "learning_rate": 7.388668034614645e-06, "loss": 0.0135, "step": 1819 }, { "epoch": 0.36150561128215314, "grad_norm": 0.37571678679572235, "learning_rate": 7.385840928113321e-06, "loss": 0.01, "step": 1820 }, { "epoch": 0.36170424073890156, "grad_norm": 0.4874416376117431, "learning_rate": 7.383012833637657e-06, "loss": 0.0111, "step": 1821 }, { "epoch": 0.36190287019565004, "grad_norm": 0.4693254096635751, "learning_rate": 7.380183752358768e-06, "loss": 0.0154, "step": 1822 }, { "epoch": 0.36210149965239846, "grad_norm": 0.6119621192596436, "learning_rate": 7.37735368544817e-06, "loss": 0.0187, "step": 1823 }, { "epoch": 0.3623001291091469, "grad_norm": 0.35374113186809264, "learning_rate": 7.37452263407779e-06, "loss": 0.0102, "step": 1824 }, { "epoch": 0.3624987585658953, "grad_norm": 0.7498478793443032, "learning_rate": 7.371690599419965e-06, "loss": 0.0147, "step": 1825 }, { "epoch": 0.3626973880226438, "grad_norm": 0.9295665798522207, "learning_rate": 7.3688575826474385e-06, "loss": 0.026, "step": 1826 }, { "epoch": 0.3628960174793922, "grad_norm": 0.523390784697274, "learning_rate": 7.3660235849333594e-06, "loss": 0.0169, "step": 1827 }, { "epoch": 0.36309464693614063, "grad_norm": 0.4162831159506049, "learning_rate": 7.363188607451283e-06, "loss": 0.0132, "step": 1828 }, { "epoch": 0.36329327639288905, "grad_norm": 0.6182223304228157, "learning_rate": 7.360352651375171e-06, "loss": 0.0138, "step": 1829 }, { "epoch": 0.3634919058496375, "grad_norm": 0.43001898999202587, "learning_rate": 7.35751571787939e-06, "loss": 0.0137, "step": 1830 }, { "epoch": 0.36369053530638595, "grad_norm": 0.5552465012209282, "learning_rate": 7.35467780813871e-06, "loss": 0.0103, "step": 1831 }, { "epoch": 0.3638891647631344, "grad_norm": 0.5016437757951848, "learning_rate": 7.3518389233283095e-06, "loss": 0.0094, "step": 1832 }, { "epoch": 0.3640877942198828, "grad_norm": 0.18076930934715546, "learning_rate": 7.348999064623763e-06, "loss": 0.0059, "step": 1833 }, { "epoch": 0.3642864236766312, "grad_norm": 0.817939106262419, "learning_rate": 7.34615823320106e-06, "loss": 0.021, "step": 1834 }, { "epoch": 0.3644850531333797, "grad_norm": 0.517907031178373, "learning_rate": 7.34331643023658e-06, "loss": 0.0129, "step": 1835 }, { "epoch": 0.3646836825901281, "grad_norm": 0.4513320397295181, "learning_rate": 7.340473656907113e-06, "loss": 0.0127, "step": 1836 }, { "epoch": 0.36488231204687654, "grad_norm": 0.7014290051670106, "learning_rate": 7.33762991438985e-06, "loss": 0.0126, "step": 1837 }, { "epoch": 0.36508094150362497, "grad_norm": 0.34486163648433743, "learning_rate": 7.334785203862378e-06, "loss": 0.0131, "step": 1838 }, { "epoch": 0.36527957096037345, "grad_norm": 0.42658500377007785, "learning_rate": 7.331939526502692e-06, "loss": 0.0148, "step": 1839 }, { "epoch": 0.36547820041712187, "grad_norm": 0.4012284928050507, "learning_rate": 7.329092883489184e-06, "loss": 0.0103, "step": 1840 }, { "epoch": 0.3656768298738703, "grad_norm": 0.6923841780226727, "learning_rate": 7.326245276000645e-06, "loss": 0.0173, "step": 1841 }, { "epoch": 0.3658754593306187, "grad_norm": 0.6204656956557247, "learning_rate": 7.323396705216267e-06, "loss": 0.0149, "step": 1842 }, { "epoch": 0.3660740887873672, "grad_norm": 0.43915467134030417, "learning_rate": 7.320547172315639e-06, "loss": 0.0115, "step": 1843 }, { "epoch": 0.3662727182441156, "grad_norm": 0.9453561263857528, "learning_rate": 7.317696678478752e-06, "loss": 0.0147, "step": 1844 }, { "epoch": 0.36647134770086404, "grad_norm": 0.4116346164383005, "learning_rate": 7.314845224885992e-06, "loss": 0.0132, "step": 1845 }, { "epoch": 0.36666997715761246, "grad_norm": 0.6010586200072675, "learning_rate": 7.31199281271814e-06, "loss": 0.016, "step": 1846 }, { "epoch": 0.36686860661436094, "grad_norm": 0.379756980977871, "learning_rate": 7.309139443156382e-06, "loss": 0.0084, "step": 1847 }, { "epoch": 0.36706723607110936, "grad_norm": 0.32996713284661944, "learning_rate": 7.306285117382292e-06, "loss": 0.0078, "step": 1848 }, { "epoch": 0.3672658655278578, "grad_norm": 0.3528111052658405, "learning_rate": 7.3034298365778455e-06, "loss": 0.0105, "step": 1849 }, { "epoch": 0.3674644949846062, "grad_norm": 0.32743299157906436, "learning_rate": 7.300573601925409e-06, "loss": 0.0129, "step": 1850 }, { "epoch": 0.3676631244413546, "grad_norm": 0.40089814984231775, "learning_rate": 7.297716414607747e-06, "loss": 0.0074, "step": 1851 }, { "epoch": 0.3678617538981031, "grad_norm": 0.8879963171017308, "learning_rate": 7.294858275808021e-06, "loss": 0.0158, "step": 1852 }, { "epoch": 0.36806038335485153, "grad_norm": 1.1591210573488786, "learning_rate": 7.29199918670978e-06, "loss": 0.0179, "step": 1853 }, { "epoch": 0.36825901281159995, "grad_norm": 0.5766334292979398, "learning_rate": 7.289139148496971e-06, "loss": 0.0134, "step": 1854 }, { "epoch": 0.3684576422683484, "grad_norm": 0.5074699980363011, "learning_rate": 7.286278162353934e-06, "loss": 0.0087, "step": 1855 }, { "epoch": 0.36865627172509685, "grad_norm": 0.5200342371854682, "learning_rate": 7.283416229465399e-06, "loss": 0.0154, "step": 1856 }, { "epoch": 0.3688549011818453, "grad_norm": 0.4688805862662405, "learning_rate": 7.280553351016489e-06, "loss": 0.0083, "step": 1857 }, { "epoch": 0.3690535306385937, "grad_norm": 0.5049930399031888, "learning_rate": 7.277689528192722e-06, "loss": 0.0113, "step": 1858 }, { "epoch": 0.3692521600953421, "grad_norm": 0.8576327838457353, "learning_rate": 7.2748247621800005e-06, "loss": 0.0151, "step": 1859 }, { "epoch": 0.3694507895520906, "grad_norm": 0.5565380252033648, "learning_rate": 7.271959054164623e-06, "loss": 0.0117, "step": 1860 }, { "epoch": 0.369649419008839, "grad_norm": 1.193299984883163, "learning_rate": 7.269092405333278e-06, "loss": 0.0153, "step": 1861 }, { "epoch": 0.36984804846558744, "grad_norm": 0.8109235962868375, "learning_rate": 7.26622481687304e-06, "loss": 0.0166, "step": 1862 }, { "epoch": 0.37004667792233586, "grad_norm": 0.7632333804661434, "learning_rate": 7.263356289971374e-06, "loss": 0.0169, "step": 1863 }, { "epoch": 0.37024530737908434, "grad_norm": 0.8232197885172541, "learning_rate": 7.260486825816134e-06, "loss": 0.0165, "step": 1864 }, { "epoch": 0.37044393683583277, "grad_norm": 0.4215445953865816, "learning_rate": 7.257616425595564e-06, "loss": 0.0146, "step": 1865 }, { "epoch": 0.3706425662925812, "grad_norm": 0.6077339267039444, "learning_rate": 7.254745090498294e-06, "loss": 0.0179, "step": 1866 }, { "epoch": 0.3708411957493296, "grad_norm": 0.8156039287786392, "learning_rate": 7.251872821713339e-06, "loss": 0.015, "step": 1867 }, { "epoch": 0.3710398252060781, "grad_norm": 0.4884625901216402, "learning_rate": 7.248999620430104e-06, "loss": 0.0076, "step": 1868 }, { "epoch": 0.3712384546628265, "grad_norm": 0.47130775104617345, "learning_rate": 7.246125487838378e-06, "loss": 0.012, "step": 1869 }, { "epoch": 0.37143708411957493, "grad_norm": 0.3343798674016324, "learning_rate": 7.243250425128337e-06, "loss": 0.0082, "step": 1870 }, { "epoch": 0.37163571357632336, "grad_norm": 0.2962776732176791, "learning_rate": 7.240374433490542e-06, "loss": 0.0092, "step": 1871 }, { "epoch": 0.3718343430330718, "grad_norm": 0.6443601303540771, "learning_rate": 7.237497514115937e-06, "loss": 0.0149, "step": 1872 }, { "epoch": 0.37203297248982026, "grad_norm": 0.6090312171198041, "learning_rate": 7.234619668195853e-06, "loss": 0.022, "step": 1873 }, { "epoch": 0.3722316019465687, "grad_norm": 1.008498309205369, "learning_rate": 7.231740896922e-06, "loss": 0.0181, "step": 1874 }, { "epoch": 0.3724302314033171, "grad_norm": 0.2263113111319505, "learning_rate": 7.228861201486479e-06, "loss": 0.0107, "step": 1875 }, { "epoch": 0.3726288608600655, "grad_norm": 1.0824047330601112, "learning_rate": 7.225980583081764e-06, "loss": 0.015, "step": 1876 }, { "epoch": 0.372827490316814, "grad_norm": 0.4519637815818529, "learning_rate": 7.2230990429007205e-06, "loss": 0.0167, "step": 1877 }, { "epoch": 0.3730261197735624, "grad_norm": 0.3996641096482069, "learning_rate": 7.2202165821365884e-06, "loss": 0.0099, "step": 1878 }, { "epoch": 0.37322474923031085, "grad_norm": 0.5247153139606248, "learning_rate": 7.217333201982994e-06, "loss": 0.0112, "step": 1879 }, { "epoch": 0.37342337868705927, "grad_norm": 0.6322765470012125, "learning_rate": 7.2144489036339414e-06, "loss": 0.01, "step": 1880 }, { "epoch": 0.37362200814380775, "grad_norm": 0.7784772761848201, "learning_rate": 7.211563688283815e-06, "loss": 0.0093, "step": 1881 }, { "epoch": 0.37382063760055617, "grad_norm": 0.489780522096192, "learning_rate": 7.20867755712738e-06, "loss": 0.0107, "step": 1882 }, { "epoch": 0.3740192670573046, "grad_norm": 0.4222831983087626, "learning_rate": 7.20579051135978e-06, "loss": 0.0133, "step": 1883 }, { "epoch": 0.374217896514053, "grad_norm": 0.39180257048360806, "learning_rate": 7.2029025521765395e-06, "loss": 0.0084, "step": 1884 }, { "epoch": 0.3744165259708015, "grad_norm": 0.328589453430018, "learning_rate": 7.200013680773556e-06, "loss": 0.0086, "step": 1885 }, { "epoch": 0.3746151554275499, "grad_norm": 0.8840476580346277, "learning_rate": 7.197123898347113e-06, "loss": 0.0197, "step": 1886 }, { "epoch": 0.37481378488429834, "grad_norm": 0.7445076848470832, "learning_rate": 7.194233206093862e-06, "loss": 0.0155, "step": 1887 }, { "epoch": 0.37501241434104676, "grad_norm": 0.823712336571771, "learning_rate": 7.1913416052108385e-06, "loss": 0.0138, "step": 1888 }, { "epoch": 0.37521104379779524, "grad_norm": 0.5026615807876057, "learning_rate": 7.18844909689545e-06, "loss": 0.0127, "step": 1889 }, { "epoch": 0.37540967325454366, "grad_norm": 0.5235102842153762, "learning_rate": 7.185555682345483e-06, "loss": 0.0167, "step": 1890 }, { "epoch": 0.3756083027112921, "grad_norm": 0.38122765851314344, "learning_rate": 7.182661362759096e-06, "loss": 0.016, "step": 1891 }, { "epoch": 0.3758069321680405, "grad_norm": 0.659952705620418, "learning_rate": 7.179766139334825e-06, "loss": 0.0217, "step": 1892 }, { "epoch": 0.37600556162478893, "grad_norm": 0.3460455608781431, "learning_rate": 7.1768700132715785e-06, "loss": 0.0122, "step": 1893 }, { "epoch": 0.3762041910815374, "grad_norm": 0.5470309221958665, "learning_rate": 7.173972985768639e-06, "loss": 0.0159, "step": 1894 }, { "epoch": 0.37640282053828583, "grad_norm": 0.609575691356759, "learning_rate": 7.171075058025664e-06, "loss": 0.015, "step": 1895 }, { "epoch": 0.37660144999503425, "grad_norm": 0.6256172821754186, "learning_rate": 7.168176231242681e-06, "loss": 0.0091, "step": 1896 }, { "epoch": 0.3768000794517827, "grad_norm": 0.43669928645159256, "learning_rate": 7.165276506620092e-06, "loss": 0.0108, "step": 1897 }, { "epoch": 0.37699870890853115, "grad_norm": 0.452629904550346, "learning_rate": 7.16237588535867e-06, "loss": 0.0131, "step": 1898 }, { "epoch": 0.3771973383652796, "grad_norm": 0.5306623947422398, "learning_rate": 7.159474368659559e-06, "loss": 0.0101, "step": 1899 }, { "epoch": 0.377395967822028, "grad_norm": 0.877007063554911, "learning_rate": 7.156571957724275e-06, "loss": 0.0221, "step": 1900 }, { "epoch": 0.3775945972787764, "grad_norm": 0.5158241756842965, "learning_rate": 7.153668653754702e-06, "loss": 0.0129, "step": 1901 }, { "epoch": 0.3777932267355249, "grad_norm": 0.3126658544481895, "learning_rate": 7.150764457953096e-06, "loss": 0.0131, "step": 1902 }, { "epoch": 0.3779918561922733, "grad_norm": 0.8119239874875152, "learning_rate": 7.147859371522083e-06, "loss": 0.0149, "step": 1903 }, { "epoch": 0.37819048564902175, "grad_norm": 0.37864224886279996, "learning_rate": 7.1449533956646555e-06, "loss": 0.0166, "step": 1904 }, { "epoch": 0.37838911510577017, "grad_norm": 0.31041396376686636, "learning_rate": 7.142046531584176e-06, "loss": 0.0077, "step": 1905 }, { "epoch": 0.37858774456251865, "grad_norm": 0.44678399616930514, "learning_rate": 7.139138780484371e-06, "loss": 0.0147, "step": 1906 }, { "epoch": 0.37878637401926707, "grad_norm": 0.6814421963342644, "learning_rate": 7.136230143569338e-06, "loss": 0.0092, "step": 1907 }, { "epoch": 0.3789850034760155, "grad_norm": 0.5699394737446986, "learning_rate": 7.133320622043544e-06, "loss": 0.0095, "step": 1908 }, { "epoch": 0.3791836329327639, "grad_norm": 0.5118628899874634, "learning_rate": 7.1304102171118165e-06, "loss": 0.018, "step": 1909 }, { "epoch": 0.37938226238951234, "grad_norm": 0.8450993817197867, "learning_rate": 7.12749892997935e-06, "loss": 0.0152, "step": 1910 }, { "epoch": 0.3795808918462608, "grad_norm": 0.5936810791362285, "learning_rate": 7.124586761851709e-06, "loss": 0.0137, "step": 1911 }, { "epoch": 0.37977952130300924, "grad_norm": 0.7147959398627135, "learning_rate": 7.121673713934816e-06, "loss": 0.0075, "step": 1912 }, { "epoch": 0.37997815075975766, "grad_norm": 0.3268090768205218, "learning_rate": 7.1187597874349635e-06, "loss": 0.0086, "step": 1913 }, { "epoch": 0.3801767802165061, "grad_norm": 0.8429071661543824, "learning_rate": 7.115844983558804e-06, "loss": 0.0164, "step": 1914 }, { "epoch": 0.38037540967325456, "grad_norm": 0.5508174982198967, "learning_rate": 7.112929303513356e-06, "loss": 0.0185, "step": 1915 }, { "epoch": 0.380574039130003, "grad_norm": 0.8107547717459483, "learning_rate": 7.110012748506e-06, "loss": 0.0191, "step": 1916 }, { "epoch": 0.3807726685867514, "grad_norm": 0.4591386080076129, "learning_rate": 7.107095319744479e-06, "loss": 0.0108, "step": 1917 }, { "epoch": 0.38097129804349983, "grad_norm": 0.7765034782212996, "learning_rate": 7.1041770184368945e-06, "loss": 0.0119, "step": 1918 }, { "epoch": 0.3811699275002483, "grad_norm": 0.5274182540525633, "learning_rate": 7.101257845791714e-06, "loss": 0.0141, "step": 1919 }, { "epoch": 0.38136855695699673, "grad_norm": 0.3603680268183808, "learning_rate": 7.098337803017763e-06, "loss": 0.0087, "step": 1920 }, { "epoch": 0.38156718641374515, "grad_norm": 0.47632384862279487, "learning_rate": 7.095416891324231e-06, "loss": 0.0083, "step": 1921 }, { "epoch": 0.3817658158704936, "grad_norm": 0.6379591206386355, "learning_rate": 7.0924951119206605e-06, "loss": 0.0113, "step": 1922 }, { "epoch": 0.38196444532724205, "grad_norm": 0.5608777505358337, "learning_rate": 7.0895724660169615e-06, "loss": 0.0159, "step": 1923 }, { "epoch": 0.3821630747839905, "grad_norm": 0.45075216436612026, "learning_rate": 7.086648954823396e-06, "loss": 0.0103, "step": 1924 }, { "epoch": 0.3823617042407389, "grad_norm": 0.8387316921379768, "learning_rate": 7.083724579550588e-06, "loss": 0.0248, "step": 1925 }, { "epoch": 0.3825603336974873, "grad_norm": 0.504954461339415, "learning_rate": 7.080799341409518e-06, "loss": 0.0162, "step": 1926 }, { "epoch": 0.3827589631542358, "grad_norm": 0.633175612122431, "learning_rate": 7.077873241611525e-06, "loss": 0.0134, "step": 1927 }, { "epoch": 0.3829575926109842, "grad_norm": 0.5726947868855516, "learning_rate": 7.074946281368304e-06, "loss": 0.0154, "step": 1928 }, { "epoch": 0.38315622206773264, "grad_norm": 0.27366180430338843, "learning_rate": 7.072018461891906e-06, "loss": 0.0122, "step": 1929 }, { "epoch": 0.38335485152448107, "grad_norm": 0.3847532093947302, "learning_rate": 7.069089784394737e-06, "loss": 0.01, "step": 1930 }, { "epoch": 0.3835534809812295, "grad_norm": 0.40845059746079004, "learning_rate": 7.066160250089561e-06, "loss": 0.0101, "step": 1931 }, { "epoch": 0.38375211043797797, "grad_norm": 0.4485746408020155, "learning_rate": 7.063229860189493e-06, "loss": 0.0105, "step": 1932 }, { "epoch": 0.3839507398947264, "grad_norm": 0.9400839412508927, "learning_rate": 7.060298615908006e-06, "loss": 0.0153, "step": 1933 }, { "epoch": 0.3841493693514748, "grad_norm": 0.3182534498151112, "learning_rate": 7.057366518458928e-06, "loss": 0.0111, "step": 1934 }, { "epoch": 0.38434799880822323, "grad_norm": 0.4800246818941992, "learning_rate": 7.0544335690564334e-06, "loss": 0.0209, "step": 1935 }, { "epoch": 0.3845466282649717, "grad_norm": 0.46973294531377413, "learning_rate": 7.051499768915056e-06, "loss": 0.0114, "step": 1936 }, { "epoch": 0.38474525772172014, "grad_norm": 0.7859855280724575, "learning_rate": 7.048565119249677e-06, "loss": 0.0141, "step": 1937 }, { "epoch": 0.38494388717846856, "grad_norm": 0.6876110737606786, "learning_rate": 7.0456296212755344e-06, "loss": 0.0128, "step": 1938 }, { "epoch": 0.385142516635217, "grad_norm": 0.46899691256126286, "learning_rate": 7.0426932762082135e-06, "loss": 0.0093, "step": 1939 }, { "epoch": 0.38534114609196546, "grad_norm": 0.3513359380168546, "learning_rate": 7.039756085263654e-06, "loss": 0.0072, "step": 1940 }, { "epoch": 0.3855397755487139, "grad_norm": 0.6075133338918507, "learning_rate": 7.036818049658143e-06, "loss": 0.0143, "step": 1941 }, { "epoch": 0.3857384050054623, "grad_norm": 0.7662029488077239, "learning_rate": 7.0338791706083155e-06, "loss": 0.0132, "step": 1942 }, { "epoch": 0.3859370344622107, "grad_norm": 0.628109956535636, "learning_rate": 7.03093944933116e-06, "loss": 0.016, "step": 1943 }, { "epoch": 0.3861356639189592, "grad_norm": 0.19803867516817825, "learning_rate": 7.027998887044013e-06, "loss": 0.0052, "step": 1944 }, { "epoch": 0.3863342933757076, "grad_norm": 0.662751247765471, "learning_rate": 7.025057484964558e-06, "loss": 0.0104, "step": 1945 }, { "epoch": 0.38653292283245605, "grad_norm": 0.6819086619122386, "learning_rate": 7.022115244310826e-06, "loss": 0.0113, "step": 1946 }, { "epoch": 0.38673155228920447, "grad_norm": 1.5798621153907724, "learning_rate": 7.019172166301197e-06, "loss": 0.0223, "step": 1947 }, { "epoch": 0.38693018174595295, "grad_norm": 0.5987945163370364, "learning_rate": 7.016228252154396e-06, "loss": 0.0121, "step": 1948 }, { "epoch": 0.3871288112027014, "grad_norm": 0.9981941212502284, "learning_rate": 7.013283503089494e-06, "loss": 0.0115, "step": 1949 }, { "epoch": 0.3873274406594498, "grad_norm": 1.0406382038091888, "learning_rate": 7.01033792032591e-06, "loss": 0.0248, "step": 1950 }, { "epoch": 0.3875260701161982, "grad_norm": 0.41775245079601303, "learning_rate": 7.007391505083405e-06, "loss": 0.0104, "step": 1951 }, { "epoch": 0.38772469957294664, "grad_norm": 0.5898166538524512, "learning_rate": 7.004444258582091e-06, "loss": 0.0129, "step": 1952 }, { "epoch": 0.3879233290296951, "grad_norm": 0.7611476458360803, "learning_rate": 7.001496182042416e-06, "loss": 0.0157, "step": 1953 }, { "epoch": 0.38812195848644354, "grad_norm": 0.33055910901797114, "learning_rate": 6.998547276685175e-06, "loss": 0.0107, "step": 1954 }, { "epoch": 0.38832058794319196, "grad_norm": 0.48354068516638127, "learning_rate": 6.995597543731509e-06, "loss": 0.0085, "step": 1955 }, { "epoch": 0.3885192173999404, "grad_norm": 1.3859038603039011, "learning_rate": 6.992646984402898e-06, "loss": 0.022, "step": 1956 }, { "epoch": 0.38871784685668886, "grad_norm": 0.9177731485118693, "learning_rate": 6.989695599921166e-06, "loss": 0.0192, "step": 1957 }, { "epoch": 0.3889164763134373, "grad_norm": 0.6987553756434028, "learning_rate": 6.986743391508479e-06, "loss": 0.0182, "step": 1958 }, { "epoch": 0.3891151057701857, "grad_norm": 0.40997833208509904, "learning_rate": 6.983790360387344e-06, "loss": 0.0088, "step": 1959 }, { "epoch": 0.38931373522693413, "grad_norm": 0.34916159941815383, "learning_rate": 6.980836507780606e-06, "loss": 0.0128, "step": 1960 }, { "epoch": 0.3895123646836826, "grad_norm": 0.7820977108647957, "learning_rate": 6.977881834911455e-06, "loss": 0.0122, "step": 1961 }, { "epoch": 0.38971099414043103, "grad_norm": 0.9305707403285578, "learning_rate": 6.974926343003416e-06, "loss": 0.0128, "step": 1962 }, { "epoch": 0.38990962359717946, "grad_norm": 0.682174849117662, "learning_rate": 6.971970033280356e-06, "loss": 0.0108, "step": 1963 }, { "epoch": 0.3901082530539279, "grad_norm": 0.5350787998659173, "learning_rate": 6.96901290696648e-06, "loss": 0.0154, "step": 1964 }, { "epoch": 0.39030688251067636, "grad_norm": 0.3819759718576897, "learning_rate": 6.966054965286333e-06, "loss": 0.0098, "step": 1965 }, { "epoch": 0.3905055119674248, "grad_norm": 0.5388430417097986, "learning_rate": 6.963096209464793e-06, "loss": 0.0144, "step": 1966 }, { "epoch": 0.3907041414241732, "grad_norm": 0.5604706168748357, "learning_rate": 6.9601366407270766e-06, "loss": 0.0157, "step": 1967 }, { "epoch": 0.3909027708809216, "grad_norm": 0.5074402312211344, "learning_rate": 6.957176260298742e-06, "loss": 0.008, "step": 1968 }, { "epoch": 0.3911014003376701, "grad_norm": 0.37696620230600386, "learning_rate": 6.954215069405677e-06, "loss": 0.0164, "step": 1969 }, { "epoch": 0.3913000297944185, "grad_norm": 0.7845133541601853, "learning_rate": 6.951253069274109e-06, "loss": 0.013, "step": 1970 }, { "epoch": 0.39149865925116695, "grad_norm": 0.6377358194064576, "learning_rate": 6.948290261130598e-06, "loss": 0.0108, "step": 1971 }, { "epoch": 0.39169728870791537, "grad_norm": 0.37305516874735534, "learning_rate": 6.945326646202042e-06, "loss": 0.0118, "step": 1972 }, { "epoch": 0.3918959181646638, "grad_norm": 0.6101758865406128, "learning_rate": 6.942362225715671e-06, "loss": 0.0072, "step": 1973 }, { "epoch": 0.39209454762141227, "grad_norm": 0.6669153608848816, "learning_rate": 6.939397000899046e-06, "loss": 0.0164, "step": 1974 }, { "epoch": 0.3922931770781607, "grad_norm": 0.4434397317662605, "learning_rate": 6.936430972980066e-06, "loss": 0.0142, "step": 1975 }, { "epoch": 0.3924918065349091, "grad_norm": 1.1486290060634483, "learning_rate": 6.933464143186958e-06, "loss": 0.0246, "step": 1976 }, { "epoch": 0.39269043599165754, "grad_norm": 0.4402208692427919, "learning_rate": 6.930496512748286e-06, "loss": 0.0142, "step": 1977 }, { "epoch": 0.392889065448406, "grad_norm": 0.8605918213090531, "learning_rate": 6.927528082892941e-06, "loss": 0.0162, "step": 1978 }, { "epoch": 0.39308769490515444, "grad_norm": 0.602808108046716, "learning_rate": 6.924558854850146e-06, "loss": 0.0202, "step": 1979 }, { "epoch": 0.39328632436190286, "grad_norm": 1.044860369928853, "learning_rate": 6.921588829849458e-06, "loss": 0.0239, "step": 1980 }, { "epoch": 0.3934849538186513, "grad_norm": 1.0031230137699616, "learning_rate": 6.918618009120759e-06, "loss": 0.021, "step": 1981 }, { "epoch": 0.39368358327539976, "grad_norm": 0.4596310481079393, "learning_rate": 6.915646393894263e-06, "loss": 0.0126, "step": 1982 }, { "epoch": 0.3938822127321482, "grad_norm": 0.48189428114732397, "learning_rate": 6.912673985400515e-06, "loss": 0.0119, "step": 1983 }, { "epoch": 0.3940808421888966, "grad_norm": 0.4738322562132466, "learning_rate": 6.909700784870384e-06, "loss": 0.021, "step": 1984 }, { "epoch": 0.39427947164564503, "grad_norm": 0.41216128439817645, "learning_rate": 6.906726793535072e-06, "loss": 0.0141, "step": 1985 }, { "epoch": 0.3944781011023935, "grad_norm": 0.33966794763975494, "learning_rate": 6.903752012626104e-06, "loss": 0.012, "step": 1986 }, { "epoch": 0.39467673055914193, "grad_norm": 0.2801511238879822, "learning_rate": 6.9007764433753324e-06, "loss": 0.0155, "step": 1987 }, { "epoch": 0.39487536001589035, "grad_norm": 0.16933671446526258, "learning_rate": 6.897800087014939e-06, "loss": 0.0064, "step": 1988 }, { "epoch": 0.3950739894726388, "grad_norm": 0.8536198892078616, "learning_rate": 6.894822944777433e-06, "loss": 0.0156, "step": 1989 }, { "epoch": 0.39527261892938725, "grad_norm": 0.2747101542433025, "learning_rate": 6.891845017895641e-06, "loss": 0.0069, "step": 1990 }, { "epoch": 0.3954712483861357, "grad_norm": 0.5826546759975959, "learning_rate": 6.888866307602722e-06, "loss": 0.0206, "step": 1991 }, { "epoch": 0.3956698778428841, "grad_norm": 0.5732473635409778, "learning_rate": 6.885886815132156e-06, "loss": 0.0144, "step": 1992 }, { "epoch": 0.3958685072996325, "grad_norm": 0.25716654176152753, "learning_rate": 6.882906541717749e-06, "loss": 0.0091, "step": 1993 }, { "epoch": 0.39606713675638094, "grad_norm": 0.3022114556941632, "learning_rate": 6.879925488593629e-06, "loss": 0.009, "step": 1994 }, { "epoch": 0.3962657662131294, "grad_norm": 0.4902950717878552, "learning_rate": 6.876943656994246e-06, "loss": 0.0113, "step": 1995 }, { "epoch": 0.39646439566987784, "grad_norm": 0.46038753699336915, "learning_rate": 6.873961048154374e-06, "loss": 0.0146, "step": 1996 }, { "epoch": 0.39666302512662627, "grad_norm": 0.6323728442617919, "learning_rate": 6.8709776633091085e-06, "loss": 0.011, "step": 1997 }, { "epoch": 0.3968616545833747, "grad_norm": 0.4427297026630537, "learning_rate": 6.867993503693868e-06, "loss": 0.0168, "step": 1998 }, { "epoch": 0.39706028404012317, "grad_norm": 0.4675745693620594, "learning_rate": 6.865008570544387e-06, "loss": 0.0155, "step": 1999 }, { "epoch": 0.3972589134968716, "grad_norm": 1.0376746039699174, "learning_rate": 6.862022865096724e-06, "loss": 0.0198, "step": 2000 }, { "epoch": 0.39745754295362, "grad_norm": 0.8009967163692941, "learning_rate": 6.859036388587259e-06, "loss": 0.0097, "step": 2001 }, { "epoch": 0.39765617241036844, "grad_norm": 0.6394354987730926, "learning_rate": 6.856049142252687e-06, "loss": 0.011, "step": 2002 }, { "epoch": 0.3978548018671169, "grad_norm": 1.2279342276594911, "learning_rate": 6.853061127330023e-06, "loss": 0.0207, "step": 2003 }, { "epoch": 0.39805343132386534, "grad_norm": 0.5253473078917413, "learning_rate": 6.850072345056602e-06, "loss": 0.0119, "step": 2004 }, { "epoch": 0.39825206078061376, "grad_norm": 0.4416957674480141, "learning_rate": 6.847082796670075e-06, "loss": 0.0116, "step": 2005 }, { "epoch": 0.3984506902373622, "grad_norm": 0.36052345349554943, "learning_rate": 6.844092483408411e-06, "loss": 0.0119, "step": 2006 }, { "epoch": 0.39864931969411066, "grad_norm": 0.5852938989301314, "learning_rate": 6.8411014065098976e-06, "loss": 0.0155, "step": 2007 }, { "epoch": 0.3988479491508591, "grad_norm": 0.7611535304666339, "learning_rate": 6.838109567213136e-06, "loss": 0.011, "step": 2008 }, { "epoch": 0.3990465786076075, "grad_norm": 0.6832215844317394, "learning_rate": 6.835116966757042e-06, "loss": 0.0128, "step": 2009 }, { "epoch": 0.3992452080643559, "grad_norm": 0.35682225109643834, "learning_rate": 6.832123606380852e-06, "loss": 0.0058, "step": 2010 }, { "epoch": 0.3994438375211044, "grad_norm": 0.4423053994022838, "learning_rate": 6.82912948732411e-06, "loss": 0.0129, "step": 2011 }, { "epoch": 0.39964246697785283, "grad_norm": 0.8397841328233963, "learning_rate": 6.826134610826678e-06, "loss": 0.0142, "step": 2012 }, { "epoch": 0.39984109643460125, "grad_norm": 0.7620187002515791, "learning_rate": 6.8231389781287334e-06, "loss": 0.0191, "step": 2013 }, { "epoch": 0.4000397258913497, "grad_norm": 0.41386810405085306, "learning_rate": 6.820142590470764e-06, "loss": 0.0129, "step": 2014 }, { "epoch": 0.4002383553480981, "grad_norm": 0.7141109172857413, "learning_rate": 6.81714544909357e-06, "loss": 0.0183, "step": 2015 }, { "epoch": 0.4004369848048466, "grad_norm": 0.3552446795117014, "learning_rate": 6.8141475552382665e-06, "loss": 0.0094, "step": 2016 }, { "epoch": 0.400635614261595, "grad_norm": 0.44775694205320965, "learning_rate": 6.8111489101462755e-06, "loss": 0.0126, "step": 2017 }, { "epoch": 0.4008342437183434, "grad_norm": 0.5103022028781918, "learning_rate": 6.808149515059336e-06, "loss": 0.015, "step": 2018 }, { "epoch": 0.40103287317509184, "grad_norm": 1.1270462570715771, "learning_rate": 6.805149371219491e-06, "loss": 0.0214, "step": 2019 }, { "epoch": 0.4012315026318403, "grad_norm": 0.49338047514807803, "learning_rate": 6.8021484798691005e-06, "loss": 0.0094, "step": 2020 }, { "epoch": 0.40143013208858874, "grad_norm": 0.4300331782209231, "learning_rate": 6.799146842250829e-06, "loss": 0.0163, "step": 2021 }, { "epoch": 0.40162876154533717, "grad_norm": 0.33283287964762853, "learning_rate": 6.796144459607652e-06, "loss": 0.0094, "step": 2022 }, { "epoch": 0.4018273910020856, "grad_norm": 0.3659723339747389, "learning_rate": 6.793141333182853e-06, "loss": 0.0111, "step": 2023 }, { "epoch": 0.40202602045883407, "grad_norm": 0.49565704935423, "learning_rate": 6.7901374642200235e-06, "loss": 0.0164, "step": 2024 }, { "epoch": 0.4022246499155825, "grad_norm": 0.6254189475535632, "learning_rate": 6.787132853963063e-06, "loss": 0.0145, "step": 2025 }, { "epoch": 0.4024232793723309, "grad_norm": 0.39002228377365256, "learning_rate": 6.784127503656177e-06, "loss": 0.0087, "step": 2026 }, { "epoch": 0.40262190882907933, "grad_norm": 0.2860968118458668, "learning_rate": 6.781121414543878e-06, "loss": 0.0076, "step": 2027 }, { "epoch": 0.4028205382858278, "grad_norm": 0.37170294728652653, "learning_rate": 6.778114587870985e-06, "loss": 0.0104, "step": 2028 }, { "epoch": 0.40301916774257623, "grad_norm": 0.4036659979664484, "learning_rate": 6.775107024882623e-06, "loss": 0.007, "step": 2029 }, { "epoch": 0.40321779719932466, "grad_norm": 0.5549451713316448, "learning_rate": 6.772098726824219e-06, "loss": 0.0119, "step": 2030 }, { "epoch": 0.4034164266560731, "grad_norm": 0.9605322431392109, "learning_rate": 6.769089694941506e-06, "loss": 0.0165, "step": 2031 }, { "epoch": 0.40361505611282156, "grad_norm": 0.21818067243379816, "learning_rate": 6.766079930480523e-06, "loss": 0.0055, "step": 2032 }, { "epoch": 0.40381368556957, "grad_norm": 0.394154840294968, "learning_rate": 6.763069434687608e-06, "loss": 0.0177, "step": 2033 }, { "epoch": 0.4040123150263184, "grad_norm": 0.4819653179358817, "learning_rate": 6.760058208809407e-06, "loss": 0.0123, "step": 2034 }, { "epoch": 0.4042109444830668, "grad_norm": 1.280679469593761, "learning_rate": 6.757046254092865e-06, "loss": 0.016, "step": 2035 }, { "epoch": 0.40440957393981525, "grad_norm": 0.5941803654709766, "learning_rate": 6.754033571785227e-06, "loss": 0.0104, "step": 2036 }, { "epoch": 0.4046082033965637, "grad_norm": 0.5054580109994721, "learning_rate": 6.7510201631340445e-06, "loss": 0.0165, "step": 2037 }, { "epoch": 0.40480683285331215, "grad_norm": 0.5427400354352391, "learning_rate": 6.748006029387165e-06, "loss": 0.0102, "step": 2038 }, { "epoch": 0.40500546231006057, "grad_norm": 0.8181768765515004, "learning_rate": 6.744991171792741e-06, "loss": 0.0104, "step": 2039 }, { "epoch": 0.405204091766809, "grad_norm": 0.6029402222723133, "learning_rate": 6.741975591599219e-06, "loss": 0.0115, "step": 2040 }, { "epoch": 0.40540272122355747, "grad_norm": 0.5908122684007071, "learning_rate": 6.738959290055349e-06, "loss": 0.014, "step": 2041 }, { "epoch": 0.4056013506803059, "grad_norm": 0.2594953161338611, "learning_rate": 6.735942268410178e-06, "loss": 0.009, "step": 2042 }, { "epoch": 0.4057999801370543, "grad_norm": 0.9374727512612783, "learning_rate": 6.7329245279130525e-06, "loss": 0.0129, "step": 2043 }, { "epoch": 0.40599860959380274, "grad_norm": 0.978985312338324, "learning_rate": 6.7299060698136146e-06, "loss": 0.0153, "step": 2044 }, { "epoch": 0.4061972390505512, "grad_norm": 0.4019327020061037, "learning_rate": 6.726886895361807e-06, "loss": 0.014, "step": 2045 }, { "epoch": 0.40639586850729964, "grad_norm": 0.7831143997306873, "learning_rate": 6.723867005807865e-06, "loss": 0.0185, "step": 2046 }, { "epoch": 0.40659449796404806, "grad_norm": 0.5998403552907489, "learning_rate": 6.720846402402321e-06, "loss": 0.0133, "step": 2047 }, { "epoch": 0.4067931274207965, "grad_norm": 0.6325931920573962, "learning_rate": 6.717825086396007e-06, "loss": 0.0091, "step": 2048 }, { "epoch": 0.40699175687754496, "grad_norm": 1.0061561033266881, "learning_rate": 6.714803059040043e-06, "loss": 0.0142, "step": 2049 }, { "epoch": 0.4071903863342934, "grad_norm": 0.6985348942673328, "learning_rate": 6.711780321585851e-06, "loss": 0.0118, "step": 2050 }, { "epoch": 0.4073890157910418, "grad_norm": 1.2385741562172954, "learning_rate": 6.708756875285143e-06, "loss": 0.0112, "step": 2051 }, { "epoch": 0.40758764524779023, "grad_norm": 0.3580125091627547, "learning_rate": 6.705732721389922e-06, "loss": 0.0089, "step": 2052 }, { "epoch": 0.4077862747045387, "grad_norm": 0.5168739359216307, "learning_rate": 6.7027078611524895e-06, "loss": 0.012, "step": 2053 }, { "epoch": 0.40798490416128713, "grad_norm": 1.176391662569905, "learning_rate": 6.699682295825438e-06, "loss": 0.0159, "step": 2054 }, { "epoch": 0.40818353361803555, "grad_norm": 0.32971423902812974, "learning_rate": 6.69665602666165e-06, "loss": 0.0095, "step": 2055 }, { "epoch": 0.408382163074784, "grad_norm": 0.7928255269070492, "learning_rate": 6.693629054914298e-06, "loss": 0.0061, "step": 2056 }, { "epoch": 0.4085807925315324, "grad_norm": 0.35670943004316913, "learning_rate": 6.690601381836852e-06, "loss": 0.0096, "step": 2057 }, { "epoch": 0.4087794219882809, "grad_norm": 0.7975764075408337, "learning_rate": 6.687573008683067e-06, "loss": 0.015, "step": 2058 }, { "epoch": 0.4089780514450293, "grad_norm": 0.8319504575889517, "learning_rate": 6.684543936706989e-06, "loss": 0.0094, "step": 2059 }, { "epoch": 0.4091766809017777, "grad_norm": 0.7205610429728381, "learning_rate": 6.681514167162954e-06, "loss": 0.0217, "step": 2060 }, { "epoch": 0.40937531035852615, "grad_norm": 0.5035140786062322, "learning_rate": 6.678483701305587e-06, "loss": 0.0143, "step": 2061 }, { "epoch": 0.4095739398152746, "grad_norm": 0.9123728525009268, "learning_rate": 6.675452540389799e-06, "loss": 0.0146, "step": 2062 }, { "epoch": 0.40977256927202305, "grad_norm": 1.2095776877970332, "learning_rate": 6.672420685670791e-06, "loss": 0.0232, "step": 2063 }, { "epoch": 0.40997119872877147, "grad_norm": 0.44951274178922074, "learning_rate": 6.669388138404053e-06, "loss": 0.011, "step": 2064 }, { "epoch": 0.4101698281855199, "grad_norm": 1.2023015543359805, "learning_rate": 6.666354899845359e-06, "loss": 0.0202, "step": 2065 }, { "epoch": 0.41036845764226837, "grad_norm": 1.221622773848238, "learning_rate": 6.6633209712507685e-06, "loss": 0.0185, "step": 2066 }, { "epoch": 0.4105670870990168, "grad_norm": 1.245700627421625, "learning_rate": 6.66028635387663e-06, "loss": 0.0187, "step": 2067 }, { "epoch": 0.4107657165557652, "grad_norm": 1.3483216195815648, "learning_rate": 6.657251048979576e-06, "loss": 0.0159, "step": 2068 }, { "epoch": 0.41096434601251364, "grad_norm": 0.782416570047913, "learning_rate": 6.654215057816521e-06, "loss": 0.0172, "step": 2069 }, { "epoch": 0.4111629754692621, "grad_norm": 0.3826135913453981, "learning_rate": 6.651178381644668e-06, "loss": 0.0147, "step": 2070 }, { "epoch": 0.41136160492601054, "grad_norm": 0.4239048220757515, "learning_rate": 6.648141021721499e-06, "loss": 0.0131, "step": 2071 }, { "epoch": 0.41156023438275896, "grad_norm": 0.20186798617854174, "learning_rate": 6.645102979304785e-06, "loss": 0.0067, "step": 2072 }, { "epoch": 0.4117588638395074, "grad_norm": 0.44996535792808406, "learning_rate": 6.642064255652576e-06, "loss": 0.0134, "step": 2073 }, { "epoch": 0.41195749329625586, "grad_norm": 0.7266208044448531, "learning_rate": 6.6390248520232005e-06, "loss": 0.0104, "step": 2074 }, { "epoch": 0.4121561227530043, "grad_norm": 0.8966072482079172, "learning_rate": 6.6359847696752765e-06, "loss": 0.019, "step": 2075 }, { "epoch": 0.4123547522097527, "grad_norm": 0.39163729067805625, "learning_rate": 6.6329440098676975e-06, "loss": 0.0125, "step": 2076 }, { "epoch": 0.41255338166650113, "grad_norm": 0.5443082189044185, "learning_rate": 6.629902573859639e-06, "loss": 0.0127, "step": 2077 }, { "epoch": 0.41275201112324955, "grad_norm": 0.8156259321153082, "learning_rate": 6.626860462910557e-06, "loss": 0.0175, "step": 2078 }, { "epoch": 0.41295064057999803, "grad_norm": 0.7619649973471931, "learning_rate": 6.623817678280187e-06, "loss": 0.0136, "step": 2079 }, { "epoch": 0.41314927003674645, "grad_norm": 0.779374490528315, "learning_rate": 6.620774221228542e-06, "loss": 0.0155, "step": 2080 }, { "epoch": 0.4133478994934949, "grad_norm": 0.3046402817702944, "learning_rate": 6.617730093015915e-06, "loss": 0.0108, "step": 2081 }, { "epoch": 0.4135465289502433, "grad_norm": 0.442045865809681, "learning_rate": 6.614685294902876e-06, "loss": 0.0121, "step": 2082 }, { "epoch": 0.4137451584069918, "grad_norm": 0.5778707285503455, "learning_rate": 6.611639828150273e-06, "loss": 0.0133, "step": 2083 }, { "epoch": 0.4139437878637402, "grad_norm": 0.8172818474796, "learning_rate": 6.608593694019233e-06, "loss": 0.0118, "step": 2084 }, { "epoch": 0.4141424173204886, "grad_norm": 1.0008201942397041, "learning_rate": 6.605546893771155e-06, "loss": 0.0099, "step": 2085 }, { "epoch": 0.41434104677723704, "grad_norm": 0.32404676420516487, "learning_rate": 6.602499428667717e-06, "loss": 0.008, "step": 2086 }, { "epoch": 0.4145396762339855, "grad_norm": 0.9189946664738823, "learning_rate": 6.599451299970867e-06, "loss": 0.0102, "step": 2087 }, { "epoch": 0.41473830569073394, "grad_norm": 0.5740526296204994, "learning_rate": 6.596402508942838e-06, "loss": 0.0186, "step": 2088 }, { "epoch": 0.41493693514748237, "grad_norm": 0.4517267054867236, "learning_rate": 6.593353056846128e-06, "loss": 0.017, "step": 2089 }, { "epoch": 0.4151355646042308, "grad_norm": 0.6912662883562258, "learning_rate": 6.590302944943513e-06, "loss": 0.0149, "step": 2090 }, { "epoch": 0.41533419406097927, "grad_norm": 0.7887240811333215, "learning_rate": 6.58725217449804e-06, "loss": 0.0118, "step": 2091 }, { "epoch": 0.4155328235177277, "grad_norm": 1.0223004745638395, "learning_rate": 6.584200746773033e-06, "loss": 0.0222, "step": 2092 }, { "epoch": 0.4157314529744761, "grad_norm": 0.4963546873514082, "learning_rate": 6.581148663032082e-06, "loss": 0.0154, "step": 2093 }, { "epoch": 0.41593008243122453, "grad_norm": 0.9897376857036329, "learning_rate": 6.5780959245390516e-06, "loss": 0.0132, "step": 2094 }, { "epoch": 0.416128711887973, "grad_norm": 0.5755955971080043, "learning_rate": 6.5750425325580815e-06, "loss": 0.0164, "step": 2095 }, { "epoch": 0.41632734134472144, "grad_norm": 0.4895034942702204, "learning_rate": 6.571988488353574e-06, "loss": 0.0096, "step": 2096 }, { "epoch": 0.41652597080146986, "grad_norm": 0.8734463704411604, "learning_rate": 6.5689337931902106e-06, "loss": 0.0139, "step": 2097 }, { "epoch": 0.4167246002582183, "grad_norm": 0.3329657332391327, "learning_rate": 6.565878448332933e-06, "loss": 0.0093, "step": 2098 }, { "epoch": 0.4169232297149667, "grad_norm": 0.40090511983879706, "learning_rate": 6.562822455046958e-06, "loss": 0.0076, "step": 2099 }, { "epoch": 0.4171218591717152, "grad_norm": 1.05951571631075, "learning_rate": 6.559765814597768e-06, "loss": 0.0173, "step": 2100 }, { "epoch": 0.4173204886284636, "grad_norm": 0.7267645054655884, "learning_rate": 6.556708528251119e-06, "loss": 0.0166, "step": 2101 }, { "epoch": 0.417519118085212, "grad_norm": 0.4476817973691744, "learning_rate": 6.553650597273025e-06, "loss": 0.0098, "step": 2102 }, { "epoch": 0.41771774754196045, "grad_norm": 0.8482543101543035, "learning_rate": 6.550592022929776e-06, "loss": 0.0206, "step": 2103 }, { "epoch": 0.4179163769987089, "grad_norm": 0.3610267762233538, "learning_rate": 6.5475328064879225e-06, "loss": 0.0103, "step": 2104 }, { "epoch": 0.41811500645545735, "grad_norm": 0.8566164925435632, "learning_rate": 6.544472949214283e-06, "loss": 0.0127, "step": 2105 }, { "epoch": 0.4183136359122058, "grad_norm": 0.33243771966184194, "learning_rate": 6.5414124523759425e-06, "loss": 0.0098, "step": 2106 }, { "epoch": 0.4185122653689542, "grad_norm": 0.380868671779114, "learning_rate": 6.538351317240249e-06, "loss": 0.0131, "step": 2107 }, { "epoch": 0.4187108948257027, "grad_norm": 0.5135335899779155, "learning_rate": 6.5352895450748155e-06, "loss": 0.0067, "step": 2108 }, { "epoch": 0.4189095242824511, "grad_norm": 0.38253084605058807, "learning_rate": 6.53222713714752e-06, "loss": 0.0096, "step": 2109 }, { "epoch": 0.4191081537391995, "grad_norm": 0.6100179434270661, "learning_rate": 6.5291640947265015e-06, "loss": 0.0116, "step": 2110 }, { "epoch": 0.41930678319594794, "grad_norm": 1.412550135096153, "learning_rate": 6.5261004190801615e-06, "loss": 0.0201, "step": 2111 }, { "epoch": 0.4195054126526964, "grad_norm": 0.26253036733279284, "learning_rate": 6.523036111477165e-06, "loss": 0.0057, "step": 2112 }, { "epoch": 0.41970404210944484, "grad_norm": 0.7640094825754855, "learning_rate": 6.519971173186441e-06, "loss": 0.0129, "step": 2113 }, { "epoch": 0.41990267156619326, "grad_norm": 0.41106174134380463, "learning_rate": 6.516905605477177e-06, "loss": 0.0081, "step": 2114 }, { "epoch": 0.4201013010229417, "grad_norm": 0.45933590667057456, "learning_rate": 6.513839409618821e-06, "loss": 0.0078, "step": 2115 }, { "epoch": 0.42029993047969016, "grad_norm": 0.7965887185750786, "learning_rate": 6.510772586881081e-06, "loss": 0.0179, "step": 2116 }, { "epoch": 0.4204985599364386, "grad_norm": 0.59035117918782, "learning_rate": 6.5077051385339266e-06, "loss": 0.0096, "step": 2117 }, { "epoch": 0.420697189393187, "grad_norm": 0.2143236615237929, "learning_rate": 6.504637065847583e-06, "loss": 0.0045, "step": 2118 }, { "epoch": 0.42089581884993543, "grad_norm": 0.7334121261956231, "learning_rate": 6.501568370092536e-06, "loss": 0.0147, "step": 2119 }, { "epoch": 0.42109444830668386, "grad_norm": 0.7597269250272318, "learning_rate": 6.498499052539533e-06, "loss": 0.0092, "step": 2120 }, { "epoch": 0.42129307776343233, "grad_norm": 0.47333533855501414, "learning_rate": 6.495429114459573e-06, "loss": 0.0065, "step": 2121 }, { "epoch": 0.42149170722018076, "grad_norm": 1.330156731921058, "learning_rate": 6.4923585571239135e-06, "loss": 0.0182, "step": 2122 }, { "epoch": 0.4216903366769292, "grad_norm": 0.6948668654706006, "learning_rate": 6.48928738180407e-06, "loss": 0.0085, "step": 2123 }, { "epoch": 0.4218889661336776, "grad_norm": 0.7890280489148934, "learning_rate": 6.486215589771812e-06, "loss": 0.015, "step": 2124 }, { "epoch": 0.4220875955904261, "grad_norm": 0.452242433827678, "learning_rate": 6.483143182299168e-06, "loss": 0.0051, "step": 2125 }, { "epoch": 0.4222862250471745, "grad_norm": 0.5748220685978489, "learning_rate": 6.480070160658416e-06, "loss": 0.0185, "step": 2126 }, { "epoch": 0.4224848545039229, "grad_norm": 1.0878583524025376, "learning_rate": 6.476996526122095e-06, "loss": 0.0176, "step": 2127 }, { "epoch": 0.42268348396067135, "grad_norm": 1.0214588977314998, "learning_rate": 6.47392227996299e-06, "loss": 0.0229, "step": 2128 }, { "epoch": 0.4228821134174198, "grad_norm": 0.8368364297826585, "learning_rate": 6.470847423454145e-06, "loss": 0.0119, "step": 2129 }, { "epoch": 0.42308074287416825, "grad_norm": 1.5630364581414389, "learning_rate": 6.467771957868856e-06, "loss": 0.0279, "step": 2130 }, { "epoch": 0.42327937233091667, "grad_norm": 0.5265919309050093, "learning_rate": 6.464695884480666e-06, "loss": 0.0066, "step": 2131 }, { "epoch": 0.4234780017876651, "grad_norm": 0.5458391814360123, "learning_rate": 6.461619204563379e-06, "loss": 0.0078, "step": 2132 }, { "epoch": 0.42367663124441357, "grad_norm": 0.5629575812962587, "learning_rate": 6.458541919391046e-06, "loss": 0.0164, "step": 2133 }, { "epoch": 0.423875260701162, "grad_norm": 1.0217593729153478, "learning_rate": 6.455464030237961e-06, "loss": 0.0132, "step": 2134 }, { "epoch": 0.4240738901579104, "grad_norm": 0.3745957899282288, "learning_rate": 6.452385538378681e-06, "loss": 0.0078, "step": 2135 }, { "epoch": 0.42427251961465884, "grad_norm": 0.4083516790878554, "learning_rate": 6.4493064450880014e-06, "loss": 0.0102, "step": 2136 }, { "epoch": 0.4244711490714073, "grad_norm": 1.0042440949633358, "learning_rate": 6.446226751640976e-06, "loss": 0.0146, "step": 2137 }, { "epoch": 0.42466977852815574, "grad_norm": 0.6514777945792959, "learning_rate": 6.4431464593129e-06, "loss": 0.0108, "step": 2138 }, { "epoch": 0.42486840798490416, "grad_norm": 0.8367257022237136, "learning_rate": 6.440065569379321e-06, "loss": 0.0221, "step": 2139 }, { "epoch": 0.4250670374416526, "grad_norm": 1.226693427969879, "learning_rate": 6.436984083116032e-06, "loss": 0.0229, "step": 2140 }, { "epoch": 0.425265666898401, "grad_norm": 0.5627979518455061, "learning_rate": 6.433902001799074e-06, "loss": 0.012, "step": 2141 }, { "epoch": 0.4254642963551495, "grad_norm": 0.7446434572287394, "learning_rate": 6.430819326704732e-06, "loss": 0.0133, "step": 2142 }, { "epoch": 0.4256629258118979, "grad_norm": 0.5119208241159949, "learning_rate": 6.427736059109539e-06, "loss": 0.0134, "step": 2143 }, { "epoch": 0.42586155526864633, "grad_norm": 0.43798331629976245, "learning_rate": 6.424652200290275e-06, "loss": 0.0116, "step": 2144 }, { "epoch": 0.42606018472539475, "grad_norm": 0.8345104818237911, "learning_rate": 6.421567751523962e-06, "loss": 0.0146, "step": 2145 }, { "epoch": 0.42625881418214323, "grad_norm": 0.5168528918687458, "learning_rate": 6.4184827140878655e-06, "loss": 0.0164, "step": 2146 }, { "epoch": 0.42645744363889165, "grad_norm": 0.769263430644991, "learning_rate": 6.415397089259497e-06, "loss": 0.011, "step": 2147 }, { "epoch": 0.4266560730956401, "grad_norm": 1.2006056414732815, "learning_rate": 6.412310878316612e-06, "loss": 0.0199, "step": 2148 }, { "epoch": 0.4268547025523885, "grad_norm": 0.36140757435107507, "learning_rate": 6.409224082537206e-06, "loss": 0.0095, "step": 2149 }, { "epoch": 0.427053332009137, "grad_norm": 0.4887806589406039, "learning_rate": 6.4061367031995205e-06, "loss": 0.0137, "step": 2150 }, { "epoch": 0.4272519614658854, "grad_norm": 0.4372668846370759, "learning_rate": 6.403048741582033e-06, "loss": 0.0143, "step": 2151 }, { "epoch": 0.4274505909226338, "grad_norm": 0.4264517517122641, "learning_rate": 6.3999601989634665e-06, "loss": 0.011, "step": 2152 }, { "epoch": 0.42764922037938224, "grad_norm": 0.5939697458462438, "learning_rate": 6.396871076622784e-06, "loss": 0.0153, "step": 2153 }, { "epoch": 0.4278478498361307, "grad_norm": 0.8720261216914841, "learning_rate": 6.393781375839187e-06, "loss": 0.0133, "step": 2154 }, { "epoch": 0.42804647929287915, "grad_norm": 0.39205208543034786, "learning_rate": 6.390691097892119e-06, "loss": 0.0122, "step": 2155 }, { "epoch": 0.42824510874962757, "grad_norm": 0.3140641676904937, "learning_rate": 6.387600244061258e-06, "loss": 0.0074, "step": 2156 }, { "epoch": 0.428443738206376, "grad_norm": 0.5950724392389523, "learning_rate": 6.384508815626529e-06, "loss": 0.0119, "step": 2157 }, { "epoch": 0.42864236766312447, "grad_norm": 1.1164418050217393, "learning_rate": 6.381416813868085e-06, "loss": 0.0168, "step": 2158 }, { "epoch": 0.4288409971198729, "grad_norm": 1.0956077716877128, "learning_rate": 6.37832424006632e-06, "loss": 0.0246, "step": 2159 }, { "epoch": 0.4290396265766213, "grad_norm": 0.5788002270708077, "learning_rate": 6.375231095501868e-06, "loss": 0.016, "step": 2160 }, { "epoch": 0.42923825603336974, "grad_norm": 0.634313578842633, "learning_rate": 6.3721373814555965e-06, "loss": 0.0083, "step": 2161 }, { "epoch": 0.42943688549011816, "grad_norm": 0.3256946524279106, "learning_rate": 6.369043099208609e-06, "loss": 0.0151, "step": 2162 }, { "epoch": 0.42963551494686664, "grad_norm": 1.4280037849128235, "learning_rate": 6.365948250042246e-06, "loss": 0.0127, "step": 2163 }, { "epoch": 0.42983414440361506, "grad_norm": 0.34022719740651086, "learning_rate": 6.3628528352380804e-06, "loss": 0.0073, "step": 2164 }, { "epoch": 0.4300327738603635, "grad_norm": 0.5438789476793532, "learning_rate": 6.359756856077921e-06, "loss": 0.0109, "step": 2165 }, { "epoch": 0.4302314033171119, "grad_norm": 0.9150416889469685, "learning_rate": 6.3566603138438074e-06, "loss": 0.0176, "step": 2166 }, { "epoch": 0.4304300327738604, "grad_norm": 0.37232770382572944, "learning_rate": 6.353563209818019e-06, "loss": 0.0108, "step": 2167 }, { "epoch": 0.4306286622306088, "grad_norm": 0.5476731401751236, "learning_rate": 6.35046554528306e-06, "loss": 0.0097, "step": 2168 }, { "epoch": 0.43082729168735723, "grad_norm": 0.996056551510471, "learning_rate": 6.347367321521671e-06, "loss": 0.0164, "step": 2169 }, { "epoch": 0.43102592114410565, "grad_norm": 0.7735149654496623, "learning_rate": 6.3442685398168226e-06, "loss": 0.0174, "step": 2170 }, { "epoch": 0.43122455060085413, "grad_norm": 0.5452867098400445, "learning_rate": 6.3411692014517175e-06, "loss": 0.0129, "step": 2171 }, { "epoch": 0.43142318005760255, "grad_norm": 0.4948507173958259, "learning_rate": 6.338069307709791e-06, "loss": 0.0154, "step": 2172 }, { "epoch": 0.431621809514351, "grad_norm": 0.612880978569296, "learning_rate": 6.334968859874704e-06, "loss": 0.0177, "step": 2173 }, { "epoch": 0.4318204389710994, "grad_norm": 0.6349394426674493, "learning_rate": 6.331867859230347e-06, "loss": 0.0123, "step": 2174 }, { "epoch": 0.4320190684278479, "grad_norm": 0.44612232259436474, "learning_rate": 6.328766307060845e-06, "loss": 0.015, "step": 2175 }, { "epoch": 0.4322176978845963, "grad_norm": 0.8671464943443922, "learning_rate": 6.3256642046505455e-06, "loss": 0.0155, "step": 2176 }, { "epoch": 0.4324163273413447, "grad_norm": 0.35014656407021144, "learning_rate": 6.322561553284027e-06, "loss": 0.0049, "step": 2177 }, { "epoch": 0.43261495679809314, "grad_norm": 1.0746580549448301, "learning_rate": 6.319458354246093e-06, "loss": 0.0128, "step": 2178 }, { "epoch": 0.4328135862548416, "grad_norm": 0.7495218834739403, "learning_rate": 6.316354608821775e-06, "loss": 0.0167, "step": 2179 }, { "epoch": 0.43301221571159004, "grad_norm": 0.6790113545596657, "learning_rate": 6.313250318296333e-06, "loss": 0.0071, "step": 2180 }, { "epoch": 0.43321084516833847, "grad_norm": 0.5279299100486009, "learning_rate": 6.31014548395525e-06, "loss": 0.0125, "step": 2181 }, { "epoch": 0.4334094746250869, "grad_norm": 0.3722980884712896, "learning_rate": 6.307040107084232e-06, "loss": 0.0067, "step": 2182 }, { "epoch": 0.4336081040818353, "grad_norm": 0.5681920025290187, "learning_rate": 6.3039341889692164e-06, "loss": 0.0168, "step": 2183 }, { "epoch": 0.4338067335385838, "grad_norm": 0.7370282183909415, "learning_rate": 6.300827730896359e-06, "loss": 0.013, "step": 2184 }, { "epoch": 0.4340053629953322, "grad_norm": 1.0952586557587631, "learning_rate": 6.29772073415204e-06, "loss": 0.0246, "step": 2185 }, { "epoch": 0.43420399245208063, "grad_norm": 0.36852862876653647, "learning_rate": 6.294613200022865e-06, "loss": 0.0073, "step": 2186 }, { "epoch": 0.43440262190882906, "grad_norm": 0.5506267036624556, "learning_rate": 6.291505129795659e-06, "loss": 0.0078, "step": 2187 }, { "epoch": 0.43460125136557753, "grad_norm": 0.642929767010988, "learning_rate": 6.288396524757473e-06, "loss": 0.02, "step": 2188 }, { "epoch": 0.43479988082232596, "grad_norm": 0.4663773137832936, "learning_rate": 6.285287386195577e-06, "loss": 0.0096, "step": 2189 }, { "epoch": 0.4349985102790744, "grad_norm": 0.899128172317911, "learning_rate": 6.28217771539746e-06, "loss": 0.0165, "step": 2190 }, { "epoch": 0.4351971397358228, "grad_norm": 0.5828032708204945, "learning_rate": 6.279067513650837e-06, "loss": 0.0138, "step": 2191 }, { "epoch": 0.4353957691925713, "grad_norm": 0.5191364681793799, "learning_rate": 6.275956782243637e-06, "loss": 0.0117, "step": 2192 }, { "epoch": 0.4355943986493197, "grad_norm": 0.5129148486056785, "learning_rate": 6.272845522464011e-06, "loss": 0.0125, "step": 2193 }, { "epoch": 0.4357930281060681, "grad_norm": 0.5310527010612295, "learning_rate": 6.269733735600328e-06, "loss": 0.0106, "step": 2194 }, { "epoch": 0.43599165756281655, "grad_norm": 0.4310848203884204, "learning_rate": 6.266621422941177e-06, "loss": 0.0124, "step": 2195 }, { "epoch": 0.436190287019565, "grad_norm": 0.9514944377133658, "learning_rate": 6.263508585775364e-06, "loss": 0.0127, "step": 2196 }, { "epoch": 0.43638891647631345, "grad_norm": 0.7287942888398268, "learning_rate": 6.260395225391911e-06, "loss": 0.0107, "step": 2197 }, { "epoch": 0.43658754593306187, "grad_norm": 0.3824710094767856, "learning_rate": 6.2572813430800565e-06, "loss": 0.0131, "step": 2198 }, { "epoch": 0.4367861753898103, "grad_norm": 0.4588648637376047, "learning_rate": 6.254166940129256e-06, "loss": 0.0152, "step": 2199 }, { "epoch": 0.43698480484655877, "grad_norm": 0.7375069280141057, "learning_rate": 6.251052017829183e-06, "loss": 0.0132, "step": 2200 }, { "epoch": 0.4371834343033072, "grad_norm": 0.6624079769881998, "learning_rate": 6.247936577469722e-06, "loss": 0.0131, "step": 2201 }, { "epoch": 0.4373820637600556, "grad_norm": 0.2923891307524442, "learning_rate": 6.244820620340974e-06, "loss": 0.0098, "step": 2202 }, { "epoch": 0.43758069321680404, "grad_norm": 1.2709855509660637, "learning_rate": 6.241704147733254e-06, "loss": 0.0213, "step": 2203 }, { "epoch": 0.43777932267355246, "grad_norm": 0.593801147450338, "learning_rate": 6.2385871609370885e-06, "loss": 0.0172, "step": 2204 }, { "epoch": 0.43797795213030094, "grad_norm": 0.9452658486066292, "learning_rate": 6.235469661243222e-06, "loss": 0.0191, "step": 2205 }, { "epoch": 0.43817658158704936, "grad_norm": 0.46873430033477437, "learning_rate": 6.2323516499426055e-06, "loss": 0.017, "step": 2206 }, { "epoch": 0.4383752110437978, "grad_norm": 0.934149350494909, "learning_rate": 6.229233128326404e-06, "loss": 0.0168, "step": 2207 }, { "epoch": 0.4385738405005462, "grad_norm": 0.7694849628244852, "learning_rate": 6.226114097685996e-06, "loss": 0.0106, "step": 2208 }, { "epoch": 0.4387724699572947, "grad_norm": 0.36801759673916484, "learning_rate": 6.222994559312967e-06, "loss": 0.0058, "step": 2209 }, { "epoch": 0.4389710994140431, "grad_norm": 0.6695445746957661, "learning_rate": 6.219874514499116e-06, "loss": 0.0267, "step": 2210 }, { "epoch": 0.43916972887079153, "grad_norm": 0.38996735486229944, "learning_rate": 6.21675396453645e-06, "loss": 0.009, "step": 2211 }, { "epoch": 0.43936835832753995, "grad_norm": 0.25082029907152686, "learning_rate": 6.213632910717186e-06, "loss": 0.0089, "step": 2212 }, { "epoch": 0.43956698778428843, "grad_norm": 0.30323741324880354, "learning_rate": 6.210511354333749e-06, "loss": 0.0114, "step": 2213 }, { "epoch": 0.43976561724103685, "grad_norm": 0.611598681425317, "learning_rate": 6.207389296678773e-06, "loss": 0.0237, "step": 2214 }, { "epoch": 0.4399642466977853, "grad_norm": 0.5733866943683243, "learning_rate": 6.2042667390450975e-06, "loss": 0.0097, "step": 2215 }, { "epoch": 0.4401628761545337, "grad_norm": 0.79670133428446, "learning_rate": 6.201143682725772e-06, "loss": 0.0176, "step": 2216 }, { "epoch": 0.4403615056112822, "grad_norm": 0.9551481409610042, "learning_rate": 6.19802012901405e-06, "loss": 0.0143, "step": 2217 }, { "epoch": 0.4405601350680306, "grad_norm": 0.5431923190522518, "learning_rate": 6.1948960792033926e-06, "loss": 0.0231, "step": 2218 }, { "epoch": 0.440758764524779, "grad_norm": 1.0130427924002454, "learning_rate": 6.191771534587465e-06, "loss": 0.0125, "step": 2219 }, { "epoch": 0.44095739398152745, "grad_norm": 0.5857570298238707, "learning_rate": 6.188646496460139e-06, "loss": 0.0135, "step": 2220 }, { "epoch": 0.4411560234382759, "grad_norm": 0.9795520817741151, "learning_rate": 6.185520966115489e-06, "loss": 0.0154, "step": 2221 }, { "epoch": 0.44135465289502435, "grad_norm": 0.7885367572135659, "learning_rate": 6.182394944847794e-06, "loss": 0.0173, "step": 2222 }, { "epoch": 0.44155328235177277, "grad_norm": 0.21893643286317785, "learning_rate": 6.179268433951536e-06, "loss": 0.0058, "step": 2223 }, { "epoch": 0.4417519118085212, "grad_norm": 0.7459506265711627, "learning_rate": 6.1761414347213995e-06, "loss": 0.0089, "step": 2224 }, { "epoch": 0.4419505412652696, "grad_norm": 0.27792088533372517, "learning_rate": 6.173013948452274e-06, "loss": 0.0098, "step": 2225 }, { "epoch": 0.4421491707220181, "grad_norm": 1.2345740081065522, "learning_rate": 6.1698859764392475e-06, "loss": 0.0143, "step": 2226 }, { "epoch": 0.4423478001787665, "grad_norm": 0.3732015361677137, "learning_rate": 6.1667575199776096e-06, "loss": 0.0131, "step": 2227 }, { "epoch": 0.44254642963551494, "grad_norm": 0.5343008204160175, "learning_rate": 6.163628580362851e-06, "loss": 0.0089, "step": 2228 }, { "epoch": 0.44274505909226336, "grad_norm": 0.5522418996318227, "learning_rate": 6.160499158890664e-06, "loss": 0.0106, "step": 2229 }, { "epoch": 0.44294368854901184, "grad_norm": 0.6254354935039006, "learning_rate": 6.157369256856934e-06, "loss": 0.0167, "step": 2230 }, { "epoch": 0.44314231800576026, "grad_norm": 0.42379511270760073, "learning_rate": 6.154238875557755e-06, "loss": 0.0131, "step": 2231 }, { "epoch": 0.4433409474625087, "grad_norm": 0.6058120408697395, "learning_rate": 6.151108016289416e-06, "loss": 0.0096, "step": 2232 }, { "epoch": 0.4435395769192571, "grad_norm": 0.25542634551541554, "learning_rate": 6.147976680348398e-06, "loss": 0.0072, "step": 2233 }, { "epoch": 0.4437382063760056, "grad_norm": 0.2899452724247626, "learning_rate": 6.144844869031385e-06, "loss": 0.006, "step": 2234 }, { "epoch": 0.443936835832754, "grad_norm": 0.7491935474647948, "learning_rate": 6.141712583635261e-06, "loss": 0.0145, "step": 2235 }, { "epoch": 0.44413546528950243, "grad_norm": 0.4422482327524406, "learning_rate": 6.138579825457098e-06, "loss": 0.0092, "step": 2236 }, { "epoch": 0.44433409474625085, "grad_norm": 0.21771092100081543, "learning_rate": 6.135446595794169e-06, "loss": 0.0085, "step": 2237 }, { "epoch": 0.44453272420299933, "grad_norm": 1.4002752559333222, "learning_rate": 6.132312895943942e-06, "loss": 0.0207, "step": 2238 }, { "epoch": 0.44473135365974775, "grad_norm": 0.7190190028304118, "learning_rate": 6.129178727204079e-06, "loss": 0.0161, "step": 2239 }, { "epoch": 0.4449299831164962, "grad_norm": 0.42822061766771496, "learning_rate": 6.126044090872437e-06, "loss": 0.0124, "step": 2240 }, { "epoch": 0.4451286125732446, "grad_norm": 0.677711220443176, "learning_rate": 6.1229089882470645e-06, "loss": 0.0101, "step": 2241 }, { "epoch": 0.4453272420299931, "grad_norm": 1.059203479885282, "learning_rate": 6.119773420626203e-06, "loss": 0.0325, "step": 2242 }, { "epoch": 0.4455258714867415, "grad_norm": 0.5706231879648851, "learning_rate": 6.1166373893082895e-06, "loss": 0.0106, "step": 2243 }, { "epoch": 0.4457245009434899, "grad_norm": 0.995491363379659, "learning_rate": 6.113500895591953e-06, "loss": 0.0167, "step": 2244 }, { "epoch": 0.44592313040023834, "grad_norm": 0.29779451432903564, "learning_rate": 6.110363940776008e-06, "loss": 0.0092, "step": 2245 }, { "epoch": 0.44612175985698677, "grad_norm": 0.5762002893389644, "learning_rate": 6.10722652615947e-06, "loss": 0.0143, "step": 2246 }, { "epoch": 0.44632038931373524, "grad_norm": 0.7215045066061389, "learning_rate": 6.104088653041534e-06, "loss": 0.0174, "step": 2247 }, { "epoch": 0.44651901877048367, "grad_norm": 0.32513640970058094, "learning_rate": 6.100950322721594e-06, "loss": 0.0041, "step": 2248 }, { "epoch": 0.4467176482272321, "grad_norm": 0.5763190265316815, "learning_rate": 6.097811536499227e-06, "loss": 0.018, "step": 2249 }, { "epoch": 0.4469162776839805, "grad_norm": 0.8055544487235922, "learning_rate": 6.094672295674202e-06, "loss": 0.0155, "step": 2250 }, { "epoch": 0.447114907140729, "grad_norm": 0.992649917079707, "learning_rate": 6.091532601546476e-06, "loss": 0.0161, "step": 2251 }, { "epoch": 0.4473135365974774, "grad_norm": 0.4457080027649029, "learning_rate": 6.088392455416194e-06, "loss": 0.0073, "step": 2252 }, { "epoch": 0.44751216605422584, "grad_norm": 0.2552247040191699, "learning_rate": 6.085251858583685e-06, "loss": 0.0092, "step": 2253 }, { "epoch": 0.44771079551097426, "grad_norm": 0.590972028978692, "learning_rate": 6.082110812349468e-06, "loss": 0.0109, "step": 2254 }, { "epoch": 0.44790942496772274, "grad_norm": 0.4091054658923765, "learning_rate": 6.078969318014246e-06, "loss": 0.0124, "step": 2255 }, { "epoch": 0.44810805442447116, "grad_norm": 0.7196903790599142, "learning_rate": 6.075827376878911e-06, "loss": 0.0148, "step": 2256 }, { "epoch": 0.4483066838812196, "grad_norm": 0.3261047677288729, "learning_rate": 6.072684990244537e-06, "loss": 0.0098, "step": 2257 }, { "epoch": 0.448505313337968, "grad_norm": 0.29300124021584395, "learning_rate": 6.069542159412384e-06, "loss": 0.0122, "step": 2258 }, { "epoch": 0.4487039427947165, "grad_norm": 0.978139352013834, "learning_rate": 6.066398885683892e-06, "loss": 0.0144, "step": 2259 }, { "epoch": 0.4489025722514649, "grad_norm": 0.7871382091207846, "learning_rate": 6.06325517036069e-06, "loss": 0.0155, "step": 2260 }, { "epoch": 0.4491012017082133, "grad_norm": 1.1988153830297887, "learning_rate": 6.060111014744586e-06, "loss": 0.0114, "step": 2261 }, { "epoch": 0.44929983116496175, "grad_norm": 0.5272884595442661, "learning_rate": 6.056966420137572e-06, "loss": 0.013, "step": 2262 }, { "epoch": 0.44949846062171017, "grad_norm": 0.5853642484110945, "learning_rate": 6.053821387841823e-06, "loss": 0.0107, "step": 2263 }, { "epoch": 0.44969709007845865, "grad_norm": 0.3163330948551929, "learning_rate": 6.0506759191596944e-06, "loss": 0.0122, "step": 2264 }, { "epoch": 0.4498957195352071, "grad_norm": 0.43256342417833826, "learning_rate": 6.047530015393718e-06, "loss": 0.0081, "step": 2265 }, { "epoch": 0.4500943489919555, "grad_norm": 0.8015470911760131, "learning_rate": 6.04438367784661e-06, "loss": 0.012, "step": 2266 }, { "epoch": 0.4502929784487039, "grad_norm": 0.40936392070753425, "learning_rate": 6.041236907821267e-06, "loss": 0.0098, "step": 2267 }, { "epoch": 0.4504916079054524, "grad_norm": 1.3184403290579898, "learning_rate": 6.0380897066207654e-06, "loss": 0.0159, "step": 2268 }, { "epoch": 0.4506902373622008, "grad_norm": 0.36040823408450195, "learning_rate": 6.034942075548355e-06, "loss": 0.0063, "step": 2269 }, { "epoch": 0.45088886681894924, "grad_norm": 0.5708270902664976, "learning_rate": 6.0317940159074675e-06, "loss": 0.0203, "step": 2270 }, { "epoch": 0.45108749627569766, "grad_norm": 0.4185899585374011, "learning_rate": 6.028645529001711e-06, "loss": 0.009, "step": 2271 }, { "epoch": 0.45128612573244614, "grad_norm": 0.5374794859076718, "learning_rate": 6.02549661613487e-06, "loss": 0.0184, "step": 2272 }, { "epoch": 0.45148475518919456, "grad_norm": 0.3634143098642771, "learning_rate": 6.0223472786109095e-06, "loss": 0.0101, "step": 2273 }, { "epoch": 0.451683384645943, "grad_norm": 0.42164752734379324, "learning_rate": 6.019197517733963e-06, "loss": 0.0124, "step": 2274 }, { "epoch": 0.4518820141026914, "grad_norm": 0.2932256480659242, "learning_rate": 6.016047334808345e-06, "loss": 0.0056, "step": 2275 }, { "epoch": 0.4520806435594399, "grad_norm": 0.950748673633275, "learning_rate": 6.012896731138545e-06, "loss": 0.0131, "step": 2276 }, { "epoch": 0.4522792730161883, "grad_norm": 0.5735740533696118, "learning_rate": 6.0097457080292206e-06, "loss": 0.0158, "step": 2277 }, { "epoch": 0.45247790247293673, "grad_norm": 0.6300357309402919, "learning_rate": 6.006594266785212e-06, "loss": 0.0198, "step": 2278 }, { "epoch": 0.45267653192968516, "grad_norm": 0.45140542258277755, "learning_rate": 6.003442408711524e-06, "loss": 0.0103, "step": 2279 }, { "epoch": 0.45287516138643363, "grad_norm": 0.31203226057353545, "learning_rate": 6.000290135113338e-06, "loss": 0.0072, "step": 2280 }, { "epoch": 0.45307379084318206, "grad_norm": 0.5315768399241099, "learning_rate": 5.997137447296011e-06, "loss": 0.0189, "step": 2281 }, { "epoch": 0.4532724202999305, "grad_norm": 0.7502568266180624, "learning_rate": 5.993984346565065e-06, "loss": 0.0168, "step": 2282 }, { "epoch": 0.4534710497566789, "grad_norm": 0.7922773049664108, "learning_rate": 5.990830834226197e-06, "loss": 0.0197, "step": 2283 }, { "epoch": 0.4536696792134273, "grad_norm": 0.5256093593559897, "learning_rate": 5.987676911585273e-06, "loss": 0.0118, "step": 2284 }, { "epoch": 0.4538683086701758, "grad_norm": 0.4936069122490565, "learning_rate": 5.984522579948329e-06, "loss": 0.0119, "step": 2285 }, { "epoch": 0.4540669381269242, "grad_norm": 0.6240742984791606, "learning_rate": 5.981367840621569e-06, "loss": 0.0103, "step": 2286 }, { "epoch": 0.45426556758367265, "grad_norm": 0.4580728528848644, "learning_rate": 5.97821269491137e-06, "loss": 0.0089, "step": 2287 }, { "epoch": 0.45446419704042107, "grad_norm": 1.0311027752364688, "learning_rate": 5.975057144124274e-06, "loss": 0.0109, "step": 2288 }, { "epoch": 0.45466282649716955, "grad_norm": 0.47911929621431937, "learning_rate": 5.971901189566991e-06, "loss": 0.0127, "step": 2289 }, { "epoch": 0.45486145595391797, "grad_norm": 0.31717175346501003, "learning_rate": 5.968744832546398e-06, "loss": 0.0131, "step": 2290 }, { "epoch": 0.4550600854106664, "grad_norm": 0.5084576453823957, "learning_rate": 5.965588074369541e-06, "loss": 0.0108, "step": 2291 }, { "epoch": 0.4552587148674148, "grad_norm": 0.4321949217258047, "learning_rate": 5.962430916343627e-06, "loss": 0.0073, "step": 2292 }, { "epoch": 0.4554573443241633, "grad_norm": 0.3587917028460577, "learning_rate": 5.959273359776037e-06, "loss": 0.0095, "step": 2293 }, { "epoch": 0.4556559737809117, "grad_norm": 0.39093795226899214, "learning_rate": 5.956115405974308e-06, "loss": 0.0067, "step": 2294 }, { "epoch": 0.45585460323766014, "grad_norm": 1.022914745931775, "learning_rate": 5.952957056246147e-06, "loss": 0.0163, "step": 2295 }, { "epoch": 0.45605323269440856, "grad_norm": 0.35063776749240805, "learning_rate": 5.949798311899424e-06, "loss": 0.0096, "step": 2296 }, { "epoch": 0.45625186215115704, "grad_norm": 0.34523519354493915, "learning_rate": 5.94663917424217e-06, "loss": 0.0114, "step": 2297 }, { "epoch": 0.45645049160790546, "grad_norm": 0.4356127664422994, "learning_rate": 5.9434796445825835e-06, "loss": 0.0114, "step": 2298 }, { "epoch": 0.4566491210646539, "grad_norm": 0.34697933488413446, "learning_rate": 5.940319724229019e-06, "loss": 0.007, "step": 2299 }, { "epoch": 0.4568477505214023, "grad_norm": 0.45136277819236226, "learning_rate": 5.937159414490001e-06, "loss": 0.0141, "step": 2300 }, { "epoch": 0.4570463799781508, "grad_norm": 0.9498700129253315, "learning_rate": 5.933998716674206e-06, "loss": 0.0113, "step": 2301 }, { "epoch": 0.4572450094348992, "grad_norm": 1.0293691155517941, "learning_rate": 5.930837632090479e-06, "loss": 0.0221, "step": 2302 }, { "epoch": 0.45744363889164763, "grad_norm": 0.3086672015952359, "learning_rate": 5.92767616204782e-06, "loss": 0.0081, "step": 2303 }, { "epoch": 0.45764226834839605, "grad_norm": 0.3479833097855302, "learning_rate": 5.92451430785539e-06, "loss": 0.0062, "step": 2304 }, { "epoch": 0.4578408978051445, "grad_norm": 0.5660458407661002, "learning_rate": 5.921352070822513e-06, "loss": 0.0141, "step": 2305 }, { "epoch": 0.45803952726189295, "grad_norm": 1.281389383626456, "learning_rate": 5.918189452258665e-06, "loss": 0.0192, "step": 2306 }, { "epoch": 0.4582381567186414, "grad_norm": 0.6209042441203885, "learning_rate": 5.915026453473485e-06, "loss": 0.0237, "step": 2307 }, { "epoch": 0.4584367861753898, "grad_norm": 0.372320922451502, "learning_rate": 5.9118630757767666e-06, "loss": 0.0078, "step": 2308 }, { "epoch": 0.4586354156321382, "grad_norm": 0.5819030047417552, "learning_rate": 5.908699320478462e-06, "loss": 0.0073, "step": 2309 }, { "epoch": 0.4588340450888867, "grad_norm": 0.7061784530868636, "learning_rate": 5.905535188888678e-06, "loss": 0.0129, "step": 2310 }, { "epoch": 0.4590326745456351, "grad_norm": 0.6778701936405351, "learning_rate": 5.9023706823176795e-06, "loss": 0.0114, "step": 2311 }, { "epoch": 0.45923130400238354, "grad_norm": 0.7264468288248946, "learning_rate": 5.8992058020758855e-06, "loss": 0.0231, "step": 2312 }, { "epoch": 0.45942993345913197, "grad_norm": 0.6212393297209905, "learning_rate": 5.89604054947387e-06, "loss": 0.0143, "step": 2313 }, { "epoch": 0.45962856291588045, "grad_norm": 0.35909749045150213, "learning_rate": 5.892874925822359e-06, "loss": 0.0122, "step": 2314 }, { "epoch": 0.45982719237262887, "grad_norm": 0.3641939953347697, "learning_rate": 5.889708932432235e-06, "loss": 0.0105, "step": 2315 }, { "epoch": 0.4600258218293773, "grad_norm": 0.48230559364020364, "learning_rate": 5.8865425706145355e-06, "loss": 0.0127, "step": 2316 }, { "epoch": 0.4602244512861257, "grad_norm": 0.46675212979529435, "learning_rate": 5.883375841680442e-06, "loss": 0.0124, "step": 2317 }, { "epoch": 0.4604230807428742, "grad_norm": 0.3866132495591225, "learning_rate": 5.880208746941299e-06, "loss": 0.0073, "step": 2318 }, { "epoch": 0.4606217101996226, "grad_norm": 0.439249606081369, "learning_rate": 5.877041287708595e-06, "loss": 0.0137, "step": 2319 }, { "epoch": 0.46082033965637104, "grad_norm": 0.9131678817021807, "learning_rate": 5.8738734652939725e-06, "loss": 0.0144, "step": 2320 }, { "epoch": 0.46101896911311946, "grad_norm": 0.4209332036129976, "learning_rate": 5.870705281009222e-06, "loss": 0.011, "step": 2321 }, { "epoch": 0.46121759856986794, "grad_norm": 0.5289405995879706, "learning_rate": 5.867536736166287e-06, "loss": 0.0141, "step": 2322 }, { "epoch": 0.46141622802661636, "grad_norm": 0.4232956656008544, "learning_rate": 5.864367832077259e-06, "loss": 0.0118, "step": 2323 }, { "epoch": 0.4616148574833648, "grad_norm": 0.38126143858947364, "learning_rate": 5.861198570054377e-06, "loss": 0.0094, "step": 2324 }, { "epoch": 0.4618134869401132, "grad_norm": 0.5109437768251036, "learning_rate": 5.858028951410029e-06, "loss": 0.019, "step": 2325 }, { "epoch": 0.4620121163968616, "grad_norm": 0.9866712175295075, "learning_rate": 5.854858977456753e-06, "loss": 0.011, "step": 2326 }, { "epoch": 0.4622107458536101, "grad_norm": 0.5509612624995485, "learning_rate": 5.85168864950723e-06, "loss": 0.0146, "step": 2327 }, { "epoch": 0.46240937531035853, "grad_norm": 0.6697882679503572, "learning_rate": 5.8485179688742896e-06, "loss": 0.0156, "step": 2328 }, { "epoch": 0.46260800476710695, "grad_norm": 0.25998329362845135, "learning_rate": 5.845346936870907e-06, "loss": 0.009, "step": 2329 }, { "epoch": 0.4628066342238554, "grad_norm": 0.445245200827973, "learning_rate": 5.8421755548102054e-06, "loss": 0.0107, "step": 2330 }, { "epoch": 0.46300526368060385, "grad_norm": 0.30406825765433, "learning_rate": 5.8390038240054505e-06, "loss": 0.0087, "step": 2331 }, { "epoch": 0.4632038931373523, "grad_norm": 0.5121143693890272, "learning_rate": 5.835831745770052e-06, "loss": 0.0121, "step": 2332 }, { "epoch": 0.4634025225941007, "grad_norm": 0.6199846252981046, "learning_rate": 5.832659321417565e-06, "loss": 0.0112, "step": 2333 }, { "epoch": 0.4636011520508491, "grad_norm": 0.48346062372856696, "learning_rate": 5.829486552261685e-06, "loss": 0.0131, "step": 2334 }, { "epoch": 0.4637997815075976, "grad_norm": 0.8757641479679567, "learning_rate": 5.826313439616256e-06, "loss": 0.0156, "step": 2335 }, { "epoch": 0.463998410964346, "grad_norm": 0.43652114504307693, "learning_rate": 5.8231399847952585e-06, "loss": 0.0114, "step": 2336 }, { "epoch": 0.46419704042109444, "grad_norm": 0.47839962377266626, "learning_rate": 5.819966189112818e-06, "loss": 0.0102, "step": 2337 }, { "epoch": 0.46439566987784286, "grad_norm": 1.2127218820606134, "learning_rate": 5.8167920538832e-06, "loss": 0.014, "step": 2338 }, { "epoch": 0.46459429933459134, "grad_norm": 0.769446434011527, "learning_rate": 5.8136175804208106e-06, "loss": 0.0114, "step": 2339 }, { "epoch": 0.46479292879133977, "grad_norm": 0.34763955904579885, "learning_rate": 5.8104427700401945e-06, "loss": 0.015, "step": 2340 }, { "epoch": 0.4649915582480882, "grad_norm": 0.5597168255035329, "learning_rate": 5.80726762405604e-06, "loss": 0.014, "step": 2341 }, { "epoch": 0.4651901877048366, "grad_norm": 0.9135852247092504, "learning_rate": 5.80409214378317e-06, "loss": 0.0199, "step": 2342 }, { "epoch": 0.4653888171615851, "grad_norm": 0.6722348948820552, "learning_rate": 5.800916330536549e-06, "loss": 0.0111, "step": 2343 }, { "epoch": 0.4655874466183335, "grad_norm": 0.2794802612129201, "learning_rate": 5.7977401856312775e-06, "loss": 0.0081, "step": 2344 }, { "epoch": 0.46578607607508193, "grad_norm": 0.4778286582110137, "learning_rate": 5.794563710382595e-06, "loss": 0.0113, "step": 2345 }, { "epoch": 0.46598470553183036, "grad_norm": 0.3799982420436121, "learning_rate": 5.791386906105875e-06, "loss": 0.0112, "step": 2346 }, { "epoch": 0.4661833349885788, "grad_norm": 0.5794990884883824, "learning_rate": 5.788209774116629e-06, "loss": 0.0167, "step": 2347 }, { "epoch": 0.46638196444532726, "grad_norm": 0.9348715395945014, "learning_rate": 5.785032315730506e-06, "loss": 0.0248, "step": 2348 }, { "epoch": 0.4665805939020757, "grad_norm": 0.5055416806585602, "learning_rate": 5.781854532263287e-06, "loss": 0.0158, "step": 2349 }, { "epoch": 0.4667792233588241, "grad_norm": 0.6028354260098756, "learning_rate": 5.778676425030888e-06, "loss": 0.0103, "step": 2350 }, { "epoch": 0.4669778528155725, "grad_norm": 0.42355897766306966, "learning_rate": 5.775497995349363e-06, "loss": 0.0058, "step": 2351 }, { "epoch": 0.467176482272321, "grad_norm": 0.30607023801120187, "learning_rate": 5.772319244534893e-06, "loss": 0.0116, "step": 2352 }, { "epoch": 0.4673751117290694, "grad_norm": 0.8992242787812346, "learning_rate": 5.769140173903799e-06, "loss": 0.0221, "step": 2353 }, { "epoch": 0.46757374118581785, "grad_norm": 0.8268467494450803, "learning_rate": 5.765960784772527e-06, "loss": 0.01, "step": 2354 }, { "epoch": 0.46777237064256627, "grad_norm": 0.31652768813051085, "learning_rate": 5.762781078457662e-06, "loss": 0.0071, "step": 2355 }, { "epoch": 0.46797100009931475, "grad_norm": 0.5660464775694583, "learning_rate": 5.759601056275916e-06, "loss": 0.0099, "step": 2356 }, { "epoch": 0.46816962955606317, "grad_norm": 0.6382863746802383, "learning_rate": 5.756420719544135e-06, "loss": 0.0161, "step": 2357 }, { "epoch": 0.4683682590128116, "grad_norm": 0.6570237568395549, "learning_rate": 5.75324006957929e-06, "loss": 0.0151, "step": 2358 }, { "epoch": 0.46856688846956, "grad_norm": 0.455513564680463, "learning_rate": 5.7500591076984865e-06, "loss": 0.0137, "step": 2359 }, { "epoch": 0.4687655179263085, "grad_norm": 0.3782815896106589, "learning_rate": 5.746877835218959e-06, "loss": 0.0082, "step": 2360 }, { "epoch": 0.4689641473830569, "grad_norm": 0.25011529623111944, "learning_rate": 5.7436962534580665e-06, "loss": 0.0099, "step": 2361 }, { "epoch": 0.46916277683980534, "grad_norm": 0.2755162028039524, "learning_rate": 5.7405143637333e-06, "loss": 0.0074, "step": 2362 }, { "epoch": 0.46936140629655376, "grad_norm": 0.4887019159723483, "learning_rate": 5.737332167362278e-06, "loss": 0.0123, "step": 2363 }, { "epoch": 0.46956003575330224, "grad_norm": 0.6804047453581535, "learning_rate": 5.734149665662744e-06, "loss": 0.0099, "step": 2364 }, { "epoch": 0.46975866521005066, "grad_norm": 0.8398249561968816, "learning_rate": 5.730966859952568e-06, "loss": 0.0193, "step": 2365 }, { "epoch": 0.4699572946667991, "grad_norm": 0.30808476371188004, "learning_rate": 5.727783751549748e-06, "loss": 0.0101, "step": 2366 }, { "epoch": 0.4701559241235475, "grad_norm": 0.40427106496463305, "learning_rate": 5.724600341772404e-06, "loss": 0.0072, "step": 2367 }, { "epoch": 0.47035455358029593, "grad_norm": 0.5655774899898666, "learning_rate": 5.721416631938785e-06, "loss": 0.0161, "step": 2368 }, { "epoch": 0.4705531830370444, "grad_norm": 0.6325786828061096, "learning_rate": 5.718232623367262e-06, "loss": 0.0183, "step": 2369 }, { "epoch": 0.47075181249379283, "grad_norm": 0.5808533874064912, "learning_rate": 5.715048317376327e-06, "loss": 0.0074, "step": 2370 }, { "epoch": 0.47095044195054125, "grad_norm": 0.8445148294332612, "learning_rate": 5.7118637152846015e-06, "loss": 0.0161, "step": 2371 }, { "epoch": 0.4711490714072897, "grad_norm": 0.4846773932797683, "learning_rate": 5.708678818410823e-06, "loss": 0.0057, "step": 2372 }, { "epoch": 0.47134770086403815, "grad_norm": 0.4942960720168355, "learning_rate": 5.705493628073856e-06, "loss": 0.0163, "step": 2373 }, { "epoch": 0.4715463303207866, "grad_norm": 0.2926160774657436, "learning_rate": 5.702308145592684e-06, "loss": 0.0084, "step": 2374 }, { "epoch": 0.471744959777535, "grad_norm": 0.36092426480142087, "learning_rate": 5.699122372286413e-06, "loss": 0.0156, "step": 2375 }, { "epoch": 0.4719435892342834, "grad_norm": 0.971391873556755, "learning_rate": 5.6959363094742684e-06, "loss": 0.0209, "step": 2376 }, { "epoch": 0.4721422186910319, "grad_norm": 0.5192492237273335, "learning_rate": 5.692749958475593e-06, "loss": 0.0158, "step": 2377 }, { "epoch": 0.4723408481477803, "grad_norm": 0.37516394436059797, "learning_rate": 5.689563320609854e-06, "loss": 0.012, "step": 2378 }, { "epoch": 0.47253947760452875, "grad_norm": 0.7321340081454777, "learning_rate": 5.686376397196635e-06, "loss": 0.0095, "step": 2379 }, { "epoch": 0.47273810706127717, "grad_norm": 0.6380956047969817, "learning_rate": 5.683189189555637e-06, "loss": 0.0094, "step": 2380 }, { "epoch": 0.47293673651802565, "grad_norm": 0.5714109992216438, "learning_rate": 5.68000169900668e-06, "loss": 0.008, "step": 2381 }, { "epoch": 0.47313536597477407, "grad_norm": 0.773380440894765, "learning_rate": 5.6768139268697e-06, "loss": 0.0152, "step": 2382 }, { "epoch": 0.4733339954315225, "grad_norm": 0.6960015446023513, "learning_rate": 5.673625874464751e-06, "loss": 0.0217, "step": 2383 }, { "epoch": 0.4735326248882709, "grad_norm": 0.6037591137291943, "learning_rate": 5.670437543112003e-06, "loss": 0.0136, "step": 2384 }, { "epoch": 0.4737312543450194, "grad_norm": 0.541539265791468, "learning_rate": 5.667248934131739e-06, "loss": 0.0087, "step": 2385 }, { "epoch": 0.4739298838017678, "grad_norm": 0.4801580538978032, "learning_rate": 5.6640600488443585e-06, "loss": 0.0106, "step": 2386 }, { "epoch": 0.47412851325851624, "grad_norm": 0.4106221925173567, "learning_rate": 5.660870888570378e-06, "loss": 0.0121, "step": 2387 }, { "epoch": 0.47432714271526466, "grad_norm": 1.2008332110233264, "learning_rate": 5.657681454630424e-06, "loss": 0.0164, "step": 2388 }, { "epoch": 0.4745257721720131, "grad_norm": 0.46847613164327123, "learning_rate": 5.654491748345238e-06, "loss": 0.0112, "step": 2389 }, { "epoch": 0.47472440162876156, "grad_norm": 0.4866905116366235, "learning_rate": 5.651301771035675e-06, "loss": 0.0123, "step": 2390 }, { "epoch": 0.47492303108551, "grad_norm": 0.8229938698433211, "learning_rate": 5.6481115240227005e-06, "loss": 0.0144, "step": 2391 }, { "epoch": 0.4751216605422584, "grad_norm": 0.9137037442295208, "learning_rate": 5.644921008627391e-06, "loss": 0.0151, "step": 2392 }, { "epoch": 0.47532028999900683, "grad_norm": 0.4876172441311117, "learning_rate": 5.6417302261709404e-06, "loss": 0.0095, "step": 2393 }, { "epoch": 0.4755189194557553, "grad_norm": 0.37524536677463444, "learning_rate": 5.638539177974645e-06, "loss": 0.0121, "step": 2394 }, { "epoch": 0.47571754891250373, "grad_norm": 0.7581160723528595, "learning_rate": 5.635347865359915e-06, "loss": 0.0107, "step": 2395 }, { "epoch": 0.47591617836925215, "grad_norm": 0.4697124876851263, "learning_rate": 5.632156289648272e-06, "loss": 0.0141, "step": 2396 }, { "epoch": 0.4761148078260006, "grad_norm": 1.2004601762194573, "learning_rate": 5.62896445216134e-06, "loss": 0.0178, "step": 2397 }, { "epoch": 0.47631343728274905, "grad_norm": 0.44792347789247144, "learning_rate": 5.6257723542208595e-06, "loss": 0.0094, "step": 2398 }, { "epoch": 0.4765120667394975, "grad_norm": 0.6253105011573149, "learning_rate": 5.622579997148674e-06, "loss": 0.0104, "step": 2399 }, { "epoch": 0.4767106961962459, "grad_norm": 0.38725763342264224, "learning_rate": 5.619387382266734e-06, "loss": 0.0181, "step": 2400 }, { "epoch": 0.4769093256529943, "grad_norm": 0.5602962469904611, "learning_rate": 5.6161945108971005e-06, "loss": 0.0195, "step": 2401 }, { "epoch": 0.4771079551097428, "grad_norm": 0.48104235373399207, "learning_rate": 5.6130013843619366e-06, "loss": 0.0197, "step": 2402 }, { "epoch": 0.4773065845664912, "grad_norm": 0.3039832053300856, "learning_rate": 5.609808003983513e-06, "loss": 0.0067, "step": 2403 }, { "epoch": 0.47750521402323964, "grad_norm": 0.7235246350240075, "learning_rate": 5.606614371084206e-06, "loss": 0.0193, "step": 2404 }, { "epoch": 0.47770384347998807, "grad_norm": 0.547167273803981, "learning_rate": 5.603420486986495e-06, "loss": 0.0059, "step": 2405 }, { "epoch": 0.47790247293673654, "grad_norm": 0.5020367397204322, "learning_rate": 5.600226353012965e-06, "loss": 0.0124, "step": 2406 }, { "epoch": 0.47810110239348497, "grad_norm": 0.29441250473742775, "learning_rate": 5.597031970486304e-06, "loss": 0.0089, "step": 2407 }, { "epoch": 0.4782997318502334, "grad_norm": 0.3051620947194149, "learning_rate": 5.593837340729302e-06, "loss": 0.0102, "step": 2408 }, { "epoch": 0.4784983613069818, "grad_norm": 0.46639149386889667, "learning_rate": 5.590642465064852e-06, "loss": 0.0154, "step": 2409 }, { "epoch": 0.47869699076373023, "grad_norm": 0.6356240089361235, "learning_rate": 5.587447344815946e-06, "loss": 0.0158, "step": 2410 }, { "epoch": 0.4788956202204787, "grad_norm": 0.7929409757443711, "learning_rate": 5.584251981305685e-06, "loss": 0.016, "step": 2411 }, { "epoch": 0.47909424967722714, "grad_norm": 0.7886868857089763, "learning_rate": 5.581056375857263e-06, "loss": 0.0158, "step": 2412 }, { "epoch": 0.47929287913397556, "grad_norm": 0.6545632498798621, "learning_rate": 5.577860529793978e-06, "loss": 0.0142, "step": 2413 }, { "epoch": 0.479491508590724, "grad_norm": 0.6445078069693473, "learning_rate": 5.574664444439226e-06, "loss": 0.0128, "step": 2414 }, { "epoch": 0.47969013804747246, "grad_norm": 0.4114117395918578, "learning_rate": 5.571468121116504e-06, "loss": 0.0118, "step": 2415 }, { "epoch": 0.4798887675042209, "grad_norm": 0.49308068014298745, "learning_rate": 5.568271561149404e-06, "loss": 0.0199, "step": 2416 }, { "epoch": 0.4800873969609693, "grad_norm": 0.8300192404106222, "learning_rate": 5.56507476586162e-06, "loss": 0.0163, "step": 2417 }, { "epoch": 0.4802860264177177, "grad_norm": 0.7203031117955251, "learning_rate": 5.561877736576942e-06, "loss": 0.0192, "step": 2418 }, { "epoch": 0.4804846558744662, "grad_norm": 0.63747435264011, "learning_rate": 5.558680474619255e-06, "loss": 0.0116, "step": 2419 }, { "epoch": 0.4806832853312146, "grad_norm": 0.6686966563642719, "learning_rate": 5.5554829813125446e-06, "loss": 0.0137, "step": 2420 }, { "epoch": 0.48088191478796305, "grad_norm": 0.7236264839110779, "learning_rate": 5.552285257980887e-06, "loss": 0.0118, "step": 2421 }, { "epoch": 0.48108054424471147, "grad_norm": 0.3489337234437465, "learning_rate": 5.549087305948455e-06, "loss": 0.0125, "step": 2422 }, { "epoch": 0.48127917370145995, "grad_norm": 0.6402623274118869, "learning_rate": 5.545889126539522e-06, "loss": 0.0158, "step": 2423 }, { "epoch": 0.4814778031582084, "grad_norm": 0.6755621454969479, "learning_rate": 5.542690721078447e-06, "loss": 0.018, "step": 2424 }, { "epoch": 0.4816764326149568, "grad_norm": 0.3813390848817102, "learning_rate": 5.5394920908896895e-06, "loss": 0.0082, "step": 2425 }, { "epoch": 0.4818750620717052, "grad_norm": 0.5944323310591813, "learning_rate": 5.536293237297796e-06, "loss": 0.0086, "step": 2426 }, { "epoch": 0.4820736915284537, "grad_norm": 0.5823022738844985, "learning_rate": 5.533094161627412e-06, "loss": 0.0129, "step": 2427 }, { "epoch": 0.4822723209852021, "grad_norm": 0.24109484982366644, "learning_rate": 5.529894865203267e-06, "loss": 0.0059, "step": 2428 }, { "epoch": 0.48247095044195054, "grad_norm": 0.6279027842239673, "learning_rate": 5.52669534935019e-06, "loss": 0.0151, "step": 2429 }, { "epoch": 0.48266957989869896, "grad_norm": 0.8846786775845136, "learning_rate": 5.523495615393095e-06, "loss": 0.0208, "step": 2430 }, { "epoch": 0.4828682093554474, "grad_norm": 0.38091540228676024, "learning_rate": 5.520295664656992e-06, "loss": 0.0069, "step": 2431 }, { "epoch": 0.48306683881219586, "grad_norm": 0.30556096556441176, "learning_rate": 5.517095498466976e-06, "loss": 0.0061, "step": 2432 }, { "epoch": 0.4832654682689443, "grad_norm": 0.8509282833418818, "learning_rate": 5.51389511814823e-06, "loss": 0.0135, "step": 2433 }, { "epoch": 0.4834640977256927, "grad_norm": 0.37578690343483734, "learning_rate": 5.51069452502603e-06, "loss": 0.0091, "step": 2434 }, { "epoch": 0.48366272718244113, "grad_norm": 0.46966875266445063, "learning_rate": 5.5074937204257385e-06, "loss": 0.0147, "step": 2435 }, { "epoch": 0.4838613566391896, "grad_norm": 0.509057592751693, "learning_rate": 5.504292705672807e-06, "loss": 0.0142, "step": 2436 }, { "epoch": 0.48405998609593803, "grad_norm": 0.472350938168894, "learning_rate": 5.5010914820927695e-06, "loss": 0.01, "step": 2437 }, { "epoch": 0.48425861555268646, "grad_norm": 0.4293231269108412, "learning_rate": 5.497890051011252e-06, "loss": 0.009, "step": 2438 }, { "epoch": 0.4844572450094349, "grad_norm": 0.6277901027183694, "learning_rate": 5.494688413753964e-06, "loss": 0.0085, "step": 2439 }, { "epoch": 0.48465587446618336, "grad_norm": 0.4248433246133367, "learning_rate": 5.491486571646698e-06, "loss": 0.0103, "step": 2440 }, { "epoch": 0.4848545039229318, "grad_norm": 0.7745270788422335, "learning_rate": 5.488284526015335e-06, "loss": 0.0169, "step": 2441 }, { "epoch": 0.4850531333796802, "grad_norm": 0.9595839872373445, "learning_rate": 5.485082278185838e-06, "loss": 0.0123, "step": 2442 }, { "epoch": 0.4852517628364286, "grad_norm": 0.24937919516680546, "learning_rate": 5.481879829484256e-06, "loss": 0.0072, "step": 2443 }, { "epoch": 0.4854503922931771, "grad_norm": 0.5052250608119354, "learning_rate": 5.47867718123672e-06, "loss": 0.0126, "step": 2444 }, { "epoch": 0.4856490217499255, "grad_norm": 0.6095834648470921, "learning_rate": 5.475474334769443e-06, "loss": 0.0127, "step": 2445 }, { "epoch": 0.48584765120667395, "grad_norm": 0.5239132674664736, "learning_rate": 5.47227129140872e-06, "loss": 0.0109, "step": 2446 }, { "epoch": 0.48604628066342237, "grad_norm": 0.40049557705857963, "learning_rate": 5.4690680524809246e-06, "loss": 0.0072, "step": 2447 }, { "epoch": 0.48624491012017085, "grad_norm": 0.6583755900248895, "learning_rate": 5.465864619312522e-06, "loss": 0.0112, "step": 2448 }, { "epoch": 0.48644353957691927, "grad_norm": 0.511543136523891, "learning_rate": 5.462660993230045e-06, "loss": 0.0111, "step": 2449 }, { "epoch": 0.4866421690336677, "grad_norm": 0.6333349669994685, "learning_rate": 5.459457175560117e-06, "loss": 0.0142, "step": 2450 }, { "epoch": 0.4868407984904161, "grad_norm": 0.4081955737456362, "learning_rate": 5.456253167629431e-06, "loss": 0.0106, "step": 2451 }, { "epoch": 0.48703942794716454, "grad_norm": 0.45569800327311816, "learning_rate": 5.4530489707647646e-06, "loss": 0.0122, "step": 2452 }, { "epoch": 0.487238057403913, "grad_norm": 0.49468712119644775, "learning_rate": 5.449844586292974e-06, "loss": 0.0163, "step": 2453 }, { "epoch": 0.48743668686066144, "grad_norm": 1.3504172026817023, "learning_rate": 5.446640015540989e-06, "loss": 0.0165, "step": 2454 }, { "epoch": 0.48763531631740986, "grad_norm": 0.48452241295226994, "learning_rate": 5.443435259835822e-06, "loss": 0.0116, "step": 2455 }, { "epoch": 0.4878339457741583, "grad_norm": 0.5427977276769387, "learning_rate": 5.44023032050456e-06, "loss": 0.0129, "step": 2456 }, { "epoch": 0.48803257523090676, "grad_norm": 0.3796715316602044, "learning_rate": 5.437025198874361e-06, "loss": 0.0107, "step": 2457 }, { "epoch": 0.4882312046876552, "grad_norm": 0.9111058024326351, "learning_rate": 5.433819896272464e-06, "loss": 0.0157, "step": 2458 }, { "epoch": 0.4884298341444036, "grad_norm": 0.490889205825391, "learning_rate": 5.4306144140261845e-06, "loss": 0.009, "step": 2459 }, { "epoch": 0.48862846360115203, "grad_norm": 0.6591819950885145, "learning_rate": 5.427408753462905e-06, "loss": 0.0151, "step": 2460 }, { "epoch": 0.4888270930579005, "grad_norm": 0.3574688903762408, "learning_rate": 5.42420291591009e-06, "loss": 0.0145, "step": 2461 }, { "epoch": 0.48902572251464893, "grad_norm": 0.7539287197857844, "learning_rate": 5.420996902695273e-06, "loss": 0.0134, "step": 2462 }, { "epoch": 0.48922435197139735, "grad_norm": 0.6981359145878261, "learning_rate": 5.4177907151460585e-06, "loss": 0.0119, "step": 2463 }, { "epoch": 0.4894229814281458, "grad_norm": 0.606244215044152, "learning_rate": 5.414584354590129e-06, "loss": 0.0111, "step": 2464 }, { "epoch": 0.48962161088489425, "grad_norm": 0.5459437232022195, "learning_rate": 5.411377822355233e-06, "loss": 0.0091, "step": 2465 }, { "epoch": 0.4898202403416427, "grad_norm": 0.5279168001785495, "learning_rate": 5.408171119769192e-06, "loss": 0.0112, "step": 2466 }, { "epoch": 0.4900188697983911, "grad_norm": 0.4695727703267886, "learning_rate": 5.4049642481598985e-06, "loss": 0.0117, "step": 2467 }, { "epoch": 0.4902174992551395, "grad_norm": 0.43550533403024305, "learning_rate": 5.401757208855317e-06, "loss": 0.0048, "step": 2468 }, { "epoch": 0.490416128711888, "grad_norm": 0.42677326587153497, "learning_rate": 5.398550003183478e-06, "loss": 0.0081, "step": 2469 }, { "epoch": 0.4906147581686364, "grad_norm": 0.6348135613049047, "learning_rate": 5.39534263247248e-06, "loss": 0.0209, "step": 2470 }, { "epoch": 0.49081338762538484, "grad_norm": 0.5524103691775981, "learning_rate": 5.392135098050495e-06, "loss": 0.0119, "step": 2471 }, { "epoch": 0.49101201708213327, "grad_norm": 0.7108743625403123, "learning_rate": 5.388927401245757e-06, "loss": 0.0176, "step": 2472 }, { "epoch": 0.4912106465388817, "grad_norm": 0.4353791961565994, "learning_rate": 5.385719543386574e-06, "loss": 0.0106, "step": 2473 }, { "epoch": 0.49140927599563017, "grad_norm": 0.6306640021625024, "learning_rate": 5.3825115258013126e-06, "loss": 0.0103, "step": 2474 }, { "epoch": 0.4916079054523786, "grad_norm": 0.5210772415889535, "learning_rate": 5.379303349818412e-06, "loss": 0.009, "step": 2475 }, { "epoch": 0.491806534909127, "grad_norm": 0.414410686373316, "learning_rate": 5.376095016766374e-06, "loss": 0.0089, "step": 2476 }, { "epoch": 0.49200516436587544, "grad_norm": 0.4551862364482468, "learning_rate": 5.372886527973767e-06, "loss": 0.0082, "step": 2477 }, { "epoch": 0.4922037938226239, "grad_norm": 0.7775904217757333, "learning_rate": 5.369677884769221e-06, "loss": 0.0177, "step": 2478 }, { "epoch": 0.49240242327937234, "grad_norm": 1.0123752948840063, "learning_rate": 5.366469088481433e-06, "loss": 0.0137, "step": 2479 }, { "epoch": 0.49260105273612076, "grad_norm": 0.4494991849679043, "learning_rate": 5.3632601404391635e-06, "loss": 0.0094, "step": 2480 }, { "epoch": 0.4927996821928692, "grad_norm": 0.7550282831087276, "learning_rate": 5.360051041971234e-06, "loss": 0.014, "step": 2481 }, { "epoch": 0.49299831164961766, "grad_norm": 0.46682384084711037, "learning_rate": 5.356841794406527e-06, "loss": 0.0151, "step": 2482 }, { "epoch": 0.4931969411063661, "grad_norm": 0.4115591498845582, "learning_rate": 5.353632399073991e-06, "loss": 0.0113, "step": 2483 }, { "epoch": 0.4933955705631145, "grad_norm": 0.7247787357221698, "learning_rate": 5.350422857302633e-06, "loss": 0.0134, "step": 2484 }, { "epoch": 0.4935942000198629, "grad_norm": 0.8578245479997127, "learning_rate": 5.347213170421519e-06, "loss": 0.0148, "step": 2485 }, { "epoch": 0.4937928294766114, "grad_norm": 0.4699237127768561, "learning_rate": 5.344003339759781e-06, "loss": 0.0097, "step": 2486 }, { "epoch": 0.49399145893335983, "grad_norm": 0.4744010503049874, "learning_rate": 5.340793366646604e-06, "loss": 0.0115, "step": 2487 }, { "epoch": 0.49419008839010825, "grad_norm": 0.636008252077129, "learning_rate": 5.337583252411235e-06, "loss": 0.0102, "step": 2488 }, { "epoch": 0.4943887178468567, "grad_norm": 0.47474845684362593, "learning_rate": 5.33437299838298e-06, "loss": 0.0124, "step": 2489 }, { "epoch": 0.49458734730360515, "grad_norm": 1.0566024501358895, "learning_rate": 5.3311626058911994e-06, "loss": 0.0151, "step": 2490 }, { "epoch": 0.4947859767603536, "grad_norm": 0.36678370138916344, "learning_rate": 5.327952076265317e-06, "loss": 0.0147, "step": 2491 }, { "epoch": 0.494984606217102, "grad_norm": 0.25272585797585356, "learning_rate": 5.324741410834807e-06, "loss": 0.0067, "step": 2492 }, { "epoch": 0.4951832356738504, "grad_norm": 0.6048164575103134, "learning_rate": 5.321530610929204e-06, "loss": 0.0113, "step": 2493 }, { "epoch": 0.49538186513059884, "grad_norm": 0.7695263702174417, "learning_rate": 5.318319677878098e-06, "loss": 0.0142, "step": 2494 }, { "epoch": 0.4955804945873473, "grad_norm": 0.33525412117269704, "learning_rate": 5.315108613011132e-06, "loss": 0.0124, "step": 2495 }, { "epoch": 0.49577912404409574, "grad_norm": 0.7878389738793319, "learning_rate": 5.311897417658005e-06, "loss": 0.0151, "step": 2496 }, { "epoch": 0.49597775350084417, "grad_norm": 0.4578947485727716, "learning_rate": 5.308686093148467e-06, "loss": 0.0115, "step": 2497 }, { "epoch": 0.4961763829575926, "grad_norm": 1.030479841136134, "learning_rate": 5.305474640812331e-06, "loss": 0.0139, "step": 2498 }, { "epoch": 0.49637501241434107, "grad_norm": 0.796961987633012, "learning_rate": 5.30226306197945e-06, "loss": 0.0109, "step": 2499 }, { "epoch": 0.4965736418710895, "grad_norm": 0.782382140972235, "learning_rate": 5.299051357979738e-06, "loss": 0.0187, "step": 2500 }, { "epoch": 0.4967722713278379, "grad_norm": 1.121114404342789, "learning_rate": 5.295839530143159e-06, "loss": 0.0178, "step": 2501 }, { "epoch": 0.49697090078458633, "grad_norm": 0.7261947822033853, "learning_rate": 5.2926275797997255e-06, "loss": 0.0188, "step": 2502 }, { "epoch": 0.4971695302413348, "grad_norm": 0.46923022153419347, "learning_rate": 5.289415508279505e-06, "loss": 0.0073, "step": 2503 }, { "epoch": 0.49736815969808323, "grad_norm": 0.7182836113385063, "learning_rate": 5.286203316912613e-06, "loss": 0.015, "step": 2504 }, { "epoch": 0.49756678915483166, "grad_norm": 0.603411625751702, "learning_rate": 5.282991007029213e-06, "loss": 0.0106, "step": 2505 }, { "epoch": 0.4977654186115801, "grad_norm": 0.8174796617956737, "learning_rate": 5.27977857995952e-06, "loss": 0.0149, "step": 2506 }, { "epoch": 0.49796404806832856, "grad_norm": 1.0146932905608117, "learning_rate": 5.276566037033798e-06, "loss": 0.0141, "step": 2507 }, { "epoch": 0.498162677525077, "grad_norm": 0.4163900253441471, "learning_rate": 5.273353379582357e-06, "loss": 0.0108, "step": 2508 }, { "epoch": 0.4983613069818254, "grad_norm": 0.763281658551113, "learning_rate": 5.270140608935555e-06, "loss": 0.015, "step": 2509 }, { "epoch": 0.4985599364385738, "grad_norm": 0.22259719035648037, "learning_rate": 5.266927726423797e-06, "loss": 0.0087, "step": 2510 }, { "epoch": 0.4987585658953223, "grad_norm": 0.42395383851091467, "learning_rate": 5.263714733377535e-06, "loss": 0.0133, "step": 2511 }, { "epoch": 0.4989571953520707, "grad_norm": 0.5328797312516291, "learning_rate": 5.260501631127266e-06, "loss": 0.012, "step": 2512 }, { "epoch": 0.49915582480881915, "grad_norm": 0.4743705135796904, "learning_rate": 5.257288421003534e-06, "loss": 0.0196, "step": 2513 }, { "epoch": 0.49935445426556757, "grad_norm": 0.5093401512061038, "learning_rate": 5.254075104336922e-06, "loss": 0.0082, "step": 2514 }, { "epoch": 0.499553083722316, "grad_norm": 0.9029692999709029, "learning_rate": 5.250861682458066e-06, "loss": 0.0249, "step": 2515 }, { "epoch": 0.49975171317906447, "grad_norm": 1.777870716049857, "learning_rate": 5.247648156697637e-06, "loss": 0.0258, "step": 2516 }, { "epoch": 0.4999503426358129, "grad_norm": 0.5892960441699706, "learning_rate": 5.2444345283863555e-06, "loss": 0.008, "step": 2517 }, { "epoch": 0.5001489720925614, "grad_norm": 0.44368783450479404, "learning_rate": 5.241220798854979e-06, "loss": 0.0097, "step": 2518 }, { "epoch": 0.5003476015493098, "grad_norm": 0.4921667565769311, "learning_rate": 5.238006969434313e-06, "loss": 0.0111, "step": 2519 }, { "epoch": 0.5005462310060582, "grad_norm": 0.5508976317775016, "learning_rate": 5.234793041455199e-06, "loss": 0.0143, "step": 2520 }, { "epoch": 0.5007448604628066, "grad_norm": 0.7142397409065645, "learning_rate": 5.2315790162485204e-06, "loss": 0.0141, "step": 2521 }, { "epoch": 0.5009434899195551, "grad_norm": 0.47566862647539554, "learning_rate": 5.228364895145203e-06, "loss": 0.0162, "step": 2522 }, { "epoch": 0.5011421193763035, "grad_norm": 0.9705471048491111, "learning_rate": 5.225150679476212e-06, "loss": 0.0165, "step": 2523 }, { "epoch": 0.5013407488330519, "grad_norm": 0.5751231880679607, "learning_rate": 5.22193637057255e-06, "loss": 0.0138, "step": 2524 }, { "epoch": 0.5015393782898003, "grad_norm": 0.40697823166729785, "learning_rate": 5.218721969765259e-06, "loss": 0.0055, "step": 2525 }, { "epoch": 0.5017380077465489, "grad_norm": 0.4094426383199233, "learning_rate": 5.21550747838542e-06, "loss": 0.0122, "step": 2526 }, { "epoch": 0.5019366372032973, "grad_norm": 0.4031192231606706, "learning_rate": 5.21229289776415e-06, "loss": 0.0092, "step": 2527 }, { "epoch": 0.5021352666600457, "grad_norm": 0.35911635894078986, "learning_rate": 5.209078229232603e-06, "loss": 0.0068, "step": 2528 }, { "epoch": 0.5023338961167941, "grad_norm": 0.7098839446706783, "learning_rate": 5.205863474121972e-06, "loss": 0.0152, "step": 2529 }, { "epoch": 0.5025325255735426, "grad_norm": 0.3375041096332709, "learning_rate": 5.2026486337634804e-06, "loss": 0.0089, "step": 2530 }, { "epoch": 0.502731155030291, "grad_norm": 0.41527798701142316, "learning_rate": 5.199433709488394e-06, "loss": 0.0141, "step": 2531 }, { "epoch": 0.5029297844870394, "grad_norm": 0.39038459676259, "learning_rate": 5.196218702628008e-06, "loss": 0.0115, "step": 2532 }, { "epoch": 0.5031284139437878, "grad_norm": 0.6528062096390664, "learning_rate": 5.193003614513653e-06, "loss": 0.0231, "step": 2533 }, { "epoch": 0.5033270434005362, "grad_norm": 1.7917958287890257, "learning_rate": 5.189788446476695e-06, "loss": 0.0243, "step": 2534 }, { "epoch": 0.5035256728572848, "grad_norm": 0.44912076622692515, "learning_rate": 5.186573199848532e-06, "loss": 0.0106, "step": 2535 }, { "epoch": 0.5037243023140332, "grad_norm": 0.40776135095164723, "learning_rate": 5.183357875960592e-06, "loss": 0.0122, "step": 2536 }, { "epoch": 0.5039229317707816, "grad_norm": 0.5810563884951965, "learning_rate": 5.1801424761443404e-06, "loss": 0.0164, "step": 2537 }, { "epoch": 0.50412156122753, "grad_norm": 0.7568118365890487, "learning_rate": 5.17692700173127e-06, "loss": 0.0133, "step": 2538 }, { "epoch": 0.5043201906842785, "grad_norm": 0.42687995689269276, "learning_rate": 5.173711454052905e-06, "loss": 0.0095, "step": 2539 }, { "epoch": 0.5045188201410269, "grad_norm": 0.4374548051838743, "learning_rate": 5.170495834440802e-06, "loss": 0.0121, "step": 2540 }, { "epoch": 0.5047174495977753, "grad_norm": 0.4622644074029037, "learning_rate": 5.167280144226543e-06, "loss": 0.0118, "step": 2541 }, { "epoch": 0.5049160790545237, "grad_norm": 0.7577713982849161, "learning_rate": 5.164064384741745e-06, "loss": 0.0126, "step": 2542 }, { "epoch": 0.5051147085112723, "grad_norm": 0.3726201408129837, "learning_rate": 5.160848557318049e-06, "loss": 0.0127, "step": 2543 }, { "epoch": 0.5053133379680207, "grad_norm": 0.797286065504275, "learning_rate": 5.157632663287126e-06, "loss": 0.0126, "step": 2544 }, { "epoch": 0.5055119674247691, "grad_norm": 0.4918743375069235, "learning_rate": 5.1544167039806755e-06, "loss": 0.0082, "step": 2545 }, { "epoch": 0.5057105968815175, "grad_norm": 1.8250694853729397, "learning_rate": 5.1512006807304235e-06, "loss": 0.0155, "step": 2546 }, { "epoch": 0.505909226338266, "grad_norm": 0.6131857955146129, "learning_rate": 5.1479845948681185e-06, "loss": 0.0169, "step": 2547 }, { "epoch": 0.5061078557950144, "grad_norm": 0.48035054079034195, "learning_rate": 5.144768447725544e-06, "loss": 0.009, "step": 2548 }, { "epoch": 0.5063064852517628, "grad_norm": 0.3214285229524216, "learning_rate": 5.1415522406344976e-06, "loss": 0.0101, "step": 2549 }, { "epoch": 0.5065051147085112, "grad_norm": 0.7075802214664444, "learning_rate": 5.1383359749268114e-06, "loss": 0.0066, "step": 2550 }, { "epoch": 0.5067037441652598, "grad_norm": 0.5521771461028792, "learning_rate": 5.135119651934337e-06, "loss": 0.0089, "step": 2551 }, { "epoch": 0.5069023736220082, "grad_norm": 0.25160655768692114, "learning_rate": 5.131903272988951e-06, "loss": 0.0114, "step": 2552 }, { "epoch": 0.5071010030787566, "grad_norm": 0.49214589306265416, "learning_rate": 5.128686839422548e-06, "loss": 0.0139, "step": 2553 }, { "epoch": 0.507299632535505, "grad_norm": 0.3311557204020224, "learning_rate": 5.125470352567057e-06, "loss": 0.0085, "step": 2554 }, { "epoch": 0.5074982619922535, "grad_norm": 0.5102290741270068, "learning_rate": 5.122253813754418e-06, "loss": 0.0135, "step": 2555 }, { "epoch": 0.5076968914490019, "grad_norm": 0.47249684319774726, "learning_rate": 5.119037224316596e-06, "loss": 0.0142, "step": 2556 }, { "epoch": 0.5078955209057503, "grad_norm": 0.605698552614918, "learning_rate": 5.115820585585579e-06, "loss": 0.0113, "step": 2557 }, { "epoch": 0.5080941503624987, "grad_norm": 0.6910282577333509, "learning_rate": 5.1126038988933745e-06, "loss": 0.0098, "step": 2558 }, { "epoch": 0.5082927798192471, "grad_norm": 0.3860296102305923, "learning_rate": 5.109387165572007e-06, "loss": 0.0149, "step": 2559 }, { "epoch": 0.5084914092759957, "grad_norm": 0.47180358188811444, "learning_rate": 5.106170386953524e-06, "loss": 0.0099, "step": 2560 }, { "epoch": 0.5086900387327441, "grad_norm": 0.30576774774595783, "learning_rate": 5.10295356436999e-06, "loss": 0.0077, "step": 2561 }, { "epoch": 0.5088886681894925, "grad_norm": 0.30760305826870893, "learning_rate": 5.099736699153489e-06, "loss": 0.0076, "step": 2562 }, { "epoch": 0.5090872976462409, "grad_norm": 0.3838863165764435, "learning_rate": 5.09651979263612e-06, "loss": 0.0117, "step": 2563 }, { "epoch": 0.5092859271029894, "grad_norm": 0.7031651252049763, "learning_rate": 5.093302846150003e-06, "loss": 0.0124, "step": 2564 }, { "epoch": 0.5094845565597378, "grad_norm": 0.2976625692750354, "learning_rate": 5.0900858610272686e-06, "loss": 0.0081, "step": 2565 }, { "epoch": 0.5096831860164862, "grad_norm": 0.7678894575982894, "learning_rate": 5.0868688386000705e-06, "loss": 0.0082, "step": 2566 }, { "epoch": 0.5098818154732346, "grad_norm": 1.202459097306564, "learning_rate": 5.083651780200573e-06, "loss": 0.0143, "step": 2567 }, { "epoch": 0.5100804449299832, "grad_norm": 0.7918825373152566, "learning_rate": 5.080434687160958e-06, "loss": 0.0194, "step": 2568 }, { "epoch": 0.5102790743867316, "grad_norm": 0.35999767545572886, "learning_rate": 5.077217560813419e-06, "loss": 0.0131, "step": 2569 }, { "epoch": 0.51047770384348, "grad_norm": 0.6835356858570736, "learning_rate": 5.074000402490166e-06, "loss": 0.0125, "step": 2570 }, { "epoch": 0.5106763333002284, "grad_norm": 0.8376402761684941, "learning_rate": 5.0707832135234196e-06, "loss": 0.0111, "step": 2571 }, { "epoch": 0.5108749627569769, "grad_norm": 0.4974372233299198, "learning_rate": 5.067565995245413e-06, "loss": 0.0094, "step": 2572 }, { "epoch": 0.5110735922137253, "grad_norm": 0.661553749690577, "learning_rate": 5.0643487489883995e-06, "loss": 0.0107, "step": 2573 }, { "epoch": 0.5112722216704737, "grad_norm": 0.6776284432083003, "learning_rate": 5.061131476084632e-06, "loss": 0.0138, "step": 2574 }, { "epoch": 0.5114708511272221, "grad_norm": 0.6037483195136601, "learning_rate": 5.057914177866381e-06, "loss": 0.0122, "step": 2575 }, { "epoch": 0.5116694805839705, "grad_norm": 0.5131467239719532, "learning_rate": 5.054696855665928e-06, "loss": 0.0103, "step": 2576 }, { "epoch": 0.5118681100407191, "grad_norm": 0.43566722341066894, "learning_rate": 5.051479510815561e-06, "loss": 0.0098, "step": 2577 }, { "epoch": 0.5120667394974675, "grad_norm": 0.45003515001134503, "learning_rate": 5.04826214464758e-06, "loss": 0.0122, "step": 2578 }, { "epoch": 0.5122653689542159, "grad_norm": 0.750722907248036, "learning_rate": 5.0450447584942945e-06, "loss": 0.0112, "step": 2579 }, { "epoch": 0.5124639984109643, "grad_norm": 0.6004993496158042, "learning_rate": 5.041827353688018e-06, "loss": 0.0185, "step": 2580 }, { "epoch": 0.5126626278677128, "grad_norm": 3.1872299490121687, "learning_rate": 5.038609931561077e-06, "loss": 0.0182, "step": 2581 }, { "epoch": 0.5128612573244612, "grad_norm": 0.57628867563776, "learning_rate": 5.035392493445802e-06, "loss": 0.0133, "step": 2582 }, { "epoch": 0.5130598867812096, "grad_norm": 0.4439739183814535, "learning_rate": 5.03217504067453e-06, "loss": 0.0083, "step": 2583 }, { "epoch": 0.513258516237958, "grad_norm": 0.29526566381496966, "learning_rate": 5.028957574579607e-06, "loss": 0.0122, "step": 2584 }, { "epoch": 0.5134571456947066, "grad_norm": 0.5866932164892713, "learning_rate": 5.02574009649338e-06, "loss": 0.0173, "step": 2585 }, { "epoch": 0.513655775151455, "grad_norm": 0.6315522926249754, "learning_rate": 5.0225226077482055e-06, "loss": 0.0196, "step": 2586 }, { "epoch": 0.5138544046082034, "grad_norm": 0.430607086857732, "learning_rate": 5.019305109676443e-06, "loss": 0.0118, "step": 2587 }, { "epoch": 0.5140530340649518, "grad_norm": 0.6939527777771256, "learning_rate": 5.016087603610454e-06, "loss": 0.013, "step": 2588 }, { "epoch": 0.5142516635217003, "grad_norm": 0.43910241298197344, "learning_rate": 5.012870090882604e-06, "loss": 0.0118, "step": 2589 }, { "epoch": 0.5144502929784487, "grad_norm": 0.49333214750546195, "learning_rate": 5.00965257282526e-06, "loss": 0.0104, "step": 2590 }, { "epoch": 0.5146489224351971, "grad_norm": 0.800628768602885, "learning_rate": 5.006435050770797e-06, "loss": 0.0112, "step": 2591 }, { "epoch": 0.5148475518919455, "grad_norm": 0.5851511459639136, "learning_rate": 5.003217526051586e-06, "loss": 0.012, "step": 2592 }, { "epoch": 0.5150461813486941, "grad_norm": 0.47041917943176503, "learning_rate": 5e-06, "loss": 0.0168, "step": 2593 }, { "epoch": 0.5152448108054425, "grad_norm": 0.47959696982675354, "learning_rate": 4.996782473948416e-06, "loss": 0.0132, "step": 2594 }, { "epoch": 0.5154434402621909, "grad_norm": 0.47813958473253837, "learning_rate": 4.993564949229204e-06, "loss": 0.0108, "step": 2595 }, { "epoch": 0.5156420697189393, "grad_norm": 0.3852993385433787, "learning_rate": 4.990347427174742e-06, "loss": 0.0089, "step": 2596 }, { "epoch": 0.5158406991756878, "grad_norm": 0.6016579451534518, "learning_rate": 4.987129909117398e-06, "loss": 0.0175, "step": 2597 }, { "epoch": 0.5160393286324362, "grad_norm": 0.45074106282002463, "learning_rate": 4.983912396389547e-06, "loss": 0.0094, "step": 2598 }, { "epoch": 0.5162379580891846, "grad_norm": 0.7101086980066497, "learning_rate": 4.980694890323558e-06, "loss": 0.0171, "step": 2599 }, { "epoch": 0.516436587545933, "grad_norm": 0.4153502818151871, "learning_rate": 4.977477392251794e-06, "loss": 0.0176, "step": 2600 }, { "epoch": 0.5166352170026814, "grad_norm": 0.9128192784068005, "learning_rate": 4.97425990350662e-06, "loss": 0.0119, "step": 2601 }, { "epoch": 0.51683384645943, "grad_norm": 0.532406897997197, "learning_rate": 4.971042425420394e-06, "loss": 0.0098, "step": 2602 }, { "epoch": 0.5170324759161784, "grad_norm": 0.6432723125325949, "learning_rate": 4.96782495932547e-06, "loss": 0.0125, "step": 2603 }, { "epoch": 0.5172311053729268, "grad_norm": 0.4441861132175126, "learning_rate": 4.964607506554199e-06, "loss": 0.0088, "step": 2604 }, { "epoch": 0.5174297348296752, "grad_norm": 0.6071744574706942, "learning_rate": 4.961390068438926e-06, "loss": 0.0125, "step": 2605 }, { "epoch": 0.5176283642864237, "grad_norm": 0.5144396277013178, "learning_rate": 4.958172646311985e-06, "loss": 0.0108, "step": 2606 }, { "epoch": 0.5178269937431721, "grad_norm": 0.27271582525812527, "learning_rate": 4.954955241505709e-06, "loss": 0.0057, "step": 2607 }, { "epoch": 0.5180256231999205, "grad_norm": 0.41602838022157057, "learning_rate": 4.951737855352422e-06, "loss": 0.014, "step": 2608 }, { "epoch": 0.5182242526566689, "grad_norm": 0.8548531908521733, "learning_rate": 4.948520489184441e-06, "loss": 0.0192, "step": 2609 }, { "epoch": 0.5184228821134175, "grad_norm": 0.7707419054286566, "learning_rate": 4.945303144334074e-06, "loss": 0.0156, "step": 2610 }, { "epoch": 0.5186215115701659, "grad_norm": 0.9002762306327531, "learning_rate": 4.94208582213362e-06, "loss": 0.0109, "step": 2611 }, { "epoch": 0.5188201410269143, "grad_norm": 0.5969255787289416, "learning_rate": 4.9388685239153696e-06, "loss": 0.013, "step": 2612 }, { "epoch": 0.5190187704836627, "grad_norm": 1.321061946411639, "learning_rate": 4.935651251011602e-06, "loss": 0.0265, "step": 2613 }, { "epoch": 0.5192173999404112, "grad_norm": 0.9544926651703668, "learning_rate": 4.9324340047545874e-06, "loss": 0.0232, "step": 2614 }, { "epoch": 0.5194160293971596, "grad_norm": 0.8724356174787311, "learning_rate": 4.929216786476582e-06, "loss": 0.0172, "step": 2615 }, { "epoch": 0.519614658853908, "grad_norm": 0.5481496452758235, "learning_rate": 4.925999597509836e-06, "loss": 0.0105, "step": 2616 }, { "epoch": 0.5198132883106564, "grad_norm": 0.5924502586289119, "learning_rate": 4.922782439186583e-06, "loss": 0.0126, "step": 2617 }, { "epoch": 0.5200119177674049, "grad_norm": 1.097058049346646, "learning_rate": 4.9195653128390436e-06, "loss": 0.0194, "step": 2618 }, { "epoch": 0.5202105472241534, "grad_norm": 0.34572029206305155, "learning_rate": 4.9163482197994275e-06, "loss": 0.0114, "step": 2619 }, { "epoch": 0.5204091766809018, "grad_norm": 0.6478854101066456, "learning_rate": 4.913131161399931e-06, "loss": 0.0124, "step": 2620 }, { "epoch": 0.5206078061376502, "grad_norm": 0.8509089389165168, "learning_rate": 4.909914138972733e-06, "loss": 0.0152, "step": 2621 }, { "epoch": 0.5208064355943987, "grad_norm": 0.43045550772154256, "learning_rate": 4.9066971538499985e-06, "loss": 0.0058, "step": 2622 }, { "epoch": 0.5210050650511471, "grad_norm": 0.2811848823106481, "learning_rate": 4.903480207363881e-06, "loss": 0.0078, "step": 2623 }, { "epoch": 0.5212036945078955, "grad_norm": 0.17886282113428878, "learning_rate": 4.900263300846512e-06, "loss": 0.0045, "step": 2624 }, { "epoch": 0.5214023239646439, "grad_norm": 1.0905060510323827, "learning_rate": 4.89704643563001e-06, "loss": 0.021, "step": 2625 }, { "epoch": 0.5216009534213923, "grad_norm": 0.52942449810658, "learning_rate": 4.893829613046476e-06, "loss": 0.0105, "step": 2626 }, { "epoch": 0.5217995828781409, "grad_norm": 0.38872032429287773, "learning_rate": 4.890612834427994e-06, "loss": 0.0119, "step": 2627 }, { "epoch": 0.5219982123348893, "grad_norm": 0.4144058112551697, "learning_rate": 4.887396101106627e-06, "loss": 0.0122, "step": 2628 }, { "epoch": 0.5221968417916377, "grad_norm": 0.7047127790035546, "learning_rate": 4.884179414414423e-06, "loss": 0.013, "step": 2629 }, { "epoch": 0.5223954712483861, "grad_norm": 0.41046615550881144, "learning_rate": 4.880962775683406e-06, "loss": 0.0143, "step": 2630 }, { "epoch": 0.5225941007051346, "grad_norm": 0.2756154625309793, "learning_rate": 4.877746186245586e-06, "loss": 0.0114, "step": 2631 }, { "epoch": 0.522792730161883, "grad_norm": 0.8783246622943536, "learning_rate": 4.874529647432946e-06, "loss": 0.0111, "step": 2632 }, { "epoch": 0.5229913596186314, "grad_norm": 0.35351879117875357, "learning_rate": 4.871313160577453e-06, "loss": 0.0114, "step": 2633 }, { "epoch": 0.5231899890753798, "grad_norm": 0.3497304664011684, "learning_rate": 4.868096727011052e-06, "loss": 0.0117, "step": 2634 }, { "epoch": 0.5233886185321284, "grad_norm": 0.38460261605226964, "learning_rate": 4.864880348065665e-06, "loss": 0.0099, "step": 2635 }, { "epoch": 0.5235872479888768, "grad_norm": 0.34385257857225565, "learning_rate": 4.86166402507319e-06, "loss": 0.0131, "step": 2636 }, { "epoch": 0.5237858774456252, "grad_norm": 0.7154889328263488, "learning_rate": 4.858447759365504e-06, "loss": 0.0185, "step": 2637 }, { "epoch": 0.5239845069023736, "grad_norm": 0.45589891363247864, "learning_rate": 4.855231552274459e-06, "loss": 0.0089, "step": 2638 }, { "epoch": 0.5241831363591221, "grad_norm": 0.3846830282655784, "learning_rate": 4.852015405131882e-06, "loss": 0.018, "step": 2639 }, { "epoch": 0.5243817658158705, "grad_norm": 0.2940469069583859, "learning_rate": 4.848799319269578e-06, "loss": 0.0064, "step": 2640 }, { "epoch": 0.5245803952726189, "grad_norm": 0.2149661207674454, "learning_rate": 4.845583296019325e-06, "loss": 0.0061, "step": 2641 }, { "epoch": 0.5247790247293673, "grad_norm": 0.3446058201626362, "learning_rate": 4.8423673367128745e-06, "loss": 0.0072, "step": 2642 }, { "epoch": 0.5249776541861157, "grad_norm": 0.9384941741122217, "learning_rate": 4.839151442681952e-06, "loss": 0.014, "step": 2643 }, { "epoch": 0.5251762836428643, "grad_norm": 0.7648356533107046, "learning_rate": 4.835935615258257e-06, "loss": 0.0112, "step": 2644 }, { "epoch": 0.5253749130996127, "grad_norm": 0.5947850726160407, "learning_rate": 4.832719855773458e-06, "loss": 0.0237, "step": 2645 }, { "epoch": 0.5255735425563611, "grad_norm": 0.5069947415968155, "learning_rate": 4.8295041655592e-06, "loss": 0.019, "step": 2646 }, { "epoch": 0.5257721720131096, "grad_norm": 0.530771872211901, "learning_rate": 4.826288545947095e-06, "loss": 0.0167, "step": 2647 }, { "epoch": 0.525970801469858, "grad_norm": 0.7343719370370166, "learning_rate": 4.82307299826873e-06, "loss": 0.0153, "step": 2648 }, { "epoch": 0.5261694309266064, "grad_norm": 0.43053313989705466, "learning_rate": 4.81985752385566e-06, "loss": 0.0113, "step": 2649 }, { "epoch": 0.5263680603833548, "grad_norm": 1.0486479094087207, "learning_rate": 4.816642124039408e-06, "loss": 0.0225, "step": 2650 }, { "epoch": 0.5265666898401032, "grad_norm": 1.204001697819146, "learning_rate": 4.813426800151469e-06, "loss": 0.0119, "step": 2651 }, { "epoch": 0.5267653192968518, "grad_norm": 0.38383932212493976, "learning_rate": 4.8102115535233054e-06, "loss": 0.0108, "step": 2652 }, { "epoch": 0.5269639487536002, "grad_norm": 0.7521314041717428, "learning_rate": 4.806996385486349e-06, "loss": 0.0115, "step": 2653 }, { "epoch": 0.5271625782103486, "grad_norm": 0.35629755731123935, "learning_rate": 4.803781297371995e-06, "loss": 0.0073, "step": 2654 }, { "epoch": 0.527361207667097, "grad_norm": 0.4526855571654173, "learning_rate": 4.8005662905116085e-06, "loss": 0.0092, "step": 2655 }, { "epoch": 0.5275598371238455, "grad_norm": 0.3182080909457293, "learning_rate": 4.797351366236522e-06, "loss": 0.0132, "step": 2656 }, { "epoch": 0.5277584665805939, "grad_norm": 0.6049331198936719, "learning_rate": 4.794136525878032e-06, "loss": 0.016, "step": 2657 }, { "epoch": 0.5279570960373423, "grad_norm": 0.4754537156663623, "learning_rate": 4.7909217707673984e-06, "loss": 0.0168, "step": 2658 }, { "epoch": 0.5281557254940907, "grad_norm": 0.998499211825424, "learning_rate": 4.787707102235852e-06, "loss": 0.0176, "step": 2659 }, { "epoch": 0.5283543549508392, "grad_norm": 0.46739699878852936, "learning_rate": 4.784492521614582e-06, "loss": 0.01, "step": 2660 }, { "epoch": 0.5285529844075877, "grad_norm": 0.4320350647544851, "learning_rate": 4.781278030234742e-06, "loss": 0.0078, "step": 2661 }, { "epoch": 0.5287516138643361, "grad_norm": 0.6016783731374997, "learning_rate": 4.778063629427451e-06, "loss": 0.0127, "step": 2662 }, { "epoch": 0.5289502433210845, "grad_norm": 0.49313103783452733, "learning_rate": 4.77484932052379e-06, "loss": 0.0146, "step": 2663 }, { "epoch": 0.529148872777833, "grad_norm": 0.40985189124421423, "learning_rate": 4.771635104854799e-06, "loss": 0.0138, "step": 2664 }, { "epoch": 0.5293475022345814, "grad_norm": 0.3135781617489447, "learning_rate": 4.768420983751481e-06, "loss": 0.0105, "step": 2665 }, { "epoch": 0.5295461316913298, "grad_norm": 0.41689021986146035, "learning_rate": 4.765206958544803e-06, "loss": 0.0154, "step": 2666 }, { "epoch": 0.5297447611480782, "grad_norm": 0.2399269526721442, "learning_rate": 4.761993030565688e-06, "loss": 0.0076, "step": 2667 }, { "epoch": 0.5299433906048266, "grad_norm": 0.49800855497998864, "learning_rate": 4.758779201145022e-06, "loss": 0.02, "step": 2668 }, { "epoch": 0.5301420200615752, "grad_norm": 0.3540794930683414, "learning_rate": 4.755565471613646e-06, "loss": 0.0074, "step": 2669 }, { "epoch": 0.5303406495183236, "grad_norm": 0.37327188769326325, "learning_rate": 4.752351843302364e-06, "loss": 0.0093, "step": 2670 }, { "epoch": 0.530539278975072, "grad_norm": 0.6218524242302055, "learning_rate": 4.749138317541936e-06, "loss": 0.0114, "step": 2671 }, { "epoch": 0.5307379084318204, "grad_norm": 0.47629545880771196, "learning_rate": 4.745924895663078e-06, "loss": 0.0095, "step": 2672 }, { "epoch": 0.5309365378885689, "grad_norm": 0.41948688036972176, "learning_rate": 4.742711578996467e-06, "loss": 0.0128, "step": 2673 }, { "epoch": 0.5311351673453173, "grad_norm": 0.26707306675736925, "learning_rate": 4.739498368872734e-06, "loss": 0.0114, "step": 2674 }, { "epoch": 0.5313337968020657, "grad_norm": 0.4462577977563272, "learning_rate": 4.7362852666224654e-06, "loss": 0.022, "step": 2675 }, { "epoch": 0.5315324262588141, "grad_norm": 0.42752531113748565, "learning_rate": 4.733072273576204e-06, "loss": 0.0128, "step": 2676 }, { "epoch": 0.5317310557155627, "grad_norm": 0.1988579553958363, "learning_rate": 4.729859391064447e-06, "loss": 0.0066, "step": 2677 }, { "epoch": 0.5319296851723111, "grad_norm": 0.3623897142655911, "learning_rate": 4.726646620417646e-06, "loss": 0.0082, "step": 2678 }, { "epoch": 0.5321283146290595, "grad_norm": 0.5471633235503327, "learning_rate": 4.723433962966204e-06, "loss": 0.0099, "step": 2679 }, { "epoch": 0.5323269440858079, "grad_norm": 0.30009321328868405, "learning_rate": 4.720221420040483e-06, "loss": 0.0084, "step": 2680 }, { "epoch": 0.5325255735425564, "grad_norm": 0.36399418358138586, "learning_rate": 4.71700899297079e-06, "loss": 0.0081, "step": 2681 }, { "epoch": 0.5327242029993048, "grad_norm": 0.4970147699892617, "learning_rate": 4.7137966830873905e-06, "loss": 0.0113, "step": 2682 }, { "epoch": 0.5329228324560532, "grad_norm": 0.5953544717285123, "learning_rate": 4.710584491720496e-06, "loss": 0.0182, "step": 2683 }, { "epoch": 0.5331214619128016, "grad_norm": 0.4802266373130576, "learning_rate": 4.707372420200275e-06, "loss": 0.0124, "step": 2684 }, { "epoch": 0.53332009136955, "grad_norm": 0.6689446433586343, "learning_rate": 4.7041604698568436e-06, "loss": 0.0196, "step": 2685 }, { "epoch": 0.5335187208262986, "grad_norm": 0.5247125265711512, "learning_rate": 4.700948642020263e-06, "loss": 0.0108, "step": 2686 }, { "epoch": 0.533717350283047, "grad_norm": 0.4728274310812671, "learning_rate": 4.6977369380205514e-06, "loss": 0.0123, "step": 2687 }, { "epoch": 0.5339159797397954, "grad_norm": 0.60501199620917, "learning_rate": 4.694525359187671e-06, "loss": 0.0071, "step": 2688 }, { "epoch": 0.5341146091965439, "grad_norm": 0.36671802535764086, "learning_rate": 4.691313906851534e-06, "loss": 0.0119, "step": 2689 }, { "epoch": 0.5343132386532923, "grad_norm": 1.0242303491155098, "learning_rate": 4.688102582341997e-06, "loss": 0.0172, "step": 2690 }, { "epoch": 0.5345118681100407, "grad_norm": 0.7058136287291272, "learning_rate": 4.684891386988869e-06, "loss": 0.014, "step": 2691 }, { "epoch": 0.5347104975667891, "grad_norm": 1.1395881555357235, "learning_rate": 4.681680322121903e-06, "loss": 0.0197, "step": 2692 }, { "epoch": 0.5349091270235375, "grad_norm": 0.8146146171242227, "learning_rate": 4.678469389070797e-06, "loss": 0.0135, "step": 2693 }, { "epoch": 0.5351077564802861, "grad_norm": 0.4681663478402444, "learning_rate": 4.675258589165194e-06, "loss": 0.0061, "step": 2694 }, { "epoch": 0.5353063859370345, "grad_norm": 0.7888374699099346, "learning_rate": 4.672047923734685e-06, "loss": 0.0126, "step": 2695 }, { "epoch": 0.5355050153937829, "grad_norm": 0.26698990733974676, "learning_rate": 4.668837394108801e-06, "loss": 0.0081, "step": 2696 }, { "epoch": 0.5357036448505313, "grad_norm": 0.6481355354112126, "learning_rate": 4.665627001617021e-06, "loss": 0.0106, "step": 2697 }, { "epoch": 0.5359022743072798, "grad_norm": 0.9141985443318152, "learning_rate": 4.662416747588765e-06, "loss": 0.019, "step": 2698 }, { "epoch": 0.5361009037640282, "grad_norm": 0.345436456845145, "learning_rate": 4.6592066333533966e-06, "loss": 0.0101, "step": 2699 }, { "epoch": 0.5362995332207766, "grad_norm": 0.5340280696377245, "learning_rate": 4.6559966602402195e-06, "loss": 0.0132, "step": 2700 }, { "epoch": 0.536498162677525, "grad_norm": 0.495489992727139, "learning_rate": 4.652786829578482e-06, "loss": 0.0108, "step": 2701 }, { "epoch": 0.5366967921342735, "grad_norm": 0.48163074793680033, "learning_rate": 4.649577142697369e-06, "loss": 0.0089, "step": 2702 }, { "epoch": 0.536895421591022, "grad_norm": 0.6202192336463302, "learning_rate": 4.6463676009260115e-06, "loss": 0.0128, "step": 2703 }, { "epoch": 0.5370940510477704, "grad_norm": 0.8438747669064174, "learning_rate": 4.643158205593475e-06, "loss": 0.0207, "step": 2704 }, { "epoch": 0.5372926805045188, "grad_norm": 0.5444604148537907, "learning_rate": 4.639948958028769e-06, "loss": 0.011, "step": 2705 }, { "epoch": 0.5374913099612673, "grad_norm": 0.40458432983482345, "learning_rate": 4.636739859560839e-06, "loss": 0.0137, "step": 2706 }, { "epoch": 0.5376899394180157, "grad_norm": 0.38258143370225456, "learning_rate": 4.633530911518569e-06, "loss": 0.0121, "step": 2707 }, { "epoch": 0.5378885688747641, "grad_norm": 0.3338515448391374, "learning_rate": 4.63032211523078e-06, "loss": 0.0081, "step": 2708 }, { "epoch": 0.5380871983315125, "grad_norm": 0.2721756551155944, "learning_rate": 4.627113472026235e-06, "loss": 0.0114, "step": 2709 }, { "epoch": 0.538285827788261, "grad_norm": 0.8199742148753297, "learning_rate": 4.623904983233628e-06, "loss": 0.0194, "step": 2710 }, { "epoch": 0.5384844572450095, "grad_norm": 0.6501960596074884, "learning_rate": 4.6206966501815895e-06, "loss": 0.0105, "step": 2711 }, { "epoch": 0.5386830867017579, "grad_norm": 0.3113077005159778, "learning_rate": 4.617488474198689e-06, "loss": 0.0066, "step": 2712 }, { "epoch": 0.5388817161585063, "grad_norm": 0.46285719527025915, "learning_rate": 4.614280456613428e-06, "loss": 0.015, "step": 2713 }, { "epoch": 0.5390803456152548, "grad_norm": 0.9043967381501636, "learning_rate": 4.6110725987542436e-06, "loss": 0.0161, "step": 2714 }, { "epoch": 0.5392789750720032, "grad_norm": 0.6074770026179739, "learning_rate": 4.607864901949506e-06, "loss": 0.0085, "step": 2715 }, { "epoch": 0.5394776045287516, "grad_norm": 0.5580886715434685, "learning_rate": 4.6046573675275204e-06, "loss": 0.0124, "step": 2716 }, { "epoch": 0.5396762339855, "grad_norm": 0.4824079804144595, "learning_rate": 4.601449996816524e-06, "loss": 0.0099, "step": 2717 }, { "epoch": 0.5398748634422484, "grad_norm": 0.48189458946683134, "learning_rate": 4.598242791144684e-06, "loss": 0.0126, "step": 2718 }, { "epoch": 0.540073492898997, "grad_norm": 1.2183320489020404, "learning_rate": 4.5950357518401015e-06, "loss": 0.0142, "step": 2719 }, { "epoch": 0.5402721223557454, "grad_norm": 0.30529970276531804, "learning_rate": 4.591828880230809e-06, "loss": 0.0101, "step": 2720 }, { "epoch": 0.5404707518124938, "grad_norm": 0.23725652114441975, "learning_rate": 4.588622177644769e-06, "loss": 0.0052, "step": 2721 }, { "epoch": 0.5406693812692422, "grad_norm": 0.49526465559783595, "learning_rate": 4.585415645409872e-06, "loss": 0.0101, "step": 2722 }, { "epoch": 0.5408680107259907, "grad_norm": 0.629120686573954, "learning_rate": 4.5822092848539415e-06, "loss": 0.0178, "step": 2723 }, { "epoch": 0.5410666401827391, "grad_norm": 0.6034085054382206, "learning_rate": 4.579003097304728e-06, "loss": 0.0152, "step": 2724 }, { "epoch": 0.5412652696394875, "grad_norm": 0.32942180161724255, "learning_rate": 4.575797084089912e-06, "loss": 0.0093, "step": 2725 }, { "epoch": 0.5414638990962359, "grad_norm": 0.42192709837470616, "learning_rate": 4.572591246537097e-06, "loss": 0.0112, "step": 2726 }, { "epoch": 0.5416625285529844, "grad_norm": 0.6217230182883882, "learning_rate": 4.569385585973818e-06, "loss": 0.01, "step": 2727 }, { "epoch": 0.5418611580097329, "grad_norm": 0.6470622146574528, "learning_rate": 4.566180103727538e-06, "loss": 0.0172, "step": 2728 }, { "epoch": 0.5420597874664813, "grad_norm": 0.15106597972191477, "learning_rate": 4.562974801125642e-06, "loss": 0.0053, "step": 2729 }, { "epoch": 0.5422584169232297, "grad_norm": 0.6869662187626556, "learning_rate": 4.559769679495443e-06, "loss": 0.0138, "step": 2730 }, { "epoch": 0.5424570463799782, "grad_norm": 0.34343413551532936, "learning_rate": 4.55656474016418e-06, "loss": 0.0077, "step": 2731 }, { "epoch": 0.5426556758367266, "grad_norm": 0.6162031016897686, "learning_rate": 4.553359984459012e-06, "loss": 0.0146, "step": 2732 }, { "epoch": 0.542854305293475, "grad_norm": 0.33775023474661575, "learning_rate": 4.550155413707028e-06, "loss": 0.0112, "step": 2733 }, { "epoch": 0.5430529347502234, "grad_norm": 0.5286312992072117, "learning_rate": 4.546951029235237e-06, "loss": 0.0089, "step": 2734 }, { "epoch": 0.5432515642069718, "grad_norm": 0.44205639734611607, "learning_rate": 4.543746832370572e-06, "loss": 0.0101, "step": 2735 }, { "epoch": 0.5434501936637204, "grad_norm": 0.7046118783607311, "learning_rate": 4.540542824439885e-06, "loss": 0.0133, "step": 2736 }, { "epoch": 0.5436488231204688, "grad_norm": 0.2920099315721433, "learning_rate": 4.5373390067699555e-06, "loss": 0.0075, "step": 2737 }, { "epoch": 0.5438474525772172, "grad_norm": 0.40661921340738155, "learning_rate": 4.53413538068748e-06, "loss": 0.0118, "step": 2738 }, { "epoch": 0.5440460820339657, "grad_norm": 0.7267749331431869, "learning_rate": 4.530931947519076e-06, "loss": 0.0148, "step": 2739 }, { "epoch": 0.5442447114907141, "grad_norm": 0.3581174389427978, "learning_rate": 4.527728708591283e-06, "loss": 0.0107, "step": 2740 }, { "epoch": 0.5444433409474625, "grad_norm": 0.41211789962071976, "learning_rate": 4.524525665230559e-06, "loss": 0.0129, "step": 2741 }, { "epoch": 0.5446419704042109, "grad_norm": 0.9758329205888799, "learning_rate": 4.521322818763281e-06, "loss": 0.0179, "step": 2742 }, { "epoch": 0.5448405998609593, "grad_norm": 0.8121007154836791, "learning_rate": 4.518120170515744e-06, "loss": 0.018, "step": 2743 }, { "epoch": 0.5450392293177078, "grad_norm": 0.2556833343570392, "learning_rate": 4.514917721814163e-06, "loss": 0.0062, "step": 2744 }, { "epoch": 0.5452378587744563, "grad_norm": 0.49149727060895204, "learning_rate": 4.5117154739846665e-06, "loss": 0.0176, "step": 2745 }, { "epoch": 0.5454364882312047, "grad_norm": 1.0477647176321618, "learning_rate": 4.5085134283533035e-06, "loss": 0.0168, "step": 2746 }, { "epoch": 0.5456351176879531, "grad_norm": 0.3199467470120506, "learning_rate": 4.505311586246037e-06, "loss": 0.0087, "step": 2747 }, { "epoch": 0.5458337471447016, "grad_norm": 0.4189748938840971, "learning_rate": 4.502109948988748e-06, "loss": 0.0122, "step": 2748 }, { "epoch": 0.54603237660145, "grad_norm": 0.43642675146362775, "learning_rate": 4.498908517907232e-06, "loss": 0.0065, "step": 2749 }, { "epoch": 0.5462310060581984, "grad_norm": 1.5799130468680098, "learning_rate": 4.4957072943271965e-06, "loss": 0.0129, "step": 2750 }, { "epoch": 0.5464296355149468, "grad_norm": 0.3761584834352775, "learning_rate": 4.492506279574262e-06, "loss": 0.0124, "step": 2751 }, { "epoch": 0.5466282649716953, "grad_norm": 0.7976727590944505, "learning_rate": 4.4893054749739715e-06, "loss": 0.0131, "step": 2752 }, { "epoch": 0.5468268944284438, "grad_norm": 1.038660703724904, "learning_rate": 4.4861048818517725e-06, "loss": 0.0164, "step": 2753 }, { "epoch": 0.5470255238851922, "grad_norm": 0.8037713236955946, "learning_rate": 4.482904501533027e-06, "loss": 0.0148, "step": 2754 }, { "epoch": 0.5472241533419406, "grad_norm": 0.3649529151254279, "learning_rate": 4.47970433534301e-06, "loss": 0.0109, "step": 2755 }, { "epoch": 0.5474227827986891, "grad_norm": 0.539372194994621, "learning_rate": 4.476504384606906e-06, "loss": 0.0128, "step": 2756 }, { "epoch": 0.5476214122554375, "grad_norm": 0.7258264788596848, "learning_rate": 4.473304650649812e-06, "loss": 0.0146, "step": 2757 }, { "epoch": 0.5478200417121859, "grad_norm": 0.28150530699276105, "learning_rate": 4.470105134796734e-06, "loss": 0.0065, "step": 2758 }, { "epoch": 0.5480186711689343, "grad_norm": 0.7482265228868586, "learning_rate": 4.466905838372591e-06, "loss": 0.0179, "step": 2759 }, { "epoch": 0.5482173006256827, "grad_norm": 0.6119275773400306, "learning_rate": 4.463706762702205e-06, "loss": 0.0176, "step": 2760 }, { "epoch": 0.5484159300824313, "grad_norm": 0.4938248337453184, "learning_rate": 4.460507909110312e-06, "loss": 0.0106, "step": 2761 }, { "epoch": 0.5486145595391797, "grad_norm": 0.27287982790646315, "learning_rate": 4.457309278921554e-06, "loss": 0.0054, "step": 2762 }, { "epoch": 0.5488131889959281, "grad_norm": 0.7541141348539235, "learning_rate": 4.4541108734604795e-06, "loss": 0.0108, "step": 2763 }, { "epoch": 0.5490118184526765, "grad_norm": 0.7194720755343721, "learning_rate": 4.450912694051546e-06, "loss": 0.0131, "step": 2764 }, { "epoch": 0.549210447909425, "grad_norm": 0.36732971751956595, "learning_rate": 4.447714742019115e-06, "loss": 0.0114, "step": 2765 }, { "epoch": 0.5494090773661734, "grad_norm": 0.6386170728830878, "learning_rate": 4.444517018687457e-06, "loss": 0.009, "step": 2766 }, { "epoch": 0.5496077068229218, "grad_norm": 0.39169752604182223, "learning_rate": 4.441319525380745e-06, "loss": 0.0133, "step": 2767 }, { "epoch": 0.5498063362796702, "grad_norm": 0.569101509077439, "learning_rate": 4.438122263423059e-06, "loss": 0.0131, "step": 2768 }, { "epoch": 0.5500049657364187, "grad_norm": 0.7374889233619399, "learning_rate": 4.434925234138381e-06, "loss": 0.0154, "step": 2769 }, { "epoch": 0.5502035951931672, "grad_norm": 0.27469994570416645, "learning_rate": 4.431728438850597e-06, "loss": 0.01, "step": 2770 }, { "epoch": 0.5504022246499156, "grad_norm": 0.5480817776491356, "learning_rate": 4.4285318788834976e-06, "loss": 0.0112, "step": 2771 }, { "epoch": 0.550600854106664, "grad_norm": 0.4678729381775268, "learning_rate": 4.425335555560773e-06, "loss": 0.0112, "step": 2772 }, { "epoch": 0.5507994835634125, "grad_norm": 0.6929526581318076, "learning_rate": 4.422139470206024e-06, "loss": 0.0147, "step": 2773 }, { "epoch": 0.5509981130201609, "grad_norm": 0.5590965545680023, "learning_rate": 4.4189436241427395e-06, "loss": 0.0128, "step": 2774 }, { "epoch": 0.5511967424769093, "grad_norm": 0.4299925224605084, "learning_rate": 4.415748018694317e-06, "loss": 0.0078, "step": 2775 }, { "epoch": 0.5513953719336577, "grad_norm": 0.30426919330239416, "learning_rate": 4.412552655184055e-06, "loss": 0.0071, "step": 2776 }, { "epoch": 0.5515940013904062, "grad_norm": 1.7960268104440869, "learning_rate": 4.409357534935151e-06, "loss": 0.0276, "step": 2777 }, { "epoch": 0.5517926308471547, "grad_norm": 2.6449701275835187, "learning_rate": 4.4061626592707e-06, "loss": 0.0182, "step": 2778 }, { "epoch": 0.5519912603039031, "grad_norm": 0.22507808021610604, "learning_rate": 4.4029680295136975e-06, "loss": 0.0056, "step": 2779 }, { "epoch": 0.5521898897606515, "grad_norm": 0.7832386144210031, "learning_rate": 4.399773646987036e-06, "loss": 0.0183, "step": 2780 }, { "epoch": 0.5523885192174, "grad_norm": 0.632793978123542, "learning_rate": 4.396579513013506e-06, "loss": 0.0148, "step": 2781 }, { "epoch": 0.5525871486741484, "grad_norm": 0.5274197472069998, "learning_rate": 4.393385628915795e-06, "loss": 0.0132, "step": 2782 }, { "epoch": 0.5527857781308968, "grad_norm": 0.6239929821382354, "learning_rate": 4.390191996016488e-06, "loss": 0.0139, "step": 2783 }, { "epoch": 0.5529844075876452, "grad_norm": 0.5616827331201757, "learning_rate": 4.386998615638064e-06, "loss": 0.0099, "step": 2784 }, { "epoch": 0.5531830370443936, "grad_norm": 0.5048749306077575, "learning_rate": 4.383805489102901e-06, "loss": 0.0098, "step": 2785 }, { "epoch": 0.5533816665011421, "grad_norm": 0.3937045403697834, "learning_rate": 4.380612617733267e-06, "loss": 0.0106, "step": 2786 }, { "epoch": 0.5535802959578906, "grad_norm": 0.4297000120260985, "learning_rate": 4.3774200028513275e-06, "loss": 0.0085, "step": 2787 }, { "epoch": 0.553778925414639, "grad_norm": 0.7721765597312535, "learning_rate": 4.374227645779142e-06, "loss": 0.0146, "step": 2788 }, { "epoch": 0.5539775548713874, "grad_norm": 0.6219951031088091, "learning_rate": 4.371035547838661e-06, "loss": 0.0123, "step": 2789 }, { "epoch": 0.5541761843281359, "grad_norm": 0.6784123591321396, "learning_rate": 4.36784371035173e-06, "loss": 0.0119, "step": 2790 }, { "epoch": 0.5543748137848843, "grad_norm": 0.4407590870450644, "learning_rate": 4.364652134640085e-06, "loss": 0.0105, "step": 2791 }, { "epoch": 0.5545734432416327, "grad_norm": 0.5200832710265008, "learning_rate": 4.361460822025356e-06, "loss": 0.01, "step": 2792 }, { "epoch": 0.5547720726983811, "grad_norm": 0.5895406136146861, "learning_rate": 4.35826977382906e-06, "loss": 0.0082, "step": 2793 }, { "epoch": 0.5549707021551296, "grad_norm": 0.3137853635557372, "learning_rate": 4.355078991372609e-06, "loss": 0.0062, "step": 2794 }, { "epoch": 0.5551693316118781, "grad_norm": 0.3276947209644395, "learning_rate": 4.351888475977302e-06, "loss": 0.0062, "step": 2795 }, { "epoch": 0.5553679610686265, "grad_norm": 0.49943742040860356, "learning_rate": 4.348698228964327e-06, "loss": 0.0077, "step": 2796 }, { "epoch": 0.5555665905253749, "grad_norm": 0.46359295428980324, "learning_rate": 4.345508251654765e-06, "loss": 0.0068, "step": 2797 }, { "epoch": 0.5557652199821234, "grad_norm": 1.1215529581908548, "learning_rate": 4.34231854536958e-06, "loss": 0.0217, "step": 2798 }, { "epoch": 0.5559638494388718, "grad_norm": 0.6207216506521158, "learning_rate": 4.339129111429625e-06, "loss": 0.0096, "step": 2799 }, { "epoch": 0.5561624788956202, "grad_norm": 0.3028484585994268, "learning_rate": 4.335939951155644e-06, "loss": 0.0053, "step": 2800 }, { "epoch": 0.5563611083523686, "grad_norm": 1.030312295449838, "learning_rate": 4.332751065868264e-06, "loss": 0.0145, "step": 2801 }, { "epoch": 0.556559737809117, "grad_norm": 0.4829944470180176, "learning_rate": 4.329562456888e-06, "loss": 0.0117, "step": 2802 }, { "epoch": 0.5567583672658655, "grad_norm": 0.3015442911784513, "learning_rate": 4.32637412553525e-06, "loss": 0.0066, "step": 2803 }, { "epoch": 0.556956996722614, "grad_norm": 0.49410995263115004, "learning_rate": 4.323186073130302e-06, "loss": 0.0134, "step": 2804 }, { "epoch": 0.5571556261793624, "grad_norm": 1.1059775844917823, "learning_rate": 4.319998300993322e-06, "loss": 0.0167, "step": 2805 }, { "epoch": 0.5573542556361109, "grad_norm": 0.5759009436326822, "learning_rate": 4.316810810444365e-06, "loss": 0.0121, "step": 2806 }, { "epoch": 0.5575528850928593, "grad_norm": 0.41776379254655316, "learning_rate": 4.313623602803367e-06, "loss": 0.0048, "step": 2807 }, { "epoch": 0.5577515145496077, "grad_norm": 0.5034867323356977, "learning_rate": 4.310436679390147e-06, "loss": 0.0121, "step": 2808 }, { "epoch": 0.5579501440063561, "grad_norm": 0.41318019364973024, "learning_rate": 4.307250041524408e-06, "loss": 0.0064, "step": 2809 }, { "epoch": 0.5581487734631045, "grad_norm": 1.0091161435064295, "learning_rate": 4.304063690525734e-06, "loss": 0.0137, "step": 2810 }, { "epoch": 0.558347402919853, "grad_norm": 0.6276179334811938, "learning_rate": 4.300877627713588e-06, "loss": 0.0113, "step": 2811 }, { "epoch": 0.5585460323766015, "grad_norm": 0.8074924839212643, "learning_rate": 4.297691854407317e-06, "loss": 0.0175, "step": 2812 }, { "epoch": 0.5587446618333499, "grad_norm": 1.2308453281661025, "learning_rate": 4.294506371926145e-06, "loss": 0.0256, "step": 2813 }, { "epoch": 0.5589432912900983, "grad_norm": 0.8890857016385656, "learning_rate": 4.291321181589179e-06, "loss": 0.0138, "step": 2814 }, { "epoch": 0.5591419207468468, "grad_norm": 0.42316267167957217, "learning_rate": 4.288136284715399e-06, "loss": 0.0061, "step": 2815 }, { "epoch": 0.5593405502035952, "grad_norm": 0.9297151487475857, "learning_rate": 4.284951682623674e-06, "loss": 0.0149, "step": 2816 }, { "epoch": 0.5595391796603436, "grad_norm": 0.4544147000009162, "learning_rate": 4.281767376632739e-06, "loss": 0.0071, "step": 2817 }, { "epoch": 0.559737809117092, "grad_norm": 0.6237208093686428, "learning_rate": 4.278583368061216e-06, "loss": 0.0225, "step": 2818 }, { "epoch": 0.5599364385738405, "grad_norm": 0.39156008750074794, "learning_rate": 4.275399658227596e-06, "loss": 0.0123, "step": 2819 }, { "epoch": 0.560135068030589, "grad_norm": 0.41234897144867805, "learning_rate": 4.272216248450253e-06, "loss": 0.0072, "step": 2820 }, { "epoch": 0.5603336974873374, "grad_norm": 0.4726168304536307, "learning_rate": 4.2690331400474335e-06, "loss": 0.0095, "step": 2821 }, { "epoch": 0.5605323269440858, "grad_norm": 0.36137404100034154, "learning_rate": 4.265850334337258e-06, "loss": 0.0106, "step": 2822 }, { "epoch": 0.5607309564008343, "grad_norm": 0.41668059273849295, "learning_rate": 4.262667832637724e-06, "loss": 0.0085, "step": 2823 }, { "epoch": 0.5609295858575827, "grad_norm": 0.547862940561988, "learning_rate": 4.2594856362667015e-06, "loss": 0.0129, "step": 2824 }, { "epoch": 0.5611282153143311, "grad_norm": 0.40441643120996035, "learning_rate": 4.256303746541936e-06, "loss": 0.0102, "step": 2825 }, { "epoch": 0.5613268447710795, "grad_norm": 0.5954924239361523, "learning_rate": 4.253122164781043e-06, "loss": 0.0143, "step": 2826 }, { "epoch": 0.561525474227828, "grad_norm": 0.7813919479034077, "learning_rate": 4.249940892301514e-06, "loss": 0.009, "step": 2827 }, { "epoch": 0.5617241036845764, "grad_norm": 0.4565410720159291, "learning_rate": 4.246759930420711e-06, "loss": 0.0124, "step": 2828 }, { "epoch": 0.5619227331413249, "grad_norm": 0.5694645470193745, "learning_rate": 4.243579280455867e-06, "loss": 0.0108, "step": 2829 }, { "epoch": 0.5621213625980733, "grad_norm": 0.8304296127324543, "learning_rate": 4.240398943724085e-06, "loss": 0.0101, "step": 2830 }, { "epoch": 0.5623199920548217, "grad_norm": 0.7021520729713497, "learning_rate": 4.237218921542339e-06, "loss": 0.0177, "step": 2831 }, { "epoch": 0.5625186215115702, "grad_norm": 0.6305791789892291, "learning_rate": 4.234039215227474e-06, "loss": 0.009, "step": 2832 }, { "epoch": 0.5627172509683186, "grad_norm": 0.9230478228209408, "learning_rate": 4.230859826096203e-06, "loss": 0.0147, "step": 2833 }, { "epoch": 0.562915880425067, "grad_norm": 0.4300837811713559, "learning_rate": 4.2276807554651074e-06, "loss": 0.0099, "step": 2834 }, { "epoch": 0.5631145098818154, "grad_norm": 0.5975455251314918, "learning_rate": 4.224502004650639e-06, "loss": 0.0138, "step": 2835 }, { "epoch": 0.5633131393385639, "grad_norm": 0.5208038837005452, "learning_rate": 4.221323574969113e-06, "loss": 0.0129, "step": 2836 }, { "epoch": 0.5635117687953124, "grad_norm": 0.4113017440579882, "learning_rate": 4.218145467736715e-06, "loss": 0.013, "step": 2837 }, { "epoch": 0.5637103982520608, "grad_norm": 0.8453183610999317, "learning_rate": 4.214967684269495e-06, "loss": 0.0069, "step": 2838 }, { "epoch": 0.5639090277088092, "grad_norm": 0.3829389981415355, "learning_rate": 4.211790225883372e-06, "loss": 0.007, "step": 2839 }, { "epoch": 0.5641076571655577, "grad_norm": 0.45669163404263635, "learning_rate": 4.208613093894126e-06, "loss": 0.0141, "step": 2840 }, { "epoch": 0.5643062866223061, "grad_norm": 0.833349864065106, "learning_rate": 4.205436289617406e-06, "loss": 0.0169, "step": 2841 }, { "epoch": 0.5645049160790545, "grad_norm": 0.6077279896370484, "learning_rate": 4.2022598143687224e-06, "loss": 0.0123, "step": 2842 }, { "epoch": 0.5647035455358029, "grad_norm": 0.38738975198937353, "learning_rate": 4.199083669463452e-06, "loss": 0.0072, "step": 2843 }, { "epoch": 0.5649021749925514, "grad_norm": 0.38114824416375975, "learning_rate": 4.195907856216831e-06, "loss": 0.0092, "step": 2844 }, { "epoch": 0.5651008044492998, "grad_norm": 0.8831215464756551, "learning_rate": 4.192732375943962e-06, "loss": 0.0167, "step": 2845 }, { "epoch": 0.5652994339060483, "grad_norm": 0.3030833462369814, "learning_rate": 4.189557229959807e-06, "loss": 0.0077, "step": 2846 }, { "epoch": 0.5654980633627967, "grad_norm": 1.0605500779156578, "learning_rate": 4.186382419579193e-06, "loss": 0.0132, "step": 2847 }, { "epoch": 0.5656966928195452, "grad_norm": 0.6755075790503954, "learning_rate": 4.183207946116802e-06, "loss": 0.0182, "step": 2848 }, { "epoch": 0.5658953222762936, "grad_norm": 0.9423178403605319, "learning_rate": 4.180033810887184e-06, "loss": 0.0147, "step": 2849 }, { "epoch": 0.566093951733042, "grad_norm": 0.6355943716774898, "learning_rate": 4.176860015204743e-06, "loss": 0.0122, "step": 2850 }, { "epoch": 0.5662925811897904, "grad_norm": 0.46477508165506526, "learning_rate": 4.173686560383745e-06, "loss": 0.0143, "step": 2851 }, { "epoch": 0.5664912106465388, "grad_norm": 0.489449386116713, "learning_rate": 4.170513447738316e-06, "loss": 0.015, "step": 2852 }, { "epoch": 0.5666898401032873, "grad_norm": 0.475425921121026, "learning_rate": 4.167340678582437e-06, "loss": 0.0132, "step": 2853 }, { "epoch": 0.5668884695600358, "grad_norm": 0.5947470913713117, "learning_rate": 4.16416825422995e-06, "loss": 0.0144, "step": 2854 }, { "epoch": 0.5670870990167842, "grad_norm": 0.8415609819118625, "learning_rate": 4.160996175994551e-06, "loss": 0.0167, "step": 2855 }, { "epoch": 0.5672857284735326, "grad_norm": 0.9514310803640764, "learning_rate": 4.157824445189796e-06, "loss": 0.0103, "step": 2856 }, { "epoch": 0.5674843579302811, "grad_norm": 0.6091553295909007, "learning_rate": 4.1546530631290945e-06, "loss": 0.0125, "step": 2857 }, { "epoch": 0.5676829873870295, "grad_norm": 0.4674705036869269, "learning_rate": 4.151482031125712e-06, "loss": 0.0158, "step": 2858 }, { "epoch": 0.5678816168437779, "grad_norm": 0.7383391584281234, "learning_rate": 4.148311350492772e-06, "loss": 0.0128, "step": 2859 }, { "epoch": 0.5680802463005263, "grad_norm": 1.9683596728322885, "learning_rate": 4.145141022543248e-06, "loss": 0.0126, "step": 2860 }, { "epoch": 0.5682788757572748, "grad_norm": 0.717958085427442, "learning_rate": 4.1419710485899715e-06, "loss": 0.0106, "step": 2861 }, { "epoch": 0.5684775052140233, "grad_norm": 0.7306809802401357, "learning_rate": 4.138801429945624e-06, "loss": 0.0088, "step": 2862 }, { "epoch": 0.5686761346707717, "grad_norm": 0.3406143943303101, "learning_rate": 4.135632167922742e-06, "loss": 0.0104, "step": 2863 }, { "epoch": 0.5688747641275201, "grad_norm": 0.35075777130300395, "learning_rate": 4.1324632638337134e-06, "loss": 0.0071, "step": 2864 }, { "epoch": 0.5690733935842686, "grad_norm": 3.6206297212120857, "learning_rate": 4.129294718990779e-06, "loss": 0.0089, "step": 2865 }, { "epoch": 0.569272023041017, "grad_norm": 0.6092937120736366, "learning_rate": 4.126126534706028e-06, "loss": 0.0129, "step": 2866 }, { "epoch": 0.5694706524977654, "grad_norm": 0.6599744938071138, "learning_rate": 4.122958712291406e-06, "loss": 0.0132, "step": 2867 }, { "epoch": 0.5696692819545138, "grad_norm": 0.40817723392650745, "learning_rate": 4.119791253058701e-06, "loss": 0.0155, "step": 2868 }, { "epoch": 0.5698679114112623, "grad_norm": 0.7207327641411628, "learning_rate": 4.1166241583195596e-06, "loss": 0.0149, "step": 2869 }, { "epoch": 0.5700665408680107, "grad_norm": 0.5234949232683128, "learning_rate": 4.113457429385468e-06, "loss": 0.0089, "step": 2870 }, { "epoch": 0.5702651703247592, "grad_norm": 0.5297434498649813, "learning_rate": 4.110291067567766e-06, "loss": 0.0107, "step": 2871 }, { "epoch": 0.5704637997815076, "grad_norm": 0.5678519577896356, "learning_rate": 4.107125074177643e-06, "loss": 0.0092, "step": 2872 }, { "epoch": 0.570662429238256, "grad_norm": 0.35857068739907116, "learning_rate": 4.103959450526133e-06, "loss": 0.0089, "step": 2873 }, { "epoch": 0.5708610586950045, "grad_norm": 0.7083827991263594, "learning_rate": 4.100794197924117e-06, "loss": 0.0118, "step": 2874 }, { "epoch": 0.5710596881517529, "grad_norm": 0.5189059502478018, "learning_rate": 4.097629317682322e-06, "loss": 0.008, "step": 2875 }, { "epoch": 0.5712583176085013, "grad_norm": 1.0428771302955275, "learning_rate": 4.094464811111323e-06, "loss": 0.0139, "step": 2876 }, { "epoch": 0.5714569470652497, "grad_norm": 0.711904256313604, "learning_rate": 4.091300679521539e-06, "loss": 0.0162, "step": 2877 }, { "epoch": 0.5716555765219982, "grad_norm": 0.32525294522509485, "learning_rate": 4.088136924223235e-06, "loss": 0.0106, "step": 2878 }, { "epoch": 0.5718542059787467, "grad_norm": 0.6302841695249592, "learning_rate": 4.084973546526517e-06, "loss": 0.0115, "step": 2879 }, { "epoch": 0.5720528354354951, "grad_norm": 0.828179414805218, "learning_rate": 4.081810547741336e-06, "loss": 0.0161, "step": 2880 }, { "epoch": 0.5722514648922435, "grad_norm": 0.4625165253053684, "learning_rate": 4.078647929177489e-06, "loss": 0.0064, "step": 2881 }, { "epoch": 0.572450094348992, "grad_norm": 0.6812644619662183, "learning_rate": 4.075485692144611e-06, "loss": 0.0173, "step": 2882 }, { "epoch": 0.5726487238057404, "grad_norm": 0.8313655313026633, "learning_rate": 4.072323837952181e-06, "loss": 0.0144, "step": 2883 }, { "epoch": 0.5728473532624888, "grad_norm": 0.45743710231818924, "learning_rate": 4.069162367909522e-06, "loss": 0.0113, "step": 2884 }, { "epoch": 0.5730459827192372, "grad_norm": 0.9030179886207947, "learning_rate": 4.0660012833257945e-06, "loss": 0.0153, "step": 2885 }, { "epoch": 0.5732446121759857, "grad_norm": 0.7782720108694118, "learning_rate": 4.062840585510001e-06, "loss": 0.01, "step": 2886 }, { "epoch": 0.5734432416327341, "grad_norm": 0.40236050134983353, "learning_rate": 4.05968027577098e-06, "loss": 0.0071, "step": 2887 }, { "epoch": 0.5736418710894826, "grad_norm": 0.6486277794089854, "learning_rate": 4.056520355417418e-06, "loss": 0.01, "step": 2888 }, { "epoch": 0.573840500546231, "grad_norm": 0.5243182245221714, "learning_rate": 4.053360825757831e-06, "loss": 0.0125, "step": 2889 }, { "epoch": 0.5740391300029795, "grad_norm": 0.6340711859635116, "learning_rate": 4.050201688100577e-06, "loss": 0.0114, "step": 2890 }, { "epoch": 0.5742377594597279, "grad_norm": 0.44387658891634135, "learning_rate": 4.047042943753853e-06, "loss": 0.0091, "step": 2891 }, { "epoch": 0.5744363889164763, "grad_norm": 0.2538412173316712, "learning_rate": 4.043884594025692e-06, "loss": 0.0054, "step": 2892 }, { "epoch": 0.5746350183732247, "grad_norm": 0.6434174001298162, "learning_rate": 4.040726640223967e-06, "loss": 0.0163, "step": 2893 }, { "epoch": 0.5748336478299731, "grad_norm": 0.4516311123319802, "learning_rate": 4.037569083656374e-06, "loss": 0.0088, "step": 2894 }, { "epoch": 0.5750322772867216, "grad_norm": 0.6032522782543573, "learning_rate": 4.034411925630462e-06, "loss": 0.0184, "step": 2895 }, { "epoch": 0.5752309067434701, "grad_norm": 0.5036736272759907, "learning_rate": 4.031255167453604e-06, "loss": 0.006, "step": 2896 }, { "epoch": 0.5754295362002185, "grad_norm": 0.5616796264869481, "learning_rate": 4.028098810433012e-06, "loss": 0.02, "step": 2897 }, { "epoch": 0.575628165656967, "grad_norm": 0.7195746914339065, "learning_rate": 4.024942855875728e-06, "loss": 0.0204, "step": 2898 }, { "epoch": 0.5758267951137154, "grad_norm": 0.681543673069867, "learning_rate": 4.021787305088633e-06, "loss": 0.0143, "step": 2899 }, { "epoch": 0.5760254245704638, "grad_norm": 0.4669150650713162, "learning_rate": 4.0186321593784325e-06, "loss": 0.0089, "step": 2900 }, { "epoch": 0.5762240540272122, "grad_norm": 0.33038421962080927, "learning_rate": 4.015477420051673e-06, "loss": 0.0096, "step": 2901 }, { "epoch": 0.5764226834839606, "grad_norm": 0.3634840974532087, "learning_rate": 4.012323088414729e-06, "loss": 0.0104, "step": 2902 }, { "epoch": 0.5766213129407091, "grad_norm": 0.8343922468702314, "learning_rate": 4.009169165773804e-06, "loss": 0.0117, "step": 2903 }, { "epoch": 0.5768199423974576, "grad_norm": 0.6495727500722914, "learning_rate": 4.0060156534349355e-06, "loss": 0.0125, "step": 2904 }, { "epoch": 0.577018571854206, "grad_norm": 0.6135698010073438, "learning_rate": 4.00286255270399e-06, "loss": 0.0146, "step": 2905 }, { "epoch": 0.5772172013109544, "grad_norm": 0.5573911672484038, "learning_rate": 3.9997098648866624e-06, "loss": 0.0124, "step": 2906 }, { "epoch": 0.5774158307677029, "grad_norm": 0.40620283454662026, "learning_rate": 3.996557591288477e-06, "loss": 0.0106, "step": 2907 }, { "epoch": 0.5776144602244513, "grad_norm": 0.7192076124521594, "learning_rate": 3.99340573321479e-06, "loss": 0.0102, "step": 2908 }, { "epoch": 0.5778130896811997, "grad_norm": 0.4360510467100798, "learning_rate": 3.99025429197078e-06, "loss": 0.0141, "step": 2909 }, { "epoch": 0.5780117191379481, "grad_norm": 0.463212787813459, "learning_rate": 3.987103268861457e-06, "loss": 0.0111, "step": 2910 }, { "epoch": 0.5782103485946966, "grad_norm": 0.8251451590325685, "learning_rate": 3.983952665191656e-06, "loss": 0.0142, "step": 2911 }, { "epoch": 0.578408978051445, "grad_norm": 0.761966750523954, "learning_rate": 3.980802482266038e-06, "loss": 0.0163, "step": 2912 }, { "epoch": 0.5786076075081935, "grad_norm": 0.6924212159837746, "learning_rate": 3.977652721389092e-06, "loss": 0.0099, "step": 2913 }, { "epoch": 0.5788062369649419, "grad_norm": 0.3979725594269708, "learning_rate": 3.97450338386513e-06, "loss": 0.0085, "step": 2914 }, { "epoch": 0.5790048664216904, "grad_norm": 0.5618585012904748, "learning_rate": 3.97135447099829e-06, "loss": 0.0127, "step": 2915 }, { "epoch": 0.5792034958784388, "grad_norm": 0.7497853838343431, "learning_rate": 3.968205984092533e-06, "loss": 0.0108, "step": 2916 }, { "epoch": 0.5794021253351872, "grad_norm": 0.5120214709576271, "learning_rate": 3.965057924451648e-06, "loss": 0.0131, "step": 2917 }, { "epoch": 0.5796007547919356, "grad_norm": 0.6871961008819738, "learning_rate": 3.961910293379236e-06, "loss": 0.0144, "step": 2918 }, { "epoch": 0.579799384248684, "grad_norm": 0.41734721830693366, "learning_rate": 3.958763092178734e-06, "loss": 0.01, "step": 2919 }, { "epoch": 0.5799980137054325, "grad_norm": 0.4036222914847742, "learning_rate": 3.955616322153391e-06, "loss": 0.0105, "step": 2920 }, { "epoch": 0.580196643162181, "grad_norm": 0.6130853508919999, "learning_rate": 3.952469984606285e-06, "loss": 0.0134, "step": 2921 }, { "epoch": 0.5803952726189294, "grad_norm": 0.3509331583951365, "learning_rate": 3.949324080840309e-06, "loss": 0.0098, "step": 2922 }, { "epoch": 0.5805939020756778, "grad_norm": 0.37833538757507784, "learning_rate": 3.946178612158178e-06, "loss": 0.0134, "step": 2923 }, { "epoch": 0.5807925315324263, "grad_norm": 0.46980032869709154, "learning_rate": 3.94303357986243e-06, "loss": 0.0093, "step": 2924 }, { "epoch": 0.5809911609891747, "grad_norm": 0.47678450473727596, "learning_rate": 3.939888985255415e-06, "loss": 0.0156, "step": 2925 }, { "epoch": 0.5811897904459231, "grad_norm": 0.48897265097095066, "learning_rate": 3.9367448296393115e-06, "loss": 0.0082, "step": 2926 }, { "epoch": 0.5813884199026715, "grad_norm": 0.39022191785178684, "learning_rate": 3.93360111431611e-06, "loss": 0.0097, "step": 2927 }, { "epoch": 0.58158704935942, "grad_norm": 0.9510163155213661, "learning_rate": 3.930457840587618e-06, "loss": 0.0189, "step": 2928 }, { "epoch": 0.5817856788161684, "grad_norm": 0.4727876728395284, "learning_rate": 3.927315009755464e-06, "loss": 0.0142, "step": 2929 }, { "epoch": 0.5819843082729169, "grad_norm": 0.7381075193685331, "learning_rate": 3.92417262312109e-06, "loss": 0.0111, "step": 2930 }, { "epoch": 0.5821829377296653, "grad_norm": 0.6627709815131074, "learning_rate": 3.921030681985755e-06, "loss": 0.0108, "step": 2931 }, { "epoch": 0.5823815671864138, "grad_norm": 0.49816760107685104, "learning_rate": 3.917889187650533e-06, "loss": 0.012, "step": 2932 }, { "epoch": 0.5825801966431622, "grad_norm": 0.7768642446479597, "learning_rate": 3.914748141416317e-06, "loss": 0.0104, "step": 2933 }, { "epoch": 0.5827788260999106, "grad_norm": 0.2884247629212007, "learning_rate": 3.9116075445838075e-06, "loss": 0.0094, "step": 2934 }, { "epoch": 0.582977455556659, "grad_norm": 0.47011413952328, "learning_rate": 3.908467398453524e-06, "loss": 0.0066, "step": 2935 }, { "epoch": 0.5831760850134075, "grad_norm": 0.4443819609591886, "learning_rate": 3.905327704325799e-06, "loss": 0.0104, "step": 2936 }, { "epoch": 0.5833747144701559, "grad_norm": 0.3839458518134734, "learning_rate": 3.902188463500774e-06, "loss": 0.0119, "step": 2937 }, { "epoch": 0.5835733439269044, "grad_norm": 0.6111807494928548, "learning_rate": 3.899049677278407e-06, "loss": 0.013, "step": 2938 }, { "epoch": 0.5837719733836528, "grad_norm": 0.60033600270167, "learning_rate": 3.895911346958466e-06, "loss": 0.013, "step": 2939 }, { "epoch": 0.5839706028404013, "grad_norm": 0.2970921298615577, "learning_rate": 3.892773473840531e-06, "loss": 0.0074, "step": 2940 }, { "epoch": 0.5841692322971497, "grad_norm": 0.610226222580878, "learning_rate": 3.889636059223993e-06, "loss": 0.0167, "step": 2941 }, { "epoch": 0.5843678617538981, "grad_norm": 0.6862312895558023, "learning_rate": 3.886499104408051e-06, "loss": 0.0121, "step": 2942 }, { "epoch": 0.5845664912106465, "grad_norm": 0.4404684118718429, "learning_rate": 3.883362610691711e-06, "loss": 0.0142, "step": 2943 }, { "epoch": 0.5847651206673949, "grad_norm": 0.9493244733608, "learning_rate": 3.880226579373799e-06, "loss": 0.0111, "step": 2944 }, { "epoch": 0.5849637501241434, "grad_norm": 0.7593496884130583, "learning_rate": 3.877091011752938e-06, "loss": 0.015, "step": 2945 }, { "epoch": 0.5851623795808919, "grad_norm": 0.6721422032614358, "learning_rate": 3.8739559091275646e-06, "loss": 0.0139, "step": 2946 }, { "epoch": 0.5853610090376403, "grad_norm": 0.48578862136014994, "learning_rate": 3.870821272795922e-06, "loss": 0.0087, "step": 2947 }, { "epoch": 0.5855596384943887, "grad_norm": 0.3783706141485433, "learning_rate": 3.867687104056059e-06, "loss": 0.0068, "step": 2948 }, { "epoch": 0.5857582679511372, "grad_norm": 0.7342026067918515, "learning_rate": 3.864553404205833e-06, "loss": 0.0154, "step": 2949 }, { "epoch": 0.5859568974078856, "grad_norm": 0.45238663502264936, "learning_rate": 3.861420174542903e-06, "loss": 0.0132, "step": 2950 }, { "epoch": 0.586155526864634, "grad_norm": 0.5760123588598605, "learning_rate": 3.85828741636474e-06, "loss": 0.0088, "step": 2951 }, { "epoch": 0.5863541563213824, "grad_norm": 0.37486015440914744, "learning_rate": 3.855155130968616e-06, "loss": 0.0137, "step": 2952 }, { "epoch": 0.5865527857781309, "grad_norm": 0.4390439990790977, "learning_rate": 3.852023319651605e-06, "loss": 0.011, "step": 2953 }, { "epoch": 0.5867514152348793, "grad_norm": 0.9686179462089476, "learning_rate": 3.848891983710587e-06, "loss": 0.0108, "step": 2954 }, { "epoch": 0.5869500446916278, "grad_norm": 0.7875155365033674, "learning_rate": 3.845761124442246e-06, "loss": 0.0238, "step": 2955 }, { "epoch": 0.5871486741483762, "grad_norm": 0.38882678317489044, "learning_rate": 3.842630743143068e-06, "loss": 0.0119, "step": 2956 }, { "epoch": 0.5873473036051247, "grad_norm": 0.6336228129818564, "learning_rate": 3.839500841109338e-06, "loss": 0.0104, "step": 2957 }, { "epoch": 0.5875459330618731, "grad_norm": 0.5805224655559802, "learning_rate": 3.836371419637149e-06, "loss": 0.014, "step": 2958 }, { "epoch": 0.5877445625186215, "grad_norm": 0.27676008745737823, "learning_rate": 3.833242480022391e-06, "loss": 0.0075, "step": 2959 }, { "epoch": 0.5879431919753699, "grad_norm": 0.30567765332023794, "learning_rate": 3.8301140235607525e-06, "loss": 0.0074, "step": 2960 }, { "epoch": 0.5881418214321184, "grad_norm": 0.3651536661757168, "learning_rate": 3.826986051547726e-06, "loss": 0.008, "step": 2961 }, { "epoch": 0.5883404508888668, "grad_norm": 0.5291250242062171, "learning_rate": 3.8238585652786004e-06, "loss": 0.0109, "step": 2962 }, { "epoch": 0.5885390803456153, "grad_norm": 0.3137663400127696, "learning_rate": 3.820731566048466e-06, "loss": 0.0087, "step": 2963 }, { "epoch": 0.5887377098023637, "grad_norm": 2.235523320234743, "learning_rate": 3.817605055152208e-06, "loss": 0.012, "step": 2964 }, { "epoch": 0.5889363392591122, "grad_norm": 0.3358171761925821, "learning_rate": 3.814479033884514e-06, "loss": 0.008, "step": 2965 }, { "epoch": 0.5891349687158606, "grad_norm": 0.987905521258979, "learning_rate": 3.8113535035398637e-06, "loss": 0.0168, "step": 2966 }, { "epoch": 0.589333598172609, "grad_norm": 0.6892134567235605, "learning_rate": 3.8082284654125373e-06, "loss": 0.0164, "step": 2967 }, { "epoch": 0.5895322276293574, "grad_norm": 0.987943395384034, "learning_rate": 3.805103920796609e-06, "loss": 0.0133, "step": 2968 }, { "epoch": 0.5897308570861058, "grad_norm": 0.8509307481721003, "learning_rate": 3.8019798709859512e-06, "loss": 0.0116, "step": 2969 }, { "epoch": 0.5899294865428543, "grad_norm": 0.759142605103621, "learning_rate": 3.79885631727423e-06, "loss": 0.0115, "step": 2970 }, { "epoch": 0.5901281159996027, "grad_norm": 0.24316763299724442, "learning_rate": 3.7957332609549037e-06, "loss": 0.0069, "step": 2971 }, { "epoch": 0.5903267454563512, "grad_norm": 0.4485372299375667, "learning_rate": 3.792610703321229e-06, "loss": 0.0086, "step": 2972 }, { "epoch": 0.5905253749130996, "grad_norm": 0.588418063546346, "learning_rate": 3.789488645666253e-06, "loss": 0.0127, "step": 2973 }, { "epoch": 0.5907240043698481, "grad_norm": 0.36059543710830055, "learning_rate": 3.7863670892828156e-06, "loss": 0.012, "step": 2974 }, { "epoch": 0.5909226338265965, "grad_norm": 0.902669268812175, "learning_rate": 3.783246035463551e-06, "loss": 0.014, "step": 2975 }, { "epoch": 0.5911212632833449, "grad_norm": 0.31958820455962267, "learning_rate": 3.780125485500885e-06, "loss": 0.0087, "step": 2976 }, { "epoch": 0.5913198927400933, "grad_norm": 0.4986550681651737, "learning_rate": 3.777005440687035e-06, "loss": 0.0098, "step": 2977 }, { "epoch": 0.5915185221968418, "grad_norm": 0.7580605603680164, "learning_rate": 3.773885902314006e-06, "loss": 0.0241, "step": 2978 }, { "epoch": 0.5917171516535902, "grad_norm": 0.2330131586895299, "learning_rate": 3.770766871673598e-06, "loss": 0.0069, "step": 2979 }, { "epoch": 0.5919157811103387, "grad_norm": 0.2919054898745666, "learning_rate": 3.7676483500573966e-06, "loss": 0.0046, "step": 2980 }, { "epoch": 0.5921144105670871, "grad_norm": 0.38396343904172237, "learning_rate": 3.76453033875678e-06, "loss": 0.0065, "step": 2981 }, { "epoch": 0.5923130400238356, "grad_norm": 0.8594455212714647, "learning_rate": 3.761412839062911e-06, "loss": 0.0133, "step": 2982 }, { "epoch": 0.592511669480584, "grad_norm": 0.20526215304091466, "learning_rate": 3.7582958522667466e-06, "loss": 0.0043, "step": 2983 }, { "epoch": 0.5927102989373324, "grad_norm": 0.3004063766309997, "learning_rate": 3.7551793796590263e-06, "loss": 0.0062, "step": 2984 }, { "epoch": 0.5929089283940808, "grad_norm": 0.41297527826978053, "learning_rate": 3.7520634225302788e-06, "loss": 0.0105, "step": 2985 }, { "epoch": 0.5931075578508292, "grad_norm": 0.8190693978061193, "learning_rate": 3.7489479821708173e-06, "loss": 0.0171, "step": 2986 }, { "epoch": 0.5933061873075777, "grad_norm": 0.4697013617601209, "learning_rate": 3.7458330598707443e-06, "loss": 0.0065, "step": 2987 }, { "epoch": 0.5935048167643262, "grad_norm": 0.5384383900255248, "learning_rate": 3.7427186569199456e-06, "loss": 0.0128, "step": 2988 }, { "epoch": 0.5937034462210746, "grad_norm": 0.753459781552131, "learning_rate": 3.739604774608092e-06, "loss": 0.02, "step": 2989 }, { "epoch": 0.593902075677823, "grad_norm": 0.49028554186530215, "learning_rate": 3.7364914142246383e-06, "loss": 0.0101, "step": 2990 }, { "epoch": 0.5941007051345715, "grad_norm": 0.7809148931549772, "learning_rate": 3.733378577058825e-06, "loss": 0.0146, "step": 2991 }, { "epoch": 0.5942993345913199, "grad_norm": 0.7722259717302931, "learning_rate": 3.7302662643996747e-06, "loss": 0.0141, "step": 2992 }, { "epoch": 0.5944979640480683, "grad_norm": 0.6140405841322572, "learning_rate": 3.7271544775359906e-06, "loss": 0.0162, "step": 2993 }, { "epoch": 0.5946965935048167, "grad_norm": 0.7586513477389154, "learning_rate": 3.7240432177563646e-06, "loss": 0.0132, "step": 2994 }, { "epoch": 0.5948952229615652, "grad_norm": 0.6630658842640412, "learning_rate": 3.720932486349165e-06, "loss": 0.0153, "step": 2995 }, { "epoch": 0.5950938524183136, "grad_norm": 0.6232367511499796, "learning_rate": 3.7178222846025404e-06, "loss": 0.0097, "step": 2996 }, { "epoch": 0.5952924818750621, "grad_norm": 0.38137155843187615, "learning_rate": 3.7147126138044243e-06, "loss": 0.0072, "step": 2997 }, { "epoch": 0.5954911113318105, "grad_norm": 0.6383755710136322, "learning_rate": 3.7116034752425277e-06, "loss": 0.0124, "step": 2998 }, { "epoch": 0.595689740788559, "grad_norm": 0.5596917796188703, "learning_rate": 3.708494870204342e-06, "loss": 0.0158, "step": 2999 }, { "epoch": 0.5958883702453074, "grad_norm": 0.42835170745607165, "learning_rate": 3.7053867999771366e-06, "loss": 0.0125, "step": 3000 }, { "epoch": 0.5960869997020558, "grad_norm": 0.39430591078922234, "learning_rate": 3.702279265847961e-06, "loss": 0.0099, "step": 3001 }, { "epoch": 0.5962856291588042, "grad_norm": 0.509207291742738, "learning_rate": 3.6991722691036423e-06, "loss": 0.0104, "step": 3002 }, { "epoch": 0.5964842586155527, "grad_norm": 0.8424408408284407, "learning_rate": 3.6960658110307844e-06, "loss": 0.0109, "step": 3003 }, { "epoch": 0.5966828880723011, "grad_norm": 0.7528576265240631, "learning_rate": 3.6929598929157682e-06, "loss": 0.0151, "step": 3004 }, { "epoch": 0.5968815175290496, "grad_norm": 0.45476504766802717, "learning_rate": 3.689854516044752e-06, "loss": 0.0143, "step": 3005 }, { "epoch": 0.597080146985798, "grad_norm": 0.44031126237277396, "learning_rate": 3.6867496817036674e-06, "loss": 0.0138, "step": 3006 }, { "epoch": 0.5972787764425465, "grad_norm": 0.5965406726879514, "learning_rate": 3.6836453911782244e-06, "loss": 0.0097, "step": 3007 }, { "epoch": 0.5974774058992949, "grad_norm": 0.32200058430663714, "learning_rate": 3.680541645753908e-06, "loss": 0.0065, "step": 3008 }, { "epoch": 0.5976760353560433, "grad_norm": 0.35025081668798586, "learning_rate": 3.677438446715974e-06, "loss": 0.0148, "step": 3009 }, { "epoch": 0.5978746648127917, "grad_norm": 0.732734140663941, "learning_rate": 3.6743357953494554e-06, "loss": 0.0232, "step": 3010 }, { "epoch": 0.5980732942695401, "grad_norm": 0.9147359044098355, "learning_rate": 3.6712336929391558e-06, "loss": 0.0171, "step": 3011 }, { "epoch": 0.5982719237262886, "grad_norm": 0.47195978016132334, "learning_rate": 3.6681321407696546e-06, "loss": 0.0108, "step": 3012 }, { "epoch": 0.598470553183037, "grad_norm": 0.3255368283156676, "learning_rate": 3.665031140125299e-06, "loss": 0.0082, "step": 3013 }, { "epoch": 0.5986691826397855, "grad_norm": 0.4043095691857067, "learning_rate": 3.661930692290211e-06, "loss": 0.0111, "step": 3014 }, { "epoch": 0.598867812096534, "grad_norm": 0.8606549728317674, "learning_rate": 3.658830798548284e-06, "loss": 0.0208, "step": 3015 }, { "epoch": 0.5990664415532824, "grad_norm": 0.31791862321187564, "learning_rate": 3.6557314601831804e-06, "loss": 0.0075, "step": 3016 }, { "epoch": 0.5992650710100308, "grad_norm": 0.61615355943961, "learning_rate": 3.6526326784783328e-06, "loss": 0.0188, "step": 3017 }, { "epoch": 0.5994637004667792, "grad_norm": 0.3392910567038716, "learning_rate": 3.649534454716942e-06, "loss": 0.0071, "step": 3018 }, { "epoch": 0.5996623299235276, "grad_norm": 0.3407654018111544, "learning_rate": 3.646436790181983e-06, "loss": 0.0091, "step": 3019 }, { "epoch": 0.5998609593802761, "grad_norm": 0.43481756324822807, "learning_rate": 3.643339686156193e-06, "loss": 0.007, "step": 3020 }, { "epoch": 0.6000595888370245, "grad_norm": 0.42659281115467734, "learning_rate": 3.6402431439220807e-06, "loss": 0.0081, "step": 3021 }, { "epoch": 0.600258218293773, "grad_norm": 0.7240516093008215, "learning_rate": 3.6371471647619212e-06, "loss": 0.0172, "step": 3022 }, { "epoch": 0.6004568477505214, "grad_norm": 0.2238508808138508, "learning_rate": 3.6340517499577552e-06, "loss": 0.0068, "step": 3023 }, { "epoch": 0.6006554772072699, "grad_norm": 0.641644301510662, "learning_rate": 3.6309569007913926e-06, "loss": 0.0192, "step": 3024 }, { "epoch": 0.6008541066640183, "grad_norm": 0.6086067117270952, "learning_rate": 3.6278626185444043e-06, "loss": 0.0146, "step": 3025 }, { "epoch": 0.6010527361207667, "grad_norm": 0.562268484312575, "learning_rate": 3.624768904498133e-06, "loss": 0.0127, "step": 3026 }, { "epoch": 0.6012513655775151, "grad_norm": 0.5503772462099394, "learning_rate": 3.6216757599336817e-06, "loss": 0.0122, "step": 3027 }, { "epoch": 0.6014499950342636, "grad_norm": 0.3503335156258801, "learning_rate": 3.6185831861319175e-06, "loss": 0.0096, "step": 3028 }, { "epoch": 0.601648624491012, "grad_norm": 0.4135501081556177, "learning_rate": 3.6154911843734726e-06, "loss": 0.0098, "step": 3029 }, { "epoch": 0.6018472539477605, "grad_norm": 0.3319077445993011, "learning_rate": 3.612399755938741e-06, "loss": 0.0095, "step": 3030 }, { "epoch": 0.6020458834045089, "grad_norm": 0.4357565471543654, "learning_rate": 3.609308902107882e-06, "loss": 0.0093, "step": 3031 }, { "epoch": 0.6022445128612574, "grad_norm": 0.3754875478839089, "learning_rate": 3.6062186241608127e-06, "loss": 0.0067, "step": 3032 }, { "epoch": 0.6024431423180058, "grad_norm": 0.4655009047495604, "learning_rate": 3.603128923377216e-06, "loss": 0.0104, "step": 3033 }, { "epoch": 0.6026417717747542, "grad_norm": 0.37526412933260067, "learning_rate": 3.6000398010365335e-06, "loss": 0.0102, "step": 3034 }, { "epoch": 0.6028404012315026, "grad_norm": 0.37466649162476384, "learning_rate": 3.5969512584179676e-06, "loss": 0.0062, "step": 3035 }, { "epoch": 0.603039030688251, "grad_norm": 0.6376237051267966, "learning_rate": 3.5938632968004816e-06, "loss": 0.007, "step": 3036 }, { "epoch": 0.6032376601449995, "grad_norm": 0.5472784553576067, "learning_rate": 3.590775917462795e-06, "loss": 0.0087, "step": 3037 }, { "epoch": 0.6034362896017479, "grad_norm": 0.35009623082706764, "learning_rate": 3.5876891216833898e-06, "loss": 0.0111, "step": 3038 }, { "epoch": 0.6036349190584964, "grad_norm": 0.46974213511879326, "learning_rate": 3.5846029107405043e-06, "loss": 0.0095, "step": 3039 }, { "epoch": 0.6038335485152448, "grad_norm": 0.7485818959883949, "learning_rate": 3.581517285912137e-06, "loss": 0.0173, "step": 3040 }, { "epoch": 0.6040321779719933, "grad_norm": 0.38815346960654723, "learning_rate": 3.578432248476041e-06, "loss": 0.0086, "step": 3041 }, { "epoch": 0.6042308074287417, "grad_norm": 0.5263522252501535, "learning_rate": 3.5753477997097274e-06, "loss": 0.0089, "step": 3042 }, { "epoch": 0.6044294368854901, "grad_norm": 0.5860186189203216, "learning_rate": 3.5722639408904613e-06, "loss": 0.0121, "step": 3043 }, { "epoch": 0.6046280663422385, "grad_norm": 0.5281102457455624, "learning_rate": 3.5691806732952695e-06, "loss": 0.0121, "step": 3044 }, { "epoch": 0.604826695798987, "grad_norm": 0.6500090193718753, "learning_rate": 3.5660979982009283e-06, "loss": 0.0136, "step": 3045 }, { "epoch": 0.6050253252557354, "grad_norm": 0.39939760825863435, "learning_rate": 3.563015916883969e-06, "loss": 0.0085, "step": 3046 }, { "epoch": 0.6052239547124839, "grad_norm": 0.347841392980447, "learning_rate": 3.5599344306206797e-06, "loss": 0.0106, "step": 3047 }, { "epoch": 0.6054225841692323, "grad_norm": 0.9314772406942418, "learning_rate": 3.5568535406871006e-06, "loss": 0.0122, "step": 3048 }, { "epoch": 0.6056212136259808, "grad_norm": 0.4754175168943589, "learning_rate": 3.553773248359026e-06, "loss": 0.0114, "step": 3049 }, { "epoch": 0.6058198430827292, "grad_norm": 0.3708403797175746, "learning_rate": 3.5506935549119994e-06, "loss": 0.0091, "step": 3050 }, { "epoch": 0.6060184725394776, "grad_norm": 1.7346629552296347, "learning_rate": 3.547614461621321e-06, "loss": 0.013, "step": 3051 }, { "epoch": 0.606217101996226, "grad_norm": 0.40695318664902086, "learning_rate": 3.5445359697620396e-06, "loss": 0.0092, "step": 3052 }, { "epoch": 0.6064157314529744, "grad_norm": 0.7125792193716718, "learning_rate": 3.541458080608956e-06, "loss": 0.0119, "step": 3053 }, { "epoch": 0.6066143609097229, "grad_norm": 0.6915939424187886, "learning_rate": 3.5383807954366207e-06, "loss": 0.0168, "step": 3054 }, { "epoch": 0.6068129903664713, "grad_norm": 1.730306247282683, "learning_rate": 3.5353041155193333e-06, "loss": 0.0073, "step": 3055 }, { "epoch": 0.6070116198232198, "grad_norm": 0.438350129352838, "learning_rate": 3.5322280421311462e-06, "loss": 0.0095, "step": 3056 }, { "epoch": 0.6072102492799683, "grad_norm": 1.0041964816668, "learning_rate": 3.5291525765458555e-06, "loss": 0.0138, "step": 3057 }, { "epoch": 0.6074088787367167, "grad_norm": 0.48577362615681124, "learning_rate": 3.5260777200370108e-06, "loss": 0.0109, "step": 3058 }, { "epoch": 0.6076075081934651, "grad_norm": 0.5217130396983579, "learning_rate": 3.5230034738779062e-06, "loss": 0.0166, "step": 3059 }, { "epoch": 0.6078061376502135, "grad_norm": 0.519867329149172, "learning_rate": 3.519929839341586e-06, "loss": 0.0135, "step": 3060 }, { "epoch": 0.6080047671069619, "grad_norm": 0.5343544411448541, "learning_rate": 3.5168568177008343e-06, "loss": 0.0121, "step": 3061 }, { "epoch": 0.6082033965637104, "grad_norm": 0.5921739219433378, "learning_rate": 3.51378441022819e-06, "loss": 0.0217, "step": 3062 }, { "epoch": 0.6084020260204588, "grad_norm": 0.4319692092553288, "learning_rate": 3.5107126181959326e-06, "loss": 0.0087, "step": 3063 }, { "epoch": 0.6086006554772073, "grad_norm": 0.7628603176542459, "learning_rate": 3.507641442876089e-06, "loss": 0.0079, "step": 3064 }, { "epoch": 0.6087992849339557, "grad_norm": 0.5049220741692332, "learning_rate": 3.50457088554043e-06, "loss": 0.0047, "step": 3065 }, { "epoch": 0.6089979143907042, "grad_norm": 1.412081883923703, "learning_rate": 3.5015009474604687e-06, "loss": 0.0155, "step": 3066 }, { "epoch": 0.6091965438474526, "grad_norm": 1.8218808527341421, "learning_rate": 3.498431629907465e-06, "loss": 0.0156, "step": 3067 }, { "epoch": 0.609395173304201, "grad_norm": 0.3862970491125843, "learning_rate": 3.4953629341524185e-06, "loss": 0.0083, "step": 3068 }, { "epoch": 0.6095938027609494, "grad_norm": 0.5034340958370394, "learning_rate": 3.4922948614660755e-06, "loss": 0.0106, "step": 3069 }, { "epoch": 0.6097924322176979, "grad_norm": 0.6806056800334587, "learning_rate": 3.4892274131189203e-06, "loss": 0.0091, "step": 3070 }, { "epoch": 0.6099910616744463, "grad_norm": 1.6108884326279738, "learning_rate": 3.4861605903811802e-06, "loss": 0.0169, "step": 3071 }, { "epoch": 0.6101896911311948, "grad_norm": 0.4012224150673856, "learning_rate": 3.4830943945228243e-06, "loss": 0.0086, "step": 3072 }, { "epoch": 0.6103883205879432, "grad_norm": 0.5349335803458497, "learning_rate": 3.4800288268135598e-06, "loss": 0.0174, "step": 3073 }, { "epoch": 0.6105869500446917, "grad_norm": 0.4841940483295753, "learning_rate": 3.4769638885228364e-06, "loss": 0.0168, "step": 3074 }, { "epoch": 0.6107855795014401, "grad_norm": 0.6441740428833413, "learning_rate": 3.47389958091984e-06, "loss": 0.015, "step": 3075 }, { "epoch": 0.6109842089581885, "grad_norm": 0.357406903865928, "learning_rate": 3.4708359052735006e-06, "loss": 0.0082, "step": 3076 }, { "epoch": 0.6111828384149369, "grad_norm": 0.5834140528899641, "learning_rate": 3.4677728628524807e-06, "loss": 0.0115, "step": 3077 }, { "epoch": 0.6113814678716853, "grad_norm": 0.4928712484829136, "learning_rate": 3.464710454925184e-06, "loss": 0.0132, "step": 3078 }, { "epoch": 0.6115800973284338, "grad_norm": 0.5979492434923857, "learning_rate": 3.461648682759752e-06, "loss": 0.0095, "step": 3079 }, { "epoch": 0.6117787267851822, "grad_norm": 0.5154017529353565, "learning_rate": 3.4585875476240584e-06, "loss": 0.0131, "step": 3080 }, { "epoch": 0.6119773562419307, "grad_norm": 1.054941339463001, "learning_rate": 3.4555270507857174e-06, "loss": 0.0082, "step": 3081 }, { "epoch": 0.6121759856986791, "grad_norm": 0.7383146639599456, "learning_rate": 3.452467193512078e-06, "loss": 0.0086, "step": 3082 }, { "epoch": 0.6123746151554276, "grad_norm": 0.46223660532581257, "learning_rate": 3.449407977070225e-06, "loss": 0.008, "step": 3083 }, { "epoch": 0.612573244612176, "grad_norm": 0.6168524172398883, "learning_rate": 3.4463494027269772e-06, "loss": 0.0111, "step": 3084 }, { "epoch": 0.6127718740689244, "grad_norm": 0.443610681926009, "learning_rate": 3.443291471748884e-06, "loss": 0.0088, "step": 3085 }, { "epoch": 0.6129705035256728, "grad_norm": 0.42211292893459496, "learning_rate": 3.4402341854022326e-06, "loss": 0.0115, "step": 3086 }, { "epoch": 0.6131691329824213, "grad_norm": 0.38959656912953755, "learning_rate": 3.4371775449530444e-06, "loss": 0.0118, "step": 3087 }, { "epoch": 0.6133677624391697, "grad_norm": 0.482420771604837, "learning_rate": 3.434121551667069e-06, "loss": 0.0124, "step": 3088 }, { "epoch": 0.6135663918959182, "grad_norm": 0.9713114677647458, "learning_rate": 3.4310662068097915e-06, "loss": 0.0177, "step": 3089 }, { "epoch": 0.6137650213526666, "grad_norm": 0.41367064378744073, "learning_rate": 3.4280115116464263e-06, "loss": 0.0137, "step": 3090 }, { "epoch": 0.6139636508094151, "grad_norm": 0.3784820028429733, "learning_rate": 3.4249574674419206e-06, "loss": 0.0163, "step": 3091 }, { "epoch": 0.6141622802661635, "grad_norm": 0.4813484208222651, "learning_rate": 3.4219040754609497e-06, "loss": 0.0141, "step": 3092 }, { "epoch": 0.6143609097229119, "grad_norm": 0.28494432612150494, "learning_rate": 3.41885133696792e-06, "loss": 0.0054, "step": 3093 }, { "epoch": 0.6145595391796603, "grad_norm": 0.5830609541475335, "learning_rate": 3.415799253226969e-06, "loss": 0.0095, "step": 3094 }, { "epoch": 0.6147581686364088, "grad_norm": 0.32540607111091596, "learning_rate": 3.4127478255019607e-06, "loss": 0.0107, "step": 3095 }, { "epoch": 0.6149567980931572, "grad_norm": 0.35280809057789947, "learning_rate": 3.409697055056489e-06, "loss": 0.0105, "step": 3096 }, { "epoch": 0.6151554275499056, "grad_norm": 0.7711115094509354, "learning_rate": 3.406646943153874e-06, "loss": 0.0131, "step": 3097 }, { "epoch": 0.6153540570066541, "grad_norm": 0.6585705185875225, "learning_rate": 3.4035974910571635e-06, "loss": 0.0089, "step": 3098 }, { "epoch": 0.6155526864634026, "grad_norm": 0.8014653930270288, "learning_rate": 3.4005487000291336e-06, "loss": 0.0094, "step": 3099 }, { "epoch": 0.615751315920151, "grad_norm": 0.6004322261525369, "learning_rate": 3.3975005713322852e-06, "loss": 0.0183, "step": 3100 }, { "epoch": 0.6159499453768994, "grad_norm": 0.5039813081737613, "learning_rate": 3.3944531062288456e-06, "loss": 0.0117, "step": 3101 }, { "epoch": 0.6161485748336478, "grad_norm": 0.32998773181787877, "learning_rate": 3.391406305980767e-06, "loss": 0.0072, "step": 3102 }, { "epoch": 0.6163472042903962, "grad_norm": 0.39531334946742824, "learning_rate": 3.388360171849726e-06, "loss": 0.0131, "step": 3103 }, { "epoch": 0.6165458337471447, "grad_norm": 1.4423036226921424, "learning_rate": 3.3853147050971245e-06, "loss": 0.028, "step": 3104 }, { "epoch": 0.6167444632038931, "grad_norm": 0.6108617381540014, "learning_rate": 3.382269906984086e-06, "loss": 0.0083, "step": 3105 }, { "epoch": 0.6169430926606416, "grad_norm": 0.5450037541557885, "learning_rate": 3.3792257787714593e-06, "loss": 0.0192, "step": 3106 }, { "epoch": 0.61714172211739, "grad_norm": 1.7949375217276946, "learning_rate": 3.376182321719813e-06, "loss": 0.0241, "step": 3107 }, { "epoch": 0.6173403515741385, "grad_norm": 0.47593830021557054, "learning_rate": 3.3731395370894447e-06, "loss": 0.0137, "step": 3108 }, { "epoch": 0.6175389810308869, "grad_norm": 0.5799733370382245, "learning_rate": 3.370097426140363e-06, "loss": 0.0119, "step": 3109 }, { "epoch": 0.6177376104876353, "grad_norm": 0.4307995841914232, "learning_rate": 3.3670559901323054e-06, "loss": 0.0081, "step": 3110 }, { "epoch": 0.6179362399443837, "grad_norm": 1.0801970697450218, "learning_rate": 3.364015230324725e-06, "loss": 0.0237, "step": 3111 }, { "epoch": 0.6181348694011322, "grad_norm": 0.4367611467960593, "learning_rate": 3.3609751479768003e-06, "loss": 0.0098, "step": 3112 }, { "epoch": 0.6183334988578806, "grad_norm": 0.3381363192748383, "learning_rate": 3.3579357443474264e-06, "loss": 0.0083, "step": 3113 }, { "epoch": 0.6185321283146291, "grad_norm": 0.31029202975354897, "learning_rate": 3.354897020695216e-06, "loss": 0.0069, "step": 3114 }, { "epoch": 0.6187307577713775, "grad_norm": 0.36141553067284127, "learning_rate": 3.3518589782785016e-06, "loss": 0.0053, "step": 3115 }, { "epoch": 0.618929387228126, "grad_norm": 0.7205196066259474, "learning_rate": 3.348821618355334e-06, "loss": 0.0174, "step": 3116 }, { "epoch": 0.6191280166848744, "grad_norm": 0.392173406386398, "learning_rate": 3.345784942183481e-06, "loss": 0.0118, "step": 3117 }, { "epoch": 0.6193266461416228, "grad_norm": 0.3923813550404335, "learning_rate": 3.342748951020425e-06, "loss": 0.0072, "step": 3118 }, { "epoch": 0.6195252755983712, "grad_norm": 0.9318581326956561, "learning_rate": 3.3397136461233705e-06, "loss": 0.0158, "step": 3119 }, { "epoch": 0.6197239050551197, "grad_norm": 0.4305556661534459, "learning_rate": 3.3366790287492323e-06, "loss": 0.01, "step": 3120 }, { "epoch": 0.6199225345118681, "grad_norm": 0.494409947487882, "learning_rate": 3.3336451001546422e-06, "loss": 0.0114, "step": 3121 }, { "epoch": 0.6201211639686165, "grad_norm": 0.41455988706859453, "learning_rate": 3.3306118615959483e-06, "loss": 0.006, "step": 3122 }, { "epoch": 0.620319793425365, "grad_norm": 0.7803252545572082, "learning_rate": 3.32757931432921e-06, "loss": 0.0181, "step": 3123 }, { "epoch": 0.6205184228821135, "grad_norm": 0.5185588948320111, "learning_rate": 3.3245474596102034e-06, "loss": 0.0094, "step": 3124 }, { "epoch": 0.6207170523388619, "grad_norm": 0.35619054220848395, "learning_rate": 3.3215162986944145e-06, "loss": 0.0169, "step": 3125 }, { "epoch": 0.6209156817956103, "grad_norm": 0.6897006764622771, "learning_rate": 3.3184858328370464e-06, "loss": 0.0112, "step": 3126 }, { "epoch": 0.6211143112523587, "grad_norm": 0.579124162879879, "learning_rate": 3.315456063293011e-06, "loss": 0.0161, "step": 3127 }, { "epoch": 0.6213129407091071, "grad_norm": 0.45691926633221563, "learning_rate": 3.312426991316933e-06, "loss": 0.0079, "step": 3128 }, { "epoch": 0.6215115701658556, "grad_norm": 0.3550439857141177, "learning_rate": 3.309398618163148e-06, "loss": 0.01, "step": 3129 }, { "epoch": 0.621710199622604, "grad_norm": 0.4261114797732587, "learning_rate": 3.306370945085702e-06, "loss": 0.0093, "step": 3130 }, { "epoch": 0.6219088290793525, "grad_norm": 0.509512632829503, "learning_rate": 3.303343973338352e-06, "loss": 0.0149, "step": 3131 }, { "epoch": 0.6221074585361009, "grad_norm": 0.33665150006855243, "learning_rate": 3.3003177041745644e-06, "loss": 0.0044, "step": 3132 }, { "epoch": 0.6223060879928494, "grad_norm": 0.6093443769153506, "learning_rate": 3.297292138847512e-06, "loss": 0.0139, "step": 3133 }, { "epoch": 0.6225047174495978, "grad_norm": 0.6356168599910234, "learning_rate": 3.2942672786100806e-06, "loss": 0.0119, "step": 3134 }, { "epoch": 0.6227033469063462, "grad_norm": 0.4716855426406557, "learning_rate": 3.2912431247148603e-06, "loss": 0.0123, "step": 3135 }, { "epoch": 0.6229019763630946, "grad_norm": 0.26872765852592456, "learning_rate": 3.2882196784141497e-06, "loss": 0.0087, "step": 3136 }, { "epoch": 0.6231006058198431, "grad_norm": 0.9289996201648486, "learning_rate": 3.285196940959957e-06, "loss": 0.0128, "step": 3137 }, { "epoch": 0.6232992352765915, "grad_norm": 0.5778170278567689, "learning_rate": 3.2821749136039947e-06, "loss": 0.015, "step": 3138 }, { "epoch": 0.6234978647333399, "grad_norm": 0.44484550235368014, "learning_rate": 3.2791535975976796e-06, "loss": 0.0112, "step": 3139 }, { "epoch": 0.6236964941900884, "grad_norm": 0.5650552894350845, "learning_rate": 3.276132994192137e-06, "loss": 0.0168, "step": 3140 }, { "epoch": 0.6238951236468369, "grad_norm": 0.45586647670638364, "learning_rate": 3.2731131046381946e-06, "loss": 0.0165, "step": 3141 }, { "epoch": 0.6240937531035853, "grad_norm": 0.3991031664463805, "learning_rate": 3.2700939301863867e-06, "loss": 0.0088, "step": 3142 }, { "epoch": 0.6242923825603337, "grad_norm": 0.5441221777716391, "learning_rate": 3.2670754720869483e-06, "loss": 0.0091, "step": 3143 }, { "epoch": 0.6244910120170821, "grad_norm": 0.4508262833394596, "learning_rate": 3.2640577315898232e-06, "loss": 0.0101, "step": 3144 }, { "epoch": 0.6246896414738305, "grad_norm": 0.5522463595883214, "learning_rate": 3.261040709944653e-06, "loss": 0.0199, "step": 3145 }, { "epoch": 0.624888270930579, "grad_norm": 0.5241449671995597, "learning_rate": 3.258024408400783e-06, "loss": 0.0155, "step": 3146 }, { "epoch": 0.6250869003873274, "grad_norm": 0.5548789843023203, "learning_rate": 3.2550088282072614e-06, "loss": 0.0109, "step": 3147 }, { "epoch": 0.6252855298440759, "grad_norm": 1.0504625440702693, "learning_rate": 3.2519939706128354e-06, "loss": 0.019, "step": 3148 }, { "epoch": 0.6254841593008243, "grad_norm": 0.4102947730181052, "learning_rate": 3.2489798368659568e-06, "loss": 0.0106, "step": 3149 }, { "epoch": 0.6256827887575728, "grad_norm": 0.6501870965913413, "learning_rate": 3.245966428214773e-06, "loss": 0.0102, "step": 3150 }, { "epoch": 0.6258814182143212, "grad_norm": 0.542258565431279, "learning_rate": 3.242953745907136e-06, "loss": 0.0164, "step": 3151 }, { "epoch": 0.6260800476710696, "grad_norm": 0.43709875345405597, "learning_rate": 3.2399417911905928e-06, "loss": 0.0099, "step": 3152 }, { "epoch": 0.626278677127818, "grad_norm": 0.4169502805419377, "learning_rate": 3.2369305653123918e-06, "loss": 0.0161, "step": 3153 }, { "epoch": 0.6264773065845665, "grad_norm": 0.7203211824671325, "learning_rate": 3.2339200695194776e-06, "loss": 0.0132, "step": 3154 }, { "epoch": 0.6266759360413149, "grad_norm": 0.6337861054133493, "learning_rate": 3.2309103050584943e-06, "loss": 0.0155, "step": 3155 }, { "epoch": 0.6268745654980633, "grad_norm": 0.5461213073528943, "learning_rate": 3.227901273175783e-06, "loss": 0.018, "step": 3156 }, { "epoch": 0.6270731949548118, "grad_norm": 0.4288994784155615, "learning_rate": 3.2248929751173796e-06, "loss": 0.013, "step": 3157 }, { "epoch": 0.6272718244115603, "grad_norm": 0.5336781946873922, "learning_rate": 3.2218854121290167e-06, "loss": 0.0125, "step": 3158 }, { "epoch": 0.6274704538683087, "grad_norm": 0.39411430837415473, "learning_rate": 3.218878585456124e-06, "loss": 0.0155, "step": 3159 }, { "epoch": 0.6276690833250571, "grad_norm": 0.7372205200276526, "learning_rate": 3.215872496343826e-06, "loss": 0.0109, "step": 3160 }, { "epoch": 0.6278677127818055, "grad_norm": 0.31051639973025613, "learning_rate": 3.212867146036939e-06, "loss": 0.0112, "step": 3161 }, { "epoch": 0.628066342238554, "grad_norm": 0.7517003368889213, "learning_rate": 3.2098625357799777e-06, "loss": 0.0152, "step": 3162 }, { "epoch": 0.6282649716953024, "grad_norm": 0.4493855979835352, "learning_rate": 3.2068586668171487e-06, "loss": 0.0128, "step": 3163 }, { "epoch": 0.6284636011520508, "grad_norm": 0.43259693880401673, "learning_rate": 3.2038555403923495e-06, "loss": 0.0105, "step": 3164 }, { "epoch": 0.6286622306087993, "grad_norm": 0.6757306391799492, "learning_rate": 3.2008531577491726e-06, "loss": 0.015, "step": 3165 }, { "epoch": 0.6288608600655478, "grad_norm": 0.7200622315806112, "learning_rate": 3.197851520130901e-06, "loss": 0.0127, "step": 3166 }, { "epoch": 0.6290594895222962, "grad_norm": 0.30945546413393143, "learning_rate": 3.1948506287805105e-06, "loss": 0.0071, "step": 3167 }, { "epoch": 0.6292581189790446, "grad_norm": 0.5184927164257134, "learning_rate": 3.1918504849406655e-06, "loss": 0.013, "step": 3168 }, { "epoch": 0.629456748435793, "grad_norm": 0.37944118777802155, "learning_rate": 3.188851089853725e-06, "loss": 0.0121, "step": 3169 }, { "epoch": 0.6296553778925414, "grad_norm": 0.6108105036736555, "learning_rate": 3.185852444761735e-06, "loss": 0.0175, "step": 3170 }, { "epoch": 0.6298540073492899, "grad_norm": 0.45049388301001664, "learning_rate": 3.182854550906431e-06, "loss": 0.0119, "step": 3171 }, { "epoch": 0.6300526368060383, "grad_norm": 0.5933853421541697, "learning_rate": 3.1798574095292366e-06, "loss": 0.0153, "step": 3172 }, { "epoch": 0.6302512662627868, "grad_norm": 0.3057531950463691, "learning_rate": 3.176861021871267e-06, "loss": 0.0109, "step": 3173 }, { "epoch": 0.6304498957195352, "grad_norm": 0.3058524461492556, "learning_rate": 3.1738653891733228e-06, "loss": 0.0102, "step": 3174 }, { "epoch": 0.6306485251762837, "grad_norm": 0.297644692305566, "learning_rate": 3.170870512675891e-06, "loss": 0.0083, "step": 3175 }, { "epoch": 0.6308471546330321, "grad_norm": 0.41503730094845476, "learning_rate": 3.1678763936191493e-06, "loss": 0.0071, "step": 3176 }, { "epoch": 0.6310457840897805, "grad_norm": 0.43732569514354086, "learning_rate": 3.1648830332429576e-06, "loss": 0.0149, "step": 3177 }, { "epoch": 0.6312444135465289, "grad_norm": 0.4053359605257468, "learning_rate": 3.161890432786864e-06, "loss": 0.0119, "step": 3178 }, { "epoch": 0.6314430430032774, "grad_norm": 0.4018745491695557, "learning_rate": 3.1588985934901024e-06, "loss": 0.0102, "step": 3179 }, { "epoch": 0.6316416724600258, "grad_norm": 0.7769830143664423, "learning_rate": 3.1559075165915897e-06, "loss": 0.0141, "step": 3180 }, { "epoch": 0.6318403019167742, "grad_norm": 0.4333501607393468, "learning_rate": 3.152917203329927e-06, "loss": 0.0073, "step": 3181 }, { "epoch": 0.6320389313735227, "grad_norm": 0.3780998689723311, "learning_rate": 3.149927654943401e-06, "loss": 0.0101, "step": 3182 }, { "epoch": 0.6322375608302712, "grad_norm": 0.5924510687322293, "learning_rate": 3.14693887266998e-06, "loss": 0.015, "step": 3183 }, { "epoch": 0.6324361902870196, "grad_norm": 0.7703336171257334, "learning_rate": 3.143950857747316e-06, "loss": 0.009, "step": 3184 }, { "epoch": 0.632634819743768, "grad_norm": 0.47579749826164863, "learning_rate": 3.1409636114127434e-06, "loss": 0.0103, "step": 3185 }, { "epoch": 0.6328334492005164, "grad_norm": 0.5329585694417279, "learning_rate": 3.137977134903276e-06, "loss": 0.015, "step": 3186 }, { "epoch": 0.6330320786572649, "grad_norm": 0.3926580971839605, "learning_rate": 3.1349914294556146e-06, "loss": 0.0122, "step": 3187 }, { "epoch": 0.6332307081140133, "grad_norm": 0.7331321468927112, "learning_rate": 3.1320064963061335e-06, "loss": 0.0179, "step": 3188 }, { "epoch": 0.6334293375707617, "grad_norm": 0.7309606038405821, "learning_rate": 3.1290223366908923e-06, "loss": 0.0097, "step": 3189 }, { "epoch": 0.6336279670275102, "grad_norm": 0.40442647112520747, "learning_rate": 3.1260389518456275e-06, "loss": 0.0081, "step": 3190 }, { "epoch": 0.6338265964842587, "grad_norm": 0.23396485229178887, "learning_rate": 3.123056343005756e-06, "loss": 0.0042, "step": 3191 }, { "epoch": 0.6340252259410071, "grad_norm": 0.31067689461138565, "learning_rate": 3.1200745114063733e-06, "loss": 0.0068, "step": 3192 }, { "epoch": 0.6342238553977555, "grad_norm": 0.4863770217084106, "learning_rate": 3.117093458282252e-06, "loss": 0.0082, "step": 3193 }, { "epoch": 0.6344224848545039, "grad_norm": 0.6185800085213491, "learning_rate": 3.1141131848678453e-06, "loss": 0.0113, "step": 3194 }, { "epoch": 0.6346211143112523, "grad_norm": 0.4166395787409532, "learning_rate": 3.111133692397279e-06, "loss": 0.0072, "step": 3195 }, { "epoch": 0.6348197437680008, "grad_norm": 0.6441347162448549, "learning_rate": 3.10815498210436e-06, "loss": 0.0191, "step": 3196 }, { "epoch": 0.6350183732247492, "grad_norm": 0.5648143884299345, "learning_rate": 3.105177055222569e-06, "loss": 0.0188, "step": 3197 }, { "epoch": 0.6352170026814976, "grad_norm": 0.2983740379639698, "learning_rate": 3.102199912985061e-06, "loss": 0.0074, "step": 3198 }, { "epoch": 0.6354156321382461, "grad_norm": 0.4602927312462115, "learning_rate": 3.099223556624669e-06, "loss": 0.0139, "step": 3199 }, { "epoch": 0.6356142615949946, "grad_norm": 0.697211094443911, "learning_rate": 3.096247987373897e-06, "loss": 0.0155, "step": 3200 }, { "epoch": 0.635812891051743, "grad_norm": 0.7485575048841167, "learning_rate": 3.0932732064649284e-06, "loss": 0.016, "step": 3201 }, { "epoch": 0.6360115205084914, "grad_norm": 0.78529100896383, "learning_rate": 3.0902992151296156e-06, "loss": 0.0226, "step": 3202 }, { "epoch": 0.6362101499652398, "grad_norm": 0.41085026543693437, "learning_rate": 3.0873260145994857e-06, "loss": 0.0112, "step": 3203 }, { "epoch": 0.6364087794219883, "grad_norm": 0.6494964040575566, "learning_rate": 3.0843536061057378e-06, "loss": 0.0121, "step": 3204 }, { "epoch": 0.6366074088787367, "grad_norm": 0.6702929678649525, "learning_rate": 3.081381990879243e-06, "loss": 0.0129, "step": 3205 }, { "epoch": 0.6368060383354851, "grad_norm": 0.6836348848424111, "learning_rate": 3.078411170150545e-06, "loss": 0.0116, "step": 3206 }, { "epoch": 0.6370046677922336, "grad_norm": 0.393253021137455, "learning_rate": 3.0754411451498557e-06, "loss": 0.0145, "step": 3207 }, { "epoch": 0.6372032972489821, "grad_norm": 1.4739247811366072, "learning_rate": 3.0724719171070615e-06, "loss": 0.0185, "step": 3208 }, { "epoch": 0.6374019267057305, "grad_norm": 0.4801500029884138, "learning_rate": 3.0695034872517166e-06, "loss": 0.0075, "step": 3209 }, { "epoch": 0.6376005561624789, "grad_norm": 0.7278188114507573, "learning_rate": 3.066535856813044e-06, "loss": 0.0134, "step": 3210 }, { "epoch": 0.6377991856192273, "grad_norm": 1.2241754299310377, "learning_rate": 3.063569027019936e-06, "loss": 0.0163, "step": 3211 }, { "epoch": 0.6379978150759757, "grad_norm": 0.2911942180277458, "learning_rate": 3.0606029991009557e-06, "loss": 0.0116, "step": 3212 }, { "epoch": 0.6381964445327242, "grad_norm": 0.19274138226813456, "learning_rate": 3.057637774284331e-06, "loss": 0.0055, "step": 3213 }, { "epoch": 0.6383950739894726, "grad_norm": 0.37853841414052497, "learning_rate": 3.0546733537979588e-06, "loss": 0.0141, "step": 3214 }, { "epoch": 0.6385937034462211, "grad_norm": 0.3776259956184919, "learning_rate": 3.051709738869403e-06, "loss": 0.0083, "step": 3215 }, { "epoch": 0.6387923329029696, "grad_norm": 0.560635040612453, "learning_rate": 3.048746930725893e-06, "loss": 0.011, "step": 3216 }, { "epoch": 0.638990962359718, "grad_norm": 0.48130658527669296, "learning_rate": 3.0457849305943256e-06, "loss": 0.0118, "step": 3217 }, { "epoch": 0.6391895918164664, "grad_norm": 0.4771547132379289, "learning_rate": 3.04282373970126e-06, "loss": 0.0104, "step": 3218 }, { "epoch": 0.6393882212732148, "grad_norm": 0.3460116166064898, "learning_rate": 3.0398633592729243e-06, "loss": 0.0071, "step": 3219 }, { "epoch": 0.6395868507299632, "grad_norm": 0.5824392990720808, "learning_rate": 3.0369037905352093e-06, "loss": 0.0088, "step": 3220 }, { "epoch": 0.6397854801867117, "grad_norm": 0.20323244555014405, "learning_rate": 3.033945034713669e-06, "loss": 0.0064, "step": 3221 }, { "epoch": 0.6399841096434601, "grad_norm": 0.638442739435459, "learning_rate": 3.0309870930335204e-06, "loss": 0.0137, "step": 3222 }, { "epoch": 0.6401827391002085, "grad_norm": 0.77301437107863, "learning_rate": 3.0280299667196444e-06, "loss": 0.0087, "step": 3223 }, { "epoch": 0.640381368556957, "grad_norm": 0.7822928728867742, "learning_rate": 3.0250736569965857e-06, "loss": 0.0181, "step": 3224 }, { "epoch": 0.6405799980137055, "grad_norm": 0.33540923186909954, "learning_rate": 3.0221181650885454e-06, "loss": 0.0094, "step": 3225 }, { "epoch": 0.6407786274704539, "grad_norm": 0.3931991439665449, "learning_rate": 3.0191634922193946e-06, "loss": 0.014, "step": 3226 }, { "epoch": 0.6409772569272023, "grad_norm": 0.6751895750801706, "learning_rate": 3.016209639612657e-06, "loss": 0.0127, "step": 3227 }, { "epoch": 0.6411758863839507, "grad_norm": 0.5026416514430642, "learning_rate": 3.0132566084915236e-06, "loss": 0.0133, "step": 3228 }, { "epoch": 0.6413745158406992, "grad_norm": 0.8976233584442671, "learning_rate": 3.0103044000788356e-06, "loss": 0.0139, "step": 3229 }, { "epoch": 0.6415731452974476, "grad_norm": 0.7798871995666934, "learning_rate": 3.007353015597104e-06, "loss": 0.0134, "step": 3230 }, { "epoch": 0.641771774754196, "grad_norm": 0.5380549093775145, "learning_rate": 3.0044024562684938e-06, "loss": 0.0139, "step": 3231 }, { "epoch": 0.6419704042109445, "grad_norm": 0.3154555145345817, "learning_rate": 3.001452723314827e-06, "loss": 0.0102, "step": 3232 }, { "epoch": 0.642169033667693, "grad_norm": 0.3340853456189174, "learning_rate": 2.998503817957587e-06, "loss": 0.0058, "step": 3233 }, { "epoch": 0.6423676631244414, "grad_norm": 0.2936801248452502, "learning_rate": 2.9955557414179117e-06, "loss": 0.0043, "step": 3234 }, { "epoch": 0.6425662925811898, "grad_norm": 0.7864359116686, "learning_rate": 2.9926084949165956e-06, "loss": 0.0189, "step": 3235 }, { "epoch": 0.6427649220379382, "grad_norm": 0.7167406061383401, "learning_rate": 2.989662079674092e-06, "loss": 0.0102, "step": 3236 }, { "epoch": 0.6429635514946866, "grad_norm": 0.3351720288278042, "learning_rate": 2.9867164969105073e-06, "loss": 0.0062, "step": 3237 }, { "epoch": 0.6431621809514351, "grad_norm": 0.40908005659181024, "learning_rate": 2.983771747845606e-06, "loss": 0.0111, "step": 3238 }, { "epoch": 0.6433608104081835, "grad_norm": 0.3277290055237293, "learning_rate": 2.9808278336988043e-06, "loss": 0.0073, "step": 3239 }, { "epoch": 0.6435594398649319, "grad_norm": 0.5048253802977973, "learning_rate": 2.9778847556891754e-06, "loss": 0.0077, "step": 3240 }, { "epoch": 0.6437580693216804, "grad_norm": 0.3405168121598885, "learning_rate": 2.974942515035444e-06, "loss": 0.0135, "step": 3241 }, { "epoch": 0.6439566987784289, "grad_norm": 0.6104144042070537, "learning_rate": 2.972001112955989e-06, "loss": 0.0141, "step": 3242 }, { "epoch": 0.6441553282351773, "grad_norm": 0.3216837585137305, "learning_rate": 2.969060550668841e-06, "loss": 0.0045, "step": 3243 }, { "epoch": 0.6443539576919257, "grad_norm": 0.4861048452267066, "learning_rate": 2.966120829391686e-06, "loss": 0.0073, "step": 3244 }, { "epoch": 0.6445525871486741, "grad_norm": 0.4499553189439919, "learning_rate": 2.963181950341859e-06, "loss": 0.0089, "step": 3245 }, { "epoch": 0.6447512166054226, "grad_norm": 0.435117209252951, "learning_rate": 2.9602439147363472e-06, "loss": 0.0101, "step": 3246 }, { "epoch": 0.644949846062171, "grad_norm": 0.5927891171510099, "learning_rate": 2.957306723791787e-06, "loss": 0.0067, "step": 3247 }, { "epoch": 0.6451484755189194, "grad_norm": 0.37326850014800456, "learning_rate": 2.9543703787244672e-06, "loss": 0.0064, "step": 3248 }, { "epoch": 0.6453471049756679, "grad_norm": 0.5716478987296174, "learning_rate": 2.9514348807503248e-06, "loss": 0.0076, "step": 3249 }, { "epoch": 0.6455457344324164, "grad_norm": 0.5615102052694282, "learning_rate": 2.9485002310849454e-06, "loss": 0.0106, "step": 3250 }, { "epoch": 0.6457443638891648, "grad_norm": 0.47977193647612537, "learning_rate": 2.9455664309435674e-06, "loss": 0.0085, "step": 3251 }, { "epoch": 0.6459429933459132, "grad_norm": 0.6327506375262189, "learning_rate": 2.942633481541075e-06, "loss": 0.0138, "step": 3252 }, { "epoch": 0.6461416228026616, "grad_norm": 0.558340339517197, "learning_rate": 2.9397013840919953e-06, "loss": 0.0085, "step": 3253 }, { "epoch": 0.64634025225941, "grad_norm": 0.6265421103163499, "learning_rate": 2.9367701398105087e-06, "loss": 0.0088, "step": 3254 }, { "epoch": 0.6465388817161585, "grad_norm": 0.6651718817702527, "learning_rate": 2.933839749910442e-06, "loss": 0.011, "step": 3255 }, { "epoch": 0.6467375111729069, "grad_norm": 0.7677766935582281, "learning_rate": 2.930910215605265e-06, "loss": 0.0126, "step": 3256 }, { "epoch": 0.6469361406296554, "grad_norm": 0.6946854991421672, "learning_rate": 2.9279815381080966e-06, "loss": 0.018, "step": 3257 }, { "epoch": 0.6471347700864039, "grad_norm": 0.9714783676460519, "learning_rate": 2.9250537186316975e-06, "loss": 0.0204, "step": 3258 }, { "epoch": 0.6473333995431523, "grad_norm": 0.5991115101907825, "learning_rate": 2.9221267583884762e-06, "loss": 0.0127, "step": 3259 }, { "epoch": 0.6475320289999007, "grad_norm": 0.5265824803297029, "learning_rate": 2.919200658590483e-06, "loss": 0.0098, "step": 3260 }, { "epoch": 0.6477306584566491, "grad_norm": 0.4157857124333679, "learning_rate": 2.9162754204494125e-06, "loss": 0.0074, "step": 3261 }, { "epoch": 0.6479292879133975, "grad_norm": 0.5129766838144425, "learning_rate": 2.913351045176606e-06, "loss": 0.0135, "step": 3262 }, { "epoch": 0.648127917370146, "grad_norm": 0.614863644307128, "learning_rate": 2.91042753398304e-06, "loss": 0.0082, "step": 3263 }, { "epoch": 0.6483265468268944, "grad_norm": 0.7205082964279731, "learning_rate": 2.9075048880793395e-06, "loss": 0.012, "step": 3264 }, { "epoch": 0.6485251762836428, "grad_norm": 0.6148645605362121, "learning_rate": 2.9045831086757716e-06, "loss": 0.0157, "step": 3265 }, { "epoch": 0.6487238057403913, "grad_norm": 1.7983560604746254, "learning_rate": 2.9016621969822374e-06, "loss": 0.0122, "step": 3266 }, { "epoch": 0.6489224351971398, "grad_norm": 0.219136851702214, "learning_rate": 2.8987421542082885e-06, "loss": 0.0034, "step": 3267 }, { "epoch": 0.6491210646538882, "grad_norm": 0.6386998265055096, "learning_rate": 2.8958229815631068e-06, "loss": 0.0098, "step": 3268 }, { "epoch": 0.6493196941106366, "grad_norm": 0.5134293758082206, "learning_rate": 2.892904680255524e-06, "loss": 0.012, "step": 3269 }, { "epoch": 0.649518323567385, "grad_norm": 0.5345560851450427, "learning_rate": 2.889987251494e-06, "loss": 0.0116, "step": 3270 }, { "epoch": 0.6497169530241335, "grad_norm": 0.7400191306883878, "learning_rate": 2.8870706964866436e-06, "loss": 0.0143, "step": 3271 }, { "epoch": 0.6499155824808819, "grad_norm": 0.4635388451227108, "learning_rate": 2.8841550164411967e-06, "loss": 0.0131, "step": 3272 }, { "epoch": 0.6501142119376303, "grad_norm": 0.5109709585210221, "learning_rate": 2.881240212565037e-06, "loss": 0.0051, "step": 3273 }, { "epoch": 0.6503128413943788, "grad_norm": 0.7108773032847435, "learning_rate": 2.878326286065185e-06, "loss": 0.0139, "step": 3274 }, { "epoch": 0.6505114708511273, "grad_norm": 0.6626555461496284, "learning_rate": 2.8754132381482926e-06, "loss": 0.0119, "step": 3275 }, { "epoch": 0.6507101003078757, "grad_norm": 0.5269547367333896, "learning_rate": 2.8725010700206514e-06, "loss": 0.0145, "step": 3276 }, { "epoch": 0.6509087297646241, "grad_norm": 0.48445706983464565, "learning_rate": 2.869589782888187e-06, "loss": 0.0069, "step": 3277 }, { "epoch": 0.6511073592213725, "grad_norm": 0.39715086701308067, "learning_rate": 2.866679377956458e-06, "loss": 0.0107, "step": 3278 }, { "epoch": 0.651305988678121, "grad_norm": 0.6713147395317994, "learning_rate": 2.8637698564306637e-06, "loss": 0.0145, "step": 3279 }, { "epoch": 0.6515046181348694, "grad_norm": 0.6460254311609365, "learning_rate": 2.8608612195156318e-06, "loss": 0.022, "step": 3280 }, { "epoch": 0.6517032475916178, "grad_norm": 0.689742162700817, "learning_rate": 2.8579534684158277e-06, "loss": 0.0121, "step": 3281 }, { "epoch": 0.6519018770483662, "grad_norm": 0.6959754949888912, "learning_rate": 2.8550466043353453e-06, "loss": 0.0175, "step": 3282 }, { "epoch": 0.6521005065051148, "grad_norm": 0.47915768218904303, "learning_rate": 2.852140628477916e-06, "loss": 0.0153, "step": 3283 }, { "epoch": 0.6522991359618632, "grad_norm": 0.6904735505096031, "learning_rate": 2.849235542046904e-06, "loss": 0.0146, "step": 3284 }, { "epoch": 0.6524977654186116, "grad_norm": 0.8025885992496349, "learning_rate": 2.846331346245298e-06, "loss": 0.0239, "step": 3285 }, { "epoch": 0.65269639487536, "grad_norm": 0.619059088683545, "learning_rate": 2.843428042275727e-06, "loss": 0.0113, "step": 3286 }, { "epoch": 0.6528950243321084, "grad_norm": 0.8459065891949454, "learning_rate": 2.8405256313404417e-06, "loss": 0.0184, "step": 3287 }, { "epoch": 0.6530936537888569, "grad_norm": 0.5453957141251398, "learning_rate": 2.8376241146413324e-06, "loss": 0.0132, "step": 3288 }, { "epoch": 0.6532922832456053, "grad_norm": 0.377901914882823, "learning_rate": 2.8347234933799097e-06, "loss": 0.0104, "step": 3289 }, { "epoch": 0.6534909127023537, "grad_norm": 0.38612479544338135, "learning_rate": 2.831823768757319e-06, "loss": 0.0108, "step": 3290 }, { "epoch": 0.6536895421591022, "grad_norm": 0.7541376084135651, "learning_rate": 2.8289249419743376e-06, "loss": 0.0134, "step": 3291 }, { "epoch": 0.6538881716158507, "grad_norm": 0.22674881778038922, "learning_rate": 2.826027014231361e-06, "loss": 0.0112, "step": 3292 }, { "epoch": 0.6540868010725991, "grad_norm": 0.5466667057789234, "learning_rate": 2.8231299867284228e-06, "loss": 0.0126, "step": 3293 }, { "epoch": 0.6542854305293475, "grad_norm": 0.5346884562381211, "learning_rate": 2.820233860665175e-06, "loss": 0.0085, "step": 3294 }, { "epoch": 0.6544840599860959, "grad_norm": 0.3347131133980114, "learning_rate": 2.817338637240905e-06, "loss": 0.0091, "step": 3295 }, { "epoch": 0.6546826894428444, "grad_norm": 0.6898205612146502, "learning_rate": 2.814444317654518e-06, "loss": 0.01, "step": 3296 }, { "epoch": 0.6548813188995928, "grad_norm": 0.6113056037292642, "learning_rate": 2.811550903104549e-06, "loss": 0.0135, "step": 3297 }, { "epoch": 0.6550799483563412, "grad_norm": 0.24086210100341407, "learning_rate": 2.8086583947891623e-06, "loss": 0.0098, "step": 3298 }, { "epoch": 0.6552785778130897, "grad_norm": 0.29528694256953764, "learning_rate": 2.8057667939061394e-06, "loss": 0.0091, "step": 3299 }, { "epoch": 0.6554772072698382, "grad_norm": 0.32915813162257146, "learning_rate": 2.8028761016528882e-06, "loss": 0.0112, "step": 3300 }, { "epoch": 0.6556758367265866, "grad_norm": 0.5845599531100432, "learning_rate": 2.7999863192264453e-06, "loss": 0.0084, "step": 3301 }, { "epoch": 0.655874466183335, "grad_norm": 0.5157143599088475, "learning_rate": 2.7970974478234626e-06, "loss": 0.0081, "step": 3302 }, { "epoch": 0.6560730956400834, "grad_norm": 0.21072408406022408, "learning_rate": 2.7942094886402214e-06, "loss": 0.0057, "step": 3303 }, { "epoch": 0.6562717250968318, "grad_norm": 0.4446472530319208, "learning_rate": 2.7913224428726215e-06, "loss": 0.0109, "step": 3304 }, { "epoch": 0.6564703545535803, "grad_norm": 0.7754716523493385, "learning_rate": 2.788436311716187e-06, "loss": 0.0217, "step": 3305 }, { "epoch": 0.6566689840103287, "grad_norm": 0.3370976421618834, "learning_rate": 2.78555109636606e-06, "loss": 0.01, "step": 3306 }, { "epoch": 0.6568676134670771, "grad_norm": 0.5079691302186063, "learning_rate": 2.7826667980170064e-06, "loss": 0.0109, "step": 3307 }, { "epoch": 0.6570662429238256, "grad_norm": 0.49334190860082194, "learning_rate": 2.7797834178634124e-06, "loss": 0.0094, "step": 3308 }, { "epoch": 0.6572648723805741, "grad_norm": 0.5546559533772156, "learning_rate": 2.77690095709928e-06, "loss": 0.0143, "step": 3309 }, { "epoch": 0.6574635018373225, "grad_norm": 0.4708980828292352, "learning_rate": 2.7740194169182377e-06, "loss": 0.0122, "step": 3310 }, { "epoch": 0.6576621312940709, "grad_norm": 0.7391696704500784, "learning_rate": 2.771138798513523e-06, "loss": 0.0113, "step": 3311 }, { "epoch": 0.6578607607508193, "grad_norm": 0.7253819426997582, "learning_rate": 2.7682591030780014e-06, "loss": 0.0114, "step": 3312 }, { "epoch": 0.6580593902075678, "grad_norm": 0.6177374318507232, "learning_rate": 2.7653803318041495e-06, "loss": 0.0143, "step": 3313 }, { "epoch": 0.6582580196643162, "grad_norm": 0.43170689684000746, "learning_rate": 2.7625024858840634e-06, "loss": 0.0082, "step": 3314 }, { "epoch": 0.6584566491210646, "grad_norm": 0.4260173502271494, "learning_rate": 2.7596255665094594e-06, "loss": 0.0125, "step": 3315 }, { "epoch": 0.6586552785778131, "grad_norm": 0.40502002471903215, "learning_rate": 2.7567495748716632e-06, "loss": 0.0127, "step": 3316 }, { "epoch": 0.6588539080345616, "grad_norm": 0.523810770366039, "learning_rate": 2.7538745121616235e-06, "loss": 0.0111, "step": 3317 }, { "epoch": 0.65905253749131, "grad_norm": 1.0090297301535212, "learning_rate": 2.751000379569897e-06, "loss": 0.0136, "step": 3318 }, { "epoch": 0.6592511669480584, "grad_norm": 0.45840798674019173, "learning_rate": 2.7481271782866623e-06, "loss": 0.0045, "step": 3319 }, { "epoch": 0.6594497964048068, "grad_norm": 0.7665200456822501, "learning_rate": 2.7452549095017065e-06, "loss": 0.0171, "step": 3320 }, { "epoch": 0.6596484258615553, "grad_norm": 0.4338808266126319, "learning_rate": 2.7423835744044346e-06, "loss": 0.0074, "step": 3321 }, { "epoch": 0.6598470553183037, "grad_norm": 0.36880213167051384, "learning_rate": 2.7395131741838666e-06, "loss": 0.006, "step": 3322 }, { "epoch": 0.6600456847750521, "grad_norm": 0.3898475854075473, "learning_rate": 2.7366437100286286e-06, "loss": 0.0088, "step": 3323 }, { "epoch": 0.6602443142318005, "grad_norm": 0.5107579237424404, "learning_rate": 2.7337751831269637e-06, "loss": 0.0132, "step": 3324 }, { "epoch": 0.6604429436885491, "grad_norm": 0.40112286710842526, "learning_rate": 2.730907594666724e-06, "loss": 0.0077, "step": 3325 }, { "epoch": 0.6606415731452975, "grad_norm": 0.5870623533419401, "learning_rate": 2.7280409458353775e-06, "loss": 0.0133, "step": 3326 }, { "epoch": 0.6608402026020459, "grad_norm": 0.32740499713756716, "learning_rate": 2.725175237820002e-06, "loss": 0.0103, "step": 3327 }, { "epoch": 0.6610388320587943, "grad_norm": 0.4982828855799693, "learning_rate": 2.72231047180728e-06, "loss": 0.0075, "step": 3328 }, { "epoch": 0.6612374615155427, "grad_norm": 0.42218049160104415, "learning_rate": 2.7194466489835132e-06, "loss": 0.0128, "step": 3329 }, { "epoch": 0.6614360909722912, "grad_norm": 0.6155184301745691, "learning_rate": 2.7165837705346033e-06, "loss": 0.0155, "step": 3330 }, { "epoch": 0.6616347204290396, "grad_norm": 0.6852751161684462, "learning_rate": 2.7137218376460683e-06, "loss": 0.0093, "step": 3331 }, { "epoch": 0.661833349885788, "grad_norm": 0.3782884028083007, "learning_rate": 2.7108608515030297e-06, "loss": 0.0086, "step": 3332 }, { "epoch": 0.6620319793425365, "grad_norm": 0.4146005267325709, "learning_rate": 2.70800081329022e-06, "loss": 0.0101, "step": 3333 }, { "epoch": 0.662230608799285, "grad_norm": 0.36972782718978237, "learning_rate": 2.7051417241919808e-06, "loss": 0.0096, "step": 3334 }, { "epoch": 0.6624292382560334, "grad_norm": 0.6516852569514466, "learning_rate": 2.7022835853922525e-06, "loss": 0.0114, "step": 3335 }, { "epoch": 0.6626278677127818, "grad_norm": 0.29398406434321445, "learning_rate": 2.699426398074593e-06, "loss": 0.005, "step": 3336 }, { "epoch": 0.6628264971695302, "grad_norm": 0.3956847791060924, "learning_rate": 2.6965701634221566e-06, "loss": 0.0093, "step": 3337 }, { "epoch": 0.6630251266262787, "grad_norm": 0.688077347911631, "learning_rate": 2.6937148826177095e-06, "loss": 0.0154, "step": 3338 }, { "epoch": 0.6632237560830271, "grad_norm": 0.5079820743351281, "learning_rate": 2.690860556843619e-06, "loss": 0.0105, "step": 3339 }, { "epoch": 0.6634223855397755, "grad_norm": 0.4440992131204025, "learning_rate": 2.688007187281859e-06, "loss": 0.0102, "step": 3340 }, { "epoch": 0.663621014996524, "grad_norm": 0.49214330396861955, "learning_rate": 2.68515477511401e-06, "loss": 0.0118, "step": 3341 }, { "epoch": 0.6638196444532725, "grad_norm": 1.0900456782512908, "learning_rate": 2.6823033215212478e-06, "loss": 0.022, "step": 3342 }, { "epoch": 0.6640182739100209, "grad_norm": 0.5381851320515335, "learning_rate": 2.679452827684362e-06, "loss": 0.0101, "step": 3343 }, { "epoch": 0.6642169033667693, "grad_norm": 0.4016766272309056, "learning_rate": 2.676603294783734e-06, "loss": 0.0087, "step": 3344 }, { "epoch": 0.6644155328235177, "grad_norm": 0.6652421012666038, "learning_rate": 2.6737547239993565e-06, "loss": 0.0186, "step": 3345 }, { "epoch": 0.6646141622802662, "grad_norm": 0.6608874683072442, "learning_rate": 2.670907116510817e-06, "loss": 0.0132, "step": 3346 }, { "epoch": 0.6648127917370146, "grad_norm": 0.47308686328108424, "learning_rate": 2.668060473497309e-06, "loss": 0.0075, "step": 3347 }, { "epoch": 0.665011421193763, "grad_norm": 0.5810128092238643, "learning_rate": 2.6652147961376253e-06, "loss": 0.0138, "step": 3348 }, { "epoch": 0.6652100506505114, "grad_norm": 0.535033001330009, "learning_rate": 2.662370085610153e-06, "loss": 0.0094, "step": 3349 }, { "epoch": 0.66540868010726, "grad_norm": 0.3123641456923419, "learning_rate": 2.6595263430928874e-06, "loss": 0.0086, "step": 3350 }, { "epoch": 0.6656073095640084, "grad_norm": 0.8214580684647604, "learning_rate": 2.656683569763422e-06, "loss": 0.0191, "step": 3351 }, { "epoch": 0.6658059390207568, "grad_norm": 0.8066452608717822, "learning_rate": 2.6538417667989414e-06, "loss": 0.0159, "step": 3352 }, { "epoch": 0.6660045684775052, "grad_norm": 0.41875862549545895, "learning_rate": 2.651000935376238e-06, "loss": 0.0066, "step": 3353 }, { "epoch": 0.6662031979342536, "grad_norm": 0.6547577856421022, "learning_rate": 2.6481610766716926e-06, "loss": 0.0115, "step": 3354 }, { "epoch": 0.6664018273910021, "grad_norm": 0.6959330954668181, "learning_rate": 2.6453221918612915e-06, "loss": 0.014, "step": 3355 }, { "epoch": 0.6666004568477505, "grad_norm": 0.5030885709048069, "learning_rate": 2.642484282120612e-06, "loss": 0.0095, "step": 3356 }, { "epoch": 0.6667990863044989, "grad_norm": 0.4719568465097182, "learning_rate": 2.63964734862483e-06, "loss": 0.014, "step": 3357 }, { "epoch": 0.6669977157612474, "grad_norm": 0.6796444534634053, "learning_rate": 2.6368113925487182e-06, "loss": 0.0091, "step": 3358 }, { "epoch": 0.6671963452179959, "grad_norm": 0.6346635781019123, "learning_rate": 2.6339764150666414e-06, "loss": 0.0174, "step": 3359 }, { "epoch": 0.6673949746747443, "grad_norm": 0.5601756578875082, "learning_rate": 2.6311424173525636e-06, "loss": 0.0081, "step": 3360 }, { "epoch": 0.6675936041314927, "grad_norm": 0.4608080766660191, "learning_rate": 2.628309400580036e-06, "loss": 0.0105, "step": 3361 }, { "epoch": 0.6677922335882411, "grad_norm": 0.8768255616150576, "learning_rate": 2.6254773659222123e-06, "loss": 0.0146, "step": 3362 }, { "epoch": 0.6679908630449896, "grad_norm": 0.5125903666857362, "learning_rate": 2.622646314551832e-06, "loss": 0.017, "step": 3363 }, { "epoch": 0.668189492501738, "grad_norm": 0.4250565635417071, "learning_rate": 2.6198162476412324e-06, "loss": 0.0168, "step": 3364 }, { "epoch": 0.6683881219584864, "grad_norm": 0.5993238921833851, "learning_rate": 2.6169871663623424e-06, "loss": 0.0079, "step": 3365 }, { "epoch": 0.6685867514152348, "grad_norm": 0.4691003256708194, "learning_rate": 2.614159071886679e-06, "loss": 0.0194, "step": 3366 }, { "epoch": 0.6687853808719834, "grad_norm": 0.34795435504024236, "learning_rate": 2.6113319653853565e-06, "loss": 0.01, "step": 3367 }, { "epoch": 0.6689840103287318, "grad_norm": 0.7334349474214593, "learning_rate": 2.6085058480290724e-06, "loss": 0.0168, "step": 3368 }, { "epoch": 0.6691826397854802, "grad_norm": 0.3782004612022668, "learning_rate": 2.6056807209881247e-06, "loss": 0.0097, "step": 3369 }, { "epoch": 0.6693812692422286, "grad_norm": 0.6207564025203681, "learning_rate": 2.6028565854323905e-06, "loss": 0.0145, "step": 3370 }, { "epoch": 0.669579898698977, "grad_norm": 0.3664793557426668, "learning_rate": 2.6000334425313455e-06, "loss": 0.0094, "step": 3371 }, { "epoch": 0.6697785281557255, "grad_norm": 0.7947514445887053, "learning_rate": 2.5972112934540483e-06, "loss": 0.0137, "step": 3372 }, { "epoch": 0.6699771576124739, "grad_norm": 0.5244578943642776, "learning_rate": 2.5943901393691463e-06, "loss": 0.0146, "step": 3373 }, { "epoch": 0.6701757870692223, "grad_norm": 0.31429941997048494, "learning_rate": 2.5915699814448813e-06, "loss": 0.0084, "step": 3374 }, { "epoch": 0.6703744165259709, "grad_norm": 0.36243016062186306, "learning_rate": 2.5887508208490726e-06, "loss": 0.0089, "step": 3375 }, { "epoch": 0.6705730459827193, "grad_norm": 0.5468920277671807, "learning_rate": 2.585932658749134e-06, "loss": 0.0128, "step": 3376 }, { "epoch": 0.6707716754394677, "grad_norm": 0.4530603705172164, "learning_rate": 2.583115496312066e-06, "loss": 0.0095, "step": 3377 }, { "epoch": 0.6709703048962161, "grad_norm": 0.6766180998941852, "learning_rate": 2.5802993347044494e-06, "loss": 0.0111, "step": 3378 }, { "epoch": 0.6711689343529645, "grad_norm": 0.34837311844564506, "learning_rate": 2.5774841750924564e-06, "loss": 0.0118, "step": 3379 }, { "epoch": 0.671367563809713, "grad_norm": 0.4237238100703997, "learning_rate": 2.5746700186418388e-06, "loss": 0.0094, "step": 3380 }, { "epoch": 0.6715661932664614, "grad_norm": 0.2705398980966825, "learning_rate": 2.5718568665179394e-06, "loss": 0.0094, "step": 3381 }, { "epoch": 0.6717648227232098, "grad_norm": 0.7486608005177356, "learning_rate": 2.5690447198856784e-06, "loss": 0.0155, "step": 3382 }, { "epoch": 0.6719634521799583, "grad_norm": 0.38166376147207953, "learning_rate": 2.566233579909564e-06, "loss": 0.0082, "step": 3383 }, { "epoch": 0.6721620816367068, "grad_norm": 0.7618562250325333, "learning_rate": 2.5634234477536894e-06, "loss": 0.0117, "step": 3384 }, { "epoch": 0.6723607110934552, "grad_norm": 0.4237062506104705, "learning_rate": 2.5606143245817227e-06, "loss": 0.0159, "step": 3385 }, { "epoch": 0.6725593405502036, "grad_norm": 0.4786823768082389, "learning_rate": 2.5578062115569234e-06, "loss": 0.0133, "step": 3386 }, { "epoch": 0.672757970006952, "grad_norm": 0.9433799903017728, "learning_rate": 2.5549991098421245e-06, "loss": 0.0195, "step": 3387 }, { "epoch": 0.6729565994637005, "grad_norm": 0.3981282726321889, "learning_rate": 2.5521930205997476e-06, "loss": 0.0109, "step": 3388 }, { "epoch": 0.6731552289204489, "grad_norm": 0.5626283367420652, "learning_rate": 2.5493879449917875e-06, "loss": 0.014, "step": 3389 }, { "epoch": 0.6733538583771973, "grad_norm": 1.4117855646706072, "learning_rate": 2.546583884179825e-06, "loss": 0.0172, "step": 3390 }, { "epoch": 0.6735524878339457, "grad_norm": 0.6383343978861289, "learning_rate": 2.543780839325022e-06, "loss": 0.0106, "step": 3391 }, { "epoch": 0.6737511172906943, "grad_norm": 0.7138217958976034, "learning_rate": 2.540978811588111e-06, "loss": 0.0106, "step": 3392 }, { "epoch": 0.6739497467474427, "grad_norm": 0.34623408436167746, "learning_rate": 2.5381778021294133e-06, "loss": 0.0091, "step": 3393 }, { "epoch": 0.6741483762041911, "grad_norm": 0.8557016580449002, "learning_rate": 2.535377812108821e-06, "loss": 0.0151, "step": 3394 }, { "epoch": 0.6743470056609395, "grad_norm": 0.24818536799482643, "learning_rate": 2.5325788426858106e-06, "loss": 0.0073, "step": 3395 }, { "epoch": 0.674545635117688, "grad_norm": 0.31460883387664695, "learning_rate": 2.5297808950194303e-06, "loss": 0.0092, "step": 3396 }, { "epoch": 0.6747442645744364, "grad_norm": 0.9971267660879034, "learning_rate": 2.526983970268305e-06, "loss": 0.0162, "step": 3397 }, { "epoch": 0.6749428940311848, "grad_norm": 0.9119079346987756, "learning_rate": 2.524188069590644e-06, "loss": 0.0153, "step": 3398 }, { "epoch": 0.6751415234879332, "grad_norm": 0.7270396119139402, "learning_rate": 2.521393194144222e-06, "loss": 0.0131, "step": 3399 }, { "epoch": 0.6753401529446817, "grad_norm": 0.8376328773880279, "learning_rate": 2.518599345086396e-06, "loss": 0.0145, "step": 3400 }, { "epoch": 0.6755387824014302, "grad_norm": 0.5909829299184999, "learning_rate": 2.515806523574098e-06, "loss": 0.0141, "step": 3401 }, { "epoch": 0.6757374118581786, "grad_norm": 0.7347277470642827, "learning_rate": 2.513014730763829e-06, "loss": 0.0136, "step": 3402 }, { "epoch": 0.675936041314927, "grad_norm": 0.5930317321813116, "learning_rate": 2.5102239678116714e-06, "loss": 0.0177, "step": 3403 }, { "epoch": 0.6761346707716754, "grad_norm": 0.3929167896627322, "learning_rate": 2.507434235873274e-06, "loss": 0.0087, "step": 3404 }, { "epoch": 0.6763333002284239, "grad_norm": 0.33640967171465747, "learning_rate": 2.5046455361038646e-06, "loss": 0.0066, "step": 3405 }, { "epoch": 0.6765319296851723, "grad_norm": 0.32477500635650375, "learning_rate": 2.5018578696582387e-06, "loss": 0.0108, "step": 3406 }, { "epoch": 0.6767305591419207, "grad_norm": 0.6222872814600147, "learning_rate": 2.499071237690766e-06, "loss": 0.0153, "step": 3407 }, { "epoch": 0.6769291885986691, "grad_norm": 0.3552835031167747, "learning_rate": 2.496285641355392e-06, "loss": 0.0054, "step": 3408 }, { "epoch": 0.6771278180554177, "grad_norm": 0.4119535077595573, "learning_rate": 2.493501081805625e-06, "loss": 0.0088, "step": 3409 }, { "epoch": 0.6773264475121661, "grad_norm": 0.6664231184464069, "learning_rate": 2.490717560194551e-06, "loss": 0.0107, "step": 3410 }, { "epoch": 0.6775250769689145, "grad_norm": 0.6170792412387821, "learning_rate": 2.4879350776748214e-06, "loss": 0.013, "step": 3411 }, { "epoch": 0.6777237064256629, "grad_norm": 0.44050710440567764, "learning_rate": 2.485153635398662e-06, "loss": 0.009, "step": 3412 }, { "epoch": 0.6779223358824114, "grad_norm": 1.1119697405780344, "learning_rate": 2.482373234517862e-06, "loss": 0.0136, "step": 3413 }, { "epoch": 0.6781209653391598, "grad_norm": 1.0238357497490542, "learning_rate": 2.479593876183784e-06, "loss": 0.0126, "step": 3414 }, { "epoch": 0.6783195947959082, "grad_norm": 0.46374993699075306, "learning_rate": 2.47681556154736e-06, "loss": 0.014, "step": 3415 }, { "epoch": 0.6785182242526566, "grad_norm": 0.7825050148026104, "learning_rate": 2.4740382917590834e-06, "loss": 0.0149, "step": 3416 }, { "epoch": 0.6787168537094052, "grad_norm": 0.45223615016890584, "learning_rate": 2.4712620679690223e-06, "loss": 0.0082, "step": 3417 }, { "epoch": 0.6789154831661536, "grad_norm": 0.5359050703153959, "learning_rate": 2.468486891326805e-06, "loss": 0.0103, "step": 3418 }, { "epoch": 0.679114112622902, "grad_norm": 0.6388123076656673, "learning_rate": 2.4657127629816323e-06, "loss": 0.0128, "step": 3419 }, { "epoch": 0.6793127420796504, "grad_norm": 0.624704597446212, "learning_rate": 2.4629396840822665e-06, "loss": 0.0112, "step": 3420 }, { "epoch": 0.6795113715363988, "grad_norm": 0.8921444577470375, "learning_rate": 2.4601676557770353e-06, "loss": 0.0108, "step": 3421 }, { "epoch": 0.6797100009931473, "grad_norm": 0.4586739844639518, "learning_rate": 2.4573966792138363e-06, "loss": 0.0054, "step": 3422 }, { "epoch": 0.6799086304498957, "grad_norm": 0.3515731328172263, "learning_rate": 2.4546267555401245e-06, "loss": 0.0108, "step": 3423 }, { "epoch": 0.6801072599066441, "grad_norm": 0.5037818639650333, "learning_rate": 2.451857885902926e-06, "loss": 0.0144, "step": 3424 }, { "epoch": 0.6803058893633926, "grad_norm": 0.7695438647313355, "learning_rate": 2.449090071448823e-06, "loss": 0.0078, "step": 3425 }, { "epoch": 0.6805045188201411, "grad_norm": 1.0299643704038546, "learning_rate": 2.4463233133239668e-06, "loss": 0.0113, "step": 3426 }, { "epoch": 0.6807031482768895, "grad_norm": 0.43828329932828497, "learning_rate": 2.443557612674071e-06, "loss": 0.0075, "step": 3427 }, { "epoch": 0.6809017777336379, "grad_norm": 0.35986529838291326, "learning_rate": 2.4407929706444067e-06, "loss": 0.0091, "step": 3428 }, { "epoch": 0.6811004071903863, "grad_norm": 0.5825951519865635, "learning_rate": 2.4380293883798118e-06, "loss": 0.0132, "step": 3429 }, { "epoch": 0.6812990366471348, "grad_norm": 0.7054553585094656, "learning_rate": 2.435266867024679e-06, "loss": 0.0132, "step": 3430 }, { "epoch": 0.6814976661038832, "grad_norm": 0.5757263663466883, "learning_rate": 2.432505407722971e-06, "loss": 0.0135, "step": 3431 }, { "epoch": 0.6816962955606316, "grad_norm": 0.5381046623539257, "learning_rate": 2.4297450116182003e-06, "loss": 0.0116, "step": 3432 }, { "epoch": 0.68189492501738, "grad_norm": 0.20663755135436665, "learning_rate": 2.4269856798534463e-06, "loss": 0.0044, "step": 3433 }, { "epoch": 0.6820935544741286, "grad_norm": 0.414472597291746, "learning_rate": 2.4242274135713477e-06, "loss": 0.0122, "step": 3434 }, { "epoch": 0.682292183930877, "grad_norm": 0.4289247107743778, "learning_rate": 2.4214702139140954e-06, "loss": 0.0103, "step": 3435 }, { "epoch": 0.6824908133876254, "grad_norm": 0.3110902803437075, "learning_rate": 2.418714082023448e-06, "loss": 0.0078, "step": 3436 }, { "epoch": 0.6826894428443738, "grad_norm": 0.44667625859781807, "learning_rate": 2.4159590190407113e-06, "loss": 0.0091, "step": 3437 }, { "epoch": 0.6828880723011223, "grad_norm": 0.6388858453233094, "learning_rate": 2.4132050261067595e-06, "loss": 0.0113, "step": 3438 }, { "epoch": 0.6830867017578707, "grad_norm": 0.6488255956688261, "learning_rate": 2.410452104362014e-06, "loss": 0.0139, "step": 3439 }, { "epoch": 0.6832853312146191, "grad_norm": 0.399390098306922, "learning_rate": 2.407700254946459e-06, "loss": 0.0106, "step": 3440 }, { "epoch": 0.6834839606713675, "grad_norm": 0.42623686761820223, "learning_rate": 2.404949478999634e-06, "loss": 0.0114, "step": 3441 }, { "epoch": 0.683682590128116, "grad_norm": 0.6585189499135641, "learning_rate": 2.4021997776606294e-06, "loss": 0.0164, "step": 3442 }, { "epoch": 0.6838812195848645, "grad_norm": 0.5067746225527527, "learning_rate": 2.3994511520680975e-06, "loss": 0.0229, "step": 3443 }, { "epoch": 0.6840798490416129, "grad_norm": 0.6378096505999058, "learning_rate": 2.3967036033602388e-06, "loss": 0.0119, "step": 3444 }, { "epoch": 0.6842784784983613, "grad_norm": 0.5093697789433972, "learning_rate": 2.393957132674809e-06, "loss": 0.0111, "step": 3445 }, { "epoch": 0.6844771079551097, "grad_norm": 0.8605508533457091, "learning_rate": 2.3912117411491233e-06, "loss": 0.0123, "step": 3446 }, { "epoch": 0.6846757374118582, "grad_norm": 0.3992536191232021, "learning_rate": 2.3884674299200404e-06, "loss": 0.0073, "step": 3447 }, { "epoch": 0.6848743668686066, "grad_norm": 0.3301132162498039, "learning_rate": 2.3857242001239815e-06, "loss": 0.0111, "step": 3448 }, { "epoch": 0.685072996325355, "grad_norm": 0.47428147751600336, "learning_rate": 2.3829820528969118e-06, "loss": 0.0149, "step": 3449 }, { "epoch": 0.6852716257821034, "grad_norm": 0.5910659829970917, "learning_rate": 2.3802409893743534e-06, "loss": 0.0128, "step": 3450 }, { "epoch": 0.685470255238852, "grad_norm": 0.3809691879291683, "learning_rate": 2.3775010106913794e-06, "loss": 0.0128, "step": 3451 }, { "epoch": 0.6856688846956004, "grad_norm": 0.5463503817648073, "learning_rate": 2.3747621179826084e-06, "loss": 0.0109, "step": 3452 }, { "epoch": 0.6858675141523488, "grad_norm": 0.5779042575602321, "learning_rate": 2.3720243123822177e-06, "loss": 0.0154, "step": 3453 }, { "epoch": 0.6860661436090972, "grad_norm": 0.5292906640923668, "learning_rate": 2.3692875950239256e-06, "loss": 0.012, "step": 3454 }, { "epoch": 0.6862647730658457, "grad_norm": 0.7193268858083541, "learning_rate": 2.366551967041008e-06, "loss": 0.015, "step": 3455 }, { "epoch": 0.6864634025225941, "grad_norm": 0.7981922366337875, "learning_rate": 2.3638174295662815e-06, "loss": 0.0149, "step": 3456 }, { "epoch": 0.6866620319793425, "grad_norm": 1.8969690617361947, "learning_rate": 2.3610839837321174e-06, "loss": 0.009, "step": 3457 }, { "epoch": 0.6868606614360909, "grad_norm": 1.1223983859989493, "learning_rate": 2.3583516306704353e-06, "loss": 0.0138, "step": 3458 }, { "epoch": 0.6870592908928395, "grad_norm": 0.7263132364715444, "learning_rate": 2.3556203715126953e-06, "loss": 0.0209, "step": 3459 }, { "epoch": 0.6872579203495879, "grad_norm": 0.4466445789942108, "learning_rate": 2.352890207389913e-06, "loss": 0.0097, "step": 3460 }, { "epoch": 0.6874565498063363, "grad_norm": 0.9150117897195746, "learning_rate": 2.3501611394326434e-06, "loss": 0.0133, "step": 3461 }, { "epoch": 0.6876551792630847, "grad_norm": 0.5168870032711956, "learning_rate": 2.3474331687709937e-06, "loss": 0.0136, "step": 3462 }, { "epoch": 0.6878538087198331, "grad_norm": 0.31968699277456936, "learning_rate": 2.3447062965346113e-06, "loss": 0.0073, "step": 3463 }, { "epoch": 0.6880524381765816, "grad_norm": 0.6109378846286353, "learning_rate": 2.3419805238526923e-06, "loss": 0.0179, "step": 3464 }, { "epoch": 0.68825106763333, "grad_norm": 0.35760154395629234, "learning_rate": 2.339255851853978e-06, "loss": 0.0115, "step": 3465 }, { "epoch": 0.6884496970900784, "grad_norm": 0.5708486697315434, "learning_rate": 2.3365322816667497e-06, "loss": 0.0132, "step": 3466 }, { "epoch": 0.688648326546827, "grad_norm": 0.6178889637099267, "learning_rate": 2.3338098144188394e-06, "loss": 0.0093, "step": 3467 }, { "epoch": 0.6888469560035754, "grad_norm": 0.16097938067312248, "learning_rate": 2.3310884512376115e-06, "loss": 0.0034, "step": 3468 }, { "epoch": 0.6890455854603238, "grad_norm": 0.3961429560117535, "learning_rate": 2.328368193249983e-06, "loss": 0.0067, "step": 3469 }, { "epoch": 0.6892442149170722, "grad_norm": 0.38510183274734977, "learning_rate": 2.3256490415824124e-06, "loss": 0.0118, "step": 3470 }, { "epoch": 0.6894428443738206, "grad_norm": 0.9257497192314168, "learning_rate": 2.322930997360894e-06, "loss": 0.014, "step": 3471 }, { "epoch": 0.6896414738305691, "grad_norm": 0.6459018691049223, "learning_rate": 2.3202140617109697e-06, "loss": 0.0098, "step": 3472 }, { "epoch": 0.6898401032873175, "grad_norm": 1.043612774948338, "learning_rate": 2.3174982357577174e-06, "loss": 0.0099, "step": 3473 }, { "epoch": 0.6900387327440659, "grad_norm": 0.2929171606322306, "learning_rate": 2.3147835206257615e-06, "loss": 0.0066, "step": 3474 }, { "epoch": 0.6902373622008143, "grad_norm": 0.4205660357282004, "learning_rate": 2.312069917439258e-06, "loss": 0.0106, "step": 3475 }, { "epoch": 0.6904359916575629, "grad_norm": 0.28539942824667136, "learning_rate": 2.3093574273219103e-06, "loss": 0.0107, "step": 3476 }, { "epoch": 0.6906346211143113, "grad_norm": 0.38833687964976527, "learning_rate": 2.30664605139696e-06, "loss": 0.0107, "step": 3477 }, { "epoch": 0.6908332505710597, "grad_norm": 0.40899264773370975, "learning_rate": 2.30393579078718e-06, "loss": 0.0044, "step": 3478 }, { "epoch": 0.6910318800278081, "grad_norm": 0.5323829934882054, "learning_rate": 2.3012266466148913e-06, "loss": 0.0164, "step": 3479 }, { "epoch": 0.6912305094845566, "grad_norm": 0.5241906848713265, "learning_rate": 2.2985186200019434e-06, "loss": 0.0091, "step": 3480 }, { "epoch": 0.691429138941305, "grad_norm": 0.47844187118354103, "learning_rate": 2.2958117120697322e-06, "loss": 0.0088, "step": 3481 }, { "epoch": 0.6916277683980534, "grad_norm": 0.5162909926785495, "learning_rate": 2.29310592393918e-06, "loss": 0.0079, "step": 3482 }, { "epoch": 0.6918263978548018, "grad_norm": 0.4668928613690837, "learning_rate": 2.290401256730755e-06, "loss": 0.0087, "step": 3483 }, { "epoch": 0.6920250273115504, "grad_norm": 0.47892066762124963, "learning_rate": 2.2876977115644577e-06, "loss": 0.0098, "step": 3484 }, { "epoch": 0.6922236567682988, "grad_norm": 0.4015035321499451, "learning_rate": 2.28499528955982e-06, "loss": 0.0082, "step": 3485 }, { "epoch": 0.6924222862250472, "grad_norm": 0.4774526229757127, "learning_rate": 2.2822939918359157e-06, "loss": 0.0131, "step": 3486 }, { "epoch": 0.6926209156817956, "grad_norm": 0.3237800455442647, "learning_rate": 2.279593819511346e-06, "loss": 0.0094, "step": 3487 }, { "epoch": 0.692819545138544, "grad_norm": 0.8338717940603173, "learning_rate": 2.276894773704253e-06, "loss": 0.0164, "step": 3488 }, { "epoch": 0.6930181745952925, "grad_norm": 0.3897033888651404, "learning_rate": 2.274196855532306e-06, "loss": 0.0061, "step": 3489 }, { "epoch": 0.6932168040520409, "grad_norm": 0.6207140161695185, "learning_rate": 2.2715000661127107e-06, "loss": 0.0168, "step": 3490 }, { "epoch": 0.6934154335087893, "grad_norm": 0.36947534908833934, "learning_rate": 2.26880440656221e-06, "loss": 0.0092, "step": 3491 }, { "epoch": 0.6936140629655377, "grad_norm": 0.5230707821198057, "learning_rate": 2.2661098779970653e-06, "loss": 0.0116, "step": 3492 }, { "epoch": 0.6938126924222863, "grad_norm": 0.34175221266600403, "learning_rate": 2.263416481533082e-06, "loss": 0.0057, "step": 3493 }, { "epoch": 0.6940113218790347, "grad_norm": 0.5044214335668877, "learning_rate": 2.260724218285596e-06, "loss": 0.0121, "step": 3494 }, { "epoch": 0.6942099513357831, "grad_norm": 0.7343017142175041, "learning_rate": 2.258033089369466e-06, "loss": 0.0109, "step": 3495 }, { "epoch": 0.6944085807925315, "grad_norm": 0.7027496441081319, "learning_rate": 2.255343095899089e-06, "loss": 0.0133, "step": 3496 }, { "epoch": 0.69460721024928, "grad_norm": 0.515249317298665, "learning_rate": 2.252654238988386e-06, "loss": 0.0117, "step": 3497 }, { "epoch": 0.6948058397060284, "grad_norm": 0.3406590742309431, "learning_rate": 2.2499665197508137e-06, "loss": 0.0073, "step": 3498 }, { "epoch": 0.6950044691627768, "grad_norm": 1.054212903506879, "learning_rate": 2.247279939299349e-06, "loss": 0.0198, "step": 3499 }, { "epoch": 0.6952030986195252, "grad_norm": 1.254784991849465, "learning_rate": 2.244594498746505e-06, "loss": 0.0131, "step": 3500 }, { "epoch": 0.6954017280762738, "grad_norm": 0.5023270239108601, "learning_rate": 2.241910199204321e-06, "loss": 0.0068, "step": 3501 }, { "epoch": 0.6956003575330222, "grad_norm": 0.33395472573545676, "learning_rate": 2.239227041784361e-06, "loss": 0.0054, "step": 3502 }, { "epoch": 0.6957989869897706, "grad_norm": 1.1377173779793461, "learning_rate": 2.236545027597719e-06, "loss": 0.0275, "step": 3503 }, { "epoch": 0.695997616446519, "grad_norm": 0.46319236840052935, "learning_rate": 2.2338641577550124e-06, "loss": 0.0162, "step": 3504 }, { "epoch": 0.6961962459032675, "grad_norm": 0.3934534179163216, "learning_rate": 2.2311844333663897e-06, "loss": 0.0071, "step": 3505 }, { "epoch": 0.6963948753600159, "grad_norm": 0.7869903742607097, "learning_rate": 2.2285058555415184e-06, "loss": 0.0174, "step": 3506 }, { "epoch": 0.6965935048167643, "grad_norm": 0.5402812555281423, "learning_rate": 2.2258284253895975e-06, "loss": 0.0067, "step": 3507 }, { "epoch": 0.6967921342735127, "grad_norm": 0.5141673771075058, "learning_rate": 2.2231521440193486e-06, "loss": 0.0141, "step": 3508 }, { "epoch": 0.6969907637302611, "grad_norm": 0.7019261554336447, "learning_rate": 2.2204770125390147e-06, "loss": 0.0112, "step": 3509 }, { "epoch": 0.6971893931870097, "grad_norm": 0.6811176170277796, "learning_rate": 2.2178030320563677e-06, "loss": 0.0106, "step": 3510 }, { "epoch": 0.6973880226437581, "grad_norm": 0.39398429666162565, "learning_rate": 2.2151302036786976e-06, "loss": 0.0087, "step": 3511 }, { "epoch": 0.6975866521005065, "grad_norm": 0.7331468400657002, "learning_rate": 2.212458528512823e-06, "loss": 0.0166, "step": 3512 }, { "epoch": 0.6977852815572549, "grad_norm": 0.5535187539164983, "learning_rate": 2.2097880076650786e-06, "loss": 0.0096, "step": 3513 }, { "epoch": 0.6979839110140034, "grad_norm": 0.3967575431172973, "learning_rate": 2.2071186422413253e-06, "loss": 0.0116, "step": 3514 }, { "epoch": 0.6981825404707518, "grad_norm": 0.7538890683797367, "learning_rate": 2.2044504333469497e-06, "loss": 0.0134, "step": 3515 }, { "epoch": 0.6983811699275002, "grad_norm": 0.3956918733926533, "learning_rate": 2.2017833820868466e-06, "loss": 0.0119, "step": 3516 }, { "epoch": 0.6985797993842486, "grad_norm": 0.6929067669197901, "learning_rate": 2.1991174895654445e-06, "loss": 0.0131, "step": 3517 }, { "epoch": 0.6987784288409972, "grad_norm": 3.3273773145583596, "learning_rate": 2.1964527568866828e-06, "loss": 0.013, "step": 3518 }, { "epoch": 0.6989770582977456, "grad_norm": 0.9093484745445498, "learning_rate": 2.193789185154027e-06, "loss": 0.0223, "step": 3519 }, { "epoch": 0.699175687754494, "grad_norm": 0.6123078414872011, "learning_rate": 2.191126775470462e-06, "loss": 0.0098, "step": 3520 }, { "epoch": 0.6993743172112424, "grad_norm": 0.5104306223926839, "learning_rate": 2.188465528938484e-06, "loss": 0.0169, "step": 3521 }, { "epoch": 0.6995729466679909, "grad_norm": 0.6464141115145141, "learning_rate": 2.185805446660117e-06, "loss": 0.0158, "step": 3522 }, { "epoch": 0.6997715761247393, "grad_norm": 0.5539549495565598, "learning_rate": 2.183146529736894e-06, "loss": 0.0142, "step": 3523 }, { "epoch": 0.6999702055814877, "grad_norm": 1.1776865246831185, "learning_rate": 2.180488779269874e-06, "loss": 0.0158, "step": 3524 }, { "epoch": 0.7001688350382361, "grad_norm": 0.5566688947877497, "learning_rate": 2.177832196359626e-06, "loss": 0.0093, "step": 3525 }, { "epoch": 0.7003674644949847, "grad_norm": 0.2616688972825115, "learning_rate": 2.1751767821062385e-06, "loss": 0.0077, "step": 3526 }, { "epoch": 0.7005660939517331, "grad_norm": 0.8922237616752913, "learning_rate": 2.172522537609319e-06, "loss": 0.0136, "step": 3527 }, { "epoch": 0.7007647234084815, "grad_norm": 0.42603522315846554, "learning_rate": 2.169869463967983e-06, "loss": 0.0178, "step": 3528 }, { "epoch": 0.7009633528652299, "grad_norm": 0.4466000675946425, "learning_rate": 2.16721756228087e-06, "loss": 0.0137, "step": 3529 }, { "epoch": 0.7011619823219783, "grad_norm": 0.6276370519416008, "learning_rate": 2.164566833646125e-06, "loss": 0.0192, "step": 3530 }, { "epoch": 0.7013606117787268, "grad_norm": 0.5258723027999391, "learning_rate": 2.1619172791614175e-06, "loss": 0.0151, "step": 3531 }, { "epoch": 0.7015592412354752, "grad_norm": 0.6959177505640589, "learning_rate": 2.1592688999239204e-06, "loss": 0.0166, "step": 3532 }, { "epoch": 0.7017578706922236, "grad_norm": 0.6402652235456184, "learning_rate": 2.156621697030327e-06, "loss": 0.0114, "step": 3533 }, { "epoch": 0.701956500148972, "grad_norm": 0.46310938800434814, "learning_rate": 2.1539756715768434e-06, "loss": 0.0097, "step": 3534 }, { "epoch": 0.7021551296057206, "grad_norm": 0.44032965187225664, "learning_rate": 2.151330824659182e-06, "loss": 0.0097, "step": 3535 }, { "epoch": 0.702353759062469, "grad_norm": 0.8920415210582037, "learning_rate": 2.1486871573725752e-06, "loss": 0.0143, "step": 3536 }, { "epoch": 0.7025523885192174, "grad_norm": 0.37431884202742494, "learning_rate": 2.1460446708117594e-06, "loss": 0.0098, "step": 3537 }, { "epoch": 0.7027510179759658, "grad_norm": 0.4891117814991983, "learning_rate": 2.1434033660709896e-06, "loss": 0.0131, "step": 3538 }, { "epoch": 0.7029496474327143, "grad_norm": 1.047496707565232, "learning_rate": 2.1407632442440247e-06, "loss": 0.0132, "step": 3539 }, { "epoch": 0.7031482768894627, "grad_norm": 0.4074516687437718, "learning_rate": 2.138124306424136e-06, "loss": 0.0096, "step": 3540 }, { "epoch": 0.7033469063462111, "grad_norm": 0.6921084878895216, "learning_rate": 2.135486553704107e-06, "loss": 0.012, "step": 3541 }, { "epoch": 0.7035455358029595, "grad_norm": 0.352632241603671, "learning_rate": 2.132849987176226e-06, "loss": 0.0085, "step": 3542 }, { "epoch": 0.7037441652597081, "grad_norm": 0.5517973185214629, "learning_rate": 2.1302146079322945e-06, "loss": 0.0078, "step": 3543 }, { "epoch": 0.7039427947164565, "grad_norm": 0.3723571720752435, "learning_rate": 2.1275804170636213e-06, "loss": 0.0087, "step": 3544 }, { "epoch": 0.7041414241732049, "grad_norm": 0.57601818247446, "learning_rate": 2.124947415661019e-06, "loss": 0.0152, "step": 3545 }, { "epoch": 0.7043400536299533, "grad_norm": 0.21023435616038813, "learning_rate": 2.1223156048148146e-06, "loss": 0.0068, "step": 3546 }, { "epoch": 0.7045386830867018, "grad_norm": 0.33170514161187187, "learning_rate": 2.119684985614835e-06, "loss": 0.0091, "step": 3547 }, { "epoch": 0.7047373125434502, "grad_norm": 0.6974394026163584, "learning_rate": 2.1170555591504198e-06, "loss": 0.0195, "step": 3548 }, { "epoch": 0.7049359420001986, "grad_norm": 0.6571182045495615, "learning_rate": 2.1144273265104088e-06, "loss": 0.0166, "step": 3549 }, { "epoch": 0.705134571456947, "grad_norm": 0.5118877638635031, "learning_rate": 2.1118002887831523e-06, "loss": 0.017, "step": 3550 }, { "epoch": 0.7053332009136954, "grad_norm": 0.607122918396555, "learning_rate": 2.1091744470565062e-06, "loss": 0.0123, "step": 3551 }, { "epoch": 0.705531830370444, "grad_norm": 0.46800730184021394, "learning_rate": 2.1065498024178237e-06, "loss": 0.0117, "step": 3552 }, { "epoch": 0.7057304598271924, "grad_norm": 0.48441945139013526, "learning_rate": 2.1039263559539737e-06, "loss": 0.0137, "step": 3553 }, { "epoch": 0.7059290892839408, "grad_norm": 0.6602368453331714, "learning_rate": 2.1013041087513163e-06, "loss": 0.0202, "step": 3554 }, { "epoch": 0.7061277187406892, "grad_norm": 0.4140800156964033, "learning_rate": 2.098683061895727e-06, "loss": 0.013, "step": 3555 }, { "epoch": 0.7063263481974377, "grad_norm": 0.39987172496263784, "learning_rate": 2.0960632164725746e-06, "loss": 0.009, "step": 3556 }, { "epoch": 0.7065249776541861, "grad_norm": 0.42338870067574086, "learning_rate": 2.0934445735667366e-06, "loss": 0.012, "step": 3557 }, { "epoch": 0.7067236071109345, "grad_norm": 0.5296392321338635, "learning_rate": 2.0908271342625907e-06, "loss": 0.0116, "step": 3558 }, { "epoch": 0.7069222365676829, "grad_norm": 0.6562700338635507, "learning_rate": 2.0882108996440144e-06, "loss": 0.0162, "step": 3559 }, { "epoch": 0.7071208660244315, "grad_norm": 0.45596472145739425, "learning_rate": 2.0855958707943903e-06, "loss": 0.0132, "step": 3560 }, { "epoch": 0.7073194954811799, "grad_norm": 0.5851977975386901, "learning_rate": 2.082982048796595e-06, "loss": 0.0115, "step": 3561 }, { "epoch": 0.7075181249379283, "grad_norm": 0.6246161154127471, "learning_rate": 2.0803694347330146e-06, "loss": 0.025, "step": 3562 }, { "epoch": 0.7077167543946767, "grad_norm": 0.70364745633366, "learning_rate": 2.077758029685527e-06, "loss": 0.0059, "step": 3563 }, { "epoch": 0.7079153838514252, "grad_norm": 1.2354575585632148, "learning_rate": 2.0751478347355112e-06, "loss": 0.0226, "step": 3564 }, { "epoch": 0.7081140133081736, "grad_norm": 0.40712325234995933, "learning_rate": 2.0725388509638504e-06, "loss": 0.0098, "step": 3565 }, { "epoch": 0.708312642764922, "grad_norm": 0.318136163341281, "learning_rate": 2.0699310794509176e-06, "loss": 0.0089, "step": 3566 }, { "epoch": 0.7085112722216704, "grad_norm": 0.22133628429597416, "learning_rate": 2.0673245212765923e-06, "loss": 0.0054, "step": 3567 }, { "epoch": 0.708709901678419, "grad_norm": 1.0519287362857404, "learning_rate": 2.064719177520244e-06, "loss": 0.0146, "step": 3568 }, { "epoch": 0.7089085311351674, "grad_norm": 0.38952311086071517, "learning_rate": 2.062115049260745e-06, "loss": 0.0088, "step": 3569 }, { "epoch": 0.7091071605919158, "grad_norm": 0.5339933404088719, "learning_rate": 2.0595121375764638e-06, "loss": 0.0148, "step": 3570 }, { "epoch": 0.7093057900486642, "grad_norm": 0.35237702102373786, "learning_rate": 2.0569104435452597e-06, "loss": 0.0071, "step": 3571 }, { "epoch": 0.7095044195054127, "grad_norm": 0.5657247740451654, "learning_rate": 2.0543099682444958e-06, "loss": 0.0107, "step": 3572 }, { "epoch": 0.7097030489621611, "grad_norm": 0.5548852852278983, "learning_rate": 2.0517107127510223e-06, "loss": 0.0107, "step": 3573 }, { "epoch": 0.7099016784189095, "grad_norm": 0.34677930544310487, "learning_rate": 2.049112678141192e-06, "loss": 0.0053, "step": 3574 }, { "epoch": 0.7101003078756579, "grad_norm": 0.3763747072298554, "learning_rate": 2.0465158654908447e-06, "loss": 0.0092, "step": 3575 }, { "epoch": 0.7102989373324063, "grad_norm": 0.4447085423083739, "learning_rate": 2.0439202758753196e-06, "loss": 0.0177, "step": 3576 }, { "epoch": 0.7104975667891549, "grad_norm": 0.43408562131660483, "learning_rate": 2.04132591036945e-06, "loss": 0.011, "step": 3577 }, { "epoch": 0.7106961962459033, "grad_norm": 0.7457101724605041, "learning_rate": 2.0387327700475564e-06, "loss": 0.0103, "step": 3578 }, { "epoch": 0.7108948257026517, "grad_norm": 0.7667181305870763, "learning_rate": 2.036140855983458e-06, "loss": 0.0115, "step": 3579 }, { "epoch": 0.7110934551594001, "grad_norm": 0.4608200956966256, "learning_rate": 2.0335501692504624e-06, "loss": 0.0138, "step": 3580 }, { "epoch": 0.7112920846161486, "grad_norm": 0.43968224621643054, "learning_rate": 2.0309607109213725e-06, "loss": 0.0119, "step": 3581 }, { "epoch": 0.711490714072897, "grad_norm": 0.38784857298098396, "learning_rate": 2.028372482068477e-06, "loss": 0.0086, "step": 3582 }, { "epoch": 0.7116893435296454, "grad_norm": 0.4987232153713858, "learning_rate": 2.025785483763561e-06, "loss": 0.0195, "step": 3583 }, { "epoch": 0.7118879729863938, "grad_norm": 0.7964544674594572, "learning_rate": 2.0231997170778996e-06, "loss": 0.0187, "step": 3584 }, { "epoch": 0.7120866024431424, "grad_norm": 0.9392661611065467, "learning_rate": 2.0206151830822523e-06, "loss": 0.013, "step": 3585 }, { "epoch": 0.7122852318998908, "grad_norm": 0.5308227215692699, "learning_rate": 2.018031882846876e-06, "loss": 0.0074, "step": 3586 }, { "epoch": 0.7124838613566392, "grad_norm": 0.44551851990638935, "learning_rate": 2.0154498174415104e-06, "loss": 0.0069, "step": 3587 }, { "epoch": 0.7126824908133876, "grad_norm": 0.4912170131917664, "learning_rate": 2.0128689879353853e-06, "loss": 0.0249, "step": 3588 }, { "epoch": 0.7128811202701361, "grad_norm": 0.9819312671738274, "learning_rate": 2.0102893953972223e-06, "loss": 0.0155, "step": 3589 }, { "epoch": 0.7130797497268845, "grad_norm": 0.6813312541973012, "learning_rate": 2.0077110408952254e-06, "loss": 0.0128, "step": 3590 }, { "epoch": 0.7132783791836329, "grad_norm": 0.5331664640007728, "learning_rate": 2.0051339254970912e-06, "loss": 0.0146, "step": 3591 }, { "epoch": 0.7134770086403813, "grad_norm": 0.2943537320564372, "learning_rate": 2.0025580502699983e-06, "loss": 0.0083, "step": 3592 }, { "epoch": 0.7136756380971297, "grad_norm": 0.5213864974119058, "learning_rate": 1.9999834162806143e-06, "loss": 0.0119, "step": 3593 }, { "epoch": 0.7138742675538783, "grad_norm": 0.46666853895153365, "learning_rate": 1.997410024595095e-06, "loss": 0.0091, "step": 3594 }, { "epoch": 0.7140728970106267, "grad_norm": 0.5085939748759494, "learning_rate": 1.9948378762790767e-06, "loss": 0.0138, "step": 3595 }, { "epoch": 0.7142715264673751, "grad_norm": 0.2990427145994346, "learning_rate": 1.992266972397685e-06, "loss": 0.0062, "step": 3596 }, { "epoch": 0.7144701559241236, "grad_norm": 0.7704563352878734, "learning_rate": 1.9896973140155274e-06, "loss": 0.0136, "step": 3597 }, { "epoch": 0.714668785380872, "grad_norm": 0.4896227179477158, "learning_rate": 1.9871289021966984e-06, "loss": 0.0091, "step": 3598 }, { "epoch": 0.7148674148376204, "grad_norm": 0.5270626574804558, "learning_rate": 1.9845617380047725e-06, "loss": 0.0119, "step": 3599 }, { "epoch": 0.7150660442943688, "grad_norm": 0.38674602009064785, "learning_rate": 1.981995822502811e-06, "loss": 0.008, "step": 3600 }, { "epoch": 0.7152646737511172, "grad_norm": 0.291567327736593, "learning_rate": 1.979431156753359e-06, "loss": 0.0089, "step": 3601 }, { "epoch": 0.7154633032078658, "grad_norm": 0.7286287787040973, "learning_rate": 1.9768677418184382e-06, "loss": 0.0097, "step": 3602 }, { "epoch": 0.7156619326646142, "grad_norm": 0.3147595174206081, "learning_rate": 1.97430557875956e-06, "loss": 0.0074, "step": 3603 }, { "epoch": 0.7158605621213626, "grad_norm": 0.557758074899719, "learning_rate": 1.97174466863771e-06, "loss": 0.014, "step": 3604 }, { "epoch": 0.716059191578111, "grad_norm": 0.312647733026765, "learning_rate": 1.9691850125133617e-06, "loss": 0.0106, "step": 3605 }, { "epoch": 0.7162578210348595, "grad_norm": 0.5740235026460454, "learning_rate": 1.9666266114464626e-06, "loss": 0.0105, "step": 3606 }, { "epoch": 0.7164564504916079, "grad_norm": 0.42789475100988844, "learning_rate": 1.964069466496446e-06, "loss": 0.008, "step": 3607 }, { "epoch": 0.7166550799483563, "grad_norm": 0.3997543712505119, "learning_rate": 1.961513578722225e-06, "loss": 0.0067, "step": 3608 }, { "epoch": 0.7168537094051047, "grad_norm": 0.5950672430416363, "learning_rate": 1.958958949182187e-06, "loss": 0.012, "step": 3609 }, { "epoch": 0.7170523388618533, "grad_norm": 0.6623144173230123, "learning_rate": 1.956405578934204e-06, "loss": 0.0094, "step": 3610 }, { "epoch": 0.7172509683186017, "grad_norm": 0.6793926564221918, "learning_rate": 1.9538534690356225e-06, "loss": 0.014, "step": 3611 }, { "epoch": 0.7174495977753501, "grad_norm": 0.5600807419635767, "learning_rate": 1.951302620543268e-06, "loss": 0.0142, "step": 3612 }, { "epoch": 0.7176482272320985, "grad_norm": 0.4854484281401081, "learning_rate": 1.9487530345134464e-06, "loss": 0.01, "step": 3613 }, { "epoch": 0.717846856688847, "grad_norm": 0.9611540312341723, "learning_rate": 1.946204712001936e-06, "loss": 0.0098, "step": 3614 }, { "epoch": 0.7180454861455954, "grad_norm": 0.5517679891572496, "learning_rate": 1.9436576540639985e-06, "loss": 0.0096, "step": 3615 }, { "epoch": 0.7182441156023438, "grad_norm": 0.41701208366022036, "learning_rate": 1.9411118617543634e-06, "loss": 0.0077, "step": 3616 }, { "epoch": 0.7184427450590922, "grad_norm": 0.36515434166067073, "learning_rate": 1.938567336127245e-06, "loss": 0.0145, "step": 3617 }, { "epoch": 0.7186413745158406, "grad_norm": 0.42121979670363874, "learning_rate": 1.936024078236325e-06, "loss": 0.0089, "step": 3618 }, { "epoch": 0.7188400039725892, "grad_norm": 0.43493645872145054, "learning_rate": 1.9334820891347663e-06, "loss": 0.0106, "step": 3619 }, { "epoch": 0.7190386334293376, "grad_norm": 0.39201007950116096, "learning_rate": 1.930941369875205e-06, "loss": 0.0066, "step": 3620 }, { "epoch": 0.719237262886086, "grad_norm": 0.44616612890490304, "learning_rate": 1.928401921509747e-06, "loss": 0.0089, "step": 3621 }, { "epoch": 0.7194358923428344, "grad_norm": 0.5005235650718418, "learning_rate": 1.9258637450899796e-06, "loss": 0.0082, "step": 3622 }, { "epoch": 0.7196345217995829, "grad_norm": 0.9873137170116204, "learning_rate": 1.9233268416669547e-06, "loss": 0.0143, "step": 3623 }, { "epoch": 0.7198331512563313, "grad_norm": 0.9359450207876253, "learning_rate": 1.920791212291206e-06, "loss": 0.0142, "step": 3624 }, { "epoch": 0.7200317807130797, "grad_norm": 0.8723213929265855, "learning_rate": 1.9182568580127304e-06, "loss": 0.0147, "step": 3625 }, { "epoch": 0.7202304101698281, "grad_norm": 0.7238675157448663, "learning_rate": 1.9157237798810037e-06, "loss": 0.0116, "step": 3626 }, { "epoch": 0.7204290396265767, "grad_norm": 0.8076205888225351, "learning_rate": 1.9131919789449733e-06, "loss": 0.0177, "step": 3627 }, { "epoch": 0.7206276690833251, "grad_norm": 1.0779343741041851, "learning_rate": 1.910661456253051e-06, "loss": 0.021, "step": 3628 }, { "epoch": 0.7208262985400735, "grad_norm": 0.5096320669800932, "learning_rate": 1.9081322128531277e-06, "loss": 0.0114, "step": 3629 }, { "epoch": 0.7210249279968219, "grad_norm": 0.6623908050965808, "learning_rate": 1.905604249792557e-06, "loss": 0.0052, "step": 3630 }, { "epoch": 0.7212235574535704, "grad_norm": 0.47990832396620436, "learning_rate": 1.9030775681181696e-06, "loss": 0.0091, "step": 3631 }, { "epoch": 0.7214221869103188, "grad_norm": 0.7334184988391341, "learning_rate": 1.9005521688762585e-06, "loss": 0.0159, "step": 3632 }, { "epoch": 0.7216208163670672, "grad_norm": 0.4980418429190839, "learning_rate": 1.89802805311259e-06, "loss": 0.0128, "step": 3633 }, { "epoch": 0.7218194458238156, "grad_norm": 0.47426037363700535, "learning_rate": 1.8955052218724002e-06, "loss": 0.0085, "step": 3634 }, { "epoch": 0.722018075280564, "grad_norm": 0.5843280660499018, "learning_rate": 1.892983676200389e-06, "loss": 0.0122, "step": 3635 }, { "epoch": 0.7222167047373126, "grad_norm": 0.8246276543714378, "learning_rate": 1.8904634171407238e-06, "loss": 0.0095, "step": 3636 }, { "epoch": 0.722415334194061, "grad_norm": 1.1929203320268906, "learning_rate": 1.887944445737046e-06, "loss": 0.0182, "step": 3637 }, { "epoch": 0.7226139636508094, "grad_norm": 0.46024686499789297, "learning_rate": 1.8854267630324547e-06, "loss": 0.0061, "step": 3638 }, { "epoch": 0.7228125931075579, "grad_norm": 0.3041353357374024, "learning_rate": 1.882910370069524e-06, "loss": 0.006, "step": 3639 }, { "epoch": 0.7230112225643063, "grad_norm": 0.566018431463613, "learning_rate": 1.8803952678902853e-06, "loss": 0.0125, "step": 3640 }, { "epoch": 0.7232098520210547, "grad_norm": 0.3629859525601858, "learning_rate": 1.877881457536244e-06, "loss": 0.0135, "step": 3641 }, { "epoch": 0.7234084814778031, "grad_norm": 0.49111992542952826, "learning_rate": 1.8753689400483627e-06, "loss": 0.0075, "step": 3642 }, { "epoch": 0.7236071109345515, "grad_norm": 0.3138065124597806, "learning_rate": 1.8728577164670747e-06, "loss": 0.0043, "step": 3643 }, { "epoch": 0.7238057403913001, "grad_norm": 0.9114034734955218, "learning_rate": 1.8703477878322763e-06, "loss": 0.0131, "step": 3644 }, { "epoch": 0.7240043698480485, "grad_norm": 0.4633138637914843, "learning_rate": 1.8678391551833225e-06, "loss": 0.0076, "step": 3645 }, { "epoch": 0.7242029993047969, "grad_norm": 0.5559865923945981, "learning_rate": 1.8653318195590403e-06, "loss": 0.0102, "step": 3646 }, { "epoch": 0.7244016287615453, "grad_norm": 0.45909590775240494, "learning_rate": 1.8628257819977102e-06, "loss": 0.0114, "step": 3647 }, { "epoch": 0.7246002582182938, "grad_norm": 0.5720442731394239, "learning_rate": 1.8603210435370845e-06, "loss": 0.0139, "step": 3648 }, { "epoch": 0.7247988876750422, "grad_norm": 1.0088288086073944, "learning_rate": 1.8578176052143682e-06, "loss": 0.0105, "step": 3649 }, { "epoch": 0.7249975171317906, "grad_norm": 0.341268216037555, "learning_rate": 1.8553154680662355e-06, "loss": 0.0102, "step": 3650 }, { "epoch": 0.725196146588539, "grad_norm": 0.5430532731720251, "learning_rate": 1.8528146331288198e-06, "loss": 0.0056, "step": 3651 }, { "epoch": 0.7253947760452876, "grad_norm": 0.6547103394450872, "learning_rate": 1.8503151014377108e-06, "loss": 0.0141, "step": 3652 }, { "epoch": 0.725593405502036, "grad_norm": 0.7543034404187979, "learning_rate": 1.8478168740279662e-06, "loss": 0.0146, "step": 3653 }, { "epoch": 0.7257920349587844, "grad_norm": 0.8527254403878096, "learning_rate": 1.8453199519340959e-06, "loss": 0.0108, "step": 3654 }, { "epoch": 0.7259906644155328, "grad_norm": 0.8087062079395743, "learning_rate": 1.8428243361900754e-06, "loss": 0.0127, "step": 3655 }, { "epoch": 0.7261892938722813, "grad_norm": 0.906253089444188, "learning_rate": 1.840330027829334e-06, "loss": 0.0245, "step": 3656 }, { "epoch": 0.7263879233290297, "grad_norm": 0.6648053391954097, "learning_rate": 1.8378370278847646e-06, "loss": 0.012, "step": 3657 }, { "epoch": 0.7265865527857781, "grad_norm": 0.8741980778803626, "learning_rate": 1.835345337388718e-06, "loss": 0.0136, "step": 3658 }, { "epoch": 0.7267851822425265, "grad_norm": 0.637411767418505, "learning_rate": 1.8328549573729948e-06, "loss": 0.0148, "step": 3659 }, { "epoch": 0.726983811699275, "grad_norm": 0.6026345033209373, "learning_rate": 1.8303658888688642e-06, "loss": 0.011, "step": 3660 }, { "epoch": 0.7271824411560235, "grad_norm": 0.7594705123411722, "learning_rate": 1.8278781329070422e-06, "loss": 0.017, "step": 3661 }, { "epoch": 0.7273810706127719, "grad_norm": 0.41929886968876007, "learning_rate": 1.8253916905177093e-06, "loss": 0.0134, "step": 3662 }, { "epoch": 0.7275797000695203, "grad_norm": 1.0949746883385296, "learning_rate": 1.8229065627305003e-06, "loss": 0.0237, "step": 3663 }, { "epoch": 0.7277783295262688, "grad_norm": 0.3118311043232577, "learning_rate": 1.8204227505744998e-06, "loss": 0.0068, "step": 3664 }, { "epoch": 0.7279769589830172, "grad_norm": 0.659409007129231, "learning_rate": 1.817940255078256e-06, "loss": 0.01, "step": 3665 }, { "epoch": 0.7281755884397656, "grad_norm": 0.5934643848129301, "learning_rate": 1.8154590772697644e-06, "loss": 0.0096, "step": 3666 }, { "epoch": 0.728374217896514, "grad_norm": 0.4976392266560354, "learning_rate": 1.8129792181764817e-06, "loss": 0.013, "step": 3667 }, { "epoch": 0.7285728473532624, "grad_norm": 0.47726296885447955, "learning_rate": 1.810500678825311e-06, "loss": 0.0144, "step": 3668 }, { "epoch": 0.728771476810011, "grad_norm": 0.22945888004342208, "learning_rate": 1.8080234602426155e-06, "loss": 0.0047, "step": 3669 }, { "epoch": 0.7289701062667594, "grad_norm": 0.5110342276655542, "learning_rate": 1.8055475634542102e-06, "loss": 0.0078, "step": 3670 }, { "epoch": 0.7291687357235078, "grad_norm": 0.31793929875854604, "learning_rate": 1.8030729894853582e-06, "loss": 0.0134, "step": 3671 }, { "epoch": 0.7293673651802562, "grad_norm": 0.7487761059663722, "learning_rate": 1.8005997393607816e-06, "loss": 0.0206, "step": 3672 }, { "epoch": 0.7295659946370047, "grad_norm": 0.2538735229220153, "learning_rate": 1.7981278141046472e-06, "loss": 0.0088, "step": 3673 }, { "epoch": 0.7297646240937531, "grad_norm": 0.4930426210508408, "learning_rate": 1.7956572147405798e-06, "loss": 0.0116, "step": 3674 }, { "epoch": 0.7299632535505015, "grad_norm": 0.46080199402940236, "learning_rate": 1.7931879422916499e-06, "loss": 0.0054, "step": 3675 }, { "epoch": 0.7301618830072499, "grad_norm": 0.44044922022937444, "learning_rate": 1.7907199977803809e-06, "loss": 0.0099, "step": 3676 }, { "epoch": 0.7303605124639984, "grad_norm": 0.42505852730210714, "learning_rate": 1.788253382228749e-06, "loss": 0.0196, "step": 3677 }, { "epoch": 0.7305591419207469, "grad_norm": 0.3898875170650358, "learning_rate": 1.7857880966581738e-06, "loss": 0.0051, "step": 3678 }, { "epoch": 0.7307577713774953, "grad_norm": 0.49491000937059526, "learning_rate": 1.7833241420895304e-06, "loss": 0.011, "step": 3679 }, { "epoch": 0.7309564008342437, "grad_norm": 0.8767113299077823, "learning_rate": 1.7808615195431366e-06, "loss": 0.0131, "step": 3680 }, { "epoch": 0.7311550302909922, "grad_norm": 0.6506677708061251, "learning_rate": 1.778400230038766e-06, "loss": 0.0141, "step": 3681 }, { "epoch": 0.7313536597477406, "grad_norm": 0.31762522470848703, "learning_rate": 1.775940274595634e-06, "loss": 0.0072, "step": 3682 }, { "epoch": 0.731552289204489, "grad_norm": 0.3400734480718243, "learning_rate": 1.7734816542324034e-06, "loss": 0.0069, "step": 3683 }, { "epoch": 0.7317509186612374, "grad_norm": 0.3523223550612423, "learning_rate": 1.7710243699671908e-06, "loss": 0.0089, "step": 3684 }, { "epoch": 0.7319495481179858, "grad_norm": 0.26978468960487634, "learning_rate": 1.768568422817551e-06, "loss": 0.0061, "step": 3685 }, { "epoch": 0.7321481775747344, "grad_norm": 0.5668983473598859, "learning_rate": 1.7661138138004918e-06, "loss": 0.0153, "step": 3686 }, { "epoch": 0.7323468070314828, "grad_norm": 1.5335058937602353, "learning_rate": 1.763660543932465e-06, "loss": 0.0177, "step": 3687 }, { "epoch": 0.7325454364882312, "grad_norm": 0.5330209491574611, "learning_rate": 1.7612086142293643e-06, "loss": 0.0097, "step": 3688 }, { "epoch": 0.7327440659449797, "grad_norm": 0.5463971064029184, "learning_rate": 1.758758025706535e-06, "loss": 0.0125, "step": 3689 }, { "epoch": 0.7329426954017281, "grad_norm": 0.4545424691631491, "learning_rate": 1.7563087793787597e-06, "loss": 0.0095, "step": 3690 }, { "epoch": 0.7331413248584765, "grad_norm": 0.4004144836163272, "learning_rate": 1.753860876260272e-06, "loss": 0.0109, "step": 3691 }, { "epoch": 0.7333399543152249, "grad_norm": 0.4843018902925179, "learning_rate": 1.751414317364743e-06, "loss": 0.0128, "step": 3692 }, { "epoch": 0.7335385837719733, "grad_norm": 0.27511747492039373, "learning_rate": 1.7489691037052925e-06, "loss": 0.0068, "step": 3693 }, { "epoch": 0.7337372132287219, "grad_norm": 0.36643270106143355, "learning_rate": 1.7465252362944818e-06, "loss": 0.0093, "step": 3694 }, { "epoch": 0.7339358426854703, "grad_norm": 0.320728805385628, "learning_rate": 1.7440827161443108e-06, "loss": 0.0115, "step": 3695 }, { "epoch": 0.7341344721422187, "grad_norm": 0.42125602343146334, "learning_rate": 1.7416415442662283e-06, "loss": 0.0135, "step": 3696 }, { "epoch": 0.7343331015989671, "grad_norm": 0.418662760132342, "learning_rate": 1.7392017216711178e-06, "loss": 0.0113, "step": 3697 }, { "epoch": 0.7345317310557156, "grad_norm": 0.29967942764047406, "learning_rate": 1.7367632493693098e-06, "loss": 0.0108, "step": 3698 }, { "epoch": 0.734730360512464, "grad_norm": 0.683085498593672, "learning_rate": 1.7343261283705714e-06, "loss": 0.0108, "step": 3699 }, { "epoch": 0.7349289899692124, "grad_norm": 0.3669500417485653, "learning_rate": 1.731890359684113e-06, "loss": 0.0086, "step": 3700 }, { "epoch": 0.7351276194259608, "grad_norm": 0.7989281415130531, "learning_rate": 1.7294559443185854e-06, "loss": 0.0114, "step": 3701 }, { "epoch": 0.7353262488827093, "grad_norm": 0.3908467521711616, "learning_rate": 1.727022883282074e-06, "loss": 0.0079, "step": 3702 }, { "epoch": 0.7355248783394578, "grad_norm": 0.4811819714857938, "learning_rate": 1.7245911775821112e-06, "loss": 0.005, "step": 3703 }, { "epoch": 0.7357235077962062, "grad_norm": 0.5482194796559414, "learning_rate": 1.7221608282256597e-06, "loss": 0.0088, "step": 3704 }, { "epoch": 0.7359221372529546, "grad_norm": 0.3769237988240613, "learning_rate": 1.7197318362191284e-06, "loss": 0.0138, "step": 3705 }, { "epoch": 0.7361207667097031, "grad_norm": 0.6344995974195727, "learning_rate": 1.717304202568359e-06, "loss": 0.0169, "step": 3706 }, { "epoch": 0.7363193961664515, "grad_norm": 0.5850068173437899, "learning_rate": 1.7148779282786305e-06, "loss": 0.0138, "step": 3707 }, { "epoch": 0.7365180256231999, "grad_norm": 0.7795863924661017, "learning_rate": 1.712453014354663e-06, "loss": 0.0122, "step": 3708 }, { "epoch": 0.7367166550799483, "grad_norm": 0.35280976439404965, "learning_rate": 1.7100294618006092e-06, "loss": 0.0084, "step": 3709 }, { "epoch": 0.7369152845366967, "grad_norm": 0.5700366010660607, "learning_rate": 1.7076072716200616e-06, "loss": 0.0155, "step": 3710 }, { "epoch": 0.7371139139934453, "grad_norm": 0.40895343575304466, "learning_rate": 1.7051864448160444e-06, "loss": 0.0084, "step": 3711 }, { "epoch": 0.7373125434501937, "grad_norm": 0.4305656202174194, "learning_rate": 1.7027669823910208e-06, "loss": 0.0111, "step": 3712 }, { "epoch": 0.7375111729069421, "grad_norm": 0.984538631213734, "learning_rate": 1.7003488853468897e-06, "loss": 0.0131, "step": 3713 }, { "epoch": 0.7377098023636905, "grad_norm": 0.5010493377039495, "learning_rate": 1.6979321546849786e-06, "loss": 0.0063, "step": 3714 }, { "epoch": 0.737908431820439, "grad_norm": 0.45048026983636796, "learning_rate": 1.6955167914060578e-06, "loss": 0.012, "step": 3715 }, { "epoch": 0.7381070612771874, "grad_norm": 0.2741879991054503, "learning_rate": 1.6931027965103224e-06, "loss": 0.0057, "step": 3716 }, { "epoch": 0.7383056907339358, "grad_norm": 0.4130916332257633, "learning_rate": 1.6906901709974093e-06, "loss": 0.0115, "step": 3717 }, { "epoch": 0.7385043201906842, "grad_norm": 0.561420411364561, "learning_rate": 1.6882789158663803e-06, "loss": 0.0128, "step": 3718 }, { "epoch": 0.7387029496474327, "grad_norm": 0.22467074037808235, "learning_rate": 1.6858690321157362e-06, "loss": 0.0036, "step": 3719 }, { "epoch": 0.7389015791041812, "grad_norm": 0.7303477211037094, "learning_rate": 1.6834605207434084e-06, "loss": 0.0167, "step": 3720 }, { "epoch": 0.7391002085609296, "grad_norm": 0.5588024743579839, "learning_rate": 1.6810533827467563e-06, "loss": 0.0104, "step": 3721 }, { "epoch": 0.739298838017678, "grad_norm": 0.2374237543068285, "learning_rate": 1.6786476191225764e-06, "loss": 0.0045, "step": 3722 }, { "epoch": 0.7394974674744265, "grad_norm": 0.4732242064561535, "learning_rate": 1.6762432308670895e-06, "loss": 0.0095, "step": 3723 }, { "epoch": 0.7396960969311749, "grad_norm": 1.3353741885909276, "learning_rate": 1.6738402189759539e-06, "loss": 0.0084, "step": 3724 }, { "epoch": 0.7398947263879233, "grad_norm": 0.7402268023476685, "learning_rate": 1.671438584444251e-06, "loss": 0.0161, "step": 3725 }, { "epoch": 0.7400933558446717, "grad_norm": 0.48374235673574467, "learning_rate": 1.6690383282664975e-06, "loss": 0.0117, "step": 3726 }, { "epoch": 0.7402919853014202, "grad_norm": 0.3143118173710068, "learning_rate": 1.6666394514366374e-06, "loss": 0.0106, "step": 3727 }, { "epoch": 0.7404906147581687, "grad_norm": 0.622043363621592, "learning_rate": 1.6642419549480414e-06, "loss": 0.0069, "step": 3728 }, { "epoch": 0.7406892442149171, "grad_norm": 0.5599094194736531, "learning_rate": 1.6618458397935128e-06, "loss": 0.015, "step": 3729 }, { "epoch": 0.7408878736716655, "grad_norm": 0.48331856066211576, "learning_rate": 1.6594511069652786e-06, "loss": 0.0066, "step": 3730 }, { "epoch": 0.741086503128414, "grad_norm": 0.34166598414346067, "learning_rate": 1.6570577574549945e-06, "loss": 0.0066, "step": 3731 }, { "epoch": 0.7412851325851624, "grad_norm": 0.427571456888473, "learning_rate": 1.6546657922537467e-06, "loss": 0.0069, "step": 3732 }, { "epoch": 0.7414837620419108, "grad_norm": 0.3951042263387438, "learning_rate": 1.6522752123520431e-06, "loss": 0.0057, "step": 3733 }, { "epoch": 0.7416823914986592, "grad_norm": 0.5078153456049683, "learning_rate": 1.6498860187398225e-06, "loss": 0.0113, "step": 3734 }, { "epoch": 0.7418810209554076, "grad_norm": 0.8021479414729211, "learning_rate": 1.6474982124064453e-06, "loss": 0.0165, "step": 3735 }, { "epoch": 0.7420796504121562, "grad_norm": 0.2070717809242952, "learning_rate": 1.6451117943407014e-06, "loss": 0.0068, "step": 3736 }, { "epoch": 0.7422782798689046, "grad_norm": 1.1209489750991877, "learning_rate": 1.6427267655308048e-06, "loss": 0.0134, "step": 3737 }, { "epoch": 0.742476909325653, "grad_norm": 0.9481691520290123, "learning_rate": 1.6403431269643916e-06, "loss": 0.0169, "step": 3738 }, { "epoch": 0.7426755387824014, "grad_norm": 0.6585340668755794, "learning_rate": 1.6379608796285263e-06, "loss": 0.01, "step": 3739 }, { "epoch": 0.7428741682391499, "grad_norm": 0.7454877441367229, "learning_rate": 1.635580024509692e-06, "loss": 0.0084, "step": 3740 }, { "epoch": 0.7430727976958983, "grad_norm": 0.6830037620298528, "learning_rate": 1.6332005625938025e-06, "loss": 0.0139, "step": 3741 }, { "epoch": 0.7432714271526467, "grad_norm": 0.40026913724686264, "learning_rate": 1.6308224948661867e-06, "loss": 0.0102, "step": 3742 }, { "epoch": 0.7434700566093951, "grad_norm": 1.211034939945853, "learning_rate": 1.6284458223116011e-06, "loss": 0.0208, "step": 3743 }, { "epoch": 0.7436686860661436, "grad_norm": 0.4950643350049753, "learning_rate": 1.6260705459142268e-06, "loss": 0.0099, "step": 3744 }, { "epoch": 0.7438673155228921, "grad_norm": 0.7561528087411485, "learning_rate": 1.6236966666576586e-06, "loss": 0.015, "step": 3745 }, { "epoch": 0.7440659449796405, "grad_norm": 0.4587296244272926, "learning_rate": 1.6213241855249211e-06, "loss": 0.0105, "step": 3746 }, { "epoch": 0.7442645744363889, "grad_norm": 0.35825062637065686, "learning_rate": 1.6189531034984534e-06, "loss": 0.0071, "step": 3747 }, { "epoch": 0.7444632038931374, "grad_norm": 0.1667528612749447, "learning_rate": 1.616583421560121e-06, "loss": 0.0034, "step": 3748 }, { "epoch": 0.7446618333498858, "grad_norm": 1.4414518890486638, "learning_rate": 1.6142151406912043e-06, "loss": 0.0252, "step": 3749 }, { "epoch": 0.7448604628066342, "grad_norm": 0.8832252217740378, "learning_rate": 1.6118482618724073e-06, "loss": 0.0266, "step": 3750 }, { "epoch": 0.7450590922633826, "grad_norm": 0.76220401745124, "learning_rate": 1.6094827860838535e-06, "loss": 0.0106, "step": 3751 }, { "epoch": 0.745257721720131, "grad_norm": 0.9771556397073955, "learning_rate": 1.6071187143050809e-06, "loss": 0.0122, "step": 3752 }, { "epoch": 0.7454563511768796, "grad_norm": 0.3409011178631712, "learning_rate": 1.6047560475150532e-06, "loss": 0.0044, "step": 3753 }, { "epoch": 0.745654980633628, "grad_norm": 1.2118712960840325, "learning_rate": 1.6023947866921452e-06, "loss": 0.0163, "step": 3754 }, { "epoch": 0.7458536100903764, "grad_norm": 0.7180041710450296, "learning_rate": 1.6000349328141528e-06, "loss": 0.0171, "step": 3755 }, { "epoch": 0.7460522395471249, "grad_norm": 0.8759814111147064, "learning_rate": 1.597676486858291e-06, "loss": 0.0129, "step": 3756 }, { "epoch": 0.7462508690038733, "grad_norm": 0.3860414331142917, "learning_rate": 1.5953194498011876e-06, "loss": 0.0081, "step": 3757 }, { "epoch": 0.7464494984606217, "grad_norm": 0.6321819223523495, "learning_rate": 1.5929638226188915e-06, "loss": 0.0144, "step": 3758 }, { "epoch": 0.7466481279173701, "grad_norm": 0.21545187521317896, "learning_rate": 1.5906096062868638e-06, "loss": 0.0035, "step": 3759 }, { "epoch": 0.7468467573741185, "grad_norm": 0.47253677184045745, "learning_rate": 1.5882568017799848e-06, "loss": 0.0099, "step": 3760 }, { "epoch": 0.747045386830867, "grad_norm": 0.678212792886403, "learning_rate": 1.5859054100725463e-06, "loss": 0.0164, "step": 3761 }, { "epoch": 0.7472440162876155, "grad_norm": 0.5852719573661888, "learning_rate": 1.5835554321382585e-06, "loss": 0.0111, "step": 3762 }, { "epoch": 0.7474426457443639, "grad_norm": 0.46344554315419095, "learning_rate": 1.5812068689502464e-06, "loss": 0.0174, "step": 3763 }, { "epoch": 0.7476412752011123, "grad_norm": 0.6136914065626226, "learning_rate": 1.5788597214810448e-06, "loss": 0.0084, "step": 3764 }, { "epoch": 0.7478399046578608, "grad_norm": 0.6281579179223541, "learning_rate": 1.576513990702608e-06, "loss": 0.0083, "step": 3765 }, { "epoch": 0.7480385341146092, "grad_norm": 0.543301875004366, "learning_rate": 1.5741696775862975e-06, "loss": 0.009, "step": 3766 }, { "epoch": 0.7482371635713576, "grad_norm": 0.36572588305171494, "learning_rate": 1.5718267831028944e-06, "loss": 0.0119, "step": 3767 }, { "epoch": 0.748435793028106, "grad_norm": 0.24068019330565446, "learning_rate": 1.5694853082225859e-06, "loss": 0.0055, "step": 3768 }, { "epoch": 0.7486344224848545, "grad_norm": 0.41680849368459344, "learning_rate": 1.5671452539149761e-06, "loss": 0.0062, "step": 3769 }, { "epoch": 0.748833051941603, "grad_norm": 0.8860868458069969, "learning_rate": 1.5648066211490804e-06, "loss": 0.0158, "step": 3770 }, { "epoch": 0.7490316813983514, "grad_norm": 0.36239580091258583, "learning_rate": 1.5624694108933208e-06, "loss": 0.0079, "step": 3771 }, { "epoch": 0.7492303108550998, "grad_norm": 0.855233140426005, "learning_rate": 1.560133624115538e-06, "loss": 0.0133, "step": 3772 }, { "epoch": 0.7494289403118483, "grad_norm": 0.5163747324216429, "learning_rate": 1.5577992617829745e-06, "loss": 0.0119, "step": 3773 }, { "epoch": 0.7496275697685967, "grad_norm": 4.943484511480794, "learning_rate": 1.5554663248622914e-06, "loss": 0.0085, "step": 3774 }, { "epoch": 0.7498261992253451, "grad_norm": 0.6847981517474558, "learning_rate": 1.5531348143195524e-06, "loss": 0.0158, "step": 3775 }, { "epoch": 0.7500248286820935, "grad_norm": 0.24726926855367248, "learning_rate": 1.5508047311202346e-06, "loss": 0.0063, "step": 3776 }, { "epoch": 0.750223458138842, "grad_norm": 1.0653880290235649, "learning_rate": 1.548476076229225e-06, "loss": 0.0152, "step": 3777 }, { "epoch": 0.7504220875955905, "grad_norm": 0.41225668837673585, "learning_rate": 1.5461488506108163e-06, "loss": 0.0079, "step": 3778 }, { "epoch": 0.7506207170523389, "grad_norm": 0.39688034716378284, "learning_rate": 1.5438230552287076e-06, "loss": 0.0133, "step": 3779 }, { "epoch": 0.7508193465090873, "grad_norm": 0.7160254723600693, "learning_rate": 1.5414986910460127e-06, "loss": 0.0119, "step": 3780 }, { "epoch": 0.7510179759658357, "grad_norm": 0.5240990626907865, "learning_rate": 1.5391757590252448e-06, "loss": 0.0115, "step": 3781 }, { "epoch": 0.7512166054225842, "grad_norm": 1.2549979086012635, "learning_rate": 1.536854260128331e-06, "loss": 0.0133, "step": 3782 }, { "epoch": 0.7514152348793326, "grad_norm": 0.9872465010490661, "learning_rate": 1.5345341953165982e-06, "loss": 0.0137, "step": 3783 }, { "epoch": 0.751613864336081, "grad_norm": 0.34007165388302485, "learning_rate": 1.5322155655507859e-06, "loss": 0.0084, "step": 3784 }, { "epoch": 0.7518124937928294, "grad_norm": 0.494044558950559, "learning_rate": 1.5298983717910342e-06, "loss": 0.0127, "step": 3785 }, { "epoch": 0.7520111232495779, "grad_norm": 0.8210240909130295, "learning_rate": 1.5275826149968913e-06, "loss": 0.0089, "step": 3786 }, { "epoch": 0.7522097527063264, "grad_norm": 0.47775536674059704, "learning_rate": 1.5252682961273125e-06, "loss": 0.0107, "step": 3787 }, { "epoch": 0.7524083821630748, "grad_norm": 0.42651256912812496, "learning_rate": 1.5229554161406502e-06, "loss": 0.0104, "step": 3788 }, { "epoch": 0.7526070116198232, "grad_norm": 0.5814023196245535, "learning_rate": 1.52064397599467e-06, "loss": 0.0115, "step": 3789 }, { "epoch": 0.7528056410765717, "grad_norm": 0.39860990655481554, "learning_rate": 1.5183339766465332e-06, "loss": 0.0108, "step": 3790 }, { "epoch": 0.7530042705333201, "grad_norm": 0.3816121403616372, "learning_rate": 1.5160254190528118e-06, "loss": 0.0073, "step": 3791 }, { "epoch": 0.7532028999900685, "grad_norm": 0.5562129339192128, "learning_rate": 1.5137183041694736e-06, "loss": 0.0145, "step": 3792 }, { "epoch": 0.7534015294468169, "grad_norm": 0.35565917270513564, "learning_rate": 1.5114126329518947e-06, "loss": 0.0088, "step": 3793 }, { "epoch": 0.7536001589035654, "grad_norm": 0.47105502946267286, "learning_rate": 1.5091084063548527e-06, "loss": 0.0126, "step": 3794 }, { "epoch": 0.7537987883603139, "grad_norm": 0.6185552661380914, "learning_rate": 1.506805625332522e-06, "loss": 0.0123, "step": 3795 }, { "epoch": 0.7539974178170623, "grad_norm": 0.40215289956017086, "learning_rate": 1.504504290838485e-06, "loss": 0.0075, "step": 3796 }, { "epoch": 0.7541960472738107, "grad_norm": 0.4782485953056544, "learning_rate": 1.5022044038257195e-06, "loss": 0.0162, "step": 3797 }, { "epoch": 0.7543946767305592, "grad_norm": 0.7324829878695893, "learning_rate": 1.4999059652466085e-06, "loss": 0.0105, "step": 3798 }, { "epoch": 0.7545933061873076, "grad_norm": 0.47495420804446264, "learning_rate": 1.4976089760529311e-06, "loss": 0.0139, "step": 3799 }, { "epoch": 0.754791935644056, "grad_norm": 0.30470882134001215, "learning_rate": 1.4953134371958694e-06, "loss": 0.0044, "step": 3800 }, { "epoch": 0.7549905651008044, "grad_norm": 0.5558913333675957, "learning_rate": 1.4930193496260053e-06, "loss": 0.0136, "step": 3801 }, { "epoch": 0.7551891945575528, "grad_norm": 0.5677602935945104, "learning_rate": 1.4907267142933162e-06, "loss": 0.007, "step": 3802 }, { "epoch": 0.7553878240143013, "grad_norm": 0.44674486843299116, "learning_rate": 1.488435532147181e-06, "loss": 0.0097, "step": 3803 }, { "epoch": 0.7555864534710498, "grad_norm": 0.24979032118916042, "learning_rate": 1.4861458041363736e-06, "loss": 0.0067, "step": 3804 }, { "epoch": 0.7557850829277982, "grad_norm": 0.644903890844482, "learning_rate": 1.4838575312090692e-06, "loss": 0.0176, "step": 3805 }, { "epoch": 0.7559837123845466, "grad_norm": 0.4250844938968267, "learning_rate": 1.481570714312842e-06, "loss": 0.0095, "step": 3806 }, { "epoch": 0.7561823418412951, "grad_norm": 0.5450830869397256, "learning_rate": 1.4792853543946572e-06, "loss": 0.014, "step": 3807 }, { "epoch": 0.7563809712980435, "grad_norm": 0.2125242305530338, "learning_rate": 1.477001452400883e-06, "loss": 0.0067, "step": 3808 }, { "epoch": 0.7565796007547919, "grad_norm": 0.6807746018493928, "learning_rate": 1.4747190092772774e-06, "loss": 0.0151, "step": 3809 }, { "epoch": 0.7567782302115403, "grad_norm": 0.42307268538948456, "learning_rate": 1.4724380259690013e-06, "loss": 0.0119, "step": 3810 }, { "epoch": 0.7569768596682888, "grad_norm": 0.6930123507211039, "learning_rate": 1.470158503420605e-06, "loss": 0.0104, "step": 3811 }, { "epoch": 0.7571754891250373, "grad_norm": 0.3363804661976727, "learning_rate": 1.4678804425760374e-06, "loss": 0.0049, "step": 3812 }, { "epoch": 0.7573741185817857, "grad_norm": 0.5791696042890436, "learning_rate": 1.4656038443786425e-06, "loss": 0.0119, "step": 3813 }, { "epoch": 0.7575727480385341, "grad_norm": 0.30695431877058926, "learning_rate": 1.4633287097711552e-06, "loss": 0.0062, "step": 3814 }, { "epoch": 0.7577713774952826, "grad_norm": 0.5269032914372845, "learning_rate": 1.4610550396957085e-06, "loss": 0.0175, "step": 3815 }, { "epoch": 0.757970006952031, "grad_norm": 0.38075958392152365, "learning_rate": 1.458782835093825e-06, "loss": 0.008, "step": 3816 }, { "epoch": 0.7581686364087794, "grad_norm": 0.3840340193367981, "learning_rate": 1.4565120969064222e-06, "loss": 0.0062, "step": 3817 }, { "epoch": 0.7583672658655278, "grad_norm": 0.44995171484252383, "learning_rate": 1.454242826073814e-06, "loss": 0.0105, "step": 3818 }, { "epoch": 0.7585658953222763, "grad_norm": 0.755410175506028, "learning_rate": 1.451975023535699e-06, "loss": 0.0151, "step": 3819 }, { "epoch": 0.7587645247790247, "grad_norm": 0.4925822601592495, "learning_rate": 1.4497086902311746e-06, "loss": 0.0118, "step": 3820 }, { "epoch": 0.7589631542357732, "grad_norm": 0.33772272051963276, "learning_rate": 1.4474438270987257e-06, "loss": 0.0062, "step": 3821 }, { "epoch": 0.7591617836925216, "grad_norm": 0.8290174834097047, "learning_rate": 1.445180435076231e-06, "loss": 0.0136, "step": 3822 }, { "epoch": 0.75936041314927, "grad_norm": 0.43343542171958976, "learning_rate": 1.4429185151009573e-06, "loss": 0.0092, "step": 3823 }, { "epoch": 0.7595590426060185, "grad_norm": 0.775111328446171, "learning_rate": 1.4406580681095639e-06, "loss": 0.0181, "step": 3824 }, { "epoch": 0.7597576720627669, "grad_norm": 0.39166179536426354, "learning_rate": 1.4383990950381022e-06, "loss": 0.0088, "step": 3825 }, { "epoch": 0.7599563015195153, "grad_norm": 0.4925837143073198, "learning_rate": 1.4361415968220082e-06, "loss": 0.0101, "step": 3826 }, { "epoch": 0.7601549309762637, "grad_norm": 0.32696898445858813, "learning_rate": 1.43388557439611e-06, "loss": 0.0086, "step": 3827 }, { "epoch": 0.7603535604330122, "grad_norm": 0.4958561019808758, "learning_rate": 1.4316310286946228e-06, "loss": 0.0137, "step": 3828 }, { "epoch": 0.7605521898897607, "grad_norm": 0.4975604655384996, "learning_rate": 1.4293779606511527e-06, "loss": 0.0136, "step": 3829 }, { "epoch": 0.7607508193465091, "grad_norm": 0.49887393821198955, "learning_rate": 1.4271263711986954e-06, "loss": 0.0103, "step": 3830 }, { "epoch": 0.7609494488032575, "grad_norm": 0.3683363611647419, "learning_rate": 1.4248762612696282e-06, "loss": 0.0063, "step": 3831 }, { "epoch": 0.761148078260006, "grad_norm": 0.3079985582051433, "learning_rate": 1.4226276317957228e-06, "loss": 0.0125, "step": 3832 }, { "epoch": 0.7613467077167544, "grad_norm": 0.4074575929705946, "learning_rate": 1.4203804837081308e-06, "loss": 0.0075, "step": 3833 }, { "epoch": 0.7615453371735028, "grad_norm": 0.49220265205108876, "learning_rate": 1.4181348179373972e-06, "loss": 0.0127, "step": 3834 }, { "epoch": 0.7617439666302512, "grad_norm": 0.2819678338268758, "learning_rate": 1.4158906354134472e-06, "loss": 0.0082, "step": 3835 }, { "epoch": 0.7619425960869997, "grad_norm": 0.5745246590432623, "learning_rate": 1.413647937065596e-06, "loss": 0.0149, "step": 3836 }, { "epoch": 0.7621412255437482, "grad_norm": 0.6022591411514704, "learning_rate": 1.4114067238225438e-06, "loss": 0.017, "step": 3837 }, { "epoch": 0.7623398550004966, "grad_norm": 0.5962439448569538, "learning_rate": 1.4091669966123717e-06, "loss": 0.0079, "step": 3838 }, { "epoch": 0.762538484457245, "grad_norm": 0.48725897414015545, "learning_rate": 1.4069287563625523e-06, "loss": 0.0122, "step": 3839 }, { "epoch": 0.7627371139139935, "grad_norm": 0.6291651873134785, "learning_rate": 1.404692003999935e-06, "loss": 0.0142, "step": 3840 }, { "epoch": 0.7629357433707419, "grad_norm": 0.5770793807297394, "learning_rate": 1.4024567404507606e-06, "loss": 0.0129, "step": 3841 }, { "epoch": 0.7631343728274903, "grad_norm": 0.4801313035665573, "learning_rate": 1.4002229666406454e-06, "loss": 0.0127, "step": 3842 }, { "epoch": 0.7633330022842387, "grad_norm": 0.5099357827368747, "learning_rate": 1.3979906834945944e-06, "loss": 0.0139, "step": 3843 }, { "epoch": 0.7635316317409871, "grad_norm": 0.3833860818815375, "learning_rate": 1.3957598919369958e-06, "loss": 0.0099, "step": 3844 }, { "epoch": 0.7637302611977356, "grad_norm": 0.41162753016444315, "learning_rate": 1.3935305928916154e-06, "loss": 0.0108, "step": 3845 }, { "epoch": 0.7639288906544841, "grad_norm": 0.3078560285036505, "learning_rate": 1.3913027872816064e-06, "loss": 0.0095, "step": 3846 }, { "epoch": 0.7641275201112325, "grad_norm": 0.37973724509719714, "learning_rate": 1.3890764760294979e-06, "loss": 0.0074, "step": 3847 }, { "epoch": 0.764326149567981, "grad_norm": 0.4498229089715063, "learning_rate": 1.3868516600572056e-06, "loss": 0.0091, "step": 3848 }, { "epoch": 0.7645247790247294, "grad_norm": 0.37444629015372083, "learning_rate": 1.3846283402860216e-06, "loss": 0.0099, "step": 3849 }, { "epoch": 0.7647234084814778, "grad_norm": 0.5009045045676753, "learning_rate": 1.3824065176366225e-06, "loss": 0.0105, "step": 3850 }, { "epoch": 0.7649220379382262, "grad_norm": 0.4475657849726896, "learning_rate": 1.3801861930290623e-06, "loss": 0.0064, "step": 3851 }, { "epoch": 0.7651206673949746, "grad_norm": 0.774374062344075, "learning_rate": 1.377967367382772e-06, "loss": 0.0182, "step": 3852 }, { "epoch": 0.7653192968517231, "grad_norm": 0.4792793910315004, "learning_rate": 1.3757500416165686e-06, "loss": 0.012, "step": 3853 }, { "epoch": 0.7655179263084716, "grad_norm": 0.6802435312134761, "learning_rate": 1.3735342166486448e-06, "loss": 0.0146, "step": 3854 }, { "epoch": 0.76571655576522, "grad_norm": 0.5822617738126644, "learning_rate": 1.3713198933965687e-06, "loss": 0.0102, "step": 3855 }, { "epoch": 0.7659151852219684, "grad_norm": 0.3228251141085814, "learning_rate": 1.3691070727772927e-06, "loss": 0.0058, "step": 3856 }, { "epoch": 0.7661138146787169, "grad_norm": 0.5602496268212196, "learning_rate": 1.36689575570714e-06, "loss": 0.0138, "step": 3857 }, { "epoch": 0.7663124441354653, "grad_norm": 0.8410546969132571, "learning_rate": 1.3646859431018178e-06, "loss": 0.0168, "step": 3858 }, { "epoch": 0.7665110735922137, "grad_norm": 0.49153277899098013, "learning_rate": 1.3624776358764046e-06, "loss": 0.0101, "step": 3859 }, { "epoch": 0.7667097030489621, "grad_norm": 0.4718983704981794, "learning_rate": 1.3602708349453603e-06, "loss": 0.0146, "step": 3860 }, { "epoch": 0.7669083325057106, "grad_norm": 0.46323476790072704, "learning_rate": 1.3580655412225192e-06, "loss": 0.0141, "step": 3861 }, { "epoch": 0.767106961962459, "grad_norm": 0.72794578293661, "learning_rate": 1.3558617556210891e-06, "loss": 0.0146, "step": 3862 }, { "epoch": 0.7673055914192075, "grad_norm": 0.36007802301261976, "learning_rate": 1.3536594790536584e-06, "loss": 0.0073, "step": 3863 }, { "epoch": 0.7675042208759559, "grad_norm": 0.6871366174736734, "learning_rate": 1.3514587124321842e-06, "loss": 0.0154, "step": 3864 }, { "epoch": 0.7677028503327044, "grad_norm": 0.77934118848991, "learning_rate": 1.3492594566680052e-06, "loss": 0.0133, "step": 3865 }, { "epoch": 0.7679014797894528, "grad_norm": 0.31816704466209683, "learning_rate": 1.3470617126718272e-06, "loss": 0.0114, "step": 3866 }, { "epoch": 0.7681001092462012, "grad_norm": 0.39457341298241466, "learning_rate": 1.344865481353736e-06, "loss": 0.0088, "step": 3867 }, { "epoch": 0.7682987387029496, "grad_norm": 0.3878354688307015, "learning_rate": 1.34267076362319e-06, "loss": 0.0051, "step": 3868 }, { "epoch": 0.768497368159698, "grad_norm": 0.5176469767858003, "learning_rate": 1.3404775603890175e-06, "loss": 0.0102, "step": 3869 }, { "epoch": 0.7686959976164465, "grad_norm": 0.6546341810751645, "learning_rate": 1.3382858725594233e-06, "loss": 0.016, "step": 3870 }, { "epoch": 0.768894627073195, "grad_norm": 0.4077083813837435, "learning_rate": 1.3360957010419813e-06, "loss": 0.0127, "step": 3871 }, { "epoch": 0.7690932565299434, "grad_norm": 1.0786672783122584, "learning_rate": 1.333907046743641e-06, "loss": 0.0139, "step": 3872 }, { "epoch": 0.7692918859866918, "grad_norm": 0.620538079771535, "learning_rate": 1.3317199105707207e-06, "loss": 0.0119, "step": 3873 }, { "epoch": 0.7694905154434403, "grad_norm": 0.7487056379611181, "learning_rate": 1.3295342934289128e-06, "loss": 0.0135, "step": 3874 }, { "epoch": 0.7696891449001887, "grad_norm": 0.5658770968139971, "learning_rate": 1.3273501962232787e-06, "loss": 0.0136, "step": 3875 }, { "epoch": 0.7698877743569371, "grad_norm": 0.5544123537115362, "learning_rate": 1.3251676198582491e-06, "loss": 0.0076, "step": 3876 }, { "epoch": 0.7700864038136855, "grad_norm": 0.42604725459840853, "learning_rate": 1.3229865652376295e-06, "loss": 0.0092, "step": 3877 }, { "epoch": 0.770285033270434, "grad_norm": 0.7356682784487021, "learning_rate": 1.3208070332645889e-06, "loss": 0.0091, "step": 3878 }, { "epoch": 0.7704836627271825, "grad_norm": 0.2945392911865687, "learning_rate": 1.3186290248416723e-06, "loss": 0.0094, "step": 3879 }, { "epoch": 0.7706822921839309, "grad_norm": 0.4400140082230475, "learning_rate": 1.3164525408707908e-06, "loss": 0.012, "step": 3880 }, { "epoch": 0.7708809216406793, "grad_norm": 0.3677167541782022, "learning_rate": 1.3142775822532216e-06, "loss": 0.0068, "step": 3881 }, { "epoch": 0.7710795510974278, "grad_norm": 0.393778558813364, "learning_rate": 1.3121041498896165e-06, "loss": 0.0093, "step": 3882 }, { "epoch": 0.7712781805541762, "grad_norm": 0.41937295389208945, "learning_rate": 1.3099322446799883e-06, "loss": 0.0109, "step": 3883 }, { "epoch": 0.7714768100109246, "grad_norm": 0.45418818468109395, "learning_rate": 1.3077618675237235e-06, "loss": 0.009, "step": 3884 }, { "epoch": 0.771675439467673, "grad_norm": 0.5022438379809215, "learning_rate": 1.305593019319571e-06, "loss": 0.0133, "step": 3885 }, { "epoch": 0.7718740689244215, "grad_norm": 0.7531062350453083, "learning_rate": 1.3034257009656486e-06, "loss": 0.0098, "step": 3886 }, { "epoch": 0.7720726983811699, "grad_norm": 0.32685807203315603, "learning_rate": 1.3012599133594438e-06, "loss": 0.0122, "step": 3887 }, { "epoch": 0.7722713278379184, "grad_norm": 0.6199595695269277, "learning_rate": 1.299095657397803e-06, "loss": 0.0137, "step": 3888 }, { "epoch": 0.7724699572946668, "grad_norm": 0.918750119255603, "learning_rate": 1.296932933976946e-06, "loss": 0.0183, "step": 3889 }, { "epoch": 0.7726685867514153, "grad_norm": 0.5007197845896081, "learning_rate": 1.294771743992451e-06, "loss": 0.01, "step": 3890 }, { "epoch": 0.7728672162081637, "grad_norm": 0.4774983945538157, "learning_rate": 1.292612088339268e-06, "loss": 0.0112, "step": 3891 }, { "epoch": 0.7730658456649121, "grad_norm": 0.40168632160434964, "learning_rate": 1.2904539679117051e-06, "loss": 0.0045, "step": 3892 }, { "epoch": 0.7732644751216605, "grad_norm": 0.6613299836977145, "learning_rate": 1.2882973836034391e-06, "loss": 0.0173, "step": 3893 }, { "epoch": 0.7734631045784089, "grad_norm": 2.4503594172896195, "learning_rate": 1.286142336307511e-06, "loss": 0.016, "step": 3894 }, { "epoch": 0.7736617340351574, "grad_norm": 0.5175369687816846, "learning_rate": 1.283988826916321e-06, "loss": 0.0129, "step": 3895 }, { "epoch": 0.7738603634919059, "grad_norm": 0.32059427002808744, "learning_rate": 1.2818368563216377e-06, "loss": 0.0094, "step": 3896 }, { "epoch": 0.7740589929486543, "grad_norm": 0.45258914822238383, "learning_rate": 1.2796864254145875e-06, "loss": 0.0123, "step": 3897 }, { "epoch": 0.7742576224054027, "grad_norm": 0.31073316498638376, "learning_rate": 1.277537535085664e-06, "loss": 0.0105, "step": 3898 }, { "epoch": 0.7744562518621512, "grad_norm": 0.27654739609890827, "learning_rate": 1.2753901862247198e-06, "loss": 0.0068, "step": 3899 }, { "epoch": 0.7746548813188996, "grad_norm": 0.7702240957803342, "learning_rate": 1.2732443797209676e-06, "loss": 0.014, "step": 3900 }, { "epoch": 0.774853510775648, "grad_norm": 0.7888401795775241, "learning_rate": 1.2711001164629878e-06, "loss": 0.0107, "step": 3901 }, { "epoch": 0.7750521402323964, "grad_norm": 0.5102152605501087, "learning_rate": 1.2689573973387136e-06, "loss": 0.0109, "step": 3902 }, { "epoch": 0.7752507696891449, "grad_norm": 0.42949075002493076, "learning_rate": 1.2668162232354453e-06, "loss": 0.0102, "step": 3903 }, { "epoch": 0.7754493991458933, "grad_norm": 0.51913928021783, "learning_rate": 1.2646765950398415e-06, "loss": 0.0152, "step": 3904 }, { "epoch": 0.7756480286026418, "grad_norm": 0.5147647620976568, "learning_rate": 1.2625385136379181e-06, "loss": 0.0109, "step": 3905 }, { "epoch": 0.7758466580593902, "grad_norm": 0.3620633765074665, "learning_rate": 1.2604019799150547e-06, "loss": 0.0099, "step": 3906 }, { "epoch": 0.7760452875161387, "grad_norm": 0.623127129178747, "learning_rate": 1.2582669947559845e-06, "loss": 0.0142, "step": 3907 }, { "epoch": 0.7762439169728871, "grad_norm": 0.8733934868624431, "learning_rate": 1.2561335590448066e-06, "loss": 0.0162, "step": 3908 }, { "epoch": 0.7764425464296355, "grad_norm": 0.8813493665439702, "learning_rate": 1.2540016736649713e-06, "loss": 0.0126, "step": 3909 }, { "epoch": 0.7766411758863839, "grad_norm": 0.27843671726361163, "learning_rate": 1.2518713394992916e-06, "loss": 0.0077, "step": 3910 }, { "epoch": 0.7768398053431324, "grad_norm": 0.5249639405585835, "learning_rate": 1.2497425574299376e-06, "loss": 0.0139, "step": 3911 }, { "epoch": 0.7770384347998808, "grad_norm": 0.4152029494822117, "learning_rate": 1.247615328338434e-06, "loss": 0.0069, "step": 3912 }, { "epoch": 0.7772370642566293, "grad_norm": 0.9012103889077504, "learning_rate": 1.2454896531056665e-06, "loss": 0.012, "step": 3913 }, { "epoch": 0.7774356937133777, "grad_norm": 0.42111435803255814, "learning_rate": 1.2433655326118726e-06, "loss": 0.0102, "step": 3914 }, { "epoch": 0.7776343231701262, "grad_norm": 0.7641685303436699, "learning_rate": 1.2412429677366512e-06, "loss": 0.0118, "step": 3915 }, { "epoch": 0.7778329526268746, "grad_norm": 0.5032971538363441, "learning_rate": 1.239121959358951e-06, "loss": 0.0073, "step": 3916 }, { "epoch": 0.778031582083623, "grad_norm": 0.774857905002289, "learning_rate": 1.2370025083570813e-06, "loss": 0.0082, "step": 3917 }, { "epoch": 0.7782302115403714, "grad_norm": 0.5279560951716294, "learning_rate": 1.2348846156087058e-06, "loss": 0.0139, "step": 3918 }, { "epoch": 0.7784288409971198, "grad_norm": 0.4746514702620925, "learning_rate": 1.2327682819908393e-06, "loss": 0.0111, "step": 3919 }, { "epoch": 0.7786274704538683, "grad_norm": 0.5401006357078507, "learning_rate": 1.230653508379856e-06, "loss": 0.0128, "step": 3920 }, { "epoch": 0.7788260999106168, "grad_norm": 0.5347132085056566, "learning_rate": 1.2285402956514786e-06, "loss": 0.0094, "step": 3921 }, { "epoch": 0.7790247293673652, "grad_norm": 0.8006086363779595, "learning_rate": 1.2264286446807893e-06, "loss": 0.0211, "step": 3922 }, { "epoch": 0.7792233588241136, "grad_norm": 0.729596534109269, "learning_rate": 1.2243185563422194e-06, "loss": 0.0102, "step": 3923 }, { "epoch": 0.7794219882808621, "grad_norm": 0.8162366400705077, "learning_rate": 1.222210031509553e-06, "loss": 0.0154, "step": 3924 }, { "epoch": 0.7796206177376105, "grad_norm": 0.8298607900398938, "learning_rate": 1.2201030710559309e-06, "loss": 0.0154, "step": 3925 }, { "epoch": 0.7798192471943589, "grad_norm": 0.9295004548979635, "learning_rate": 1.2179976758538397e-06, "loss": 0.0202, "step": 3926 }, { "epoch": 0.7800178766511073, "grad_norm": 0.6284727316819763, "learning_rate": 1.2158938467751258e-06, "loss": 0.0144, "step": 3927 }, { "epoch": 0.7802165061078558, "grad_norm": 0.31174985864230775, "learning_rate": 1.213791584690978e-06, "loss": 0.008, "step": 3928 }, { "epoch": 0.7804151355646042, "grad_norm": 0.3904178709197262, "learning_rate": 1.2116908904719443e-06, "loss": 0.008, "step": 3929 }, { "epoch": 0.7806137650213527, "grad_norm": 0.28385884511776377, "learning_rate": 1.2095917649879202e-06, "loss": 0.0072, "step": 3930 }, { "epoch": 0.7808123944781011, "grad_norm": 0.8225239608519143, "learning_rate": 1.2074942091081493e-06, "loss": 0.0077, "step": 3931 }, { "epoch": 0.7810110239348496, "grad_norm": 0.25070849880393475, "learning_rate": 1.2053982237012295e-06, "loss": 0.0067, "step": 3932 }, { "epoch": 0.781209653391598, "grad_norm": 1.009173744589044, "learning_rate": 1.2033038096351042e-06, "loss": 0.0153, "step": 3933 }, { "epoch": 0.7814082828483464, "grad_norm": 0.3001079077055634, "learning_rate": 1.20121096777707e-06, "loss": 0.0073, "step": 3934 }, { "epoch": 0.7816069123050948, "grad_norm": 0.2675585992071396, "learning_rate": 1.1991196989937693e-06, "loss": 0.0077, "step": 3935 }, { "epoch": 0.7818055417618432, "grad_norm": 0.36211169776646895, "learning_rate": 1.1970300041511945e-06, "loss": 0.0065, "step": 3936 }, { "epoch": 0.7820041712185917, "grad_norm": 0.556461940995312, "learning_rate": 1.1949418841146875e-06, "loss": 0.0149, "step": 3937 }, { "epoch": 0.7822028006753402, "grad_norm": 0.41740062523359517, "learning_rate": 1.192855339748935e-06, "loss": 0.0151, "step": 3938 }, { "epoch": 0.7824014301320886, "grad_norm": 0.31475234587927653, "learning_rate": 1.1907703719179752e-06, "loss": 0.0064, "step": 3939 }, { "epoch": 0.782600059588837, "grad_norm": 0.6800297738810858, "learning_rate": 1.1886869814851881e-06, "loss": 0.0138, "step": 3940 }, { "epoch": 0.7827986890455855, "grad_norm": 0.5648602421057914, "learning_rate": 1.186605169313307e-06, "loss": 0.0143, "step": 3941 }, { "epoch": 0.7829973185023339, "grad_norm": 0.32364785454126055, "learning_rate": 1.1845249362644046e-06, "loss": 0.0111, "step": 3942 }, { "epoch": 0.7831959479590823, "grad_norm": 7.128471218729785, "learning_rate": 1.1824462831999057e-06, "loss": 0.018, "step": 3943 }, { "epoch": 0.7833945774158307, "grad_norm": 0.46537352198306225, "learning_rate": 1.1803692109805786e-06, "loss": 0.0096, "step": 3944 }, { "epoch": 0.7835932068725792, "grad_norm": 0.40237938194467165, "learning_rate": 1.178293720466535e-06, "loss": 0.006, "step": 3945 }, { "epoch": 0.7837918363293276, "grad_norm": 0.47649481459898346, "learning_rate": 1.1762198125172364e-06, "loss": 0.0135, "step": 3946 }, { "epoch": 0.7839904657860761, "grad_norm": 0.3889132630569039, "learning_rate": 1.1741474879914837e-06, "loss": 0.0091, "step": 3947 }, { "epoch": 0.7841890952428245, "grad_norm": 0.3589855854840536, "learning_rate": 1.1720767477474238e-06, "loss": 0.0071, "step": 3948 }, { "epoch": 0.784387724699573, "grad_norm": 0.6600060042525093, "learning_rate": 1.1700075926425508e-06, "loss": 0.0073, "step": 3949 }, { "epoch": 0.7845863541563214, "grad_norm": 0.5605141682819981, "learning_rate": 1.167940023533697e-06, "loss": 0.0107, "step": 3950 }, { "epoch": 0.7847849836130698, "grad_norm": 0.4276017759614575, "learning_rate": 1.1658740412770426e-06, "loss": 0.0108, "step": 3951 }, { "epoch": 0.7849836130698182, "grad_norm": 0.5245144378224614, "learning_rate": 1.1638096467281074e-06, "loss": 0.0085, "step": 3952 }, { "epoch": 0.7851822425265667, "grad_norm": 0.504334640539037, "learning_rate": 1.1617468407417553e-06, "loss": 0.0101, "step": 3953 }, { "epoch": 0.7853808719833151, "grad_norm": 0.48205839823898566, "learning_rate": 1.1596856241721944e-06, "loss": 0.0116, "step": 3954 }, { "epoch": 0.7855795014400636, "grad_norm": 0.6530170104490158, "learning_rate": 1.1576259978729692e-06, "loss": 0.0178, "step": 3955 }, { "epoch": 0.785778130896812, "grad_norm": 0.710759087565475, "learning_rate": 1.1555679626969724e-06, "loss": 0.0144, "step": 3956 }, { "epoch": 0.7859767603535605, "grad_norm": 0.5424274465145213, "learning_rate": 1.1535115194964304e-06, "loss": 0.0132, "step": 3957 }, { "epoch": 0.7861753898103089, "grad_norm": 0.6661319005283131, "learning_rate": 1.1514566691229178e-06, "loss": 0.013, "step": 3958 }, { "epoch": 0.7863740192670573, "grad_norm": 0.5573412948277443, "learning_rate": 1.1494034124273428e-06, "loss": 0.0133, "step": 3959 }, { "epoch": 0.7865726487238057, "grad_norm": 0.4236751414096416, "learning_rate": 1.147351750259959e-06, "loss": 0.0079, "step": 3960 }, { "epoch": 0.7867712781805541, "grad_norm": 0.6700103105511918, "learning_rate": 1.1453016834703584e-06, "loss": 0.0142, "step": 3961 }, { "epoch": 0.7869699076373026, "grad_norm": 0.6722312403375201, "learning_rate": 1.1432532129074692e-06, "loss": 0.0175, "step": 3962 }, { "epoch": 0.7871685370940511, "grad_norm": 0.29540238303248256, "learning_rate": 1.1412063394195634e-06, "loss": 0.0082, "step": 3963 }, { "epoch": 0.7873671665507995, "grad_norm": 0.453040253508881, "learning_rate": 1.1391610638542473e-06, "loss": 0.0149, "step": 3964 }, { "epoch": 0.787565796007548, "grad_norm": 0.3063401650275241, "learning_rate": 1.1371173870584696e-06, "loss": 0.0087, "step": 3965 }, { "epoch": 0.7877644254642964, "grad_norm": 0.7536587487092659, "learning_rate": 1.1350753098785117e-06, "loss": 0.0112, "step": 3966 }, { "epoch": 0.7879630549210448, "grad_norm": 0.31791285239524447, "learning_rate": 1.1330348331599978e-06, "loss": 0.0101, "step": 3967 }, { "epoch": 0.7881616843777932, "grad_norm": 0.5345985537470007, "learning_rate": 1.1309959577478885e-06, "loss": 0.0138, "step": 3968 }, { "epoch": 0.7883603138345416, "grad_norm": 0.39066543567981465, "learning_rate": 1.128958684486477e-06, "loss": 0.0105, "step": 3969 }, { "epoch": 0.7885589432912901, "grad_norm": 0.6971256838903706, "learning_rate": 1.1269230142194004e-06, "loss": 0.0219, "step": 3970 }, { "epoch": 0.7887575727480385, "grad_norm": 0.3025242348367761, "learning_rate": 1.1248889477896224e-06, "loss": 0.0064, "step": 3971 }, { "epoch": 0.788956202204787, "grad_norm": 0.4101701372618817, "learning_rate": 1.1228564860394508e-06, "loss": 0.0085, "step": 3972 }, { "epoch": 0.7891548316615354, "grad_norm": 1.342044070773984, "learning_rate": 1.120825629810527e-06, "loss": 0.0192, "step": 3973 }, { "epoch": 0.7893534611182839, "grad_norm": 0.43993593424734334, "learning_rate": 1.1187963799438235e-06, "loss": 0.0124, "step": 3974 }, { "epoch": 0.7895520905750323, "grad_norm": 0.8408300332951749, "learning_rate": 1.1167687372796537e-06, "loss": 0.0113, "step": 3975 }, { "epoch": 0.7897507200317807, "grad_norm": 0.9328690964868325, "learning_rate": 1.1147427026576597e-06, "loss": 0.0184, "step": 3976 }, { "epoch": 0.7899493494885291, "grad_norm": 0.41486356516047107, "learning_rate": 1.1127182769168231e-06, "loss": 0.0048, "step": 3977 }, { "epoch": 0.7901479789452776, "grad_norm": 0.6855136367785546, "learning_rate": 1.110695460895454e-06, "loss": 0.0129, "step": 3978 }, { "epoch": 0.790346608402026, "grad_norm": 0.28701281574074733, "learning_rate": 1.108674255431199e-06, "loss": 0.0041, "step": 3979 }, { "epoch": 0.7905452378587745, "grad_norm": 0.4540954582400495, "learning_rate": 1.10665466136104e-06, "loss": 0.0152, "step": 3980 }, { "epoch": 0.7907438673155229, "grad_norm": 0.7368225481887282, "learning_rate": 1.1046366795212854e-06, "loss": 0.0125, "step": 3981 }, { "epoch": 0.7909424967722714, "grad_norm": 0.9802282075745697, "learning_rate": 1.1026203107475824e-06, "loss": 0.0145, "step": 3982 }, { "epoch": 0.7911411262290198, "grad_norm": 1.0658263094138696, "learning_rate": 1.100605555874904e-06, "loss": 0.0151, "step": 3983 }, { "epoch": 0.7913397556857682, "grad_norm": 0.35002483862277856, "learning_rate": 1.0985924157375616e-06, "loss": 0.0088, "step": 3984 }, { "epoch": 0.7915383851425166, "grad_norm": 0.3244457738853698, "learning_rate": 1.0965808911691917e-06, "loss": 0.0066, "step": 3985 }, { "epoch": 0.791737014599265, "grad_norm": 0.6796365282260834, "learning_rate": 1.0945709830027657e-06, "loss": 0.0094, "step": 3986 }, { "epoch": 0.7919356440560135, "grad_norm": 0.45854564583815816, "learning_rate": 1.0925626920705857e-06, "loss": 0.0098, "step": 3987 }, { "epoch": 0.7921342735127619, "grad_norm": 0.5264577666006467, "learning_rate": 1.0905560192042808e-06, "loss": 0.0071, "step": 3988 }, { "epoch": 0.7923329029695104, "grad_norm": 0.3097781743722543, "learning_rate": 1.0885509652348142e-06, "loss": 0.008, "step": 3989 }, { "epoch": 0.7925315324262588, "grad_norm": 0.4200986146469633, "learning_rate": 1.086547530992475e-06, "loss": 0.0089, "step": 3990 }, { "epoch": 0.7927301618830073, "grad_norm": 0.6691123935864665, "learning_rate": 1.0845457173068858e-06, "loss": 0.0151, "step": 3991 }, { "epoch": 0.7929287913397557, "grad_norm": 0.3305478813634996, "learning_rate": 1.0825455250069921e-06, "loss": 0.005, "step": 3992 }, { "epoch": 0.7931274207965041, "grad_norm": 0.33748170186551923, "learning_rate": 1.080546954921075e-06, "loss": 0.0062, "step": 3993 }, { "epoch": 0.7933260502532525, "grad_norm": 0.3245694682203549, "learning_rate": 1.0785500078767392e-06, "loss": 0.0057, "step": 3994 }, { "epoch": 0.793524679710001, "grad_norm": 0.4644644236708028, "learning_rate": 1.076554684700916e-06, "loss": 0.0089, "step": 3995 }, { "epoch": 0.7937233091667494, "grad_norm": 0.5038171184311694, "learning_rate": 1.0745609862198692e-06, "loss": 0.0116, "step": 3996 }, { "epoch": 0.7939219386234979, "grad_norm": 0.6885486523828257, "learning_rate": 1.0725689132591888e-06, "loss": 0.0163, "step": 3997 }, { "epoch": 0.7941205680802463, "grad_norm": 0.3900499706842763, "learning_rate": 1.0705784666437863e-06, "loss": 0.011, "step": 3998 }, { "epoch": 0.7943191975369948, "grad_norm": 0.39376689041252166, "learning_rate": 1.0685896471979074e-06, "loss": 0.0072, "step": 3999 }, { "epoch": 0.7945178269937432, "grad_norm": 0.472464666055705, "learning_rate": 1.066602455745117e-06, "loss": 0.0113, "step": 4000 }, { "epoch": 0.7947164564504916, "grad_norm": 0.7011028914294587, "learning_rate": 1.0646168931083123e-06, "loss": 0.0159, "step": 4001 }, { "epoch": 0.79491508590724, "grad_norm": 0.5949751520146886, "learning_rate": 1.0626329601097102e-06, "loss": 0.0188, "step": 4002 }, { "epoch": 0.7951137153639884, "grad_norm": 0.5403625268434289, "learning_rate": 1.0606506575708559e-06, "loss": 0.0153, "step": 4003 }, { "epoch": 0.7953123448207369, "grad_norm": 0.32460874730935, "learning_rate": 1.0586699863126205e-06, "loss": 0.0074, "step": 4004 }, { "epoch": 0.7955109742774854, "grad_norm": 0.5225527629713734, "learning_rate": 1.0566909471551956e-06, "loss": 0.011, "step": 4005 }, { "epoch": 0.7957096037342338, "grad_norm": 0.23452137300486403, "learning_rate": 1.054713540918102e-06, "loss": 0.006, "step": 4006 }, { "epoch": 0.7959082331909823, "grad_norm": 0.9970244401372391, "learning_rate": 1.0527377684201788e-06, "loss": 0.0139, "step": 4007 }, { "epoch": 0.7961068626477307, "grad_norm": 0.5496516753977624, "learning_rate": 1.0507636304795942e-06, "loss": 0.0113, "step": 4008 }, { "epoch": 0.7963054921044791, "grad_norm": 0.40426703473069714, "learning_rate": 1.0487911279138341e-06, "loss": 0.0115, "step": 4009 }, { "epoch": 0.7965041215612275, "grad_norm": 0.4432391943593047, "learning_rate": 1.046820261539711e-06, "loss": 0.0087, "step": 4010 }, { "epoch": 0.7967027510179759, "grad_norm": 0.5434341498818477, "learning_rate": 1.04485103217336e-06, "loss": 0.0133, "step": 4011 }, { "epoch": 0.7969013804747244, "grad_norm": 0.6869762130705815, "learning_rate": 1.0428834406302345e-06, "loss": 0.0172, "step": 4012 }, { "epoch": 0.7971000099314728, "grad_norm": 0.3868532857982542, "learning_rate": 1.040917487725115e-06, "loss": 0.0107, "step": 4013 }, { "epoch": 0.7972986393882213, "grad_norm": 0.7528663077873841, "learning_rate": 1.0389531742720976e-06, "loss": 0.008, "step": 4014 }, { "epoch": 0.7974972688449697, "grad_norm": 0.47927375494835317, "learning_rate": 1.0369905010846054e-06, "loss": 0.0083, "step": 4015 }, { "epoch": 0.7976958983017182, "grad_norm": 0.4350659995552571, "learning_rate": 1.0350294689753764e-06, "loss": 0.0088, "step": 4016 }, { "epoch": 0.7978945277584666, "grad_norm": 0.295174664809995, "learning_rate": 1.0330700787564756e-06, "loss": 0.0045, "step": 4017 }, { "epoch": 0.798093157215215, "grad_norm": 0.31793807650251443, "learning_rate": 1.0311123312392823e-06, "loss": 0.0099, "step": 4018 }, { "epoch": 0.7982917866719634, "grad_norm": 0.23482176137295935, "learning_rate": 1.0291562272344968e-06, "loss": 0.0062, "step": 4019 }, { "epoch": 0.7984904161287119, "grad_norm": 0.5061544706704849, "learning_rate": 1.0272017675521423e-06, "loss": 0.0089, "step": 4020 }, { "epoch": 0.7986890455854603, "grad_norm": 0.31649122670119934, "learning_rate": 1.0252489530015564e-06, "loss": 0.0064, "step": 4021 }, { "epoch": 0.7988876750422088, "grad_norm": 0.7586656461806361, "learning_rate": 1.0232977843913983e-06, "loss": 0.0149, "step": 4022 }, { "epoch": 0.7990863044989572, "grad_norm": 0.6009260380452704, "learning_rate": 1.0213482625296468e-06, "loss": 0.0108, "step": 4023 }, { "epoch": 0.7992849339557057, "grad_norm": 0.9958776677403033, "learning_rate": 1.0194003882235943e-06, "loss": 0.0113, "step": 4024 }, { "epoch": 0.7994835634124541, "grad_norm": 0.5106754309922006, "learning_rate": 1.0174541622798556e-06, "loss": 0.0092, "step": 4025 }, { "epoch": 0.7996821928692025, "grad_norm": 0.5365858874211565, "learning_rate": 1.015509585504359e-06, "loss": 0.0112, "step": 4026 }, { "epoch": 0.7998808223259509, "grad_norm": 0.5783336315909131, "learning_rate": 1.0135666587023545e-06, "loss": 0.012, "step": 4027 }, { "epoch": 0.8000794517826993, "grad_norm": 0.7373444717399422, "learning_rate": 1.0116253826784028e-06, "loss": 0.0112, "step": 4028 }, { "epoch": 0.8002780812394478, "grad_norm": 0.23832355380940837, "learning_rate": 1.0096857582363862e-06, "loss": 0.0037, "step": 4029 }, { "epoch": 0.8004767106961962, "grad_norm": 0.4162968251704296, "learning_rate": 1.0077477861795026e-06, "loss": 0.0098, "step": 4030 }, { "epoch": 0.8006753401529447, "grad_norm": 0.2707466456827671, "learning_rate": 1.0058114673102616e-06, "loss": 0.0055, "step": 4031 }, { "epoch": 0.8008739696096931, "grad_norm": 0.6216931576856428, "learning_rate": 1.0038768024304928e-06, "loss": 0.0139, "step": 4032 }, { "epoch": 0.8010725990664416, "grad_norm": 0.5832019943429834, "learning_rate": 1.0019437923413373e-06, "loss": 0.0149, "step": 4033 }, { "epoch": 0.80127122852319, "grad_norm": 0.6152187266424713, "learning_rate": 1.0000124378432553e-06, "loss": 0.0086, "step": 4034 }, { "epoch": 0.8014698579799384, "grad_norm": 0.3619901915947861, "learning_rate": 9.980827397360155e-07, "loss": 0.0082, "step": 4035 }, { "epoch": 0.8016684874366868, "grad_norm": 0.3343601589064828, "learning_rate": 9.961546988187055e-07, "loss": 0.0083, "step": 4036 }, { "epoch": 0.8018671168934353, "grad_norm": 0.5999940967835934, "learning_rate": 9.942283158897264e-07, "loss": 0.0149, "step": 4037 }, { "epoch": 0.8020657463501837, "grad_norm": 0.7931795315952762, "learning_rate": 9.923035917467887e-07, "loss": 0.0166, "step": 4038 }, { "epoch": 0.8022643758069322, "grad_norm": 0.3931590426635836, "learning_rate": 9.90380527186922e-07, "loss": 0.0076, "step": 4039 }, { "epoch": 0.8024630052636806, "grad_norm": 0.4836490308516654, "learning_rate": 9.884591230064622e-07, "loss": 0.0098, "step": 4040 }, { "epoch": 0.8026616347204291, "grad_norm": 0.473992042830245, "learning_rate": 9.865393800010636e-07, "loss": 0.011, "step": 4041 }, { "epoch": 0.8028602641771775, "grad_norm": 0.29184782245234137, "learning_rate": 9.84621298965689e-07, "loss": 0.0083, "step": 4042 }, { "epoch": 0.8030588936339259, "grad_norm": 0.642632073832344, "learning_rate": 9.827048806946115e-07, "loss": 0.0094, "step": 4043 }, { "epoch": 0.8032575230906743, "grad_norm": 0.3687910450246707, "learning_rate": 9.807901259814211e-07, "loss": 0.0072, "step": 4044 }, { "epoch": 0.8034561525474228, "grad_norm": 0.37471175625878944, "learning_rate": 9.788770356190137e-07, "loss": 0.0097, "step": 4045 }, { "epoch": 0.8036547820041712, "grad_norm": 0.357576240436411, "learning_rate": 9.76965610399599e-07, "loss": 0.0065, "step": 4046 }, { "epoch": 0.8038534114609197, "grad_norm": 0.6214890939496734, "learning_rate": 9.750558511146974e-07, "loss": 0.0069, "step": 4047 }, { "epoch": 0.8040520409176681, "grad_norm": 1.1538127906781666, "learning_rate": 9.731477585551357e-07, "loss": 0.0152, "step": 4048 }, { "epoch": 0.8042506703744166, "grad_norm": 0.45686551117881485, "learning_rate": 9.71241333511056e-07, "loss": 0.0082, "step": 4049 }, { "epoch": 0.804449299831165, "grad_norm": 0.3975400079457581, "learning_rate": 9.693365767719044e-07, "loss": 0.0087, "step": 4050 }, { "epoch": 0.8046479292879134, "grad_norm": 0.7165138344226095, "learning_rate": 9.674334891264414e-07, "loss": 0.0122, "step": 4051 }, { "epoch": 0.8048465587446618, "grad_norm": 0.5277203363825931, "learning_rate": 9.65532071362731e-07, "loss": 0.009, "step": 4052 }, { "epoch": 0.8050451882014102, "grad_norm": 0.6225019351917528, "learning_rate": 9.63632324268149e-07, "loss": 0.0198, "step": 4053 }, { "epoch": 0.8052438176581587, "grad_norm": 0.4903031415168761, "learning_rate": 9.617342486293812e-07, "loss": 0.0069, "step": 4054 }, { "epoch": 0.8054424471149071, "grad_norm": 0.5308344689616842, "learning_rate": 9.59837845232416e-07, "loss": 0.0107, "step": 4055 }, { "epoch": 0.8056410765716556, "grad_norm": 0.7708014862830759, "learning_rate": 9.57943114862554e-07, "loss": 0.0099, "step": 4056 }, { "epoch": 0.805839706028404, "grad_norm": 1.0172527611136573, "learning_rate": 9.560500583043986e-07, "loss": 0.0087, "step": 4057 }, { "epoch": 0.8060383354851525, "grad_norm": 0.3705945382686297, "learning_rate": 9.541586763418664e-07, "loss": 0.0122, "step": 4058 }, { "epoch": 0.8062369649419009, "grad_norm": 0.7158602396036523, "learning_rate": 9.522689697581733e-07, "loss": 0.0117, "step": 4059 }, { "epoch": 0.8064355943986493, "grad_norm": 0.5429010252706425, "learning_rate": 9.50380939335846e-07, "loss": 0.014, "step": 4060 }, { "epoch": 0.8066342238553977, "grad_norm": 0.6956019948957465, "learning_rate": 9.48494585856718e-07, "loss": 0.0223, "step": 4061 }, { "epoch": 0.8068328533121462, "grad_norm": 0.8506079494222205, "learning_rate": 9.466099101019233e-07, "loss": 0.0063, "step": 4062 }, { "epoch": 0.8070314827688946, "grad_norm": 0.42075834755040786, "learning_rate": 9.447269128519065e-07, "loss": 0.0076, "step": 4063 }, { "epoch": 0.8072301122256431, "grad_norm": 0.595418453912562, "learning_rate": 9.428455948864134e-07, "loss": 0.0117, "step": 4064 }, { "epoch": 0.8074287416823915, "grad_norm": 0.6130960282660696, "learning_rate": 9.409659569844975e-07, "loss": 0.0107, "step": 4065 }, { "epoch": 0.80762737113914, "grad_norm": 1.5118482480538398, "learning_rate": 9.390879999245139e-07, "loss": 0.0109, "step": 4066 }, { "epoch": 0.8078260005958884, "grad_norm": 0.40907353954507686, "learning_rate": 9.372117244841216e-07, "loss": 0.0077, "step": 4067 }, { "epoch": 0.8080246300526368, "grad_norm": 0.6408271348047319, "learning_rate": 9.353371314402871e-07, "loss": 0.0112, "step": 4068 }, { "epoch": 0.8082232595093852, "grad_norm": 0.4828591067512512, "learning_rate": 9.334642215692746e-07, "loss": 0.0095, "step": 4069 }, { "epoch": 0.8084218889661337, "grad_norm": 0.5431520041134413, "learning_rate": 9.315929956466568e-07, "loss": 0.0139, "step": 4070 }, { "epoch": 0.8086205184228821, "grad_norm": 0.6310284789435388, "learning_rate": 9.297234544473044e-07, "loss": 0.0113, "step": 4071 }, { "epoch": 0.8088191478796305, "grad_norm": 0.5081618596537647, "learning_rate": 9.27855598745393e-07, "loss": 0.0099, "step": 4072 }, { "epoch": 0.809017777336379, "grad_norm": 0.44491804248735184, "learning_rate": 9.259894293144017e-07, "loss": 0.0101, "step": 4073 }, { "epoch": 0.8092164067931275, "grad_norm": 0.46738266736467293, "learning_rate": 9.241249469271068e-07, "loss": 0.0103, "step": 4074 }, { "epoch": 0.8094150362498759, "grad_norm": 0.5209162826736811, "learning_rate": 9.222621523555908e-07, "loss": 0.0065, "step": 4075 }, { "epoch": 0.8096136657066243, "grad_norm": 0.3940989383722829, "learning_rate": 9.204010463712326e-07, "loss": 0.0087, "step": 4076 }, { "epoch": 0.8098122951633727, "grad_norm": 0.4123877332825086, "learning_rate": 9.18541629744717e-07, "loss": 0.0093, "step": 4077 }, { "epoch": 0.8100109246201211, "grad_norm": 0.6265895051388984, "learning_rate": 9.16683903246024e-07, "loss": 0.0155, "step": 4078 }, { "epoch": 0.8102095540768696, "grad_norm": 0.5490533621387226, "learning_rate": 9.148278676444372e-07, "loss": 0.0067, "step": 4079 }, { "epoch": 0.810408183533618, "grad_norm": 0.3131848581841642, "learning_rate": 9.129735237085408e-07, "loss": 0.0049, "step": 4080 }, { "epoch": 0.8106068129903665, "grad_norm": 0.7500483326690944, "learning_rate": 9.111208722062143e-07, "loss": 0.0113, "step": 4081 }, { "epoch": 0.8108054424471149, "grad_norm": 0.5484117682655739, "learning_rate": 9.092699139046413e-07, "loss": 0.011, "step": 4082 }, { "epoch": 0.8110040719038634, "grad_norm": 0.3098865334339739, "learning_rate": 9.074206495702992e-07, "loss": 0.0086, "step": 4083 }, { "epoch": 0.8112027013606118, "grad_norm": 0.17468366708042246, "learning_rate": 9.055730799689688e-07, "loss": 0.0028, "step": 4084 }, { "epoch": 0.8114013308173602, "grad_norm": 0.538338960986114, "learning_rate": 9.037272058657242e-07, "loss": 0.0105, "step": 4085 }, { "epoch": 0.8115999602741086, "grad_norm": 0.4201409900907841, "learning_rate": 9.018830280249419e-07, "loss": 0.0063, "step": 4086 }, { "epoch": 0.8117985897308571, "grad_norm": 0.5876635452613788, "learning_rate": 9.000405472102946e-07, "loss": 0.0126, "step": 4087 }, { "epoch": 0.8119972191876055, "grad_norm": 0.5156736720046141, "learning_rate": 8.981997641847501e-07, "loss": 0.0128, "step": 4088 }, { "epoch": 0.812195848644354, "grad_norm": 0.5052009076325246, "learning_rate": 8.963606797105767e-07, "loss": 0.015, "step": 4089 }, { "epoch": 0.8123944781011024, "grad_norm": 0.5430853287081518, "learning_rate": 8.94523294549336e-07, "loss": 0.01, "step": 4090 }, { "epoch": 0.8125931075578509, "grad_norm": 0.44810523621001547, "learning_rate": 8.92687609461887e-07, "loss": 0.0098, "step": 4091 }, { "epoch": 0.8127917370145993, "grad_norm": 0.5134814399572584, "learning_rate": 8.908536252083865e-07, "loss": 0.0056, "step": 4092 }, { "epoch": 0.8129903664713477, "grad_norm": 0.46375774276643456, "learning_rate": 8.890213425482841e-07, "loss": 0.0089, "step": 4093 }, { "epoch": 0.8131889959280961, "grad_norm": 1.1162249657192767, "learning_rate": 8.871907622403275e-07, "loss": 0.0214, "step": 4094 }, { "epoch": 0.8133876253848445, "grad_norm": 0.42917047757228455, "learning_rate": 8.853618850425572e-07, "loss": 0.006, "step": 4095 }, { "epoch": 0.813586254841593, "grad_norm": 0.5208590114708673, "learning_rate": 8.835347117123089e-07, "loss": 0.0161, "step": 4096 }, { "epoch": 0.8137848842983414, "grad_norm": 0.5308424443047222, "learning_rate": 8.817092430062158e-07, "loss": 0.0161, "step": 4097 }, { "epoch": 0.8139835137550899, "grad_norm": 0.46028366709106316, "learning_rate": 8.798854796801997e-07, "loss": 0.0097, "step": 4098 }, { "epoch": 0.8141821432118383, "grad_norm": 0.448761200240888, "learning_rate": 8.780634224894818e-07, "loss": 0.0128, "step": 4099 }, { "epoch": 0.8143807726685868, "grad_norm": 0.3671637643734095, "learning_rate": 8.762430721885717e-07, "loss": 0.0073, "step": 4100 }, { "epoch": 0.8145794021253352, "grad_norm": 0.7115867455847144, "learning_rate": 8.74424429531277e-07, "loss": 0.0087, "step": 4101 }, { "epoch": 0.8147780315820836, "grad_norm": 0.42494740814520693, "learning_rate": 8.726074952706931e-07, "loss": 0.0092, "step": 4102 }, { "epoch": 0.814976661038832, "grad_norm": 0.7513309884495961, "learning_rate": 8.707922701592126e-07, "loss": 0.008, "step": 4103 }, { "epoch": 0.8151752904955805, "grad_norm": 1.1383897819360642, "learning_rate": 8.689787549485185e-07, "loss": 0.0138, "step": 4104 }, { "epoch": 0.8153739199523289, "grad_norm": 0.6557470447168657, "learning_rate": 8.671669503895841e-07, "loss": 0.0198, "step": 4105 }, { "epoch": 0.8155725494090774, "grad_norm": 0.6581283241574932, "learning_rate": 8.653568572326781e-07, "loss": 0.0148, "step": 4106 }, { "epoch": 0.8157711788658258, "grad_norm": 0.417139009113113, "learning_rate": 8.635484762273561e-07, "loss": 0.0123, "step": 4107 }, { "epoch": 0.8159698083225743, "grad_norm": 0.6867991303871931, "learning_rate": 8.617418081224682e-07, "loss": 0.0168, "step": 4108 }, { "epoch": 0.8161684377793227, "grad_norm": 1.0724616229540893, "learning_rate": 8.599368536661528e-07, "loss": 0.0131, "step": 4109 }, { "epoch": 0.8163670672360711, "grad_norm": 0.5087955548304802, "learning_rate": 8.581336136058405e-07, "loss": 0.0137, "step": 4110 }, { "epoch": 0.8165656966928195, "grad_norm": 0.421017074957917, "learning_rate": 8.563320886882514e-07, "loss": 0.0154, "step": 4111 }, { "epoch": 0.816764326149568, "grad_norm": 0.39496073798864595, "learning_rate": 8.545322796593941e-07, "loss": 0.0157, "step": 4112 }, { "epoch": 0.8169629556063164, "grad_norm": 0.4997123669546495, "learning_rate": 8.527341872645711e-07, "loss": 0.0143, "step": 4113 }, { "epoch": 0.8171615850630648, "grad_norm": 0.37415956728088906, "learning_rate": 8.509378122483652e-07, "loss": 0.0102, "step": 4114 }, { "epoch": 0.8173602145198133, "grad_norm": 0.383596392690376, "learning_rate": 8.491431553546564e-07, "loss": 0.0054, "step": 4115 }, { "epoch": 0.8175588439765618, "grad_norm": 0.4821601487830514, "learning_rate": 8.473502173266113e-07, "loss": 0.0087, "step": 4116 }, { "epoch": 0.8177574734333102, "grad_norm": 0.47018873094016517, "learning_rate": 8.455589989066815e-07, "loss": 0.0102, "step": 4117 }, { "epoch": 0.8179561028900586, "grad_norm": 0.2810129004401643, "learning_rate": 8.437695008366115e-07, "loss": 0.0077, "step": 4118 }, { "epoch": 0.818154732346807, "grad_norm": 0.5366359137320363, "learning_rate": 8.419817238574273e-07, "loss": 0.0083, "step": 4119 }, { "epoch": 0.8183533618035554, "grad_norm": 0.316839455464566, "learning_rate": 8.401956687094487e-07, "loss": 0.006, "step": 4120 }, { "epoch": 0.8185519912603039, "grad_norm": 0.3111926776617216, "learning_rate": 8.384113361322765e-07, "loss": 0.0086, "step": 4121 }, { "epoch": 0.8187506207170523, "grad_norm": 0.33471856971083414, "learning_rate": 8.366287268648027e-07, "loss": 0.0079, "step": 4122 }, { "epoch": 0.8189492501738008, "grad_norm": 0.4351278758006974, "learning_rate": 8.348478416452049e-07, "loss": 0.0078, "step": 4123 }, { "epoch": 0.8191478796305492, "grad_norm": 0.1570818636838516, "learning_rate": 8.330686812109439e-07, "loss": 0.0027, "step": 4124 }, { "epoch": 0.8193465090872977, "grad_norm": 0.5523549117457559, "learning_rate": 8.312912462987699e-07, "loss": 0.013, "step": 4125 }, { "epoch": 0.8195451385440461, "grad_norm": 0.3214475082205346, "learning_rate": 8.295155376447151e-07, "loss": 0.0087, "step": 4126 }, { "epoch": 0.8197437680007945, "grad_norm": 0.29871131133299855, "learning_rate": 8.277415559841012e-07, "loss": 0.0064, "step": 4127 }, { "epoch": 0.8199423974575429, "grad_norm": 0.9678390162363811, "learning_rate": 8.259693020515292e-07, "loss": 0.0096, "step": 4128 }, { "epoch": 0.8201410269142914, "grad_norm": 0.460127250128567, "learning_rate": 8.241987765808896e-07, "loss": 0.0128, "step": 4129 }, { "epoch": 0.8203396563710398, "grad_norm": 0.6057853897482152, "learning_rate": 8.224299803053559e-07, "loss": 0.0088, "step": 4130 }, { "epoch": 0.8205382858277883, "grad_norm": 0.5206279732652137, "learning_rate": 8.206629139573824e-07, "loss": 0.008, "step": 4131 }, { "epoch": 0.8207369152845367, "grad_norm": 0.22610796187469387, "learning_rate": 8.188975782687125e-07, "loss": 0.0035, "step": 4132 }, { "epoch": 0.8209355447412852, "grad_norm": 0.5861117727426646, "learning_rate": 8.171339739703671e-07, "loss": 0.0154, "step": 4133 }, { "epoch": 0.8211341741980336, "grad_norm": 0.5374366990301481, "learning_rate": 8.153721017926552e-07, "loss": 0.0129, "step": 4134 }, { "epoch": 0.821332803654782, "grad_norm": 0.4143698403269354, "learning_rate": 8.136119624651645e-07, "loss": 0.0114, "step": 4135 }, { "epoch": 0.8215314331115304, "grad_norm": 0.37663933807526523, "learning_rate": 8.118535567167673e-07, "loss": 0.0067, "step": 4136 }, { "epoch": 0.8217300625682789, "grad_norm": 0.9518297277488379, "learning_rate": 8.100968852756208e-07, "loss": 0.0155, "step": 4137 }, { "epoch": 0.8219286920250273, "grad_norm": 0.4973523714635241, "learning_rate": 8.083419488691563e-07, "loss": 0.0108, "step": 4138 }, { "epoch": 0.8221273214817757, "grad_norm": 0.4128944945882541, "learning_rate": 8.065887482240925e-07, "loss": 0.0105, "step": 4139 }, { "epoch": 0.8223259509385242, "grad_norm": 0.43279127906825904, "learning_rate": 8.048372840664298e-07, "loss": 0.0086, "step": 4140 }, { "epoch": 0.8225245803952727, "grad_norm": 0.5404311137276034, "learning_rate": 8.030875571214458e-07, "loss": 0.0118, "step": 4141 }, { "epoch": 0.8227232098520211, "grad_norm": 0.6347070223716965, "learning_rate": 8.013395681137027e-07, "loss": 0.0154, "step": 4142 }, { "epoch": 0.8229218393087695, "grad_norm": 0.44237408691615476, "learning_rate": 7.995933177670385e-07, "loss": 0.0108, "step": 4143 }, { "epoch": 0.8231204687655179, "grad_norm": 0.6010470786401767, "learning_rate": 7.978488068045764e-07, "loss": 0.015, "step": 4144 }, { "epoch": 0.8233190982222663, "grad_norm": 0.7701565584935676, "learning_rate": 7.961060359487138e-07, "loss": 0.0119, "step": 4145 }, { "epoch": 0.8235177276790148, "grad_norm": 0.4310605835446213, "learning_rate": 7.943650059211322e-07, "loss": 0.0081, "step": 4146 }, { "epoch": 0.8237163571357632, "grad_norm": 0.590740099431193, "learning_rate": 7.92625717442791e-07, "loss": 0.0167, "step": 4147 }, { "epoch": 0.8239149865925117, "grad_norm": 0.9726708073139096, "learning_rate": 7.908881712339256e-07, "loss": 0.0189, "step": 4148 }, { "epoch": 0.8241136160492601, "grad_norm": 0.4053559892400188, "learning_rate": 7.891523680140545e-07, "loss": 0.007, "step": 4149 }, { "epoch": 0.8243122455060086, "grad_norm": 0.2627943863024738, "learning_rate": 7.874183085019698e-07, "loss": 0.0075, "step": 4150 }, { "epoch": 0.824510874962757, "grad_norm": 0.6733876425443319, "learning_rate": 7.856859934157463e-07, "loss": 0.0113, "step": 4151 }, { "epoch": 0.8247095044195054, "grad_norm": 0.42436341922801024, "learning_rate": 7.839554234727309e-07, "loss": 0.0105, "step": 4152 }, { "epoch": 0.8249081338762538, "grad_norm": 0.6937300129824238, "learning_rate": 7.822265993895533e-07, "loss": 0.01, "step": 4153 }, { "epoch": 0.8251067633330023, "grad_norm": 0.3959765854010693, "learning_rate": 7.804995218821182e-07, "loss": 0.0069, "step": 4154 }, { "epoch": 0.8253053927897507, "grad_norm": 0.5846032872762472, "learning_rate": 7.787741916656038e-07, "loss": 0.0091, "step": 4155 }, { "epoch": 0.8255040222464991, "grad_norm": 0.6565011551242094, "learning_rate": 7.770506094544711e-07, "loss": 0.0107, "step": 4156 }, { "epoch": 0.8257026517032476, "grad_norm": 0.612137058681032, "learning_rate": 7.753287759624506e-07, "loss": 0.0123, "step": 4157 }, { "epoch": 0.8259012811599961, "grad_norm": 0.5434012901407049, "learning_rate": 7.736086919025549e-07, "loss": 0.0098, "step": 4158 }, { "epoch": 0.8260999106167445, "grad_norm": 0.45931089115946994, "learning_rate": 7.718903579870656e-07, "loss": 0.0132, "step": 4159 }, { "epoch": 0.8262985400734929, "grad_norm": 0.3361927264054978, "learning_rate": 7.701737749275457e-07, "loss": 0.0074, "step": 4160 }, { "epoch": 0.8264971695302413, "grad_norm": 0.5681567318696761, "learning_rate": 7.684589434348316e-07, "loss": 0.0092, "step": 4161 }, { "epoch": 0.8266957989869897, "grad_norm": 0.5567095462260514, "learning_rate": 7.66745864219029e-07, "loss": 0.0152, "step": 4162 }, { "epoch": 0.8268944284437382, "grad_norm": 0.6126892428547912, "learning_rate": 7.650345379895263e-07, "loss": 0.0118, "step": 4163 }, { "epoch": 0.8270930579004866, "grad_norm": 0.8912645228011347, "learning_rate": 7.63324965454979e-07, "loss": 0.0145, "step": 4164 }, { "epoch": 0.8272916873572351, "grad_norm": 0.5553683620343327, "learning_rate": 7.616171473233208e-07, "loss": 0.0127, "step": 4165 }, { "epoch": 0.8274903168139836, "grad_norm": 0.5464361540926801, "learning_rate": 7.599110843017588e-07, "loss": 0.0196, "step": 4166 }, { "epoch": 0.827688946270732, "grad_norm": 0.7188318229017358, "learning_rate": 7.582067770967694e-07, "loss": 0.0091, "step": 4167 }, { "epoch": 0.8278875757274804, "grad_norm": 0.6648555543168041, "learning_rate": 7.565042264141071e-07, "loss": 0.0161, "step": 4168 }, { "epoch": 0.8280862051842288, "grad_norm": 0.5054813315229681, "learning_rate": 7.548034329587934e-07, "loss": 0.0143, "step": 4169 }, { "epoch": 0.8282848346409772, "grad_norm": 0.3704689407540544, "learning_rate": 7.531043974351282e-07, "loss": 0.0096, "step": 4170 }, { "epoch": 0.8284834640977257, "grad_norm": 0.5217578074725935, "learning_rate": 7.514071205466783e-07, "loss": 0.0092, "step": 4171 }, { "epoch": 0.8286820935544741, "grad_norm": 0.4232794133302012, "learning_rate": 7.497116029962848e-07, "loss": 0.0113, "step": 4172 }, { "epoch": 0.8288807230112225, "grad_norm": 0.5154993721621637, "learning_rate": 7.480178454860615e-07, "loss": 0.0137, "step": 4173 }, { "epoch": 0.829079352467971, "grad_norm": 0.7002762553322276, "learning_rate": 7.463258487173891e-07, "loss": 0.0122, "step": 4174 }, { "epoch": 0.8292779819247195, "grad_norm": 0.2849167502483706, "learning_rate": 7.446356133909244e-07, "loss": 0.0053, "step": 4175 }, { "epoch": 0.8294766113814679, "grad_norm": 0.3481357192610811, "learning_rate": 7.429471402065891e-07, "loss": 0.0057, "step": 4176 }, { "epoch": 0.8296752408382163, "grad_norm": 0.5261029513879698, "learning_rate": 7.412604298635817e-07, "loss": 0.0142, "step": 4177 }, { "epoch": 0.8298738702949647, "grad_norm": 0.43848638143307833, "learning_rate": 7.395754830603636e-07, "loss": 0.0083, "step": 4178 }, { "epoch": 0.8300724997517132, "grad_norm": 0.47296015769234956, "learning_rate": 7.37892300494672e-07, "loss": 0.0129, "step": 4179 }, { "epoch": 0.8302711292084616, "grad_norm": 0.2953837424429043, "learning_rate": 7.362108828635117e-07, "loss": 0.0095, "step": 4180 }, { "epoch": 0.83046975866521, "grad_norm": 0.3474888761012372, "learning_rate": 7.345312308631536e-07, "loss": 0.0084, "step": 4181 }, { "epoch": 0.8306683881219585, "grad_norm": 0.4370889556934582, "learning_rate": 7.328533451891423e-07, "loss": 0.011, "step": 4182 }, { "epoch": 0.830867017578707, "grad_norm": 0.4255737427042872, "learning_rate": 7.311772265362866e-07, "loss": 0.0133, "step": 4183 }, { "epoch": 0.8310656470354554, "grad_norm": 0.3946025875468651, "learning_rate": 7.295028755986678e-07, "loss": 0.0093, "step": 4184 }, { "epoch": 0.8312642764922038, "grad_norm": 0.40631146235195387, "learning_rate": 7.278302930696312e-07, "loss": 0.0105, "step": 4185 }, { "epoch": 0.8314629059489522, "grad_norm": 0.37683409726127903, "learning_rate": 7.261594796417915e-07, "loss": 0.016, "step": 4186 }, { "epoch": 0.8316615354057006, "grad_norm": 0.5759629439702126, "learning_rate": 7.244904360070321e-07, "loss": 0.0093, "step": 4187 }, { "epoch": 0.8318601648624491, "grad_norm": 0.4095970190194349, "learning_rate": 7.228231628565003e-07, "loss": 0.0089, "step": 4188 }, { "epoch": 0.8320587943191975, "grad_norm": 0.8007934056507748, "learning_rate": 7.211576608806132e-07, "loss": 0.0129, "step": 4189 }, { "epoch": 0.832257423775946, "grad_norm": 0.4194690457518919, "learning_rate": 7.194939307690557e-07, "loss": 0.0088, "step": 4190 }, { "epoch": 0.8324560532326944, "grad_norm": 0.4443434778514458, "learning_rate": 7.17831973210773e-07, "loss": 0.0093, "step": 4191 }, { "epoch": 0.8326546826894429, "grad_norm": 0.3928978159389848, "learning_rate": 7.161717888939834e-07, "loss": 0.0095, "step": 4192 }, { "epoch": 0.8328533121461913, "grad_norm": 0.431478336303014, "learning_rate": 7.145133785061648e-07, "loss": 0.0105, "step": 4193 }, { "epoch": 0.8330519416029397, "grad_norm": 0.5225194392334359, "learning_rate": 7.12856742734066e-07, "loss": 0.0196, "step": 4194 }, { "epoch": 0.8332505710596881, "grad_norm": 0.36971643133265614, "learning_rate": 7.112018822636951e-07, "loss": 0.0079, "step": 4195 }, { "epoch": 0.8334492005164366, "grad_norm": 0.6234502273500208, "learning_rate": 7.095487977803306e-07, "loss": 0.0126, "step": 4196 }, { "epoch": 0.833647829973185, "grad_norm": 0.4720027613289749, "learning_rate": 7.078974899685132e-07, "loss": 0.0145, "step": 4197 }, { "epoch": 0.8338464594299334, "grad_norm": 0.42761253917678405, "learning_rate": 7.062479595120458e-07, "loss": 0.0095, "step": 4198 }, { "epoch": 0.8340450888866819, "grad_norm": 0.2903106453517186, "learning_rate": 7.046002070939995e-07, "loss": 0.0076, "step": 4199 }, { "epoch": 0.8342437183434304, "grad_norm": 0.5331366190524331, "learning_rate": 7.029542333967049e-07, "loss": 0.0138, "step": 4200 }, { "epoch": 0.8344423478001788, "grad_norm": 0.3941782778497735, "learning_rate": 7.013100391017602e-07, "loss": 0.0053, "step": 4201 }, { "epoch": 0.8346409772569272, "grad_norm": 0.5155012599849433, "learning_rate": 6.996676248900219e-07, "loss": 0.0119, "step": 4202 }, { "epoch": 0.8348396067136756, "grad_norm": 0.6437821506076101, "learning_rate": 6.980269914416144e-07, "loss": 0.0117, "step": 4203 }, { "epoch": 0.835038236170424, "grad_norm": 0.24442114698882744, "learning_rate": 6.963881394359223e-07, "loss": 0.004, "step": 4204 }, { "epoch": 0.8352368656271725, "grad_norm": 0.9459746769962994, "learning_rate": 6.947510695515913e-07, "loss": 0.0263, "step": 4205 }, { "epoch": 0.8354354950839209, "grad_norm": 0.2833211575993794, "learning_rate": 6.931157824665319e-07, "loss": 0.0066, "step": 4206 }, { "epoch": 0.8356341245406694, "grad_norm": 0.4356603803658812, "learning_rate": 6.914822788579123e-07, "loss": 0.01, "step": 4207 }, { "epoch": 0.8358327539974179, "grad_norm": 0.5338876349294479, "learning_rate": 6.898505594021681e-07, "loss": 0.0127, "step": 4208 }, { "epoch": 0.8360313834541663, "grad_norm": 0.5494695551352939, "learning_rate": 6.882206247749907e-07, "loss": 0.012, "step": 4209 }, { "epoch": 0.8362300129109147, "grad_norm": 0.6861938545700319, "learning_rate": 6.865924756513336e-07, "loss": 0.0191, "step": 4210 }, { "epoch": 0.8364286423676631, "grad_norm": 0.4886731636680826, "learning_rate": 6.849661127054141e-07, "loss": 0.009, "step": 4211 }, { "epoch": 0.8366272718244115, "grad_norm": 0.3818112246211818, "learning_rate": 6.833415366107049e-07, "loss": 0.0103, "step": 4212 }, { "epoch": 0.83682590128116, "grad_norm": 0.21974960483824155, "learning_rate": 6.817187480399434e-07, "loss": 0.0045, "step": 4213 }, { "epoch": 0.8370245307379084, "grad_norm": 0.46671217892639616, "learning_rate": 6.800977476651232e-07, "loss": 0.0128, "step": 4214 }, { "epoch": 0.8372231601946568, "grad_norm": 0.6528384272178425, "learning_rate": 6.784785361574997e-07, "loss": 0.0114, "step": 4215 }, { "epoch": 0.8374217896514053, "grad_norm": 0.4510583456447262, "learning_rate": 6.768611141875875e-07, "loss": 0.0088, "step": 4216 }, { "epoch": 0.8376204191081538, "grad_norm": 0.40374865267390264, "learning_rate": 6.752454824251575e-07, "loss": 0.011, "step": 4217 }, { "epoch": 0.8378190485649022, "grad_norm": 0.41459767009959575, "learning_rate": 6.736316415392435e-07, "loss": 0.0093, "step": 4218 }, { "epoch": 0.8380176780216506, "grad_norm": 0.2681325117645612, "learning_rate": 6.720195921981332e-07, "loss": 0.0038, "step": 4219 }, { "epoch": 0.838216307478399, "grad_norm": 0.42508478595734384, "learning_rate": 6.704093350693763e-07, "loss": 0.015, "step": 4220 }, { "epoch": 0.8384149369351475, "grad_norm": 0.37595114785371714, "learning_rate": 6.688008708197774e-07, "loss": 0.0073, "step": 4221 }, { "epoch": 0.8386135663918959, "grad_norm": 0.31877225252641367, "learning_rate": 6.671942001154003e-07, "loss": 0.0067, "step": 4222 }, { "epoch": 0.8388121958486443, "grad_norm": 0.46674482672364, "learning_rate": 6.655893236215671e-07, "loss": 0.0135, "step": 4223 }, { "epoch": 0.8390108253053928, "grad_norm": 0.41001893376938964, "learning_rate": 6.639862420028531e-07, "loss": 0.0105, "step": 4224 }, { "epoch": 0.8392094547621413, "grad_norm": 0.7742237052495848, "learning_rate": 6.623849559230955e-07, "loss": 0.0171, "step": 4225 }, { "epoch": 0.8394080842188897, "grad_norm": 1.0107664886273078, "learning_rate": 6.607854660453827e-07, "loss": 0.0148, "step": 4226 }, { "epoch": 0.8396067136756381, "grad_norm": 0.7350900223607449, "learning_rate": 6.591877730320639e-07, "loss": 0.0105, "step": 4227 }, { "epoch": 0.8398053431323865, "grad_norm": 0.6237840242821349, "learning_rate": 6.575918775447404e-07, "loss": 0.0104, "step": 4228 }, { "epoch": 0.840003972589135, "grad_norm": 0.2205986767974159, "learning_rate": 6.559977802442719e-07, "loss": 0.0079, "step": 4229 }, { "epoch": 0.8402026020458834, "grad_norm": 0.391922815119898, "learning_rate": 6.54405481790773e-07, "loss": 0.0071, "step": 4230 }, { "epoch": 0.8404012315026318, "grad_norm": 0.6980785689595316, "learning_rate": 6.52814982843612e-07, "loss": 0.0181, "step": 4231 }, { "epoch": 0.8405998609593803, "grad_norm": 0.8308250329575882, "learning_rate": 6.512262840614137e-07, "loss": 0.0107, "step": 4232 }, { "epoch": 0.8407984904161288, "grad_norm": 0.5525708162928655, "learning_rate": 6.496393861020562e-07, "loss": 0.0099, "step": 4233 }, { "epoch": 0.8409971198728772, "grad_norm": 0.4976875182817474, "learning_rate": 6.480542896226716e-07, "loss": 0.0083, "step": 4234 }, { "epoch": 0.8411957493296256, "grad_norm": 0.34387989751239384, "learning_rate": 6.464709952796482e-07, "loss": 0.0082, "step": 4235 }, { "epoch": 0.841394378786374, "grad_norm": 0.5003515193396486, "learning_rate": 6.448895037286251e-07, "loss": 0.0079, "step": 4236 }, { "epoch": 0.8415930082431224, "grad_norm": 0.476787360201741, "learning_rate": 6.433098156244982e-07, "loss": 0.0098, "step": 4237 }, { "epoch": 0.8417916376998709, "grad_norm": 0.5623162439430519, "learning_rate": 6.417319316214126e-07, "loss": 0.0124, "step": 4238 }, { "epoch": 0.8419902671566193, "grad_norm": 0.368015208338776, "learning_rate": 6.401558523727703e-07, "loss": 0.0098, "step": 4239 }, { "epoch": 0.8421888966133677, "grad_norm": 0.5533902338765675, "learning_rate": 6.385815785312238e-07, "loss": 0.012, "step": 4240 }, { "epoch": 0.8423875260701162, "grad_norm": 0.4726896319372486, "learning_rate": 6.37009110748677e-07, "loss": 0.0139, "step": 4241 }, { "epoch": 0.8425861555268647, "grad_norm": 0.4250515882151428, "learning_rate": 6.354384496762894e-07, "loss": 0.01, "step": 4242 }, { "epoch": 0.8427847849836131, "grad_norm": 0.25361240255528045, "learning_rate": 6.33869595964468e-07, "loss": 0.0069, "step": 4243 }, { "epoch": 0.8429834144403615, "grad_norm": 0.44441454060510527, "learning_rate": 6.323025502628754e-07, "loss": 0.0095, "step": 4244 }, { "epoch": 0.8431820438971099, "grad_norm": 0.599605477417074, "learning_rate": 6.30737313220422e-07, "loss": 0.0091, "step": 4245 }, { "epoch": 0.8433806733538584, "grad_norm": 0.4374505279129274, "learning_rate": 6.291738854852719e-07, "loss": 0.0079, "step": 4246 }, { "epoch": 0.8435793028106068, "grad_norm": 0.7710884713360233, "learning_rate": 6.276122677048396e-07, "loss": 0.0135, "step": 4247 }, { "epoch": 0.8437779322673552, "grad_norm": 0.5753325234304452, "learning_rate": 6.260524605257873e-07, "loss": 0.0148, "step": 4248 }, { "epoch": 0.8439765617241037, "grad_norm": 0.5226453715532203, "learning_rate": 6.244944645940326e-07, "loss": 0.013, "step": 4249 }, { "epoch": 0.8441751911808522, "grad_norm": 0.4733145956448767, "learning_rate": 6.229382805547369e-07, "loss": 0.0142, "step": 4250 }, { "epoch": 0.8443738206376006, "grad_norm": 0.7137863419541726, "learning_rate": 6.213839090523166e-07, "loss": 0.0124, "step": 4251 }, { "epoch": 0.844572450094349, "grad_norm": 0.5052172387349066, "learning_rate": 6.198313507304343e-07, "loss": 0.0214, "step": 4252 }, { "epoch": 0.8447710795510974, "grad_norm": 0.36751253905346604, "learning_rate": 6.182806062320029e-07, "loss": 0.0086, "step": 4253 }, { "epoch": 0.8449697090078458, "grad_norm": 0.6136397561793601, "learning_rate": 6.167316761991854e-07, "loss": 0.0144, "step": 4254 }, { "epoch": 0.8451683384645943, "grad_norm": 1.413194361081298, "learning_rate": 6.151845612733909e-07, "loss": 0.0066, "step": 4255 }, { "epoch": 0.8453669679213427, "grad_norm": 0.38786949863789816, "learning_rate": 6.136392620952791e-07, "loss": 0.0148, "step": 4256 }, { "epoch": 0.8455655973780911, "grad_norm": 0.3696999820971262, "learning_rate": 6.120957793047561e-07, "loss": 0.0065, "step": 4257 }, { "epoch": 0.8457642268348396, "grad_norm": 0.5675751446068034, "learning_rate": 6.105541135409759e-07, "loss": 0.0134, "step": 4258 }, { "epoch": 0.8459628562915881, "grad_norm": 0.49713563656670917, "learning_rate": 6.090142654423425e-07, "loss": 0.0076, "step": 4259 }, { "epoch": 0.8461614857483365, "grad_norm": 0.36177582839219674, "learning_rate": 6.074762356465036e-07, "loss": 0.0101, "step": 4260 }, { "epoch": 0.8463601152050849, "grad_norm": 0.682481331309334, "learning_rate": 6.05940024790358e-07, "loss": 0.0127, "step": 4261 }, { "epoch": 0.8465587446618333, "grad_norm": 0.3292789124528157, "learning_rate": 6.044056335100462e-07, "loss": 0.006, "step": 4262 }, { "epoch": 0.8467573741185818, "grad_norm": 0.64966583910457, "learning_rate": 6.028730624409612e-07, "loss": 0.0116, "step": 4263 }, { "epoch": 0.8469560035753302, "grad_norm": 0.8237702138641505, "learning_rate": 6.013423122177364e-07, "loss": 0.0103, "step": 4264 }, { "epoch": 0.8471546330320786, "grad_norm": 0.39478232550170367, "learning_rate": 5.998133834742553e-07, "loss": 0.0058, "step": 4265 }, { "epoch": 0.8473532624888271, "grad_norm": 0.6707468045773747, "learning_rate": 5.982862768436464e-07, "loss": 0.0093, "step": 4266 }, { "epoch": 0.8475518919455756, "grad_norm": 0.7479799316317778, "learning_rate": 5.967609929582818e-07, "loss": 0.0226, "step": 4267 }, { "epoch": 0.847750521402324, "grad_norm": 0.6860784103609919, "learning_rate": 5.952375324497811e-07, "loss": 0.0106, "step": 4268 }, { "epoch": 0.8479491508590724, "grad_norm": 0.5184502971263298, "learning_rate": 5.937158959490064e-07, "loss": 0.0086, "step": 4269 }, { "epoch": 0.8481477803158208, "grad_norm": 0.39623222920861734, "learning_rate": 5.921960840860674e-07, "loss": 0.0142, "step": 4270 }, { "epoch": 0.8483464097725693, "grad_norm": 2.3172804734227426, "learning_rate": 5.906780974903154e-07, "loss": 0.0167, "step": 4271 }, { "epoch": 0.8485450392293177, "grad_norm": 0.3739822933902876, "learning_rate": 5.891619367903467e-07, "loss": 0.0104, "step": 4272 }, { "epoch": 0.8487436686860661, "grad_norm": 0.3235758648301256, "learning_rate": 5.876476026140043e-07, "loss": 0.0065, "step": 4273 }, { "epoch": 0.8489422981428146, "grad_norm": 0.4318882083611534, "learning_rate": 5.861350955883705e-07, "loss": 0.011, "step": 4274 }, { "epoch": 0.8491409275995631, "grad_norm": 1.0955982519259333, "learning_rate": 5.846244163397741e-07, "loss": 0.0139, "step": 4275 }, { "epoch": 0.8493395570563115, "grad_norm": 0.5712222512112204, "learning_rate": 5.831155654937837e-07, "loss": 0.0149, "step": 4276 }, { "epoch": 0.8495381865130599, "grad_norm": 0.3652696761055196, "learning_rate": 5.816085436752162e-07, "loss": 0.0064, "step": 4277 }, { "epoch": 0.8497368159698083, "grad_norm": 0.7739356696768167, "learning_rate": 5.80103351508125e-07, "loss": 0.0139, "step": 4278 }, { "epoch": 0.8499354454265567, "grad_norm": 0.42583485037192603, "learning_rate": 5.785999896158096e-07, "loss": 0.0092, "step": 4279 }, { "epoch": 0.8501340748833052, "grad_norm": 0.6344045019567613, "learning_rate": 5.770984586208112e-07, "loss": 0.0082, "step": 4280 }, { "epoch": 0.8503327043400536, "grad_norm": 0.6040578317404494, "learning_rate": 5.755987591449125e-07, "loss": 0.0127, "step": 4281 }, { "epoch": 0.850531333796802, "grad_norm": 0.4502937353117837, "learning_rate": 5.741008918091362e-07, "loss": 0.0113, "step": 4282 }, { "epoch": 0.8507299632535505, "grad_norm": 0.4734975385054619, "learning_rate": 5.726048572337489e-07, "loss": 0.0123, "step": 4283 }, { "epoch": 0.850928592710299, "grad_norm": 0.335158237315028, "learning_rate": 5.711106560382556e-07, "loss": 0.0071, "step": 4284 }, { "epoch": 0.8511272221670474, "grad_norm": 0.6315586556303954, "learning_rate": 5.696182888414053e-07, "loss": 0.0202, "step": 4285 }, { "epoch": 0.8513258516237958, "grad_norm": 0.5356384799146305, "learning_rate": 5.681277562611842e-07, "loss": 0.013, "step": 4286 }, { "epoch": 0.8515244810805442, "grad_norm": 0.5933153411083376, "learning_rate": 5.666390589148219e-07, "loss": 0.0101, "step": 4287 }, { "epoch": 0.8517231105372927, "grad_norm": 0.29937152219167246, "learning_rate": 5.651521974187846e-07, "loss": 0.0083, "step": 4288 }, { "epoch": 0.8519217399940411, "grad_norm": 0.7624990790569742, "learning_rate": 5.636671723887816e-07, "loss": 0.0132, "step": 4289 }, { "epoch": 0.8521203694507895, "grad_norm": 0.18558021382686216, "learning_rate": 5.621839844397603e-07, "loss": 0.0047, "step": 4290 }, { "epoch": 0.852318998907538, "grad_norm": 0.31218589825449133, "learning_rate": 5.607026341859062e-07, "loss": 0.0094, "step": 4291 }, { "epoch": 0.8525176283642865, "grad_norm": 0.3659669083845685, "learning_rate": 5.592231222406463e-07, "loss": 0.0033, "step": 4292 }, { "epoch": 0.8527162578210349, "grad_norm": 0.5248556606937119, "learning_rate": 5.57745449216644e-07, "loss": 0.0075, "step": 4293 }, { "epoch": 0.8529148872777833, "grad_norm": 0.30725668244669896, "learning_rate": 5.562696157258029e-07, "loss": 0.0075, "step": 4294 }, { "epoch": 0.8531135167345317, "grad_norm": 0.5613863285877204, "learning_rate": 5.547956223792633e-07, "loss": 0.0081, "step": 4295 }, { "epoch": 0.8533121461912802, "grad_norm": 0.2473969543840592, "learning_rate": 5.533234697874045e-07, "loss": 0.0054, "step": 4296 }, { "epoch": 0.8535107756480286, "grad_norm": 0.46726588405248737, "learning_rate": 5.518531585598452e-07, "loss": 0.0125, "step": 4297 }, { "epoch": 0.853709405104777, "grad_norm": 0.5320500221062148, "learning_rate": 5.503846893054376e-07, "loss": 0.0147, "step": 4298 }, { "epoch": 0.8539080345615254, "grad_norm": 0.5026086372784023, "learning_rate": 5.489180626322749e-07, "loss": 0.0147, "step": 4299 }, { "epoch": 0.854106664018274, "grad_norm": 0.36955112376345584, "learning_rate": 5.474532791476844e-07, "loss": 0.0063, "step": 4300 }, { "epoch": 0.8543052934750224, "grad_norm": 0.4981679024030091, "learning_rate": 5.459903394582328e-07, "loss": 0.0064, "step": 4301 }, { "epoch": 0.8545039229317708, "grad_norm": 0.4244213243244062, "learning_rate": 5.445292441697203e-07, "loss": 0.0144, "step": 4302 }, { "epoch": 0.8547025523885192, "grad_norm": 0.6977278332167148, "learning_rate": 5.430699938871858e-07, "loss": 0.0088, "step": 4303 }, { "epoch": 0.8549011818452676, "grad_norm": 0.7642388896941835, "learning_rate": 5.416125892149049e-07, "loss": 0.0149, "step": 4304 }, { "epoch": 0.8550998113020161, "grad_norm": 0.4029155837440189, "learning_rate": 5.401570307563858e-07, "loss": 0.0064, "step": 4305 }, { "epoch": 0.8552984407587645, "grad_norm": 0.6354016421469559, "learning_rate": 5.387033191143742e-07, "loss": 0.0156, "step": 4306 }, { "epoch": 0.8554970702155129, "grad_norm": 0.36260476179749174, "learning_rate": 5.372514548908498e-07, "loss": 0.0073, "step": 4307 }, { "epoch": 0.8556956996722614, "grad_norm": 0.5767576719232735, "learning_rate": 5.358014386870286e-07, "loss": 0.0071, "step": 4308 }, { "epoch": 0.8558943291290099, "grad_norm": 0.5407542575497629, "learning_rate": 5.343532711033617e-07, "loss": 0.0089, "step": 4309 }, { "epoch": 0.8560929585857583, "grad_norm": 0.39473103537864135, "learning_rate": 5.329069527395325e-07, "loss": 0.0085, "step": 4310 }, { "epoch": 0.8562915880425067, "grad_norm": 0.3010851871794071, "learning_rate": 5.314624841944616e-07, "loss": 0.0084, "step": 4311 }, { "epoch": 0.8564902174992551, "grad_norm": 0.4049809213204187, "learning_rate": 5.300198660663003e-07, "loss": 0.0106, "step": 4312 }, { "epoch": 0.8566888469560036, "grad_norm": 0.5330916206019888, "learning_rate": 5.28579098952437e-07, "loss": 0.016, "step": 4313 }, { "epoch": 0.856887476412752, "grad_norm": 0.32358362989327244, "learning_rate": 5.2714018344949e-07, "loss": 0.013, "step": 4314 }, { "epoch": 0.8570861058695004, "grad_norm": 0.39221129075451266, "learning_rate": 5.257031201533141e-07, "loss": 0.006, "step": 4315 }, { "epoch": 0.8572847353262489, "grad_norm": 0.4547069807214053, "learning_rate": 5.242679096589959e-07, "loss": 0.0131, "step": 4316 }, { "epoch": 0.8574833647829974, "grad_norm": 0.24851871977697923, "learning_rate": 5.228345525608536e-07, "loss": 0.007, "step": 4317 }, { "epoch": 0.8576819942397458, "grad_norm": 0.3943911033364027, "learning_rate": 5.214030494524408e-07, "loss": 0.0136, "step": 4318 }, { "epoch": 0.8578806236964942, "grad_norm": 0.40090137599123665, "learning_rate": 5.199734009265389e-07, "loss": 0.009, "step": 4319 }, { "epoch": 0.8580792531532426, "grad_norm": 0.5959732058644688, "learning_rate": 5.18545607575166e-07, "loss": 0.0098, "step": 4320 }, { "epoch": 0.858277882609991, "grad_norm": 0.6067428479491546, "learning_rate": 5.171196699895687e-07, "loss": 0.0081, "step": 4321 }, { "epoch": 0.8584765120667395, "grad_norm": 0.46155290089707957, "learning_rate": 5.15695588760226e-07, "loss": 0.0116, "step": 4322 }, { "epoch": 0.8586751415234879, "grad_norm": 0.4010737738551735, "learning_rate": 5.142733644768511e-07, "loss": 0.0154, "step": 4323 }, { "epoch": 0.8588737709802363, "grad_norm": 0.2686236162523568, "learning_rate": 5.128529977283824e-07, "loss": 0.0055, "step": 4324 }, { "epoch": 0.8590724004369849, "grad_norm": 0.618702187000194, "learning_rate": 5.114344891029949e-07, "loss": 0.0114, "step": 4325 }, { "epoch": 0.8592710298937333, "grad_norm": 0.18789334381194944, "learning_rate": 5.1001783918809e-07, "loss": 0.0039, "step": 4326 }, { "epoch": 0.8594696593504817, "grad_norm": 0.6465736250291123, "learning_rate": 5.086030485703019e-07, "loss": 0.0084, "step": 4327 }, { "epoch": 0.8596682888072301, "grad_norm": 0.5719333455155227, "learning_rate": 5.071901178354927e-07, "loss": 0.0094, "step": 4328 }, { "epoch": 0.8598669182639785, "grad_norm": 0.5000117385352728, "learning_rate": 5.057790475687574e-07, "loss": 0.0065, "step": 4329 }, { "epoch": 0.860065547720727, "grad_norm": 0.6335598418218465, "learning_rate": 5.043698383544182e-07, "loss": 0.0147, "step": 4330 }, { "epoch": 0.8602641771774754, "grad_norm": 0.7245941891795274, "learning_rate": 5.029624907760255e-07, "loss": 0.0075, "step": 4331 }, { "epoch": 0.8604628066342238, "grad_norm": 0.5740508426452544, "learning_rate": 5.015570054163621e-07, "loss": 0.0092, "step": 4332 }, { "epoch": 0.8606614360909723, "grad_norm": 0.47282095433648685, "learning_rate": 5.001533828574389e-07, "loss": 0.0105, "step": 4333 }, { "epoch": 0.8608600655477208, "grad_norm": 0.5065183646911192, "learning_rate": 4.987516236804929e-07, "loss": 0.01, "step": 4334 }, { "epoch": 0.8610586950044692, "grad_norm": 0.6252766191350665, "learning_rate": 4.973517284659923e-07, "loss": 0.0121, "step": 4335 }, { "epoch": 0.8612573244612176, "grad_norm": 0.8191924217356521, "learning_rate": 4.959536977936313e-07, "loss": 0.0168, "step": 4336 }, { "epoch": 0.861455953917966, "grad_norm": 0.7702791527849939, "learning_rate": 4.945575322423346e-07, "loss": 0.014, "step": 4337 }, { "epoch": 0.8616545833747145, "grad_norm": 0.4220973789902732, "learning_rate": 4.931632323902508e-07, "loss": 0.0071, "step": 4338 }, { "epoch": 0.8618532128314629, "grad_norm": 0.7487513974820313, "learning_rate": 4.917707988147591e-07, "loss": 0.0109, "step": 4339 }, { "epoch": 0.8620518422882113, "grad_norm": 0.5040477610803438, "learning_rate": 4.903802320924661e-07, "loss": 0.0175, "step": 4340 }, { "epoch": 0.8622504717449597, "grad_norm": 0.8484536149810432, "learning_rate": 4.889915327992024e-07, "loss": 0.0114, "step": 4341 }, { "epoch": 0.8624491012017083, "grad_norm": 0.34276135719378503, "learning_rate": 4.876047015100277e-07, "loss": 0.0091, "step": 4342 }, { "epoch": 0.8626477306584567, "grad_norm": 0.3138227924206721, "learning_rate": 4.862197387992267e-07, "loss": 0.0081, "step": 4343 }, { "epoch": 0.8628463601152051, "grad_norm": 0.48755996193098844, "learning_rate": 4.848366452403125e-07, "loss": 0.0141, "step": 4344 }, { "epoch": 0.8630449895719535, "grad_norm": 0.4243160855813772, "learning_rate": 4.834554214060211e-07, "loss": 0.0129, "step": 4345 }, { "epoch": 0.863243619028702, "grad_norm": 0.28134303568802305, "learning_rate": 4.820760678683168e-07, "loss": 0.0092, "step": 4346 }, { "epoch": 0.8634422484854504, "grad_norm": 0.9313255602494132, "learning_rate": 4.806985851983892e-07, "loss": 0.0148, "step": 4347 }, { "epoch": 0.8636408779421988, "grad_norm": 0.33444347069896996, "learning_rate": 4.793229739666505e-07, "loss": 0.0057, "step": 4348 }, { "epoch": 0.8638395073989472, "grad_norm": 0.4575240882152554, "learning_rate": 4.779492347427422e-07, "loss": 0.0069, "step": 4349 }, { "epoch": 0.8640381368556957, "grad_norm": 0.25494908087050244, "learning_rate": 4.7657736809552655e-07, "loss": 0.007, "step": 4350 }, { "epoch": 0.8642367663124442, "grad_norm": 0.674796943258971, "learning_rate": 4.752073745930941e-07, "loss": 0.0126, "step": 4351 }, { "epoch": 0.8644353957691926, "grad_norm": 0.8848459103510878, "learning_rate": 4.738392548027565e-07, "loss": 0.0094, "step": 4352 }, { "epoch": 0.864634025225941, "grad_norm": 0.48385172701891577, "learning_rate": 4.724730092910496e-07, "loss": 0.0138, "step": 4353 }, { "epoch": 0.8648326546826894, "grad_norm": 0.31959578564614055, "learning_rate": 4.7110863862373677e-07, "loss": 0.0059, "step": 4354 }, { "epoch": 0.8650312841394379, "grad_norm": 0.45707641134122046, "learning_rate": 4.6974614336580014e-07, "loss": 0.0126, "step": 4355 }, { "epoch": 0.8652299135961863, "grad_norm": 0.24519345456981262, "learning_rate": 4.6838552408145e-07, "loss": 0.006, "step": 4356 }, { "epoch": 0.8654285430529347, "grad_norm": 0.6553366699161421, "learning_rate": 4.6702678133411505e-07, "loss": 0.0137, "step": 4357 }, { "epoch": 0.8656271725096832, "grad_norm": 0.6216476033899038, "learning_rate": 4.656699156864508e-07, "loss": 0.0076, "step": 4358 }, { "epoch": 0.8658258019664317, "grad_norm": 0.9884101474028949, "learning_rate": 4.643149277003345e-07, "loss": 0.0187, "step": 4359 }, { "epoch": 0.8660244314231801, "grad_norm": 0.3261124525807072, "learning_rate": 4.6296181793686337e-07, "loss": 0.0059, "step": 4360 }, { "epoch": 0.8662230608799285, "grad_norm": 0.4910131929561135, "learning_rate": 4.616105869563614e-07, "loss": 0.0144, "step": 4361 }, { "epoch": 0.8664216903366769, "grad_norm": 0.5104540545077465, "learning_rate": 4.602612353183689e-07, "loss": 0.0084, "step": 4362 }, { "epoch": 0.8666203197934254, "grad_norm": 0.4238162272185062, "learning_rate": 4.589137635816543e-07, "loss": 0.008, "step": 4363 }, { "epoch": 0.8668189492501738, "grad_norm": 0.5117371152238335, "learning_rate": 4.575681723042014e-07, "loss": 0.0121, "step": 4364 }, { "epoch": 0.8670175787069222, "grad_norm": 0.4015325998809057, "learning_rate": 4.5622446204321936e-07, "loss": 0.0074, "step": 4365 }, { "epoch": 0.8672162081636706, "grad_norm": 0.35993039383283, "learning_rate": 4.548826333551382e-07, "loss": 0.0067, "step": 4366 }, { "epoch": 0.8674148376204192, "grad_norm": 0.503064253155119, "learning_rate": 4.535426867956061e-07, "loss": 0.0073, "step": 4367 }, { "epoch": 0.8676134670771676, "grad_norm": 0.4651717976831416, "learning_rate": 4.522046229194954e-07, "loss": 0.0075, "step": 4368 }, { "epoch": 0.867812096533916, "grad_norm": 0.9069952716545454, "learning_rate": 4.508684422808951e-07, "loss": 0.0136, "step": 4369 }, { "epoch": 0.8680107259906644, "grad_norm": 0.4405186014248274, "learning_rate": 4.4953414543311815e-07, "loss": 0.013, "step": 4370 }, { "epoch": 0.8682093554474128, "grad_norm": 0.5402377194016759, "learning_rate": 4.4820173292869416e-07, "loss": 0.0143, "step": 4371 }, { "epoch": 0.8684079849041613, "grad_norm": 0.43013105755484454, "learning_rate": 4.4687120531937357e-07, "loss": 0.0107, "step": 4372 }, { "epoch": 0.8686066143609097, "grad_norm": 0.4369419022550253, "learning_rate": 4.4554256315612833e-07, "loss": 0.0124, "step": 4373 }, { "epoch": 0.8688052438176581, "grad_norm": 0.5827970943937599, "learning_rate": 4.4421580698914615e-07, "loss": 0.0091, "step": 4374 }, { "epoch": 0.8690038732744066, "grad_norm": 0.8284652360245892, "learning_rate": 4.4289093736783695e-07, "loss": 0.0131, "step": 4375 }, { "epoch": 0.8692025027311551, "grad_norm": 0.5145577977088364, "learning_rate": 4.4156795484082694e-07, "loss": 0.014, "step": 4376 }, { "epoch": 0.8694011321879035, "grad_norm": 0.24551320324099252, "learning_rate": 4.402468599559606e-07, "loss": 0.0056, "step": 4377 }, { "epoch": 0.8695997616446519, "grad_norm": 0.44050784980271485, "learning_rate": 4.3892765326030427e-07, "loss": 0.0092, "step": 4378 }, { "epoch": 0.8697983911014003, "grad_norm": 0.5213423596831668, "learning_rate": 4.376103353001387e-07, "loss": 0.0078, "step": 4379 }, { "epoch": 0.8699970205581488, "grad_norm": 0.30488209551484446, "learning_rate": 4.3629490662096484e-07, "loss": 0.0071, "step": 4380 }, { "epoch": 0.8701956500148972, "grad_norm": 0.6465717561681386, "learning_rate": 4.349813677674991e-07, "loss": 0.0066, "step": 4381 }, { "epoch": 0.8703942794716456, "grad_norm": 0.9699003433798918, "learning_rate": 4.336697192836775e-07, "loss": 0.0155, "step": 4382 }, { "epoch": 0.870592908928394, "grad_norm": 0.5174827015842891, "learning_rate": 4.323599617126534e-07, "loss": 0.0075, "step": 4383 }, { "epoch": 0.8707915383851426, "grad_norm": 0.3745007213523484, "learning_rate": 4.3105209559679397e-07, "loss": 0.0083, "step": 4384 }, { "epoch": 0.870990167841891, "grad_norm": 0.37209009481585975, "learning_rate": 4.2974612147768715e-07, "loss": 0.0075, "step": 4385 }, { "epoch": 0.8711887972986394, "grad_norm": 0.605148782197854, "learning_rate": 4.284420398961342e-07, "loss": 0.0082, "step": 4386 }, { "epoch": 0.8713874267553878, "grad_norm": 0.7211710943486761, "learning_rate": 4.2713985139215485e-07, "loss": 0.0083, "step": 4387 }, { "epoch": 0.8715860562121363, "grad_norm": 0.29150430656637677, "learning_rate": 4.2583955650498276e-07, "loss": 0.0079, "step": 4388 }, { "epoch": 0.8717846856688847, "grad_norm": 0.6277540412956093, "learning_rate": 4.2454115577307e-07, "loss": 0.0104, "step": 4389 }, { "epoch": 0.8719833151256331, "grad_norm": 0.8937024962263034, "learning_rate": 4.2324464973408306e-07, "loss": 0.016, "step": 4390 }, { "epoch": 0.8721819445823815, "grad_norm": 0.22626655374898483, "learning_rate": 4.219500389249026e-07, "loss": 0.0039, "step": 4391 }, { "epoch": 0.87238057403913, "grad_norm": 0.5483301349143612, "learning_rate": 4.206573238816275e-07, "loss": 0.0116, "step": 4392 }, { "epoch": 0.8725792034958785, "grad_norm": 0.140657628470087, "learning_rate": 4.193665051395679e-07, "loss": 0.0021, "step": 4393 }, { "epoch": 0.8727778329526269, "grad_norm": 0.41737224691478453, "learning_rate": 4.180775832332523e-07, "loss": 0.0113, "step": 4394 }, { "epoch": 0.8729764624093753, "grad_norm": 0.32655825358404184, "learning_rate": 4.1679055869641993e-07, "loss": 0.0078, "step": 4395 }, { "epoch": 0.8731750918661237, "grad_norm": 0.5392995614964906, "learning_rate": 4.155054320620272e-07, "loss": 0.011, "step": 4396 }, { "epoch": 0.8733737213228722, "grad_norm": 0.2600770739951475, "learning_rate": 4.1422220386224567e-07, "loss": 0.0055, "step": 4397 }, { "epoch": 0.8735723507796206, "grad_norm": 0.9550612436854214, "learning_rate": 4.1294087462845576e-07, "loss": 0.0125, "step": 4398 }, { "epoch": 0.873770980236369, "grad_norm": 0.5668918705649273, "learning_rate": 4.1166144489125703e-07, "loss": 0.0092, "step": 4399 }, { "epoch": 0.8739696096931175, "grad_norm": 0.3898610540948218, "learning_rate": 4.1038391518045895e-07, "loss": 0.0098, "step": 4400 }, { "epoch": 0.874168239149866, "grad_norm": 0.542748206533429, "learning_rate": 4.09108286025085e-07, "loss": 0.009, "step": 4401 }, { "epoch": 0.8743668686066144, "grad_norm": 0.4832034435765608, "learning_rate": 4.0783455795337267e-07, "loss": 0.0096, "step": 4402 }, { "epoch": 0.8745654980633628, "grad_norm": 0.5202243446046367, "learning_rate": 4.065627314927706e-07, "loss": 0.0109, "step": 4403 }, { "epoch": 0.8747641275201112, "grad_norm": 0.5784085936698135, "learning_rate": 4.0529280716994246e-07, "loss": 0.0125, "step": 4404 }, { "epoch": 0.8749627569768597, "grad_norm": 1.1495995355735231, "learning_rate": 4.0402478551076095e-07, "loss": 0.0127, "step": 4405 }, { "epoch": 0.8751613864336081, "grad_norm": 0.31144818566157473, "learning_rate": 4.027586670403133e-07, "loss": 0.0075, "step": 4406 }, { "epoch": 0.8753600158903565, "grad_norm": 0.4268316810889361, "learning_rate": 4.0149445228289787e-07, "loss": 0.0082, "step": 4407 }, { "epoch": 0.8755586453471049, "grad_norm": 0.6936100882597547, "learning_rate": 4.002321417620242e-07, "loss": 0.0143, "step": 4408 }, { "epoch": 0.8757572748038535, "grad_norm": 0.3286470991413137, "learning_rate": 3.989717360004153e-07, "loss": 0.006, "step": 4409 }, { "epoch": 0.8759559042606019, "grad_norm": 0.8501238578826438, "learning_rate": 3.9771323552000196e-07, "loss": 0.0141, "step": 4410 }, { "epoch": 0.8761545337173503, "grad_norm": 0.6051259450991804, "learning_rate": 3.964566408419296e-07, "loss": 0.0109, "step": 4411 }, { "epoch": 0.8763531631740987, "grad_norm": 0.5210373235659236, "learning_rate": 3.952019524865519e-07, "loss": 0.0086, "step": 4412 }, { "epoch": 0.8765517926308471, "grad_norm": 1.012680225277243, "learning_rate": 3.939491709734344e-07, "loss": 0.0125, "step": 4413 }, { "epoch": 0.8767504220875956, "grad_norm": 0.32859384106789974, "learning_rate": 3.926982968213522e-07, "loss": 0.0084, "step": 4414 }, { "epoch": 0.876949051544344, "grad_norm": 0.2505465396417254, "learning_rate": 3.914493305482914e-07, "loss": 0.0059, "step": 4415 }, { "epoch": 0.8771476810010924, "grad_norm": 0.542229522925549, "learning_rate": 3.902022726714488e-07, "loss": 0.0134, "step": 4416 }, { "epoch": 0.877346310457841, "grad_norm": 0.4435250855553033, "learning_rate": 3.889571237072276e-07, "loss": 0.0109, "step": 4417 }, { "epoch": 0.8775449399145894, "grad_norm": 0.7624348302946883, "learning_rate": 3.877138841712447e-07, "loss": 0.0191, "step": 4418 }, { "epoch": 0.8777435693713378, "grad_norm": 0.39223360906206584, "learning_rate": 3.8647255457832264e-07, "loss": 0.0108, "step": 4419 }, { "epoch": 0.8779421988280862, "grad_norm": 0.26266628381114066, "learning_rate": 3.8523313544249653e-07, "loss": 0.0055, "step": 4420 }, { "epoch": 0.8781408282848346, "grad_norm": 0.6195454555078306, "learning_rate": 3.839956272770068e-07, "loss": 0.0088, "step": 4421 }, { "epoch": 0.8783394577415831, "grad_norm": 0.6243351595233597, "learning_rate": 3.8276003059430523e-07, "loss": 0.0117, "step": 4422 }, { "epoch": 0.8785380871983315, "grad_norm": 0.5652958236632496, "learning_rate": 3.8152634590605196e-07, "loss": 0.0199, "step": 4423 }, { "epoch": 0.8787367166550799, "grad_norm": 0.9261366755651307, "learning_rate": 3.80294573723114e-07, "loss": 0.0146, "step": 4424 }, { "epoch": 0.8789353461118283, "grad_norm": 0.4162303412607668, "learning_rate": 3.7906471455556537e-07, "loss": 0.0077, "step": 4425 }, { "epoch": 0.8791339755685769, "grad_norm": 0.47965614259858963, "learning_rate": 3.7783676891269216e-07, "loss": 0.0107, "step": 4426 }, { "epoch": 0.8793326050253253, "grad_norm": 0.758019511885331, "learning_rate": 3.766107373029837e-07, "loss": 0.0135, "step": 4427 }, { "epoch": 0.8795312344820737, "grad_norm": 1.122056922868509, "learning_rate": 3.753866202341394e-07, "loss": 0.0115, "step": 4428 }, { "epoch": 0.8797298639388221, "grad_norm": 0.26469452888733724, "learning_rate": 3.7416441821306325e-07, "loss": 0.0068, "step": 4429 }, { "epoch": 0.8799284933955706, "grad_norm": 0.7299933289853252, "learning_rate": 3.7294413174587043e-07, "loss": 0.015, "step": 4430 }, { "epoch": 0.880127122852319, "grad_norm": 0.7634736305853262, "learning_rate": 3.717257613378783e-07, "loss": 0.0093, "step": 4431 }, { "epoch": 0.8803257523090674, "grad_norm": 0.7735715700253204, "learning_rate": 3.7050930749361335e-07, "loss": 0.0156, "step": 4432 }, { "epoch": 0.8805243817658158, "grad_norm": 0.5172736405040491, "learning_rate": 3.6929477071680876e-07, "loss": 0.0071, "step": 4433 }, { "epoch": 0.8807230112225644, "grad_norm": 0.744487659123002, "learning_rate": 3.680821515104016e-07, "loss": 0.0143, "step": 4434 }, { "epoch": 0.8809216406793128, "grad_norm": 0.5354693790419052, "learning_rate": 3.6687145037653804e-07, "loss": 0.0101, "step": 4435 }, { "epoch": 0.8811202701360612, "grad_norm": 0.3906157093858662, "learning_rate": 3.656626678165659e-07, "loss": 0.0057, "step": 4436 }, { "epoch": 0.8813188995928096, "grad_norm": 0.37180422992439127, "learning_rate": 3.6445580433104313e-07, "loss": 0.0073, "step": 4437 }, { "epoch": 0.881517529049558, "grad_norm": 0.3879523154614315, "learning_rate": 3.632508604197288e-07, "loss": 0.0124, "step": 4438 }, { "epoch": 0.8817161585063065, "grad_norm": 0.5095632886042696, "learning_rate": 3.6204783658158995e-07, "loss": 0.0117, "step": 4439 }, { "epoch": 0.8819147879630549, "grad_norm": 0.39433819698390393, "learning_rate": 3.60846733314798e-07, "loss": 0.0089, "step": 4440 }, { "epoch": 0.8821134174198033, "grad_norm": 0.43438384905622585, "learning_rate": 3.596475511167269e-07, "loss": 0.0061, "step": 4441 }, { "epoch": 0.8823120468765518, "grad_norm": 0.6305498537007402, "learning_rate": 3.584502904839593e-07, "loss": 0.0126, "step": 4442 }, { "epoch": 0.8825106763333003, "grad_norm": 0.6097637763944662, "learning_rate": 3.57254951912277e-07, "loss": 0.0137, "step": 4443 }, { "epoch": 0.8827093057900487, "grad_norm": 0.41692408428569017, "learning_rate": 3.560615358966707e-07, "loss": 0.013, "step": 4444 }, { "epoch": 0.8829079352467971, "grad_norm": 0.4647122199079791, "learning_rate": 3.548700429313312e-07, "loss": 0.0075, "step": 4445 }, { "epoch": 0.8831065647035455, "grad_norm": 0.748689177498006, "learning_rate": 3.5368047350965496e-07, "loss": 0.0114, "step": 4446 }, { "epoch": 0.883305194160294, "grad_norm": 0.8174465249948543, "learning_rate": 3.524928281242429e-07, "loss": 0.0073, "step": 4447 }, { "epoch": 0.8835038236170424, "grad_norm": 0.4656654252231502, "learning_rate": 3.513071072668961e-07, "loss": 0.0111, "step": 4448 }, { "epoch": 0.8837024530737908, "grad_norm": 0.5205301107761227, "learning_rate": 3.5012331142862065e-07, "loss": 0.0099, "step": 4449 }, { "epoch": 0.8839010825305392, "grad_norm": 0.42783686791908027, "learning_rate": 3.4894144109962557e-07, "loss": 0.0114, "step": 4450 }, { "epoch": 0.8840997119872878, "grad_norm": 0.4787814662703929, "learning_rate": 3.477614967693216e-07, "loss": 0.0086, "step": 4451 }, { "epoch": 0.8842983414440362, "grad_norm": 0.4314781867922066, "learning_rate": 3.4658347892632337e-07, "loss": 0.0103, "step": 4452 }, { "epoch": 0.8844969709007846, "grad_norm": 0.35834664121669135, "learning_rate": 3.454073880584463e-07, "loss": 0.0095, "step": 4453 }, { "epoch": 0.884695600357533, "grad_norm": 0.4202353721736912, "learning_rate": 3.4423322465270914e-07, "loss": 0.0112, "step": 4454 }, { "epoch": 0.8848942298142815, "grad_norm": 1.639304177182807, "learning_rate": 3.4306098919533013e-07, "loss": 0.0088, "step": 4455 }, { "epoch": 0.8850928592710299, "grad_norm": 0.5271434662250698, "learning_rate": 3.4189068217173216e-07, "loss": 0.0105, "step": 4456 }, { "epoch": 0.8852914887277783, "grad_norm": 0.4513582350757716, "learning_rate": 3.407223040665386e-07, "loss": 0.0089, "step": 4457 }, { "epoch": 0.8854901181845267, "grad_norm": 0.43566762732715747, "learning_rate": 3.395558553635725e-07, "loss": 0.0083, "step": 4458 }, { "epoch": 0.8856887476412753, "grad_norm": 0.4675844643952693, "learning_rate": 3.383913365458602e-07, "loss": 0.0124, "step": 4459 }, { "epoch": 0.8858873770980237, "grad_norm": 0.5086115524011803, "learning_rate": 3.3722874809562655e-07, "loss": 0.0085, "step": 4460 }, { "epoch": 0.8860860065547721, "grad_norm": 0.4995927325240558, "learning_rate": 3.3606809049429976e-07, "loss": 0.0147, "step": 4461 }, { "epoch": 0.8862846360115205, "grad_norm": 0.563092019434094, "learning_rate": 3.3490936422250486e-07, "loss": 0.0091, "step": 4462 }, { "epoch": 0.8864832654682689, "grad_norm": 0.3664198660805334, "learning_rate": 3.337525697600713e-07, "loss": 0.0085, "step": 4463 }, { "epoch": 0.8866818949250174, "grad_norm": 0.5626785718095834, "learning_rate": 3.3259770758602593e-07, "loss": 0.0078, "step": 4464 }, { "epoch": 0.8868805243817658, "grad_norm": 0.6720037981827449, "learning_rate": 3.314447781785951e-07, "loss": 0.0166, "step": 4465 }, { "epoch": 0.8870791538385142, "grad_norm": 0.6089564989189392, "learning_rate": 3.302937820152069e-07, "loss": 0.0109, "step": 4466 }, { "epoch": 0.8872777832952626, "grad_norm": 0.8451481981228195, "learning_rate": 3.291447195724867e-07, "loss": 0.0129, "step": 4467 }, { "epoch": 0.8874764127520112, "grad_norm": 0.44291299876872686, "learning_rate": 3.2799759132626176e-07, "loss": 0.0075, "step": 4468 }, { "epoch": 0.8876750422087596, "grad_norm": 0.6004352791066179, "learning_rate": 3.268523977515542e-07, "loss": 0.0122, "step": 4469 }, { "epoch": 0.887873671665508, "grad_norm": 0.4086550054310082, "learning_rate": 3.257091393225892e-07, "loss": 0.0087, "step": 4470 }, { "epoch": 0.8880723011222564, "grad_norm": 0.46507471921866933, "learning_rate": 3.245678165127891e-07, "loss": 0.0073, "step": 4471 }, { "epoch": 0.8882709305790049, "grad_norm": 0.39929592869202446, "learning_rate": 3.234284297947748e-07, "loss": 0.0076, "step": 4472 }, { "epoch": 0.8884695600357533, "grad_norm": 0.44202911846396886, "learning_rate": 3.222909796403639e-07, "loss": 0.0103, "step": 4473 }, { "epoch": 0.8886681894925017, "grad_norm": 0.4416813065644919, "learning_rate": 3.211554665205735e-07, "loss": 0.0085, "step": 4474 }, { "epoch": 0.8888668189492501, "grad_norm": 0.451292002362962, "learning_rate": 3.200218909056185e-07, "loss": 0.0072, "step": 4475 }, { "epoch": 0.8890654484059987, "grad_norm": 0.39357809600693094, "learning_rate": 3.18890253264913e-07, "loss": 0.0076, "step": 4476 }, { "epoch": 0.8892640778627471, "grad_norm": 0.599306137463403, "learning_rate": 3.1776055406706474e-07, "loss": 0.0094, "step": 4477 }, { "epoch": 0.8894627073194955, "grad_norm": 0.3551885948768195, "learning_rate": 3.16632793779883e-07, "loss": 0.0111, "step": 4478 }, { "epoch": 0.8896613367762439, "grad_norm": 0.303987373083012, "learning_rate": 3.155069728703708e-07, "loss": 0.0064, "step": 4479 }, { "epoch": 0.8898599662329923, "grad_norm": 0.7886640360675943, "learning_rate": 3.1438309180473083e-07, "loss": 0.0145, "step": 4480 }, { "epoch": 0.8900585956897408, "grad_norm": 0.3640471787782126, "learning_rate": 3.132611510483591e-07, "loss": 0.0081, "step": 4481 }, { "epoch": 0.8902572251464892, "grad_norm": 0.4516366521716688, "learning_rate": 3.121411510658512e-07, "loss": 0.0058, "step": 4482 }, { "epoch": 0.8904558546032376, "grad_norm": 0.5960707874212607, "learning_rate": 3.1102309232099895e-07, "loss": 0.0079, "step": 4483 }, { "epoch": 0.8906544840599862, "grad_norm": 0.6565899058793593, "learning_rate": 3.09906975276788e-07, "loss": 0.0106, "step": 4484 }, { "epoch": 0.8908531135167346, "grad_norm": 0.43991379023445737, "learning_rate": 3.087928003954027e-07, "loss": 0.0079, "step": 4485 }, { "epoch": 0.891051742973483, "grad_norm": 0.5593853069391717, "learning_rate": 3.0768056813821943e-07, "loss": 0.0135, "step": 4486 }, { "epoch": 0.8912503724302314, "grad_norm": 0.3950431747898158, "learning_rate": 3.0657027896581537e-07, "loss": 0.0094, "step": 4487 }, { "epoch": 0.8914490018869798, "grad_norm": 0.25409553815336716, "learning_rate": 3.054619333379577e-07, "loss": 0.0034, "step": 4488 }, { "epoch": 0.8916476313437283, "grad_norm": 0.5330841755954928, "learning_rate": 3.0435553171361207e-07, "loss": 0.0072, "step": 4489 }, { "epoch": 0.8918462608004767, "grad_norm": 0.6056435536748148, "learning_rate": 3.032510745509393e-07, "loss": 0.0125, "step": 4490 }, { "epoch": 0.8920448902572251, "grad_norm": 0.40977754735724337, "learning_rate": 3.021485623072923e-07, "loss": 0.0091, "step": 4491 }, { "epoch": 0.8922435197139735, "grad_norm": 0.5596491771836665, "learning_rate": 3.0104799543922146e-07, "loss": 0.0105, "step": 4492 }, { "epoch": 0.8924421491707221, "grad_norm": 0.4786492106128223, "learning_rate": 2.999493744024701e-07, "loss": 0.0083, "step": 4493 }, { "epoch": 0.8926407786274705, "grad_norm": 0.9836960055347553, "learning_rate": 2.988526996519764e-07, "loss": 0.018, "step": 4494 }, { "epoch": 0.8928394080842189, "grad_norm": 0.6957875722193689, "learning_rate": 2.97757971641871e-07, "loss": 0.0072, "step": 4495 }, { "epoch": 0.8930380375409673, "grad_norm": 0.7150594587760443, "learning_rate": 2.966651908254814e-07, "loss": 0.0151, "step": 4496 }, { "epoch": 0.8932366669977158, "grad_norm": 0.9452942090466192, "learning_rate": 2.955743576553266e-07, "loss": 0.011, "step": 4497 }, { "epoch": 0.8934352964544642, "grad_norm": 0.4147804118671288, "learning_rate": 2.944854725831181e-07, "loss": 0.0086, "step": 4498 }, { "epoch": 0.8936339259112126, "grad_norm": 0.6444712673599818, "learning_rate": 2.93398536059763e-07, "loss": 0.0116, "step": 4499 }, { "epoch": 0.893832555367961, "grad_norm": 0.4545175271742511, "learning_rate": 2.923135485353618e-07, "loss": 0.0091, "step": 4500 }, { "epoch": 0.8940311848247096, "grad_norm": 0.435024614613689, "learning_rate": 2.9123051045920503e-07, "loss": 0.0099, "step": 4501 }, { "epoch": 0.894229814281458, "grad_norm": 0.5137299220895429, "learning_rate": 2.9014942227977907e-07, "loss": 0.0114, "step": 4502 }, { "epoch": 0.8944284437382064, "grad_norm": 0.5936552009342531, "learning_rate": 2.8907028444476017e-07, "loss": 0.0147, "step": 4503 }, { "epoch": 0.8946270731949548, "grad_norm": 0.6272540958685057, "learning_rate": 2.879930974010198e-07, "loss": 0.0118, "step": 4504 }, { "epoch": 0.8948257026517032, "grad_norm": 0.626455749063314, "learning_rate": 2.8691786159461776e-07, "loss": 0.0086, "step": 4505 }, { "epoch": 0.8950243321084517, "grad_norm": 0.4727682477458918, "learning_rate": 2.858445774708096e-07, "loss": 0.0125, "step": 4506 }, { "epoch": 0.8952229615652001, "grad_norm": 0.42330205421234, "learning_rate": 2.8477324547404204e-07, "loss": 0.0131, "step": 4507 }, { "epoch": 0.8954215910219485, "grad_norm": 0.8016337669641905, "learning_rate": 2.837038660479508e-07, "loss": 0.0087, "step": 4508 }, { "epoch": 0.8956202204786969, "grad_norm": 0.4415565035557052, "learning_rate": 2.826364396353659e-07, "loss": 0.0127, "step": 4509 }, { "epoch": 0.8958188499354455, "grad_norm": 0.5118589639856184, "learning_rate": 2.815709666783073e-07, "loss": 0.0076, "step": 4510 }, { "epoch": 0.8960174793921939, "grad_norm": 0.7602872016834586, "learning_rate": 2.805074476179864e-07, "loss": 0.0098, "step": 4511 }, { "epoch": 0.8962161088489423, "grad_norm": 0.29573595687294996, "learning_rate": 2.7944588289480436e-07, "loss": 0.0071, "step": 4512 }, { "epoch": 0.8964147383056907, "grad_norm": 0.6148007368180166, "learning_rate": 2.7838627294835553e-07, "loss": 0.0177, "step": 4513 }, { "epoch": 0.8966133677624392, "grad_norm": 0.3086576744890267, "learning_rate": 2.7732861821742285e-07, "loss": 0.0037, "step": 4514 }, { "epoch": 0.8968119972191876, "grad_norm": 0.33506300714770954, "learning_rate": 2.762729191399799e-07, "loss": 0.0078, "step": 4515 }, { "epoch": 0.897010626675936, "grad_norm": 0.7280224457903677, "learning_rate": 2.752191761531908e-07, "loss": 0.0132, "step": 4516 }, { "epoch": 0.8972092561326844, "grad_norm": 0.902157920426683, "learning_rate": 2.7416738969340884e-07, "loss": 0.0157, "step": 4517 }, { "epoch": 0.897407885589433, "grad_norm": 1.5039398413544367, "learning_rate": 2.7311756019617886e-07, "loss": 0.0096, "step": 4518 }, { "epoch": 0.8976065150461814, "grad_norm": 0.43159755127163674, "learning_rate": 2.720696880962331e-07, "loss": 0.0142, "step": 4519 }, { "epoch": 0.8978051445029298, "grad_norm": 0.43080340115106225, "learning_rate": 2.710237738274951e-07, "loss": 0.0147, "step": 4520 }, { "epoch": 0.8980037739596782, "grad_norm": 0.7326367128578075, "learning_rate": 2.699798178230772e-07, "loss": 0.0189, "step": 4521 }, { "epoch": 0.8982024034164267, "grad_norm": 0.43562861431632444, "learning_rate": 2.6893782051527873e-07, "loss": 0.011, "step": 4522 }, { "epoch": 0.8984010328731751, "grad_norm": 0.3484366560219996, "learning_rate": 2.6789778233559214e-07, "loss": 0.0075, "step": 4523 }, { "epoch": 0.8985996623299235, "grad_norm": 0.45427660263747915, "learning_rate": 2.6685970371469414e-07, "loss": 0.0057, "step": 4524 }, { "epoch": 0.8987982917866719, "grad_norm": 0.34514097610134814, "learning_rate": 2.658235850824531e-07, "loss": 0.009, "step": 4525 }, { "epoch": 0.8989969212434203, "grad_norm": 0.9536404644314742, "learning_rate": 2.647894268679252e-07, "loss": 0.0058, "step": 4526 }, { "epoch": 0.8991955507001689, "grad_norm": 0.679047679167587, "learning_rate": 2.637572294993529e-07, "loss": 0.0158, "step": 4527 }, { "epoch": 0.8993941801569173, "grad_norm": 0.48771424479973163, "learning_rate": 2.627269934041693e-07, "loss": 0.0082, "step": 4528 }, { "epoch": 0.8995928096136657, "grad_norm": 0.8876585963703878, "learning_rate": 2.6169871900899367e-07, "loss": 0.0129, "step": 4529 }, { "epoch": 0.8997914390704141, "grad_norm": 0.42451999038659616, "learning_rate": 2.606724067396338e-07, "loss": 0.0106, "step": 4530 }, { "epoch": 0.8999900685271626, "grad_norm": 0.5490668531272829, "learning_rate": 2.59648057021083e-07, "loss": 0.0152, "step": 4531 }, { "epoch": 0.900188697983911, "grad_norm": 0.5470093186008119, "learning_rate": 2.5862567027752526e-07, "loss": 0.0094, "step": 4532 }, { "epoch": 0.9003873274406594, "grad_norm": 0.6398893462509314, "learning_rate": 2.576052469323298e-07, "loss": 0.0116, "step": 4533 }, { "epoch": 0.9005859568974078, "grad_norm": 0.4022552451748213, "learning_rate": 2.5658678740805186e-07, "loss": 0.0077, "step": 4534 }, { "epoch": 0.9007845863541564, "grad_norm": 0.39557786288450486, "learning_rate": 2.555702921264358e-07, "loss": 0.0088, "step": 4535 }, { "epoch": 0.9009832158109048, "grad_norm": 1.0099485838483995, "learning_rate": 2.545557615084099e-07, "loss": 0.0117, "step": 4536 }, { "epoch": 0.9011818452676532, "grad_norm": 0.6052324571559026, "learning_rate": 2.5354319597409194e-07, "loss": 0.0095, "step": 4537 }, { "epoch": 0.9013804747244016, "grad_norm": 0.4649008984948733, "learning_rate": 2.525325959427821e-07, "loss": 0.0056, "step": 4538 }, { "epoch": 0.9015791041811501, "grad_norm": 0.4320126986367103, "learning_rate": 2.515239618329701e-07, "loss": 0.0102, "step": 4539 }, { "epoch": 0.9017777336378985, "grad_norm": 0.3576353257805511, "learning_rate": 2.505172940623313e-07, "loss": 0.0062, "step": 4540 }, { "epoch": 0.9019763630946469, "grad_norm": 0.3318197981973977, "learning_rate": 2.495125930477238e-07, "loss": 0.01, "step": 4541 }, { "epoch": 0.9021749925513953, "grad_norm": 0.4512411192015429, "learning_rate": 2.485098592051949e-07, "loss": 0.0048, "step": 4542 }, { "epoch": 0.9023736220081439, "grad_norm": 0.3890900313378956, "learning_rate": 2.475090929499746e-07, "loss": 0.0101, "step": 4543 }, { "epoch": 0.9025722514648923, "grad_norm": 0.43260373969735505, "learning_rate": 2.465102946964798e-07, "loss": 0.0148, "step": 4544 }, { "epoch": 0.9027708809216407, "grad_norm": 0.586975403052094, "learning_rate": 2.455134648583124e-07, "loss": 0.0123, "step": 4545 }, { "epoch": 0.9029695103783891, "grad_norm": 0.4911950176274392, "learning_rate": 2.4451860384825663e-07, "loss": 0.011, "step": 4546 }, { "epoch": 0.9031681398351376, "grad_norm": 0.389644189362611, "learning_rate": 2.4352571207828577e-07, "loss": 0.011, "step": 4547 }, { "epoch": 0.903366769291886, "grad_norm": 0.43830143444725367, "learning_rate": 2.425347899595537e-07, "loss": 0.0056, "step": 4548 }, { "epoch": 0.9035653987486344, "grad_norm": 0.6957717254966186, "learning_rate": 2.415458379024005e-07, "loss": 0.0163, "step": 4549 }, { "epoch": 0.9037640282053828, "grad_norm": 0.4740762518443394, "learning_rate": 2.405588563163519e-07, "loss": 0.0146, "step": 4550 }, { "epoch": 0.9039626576621312, "grad_norm": 0.6021255205184528, "learning_rate": 2.395738456101132e-07, "loss": 0.0124, "step": 4551 }, { "epoch": 0.9041612871188798, "grad_norm": 0.741752144803565, "learning_rate": 2.3859080619157925e-07, "loss": 0.0125, "step": 4552 }, { "epoch": 0.9043599165756282, "grad_norm": 0.42927765078239116, "learning_rate": 2.376097384678233e-07, "loss": 0.0156, "step": 4553 }, { "epoch": 0.9045585460323766, "grad_norm": 0.6840710533669623, "learning_rate": 2.3663064284510594e-07, "loss": 0.0157, "step": 4554 }, { "epoch": 0.904757175489125, "grad_norm": 0.62982881490239, "learning_rate": 2.356535197288684e-07, "loss": 0.0129, "step": 4555 }, { "epoch": 0.9049558049458735, "grad_norm": 0.35044902975137593, "learning_rate": 2.3467836952373756e-07, "loss": 0.0073, "step": 4556 }, { "epoch": 0.9051544344026219, "grad_norm": 0.5392210976679882, "learning_rate": 2.3370519263352264e-07, "loss": 0.0094, "step": 4557 }, { "epoch": 0.9053530638593703, "grad_norm": 0.48828299525425184, "learning_rate": 2.327339894612135e-07, "loss": 0.0153, "step": 4558 }, { "epoch": 0.9055516933161187, "grad_norm": 0.4056484456009865, "learning_rate": 2.3176476040898566e-07, "loss": 0.0135, "step": 4559 }, { "epoch": 0.9057503227728673, "grad_norm": 0.6301698090976487, "learning_rate": 2.3079750587819527e-07, "loss": 0.0113, "step": 4560 }, { "epoch": 0.9059489522296157, "grad_norm": 0.48414953276680756, "learning_rate": 2.2983222626938196e-07, "loss": 0.0165, "step": 4561 }, { "epoch": 0.9061475816863641, "grad_norm": 0.44003626001855367, "learning_rate": 2.288689219822665e-07, "loss": 0.0124, "step": 4562 }, { "epoch": 0.9063462111431125, "grad_norm": 0.6813734049579375, "learning_rate": 2.2790759341575208e-07, "loss": 0.0116, "step": 4563 }, { "epoch": 0.906544840599861, "grad_norm": 0.5755670216305095, "learning_rate": 2.2694824096792522e-07, "loss": 0.0124, "step": 4564 }, { "epoch": 0.9067434700566094, "grad_norm": 0.40263299299179744, "learning_rate": 2.259908650360515e-07, "loss": 0.0122, "step": 4565 }, { "epoch": 0.9069420995133578, "grad_norm": 0.6401801441669921, "learning_rate": 2.2503546601657988e-07, "loss": 0.0105, "step": 4566 }, { "epoch": 0.9071407289701062, "grad_norm": 0.785747890099103, "learning_rate": 2.2408204430514003e-07, "loss": 0.0169, "step": 4567 }, { "epoch": 0.9073393584268546, "grad_norm": 0.4264636528520051, "learning_rate": 2.2313060029654276e-07, "loss": 0.0059, "step": 4568 }, { "epoch": 0.9075379878836032, "grad_norm": 0.42739671235798343, "learning_rate": 2.2218113438478074e-07, "loss": 0.0083, "step": 4569 }, { "epoch": 0.9077366173403516, "grad_norm": 0.40779650096228554, "learning_rate": 2.2123364696302553e-07, "loss": 0.0077, "step": 4570 }, { "epoch": 0.9079352467971, "grad_norm": 0.39853271354961034, "learning_rate": 2.2028813842363272e-07, "loss": 0.0107, "step": 4571 }, { "epoch": 0.9081338762538484, "grad_norm": 0.33762970470811593, "learning_rate": 2.1934460915813416e-07, "loss": 0.0075, "step": 4572 }, { "epoch": 0.9083325057105969, "grad_norm": 0.5183895697114581, "learning_rate": 2.1840305955724561e-07, "loss": 0.0096, "step": 4573 }, { "epoch": 0.9085311351673453, "grad_norm": 0.48886933649478814, "learning_rate": 2.1746349001086187e-07, "loss": 0.0061, "step": 4574 }, { "epoch": 0.9087297646240937, "grad_norm": 0.638424883774554, "learning_rate": 2.1652590090805725e-07, "loss": 0.0085, "step": 4575 }, { "epoch": 0.9089283940808421, "grad_norm": 0.47399133102593094, "learning_rate": 2.155902926370873e-07, "loss": 0.0102, "step": 4576 }, { "epoch": 0.9091270235375907, "grad_norm": 0.8460449737384751, "learning_rate": 2.1465666558538544e-07, "loss": 0.0103, "step": 4577 }, { "epoch": 0.9093256529943391, "grad_norm": 0.5377971540122163, "learning_rate": 2.1372502013956687e-07, "loss": 0.0114, "step": 4578 }, { "epoch": 0.9095242824510875, "grad_norm": 0.4809164280538038, "learning_rate": 2.127953566854235e-07, "loss": 0.0105, "step": 4579 }, { "epoch": 0.9097229119078359, "grad_norm": 0.2805309828097, "learning_rate": 2.1186767560792964e-07, "loss": 0.0079, "step": 4580 }, { "epoch": 0.9099215413645844, "grad_norm": 0.43230047439300484, "learning_rate": 2.1094197729123577e-07, "loss": 0.012, "step": 4581 }, { "epoch": 0.9101201708213328, "grad_norm": 0.5605835051911726, "learning_rate": 2.100182621186736e-07, "loss": 0.0159, "step": 4582 }, { "epoch": 0.9103188002780812, "grad_norm": 0.5202163319138345, "learning_rate": 2.090965304727527e-07, "loss": 0.0096, "step": 4583 }, { "epoch": 0.9105174297348296, "grad_norm": 0.4097715169980986, "learning_rate": 2.0817678273515996e-07, "loss": 0.0107, "step": 4584 }, { "epoch": 0.9107160591915782, "grad_norm": 0.8758598360143836, "learning_rate": 2.072590192867635e-07, "loss": 0.0103, "step": 4585 }, { "epoch": 0.9109146886483266, "grad_norm": 0.5918227290134394, "learning_rate": 2.0634324050760658e-07, "loss": 0.0122, "step": 4586 }, { "epoch": 0.911113318105075, "grad_norm": 0.49732322398821266, "learning_rate": 2.0542944677691467e-07, "loss": 0.0112, "step": 4587 }, { "epoch": 0.9113119475618234, "grad_norm": 0.7732911114954113, "learning_rate": 2.0451763847308626e-07, "loss": 0.0122, "step": 4588 }, { "epoch": 0.9115105770185719, "grad_norm": 0.6007278204049679, "learning_rate": 2.036078159737015e-07, "loss": 0.0139, "step": 4589 }, { "epoch": 0.9117092064753203, "grad_norm": 1.284947054234802, "learning_rate": 2.026999796555179e-07, "loss": 0.0237, "step": 4590 }, { "epoch": 0.9119078359320687, "grad_norm": 0.567152964186292, "learning_rate": 2.0179412989446756e-07, "loss": 0.0088, "step": 4591 }, { "epoch": 0.9121064653888171, "grad_norm": 0.38390704525282005, "learning_rate": 2.0089026706566372e-07, "loss": 0.0067, "step": 4592 }, { "epoch": 0.9123050948455655, "grad_norm": 0.6975714046203888, "learning_rate": 1.999883915433948e-07, "loss": 0.017, "step": 4593 }, { "epoch": 0.9125037243023141, "grad_norm": 0.5217159598877467, "learning_rate": 1.9908850370112476e-07, "loss": 0.0149, "step": 4594 }, { "epoch": 0.9127023537590625, "grad_norm": 0.5062818955759204, "learning_rate": 1.9819060391149837e-07, "loss": 0.0045, "step": 4595 }, { "epoch": 0.9129009832158109, "grad_norm": 0.30318092705281424, "learning_rate": 1.9729469254633425e-07, "loss": 0.0063, "step": 4596 }, { "epoch": 0.9130996126725593, "grad_norm": 0.2722804564584692, "learning_rate": 1.9640076997662848e-07, "loss": 0.0067, "step": 4597 }, { "epoch": 0.9132982421293078, "grad_norm": 0.4823650756559266, "learning_rate": 1.955088365725527e-07, "loss": 0.0092, "step": 4598 }, { "epoch": 0.9134968715860562, "grad_norm": 0.45532989870452495, "learning_rate": 1.9461889270345645e-07, "loss": 0.011, "step": 4599 }, { "epoch": 0.9136955010428046, "grad_norm": 0.6373971983656296, "learning_rate": 1.9373093873786497e-07, "loss": 0.0084, "step": 4600 }, { "epoch": 0.913894130499553, "grad_norm": 0.491770676105079, "learning_rate": 1.9284497504347854e-07, "loss": 0.006, "step": 4601 }, { "epoch": 0.9140927599563016, "grad_norm": 0.6167707314340485, "learning_rate": 1.9196100198717427e-07, "loss": 0.0161, "step": 4602 }, { "epoch": 0.91429138941305, "grad_norm": 0.462717621588757, "learning_rate": 1.9107901993500322e-07, "loss": 0.0157, "step": 4603 }, { "epoch": 0.9144900188697984, "grad_norm": 0.17662504554317196, "learning_rate": 1.9019902925219548e-07, "loss": 0.004, "step": 4604 }, { "epoch": 0.9146886483265468, "grad_norm": 0.49148437356429236, "learning_rate": 1.893210303031523e-07, "loss": 0.0121, "step": 4605 }, { "epoch": 0.9148872777832953, "grad_norm": 0.7776087485666608, "learning_rate": 1.8844502345145233e-07, "loss": 0.0096, "step": 4606 }, { "epoch": 0.9150859072400437, "grad_norm": 0.5332572925885097, "learning_rate": 1.8757100905985094e-07, "loss": 0.0055, "step": 4607 }, { "epoch": 0.9152845366967921, "grad_norm": 0.32966084466808204, "learning_rate": 1.8669898749027472e-07, "loss": 0.0094, "step": 4608 }, { "epoch": 0.9154831661535405, "grad_norm": 0.3990945821247028, "learning_rate": 1.8582895910382813e-07, "loss": 0.0125, "step": 4609 }, { "epoch": 0.915681795610289, "grad_norm": 0.44485780478520864, "learning_rate": 1.8496092426078805e-07, "loss": 0.0074, "step": 4610 }, { "epoch": 0.9158804250670375, "grad_norm": 0.558341054302273, "learning_rate": 1.8409488332060799e-07, "loss": 0.0115, "step": 4611 }, { "epoch": 0.9160790545237859, "grad_norm": 0.37536643210218906, "learning_rate": 1.8323083664191333e-07, "loss": 0.0057, "step": 4612 }, { "epoch": 0.9162776839805343, "grad_norm": 0.7931572433347439, "learning_rate": 1.823687845825056e-07, "loss": 0.0135, "step": 4613 }, { "epoch": 0.9164763134372828, "grad_norm": 0.5130348406821796, "learning_rate": 1.8150872749935989e-07, "loss": 0.017, "step": 4614 }, { "epoch": 0.9166749428940312, "grad_norm": 0.5767895711884279, "learning_rate": 1.8065066574862455e-07, "loss": 0.0193, "step": 4615 }, { "epoch": 0.9168735723507796, "grad_norm": 0.5029366482371473, "learning_rate": 1.7979459968562317e-07, "loss": 0.0144, "step": 4616 }, { "epoch": 0.917072201807528, "grad_norm": 0.49339222260197624, "learning_rate": 1.7894052966485053e-07, "loss": 0.0069, "step": 4617 }, { "epoch": 0.9172708312642764, "grad_norm": 0.4827004588018792, "learning_rate": 1.7808845603997594e-07, "loss": 0.0089, "step": 4618 }, { "epoch": 0.917469460721025, "grad_norm": 0.48201824114443204, "learning_rate": 1.772383791638438e-07, "loss": 0.0117, "step": 4619 }, { "epoch": 0.9176680901777734, "grad_norm": 0.42119151357623763, "learning_rate": 1.7639029938846808e-07, "loss": 0.0152, "step": 4620 }, { "epoch": 0.9178667196345218, "grad_norm": 0.42340226485526916, "learning_rate": 1.755442170650401e-07, "loss": 0.0062, "step": 4621 }, { "epoch": 0.9180653490912702, "grad_norm": 0.42028816533360497, "learning_rate": 1.747001325439196e-07, "loss": 0.0109, "step": 4622 }, { "epoch": 0.9182639785480187, "grad_norm": 0.37252045119966465, "learning_rate": 1.7385804617464308e-07, "loss": 0.0132, "step": 4623 }, { "epoch": 0.9184626080047671, "grad_norm": 0.6415858988525973, "learning_rate": 1.730179583059155e-07, "loss": 0.014, "step": 4624 }, { "epoch": 0.9186612374615155, "grad_norm": 0.6620697719569957, "learning_rate": 1.7217986928561803e-07, "loss": 0.0073, "step": 4625 }, { "epoch": 0.9188598669182639, "grad_norm": 0.534573318706696, "learning_rate": 1.7134377946080193e-07, "loss": 0.0085, "step": 4626 }, { "epoch": 0.9190584963750125, "grad_norm": 0.4937748956041207, "learning_rate": 1.7050968917769139e-07, "loss": 0.0108, "step": 4627 }, { "epoch": 0.9192571258317609, "grad_norm": 0.7534757131719959, "learning_rate": 1.6967759878168233e-07, "loss": 0.0149, "step": 4628 }, { "epoch": 0.9194557552885093, "grad_norm": 0.48683536600548843, "learning_rate": 1.688475086173419e-07, "loss": 0.0071, "step": 4629 }, { "epoch": 0.9196543847452577, "grad_norm": 0.5205814438223413, "learning_rate": 1.6801941902841068e-07, "loss": 0.0133, "step": 4630 }, { "epoch": 0.9198530142020062, "grad_norm": 0.37512216595028314, "learning_rate": 1.6719333035779827e-07, "loss": 0.0075, "step": 4631 }, { "epoch": 0.9200516436587546, "grad_norm": 0.5986458365684142, "learning_rate": 1.6636924294758828e-07, "loss": 0.0095, "step": 4632 }, { "epoch": 0.920250273115503, "grad_norm": 0.5772681496058722, "learning_rate": 1.655471571390349e-07, "loss": 0.0083, "step": 4633 }, { "epoch": 0.9204489025722514, "grad_norm": 0.29855689621547776, "learning_rate": 1.6472707327256198e-07, "loss": 0.0048, "step": 4634 }, { "epoch": 0.9206475320289998, "grad_norm": 0.5804263227677894, "learning_rate": 1.639089916877662e-07, "loss": 0.0156, "step": 4635 }, { "epoch": 0.9208461614857484, "grad_norm": 0.47922034362262383, "learning_rate": 1.6309291272341377e-07, "loss": 0.0108, "step": 4636 }, { "epoch": 0.9210447909424968, "grad_norm": 0.3263256249555732, "learning_rate": 1.622788367174427e-07, "loss": 0.0058, "step": 4637 }, { "epoch": 0.9212434203992452, "grad_norm": 0.6192492406914758, "learning_rate": 1.6146676400696003e-07, "loss": 0.012, "step": 4638 }, { "epoch": 0.9214420498559937, "grad_norm": 0.4766849044662014, "learning_rate": 1.606566949282451e-07, "loss": 0.0172, "step": 4639 }, { "epoch": 0.9216406793127421, "grad_norm": 0.6710327474510234, "learning_rate": 1.5984862981674786e-07, "loss": 0.0129, "step": 4640 }, { "epoch": 0.9218393087694905, "grad_norm": 0.42862810177240407, "learning_rate": 1.5904256900708459e-07, "loss": 0.0063, "step": 4641 }, { "epoch": 0.9220379382262389, "grad_norm": 0.3757229399422139, "learning_rate": 1.5823851283304546e-07, "loss": 0.008, "step": 4642 }, { "epoch": 0.9222365676829873, "grad_norm": 0.4649954354241579, "learning_rate": 1.574364616275903e-07, "loss": 0.009, "step": 4643 }, { "epoch": 0.9224351971397359, "grad_norm": 0.24892525017019804, "learning_rate": 1.5663641572284672e-07, "loss": 0.0055, "step": 4644 }, { "epoch": 0.9226338265964843, "grad_norm": 0.620937103802513, "learning_rate": 1.5583837545011305e-07, "loss": 0.0108, "step": 4645 }, { "epoch": 0.9228324560532327, "grad_norm": 0.7464819124618941, "learning_rate": 1.5504234113985661e-07, "loss": 0.005, "step": 4646 }, { "epoch": 0.9230310855099811, "grad_norm": 0.39950890377449805, "learning_rate": 1.5424831312171595e-07, "loss": 0.0091, "step": 4647 }, { "epoch": 0.9232297149667296, "grad_norm": 0.7650531790637041, "learning_rate": 1.5345629172449472e-07, "loss": 0.0182, "step": 4648 }, { "epoch": 0.923428344423478, "grad_norm": 0.4823999396347639, "learning_rate": 1.5266627727617056e-07, "loss": 0.0192, "step": 4649 }, { "epoch": 0.9236269738802264, "grad_norm": 0.7401078670894918, "learning_rate": 1.5187827010388677e-07, "loss": 0.0127, "step": 4650 }, { "epoch": 0.9238256033369748, "grad_norm": 0.3397450263276877, "learning_rate": 1.510922705339557e-07, "loss": 0.0108, "step": 4651 }, { "epoch": 0.9240242327937233, "grad_norm": 0.3518773712824609, "learning_rate": 1.503082788918603e-07, "loss": 0.0057, "step": 4652 }, { "epoch": 0.9242228622504718, "grad_norm": 0.2477018526451415, "learning_rate": 1.4952629550224916e-07, "loss": 0.0058, "step": 4653 }, { "epoch": 0.9244214917072202, "grad_norm": 0.5414836310524654, "learning_rate": 1.487463206889428e-07, "loss": 0.0165, "step": 4654 }, { "epoch": 0.9246201211639686, "grad_norm": 0.47446038228846893, "learning_rate": 1.479683547749261e-07, "loss": 0.0131, "step": 4655 }, { "epoch": 0.9248187506207171, "grad_norm": 0.2636023459839493, "learning_rate": 1.4719239808235418e-07, "loss": 0.0041, "step": 4656 }, { "epoch": 0.9250173800774655, "grad_norm": 0.40166636205399747, "learning_rate": 1.464184509325517e-07, "loss": 0.0055, "step": 4657 }, { "epoch": 0.9252160095342139, "grad_norm": 0.44435793160055864, "learning_rate": 1.4564651364600724e-07, "loss": 0.0059, "step": 4658 }, { "epoch": 0.9254146389909623, "grad_norm": 0.3994984612235425, "learning_rate": 1.4487658654238123e-07, "loss": 0.0071, "step": 4659 }, { "epoch": 0.9256132684477107, "grad_norm": 0.47362741123170454, "learning_rate": 1.4410866994049755e-07, "loss": 0.011, "step": 4660 }, { "epoch": 0.9258118979044593, "grad_norm": 0.7872100192173731, "learning_rate": 1.433427641583518e-07, "loss": 0.0169, "step": 4661 }, { "epoch": 0.9260105273612077, "grad_norm": 0.8552459275140093, "learning_rate": 1.4257886951310307e-07, "loss": 0.0185, "step": 4662 }, { "epoch": 0.9262091568179561, "grad_norm": 0.6608328624853623, "learning_rate": 1.4181698632108055e-07, "loss": 0.009, "step": 4663 }, { "epoch": 0.9264077862747045, "grad_norm": 0.414776424248957, "learning_rate": 1.4105711489777962e-07, "loss": 0.0088, "step": 4664 }, { "epoch": 0.926606415731453, "grad_norm": 0.6398136251284509, "learning_rate": 1.4029925555786027e-07, "loss": 0.0154, "step": 4665 }, { "epoch": 0.9268050451882014, "grad_norm": 0.7474279649587607, "learning_rate": 1.395434086151537e-07, "loss": 0.0067, "step": 4666 }, { "epoch": 0.9270036746449498, "grad_norm": 0.5064378151011119, "learning_rate": 1.3878957438265338e-07, "loss": 0.0116, "step": 4667 }, { "epoch": 0.9272023041016982, "grad_norm": 0.45989881995150333, "learning_rate": 1.3803775317252188e-07, "loss": 0.0092, "step": 4668 }, { "epoch": 0.9274009335584468, "grad_norm": 0.6599032509747758, "learning_rate": 1.3728794529608846e-07, "loss": 0.0093, "step": 4669 }, { "epoch": 0.9275995630151952, "grad_norm": 0.5141005371853344, "learning_rate": 1.3654015106384698e-07, "loss": 0.0093, "step": 4670 }, { "epoch": 0.9277981924719436, "grad_norm": 0.5941689240567093, "learning_rate": 1.357943707854592e-07, "loss": 0.0146, "step": 4671 }, { "epoch": 0.927996821928692, "grad_norm": 0.20114286406354762, "learning_rate": 1.3505060476975085e-07, "loss": 0.003, "step": 4672 }, { "epoch": 0.9281954513854405, "grad_norm": 0.45688487671874534, "learning_rate": 1.3430885332471554e-07, "loss": 0.0162, "step": 4673 }, { "epoch": 0.9283940808421889, "grad_norm": 0.35614643694965575, "learning_rate": 1.3356911675751093e-07, "loss": 0.0051, "step": 4674 }, { "epoch": 0.9285927102989373, "grad_norm": 0.5744415497903277, "learning_rate": 1.3283139537446144e-07, "loss": 0.0121, "step": 4675 }, { "epoch": 0.9287913397556857, "grad_norm": 1.0850590038936332, "learning_rate": 1.3209568948105768e-07, "loss": 0.0145, "step": 4676 }, { "epoch": 0.9289899692124342, "grad_norm": 0.45964179898690255, "learning_rate": 1.313619993819537e-07, "loss": 0.0131, "step": 4677 }, { "epoch": 0.9291885986691827, "grad_norm": 0.5818030589778348, "learning_rate": 1.3063032538097097e-07, "loss": 0.0111, "step": 4678 }, { "epoch": 0.9293872281259311, "grad_norm": 0.6481989151858105, "learning_rate": 1.2990066778109323e-07, "loss": 0.0136, "step": 4679 }, { "epoch": 0.9295858575826795, "grad_norm": 0.37947609369661844, "learning_rate": 1.2917302688447265e-07, "loss": 0.0113, "step": 4680 }, { "epoch": 0.929784487039428, "grad_norm": 0.5081483427859849, "learning_rate": 1.284474029924232e-07, "loss": 0.0103, "step": 4681 }, { "epoch": 0.9299831164961764, "grad_norm": 0.21444824245931055, "learning_rate": 1.2772379640542564e-07, "loss": 0.005, "step": 4682 }, { "epoch": 0.9301817459529248, "grad_norm": 0.26879234255685003, "learning_rate": 1.270022074231253e-07, "loss": 0.0099, "step": 4683 }, { "epoch": 0.9303803754096732, "grad_norm": 0.5160956745621564, "learning_rate": 1.2628263634433035e-07, "loss": 0.0138, "step": 4684 }, { "epoch": 0.9305790048664216, "grad_norm": 0.49211468181460044, "learning_rate": 1.2556508346701578e-07, "loss": 0.0064, "step": 4685 }, { "epoch": 0.9307776343231702, "grad_norm": 0.8023564960097778, "learning_rate": 1.2484954908831837e-07, "loss": 0.0125, "step": 4686 }, { "epoch": 0.9309762637799186, "grad_norm": 0.9862722885789399, "learning_rate": 1.241360335045405e-07, "loss": 0.0085, "step": 4687 }, { "epoch": 0.931174893236667, "grad_norm": 0.7887851867631835, "learning_rate": 1.2342453701114864e-07, "loss": 0.0131, "step": 4688 }, { "epoch": 0.9313735226934154, "grad_norm": 0.3479006313592782, "learning_rate": 1.227150599027721e-07, "loss": 0.0094, "step": 4689 }, { "epoch": 0.9315721521501639, "grad_norm": 0.674927856372601, "learning_rate": 1.2200760247320586e-07, "loss": 0.0098, "step": 4690 }, { "epoch": 0.9317707816069123, "grad_norm": 0.42722316421638, "learning_rate": 1.213021650154056e-07, "loss": 0.0095, "step": 4691 }, { "epoch": 0.9319694110636607, "grad_norm": 0.301197655900226, "learning_rate": 1.2059874782149317e-07, "loss": 0.0061, "step": 4692 }, { "epoch": 0.9321680405204091, "grad_norm": 0.5133950310604148, "learning_rate": 1.1989735118275337e-07, "loss": 0.0127, "step": 4693 }, { "epoch": 0.9323666699771576, "grad_norm": 0.34014268074393605, "learning_rate": 1.1919797538963274e-07, "loss": 0.0059, "step": 4694 }, { "epoch": 0.9325652994339061, "grad_norm": 0.5164175565820704, "learning_rate": 1.1850062073174351e-07, "loss": 0.0116, "step": 4695 }, { "epoch": 0.9327639288906545, "grad_norm": 0.4070984340693311, "learning_rate": 1.1780528749785802e-07, "loss": 0.0077, "step": 4696 }, { "epoch": 0.9329625583474029, "grad_norm": 0.38897032187757813, "learning_rate": 1.1711197597591428e-07, "loss": 0.0063, "step": 4697 }, { "epoch": 0.9331611878041514, "grad_norm": 0.2930844011928402, "learning_rate": 1.1642068645301152e-07, "loss": 0.0053, "step": 4698 }, { "epoch": 0.9333598172608998, "grad_norm": 0.3764839957960558, "learning_rate": 1.1573141921541131e-07, "loss": 0.0054, "step": 4699 }, { "epoch": 0.9335584467176482, "grad_norm": 0.43523780379200877, "learning_rate": 1.1504417454854033e-07, "loss": 0.013, "step": 4700 }, { "epoch": 0.9337570761743966, "grad_norm": 0.5750229920284355, "learning_rate": 1.1435895273698372e-07, "loss": 0.0084, "step": 4701 }, { "epoch": 0.933955705631145, "grad_norm": 0.19886700519682282, "learning_rate": 1.1367575406449282e-07, "loss": 0.0046, "step": 4702 }, { "epoch": 0.9341543350878936, "grad_norm": 0.790833202260621, "learning_rate": 1.1299457881397858e-07, "loss": 0.0185, "step": 4703 }, { "epoch": 0.934352964544642, "grad_norm": 0.4219918573113129, "learning_rate": 1.1231542726751532e-07, "loss": 0.0092, "step": 4704 }, { "epoch": 0.9345515940013904, "grad_norm": 0.7230909029282245, "learning_rate": 1.1163829970633865e-07, "loss": 0.0147, "step": 4705 }, { "epoch": 0.9347502234581389, "grad_norm": 0.5093534424476391, "learning_rate": 1.1096319641084708e-07, "loss": 0.0116, "step": 4706 }, { "epoch": 0.9349488529148873, "grad_norm": 0.27347946955908975, "learning_rate": 1.1029011766059972e-07, "loss": 0.0034, "step": 4707 }, { "epoch": 0.9351474823716357, "grad_norm": 1.0701505055181986, "learning_rate": 1.0961906373431808e-07, "loss": 0.0117, "step": 4708 }, { "epoch": 0.9353461118283841, "grad_norm": 0.6516188206878525, "learning_rate": 1.0895003490988487e-07, "loss": 0.0114, "step": 4709 }, { "epoch": 0.9355447412851325, "grad_norm": 0.4902354426875574, "learning_rate": 1.0828303146434404e-07, "loss": 0.0088, "step": 4710 }, { "epoch": 0.9357433707418811, "grad_norm": 0.5209392249997609, "learning_rate": 1.0761805367390187e-07, "loss": 0.0117, "step": 4711 }, { "epoch": 0.9359420001986295, "grad_norm": 1.259910339066219, "learning_rate": 1.0695510181392365e-07, "loss": 0.0088, "step": 4712 }, { "epoch": 0.9361406296553779, "grad_norm": 0.5151392556944235, "learning_rate": 1.0629417615893756e-07, "loss": 0.0139, "step": 4713 }, { "epoch": 0.9363392591121263, "grad_norm": 0.9491713148601669, "learning_rate": 1.0563527698263298e-07, "loss": 0.018, "step": 4714 }, { "epoch": 0.9365378885688748, "grad_norm": 1.2153903329483169, "learning_rate": 1.0497840455785835e-07, "loss": 0.0173, "step": 4715 }, { "epoch": 0.9367365180256232, "grad_norm": 0.288079961745967, "learning_rate": 1.0432355915662496e-07, "loss": 0.0033, "step": 4716 }, { "epoch": 0.9369351474823716, "grad_norm": 0.3536849277399397, "learning_rate": 1.0367074105010256e-07, "loss": 0.0075, "step": 4717 }, { "epoch": 0.93713377693912, "grad_norm": 0.6473193451222942, "learning_rate": 1.0301995050862323e-07, "loss": 0.0126, "step": 4718 }, { "epoch": 0.9373324063958685, "grad_norm": 0.6978153920712078, "learning_rate": 1.0237118780167809e-07, "loss": 0.0106, "step": 4719 }, { "epoch": 0.937531035852617, "grad_norm": 0.41837417287713097, "learning_rate": 1.0172445319792002e-07, "loss": 0.009, "step": 4720 }, { "epoch": 0.9377296653093654, "grad_norm": 0.3535301700831529, "learning_rate": 1.0107974696516032e-07, "loss": 0.0078, "step": 4721 }, { "epoch": 0.9379282947661138, "grad_norm": 0.7905465076037768, "learning_rate": 1.0043706937037156e-07, "loss": 0.0134, "step": 4722 }, { "epoch": 0.9381269242228623, "grad_norm": 0.36966492424476793, "learning_rate": 9.979642067968587e-08, "loss": 0.0116, "step": 4723 }, { "epoch": 0.9383255536796107, "grad_norm": 0.6252230556557902, "learning_rate": 9.915780115839491e-08, "loss": 0.0116, "step": 4724 }, { "epoch": 0.9385241831363591, "grad_norm": 0.8581563383955567, "learning_rate": 9.852121107095047e-08, "loss": 0.0266, "step": 4725 }, { "epoch": 0.9387228125931075, "grad_norm": 0.5722638109056252, "learning_rate": 9.788665068096504e-08, "loss": 0.0162, "step": 4726 }, { "epoch": 0.938921442049856, "grad_norm": 0.7084688510586287, "learning_rate": 9.725412025120783e-08, "loss": 0.0087, "step": 4727 }, { "epoch": 0.9391200715066045, "grad_norm": 0.1863360147125236, "learning_rate": 9.662362004360992e-08, "loss": 0.0036, "step": 4728 }, { "epoch": 0.9393187009633529, "grad_norm": 0.5073294558989344, "learning_rate": 9.599515031926021e-08, "loss": 0.0093, "step": 4729 }, { "epoch": 0.9395173304201013, "grad_norm": 0.8287013770837947, "learning_rate": 9.536871133840775e-08, "loss": 0.0186, "step": 4730 }, { "epoch": 0.9397159598768497, "grad_norm": 0.4145640260838776, "learning_rate": 9.474430336046059e-08, "loss": 0.011, "step": 4731 }, { "epoch": 0.9399145893335982, "grad_norm": 0.3121093385078184, "learning_rate": 9.412192664398467e-08, "loss": 0.004, "step": 4732 }, { "epoch": 0.9401132187903466, "grad_norm": 0.5686734537347415, "learning_rate": 9.350158144670662e-08, "loss": 0.0153, "step": 4733 }, { "epoch": 0.940311848247095, "grad_norm": 1.060966428278419, "learning_rate": 9.28832680255104e-08, "loss": 0.019, "step": 4734 }, { "epoch": 0.9405104777038434, "grad_norm": 0.4224352499347425, "learning_rate": 9.22669866364384e-08, "loss": 0.0067, "step": 4735 }, { "epoch": 0.9407091071605919, "grad_norm": 0.3341915120102901, "learning_rate": 9.165273753469261e-08, "loss": 0.0069, "step": 4736 }, { "epoch": 0.9409077366173404, "grad_norm": 0.6457098860974007, "learning_rate": 9.10405209746329e-08, "loss": 0.0137, "step": 4737 }, { "epoch": 0.9411063660740888, "grad_norm": 0.24789794897733472, "learning_rate": 9.043033720977756e-08, "loss": 0.0048, "step": 4738 }, { "epoch": 0.9413049955308372, "grad_norm": 0.3608454499887148, "learning_rate": 8.982218649280284e-08, "loss": 0.0122, "step": 4739 }, { "epoch": 0.9415036249875857, "grad_norm": 0.7509200650856567, "learning_rate": 8.921606907554337e-08, "loss": 0.0116, "step": 4740 }, { "epoch": 0.9417022544443341, "grad_norm": 0.3689801779957159, "learning_rate": 8.861198520899172e-08, "loss": 0.0076, "step": 4741 }, { "epoch": 0.9419008839010825, "grad_norm": 0.5557873310711061, "learning_rate": 8.800993514329892e-08, "loss": 0.0153, "step": 4742 }, { "epoch": 0.9420995133578309, "grad_norm": 0.37590619822900495, "learning_rate": 8.74099191277733e-08, "loss": 0.0091, "step": 4743 }, { "epoch": 0.9422981428145794, "grad_norm": 0.8370582813494728, "learning_rate": 8.681193741088e-08, "loss": 0.0087, "step": 4744 }, { "epoch": 0.9424967722713279, "grad_norm": 0.9398151319661212, "learning_rate": 8.621599024024374e-08, "loss": 0.0148, "step": 4745 }, { "epoch": 0.9426954017280763, "grad_norm": 0.5122269675023894, "learning_rate": 8.562207786264487e-08, "loss": 0.0089, "step": 4746 }, { "epoch": 0.9428940311848247, "grad_norm": 0.6367900598425142, "learning_rate": 8.503020052402223e-08, "loss": 0.0109, "step": 4747 }, { "epoch": 0.9430926606415732, "grad_norm": 0.7304382048017897, "learning_rate": 8.444035846947141e-08, "loss": 0.0169, "step": 4748 }, { "epoch": 0.9432912900983216, "grad_norm": 0.7902391038165741, "learning_rate": 8.385255194324593e-08, "loss": 0.0119, "step": 4749 }, { "epoch": 0.94348991955507, "grad_norm": 0.403170445331138, "learning_rate": 8.326678118875554e-08, "loss": 0.0101, "step": 4750 }, { "epoch": 0.9436885490118184, "grad_norm": 0.4907736576120689, "learning_rate": 8.268304644856673e-08, "loss": 0.0108, "step": 4751 }, { "epoch": 0.9438871784685668, "grad_norm": 0.5505554874481177, "learning_rate": 8.210134796440449e-08, "loss": 0.0107, "step": 4752 }, { "epoch": 0.9440858079253154, "grad_norm": 0.5428734384940673, "learning_rate": 8.152168597714894e-08, "loss": 0.0093, "step": 4753 }, { "epoch": 0.9442844373820638, "grad_norm": 0.44060876168794505, "learning_rate": 8.094406072683858e-08, "loss": 0.0062, "step": 4754 }, { "epoch": 0.9444830668388122, "grad_norm": 0.40725989340402463, "learning_rate": 8.036847245266543e-08, "loss": 0.0099, "step": 4755 }, { "epoch": 0.9446816962955606, "grad_norm": 0.46261890604962713, "learning_rate": 7.979492139298162e-08, "loss": 0.011, "step": 4756 }, { "epoch": 0.9448803257523091, "grad_norm": 0.4976138795208467, "learning_rate": 7.92234077852938e-08, "loss": 0.0136, "step": 4757 }, { "epoch": 0.9450789552090575, "grad_norm": 0.5403839208497061, "learning_rate": 7.865393186626491e-08, "loss": 0.0068, "step": 4758 }, { "epoch": 0.9452775846658059, "grad_norm": 0.34280255446442026, "learning_rate": 7.808649387171519e-08, "loss": 0.0063, "step": 4759 }, { "epoch": 0.9454762141225543, "grad_norm": 0.9007767217887392, "learning_rate": 7.752109403661834e-08, "loss": 0.011, "step": 4760 }, { "epoch": 0.9456748435793028, "grad_norm": 1.0223302663255138, "learning_rate": 7.695773259510764e-08, "loss": 0.0094, "step": 4761 }, { "epoch": 0.9458734730360513, "grad_norm": 0.41230357058037975, "learning_rate": 7.639640978046981e-08, "loss": 0.0076, "step": 4762 }, { "epoch": 0.9460721024927997, "grad_norm": 0.4318170038225908, "learning_rate": 7.583712582514724e-08, "loss": 0.0121, "step": 4763 }, { "epoch": 0.9462707319495481, "grad_norm": 0.2930715164840165, "learning_rate": 7.527988096074079e-08, "loss": 0.004, "step": 4764 }, { "epoch": 0.9464693614062966, "grad_norm": 1.1647632495336515, "learning_rate": 7.47246754180031e-08, "loss": 0.0114, "step": 4765 }, { "epoch": 0.946667990863045, "grad_norm": 0.5002685742542026, "learning_rate": 7.417150942684525e-08, "loss": 0.0115, "step": 4766 }, { "epoch": 0.9468666203197934, "grad_norm": 0.5858011863544192, "learning_rate": 7.362038321633235e-08, "loss": 0.0086, "step": 4767 }, { "epoch": 0.9470652497765418, "grad_norm": 0.5066199298525029, "learning_rate": 7.307129701468574e-08, "loss": 0.0103, "step": 4768 }, { "epoch": 0.9472638792332903, "grad_norm": 0.2689653914202334, "learning_rate": 7.252425104928074e-08, "loss": 0.0064, "step": 4769 }, { "epoch": 0.9474625086900388, "grad_norm": 0.6049148054355731, "learning_rate": 7.197924554664893e-08, "loss": 0.0119, "step": 4770 }, { "epoch": 0.9476611381467872, "grad_norm": 0.5517727484698056, "learning_rate": 7.1436280732477e-08, "loss": 0.0132, "step": 4771 }, { "epoch": 0.9478597676035356, "grad_norm": 0.32666164342892834, "learning_rate": 7.089535683160508e-08, "loss": 0.0062, "step": 4772 }, { "epoch": 0.948058397060284, "grad_norm": 1.102212142762428, "learning_rate": 7.035647406803015e-08, "loss": 0.0136, "step": 4773 }, { "epoch": 0.9482570265170325, "grad_norm": 0.7658504815210466, "learning_rate": 6.981963266490199e-08, "loss": 0.0116, "step": 4774 }, { "epoch": 0.9484556559737809, "grad_norm": 0.8130273192518539, "learning_rate": 6.92848328445267e-08, "loss": 0.0128, "step": 4775 }, { "epoch": 0.9486542854305293, "grad_norm": 0.6824866401998253, "learning_rate": 6.875207482836544e-08, "loss": 0.0164, "step": 4776 }, { "epoch": 0.9488529148872777, "grad_norm": 0.47569708326658633, "learning_rate": 6.822135883703063e-08, "loss": 0.0089, "step": 4777 }, { "epoch": 0.9490515443440262, "grad_norm": 0.38871295881882434, "learning_rate": 6.769268509029315e-08, "loss": 0.0124, "step": 4778 }, { "epoch": 0.9492501738007747, "grad_norm": 0.6378821616400225, "learning_rate": 6.716605380707508e-08, "loss": 0.0104, "step": 4779 }, { "epoch": 0.9494488032575231, "grad_norm": 0.745002833768245, "learning_rate": 6.664146520545422e-08, "loss": 0.0115, "step": 4780 }, { "epoch": 0.9496474327142715, "grad_norm": 0.3103526017311052, "learning_rate": 6.611891950266235e-08, "loss": 0.006, "step": 4781 }, { "epoch": 0.94984606217102, "grad_norm": 0.44138783004158555, "learning_rate": 6.559841691508473e-08, "loss": 0.0109, "step": 4782 }, { "epoch": 0.9500446916277684, "grad_norm": 0.6016533565370209, "learning_rate": 6.507995765826169e-08, "loss": 0.0135, "step": 4783 }, { "epoch": 0.9502433210845168, "grad_norm": 0.9664137702511328, "learning_rate": 6.456354194688597e-08, "loss": 0.0103, "step": 4784 }, { "epoch": 0.9504419505412652, "grad_norm": 0.2967531583933644, "learning_rate": 6.404916999480482e-08, "loss": 0.0061, "step": 4785 }, { "epoch": 0.9506405799980137, "grad_norm": 0.4734645839133741, "learning_rate": 6.353684201502008e-08, "loss": 0.0094, "step": 4786 }, { "epoch": 0.9508392094547622, "grad_norm": 0.7133914033624615, "learning_rate": 6.302655821968485e-08, "loss": 0.0146, "step": 4787 }, { "epoch": 0.9510378389115106, "grad_norm": 0.4600218420642334, "learning_rate": 6.25183188201084e-08, "loss": 0.0097, "step": 4788 }, { "epoch": 0.951236468368259, "grad_norm": 0.38688423685277884, "learning_rate": 6.201212402675072e-08, "loss": 0.0084, "step": 4789 }, { "epoch": 0.9514350978250075, "grad_norm": 0.566156909391181, "learning_rate": 6.15079740492286e-08, "loss": 0.0127, "step": 4790 }, { "epoch": 0.9516337272817559, "grad_norm": 0.5610189586192836, "learning_rate": 6.100586909630779e-08, "loss": 0.0126, "step": 4791 }, { "epoch": 0.9518323567385043, "grad_norm": 0.6857347036976907, "learning_rate": 6.050580937591144e-08, "loss": 0.0117, "step": 4792 }, { "epoch": 0.9520309861952527, "grad_norm": 0.5458484629152233, "learning_rate": 6.000779509511279e-08, "loss": 0.0115, "step": 4793 }, { "epoch": 0.9522296156520011, "grad_norm": 0.6424256763252679, "learning_rate": 5.951182646013853e-08, "loss": 0.0161, "step": 4794 }, { "epoch": 0.9524282451087497, "grad_norm": 0.544918430900207, "learning_rate": 5.901790367636995e-08, "loss": 0.0138, "step": 4795 }, { "epoch": 0.9526268745654981, "grad_norm": 0.7253601543796748, "learning_rate": 5.8526026948338974e-08, "loss": 0.0188, "step": 4796 }, { "epoch": 0.9528255040222465, "grad_norm": 0.6596213294085469, "learning_rate": 5.803619647973213e-08, "loss": 0.0108, "step": 4797 }, { "epoch": 0.953024133478995, "grad_norm": 0.3094292663332667, "learning_rate": 5.754841247338716e-08, "loss": 0.0075, "step": 4798 }, { "epoch": 0.9532227629357434, "grad_norm": 0.2653176462450698, "learning_rate": 5.706267513129527e-08, "loss": 0.006, "step": 4799 }, { "epoch": 0.9534213923924918, "grad_norm": 0.2862839957357303, "learning_rate": 5.657898465459943e-08, "loss": 0.007, "step": 4800 }, { "epoch": 0.9536200218492402, "grad_norm": 0.46267765672416544, "learning_rate": 5.609734124359556e-08, "loss": 0.0115, "step": 4801 }, { "epoch": 0.9538186513059886, "grad_norm": 1.3552174604785634, "learning_rate": 5.5617745097731876e-08, "loss": 0.0066, "step": 4802 }, { "epoch": 0.9540172807627371, "grad_norm": 0.6673917782595586, "learning_rate": 5.5140196415608414e-08, "loss": 0.0128, "step": 4803 }, { "epoch": 0.9542159102194856, "grad_norm": 0.6949616822551641, "learning_rate": 5.466469539497809e-08, "loss": 0.0158, "step": 4804 }, { "epoch": 0.954414539676234, "grad_norm": 0.7623499887827077, "learning_rate": 5.419124223274452e-08, "loss": 0.0139, "step": 4805 }, { "epoch": 0.9546131691329824, "grad_norm": 0.548628908422014, "learning_rate": 5.371983712496476e-08, "loss": 0.0122, "step": 4806 }, { "epoch": 0.9548117985897309, "grad_norm": 0.8368126791005601, "learning_rate": 5.325048026684765e-08, "loss": 0.0156, "step": 4807 }, { "epoch": 0.9550104280464793, "grad_norm": 0.5712358763963159, "learning_rate": 5.278317185275217e-08, "loss": 0.0143, "step": 4808 }, { "epoch": 0.9552090575032277, "grad_norm": 0.33369125553839335, "learning_rate": 5.23179120761913e-08, "loss": 0.0091, "step": 4809 }, { "epoch": 0.9554076869599761, "grad_norm": 0.5426548631897875, "learning_rate": 5.185470112982816e-08, "loss": 0.01, "step": 4810 }, { "epoch": 0.9556063164167246, "grad_norm": 0.7736330728886897, "learning_rate": 5.139353920547818e-08, "loss": 0.0084, "step": 4811 }, { "epoch": 0.9558049458734731, "grad_norm": 0.7726944536488665, "learning_rate": 5.093442649410807e-08, "loss": 0.0122, "step": 4812 }, { "epoch": 0.9560035753302215, "grad_norm": 0.41025603735219146, "learning_rate": 5.0477363185835736e-08, "loss": 0.0082, "step": 4813 }, { "epoch": 0.9562022047869699, "grad_norm": 0.42810031719834896, "learning_rate": 5.0022349469930344e-08, "loss": 0.0076, "step": 4814 }, { "epoch": 0.9564008342437184, "grad_norm": 0.5312298899611213, "learning_rate": 4.9569385534813386e-08, "loss": 0.0091, "step": 4815 }, { "epoch": 0.9565994637004668, "grad_norm": 0.5843424770418346, "learning_rate": 4.911847156805649e-08, "loss": 0.0058, "step": 4816 }, { "epoch": 0.9567980931572152, "grad_norm": 0.627548785238714, "learning_rate": 4.866960775638252e-08, "loss": 0.0131, "step": 4817 }, { "epoch": 0.9569967226139636, "grad_norm": 0.7105539318515306, "learning_rate": 4.8222794285665006e-08, "loss": 0.0183, "step": 4818 }, { "epoch": 0.957195352070712, "grad_norm": 0.6002583148423645, "learning_rate": 4.7778031340930397e-08, "loss": 0.0107, "step": 4819 }, { "epoch": 0.9573939815274605, "grad_norm": 0.6818174820977825, "learning_rate": 4.7335319106353026e-08, "loss": 0.0168, "step": 4820 }, { "epoch": 0.957592610984209, "grad_norm": 0.3965043691901418, "learning_rate": 4.689465776526125e-08, "loss": 0.0083, "step": 4821 }, { "epoch": 0.9577912404409574, "grad_norm": 0.6228622351159547, "learning_rate": 4.645604750013078e-08, "loss": 0.009, "step": 4822 }, { "epoch": 0.9579898698977058, "grad_norm": 0.47410239295040185, "learning_rate": 4.601948849259019e-08, "loss": 0.014, "step": 4823 }, { "epoch": 0.9581884993544543, "grad_norm": 1.0847330066440664, "learning_rate": 4.558498092341879e-08, "loss": 0.016, "step": 4824 }, { "epoch": 0.9583871288112027, "grad_norm": 1.061431136868962, "learning_rate": 4.515252497254541e-08, "loss": 0.0147, "step": 4825 }, { "epoch": 0.9585857582679511, "grad_norm": 0.4114241358058321, "learning_rate": 4.4722120819049586e-08, "loss": 0.0111, "step": 4826 }, { "epoch": 0.9587843877246995, "grad_norm": 0.6247463058011158, "learning_rate": 4.4293768641160416e-08, "loss": 0.0099, "step": 4827 }, { "epoch": 0.958983017181448, "grad_norm": 0.8021242243802347, "learning_rate": 4.38674686162599e-08, "loss": 0.0136, "step": 4828 }, { "epoch": 0.9591816466381965, "grad_norm": 0.5276059971971032, "learning_rate": 4.344322092087683e-08, "loss": 0.0139, "step": 4829 }, { "epoch": 0.9593802760949449, "grad_norm": 0.4836014882831527, "learning_rate": 4.302102573069289e-08, "loss": 0.0115, "step": 4830 }, { "epoch": 0.9595789055516933, "grad_norm": 0.34443701008701366, "learning_rate": 4.260088322053768e-08, "loss": 0.0123, "step": 4831 }, { "epoch": 0.9597775350084418, "grad_norm": 0.3859441891012454, "learning_rate": 4.2182793564392034e-08, "loss": 0.0092, "step": 4832 }, { "epoch": 0.9599761644651902, "grad_norm": 0.5084520994177352, "learning_rate": 4.176675693538745e-08, "loss": 0.013, "step": 4833 }, { "epoch": 0.9601747939219386, "grad_norm": 0.3918431048956989, "learning_rate": 4.13527735058028e-08, "loss": 0.0069, "step": 4834 }, { "epoch": 0.960373423378687, "grad_norm": 0.5475933146987769, "learning_rate": 4.094084344706928e-08, "loss": 0.0131, "step": 4835 }, { "epoch": 0.9605720528354355, "grad_norm": 0.8255199408065383, "learning_rate": 4.053096692976655e-08, "loss": 0.0127, "step": 4836 }, { "epoch": 0.960770682292184, "grad_norm": 0.9276148823282381, "learning_rate": 4.012314412362328e-08, "loss": 0.0119, "step": 4837 }, { "epoch": 0.9609693117489324, "grad_norm": 0.5402494802195912, "learning_rate": 3.971737519751939e-08, "loss": 0.0083, "step": 4838 }, { "epoch": 0.9611679412056808, "grad_norm": 0.35876195272358025, "learning_rate": 3.9313660319483246e-08, "loss": 0.0085, "step": 4839 }, { "epoch": 0.9613665706624293, "grad_norm": 0.42863726316703926, "learning_rate": 3.8911999656692787e-08, "loss": 0.0053, "step": 4840 }, { "epoch": 0.9615652001191777, "grad_norm": 0.43862831859121126, "learning_rate": 3.851239337547441e-08, "loss": 0.0099, "step": 4841 }, { "epoch": 0.9617638295759261, "grad_norm": 0.5380419062452532, "learning_rate": 3.8114841641305744e-08, "loss": 0.0102, "step": 4842 }, { "epoch": 0.9619624590326745, "grad_norm": 0.3862160575949929, "learning_rate": 3.7719344618812326e-08, "loss": 0.0121, "step": 4843 }, { "epoch": 0.9621610884894229, "grad_norm": 0.4397239298419148, "learning_rate": 3.7325902471768706e-08, "loss": 0.0095, "step": 4844 }, { "epoch": 0.9623597179461714, "grad_norm": 0.8870761334367195, "learning_rate": 3.693451536309955e-08, "loss": 0.018, "step": 4845 }, { "epoch": 0.9625583474029199, "grad_norm": 0.7392267280358624, "learning_rate": 3.65451834548769e-08, "loss": 0.0132, "step": 4846 }, { "epoch": 0.9627569768596683, "grad_norm": 0.9086319168545279, "learning_rate": 3.6157906908323995e-08, "loss": 0.0112, "step": 4847 }, { "epoch": 0.9629556063164167, "grad_norm": 0.5583448293826135, "learning_rate": 3.5772685883809775e-08, "loss": 0.008, "step": 4848 }, { "epoch": 0.9631542357731652, "grad_norm": 0.5863970387552007, "learning_rate": 3.5389520540856094e-08, "loss": 0.0062, "step": 4849 }, { "epoch": 0.9633528652299136, "grad_norm": 0.25369702479905915, "learning_rate": 3.500841103812991e-08, "loss": 0.006, "step": 4850 }, { "epoch": 0.963551494686662, "grad_norm": 0.7699812014005178, "learning_rate": 3.462935753344832e-08, "loss": 0.0124, "step": 4851 }, { "epoch": 0.9637501241434104, "grad_norm": 0.39448570416002493, "learning_rate": 3.4252360183777976e-08, "loss": 0.0068, "step": 4852 }, { "epoch": 0.9639487536001589, "grad_norm": 0.37419322323165516, "learning_rate": 3.38774191452318e-08, "loss": 0.0063, "step": 4853 }, { "epoch": 0.9641473830569074, "grad_norm": 0.6355343515700734, "learning_rate": 3.350453457307335e-08, "loss": 0.0141, "step": 4854 }, { "epoch": 0.9643460125136558, "grad_norm": 0.8356043327709957, "learning_rate": 3.313370662171411e-08, "loss": 0.0197, "step": 4855 }, { "epoch": 0.9645446419704042, "grad_norm": 0.88373860293526, "learning_rate": 3.276493544471237e-08, "loss": 0.0137, "step": 4856 }, { "epoch": 0.9647432714271527, "grad_norm": 0.43188108347987786, "learning_rate": 3.239822119477709e-08, "loss": 0.0099, "step": 4857 }, { "epoch": 0.9649419008839011, "grad_norm": 0.2922777433644811, "learning_rate": 3.2033564023762895e-08, "loss": 0.0076, "step": 4858 }, { "epoch": 0.9651405303406495, "grad_norm": 0.41982816990822347, "learning_rate": 3.167096408267567e-08, "loss": 0.0064, "step": 4859 }, { "epoch": 0.9653391597973979, "grad_norm": 0.3912328421574355, "learning_rate": 3.131042152166641e-08, "loss": 0.0078, "step": 4860 }, { "epoch": 0.9655377892541464, "grad_norm": 0.49125451640633777, "learning_rate": 3.0951936490035696e-08, "loss": 0.0071, "step": 4861 }, { "epoch": 0.9657364187108948, "grad_norm": 0.38541352755974334, "learning_rate": 3.059550913623199e-08, "loss": 0.0106, "step": 4862 }, { "epoch": 0.9659350481676433, "grad_norm": 0.34748274835246395, "learning_rate": 3.024113960785169e-08, "loss": 0.0068, "step": 4863 }, { "epoch": 0.9661336776243917, "grad_norm": 0.33066145540841996, "learning_rate": 2.9888828051638505e-08, "loss": 0.0088, "step": 4864 }, { "epoch": 0.9663323070811402, "grad_norm": 0.21097836166400444, "learning_rate": 2.9538574613484084e-08, "loss": 0.0058, "step": 4865 }, { "epoch": 0.9665309365378886, "grad_norm": 0.36457461980716876, "learning_rate": 2.919037943842906e-08, "loss": 0.0062, "step": 4866 }, { "epoch": 0.966729565994637, "grad_norm": 0.42668003224731427, "learning_rate": 2.884424267065922e-08, "loss": 0.0083, "step": 4867 }, { "epoch": 0.9669281954513854, "grad_norm": 0.8275980493196213, "learning_rate": 2.8500164453511002e-08, "loss": 0.019, "step": 4868 }, { "epoch": 0.9671268249081338, "grad_norm": 0.3290524233842703, "learning_rate": 2.8158144929466e-08, "loss": 0.0066, "step": 4869 }, { "epoch": 0.9673254543648823, "grad_norm": 0.372833561243726, "learning_rate": 2.78181842401537e-08, "loss": 0.0045, "step": 4870 }, { "epoch": 0.9675240838216308, "grad_norm": 0.4524320356338434, "learning_rate": 2.74802825263526e-08, "loss": 0.0061, "step": 4871 }, { "epoch": 0.9677227132783792, "grad_norm": 0.5396272827787818, "learning_rate": 2.714443992798632e-08, "loss": 0.0114, "step": 4872 }, { "epoch": 0.9679213427351276, "grad_norm": 0.71897085626895, "learning_rate": 2.681065658412807e-08, "loss": 0.0149, "step": 4873 }, { "epoch": 0.9681199721918761, "grad_norm": 0.3973267868046733, "learning_rate": 2.6478932632996724e-08, "loss": 0.0071, "step": 4874 }, { "epoch": 0.9683186016486245, "grad_norm": 0.7077697157290327, "learning_rate": 2.6149268211957955e-08, "loss": 0.0185, "step": 4875 }, { "epoch": 0.9685172311053729, "grad_norm": 0.5366383773695785, "learning_rate": 2.5821663457527013e-08, "loss": 0.0115, "step": 4876 }, { "epoch": 0.9687158605621213, "grad_norm": 0.587638684038049, "learning_rate": 2.549611850536371e-08, "loss": 0.0098, "step": 4877 }, { "epoch": 0.9689144900188698, "grad_norm": 0.5064559833862212, "learning_rate": 2.517263349027632e-08, "loss": 0.0114, "step": 4878 }, { "epoch": 0.9691131194756182, "grad_norm": 0.677575640563524, "learning_rate": 2.485120854621992e-08, "loss": 0.0112, "step": 4879 }, { "epoch": 0.9693117489323667, "grad_norm": 0.5454976845516769, "learning_rate": 2.4531843806294696e-08, "loss": 0.0071, "step": 4880 }, { "epoch": 0.9695103783891151, "grad_norm": 0.5457073272241642, "learning_rate": 2.4214539402751534e-08, "loss": 0.0078, "step": 4881 }, { "epoch": 0.9697090078458636, "grad_norm": 0.4264160073509605, "learning_rate": 2.3899295466983663e-08, "loss": 0.0122, "step": 4882 }, { "epoch": 0.969907637302612, "grad_norm": 0.4258970818620188, "learning_rate": 2.3586112129534988e-08, "loss": 0.0112, "step": 4883 }, { "epoch": 0.9701062667593604, "grad_norm": 0.6087695074036127, "learning_rate": 2.3274989520093994e-08, "loss": 0.0109, "step": 4884 }, { "epoch": 0.9703048962161088, "grad_norm": 0.857276501396098, "learning_rate": 2.29659277674954e-08, "loss": 0.0159, "step": 4885 }, { "epoch": 0.9705035256728572, "grad_norm": 0.30379709830289564, "learning_rate": 2.2658926999722386e-08, "loss": 0.005, "step": 4886 }, { "epoch": 0.9707021551296057, "grad_norm": 0.44475948970448625, "learning_rate": 2.2353987343902704e-08, "loss": 0.0083, "step": 4887 }, { "epoch": 0.9709007845863542, "grad_norm": 0.5243677778649878, "learning_rate": 2.2051108926313125e-08, "loss": 0.0063, "step": 4888 }, { "epoch": 0.9710994140431026, "grad_norm": 0.6869080614393989, "learning_rate": 2.175029187237332e-08, "loss": 0.0101, "step": 4889 }, { "epoch": 0.971298043499851, "grad_norm": 0.41495749218023437, "learning_rate": 2.1451536306653088e-08, "loss": 0.0096, "step": 4890 }, { "epoch": 0.9714966729565995, "grad_norm": 0.44501375126763126, "learning_rate": 2.1154842352865134e-08, "loss": 0.0076, "step": 4891 }, { "epoch": 0.9716953024133479, "grad_norm": 0.48307549732523714, "learning_rate": 2.0860210133871738e-08, "loss": 0.0098, "step": 4892 }, { "epoch": 0.9718939318700963, "grad_norm": 0.3302165574725224, "learning_rate": 2.0567639771679192e-08, "loss": 0.006, "step": 4893 }, { "epoch": 0.9720925613268447, "grad_norm": 0.6477082027663088, "learning_rate": 2.027713138744003e-08, "loss": 0.0128, "step": 4894 }, { "epoch": 0.9722911907835932, "grad_norm": 0.6335639722740967, "learning_rate": 1.998868510145413e-08, "loss": 0.0092, "step": 4895 }, { "epoch": 0.9724898202403417, "grad_norm": 0.7004054321677288, "learning_rate": 1.9702301033166505e-08, "loss": 0.0134, "step": 4896 }, { "epoch": 0.9726884496970901, "grad_norm": 0.46136836622282223, "learning_rate": 1.9417979301168956e-08, "loss": 0.0084, "step": 4897 }, { "epoch": 0.9728870791538385, "grad_norm": 0.4510335143166035, "learning_rate": 1.9135720023197857e-08, "loss": 0.0092, "step": 4898 }, { "epoch": 0.973085708610587, "grad_norm": 0.2875052305885156, "learning_rate": 1.8855523316137492e-08, "loss": 0.0056, "step": 4899 }, { "epoch": 0.9732843380673354, "grad_norm": 0.4981975875701382, "learning_rate": 1.8577389296016713e-08, "loss": 0.0111, "step": 4900 }, { "epoch": 0.9734829675240838, "grad_norm": 0.6442750654417164, "learning_rate": 1.830131807801061e-08, "loss": 0.0131, "step": 4901 }, { "epoch": 0.9736815969808322, "grad_norm": 0.4873509567237911, "learning_rate": 1.802730977643996e-08, "loss": 0.0079, "step": 4902 }, { "epoch": 0.9738802264375807, "grad_norm": 0.35940737168787174, "learning_rate": 1.7755364504771222e-08, "loss": 0.009, "step": 4903 }, { "epoch": 0.9740788558943291, "grad_norm": 0.6049711416734347, "learning_rate": 1.7485482375616534e-08, "loss": 0.0115, "step": 4904 }, { "epoch": 0.9742774853510776, "grad_norm": 0.540096719001103, "learning_rate": 1.721766350073373e-08, "loss": 0.0085, "step": 4905 }, { "epoch": 0.974476114807826, "grad_norm": 0.5403139332223295, "learning_rate": 1.6951907991026863e-08, "loss": 0.0104, "step": 4906 }, { "epoch": 0.9746747442645745, "grad_norm": 0.3412169774660418, "learning_rate": 1.6688215956545128e-08, "loss": 0.0081, "step": 4907 }, { "epoch": 0.9748733737213229, "grad_norm": 0.30705865819778627, "learning_rate": 1.6426587506482295e-08, "loss": 0.0102, "step": 4908 }, { "epoch": 0.9750720031780713, "grad_norm": 0.46701451128719806, "learning_rate": 1.616702274917892e-08, "loss": 0.0101, "step": 4909 }, { "epoch": 0.9752706326348197, "grad_norm": 0.39809981599821737, "learning_rate": 1.590952179212013e-08, "loss": 0.0109, "step": 4910 }, { "epoch": 0.9754692620915681, "grad_norm": 0.2555686289992959, "learning_rate": 1.565408474193786e-08, "loss": 0.0054, "step": 4911 }, { "epoch": 0.9756678915483166, "grad_norm": 0.24199181404380177, "learning_rate": 1.540071170440749e-08, "loss": 0.0096, "step": 4912 }, { "epoch": 0.9758665210050651, "grad_norm": 0.5796493821986631, "learning_rate": 1.514940278445065e-08, "loss": 0.0074, "step": 4913 }, { "epoch": 0.9760651504618135, "grad_norm": 0.5748057498768764, "learning_rate": 1.4900158086134097e-08, "loss": 0.0154, "step": 4914 }, { "epoch": 0.976263779918562, "grad_norm": 0.33859494804875856, "learning_rate": 1.4652977712669714e-08, "loss": 0.0057, "step": 4915 }, { "epoch": 0.9764624093753104, "grad_norm": 0.4017728258831618, "learning_rate": 1.4407861766415066e-08, "loss": 0.0132, "step": 4916 }, { "epoch": 0.9766610388320588, "grad_norm": 0.6787171557695976, "learning_rate": 1.4164810348871739e-08, "loss": 0.0118, "step": 4917 }, { "epoch": 0.9768596682888072, "grad_norm": 0.6909520206503219, "learning_rate": 1.392382356068811e-08, "loss": 0.0164, "step": 4918 }, { "epoch": 0.9770582977455556, "grad_norm": 0.5243266524129274, "learning_rate": 1.3684901501655468e-08, "loss": 0.0102, "step": 4919 }, { "epoch": 0.9772569272023041, "grad_norm": 0.5013715407112671, "learning_rate": 1.344804427071189e-08, "loss": 0.0145, "step": 4920 }, { "epoch": 0.9774555566590525, "grad_norm": 0.6104887396616863, "learning_rate": 1.3213251965939478e-08, "loss": 0.0094, "step": 4921 }, { "epoch": 0.977654186115801, "grad_norm": 12.92537382124017, "learning_rate": 1.2980524684565455e-08, "loss": 0.0142, "step": 4922 }, { "epoch": 0.9778528155725494, "grad_norm": 0.4169042250958797, "learning_rate": 1.274986252296273e-08, "loss": 0.0092, "step": 4923 }, { "epoch": 0.9780514450292979, "grad_norm": 0.8118130101634882, "learning_rate": 1.2521265576646569e-08, "loss": 0.015, "step": 4924 }, { "epoch": 0.9782500744860463, "grad_norm": 0.6249056291594971, "learning_rate": 1.2294733940280135e-08, "loss": 0.011, "step": 4925 }, { "epoch": 0.9784487039427947, "grad_norm": 0.39444979264014, "learning_rate": 1.2070267707670058e-08, "loss": 0.0086, "step": 4926 }, { "epoch": 0.9786473333995431, "grad_norm": 1.2105651014944392, "learning_rate": 1.184786697176643e-08, "loss": 0.0201, "step": 4927 }, { "epoch": 0.9788459628562916, "grad_norm": 0.6828183839448554, "learning_rate": 1.1627531824666138e-08, "loss": 0.0137, "step": 4928 }, { "epoch": 0.97904459231304, "grad_norm": 0.7000845033148919, "learning_rate": 1.1409262357609529e-08, "loss": 0.012, "step": 4929 }, { "epoch": 0.9792432217697885, "grad_norm": 0.44350465099943603, "learning_rate": 1.1193058660980971e-08, "loss": 0.0115, "step": 4930 }, { "epoch": 0.9794418512265369, "grad_norm": 0.7669306721956611, "learning_rate": 1.0978920824311622e-08, "loss": 0.0188, "step": 4931 }, { "epoch": 0.9796404806832854, "grad_norm": 0.4653719983059989, "learning_rate": 1.0766848936274998e-08, "loss": 0.0079, "step": 4932 }, { "epoch": 0.9798391101400338, "grad_norm": 0.3330343266105515, "learning_rate": 1.0556843084689738e-08, "loss": 0.0084, "step": 4933 }, { "epoch": 0.9800377395967822, "grad_norm": 0.4031244671758501, "learning_rate": 1.0348903356519057e-08, "loss": 0.0088, "step": 4934 }, { "epoch": 0.9802363690535306, "grad_norm": 0.47173933428617837, "learning_rate": 1.0143029837870744e-08, "loss": 0.0117, "step": 4935 }, { "epoch": 0.980434998510279, "grad_norm": 0.5623629301817437, "learning_rate": 9.939222613997157e-09, "loss": 0.0088, "step": 4936 }, { "epoch": 0.9806336279670275, "grad_norm": 0.6962143816147889, "learning_rate": 9.737481769293566e-09, "loss": 0.0147, "step": 4937 }, { "epoch": 0.980832257423776, "grad_norm": 0.456040337121184, "learning_rate": 9.537807387302034e-09, "loss": 0.0093, "step": 4938 }, { "epoch": 0.9810308868805244, "grad_norm": 0.4400816228735132, "learning_rate": 9.340199550706974e-09, "loss": 0.0062, "step": 4939 }, { "epoch": 0.9812295163372728, "grad_norm": 0.7076270413554725, "learning_rate": 9.144658341337375e-09, "loss": 0.014, "step": 4940 }, { "epoch": 0.9814281457940213, "grad_norm": 0.8596549261688607, "learning_rate": 8.9511838401668e-09, "loss": 0.0221, "step": 4941 }, { "epoch": 0.9816267752507697, "grad_norm": 0.3642453441116622, "learning_rate": 8.75977612731227e-09, "loss": 0.0109, "step": 4942 }, { "epoch": 0.9818254047075181, "grad_norm": 0.38678000727812256, "learning_rate": 8.570435282037048e-09, "loss": 0.0095, "step": 4943 }, { "epoch": 0.9820240341642665, "grad_norm": 0.5698335800750048, "learning_rate": 8.383161382745087e-09, "loss": 0.0101, "step": 4944 }, { "epoch": 0.982222663621015, "grad_norm": 0.46124466080444443, "learning_rate": 8.197954506988237e-09, "loss": 0.0057, "step": 4945 }, { "epoch": 0.9824212930777634, "grad_norm": 0.3552672963335277, "learning_rate": 8.014814731458487e-09, "loss": 0.0059, "step": 4946 }, { "epoch": 0.9826199225345119, "grad_norm": 0.42903428026043167, "learning_rate": 7.833742131995725e-09, "loss": 0.0077, "step": 4947 }, { "epoch": 0.9828185519912603, "grad_norm": 0.5963106438933184, "learning_rate": 7.65473678358053e-09, "loss": 0.0132, "step": 4948 }, { "epoch": 0.9830171814480088, "grad_norm": 0.6044799566434823, "learning_rate": 7.477798760339717e-09, "loss": 0.0083, "step": 4949 }, { "epoch": 0.9832158109047572, "grad_norm": 0.25409019391898363, "learning_rate": 7.302928135542453e-09, "loss": 0.0096, "step": 4950 }, { "epoch": 0.9834144403615056, "grad_norm": 0.4559853535199204, "learning_rate": 7.130124981603037e-09, "loss": 0.0154, "step": 4951 }, { "epoch": 0.983613069818254, "grad_norm": 0.5446426719314116, "learning_rate": 6.959389370079228e-09, "loss": 0.0084, "step": 4952 }, { "epoch": 0.9838116992750024, "grad_norm": 0.4002074352633331, "learning_rate": 6.7907213716716936e-09, "loss": 0.0134, "step": 4953 }, { "epoch": 0.9840103287317509, "grad_norm": 0.5482538576918561, "learning_rate": 6.624121056225674e-09, "loss": 0.0101, "step": 4954 }, { "epoch": 0.9842089581884994, "grad_norm": 0.2548625973854045, "learning_rate": 6.459588492731539e-09, "loss": 0.0071, "step": 4955 }, { "epoch": 0.9844075876452478, "grad_norm": 0.7503639525229563, "learning_rate": 6.297123749320344e-09, "loss": 0.0075, "step": 4956 }, { "epoch": 0.9846062171019963, "grad_norm": 0.4290624194383481, "learning_rate": 6.13672689326994e-09, "loss": 0.0142, "step": 4957 }, { "epoch": 0.9848048465587447, "grad_norm": 0.47732768665483954, "learning_rate": 5.978397990999973e-09, "loss": 0.0084, "step": 4958 }, { "epoch": 0.9850034760154931, "grad_norm": 0.6430141219587137, "learning_rate": 5.822137108074111e-09, "loss": 0.0122, "step": 4959 }, { "epoch": 0.9852021054722415, "grad_norm": 0.4601266343624083, "learning_rate": 5.6679443092000354e-09, "loss": 0.0111, "step": 4960 }, { "epoch": 0.9854007349289899, "grad_norm": 0.8545753873794333, "learning_rate": 5.515819658228339e-09, "loss": 0.0072, "step": 4961 }, { "epoch": 0.9855993643857384, "grad_norm": 0.5894818362611455, "learning_rate": 5.3657632181547405e-09, "loss": 0.0107, "step": 4962 }, { "epoch": 0.9857979938424868, "grad_norm": 0.6638553478242332, "learning_rate": 5.217775051116203e-09, "loss": 0.0135, "step": 4963 }, { "epoch": 0.9859966232992353, "grad_norm": 0.28446784639161005, "learning_rate": 5.071855218395927e-09, "loss": 0.0057, "step": 4964 }, { "epoch": 0.9861952527559837, "grad_norm": 0.5910524592127953, "learning_rate": 4.9280037804178e-09, "loss": 0.0144, "step": 4965 }, { "epoch": 0.9863938822127322, "grad_norm": 0.6211297902663819, "learning_rate": 4.78622079675084e-09, "loss": 0.0109, "step": 4966 }, { "epoch": 0.9865925116694806, "grad_norm": 0.5622570313549148, "learning_rate": 4.64650632610808e-09, "loss": 0.0115, "step": 4967 }, { "epoch": 0.986791141126229, "grad_norm": 0.7851311388372668, "learning_rate": 4.508860426344353e-09, "loss": 0.0108, "step": 4968 }, { "epoch": 0.9869897705829774, "grad_norm": 0.9651707733360702, "learning_rate": 4.3732831544590625e-09, "loss": 0.0158, "step": 4969 }, { "epoch": 0.9871884000397259, "grad_norm": 1.3074665840528905, "learning_rate": 4.239774566594523e-09, "loss": 0.009, "step": 4970 }, { "epoch": 0.9873870294964743, "grad_norm": 0.6168920501208438, "learning_rate": 4.1083347180359555e-09, "loss": 0.0158, "step": 4971 }, { "epoch": 0.9875856589532228, "grad_norm": 0.5709471615250453, "learning_rate": 3.9789636632131536e-09, "loss": 0.0088, "step": 4972 }, { "epoch": 0.9877842884099712, "grad_norm": 0.4726577740179389, "learning_rate": 3.851661455698819e-09, "loss": 0.0092, "step": 4973 }, { "epoch": 0.9879829178667197, "grad_norm": 0.34416204276241585, "learning_rate": 3.726428148208006e-09, "loss": 0.005, "step": 4974 }, { "epoch": 0.9881815473234681, "grad_norm": 0.5945886172045202, "learning_rate": 3.6032637925997873e-09, "loss": 0.0175, "step": 4975 }, { "epoch": 0.9883801767802165, "grad_norm": 0.6385884578112276, "learning_rate": 3.4821684398766987e-09, "loss": 0.0105, "step": 4976 }, { "epoch": 0.9885788062369649, "grad_norm": 0.4104801504398456, "learning_rate": 3.3631421401836284e-09, "loss": 0.0114, "step": 4977 }, { "epoch": 0.9887774356937133, "grad_norm": 0.31068946253955954, "learning_rate": 3.2461849428094827e-09, "loss": 0.006, "step": 4978 }, { "epoch": 0.9889760651504618, "grad_norm": 0.6688702368786089, "learning_rate": 3.131296896187186e-09, "loss": 0.0101, "step": 4979 }, { "epoch": 0.9891746946072103, "grad_norm": 0.4449185151964188, "learning_rate": 3.0184780478897947e-09, "loss": 0.0123, "step": 4980 }, { "epoch": 0.9893733240639587, "grad_norm": 0.5828177516464593, "learning_rate": 2.907728444637159e-09, "loss": 0.0127, "step": 4981 }, { "epoch": 0.9895719535207071, "grad_norm": 0.4850738539372369, "learning_rate": 2.7990481322898166e-09, "loss": 0.0141, "step": 4982 }, { "epoch": 0.9897705829774556, "grad_norm": 0.46338314092768906, "learning_rate": 2.6924371558523233e-09, "loss": 0.0139, "step": 4983 }, { "epoch": 0.989969212434204, "grad_norm": 0.4030592860660505, "learning_rate": 2.5878955594726974e-09, "loss": 0.0117, "step": 4984 }, { "epoch": 0.9901678418909524, "grad_norm": 1.0961667604547174, "learning_rate": 2.4854233864402e-09, "loss": 0.0202, "step": 4985 }, { "epoch": 0.9903664713477008, "grad_norm": 0.4721411563769088, "learning_rate": 2.3850206791897756e-09, "loss": 0.0113, "step": 4986 }, { "epoch": 0.9905651008044493, "grad_norm": 0.4833604924531587, "learning_rate": 2.286687479297056e-09, "loss": 0.0095, "step": 4987 }, { "epoch": 0.9907637302611977, "grad_norm": 0.5752978520181773, "learning_rate": 2.1904238274828016e-09, "loss": 0.0097, "step": 4988 }, { "epoch": 0.9909623597179462, "grad_norm": 0.39596085840021944, "learning_rate": 2.0962297636084593e-09, "loss": 0.0083, "step": 4989 }, { "epoch": 0.9911609891746946, "grad_norm": 0.6351520216922857, "learning_rate": 2.0041053266806054e-09, "loss": 0.0117, "step": 4990 }, { "epoch": 0.9913596186314431, "grad_norm": 0.537475588519044, "learning_rate": 1.9140505548476128e-09, "loss": 0.0079, "step": 4991 }, { "epoch": 0.9915582480881915, "grad_norm": 0.4082058783031262, "learning_rate": 1.8260654854013182e-09, "loss": 0.0097, "step": 4992 }, { "epoch": 0.9917568775449399, "grad_norm": 1.0157688286043665, "learning_rate": 1.7401501547759104e-09, "loss": 0.021, "step": 4993 }, { "epoch": 0.9919555070016883, "grad_norm": 0.3634022299259286, "learning_rate": 1.6563045985490412e-09, "loss": 0.0107, "step": 4994 }, { "epoch": 0.9921541364584368, "grad_norm": 0.6952556762857542, "learning_rate": 1.5745288514407153e-09, "loss": 0.0147, "step": 4995 }, { "epoch": 0.9923527659151852, "grad_norm": 0.5786296634418221, "learning_rate": 1.4948229473144005e-09, "loss": 0.0117, "step": 4996 }, { "epoch": 0.9925513953719337, "grad_norm": 0.6356775912475472, "learning_rate": 1.417186919176472e-09, "loss": 0.0151, "step": 4997 }, { "epoch": 0.9927500248286821, "grad_norm": 0.5692096365130425, "learning_rate": 1.341620799175658e-09, "loss": 0.0076, "step": 4998 }, { "epoch": 0.9929486542854306, "grad_norm": 0.8015252962881247, "learning_rate": 1.2681246186035945e-09, "loss": 0.0166, "step": 4999 }, { "epoch": 0.993147283742179, "grad_norm": 0.5276135206707997, "learning_rate": 1.1966984078959354e-09, "loss": 0.0087, "step": 5000 }, { "epoch": 0.9933459131989274, "grad_norm": 0.4908418867095688, "learning_rate": 1.1273421966290221e-09, "loss": 0.0094, "step": 5001 }, { "epoch": 0.9935445426556758, "grad_norm": 0.9923607226693753, "learning_rate": 1.0600560135237691e-09, "loss": 0.0145, "step": 5002 }, { "epoch": 0.9937431721124242, "grad_norm": 0.2731344484135086, "learning_rate": 9.948398864434439e-10, "loss": 0.0068, "step": 5003 }, { "epoch": 0.9939418015691727, "grad_norm": 0.7287487195663518, "learning_rate": 9.316938423936662e-10, "loss": 0.0092, "step": 5004 }, { "epoch": 0.9941404310259211, "grad_norm": 0.505240386716718, "learning_rate": 8.706179075229637e-10, "loss": 0.0117, "step": 5005 }, { "epoch": 0.9943390604826696, "grad_norm": 0.5101453671929035, "learning_rate": 8.116121071238825e-10, "loss": 0.0136, "step": 5006 }, { "epoch": 0.994537689939418, "grad_norm": 0.4807315646835647, "learning_rate": 7.546764656291005e-10, "loss": 0.0086, "step": 5007 }, { "epoch": 0.9947363193961665, "grad_norm": 0.4595127871163573, "learning_rate": 6.998110066169794e-10, "loss": 0.0083, "step": 5008 }, { "epoch": 0.9949349488529149, "grad_norm": 0.5817190396655636, "learning_rate": 6.470157528065679e-10, "loss": 0.0101, "step": 5009 }, { "epoch": 0.9951335783096633, "grad_norm": 0.3676483776227927, "learning_rate": 5.962907260603779e-10, "loss": 0.0088, "step": 5010 }, { "epoch": 0.9953322077664117, "grad_norm": 0.5527914548170548, "learning_rate": 5.476359473838289e-10, "loss": 0.0089, "step": 5011 }, { "epoch": 0.9955308372231602, "grad_norm": 0.47874706302938747, "learning_rate": 5.010514369246933e-10, "loss": 0.0131, "step": 5012 }, { "epoch": 0.9957294666799086, "grad_norm": 0.4328300045576964, "learning_rate": 4.565372139730961e-10, "loss": 0.0162, "step": 5013 }, { "epoch": 0.9959280961366571, "grad_norm": 0.45665840240784356, "learning_rate": 4.140932969631806e-10, "loss": 0.0085, "step": 5014 }, { "epoch": 0.9961267255934055, "grad_norm": 0.9001422570931706, "learning_rate": 3.737197034703322e-10, "loss": 0.0126, "step": 5015 }, { "epoch": 0.996325355050154, "grad_norm": 0.45807153798940137, "learning_rate": 3.3541645021339944e-10, "loss": 0.0105, "step": 5016 }, { "epoch": 0.9965239845069024, "grad_norm": 0.6282268630028757, "learning_rate": 2.991835530535836e-10, "loss": 0.0092, "step": 5017 }, { "epoch": 0.9967226139636508, "grad_norm": 0.42023530746133536, "learning_rate": 2.650210269955489e-10, "loss": 0.01, "step": 5018 }, { "epoch": 0.9969212434203992, "grad_norm": 0.40516998628725354, "learning_rate": 2.3292888618520195e-10, "loss": 0.006, "step": 5019 }, { "epoch": 0.9971198728771477, "grad_norm": 0.5106870738378689, "learning_rate": 2.0290714391191235e-10, "loss": 0.0135, "step": 5020 }, { "epoch": 0.9973185023338961, "grad_norm": 0.5702272196550405, "learning_rate": 1.7495581260795758e-10, "loss": 0.0084, "step": 5021 }, { "epoch": 0.9975171317906446, "grad_norm": 0.3574001167310024, "learning_rate": 1.4907490384796774e-10, "loss": 0.0089, "step": 5022 }, { "epoch": 0.997715761247393, "grad_norm": 0.6482126256169631, "learning_rate": 1.252644283489257e-10, "loss": 0.009, "step": 5023 }, { "epoch": 0.9979143907041415, "grad_norm": 0.40764995367026596, "learning_rate": 1.0352439597072217e-10, "loss": 0.0123, "step": 5024 }, { "epoch": 0.9981130201608899, "grad_norm": 0.464525483618827, "learning_rate": 8.385481571615561e-11, "loss": 0.0124, "step": 5025 }, { "epoch": 0.9983116496176383, "grad_norm": 0.49823901728048225, "learning_rate": 6.625569573037727e-11, "loss": 0.0176, "step": 5026 }, { "epoch": 0.9985102790743867, "grad_norm": 0.5896583089333627, "learning_rate": 5.0727043301446175e-11, "loss": 0.0129, "step": 5027 }, { "epoch": 0.9987089085311351, "grad_norm": 0.37718748502681293, "learning_rate": 3.726886485866388e-11, "loss": 0.0133, "step": 5028 }, { "epoch": 0.9989075379878836, "grad_norm": 0.5830988240086593, "learning_rate": 2.5881165976460178e-11, "loss": 0.0161, "step": 5029 }, { "epoch": 0.999106167444632, "grad_norm": 0.6251502587104545, "learning_rate": 1.6563951368842034e-11, "loss": 0.0125, "step": 5030 }, { "epoch": 0.9993047969013805, "grad_norm": 0.5850394153814858, "learning_rate": 9.317224895499799e-12, "loss": 0.013, "step": 5031 }, { "epoch": 0.9995034263581289, "grad_norm": 0.5740375396508653, "learning_rate": 4.1409895568111924e-12, "loss": 0.0112, "step": 5032 }, { "epoch": 0.9997020558148774, "grad_norm": 0.2949227457860251, "learning_rate": 1.0352474966168758e-12, "loss": 0.0055, "step": 5033 }, { "epoch": 0.9999006852716258, "grad_norm": 0.8152259997227421, "learning_rate": 0.0, "loss": 0.0059, "step": 5034 }, { "epoch": 0.9999006852716258, "step": 5034, "total_flos": 393450150948864.0, "train_loss": 0.013911830744355322, "train_runtime": 43586.62, "train_samples_per_second": 7.393, "train_steps_per_second": 0.115 } ], "logging_steps": 1.0, "max_steps": 5034, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 393450150948864.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }