{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992505620784412, "eval_steps": 500, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007994004496627529, "grad_norm": 29.08027928947176, "learning_rate": 0.0, "loss": 1.7209, "step": 1 }, { "epoch": 0.0015988008993255058, "grad_norm": 9.836200747540412, "learning_rate": 2.7023815442731975e-06, "loss": 1.2157, "step": 2 }, { "epoch": 0.002398201348988259, "grad_norm": 8.732062138142359, "learning_rate": 4.2831734103139475e-06, "loss": 1.2213, "step": 3 }, { "epoch": 0.0031976017986510116, "grad_norm": 8.98196608627301, "learning_rate": 5.404763088546395e-06, "loss": 1.3207, "step": 4 }, { "epoch": 0.003997002248313765, "grad_norm": 3.104558237084713, "learning_rate": 6.274735630753034e-06, "loss": 1.2009, "step": 5 }, { "epoch": 0.004796402697976518, "grad_norm": 2.9678718492236587, "learning_rate": 6.985554954587145e-06, "loss": 1.1976, "step": 6 }, { "epoch": 0.0055958031476392705, "grad_norm": 2.324032539210556, "learning_rate": 7.586544129592991e-06, "loss": 1.1668, "step": 7 }, { "epoch": 0.006395203597302023, "grad_norm": 2.422145845478249, "learning_rate": 8.107144632819592e-06, "loss": 1.1056, "step": 8 }, { "epoch": 0.007194604046964776, "grad_norm": 2.7795213648793236, "learning_rate": 8.566346820627895e-06, "loss": 1.1439, "step": 9 }, { "epoch": 0.00799400449662753, "grad_norm": 2.304173813168448, "learning_rate": 8.977117175026234e-06, "loss": 1.0859, "step": 10 }, { "epoch": 0.008793404946290282, "grad_norm": 2.531444418518243, "learning_rate": 9.348704159880588e-06, "loss": 1.1012, "step": 11 }, { "epoch": 0.009592805395953035, "grad_norm": 2.623744403178605, "learning_rate": 9.687936498860343e-06, "loss": 1.1248, "step": 12 }, { "epoch": 0.010392205845615787, "grad_norm": 2.174204408077499, "learning_rate": 1e-05, "loss": 1.0862, "step": 13 }, { "epoch": 0.011191606295278541, "grad_norm": 2.1375382895043553, "learning_rate": 1e-05, "loss": 1.0843, "step": 14 }, { "epoch": 0.011991006744941295, "grad_norm": 2.3409573740941245, "learning_rate": 1e-05, "loss": 1.1007, "step": 15 }, { "epoch": 0.012790407194604047, "grad_norm": 2.2321265748114443, "learning_rate": 1e-05, "loss": 1.0199, "step": 16 }, { "epoch": 0.0135898076442668, "grad_norm": 2.2607491323391997, "learning_rate": 1e-05, "loss": 1.1098, "step": 17 }, { "epoch": 0.014389208093929552, "grad_norm": 2.1345387966971328, "learning_rate": 1e-05, "loss": 1.0852, "step": 18 }, { "epoch": 0.015188608543592306, "grad_norm": 2.0836111411515224, "learning_rate": 1e-05, "loss": 1.0227, "step": 19 }, { "epoch": 0.01598800899325506, "grad_norm": 2.1200221376043826, "learning_rate": 1e-05, "loss": 1.0764, "step": 20 }, { "epoch": 0.016787409442917813, "grad_norm": 2.3277973958562947, "learning_rate": 1e-05, "loss": 1.0425, "step": 21 }, { "epoch": 0.017586809892580563, "grad_norm": 2.4310258538885523, "learning_rate": 1e-05, "loss": 1.0437, "step": 22 }, { "epoch": 0.018386210342243317, "grad_norm": 2.317560454038046, "learning_rate": 1e-05, "loss": 1.0027, "step": 23 }, { "epoch": 0.01918561079190607, "grad_norm": 2.1153613214468923, "learning_rate": 1e-05, "loss": 1.0878, "step": 24 }, { "epoch": 0.019985011241568824, "grad_norm": 2.1138684148369884, "learning_rate": 1e-05, "loss": 1.0797, "step": 25 }, { "epoch": 0.020784411691231575, "grad_norm": 2.3869844261967765, "learning_rate": 1e-05, "loss": 1.1126, "step": 26 }, { "epoch": 0.02158381214089433, "grad_norm": 1.9441687206265474, "learning_rate": 1e-05, "loss": 1.0356, "step": 27 }, { "epoch": 0.022383212590557082, "grad_norm": 1.8858684427680283, "learning_rate": 1e-05, "loss": 1.0112, "step": 28 }, { "epoch": 0.023182613040219836, "grad_norm": 2.0111908392780924, "learning_rate": 1e-05, "loss": 1.025, "step": 29 }, { "epoch": 0.02398201348988259, "grad_norm": 2.3223850597645885, "learning_rate": 1e-05, "loss": 1.0608, "step": 30 }, { "epoch": 0.02478141393954534, "grad_norm": 2.282704095464692, "learning_rate": 1e-05, "loss": 0.9884, "step": 31 }, { "epoch": 0.025580814389208093, "grad_norm": 2.2485551406767392, "learning_rate": 1e-05, "loss": 1.1609, "step": 32 }, { "epoch": 0.026380214838870847, "grad_norm": 1.9632420284716974, "learning_rate": 1e-05, "loss": 1.0541, "step": 33 }, { "epoch": 0.0271796152885336, "grad_norm": 2.7873694225738963, "learning_rate": 1e-05, "loss": 0.9917, "step": 34 }, { "epoch": 0.027979015738196354, "grad_norm": 2.048096411620949, "learning_rate": 1e-05, "loss": 1.012, "step": 35 }, { "epoch": 0.028778416187859104, "grad_norm": 2.0309944076384494, "learning_rate": 1e-05, "loss": 1.0212, "step": 36 }, { "epoch": 0.029577816637521858, "grad_norm": 2.0949849865314643, "learning_rate": 1e-05, "loss": 1.0659, "step": 37 }, { "epoch": 0.03037721708718461, "grad_norm": 2.059202087957289, "learning_rate": 1e-05, "loss": 1.0168, "step": 38 }, { "epoch": 0.031176617536847365, "grad_norm": 2.0975700429920923, "learning_rate": 1e-05, "loss": 1.0216, "step": 39 }, { "epoch": 0.03197601798651012, "grad_norm": 2.0062452254349714, "learning_rate": 1e-05, "loss": 1.0274, "step": 40 }, { "epoch": 0.03277541843617287, "grad_norm": 2.222854538118324, "learning_rate": 1e-05, "loss": 1.0656, "step": 41 }, { "epoch": 0.033574818885835626, "grad_norm": 1.943599598185592, "learning_rate": 1e-05, "loss": 1.0782, "step": 42 }, { "epoch": 0.03437421933549838, "grad_norm": 1.9956218218997503, "learning_rate": 1e-05, "loss": 1.0625, "step": 43 }, { "epoch": 0.03517361978516113, "grad_norm": 2.1611198939392096, "learning_rate": 1e-05, "loss": 1.041, "step": 44 }, { "epoch": 0.035973020234823884, "grad_norm": 1.9975085093102276, "learning_rate": 1e-05, "loss": 1.0046, "step": 45 }, { "epoch": 0.036772420684486634, "grad_norm": 1.8691307201375191, "learning_rate": 1e-05, "loss": 1.0243, "step": 46 }, { "epoch": 0.03757182113414939, "grad_norm": 2.1275630339366667, "learning_rate": 1e-05, "loss": 1.0565, "step": 47 }, { "epoch": 0.03837122158381214, "grad_norm": 1.998529171481795, "learning_rate": 1e-05, "loss": 0.972, "step": 48 }, { "epoch": 0.03917062203347489, "grad_norm": 2.039027660741352, "learning_rate": 1e-05, "loss": 0.9604, "step": 49 }, { "epoch": 0.03997002248313765, "grad_norm": 1.8761207165317535, "learning_rate": 1e-05, "loss": 0.9985, "step": 50 }, { "epoch": 0.0407694229328004, "grad_norm": 2.089454409239614, "learning_rate": 1e-05, "loss": 0.9963, "step": 51 }, { "epoch": 0.04156882338246315, "grad_norm": 2.0445251187040134, "learning_rate": 1e-05, "loss": 1.0192, "step": 52 }, { "epoch": 0.042368223832125906, "grad_norm": 2.205588684592072, "learning_rate": 1e-05, "loss": 0.9684, "step": 53 }, { "epoch": 0.04316762428178866, "grad_norm": 2.0208537418585957, "learning_rate": 1e-05, "loss": 1.0063, "step": 54 }, { "epoch": 0.043967024731451414, "grad_norm": 1.7869034029258606, "learning_rate": 1e-05, "loss": 1.0368, "step": 55 }, { "epoch": 0.044766425181114164, "grad_norm": 1.8924926601293262, "learning_rate": 1e-05, "loss": 1.011, "step": 56 }, { "epoch": 0.045565825630776914, "grad_norm": 2.151723728750191, "learning_rate": 1e-05, "loss": 1.0275, "step": 57 }, { "epoch": 0.04636522608043967, "grad_norm": 2.388300807396013, "learning_rate": 1e-05, "loss": 0.996, "step": 58 }, { "epoch": 0.04716462653010242, "grad_norm": 1.9793946104980729, "learning_rate": 1e-05, "loss": 1.028, "step": 59 }, { "epoch": 0.04796402697976518, "grad_norm": 2.050014939910642, "learning_rate": 1e-05, "loss": 1.0109, "step": 60 }, { "epoch": 0.04876342742942793, "grad_norm": 1.8842986029616882, "learning_rate": 1e-05, "loss": 0.9752, "step": 61 }, { "epoch": 0.04956282787909068, "grad_norm": 1.7444876770795246, "learning_rate": 1e-05, "loss": 1.0228, "step": 62 }, { "epoch": 0.050362228328753436, "grad_norm": 1.8304676501403103, "learning_rate": 1e-05, "loss": 0.9747, "step": 63 }, { "epoch": 0.051161628778416186, "grad_norm": 2.1540039062270164, "learning_rate": 1e-05, "loss": 0.9955, "step": 64 }, { "epoch": 0.051961029228078943, "grad_norm": 1.6953401550549316, "learning_rate": 1e-05, "loss": 0.9811, "step": 65 }, { "epoch": 0.052760429677741694, "grad_norm": 2.1460856566454987, "learning_rate": 1e-05, "loss": 1.0365, "step": 66 }, { "epoch": 0.053559830127404444, "grad_norm": 1.7390283863943892, "learning_rate": 1e-05, "loss": 1.0277, "step": 67 }, { "epoch": 0.0543592305770672, "grad_norm": 2.0836221978397442, "learning_rate": 1e-05, "loss": 0.9953, "step": 68 }, { "epoch": 0.05515863102672995, "grad_norm": 1.7905448109320714, "learning_rate": 1e-05, "loss": 0.9944, "step": 69 }, { "epoch": 0.05595803147639271, "grad_norm": 1.9504348528444273, "learning_rate": 1e-05, "loss": 0.9808, "step": 70 }, { "epoch": 0.05675743192605546, "grad_norm": 1.834972840275589, "learning_rate": 1e-05, "loss": 0.9992, "step": 71 }, { "epoch": 0.05755683237571821, "grad_norm": 1.845072042104488, "learning_rate": 1e-05, "loss": 0.9811, "step": 72 }, { "epoch": 0.058356232825380966, "grad_norm": 1.85534014854077, "learning_rate": 1e-05, "loss": 0.9864, "step": 73 }, { "epoch": 0.059155633275043716, "grad_norm": 1.8650405189842276, "learning_rate": 1e-05, "loss": 0.9925, "step": 74 }, { "epoch": 0.05995503372470647, "grad_norm": 1.862410414010068, "learning_rate": 1e-05, "loss": 1.0991, "step": 75 }, { "epoch": 0.06075443417436922, "grad_norm": 2.1389193269284625, "learning_rate": 1e-05, "loss": 1.0228, "step": 76 }, { "epoch": 0.061553834624031974, "grad_norm": 1.7408061970131428, "learning_rate": 1e-05, "loss": 1.0034, "step": 77 }, { "epoch": 0.06235323507369473, "grad_norm": 2.0783333855212653, "learning_rate": 1e-05, "loss": 1.0015, "step": 78 }, { "epoch": 0.06315263552335748, "grad_norm": 2.1794919181439507, "learning_rate": 1e-05, "loss": 1.0184, "step": 79 }, { "epoch": 0.06395203597302024, "grad_norm": 1.8799556566280435, "learning_rate": 1e-05, "loss": 0.9807, "step": 80 }, { "epoch": 0.06475143642268298, "grad_norm": 1.6068132265611528, "learning_rate": 1e-05, "loss": 1.0318, "step": 81 }, { "epoch": 0.06555083687234574, "grad_norm": 1.8404529509039422, "learning_rate": 1e-05, "loss": 0.9634, "step": 82 }, { "epoch": 0.0663502373220085, "grad_norm": 1.8490571137069702, "learning_rate": 1e-05, "loss": 0.9362, "step": 83 }, { "epoch": 0.06714963777167125, "grad_norm": 2.1048586741337485, "learning_rate": 1e-05, "loss": 1.051, "step": 84 }, { "epoch": 0.067949038221334, "grad_norm": 1.9361395487099815, "learning_rate": 1e-05, "loss": 0.9884, "step": 85 }, { "epoch": 0.06874843867099675, "grad_norm": 1.882438664110377, "learning_rate": 1e-05, "loss": 1.0338, "step": 86 }, { "epoch": 0.06954783912065951, "grad_norm": 1.9328301399003285, "learning_rate": 1e-05, "loss": 1.0123, "step": 87 }, { "epoch": 0.07034723957032225, "grad_norm": 1.9592492051372121, "learning_rate": 1e-05, "loss": 1.015, "step": 88 }, { "epoch": 0.07114664001998501, "grad_norm": 2.0637394818205035, "learning_rate": 1e-05, "loss": 1.0074, "step": 89 }, { "epoch": 0.07194604046964777, "grad_norm": 1.875788422779308, "learning_rate": 1e-05, "loss": 0.966, "step": 90 }, { "epoch": 0.07274544091931051, "grad_norm": 1.8409070357840667, "learning_rate": 1e-05, "loss": 1.0463, "step": 91 }, { "epoch": 0.07354484136897327, "grad_norm": 1.9103779504623786, "learning_rate": 1e-05, "loss": 0.9633, "step": 92 }, { "epoch": 0.07434424181863603, "grad_norm": 2.0590523934839307, "learning_rate": 1e-05, "loss": 1.0215, "step": 93 }, { "epoch": 0.07514364226829878, "grad_norm": 2.104785750263468, "learning_rate": 1e-05, "loss": 1.0025, "step": 94 }, { "epoch": 0.07594304271796153, "grad_norm": 2.1695447340449663, "learning_rate": 1e-05, "loss": 0.941, "step": 95 }, { "epoch": 0.07674244316762428, "grad_norm": 2.0465650220094203, "learning_rate": 1e-05, "loss": 1.0093, "step": 96 }, { "epoch": 0.07754184361728704, "grad_norm": 1.8941011997406154, "learning_rate": 1e-05, "loss": 1.064, "step": 97 }, { "epoch": 0.07834124406694978, "grad_norm": 1.9987845140787637, "learning_rate": 1e-05, "loss": 0.9793, "step": 98 }, { "epoch": 0.07914064451661254, "grad_norm": 1.8233385113626337, "learning_rate": 1e-05, "loss": 1.0176, "step": 99 }, { "epoch": 0.0799400449662753, "grad_norm": 1.8162210777833079, "learning_rate": 1e-05, "loss": 0.9699, "step": 100 }, { "epoch": 0.08073944541593804, "grad_norm": 1.8711808189743682, "learning_rate": 1e-05, "loss": 0.9865, "step": 101 }, { "epoch": 0.0815388458656008, "grad_norm": 1.974561488916405, "learning_rate": 1e-05, "loss": 0.9806, "step": 102 }, { "epoch": 0.08233824631526356, "grad_norm": 1.7095584582820083, "learning_rate": 1e-05, "loss": 0.9955, "step": 103 }, { "epoch": 0.0831376467649263, "grad_norm": 1.8952139824297942, "learning_rate": 1e-05, "loss": 0.9338, "step": 104 }, { "epoch": 0.08393704721458906, "grad_norm": 1.8058804845050307, "learning_rate": 1e-05, "loss": 1.0062, "step": 105 }, { "epoch": 0.08473644766425181, "grad_norm": 1.8103680215448428, "learning_rate": 1e-05, "loss": 0.9872, "step": 106 }, { "epoch": 0.08553584811391457, "grad_norm": 1.694736368233996, "learning_rate": 1e-05, "loss": 0.9359, "step": 107 }, { "epoch": 0.08633524856357731, "grad_norm": 1.9235533583641018, "learning_rate": 1e-05, "loss": 1.0611, "step": 108 }, { "epoch": 0.08713464901324007, "grad_norm": 1.619066977691127, "learning_rate": 1e-05, "loss": 0.9654, "step": 109 }, { "epoch": 0.08793404946290283, "grad_norm": 1.8050888311534128, "learning_rate": 1e-05, "loss": 1.004, "step": 110 }, { "epoch": 0.08873344991256557, "grad_norm": 1.9960924269335547, "learning_rate": 1e-05, "loss": 1.0118, "step": 111 }, { "epoch": 0.08953285036222833, "grad_norm": 1.9286201089638149, "learning_rate": 1e-05, "loss": 1.0025, "step": 112 }, { "epoch": 0.09033225081189108, "grad_norm": 2.1725480586787396, "learning_rate": 1e-05, "loss": 0.9558, "step": 113 }, { "epoch": 0.09113165126155383, "grad_norm": 1.857962422635593, "learning_rate": 1e-05, "loss": 0.9772, "step": 114 }, { "epoch": 0.09193105171121659, "grad_norm": 1.9166723424153935, "learning_rate": 1e-05, "loss": 0.9749, "step": 115 }, { "epoch": 0.09273045216087934, "grad_norm": 2.0124769392114854, "learning_rate": 1e-05, "loss": 0.9548, "step": 116 }, { "epoch": 0.0935298526105421, "grad_norm": 1.847426445728428, "learning_rate": 1e-05, "loss": 0.941, "step": 117 }, { "epoch": 0.09432925306020484, "grad_norm": 2.163992947673654, "learning_rate": 1e-05, "loss": 0.9617, "step": 118 }, { "epoch": 0.0951286535098676, "grad_norm": 1.8889979598709639, "learning_rate": 1e-05, "loss": 1.0272, "step": 119 }, { "epoch": 0.09592805395953036, "grad_norm": 1.844634955046446, "learning_rate": 1e-05, "loss": 0.9669, "step": 120 }, { "epoch": 0.0967274544091931, "grad_norm": 1.9301903181704618, "learning_rate": 1e-05, "loss": 0.9717, "step": 121 }, { "epoch": 0.09752685485885586, "grad_norm": 1.9564195723979845, "learning_rate": 1e-05, "loss": 0.9527, "step": 122 }, { "epoch": 0.09832625530851861, "grad_norm": 1.834090339470851, "learning_rate": 1e-05, "loss": 0.9794, "step": 123 }, { "epoch": 0.09912565575818136, "grad_norm": 1.7936104151665677, "learning_rate": 1e-05, "loss": 0.9042, "step": 124 }, { "epoch": 0.09992505620784412, "grad_norm": 1.7969263674080669, "learning_rate": 1e-05, "loss": 1.0397, "step": 125 }, { "epoch": 0.10072445665750687, "grad_norm": 1.7901986458192694, "learning_rate": 1e-05, "loss": 1.0043, "step": 126 }, { "epoch": 0.10152385710716963, "grad_norm": 1.8947234640723079, "learning_rate": 1e-05, "loss": 0.9761, "step": 127 }, { "epoch": 0.10232325755683237, "grad_norm": 1.8487696622255145, "learning_rate": 1e-05, "loss": 0.9899, "step": 128 }, { "epoch": 0.10312265800649513, "grad_norm": 1.8207862729527453, "learning_rate": 1e-05, "loss": 1.0272, "step": 129 }, { "epoch": 0.10392205845615789, "grad_norm": 1.9816716753688939, "learning_rate": 1e-05, "loss": 0.9202, "step": 130 }, { "epoch": 0.10472145890582063, "grad_norm": 1.8916365109275264, "learning_rate": 1e-05, "loss": 0.9629, "step": 131 }, { "epoch": 0.10552085935548339, "grad_norm": 1.9863329832931071, "learning_rate": 1e-05, "loss": 0.9976, "step": 132 }, { "epoch": 0.10632025980514614, "grad_norm": 1.9194816317308832, "learning_rate": 1e-05, "loss": 1.0043, "step": 133 }, { "epoch": 0.10711966025480889, "grad_norm": 1.9537595846189237, "learning_rate": 1e-05, "loss": 0.9453, "step": 134 }, { "epoch": 0.10791906070447164, "grad_norm": 2.0669579990783253, "learning_rate": 1e-05, "loss": 0.9865, "step": 135 }, { "epoch": 0.1087184611541344, "grad_norm": 1.9760934706997628, "learning_rate": 1e-05, "loss": 1.017, "step": 136 }, { "epoch": 0.10951786160379715, "grad_norm": 1.7260389446366302, "learning_rate": 1e-05, "loss": 0.963, "step": 137 }, { "epoch": 0.1103172620534599, "grad_norm": 1.9203242105800193, "learning_rate": 1e-05, "loss": 1.0157, "step": 138 }, { "epoch": 0.11111666250312266, "grad_norm": 1.9850822013474325, "learning_rate": 1e-05, "loss": 0.9438, "step": 139 }, { "epoch": 0.11191606295278542, "grad_norm": 1.9572946605976695, "learning_rate": 1e-05, "loss": 1.0029, "step": 140 }, { "epoch": 0.11271546340244816, "grad_norm": 1.5451741731912971, "learning_rate": 1e-05, "loss": 0.9225, "step": 141 }, { "epoch": 0.11351486385211092, "grad_norm": 2.0070450938810707, "learning_rate": 1e-05, "loss": 0.922, "step": 142 }, { "epoch": 0.11431426430177367, "grad_norm": 1.89832125508894, "learning_rate": 1e-05, "loss": 1.0401, "step": 143 }, { "epoch": 0.11511366475143642, "grad_norm": 1.950327724703524, "learning_rate": 1e-05, "loss": 0.9279, "step": 144 }, { "epoch": 0.11591306520109917, "grad_norm": 1.9700609199158468, "learning_rate": 1e-05, "loss": 0.9864, "step": 145 }, { "epoch": 0.11671246565076193, "grad_norm": 1.6727783834574599, "learning_rate": 1e-05, "loss": 0.9659, "step": 146 }, { "epoch": 0.11751186610042468, "grad_norm": 1.8484918243414765, "learning_rate": 1e-05, "loss": 0.9761, "step": 147 }, { "epoch": 0.11831126655008743, "grad_norm": 2.045306713844051, "learning_rate": 1e-05, "loss": 0.9788, "step": 148 }, { "epoch": 0.11911066699975019, "grad_norm": 1.8558407244018518, "learning_rate": 1e-05, "loss": 0.963, "step": 149 }, { "epoch": 0.11991006744941295, "grad_norm": 1.777504348074839, "learning_rate": 1e-05, "loss": 0.9898, "step": 150 }, { "epoch": 0.12070946789907569, "grad_norm": 1.7945306209083864, "learning_rate": 1e-05, "loss": 0.9475, "step": 151 }, { "epoch": 0.12150886834873845, "grad_norm": 1.612635014991482, "learning_rate": 1e-05, "loss": 0.981, "step": 152 }, { "epoch": 0.1223082687984012, "grad_norm": 1.5365653630331435, "learning_rate": 1e-05, "loss": 0.9336, "step": 153 }, { "epoch": 0.12310766924806395, "grad_norm": 1.7728163669560009, "learning_rate": 1e-05, "loss": 0.9786, "step": 154 }, { "epoch": 0.1239070696977267, "grad_norm": 1.6363907272750682, "learning_rate": 1e-05, "loss": 0.9499, "step": 155 }, { "epoch": 0.12470647014738946, "grad_norm": 1.8927548789352038, "learning_rate": 1e-05, "loss": 0.9537, "step": 156 }, { "epoch": 0.1255058705970522, "grad_norm": 1.576438438411652, "learning_rate": 1e-05, "loss": 0.9273, "step": 157 }, { "epoch": 0.12630527104671496, "grad_norm": 1.8750460465870347, "learning_rate": 1e-05, "loss": 0.9687, "step": 158 }, { "epoch": 0.12710467149637772, "grad_norm": 1.712737472716492, "learning_rate": 1e-05, "loss": 0.9981, "step": 159 }, { "epoch": 0.12790407194604048, "grad_norm": 1.8944147808763965, "learning_rate": 1e-05, "loss": 1.0316, "step": 160 }, { "epoch": 0.12870347239570323, "grad_norm": 1.6975154876149214, "learning_rate": 1e-05, "loss": 0.9921, "step": 161 }, { "epoch": 0.12950287284536596, "grad_norm": 1.7330196261933866, "learning_rate": 1e-05, "loss": 0.9567, "step": 162 }, { "epoch": 0.13030227329502872, "grad_norm": 2.004904627709956, "learning_rate": 1e-05, "loss": 0.9788, "step": 163 }, { "epoch": 0.13110167374469148, "grad_norm": 1.7565329263507932, "learning_rate": 1e-05, "loss": 0.9461, "step": 164 }, { "epoch": 0.13190107419435423, "grad_norm": 1.6976314021380359, "learning_rate": 1e-05, "loss": 0.9926, "step": 165 }, { "epoch": 0.132700474644017, "grad_norm": 1.573182719519626, "learning_rate": 1e-05, "loss": 0.982, "step": 166 }, { "epoch": 0.13349987509367975, "grad_norm": 1.5753994405016738, "learning_rate": 1e-05, "loss": 0.9745, "step": 167 }, { "epoch": 0.1342992755433425, "grad_norm": 1.9199549441489088, "learning_rate": 1e-05, "loss": 0.9916, "step": 168 }, { "epoch": 0.13509867599300523, "grad_norm": 1.7662832212098252, "learning_rate": 1e-05, "loss": 0.9717, "step": 169 }, { "epoch": 0.135898076442668, "grad_norm": 2.1972236756007506, "learning_rate": 1e-05, "loss": 0.9923, "step": 170 }, { "epoch": 0.13669747689233075, "grad_norm": 1.5845907178152914, "learning_rate": 1e-05, "loss": 1.041, "step": 171 }, { "epoch": 0.1374968773419935, "grad_norm": 1.9027156433363486, "learning_rate": 1e-05, "loss": 0.9986, "step": 172 }, { "epoch": 0.13829627779165626, "grad_norm": 1.938028025396952, "learning_rate": 1e-05, "loss": 0.9856, "step": 173 }, { "epoch": 0.13909567824131902, "grad_norm": 1.7615271251517497, "learning_rate": 1e-05, "loss": 0.9879, "step": 174 }, { "epoch": 0.13989507869098175, "grad_norm": 1.5753792433296703, "learning_rate": 1e-05, "loss": 0.9952, "step": 175 }, { "epoch": 0.1406944791406445, "grad_norm": 1.8071610796834736, "learning_rate": 1e-05, "loss": 0.9403, "step": 176 }, { "epoch": 0.14149387959030726, "grad_norm": 1.8188146399425127, "learning_rate": 1e-05, "loss": 0.9166, "step": 177 }, { "epoch": 0.14229328003997002, "grad_norm": 1.8998134327288991, "learning_rate": 1e-05, "loss": 0.9307, "step": 178 }, { "epoch": 0.14309268048963278, "grad_norm": 1.8148916923977343, "learning_rate": 1e-05, "loss": 0.964, "step": 179 }, { "epoch": 0.14389208093929554, "grad_norm": 1.8025702262604992, "learning_rate": 1e-05, "loss": 0.9636, "step": 180 }, { "epoch": 0.1446914813889583, "grad_norm": 1.8970561152549208, "learning_rate": 1e-05, "loss": 0.9446, "step": 181 }, { "epoch": 0.14549088183862102, "grad_norm": 1.774281514717804, "learning_rate": 1e-05, "loss": 0.9011, "step": 182 }, { "epoch": 0.14629028228828378, "grad_norm": 1.6697484592667877, "learning_rate": 1e-05, "loss": 0.9732, "step": 183 }, { "epoch": 0.14708968273794654, "grad_norm": 1.748314198924899, "learning_rate": 1e-05, "loss": 0.9294, "step": 184 }, { "epoch": 0.1478890831876093, "grad_norm": 1.5552333328333348, "learning_rate": 1e-05, "loss": 0.9207, "step": 185 }, { "epoch": 0.14868848363727205, "grad_norm": 1.819375156478493, "learning_rate": 1e-05, "loss": 0.9667, "step": 186 }, { "epoch": 0.1494878840869348, "grad_norm": 1.5853289567427034, "learning_rate": 1e-05, "loss": 0.9863, "step": 187 }, { "epoch": 0.15028728453659757, "grad_norm": 1.7338233390104778, "learning_rate": 1e-05, "loss": 0.9088, "step": 188 }, { "epoch": 0.1510866849862603, "grad_norm": 1.8735214816693204, "learning_rate": 1e-05, "loss": 0.9931, "step": 189 }, { "epoch": 0.15188608543592305, "grad_norm": 1.70836070926444, "learning_rate": 1e-05, "loss": 0.9774, "step": 190 }, { "epoch": 0.1526854858855858, "grad_norm": 1.68457840558557, "learning_rate": 1e-05, "loss": 0.9971, "step": 191 }, { "epoch": 0.15348488633524857, "grad_norm": 1.9974046657795066, "learning_rate": 1e-05, "loss": 1.0525, "step": 192 }, { "epoch": 0.15428428678491132, "grad_norm": 1.8637088407144724, "learning_rate": 1e-05, "loss": 0.9458, "step": 193 }, { "epoch": 0.15508368723457408, "grad_norm": 1.5472617342282928, "learning_rate": 1e-05, "loss": 0.9321, "step": 194 }, { "epoch": 0.1558830876842368, "grad_norm": 2.0278392859284224, "learning_rate": 1e-05, "loss": 0.9376, "step": 195 }, { "epoch": 0.15668248813389957, "grad_norm": 1.8610095483452973, "learning_rate": 1e-05, "loss": 0.9921, "step": 196 }, { "epoch": 0.15748188858356232, "grad_norm": 2.0375178580916016, "learning_rate": 1e-05, "loss": 0.9985, "step": 197 }, { "epoch": 0.15828128903322508, "grad_norm": 1.8219362402276909, "learning_rate": 1e-05, "loss": 0.924, "step": 198 }, { "epoch": 0.15908068948288784, "grad_norm": 1.4629250708658383, "learning_rate": 1e-05, "loss": 1.0201, "step": 199 }, { "epoch": 0.1598800899325506, "grad_norm": 1.5628287370754461, "learning_rate": 1e-05, "loss": 1.0002, "step": 200 }, { "epoch": 0.16067949038221335, "grad_norm": 1.8442311252983388, "learning_rate": 1e-05, "loss": 0.937, "step": 201 }, { "epoch": 0.16147889083187608, "grad_norm": 7.441197607810174, "learning_rate": 1e-05, "loss": 0.8768, "step": 202 }, { "epoch": 0.16227829128153884, "grad_norm": 1.7947899683379576, "learning_rate": 1e-05, "loss": 0.9524, "step": 203 }, { "epoch": 0.1630776917312016, "grad_norm": 1.656507654529954, "learning_rate": 1e-05, "loss": 0.8953, "step": 204 }, { "epoch": 0.16387709218086435, "grad_norm": 1.7462816982128921, "learning_rate": 1e-05, "loss": 0.9435, "step": 205 }, { "epoch": 0.1646764926305271, "grad_norm": 1.7013940298273953, "learning_rate": 1e-05, "loss": 0.9124, "step": 206 }, { "epoch": 0.16547589308018987, "grad_norm": 1.6379746843984113, "learning_rate": 1e-05, "loss": 0.9508, "step": 207 }, { "epoch": 0.1662752935298526, "grad_norm": 1.9314822402660798, "learning_rate": 1e-05, "loss": 1.0272, "step": 208 }, { "epoch": 0.16707469397951535, "grad_norm": 1.9961308842740637, "learning_rate": 1e-05, "loss": 0.9841, "step": 209 }, { "epoch": 0.1678740944291781, "grad_norm": 2.0382234178726537, "learning_rate": 1e-05, "loss": 0.9785, "step": 210 }, { "epoch": 0.16867349487884087, "grad_norm": 1.6901064034464468, "learning_rate": 1e-05, "loss": 0.9127, "step": 211 }, { "epoch": 0.16947289532850363, "grad_norm": 1.7273747898471865, "learning_rate": 1e-05, "loss": 0.9583, "step": 212 }, { "epoch": 0.17027229577816638, "grad_norm": 1.7457470216603739, "learning_rate": 1e-05, "loss": 0.9799, "step": 213 }, { "epoch": 0.17107169622782914, "grad_norm": 1.7313522722535573, "learning_rate": 1e-05, "loss": 0.9489, "step": 214 }, { "epoch": 0.17187109667749187, "grad_norm": 1.7762615948567715, "learning_rate": 1e-05, "loss": 0.9328, "step": 215 }, { "epoch": 0.17267049712715463, "grad_norm": 1.6331422537410691, "learning_rate": 1e-05, "loss": 0.9446, "step": 216 }, { "epoch": 0.17346989757681738, "grad_norm": 1.6778510604121997, "learning_rate": 1e-05, "loss": 0.9547, "step": 217 }, { "epoch": 0.17426929802648014, "grad_norm": 1.9041470899144908, "learning_rate": 1e-05, "loss": 0.9014, "step": 218 }, { "epoch": 0.1750686984761429, "grad_norm": 1.8662662755793453, "learning_rate": 1e-05, "loss": 0.9709, "step": 219 }, { "epoch": 0.17586809892580565, "grad_norm": 1.7045357754568997, "learning_rate": 1e-05, "loss": 0.9433, "step": 220 }, { "epoch": 0.1766674993754684, "grad_norm": 1.74409106945116, "learning_rate": 1e-05, "loss": 0.9153, "step": 221 }, { "epoch": 0.17746689982513114, "grad_norm": 1.8132234884702887, "learning_rate": 1e-05, "loss": 0.8909, "step": 222 }, { "epoch": 0.1782663002747939, "grad_norm": 1.6971296927642, "learning_rate": 1e-05, "loss": 0.9622, "step": 223 }, { "epoch": 0.17906570072445666, "grad_norm": 1.781912471031092, "learning_rate": 1e-05, "loss": 0.954, "step": 224 }, { "epoch": 0.1798651011741194, "grad_norm": 1.6629867774088771, "learning_rate": 1e-05, "loss": 0.96, "step": 225 }, { "epoch": 0.18066450162378217, "grad_norm": 2.0699033115205614, "learning_rate": 1e-05, "loss": 0.9284, "step": 226 }, { "epoch": 0.18146390207344493, "grad_norm": 1.7235146329911442, "learning_rate": 1e-05, "loss": 0.9456, "step": 227 }, { "epoch": 0.18226330252310766, "grad_norm": 1.7961113577108625, "learning_rate": 1e-05, "loss": 0.9454, "step": 228 }, { "epoch": 0.1830627029727704, "grad_norm": 1.6808904917909453, "learning_rate": 1e-05, "loss": 0.9524, "step": 229 }, { "epoch": 0.18386210342243317, "grad_norm": 1.5865303307652885, "learning_rate": 1e-05, "loss": 0.9863, "step": 230 }, { "epoch": 0.18466150387209593, "grad_norm": 1.6521878212504149, "learning_rate": 1e-05, "loss": 0.946, "step": 231 }, { "epoch": 0.18546090432175869, "grad_norm": 1.5619375597824243, "learning_rate": 1e-05, "loss": 1.0141, "step": 232 }, { "epoch": 0.18626030477142144, "grad_norm": 1.9668596679027701, "learning_rate": 1e-05, "loss": 0.9783, "step": 233 }, { "epoch": 0.1870597052210842, "grad_norm": 1.7004515677555856, "learning_rate": 1e-05, "loss": 0.939, "step": 234 }, { "epoch": 0.18785910567074693, "grad_norm": 1.8505586367786393, "learning_rate": 1e-05, "loss": 1.0186, "step": 235 }, { "epoch": 0.18865850612040969, "grad_norm": 1.8794093279833084, "learning_rate": 1e-05, "loss": 0.9748, "step": 236 }, { "epoch": 0.18945790657007244, "grad_norm": 1.970577363084186, "learning_rate": 1e-05, "loss": 0.9734, "step": 237 }, { "epoch": 0.1902573070197352, "grad_norm": 1.9827162568725265, "learning_rate": 1e-05, "loss": 0.9526, "step": 238 }, { "epoch": 0.19105670746939796, "grad_norm": 1.6777105787009272, "learning_rate": 1e-05, "loss": 1.0038, "step": 239 }, { "epoch": 0.19185610791906071, "grad_norm": 1.8547665670552458, "learning_rate": 1e-05, "loss": 0.9425, "step": 240 }, { "epoch": 0.19265550836872344, "grad_norm": 1.5739853104069792, "learning_rate": 1e-05, "loss": 0.9898, "step": 241 }, { "epoch": 0.1934549088183862, "grad_norm": 1.7991544252885405, "learning_rate": 1e-05, "loss": 0.9068, "step": 242 }, { "epoch": 0.19425430926804896, "grad_norm": 1.7278046505750493, "learning_rate": 1e-05, "loss": 0.9961, "step": 243 }, { "epoch": 0.19505370971771172, "grad_norm": 1.6738018924260079, "learning_rate": 1e-05, "loss": 0.9269, "step": 244 }, { "epoch": 0.19585311016737447, "grad_norm": 1.704113739011135, "learning_rate": 1e-05, "loss": 0.9384, "step": 245 }, { "epoch": 0.19665251061703723, "grad_norm": 1.953642878567139, "learning_rate": 1e-05, "loss": 0.9003, "step": 246 }, { "epoch": 0.1974519110667, "grad_norm": 1.8994714525376621, "learning_rate": 1e-05, "loss": 0.9384, "step": 247 }, { "epoch": 0.19825131151636272, "grad_norm": 1.7335277476681896, "learning_rate": 1e-05, "loss": 0.9164, "step": 248 }, { "epoch": 0.19905071196602547, "grad_norm": 1.8114996960442162, "learning_rate": 1e-05, "loss": 0.909, "step": 249 }, { "epoch": 0.19985011241568823, "grad_norm": 1.8399064962789757, "learning_rate": 1e-05, "loss": 0.9672, "step": 250 }, { "epoch": 0.200649512865351, "grad_norm": 1.8027482426913095, "learning_rate": 1e-05, "loss": 0.9294, "step": 251 }, { "epoch": 0.20144891331501374, "grad_norm": 1.7914653808525045, "learning_rate": 1e-05, "loss": 0.9709, "step": 252 }, { "epoch": 0.2022483137646765, "grad_norm": 1.8562700822437381, "learning_rate": 1e-05, "loss": 0.918, "step": 253 }, { "epoch": 0.20304771421433926, "grad_norm": 1.592298158180451, "learning_rate": 1e-05, "loss": 0.9874, "step": 254 }, { "epoch": 0.203847114664002, "grad_norm": 1.7885472103550304, "learning_rate": 1e-05, "loss": 0.9579, "step": 255 }, { "epoch": 0.20464651511366475, "grad_norm": 1.8835318053165766, "learning_rate": 1e-05, "loss": 0.97, "step": 256 }, { "epoch": 0.2054459155633275, "grad_norm": 2.2973670794805865, "learning_rate": 1e-05, "loss": 1.0196, "step": 257 }, { "epoch": 0.20624531601299026, "grad_norm": 2.059759101560068, "learning_rate": 1e-05, "loss": 0.9051, "step": 258 }, { "epoch": 0.20704471646265302, "grad_norm": 1.6379487643230517, "learning_rate": 1e-05, "loss": 0.9853, "step": 259 }, { "epoch": 0.20784411691231577, "grad_norm": 1.7739932086505867, "learning_rate": 1e-05, "loss": 0.9365, "step": 260 }, { "epoch": 0.2086435173619785, "grad_norm": 1.9378628413327441, "learning_rate": 1e-05, "loss": 0.9248, "step": 261 }, { "epoch": 0.20944291781164126, "grad_norm": 1.8631208677480777, "learning_rate": 1e-05, "loss": 0.9417, "step": 262 }, { "epoch": 0.21024231826130402, "grad_norm": 1.73049947808822, "learning_rate": 1e-05, "loss": 0.9039, "step": 263 }, { "epoch": 0.21104171871096677, "grad_norm": 1.6873959381280914, "learning_rate": 1e-05, "loss": 0.945, "step": 264 }, { "epoch": 0.21184111916062953, "grad_norm": 1.5105067176725349, "learning_rate": 1e-05, "loss": 0.9446, "step": 265 }, { "epoch": 0.2126405196102923, "grad_norm": 1.8337058320691813, "learning_rate": 1e-05, "loss": 0.9582, "step": 266 }, { "epoch": 0.21343992005995505, "grad_norm": 1.644955596385126, "learning_rate": 1e-05, "loss": 0.9055, "step": 267 }, { "epoch": 0.21423932050961778, "grad_norm": 2.0248942495461435, "learning_rate": 1e-05, "loss": 1.0207, "step": 268 }, { "epoch": 0.21503872095928053, "grad_norm": 1.746437687084402, "learning_rate": 1e-05, "loss": 1.0093, "step": 269 }, { "epoch": 0.2158381214089433, "grad_norm": 1.719648906171914, "learning_rate": 1e-05, "loss": 0.9533, "step": 270 }, { "epoch": 0.21663752185860605, "grad_norm": 1.8380592688711606, "learning_rate": 1e-05, "loss": 0.9275, "step": 271 }, { "epoch": 0.2174369223082688, "grad_norm": 1.8205169561312367, "learning_rate": 1e-05, "loss": 0.9745, "step": 272 }, { "epoch": 0.21823632275793156, "grad_norm": 2.326139141853857, "learning_rate": 1e-05, "loss": 0.9953, "step": 273 }, { "epoch": 0.2190357232075943, "grad_norm": 1.6381092977636662, "learning_rate": 1e-05, "loss": 0.9203, "step": 274 }, { "epoch": 0.21983512365725705, "grad_norm": 1.606867524589781, "learning_rate": 1e-05, "loss": 0.9007, "step": 275 }, { "epoch": 0.2206345241069198, "grad_norm": 1.7195338383934604, "learning_rate": 1e-05, "loss": 0.9611, "step": 276 }, { "epoch": 0.22143392455658256, "grad_norm": 1.3840546682546424, "learning_rate": 1e-05, "loss": 0.9614, "step": 277 }, { "epoch": 0.22223332500624532, "grad_norm": 1.6306949714534276, "learning_rate": 1e-05, "loss": 0.9271, "step": 278 }, { "epoch": 0.22303272545590808, "grad_norm": 1.5110189180438256, "learning_rate": 1e-05, "loss": 0.9528, "step": 279 }, { "epoch": 0.22383212590557083, "grad_norm": 1.8612974867734187, "learning_rate": 1e-05, "loss": 0.9587, "step": 280 }, { "epoch": 0.22463152635523356, "grad_norm": 1.664680974165204, "learning_rate": 1e-05, "loss": 0.9129, "step": 281 }, { "epoch": 0.22543092680489632, "grad_norm": 1.7746255109018692, "learning_rate": 1e-05, "loss": 0.939, "step": 282 }, { "epoch": 0.22623032725455908, "grad_norm": 1.575200440251585, "learning_rate": 1e-05, "loss": 0.9204, "step": 283 }, { "epoch": 0.22702972770422183, "grad_norm": 1.7516406660858301, "learning_rate": 1e-05, "loss": 0.9537, "step": 284 }, { "epoch": 0.2278291281538846, "grad_norm": 1.91803098110819, "learning_rate": 1e-05, "loss": 0.9363, "step": 285 }, { "epoch": 0.22862852860354735, "grad_norm": 1.6613035583173086, "learning_rate": 1e-05, "loss": 0.9634, "step": 286 }, { "epoch": 0.2294279290532101, "grad_norm": 1.5842290188976889, "learning_rate": 1e-05, "loss": 0.9551, "step": 287 }, { "epoch": 0.23022732950287284, "grad_norm": 1.9140569815192874, "learning_rate": 1e-05, "loss": 0.9512, "step": 288 }, { "epoch": 0.2310267299525356, "grad_norm": 1.5261307902201178, "learning_rate": 1e-05, "loss": 0.96, "step": 289 }, { "epoch": 0.23182613040219835, "grad_norm": 1.682573363812062, "learning_rate": 1e-05, "loss": 0.8925, "step": 290 }, { "epoch": 0.2326255308518611, "grad_norm": 1.6358092225364382, "learning_rate": 1e-05, "loss": 0.8815, "step": 291 }, { "epoch": 0.23342493130152386, "grad_norm": 1.5670506043722536, "learning_rate": 1e-05, "loss": 0.9876, "step": 292 }, { "epoch": 0.23422433175118662, "grad_norm": 1.6299839564753011, "learning_rate": 1e-05, "loss": 0.8892, "step": 293 }, { "epoch": 0.23502373220084935, "grad_norm": 1.6554910310702649, "learning_rate": 1e-05, "loss": 0.9216, "step": 294 }, { "epoch": 0.2358231326505121, "grad_norm": 1.8037159660461701, "learning_rate": 1e-05, "loss": 0.9575, "step": 295 }, { "epoch": 0.23662253310017486, "grad_norm": 1.629165333497563, "learning_rate": 1e-05, "loss": 0.947, "step": 296 }, { "epoch": 0.23742193354983762, "grad_norm": 1.8459614666127684, "learning_rate": 1e-05, "loss": 0.9263, "step": 297 }, { "epoch": 0.23822133399950038, "grad_norm": 1.5508274722576894, "learning_rate": 1e-05, "loss": 0.9002, "step": 298 }, { "epoch": 0.23902073444916314, "grad_norm": 1.6777079971899138, "learning_rate": 1e-05, "loss": 0.9508, "step": 299 }, { "epoch": 0.2398201348988259, "grad_norm": 1.7100079727592197, "learning_rate": 1e-05, "loss": 0.935, "step": 300 }, { "epoch": 0.24061953534848862, "grad_norm": 2.1307932039198425, "learning_rate": 1e-05, "loss": 0.9233, "step": 301 }, { "epoch": 0.24141893579815138, "grad_norm": 1.883290916019245, "learning_rate": 1e-05, "loss": 0.943, "step": 302 }, { "epoch": 0.24221833624781414, "grad_norm": 1.5909650854809918, "learning_rate": 1e-05, "loss": 0.9467, "step": 303 }, { "epoch": 0.2430177366974769, "grad_norm": 1.7792900727864842, "learning_rate": 1e-05, "loss": 0.9342, "step": 304 }, { "epoch": 0.24381713714713965, "grad_norm": 1.7111474699259361, "learning_rate": 1e-05, "loss": 0.9345, "step": 305 }, { "epoch": 0.2446165375968024, "grad_norm": 1.7771845797925385, "learning_rate": 1e-05, "loss": 0.9341, "step": 306 }, { "epoch": 0.24541593804646514, "grad_norm": 1.6148130323193988, "learning_rate": 1e-05, "loss": 0.8944, "step": 307 }, { "epoch": 0.2462153384961279, "grad_norm": 1.9162065213210437, "learning_rate": 1e-05, "loss": 0.9519, "step": 308 }, { "epoch": 0.24701473894579065, "grad_norm": 1.6110529009706316, "learning_rate": 1e-05, "loss": 0.8987, "step": 309 }, { "epoch": 0.2478141393954534, "grad_norm": 1.7475182646170053, "learning_rate": 1e-05, "loss": 0.885, "step": 310 }, { "epoch": 0.24861353984511617, "grad_norm": 1.8647125722982512, "learning_rate": 1e-05, "loss": 0.9214, "step": 311 }, { "epoch": 0.24941294029477892, "grad_norm": 1.6670715424606828, "learning_rate": 1e-05, "loss": 0.9462, "step": 312 }, { "epoch": 0.25021234074444165, "grad_norm": 1.5198974766775857, "learning_rate": 1e-05, "loss": 0.9632, "step": 313 }, { "epoch": 0.2510117411941044, "grad_norm": 1.5581495649662924, "learning_rate": 1e-05, "loss": 0.9602, "step": 314 }, { "epoch": 0.25181114164376717, "grad_norm": 1.5776975494668843, "learning_rate": 1e-05, "loss": 0.9794, "step": 315 }, { "epoch": 0.2526105420934299, "grad_norm": 1.6005787401081062, "learning_rate": 1e-05, "loss": 0.8655, "step": 316 }, { "epoch": 0.2534099425430927, "grad_norm": 1.7530297645251576, "learning_rate": 1e-05, "loss": 0.915, "step": 317 }, { "epoch": 0.25420934299275544, "grad_norm": 1.8516146569735892, "learning_rate": 1e-05, "loss": 0.8734, "step": 318 }, { "epoch": 0.2550087434424182, "grad_norm": 1.5925556861862051, "learning_rate": 1e-05, "loss": 0.9356, "step": 319 }, { "epoch": 0.25580814389208095, "grad_norm": 1.7942857409055468, "learning_rate": 1e-05, "loss": 0.925, "step": 320 }, { "epoch": 0.2566075443417437, "grad_norm": 1.7301914879145586, "learning_rate": 1e-05, "loss": 0.896, "step": 321 }, { "epoch": 0.25740694479140647, "grad_norm": 1.5868880054016326, "learning_rate": 1e-05, "loss": 0.9021, "step": 322 }, { "epoch": 0.2582063452410692, "grad_norm": 1.7680256022363232, "learning_rate": 1e-05, "loss": 0.9309, "step": 323 }, { "epoch": 0.2590057456907319, "grad_norm": 1.586312615898128, "learning_rate": 1e-05, "loss": 1.0129, "step": 324 }, { "epoch": 0.2598051461403947, "grad_norm": 1.8702172203637788, "learning_rate": 1e-05, "loss": 0.9423, "step": 325 }, { "epoch": 0.26060454659005744, "grad_norm": 1.6231753647103917, "learning_rate": 1e-05, "loss": 0.9192, "step": 326 }, { "epoch": 0.2614039470397202, "grad_norm": 1.6717011992423259, "learning_rate": 1e-05, "loss": 0.9214, "step": 327 }, { "epoch": 0.26220334748938295, "grad_norm": 1.6440233759725276, "learning_rate": 1e-05, "loss": 0.9525, "step": 328 }, { "epoch": 0.2630027479390457, "grad_norm": 1.6336229619568068, "learning_rate": 1e-05, "loss": 0.9072, "step": 329 }, { "epoch": 0.26380214838870847, "grad_norm": 1.794138937818925, "learning_rate": 1e-05, "loss": 0.9081, "step": 330 }, { "epoch": 0.2646015488383712, "grad_norm": 1.7000293714077805, "learning_rate": 1e-05, "loss": 0.9311, "step": 331 }, { "epoch": 0.265400949288034, "grad_norm": 1.7629207816569556, "learning_rate": 1e-05, "loss": 0.8942, "step": 332 }, { "epoch": 0.26620034973769674, "grad_norm": 1.7243708406916276, "learning_rate": 1e-05, "loss": 0.9009, "step": 333 }, { "epoch": 0.2669997501873595, "grad_norm": 1.5153725886830214, "learning_rate": 1e-05, "loss": 0.946, "step": 334 }, { "epoch": 0.26779915063702225, "grad_norm": 1.5897189873039888, "learning_rate": 1e-05, "loss": 0.8988, "step": 335 }, { "epoch": 0.268598551086685, "grad_norm": 1.7792011474569303, "learning_rate": 1e-05, "loss": 0.9075, "step": 336 }, { "epoch": 0.2693979515363477, "grad_norm": 1.715871716234354, "learning_rate": 1e-05, "loss": 0.9488, "step": 337 }, { "epoch": 0.27019735198601047, "grad_norm": 1.7421673985618036, "learning_rate": 1e-05, "loss": 0.9265, "step": 338 }, { "epoch": 0.2709967524356732, "grad_norm": 1.701591645181251, "learning_rate": 1e-05, "loss": 0.9134, "step": 339 }, { "epoch": 0.271796152885336, "grad_norm": 1.5763851776425317, "learning_rate": 1e-05, "loss": 0.9059, "step": 340 }, { "epoch": 0.27259555333499874, "grad_norm": 1.8860488547053122, "learning_rate": 1e-05, "loss": 0.9379, "step": 341 }, { "epoch": 0.2733949537846615, "grad_norm": 1.6278214908005035, "learning_rate": 1e-05, "loss": 0.9041, "step": 342 }, { "epoch": 0.27419435423432426, "grad_norm": 1.8591339922582193, "learning_rate": 1e-05, "loss": 0.9159, "step": 343 }, { "epoch": 0.274993754683987, "grad_norm": 1.6416932855404107, "learning_rate": 1e-05, "loss": 0.9334, "step": 344 }, { "epoch": 0.27579315513364977, "grad_norm": 1.5841499089670428, "learning_rate": 1e-05, "loss": 0.8758, "step": 345 }, { "epoch": 0.2765925555833125, "grad_norm": 1.4885385714768005, "learning_rate": 1e-05, "loss": 0.9482, "step": 346 }, { "epoch": 0.2773919560329753, "grad_norm": 1.652595269550327, "learning_rate": 1e-05, "loss": 0.9341, "step": 347 }, { "epoch": 0.27819135648263804, "grad_norm": 1.569292511449757, "learning_rate": 1e-05, "loss": 0.9395, "step": 348 }, { "epoch": 0.2789907569323008, "grad_norm": 1.8816669651120839, "learning_rate": 1e-05, "loss": 0.879, "step": 349 }, { "epoch": 0.2797901573819635, "grad_norm": 1.8044366358437511, "learning_rate": 1e-05, "loss": 0.9476, "step": 350 }, { "epoch": 0.28058955783162626, "grad_norm": 1.581864578938978, "learning_rate": 1e-05, "loss": 0.9443, "step": 351 }, { "epoch": 0.281388958281289, "grad_norm": 1.719778574188113, "learning_rate": 1e-05, "loss": 0.9682, "step": 352 }, { "epoch": 0.28218835873095177, "grad_norm": 1.7544745777196906, "learning_rate": 1e-05, "loss": 0.935, "step": 353 }, { "epoch": 0.28298775918061453, "grad_norm": 1.529692690903228, "learning_rate": 1e-05, "loss": 0.899, "step": 354 }, { "epoch": 0.2837871596302773, "grad_norm": 1.7002824332518707, "learning_rate": 1e-05, "loss": 0.9089, "step": 355 }, { "epoch": 0.28458656007994004, "grad_norm": 1.6960676218935922, "learning_rate": 1e-05, "loss": 0.9131, "step": 356 }, { "epoch": 0.2853859605296028, "grad_norm": 1.5467919520374653, "learning_rate": 1e-05, "loss": 0.9234, "step": 357 }, { "epoch": 0.28618536097926556, "grad_norm": 1.5401712398267708, "learning_rate": 1e-05, "loss": 0.8821, "step": 358 }, { "epoch": 0.2869847614289283, "grad_norm": 1.713197431966504, "learning_rate": 1e-05, "loss": 0.9755, "step": 359 }, { "epoch": 0.28778416187859107, "grad_norm": 1.5846038726149987, "learning_rate": 1e-05, "loss": 0.9637, "step": 360 }, { "epoch": 0.28858356232825383, "grad_norm": 1.9337936027301381, "learning_rate": 1e-05, "loss": 0.9208, "step": 361 }, { "epoch": 0.2893829627779166, "grad_norm": 1.6240977396645668, "learning_rate": 1e-05, "loss": 0.9235, "step": 362 }, { "epoch": 0.29018236322757934, "grad_norm": 1.7452206300395003, "learning_rate": 1e-05, "loss": 0.9553, "step": 363 }, { "epoch": 0.29098176367724204, "grad_norm": 1.7207282088148232, "learning_rate": 1e-05, "loss": 0.9996, "step": 364 }, { "epoch": 0.2917811641269048, "grad_norm": 1.6238156006165856, "learning_rate": 1e-05, "loss": 0.9492, "step": 365 }, { "epoch": 0.29258056457656756, "grad_norm": 1.7217996073600954, "learning_rate": 1e-05, "loss": 0.9659, "step": 366 }, { "epoch": 0.2933799650262303, "grad_norm": 1.7599545299893906, "learning_rate": 1e-05, "loss": 0.8954, "step": 367 }, { "epoch": 0.2941793654758931, "grad_norm": 1.8392526222961474, "learning_rate": 1e-05, "loss": 0.9028, "step": 368 }, { "epoch": 0.29497876592555583, "grad_norm": 1.4791987859922466, "learning_rate": 1e-05, "loss": 0.9207, "step": 369 }, { "epoch": 0.2957781663752186, "grad_norm": 1.4806074723615978, "learning_rate": 1e-05, "loss": 0.9419, "step": 370 }, { "epoch": 0.29657756682488134, "grad_norm": 1.7004917267851303, "learning_rate": 1e-05, "loss": 0.9354, "step": 371 }, { "epoch": 0.2973769672745441, "grad_norm": 1.6234361909723023, "learning_rate": 1e-05, "loss": 0.8969, "step": 372 }, { "epoch": 0.29817636772420686, "grad_norm": 1.5271331279708817, "learning_rate": 1e-05, "loss": 0.9455, "step": 373 }, { "epoch": 0.2989757681738696, "grad_norm": 1.622230251696962, "learning_rate": 1e-05, "loss": 0.9504, "step": 374 }, { "epoch": 0.2997751686235324, "grad_norm": 1.807073970989606, "learning_rate": 1e-05, "loss": 0.9, "step": 375 }, { "epoch": 0.30057456907319513, "grad_norm": 1.4951410146162138, "learning_rate": 1e-05, "loss": 0.9664, "step": 376 }, { "epoch": 0.30137396952285783, "grad_norm": 1.813020482613949, "learning_rate": 1e-05, "loss": 0.9441, "step": 377 }, { "epoch": 0.3021733699725206, "grad_norm": 2.0285660578298046, "learning_rate": 1e-05, "loss": 0.8861, "step": 378 }, { "epoch": 0.30297277042218335, "grad_norm": 1.6967916115297645, "learning_rate": 1e-05, "loss": 0.9321, "step": 379 }, { "epoch": 0.3037721708718461, "grad_norm": 1.7022095887528572, "learning_rate": 1e-05, "loss": 0.9613, "step": 380 }, { "epoch": 0.30457157132150886, "grad_norm": 1.719645739549248, "learning_rate": 1e-05, "loss": 0.9219, "step": 381 }, { "epoch": 0.3053709717711716, "grad_norm": 1.6526243786903378, "learning_rate": 1e-05, "loss": 0.9069, "step": 382 }, { "epoch": 0.3061703722208344, "grad_norm": 1.554593608182918, "learning_rate": 1e-05, "loss": 0.8863, "step": 383 }, { "epoch": 0.30696977267049713, "grad_norm": 1.7296741561953324, "learning_rate": 1e-05, "loss": 0.9965, "step": 384 }, { "epoch": 0.3077691731201599, "grad_norm": 1.7765959484743603, "learning_rate": 1e-05, "loss": 0.9024, "step": 385 }, { "epoch": 0.30856857356982265, "grad_norm": 1.7444591927862072, "learning_rate": 1e-05, "loss": 0.9491, "step": 386 }, { "epoch": 0.3093679740194854, "grad_norm": 1.6979650733135505, "learning_rate": 1e-05, "loss": 0.9633, "step": 387 }, { "epoch": 0.31016737446914816, "grad_norm": 1.6518215838203623, "learning_rate": 1e-05, "loss": 0.8594, "step": 388 }, { "epoch": 0.3109667749188109, "grad_norm": 1.597669753265097, "learning_rate": 1e-05, "loss": 0.8807, "step": 389 }, { "epoch": 0.3117661753684736, "grad_norm": 1.7005856529533696, "learning_rate": 1e-05, "loss": 0.9136, "step": 390 }, { "epoch": 0.3125655758181364, "grad_norm": 1.714793495031338, "learning_rate": 1e-05, "loss": 0.8969, "step": 391 }, { "epoch": 0.31336497626779913, "grad_norm": 1.5558141368768388, "learning_rate": 1e-05, "loss": 0.9257, "step": 392 }, { "epoch": 0.3141643767174619, "grad_norm": 1.5404155153049455, "learning_rate": 1e-05, "loss": 0.8779, "step": 393 }, { "epoch": 0.31496377716712465, "grad_norm": 1.5383972642859716, "learning_rate": 1e-05, "loss": 0.9707, "step": 394 }, { "epoch": 0.3157631776167874, "grad_norm": 1.7191998432330473, "learning_rate": 1e-05, "loss": 0.9126, "step": 395 }, { "epoch": 0.31656257806645016, "grad_norm": 1.6051194326495044, "learning_rate": 1e-05, "loss": 0.8822, "step": 396 }, { "epoch": 0.3173619785161129, "grad_norm": 1.6869656351879205, "learning_rate": 1e-05, "loss": 0.9343, "step": 397 }, { "epoch": 0.3181613789657757, "grad_norm": 1.6256734963382786, "learning_rate": 1e-05, "loss": 0.9156, "step": 398 }, { "epoch": 0.31896077941543843, "grad_norm": 1.5756449476038674, "learning_rate": 1e-05, "loss": 0.9807, "step": 399 }, { "epoch": 0.3197601798651012, "grad_norm": 1.6188490159724278, "learning_rate": 1e-05, "loss": 0.9644, "step": 400 }, { "epoch": 0.32055958031476395, "grad_norm": 1.94007311994945, "learning_rate": 1e-05, "loss": 0.9614, "step": 401 }, { "epoch": 0.3213589807644267, "grad_norm": 1.659086295612128, "learning_rate": 1e-05, "loss": 0.944, "step": 402 }, { "epoch": 0.3221583812140894, "grad_norm": 1.9235409755089947, "learning_rate": 1e-05, "loss": 0.9259, "step": 403 }, { "epoch": 0.32295778166375216, "grad_norm": 1.5880918105995026, "learning_rate": 1e-05, "loss": 0.9255, "step": 404 }, { "epoch": 0.3237571821134149, "grad_norm": 1.4948152435643522, "learning_rate": 1e-05, "loss": 0.926, "step": 405 }, { "epoch": 0.3245565825630777, "grad_norm": 1.5350941186461544, "learning_rate": 1e-05, "loss": 0.898, "step": 406 }, { "epoch": 0.32535598301274044, "grad_norm": 1.3466986686471294, "learning_rate": 1e-05, "loss": 0.8904, "step": 407 }, { "epoch": 0.3261553834624032, "grad_norm": 1.459891336046445, "learning_rate": 1e-05, "loss": 0.8817, "step": 408 }, { "epoch": 0.32695478391206595, "grad_norm": 1.4836727854431802, "learning_rate": 1e-05, "loss": 0.9216, "step": 409 }, { "epoch": 0.3277541843617287, "grad_norm": 1.6446226209440065, "learning_rate": 1e-05, "loss": 0.9249, "step": 410 }, { "epoch": 0.32855358481139146, "grad_norm": 1.5635297277867413, "learning_rate": 1e-05, "loss": 0.9122, "step": 411 }, { "epoch": 0.3293529852610542, "grad_norm": 1.6358281167528332, "learning_rate": 1e-05, "loss": 0.8843, "step": 412 }, { "epoch": 0.330152385710717, "grad_norm": 1.5404191221381782, "learning_rate": 1e-05, "loss": 0.9106, "step": 413 }, { "epoch": 0.33095178616037974, "grad_norm": 1.5879004668639547, "learning_rate": 1e-05, "loss": 0.9211, "step": 414 }, { "epoch": 0.3317511866100425, "grad_norm": 1.790797443056402, "learning_rate": 1e-05, "loss": 0.9211, "step": 415 }, { "epoch": 0.3325505870597052, "grad_norm": 1.8179861905661685, "learning_rate": 1e-05, "loss": 0.9391, "step": 416 }, { "epoch": 0.33334998750936795, "grad_norm": 1.4379165089707215, "learning_rate": 1e-05, "loss": 0.9021, "step": 417 }, { "epoch": 0.3341493879590307, "grad_norm": 1.7134688617321956, "learning_rate": 1e-05, "loss": 0.9833, "step": 418 }, { "epoch": 0.33494878840869347, "grad_norm": 1.4039431214440103, "learning_rate": 1e-05, "loss": 0.888, "step": 419 }, { "epoch": 0.3357481888583562, "grad_norm": 1.6586329038004721, "learning_rate": 1e-05, "loss": 0.9088, "step": 420 }, { "epoch": 0.336547589308019, "grad_norm": 1.900128933012227, "learning_rate": 1e-05, "loss": 0.9, "step": 421 }, { "epoch": 0.33734698975768174, "grad_norm": 1.5890662573554606, "learning_rate": 1e-05, "loss": 0.9029, "step": 422 }, { "epoch": 0.3381463902073445, "grad_norm": 1.495628306935103, "learning_rate": 1e-05, "loss": 0.919, "step": 423 }, { "epoch": 0.33894579065700725, "grad_norm": 1.4495521814015604, "learning_rate": 1e-05, "loss": 0.9967, "step": 424 }, { "epoch": 0.33974519110667, "grad_norm": 1.7055256640065686, "learning_rate": 1e-05, "loss": 0.9769, "step": 425 }, { "epoch": 0.34054459155633277, "grad_norm": 1.4909741619159311, "learning_rate": 1e-05, "loss": 0.9152, "step": 426 }, { "epoch": 0.3413439920059955, "grad_norm": 1.628227110908977, "learning_rate": 1e-05, "loss": 0.955, "step": 427 }, { "epoch": 0.3421433924556583, "grad_norm": 1.8220036868892047, "learning_rate": 1e-05, "loss": 0.8972, "step": 428 }, { "epoch": 0.34294279290532104, "grad_norm": 1.693415237669836, "learning_rate": 1e-05, "loss": 0.9063, "step": 429 }, { "epoch": 0.34374219335498374, "grad_norm": 1.5346322329118909, "learning_rate": 1e-05, "loss": 0.8737, "step": 430 }, { "epoch": 0.3445415938046465, "grad_norm": 1.523134303904886, "learning_rate": 1e-05, "loss": 0.9522, "step": 431 }, { "epoch": 0.34534099425430925, "grad_norm": 1.8163891768400675, "learning_rate": 1e-05, "loss": 0.9254, "step": 432 }, { "epoch": 0.346140394703972, "grad_norm": 1.6001042968512986, "learning_rate": 1e-05, "loss": 0.9393, "step": 433 }, { "epoch": 0.34693979515363477, "grad_norm": 1.4962110538157338, "learning_rate": 1e-05, "loss": 0.9016, "step": 434 }, { "epoch": 0.3477391956032975, "grad_norm": 1.7041821659704226, "learning_rate": 1e-05, "loss": 0.8586, "step": 435 }, { "epoch": 0.3485385960529603, "grad_norm": 1.6883017856053422, "learning_rate": 1e-05, "loss": 0.9729, "step": 436 }, { "epoch": 0.34933799650262304, "grad_norm": 1.6846925338485461, "learning_rate": 1e-05, "loss": 0.9379, "step": 437 }, { "epoch": 0.3501373969522858, "grad_norm": 1.8235246867955863, "learning_rate": 1e-05, "loss": 0.9248, "step": 438 }, { "epoch": 0.35093679740194855, "grad_norm": 1.935505500625835, "learning_rate": 1e-05, "loss": 0.9371, "step": 439 }, { "epoch": 0.3517361978516113, "grad_norm": 1.67613124761384, "learning_rate": 1e-05, "loss": 0.979, "step": 440 }, { "epoch": 0.35253559830127407, "grad_norm": 1.4449954490901646, "learning_rate": 1e-05, "loss": 0.968, "step": 441 }, { "epoch": 0.3533349987509368, "grad_norm": 1.5913830352404914, "learning_rate": 1e-05, "loss": 0.964, "step": 442 }, { "epoch": 0.3541343992005995, "grad_norm": 1.7168730495466147, "learning_rate": 1e-05, "loss": 0.9138, "step": 443 }, { "epoch": 0.3549337996502623, "grad_norm": 1.6307072180820321, "learning_rate": 1e-05, "loss": 0.9472, "step": 444 }, { "epoch": 0.35573320009992504, "grad_norm": 1.6118353409303823, "learning_rate": 1e-05, "loss": 0.9645, "step": 445 }, { "epoch": 0.3565326005495878, "grad_norm": 1.6940859087140694, "learning_rate": 1e-05, "loss": 0.9815, "step": 446 }, { "epoch": 0.35733200099925055, "grad_norm": 1.8606216696352482, "learning_rate": 1e-05, "loss": 0.9805, "step": 447 }, { "epoch": 0.3581314014489133, "grad_norm": 1.532089096889218, "learning_rate": 1e-05, "loss": 0.9393, "step": 448 }, { "epoch": 0.35893080189857607, "grad_norm": 1.6384290071957173, "learning_rate": 1e-05, "loss": 0.9459, "step": 449 }, { "epoch": 0.3597302023482388, "grad_norm": 1.5244481340256106, "learning_rate": 1e-05, "loss": 0.977, "step": 450 }, { "epoch": 0.3605296027979016, "grad_norm": 1.6337567843902518, "learning_rate": 1e-05, "loss": 0.8991, "step": 451 }, { "epoch": 0.36132900324756434, "grad_norm": 1.7963360988533934, "learning_rate": 1e-05, "loss": 0.9087, "step": 452 }, { "epoch": 0.3621284036972271, "grad_norm": 1.696315268595366, "learning_rate": 1e-05, "loss": 0.9242, "step": 453 }, { "epoch": 0.36292780414688985, "grad_norm": 1.710036005807286, "learning_rate": 1e-05, "loss": 0.8429, "step": 454 }, { "epoch": 0.3637272045965526, "grad_norm": 1.749452843653296, "learning_rate": 1e-05, "loss": 0.9133, "step": 455 }, { "epoch": 0.3645266050462153, "grad_norm": 1.4397928987828232, "learning_rate": 1e-05, "loss": 0.8427, "step": 456 }, { "epoch": 0.36532600549587807, "grad_norm": 1.6825466790780408, "learning_rate": 1e-05, "loss": 0.89, "step": 457 }, { "epoch": 0.3661254059455408, "grad_norm": 1.6056927709310882, "learning_rate": 1e-05, "loss": 0.9421, "step": 458 }, { "epoch": 0.3669248063952036, "grad_norm": 1.5861038676425987, "learning_rate": 1e-05, "loss": 0.9496, "step": 459 }, { "epoch": 0.36772420684486634, "grad_norm": 1.6684621776248278, "learning_rate": 1e-05, "loss": 0.884, "step": 460 }, { "epoch": 0.3685236072945291, "grad_norm": 1.758026110496432, "learning_rate": 1e-05, "loss": 0.9441, "step": 461 }, { "epoch": 0.36932300774419186, "grad_norm": 1.4763506224586516, "learning_rate": 1e-05, "loss": 0.9393, "step": 462 }, { "epoch": 0.3701224081938546, "grad_norm": 1.5509318071640712, "learning_rate": 1e-05, "loss": 0.8643, "step": 463 }, { "epoch": 0.37092180864351737, "grad_norm": 1.5607192206519345, "learning_rate": 1e-05, "loss": 0.9165, "step": 464 }, { "epoch": 0.3717212090931801, "grad_norm": 1.6511236719507991, "learning_rate": 1e-05, "loss": 0.9421, "step": 465 }, { "epoch": 0.3725206095428429, "grad_norm": 1.6501362966399429, "learning_rate": 1e-05, "loss": 0.915, "step": 466 }, { "epoch": 0.37332000999250564, "grad_norm": 1.5207720771291409, "learning_rate": 1e-05, "loss": 0.9454, "step": 467 }, { "epoch": 0.3741194104421684, "grad_norm": 1.5392735956515966, "learning_rate": 1e-05, "loss": 0.956, "step": 468 }, { "epoch": 0.3749188108918311, "grad_norm": 1.5940306759004237, "learning_rate": 1e-05, "loss": 0.8719, "step": 469 }, { "epoch": 0.37571821134149386, "grad_norm": 1.6908424326030602, "learning_rate": 1e-05, "loss": 0.9255, "step": 470 }, { "epoch": 0.3765176117911566, "grad_norm": 1.4928846149782238, "learning_rate": 1e-05, "loss": 0.9378, "step": 471 }, { "epoch": 0.37731701224081937, "grad_norm": 1.7041500499453686, "learning_rate": 1e-05, "loss": 0.8734, "step": 472 }, { "epoch": 0.37811641269048213, "grad_norm": 1.619189516937598, "learning_rate": 1e-05, "loss": 0.9063, "step": 473 }, { "epoch": 0.3789158131401449, "grad_norm": 1.6588364324248581, "learning_rate": 1e-05, "loss": 0.8701, "step": 474 }, { "epoch": 0.37971521358980764, "grad_norm": 1.5762727848791807, "learning_rate": 1e-05, "loss": 0.9497, "step": 475 }, { "epoch": 0.3805146140394704, "grad_norm": 1.5363970090025982, "learning_rate": 1e-05, "loss": 0.9918, "step": 476 }, { "epoch": 0.38131401448913316, "grad_norm": 1.6404231232106667, "learning_rate": 1e-05, "loss": 0.9056, "step": 477 }, { "epoch": 0.3821134149387959, "grad_norm": 1.6314596845516385, "learning_rate": 1e-05, "loss": 0.928, "step": 478 }, { "epoch": 0.38291281538845867, "grad_norm": 1.6126677835331522, "learning_rate": 1e-05, "loss": 0.9978, "step": 479 }, { "epoch": 0.38371221583812143, "grad_norm": 1.3173664389567725, "learning_rate": 1e-05, "loss": 0.9158, "step": 480 }, { "epoch": 0.3845116162877842, "grad_norm": 1.496540187325337, "learning_rate": 1e-05, "loss": 0.9378, "step": 481 }, { "epoch": 0.3853110167374469, "grad_norm": 1.5062068173629883, "learning_rate": 1e-05, "loss": 0.9159, "step": 482 }, { "epoch": 0.38611041718710964, "grad_norm": 1.529187603034289, "learning_rate": 1e-05, "loss": 0.951, "step": 483 }, { "epoch": 0.3869098176367724, "grad_norm": 1.5635118437005366, "learning_rate": 1e-05, "loss": 0.9291, "step": 484 }, { "epoch": 0.38770921808643516, "grad_norm": 1.6646247338291131, "learning_rate": 1e-05, "loss": 0.874, "step": 485 }, { "epoch": 0.3885086185360979, "grad_norm": 1.6470189371191908, "learning_rate": 1e-05, "loss": 0.9118, "step": 486 }, { "epoch": 0.3893080189857607, "grad_norm": 1.4041767343860398, "learning_rate": 1e-05, "loss": 0.9193, "step": 487 }, { "epoch": 0.39010741943542343, "grad_norm": 1.637354519439742, "learning_rate": 1e-05, "loss": 0.9622, "step": 488 }, { "epoch": 0.3909068198850862, "grad_norm": 1.6793914337693705, "learning_rate": 1e-05, "loss": 0.8591, "step": 489 }, { "epoch": 0.39170622033474894, "grad_norm": 1.631823843080509, "learning_rate": 1e-05, "loss": 0.9061, "step": 490 }, { "epoch": 0.3925056207844117, "grad_norm": 1.4551068376984746, "learning_rate": 1e-05, "loss": 0.886, "step": 491 }, { "epoch": 0.39330502123407446, "grad_norm": 1.843148583217912, "learning_rate": 1e-05, "loss": 0.8748, "step": 492 }, { "epoch": 0.3941044216837372, "grad_norm": 1.503885142875128, "learning_rate": 1e-05, "loss": 0.9208, "step": 493 }, { "epoch": 0.3949038221334, "grad_norm": 1.7406094685573732, "learning_rate": 1e-05, "loss": 0.8786, "step": 494 }, { "epoch": 0.39570322258306273, "grad_norm": 1.730586930891903, "learning_rate": 1e-05, "loss": 0.9157, "step": 495 }, { "epoch": 0.39650262303272543, "grad_norm": 1.5528810488930866, "learning_rate": 1e-05, "loss": 0.9142, "step": 496 }, { "epoch": 0.3973020234823882, "grad_norm": 1.5307301129466364, "learning_rate": 1e-05, "loss": 0.9415, "step": 497 }, { "epoch": 0.39810142393205095, "grad_norm": 1.5706393811203467, "learning_rate": 1e-05, "loss": 0.8912, "step": 498 }, { "epoch": 0.3989008243817137, "grad_norm": 1.6199448054984131, "learning_rate": 1e-05, "loss": 0.9175, "step": 499 }, { "epoch": 0.39970022483137646, "grad_norm": 1.4945708663613873, "learning_rate": 1e-05, "loss": 0.8961, "step": 500 }, { "epoch": 0.4004996252810392, "grad_norm": 1.5533154327294227, "learning_rate": 1e-05, "loss": 0.9356, "step": 501 }, { "epoch": 0.401299025730702, "grad_norm": 1.5325963522620767, "learning_rate": 1e-05, "loss": 0.965, "step": 502 }, { "epoch": 0.40209842618036473, "grad_norm": 1.671999510186726, "learning_rate": 1e-05, "loss": 0.8271, "step": 503 }, { "epoch": 0.4028978266300275, "grad_norm": 1.5355767548245969, "learning_rate": 1e-05, "loss": 0.9497, "step": 504 }, { "epoch": 0.40369722707969025, "grad_norm": 1.6030539969868434, "learning_rate": 1e-05, "loss": 0.9291, "step": 505 }, { "epoch": 0.404496627529353, "grad_norm": 1.6407538986876247, "learning_rate": 1e-05, "loss": 0.9878, "step": 506 }, { "epoch": 0.40529602797901576, "grad_norm": 1.7688356573735502, "learning_rate": 1e-05, "loss": 0.9665, "step": 507 }, { "epoch": 0.4060954284286785, "grad_norm": 1.5607970481443443, "learning_rate": 1e-05, "loss": 0.9488, "step": 508 }, { "epoch": 0.4068948288783412, "grad_norm": 1.6161754040719796, "learning_rate": 1e-05, "loss": 0.858, "step": 509 }, { "epoch": 0.407694229328004, "grad_norm": 1.5793085315204543, "learning_rate": 1e-05, "loss": 0.8956, "step": 510 }, { "epoch": 0.40849362977766673, "grad_norm": 1.5936599885814402, "learning_rate": 1e-05, "loss": 0.9348, "step": 511 }, { "epoch": 0.4092930302273295, "grad_norm": 1.5658605524297327, "learning_rate": 1e-05, "loss": 0.9389, "step": 512 }, { "epoch": 0.41009243067699225, "grad_norm": 1.5921115812648192, "learning_rate": 1e-05, "loss": 0.8438, "step": 513 }, { "epoch": 0.410891831126655, "grad_norm": 1.8163231036582868, "learning_rate": 1e-05, "loss": 0.9064, "step": 514 }, { "epoch": 0.41169123157631776, "grad_norm": 1.5950813731389535, "learning_rate": 1e-05, "loss": 0.9172, "step": 515 }, { "epoch": 0.4124906320259805, "grad_norm": 1.689588168520015, "learning_rate": 1e-05, "loss": 0.9265, "step": 516 }, { "epoch": 0.4132900324756433, "grad_norm": 1.540041600561803, "learning_rate": 1e-05, "loss": 0.9189, "step": 517 }, { "epoch": 0.41408943292530603, "grad_norm": 1.6662920193878612, "learning_rate": 1e-05, "loss": 0.9155, "step": 518 }, { "epoch": 0.4148888333749688, "grad_norm": 1.6860065883672692, "learning_rate": 1e-05, "loss": 0.9608, "step": 519 }, { "epoch": 0.41568823382463155, "grad_norm": 1.7503429857603447, "learning_rate": 1e-05, "loss": 0.8936, "step": 520 }, { "epoch": 0.4164876342742943, "grad_norm": 1.4349809774745903, "learning_rate": 1e-05, "loss": 0.9076, "step": 521 }, { "epoch": 0.417287034723957, "grad_norm": 1.6525870899508948, "learning_rate": 1e-05, "loss": 0.951, "step": 522 }, { "epoch": 0.41808643517361976, "grad_norm": 1.305941403451334, "learning_rate": 1e-05, "loss": 0.9039, "step": 523 }, { "epoch": 0.4188858356232825, "grad_norm": 1.5535189677415364, "learning_rate": 1e-05, "loss": 0.9101, "step": 524 }, { "epoch": 0.4196852360729453, "grad_norm": 1.390869042188358, "learning_rate": 1e-05, "loss": 0.8821, "step": 525 }, { "epoch": 0.42048463652260804, "grad_norm": 1.6086764868308612, "learning_rate": 1e-05, "loss": 0.9494, "step": 526 }, { "epoch": 0.4212840369722708, "grad_norm": 1.5277453444137763, "learning_rate": 1e-05, "loss": 0.9042, "step": 527 }, { "epoch": 0.42208343742193355, "grad_norm": 1.5037652064794895, "learning_rate": 1e-05, "loss": 0.8973, "step": 528 }, { "epoch": 0.4228828378715963, "grad_norm": 1.630788946234423, "learning_rate": 1e-05, "loss": 0.8868, "step": 529 }, { "epoch": 0.42368223832125906, "grad_norm": 1.476128500837339, "learning_rate": 1e-05, "loss": 0.9264, "step": 530 }, { "epoch": 0.4244816387709218, "grad_norm": 1.4082525457129158, "learning_rate": 1e-05, "loss": 0.9194, "step": 531 }, { "epoch": 0.4252810392205846, "grad_norm": 1.60560804137754, "learning_rate": 1e-05, "loss": 0.8596, "step": 532 }, { "epoch": 0.42608043967024734, "grad_norm": 1.5292853895222724, "learning_rate": 1e-05, "loss": 0.8933, "step": 533 }, { "epoch": 0.4268798401199101, "grad_norm": 1.6276199503503024, "learning_rate": 1e-05, "loss": 0.8905, "step": 534 }, { "epoch": 0.4276792405695728, "grad_norm": 1.6143026040200776, "learning_rate": 1e-05, "loss": 0.9017, "step": 535 }, { "epoch": 0.42847864101923555, "grad_norm": 1.492638575870208, "learning_rate": 1e-05, "loss": 0.8445, "step": 536 }, { "epoch": 0.4292780414688983, "grad_norm": 1.5992856689061312, "learning_rate": 1e-05, "loss": 0.8747, "step": 537 }, { "epoch": 0.43007744191856107, "grad_norm": 1.8376302395541704, "learning_rate": 1e-05, "loss": 0.8744, "step": 538 }, { "epoch": 0.4308768423682238, "grad_norm": 1.5083175238496622, "learning_rate": 1e-05, "loss": 0.8831, "step": 539 }, { "epoch": 0.4316762428178866, "grad_norm": 1.6391308804501599, "learning_rate": 1e-05, "loss": 0.9093, "step": 540 }, { "epoch": 0.43247564326754934, "grad_norm": 1.587896265231209, "learning_rate": 1e-05, "loss": 0.931, "step": 541 }, { "epoch": 0.4332750437172121, "grad_norm": 1.5174662595552115, "learning_rate": 1e-05, "loss": 0.9176, "step": 542 }, { "epoch": 0.43407444416687485, "grad_norm": 1.6000443436491891, "learning_rate": 1e-05, "loss": 0.8983, "step": 543 }, { "epoch": 0.4348738446165376, "grad_norm": 1.6311375389076388, "learning_rate": 1e-05, "loss": 0.9358, "step": 544 }, { "epoch": 0.43567324506620037, "grad_norm": 1.5311673613481407, "learning_rate": 1e-05, "loss": 0.9248, "step": 545 }, { "epoch": 0.4364726455158631, "grad_norm": 1.527296520797819, "learning_rate": 1e-05, "loss": 0.8941, "step": 546 }, { "epoch": 0.4372720459655259, "grad_norm": 1.3849530231908453, "learning_rate": 1e-05, "loss": 0.9206, "step": 547 }, { "epoch": 0.4380714464151886, "grad_norm": 1.6041978636707703, "learning_rate": 1e-05, "loss": 0.8933, "step": 548 }, { "epoch": 0.43887084686485134, "grad_norm": 1.5449273405092985, "learning_rate": 1e-05, "loss": 0.9157, "step": 549 }, { "epoch": 0.4396702473145141, "grad_norm": 1.5864452967308555, "learning_rate": 1e-05, "loss": 0.8443, "step": 550 }, { "epoch": 0.44046964776417685, "grad_norm": 1.4728488192211566, "learning_rate": 1e-05, "loss": 0.9108, "step": 551 }, { "epoch": 0.4412690482138396, "grad_norm": 1.4823924024202317, "learning_rate": 1e-05, "loss": 0.9336, "step": 552 }, { "epoch": 0.44206844866350237, "grad_norm": 1.4382359303688308, "learning_rate": 1e-05, "loss": 0.9271, "step": 553 }, { "epoch": 0.4428678491131651, "grad_norm": 1.5676768234957863, "learning_rate": 1e-05, "loss": 0.9705, "step": 554 }, { "epoch": 0.4436672495628279, "grad_norm": 1.5423184321680976, "learning_rate": 1e-05, "loss": 0.8464, "step": 555 }, { "epoch": 0.44446665001249064, "grad_norm": 1.6045659880625645, "learning_rate": 1e-05, "loss": 0.9303, "step": 556 }, { "epoch": 0.4452660504621534, "grad_norm": 1.9872755202696784, "learning_rate": 1e-05, "loss": 0.8687, "step": 557 }, { "epoch": 0.44606545091181615, "grad_norm": 1.4834070914943105, "learning_rate": 1e-05, "loss": 0.951, "step": 558 }, { "epoch": 0.4468648513614789, "grad_norm": 1.5310211273825027, "learning_rate": 1e-05, "loss": 0.9233, "step": 559 }, { "epoch": 0.44766425181114167, "grad_norm": 1.5815996536549406, "learning_rate": 1e-05, "loss": 0.9767, "step": 560 }, { "epoch": 0.4484636522608044, "grad_norm": 1.7688239075887118, "learning_rate": 1e-05, "loss": 0.8879, "step": 561 }, { "epoch": 0.4492630527104671, "grad_norm": 1.6482560554808632, "learning_rate": 1e-05, "loss": 0.9124, "step": 562 }, { "epoch": 0.4500624531601299, "grad_norm": 1.5404021166963555, "learning_rate": 1e-05, "loss": 0.9027, "step": 563 }, { "epoch": 0.45086185360979264, "grad_norm": 1.5195520813189534, "learning_rate": 1e-05, "loss": 0.9112, "step": 564 }, { "epoch": 0.4516612540594554, "grad_norm": 1.5192783031055126, "learning_rate": 1e-05, "loss": 0.8971, "step": 565 }, { "epoch": 0.45246065450911815, "grad_norm": 1.5618653033074856, "learning_rate": 1e-05, "loss": 0.9054, "step": 566 }, { "epoch": 0.4532600549587809, "grad_norm": 1.6064016663059253, "learning_rate": 1e-05, "loss": 0.9391, "step": 567 }, { "epoch": 0.45405945540844367, "grad_norm": 1.7240615287273162, "learning_rate": 1e-05, "loss": 0.8906, "step": 568 }, { "epoch": 0.4548588558581064, "grad_norm": 1.7149945179624295, "learning_rate": 1e-05, "loss": 0.8621, "step": 569 }, { "epoch": 0.4556582563077692, "grad_norm": 1.4856328376378898, "learning_rate": 1e-05, "loss": 0.8694, "step": 570 }, { "epoch": 0.45645765675743194, "grad_norm": 1.4702642174922036, "learning_rate": 1e-05, "loss": 0.9025, "step": 571 }, { "epoch": 0.4572570572070947, "grad_norm": 1.6088556169851551, "learning_rate": 1e-05, "loss": 0.868, "step": 572 }, { "epoch": 0.45805645765675745, "grad_norm": 1.5509844332733922, "learning_rate": 1e-05, "loss": 0.9513, "step": 573 }, { "epoch": 0.4588558581064202, "grad_norm": 1.5292949122902217, "learning_rate": 1e-05, "loss": 0.8845, "step": 574 }, { "epoch": 0.4596552585560829, "grad_norm": 1.6381076297979584, "learning_rate": 1e-05, "loss": 0.9386, "step": 575 }, { "epoch": 0.46045465900574567, "grad_norm": 1.6267004497668505, "learning_rate": 1e-05, "loss": 0.8987, "step": 576 }, { "epoch": 0.4612540594554084, "grad_norm": 1.5456142322307922, "learning_rate": 1e-05, "loss": 0.9121, "step": 577 }, { "epoch": 0.4620534599050712, "grad_norm": 1.5522043742149023, "learning_rate": 1e-05, "loss": 0.8914, "step": 578 }, { "epoch": 0.46285286035473394, "grad_norm": 1.633867715589152, "learning_rate": 1e-05, "loss": 0.8741, "step": 579 }, { "epoch": 0.4636522608043967, "grad_norm": 1.614894631262607, "learning_rate": 1e-05, "loss": 0.9171, "step": 580 }, { "epoch": 0.46445166125405946, "grad_norm": 1.386145144430922, "learning_rate": 1e-05, "loss": 0.8693, "step": 581 }, { "epoch": 0.4652510617037222, "grad_norm": 1.484841140261494, "learning_rate": 1e-05, "loss": 0.8966, "step": 582 }, { "epoch": 0.46605046215338497, "grad_norm": 1.6068617064880517, "learning_rate": 1e-05, "loss": 0.8818, "step": 583 }, { "epoch": 0.4668498626030477, "grad_norm": 1.6096786496184112, "learning_rate": 1e-05, "loss": 0.9123, "step": 584 }, { "epoch": 0.4676492630527105, "grad_norm": 1.4602535645871833, "learning_rate": 1e-05, "loss": 0.9143, "step": 585 }, { "epoch": 0.46844866350237324, "grad_norm": 1.7447912274361523, "learning_rate": 1e-05, "loss": 0.8966, "step": 586 }, { "epoch": 0.469248063952036, "grad_norm": 1.5775439912332734, "learning_rate": 1e-05, "loss": 0.8994, "step": 587 }, { "epoch": 0.4700474644016987, "grad_norm": 1.4031832068470533, "learning_rate": 1e-05, "loss": 0.9055, "step": 588 }, { "epoch": 0.47084686485136146, "grad_norm": 1.5789430313417314, "learning_rate": 1e-05, "loss": 0.9393, "step": 589 }, { "epoch": 0.4716462653010242, "grad_norm": 1.4655734741114497, "learning_rate": 1e-05, "loss": 0.8889, "step": 590 }, { "epoch": 0.47244566575068697, "grad_norm": 1.752804541715281, "learning_rate": 1e-05, "loss": 0.9166, "step": 591 }, { "epoch": 0.47324506620034973, "grad_norm": 1.6906678527664594, "learning_rate": 1e-05, "loss": 0.8673, "step": 592 }, { "epoch": 0.4740444666500125, "grad_norm": 1.5985802845452706, "learning_rate": 1e-05, "loss": 0.9435, "step": 593 }, { "epoch": 0.47484386709967524, "grad_norm": 1.6997316043068198, "learning_rate": 1e-05, "loss": 0.9112, "step": 594 }, { "epoch": 0.475643267549338, "grad_norm": 1.3896008701013607, "learning_rate": 1e-05, "loss": 0.8884, "step": 595 }, { "epoch": 0.47644266799900076, "grad_norm": 1.4232134469996818, "learning_rate": 1e-05, "loss": 0.8537, "step": 596 }, { "epoch": 0.4772420684486635, "grad_norm": 1.4962294604199373, "learning_rate": 1e-05, "loss": 0.8599, "step": 597 }, { "epoch": 0.47804146889832627, "grad_norm": 1.3445821960864492, "learning_rate": 1e-05, "loss": 0.8719, "step": 598 }, { "epoch": 0.47884086934798903, "grad_norm": 1.5426225615913305, "learning_rate": 1e-05, "loss": 0.9097, "step": 599 }, { "epoch": 0.4796402697976518, "grad_norm": 1.4650349809263883, "learning_rate": 1e-05, "loss": 0.8933, "step": 600 }, { "epoch": 0.4804396702473145, "grad_norm": 1.5753170073693514, "learning_rate": 1e-05, "loss": 0.9461, "step": 601 }, { "epoch": 0.48123907069697724, "grad_norm": 1.6207854665284498, "learning_rate": 1e-05, "loss": 0.8332, "step": 602 }, { "epoch": 0.48203847114664, "grad_norm": 1.6847020603077485, "learning_rate": 1e-05, "loss": 0.8902, "step": 603 }, { "epoch": 0.48283787159630276, "grad_norm": 1.746631687170473, "learning_rate": 1e-05, "loss": 0.852, "step": 604 }, { "epoch": 0.4836372720459655, "grad_norm": 1.5812097478750036, "learning_rate": 1e-05, "loss": 0.8875, "step": 605 }, { "epoch": 0.4844366724956283, "grad_norm": 1.664501332749721, "learning_rate": 1e-05, "loss": 0.9623, "step": 606 }, { "epoch": 0.48523607294529103, "grad_norm": 1.3887624769518734, "learning_rate": 1e-05, "loss": 0.8815, "step": 607 }, { "epoch": 0.4860354733949538, "grad_norm": 1.4487321005360188, "learning_rate": 1e-05, "loss": 0.8985, "step": 608 }, { "epoch": 0.48683487384461654, "grad_norm": 1.4789561071530237, "learning_rate": 1e-05, "loss": 0.9061, "step": 609 }, { "epoch": 0.4876342742942793, "grad_norm": 1.5069409156312008, "learning_rate": 1e-05, "loss": 0.9286, "step": 610 }, { "epoch": 0.48843367474394206, "grad_norm": 1.4663884880855809, "learning_rate": 1e-05, "loss": 0.9382, "step": 611 }, { "epoch": 0.4892330751936048, "grad_norm": 1.4795946008795262, "learning_rate": 1e-05, "loss": 0.9013, "step": 612 }, { "epoch": 0.4900324756432676, "grad_norm": 1.6550390075160482, "learning_rate": 1e-05, "loss": 0.8981, "step": 613 }, { "epoch": 0.4908318760929303, "grad_norm": 1.5252370570410794, "learning_rate": 1e-05, "loss": 0.9399, "step": 614 }, { "epoch": 0.49163127654259303, "grad_norm": 1.55342646595899, "learning_rate": 1e-05, "loss": 0.9369, "step": 615 }, { "epoch": 0.4924306769922558, "grad_norm": 1.3945867465343513, "learning_rate": 1e-05, "loss": 0.9739, "step": 616 }, { "epoch": 0.49323007744191855, "grad_norm": 1.8084042523739312, "learning_rate": 1e-05, "loss": 0.8568, "step": 617 }, { "epoch": 0.4940294778915813, "grad_norm": 1.3957730664102426, "learning_rate": 1e-05, "loss": 0.9212, "step": 618 }, { "epoch": 0.49482887834124406, "grad_norm": 1.576073681260172, "learning_rate": 1e-05, "loss": 0.9323, "step": 619 }, { "epoch": 0.4956282787909068, "grad_norm": 1.478562229589502, "learning_rate": 1e-05, "loss": 0.8784, "step": 620 }, { "epoch": 0.4964276792405696, "grad_norm": 1.5742856570618204, "learning_rate": 1e-05, "loss": 0.8949, "step": 621 }, { "epoch": 0.49722707969023233, "grad_norm": 1.7717496405831807, "learning_rate": 1e-05, "loss": 0.8882, "step": 622 }, { "epoch": 0.4980264801398951, "grad_norm": 1.512802542889935, "learning_rate": 1e-05, "loss": 0.9555, "step": 623 }, { "epoch": 0.49882588058955785, "grad_norm": 1.44305014112251, "learning_rate": 1e-05, "loss": 0.9058, "step": 624 }, { "epoch": 0.4996252810392206, "grad_norm": 1.6065628841661808, "learning_rate": 1e-05, "loss": 0.8697, "step": 625 }, { "epoch": 0.5004246814888833, "grad_norm": 1.5176075034291314, "learning_rate": 1e-05, "loss": 0.8774, "step": 626 }, { "epoch": 0.5012240819385461, "grad_norm": 1.4385224047152578, "learning_rate": 1e-05, "loss": 0.9092, "step": 627 }, { "epoch": 0.5020234823882088, "grad_norm": 1.5846911793271963, "learning_rate": 1e-05, "loss": 0.9333, "step": 628 }, { "epoch": 0.5028228828378716, "grad_norm": 1.6455364602527989, "learning_rate": 1e-05, "loss": 0.9412, "step": 629 }, { "epoch": 0.5036222832875343, "grad_norm": 1.6062811152199334, "learning_rate": 1e-05, "loss": 0.9091, "step": 630 }, { "epoch": 0.5044216837371971, "grad_norm": 1.4354611082735989, "learning_rate": 1e-05, "loss": 0.907, "step": 631 }, { "epoch": 0.5052210841868598, "grad_norm": 1.52829754540632, "learning_rate": 1e-05, "loss": 0.9195, "step": 632 }, { "epoch": 0.5060204846365226, "grad_norm": 1.6184765917993094, "learning_rate": 1e-05, "loss": 0.9452, "step": 633 }, { "epoch": 0.5068198850861854, "grad_norm": 1.5257888577090237, "learning_rate": 1e-05, "loss": 0.8772, "step": 634 }, { "epoch": 0.5076192855358481, "grad_norm": 1.4539514346389641, "learning_rate": 1e-05, "loss": 0.8898, "step": 635 }, { "epoch": 0.5084186859855109, "grad_norm": 1.6554813398137607, "learning_rate": 1e-05, "loss": 0.8987, "step": 636 }, { "epoch": 0.5092180864351736, "grad_norm": 1.4575833122082418, "learning_rate": 1e-05, "loss": 0.9455, "step": 637 }, { "epoch": 0.5100174868848364, "grad_norm": 1.651253682354515, "learning_rate": 1e-05, "loss": 0.8731, "step": 638 }, { "epoch": 0.5108168873344991, "grad_norm": 1.608007000762813, "learning_rate": 1e-05, "loss": 0.9068, "step": 639 }, { "epoch": 0.5116162877841619, "grad_norm": 1.4451823786722864, "learning_rate": 1e-05, "loss": 0.9363, "step": 640 }, { "epoch": 0.5124156882338247, "grad_norm": 1.585132398185237, "learning_rate": 1e-05, "loss": 0.8394, "step": 641 }, { "epoch": 0.5132150886834874, "grad_norm": 1.5460763577114784, "learning_rate": 1e-05, "loss": 0.8782, "step": 642 }, { "epoch": 0.5140144891331502, "grad_norm": 1.5336894539869739, "learning_rate": 1e-05, "loss": 0.8913, "step": 643 }, { "epoch": 0.5148138895828129, "grad_norm": 1.5563638706418883, "learning_rate": 1e-05, "loss": 0.891, "step": 644 }, { "epoch": 0.5156132900324757, "grad_norm": 1.6781793712825763, "learning_rate": 1e-05, "loss": 0.9243, "step": 645 }, { "epoch": 0.5164126904821384, "grad_norm": 1.555015563156278, "learning_rate": 1e-05, "loss": 0.8733, "step": 646 }, { "epoch": 0.5172120909318011, "grad_norm": 1.4725706386221917, "learning_rate": 1e-05, "loss": 0.8681, "step": 647 }, { "epoch": 0.5180114913814639, "grad_norm": 1.3268747875477092, "learning_rate": 1e-05, "loss": 0.8807, "step": 648 }, { "epoch": 0.5188108918311266, "grad_norm": 1.5451710380595707, "learning_rate": 1e-05, "loss": 0.9226, "step": 649 }, { "epoch": 0.5196102922807894, "grad_norm": 1.5573419054386046, "learning_rate": 1e-05, "loss": 0.9044, "step": 650 }, { "epoch": 0.5204096927304521, "grad_norm": 1.2989474126701601, "learning_rate": 1e-05, "loss": 0.8532, "step": 651 }, { "epoch": 0.5212090931801149, "grad_norm": 1.696305481260023, "learning_rate": 1e-05, "loss": 0.8595, "step": 652 }, { "epoch": 0.5220084936297776, "grad_norm": 1.4451028681658686, "learning_rate": 1e-05, "loss": 0.933, "step": 653 }, { "epoch": 0.5228078940794404, "grad_norm": 1.5925002414772222, "learning_rate": 1e-05, "loss": 0.9286, "step": 654 }, { "epoch": 0.5236072945291032, "grad_norm": 1.4716559853454252, "learning_rate": 1e-05, "loss": 0.8943, "step": 655 }, { "epoch": 0.5244066949787659, "grad_norm": 1.7475122640309384, "learning_rate": 1e-05, "loss": 0.9523, "step": 656 }, { "epoch": 0.5252060954284287, "grad_norm": 1.645603067269987, "learning_rate": 1e-05, "loss": 0.9075, "step": 657 }, { "epoch": 0.5260054958780914, "grad_norm": 1.6726736025945501, "learning_rate": 1e-05, "loss": 0.9039, "step": 658 }, { "epoch": 0.5268048963277542, "grad_norm": 1.518938137250405, "learning_rate": 1e-05, "loss": 0.8761, "step": 659 }, { "epoch": 0.5276042967774169, "grad_norm": 1.7083121838298914, "learning_rate": 1e-05, "loss": 0.8697, "step": 660 }, { "epoch": 0.5284036972270797, "grad_norm": 1.5499463775414077, "learning_rate": 1e-05, "loss": 0.8976, "step": 661 }, { "epoch": 0.5292030976767425, "grad_norm": 1.4098782407183605, "learning_rate": 1e-05, "loss": 0.8753, "step": 662 }, { "epoch": 0.5300024981264052, "grad_norm": 1.7641918962063994, "learning_rate": 1e-05, "loss": 0.9194, "step": 663 }, { "epoch": 0.530801898576068, "grad_norm": 1.5469637110527181, "learning_rate": 1e-05, "loss": 0.9059, "step": 664 }, { "epoch": 0.5316012990257307, "grad_norm": 1.6487062365426841, "learning_rate": 1e-05, "loss": 0.8897, "step": 665 }, { "epoch": 0.5324006994753935, "grad_norm": 1.5205006643304535, "learning_rate": 1e-05, "loss": 0.9216, "step": 666 }, { "epoch": 0.5332000999250562, "grad_norm": 1.5325976583230465, "learning_rate": 1e-05, "loss": 0.8957, "step": 667 }, { "epoch": 0.533999500374719, "grad_norm": 1.720042040656152, "learning_rate": 1e-05, "loss": 0.8832, "step": 668 }, { "epoch": 0.5347989008243818, "grad_norm": 1.4435271985771057, "learning_rate": 1e-05, "loss": 0.8587, "step": 669 }, { "epoch": 0.5355983012740445, "grad_norm": 1.7309862738667545, "learning_rate": 1e-05, "loss": 0.8801, "step": 670 }, { "epoch": 0.5363977017237073, "grad_norm": 1.51553026472629, "learning_rate": 1e-05, "loss": 0.8948, "step": 671 }, { "epoch": 0.53719710217337, "grad_norm": 1.5034966185821361, "learning_rate": 1e-05, "loss": 0.8825, "step": 672 }, { "epoch": 0.5379965026230328, "grad_norm": 1.5085135625486585, "learning_rate": 1e-05, "loss": 0.8758, "step": 673 }, { "epoch": 0.5387959030726954, "grad_norm": 1.5419185376449267, "learning_rate": 1e-05, "loss": 0.9952, "step": 674 }, { "epoch": 0.5395953035223582, "grad_norm": 1.516360272741118, "learning_rate": 1e-05, "loss": 0.8744, "step": 675 }, { "epoch": 0.5403947039720209, "grad_norm": 1.6057277324687185, "learning_rate": 1e-05, "loss": 0.9379, "step": 676 }, { "epoch": 0.5411941044216837, "grad_norm": 1.5074825505125475, "learning_rate": 1e-05, "loss": 0.8687, "step": 677 }, { "epoch": 0.5419935048713465, "grad_norm": 1.564461484690962, "learning_rate": 1e-05, "loss": 0.928, "step": 678 }, { "epoch": 0.5427929053210092, "grad_norm": 1.5052766213063988, "learning_rate": 1e-05, "loss": 0.909, "step": 679 }, { "epoch": 0.543592305770672, "grad_norm": 1.3946507047858405, "learning_rate": 1e-05, "loss": 0.8984, "step": 680 }, { "epoch": 0.5443917062203347, "grad_norm": 1.524550146914044, "learning_rate": 1e-05, "loss": 0.9103, "step": 681 }, { "epoch": 0.5451911066699975, "grad_norm": 1.743015450167898, "learning_rate": 1e-05, "loss": 0.8817, "step": 682 }, { "epoch": 0.5459905071196602, "grad_norm": 1.2727179347293005, "learning_rate": 1e-05, "loss": 0.9565, "step": 683 }, { "epoch": 0.546789907569323, "grad_norm": 1.4218645212985512, "learning_rate": 1e-05, "loss": 0.9361, "step": 684 }, { "epoch": 0.5475893080189858, "grad_norm": 1.5827671331667068, "learning_rate": 1e-05, "loss": 0.9195, "step": 685 }, { "epoch": 0.5483887084686485, "grad_norm": 1.5111024964279403, "learning_rate": 1e-05, "loss": 0.8975, "step": 686 }, { "epoch": 0.5491881089183113, "grad_norm": 1.698526384803921, "learning_rate": 1e-05, "loss": 0.8693, "step": 687 }, { "epoch": 0.549987509367974, "grad_norm": 1.4499111433077698, "learning_rate": 1e-05, "loss": 0.9304, "step": 688 }, { "epoch": 0.5507869098176368, "grad_norm": 1.6198855701994876, "learning_rate": 1e-05, "loss": 0.9071, "step": 689 }, { "epoch": 0.5515863102672995, "grad_norm": 1.447799249815993, "learning_rate": 1e-05, "loss": 0.8577, "step": 690 }, { "epoch": 0.5523857107169623, "grad_norm": 1.4643912062350883, "learning_rate": 1e-05, "loss": 0.8841, "step": 691 }, { "epoch": 0.553185111166625, "grad_norm": 1.2539155232355081, "learning_rate": 1e-05, "loss": 0.902, "step": 692 }, { "epoch": 0.5539845116162878, "grad_norm": 1.547551258731981, "learning_rate": 1e-05, "loss": 0.9678, "step": 693 }, { "epoch": 0.5547839120659506, "grad_norm": 1.4727625062306167, "learning_rate": 1e-05, "loss": 0.8882, "step": 694 }, { "epoch": 0.5555833125156133, "grad_norm": 1.4776645587359942, "learning_rate": 1e-05, "loss": 0.8921, "step": 695 }, { "epoch": 0.5563827129652761, "grad_norm": 1.4858467571616956, "learning_rate": 1e-05, "loss": 0.9032, "step": 696 }, { "epoch": 0.5571821134149388, "grad_norm": 1.6272094570109954, "learning_rate": 1e-05, "loss": 0.8754, "step": 697 }, { "epoch": 0.5579815138646016, "grad_norm": 1.5209165879169078, "learning_rate": 1e-05, "loss": 0.8892, "step": 698 }, { "epoch": 0.5587809143142644, "grad_norm": 1.5534555948764655, "learning_rate": 1e-05, "loss": 0.8518, "step": 699 }, { "epoch": 0.559580314763927, "grad_norm": 1.7199439771989053, "learning_rate": 1e-05, "loss": 0.9232, "step": 700 }, { "epoch": 0.5603797152135898, "grad_norm": 1.3598124896967667, "learning_rate": 1e-05, "loss": 0.9486, "step": 701 }, { "epoch": 0.5611791156632525, "grad_norm": 1.612574738886904, "learning_rate": 1e-05, "loss": 0.8794, "step": 702 }, { "epoch": 0.5619785161129153, "grad_norm": 1.3832262396852995, "learning_rate": 1e-05, "loss": 0.9321, "step": 703 }, { "epoch": 0.562777916562578, "grad_norm": 1.5923071651772416, "learning_rate": 1e-05, "loss": 0.8793, "step": 704 }, { "epoch": 0.5635773170122408, "grad_norm": 1.5489614624229269, "learning_rate": 1e-05, "loss": 0.8701, "step": 705 }, { "epoch": 0.5643767174619035, "grad_norm": 1.4485859965980266, "learning_rate": 1e-05, "loss": 0.8911, "step": 706 }, { "epoch": 0.5651761179115663, "grad_norm": 1.3786799015631879, "learning_rate": 1e-05, "loss": 0.9122, "step": 707 }, { "epoch": 0.5659755183612291, "grad_norm": 1.5342699233246582, "learning_rate": 1e-05, "loss": 0.8847, "step": 708 }, { "epoch": 0.5667749188108918, "grad_norm": 1.5871051701796994, "learning_rate": 1e-05, "loss": 0.884, "step": 709 }, { "epoch": 0.5675743192605546, "grad_norm": 1.5660069308536273, "learning_rate": 1e-05, "loss": 0.8551, "step": 710 }, { "epoch": 0.5683737197102173, "grad_norm": 1.429596069400543, "learning_rate": 1e-05, "loss": 0.8957, "step": 711 }, { "epoch": 0.5691731201598801, "grad_norm": 1.4491964477267238, "learning_rate": 1e-05, "loss": 0.8531, "step": 712 }, { "epoch": 0.5699725206095428, "grad_norm": 1.4365928694753973, "learning_rate": 1e-05, "loss": 0.8761, "step": 713 }, { "epoch": 0.5707719210592056, "grad_norm": 1.5578122539795014, "learning_rate": 1e-05, "loss": 0.8804, "step": 714 }, { "epoch": 0.5715713215088684, "grad_norm": 1.5327160301768794, "learning_rate": 1e-05, "loss": 0.8559, "step": 715 }, { "epoch": 0.5723707219585311, "grad_norm": 1.530442187113109, "learning_rate": 1e-05, "loss": 0.8689, "step": 716 }, { "epoch": 0.5731701224081939, "grad_norm": 1.5680317221543405, "learning_rate": 1e-05, "loss": 0.8969, "step": 717 }, { "epoch": 0.5739695228578566, "grad_norm": 1.5241288570093494, "learning_rate": 1e-05, "loss": 0.9161, "step": 718 }, { "epoch": 0.5747689233075194, "grad_norm": 1.3731985273369733, "learning_rate": 1e-05, "loss": 0.8568, "step": 719 }, { "epoch": 0.5755683237571821, "grad_norm": 1.4645302822523454, "learning_rate": 1e-05, "loss": 0.899, "step": 720 }, { "epoch": 0.5763677242068449, "grad_norm": 1.429554718936312, "learning_rate": 1e-05, "loss": 0.9161, "step": 721 }, { "epoch": 0.5771671246565077, "grad_norm": 1.3621850244930958, "learning_rate": 1e-05, "loss": 0.9169, "step": 722 }, { "epoch": 0.5779665251061704, "grad_norm": 1.485846183303666, "learning_rate": 1e-05, "loss": 0.9811, "step": 723 }, { "epoch": 0.5787659255558332, "grad_norm": 1.4036480667947844, "learning_rate": 1e-05, "loss": 0.8841, "step": 724 }, { "epoch": 0.5795653260054959, "grad_norm": 1.3680437907081195, "learning_rate": 1e-05, "loss": 0.861, "step": 725 }, { "epoch": 0.5803647264551587, "grad_norm": 1.4902900528640177, "learning_rate": 1e-05, "loss": 0.9022, "step": 726 }, { "epoch": 0.5811641269048213, "grad_norm": 1.367169701352056, "learning_rate": 1e-05, "loss": 0.9091, "step": 727 }, { "epoch": 0.5819635273544841, "grad_norm": 1.6487586565871948, "learning_rate": 1e-05, "loss": 0.9328, "step": 728 }, { "epoch": 0.5827629278041468, "grad_norm": 1.6567920316755664, "learning_rate": 1e-05, "loss": 0.8662, "step": 729 }, { "epoch": 0.5835623282538096, "grad_norm": 1.3391698664356693, "learning_rate": 1e-05, "loss": 0.8993, "step": 730 }, { "epoch": 0.5843617287034724, "grad_norm": 1.3695456445124472, "learning_rate": 1e-05, "loss": 0.8371, "step": 731 }, { "epoch": 0.5851611291531351, "grad_norm": 1.418306336363921, "learning_rate": 1e-05, "loss": 0.9092, "step": 732 }, { "epoch": 0.5859605296027979, "grad_norm": 1.7580509988769806, "learning_rate": 1e-05, "loss": 0.9117, "step": 733 }, { "epoch": 0.5867599300524606, "grad_norm": 1.4969730064494027, "learning_rate": 1e-05, "loss": 0.8777, "step": 734 }, { "epoch": 0.5875593305021234, "grad_norm": 1.5311661672699555, "learning_rate": 1e-05, "loss": 0.8633, "step": 735 }, { "epoch": 0.5883587309517861, "grad_norm": 1.4485040495772017, "learning_rate": 1e-05, "loss": 0.9358, "step": 736 }, { "epoch": 0.5891581314014489, "grad_norm": 1.4826514641684152, "learning_rate": 1e-05, "loss": 0.852, "step": 737 }, { "epoch": 0.5899575318511117, "grad_norm": 1.48791832285035, "learning_rate": 1e-05, "loss": 0.8782, "step": 738 }, { "epoch": 0.5907569323007744, "grad_norm": 1.6057041771896603, "learning_rate": 1e-05, "loss": 0.8316, "step": 739 }, { "epoch": 0.5915563327504372, "grad_norm": 2.1038688962784593, "learning_rate": 1e-05, "loss": 0.8521, "step": 740 }, { "epoch": 0.5923557332000999, "grad_norm": 1.621521996919619, "learning_rate": 1e-05, "loss": 0.8843, "step": 741 }, { "epoch": 0.5931551336497627, "grad_norm": 1.471365198038119, "learning_rate": 1e-05, "loss": 0.8784, "step": 742 }, { "epoch": 0.5939545340994254, "grad_norm": 1.4957177407162774, "learning_rate": 1e-05, "loss": 0.9049, "step": 743 }, { "epoch": 0.5947539345490882, "grad_norm": 1.4767129920631528, "learning_rate": 1e-05, "loss": 0.8556, "step": 744 }, { "epoch": 0.595553334998751, "grad_norm": 1.4751091109435195, "learning_rate": 1e-05, "loss": 0.8525, "step": 745 }, { "epoch": 0.5963527354484137, "grad_norm": 1.5180861867428592, "learning_rate": 1e-05, "loss": 0.8986, "step": 746 }, { "epoch": 0.5971521358980765, "grad_norm": 1.716833225193397, "learning_rate": 1e-05, "loss": 0.9396, "step": 747 }, { "epoch": 0.5979515363477392, "grad_norm": 1.662846684061582, "learning_rate": 1e-05, "loss": 0.8806, "step": 748 }, { "epoch": 0.598750936797402, "grad_norm": 1.5397292686479351, "learning_rate": 1e-05, "loss": 0.9085, "step": 749 }, { "epoch": 0.5995503372470647, "grad_norm": 1.3571032049534457, "learning_rate": 1e-05, "loss": 0.9406, "step": 750 }, { "epoch": 0.6003497376967275, "grad_norm": 1.384922018598161, "learning_rate": 1e-05, "loss": 0.8956, "step": 751 }, { "epoch": 0.6011491381463903, "grad_norm": 1.496498809863047, "learning_rate": 1e-05, "loss": 0.8918, "step": 752 }, { "epoch": 0.6019485385960529, "grad_norm": 1.4830953787172334, "learning_rate": 1e-05, "loss": 0.8431, "step": 753 }, { "epoch": 0.6027479390457157, "grad_norm": 1.6829833333195696, "learning_rate": 1e-05, "loss": 0.8561, "step": 754 }, { "epoch": 0.6035473394953784, "grad_norm": 1.3654050365320536, "learning_rate": 1e-05, "loss": 0.9101, "step": 755 }, { "epoch": 0.6043467399450412, "grad_norm": 1.5240419337473992, "learning_rate": 1e-05, "loss": 0.8338, "step": 756 }, { "epoch": 0.6051461403947039, "grad_norm": 1.5491861960420192, "learning_rate": 1e-05, "loss": 0.8921, "step": 757 }, { "epoch": 0.6059455408443667, "grad_norm": 1.4148317529647148, "learning_rate": 1e-05, "loss": 0.8677, "step": 758 }, { "epoch": 0.6067449412940294, "grad_norm": 1.469343003903587, "learning_rate": 1e-05, "loss": 0.8734, "step": 759 }, { "epoch": 0.6075443417436922, "grad_norm": 1.319737814833517, "learning_rate": 1e-05, "loss": 0.8688, "step": 760 }, { "epoch": 0.608343742193355, "grad_norm": 1.4581636035714403, "learning_rate": 1e-05, "loss": 0.8753, "step": 761 }, { "epoch": 0.6091431426430177, "grad_norm": 1.8427778000120836, "learning_rate": 1e-05, "loss": 0.9185, "step": 762 }, { "epoch": 0.6099425430926805, "grad_norm": 1.4013027241862714, "learning_rate": 1e-05, "loss": 0.9376, "step": 763 }, { "epoch": 0.6107419435423432, "grad_norm": 1.5267045554235308, "learning_rate": 1e-05, "loss": 0.8835, "step": 764 }, { "epoch": 0.611541343992006, "grad_norm": 1.4715893506156257, "learning_rate": 1e-05, "loss": 0.8676, "step": 765 }, { "epoch": 0.6123407444416687, "grad_norm": 1.4577005776877618, "learning_rate": 1e-05, "loss": 0.8796, "step": 766 }, { "epoch": 0.6131401448913315, "grad_norm": 1.4934814897272444, "learning_rate": 1e-05, "loss": 0.8458, "step": 767 }, { "epoch": 0.6139395453409943, "grad_norm": 1.5364809951583207, "learning_rate": 1e-05, "loss": 0.8316, "step": 768 }, { "epoch": 0.614738945790657, "grad_norm": 1.4992439555873935, "learning_rate": 1e-05, "loss": 0.9177, "step": 769 }, { "epoch": 0.6155383462403198, "grad_norm": 1.4324130065382474, "learning_rate": 1e-05, "loss": 0.9105, "step": 770 }, { "epoch": 0.6163377466899825, "grad_norm": 1.372488633970353, "learning_rate": 1e-05, "loss": 0.9365, "step": 771 }, { "epoch": 0.6171371471396453, "grad_norm": 1.3430055625087858, "learning_rate": 1e-05, "loss": 0.887, "step": 772 }, { "epoch": 0.617936547589308, "grad_norm": 1.4070687341497352, "learning_rate": 1e-05, "loss": 0.918, "step": 773 }, { "epoch": 0.6187359480389708, "grad_norm": 1.4126858378429896, "learning_rate": 1e-05, "loss": 0.8249, "step": 774 }, { "epoch": 0.6195353484886336, "grad_norm": 1.5659156867498283, "learning_rate": 1e-05, "loss": 0.8313, "step": 775 }, { "epoch": 0.6203347489382963, "grad_norm": 1.4546097055174756, "learning_rate": 1e-05, "loss": 0.8701, "step": 776 }, { "epoch": 0.6211341493879591, "grad_norm": 1.4487557061202467, "learning_rate": 1e-05, "loss": 0.9272, "step": 777 }, { "epoch": 0.6219335498376218, "grad_norm": 1.6276489271011279, "learning_rate": 1e-05, "loss": 0.9506, "step": 778 }, { "epoch": 0.6227329502872845, "grad_norm": 1.6078488944139557, "learning_rate": 1e-05, "loss": 0.9327, "step": 779 }, { "epoch": 0.6235323507369472, "grad_norm": 1.4508829251993478, "learning_rate": 1e-05, "loss": 0.9369, "step": 780 }, { "epoch": 0.62433175118661, "grad_norm": 1.612776765629144, "learning_rate": 1e-05, "loss": 0.9124, "step": 781 }, { "epoch": 0.6251311516362728, "grad_norm": 1.5608370989668476, "learning_rate": 1e-05, "loss": 0.8758, "step": 782 }, { "epoch": 0.6259305520859355, "grad_norm": 1.5129857913859477, "learning_rate": 1e-05, "loss": 0.891, "step": 783 }, { "epoch": 0.6267299525355983, "grad_norm": 1.4321443280452155, "learning_rate": 1e-05, "loss": 0.865, "step": 784 }, { "epoch": 0.627529352985261, "grad_norm": 1.5058564295604038, "learning_rate": 1e-05, "loss": 0.8721, "step": 785 }, { "epoch": 0.6283287534349238, "grad_norm": 1.3807849349968864, "learning_rate": 1e-05, "loss": 0.886, "step": 786 }, { "epoch": 0.6291281538845865, "grad_norm": 1.544883025432354, "learning_rate": 1e-05, "loss": 0.9102, "step": 787 }, { "epoch": 0.6299275543342493, "grad_norm": 1.4150356335689325, "learning_rate": 1e-05, "loss": 0.9361, "step": 788 }, { "epoch": 0.630726954783912, "grad_norm": 1.5188112447723208, "learning_rate": 1e-05, "loss": 0.8874, "step": 789 }, { "epoch": 0.6315263552335748, "grad_norm": 1.5441059644669919, "learning_rate": 1e-05, "loss": 0.9105, "step": 790 }, { "epoch": 0.6323257556832376, "grad_norm": 1.7469333936594207, "learning_rate": 1e-05, "loss": 0.8572, "step": 791 }, { "epoch": 0.6331251561329003, "grad_norm": 1.6602103078622925, "learning_rate": 1e-05, "loss": 0.9294, "step": 792 }, { "epoch": 0.6339245565825631, "grad_norm": 1.5925807734316682, "learning_rate": 1e-05, "loss": 0.9744, "step": 793 }, { "epoch": 0.6347239570322258, "grad_norm": 1.5394065631369533, "learning_rate": 1e-05, "loss": 0.9164, "step": 794 }, { "epoch": 0.6355233574818886, "grad_norm": 1.5935047510060332, "learning_rate": 1e-05, "loss": 0.8769, "step": 795 }, { "epoch": 0.6363227579315514, "grad_norm": 1.344142047079821, "learning_rate": 1e-05, "loss": 0.9317, "step": 796 }, { "epoch": 0.6371221583812141, "grad_norm": 1.6200454224138392, "learning_rate": 1e-05, "loss": 0.8334, "step": 797 }, { "epoch": 0.6379215588308769, "grad_norm": 1.5204016202631034, "learning_rate": 1e-05, "loss": 0.9006, "step": 798 }, { "epoch": 0.6387209592805396, "grad_norm": 1.4920314496701772, "learning_rate": 1e-05, "loss": 0.8501, "step": 799 }, { "epoch": 0.6395203597302024, "grad_norm": 1.3209265560951622, "learning_rate": 1e-05, "loss": 0.9025, "step": 800 }, { "epoch": 0.6403197601798651, "grad_norm": 1.5701927388007535, "learning_rate": 1e-05, "loss": 0.8747, "step": 801 }, { "epoch": 0.6411191606295279, "grad_norm": 1.3344795038412969, "learning_rate": 1e-05, "loss": 0.9104, "step": 802 }, { "epoch": 0.6419185610791907, "grad_norm": 1.3938320762656133, "learning_rate": 1e-05, "loss": 0.8409, "step": 803 }, { "epoch": 0.6427179615288534, "grad_norm": 1.4249626741383923, "learning_rate": 1e-05, "loss": 0.8727, "step": 804 }, { "epoch": 0.6435173619785162, "grad_norm": 1.6691646244578324, "learning_rate": 1e-05, "loss": 0.8903, "step": 805 }, { "epoch": 0.6443167624281788, "grad_norm": 1.665931296408499, "learning_rate": 1e-05, "loss": 0.8787, "step": 806 }, { "epoch": 0.6451161628778416, "grad_norm": 1.693200235102736, "learning_rate": 1e-05, "loss": 0.8462, "step": 807 }, { "epoch": 0.6459155633275043, "grad_norm": 1.4005335152598601, "learning_rate": 1e-05, "loss": 0.8637, "step": 808 }, { "epoch": 0.6467149637771671, "grad_norm": 1.5270196926285917, "learning_rate": 1e-05, "loss": 0.86, "step": 809 }, { "epoch": 0.6475143642268298, "grad_norm": 1.4150346179433293, "learning_rate": 1e-05, "loss": 0.8734, "step": 810 }, { "epoch": 0.6483137646764926, "grad_norm": 1.53091696763508, "learning_rate": 1e-05, "loss": 0.8754, "step": 811 }, { "epoch": 0.6491131651261554, "grad_norm": 1.474027558315905, "learning_rate": 1e-05, "loss": 0.9586, "step": 812 }, { "epoch": 0.6499125655758181, "grad_norm": 1.485859581480546, "learning_rate": 1e-05, "loss": 0.9106, "step": 813 }, { "epoch": 0.6507119660254809, "grad_norm": 1.568460720361032, "learning_rate": 1e-05, "loss": 0.8803, "step": 814 }, { "epoch": 0.6515113664751436, "grad_norm": 1.5563031313131295, "learning_rate": 1e-05, "loss": 0.9097, "step": 815 }, { "epoch": 0.6523107669248064, "grad_norm": 1.5440917854626373, "learning_rate": 1e-05, "loss": 0.9062, "step": 816 }, { "epoch": 0.6531101673744691, "grad_norm": 1.5083755089979098, "learning_rate": 1e-05, "loss": 0.8674, "step": 817 }, { "epoch": 0.6539095678241319, "grad_norm": 1.508645000565019, "learning_rate": 1e-05, "loss": 0.8815, "step": 818 }, { "epoch": 0.6547089682737947, "grad_norm": 1.6098529049906811, "learning_rate": 1e-05, "loss": 0.8344, "step": 819 }, { "epoch": 0.6555083687234574, "grad_norm": 1.711843405154856, "learning_rate": 1e-05, "loss": 0.9035, "step": 820 }, { "epoch": 0.6563077691731202, "grad_norm": 1.4578793644862615, "learning_rate": 1e-05, "loss": 0.8953, "step": 821 }, { "epoch": 0.6571071696227829, "grad_norm": 1.5916969602134543, "learning_rate": 1e-05, "loss": 0.8868, "step": 822 }, { "epoch": 0.6579065700724457, "grad_norm": 1.7747741238079355, "learning_rate": 1e-05, "loss": 0.8762, "step": 823 }, { "epoch": 0.6587059705221084, "grad_norm": 1.610938375922778, "learning_rate": 1e-05, "loss": 0.9062, "step": 824 }, { "epoch": 0.6595053709717712, "grad_norm": 1.6873519485834756, "learning_rate": 1e-05, "loss": 0.8631, "step": 825 }, { "epoch": 0.660304771421434, "grad_norm": 1.430821156429654, "learning_rate": 1e-05, "loss": 0.9604, "step": 826 }, { "epoch": 0.6611041718710967, "grad_norm": 1.457720171628577, "learning_rate": 1e-05, "loss": 0.8823, "step": 827 }, { "epoch": 0.6619035723207595, "grad_norm": 1.3817461766649617, "learning_rate": 1e-05, "loss": 0.9294, "step": 828 }, { "epoch": 0.6627029727704222, "grad_norm": 1.4095998527286095, "learning_rate": 1e-05, "loss": 0.8562, "step": 829 }, { "epoch": 0.663502373220085, "grad_norm": 1.4396424977428872, "learning_rate": 1e-05, "loss": 0.8256, "step": 830 }, { "epoch": 0.6643017736697477, "grad_norm": 1.38822130860778, "learning_rate": 1e-05, "loss": 0.8717, "step": 831 }, { "epoch": 0.6651011741194104, "grad_norm": 1.4057148558281964, "learning_rate": 1e-05, "loss": 0.9041, "step": 832 }, { "epoch": 0.6659005745690731, "grad_norm": 1.4772530181187606, "learning_rate": 1e-05, "loss": 0.9316, "step": 833 }, { "epoch": 0.6666999750187359, "grad_norm": 1.5248374759511425, "learning_rate": 1e-05, "loss": 0.8771, "step": 834 }, { "epoch": 0.6674993754683987, "grad_norm": 1.5352948925732954, "learning_rate": 1e-05, "loss": 0.9223, "step": 835 }, { "epoch": 0.6682987759180614, "grad_norm": 1.7695375410960146, "learning_rate": 1e-05, "loss": 0.8801, "step": 836 }, { "epoch": 0.6690981763677242, "grad_norm": 1.3579372966834742, "learning_rate": 1e-05, "loss": 0.8714, "step": 837 }, { "epoch": 0.6698975768173869, "grad_norm": 1.5174930728786662, "learning_rate": 1e-05, "loss": 0.8513, "step": 838 }, { "epoch": 0.6706969772670497, "grad_norm": 1.5225177134174273, "learning_rate": 1e-05, "loss": 0.8947, "step": 839 }, { "epoch": 0.6714963777167124, "grad_norm": 1.773009293174373, "learning_rate": 1e-05, "loss": 0.8279, "step": 840 }, { "epoch": 0.6722957781663752, "grad_norm": 1.5784630095216696, "learning_rate": 1e-05, "loss": 0.8764, "step": 841 }, { "epoch": 0.673095178616038, "grad_norm": 1.4708285523723468, "learning_rate": 1e-05, "loss": 0.8816, "step": 842 }, { "epoch": 0.6738945790657007, "grad_norm": 1.43983591742943, "learning_rate": 1e-05, "loss": 0.9482, "step": 843 }, { "epoch": 0.6746939795153635, "grad_norm": 1.4485915743374498, "learning_rate": 1e-05, "loss": 0.9028, "step": 844 }, { "epoch": 0.6754933799650262, "grad_norm": 1.5016530521995441, "learning_rate": 1e-05, "loss": 0.8731, "step": 845 }, { "epoch": 0.676292780414689, "grad_norm": 1.3809441111375442, "learning_rate": 1e-05, "loss": 0.9214, "step": 846 }, { "epoch": 0.6770921808643517, "grad_norm": 1.7240170055604878, "learning_rate": 1e-05, "loss": 0.8947, "step": 847 }, { "epoch": 0.6778915813140145, "grad_norm": 1.3301304611766438, "learning_rate": 1e-05, "loss": 0.9231, "step": 848 }, { "epoch": 0.6786909817636773, "grad_norm": 1.4218727212100182, "learning_rate": 1e-05, "loss": 0.8962, "step": 849 }, { "epoch": 0.67949038221334, "grad_norm": 1.63010423786957, "learning_rate": 1e-05, "loss": 0.8939, "step": 850 }, { "epoch": 0.6802897826630028, "grad_norm": 1.4495140324549352, "learning_rate": 1e-05, "loss": 0.8875, "step": 851 }, { "epoch": 0.6810891831126655, "grad_norm": 1.5626000543974294, "learning_rate": 1e-05, "loss": 0.8814, "step": 852 }, { "epoch": 0.6818885835623283, "grad_norm": 1.5909709047210767, "learning_rate": 1e-05, "loss": 0.883, "step": 853 }, { "epoch": 0.682687984011991, "grad_norm": 1.388722303171786, "learning_rate": 1e-05, "loss": 0.8748, "step": 854 }, { "epoch": 0.6834873844616538, "grad_norm": 1.385369830792288, "learning_rate": 1e-05, "loss": 0.8989, "step": 855 }, { "epoch": 0.6842867849113166, "grad_norm": 1.4882389241813443, "learning_rate": 1e-05, "loss": 0.8844, "step": 856 }, { "epoch": 0.6850861853609793, "grad_norm": 1.5186240399620652, "learning_rate": 1e-05, "loss": 0.8171, "step": 857 }, { "epoch": 0.6858855858106421, "grad_norm": 1.6078033804533332, "learning_rate": 1e-05, "loss": 0.8521, "step": 858 }, { "epoch": 0.6866849862603047, "grad_norm": 1.5272879309131646, "learning_rate": 1e-05, "loss": 0.8721, "step": 859 }, { "epoch": 0.6874843867099675, "grad_norm": 1.3931816328350173, "learning_rate": 1e-05, "loss": 0.8932, "step": 860 }, { "epoch": 0.6882837871596302, "grad_norm": 1.907029791689304, "learning_rate": 1e-05, "loss": 0.8905, "step": 861 }, { "epoch": 0.689083187609293, "grad_norm": 1.556416461497499, "learning_rate": 1e-05, "loss": 0.8986, "step": 862 }, { "epoch": 0.6898825880589557, "grad_norm": 1.4869019645563188, "learning_rate": 1e-05, "loss": 0.877, "step": 863 }, { "epoch": 0.6906819885086185, "grad_norm": 1.3740940835208075, "learning_rate": 1e-05, "loss": 0.9277, "step": 864 }, { "epoch": 0.6914813889582813, "grad_norm": 1.4834340760108946, "learning_rate": 1e-05, "loss": 0.9176, "step": 865 }, { "epoch": 0.692280789407944, "grad_norm": 1.4499951936894326, "learning_rate": 1e-05, "loss": 0.8522, "step": 866 }, { "epoch": 0.6930801898576068, "grad_norm": 1.3889909352429337, "learning_rate": 1e-05, "loss": 0.8675, "step": 867 }, { "epoch": 0.6938795903072695, "grad_norm": 1.3995933987812776, "learning_rate": 1e-05, "loss": 0.8729, "step": 868 }, { "epoch": 0.6946789907569323, "grad_norm": 1.4764512256041193, "learning_rate": 1e-05, "loss": 0.8209, "step": 869 }, { "epoch": 0.695478391206595, "grad_norm": 1.465237648051072, "learning_rate": 1e-05, "loss": 0.8008, "step": 870 }, { "epoch": 0.6962777916562578, "grad_norm": 1.3466097106594175, "learning_rate": 1e-05, "loss": 0.8931, "step": 871 }, { "epoch": 0.6970771921059206, "grad_norm": 1.5104958792040775, "learning_rate": 1e-05, "loss": 0.8828, "step": 872 }, { "epoch": 0.6978765925555833, "grad_norm": 1.5720653267427949, "learning_rate": 1e-05, "loss": 0.9531, "step": 873 }, { "epoch": 0.6986759930052461, "grad_norm": 1.5492614550562422, "learning_rate": 1e-05, "loss": 0.9313, "step": 874 }, { "epoch": 0.6994753934549088, "grad_norm": 1.303038024217404, "learning_rate": 1e-05, "loss": 0.9034, "step": 875 }, { "epoch": 0.7002747939045716, "grad_norm": 1.4497112842693025, "learning_rate": 1e-05, "loss": 0.8917, "step": 876 }, { "epoch": 0.7010741943542343, "grad_norm": 1.530596911055762, "learning_rate": 1e-05, "loss": 0.8814, "step": 877 }, { "epoch": 0.7018735948038971, "grad_norm": 1.5261791959543383, "learning_rate": 1e-05, "loss": 0.8853, "step": 878 }, { "epoch": 0.7026729952535599, "grad_norm": 1.527060521262994, "learning_rate": 1e-05, "loss": 0.8882, "step": 879 }, { "epoch": 0.7034723957032226, "grad_norm": 1.4906207672568565, "learning_rate": 1e-05, "loss": 0.8723, "step": 880 }, { "epoch": 0.7042717961528854, "grad_norm": 1.480851718176504, "learning_rate": 1e-05, "loss": 0.8692, "step": 881 }, { "epoch": 0.7050711966025481, "grad_norm": 1.47139179353177, "learning_rate": 1e-05, "loss": 0.888, "step": 882 }, { "epoch": 0.7058705970522109, "grad_norm": 1.4278237515234393, "learning_rate": 1e-05, "loss": 0.9221, "step": 883 }, { "epoch": 0.7066699975018736, "grad_norm": 1.573532967010904, "learning_rate": 1e-05, "loss": 0.9087, "step": 884 }, { "epoch": 0.7074693979515363, "grad_norm": 1.590669913446065, "learning_rate": 1e-05, "loss": 0.8771, "step": 885 }, { "epoch": 0.708268798401199, "grad_norm": 1.5285176052901992, "learning_rate": 1e-05, "loss": 0.8884, "step": 886 }, { "epoch": 0.7090681988508618, "grad_norm": 1.5768609209939375, "learning_rate": 1e-05, "loss": 0.8715, "step": 887 }, { "epoch": 0.7098675993005246, "grad_norm": 1.7625584577995699, "learning_rate": 1e-05, "loss": 0.936, "step": 888 }, { "epoch": 0.7106669997501873, "grad_norm": 1.6615792785808772, "learning_rate": 1e-05, "loss": 0.8865, "step": 889 }, { "epoch": 0.7114664001998501, "grad_norm": 1.3836071347408263, "learning_rate": 1e-05, "loss": 0.861, "step": 890 }, { "epoch": 0.7122658006495128, "grad_norm": 1.5374171878390779, "learning_rate": 1e-05, "loss": 0.8533, "step": 891 }, { "epoch": 0.7130652010991756, "grad_norm": 1.4960191138124015, "learning_rate": 1e-05, "loss": 0.8971, "step": 892 }, { "epoch": 0.7138646015488384, "grad_norm": 1.3462286304870854, "learning_rate": 1e-05, "loss": 0.9002, "step": 893 }, { "epoch": 0.7146640019985011, "grad_norm": 1.516533149153394, "learning_rate": 1e-05, "loss": 0.8495, "step": 894 }, { "epoch": 0.7154634024481639, "grad_norm": 1.4741671333939332, "learning_rate": 1e-05, "loss": 0.8702, "step": 895 }, { "epoch": 0.7162628028978266, "grad_norm": 1.412230967356979, "learning_rate": 1e-05, "loss": 0.8839, "step": 896 }, { "epoch": 0.7170622033474894, "grad_norm": 1.508657424433702, "learning_rate": 1e-05, "loss": 0.9207, "step": 897 }, { "epoch": 0.7178616037971521, "grad_norm": 1.5335780024625871, "learning_rate": 1e-05, "loss": 0.9414, "step": 898 }, { "epoch": 0.7186610042468149, "grad_norm": 1.522192545285303, "learning_rate": 1e-05, "loss": 0.85, "step": 899 }, { "epoch": 0.7194604046964777, "grad_norm": 1.433190511112366, "learning_rate": 1e-05, "loss": 0.8603, "step": 900 }, { "epoch": 0.7202598051461404, "grad_norm": 1.505735858560805, "learning_rate": 1e-05, "loss": 0.9305, "step": 901 }, { "epoch": 0.7210592055958032, "grad_norm": 1.3709122596783658, "learning_rate": 1e-05, "loss": 0.9035, "step": 902 }, { "epoch": 0.7218586060454659, "grad_norm": 1.4784407355636868, "learning_rate": 1e-05, "loss": 0.8133, "step": 903 }, { "epoch": 0.7226580064951287, "grad_norm": 1.4139431509162406, "learning_rate": 1e-05, "loss": 0.8757, "step": 904 }, { "epoch": 0.7234574069447914, "grad_norm": 1.483920166289949, "learning_rate": 1e-05, "loss": 0.8908, "step": 905 }, { "epoch": 0.7242568073944542, "grad_norm": 1.355169839026166, "learning_rate": 1e-05, "loss": 0.8778, "step": 906 }, { "epoch": 0.725056207844117, "grad_norm": 1.5849754730542471, "learning_rate": 1e-05, "loss": 0.8126, "step": 907 }, { "epoch": 0.7258556082937797, "grad_norm": 1.4415392226295947, "learning_rate": 1e-05, "loss": 0.9533, "step": 908 }, { "epoch": 0.7266550087434425, "grad_norm": 1.423271400925077, "learning_rate": 1e-05, "loss": 0.8991, "step": 909 }, { "epoch": 0.7274544091931052, "grad_norm": 1.2581118411370464, "learning_rate": 1e-05, "loss": 0.8691, "step": 910 }, { "epoch": 0.7282538096427679, "grad_norm": 1.6042455117982117, "learning_rate": 1e-05, "loss": 0.9323, "step": 911 }, { "epoch": 0.7290532100924306, "grad_norm": 1.7219536250131735, "learning_rate": 1e-05, "loss": 0.9108, "step": 912 }, { "epoch": 0.7298526105420934, "grad_norm": 1.39448532764431, "learning_rate": 1e-05, "loss": 0.8465, "step": 913 }, { "epoch": 0.7306520109917561, "grad_norm": 1.3967526960492356, "learning_rate": 1e-05, "loss": 0.8673, "step": 914 }, { "epoch": 0.7314514114414189, "grad_norm": 1.6077994734490668, "learning_rate": 1e-05, "loss": 0.8955, "step": 915 }, { "epoch": 0.7322508118910817, "grad_norm": 1.3203640300504973, "learning_rate": 1e-05, "loss": 0.7997, "step": 916 }, { "epoch": 0.7330502123407444, "grad_norm": 1.4566518226470033, "learning_rate": 1e-05, "loss": 0.8296, "step": 917 }, { "epoch": 0.7338496127904072, "grad_norm": 1.7293187013351636, "learning_rate": 1e-05, "loss": 0.9021, "step": 918 }, { "epoch": 0.7346490132400699, "grad_norm": 1.5383747305896551, "learning_rate": 1e-05, "loss": 0.8973, "step": 919 }, { "epoch": 0.7354484136897327, "grad_norm": 1.4275975245981607, "learning_rate": 1e-05, "loss": 0.8612, "step": 920 }, { "epoch": 0.7362478141393954, "grad_norm": 1.472214485322947, "learning_rate": 1e-05, "loss": 0.9005, "step": 921 }, { "epoch": 0.7370472145890582, "grad_norm": 1.4170406969180516, "learning_rate": 1e-05, "loss": 0.8952, "step": 922 }, { "epoch": 0.737846615038721, "grad_norm": 1.4134994732170305, "learning_rate": 1e-05, "loss": 0.8427, "step": 923 }, { "epoch": 0.7386460154883837, "grad_norm": 1.5810245176397593, "learning_rate": 1e-05, "loss": 0.8873, "step": 924 }, { "epoch": 0.7394454159380465, "grad_norm": 1.572493026866151, "learning_rate": 1e-05, "loss": 0.8999, "step": 925 }, { "epoch": 0.7402448163877092, "grad_norm": 1.4558846312035074, "learning_rate": 1e-05, "loss": 0.9221, "step": 926 }, { "epoch": 0.741044216837372, "grad_norm": 1.41669477168302, "learning_rate": 1e-05, "loss": 0.8994, "step": 927 }, { "epoch": 0.7418436172870347, "grad_norm": 1.433461160216514, "learning_rate": 1e-05, "loss": 0.8749, "step": 928 }, { "epoch": 0.7426430177366975, "grad_norm": 1.5673163590141157, "learning_rate": 1e-05, "loss": 0.8586, "step": 929 }, { "epoch": 0.7434424181863603, "grad_norm": 1.4736635147050137, "learning_rate": 1e-05, "loss": 0.9211, "step": 930 }, { "epoch": 0.744241818636023, "grad_norm": 1.4647228645746486, "learning_rate": 1e-05, "loss": 0.8332, "step": 931 }, { "epoch": 0.7450412190856858, "grad_norm": 1.3876657153509906, "learning_rate": 1e-05, "loss": 0.8481, "step": 932 }, { "epoch": 0.7458406195353485, "grad_norm": 1.469313389155329, "learning_rate": 1e-05, "loss": 0.9234, "step": 933 }, { "epoch": 0.7466400199850113, "grad_norm": 1.415959193503077, "learning_rate": 1e-05, "loss": 0.8794, "step": 934 }, { "epoch": 0.747439420434674, "grad_norm": 1.4597571617980725, "learning_rate": 1e-05, "loss": 0.8565, "step": 935 }, { "epoch": 0.7482388208843368, "grad_norm": 1.5271437643331571, "learning_rate": 1e-05, "loss": 0.8826, "step": 936 }, { "epoch": 0.7490382213339996, "grad_norm": 1.4956114964893394, "learning_rate": 1e-05, "loss": 0.9085, "step": 937 }, { "epoch": 0.7498376217836622, "grad_norm": 1.4732612528806723, "learning_rate": 1e-05, "loss": 0.8247, "step": 938 }, { "epoch": 0.750637022233325, "grad_norm": 1.4787986640658028, "learning_rate": 1e-05, "loss": 0.9317, "step": 939 }, { "epoch": 0.7514364226829877, "grad_norm": 1.7252017457319206, "learning_rate": 1e-05, "loss": 0.8741, "step": 940 }, { "epoch": 0.7522358231326505, "grad_norm": 1.4487217007150137, "learning_rate": 1e-05, "loss": 0.8629, "step": 941 }, { "epoch": 0.7530352235823132, "grad_norm": 1.5157039585564798, "learning_rate": 1e-05, "loss": 0.897, "step": 942 }, { "epoch": 0.753834624031976, "grad_norm": 1.611412160953887, "learning_rate": 1e-05, "loss": 0.9021, "step": 943 }, { "epoch": 0.7546340244816387, "grad_norm": 1.4394146060850934, "learning_rate": 1e-05, "loss": 0.8281, "step": 944 }, { "epoch": 0.7554334249313015, "grad_norm": 1.453348907195491, "learning_rate": 1e-05, "loss": 0.8928, "step": 945 }, { "epoch": 0.7562328253809643, "grad_norm": 1.4907250315835585, "learning_rate": 1e-05, "loss": 0.7856, "step": 946 }, { "epoch": 0.757032225830627, "grad_norm": 1.49481328462233, "learning_rate": 1e-05, "loss": 0.9155, "step": 947 }, { "epoch": 0.7578316262802898, "grad_norm": 1.3751698030196142, "learning_rate": 1e-05, "loss": 0.9301, "step": 948 }, { "epoch": 0.7586310267299525, "grad_norm": 1.4444725328440537, "learning_rate": 1e-05, "loss": 0.8655, "step": 949 }, { "epoch": 0.7594304271796153, "grad_norm": 1.5456042887758088, "learning_rate": 1e-05, "loss": 0.8901, "step": 950 }, { "epoch": 0.760229827629278, "grad_norm": 1.4949432017846453, "learning_rate": 1e-05, "loss": 0.916, "step": 951 }, { "epoch": 0.7610292280789408, "grad_norm": 1.6011034169877894, "learning_rate": 1e-05, "loss": 0.891, "step": 952 }, { "epoch": 0.7618286285286036, "grad_norm": 1.3042950526088992, "learning_rate": 1e-05, "loss": 0.9446, "step": 953 }, { "epoch": 0.7626280289782663, "grad_norm": 1.5837586059992244, "learning_rate": 1e-05, "loss": 0.91, "step": 954 }, { "epoch": 0.7634274294279291, "grad_norm": 1.4339719484551816, "learning_rate": 1e-05, "loss": 0.9016, "step": 955 }, { "epoch": 0.7642268298775918, "grad_norm": 1.5025550156701537, "learning_rate": 1e-05, "loss": 0.879, "step": 956 }, { "epoch": 0.7650262303272546, "grad_norm": 1.5378372958159126, "learning_rate": 1e-05, "loss": 0.9063, "step": 957 }, { "epoch": 0.7658256307769173, "grad_norm": 1.5230827569900542, "learning_rate": 1e-05, "loss": 0.8989, "step": 958 }, { "epoch": 0.7666250312265801, "grad_norm": 1.5291223084053325, "learning_rate": 1e-05, "loss": 0.8616, "step": 959 }, { "epoch": 0.7674244316762429, "grad_norm": 1.4773136700451888, "learning_rate": 1e-05, "loss": 0.8424, "step": 960 }, { "epoch": 0.7682238321259056, "grad_norm": 1.2093245102672463, "learning_rate": 1e-05, "loss": 0.8848, "step": 961 }, { "epoch": 0.7690232325755684, "grad_norm": 1.8571716921307402, "learning_rate": 1e-05, "loss": 0.8495, "step": 962 }, { "epoch": 0.7698226330252311, "grad_norm": 1.4472646694433717, "learning_rate": 1e-05, "loss": 0.867, "step": 963 }, { "epoch": 0.7706220334748938, "grad_norm": 1.4580828263402077, "learning_rate": 1e-05, "loss": 0.9002, "step": 964 }, { "epoch": 0.7714214339245565, "grad_norm": 1.499450946544706, "learning_rate": 1e-05, "loss": 0.9073, "step": 965 }, { "epoch": 0.7722208343742193, "grad_norm": 1.4461364940439836, "learning_rate": 1e-05, "loss": 0.8778, "step": 966 }, { "epoch": 0.773020234823882, "grad_norm": 1.506316728494387, "learning_rate": 1e-05, "loss": 0.845, "step": 967 }, { "epoch": 0.7738196352735448, "grad_norm": 1.3561052135711964, "learning_rate": 1e-05, "loss": 0.8722, "step": 968 }, { "epoch": 0.7746190357232076, "grad_norm": 1.4017997594585556, "learning_rate": 1e-05, "loss": 0.8602, "step": 969 }, { "epoch": 0.7754184361728703, "grad_norm": 1.4673374430145514, "learning_rate": 1e-05, "loss": 0.8503, "step": 970 }, { "epoch": 0.7762178366225331, "grad_norm": 1.6150949805416606, "learning_rate": 1e-05, "loss": 0.8194, "step": 971 }, { "epoch": 0.7770172370721958, "grad_norm": 1.4293495610183653, "learning_rate": 1e-05, "loss": 0.8719, "step": 972 }, { "epoch": 0.7778166375218586, "grad_norm": 1.34711255646197, "learning_rate": 1e-05, "loss": 0.9134, "step": 973 }, { "epoch": 0.7786160379715213, "grad_norm": 1.486875510521667, "learning_rate": 1e-05, "loss": 0.8282, "step": 974 }, { "epoch": 0.7794154384211841, "grad_norm": 1.3975406817023381, "learning_rate": 1e-05, "loss": 0.8696, "step": 975 }, { "epoch": 0.7802148388708469, "grad_norm": 1.590401419774706, "learning_rate": 1e-05, "loss": 0.8639, "step": 976 }, { "epoch": 0.7810142393205096, "grad_norm": 1.6348383990486186, "learning_rate": 1e-05, "loss": 0.8747, "step": 977 }, { "epoch": 0.7818136397701724, "grad_norm": 1.316266061656018, "learning_rate": 1e-05, "loss": 0.8912, "step": 978 }, { "epoch": 0.7826130402198351, "grad_norm": 1.4071917948886756, "learning_rate": 1e-05, "loss": 0.8258, "step": 979 }, { "epoch": 0.7834124406694979, "grad_norm": 1.4469880221919649, "learning_rate": 1e-05, "loss": 0.8354, "step": 980 }, { "epoch": 0.7842118411191606, "grad_norm": 1.2870843290387057, "learning_rate": 1e-05, "loss": 0.895, "step": 981 }, { "epoch": 0.7850112415688234, "grad_norm": 1.3149196281524491, "learning_rate": 1e-05, "loss": 0.8512, "step": 982 }, { "epoch": 0.7858106420184862, "grad_norm": 1.4704086234102491, "learning_rate": 1e-05, "loss": 0.9012, "step": 983 }, { "epoch": 0.7866100424681489, "grad_norm": 1.3828361638550721, "learning_rate": 1e-05, "loss": 0.8971, "step": 984 }, { "epoch": 0.7874094429178117, "grad_norm": 1.6052079287723495, "learning_rate": 1e-05, "loss": 0.8577, "step": 985 }, { "epoch": 0.7882088433674744, "grad_norm": 1.6793204061607632, "learning_rate": 1e-05, "loss": 0.876, "step": 986 }, { "epoch": 0.7890082438171372, "grad_norm": 1.4036184553448683, "learning_rate": 1e-05, "loss": 0.8983, "step": 987 }, { "epoch": 0.7898076442668, "grad_norm": 1.3858819411819097, "learning_rate": 1e-05, "loss": 0.8535, "step": 988 }, { "epoch": 0.7906070447164627, "grad_norm": 1.549784899074943, "learning_rate": 1e-05, "loss": 0.8849, "step": 989 }, { "epoch": 0.7914064451661255, "grad_norm": 1.4777960666446712, "learning_rate": 1e-05, "loss": 0.8666, "step": 990 }, { "epoch": 0.7922058456157881, "grad_norm": 1.9201257825258455, "learning_rate": 1e-05, "loss": 0.8409, "step": 991 }, { "epoch": 0.7930052460654509, "grad_norm": 1.5498417440527896, "learning_rate": 1e-05, "loss": 0.8805, "step": 992 }, { "epoch": 0.7938046465151136, "grad_norm": 1.4317539743714072, "learning_rate": 1e-05, "loss": 0.8635, "step": 993 }, { "epoch": 0.7946040469647764, "grad_norm": 1.5358225135776136, "learning_rate": 1e-05, "loss": 0.8708, "step": 994 }, { "epoch": 0.7954034474144391, "grad_norm": 1.4138952398073754, "learning_rate": 1e-05, "loss": 0.9169, "step": 995 }, { "epoch": 0.7962028478641019, "grad_norm": 1.306882525453356, "learning_rate": 1e-05, "loss": 0.8839, "step": 996 }, { "epoch": 0.7970022483137646, "grad_norm": 1.4151969180638062, "learning_rate": 1e-05, "loss": 0.9003, "step": 997 }, { "epoch": 0.7978016487634274, "grad_norm": 1.275880598076204, "learning_rate": 1e-05, "loss": 0.8549, "step": 998 }, { "epoch": 0.7986010492130902, "grad_norm": 1.4001477420212065, "learning_rate": 1e-05, "loss": 0.8505, "step": 999 }, { "epoch": 0.7994004496627529, "grad_norm": 1.4186294121350504, "learning_rate": 1e-05, "loss": 0.9215, "step": 1000 }, { "epoch": 0.8001998501124157, "grad_norm": 1.2933825079861516, "learning_rate": 1e-05, "loss": 0.851, "step": 1001 }, { "epoch": 0.8009992505620784, "grad_norm": 1.3844838762102727, "learning_rate": 1e-05, "loss": 0.8491, "step": 1002 }, { "epoch": 0.8017986510117412, "grad_norm": 1.424315745643642, "learning_rate": 1e-05, "loss": 0.8568, "step": 1003 }, { "epoch": 0.802598051461404, "grad_norm": 1.536779666402137, "learning_rate": 1e-05, "loss": 0.8616, "step": 1004 }, { "epoch": 0.8033974519110667, "grad_norm": 1.5821296355398455, "learning_rate": 1e-05, "loss": 0.8527, "step": 1005 }, { "epoch": 0.8041968523607295, "grad_norm": 1.4455856617071001, "learning_rate": 1e-05, "loss": 0.8713, "step": 1006 }, { "epoch": 0.8049962528103922, "grad_norm": 1.2354619055674243, "learning_rate": 1e-05, "loss": 0.8993, "step": 1007 }, { "epoch": 0.805795653260055, "grad_norm": 1.6070375123923897, "learning_rate": 1e-05, "loss": 0.8743, "step": 1008 }, { "epoch": 0.8065950537097177, "grad_norm": 1.4541848794736738, "learning_rate": 1e-05, "loss": 0.8581, "step": 1009 }, { "epoch": 0.8073944541593805, "grad_norm": 1.4226597509638712, "learning_rate": 1e-05, "loss": 0.8627, "step": 1010 }, { "epoch": 0.8081938546090432, "grad_norm": 1.4541863541400335, "learning_rate": 1e-05, "loss": 0.895, "step": 1011 }, { "epoch": 0.808993255058706, "grad_norm": 1.5062531375485146, "learning_rate": 1e-05, "loss": 0.8703, "step": 1012 }, { "epoch": 0.8097926555083688, "grad_norm": 1.4276851237794737, "learning_rate": 1e-05, "loss": 0.9105, "step": 1013 }, { "epoch": 0.8105920559580315, "grad_norm": 1.4788273876522071, "learning_rate": 1e-05, "loss": 0.8367, "step": 1014 }, { "epoch": 0.8113914564076943, "grad_norm": 1.407145240499365, "learning_rate": 1e-05, "loss": 0.8145, "step": 1015 }, { "epoch": 0.812190856857357, "grad_norm": 1.8014438432061057, "learning_rate": 1e-05, "loss": 0.8819, "step": 1016 }, { "epoch": 0.8129902573070197, "grad_norm": 1.372209038359735, "learning_rate": 1e-05, "loss": 0.868, "step": 1017 }, { "epoch": 0.8137896577566824, "grad_norm": 1.5495856867435909, "learning_rate": 1e-05, "loss": 0.8626, "step": 1018 }, { "epoch": 0.8145890582063452, "grad_norm": 1.617836176498916, "learning_rate": 1e-05, "loss": 0.802, "step": 1019 }, { "epoch": 0.815388458656008, "grad_norm": 1.4721686410259016, "learning_rate": 1e-05, "loss": 0.8835, "step": 1020 }, { "epoch": 0.8161878591056707, "grad_norm": 1.5756457858641464, "learning_rate": 1e-05, "loss": 0.903, "step": 1021 }, { "epoch": 0.8169872595553335, "grad_norm": 1.5229101192600658, "learning_rate": 1e-05, "loss": 0.9111, "step": 1022 }, { "epoch": 0.8177866600049962, "grad_norm": 1.3993305196243857, "learning_rate": 1e-05, "loss": 0.8218, "step": 1023 }, { "epoch": 0.818586060454659, "grad_norm": 1.760678360532871, "learning_rate": 1e-05, "loss": 0.8942, "step": 1024 }, { "epoch": 0.8193854609043217, "grad_norm": 1.5000785334067135, "learning_rate": 1e-05, "loss": 0.8791, "step": 1025 }, { "epoch": 0.8201848613539845, "grad_norm": 1.6674955223352013, "learning_rate": 1e-05, "loss": 0.8626, "step": 1026 }, { "epoch": 0.8209842618036473, "grad_norm": 1.6318940473847319, "learning_rate": 1e-05, "loss": 0.9127, "step": 1027 }, { "epoch": 0.82178366225331, "grad_norm": 1.3538540553221186, "learning_rate": 1e-05, "loss": 0.8845, "step": 1028 }, { "epoch": 0.8225830627029728, "grad_norm": 1.6241141376741397, "learning_rate": 1e-05, "loss": 0.9048, "step": 1029 }, { "epoch": 0.8233824631526355, "grad_norm": 1.6076201928103848, "learning_rate": 1e-05, "loss": 0.911, "step": 1030 }, { "epoch": 0.8241818636022983, "grad_norm": 1.6514943246242055, "learning_rate": 1e-05, "loss": 0.9161, "step": 1031 }, { "epoch": 0.824981264051961, "grad_norm": 1.4757470594102153, "learning_rate": 1e-05, "loss": 0.8522, "step": 1032 }, { "epoch": 0.8257806645016238, "grad_norm": 1.6550497620549556, "learning_rate": 1e-05, "loss": 0.8821, "step": 1033 }, { "epoch": 0.8265800649512866, "grad_norm": 1.416433208923937, "learning_rate": 1e-05, "loss": 0.8757, "step": 1034 }, { "epoch": 0.8273794654009493, "grad_norm": 1.2778570567318692, "learning_rate": 1e-05, "loss": 0.9259, "step": 1035 }, { "epoch": 0.8281788658506121, "grad_norm": 1.5852330201994453, "learning_rate": 1e-05, "loss": 0.833, "step": 1036 }, { "epoch": 0.8289782663002748, "grad_norm": 1.5240272977432132, "learning_rate": 1e-05, "loss": 0.8842, "step": 1037 }, { "epoch": 0.8297776667499376, "grad_norm": 1.2798050681953308, "learning_rate": 1e-05, "loss": 0.8338, "step": 1038 }, { "epoch": 0.8305770671996003, "grad_norm": 1.5489648653630288, "learning_rate": 1e-05, "loss": 0.8501, "step": 1039 }, { "epoch": 0.8313764676492631, "grad_norm": 1.435906811134758, "learning_rate": 1e-05, "loss": 0.867, "step": 1040 }, { "epoch": 0.8321758680989259, "grad_norm": 1.4702097962874583, "learning_rate": 1e-05, "loss": 0.8378, "step": 1041 }, { "epoch": 0.8329752685485886, "grad_norm": 1.594108179583473, "learning_rate": 1e-05, "loss": 0.8765, "step": 1042 }, { "epoch": 0.8337746689982513, "grad_norm": 1.5200902751808993, "learning_rate": 1e-05, "loss": 0.877, "step": 1043 }, { "epoch": 0.834574069447914, "grad_norm": 1.438298229451603, "learning_rate": 1e-05, "loss": 0.8663, "step": 1044 }, { "epoch": 0.8353734698975768, "grad_norm": 1.454018494694832, "learning_rate": 1e-05, "loss": 0.8918, "step": 1045 }, { "epoch": 0.8361728703472395, "grad_norm": 1.5005502708015002, "learning_rate": 1e-05, "loss": 0.8373, "step": 1046 }, { "epoch": 0.8369722707969023, "grad_norm": 1.291500552799284, "learning_rate": 1e-05, "loss": 0.8418, "step": 1047 }, { "epoch": 0.837771671246565, "grad_norm": 1.464971545003121, "learning_rate": 1e-05, "loss": 0.8986, "step": 1048 }, { "epoch": 0.8385710716962278, "grad_norm": 1.3970450556504503, "learning_rate": 1e-05, "loss": 0.8829, "step": 1049 }, { "epoch": 0.8393704721458906, "grad_norm": 1.4873155145975965, "learning_rate": 1e-05, "loss": 0.9039, "step": 1050 }, { "epoch": 0.8401698725955533, "grad_norm": 1.4116749220099283, "learning_rate": 1e-05, "loss": 0.8392, "step": 1051 }, { "epoch": 0.8409692730452161, "grad_norm": 1.375469878009426, "learning_rate": 1e-05, "loss": 0.8463, "step": 1052 }, { "epoch": 0.8417686734948788, "grad_norm": 1.2984722488631455, "learning_rate": 1e-05, "loss": 0.8578, "step": 1053 }, { "epoch": 0.8425680739445416, "grad_norm": 1.377137242087716, "learning_rate": 1e-05, "loss": 0.8753, "step": 1054 }, { "epoch": 0.8433674743942043, "grad_norm": 1.513617649555769, "learning_rate": 1e-05, "loss": 0.9161, "step": 1055 }, { "epoch": 0.8441668748438671, "grad_norm": 1.3943343336302483, "learning_rate": 1e-05, "loss": 0.8984, "step": 1056 }, { "epoch": 0.8449662752935299, "grad_norm": 1.4829273034743662, "learning_rate": 1e-05, "loss": 0.9456, "step": 1057 }, { "epoch": 0.8457656757431926, "grad_norm": 1.300813156773205, "learning_rate": 1e-05, "loss": 0.8848, "step": 1058 }, { "epoch": 0.8465650761928554, "grad_norm": 1.6171681830366456, "learning_rate": 1e-05, "loss": 0.8825, "step": 1059 }, { "epoch": 0.8473644766425181, "grad_norm": 1.4746785824924948, "learning_rate": 1e-05, "loss": 0.8782, "step": 1060 }, { "epoch": 0.8481638770921809, "grad_norm": 1.4773421322909972, "learning_rate": 1e-05, "loss": 0.8511, "step": 1061 }, { "epoch": 0.8489632775418436, "grad_norm": 1.2712734121324603, "learning_rate": 1e-05, "loss": 0.8512, "step": 1062 }, { "epoch": 0.8497626779915064, "grad_norm": 1.4673347511942942, "learning_rate": 1e-05, "loss": 0.9082, "step": 1063 }, { "epoch": 0.8505620784411692, "grad_norm": 1.3648492502704213, "learning_rate": 1e-05, "loss": 0.8857, "step": 1064 }, { "epoch": 0.8513614788908319, "grad_norm": 1.4139141501508512, "learning_rate": 1e-05, "loss": 0.862, "step": 1065 }, { "epoch": 0.8521608793404947, "grad_norm": 1.477130924813934, "learning_rate": 1e-05, "loss": 0.9706, "step": 1066 }, { "epoch": 0.8529602797901574, "grad_norm": 1.4482250453004122, "learning_rate": 1e-05, "loss": 0.906, "step": 1067 }, { "epoch": 0.8537596802398202, "grad_norm": 1.522992168116749, "learning_rate": 1e-05, "loss": 0.9545, "step": 1068 }, { "epoch": 0.8545590806894829, "grad_norm": 1.5261726249592624, "learning_rate": 1e-05, "loss": 0.9238, "step": 1069 }, { "epoch": 0.8553584811391456, "grad_norm": 1.4267309300725217, "learning_rate": 1e-05, "loss": 0.8925, "step": 1070 }, { "epoch": 0.8561578815888083, "grad_norm": 1.4527959506992734, "learning_rate": 1e-05, "loss": 0.8994, "step": 1071 }, { "epoch": 0.8569572820384711, "grad_norm": 1.5030190804190187, "learning_rate": 1e-05, "loss": 0.9301, "step": 1072 }, { "epoch": 0.8577566824881339, "grad_norm": 1.2235265071589685, "learning_rate": 1e-05, "loss": 0.8509, "step": 1073 }, { "epoch": 0.8585560829377966, "grad_norm": 1.4074654499219896, "learning_rate": 1e-05, "loss": 0.889, "step": 1074 }, { "epoch": 0.8593554833874594, "grad_norm": 1.2944008535061877, "learning_rate": 1e-05, "loss": 0.8966, "step": 1075 }, { "epoch": 0.8601548838371221, "grad_norm": 1.3310895995556136, "learning_rate": 1e-05, "loss": 0.8138, "step": 1076 }, { "epoch": 0.8609542842867849, "grad_norm": 1.427408631037752, "learning_rate": 1e-05, "loss": 0.8852, "step": 1077 }, { "epoch": 0.8617536847364476, "grad_norm": 1.4692371711172514, "learning_rate": 1e-05, "loss": 0.8797, "step": 1078 }, { "epoch": 0.8625530851861104, "grad_norm": 1.541692282374257, "learning_rate": 1e-05, "loss": 0.8589, "step": 1079 }, { "epoch": 0.8633524856357732, "grad_norm": 1.5815910403079887, "learning_rate": 1e-05, "loss": 0.8625, "step": 1080 }, { "epoch": 0.8641518860854359, "grad_norm": 1.4959179824929254, "learning_rate": 1e-05, "loss": 0.9189, "step": 1081 }, { "epoch": 0.8649512865350987, "grad_norm": 1.5319580336293697, "learning_rate": 1e-05, "loss": 0.8267, "step": 1082 }, { "epoch": 0.8657506869847614, "grad_norm": 1.486040602833083, "learning_rate": 1e-05, "loss": 0.8625, "step": 1083 }, { "epoch": 0.8665500874344242, "grad_norm": 1.424254094608181, "learning_rate": 1e-05, "loss": 0.8943, "step": 1084 }, { "epoch": 0.867349487884087, "grad_norm": 1.643630623556634, "learning_rate": 1e-05, "loss": 0.9188, "step": 1085 }, { "epoch": 0.8681488883337497, "grad_norm": 1.452534027382345, "learning_rate": 1e-05, "loss": 0.88, "step": 1086 }, { "epoch": 0.8689482887834125, "grad_norm": 1.5479604306192913, "learning_rate": 1e-05, "loss": 0.7943, "step": 1087 }, { "epoch": 0.8697476892330752, "grad_norm": 1.363070121645927, "learning_rate": 1e-05, "loss": 0.8416, "step": 1088 }, { "epoch": 0.870547089682738, "grad_norm": 1.4551924760921788, "learning_rate": 1e-05, "loss": 0.8748, "step": 1089 }, { "epoch": 0.8713464901324007, "grad_norm": 1.8072081219985316, "learning_rate": 1e-05, "loss": 0.8955, "step": 1090 }, { "epoch": 0.8721458905820635, "grad_norm": 1.480345916799882, "learning_rate": 1e-05, "loss": 0.8984, "step": 1091 }, { "epoch": 0.8729452910317262, "grad_norm": 1.513951237735827, "learning_rate": 1e-05, "loss": 0.8573, "step": 1092 }, { "epoch": 0.873744691481389, "grad_norm": 1.4012284095364107, "learning_rate": 1e-05, "loss": 0.8448, "step": 1093 }, { "epoch": 0.8745440919310518, "grad_norm": 1.4316182193855909, "learning_rate": 1e-05, "loss": 0.8778, "step": 1094 }, { "epoch": 0.8753434923807145, "grad_norm": 1.4767564148326937, "learning_rate": 1e-05, "loss": 0.8769, "step": 1095 }, { "epoch": 0.8761428928303772, "grad_norm": 1.4996522495810245, "learning_rate": 1e-05, "loss": 0.8928, "step": 1096 }, { "epoch": 0.8769422932800399, "grad_norm": 1.4836247078704627, "learning_rate": 1e-05, "loss": 0.8657, "step": 1097 }, { "epoch": 0.8777416937297027, "grad_norm": 1.4148846725052078, "learning_rate": 1e-05, "loss": 0.8763, "step": 1098 }, { "epoch": 0.8785410941793654, "grad_norm": 1.8564930284795111, "learning_rate": 1e-05, "loss": 0.8362, "step": 1099 }, { "epoch": 0.8793404946290282, "grad_norm": 1.3887316999375894, "learning_rate": 1e-05, "loss": 0.8692, "step": 1100 }, { "epoch": 0.880139895078691, "grad_norm": 4.721320996449426, "learning_rate": 1e-05, "loss": 0.8951, "step": 1101 }, { "epoch": 0.8809392955283537, "grad_norm": 1.801959602001512, "learning_rate": 1e-05, "loss": 0.9127, "step": 1102 }, { "epoch": 0.8817386959780165, "grad_norm": 1.4999059990761596, "learning_rate": 1e-05, "loss": 0.8277, "step": 1103 }, { "epoch": 0.8825380964276792, "grad_norm": 1.580749043430391, "learning_rate": 1e-05, "loss": 0.8532, "step": 1104 }, { "epoch": 0.883337496877342, "grad_norm": 4.723585804015321, "learning_rate": 1e-05, "loss": 0.8846, "step": 1105 }, { "epoch": 0.8841368973270047, "grad_norm": 10.533804682370834, "learning_rate": 1e-05, "loss": 0.915, "step": 1106 }, { "epoch": 0.8849362977766675, "grad_norm": 2.7074849652786948, "learning_rate": 1e-05, "loss": 0.8588, "step": 1107 }, { "epoch": 0.8857356982263302, "grad_norm": 1.8291875456761892, "learning_rate": 1e-05, "loss": 0.8274, "step": 1108 }, { "epoch": 0.886535098675993, "grad_norm": 4.390203546717027, "learning_rate": 1e-05, "loss": 0.908, "step": 1109 }, { "epoch": 0.8873344991256558, "grad_norm": 3.8103014314112156, "learning_rate": 1e-05, "loss": 0.9094, "step": 1110 }, { "epoch": 0.8881338995753185, "grad_norm": 455.28682152314866, "learning_rate": 1e-05, "loss": 0.8881, "step": 1111 }, { "epoch": 0.8889333000249813, "grad_norm": 4.4477324217626295, "learning_rate": 1e-05, "loss": 0.891, "step": 1112 }, { "epoch": 0.889732700474644, "grad_norm": 1.3884130302591122, "learning_rate": 1e-05, "loss": 0.8485, "step": 1113 }, { "epoch": 0.8905321009243068, "grad_norm": 1.4938176798235159, "learning_rate": 1e-05, "loss": 0.8438, "step": 1114 }, { "epoch": 0.8913315013739695, "grad_norm": 1.5434085929606869, "learning_rate": 1e-05, "loss": 0.8977, "step": 1115 }, { "epoch": 0.8921309018236323, "grad_norm": 1.3286197641197046, "learning_rate": 1e-05, "loss": 0.8355, "step": 1116 }, { "epoch": 0.8929303022732951, "grad_norm": 1.4646146883912168, "learning_rate": 1e-05, "loss": 0.8735, "step": 1117 }, { "epoch": 0.8937297027229578, "grad_norm": 8.122892577298567, "learning_rate": 1e-05, "loss": 0.9016, "step": 1118 }, { "epoch": 0.8945291031726206, "grad_norm": 1.4376773752975496, "learning_rate": 1e-05, "loss": 0.913, "step": 1119 }, { "epoch": 0.8953285036222833, "grad_norm": 2.146749128485352, "learning_rate": 1e-05, "loss": 0.8643, "step": 1120 }, { "epoch": 0.8961279040719461, "grad_norm": 1.7549423766927372, "learning_rate": 1e-05, "loss": 0.8559, "step": 1121 }, { "epoch": 0.8969273045216088, "grad_norm": 1.6784215753386844, "learning_rate": 1e-05, "loss": 0.8467, "step": 1122 }, { "epoch": 0.8977267049712715, "grad_norm": 1.5205155953208587, "learning_rate": 1e-05, "loss": 0.8918, "step": 1123 }, { "epoch": 0.8985261054209343, "grad_norm": 1.439844948580554, "learning_rate": 1e-05, "loss": 0.846, "step": 1124 }, { "epoch": 0.899325505870597, "grad_norm": 1.3494413427515104, "learning_rate": 1e-05, "loss": 0.8701, "step": 1125 }, { "epoch": 0.9001249063202598, "grad_norm": 1.4785482334232822, "learning_rate": 1e-05, "loss": 0.8762, "step": 1126 }, { "epoch": 0.9009243067699225, "grad_norm": 1.6204723133056338, "learning_rate": 1e-05, "loss": 0.8618, "step": 1127 }, { "epoch": 0.9017237072195853, "grad_norm": 1.5410838002577578, "learning_rate": 1e-05, "loss": 0.8352, "step": 1128 }, { "epoch": 0.902523107669248, "grad_norm": 1.408368948793772, "learning_rate": 1e-05, "loss": 0.8143, "step": 1129 }, { "epoch": 0.9033225081189108, "grad_norm": 1.3840905876298821, "learning_rate": 1e-05, "loss": 0.8497, "step": 1130 }, { "epoch": 0.9041219085685736, "grad_norm": 1.7756397607717793, "learning_rate": 1e-05, "loss": 0.9065, "step": 1131 }, { "epoch": 0.9049213090182363, "grad_norm": 1.3699433150113711, "learning_rate": 1e-05, "loss": 0.8455, "step": 1132 }, { "epoch": 0.9057207094678991, "grad_norm": 1.4303100795006611, "learning_rate": 1e-05, "loss": 0.8574, "step": 1133 }, { "epoch": 0.9065201099175618, "grad_norm": 1.3913259705586178, "learning_rate": 1e-05, "loss": 0.8615, "step": 1134 }, { "epoch": 0.9073195103672246, "grad_norm": 1.4143231716945688, "learning_rate": 1e-05, "loss": 0.9084, "step": 1135 }, { "epoch": 0.9081189108168873, "grad_norm": 1.3947073651825206, "learning_rate": 1e-05, "loss": 0.8926, "step": 1136 }, { "epoch": 0.9089183112665501, "grad_norm": 1.415175153929991, "learning_rate": 1e-05, "loss": 0.8153, "step": 1137 }, { "epoch": 0.9097177117162129, "grad_norm": 1.3554176947555092, "learning_rate": 1e-05, "loss": 0.9018, "step": 1138 }, { "epoch": 0.9105171121658756, "grad_norm": 1.3676437829569341, "learning_rate": 1e-05, "loss": 0.8339, "step": 1139 }, { "epoch": 0.9113165126155384, "grad_norm": 1.2248182189476722, "learning_rate": 1e-05, "loss": 0.8865, "step": 1140 }, { "epoch": 0.9121159130652011, "grad_norm": 1.5759658066895652, "learning_rate": 1e-05, "loss": 0.8736, "step": 1141 }, { "epoch": 0.9129153135148639, "grad_norm": 1.2978962092251058, "learning_rate": 1e-05, "loss": 0.9114, "step": 1142 }, { "epoch": 0.9137147139645266, "grad_norm": 1.3829867839638308, "learning_rate": 1e-05, "loss": 0.8636, "step": 1143 }, { "epoch": 0.9145141144141894, "grad_norm": 1.4786554185436886, "learning_rate": 1e-05, "loss": 0.9103, "step": 1144 }, { "epoch": 0.9153135148638522, "grad_norm": 1.5247139561552725, "learning_rate": 1e-05, "loss": 0.8403, "step": 1145 }, { "epoch": 0.9161129153135149, "grad_norm": 1.3888872413761024, "learning_rate": 1e-05, "loss": 0.8457, "step": 1146 }, { "epoch": 0.9169123157631777, "grad_norm": 1.426097741347822, "learning_rate": 1e-05, "loss": 0.8769, "step": 1147 }, { "epoch": 0.9177117162128404, "grad_norm": 1.5560059286195493, "learning_rate": 1e-05, "loss": 0.864, "step": 1148 }, { "epoch": 0.9185111166625031, "grad_norm": 1.4406955993681905, "learning_rate": 1e-05, "loss": 0.8668, "step": 1149 }, { "epoch": 0.9193105171121658, "grad_norm": 1.489597707567999, "learning_rate": 1e-05, "loss": 0.855, "step": 1150 }, { "epoch": 0.9201099175618286, "grad_norm": 1.379157014673917, "learning_rate": 1e-05, "loss": 0.935, "step": 1151 }, { "epoch": 0.9209093180114913, "grad_norm": 1.4949181541382415, "learning_rate": 1e-05, "loss": 0.8664, "step": 1152 }, { "epoch": 0.9217087184611541, "grad_norm": 1.4980233869730157, "learning_rate": 1e-05, "loss": 0.8224, "step": 1153 }, { "epoch": 0.9225081189108169, "grad_norm": 1.4050924624234455, "learning_rate": 1e-05, "loss": 0.891, "step": 1154 }, { "epoch": 0.9233075193604796, "grad_norm": 1.6467240441672264, "learning_rate": 1e-05, "loss": 0.834, "step": 1155 }, { "epoch": 0.9241069198101424, "grad_norm": 1.3421364569781595, "learning_rate": 1e-05, "loss": 0.8436, "step": 1156 }, { "epoch": 0.9249063202598051, "grad_norm": 1.246062501997166, "learning_rate": 1e-05, "loss": 0.869, "step": 1157 }, { "epoch": 0.9257057207094679, "grad_norm": 1.4858806518325938, "learning_rate": 1e-05, "loss": 0.9168, "step": 1158 }, { "epoch": 0.9265051211591306, "grad_norm": 1.4777896246461322, "learning_rate": 1e-05, "loss": 0.8593, "step": 1159 }, { "epoch": 0.9273045216087934, "grad_norm": 1.4266934813336434, "learning_rate": 1e-05, "loss": 0.8943, "step": 1160 }, { "epoch": 0.9281039220584562, "grad_norm": 1.4729608886697982, "learning_rate": 1e-05, "loss": 0.8981, "step": 1161 }, { "epoch": 0.9289033225081189, "grad_norm": 1.4856931952636183, "learning_rate": 1e-05, "loss": 0.8623, "step": 1162 }, { "epoch": 0.9297027229577817, "grad_norm": 1.4313714774475765, "learning_rate": 1e-05, "loss": 0.8441, "step": 1163 }, { "epoch": 0.9305021234074444, "grad_norm": 1.3975576369260547, "learning_rate": 1e-05, "loss": 0.8337, "step": 1164 }, { "epoch": 0.9313015238571072, "grad_norm": 1.4706185165998424, "learning_rate": 1e-05, "loss": 0.8336, "step": 1165 }, { "epoch": 0.9321009243067699, "grad_norm": 1.3837948320627937, "learning_rate": 1e-05, "loss": 0.8741, "step": 1166 }, { "epoch": 0.9329003247564327, "grad_norm": 1.3855675072168605, "learning_rate": 1e-05, "loss": 0.9235, "step": 1167 }, { "epoch": 0.9336997252060955, "grad_norm": 1.5034589343394933, "learning_rate": 1e-05, "loss": 0.8267, "step": 1168 }, { "epoch": 0.9344991256557582, "grad_norm": 1.5081619715031618, "learning_rate": 1e-05, "loss": 0.8912, "step": 1169 }, { "epoch": 0.935298526105421, "grad_norm": 1.570365541340616, "learning_rate": 1e-05, "loss": 0.8589, "step": 1170 }, { "epoch": 0.9360979265550837, "grad_norm": 1.368058151600139, "learning_rate": 1e-05, "loss": 0.8669, "step": 1171 }, { "epoch": 0.9368973270047465, "grad_norm": 1.4254090126900538, "learning_rate": 1e-05, "loss": 0.8997, "step": 1172 }, { "epoch": 0.9376967274544092, "grad_norm": 1.4563379308659208, "learning_rate": 1e-05, "loss": 0.8378, "step": 1173 }, { "epoch": 0.938496127904072, "grad_norm": 1.480841767300247, "learning_rate": 1e-05, "loss": 0.8428, "step": 1174 }, { "epoch": 0.9392955283537348, "grad_norm": 1.4441769797776909, "learning_rate": 1e-05, "loss": 0.8308, "step": 1175 }, { "epoch": 0.9400949288033974, "grad_norm": 1.5331892724720704, "learning_rate": 1e-05, "loss": 0.8733, "step": 1176 }, { "epoch": 0.9408943292530602, "grad_norm": 1.3897737412131999, "learning_rate": 1e-05, "loss": 0.8255, "step": 1177 }, { "epoch": 0.9416937297027229, "grad_norm": 1.4328437965242162, "learning_rate": 1e-05, "loss": 0.842, "step": 1178 }, { "epoch": 0.9424931301523857, "grad_norm": 1.1601633219334695, "learning_rate": 1e-05, "loss": 0.8528, "step": 1179 }, { "epoch": 0.9432925306020484, "grad_norm": 1.3663250801686486, "learning_rate": 1e-05, "loss": 0.8325, "step": 1180 }, { "epoch": 0.9440919310517112, "grad_norm": 1.4998097326159285, "learning_rate": 1e-05, "loss": 0.9421, "step": 1181 }, { "epoch": 0.9448913315013739, "grad_norm": 1.4608832729340682, "learning_rate": 1e-05, "loss": 0.8508, "step": 1182 }, { "epoch": 0.9456907319510367, "grad_norm": 1.562661791032361, "learning_rate": 1e-05, "loss": 0.9003, "step": 1183 }, { "epoch": 0.9464901324006995, "grad_norm": 1.4455704359698196, "learning_rate": 1e-05, "loss": 0.964, "step": 1184 }, { "epoch": 0.9472895328503622, "grad_norm": 1.5776281729460202, "learning_rate": 1e-05, "loss": 0.8575, "step": 1185 }, { "epoch": 0.948088933300025, "grad_norm": 1.549921877625713, "learning_rate": 1e-05, "loss": 0.8504, "step": 1186 }, { "epoch": 0.9488883337496877, "grad_norm": 1.3389592770549843, "learning_rate": 1e-05, "loss": 0.8488, "step": 1187 }, { "epoch": 0.9496877341993505, "grad_norm": 1.4429492254870946, "learning_rate": 1e-05, "loss": 0.8246, "step": 1188 }, { "epoch": 0.9504871346490132, "grad_norm": 1.4238306426926814, "learning_rate": 1e-05, "loss": 0.8696, "step": 1189 }, { "epoch": 0.951286535098676, "grad_norm": 1.4646305779731619, "learning_rate": 1e-05, "loss": 0.8502, "step": 1190 }, { "epoch": 0.9520859355483388, "grad_norm": 1.494347031973423, "learning_rate": 1e-05, "loss": 0.8642, "step": 1191 }, { "epoch": 0.9528853359980015, "grad_norm": 1.963685019515452, "learning_rate": 1e-05, "loss": 0.8507, "step": 1192 }, { "epoch": 0.9536847364476643, "grad_norm": 1.3925026944755527, "learning_rate": 1e-05, "loss": 0.833, "step": 1193 }, { "epoch": 0.954484136897327, "grad_norm": 1.4062902940189372, "learning_rate": 1e-05, "loss": 0.9028, "step": 1194 }, { "epoch": 0.9552835373469898, "grad_norm": 1.2343971080574194, "learning_rate": 1e-05, "loss": 0.8522, "step": 1195 }, { "epoch": 0.9560829377966525, "grad_norm": 1.4221098313944995, "learning_rate": 1e-05, "loss": 0.8577, "step": 1196 }, { "epoch": 0.9568823382463153, "grad_norm": 1.5290533732550755, "learning_rate": 1e-05, "loss": 0.8093, "step": 1197 }, { "epoch": 0.9576817386959781, "grad_norm": 1.3961174339920084, "learning_rate": 1e-05, "loss": 0.8647, "step": 1198 }, { "epoch": 0.9584811391456408, "grad_norm": 1.4151475464959773, "learning_rate": 1e-05, "loss": 0.8868, "step": 1199 }, { "epoch": 0.9592805395953036, "grad_norm": 1.513441275615894, "learning_rate": 1e-05, "loss": 0.8647, "step": 1200 }, { "epoch": 0.9600799400449663, "grad_norm": 1.3820417006090109, "learning_rate": 1e-05, "loss": 0.8477, "step": 1201 }, { "epoch": 0.960879340494629, "grad_norm": 1.4387974434664792, "learning_rate": 1e-05, "loss": 0.8536, "step": 1202 }, { "epoch": 0.9616787409442917, "grad_norm": 1.5784176967006853, "learning_rate": 1e-05, "loss": 0.8778, "step": 1203 }, { "epoch": 0.9624781413939545, "grad_norm": 1.4269915386314171, "learning_rate": 1e-05, "loss": 0.8572, "step": 1204 }, { "epoch": 0.9632775418436172, "grad_norm": 1.3866388696845584, "learning_rate": 1e-05, "loss": 0.8086, "step": 1205 }, { "epoch": 0.96407694229328, "grad_norm": 1.432076302146608, "learning_rate": 1e-05, "loss": 0.8454, "step": 1206 }, { "epoch": 0.9648763427429428, "grad_norm": 1.4992577974774581, "learning_rate": 1e-05, "loss": 0.7908, "step": 1207 }, { "epoch": 0.9656757431926055, "grad_norm": 1.497039314194387, "learning_rate": 1e-05, "loss": 0.8544, "step": 1208 }, { "epoch": 0.9664751436422683, "grad_norm": 1.3007974080201803, "learning_rate": 1e-05, "loss": 0.8477, "step": 1209 }, { "epoch": 0.967274544091931, "grad_norm": 1.5618516258742383, "learning_rate": 1e-05, "loss": 0.835, "step": 1210 }, { "epoch": 0.9680739445415938, "grad_norm": 1.4210670398569833, "learning_rate": 1e-05, "loss": 0.832, "step": 1211 }, { "epoch": 0.9688733449912565, "grad_norm": 1.5510313623384935, "learning_rate": 1e-05, "loss": 0.8602, "step": 1212 }, { "epoch": 0.9696727454409193, "grad_norm": 1.521288522133268, "learning_rate": 1e-05, "loss": 0.8861, "step": 1213 }, { "epoch": 0.9704721458905821, "grad_norm": 1.5884079297863427, "learning_rate": 1e-05, "loss": 0.8258, "step": 1214 }, { "epoch": 0.9712715463402448, "grad_norm": 1.3385008591661527, "learning_rate": 1e-05, "loss": 0.8272, "step": 1215 }, { "epoch": 0.9720709467899076, "grad_norm": 1.3382297608246647, "learning_rate": 1e-05, "loss": 0.8984, "step": 1216 }, { "epoch": 0.9728703472395703, "grad_norm": 1.548407496139496, "learning_rate": 1e-05, "loss": 0.8649, "step": 1217 }, { "epoch": 0.9736697476892331, "grad_norm": 1.336053175129197, "learning_rate": 1e-05, "loss": 0.8958, "step": 1218 }, { "epoch": 0.9744691481388958, "grad_norm": 1.3748017255834115, "learning_rate": 1e-05, "loss": 0.8486, "step": 1219 }, { "epoch": 0.9752685485885586, "grad_norm": 1.5234383744628233, "learning_rate": 1e-05, "loss": 0.8617, "step": 1220 }, { "epoch": 0.9760679490382214, "grad_norm": 1.4764432977833921, "learning_rate": 1e-05, "loss": 0.9367, "step": 1221 }, { "epoch": 0.9768673494878841, "grad_norm": 1.3631292544649363, "learning_rate": 1e-05, "loss": 0.8714, "step": 1222 }, { "epoch": 0.9776667499375469, "grad_norm": 1.3171008529103865, "learning_rate": 1e-05, "loss": 0.8285, "step": 1223 }, { "epoch": 0.9784661503872096, "grad_norm": 1.4354745441705121, "learning_rate": 1e-05, "loss": 0.9037, "step": 1224 }, { "epoch": 0.9792655508368724, "grad_norm": 1.3919378193960412, "learning_rate": 1e-05, "loss": 0.9309, "step": 1225 }, { "epoch": 0.9800649512865351, "grad_norm": 1.4461454394492737, "learning_rate": 1e-05, "loss": 0.8928, "step": 1226 }, { "epoch": 0.9808643517361979, "grad_norm": 1.3724038374747247, "learning_rate": 1e-05, "loss": 0.9014, "step": 1227 }, { "epoch": 0.9816637521858605, "grad_norm": 1.351928124821094, "learning_rate": 1e-05, "loss": 0.8343, "step": 1228 }, { "epoch": 0.9824631526355233, "grad_norm": 1.3143104444611924, "learning_rate": 1e-05, "loss": 0.8804, "step": 1229 }, { "epoch": 0.9832625530851861, "grad_norm": 1.5074208283788533, "learning_rate": 1e-05, "loss": 0.8708, "step": 1230 }, { "epoch": 0.9840619535348488, "grad_norm": 1.4675362219576862, "learning_rate": 1e-05, "loss": 0.8545, "step": 1231 }, { "epoch": 0.9848613539845116, "grad_norm": 1.4044134991072301, "learning_rate": 1e-05, "loss": 0.852, "step": 1232 }, { "epoch": 0.9856607544341743, "grad_norm": 1.4731748400546958, "learning_rate": 1e-05, "loss": 0.9222, "step": 1233 }, { "epoch": 0.9864601548838371, "grad_norm": 1.4128661942086913, "learning_rate": 1e-05, "loss": 0.8997, "step": 1234 }, { "epoch": 0.9872595553334998, "grad_norm": 1.4368853581391632, "learning_rate": 1e-05, "loss": 0.8672, "step": 1235 }, { "epoch": 0.9880589557831626, "grad_norm": 1.453673257213547, "learning_rate": 1e-05, "loss": 0.8779, "step": 1236 }, { "epoch": 0.9888583562328254, "grad_norm": 1.7470099861196207, "learning_rate": 1e-05, "loss": 0.9028, "step": 1237 }, { "epoch": 0.9896577566824881, "grad_norm": 1.2697243063535835, "learning_rate": 1e-05, "loss": 0.8677, "step": 1238 }, { "epoch": 0.9904571571321509, "grad_norm": 1.5282634647109214, "learning_rate": 1e-05, "loss": 0.8824, "step": 1239 }, { "epoch": 0.9912565575818136, "grad_norm": 1.5236456464951182, "learning_rate": 1e-05, "loss": 0.875, "step": 1240 }, { "epoch": 0.9920559580314764, "grad_norm": 1.2831857679108445, "learning_rate": 1e-05, "loss": 0.9118, "step": 1241 }, { "epoch": 0.9928553584811391, "grad_norm": 1.4427270743757334, "learning_rate": 1e-05, "loss": 0.8488, "step": 1242 }, { "epoch": 0.9936547589308019, "grad_norm": 1.6145144060086711, "learning_rate": 1e-05, "loss": 0.8594, "step": 1243 }, { "epoch": 0.9944541593804647, "grad_norm": 1.5536788191330388, "learning_rate": 1e-05, "loss": 0.8736, "step": 1244 }, { "epoch": 0.9952535598301274, "grad_norm": 1.488891430752203, "learning_rate": 1e-05, "loss": 0.8824, "step": 1245 }, { "epoch": 0.9960529602797902, "grad_norm": 1.7670913427025423, "learning_rate": 1e-05, "loss": 0.8481, "step": 1246 }, { "epoch": 0.9968523607294529, "grad_norm": 1.4017507511502658, "learning_rate": 1e-05, "loss": 0.8422, "step": 1247 }, { "epoch": 0.9976517611791157, "grad_norm": 1.3372936110607956, "learning_rate": 1e-05, "loss": 0.842, "step": 1248 }, { "epoch": 0.9984511616287784, "grad_norm": 1.3328353321262152, "learning_rate": 1e-05, "loss": 0.8982, "step": 1249 }, { "epoch": 0.9992505620784412, "grad_norm": 1.4055115515472896, "learning_rate": 1e-05, "loss": 0.8433, "step": 1250 }, { "epoch": 0.9992505620784412, "step": 1250, "total_flos": 826404337876992.0, "train_loss": 0.9163284823417663, "train_runtime": 166824.9366, "train_samples_per_second": 0.48, "train_steps_per_second": 0.007 } ], "logging_steps": 1.0, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 826404337876992.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }