diff --git "a/checkpoint-100000/trainer_state.json" "b/checkpoint-100000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-100000/trainer_state.json" @@ -0,0 +1,7033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9771827820393805, + "eval_steps": 500, + "global_step": 100000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009771827820393806, + "grad_norm": 0.5417118072509766, + "learning_rate": 4.995602247740044e-05, + "loss": 1.378, + "step": 100 + }, + { + "epoch": 0.001954365564078761, + "grad_norm": 0.6493918895721436, + "learning_rate": 4.990715856340093e-05, + "loss": 1.3304, + "step": 200 + }, + { + "epoch": 0.0029315483461181415, + "grad_norm": 0.9062462449073792, + "learning_rate": 4.9858294649401425e-05, + "loss": 1.3284, + "step": 300 + }, + { + "epoch": 0.003908731128157522, + "grad_norm": 0.750052273273468, + "learning_rate": 4.9809430735401906e-05, + "loss": 1.3166, + "step": 400 + }, + { + "epoch": 0.004885913910196903, + "grad_norm": 0.6602022051811218, + "learning_rate": 4.97605668214024e-05, + "loss": 1.3166, + "step": 500 + }, + { + "epoch": 0.005863096692236283, + "grad_norm": 0.4193927049636841, + "learning_rate": 4.971170290740288e-05, + "loss": 1.3098, + "step": 600 + }, + { + "epoch": 0.006840279474275663, + "grad_norm": 0.6095415949821472, + "learning_rate": 4.966283899340338e-05, + "loss": 1.3103, + "step": 700 + }, + { + "epoch": 0.007817462256315045, + "grad_norm": 0.9943467378616333, + "learning_rate": 4.9613975079403865e-05, + "loss": 1.3096, + "step": 800 + }, + { + "epoch": 0.008794645038354424, + "grad_norm": 1.2263585329055786, + "learning_rate": 4.9565111165404346e-05, + "loss": 1.3067, + "step": 900 + }, + { + "epoch": 0.009771827820393805, + "grad_norm": 0.7198677659034729, + "learning_rate": 4.951624725140484e-05, + "loss": 1.3041, + "step": 1000 + }, + { + "epoch": 0.010749010602433185, + "grad_norm": 0.7370775938034058, + "learning_rate": 4.946738333740533e-05, + "loss": 1.302, + "step": 1100 + }, + { + "epoch": 0.011726193384472566, + "grad_norm": 0.5109437704086304, + "learning_rate": 4.941851942340582e-05, + "loss": 1.3089, + "step": 1200 + }, + { + "epoch": 0.012703376166511945, + "grad_norm": 0.1879555583000183, + "learning_rate": 4.9369655509406305e-05, + "loss": 1.3043, + "step": 1300 + }, + { + "epoch": 0.013680558948551327, + "grad_norm": 0.951046884059906, + "learning_rate": 4.932079159540679e-05, + "loss": 1.3098, + "step": 1400 + }, + { + "epoch": 0.014657741730590706, + "grad_norm": 0.2478829026222229, + "learning_rate": 4.927192768140728e-05, + "loss": 1.3026, + "step": 1500 + }, + { + "epoch": 0.01563492451263009, + "grad_norm": 0.5585843324661255, + "learning_rate": 4.9223063767407776e-05, + "loss": 1.3014, + "step": 1600 + }, + { + "epoch": 0.016612107294669467, + "grad_norm": 0.48532453179359436, + "learning_rate": 4.917419985340826e-05, + "loss": 1.2981, + "step": 1700 + }, + { + "epoch": 0.017589290076708848, + "grad_norm": 0.4233573079109192, + "learning_rate": 4.912533593940875e-05, + "loss": 1.2992, + "step": 1800 + }, + { + "epoch": 0.01856647285874823, + "grad_norm": 0.3272475600242615, + "learning_rate": 4.9076472025409234e-05, + "loss": 1.292, + "step": 1900 + }, + { + "epoch": 0.01954365564078761, + "grad_norm": 0.5299385786056519, + "learning_rate": 4.902760811140973e-05, + "loss": 1.2963, + "step": 2000 + }, + { + "epoch": 0.02052083842282699, + "grad_norm": 0.1614024043083191, + "learning_rate": 4.8978744197410216e-05, + "loss": 1.2945, + "step": 2100 + }, + { + "epoch": 0.02149802120486637, + "grad_norm": 0.6039963960647583, + "learning_rate": 4.8929880283410705e-05, + "loss": 1.2913, + "step": 2200 + }, + { + "epoch": 0.02247520398690575, + "grad_norm": 0.5772804021835327, + "learning_rate": 4.888101636941119e-05, + "loss": 1.2895, + "step": 2300 + }, + { + "epoch": 0.023452386768945132, + "grad_norm": 0.7489622235298157, + "learning_rate": 4.883215245541168e-05, + "loss": 1.2847, + "step": 2400 + }, + { + "epoch": 0.024429569550984513, + "grad_norm": 0.30208253860473633, + "learning_rate": 4.878328854141217e-05, + "loss": 1.2924, + "step": 2500 + }, + { + "epoch": 0.02540675233302389, + "grad_norm": 0.36944472789764404, + "learning_rate": 4.873442462741266e-05, + "loss": 1.2916, + "step": 2600 + }, + { + "epoch": 0.026383935115063272, + "grad_norm": 0.3268676698207855, + "learning_rate": 4.8685560713413145e-05, + "loss": 1.2893, + "step": 2700 + }, + { + "epoch": 0.027361117897102653, + "grad_norm": 0.2795974910259247, + "learning_rate": 4.863669679941363e-05, + "loss": 1.282, + "step": 2800 + }, + { + "epoch": 0.028338300679142035, + "grad_norm": 0.36298853158950806, + "learning_rate": 4.858783288541413e-05, + "loss": 1.2832, + "step": 2900 + }, + { + "epoch": 0.029315483461181412, + "grad_norm": 0.5242423415184021, + "learning_rate": 4.853896897141461e-05, + "loss": 1.2819, + "step": 3000 + }, + { + "epoch": 0.030292666243220794, + "grad_norm": 0.25340864062309265, + "learning_rate": 4.8490105057415104e-05, + "loss": 1.2809, + "step": 3100 + }, + { + "epoch": 0.03126984902526018, + "grad_norm": 0.7241976261138916, + "learning_rate": 4.844124114341559e-05, + "loss": 1.2802, + "step": 3200 + }, + { + "epoch": 0.032247031807299556, + "grad_norm": 0.5154001712799072, + "learning_rate": 4.839237722941608e-05, + "loss": 1.2748, + "step": 3300 + }, + { + "epoch": 0.033224214589338934, + "grad_norm": 0.5323473811149597, + "learning_rate": 4.834351331541657e-05, + "loss": 1.284, + "step": 3400 + }, + { + "epoch": 0.03420139737137832, + "grad_norm": 0.3947168290615082, + "learning_rate": 4.8294649401417056e-05, + "loss": 1.276, + "step": 3500 + }, + { + "epoch": 0.035178580153417696, + "grad_norm": 0.4776057302951813, + "learning_rate": 4.8245785487417544e-05, + "loss": 1.2783, + "step": 3600 + }, + { + "epoch": 0.036155762935457074, + "grad_norm": 0.4884164035320282, + "learning_rate": 4.819692157341804e-05, + "loss": 1.2745, + "step": 3700 + }, + { + "epoch": 0.03713294571749646, + "grad_norm": 0.5210428833961487, + "learning_rate": 4.814805765941852e-05, + "loss": 1.2707, + "step": 3800 + }, + { + "epoch": 0.038110128499535836, + "grad_norm": 0.46214359998703003, + "learning_rate": 4.809919374541901e-05, + "loss": 1.2727, + "step": 3900 + }, + { + "epoch": 0.03908731128157522, + "grad_norm": 0.2656782865524292, + "learning_rate": 4.8050329831419496e-05, + "loss": 1.2694, + "step": 4000 + }, + { + "epoch": 0.0400644940636146, + "grad_norm": 0.4923059940338135, + "learning_rate": 4.8001465917419985e-05, + "loss": 1.2665, + "step": 4100 + }, + { + "epoch": 0.04104167684565398, + "grad_norm": 0.92928147315979, + "learning_rate": 4.795260200342048e-05, + "loss": 1.2627, + "step": 4200 + }, + { + "epoch": 0.04201885962769336, + "grad_norm": 1.0651229619979858, + "learning_rate": 4.790373808942096e-05, + "loss": 1.2623, + "step": 4300 + }, + { + "epoch": 0.04299604240973274, + "grad_norm": 0.9612557888031006, + "learning_rate": 4.7854874175421456e-05, + "loss": 1.2482, + "step": 4400 + }, + { + "epoch": 0.043973225191772124, + "grad_norm": 1.0120874643325806, + "learning_rate": 4.7806010261421944e-05, + "loss": 1.2589, + "step": 4500 + }, + { + "epoch": 0.0449504079738115, + "grad_norm": 0.6250020861625671, + "learning_rate": 4.775714634742243e-05, + "loss": 1.2499, + "step": 4600 + }, + { + "epoch": 0.04592759075585088, + "grad_norm": 0.2850038707256317, + "learning_rate": 4.770828243342292e-05, + "loss": 1.2446, + "step": 4700 + }, + { + "epoch": 0.046904773537890264, + "grad_norm": 1.2032625675201416, + "learning_rate": 4.765941851942341e-05, + "loss": 1.2238, + "step": 4800 + }, + { + "epoch": 0.04788195631992964, + "grad_norm": 0.42024949193000793, + "learning_rate": 4.7610554605423896e-05, + "loss": 1.2255, + "step": 4900 + }, + { + "epoch": 0.048859139101969026, + "grad_norm": 0.7451406121253967, + "learning_rate": 4.756169069142439e-05, + "loss": 1.2071, + "step": 5000 + }, + { + "epoch": 0.049836321884008404, + "grad_norm": 0.8735096454620361, + "learning_rate": 4.751282677742487e-05, + "loss": 1.2126, + "step": 5100 + }, + { + "epoch": 0.05081350466604778, + "grad_norm": 0.73675137758255, + "learning_rate": 4.746396286342537e-05, + "loss": 1.2036, + "step": 5200 + }, + { + "epoch": 0.051790687448087167, + "grad_norm": 0.6540606617927551, + "learning_rate": 4.741509894942585e-05, + "loss": 1.1825, + "step": 5300 + }, + { + "epoch": 0.052767870230126544, + "grad_norm": 0.825066864490509, + "learning_rate": 4.7366235035426336e-05, + "loss": 1.1655, + "step": 5400 + }, + { + "epoch": 0.05374505301216593, + "grad_norm": 1.6421219110488892, + "learning_rate": 4.731737112142683e-05, + "loss": 1.1716, + "step": 5500 + }, + { + "epoch": 0.05472223579420531, + "grad_norm": 1.0644057989120483, + "learning_rate": 4.726850720742731e-05, + "loss": 1.1384, + "step": 5600 + }, + { + "epoch": 0.055699418576244684, + "grad_norm": 1.1611616611480713, + "learning_rate": 4.721964329342781e-05, + "loss": 1.1499, + "step": 5700 + }, + { + "epoch": 0.05667660135828407, + "grad_norm": 2.0900723934173584, + "learning_rate": 4.7170779379428295e-05, + "loss": 1.1323, + "step": 5800 + }, + { + "epoch": 0.05765378414032345, + "grad_norm": 1.0580404996871948, + "learning_rate": 4.712191546542878e-05, + "loss": 1.112, + "step": 5900 + }, + { + "epoch": 0.058630966922362825, + "grad_norm": 0.6299407482147217, + "learning_rate": 4.707305155142927e-05, + "loss": 1.104, + "step": 6000 + }, + { + "epoch": 0.05960814970440221, + "grad_norm": 0.6816271543502808, + "learning_rate": 4.702418763742976e-05, + "loss": 1.1128, + "step": 6100 + }, + { + "epoch": 0.06058533248644159, + "grad_norm": 0.654796302318573, + "learning_rate": 4.697532372343025e-05, + "loss": 1.0942, + "step": 6200 + }, + { + "epoch": 0.06156251526848097, + "grad_norm": 1.0433884859085083, + "learning_rate": 4.692645980943074e-05, + "loss": 1.0862, + "step": 6300 + }, + { + "epoch": 0.06253969805052036, + "grad_norm": 0.6256537437438965, + "learning_rate": 4.6877595895431224e-05, + "loss": 1.081, + "step": 6400 + }, + { + "epoch": 0.06351688083255973, + "grad_norm": 0.8173975348472595, + "learning_rate": 4.682873198143172e-05, + "loss": 1.0767, + "step": 6500 + }, + { + "epoch": 0.06449406361459911, + "grad_norm": 0.7856473922729492, + "learning_rate": 4.6779868067432206e-05, + "loss": 1.0767, + "step": 6600 + }, + { + "epoch": 0.0654712463966385, + "grad_norm": 0.6337741017341614, + "learning_rate": 4.6731004153432695e-05, + "loss": 1.0829, + "step": 6700 + }, + { + "epoch": 0.06644842917867787, + "grad_norm": 0.5813809037208557, + "learning_rate": 4.668214023943318e-05, + "loss": 1.0571, + "step": 6800 + }, + { + "epoch": 0.06742561196071725, + "grad_norm": 0.4155445992946625, + "learning_rate": 4.6633276325433664e-05, + "loss": 1.0707, + "step": 6900 + }, + { + "epoch": 0.06840279474275664, + "grad_norm": 0.6730567812919617, + "learning_rate": 4.658441241143416e-05, + "loss": 1.0477, + "step": 7000 + }, + { + "epoch": 0.06937997752479601, + "grad_norm": 0.8348300457000732, + "learning_rate": 4.653554849743465e-05, + "loss": 1.0644, + "step": 7100 + }, + { + "epoch": 0.07035716030683539, + "grad_norm": 2.2414326667785645, + "learning_rate": 4.6486684583435135e-05, + "loss": 1.0577, + "step": 7200 + }, + { + "epoch": 0.07133434308887478, + "grad_norm": 1.6573911905288696, + "learning_rate": 4.643782066943562e-05, + "loss": 1.0836, + "step": 7300 + }, + { + "epoch": 0.07231152587091415, + "grad_norm": 0.5690039396286011, + "learning_rate": 4.638895675543611e-05, + "loss": 1.0541, + "step": 7400 + }, + { + "epoch": 0.07328870865295353, + "grad_norm": 0.527215301990509, + "learning_rate": 4.63400928414366e-05, + "loss": 1.0164, + "step": 7500 + }, + { + "epoch": 0.07426589143499292, + "grad_norm": 0.7997362613677979, + "learning_rate": 4.6291228927437094e-05, + "loss": 1.0447, + "step": 7600 + }, + { + "epoch": 0.0752430742170323, + "grad_norm": 2.257143259048462, + "learning_rate": 4.6242365013437575e-05, + "loss": 1.0365, + "step": 7700 + }, + { + "epoch": 0.07622025699907167, + "grad_norm": 0.9132490158081055, + "learning_rate": 4.619350109943807e-05, + "loss": 1.0498, + "step": 7800 + }, + { + "epoch": 0.07719743978111106, + "grad_norm": 0.5229859948158264, + "learning_rate": 4.614463718543856e-05, + "loss": 1.0342, + "step": 7900 + }, + { + "epoch": 0.07817462256315044, + "grad_norm": 0.6948792338371277, + "learning_rate": 4.6095773271439046e-05, + "loss": 1.0325, + "step": 8000 + }, + { + "epoch": 0.07915180534518981, + "grad_norm": 0.8526360988616943, + "learning_rate": 4.6046909357439534e-05, + "loss": 1.0183, + "step": 8100 + }, + { + "epoch": 0.0801289881272292, + "grad_norm": 1.1457374095916748, + "learning_rate": 4.599804544344002e-05, + "loss": 1.0243, + "step": 8200 + }, + { + "epoch": 0.08110617090926858, + "grad_norm": 0.9335997700691223, + "learning_rate": 4.594918152944051e-05, + "loss": 1.046, + "step": 8300 + }, + { + "epoch": 0.08208335369130795, + "grad_norm": 0.8367229700088501, + "learning_rate": 4.5900317615441e-05, + "loss": 1.0176, + "step": 8400 + }, + { + "epoch": 0.08306053647334734, + "grad_norm": 3.7648801803588867, + "learning_rate": 4.5851453701441486e-05, + "loss": 1.0047, + "step": 8500 + }, + { + "epoch": 0.08403771925538672, + "grad_norm": 0.5877612829208374, + "learning_rate": 4.5802589787441975e-05, + "loss": 1.0346, + "step": 8600 + }, + { + "epoch": 0.08501490203742611, + "grad_norm": 0.5145990252494812, + "learning_rate": 4.575372587344246e-05, + "loss": 1.0268, + "step": 8700 + }, + { + "epoch": 0.08599208481946548, + "grad_norm": 0.9310688376426697, + "learning_rate": 4.570486195944295e-05, + "loss": 1.0109, + "step": 8800 + }, + { + "epoch": 0.08696926760150486, + "grad_norm": 0.5182886719703674, + "learning_rate": 4.5655998045443445e-05, + "loss": 1.0117, + "step": 8900 + }, + { + "epoch": 0.08794645038354425, + "grad_norm": 0.4319695234298706, + "learning_rate": 4.560713413144393e-05, + "loss": 1.0053, + "step": 9000 + }, + { + "epoch": 0.08892363316558362, + "grad_norm": 4.307732582092285, + "learning_rate": 4.555827021744442e-05, + "loss": 1.0151, + "step": 9100 + }, + { + "epoch": 0.089900815947623, + "grad_norm": 0.46516236662864685, + "learning_rate": 4.550940630344491e-05, + "loss": 0.9945, + "step": 9200 + }, + { + "epoch": 0.09087799872966239, + "grad_norm": 1.2372952699661255, + "learning_rate": 4.54605423894454e-05, + "loss": 0.9865, + "step": 9300 + }, + { + "epoch": 0.09185518151170176, + "grad_norm": 0.7494595646858215, + "learning_rate": 4.5411678475445886e-05, + "loss": 0.9824, + "step": 9400 + }, + { + "epoch": 0.09283236429374114, + "grad_norm": 0.5540333390235901, + "learning_rate": 4.5362814561446374e-05, + "loss": 1.0132, + "step": 9500 + }, + { + "epoch": 0.09380954707578053, + "grad_norm": 0.48533427715301514, + "learning_rate": 4.531395064744686e-05, + "loss": 1.0173, + "step": 9600 + }, + { + "epoch": 0.0947867298578199, + "grad_norm": 0.4972572922706604, + "learning_rate": 4.526508673344736e-05, + "loss": 1.0078, + "step": 9700 + }, + { + "epoch": 0.09576391263985928, + "grad_norm": 0.6748878955841064, + "learning_rate": 4.521622281944784e-05, + "loss": 1.0172, + "step": 9800 + }, + { + "epoch": 0.09674109542189867, + "grad_norm": 0.5261876583099365, + "learning_rate": 4.5167358905448326e-05, + "loss": 1.0189, + "step": 9900 + }, + { + "epoch": 0.09771827820393805, + "grad_norm": 0.4164600670337677, + "learning_rate": 4.5118494991448814e-05, + "loss": 0.9978, + "step": 10000 + }, + { + "epoch": 0.09869546098597742, + "grad_norm": 0.40417763590812683, + "learning_rate": 4.50696310774493e-05, + "loss": 1.0103, + "step": 10100 + }, + { + "epoch": 0.09967264376801681, + "grad_norm": 0.8591890931129456, + "learning_rate": 4.50207671634498e-05, + "loss": 1.0065, + "step": 10200 + }, + { + "epoch": 0.10064982655005619, + "grad_norm": 0.5676371455192566, + "learning_rate": 4.497190324945028e-05, + "loss": 1.0089, + "step": 10300 + }, + { + "epoch": 0.10162700933209556, + "grad_norm": 0.616646945476532, + "learning_rate": 4.492303933545077e-05, + "loss": 0.9897, + "step": 10400 + }, + { + "epoch": 0.10260419211413495, + "grad_norm": 0.37536484003067017, + "learning_rate": 4.487417542145126e-05, + "loss": 0.9989, + "step": 10500 + }, + { + "epoch": 0.10358137489617433, + "grad_norm": 0.6801789402961731, + "learning_rate": 4.482531150745175e-05, + "loss": 0.9923, + "step": 10600 + }, + { + "epoch": 0.1045585576782137, + "grad_norm": 0.5848776698112488, + "learning_rate": 4.477644759345224e-05, + "loss": 0.9919, + "step": 10700 + }, + { + "epoch": 0.10553574046025309, + "grad_norm": 0.7715157866477966, + "learning_rate": 4.4727583679452725e-05, + "loss": 0.9814, + "step": 10800 + }, + { + "epoch": 0.10651292324229247, + "grad_norm": 0.8080986142158508, + "learning_rate": 4.4678719765453214e-05, + "loss": 0.9935, + "step": 10900 + }, + { + "epoch": 0.10749010602433186, + "grad_norm": 0.4375016391277313, + "learning_rate": 4.462985585145371e-05, + "loss": 0.988, + "step": 11000 + }, + { + "epoch": 0.10846728880637123, + "grad_norm": 0.8055805563926697, + "learning_rate": 4.458099193745419e-05, + "loss": 0.9861, + "step": 11100 + }, + { + "epoch": 0.10944447158841061, + "grad_norm": 1.1914618015289307, + "learning_rate": 4.4532128023454685e-05, + "loss": 0.9622, + "step": 11200 + }, + { + "epoch": 0.11042165437045, + "grad_norm": 0.4247540533542633, + "learning_rate": 4.448326410945517e-05, + "loss": 0.9602, + "step": 11300 + }, + { + "epoch": 0.11139883715248937, + "grad_norm": 0.5454650521278381, + "learning_rate": 4.4434400195455654e-05, + "loss": 0.9696, + "step": 11400 + }, + { + "epoch": 0.11237601993452875, + "grad_norm": 0.5259748697280884, + "learning_rate": 4.438553628145615e-05, + "loss": 1.0021, + "step": 11500 + }, + { + "epoch": 0.11335320271656814, + "grad_norm": 0.5165246725082397, + "learning_rate": 4.433667236745663e-05, + "loss": 0.982, + "step": 11600 + }, + { + "epoch": 0.11433038549860751, + "grad_norm": 0.6768147945404053, + "learning_rate": 4.4287808453457125e-05, + "loss": 0.9398, + "step": 11700 + }, + { + "epoch": 0.1153075682806469, + "grad_norm": 1.0245041847229004, + "learning_rate": 4.423894453945761e-05, + "loss": 0.9934, + "step": 11800 + }, + { + "epoch": 0.11628475106268628, + "grad_norm": 0.6241583228111267, + "learning_rate": 4.41900806254581e-05, + "loss": 0.9697, + "step": 11900 + }, + { + "epoch": 0.11726193384472565, + "grad_norm": 0.4234873652458191, + "learning_rate": 4.414121671145859e-05, + "loss": 0.9723, + "step": 12000 + }, + { + "epoch": 0.11823911662676503, + "grad_norm": 0.3932545781135559, + "learning_rate": 4.409235279745908e-05, + "loss": 0.9826, + "step": 12100 + }, + { + "epoch": 0.11921629940880442, + "grad_norm": 1.5067880153656006, + "learning_rate": 4.4043488883459565e-05, + "loss": 0.9581, + "step": 12200 + }, + { + "epoch": 0.1201934821908438, + "grad_norm": 0.41707366704940796, + "learning_rate": 4.399462496946006e-05, + "loss": 0.9666, + "step": 12300 + }, + { + "epoch": 0.12117066497288317, + "grad_norm": 1.1278653144836426, + "learning_rate": 4.394576105546054e-05, + "loss": 0.9553, + "step": 12400 + }, + { + "epoch": 0.12214784775492256, + "grad_norm": 0.350543737411499, + "learning_rate": 4.3896897141461036e-05, + "loss": 0.9422, + "step": 12500 + }, + { + "epoch": 0.12312503053696194, + "grad_norm": 0.3775838315486908, + "learning_rate": 4.3848033227461524e-05, + "loss": 0.9626, + "step": 12600 + }, + { + "epoch": 0.12410221331900131, + "grad_norm": 0.8341017365455627, + "learning_rate": 4.379916931346201e-05, + "loss": 0.9289, + "step": 12700 + }, + { + "epoch": 0.1250793961010407, + "grad_norm": 0.805614173412323, + "learning_rate": 4.37503053994625e-05, + "loss": 0.9474, + "step": 12800 + }, + { + "epoch": 0.12605657888308008, + "grad_norm": 0.8439397215843201, + "learning_rate": 4.370144148546299e-05, + "loss": 0.9661, + "step": 12900 + }, + { + "epoch": 0.12703376166511945, + "grad_norm": 1.1272892951965332, + "learning_rate": 4.3652577571463476e-05, + "loss": 0.9514, + "step": 13000 + }, + { + "epoch": 0.12801094444715885, + "grad_norm": 0.6426375508308411, + "learning_rate": 4.3603713657463965e-05, + "loss": 0.9448, + "step": 13100 + }, + { + "epoch": 0.12898812722919822, + "grad_norm": 1.3205431699752808, + "learning_rate": 4.355484974346445e-05, + "loss": 0.9511, + "step": 13200 + }, + { + "epoch": 0.1299653100112376, + "grad_norm": 0.3671954870223999, + "learning_rate": 4.350598582946494e-05, + "loss": 0.9506, + "step": 13300 + }, + { + "epoch": 0.130942492793277, + "grad_norm": 0.7566332817077637, + "learning_rate": 4.345712191546543e-05, + "loss": 0.9363, + "step": 13400 + }, + { + "epoch": 0.13191967557531636, + "grad_norm": 0.8800159692764282, + "learning_rate": 4.340825800146592e-05, + "loss": 0.9388, + "step": 13500 + }, + { + "epoch": 0.13289685835735573, + "grad_norm": 0.7134628891944885, + "learning_rate": 4.335939408746641e-05, + "loss": 0.9162, + "step": 13600 + }, + { + "epoch": 0.13387404113939513, + "grad_norm": 0.5555543899536133, + "learning_rate": 4.331053017346689e-05, + "loss": 0.9366, + "step": 13700 + }, + { + "epoch": 0.1348512239214345, + "grad_norm": 0.4485512375831604, + "learning_rate": 4.326166625946739e-05, + "loss": 0.9286, + "step": 13800 + }, + { + "epoch": 0.13582840670347388, + "grad_norm": 0.8888948559761047, + "learning_rate": 4.3212802345467876e-05, + "loss": 0.943, + "step": 13900 + }, + { + "epoch": 0.13680558948551327, + "grad_norm": 0.6719749569892883, + "learning_rate": 4.3163938431468364e-05, + "loss": 0.9217, + "step": 14000 + }, + { + "epoch": 0.13778277226755264, + "grad_norm": 0.695377767086029, + "learning_rate": 4.311507451746885e-05, + "loss": 0.9093, + "step": 14100 + }, + { + "epoch": 0.13875995504959202, + "grad_norm": 0.5966312885284424, + "learning_rate": 4.306621060346934e-05, + "loss": 0.9195, + "step": 14200 + }, + { + "epoch": 0.13973713783163141, + "grad_norm": 0.8073310256004333, + "learning_rate": 4.301734668946983e-05, + "loss": 0.9309, + "step": 14300 + }, + { + "epoch": 0.14071432061367078, + "grad_norm": 0.6303800940513611, + "learning_rate": 4.2968482775470316e-05, + "loss": 0.9458, + "step": 14400 + }, + { + "epoch": 0.14169150339571016, + "grad_norm": 0.7043970823287964, + "learning_rate": 4.2919618861470804e-05, + "loss": 0.9132, + "step": 14500 + }, + { + "epoch": 0.14266868617774955, + "grad_norm": 0.9100736379623413, + "learning_rate": 4.287075494747129e-05, + "loss": 0.9296, + "step": 14600 + }, + { + "epoch": 0.14364586895978892, + "grad_norm": 0.787862241268158, + "learning_rate": 4.282189103347179e-05, + "loss": 0.9643, + "step": 14700 + }, + { + "epoch": 0.1446230517418283, + "grad_norm": 0.8169028162956238, + "learning_rate": 4.277302711947227e-05, + "loss": 0.9244, + "step": 14800 + }, + { + "epoch": 0.1456002345238677, + "grad_norm": 0.9544184803962708, + "learning_rate": 4.272416320547276e-05, + "loss": 0.918, + "step": 14900 + }, + { + "epoch": 0.14657741730590707, + "grad_norm": 0.5325574278831482, + "learning_rate": 4.2675299291473245e-05, + "loss": 0.9273, + "step": 15000 + }, + { + "epoch": 0.14755460008794646, + "grad_norm": 1.1403323411941528, + "learning_rate": 4.262643537747374e-05, + "loss": 0.9095, + "step": 15100 + }, + { + "epoch": 0.14853178286998583, + "grad_norm": 1.0411937236785889, + "learning_rate": 4.257757146347423e-05, + "loss": 0.8967, + "step": 15200 + }, + { + "epoch": 0.1495089656520252, + "grad_norm": 0.630393922328949, + "learning_rate": 4.2528707549474715e-05, + "loss": 0.8883, + "step": 15300 + }, + { + "epoch": 0.1504861484340646, + "grad_norm": 0.9445775747299194, + "learning_rate": 4.2479843635475204e-05, + "loss": 0.9253, + "step": 15400 + }, + { + "epoch": 0.15146333121610397, + "grad_norm": 0.5689444541931152, + "learning_rate": 4.243097972147569e-05, + "loss": 0.8983, + "step": 15500 + }, + { + "epoch": 0.15244051399814335, + "grad_norm": 0.7726677656173706, + "learning_rate": 4.238211580747618e-05, + "loss": 0.9228, + "step": 15600 + }, + { + "epoch": 0.15341769678018274, + "grad_norm": 0.8260165452957153, + "learning_rate": 4.2333251893476675e-05, + "loss": 0.9202, + "step": 15700 + }, + { + "epoch": 0.15439487956222211, + "grad_norm": 0.4869302809238434, + "learning_rate": 4.2284387979477156e-05, + "loss": 0.9283, + "step": 15800 + }, + { + "epoch": 0.15537206234426149, + "grad_norm": 0.5768991708755493, + "learning_rate": 4.2235524065477644e-05, + "loss": 0.9233, + "step": 15900 + }, + { + "epoch": 0.15634924512630088, + "grad_norm": 0.8856435418128967, + "learning_rate": 4.218666015147814e-05, + "loss": 0.8825, + "step": 16000 + }, + { + "epoch": 0.15732642790834026, + "grad_norm": 0.5258185267448425, + "learning_rate": 4.213779623747862e-05, + "loss": 0.8834, + "step": 16100 + }, + { + "epoch": 0.15830361069037963, + "grad_norm": 0.8340526223182678, + "learning_rate": 4.2088932323479115e-05, + "loss": 0.8856, + "step": 16200 + }, + { + "epoch": 0.15928079347241902, + "grad_norm": 0.4123723804950714, + "learning_rate": 4.2040068409479596e-05, + "loss": 0.8957, + "step": 16300 + }, + { + "epoch": 0.1602579762544584, + "grad_norm": 0.8336274027824402, + "learning_rate": 4.199120449548009e-05, + "loss": 0.9053, + "step": 16400 + }, + { + "epoch": 0.16123515903649777, + "grad_norm": 0.7977516055107117, + "learning_rate": 4.194234058148058e-05, + "loss": 0.8698, + "step": 16500 + }, + { + "epoch": 0.16221234181853716, + "grad_norm": 0.5064985156059265, + "learning_rate": 4.189347666748107e-05, + "loss": 0.8945, + "step": 16600 + }, + { + "epoch": 0.16318952460057654, + "grad_norm": 0.8241267204284668, + "learning_rate": 4.1844612753481555e-05, + "loss": 0.8875, + "step": 16700 + }, + { + "epoch": 0.1641667073826159, + "grad_norm": 0.7517113089561462, + "learning_rate": 4.179574883948204e-05, + "loss": 0.8845, + "step": 16800 + }, + { + "epoch": 0.1651438901646553, + "grad_norm": 0.6297169923782349, + "learning_rate": 4.174688492548253e-05, + "loss": 0.9303, + "step": 16900 + }, + { + "epoch": 0.16612107294669468, + "grad_norm": 0.5828490257263184, + "learning_rate": 4.1698021011483026e-05, + "loss": 0.8654, + "step": 17000 + }, + { + "epoch": 0.16709825572873405, + "grad_norm": 0.3038561940193176, + "learning_rate": 4.164915709748351e-05, + "loss": 0.8933, + "step": 17100 + }, + { + "epoch": 0.16807543851077344, + "grad_norm": 0.8928827047348022, + "learning_rate": 4.1600293183484e-05, + "loss": 0.8509, + "step": 17200 + }, + { + "epoch": 0.16905262129281282, + "grad_norm": 0.7055086493492126, + "learning_rate": 4.155142926948449e-05, + "loss": 0.8814, + "step": 17300 + }, + { + "epoch": 0.17002980407485221, + "grad_norm": 0.5377823710441589, + "learning_rate": 4.150256535548497e-05, + "loss": 0.888, + "step": 17400 + }, + { + "epoch": 0.17100698685689159, + "grad_norm": 0.6319778561592102, + "learning_rate": 4.1453701441485466e-05, + "loss": 0.8575, + "step": 17500 + }, + { + "epoch": 0.17198416963893096, + "grad_norm": 0.8756042122840881, + "learning_rate": 4.1404837527485954e-05, + "loss": 0.8805, + "step": 17600 + }, + { + "epoch": 0.17296135242097035, + "grad_norm": 0.5293178558349609, + "learning_rate": 4.135597361348644e-05, + "loss": 0.8471, + "step": 17700 + }, + { + "epoch": 0.17393853520300973, + "grad_norm": 0.9118284583091736, + "learning_rate": 4.130710969948693e-05, + "loss": 0.8426, + "step": 17800 + }, + { + "epoch": 0.1749157179850491, + "grad_norm": 1.0211195945739746, + "learning_rate": 4.125824578548742e-05, + "loss": 0.8877, + "step": 17900 + }, + { + "epoch": 0.1758929007670885, + "grad_norm": 1.4174985885620117, + "learning_rate": 4.120938187148791e-05, + "loss": 0.8731, + "step": 18000 + }, + { + "epoch": 0.17687008354912787, + "grad_norm": 0.8243415951728821, + "learning_rate": 4.1160517957488395e-05, + "loss": 0.8852, + "step": 18100 + }, + { + "epoch": 0.17784726633116724, + "grad_norm": 0.8385602235794067, + "learning_rate": 4.111165404348888e-05, + "loss": 0.8361, + "step": 18200 + }, + { + "epoch": 0.17882444911320663, + "grad_norm": 1.003968358039856, + "learning_rate": 4.106279012948938e-05, + "loss": 0.8738, + "step": 18300 + }, + { + "epoch": 0.179801631895246, + "grad_norm": 0.7428449988365173, + "learning_rate": 4.101392621548986e-05, + "loss": 0.8563, + "step": 18400 + }, + { + "epoch": 0.18077881467728538, + "grad_norm": 1.8963735103607178, + "learning_rate": 4.0965062301490354e-05, + "loss": 0.8428, + "step": 18500 + }, + { + "epoch": 0.18175599745932478, + "grad_norm": 0.6868895888328552, + "learning_rate": 4.091619838749084e-05, + "loss": 0.8727, + "step": 18600 + }, + { + "epoch": 0.18273318024136415, + "grad_norm": 1.8936256170272827, + "learning_rate": 4.086733447349133e-05, + "loss": 0.9211, + "step": 18700 + }, + { + "epoch": 0.18371036302340352, + "grad_norm": 1.004941463470459, + "learning_rate": 4.081847055949182e-05, + "loss": 0.8404, + "step": 18800 + }, + { + "epoch": 0.18468754580544292, + "grad_norm": 1.4084818363189697, + "learning_rate": 4.0769606645492306e-05, + "loss": 0.868, + "step": 18900 + }, + { + "epoch": 0.1856647285874823, + "grad_norm": 0.6459541320800781, + "learning_rate": 4.0720742731492794e-05, + "loss": 0.8583, + "step": 19000 + }, + { + "epoch": 0.18664191136952166, + "grad_norm": 0.7335548996925354, + "learning_rate": 4.067187881749328e-05, + "loss": 0.8622, + "step": 19100 + }, + { + "epoch": 0.18761909415156106, + "grad_norm": 0.6783348321914673, + "learning_rate": 4.062301490349377e-05, + "loss": 0.8572, + "step": 19200 + }, + { + "epoch": 0.18859627693360043, + "grad_norm": 0.6323419809341431, + "learning_rate": 4.057415098949426e-05, + "loss": 0.8763, + "step": 19300 + }, + { + "epoch": 0.1895734597156398, + "grad_norm": 0.963927686214447, + "learning_rate": 4.052528707549475e-05, + "loss": 0.8543, + "step": 19400 + }, + { + "epoch": 0.1905506424976792, + "grad_norm": 0.4785550832748413, + "learning_rate": 4.0476423161495234e-05, + "loss": 0.863, + "step": 19500 + }, + { + "epoch": 0.19152782527971857, + "grad_norm": 0.6358627080917358, + "learning_rate": 4.042755924749573e-05, + "loss": 0.8842, + "step": 19600 + }, + { + "epoch": 0.19250500806175797, + "grad_norm": 0.7857956886291504, + "learning_rate": 4.037869533349621e-05, + "loss": 0.8698, + "step": 19700 + }, + { + "epoch": 0.19348219084379734, + "grad_norm": 0.5225537419319153, + "learning_rate": 4.0329831419496705e-05, + "loss": 0.8842, + "step": 19800 + }, + { + "epoch": 0.1944593736258367, + "grad_norm": 0.582313597202301, + "learning_rate": 4.0280967505497194e-05, + "loss": 0.8506, + "step": 19900 + }, + { + "epoch": 0.1954365564078761, + "grad_norm": 0.7206740379333496, + "learning_rate": 4.023210359149768e-05, + "loss": 0.8529, + "step": 20000 + }, + { + "epoch": 0.19641373918991548, + "grad_norm": 0.45054760575294495, + "learning_rate": 4.018323967749817e-05, + "loss": 0.8564, + "step": 20100 + }, + { + "epoch": 0.19739092197195485, + "grad_norm": 0.9214595556259155, + "learning_rate": 4.013437576349866e-05, + "loss": 0.8443, + "step": 20200 + }, + { + "epoch": 0.19836810475399425, + "grad_norm": 0.9843263626098633, + "learning_rate": 4.0085511849499146e-05, + "loss": 0.856, + "step": 20300 + }, + { + "epoch": 0.19934528753603362, + "grad_norm": 0.6508098840713501, + "learning_rate": 4.0036647935499634e-05, + "loss": 0.8532, + "step": 20400 + }, + { + "epoch": 0.200322470318073, + "grad_norm": 0.8091655969619751, + "learning_rate": 3.998778402150012e-05, + "loss": 0.8691, + "step": 20500 + }, + { + "epoch": 0.20129965310011239, + "grad_norm": 0.8139657378196716, + "learning_rate": 3.993892010750061e-05, + "loss": 0.8608, + "step": 20600 + }, + { + "epoch": 0.20227683588215176, + "grad_norm": 0.628423273563385, + "learning_rate": 3.9890056193501105e-05, + "loss": 0.8369, + "step": 20700 + }, + { + "epoch": 0.20325401866419113, + "grad_norm": 1.737331748008728, + "learning_rate": 3.9841192279501586e-05, + "loss": 0.8363, + "step": 20800 + }, + { + "epoch": 0.20423120144623053, + "grad_norm": 1.036280870437622, + "learning_rate": 3.979232836550208e-05, + "loss": 0.8387, + "step": 20900 + }, + { + "epoch": 0.2052083842282699, + "grad_norm": 0.35834863781929016, + "learning_rate": 3.974346445150256e-05, + "loss": 0.8565, + "step": 21000 + }, + { + "epoch": 0.20618556701030927, + "grad_norm": 0.7657331824302673, + "learning_rate": 3.969460053750306e-05, + "loss": 0.8654, + "step": 21100 + }, + { + "epoch": 0.20716274979234867, + "grad_norm": 1.077300786972046, + "learning_rate": 3.9645736623503545e-05, + "loss": 0.8218, + "step": 21200 + }, + { + "epoch": 0.20813993257438804, + "grad_norm": 0.5806353688240051, + "learning_rate": 3.959687270950403e-05, + "loss": 0.8375, + "step": 21300 + }, + { + "epoch": 0.2091171153564274, + "grad_norm": 0.3875705599784851, + "learning_rate": 3.954800879550452e-05, + "loss": 0.8342, + "step": 21400 + }, + { + "epoch": 0.2100942981384668, + "grad_norm": 0.7829961180686951, + "learning_rate": 3.949914488150501e-05, + "loss": 0.832, + "step": 21500 + }, + { + "epoch": 0.21107148092050618, + "grad_norm": 1.9466382265090942, + "learning_rate": 3.94502809675055e-05, + "loss": 0.8118, + "step": 21600 + }, + { + "epoch": 0.21204866370254555, + "grad_norm": 0.6271357536315918, + "learning_rate": 3.940141705350599e-05, + "loss": 0.8436, + "step": 21700 + }, + { + "epoch": 0.21302584648458495, + "grad_norm": 1.320719838142395, + "learning_rate": 3.9352553139506474e-05, + "loss": 0.8586, + "step": 21800 + }, + { + "epoch": 0.21400302926662432, + "grad_norm": 0.6017069220542908, + "learning_rate": 3.930368922550697e-05, + "loss": 0.8242, + "step": 21900 + }, + { + "epoch": 0.21498021204866372, + "grad_norm": 0.8584203124046326, + "learning_rate": 3.9254825311507456e-05, + "loss": 0.815, + "step": 22000 + }, + { + "epoch": 0.2159573948307031, + "grad_norm": 0.623652458190918, + "learning_rate": 3.920596139750794e-05, + "loss": 0.812, + "step": 22100 + }, + { + "epoch": 0.21693457761274246, + "grad_norm": 0.6867117881774902, + "learning_rate": 3.915709748350843e-05, + "loss": 0.8141, + "step": 22200 + }, + { + "epoch": 0.21791176039478186, + "grad_norm": 0.6963294744491577, + "learning_rate": 3.910823356950892e-05, + "loss": 0.8227, + "step": 22300 + }, + { + "epoch": 0.21888894317682123, + "grad_norm": 0.6727440357208252, + "learning_rate": 3.905936965550941e-05, + "loss": 0.8285, + "step": 22400 + }, + { + "epoch": 0.2198661259588606, + "grad_norm": 1.261771559715271, + "learning_rate": 3.90105057415099e-05, + "loss": 0.8396, + "step": 22500 + }, + { + "epoch": 0.2208433087409, + "grad_norm": 0.9146804809570312, + "learning_rate": 3.8961641827510385e-05, + "loss": 0.8194, + "step": 22600 + }, + { + "epoch": 0.22182049152293937, + "grad_norm": 0.9350225329399109, + "learning_rate": 3.891277791351087e-05, + "loss": 0.8376, + "step": 22700 + }, + { + "epoch": 0.22279767430497874, + "grad_norm": 0.6317518353462219, + "learning_rate": 3.886391399951137e-05, + "loss": 0.8313, + "step": 22800 + }, + { + "epoch": 0.22377485708701814, + "grad_norm": 0.6716780662536621, + "learning_rate": 3.881505008551185e-05, + "loss": 0.8033, + "step": 22900 + }, + { + "epoch": 0.2247520398690575, + "grad_norm": 0.4494755268096924, + "learning_rate": 3.8766186171512344e-05, + "loss": 0.8047, + "step": 23000 + }, + { + "epoch": 0.22572922265109688, + "grad_norm": 0.5505642890930176, + "learning_rate": 3.8717322257512825e-05, + "loss": 0.8456, + "step": 23100 + }, + { + "epoch": 0.22670640543313628, + "grad_norm": 0.8866478800773621, + "learning_rate": 3.866845834351332e-05, + "loss": 0.8105, + "step": 23200 + }, + { + "epoch": 0.22768358821517565, + "grad_norm": 0.7525384426116943, + "learning_rate": 3.861959442951381e-05, + "loss": 0.8292, + "step": 23300 + }, + { + "epoch": 0.22866077099721502, + "grad_norm": 0.8182941675186157, + "learning_rate": 3.8570730515514296e-05, + "loss": 0.8392, + "step": 23400 + }, + { + "epoch": 0.22963795377925442, + "grad_norm": 0.6246720552444458, + "learning_rate": 3.8521866601514784e-05, + "loss": 0.8292, + "step": 23500 + }, + { + "epoch": 0.2306151365612938, + "grad_norm": 0.7931325435638428, + "learning_rate": 3.847300268751527e-05, + "loss": 0.83, + "step": 23600 + }, + { + "epoch": 0.23159231934333316, + "grad_norm": 0.4839908480644226, + "learning_rate": 3.842413877351576e-05, + "loss": 0.8544, + "step": 23700 + }, + { + "epoch": 0.23256950212537256, + "grad_norm": 0.694095253944397, + "learning_rate": 3.837527485951625e-05, + "loss": 0.8168, + "step": 23800 + }, + { + "epoch": 0.23354668490741193, + "grad_norm": 0.6341009140014648, + "learning_rate": 3.8326410945516736e-05, + "loss": 0.8007, + "step": 23900 + }, + { + "epoch": 0.2345238676894513, + "grad_norm": 0.6198739409446716, + "learning_rate": 3.8277547031517224e-05, + "loss": 0.8222, + "step": 24000 + }, + { + "epoch": 0.2355010504714907, + "grad_norm": 0.7246755361557007, + "learning_rate": 3.822868311751772e-05, + "loss": 0.8239, + "step": 24100 + }, + { + "epoch": 0.23647823325353007, + "grad_norm": 1.1782780885696411, + "learning_rate": 3.81798192035182e-05, + "loss": 0.8069, + "step": 24200 + }, + { + "epoch": 0.23745541603556947, + "grad_norm": 0.7902185320854187, + "learning_rate": 3.8130955289518695e-05, + "loss": 0.8283, + "step": 24300 + }, + { + "epoch": 0.23843259881760884, + "grad_norm": 1.605393648147583, + "learning_rate": 3.808209137551918e-05, + "loss": 0.7758, + "step": 24400 + }, + { + "epoch": 0.2394097815996482, + "grad_norm": 0.5076558589935303, + "learning_rate": 3.803322746151967e-05, + "loss": 0.8178, + "step": 24500 + }, + { + "epoch": 0.2403869643816876, + "grad_norm": 0.777646005153656, + "learning_rate": 3.798436354752016e-05, + "loss": 0.8074, + "step": 24600 + }, + { + "epoch": 0.24136414716372698, + "grad_norm": 1.3850637674331665, + "learning_rate": 3.793549963352065e-05, + "loss": 0.8058, + "step": 24700 + }, + { + "epoch": 0.24234132994576635, + "grad_norm": 0.6476046442985535, + "learning_rate": 3.7886635719521136e-05, + "loss": 0.7967, + "step": 24800 + }, + { + "epoch": 0.24331851272780575, + "grad_norm": 0.5768633484840393, + "learning_rate": 3.7837771805521624e-05, + "loss": 0.8269, + "step": 24900 + }, + { + "epoch": 0.24429569550984512, + "grad_norm": 0.7800481915473938, + "learning_rate": 3.778890789152211e-05, + "loss": 0.8237, + "step": 25000 + }, + { + "epoch": 0.2452728782918845, + "grad_norm": 0.591273844242096, + "learning_rate": 3.77400439775226e-05, + "loss": 0.8045, + "step": 25100 + }, + { + "epoch": 0.2462500610739239, + "grad_norm": 0.5170730352401733, + "learning_rate": 3.769118006352309e-05, + "loss": 0.818, + "step": 25200 + }, + { + "epoch": 0.24722724385596326, + "grad_norm": 0.7280113101005554, + "learning_rate": 3.7642316149523576e-05, + "loss": 0.806, + "step": 25300 + }, + { + "epoch": 0.24820442663800263, + "grad_norm": 0.48092082142829895, + "learning_rate": 3.759345223552407e-05, + "loss": 0.804, + "step": 25400 + }, + { + "epoch": 0.24918160942004203, + "grad_norm": 0.8031238913536072, + "learning_rate": 3.754458832152455e-05, + "loss": 0.8031, + "step": 25500 + }, + { + "epoch": 0.2501587922020814, + "grad_norm": 0.5290892720222473, + "learning_rate": 3.749572440752505e-05, + "loss": 0.816, + "step": 25600 + }, + { + "epoch": 0.25113597498412077, + "grad_norm": 1.850685477256775, + "learning_rate": 3.7446860493525535e-05, + "loss": 0.8241, + "step": 25700 + }, + { + "epoch": 0.25211315776616017, + "grad_norm": 0.9196923971176147, + "learning_rate": 3.739799657952602e-05, + "loss": 0.8115, + "step": 25800 + }, + { + "epoch": 0.25309034054819957, + "grad_norm": 0.8779144883155823, + "learning_rate": 3.734913266552651e-05, + "loss": 0.8065, + "step": 25900 + }, + { + "epoch": 0.2540675233302389, + "grad_norm": 0.6696827411651611, + "learning_rate": 3.7300268751527e-05, + "loss": 0.7827, + "step": 26000 + }, + { + "epoch": 0.2550447061122783, + "grad_norm": 0.5037100315093994, + "learning_rate": 3.725140483752749e-05, + "loss": 0.7955, + "step": 26100 + }, + { + "epoch": 0.2560218888943177, + "grad_norm": 1.4716683626174927, + "learning_rate": 3.7202540923527975e-05, + "loss": 0.8076, + "step": 26200 + }, + { + "epoch": 0.25699907167635705, + "grad_norm": 0.7515909671783447, + "learning_rate": 3.7153677009528463e-05, + "loss": 0.7645, + "step": 26300 + }, + { + "epoch": 0.25797625445839645, + "grad_norm": 0.8641912341117859, + "learning_rate": 3.710481309552896e-05, + "loss": 0.7794, + "step": 26400 + }, + { + "epoch": 0.25895343724043585, + "grad_norm": 0.7385029792785645, + "learning_rate": 3.705594918152944e-05, + "loss": 0.8047, + "step": 26500 + }, + { + "epoch": 0.2599306200224752, + "grad_norm": 1.194313645362854, + "learning_rate": 3.700708526752993e-05, + "loss": 0.7973, + "step": 26600 + }, + { + "epoch": 0.2609078028045146, + "grad_norm": 0.8573377728462219, + "learning_rate": 3.695822135353042e-05, + "loss": 0.8054, + "step": 26700 + }, + { + "epoch": 0.261884985586554, + "grad_norm": 0.7428358793258667, + "learning_rate": 3.6909357439530904e-05, + "loss": 0.8194, + "step": 26800 + }, + { + "epoch": 0.26286216836859333, + "grad_norm": 1.1976490020751953, + "learning_rate": 3.68604935255314e-05, + "loss": 0.7745, + "step": 26900 + }, + { + "epoch": 0.26383935115063273, + "grad_norm": 0.8391226530075073, + "learning_rate": 3.681162961153189e-05, + "loss": 0.7981, + "step": 27000 + }, + { + "epoch": 0.2648165339326721, + "grad_norm": 1.0753370523452759, + "learning_rate": 3.6762765697532375e-05, + "loss": 0.8018, + "step": 27100 + }, + { + "epoch": 0.26579371671471147, + "grad_norm": 0.8495202660560608, + "learning_rate": 3.671390178353286e-05, + "loss": 0.7894, + "step": 27200 + }, + { + "epoch": 0.26677089949675087, + "grad_norm": 2.3333170413970947, + "learning_rate": 3.666503786953335e-05, + "loss": 0.7892, + "step": 27300 + }, + { + "epoch": 0.26774808227879027, + "grad_norm": 0.7213625311851501, + "learning_rate": 3.661617395553384e-05, + "loss": 0.7902, + "step": 27400 + }, + { + "epoch": 0.2687252650608296, + "grad_norm": 1.045614242553711, + "learning_rate": 3.6567310041534334e-05, + "loss": 0.7719, + "step": 27500 + }, + { + "epoch": 0.269702447842869, + "grad_norm": 0.42100274562835693, + "learning_rate": 3.6518446127534815e-05, + "loss": 0.7705, + "step": 27600 + }, + { + "epoch": 0.2706796306249084, + "grad_norm": 0.5944122076034546, + "learning_rate": 3.646958221353531e-05, + "loss": 0.7717, + "step": 27700 + }, + { + "epoch": 0.27165681340694775, + "grad_norm": 0.7398585677146912, + "learning_rate": 3.642071829953579e-05, + "loss": 0.7896, + "step": 27800 + }, + { + "epoch": 0.27263399618898715, + "grad_norm": 0.8064782023429871, + "learning_rate": 3.6371854385536286e-05, + "loss": 0.7917, + "step": 27900 + }, + { + "epoch": 0.27361117897102655, + "grad_norm": 0.6715266108512878, + "learning_rate": 3.6322990471536774e-05, + "loss": 0.7771, + "step": 28000 + }, + { + "epoch": 0.2745883617530659, + "grad_norm": 1.1130329370498657, + "learning_rate": 3.6274126557537255e-05, + "loss": 0.7476, + "step": 28100 + }, + { + "epoch": 0.2755655445351053, + "grad_norm": 0.7601907253265381, + "learning_rate": 3.622526264353775e-05, + "loss": 0.7745, + "step": 28200 + }, + { + "epoch": 0.2765427273171447, + "grad_norm": 0.8511783480644226, + "learning_rate": 3.617639872953824e-05, + "loss": 0.7737, + "step": 28300 + }, + { + "epoch": 0.27751991009918403, + "grad_norm": 0.8136917948722839, + "learning_rate": 3.6127534815538726e-05, + "loss": 0.7905, + "step": 28400 + }, + { + "epoch": 0.27849709288122343, + "grad_norm": 0.5580685138702393, + "learning_rate": 3.6078670901539214e-05, + "loss": 0.7957, + "step": 28500 + }, + { + "epoch": 0.27947427566326283, + "grad_norm": 0.750845730304718, + "learning_rate": 3.60298069875397e-05, + "loss": 0.7396, + "step": 28600 + }, + { + "epoch": 0.28045145844530217, + "grad_norm": 0.9611383080482483, + "learning_rate": 3.598094307354019e-05, + "loss": 0.774, + "step": 28700 + }, + { + "epoch": 0.28142864122734157, + "grad_norm": 0.6622794270515442, + "learning_rate": 3.5932079159540685e-05, + "loss": 0.7993, + "step": 28800 + }, + { + "epoch": 0.28240582400938097, + "grad_norm": 0.4816977381706238, + "learning_rate": 3.588321524554117e-05, + "loss": 0.7868, + "step": 28900 + }, + { + "epoch": 0.2833830067914203, + "grad_norm": 0.6779691576957703, + "learning_rate": 3.583435133154166e-05, + "loss": 0.7838, + "step": 29000 + }, + { + "epoch": 0.2843601895734597, + "grad_norm": 0.9714117646217346, + "learning_rate": 3.578548741754214e-05, + "loss": 0.7686, + "step": 29100 + }, + { + "epoch": 0.2853373723554991, + "grad_norm": 0.7163410186767578, + "learning_rate": 3.573662350354264e-05, + "loss": 0.7747, + "step": 29200 + }, + { + "epoch": 0.28631455513753845, + "grad_norm": 0.7338354587554932, + "learning_rate": 3.5687759589543126e-05, + "loss": 0.7703, + "step": 29300 + }, + { + "epoch": 0.28729173791957785, + "grad_norm": 0.765074610710144, + "learning_rate": 3.5638895675543614e-05, + "loss": 0.7811, + "step": 29400 + }, + { + "epoch": 0.28826892070161725, + "grad_norm": 0.6714346408843994, + "learning_rate": 3.55900317615441e-05, + "loss": 0.7971, + "step": 29500 + }, + { + "epoch": 0.2892461034836566, + "grad_norm": 0.6784923672676086, + "learning_rate": 3.554116784754459e-05, + "loss": 0.7704, + "step": 29600 + }, + { + "epoch": 0.290223286265696, + "grad_norm": 0.6446245312690735, + "learning_rate": 3.549230393354508e-05, + "loss": 0.7843, + "step": 29700 + }, + { + "epoch": 0.2912004690477354, + "grad_norm": 0.9739934206008911, + "learning_rate": 3.5443440019545566e-05, + "loss": 0.7423, + "step": 29800 + }, + { + "epoch": 0.2921776518297748, + "grad_norm": 0.2898177206516266, + "learning_rate": 3.5394576105546054e-05, + "loss": 0.7322, + "step": 29900 + }, + { + "epoch": 0.29315483461181413, + "grad_norm": 0.720974862575531, + "learning_rate": 3.534571219154654e-05, + "loss": 0.7593, + "step": 30000 + }, + { + "epoch": 0.29413201739385353, + "grad_norm": 0.4672446548938751, + "learning_rate": 3.529684827754704e-05, + "loss": 0.7422, + "step": 30100 + }, + { + "epoch": 0.2951092001758929, + "grad_norm": 0.7546716332435608, + "learning_rate": 3.524798436354752e-05, + "loss": 0.7788, + "step": 30200 + }, + { + "epoch": 0.29608638295793227, + "grad_norm": 0.6265705823898315, + "learning_rate": 3.519912044954801e-05, + "loss": 0.745, + "step": 30300 + }, + { + "epoch": 0.29706356573997167, + "grad_norm": 1.092965841293335, + "learning_rate": 3.51502565355485e-05, + "loss": 0.789, + "step": 30400 + }, + { + "epoch": 0.29804074852201107, + "grad_norm": 0.7648272514343262, + "learning_rate": 3.510139262154899e-05, + "loss": 0.758, + "step": 30500 + }, + { + "epoch": 0.2990179313040504, + "grad_norm": 0.785746157169342, + "learning_rate": 3.505252870754948e-05, + "loss": 0.7744, + "step": 30600 + }, + { + "epoch": 0.2999951140860898, + "grad_norm": 0.8007264733314514, + "learning_rate": 3.5003664793549965e-05, + "loss": 0.7696, + "step": 30700 + }, + { + "epoch": 0.3009722968681292, + "grad_norm": 1.1369248628616333, + "learning_rate": 3.4954800879550453e-05, + "loss": 0.7667, + "step": 30800 + }, + { + "epoch": 0.30194947965016855, + "grad_norm": 0.6251523494720459, + "learning_rate": 3.490593696555095e-05, + "loss": 0.7686, + "step": 30900 + }, + { + "epoch": 0.30292666243220795, + "grad_norm": 1.1552335023880005, + "learning_rate": 3.485707305155143e-05, + "loss": 0.7693, + "step": 31000 + }, + { + "epoch": 0.30390384521424735, + "grad_norm": 0.9136368036270142, + "learning_rate": 3.480820913755192e-05, + "loss": 0.7898, + "step": 31100 + }, + { + "epoch": 0.3048810279962867, + "grad_norm": 0.4203650951385498, + "learning_rate": 3.4759345223552406e-05, + "loss": 0.7541, + "step": 31200 + }, + { + "epoch": 0.3058582107783261, + "grad_norm": 0.671546995639801, + "learning_rate": 3.4710481309552894e-05, + "loss": 0.735, + "step": 31300 + }, + { + "epoch": 0.3068353935603655, + "grad_norm": 0.6711509227752686, + "learning_rate": 3.466161739555339e-05, + "loss": 0.7481, + "step": 31400 + }, + { + "epoch": 0.30781257634240483, + "grad_norm": 0.7787076234817505, + "learning_rate": 3.461275348155387e-05, + "loss": 0.7701, + "step": 31500 + }, + { + "epoch": 0.30878975912444423, + "grad_norm": 0.5270808935165405, + "learning_rate": 3.4563889567554365e-05, + "loss": 0.7166, + "step": 31600 + }, + { + "epoch": 0.30976694190648363, + "grad_norm": 0.7732633352279663, + "learning_rate": 3.451502565355485e-05, + "loss": 0.7857, + "step": 31700 + }, + { + "epoch": 0.31074412468852297, + "grad_norm": 0.6347182989120483, + "learning_rate": 3.446616173955534e-05, + "loss": 0.7384, + "step": 31800 + }, + { + "epoch": 0.31172130747056237, + "grad_norm": 0.9557164311408997, + "learning_rate": 3.441729782555583e-05, + "loss": 0.755, + "step": 31900 + }, + { + "epoch": 0.31269849025260177, + "grad_norm": 0.8120887279510498, + "learning_rate": 3.436843391155632e-05, + "loss": 0.7356, + "step": 32000 + }, + { + "epoch": 0.3136756730346411, + "grad_norm": 0.6804450750350952, + "learning_rate": 3.4319569997556805e-05, + "loss": 0.785, + "step": 32100 + }, + { + "epoch": 0.3146528558166805, + "grad_norm": 0.7511081695556641, + "learning_rate": 3.42707060835573e-05, + "loss": 0.7427, + "step": 32200 + }, + { + "epoch": 0.3156300385987199, + "grad_norm": 0.8396822214126587, + "learning_rate": 3.422184216955778e-05, + "loss": 0.7801, + "step": 32300 + }, + { + "epoch": 0.31660722138075925, + "grad_norm": 1.0063520669937134, + "learning_rate": 3.4172978255558276e-05, + "loss": 0.7638, + "step": 32400 + }, + { + "epoch": 0.31758440416279865, + "grad_norm": 1.349414587020874, + "learning_rate": 3.412411434155876e-05, + "loss": 0.7522, + "step": 32500 + }, + { + "epoch": 0.31856158694483805, + "grad_norm": 0.8259103298187256, + "learning_rate": 3.4075250427559245e-05, + "loss": 0.7351, + "step": 32600 + }, + { + "epoch": 0.3195387697268774, + "grad_norm": 0.4894813597202301, + "learning_rate": 3.402638651355974e-05, + "loss": 0.7593, + "step": 32700 + }, + { + "epoch": 0.3205159525089168, + "grad_norm": 0.6558930277824402, + "learning_rate": 3.397752259956022e-05, + "loss": 0.7496, + "step": 32800 + }, + { + "epoch": 0.3214931352909562, + "grad_norm": 1.2009482383728027, + "learning_rate": 3.3928658685560716e-05, + "loss": 0.7379, + "step": 32900 + }, + { + "epoch": 0.32247031807299553, + "grad_norm": 0.8621765375137329, + "learning_rate": 3.3879794771561204e-05, + "loss": 0.7381, + "step": 33000 + }, + { + "epoch": 0.32344750085503493, + "grad_norm": 0.5097255706787109, + "learning_rate": 3.383093085756169e-05, + "loss": 0.7567, + "step": 33100 + }, + { + "epoch": 0.32442468363707433, + "grad_norm": 0.48458051681518555, + "learning_rate": 3.378206694356218e-05, + "loss": 0.7649, + "step": 33200 + }, + { + "epoch": 0.3254018664191137, + "grad_norm": 0.7467001676559448, + "learning_rate": 3.373320302956267e-05, + "loss": 0.7612, + "step": 33300 + }, + { + "epoch": 0.32637904920115307, + "grad_norm": 1.1591566801071167, + "learning_rate": 3.368433911556316e-05, + "loss": 0.7394, + "step": 33400 + }, + { + "epoch": 0.32735623198319247, + "grad_norm": 0.9665714502334595, + "learning_rate": 3.363547520156365e-05, + "loss": 0.7472, + "step": 33500 + }, + { + "epoch": 0.3283334147652318, + "grad_norm": 0.5714060664176941, + "learning_rate": 3.358661128756413e-05, + "loss": 0.7385, + "step": 33600 + }, + { + "epoch": 0.3293105975472712, + "grad_norm": 0.8278976082801819, + "learning_rate": 3.353774737356463e-05, + "loss": 0.724, + "step": 33700 + }, + { + "epoch": 0.3302877803293106, + "grad_norm": 0.9210988283157349, + "learning_rate": 3.3488883459565116e-05, + "loss": 0.7542, + "step": 33800 + }, + { + "epoch": 0.33126496311134995, + "grad_norm": 1.0610690116882324, + "learning_rate": 3.3440019545565604e-05, + "loss": 0.7284, + "step": 33900 + }, + { + "epoch": 0.33224214589338935, + "grad_norm": 0.6521257162094116, + "learning_rate": 3.339115563156609e-05, + "loss": 0.755, + "step": 34000 + }, + { + "epoch": 0.33321932867542875, + "grad_norm": 1.0515367984771729, + "learning_rate": 3.334229171756657e-05, + "loss": 0.7423, + "step": 34100 + }, + { + "epoch": 0.3341965114574681, + "grad_norm": 0.8415219783782959, + "learning_rate": 3.329342780356707e-05, + "loss": 0.716, + "step": 34200 + }, + { + "epoch": 0.3351736942395075, + "grad_norm": 0.5018264651298523, + "learning_rate": 3.3244563889567556e-05, + "loss": 0.7556, + "step": 34300 + }, + { + "epoch": 0.3361508770215469, + "grad_norm": 0.6532925963401794, + "learning_rate": 3.3195699975568044e-05, + "loss": 0.7335, + "step": 34400 + }, + { + "epoch": 0.3371280598035863, + "grad_norm": 0.6794486045837402, + "learning_rate": 3.314683606156853e-05, + "loss": 0.7466, + "step": 34500 + }, + { + "epoch": 0.33810524258562563, + "grad_norm": 0.7372865080833435, + "learning_rate": 3.309797214756902e-05, + "loss": 0.727, + "step": 34600 + }, + { + "epoch": 0.33908242536766503, + "grad_norm": 0.6354756355285645, + "learning_rate": 3.304910823356951e-05, + "loss": 0.725, + "step": 34700 + }, + { + "epoch": 0.34005960814970443, + "grad_norm": 0.7180996537208557, + "learning_rate": 3.300024431957e-05, + "loss": 0.7049, + "step": 34800 + }, + { + "epoch": 0.34103679093174377, + "grad_norm": 1.3991978168487549, + "learning_rate": 3.2951380405570484e-05, + "loss": 0.7251, + "step": 34900 + }, + { + "epoch": 0.34201397371378317, + "grad_norm": 0.5680633783340454, + "learning_rate": 3.290251649157098e-05, + "loss": 0.744, + "step": 35000 + }, + { + "epoch": 0.34299115649582257, + "grad_norm": 0.5309197306632996, + "learning_rate": 3.285365257757147e-05, + "loss": 0.7277, + "step": 35100 + }, + { + "epoch": 0.3439683392778619, + "grad_norm": 1.449625849723816, + "learning_rate": 3.2804788663571955e-05, + "loss": 0.7127, + "step": 35200 + }, + { + "epoch": 0.3449455220599013, + "grad_norm": 0.6244996190071106, + "learning_rate": 3.2755924749572443e-05, + "loss": 0.6992, + "step": 35300 + }, + { + "epoch": 0.3459227048419407, + "grad_norm": 1.037988305091858, + "learning_rate": 3.270706083557293e-05, + "loss": 0.7095, + "step": 35400 + }, + { + "epoch": 0.34689988762398005, + "grad_norm": 1.2503726482391357, + "learning_rate": 3.265819692157342e-05, + "loss": 0.7264, + "step": 35500 + }, + { + "epoch": 0.34787707040601945, + "grad_norm": 1.2136774063110352, + "learning_rate": 3.260933300757391e-05, + "loss": 0.7418, + "step": 35600 + }, + { + "epoch": 0.34885425318805885, + "grad_norm": 0.9328750371932983, + "learning_rate": 3.2560469093574396e-05, + "loss": 0.7509, + "step": 35700 + }, + { + "epoch": 0.3498314359700982, + "grad_norm": 0.5122935771942139, + "learning_rate": 3.2511605179574884e-05, + "loss": 0.7114, + "step": 35800 + }, + { + "epoch": 0.3508086187521376, + "grad_norm": 1.153583288192749, + "learning_rate": 3.246274126557537e-05, + "loss": 0.7316, + "step": 35900 + }, + { + "epoch": 0.351785801534177, + "grad_norm": 0.7405250668525696, + "learning_rate": 3.241387735157586e-05, + "loss": 0.7404, + "step": 36000 + }, + { + "epoch": 0.35276298431621633, + "grad_norm": 0.607565701007843, + "learning_rate": 3.2365013437576355e-05, + "loss": 0.7196, + "step": 36100 + }, + { + "epoch": 0.35374016709825573, + "grad_norm": 1.4975577592849731, + "learning_rate": 3.2316149523576836e-05, + "loss": 0.703, + "step": 36200 + }, + { + "epoch": 0.35471734988029513, + "grad_norm": 0.9088447093963623, + "learning_rate": 3.226728560957733e-05, + "loss": 0.7203, + "step": 36300 + }, + { + "epoch": 0.3556945326623345, + "grad_norm": 0.9132680892944336, + "learning_rate": 3.221842169557782e-05, + "loss": 0.7248, + "step": 36400 + }, + { + "epoch": 0.35667171544437387, + "grad_norm": 0.7861882448196411, + "learning_rate": 3.216955778157831e-05, + "loss": 0.7118, + "step": 36500 + }, + { + "epoch": 0.35764889822641327, + "grad_norm": 1.2251768112182617, + "learning_rate": 3.2120693867578795e-05, + "loss": 0.7304, + "step": 36600 + }, + { + "epoch": 0.3586260810084526, + "grad_norm": 1.1924370527267456, + "learning_rate": 3.207182995357928e-05, + "loss": 0.7394, + "step": 36700 + }, + { + "epoch": 0.359603263790492, + "grad_norm": 0.7275030016899109, + "learning_rate": 3.202296603957977e-05, + "loss": 0.7399, + "step": 36800 + }, + { + "epoch": 0.3605804465725314, + "grad_norm": 0.7406324148178101, + "learning_rate": 3.1974102125580266e-05, + "loss": 0.7432, + "step": 36900 + }, + { + "epoch": 0.36155762935457075, + "grad_norm": 1.0701793432235718, + "learning_rate": 3.192523821158075e-05, + "loss": 0.7099, + "step": 37000 + }, + { + "epoch": 0.36253481213661015, + "grad_norm": 0.7077426314353943, + "learning_rate": 3.1876374297581235e-05, + "loss": 0.7127, + "step": 37100 + }, + { + "epoch": 0.36351199491864955, + "grad_norm": 0.5806621313095093, + "learning_rate": 3.1827510383581723e-05, + "loss": 0.7002, + "step": 37200 + }, + { + "epoch": 0.3644891777006889, + "grad_norm": 1.1311944723129272, + "learning_rate": 3.177864646958221e-05, + "loss": 0.6876, + "step": 37300 + }, + { + "epoch": 0.3654663604827283, + "grad_norm": 0.9112023711204529, + "learning_rate": 3.1729782555582706e-05, + "loss": 0.7169, + "step": 37400 + }, + { + "epoch": 0.3664435432647677, + "grad_norm": 0.5986848473548889, + "learning_rate": 3.168091864158319e-05, + "loss": 0.7294, + "step": 37500 + }, + { + "epoch": 0.36742072604680703, + "grad_norm": 1.297155737876892, + "learning_rate": 3.163205472758368e-05, + "loss": 0.7061, + "step": 37600 + }, + { + "epoch": 0.36839790882884643, + "grad_norm": 0.6597927808761597, + "learning_rate": 3.158319081358417e-05, + "loss": 0.7166, + "step": 37700 + }, + { + "epoch": 0.36937509161088583, + "grad_norm": 0.36105087399482727, + "learning_rate": 3.153432689958466e-05, + "loss": 0.7017, + "step": 37800 + }, + { + "epoch": 0.3703522743929252, + "grad_norm": 0.5487505197525024, + "learning_rate": 3.148546298558515e-05, + "loss": 0.7081, + "step": 37900 + }, + { + "epoch": 0.3713294571749646, + "grad_norm": 1.5384310483932495, + "learning_rate": 3.1436599071585635e-05, + "loss": 0.7064, + "step": 38000 + }, + { + "epoch": 0.37230663995700397, + "grad_norm": 1.0113205909729004, + "learning_rate": 3.138773515758612e-05, + "loss": 0.7197, + "step": 38100 + }, + { + "epoch": 0.3732838227390433, + "grad_norm": 1.4755492210388184, + "learning_rate": 3.133887124358662e-05, + "loss": 0.755, + "step": 38200 + }, + { + "epoch": 0.3742610055210827, + "grad_norm": 0.7554188370704651, + "learning_rate": 3.12900073295871e-05, + "loss": 0.7083, + "step": 38300 + }, + { + "epoch": 0.3752381883031221, + "grad_norm": 0.7589747905731201, + "learning_rate": 3.1241143415587594e-05, + "loss": 0.6917, + "step": 38400 + }, + { + "epoch": 0.37621537108516145, + "grad_norm": 0.485612690448761, + "learning_rate": 3.119227950158808e-05, + "loss": 0.7429, + "step": 38500 + }, + { + "epoch": 0.37719255386720085, + "grad_norm": 0.5043421983718872, + "learning_rate": 3.114341558758856e-05, + "loss": 0.7217, + "step": 38600 + }, + { + "epoch": 0.37816973664924025, + "grad_norm": 1.6078003644943237, + "learning_rate": 3.109455167358906e-05, + "loss": 0.7019, + "step": 38700 + }, + { + "epoch": 0.3791469194312796, + "grad_norm": 0.3607342839241028, + "learning_rate": 3.104568775958954e-05, + "loss": 0.772, + "step": 38800 + }, + { + "epoch": 0.380124102213319, + "grad_norm": 1.002525806427002, + "learning_rate": 3.0996823845590034e-05, + "loss": 0.7213, + "step": 38900 + }, + { + "epoch": 0.3811012849953584, + "grad_norm": 0.7605811357498169, + "learning_rate": 3.094795993159052e-05, + "loss": 0.7, + "step": 39000 + }, + { + "epoch": 0.3820784677773978, + "grad_norm": 2.388939619064331, + "learning_rate": 3.089909601759101e-05, + "loss": 0.7307, + "step": 39100 + }, + { + "epoch": 0.38305565055943713, + "grad_norm": 0.824883222579956, + "learning_rate": 3.08502321035915e-05, + "loss": 0.7255, + "step": 39200 + }, + { + "epoch": 0.38403283334147653, + "grad_norm": 0.6755787134170532, + "learning_rate": 3.0801368189591986e-05, + "loss": 0.7013, + "step": 39300 + }, + { + "epoch": 0.38501001612351593, + "grad_norm": 0.580859899520874, + "learning_rate": 3.0752504275592474e-05, + "loss": 0.7357, + "step": 39400 + }, + { + "epoch": 0.3859871989055553, + "grad_norm": 0.6988548636436462, + "learning_rate": 3.070364036159297e-05, + "loss": 0.6902, + "step": 39500 + }, + { + "epoch": 0.38696438168759467, + "grad_norm": 0.5997043251991272, + "learning_rate": 3.065477644759345e-05, + "loss": 0.7093, + "step": 39600 + }, + { + "epoch": 0.38794156446963407, + "grad_norm": 0.7906262874603271, + "learning_rate": 3.0605912533593945e-05, + "loss": 0.7376, + "step": 39700 + }, + { + "epoch": 0.3889187472516734, + "grad_norm": 0.7436035871505737, + "learning_rate": 3.0557048619594433e-05, + "loss": 0.7159, + "step": 39800 + }, + { + "epoch": 0.3898959300337128, + "grad_norm": 0.6913009285926819, + "learning_rate": 3.050818470559492e-05, + "loss": 0.7267, + "step": 39900 + }, + { + "epoch": 0.3908731128157522, + "grad_norm": 1.0030348300933838, + "learning_rate": 3.045932079159541e-05, + "loss": 0.7186, + "step": 40000 + }, + { + "epoch": 0.39185029559779155, + "grad_norm": 0.7223851084709167, + "learning_rate": 3.0410456877595894e-05, + "loss": 0.7113, + "step": 40100 + }, + { + "epoch": 0.39282747837983095, + "grad_norm": 1.0449798107147217, + "learning_rate": 3.0361592963596386e-05, + "loss": 0.6985, + "step": 40200 + }, + { + "epoch": 0.39380466116187035, + "grad_norm": 0.7078452110290527, + "learning_rate": 3.031272904959687e-05, + "loss": 0.714, + "step": 40300 + }, + { + "epoch": 0.3947818439439097, + "grad_norm": 0.5977550148963928, + "learning_rate": 3.0263865135597362e-05, + "loss": 0.7126, + "step": 40400 + }, + { + "epoch": 0.3957590267259491, + "grad_norm": 0.6963929533958435, + "learning_rate": 3.021500122159785e-05, + "loss": 0.6922, + "step": 40500 + }, + { + "epoch": 0.3967362095079885, + "grad_norm": 0.49735382199287415, + "learning_rate": 3.016613730759834e-05, + "loss": 0.6914, + "step": 40600 + }, + { + "epoch": 0.39771339229002783, + "grad_norm": 0.8894415497779846, + "learning_rate": 3.0117273393598826e-05, + "loss": 0.6988, + "step": 40700 + }, + { + "epoch": 0.39869057507206723, + "grad_norm": 0.5845156311988831, + "learning_rate": 3.0068409479599317e-05, + "loss": 0.705, + "step": 40800 + }, + { + "epoch": 0.39966775785410663, + "grad_norm": 0.7496864199638367, + "learning_rate": 3.0019545565599806e-05, + "loss": 0.669, + "step": 40900 + }, + { + "epoch": 0.400644940636146, + "grad_norm": 1.2446004152297974, + "learning_rate": 2.9970681651600297e-05, + "loss": 0.7063, + "step": 41000 + }, + { + "epoch": 0.4016221234181854, + "grad_norm": 0.37521255016326904, + "learning_rate": 2.992181773760078e-05, + "loss": 0.6966, + "step": 41100 + }, + { + "epoch": 0.40259930620022477, + "grad_norm": 0.7953245639801025, + "learning_rate": 2.9872953823601273e-05, + "loss": 0.6934, + "step": 41200 + }, + { + "epoch": 0.4035764889822641, + "grad_norm": 0.844543993473053, + "learning_rate": 2.982408990960176e-05, + "loss": 0.6926, + "step": 41300 + }, + { + "epoch": 0.4045536717643035, + "grad_norm": 0.5298857688903809, + "learning_rate": 2.9775225995602253e-05, + "loss": 0.6926, + "step": 41400 + }, + { + "epoch": 0.4055308545463429, + "grad_norm": 0.6932188272476196, + "learning_rate": 2.9726362081602737e-05, + "loss": 0.6868, + "step": 41500 + }, + { + "epoch": 0.40650803732838225, + "grad_norm": 0.7204051613807678, + "learning_rate": 2.9677498167603225e-05, + "loss": 0.7064, + "step": 41600 + }, + { + "epoch": 0.40748522011042165, + "grad_norm": 1.0420963764190674, + "learning_rate": 2.9628634253603717e-05, + "loss": 0.7072, + "step": 41700 + }, + { + "epoch": 0.40846240289246105, + "grad_norm": 0.4677026867866516, + "learning_rate": 2.95797703396042e-05, + "loss": 0.691, + "step": 41800 + }, + { + "epoch": 0.4094395856745004, + "grad_norm": 0.6934903860092163, + "learning_rate": 2.9530906425604693e-05, + "loss": 0.6962, + "step": 41900 + }, + { + "epoch": 0.4104167684565398, + "grad_norm": 0.7500805854797363, + "learning_rate": 2.9482042511605178e-05, + "loss": 0.708, + "step": 42000 + }, + { + "epoch": 0.4113939512385792, + "grad_norm": 0.8887515664100647, + "learning_rate": 2.943317859760567e-05, + "loss": 0.702, + "step": 42100 + }, + { + "epoch": 0.41237113402061853, + "grad_norm": 0.39899566769599915, + "learning_rate": 2.9384314683606157e-05, + "loss": 0.709, + "step": 42200 + }, + { + "epoch": 0.41334831680265793, + "grad_norm": 0.8467943668365479, + "learning_rate": 2.933545076960665e-05, + "loss": 0.6928, + "step": 42300 + }, + { + "epoch": 0.41432549958469733, + "grad_norm": 0.6024282574653625, + "learning_rate": 2.9286586855607133e-05, + "loss": 0.6928, + "step": 42400 + }, + { + "epoch": 0.4153026823667367, + "grad_norm": 0.7921658158302307, + "learning_rate": 2.9237722941607625e-05, + "loss": 0.6865, + "step": 42500 + }, + { + "epoch": 0.4162798651487761, + "grad_norm": 0.9025784730911255, + "learning_rate": 2.9188859027608113e-05, + "loss": 0.6863, + "step": 42600 + }, + { + "epoch": 0.4172570479308155, + "grad_norm": 0.9453756809234619, + "learning_rate": 2.9139995113608604e-05, + "loss": 0.6924, + "step": 42700 + }, + { + "epoch": 0.4182342307128548, + "grad_norm": 0.8638947010040283, + "learning_rate": 2.909113119960909e-05, + "loss": 0.7011, + "step": 42800 + }, + { + "epoch": 0.4192114134948942, + "grad_norm": 0.6639747619628906, + "learning_rate": 2.904226728560958e-05, + "loss": 0.6766, + "step": 42900 + }, + { + "epoch": 0.4201885962769336, + "grad_norm": 0.7019941210746765, + "learning_rate": 2.899340337161007e-05, + "loss": 0.7025, + "step": 43000 + }, + { + "epoch": 0.42116577905897296, + "grad_norm": 0.6988587379455566, + "learning_rate": 2.8944539457610553e-05, + "loss": 0.6768, + "step": 43100 + }, + { + "epoch": 0.42214296184101235, + "grad_norm": 0.5817476511001587, + "learning_rate": 2.8895675543611045e-05, + "loss": 0.702, + "step": 43200 + }, + { + "epoch": 0.42312014462305175, + "grad_norm": 0.4533466398715973, + "learning_rate": 2.8846811629611533e-05, + "loss": 0.6949, + "step": 43300 + }, + { + "epoch": 0.4240973274050911, + "grad_norm": 0.6197069883346558, + "learning_rate": 2.8797947715612024e-05, + "loss": 0.684, + "step": 43400 + }, + { + "epoch": 0.4250745101871305, + "grad_norm": 1.693144679069519, + "learning_rate": 2.874908380161251e-05, + "loss": 0.7201, + "step": 43500 + }, + { + "epoch": 0.4260516929691699, + "grad_norm": 1.1772024631500244, + "learning_rate": 2.8700219887613e-05, + "loss": 0.6936, + "step": 43600 + }, + { + "epoch": 0.4270288757512093, + "grad_norm": 0.5265709161758423, + "learning_rate": 2.8651355973613485e-05, + "loss": 0.6994, + "step": 43700 + }, + { + "epoch": 0.42800605853324863, + "grad_norm": 0.8301248550415039, + "learning_rate": 2.8602492059613976e-05, + "loss": 0.6968, + "step": 43800 + }, + { + "epoch": 0.42898324131528803, + "grad_norm": 1.2123380899429321, + "learning_rate": 2.8553628145614464e-05, + "loss": 0.7013, + "step": 43900 + }, + { + "epoch": 0.42996042409732743, + "grad_norm": 1.3780418634414673, + "learning_rate": 2.8504764231614956e-05, + "loss": 0.6826, + "step": 44000 + }, + { + "epoch": 0.4309376068793668, + "grad_norm": 0.6333886981010437, + "learning_rate": 2.845590031761544e-05, + "loss": 0.6842, + "step": 44100 + }, + { + "epoch": 0.4319147896614062, + "grad_norm": 0.5353469252586365, + "learning_rate": 2.8407036403615932e-05, + "loss": 0.6751, + "step": 44200 + }, + { + "epoch": 0.43289197244344557, + "grad_norm": 0.9482343792915344, + "learning_rate": 2.835817248961642e-05, + "loss": 0.6961, + "step": 44300 + }, + { + "epoch": 0.4338691552254849, + "grad_norm": 0.7306164503097534, + "learning_rate": 2.830930857561691e-05, + "loss": 0.6829, + "step": 44400 + }, + { + "epoch": 0.4348463380075243, + "grad_norm": 0.9290406107902527, + "learning_rate": 2.8260444661617396e-05, + "loss": 0.7109, + "step": 44500 + }, + { + "epoch": 0.4358235207895637, + "grad_norm": 0.5903436541557312, + "learning_rate": 2.8211580747617884e-05, + "loss": 0.7144, + "step": 44600 + }, + { + "epoch": 0.43680070357160306, + "grad_norm": 0.7370823621749878, + "learning_rate": 2.8162716833618376e-05, + "loss": 0.6858, + "step": 44700 + }, + { + "epoch": 0.43777788635364245, + "grad_norm": 0.5477197766304016, + "learning_rate": 2.811385291961886e-05, + "loss": 0.6951, + "step": 44800 + }, + { + "epoch": 0.43875506913568185, + "grad_norm": 0.8994666934013367, + "learning_rate": 2.8064989005619352e-05, + "loss": 0.705, + "step": 44900 + }, + { + "epoch": 0.4397322519177212, + "grad_norm": 1.171186089515686, + "learning_rate": 2.8016125091619836e-05, + "loss": 0.6812, + "step": 45000 + }, + { + "epoch": 0.4407094346997606, + "grad_norm": 0.6986414194107056, + "learning_rate": 2.796726117762033e-05, + "loss": 0.6729, + "step": 45100 + }, + { + "epoch": 0.4416866174818, + "grad_norm": 0.8245409727096558, + "learning_rate": 2.7918397263620816e-05, + "loss": 0.6679, + "step": 45200 + }, + { + "epoch": 0.44266380026383934, + "grad_norm": 0.8805913925170898, + "learning_rate": 2.7869533349621307e-05, + "loss": 0.7042, + "step": 45300 + }, + { + "epoch": 0.44364098304587873, + "grad_norm": 0.7037094831466675, + "learning_rate": 2.7820669435621792e-05, + "loss": 0.6988, + "step": 45400 + }, + { + "epoch": 0.44461816582791813, + "grad_norm": 1.118363380432129, + "learning_rate": 2.7771805521622284e-05, + "loss": 0.6866, + "step": 45500 + }, + { + "epoch": 0.4455953486099575, + "grad_norm": 1.0665768384933472, + "learning_rate": 2.772294160762277e-05, + "loss": 0.6732, + "step": 45600 + }, + { + "epoch": 0.4465725313919969, + "grad_norm": 0.7593882083892822, + "learning_rate": 2.7674077693623263e-05, + "loss": 0.6951, + "step": 45700 + }, + { + "epoch": 0.4475497141740363, + "grad_norm": 2.3182179927825928, + "learning_rate": 2.7625213779623748e-05, + "loss": 0.6695, + "step": 45800 + }, + { + "epoch": 0.4485268969560756, + "grad_norm": 1.2548315525054932, + "learning_rate": 2.757634986562424e-05, + "loss": 0.7128, + "step": 45900 + }, + { + "epoch": 0.449504079738115, + "grad_norm": 0.8613176941871643, + "learning_rate": 2.7527485951624727e-05, + "loss": 0.6956, + "step": 46000 + }, + { + "epoch": 0.4504812625201544, + "grad_norm": 0.946165919303894, + "learning_rate": 2.7478622037625212e-05, + "loss": 0.7177, + "step": 46100 + }, + { + "epoch": 0.45145844530219376, + "grad_norm": 0.9122072458267212, + "learning_rate": 2.7429758123625703e-05, + "loss": 0.7094, + "step": 46200 + }, + { + "epoch": 0.45243562808423315, + "grad_norm": 0.8797391057014465, + "learning_rate": 2.738089420962619e-05, + "loss": 0.7118, + "step": 46300 + }, + { + "epoch": 0.45341281086627255, + "grad_norm": 0.5321417450904846, + "learning_rate": 2.7332030295626683e-05, + "loss": 0.6923, + "step": 46400 + }, + { + "epoch": 0.4543899936483119, + "grad_norm": 1.0878016948699951, + "learning_rate": 2.7283166381627168e-05, + "loss": 0.72, + "step": 46500 + }, + { + "epoch": 0.4553671764303513, + "grad_norm": 0.8534865975379944, + "learning_rate": 2.723430246762766e-05, + "loss": 0.6945, + "step": 46600 + }, + { + "epoch": 0.4563443592123907, + "grad_norm": 0.8475703597068787, + "learning_rate": 2.7185438553628144e-05, + "loss": 0.6891, + "step": 46700 + }, + { + "epoch": 0.45732154199443004, + "grad_norm": 0.7100959420204163, + "learning_rate": 2.713657463962864e-05, + "loss": 0.6605, + "step": 46800 + }, + { + "epoch": 0.45829872477646943, + "grad_norm": 0.6616931557655334, + "learning_rate": 2.7087710725629123e-05, + "loss": 0.6678, + "step": 46900 + }, + { + "epoch": 0.45927590755850883, + "grad_norm": 1.2114359140396118, + "learning_rate": 2.7038846811629615e-05, + "loss": 0.6525, + "step": 47000 + }, + { + "epoch": 0.4602530903405482, + "grad_norm": 0.4216634929180145, + "learning_rate": 2.69899828976301e-05, + "loss": 0.6881, + "step": 47100 + }, + { + "epoch": 0.4612302731225876, + "grad_norm": 0.7598534822463989, + "learning_rate": 2.694111898363059e-05, + "loss": 0.6555, + "step": 47200 + }, + { + "epoch": 0.462207455904627, + "grad_norm": 0.9792212843894958, + "learning_rate": 2.689225506963108e-05, + "loss": 0.6866, + "step": 47300 + }, + { + "epoch": 0.4631846386866663, + "grad_norm": 0.5867584943771362, + "learning_rate": 2.684339115563157e-05, + "loss": 0.6541, + "step": 47400 + }, + { + "epoch": 0.4641618214687057, + "grad_norm": 0.8288137912750244, + "learning_rate": 2.6794527241632055e-05, + "loss": 0.7057, + "step": 47500 + }, + { + "epoch": 0.4651390042507451, + "grad_norm": 1.5305638313293457, + "learning_rate": 2.6745663327632543e-05, + "loss": 0.6752, + "step": 47600 + }, + { + "epoch": 0.46611618703278446, + "grad_norm": 1.0784820318222046, + "learning_rate": 2.6696799413633035e-05, + "loss": 0.7041, + "step": 47700 + }, + { + "epoch": 0.46709336981482386, + "grad_norm": 0.7708161473274231, + "learning_rate": 2.664793549963352e-05, + "loss": 0.6766, + "step": 47800 + }, + { + "epoch": 0.46807055259686325, + "grad_norm": 0.7639223337173462, + "learning_rate": 2.659907158563401e-05, + "loss": 0.6553, + "step": 47900 + }, + { + "epoch": 0.4690477353789026, + "grad_norm": 0.4256194233894348, + "learning_rate": 2.65502076716345e-05, + "loss": 0.6921, + "step": 48000 + }, + { + "epoch": 0.470024918160942, + "grad_norm": 1.2620900869369507, + "learning_rate": 2.650134375763499e-05, + "loss": 0.6945, + "step": 48100 + }, + { + "epoch": 0.4710021009429814, + "grad_norm": 0.7683165073394775, + "learning_rate": 2.6452479843635475e-05, + "loss": 0.6594, + "step": 48200 + }, + { + "epoch": 0.4719792837250208, + "grad_norm": 0.784582257270813, + "learning_rate": 2.6403615929635966e-05, + "loss": 0.6877, + "step": 48300 + }, + { + "epoch": 0.47295646650706014, + "grad_norm": 0.7894740104675293, + "learning_rate": 2.635475201563645e-05, + "loss": 0.6944, + "step": 48400 + }, + { + "epoch": 0.47393364928909953, + "grad_norm": 0.6949831247329712, + "learning_rate": 2.6305888101636942e-05, + "loss": 0.6625, + "step": 48500 + }, + { + "epoch": 0.47491083207113893, + "grad_norm": 0.5648496747016907, + "learning_rate": 2.625702418763743e-05, + "loss": 0.6489, + "step": 48600 + }, + { + "epoch": 0.4758880148531783, + "grad_norm": 0.8879817128181458, + "learning_rate": 2.6208160273637922e-05, + "loss": 0.6465, + "step": 48700 + }, + { + "epoch": 0.4768651976352177, + "grad_norm": 0.5845817923545837, + "learning_rate": 2.6159296359638407e-05, + "loss": 0.7044, + "step": 48800 + }, + { + "epoch": 0.4778423804172571, + "grad_norm": 0.8040775060653687, + "learning_rate": 2.6110432445638898e-05, + "loss": 0.6745, + "step": 48900 + }, + { + "epoch": 0.4788195631992964, + "grad_norm": 0.5439351201057434, + "learning_rate": 2.6061568531639386e-05, + "loss": 0.6924, + "step": 49000 + }, + { + "epoch": 0.4797967459813358, + "grad_norm": 1.1411272287368774, + "learning_rate": 2.601270461763987e-05, + "loss": 0.6834, + "step": 49100 + }, + { + "epoch": 0.4807739287633752, + "grad_norm": 0.7273046374320984, + "learning_rate": 2.5963840703640362e-05, + "loss": 0.6547, + "step": 49200 + }, + { + "epoch": 0.48175111154541456, + "grad_norm": 0.9065064787864685, + "learning_rate": 2.591497678964085e-05, + "loss": 0.6792, + "step": 49300 + }, + { + "epoch": 0.48272829432745396, + "grad_norm": 0.6722708344459534, + "learning_rate": 2.5866112875641342e-05, + "loss": 0.6913, + "step": 49400 + }, + { + "epoch": 0.48370547710949335, + "grad_norm": 0.6576828360557556, + "learning_rate": 2.5817248961641826e-05, + "loss": 0.6741, + "step": 49500 + }, + { + "epoch": 0.4846826598915327, + "grad_norm": 0.46869999170303345, + "learning_rate": 2.5768385047642318e-05, + "loss": 0.6729, + "step": 49600 + }, + { + "epoch": 0.4856598426735721, + "grad_norm": 0.735565185546875, + "learning_rate": 2.5719521133642806e-05, + "loss": 0.6781, + "step": 49700 + }, + { + "epoch": 0.4866370254556115, + "grad_norm": 0.6392993927001953, + "learning_rate": 2.5670657219643297e-05, + "loss": 0.6824, + "step": 49800 + }, + { + "epoch": 0.48761420823765084, + "grad_norm": 3.2004761695861816, + "learning_rate": 2.5621793305643782e-05, + "loss": 0.6862, + "step": 49900 + }, + { + "epoch": 0.48859139101969024, + "grad_norm": 0.6201328635215759, + "learning_rate": 2.5572929391644274e-05, + "loss": 0.664, + "step": 50000 + }, + { + "epoch": 0.48956857380172963, + "grad_norm": 1.179991364479065, + "learning_rate": 2.5524065477644758e-05, + "loss": 0.6841, + "step": 50100 + }, + { + "epoch": 0.490545756583769, + "grad_norm": 0.942451000213623, + "learning_rate": 2.547520156364525e-05, + "loss": 0.6555, + "step": 50200 + }, + { + "epoch": 0.4915229393658084, + "grad_norm": 1.1190769672393799, + "learning_rate": 2.5426337649645738e-05, + "loss": 0.673, + "step": 50300 + }, + { + "epoch": 0.4925001221478478, + "grad_norm": 0.712053656578064, + "learning_rate": 2.537747373564623e-05, + "loss": 0.6849, + "step": 50400 + }, + { + "epoch": 0.4934773049298871, + "grad_norm": 1.3936710357666016, + "learning_rate": 2.5328609821646714e-05, + "loss": 0.6751, + "step": 50500 + }, + { + "epoch": 0.4944544877119265, + "grad_norm": 0.5909391045570374, + "learning_rate": 2.5279745907647205e-05, + "loss": 0.683, + "step": 50600 + }, + { + "epoch": 0.4954316704939659, + "grad_norm": 0.8883010149002075, + "learning_rate": 2.5230881993647693e-05, + "loss": 0.6806, + "step": 50700 + }, + { + "epoch": 0.49640885327600526, + "grad_norm": 0.7069185376167297, + "learning_rate": 2.5182018079648178e-05, + "loss": 0.6779, + "step": 50800 + }, + { + "epoch": 0.49738603605804466, + "grad_norm": 0.7906535267829895, + "learning_rate": 2.513315416564867e-05, + "loss": 0.663, + "step": 50900 + }, + { + "epoch": 0.49836321884008405, + "grad_norm": 1.8775051832199097, + "learning_rate": 2.5084290251649158e-05, + "loss": 0.6924, + "step": 51000 + }, + { + "epoch": 0.4993404016221234, + "grad_norm": 0.4028649628162384, + "learning_rate": 2.503542633764965e-05, + "loss": 0.6611, + "step": 51100 + }, + { + "epoch": 0.5003175844041629, + "grad_norm": 0.8514829277992249, + "learning_rate": 2.4986562423650137e-05, + "loss": 0.6423, + "step": 51200 + }, + { + "epoch": 0.5012947671862021, + "grad_norm": 0.5659759044647217, + "learning_rate": 2.4937698509650625e-05, + "loss": 0.6978, + "step": 51300 + }, + { + "epoch": 0.5022719499682415, + "grad_norm": 0.8396779298782349, + "learning_rate": 2.488883459565111e-05, + "loss": 0.6593, + "step": 51400 + }, + { + "epoch": 0.5032491327502809, + "grad_norm": 0.6824951767921448, + "learning_rate": 2.48399706816516e-05, + "loss": 0.6839, + "step": 51500 + }, + { + "epoch": 0.5042263155323203, + "grad_norm": 0.6299941539764404, + "learning_rate": 2.479110676765209e-05, + "loss": 0.6743, + "step": 51600 + }, + { + "epoch": 0.5052034983143597, + "grad_norm": 1.2409921884536743, + "learning_rate": 2.4742242853652577e-05, + "loss": 0.6477, + "step": 51700 + }, + { + "epoch": 0.5061806810963991, + "grad_norm": 0.668393075466156, + "learning_rate": 2.4693378939653065e-05, + "loss": 0.6568, + "step": 51800 + }, + { + "epoch": 0.5071578638784384, + "grad_norm": 0.5376803278923035, + "learning_rate": 2.4644515025653557e-05, + "loss": 0.6476, + "step": 51900 + }, + { + "epoch": 0.5081350466604778, + "grad_norm": 1.710288166999817, + "learning_rate": 2.4595651111654045e-05, + "loss": 0.6404, + "step": 52000 + }, + { + "epoch": 0.5091122294425172, + "grad_norm": 0.6142415404319763, + "learning_rate": 2.4546787197654533e-05, + "loss": 0.7026, + "step": 52100 + }, + { + "epoch": 0.5100894122245566, + "grad_norm": 0.4976397454738617, + "learning_rate": 2.449792328365502e-05, + "loss": 0.6659, + "step": 52200 + }, + { + "epoch": 0.511066595006596, + "grad_norm": 0.8558853268623352, + "learning_rate": 2.4449059369655513e-05, + "loss": 0.67, + "step": 52300 + }, + { + "epoch": 0.5120437777886354, + "grad_norm": 0.620583713054657, + "learning_rate": 2.4400195455656e-05, + "loss": 0.6596, + "step": 52400 + }, + { + "epoch": 0.5130209605706747, + "grad_norm": 0.8520305752754211, + "learning_rate": 2.435133154165649e-05, + "loss": 0.653, + "step": 52500 + }, + { + "epoch": 0.5139981433527141, + "grad_norm": 0.43671169877052307, + "learning_rate": 2.4302467627656977e-05, + "loss": 0.6554, + "step": 52600 + }, + { + "epoch": 0.5149753261347535, + "grad_norm": 0.5502797961235046, + "learning_rate": 2.4253603713657465e-05, + "loss": 0.6432, + "step": 52700 + }, + { + "epoch": 0.5159525089167929, + "grad_norm": 0.918704628944397, + "learning_rate": 2.4204739799657956e-05, + "loss": 0.6604, + "step": 52800 + }, + { + "epoch": 0.5169296916988323, + "grad_norm": 0.44583848118782043, + "learning_rate": 2.415587588565844e-05, + "loss": 0.6736, + "step": 52900 + }, + { + "epoch": 0.5179068744808717, + "grad_norm": 0.8312250971794128, + "learning_rate": 2.410701197165893e-05, + "loss": 0.6645, + "step": 53000 + }, + { + "epoch": 0.518884057262911, + "grad_norm": 0.39499637484550476, + "learning_rate": 2.4058148057659417e-05, + "loss": 0.6876, + "step": 53100 + }, + { + "epoch": 0.5198612400449504, + "grad_norm": 0.5650041699409485, + "learning_rate": 2.400928414365991e-05, + "loss": 0.691, + "step": 53200 + }, + { + "epoch": 0.5208384228269898, + "grad_norm": 0.7247036099433899, + "learning_rate": 2.3960420229660397e-05, + "loss": 0.6367, + "step": 53300 + }, + { + "epoch": 0.5218156056090292, + "grad_norm": 0.8500406742095947, + "learning_rate": 2.3911556315660885e-05, + "loss": 0.6678, + "step": 53400 + }, + { + "epoch": 0.5227927883910686, + "grad_norm": 1.2467963695526123, + "learning_rate": 2.3862692401661373e-05, + "loss": 0.6439, + "step": 53500 + }, + { + "epoch": 0.523769971173108, + "grad_norm": 1.0069133043289185, + "learning_rate": 2.3813828487661864e-05, + "loss": 0.6555, + "step": 53600 + }, + { + "epoch": 0.5247471539551473, + "grad_norm": 0.9213836193084717, + "learning_rate": 2.3764964573662352e-05, + "loss": 0.6374, + "step": 53700 + }, + { + "epoch": 0.5257243367371867, + "grad_norm": 0.7063928246498108, + "learning_rate": 2.371610065966284e-05, + "loss": 0.6473, + "step": 53800 + }, + { + "epoch": 0.5267015195192261, + "grad_norm": 0.7876357436180115, + "learning_rate": 2.366723674566333e-05, + "loss": 0.6599, + "step": 53900 + }, + { + "epoch": 0.5276787023012655, + "grad_norm": 0.5371726751327515, + "learning_rate": 2.3618372831663816e-05, + "loss": 0.6569, + "step": 54000 + }, + { + "epoch": 0.5286558850833049, + "grad_norm": 0.6501371264457703, + "learning_rate": 2.3569508917664308e-05, + "loss": 0.6502, + "step": 54100 + }, + { + "epoch": 0.5296330678653443, + "grad_norm": 1.9818251132965088, + "learning_rate": 2.3520645003664796e-05, + "loss": 0.6628, + "step": 54200 + }, + { + "epoch": 0.5306102506473835, + "grad_norm": 0.6198662519454956, + "learning_rate": 2.3471781089665284e-05, + "loss": 0.6771, + "step": 54300 + }, + { + "epoch": 0.5315874334294229, + "grad_norm": 0.70624840259552, + "learning_rate": 2.3422917175665772e-05, + "loss": 0.6685, + "step": 54400 + }, + { + "epoch": 0.5325646162114623, + "grad_norm": 0.5182805061340332, + "learning_rate": 2.337405326166626e-05, + "loss": 0.6651, + "step": 54500 + }, + { + "epoch": 0.5335417989935017, + "grad_norm": 1.0862709283828735, + "learning_rate": 2.3325189347666748e-05, + "loss": 0.668, + "step": 54600 + }, + { + "epoch": 0.5345189817755411, + "grad_norm": 0.5830691456794739, + "learning_rate": 2.3276325433667236e-05, + "loss": 0.67, + "step": 54700 + }, + { + "epoch": 0.5354961645575805, + "grad_norm": 0.5614120960235596, + "learning_rate": 2.3227461519667724e-05, + "loss": 0.6466, + "step": 54800 + }, + { + "epoch": 0.5364733473396198, + "grad_norm": 0.6346180438995361, + "learning_rate": 2.3178597605668216e-05, + "loss": 0.6784, + "step": 54900 + }, + { + "epoch": 0.5374505301216592, + "grad_norm": 0.5453216433525085, + "learning_rate": 2.3129733691668704e-05, + "loss": 0.6507, + "step": 55000 + }, + { + "epoch": 0.5384277129036986, + "grad_norm": 0.8145617246627808, + "learning_rate": 2.3080869777669192e-05, + "loss": 0.6874, + "step": 55100 + }, + { + "epoch": 0.539404895685738, + "grad_norm": 0.8334397673606873, + "learning_rate": 2.303200586366968e-05, + "loss": 0.6772, + "step": 55200 + }, + { + "epoch": 0.5403820784677774, + "grad_norm": 0.5468283295631409, + "learning_rate": 2.298314194967017e-05, + "loss": 0.6448, + "step": 55300 + }, + { + "epoch": 0.5413592612498168, + "grad_norm": 0.8369360566139221, + "learning_rate": 2.293427803567066e-05, + "loss": 0.6593, + "step": 55400 + }, + { + "epoch": 0.5423364440318562, + "grad_norm": 0.498793363571167, + "learning_rate": 2.2885414121671148e-05, + "loss": 0.6236, + "step": 55500 + }, + { + "epoch": 0.5433136268138955, + "grad_norm": 0.6096756458282471, + "learning_rate": 2.2836550207671636e-05, + "loss": 0.6766, + "step": 55600 + }, + { + "epoch": 0.5442908095959349, + "grad_norm": 0.8249727487564087, + "learning_rate": 2.2787686293672124e-05, + "loss": 0.654, + "step": 55700 + }, + { + "epoch": 0.5452679923779743, + "grad_norm": 0.9821385145187378, + "learning_rate": 2.2738822379672615e-05, + "loss": 0.6633, + "step": 55800 + }, + { + "epoch": 0.5462451751600137, + "grad_norm": 1.025420069694519, + "learning_rate": 2.26899584656731e-05, + "loss": 0.6691, + "step": 55900 + }, + { + "epoch": 0.5472223579420531, + "grad_norm": 1.1872769594192505, + "learning_rate": 2.2641094551673588e-05, + "loss": 0.6811, + "step": 56000 + }, + { + "epoch": 0.5481995407240925, + "grad_norm": 0.6862273812294006, + "learning_rate": 2.259223063767408e-05, + "loss": 0.6503, + "step": 56100 + }, + { + "epoch": 0.5491767235061318, + "grad_norm": 1.9515796899795532, + "learning_rate": 2.2543366723674567e-05, + "loss": 0.6672, + "step": 56200 + }, + { + "epoch": 0.5501539062881712, + "grad_norm": 1.5116077661514282, + "learning_rate": 2.2494502809675055e-05, + "loss": 0.6714, + "step": 56300 + }, + { + "epoch": 0.5511310890702106, + "grad_norm": 0.710858166217804, + "learning_rate": 2.2445638895675544e-05, + "loss": 0.6577, + "step": 56400 + }, + { + "epoch": 0.55210827185225, + "grad_norm": 0.6870605945587158, + "learning_rate": 2.239677498167603e-05, + "loss": 0.6655, + "step": 56500 + }, + { + "epoch": 0.5530854546342894, + "grad_norm": 0.802883505821228, + "learning_rate": 2.2347911067676523e-05, + "loss": 0.6812, + "step": 56600 + }, + { + "epoch": 0.5540626374163288, + "grad_norm": 1.244555115699768, + "learning_rate": 2.229904715367701e-05, + "loss": 0.655, + "step": 56700 + }, + { + "epoch": 0.5550398201983681, + "grad_norm": 0.7662067413330078, + "learning_rate": 2.22501832396775e-05, + "loss": 0.6867, + "step": 56800 + }, + { + "epoch": 0.5560170029804075, + "grad_norm": 0.9172037839889526, + "learning_rate": 2.2201319325677987e-05, + "loss": 0.6427, + "step": 56900 + }, + { + "epoch": 0.5569941857624469, + "grad_norm": 0.8700697422027588, + "learning_rate": 2.215245541167848e-05, + "loss": 0.6959, + "step": 57000 + }, + { + "epoch": 0.5579713685444863, + "grad_norm": 1.1184202432632446, + "learning_rate": 2.2103591497678967e-05, + "loss": 0.6601, + "step": 57100 + }, + { + "epoch": 0.5589485513265257, + "grad_norm": 1.1001787185668945, + "learning_rate": 2.2054727583679455e-05, + "loss": 0.6753, + "step": 57200 + }, + { + "epoch": 0.559925734108565, + "grad_norm": 0.29295894503593445, + "learning_rate": 2.2005863669679943e-05, + "loss": 0.625, + "step": 57300 + }, + { + "epoch": 0.5609029168906043, + "grad_norm": 0.5778409242630005, + "learning_rate": 2.195699975568043e-05, + "loss": 0.6554, + "step": 57400 + }, + { + "epoch": 0.5618800996726437, + "grad_norm": 0.8341584801673889, + "learning_rate": 2.190813584168092e-05, + "loss": 0.6324, + "step": 57500 + }, + { + "epoch": 0.5628572824546831, + "grad_norm": 1.329548716545105, + "learning_rate": 2.1859271927681407e-05, + "loss": 0.6657, + "step": 57600 + }, + { + "epoch": 0.5638344652367225, + "grad_norm": 0.6559785604476929, + "learning_rate": 2.1810408013681895e-05, + "loss": 0.6411, + "step": 57700 + }, + { + "epoch": 0.5648116480187619, + "grad_norm": 1.1021350622177124, + "learning_rate": 2.1761544099682387e-05, + "loss": 0.6363, + "step": 57800 + }, + { + "epoch": 0.5657888308008013, + "grad_norm": 1.0015547275543213, + "learning_rate": 2.1712680185682875e-05, + "loss": 0.632, + "step": 57900 + }, + { + "epoch": 0.5667660135828406, + "grad_norm": 0.7394452691078186, + "learning_rate": 2.1663816271683363e-05, + "loss": 0.6882, + "step": 58000 + }, + { + "epoch": 0.56774319636488, + "grad_norm": 1.0177232027053833, + "learning_rate": 2.161495235768385e-05, + "loss": 0.659, + "step": 58100 + }, + { + "epoch": 0.5687203791469194, + "grad_norm": 1.182385802268982, + "learning_rate": 2.156608844368434e-05, + "loss": 0.6304, + "step": 58200 + }, + { + "epoch": 0.5696975619289588, + "grad_norm": 0.6992839574813843, + "learning_rate": 2.151722452968483e-05, + "loss": 0.6419, + "step": 58300 + }, + { + "epoch": 0.5706747447109982, + "grad_norm": 1.127772331237793, + "learning_rate": 2.146836061568532e-05, + "loss": 0.6762, + "step": 58400 + }, + { + "epoch": 0.5716519274930376, + "grad_norm": 1.0480372905731201, + "learning_rate": 2.1419496701685806e-05, + "loss": 0.649, + "step": 58500 + }, + { + "epoch": 0.5726291102750769, + "grad_norm": 0.62301105260849, + "learning_rate": 2.1370632787686295e-05, + "loss": 0.6423, + "step": 58600 + }, + { + "epoch": 0.5736062930571163, + "grad_norm": 0.7996447086334229, + "learning_rate": 2.1321768873686786e-05, + "loss": 0.6675, + "step": 58700 + }, + { + "epoch": 0.5745834758391557, + "grad_norm": 0.8735845685005188, + "learning_rate": 2.1272904959687274e-05, + "loss": 0.6251, + "step": 58800 + }, + { + "epoch": 0.5755606586211951, + "grad_norm": 1.0168455839157104, + "learning_rate": 2.1224041045687762e-05, + "loss": 0.6623, + "step": 58900 + }, + { + "epoch": 0.5765378414032345, + "grad_norm": 0.7308356165885925, + "learning_rate": 2.1175177131688247e-05, + "loss": 0.6613, + "step": 59000 + }, + { + "epoch": 0.5775150241852739, + "grad_norm": 1.2486464977264404, + "learning_rate": 2.1126313217688738e-05, + "loss": 0.6424, + "step": 59100 + }, + { + "epoch": 0.5784922069673132, + "grad_norm": 0.8921827077865601, + "learning_rate": 2.1077449303689226e-05, + "loss": 0.6403, + "step": 59200 + }, + { + "epoch": 0.5794693897493526, + "grad_norm": 0.5246706604957581, + "learning_rate": 2.1028585389689714e-05, + "loss": 0.6494, + "step": 59300 + }, + { + "epoch": 0.580446572531392, + "grad_norm": 0.8651568293571472, + "learning_rate": 2.0979721475690202e-05, + "loss": 0.6352, + "step": 59400 + }, + { + "epoch": 0.5814237553134314, + "grad_norm": 0.9502151608467102, + "learning_rate": 2.093085756169069e-05, + "loss": 0.6661, + "step": 59500 + }, + { + "epoch": 0.5824009380954708, + "grad_norm": 0.6827490925788879, + "learning_rate": 2.0881993647691182e-05, + "loss": 0.625, + "step": 59600 + }, + { + "epoch": 0.5833781208775102, + "grad_norm": 0.8105266690254211, + "learning_rate": 2.083312973369167e-05, + "loss": 0.6699, + "step": 59700 + }, + { + "epoch": 0.5843553036595496, + "grad_norm": 1.005845308303833, + "learning_rate": 2.0784265819692158e-05, + "loss": 0.6528, + "step": 59800 + }, + { + "epoch": 0.5853324864415889, + "grad_norm": 0.8736119270324707, + "learning_rate": 2.0735401905692646e-05, + "loss": 0.6691, + "step": 59900 + }, + { + "epoch": 0.5863096692236283, + "grad_norm": 0.8782946467399597, + "learning_rate": 2.0686537991693138e-05, + "loss": 0.6677, + "step": 60000 + }, + { + "epoch": 0.5872868520056677, + "grad_norm": 0.7457369565963745, + "learning_rate": 2.0637674077693626e-05, + "loss": 0.6323, + "step": 60100 + }, + { + "epoch": 0.5882640347877071, + "grad_norm": 1.0230743885040283, + "learning_rate": 2.0588810163694114e-05, + "loss": 0.6521, + "step": 60200 + }, + { + "epoch": 0.5892412175697465, + "grad_norm": 0.8328123688697815, + "learning_rate": 2.0539946249694602e-05, + "loss": 0.6356, + "step": 60300 + }, + { + "epoch": 0.5902184003517859, + "grad_norm": 0.7374850511550903, + "learning_rate": 2.049108233569509e-05, + "loss": 0.6669, + "step": 60400 + }, + { + "epoch": 0.5911955831338251, + "grad_norm": 0.505228579044342, + "learning_rate": 2.0442218421695578e-05, + "loss": 0.6734, + "step": 60500 + }, + { + "epoch": 0.5921727659158645, + "grad_norm": 0.8307722210884094, + "learning_rate": 2.0393354507696066e-05, + "loss": 0.657, + "step": 60600 + }, + { + "epoch": 0.5931499486979039, + "grad_norm": 0.8867704272270203, + "learning_rate": 2.0344490593696554e-05, + "loss": 0.6407, + "step": 60700 + }, + { + "epoch": 0.5941271314799433, + "grad_norm": 0.716373085975647, + "learning_rate": 2.0295626679697045e-05, + "loss": 0.6428, + "step": 60800 + }, + { + "epoch": 0.5951043142619827, + "grad_norm": 0.5812042355537415, + "learning_rate": 2.0246762765697534e-05, + "loss": 0.63, + "step": 60900 + }, + { + "epoch": 0.5960814970440221, + "grad_norm": 1.0057129859924316, + "learning_rate": 2.019789885169802e-05, + "loss": 0.6161, + "step": 61000 + }, + { + "epoch": 0.5970586798260614, + "grad_norm": 0.6143211126327515, + "learning_rate": 2.014903493769851e-05, + "loss": 0.6454, + "step": 61100 + }, + { + "epoch": 0.5980358626081008, + "grad_norm": 1.038710594177246, + "learning_rate": 2.0100171023698998e-05, + "loss": 0.6701, + "step": 61200 + }, + { + "epoch": 0.5990130453901402, + "grad_norm": 0.6891298294067383, + "learning_rate": 2.005130710969949e-05, + "loss": 0.6666, + "step": 61300 + }, + { + "epoch": 0.5999902281721796, + "grad_norm": 0.7872188091278076, + "learning_rate": 2.0002443195699977e-05, + "loss": 0.6357, + "step": 61400 + }, + { + "epoch": 0.600967410954219, + "grad_norm": 1.2167768478393555, + "learning_rate": 1.9953579281700465e-05, + "loss": 0.6686, + "step": 61500 + }, + { + "epoch": 0.6019445937362584, + "grad_norm": 1.0418341159820557, + "learning_rate": 1.9904715367700953e-05, + "loss": 0.6356, + "step": 61600 + }, + { + "epoch": 0.6029217765182977, + "grad_norm": 0.6209270358085632, + "learning_rate": 1.9855851453701445e-05, + "loss": 0.657, + "step": 61700 + }, + { + "epoch": 0.6038989593003371, + "grad_norm": 0.8585149645805359, + "learning_rate": 1.9806987539701933e-05, + "loss": 0.6157, + "step": 61800 + }, + { + "epoch": 0.6048761420823765, + "grad_norm": 0.5286767482757568, + "learning_rate": 1.975812362570242e-05, + "loss": 0.6734, + "step": 61900 + }, + { + "epoch": 0.6058533248644159, + "grad_norm": 0.6499518156051636, + "learning_rate": 1.9709259711702906e-05, + "loss": 0.6545, + "step": 62000 + }, + { + "epoch": 0.6068305076464553, + "grad_norm": 1.4340311288833618, + "learning_rate": 1.9660395797703397e-05, + "loss": 0.6402, + "step": 62100 + }, + { + "epoch": 0.6078076904284947, + "grad_norm": 0.4783228039741516, + "learning_rate": 1.9611531883703885e-05, + "loss": 0.6495, + "step": 62200 + }, + { + "epoch": 0.608784873210534, + "grad_norm": 0.6510328054428101, + "learning_rate": 1.9562667969704373e-05, + "loss": 0.6398, + "step": 62300 + }, + { + "epoch": 0.6097620559925734, + "grad_norm": 0.7298358082771301, + "learning_rate": 1.951380405570486e-05, + "loss": 0.6406, + "step": 62400 + }, + { + "epoch": 0.6107392387746128, + "grad_norm": 0.7467713952064514, + "learning_rate": 1.9464940141705353e-05, + "loss": 0.6618, + "step": 62500 + }, + { + "epoch": 0.6117164215566522, + "grad_norm": 1.1706078052520752, + "learning_rate": 1.941607622770584e-05, + "loss": 0.6603, + "step": 62600 + }, + { + "epoch": 0.6126936043386916, + "grad_norm": 1.9863495826721191, + "learning_rate": 1.936721231370633e-05, + "loss": 0.628, + "step": 62700 + }, + { + "epoch": 0.613670787120731, + "grad_norm": 1.1297212839126587, + "learning_rate": 1.9318348399706817e-05, + "loss": 0.6198, + "step": 62800 + }, + { + "epoch": 0.6146479699027703, + "grad_norm": 0.6895560026168823, + "learning_rate": 1.9269484485707305e-05, + "loss": 0.654, + "step": 62900 + }, + { + "epoch": 0.6156251526848097, + "grad_norm": 0.5572859644889832, + "learning_rate": 1.9220620571707796e-05, + "loss": 0.6237, + "step": 63000 + }, + { + "epoch": 0.6166023354668491, + "grad_norm": 1.7625269889831543, + "learning_rate": 1.9171756657708284e-05, + "loss": 0.6615, + "step": 63100 + }, + { + "epoch": 0.6175795182488885, + "grad_norm": 0.9473828673362732, + "learning_rate": 1.9122892743708773e-05, + "loss": 0.624, + "step": 63200 + }, + { + "epoch": 0.6185567010309279, + "grad_norm": 1.6622077226638794, + "learning_rate": 1.907402882970926e-05, + "loss": 0.648, + "step": 63300 + }, + { + "epoch": 0.6195338838129673, + "grad_norm": 0.889667809009552, + "learning_rate": 1.9025164915709752e-05, + "loss": 0.6321, + "step": 63400 + }, + { + "epoch": 0.6205110665950065, + "grad_norm": 0.7613341212272644, + "learning_rate": 1.8976301001710237e-05, + "loss": 0.637, + "step": 63500 + }, + { + "epoch": 0.6214882493770459, + "grad_norm": 0.9912586212158203, + "learning_rate": 1.8927437087710725e-05, + "loss": 0.6422, + "step": 63600 + }, + { + "epoch": 0.6224654321590853, + "grad_norm": 0.7905563712120056, + "learning_rate": 1.8878573173711213e-05, + "loss": 0.6362, + "step": 63700 + }, + { + "epoch": 0.6234426149411247, + "grad_norm": 0.4368293881416321, + "learning_rate": 1.8829709259711704e-05, + "loss": 0.6472, + "step": 63800 + }, + { + "epoch": 0.6244197977231641, + "grad_norm": 0.8466482758522034, + "learning_rate": 1.8780845345712192e-05, + "loss": 0.673, + "step": 63900 + }, + { + "epoch": 0.6253969805052035, + "grad_norm": 1.4137593507766724, + "learning_rate": 1.873198143171268e-05, + "loss": 0.6382, + "step": 64000 + }, + { + "epoch": 0.6263741632872428, + "grad_norm": 1.7590171098709106, + "learning_rate": 1.868311751771317e-05, + "loss": 0.6421, + "step": 64100 + }, + { + "epoch": 0.6273513460692822, + "grad_norm": 0.7667103409767151, + "learning_rate": 1.863425360371366e-05, + "loss": 0.6448, + "step": 64200 + }, + { + "epoch": 0.6283285288513216, + "grad_norm": 1.0524508953094482, + "learning_rate": 1.8585389689714148e-05, + "loss": 0.6491, + "step": 64300 + }, + { + "epoch": 0.629305711633361, + "grad_norm": 0.6090672612190247, + "learning_rate": 1.8536525775714636e-05, + "loss": 0.6416, + "step": 64400 + }, + { + "epoch": 0.6302828944154004, + "grad_norm": 0.5970349311828613, + "learning_rate": 1.8487661861715124e-05, + "loss": 0.6393, + "step": 64500 + }, + { + "epoch": 0.6312600771974398, + "grad_norm": 0.9564999341964722, + "learning_rate": 1.8438797947715612e-05, + "loss": 0.6656, + "step": 64600 + }, + { + "epoch": 0.6322372599794792, + "grad_norm": 1.319643259048462, + "learning_rate": 1.8389934033716104e-05, + "loss": 0.6372, + "step": 64700 + }, + { + "epoch": 0.6332144427615185, + "grad_norm": 1.0311692953109741, + "learning_rate": 1.8341070119716592e-05, + "loss": 0.6377, + "step": 64800 + }, + { + "epoch": 0.6341916255435579, + "grad_norm": 0.5185050964355469, + "learning_rate": 1.829220620571708e-05, + "loss": 0.6489, + "step": 64900 + }, + { + "epoch": 0.6351688083255973, + "grad_norm": 1.0611315965652466, + "learning_rate": 1.8243342291717564e-05, + "loss": 0.6262, + "step": 65000 + }, + { + "epoch": 0.6361459911076367, + "grad_norm": 0.5177842974662781, + "learning_rate": 1.8194478377718056e-05, + "loss": 0.6424, + "step": 65100 + }, + { + "epoch": 0.6371231738896761, + "grad_norm": 0.6148577928543091, + "learning_rate": 1.8145614463718544e-05, + "loss": 0.6402, + "step": 65200 + }, + { + "epoch": 0.6381003566717155, + "grad_norm": 0.686576247215271, + "learning_rate": 1.8096750549719032e-05, + "loss": 0.6361, + "step": 65300 + }, + { + "epoch": 0.6390775394537548, + "grad_norm": 1.5292381048202515, + "learning_rate": 1.804788663571952e-05, + "loss": 0.6263, + "step": 65400 + }, + { + "epoch": 0.6400547222357942, + "grad_norm": 0.7201911807060242, + "learning_rate": 1.799902272172001e-05, + "loss": 0.6402, + "step": 65500 + }, + { + "epoch": 0.6410319050178336, + "grad_norm": 0.7407404184341431, + "learning_rate": 1.79501588077205e-05, + "loss": 0.6149, + "step": 65600 + }, + { + "epoch": 0.642009087799873, + "grad_norm": 0.7911986708641052, + "learning_rate": 1.7901294893720988e-05, + "loss": 0.6273, + "step": 65700 + }, + { + "epoch": 0.6429862705819124, + "grad_norm": 0.467869371175766, + "learning_rate": 1.7852430979721476e-05, + "loss": 0.6344, + "step": 65800 + }, + { + "epoch": 0.6439634533639518, + "grad_norm": 1.0182818174362183, + "learning_rate": 1.7803567065721967e-05, + "loss": 0.612, + "step": 65900 + }, + { + "epoch": 0.6449406361459911, + "grad_norm": 0.5325811505317688, + "learning_rate": 1.7754703151722455e-05, + "loss": 0.6427, + "step": 66000 + }, + { + "epoch": 0.6459178189280305, + "grad_norm": 1.1324542760849, + "learning_rate": 1.7705839237722943e-05, + "loss": 0.6161, + "step": 66100 + }, + { + "epoch": 0.6468950017100699, + "grad_norm": 0.7836804389953613, + "learning_rate": 1.765697532372343e-05, + "loss": 0.632, + "step": 66200 + }, + { + "epoch": 0.6478721844921093, + "grad_norm": 0.6157903075218201, + "learning_rate": 1.760811140972392e-05, + "loss": 0.6497, + "step": 66300 + }, + { + "epoch": 0.6488493672741487, + "grad_norm": 0.776150643825531, + "learning_rate": 1.755924749572441e-05, + "loss": 0.5929, + "step": 66400 + }, + { + "epoch": 0.6498265500561881, + "grad_norm": 0.6307646036148071, + "learning_rate": 1.7510383581724896e-05, + "loss": 0.66, + "step": 66500 + }, + { + "epoch": 0.6508037328382273, + "grad_norm": 0.5305992364883423, + "learning_rate": 1.7461519667725384e-05, + "loss": 0.5985, + "step": 66600 + }, + { + "epoch": 0.6517809156202667, + "grad_norm": 0.6581500172615051, + "learning_rate": 1.7412655753725872e-05, + "loss": 0.6393, + "step": 66700 + }, + { + "epoch": 0.6527580984023061, + "grad_norm": 1.0988273620605469, + "learning_rate": 1.7363791839726363e-05, + "loss": 0.6453, + "step": 66800 + }, + { + "epoch": 0.6537352811843455, + "grad_norm": 0.6662785410881042, + "learning_rate": 1.731492792572685e-05, + "loss": 0.6831, + "step": 66900 + }, + { + "epoch": 0.6547124639663849, + "grad_norm": 0.5156288743019104, + "learning_rate": 1.726606401172734e-05, + "loss": 0.647, + "step": 67000 + }, + { + "epoch": 0.6556896467484243, + "grad_norm": 0.8832482695579529, + "learning_rate": 1.7217200097727827e-05, + "loss": 0.6263, + "step": 67100 + }, + { + "epoch": 0.6566668295304636, + "grad_norm": 0.8194277882575989, + "learning_rate": 1.716833618372832e-05, + "loss": 0.6293, + "step": 67200 + }, + { + "epoch": 0.657644012312503, + "grad_norm": 0.5544142127037048, + "learning_rate": 1.7119472269728807e-05, + "loss": 0.6207, + "step": 67300 + }, + { + "epoch": 0.6586211950945424, + "grad_norm": 1.0161030292510986, + "learning_rate": 1.7070608355729295e-05, + "loss": 0.6166, + "step": 67400 + }, + { + "epoch": 0.6595983778765818, + "grad_norm": 1.1273646354675293, + "learning_rate": 1.7021744441729783e-05, + "loss": 0.6326, + "step": 67500 + }, + { + "epoch": 0.6605755606586212, + "grad_norm": 0.5743687748908997, + "learning_rate": 1.697288052773027e-05, + "loss": 0.5943, + "step": 67600 + }, + { + "epoch": 0.6615527434406606, + "grad_norm": 0.5743625164031982, + "learning_rate": 1.6924016613730763e-05, + "loss": 0.6337, + "step": 67700 + }, + { + "epoch": 0.6625299262226999, + "grad_norm": 0.47358232736587524, + "learning_rate": 1.687515269973125e-05, + "loss": 0.6272, + "step": 67800 + }, + { + "epoch": 0.6635071090047393, + "grad_norm": 0.7825568318367004, + "learning_rate": 1.682628878573174e-05, + "loss": 0.6407, + "step": 67900 + }, + { + "epoch": 0.6644842917867787, + "grad_norm": 1.0739299058914185, + "learning_rate": 1.6777424871732227e-05, + "loss": 0.6213, + "step": 68000 + }, + { + "epoch": 0.6654614745688181, + "grad_norm": 0.6242460608482361, + "learning_rate": 1.6728560957732715e-05, + "loss": 0.6247, + "step": 68100 + }, + { + "epoch": 0.6664386573508575, + "grad_norm": 0.674392580986023, + "learning_rate": 1.6679697043733203e-05, + "loss": 0.6405, + "step": 68200 + }, + { + "epoch": 0.6674158401328969, + "grad_norm": 0.4114531874656677, + "learning_rate": 1.663083312973369e-05, + "loss": 0.6235, + "step": 68300 + }, + { + "epoch": 0.6683930229149362, + "grad_norm": 0.5812088847160339, + "learning_rate": 1.658196921573418e-05, + "loss": 0.6175, + "step": 68400 + }, + { + "epoch": 0.6693702056969756, + "grad_norm": 0.48696669936180115, + "learning_rate": 1.653310530173467e-05, + "loss": 0.6264, + "step": 68500 + }, + { + "epoch": 0.670347388479015, + "grad_norm": 0.5733768939971924, + "learning_rate": 1.648424138773516e-05, + "loss": 0.6371, + "step": 68600 + }, + { + "epoch": 0.6713245712610544, + "grad_norm": 0.9609115123748779, + "learning_rate": 1.6435377473735647e-05, + "loss": 0.618, + "step": 68700 + }, + { + "epoch": 0.6723017540430938, + "grad_norm": 1.226388692855835, + "learning_rate": 1.6386513559736135e-05, + "loss": 0.6499, + "step": 68800 + }, + { + "epoch": 0.6732789368251332, + "grad_norm": 0.6776556372642517, + "learning_rate": 1.6337649645736626e-05, + "loss": 0.6356, + "step": 68900 + }, + { + "epoch": 0.6742561196071726, + "grad_norm": 0.6129021644592285, + "learning_rate": 1.6288785731737114e-05, + "loss": 0.6133, + "step": 69000 + }, + { + "epoch": 0.6752333023892119, + "grad_norm": 1.4161570072174072, + "learning_rate": 1.6239921817737602e-05, + "loss": 0.6419, + "step": 69100 + }, + { + "epoch": 0.6762104851712513, + "grad_norm": 0.5857706665992737, + "learning_rate": 1.619105790373809e-05, + "loss": 0.6227, + "step": 69200 + }, + { + "epoch": 0.6771876679532907, + "grad_norm": 0.933807909488678, + "learning_rate": 1.614219398973858e-05, + "loss": 0.6392, + "step": 69300 + }, + { + "epoch": 0.6781648507353301, + "grad_norm": 0.9411168098449707, + "learning_rate": 1.609333007573907e-05, + "loss": 0.649, + "step": 69400 + }, + { + "epoch": 0.6791420335173695, + "grad_norm": 0.5923060178756714, + "learning_rate": 1.6044466161739554e-05, + "loss": 0.6286, + "step": 69500 + }, + { + "epoch": 0.6801192162994089, + "grad_norm": 0.744339108467102, + "learning_rate": 1.5995602247740043e-05, + "loss": 0.6178, + "step": 69600 + }, + { + "epoch": 0.6810963990814481, + "grad_norm": 1.0202040672302246, + "learning_rate": 1.5946738333740534e-05, + "loss": 0.6254, + "step": 69700 + }, + { + "epoch": 0.6820735818634875, + "grad_norm": 0.8653994798660278, + "learning_rate": 1.5897874419741022e-05, + "loss": 0.6214, + "step": 69800 + }, + { + "epoch": 0.6830507646455269, + "grad_norm": 0.4566790461540222, + "learning_rate": 1.584901050574151e-05, + "loss": 0.6517, + "step": 69900 + }, + { + "epoch": 0.6840279474275663, + "grad_norm": 0.9629371166229248, + "learning_rate": 1.5800146591741998e-05, + "loss": 0.6359, + "step": 70000 + }, + { + "epoch": 0.6850051302096057, + "grad_norm": 0.7253994941711426, + "learning_rate": 1.5751282677742486e-05, + "loss": 0.6405, + "step": 70100 + }, + { + "epoch": 0.6859823129916451, + "grad_norm": 0.8287329077720642, + "learning_rate": 1.5702418763742978e-05, + "loss": 0.6085, + "step": 70200 + }, + { + "epoch": 0.6869594957736844, + "grad_norm": 0.5002869367599487, + "learning_rate": 1.5653554849743466e-05, + "loss": 0.6255, + "step": 70300 + }, + { + "epoch": 0.6879366785557238, + "grad_norm": 0.4376012682914734, + "learning_rate": 1.5604690935743954e-05, + "loss": 0.5933, + "step": 70400 + }, + { + "epoch": 0.6889138613377632, + "grad_norm": 0.756737232208252, + "learning_rate": 1.5555827021744442e-05, + "loss": 0.609, + "step": 70500 + }, + { + "epoch": 0.6898910441198026, + "grad_norm": 1.1462029218673706, + "learning_rate": 1.5506963107744933e-05, + "loss": 0.6349, + "step": 70600 + }, + { + "epoch": 0.690868226901842, + "grad_norm": 0.5806009769439697, + "learning_rate": 1.545809919374542e-05, + "loss": 0.6242, + "step": 70700 + }, + { + "epoch": 0.6918454096838814, + "grad_norm": 0.41798803210258484, + "learning_rate": 1.540923527974591e-05, + "loss": 0.6688, + "step": 70800 + }, + { + "epoch": 0.6928225924659207, + "grad_norm": 0.5598849058151245, + "learning_rate": 1.5360371365746398e-05, + "loss": 0.6371, + "step": 70900 + }, + { + "epoch": 0.6937997752479601, + "grad_norm": 1.0417990684509277, + "learning_rate": 1.5311507451746886e-05, + "loss": 0.5966, + "step": 71000 + }, + { + "epoch": 0.6947769580299995, + "grad_norm": 0.5547340512275696, + "learning_rate": 1.5262643537747374e-05, + "loss": 0.6221, + "step": 71100 + }, + { + "epoch": 0.6957541408120389, + "grad_norm": 0.4499816298484802, + "learning_rate": 1.5213779623747862e-05, + "loss": 0.6194, + "step": 71200 + }, + { + "epoch": 0.6967313235940783, + "grad_norm": 2.521627902984619, + "learning_rate": 1.5164915709748351e-05, + "loss": 0.6279, + "step": 71300 + }, + { + "epoch": 0.6977085063761177, + "grad_norm": 1.0940284729003906, + "learning_rate": 1.511605179574884e-05, + "loss": 0.6376, + "step": 71400 + }, + { + "epoch": 0.698685689158157, + "grad_norm": 0.515785276889801, + "learning_rate": 1.5067187881749328e-05, + "loss": 0.6046, + "step": 71500 + }, + { + "epoch": 0.6996628719401964, + "grad_norm": 0.5034206509590149, + "learning_rate": 1.5018323967749817e-05, + "loss": 0.6036, + "step": 71600 + }, + { + "epoch": 0.7006400547222358, + "grad_norm": 0.6637565493583679, + "learning_rate": 1.4969460053750305e-05, + "loss": 0.6288, + "step": 71700 + }, + { + "epoch": 0.7016172375042752, + "grad_norm": 0.7677326202392578, + "learning_rate": 1.4920596139750795e-05, + "loss": 0.655, + "step": 71800 + }, + { + "epoch": 0.7025944202863146, + "grad_norm": 0.6796774864196777, + "learning_rate": 1.4871732225751283e-05, + "loss": 0.5955, + "step": 71900 + }, + { + "epoch": 0.703571603068354, + "grad_norm": 0.9217430353164673, + "learning_rate": 1.4822868311751773e-05, + "loss": 0.6268, + "step": 72000 + }, + { + "epoch": 0.7045487858503933, + "grad_norm": 0.846118688583374, + "learning_rate": 1.4774004397752261e-05, + "loss": 0.6345, + "step": 72100 + }, + { + "epoch": 0.7055259686324327, + "grad_norm": 0.7406280040740967, + "learning_rate": 1.472514048375275e-05, + "loss": 0.631, + "step": 72200 + }, + { + "epoch": 0.7065031514144721, + "grad_norm": 0.8265899419784546, + "learning_rate": 1.4676276569753239e-05, + "loss": 0.6135, + "step": 72300 + }, + { + "epoch": 0.7074803341965115, + "grad_norm": 0.7813581228256226, + "learning_rate": 1.4627412655753727e-05, + "loss": 0.6448, + "step": 72400 + }, + { + "epoch": 0.7084575169785509, + "grad_norm": 0.4718623757362366, + "learning_rate": 1.4578548741754217e-05, + "loss": 0.5952, + "step": 72500 + }, + { + "epoch": 0.7094346997605903, + "grad_norm": 2.193324565887451, + "learning_rate": 1.4529684827754703e-05, + "loss": 0.6199, + "step": 72600 + }, + { + "epoch": 0.7104118825426295, + "grad_norm": 1.0357561111450195, + "learning_rate": 1.4480820913755191e-05, + "loss": 0.6342, + "step": 72700 + }, + { + "epoch": 0.711389065324669, + "grad_norm": 1.0319572687149048, + "learning_rate": 1.4431956999755681e-05, + "loss": 0.5836, + "step": 72800 + }, + { + "epoch": 0.7123662481067083, + "grad_norm": 1.0852116346359253, + "learning_rate": 1.4383093085756169e-05, + "loss": 0.6246, + "step": 72900 + }, + { + "epoch": 0.7133434308887477, + "grad_norm": 0.5591370463371277, + "learning_rate": 1.4334229171756659e-05, + "loss": 0.6022, + "step": 73000 + }, + { + "epoch": 0.7143206136707871, + "grad_norm": 1.129408836364746, + "learning_rate": 1.4285365257757147e-05, + "loss": 0.6414, + "step": 73100 + }, + { + "epoch": 0.7152977964528265, + "grad_norm": 0.9241653680801392, + "learning_rate": 1.4236501343757635e-05, + "loss": 0.5954, + "step": 73200 + }, + { + "epoch": 0.7162749792348658, + "grad_norm": 0.5140904188156128, + "learning_rate": 1.4187637429758125e-05, + "loss": 0.6499, + "step": 73300 + }, + { + "epoch": 0.7172521620169052, + "grad_norm": 0.8134740591049194, + "learning_rate": 1.4138773515758613e-05, + "loss": 0.6199, + "step": 73400 + }, + { + "epoch": 0.7182293447989446, + "grad_norm": 0.8259909749031067, + "learning_rate": 1.4089909601759102e-05, + "loss": 0.6181, + "step": 73500 + }, + { + "epoch": 0.719206527580984, + "grad_norm": 0.7081485390663147, + "learning_rate": 1.404104568775959e-05, + "loss": 0.6056, + "step": 73600 + }, + { + "epoch": 0.7201837103630234, + "grad_norm": 0.7906745076179504, + "learning_rate": 1.399218177376008e-05, + "loss": 0.6341, + "step": 73700 + }, + { + "epoch": 0.7211608931450628, + "grad_norm": 0.5661380290985107, + "learning_rate": 1.3943317859760568e-05, + "loss": 0.621, + "step": 73800 + }, + { + "epoch": 0.7221380759271022, + "grad_norm": 1.0971596240997314, + "learning_rate": 1.3894453945761058e-05, + "loss": 0.6261, + "step": 73900 + }, + { + "epoch": 0.7231152587091415, + "grad_norm": 1.6842643022537231, + "learning_rate": 1.3845590031761546e-05, + "loss": 0.6065, + "step": 74000 + }, + { + "epoch": 0.7240924414911809, + "grad_norm": 1.0033600330352783, + "learning_rate": 1.3796726117762033e-05, + "loss": 0.6364, + "step": 74100 + }, + { + "epoch": 0.7250696242732203, + "grad_norm": 0.8704243898391724, + "learning_rate": 1.374786220376252e-05, + "loss": 0.6259, + "step": 74200 + }, + { + "epoch": 0.7260468070552597, + "grad_norm": 0.855398416519165, + "learning_rate": 1.369899828976301e-05, + "loss": 0.653, + "step": 74300 + }, + { + "epoch": 0.7270239898372991, + "grad_norm": 1.733904480934143, + "learning_rate": 1.3650134375763498e-05, + "loss": 0.6284, + "step": 74400 + }, + { + "epoch": 0.7280011726193385, + "grad_norm": 0.49585819244384766, + "learning_rate": 1.3601270461763988e-05, + "loss": 0.6165, + "step": 74500 + }, + { + "epoch": 0.7289783554013778, + "grad_norm": 0.5818326473236084, + "learning_rate": 1.3552406547764476e-05, + "loss": 0.6403, + "step": 74600 + }, + { + "epoch": 0.7299555381834172, + "grad_norm": 0.8778244853019714, + "learning_rate": 1.3503542633764964e-05, + "loss": 0.5963, + "step": 74700 + }, + { + "epoch": 0.7309327209654566, + "grad_norm": 0.6378918290138245, + "learning_rate": 1.3454678719765454e-05, + "loss": 0.6242, + "step": 74800 + }, + { + "epoch": 0.731909903747496, + "grad_norm": 0.792775571346283, + "learning_rate": 1.3405814805765942e-05, + "loss": 0.6348, + "step": 74900 + }, + { + "epoch": 0.7328870865295354, + "grad_norm": 0.8906835317611694, + "learning_rate": 1.3356950891766432e-05, + "loss": 0.6074, + "step": 75000 + }, + { + "epoch": 0.7338642693115748, + "grad_norm": 0.7266893982887268, + "learning_rate": 1.330808697776692e-05, + "loss": 0.6253, + "step": 75100 + }, + { + "epoch": 0.7348414520936141, + "grad_norm": 0.6896129250526428, + "learning_rate": 1.325922306376741e-05, + "loss": 0.6273, + "step": 75200 + }, + { + "epoch": 0.7358186348756535, + "grad_norm": 1.0812867879867554, + "learning_rate": 1.3210359149767898e-05, + "loss": 0.6474, + "step": 75300 + }, + { + "epoch": 0.7367958176576929, + "grad_norm": 0.6664975881576538, + "learning_rate": 1.3161495235768388e-05, + "loss": 0.6114, + "step": 75400 + }, + { + "epoch": 0.7377730004397323, + "grad_norm": 0.6565041542053223, + "learning_rate": 1.3112631321768876e-05, + "loss": 0.6059, + "step": 75500 + }, + { + "epoch": 0.7387501832217717, + "grad_norm": 0.5191747546195984, + "learning_rate": 1.3063767407769362e-05, + "loss": 0.6, + "step": 75600 + }, + { + "epoch": 0.7397273660038111, + "grad_norm": 0.9525347948074341, + "learning_rate": 1.301490349376985e-05, + "loss": 0.6032, + "step": 75700 + }, + { + "epoch": 0.7407045487858503, + "grad_norm": 1.1167237758636475, + "learning_rate": 1.296603957977034e-05, + "loss": 0.6095, + "step": 75800 + }, + { + "epoch": 0.7416817315678897, + "grad_norm": 0.8300033807754517, + "learning_rate": 1.2917175665770828e-05, + "loss": 0.6246, + "step": 75900 + }, + { + "epoch": 0.7426589143499291, + "grad_norm": 0.7098196148872375, + "learning_rate": 1.2868311751771318e-05, + "loss": 0.6188, + "step": 76000 + }, + { + "epoch": 0.7436360971319685, + "grad_norm": 0.42002958059310913, + "learning_rate": 1.2819447837771806e-05, + "loss": 0.5943, + "step": 76100 + }, + { + "epoch": 0.7446132799140079, + "grad_norm": 0.7477664947509766, + "learning_rate": 1.2770583923772295e-05, + "loss": 0.6368, + "step": 76200 + }, + { + "epoch": 0.7455904626960473, + "grad_norm": 1.2381956577301025, + "learning_rate": 1.2721720009772783e-05, + "loss": 0.6528, + "step": 76300 + }, + { + "epoch": 0.7465676454780866, + "grad_norm": 0.46650367975234985, + "learning_rate": 1.2672856095773272e-05, + "loss": 0.6062, + "step": 76400 + }, + { + "epoch": 0.747544828260126, + "grad_norm": 0.9223760366439819, + "learning_rate": 1.2623992181773761e-05, + "loss": 0.6386, + "step": 76500 + }, + { + "epoch": 0.7485220110421654, + "grad_norm": 0.6782642602920532, + "learning_rate": 1.257512826777425e-05, + "loss": 0.5926, + "step": 76600 + }, + { + "epoch": 0.7494991938242048, + "grad_norm": 0.8533148765563965, + "learning_rate": 1.2526264353774739e-05, + "loss": 0.6076, + "step": 76700 + }, + { + "epoch": 0.7504763766062442, + "grad_norm": 0.6998764276504517, + "learning_rate": 1.2477400439775225e-05, + "loss": 0.6136, + "step": 76800 + }, + { + "epoch": 0.7514535593882836, + "grad_norm": 0.4632514715194702, + "learning_rate": 1.2428536525775715e-05, + "loss": 0.6174, + "step": 76900 + }, + { + "epoch": 0.7524307421703229, + "grad_norm": 0.6624991297721863, + "learning_rate": 1.2379672611776203e-05, + "loss": 0.6053, + "step": 77000 + }, + { + "epoch": 0.7534079249523623, + "grad_norm": 0.8521330952644348, + "learning_rate": 1.2330808697776693e-05, + "loss": 0.6261, + "step": 77100 + }, + { + "epoch": 0.7543851077344017, + "grad_norm": 0.6917625665664673, + "learning_rate": 1.2281944783777181e-05, + "loss": 0.6049, + "step": 77200 + }, + { + "epoch": 0.7553622905164411, + "grad_norm": 0.4985372722148895, + "learning_rate": 1.2233080869777671e-05, + "loss": 0.6057, + "step": 77300 + }, + { + "epoch": 0.7563394732984805, + "grad_norm": 0.6484245657920837, + "learning_rate": 1.2184216955778159e-05, + "loss": 0.602, + "step": 77400 + }, + { + "epoch": 0.7573166560805199, + "grad_norm": 0.7993507981300354, + "learning_rate": 1.2135353041778647e-05, + "loss": 0.5809, + "step": 77500 + }, + { + "epoch": 0.7582938388625592, + "grad_norm": 0.6944275498390198, + "learning_rate": 1.2086489127779135e-05, + "loss": 0.5959, + "step": 77600 + }, + { + "epoch": 0.7592710216445986, + "grad_norm": 0.6688080430030823, + "learning_rate": 1.2037625213779625e-05, + "loss": 0.6038, + "step": 77700 + }, + { + "epoch": 0.760248204426638, + "grad_norm": 0.8234009742736816, + "learning_rate": 1.1988761299780113e-05, + "loss": 0.6287, + "step": 77800 + }, + { + "epoch": 0.7612253872086774, + "grad_norm": 1.0987696647644043, + "learning_rate": 1.1939897385780601e-05, + "loss": 0.631, + "step": 77900 + }, + { + "epoch": 0.7622025699907168, + "grad_norm": 0.7760794758796692, + "learning_rate": 1.189103347178109e-05, + "loss": 0.6356, + "step": 78000 + }, + { + "epoch": 0.7631797527727562, + "grad_norm": 1.422297716140747, + "learning_rate": 1.1842169557781579e-05, + "loss": 0.5983, + "step": 78100 + }, + { + "epoch": 0.7641569355547956, + "grad_norm": 0.7743082046508789, + "learning_rate": 1.1793305643782067e-05, + "loss": 0.6132, + "step": 78200 + }, + { + "epoch": 0.7651341183368349, + "grad_norm": 1.0263071060180664, + "learning_rate": 1.1744441729782555e-05, + "loss": 0.6364, + "step": 78300 + }, + { + "epoch": 0.7661113011188743, + "grad_norm": 0.49797773361206055, + "learning_rate": 1.1695577815783045e-05, + "loss": 0.6384, + "step": 78400 + }, + { + "epoch": 0.7670884839009137, + "grad_norm": 0.58949214220047, + "learning_rate": 1.1646713901783533e-05, + "loss": 0.6176, + "step": 78500 + }, + { + "epoch": 0.7680656666829531, + "grad_norm": 0.8523328304290771, + "learning_rate": 1.1597849987784022e-05, + "loss": 0.6238, + "step": 78600 + }, + { + "epoch": 0.7690428494649925, + "grad_norm": 2.231853723526001, + "learning_rate": 1.154898607378451e-05, + "loss": 0.6553, + "step": 78700 + }, + { + "epoch": 0.7700200322470319, + "grad_norm": 0.7179421782493591, + "learning_rate": 1.1500122159785e-05, + "loss": 0.6222, + "step": 78800 + }, + { + "epoch": 0.7709972150290711, + "grad_norm": 0.7334624528884888, + "learning_rate": 1.1451258245785488e-05, + "loss": 0.6513, + "step": 78900 + }, + { + "epoch": 0.7719743978111105, + "grad_norm": 0.8650888204574585, + "learning_rate": 1.1402394331785976e-05, + "loss": 0.6382, + "step": 79000 + }, + { + "epoch": 0.77295158059315, + "grad_norm": 1.277421474456787, + "learning_rate": 1.1353530417786465e-05, + "loss": 0.6032, + "step": 79100 + }, + { + "epoch": 0.7739287633751893, + "grad_norm": 0.4764556288719177, + "learning_rate": 1.1304666503786954e-05, + "loss": 0.5852, + "step": 79200 + }, + { + "epoch": 0.7749059461572287, + "grad_norm": 0.7180933952331543, + "learning_rate": 1.1255802589787442e-05, + "loss": 0.6271, + "step": 79300 + }, + { + "epoch": 0.7758831289392681, + "grad_norm": 0.6978940367698669, + "learning_rate": 1.1206938675787932e-05, + "loss": 0.6252, + "step": 79400 + }, + { + "epoch": 0.7768603117213074, + "grad_norm": 0.9205247759819031, + "learning_rate": 1.115807476178842e-05, + "loss": 0.6227, + "step": 79500 + }, + { + "epoch": 0.7778374945033468, + "grad_norm": 0.6126120686531067, + "learning_rate": 1.1109210847788908e-05, + "loss": 0.6164, + "step": 79600 + }, + { + "epoch": 0.7788146772853862, + "grad_norm": 0.660234808921814, + "learning_rate": 1.1060346933789396e-05, + "loss": 0.6336, + "step": 79700 + }, + { + "epoch": 0.7797918600674256, + "grad_norm": 0.5239884257316589, + "learning_rate": 1.1011483019789886e-05, + "loss": 0.6324, + "step": 79800 + }, + { + "epoch": 0.780769042849465, + "grad_norm": 0.6763221621513367, + "learning_rate": 1.0962619105790374e-05, + "loss": 0.6063, + "step": 79900 + }, + { + "epoch": 0.7817462256315044, + "grad_norm": 0.6201728582382202, + "learning_rate": 1.0913755191790862e-05, + "loss": 0.6168, + "step": 80000 + }, + { + "epoch": 0.7827234084135437, + "grad_norm": 0.8859091997146606, + "learning_rate": 1.0864891277791352e-05, + "loss": 0.593, + "step": 80100 + }, + { + "epoch": 0.7837005911955831, + "grad_norm": 0.7334877848625183, + "learning_rate": 1.081602736379184e-05, + "loss": 0.6225, + "step": 80200 + }, + { + "epoch": 0.7846777739776225, + "grad_norm": 0.49573615193367004, + "learning_rate": 1.076716344979233e-05, + "loss": 0.6007, + "step": 80300 + }, + { + "epoch": 0.7856549567596619, + "grad_norm": 1.1509833335876465, + "learning_rate": 1.0718299535792818e-05, + "loss": 0.587, + "step": 80400 + }, + { + "epoch": 0.7866321395417013, + "grad_norm": 0.6591099500656128, + "learning_rate": 1.0669435621793306e-05, + "loss": 0.6462, + "step": 80500 + }, + { + "epoch": 0.7876093223237407, + "grad_norm": 0.7265052199363708, + "learning_rate": 1.0620571707793794e-05, + "loss": 0.6183, + "step": 80600 + }, + { + "epoch": 0.78858650510578, + "grad_norm": 1.2156593799591064, + "learning_rate": 1.0571707793794284e-05, + "loss": 0.5811, + "step": 80700 + }, + { + "epoch": 0.7895636878878194, + "grad_norm": 0.960753858089447, + "learning_rate": 1.0522843879794772e-05, + "loss": 0.6054, + "step": 80800 + }, + { + "epoch": 0.7905408706698588, + "grad_norm": 1.5062034130096436, + "learning_rate": 1.0473979965795262e-05, + "loss": 0.599, + "step": 80900 + }, + { + "epoch": 0.7915180534518982, + "grad_norm": 0.7047529816627502, + "learning_rate": 1.042511605179575e-05, + "loss": 0.6149, + "step": 81000 + }, + { + "epoch": 0.7924952362339376, + "grad_norm": 0.4432947337627411, + "learning_rate": 1.037625213779624e-05, + "loss": 0.6182, + "step": 81100 + }, + { + "epoch": 0.793472419015977, + "grad_norm": 0.6442515850067139, + "learning_rate": 1.0327388223796726e-05, + "loss": 0.5864, + "step": 81200 + }, + { + "epoch": 0.7944496017980163, + "grad_norm": 1.2354743480682373, + "learning_rate": 1.0278524309797215e-05, + "loss": 0.6068, + "step": 81300 + }, + { + "epoch": 0.7954267845800557, + "grad_norm": 0.7862667441368103, + "learning_rate": 1.0229660395797704e-05, + "loss": 0.6072, + "step": 81400 + }, + { + "epoch": 0.7964039673620951, + "grad_norm": 0.5142656564712524, + "learning_rate": 1.0180796481798192e-05, + "loss": 0.6009, + "step": 81500 + }, + { + "epoch": 0.7973811501441345, + "grad_norm": 0.8478522300720215, + "learning_rate": 1.0131932567798681e-05, + "loss": 0.5979, + "step": 81600 + }, + { + "epoch": 0.7983583329261739, + "grad_norm": 0.5929884910583496, + "learning_rate": 1.008306865379917e-05, + "loss": 0.6076, + "step": 81700 + }, + { + "epoch": 0.7993355157082133, + "grad_norm": 0.8067489862442017, + "learning_rate": 1.003420473979966e-05, + "loss": 0.6123, + "step": 81800 + }, + { + "epoch": 0.8003126984902525, + "grad_norm": 1.3287664651870728, + "learning_rate": 9.985340825800147e-06, + "loss": 0.6151, + "step": 81900 + }, + { + "epoch": 0.801289881272292, + "grad_norm": 0.7158493995666504, + "learning_rate": 9.936476911800635e-06, + "loss": 0.5906, + "step": 82000 + }, + { + "epoch": 0.8022670640543313, + "grad_norm": 0.7307409644126892, + "learning_rate": 9.887612997801123e-06, + "loss": 0.6165, + "step": 82100 + }, + { + "epoch": 0.8032442468363707, + "grad_norm": 0.6903741359710693, + "learning_rate": 9.838749083801613e-06, + "loss": 0.6175, + "step": 82200 + }, + { + "epoch": 0.8042214296184101, + "grad_norm": 0.7754660248756409, + "learning_rate": 9.789885169802101e-06, + "loss": 0.6349, + "step": 82300 + }, + { + "epoch": 0.8051986124004495, + "grad_norm": 0.7808040976524353, + "learning_rate": 9.741021255802591e-06, + "loss": 0.5909, + "step": 82400 + }, + { + "epoch": 0.8061757951824888, + "grad_norm": 0.8575007915496826, + "learning_rate": 9.692157341803079e-06, + "loss": 0.5861, + "step": 82500 + }, + { + "epoch": 0.8071529779645282, + "grad_norm": 1.18577241897583, + "learning_rate": 9.643293427803569e-06, + "loss": 0.6137, + "step": 82600 + }, + { + "epoch": 0.8081301607465676, + "grad_norm": 0.7913909554481506, + "learning_rate": 9.594429513804057e-06, + "loss": 0.6077, + "step": 82700 + }, + { + "epoch": 0.809107343528607, + "grad_norm": 0.8221011161804199, + "learning_rate": 9.545565599804545e-06, + "loss": 0.5946, + "step": 82800 + }, + { + "epoch": 0.8100845263106464, + "grad_norm": 0.7047521471977234, + "learning_rate": 9.496701685805033e-06, + "loss": 0.5973, + "step": 82900 + }, + { + "epoch": 0.8110617090926858, + "grad_norm": 0.5717597007751465, + "learning_rate": 9.447837771805523e-06, + "loss": 0.6236, + "step": 83000 + }, + { + "epoch": 0.8120388918747252, + "grad_norm": 0.93315190076828, + "learning_rate": 9.39897385780601e-06, + "loss": 0.6335, + "step": 83100 + }, + { + "epoch": 0.8130160746567645, + "grad_norm": 0.7691722512245178, + "learning_rate": 9.350109943806499e-06, + "loss": 0.5986, + "step": 83200 + }, + { + "epoch": 0.8139932574388039, + "grad_norm": 0.8947746157646179, + "learning_rate": 9.301246029806989e-06, + "loss": 0.5995, + "step": 83300 + }, + { + "epoch": 0.8149704402208433, + "grad_norm": 0.8654600381851196, + "learning_rate": 9.252382115807477e-06, + "loss": 0.5844, + "step": 83400 + }, + { + "epoch": 0.8159476230028827, + "grad_norm": 0.6563751697540283, + "learning_rate": 9.203518201807965e-06, + "loss": 0.588, + "step": 83500 + }, + { + "epoch": 0.8169248057849221, + "grad_norm": 0.756237804889679, + "learning_rate": 9.154654287808453e-06, + "loss": 0.5814, + "step": 83600 + }, + { + "epoch": 0.8179019885669615, + "grad_norm": 1.106650948524475, + "learning_rate": 9.105790373808943e-06, + "loss": 0.5924, + "step": 83700 + }, + { + "epoch": 0.8188791713490008, + "grad_norm": 0.39193272590637207, + "learning_rate": 9.05692645980943e-06, + "loss": 0.6048, + "step": 83800 + }, + { + "epoch": 0.8198563541310402, + "grad_norm": 0.7022530436515808, + "learning_rate": 9.00806254580992e-06, + "loss": 0.624, + "step": 83900 + }, + { + "epoch": 0.8208335369130796, + "grad_norm": 0.7286639213562012, + "learning_rate": 8.959198631810408e-06, + "loss": 0.5825, + "step": 84000 + }, + { + "epoch": 0.821810719695119, + "grad_norm": 0.9062661528587341, + "learning_rate": 8.910334717810898e-06, + "loss": 0.6024, + "step": 84100 + }, + { + "epoch": 0.8227879024771584, + "grad_norm": 1.0051745176315308, + "learning_rate": 8.861470803811386e-06, + "loss": 0.5881, + "step": 84200 + }, + { + "epoch": 0.8237650852591978, + "grad_norm": 0.5622514486312866, + "learning_rate": 8.812606889811874e-06, + "loss": 0.625, + "step": 84300 + }, + { + "epoch": 0.8247422680412371, + "grad_norm": 0.80225670337677, + "learning_rate": 8.763742975812362e-06, + "loss": 0.6142, + "step": 84400 + }, + { + "epoch": 0.8257194508232765, + "grad_norm": 0.7154406905174255, + "learning_rate": 8.714879061812852e-06, + "loss": 0.6009, + "step": 84500 + }, + { + "epoch": 0.8266966336053159, + "grad_norm": 0.8191014528274536, + "learning_rate": 8.66601514781334e-06, + "loss": 0.6054, + "step": 84600 + }, + { + "epoch": 0.8276738163873553, + "grad_norm": 1.4982640743255615, + "learning_rate": 8.617151233813828e-06, + "loss": 0.5917, + "step": 84700 + }, + { + "epoch": 0.8286509991693947, + "grad_norm": 0.6662930250167847, + "learning_rate": 8.568287319814318e-06, + "loss": 0.6047, + "step": 84800 + }, + { + "epoch": 0.8296281819514341, + "grad_norm": 0.8533642888069153, + "learning_rate": 8.519423405814806e-06, + "loss": 0.6275, + "step": 84900 + }, + { + "epoch": 0.8306053647334734, + "grad_norm": 1.0405080318450928, + "learning_rate": 8.470559491815294e-06, + "loss": 0.6325, + "step": 85000 + }, + { + "epoch": 0.8315825475155127, + "grad_norm": 0.3838236629962921, + "learning_rate": 8.421695577815782e-06, + "loss": 0.617, + "step": 85100 + }, + { + "epoch": 0.8325597302975521, + "grad_norm": 0.7229349613189697, + "learning_rate": 8.372831663816272e-06, + "loss": 0.6095, + "step": 85200 + }, + { + "epoch": 0.8335369130795915, + "grad_norm": 0.538932204246521, + "learning_rate": 8.32396774981676e-06, + "loss": 0.597, + "step": 85300 + }, + { + "epoch": 0.834514095861631, + "grad_norm": 0.9081258177757263, + "learning_rate": 8.27510383581725e-06, + "loss": 0.576, + "step": 85400 + }, + { + "epoch": 0.8354912786436703, + "grad_norm": 1.1647875308990479, + "learning_rate": 8.226239921817738e-06, + "loss": 0.6177, + "step": 85500 + }, + { + "epoch": 0.8364684614257096, + "grad_norm": 0.5544024705886841, + "learning_rate": 8.177376007818228e-06, + "loss": 0.5944, + "step": 85600 + }, + { + "epoch": 0.837445644207749, + "grad_norm": 0.49571287631988525, + "learning_rate": 8.128512093818716e-06, + "loss": 0.6417, + "step": 85700 + }, + { + "epoch": 0.8384228269897884, + "grad_norm": 0.8068299293518066, + "learning_rate": 8.079648179819204e-06, + "loss": 0.6224, + "step": 85800 + }, + { + "epoch": 0.8394000097718278, + "grad_norm": 0.9682297706604004, + "learning_rate": 8.030784265819692e-06, + "loss": 0.6111, + "step": 85900 + }, + { + "epoch": 0.8403771925538672, + "grad_norm": 1.051151990890503, + "learning_rate": 7.981920351820182e-06, + "loss": 0.6, + "step": 86000 + }, + { + "epoch": 0.8413543753359066, + "grad_norm": 0.568880558013916, + "learning_rate": 7.93305643782067e-06, + "loss": 0.6129, + "step": 86100 + }, + { + "epoch": 0.8423315581179459, + "grad_norm": 0.7681874632835388, + "learning_rate": 7.88419252382116e-06, + "loss": 0.6291, + "step": 86200 + }, + { + "epoch": 0.8433087408999853, + "grad_norm": 0.7521129250526428, + "learning_rate": 7.835328609821647e-06, + "loss": 0.5983, + "step": 86300 + }, + { + "epoch": 0.8442859236820247, + "grad_norm": 0.6910899877548218, + "learning_rate": 7.786464695822136e-06, + "loss": 0.6065, + "step": 86400 + }, + { + "epoch": 0.8452631064640641, + "grad_norm": 1.0774552822113037, + "learning_rate": 7.737600781822624e-06, + "loss": 0.6481, + "step": 86500 + }, + { + "epoch": 0.8462402892461035, + "grad_norm": 0.5744395852088928, + "learning_rate": 7.688736867823113e-06, + "loss": 0.5881, + "step": 86600 + }, + { + "epoch": 0.8472174720281429, + "grad_norm": 0.9754884839057922, + "learning_rate": 7.639872953823601e-06, + "loss": 0.6028, + "step": 86700 + }, + { + "epoch": 0.8481946548101822, + "grad_norm": 0.5664985775947571, + "learning_rate": 7.59100903982409e-06, + "loss": 0.5759, + "step": 86800 + }, + { + "epoch": 0.8491718375922216, + "grad_norm": 0.7173051238059998, + "learning_rate": 7.542145125824579e-06, + "loss": 0.6038, + "step": 86900 + }, + { + "epoch": 0.850149020374261, + "grad_norm": 0.5157271027565002, + "learning_rate": 7.493281211825068e-06, + "loss": 0.5872, + "step": 87000 + }, + { + "epoch": 0.8511262031563004, + "grad_norm": 2.847447156906128, + "learning_rate": 7.444417297825557e-06, + "loss": 0.6008, + "step": 87100 + }, + { + "epoch": 0.8521033859383398, + "grad_norm": 1.259730577468872, + "learning_rate": 7.395553383826045e-06, + "loss": 0.6047, + "step": 87200 + }, + { + "epoch": 0.8530805687203792, + "grad_norm": 0.5175238847732544, + "learning_rate": 7.346689469826533e-06, + "loss": 0.6294, + "step": 87300 + }, + { + "epoch": 0.8540577515024186, + "grad_norm": 0.5168502926826477, + "learning_rate": 7.297825555827022e-06, + "loss": 0.5987, + "step": 87400 + }, + { + "epoch": 0.8550349342844579, + "grad_norm": 0.7485826015472412, + "learning_rate": 7.24896164182751e-06, + "loss": 0.604, + "step": 87500 + }, + { + "epoch": 0.8560121170664973, + "grad_norm": 1.2643144130706787, + "learning_rate": 7.200097727827999e-06, + "loss": 0.6271, + "step": 87600 + }, + { + "epoch": 0.8569892998485367, + "grad_norm": 0.598031222820282, + "learning_rate": 7.151233813828488e-06, + "loss": 0.6201, + "step": 87700 + }, + { + "epoch": 0.8579664826305761, + "grad_norm": 0.7994399666786194, + "learning_rate": 7.102369899828977e-06, + "loss": 0.6028, + "step": 87800 + }, + { + "epoch": 0.8589436654126155, + "grad_norm": 0.47928521037101746, + "learning_rate": 7.053505985829466e-06, + "loss": 0.6042, + "step": 87900 + }, + { + "epoch": 0.8599208481946549, + "grad_norm": 0.6901227831840515, + "learning_rate": 7.004642071829953e-06, + "loss": 0.6289, + "step": 88000 + }, + { + "epoch": 0.8608980309766942, + "grad_norm": 0.9630447030067444, + "learning_rate": 6.955778157830442e-06, + "loss": 0.6097, + "step": 88100 + }, + { + "epoch": 0.8618752137587335, + "grad_norm": 0.42696672677993774, + "learning_rate": 6.906914243830931e-06, + "loss": 0.6314, + "step": 88200 + }, + { + "epoch": 0.862852396540773, + "grad_norm": 0.5964066982269287, + "learning_rate": 6.85805032983142e-06, + "loss": 0.5934, + "step": 88300 + }, + { + "epoch": 0.8638295793228123, + "grad_norm": 0.5652678608894348, + "learning_rate": 6.809186415831909e-06, + "loss": 0.6032, + "step": 88400 + }, + { + "epoch": 0.8648067621048517, + "grad_norm": 0.6129952669143677, + "learning_rate": 6.7603225018323976e-06, + "loss": 0.6116, + "step": 88500 + }, + { + "epoch": 0.8657839448868911, + "grad_norm": 0.5786252021789551, + "learning_rate": 6.7114585878328865e-06, + "loss": 0.6042, + "step": 88600 + }, + { + "epoch": 0.8667611276689304, + "grad_norm": 0.9830735325813293, + "learning_rate": 6.662594673833375e-06, + "loss": 0.5763, + "step": 88700 + }, + { + "epoch": 0.8677383104509698, + "grad_norm": 0.7167491316795349, + "learning_rate": 6.613730759833863e-06, + "loss": 0.5774, + "step": 88800 + }, + { + "epoch": 0.8687154932330092, + "grad_norm": 0.5763813257217407, + "learning_rate": 6.5648668458343515e-06, + "loss": 0.6219, + "step": 88900 + }, + { + "epoch": 0.8696926760150486, + "grad_norm": 0.552343487739563, + "learning_rate": 6.5160029318348404e-06, + "loss": 0.5983, + "step": 89000 + }, + { + "epoch": 0.870669858797088, + "grad_norm": 0.6471940279006958, + "learning_rate": 6.4671390178353285e-06, + "loss": 0.616, + "step": 89100 + }, + { + "epoch": 0.8716470415791274, + "grad_norm": 0.2821710407733917, + "learning_rate": 6.418275103835817e-06, + "loss": 0.6093, + "step": 89200 + }, + { + "epoch": 0.8726242243611667, + "grad_norm": 0.8784298896789551, + "learning_rate": 6.369411189836306e-06, + "loss": 0.6004, + "step": 89300 + }, + { + "epoch": 0.8736014071432061, + "grad_norm": 0.5774518847465515, + "learning_rate": 6.320547275836795e-06, + "loss": 0.6177, + "step": 89400 + }, + { + "epoch": 0.8745785899252455, + "grad_norm": 2.489976406097412, + "learning_rate": 6.2716833618372825e-06, + "loss": 0.6294, + "step": 89500 + }, + { + "epoch": 0.8755557727072849, + "grad_norm": 0.8063492774963379, + "learning_rate": 6.222819447837772e-06, + "loss": 0.5815, + "step": 89600 + }, + { + "epoch": 0.8765329554893243, + "grad_norm": 0.9328792095184326, + "learning_rate": 6.17395553383826e-06, + "loss": 0.5709, + "step": 89700 + }, + { + "epoch": 0.8775101382713637, + "grad_norm": 1.1980705261230469, + "learning_rate": 6.125091619838749e-06, + "loss": 0.5916, + "step": 89800 + }, + { + "epoch": 0.878487321053403, + "grad_norm": 0.9140294194221497, + "learning_rate": 6.076227705839238e-06, + "loss": 0.5975, + "step": 89900 + }, + { + "epoch": 0.8794645038354424, + "grad_norm": 0.42323464155197144, + "learning_rate": 6.027363791839727e-06, + "loss": 0.5908, + "step": 90000 + }, + { + "epoch": 0.8804416866174818, + "grad_norm": 0.8265115022659302, + "learning_rate": 5.978499877840215e-06, + "loss": 0.6236, + "step": 90100 + }, + { + "epoch": 0.8814188693995212, + "grad_norm": 0.6848395466804504, + "learning_rate": 5.929635963840704e-06, + "loss": 0.6081, + "step": 90200 + }, + { + "epoch": 0.8823960521815606, + "grad_norm": 0.8593265414237976, + "learning_rate": 5.880772049841193e-06, + "loss": 0.5926, + "step": 90300 + }, + { + "epoch": 0.8833732349636, + "grad_norm": 0.9084621667861938, + "learning_rate": 5.831908135841682e-06, + "loss": 0.5795, + "step": 90400 + }, + { + "epoch": 0.8843504177456393, + "grad_norm": 0.5158432126045227, + "learning_rate": 5.78304422184217e-06, + "loss": 0.5887, + "step": 90500 + }, + { + "epoch": 0.8853276005276787, + "grad_norm": 0.9710085988044739, + "learning_rate": 5.734180307842659e-06, + "loss": 0.5888, + "step": 90600 + }, + { + "epoch": 0.8863047833097181, + "grad_norm": 0.4963410794734955, + "learning_rate": 5.685316393843147e-06, + "loss": 0.5981, + "step": 90700 + }, + { + "epoch": 0.8872819660917575, + "grad_norm": 0.39078134298324585, + "learning_rate": 5.636452479843636e-06, + "loss": 0.5991, + "step": 90800 + }, + { + "epoch": 0.8882591488737969, + "grad_norm": 0.5350062847137451, + "learning_rate": 5.587588565844124e-06, + "loss": 0.5887, + "step": 90900 + }, + { + "epoch": 0.8892363316558363, + "grad_norm": 0.6059613823890686, + "learning_rate": 5.538724651844613e-06, + "loss": 0.6072, + "step": 91000 + }, + { + "epoch": 0.8902135144378756, + "grad_norm": 0.4223475158214569, + "learning_rate": 5.489860737845102e-06, + "loss": 0.5866, + "step": 91100 + }, + { + "epoch": 0.891190697219915, + "grad_norm": 0.8053774237632751, + "learning_rate": 5.44099682384559e-06, + "loss": 0.6031, + "step": 91200 + }, + { + "epoch": 0.8921678800019543, + "grad_norm": 0.8851518034934998, + "learning_rate": 5.392132909846079e-06, + "loss": 0.5766, + "step": 91300 + }, + { + "epoch": 0.8931450627839937, + "grad_norm": 0.6842949986457825, + "learning_rate": 5.3432689958465675e-06, + "loss": 0.5593, + "step": 91400 + }, + { + "epoch": 0.8941222455660331, + "grad_norm": 0.8229865431785583, + "learning_rate": 5.2944050818470564e-06, + "loss": 0.5802, + "step": 91500 + }, + { + "epoch": 0.8950994283480725, + "grad_norm": 0.7434598207473755, + "learning_rate": 5.2455411678475445e-06, + "loss": 0.6004, + "step": 91600 + }, + { + "epoch": 0.8960766111301118, + "grad_norm": 0.47747936844825745, + "learning_rate": 5.196677253848033e-06, + "loss": 0.5937, + "step": 91700 + }, + { + "epoch": 0.8970537939121512, + "grad_norm": 0.7917630076408386, + "learning_rate": 5.147813339848522e-06, + "loss": 0.6119, + "step": 91800 + }, + { + "epoch": 0.8980309766941906, + "grad_norm": 0.8409056663513184, + "learning_rate": 5.098949425849011e-06, + "loss": 0.6004, + "step": 91900 + }, + { + "epoch": 0.89900815947623, + "grad_norm": 0.5597165822982788, + "learning_rate": 5.050085511849499e-06, + "loss": 0.6076, + "step": 92000 + }, + { + "epoch": 0.8999853422582694, + "grad_norm": 0.5740428566932678, + "learning_rate": 5.001221597849988e-06, + "loss": 0.5925, + "step": 92100 + }, + { + "epoch": 0.9009625250403088, + "grad_norm": 0.739456832408905, + "learning_rate": 4.952357683850477e-06, + "loss": 0.5945, + "step": 92200 + }, + { + "epoch": 0.9019397078223482, + "grad_norm": 0.5648947954177856, + "learning_rate": 4.903493769850965e-06, + "loss": 0.5712, + "step": 92300 + }, + { + "epoch": 0.9029168906043875, + "grad_norm": 0.5736894607543945, + "learning_rate": 4.854629855851454e-06, + "loss": 0.6111, + "step": 92400 + }, + { + "epoch": 0.9038940733864269, + "grad_norm": 0.7701774835586548, + "learning_rate": 4.805765941851942e-06, + "loss": 0.599, + "step": 92500 + }, + { + "epoch": 0.9048712561684663, + "grad_norm": 0.7485201358795166, + "learning_rate": 4.756902027852431e-06, + "loss": 0.5842, + "step": 92600 + }, + { + "epoch": 0.9058484389505057, + "grad_norm": 0.6121499538421631, + "learning_rate": 4.70803811385292e-06, + "loss": 0.6198, + "step": 92700 + }, + { + "epoch": 0.9068256217325451, + "grad_norm": 0.7362948656082153, + "learning_rate": 4.659174199853408e-06, + "loss": 0.6123, + "step": 92800 + }, + { + "epoch": 0.9078028045145845, + "grad_norm": 0.606191098690033, + "learning_rate": 4.610310285853897e-06, + "loss": 0.6028, + "step": 92900 + }, + { + "epoch": 0.9087799872966238, + "grad_norm": 0.6618565917015076, + "learning_rate": 4.561446371854386e-06, + "loss": 0.5963, + "step": 93000 + }, + { + "epoch": 0.9097571700786632, + "grad_norm": 1.5052400827407837, + "learning_rate": 4.512582457854874e-06, + "loss": 0.603, + "step": 93100 + }, + { + "epoch": 0.9107343528607026, + "grad_norm": 0.8985777497291565, + "learning_rate": 4.463718543855363e-06, + "loss": 0.6156, + "step": 93200 + }, + { + "epoch": 0.911711535642742, + "grad_norm": 0.8037851452827454, + "learning_rate": 4.414854629855852e-06, + "loss": 0.6406, + "step": 93300 + }, + { + "epoch": 0.9126887184247814, + "grad_norm": 0.49996376037597656, + "learning_rate": 4.365990715856341e-06, + "loss": 0.6139, + "step": 93400 + }, + { + "epoch": 0.9136659012068208, + "grad_norm": 0.8254772424697876, + "learning_rate": 4.317126801856829e-06, + "loss": 0.6149, + "step": 93500 + }, + { + "epoch": 0.9146430839888601, + "grad_norm": 0.7700937390327454, + "learning_rate": 4.268262887857318e-06, + "loss": 0.5993, + "step": 93600 + }, + { + "epoch": 0.9156202667708995, + "grad_norm": 0.38511478900909424, + "learning_rate": 4.2193989738578065e-06, + "loss": 0.6232, + "step": 93700 + }, + { + "epoch": 0.9165974495529389, + "grad_norm": 0.6567879319190979, + "learning_rate": 4.1705350598582955e-06, + "loss": 0.5813, + "step": 93800 + }, + { + "epoch": 0.9175746323349783, + "grad_norm": 0.8876736760139465, + "learning_rate": 4.1216711458587835e-06, + "loss": 0.5938, + "step": 93900 + }, + { + "epoch": 0.9185518151170177, + "grad_norm": 0.41622501611709595, + "learning_rate": 4.0728072318592724e-06, + "loss": 0.579, + "step": 94000 + }, + { + "epoch": 0.9195289978990571, + "grad_norm": 0.7455472946166992, + "learning_rate": 4.0239433178597605e-06, + "loss": 0.6011, + "step": 94100 + }, + { + "epoch": 0.9205061806810964, + "grad_norm": 0.5976389646530151, + "learning_rate": 3.975079403860249e-06, + "loss": 0.6143, + "step": 94200 + }, + { + "epoch": 0.9214833634631358, + "grad_norm": 0.7773202657699585, + "learning_rate": 3.9262154898607375e-06, + "loss": 0.5796, + "step": 94300 + }, + { + "epoch": 0.9224605462451752, + "grad_norm": 0.5033147931098938, + "learning_rate": 3.877351575861226e-06, + "loss": 0.5994, + "step": 94400 + }, + { + "epoch": 0.9234377290272145, + "grad_norm": 0.7234833240509033, + "learning_rate": 3.828487661861715e-06, + "loss": 0.6102, + "step": 94500 + }, + { + "epoch": 0.924414911809254, + "grad_norm": 0.4259088635444641, + "learning_rate": 3.7796237478622038e-06, + "loss": 0.5787, + "step": 94600 + }, + { + "epoch": 0.9253920945912933, + "grad_norm": 0.43989598751068115, + "learning_rate": 3.7307598338626923e-06, + "loss": 0.5841, + "step": 94700 + }, + { + "epoch": 0.9263692773733326, + "grad_norm": 0.4430140256881714, + "learning_rate": 3.681895919863181e-06, + "loss": 0.5933, + "step": 94800 + }, + { + "epoch": 0.927346460155372, + "grad_norm": 0.7848074436187744, + "learning_rate": 3.63303200586367e-06, + "loss": 0.6138, + "step": 94900 + }, + { + "epoch": 0.9283236429374114, + "grad_norm": 0.8117037415504456, + "learning_rate": 3.584168091864158e-06, + "loss": 0.5917, + "step": 95000 + }, + { + "epoch": 0.9293008257194508, + "grad_norm": 0.6667145490646362, + "learning_rate": 3.535304177864647e-06, + "loss": 0.5542, + "step": 95100 + }, + { + "epoch": 0.9302780085014902, + "grad_norm": 0.7902615070343018, + "learning_rate": 3.486440263865136e-06, + "loss": 0.5741, + "step": 95200 + }, + { + "epoch": 0.9312551912835296, + "grad_norm": 0.7067260146141052, + "learning_rate": 3.4375763498656245e-06, + "loss": 0.5961, + "step": 95300 + }, + { + "epoch": 0.9322323740655689, + "grad_norm": 2.328338861465454, + "learning_rate": 3.388712435866113e-06, + "loss": 0.5716, + "step": 95400 + }, + { + "epoch": 0.9332095568476083, + "grad_norm": 1.1518771648406982, + "learning_rate": 3.3398485218666014e-06, + "loss": 0.6306, + "step": 95500 + }, + { + "epoch": 0.9341867396296477, + "grad_norm": 0.5183611512184143, + "learning_rate": 3.2909846078670904e-06, + "loss": 0.5998, + "step": 95600 + }, + { + "epoch": 0.9351639224116871, + "grad_norm": 0.6827223300933838, + "learning_rate": 3.2421206938675793e-06, + "loss": 0.5948, + "step": 95700 + }, + { + "epoch": 0.9361411051937265, + "grad_norm": 0.6556549668312073, + "learning_rate": 3.1932567798680673e-06, + "loss": 0.6014, + "step": 95800 + }, + { + "epoch": 0.9371182879757659, + "grad_norm": 0.5259923934936523, + "learning_rate": 3.1443928658685562e-06, + "loss": 0.6192, + "step": 95900 + }, + { + "epoch": 0.9380954707578052, + "grad_norm": 0.6890705823898315, + "learning_rate": 3.095528951869045e-06, + "loss": 0.5922, + "step": 96000 + }, + { + "epoch": 0.9390726535398446, + "grad_norm": 0.5739189386367798, + "learning_rate": 3.0466650378695336e-06, + "loss": 0.572, + "step": 96100 + }, + { + "epoch": 0.940049836321884, + "grad_norm": 0.4784778356552124, + "learning_rate": 2.997801123870022e-06, + "loss": 0.5924, + "step": 96200 + }, + { + "epoch": 0.9410270191039234, + "grad_norm": 0.4622921049594879, + "learning_rate": 2.9489372098705106e-06, + "loss": 0.6223, + "step": 96300 + }, + { + "epoch": 0.9420042018859628, + "grad_norm": 0.7146719098091125, + "learning_rate": 2.900073295870999e-06, + "loss": 0.589, + "step": 96400 + }, + { + "epoch": 0.9429813846680022, + "grad_norm": 0.5467257499694824, + "learning_rate": 2.851209381871488e-06, + "loss": 0.6197, + "step": 96500 + }, + { + "epoch": 0.9439585674500416, + "grad_norm": 0.6875296831130981, + "learning_rate": 2.8023454678719765e-06, + "loss": 0.588, + "step": 96600 + }, + { + "epoch": 0.9449357502320809, + "grad_norm": 0.8921650052070618, + "learning_rate": 2.7534815538724654e-06, + "loss": 0.6008, + "step": 96700 + }, + { + "epoch": 0.9459129330141203, + "grad_norm": 0.6401572823524475, + "learning_rate": 2.704617639872954e-06, + "loss": 0.5858, + "step": 96800 + }, + { + "epoch": 0.9468901157961597, + "grad_norm": 0.7191618084907532, + "learning_rate": 2.655753725873443e-06, + "loss": 0.5763, + "step": 96900 + }, + { + "epoch": 0.9478672985781991, + "grad_norm": 0.6186959147453308, + "learning_rate": 2.6068898118739313e-06, + "loss": 0.5695, + "step": 97000 + }, + { + "epoch": 0.9488444813602385, + "grad_norm": 0.36472517251968384, + "learning_rate": 2.5580258978744198e-06, + "loss": 0.5819, + "step": 97100 + }, + { + "epoch": 0.9498216641422779, + "grad_norm": 1.0958882570266724, + "learning_rate": 2.5091619838749083e-06, + "loss": 0.6167, + "step": 97200 + }, + { + "epoch": 0.9507988469243172, + "grad_norm": 0.7372691631317139, + "learning_rate": 2.460298069875397e-06, + "loss": 0.5936, + "step": 97300 + }, + { + "epoch": 0.9517760297063566, + "grad_norm": 0.4143502116203308, + "learning_rate": 2.4114341558758857e-06, + "loss": 0.5873, + "step": 97400 + }, + { + "epoch": 0.952753212488396, + "grad_norm": 1.134059190750122, + "learning_rate": 2.3625702418763746e-06, + "loss": 0.6143, + "step": 97500 + }, + { + "epoch": 0.9537303952704353, + "grad_norm": 0.40213558077812195, + "learning_rate": 2.313706327876863e-06, + "loss": 0.5725, + "step": 97600 + }, + { + "epoch": 0.9547075780524747, + "grad_norm": 0.5387831926345825, + "learning_rate": 2.264842413877352e-06, + "loss": 0.5959, + "step": 97700 + }, + { + "epoch": 0.9556847608345141, + "grad_norm": 0.8288729786872864, + "learning_rate": 2.2159784998778405e-06, + "loss": 0.5881, + "step": 97800 + }, + { + "epoch": 0.9566619436165534, + "grad_norm": 0.7433648109436035, + "learning_rate": 2.167114585878329e-06, + "loss": 0.5881, + "step": 97900 + }, + { + "epoch": 0.9576391263985928, + "grad_norm": 0.7633154392242432, + "learning_rate": 2.1182506718788174e-06, + "loss": 0.6218, + "step": 98000 + }, + { + "epoch": 0.9586163091806322, + "grad_norm": 0.5039961338043213, + "learning_rate": 2.069386757879306e-06, + "loss": 0.5973, + "step": 98100 + }, + { + "epoch": 0.9595934919626716, + "grad_norm": 0.9047883152961731, + "learning_rate": 2.020522843879795e-06, + "loss": 0.5741, + "step": 98200 + }, + { + "epoch": 0.960570674744711, + "grad_norm": 0.6591965556144714, + "learning_rate": 1.9716589298802833e-06, + "loss": 0.5914, + "step": 98300 + }, + { + "epoch": 0.9615478575267504, + "grad_norm": 0.6809371113777161, + "learning_rate": 1.9227950158807722e-06, + "loss": 0.5876, + "step": 98400 + }, + { + "epoch": 0.9625250403087897, + "grad_norm": 0.5399168133735657, + "learning_rate": 1.8739311018812607e-06, + "loss": 0.5921, + "step": 98500 + }, + { + "epoch": 0.9635022230908291, + "grad_norm": 0.6308420896530151, + "learning_rate": 1.8250671878817494e-06, + "loss": 0.5805, + "step": 98600 + }, + { + "epoch": 0.9644794058728685, + "grad_norm": 0.8909119963645935, + "learning_rate": 1.776203273882238e-06, + "loss": 0.6062, + "step": 98700 + }, + { + "epoch": 0.9654565886549079, + "grad_norm": 0.5217241048812866, + "learning_rate": 1.7273393598827268e-06, + "loss": 0.5866, + "step": 98800 + }, + { + "epoch": 0.9664337714369473, + "grad_norm": 0.5441256165504456, + "learning_rate": 1.6784754458832153e-06, + "loss": 0.5889, + "step": 98900 + }, + { + "epoch": 0.9674109542189867, + "grad_norm": 0.6473023891448975, + "learning_rate": 1.629611531883704e-06, + "loss": 0.6066, + "step": 99000 + }, + { + "epoch": 0.968388137001026, + "grad_norm": 0.7462273836135864, + "learning_rate": 1.5807476178841925e-06, + "loss": 0.5926, + "step": 99100 + }, + { + "epoch": 0.9693653197830654, + "grad_norm": 0.4794386029243469, + "learning_rate": 1.5318837038846812e-06, + "loss": 0.5856, + "step": 99200 + }, + { + "epoch": 0.9703425025651048, + "grad_norm": 0.5676984190940857, + "learning_rate": 1.48301978988517e-06, + "loss": 0.5797, + "step": 99300 + }, + { + "epoch": 0.9713196853471442, + "grad_norm": 0.7232435941696167, + "learning_rate": 1.4341558758856586e-06, + "loss": 0.6122, + "step": 99400 + }, + { + "epoch": 0.9722968681291836, + "grad_norm": 0.6773326396942139, + "learning_rate": 1.385291961886147e-06, + "loss": 0.5877, + "step": 99500 + }, + { + "epoch": 0.973274050911223, + "grad_norm": 0.522219717502594, + "learning_rate": 1.3364280478866358e-06, + "loss": 0.5819, + "step": 99600 + }, + { + "epoch": 0.9742512336932623, + "grad_norm": 0.7057138681411743, + "learning_rate": 1.2875641338871245e-06, + "loss": 0.6047, + "step": 99700 + }, + { + "epoch": 0.9752284164753017, + "grad_norm": 0.8740668296813965, + "learning_rate": 1.2387002198876132e-06, + "loss": 0.5909, + "step": 99800 + }, + { + "epoch": 0.9762055992573411, + "grad_norm": 0.6199445128440857, + "learning_rate": 1.1898363058881017e-06, + "loss": 0.5972, + "step": 99900 + }, + { + "epoch": 0.9771827820393805, + "grad_norm": 0.8061028122901917, + "learning_rate": 1.1409723918885904e-06, + "loss": 0.5958, + "step": 100000 + } + ], + "logging_steps": 100, + "max_steps": 102335, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.254292317011968e+18, + "train_batch_size": 12, + "trial_name": null, + "trial_params": null +}