diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,31943 +1,3992 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, + "epoch": 0.9999069680900549, "eval_steps": 100, - "global_step": 21570, + "global_step": 2687, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0002318034306907742, - "grad_norm": 6.929919242858887, - "learning_rate": 4.636068613815485e-08, - "loss": 1.3729, + "epoch": 0.0018606381989022234, + "grad_norm": 0.025198739022016525, + "learning_rate": 3.7174721189591085e-07, + "loss": 0.7637, "step": 5 }, { - "epoch": 0.0004636068613815484, - "grad_norm": 7.42155647277832, - "learning_rate": 9.27213722763097e-08, - "loss": 1.33, + "epoch": 0.003721276397804447, + "grad_norm": 0.017377199605107307, + "learning_rate": 7.434944237918217e-07, + "loss": 0.7786, "step": 10 }, { - "epoch": 0.0006954102920723226, - "grad_norm": 7.66618537902832, - "learning_rate": 1.3908205841446455e-07, - "loss": 1.1782, + "epoch": 0.0055819145967066705, + "grad_norm": 0.019743537530303, + "learning_rate": 1.1152416356877324e-06, + "loss": 0.7174, "step": 15 }, { - "epoch": 0.0009272137227630969, - "grad_norm": 7.257485389709473, - "learning_rate": 1.854427445526194e-07, - "loss": 1.2896, + "epoch": 0.007442552795608894, + "grad_norm": 0.017805561423301697, + "learning_rate": 1.4869888475836434e-06, + "loss": 0.724, "step": 20 }, { - "epoch": 0.0011590171534538712, - "grad_norm": 9.955284118652344, - "learning_rate": 2.3180343069077424e-07, - "loss": 1.3364, + "epoch": 0.009303190994511117, + "grad_norm": 0.017246991395950317, + "learning_rate": 1.858736059479554e-06, + "loss": 0.7668, "step": 25 }, { - "epoch": 0.0013908205841446453, - "grad_norm": 10.206951141357422, - "learning_rate": 2.781641168289291e-07, - "loss": 1.4175, + "epoch": 0.011163829193413341, + "grad_norm": 0.023152414709329605, + "learning_rate": 2.2304832713754648e-06, + "loss": 0.781, "step": 30 }, { - "epoch": 0.0016226240148354196, - "grad_norm": 7.223724365234375, - "learning_rate": 3.2452480296708393e-07, - "loss": 1.2606, + "epoch": 0.013024467392315565, + "grad_norm": 0.019485710188746452, + "learning_rate": 2.6022304832713758e-06, + "loss": 0.7593, "step": 35 }, { - "epoch": 0.0018544274455261937, - "grad_norm": 7.419017791748047, - "learning_rate": 3.708854891052388e-07, - "loss": 1.2551, + "epoch": 0.014885105591217787, + "grad_norm": 0.01784881390631199, + "learning_rate": 2.973977695167287e-06, + "loss": 0.762, "step": 40 }, { - "epoch": 0.002086230876216968, - "grad_norm": 7.923377990722656, - "learning_rate": 4.1724617524339365e-07, - "loss": 1.3281, + "epoch": 0.01674574379012001, + "grad_norm": 0.01972021535038948, + "learning_rate": 3.3457249070631974e-06, + "loss": 0.7489, "step": 45 }, { - "epoch": 0.0023180343069077423, - "grad_norm": 8.805521011352539, - "learning_rate": 4.636068613815485e-07, - "loss": 1.2294, + "epoch": 0.018606381989022234, + "grad_norm": 0.019333072006702423, + "learning_rate": 3.717472118959108e-06, + "loss": 0.7461, "step": 50 }, { - "epoch": 0.0025498377375985167, - "grad_norm": 7.392923355102539, - "learning_rate": 5.099675475197033e-07, - "loss": 1.1609, + "epoch": 0.02046702018792446, + "grad_norm": 0.015675414353609085, + "learning_rate": 4.089219330855019e-06, + "loss": 0.7621, "step": 55 }, { - "epoch": 0.0027816411682892906, - "grad_norm": 7.491943836212158, - "learning_rate": 5.563282336578582e-07, - "loss": 1.2333, + "epoch": 0.022327658386826682, + "grad_norm": 0.021361010149121284, + "learning_rate": 4.4609665427509296e-06, + "loss": 0.7677, "step": 60 }, { - "epoch": 0.003013444598980065, - "grad_norm": 7.105735778808594, - "learning_rate": 6.026889197960131e-07, - "loss": 1.4028, + "epoch": 0.024188296585728904, + "grad_norm": 0.021466689184308052, + "learning_rate": 4.83271375464684e-06, + "loss": 0.7857, "step": 65 }, { - "epoch": 0.003245248029670839, - "grad_norm": 5.720520973205566, - "learning_rate": 6.490496059341679e-07, - "loss": 1.2407, + "epoch": 0.02604893478463113, + "grad_norm": 0.019599348306655884, + "learning_rate": 5.2044609665427516e-06, + "loss": 0.7308, "step": 70 }, { - "epoch": 0.0034770514603616135, - "grad_norm": 6.424951076507568, - "learning_rate": 6.954102920723226e-07, - "loss": 1.1028, + "epoch": 0.027909572983533353, + "grad_norm": 0.015855278819799423, + "learning_rate": 5.576208178438662e-06, + "loss": 0.7423, "step": 75 }, { - "epoch": 0.0037088548910523874, - "grad_norm": 7.229697227478027, - "learning_rate": 7.417709782104776e-07, - "loss": 1.2738, + "epoch": 0.029770211182435575, + "grad_norm": 0.019443219527602196, + "learning_rate": 5.947955390334574e-06, + "loss": 0.7782, "step": 80 }, { - "epoch": 0.003940658321743162, - "grad_norm": 6.052042484283447, - "learning_rate": 7.881316643486324e-07, - "loss": 1.2327, + "epoch": 0.0316308493813378, + "grad_norm": 0.01575319655239582, + "learning_rate": 6.319702602230484e-06, + "loss": 0.7494, "step": 85 }, { - "epoch": 0.004172461752433936, - "grad_norm": 6.341604709625244, - "learning_rate": 8.344923504867873e-07, - "loss": 1.3533, + "epoch": 0.03349148758024002, + "grad_norm": 0.0159482192248106, + "learning_rate": 6.691449814126395e-06, + "loss": 0.6999, "step": 90 }, { - "epoch": 0.00440426518312471, - "grad_norm": 6.311711311340332, - "learning_rate": 8.808530366249421e-07, - "loss": 1.2228, + "epoch": 0.03535212577914225, + "grad_norm": 0.016392122954130173, + "learning_rate": 7.063197026022306e-06, + "loss": 0.6966, "step": 95 }, { - "epoch": 0.004636068613815485, - "grad_norm": 6.693397045135498, - "learning_rate": 9.27213722763097e-07, - "loss": 1.2429, + "epoch": 0.03721276397804447, + "grad_norm": 0.017619600519537926, + "learning_rate": 7.434944237918216e-06, + "loss": 0.7659, "step": 100 }, { - "epoch": 0.004636068613815485, - "eval_loss": 1.2549220323562622, - "eval_runtime": 11.2838, - "eval_samples_per_second": 11.255, - "eval_steps_per_second": 11.255, + "epoch": 0.03721276397804447, + "eval_loss": 0.760661780834198, + "eval_runtime": 26.8914, + "eval_samples_per_second": 4.723, + "eval_steps_per_second": 4.723, "step": 100 }, { - "epoch": 0.004867872044506259, - "grad_norm": 7.291402339935303, - "learning_rate": 9.735744089012517e-07, - "loss": 1.3052, + "epoch": 0.039073402176946694, + "grad_norm": 0.02020377479493618, + "learning_rate": 7.806691449814127e-06, + "loss": 0.7475, "step": 105 }, { - "epoch": 0.005099675475197033, - "grad_norm": 6.058638095855713, - "learning_rate": 1.0199350950394066e-06, - "loss": 1.1783, + "epoch": 0.04093404037584892, + "grad_norm": 0.018723690882325172, + "learning_rate": 8.178438661710038e-06, + "loss": 0.7233, "step": 110 }, { - "epoch": 0.005331478905887807, - "grad_norm": 6.480159282684326, - "learning_rate": 1.0662957811775615e-06, - "loss": 1.2306, + "epoch": 0.04279467857475114, + "grad_norm": 0.017194446176290512, + "learning_rate": 8.550185873605949e-06, + "loss": 0.7655, "step": 115 }, { - "epoch": 0.005563282336578581, - "grad_norm": 6.4310221672058105, - "learning_rate": 1.1126564673157164e-06, - "loss": 1.0381, + "epoch": 0.044655316773653364, + "grad_norm": 0.013910962268710136, + "learning_rate": 8.921933085501859e-06, + "loss": 0.7493, "step": 120 }, { - "epoch": 0.005795085767269356, - "grad_norm": 5.661340236663818, - "learning_rate": 1.1590171534538713e-06, - "loss": 1.1524, + "epoch": 0.04651595497255559, + "grad_norm": 0.01600288413465023, + "learning_rate": 9.29368029739777e-06, + "loss": 0.7528, "step": 125 }, { - "epoch": 0.00602688919796013, - "grad_norm": 6.941490173339844, - "learning_rate": 1.2053778395920262e-06, - "loss": 1.2525, + "epoch": 0.04837659317145781, + "grad_norm": 0.01554879080504179, + "learning_rate": 9.66542750929368e-06, + "loss": 0.6947, "step": 130 }, { - "epoch": 0.006258692628650904, - "grad_norm": 8.481853485107422, - "learning_rate": 1.2517385257301808e-06, - "loss": 1.2434, + "epoch": 0.050237231370360035, + "grad_norm": 0.015109645202755928, + "learning_rate": 1.0037174721189591e-05, + "loss": 0.6647, "step": 135 }, { - "epoch": 0.006490496059341678, - "grad_norm": 6.217038154602051, - "learning_rate": 1.2980992118683357e-06, - "loss": 1.3233, + "epoch": 0.05209786956926226, + "grad_norm": 0.015620779246091843, + "learning_rate": 1.0408921933085503e-05, + "loss": 0.7298, "step": 140 }, { - "epoch": 0.006722299490032452, - "grad_norm": 5.551589012145996, - "learning_rate": 1.3444598980064908e-06, - "loss": 1.1833, + "epoch": 0.05395850776816448, + "grad_norm": 0.013970567844808102, + "learning_rate": 1.0780669144981412e-05, + "loss": 0.7485, "step": 145 }, { - "epoch": 0.006954102920723227, - "grad_norm": 5.535802364349365, - "learning_rate": 1.3908205841446453e-06, - "loss": 1.2354, + "epoch": 0.055819145967066705, + "grad_norm": 0.014776401221752167, + "learning_rate": 1.1152416356877324e-05, + "loss": 0.6767, "step": 150 }, { - "epoch": 0.007185906351414001, - "grad_norm": 6.194558143615723, - "learning_rate": 1.4371812702828004e-06, - "loss": 1.2508, + "epoch": 0.057679784165968924, + "grad_norm": 0.014368101954460144, + "learning_rate": 1.1524163568773235e-05, + "loss": 0.7037, "step": 155 }, { - "epoch": 0.007417709782104775, - "grad_norm": 6.735708236694336, - "learning_rate": 1.4835419564209553e-06, - "loss": 1.2612, + "epoch": 0.05954042236487115, + "grad_norm": 0.01449244562536478, + "learning_rate": 1.1895910780669147e-05, + "loss": 0.7305, "step": 160 }, { - "epoch": 0.0076495132127955496, - "grad_norm": 6.2094855308532715, - "learning_rate": 1.52990264255911e-06, - "loss": 1.2216, + "epoch": 0.061401060563773376, + "grad_norm": 0.015350298024713993, + "learning_rate": 1.2267657992565056e-05, + "loss": 0.6804, "step": 165 }, { - "epoch": 0.007881316643486324, - "grad_norm": 5.640394687652588, - "learning_rate": 1.5762633286972648e-06, - "loss": 0.9614, + "epoch": 0.0632616987626756, + "grad_norm": 0.014395203441381454, + "learning_rate": 1.2639405204460968e-05, + "loss": 0.6832, "step": 170 }, { - "epoch": 0.008113120074177098, - "grad_norm": 6.062097549438477, - "learning_rate": 1.6226240148354197e-06, - "loss": 1.0738, + "epoch": 0.06512233696157782, + "grad_norm": 0.015541068278253078, + "learning_rate": 1.3011152416356879e-05, + "loss": 0.7168, "step": 175 }, { - "epoch": 0.008344923504867872, - "grad_norm": 7.769379615783691, - "learning_rate": 1.6689847009735746e-06, - "loss": 1.1195, + "epoch": 0.06698297516048005, + "grad_norm": 0.012579885311424732, + "learning_rate": 1.338289962825279e-05, + "loss": 0.693, "step": 180 }, { - "epoch": 0.008576726935558646, - "grad_norm": 6.438906192779541, - "learning_rate": 1.7153453871117293e-06, - "loss": 1.3437, + "epoch": 0.06884361335938227, + "grad_norm": 0.014168789610266685, + "learning_rate": 1.37546468401487e-05, + "loss": 0.7056, "step": 185 }, { - "epoch": 0.00880853036624942, - "grad_norm": 5.559536933898926, - "learning_rate": 1.7617060732498842e-06, - "loss": 1.2253, + "epoch": 0.0707042515582845, + "grad_norm": 0.012087655253708363, + "learning_rate": 1.4126394052044612e-05, + "loss": 0.7006, "step": 190 }, { - "epoch": 0.009040333796940195, - "grad_norm": 6.249836444854736, - "learning_rate": 1.808066759388039e-06, - "loss": 1.1692, + "epoch": 0.07256488975718671, + "grad_norm": 0.014433121308684349, + "learning_rate": 1.4498141263940521e-05, + "loss": 0.7016, "step": 195 }, { - "epoch": 0.00927213722763097, - "grad_norm": 5.227753162384033, - "learning_rate": 1.854427445526194e-06, - "loss": 1.0196, + "epoch": 0.07442552795608894, + "grad_norm": 0.015074139460921288, + "learning_rate": 1.4869888475836432e-05, + "loss": 0.6912, "step": 200 }, { - "epoch": 0.00927213722763097, - "eval_loss": 1.1532349586486816, - "eval_runtime": 11.1843, - "eval_samples_per_second": 11.355, - "eval_steps_per_second": 11.355, + "epoch": 0.07442552795608894, + "eval_loss": 0.7027233242988586, + "eval_runtime": 26.5611, + "eval_samples_per_second": 4.781, + "eval_steps_per_second": 4.781, "step": 200 }, { - "epoch": 0.009503940658321743, - "grad_norm": 4.921353340148926, - "learning_rate": 1.9007881316643486e-06, - "loss": 0.9627, + "epoch": 0.07628616615499116, + "grad_norm": 0.013617471791803837, + "learning_rate": 1.5241635687732344e-05, + "loss": 0.7139, "step": 205 }, { - "epoch": 0.009735744089012517, - "grad_norm": 8.843475341796875, - "learning_rate": 1.9471488178025035e-06, - "loss": 1.1093, + "epoch": 0.07814680435389339, + "grad_norm": 0.013274065218865871, + "learning_rate": 1.5613382899628255e-05, + "loss": 0.6955, "step": 210 }, { - "epoch": 0.009967547519703291, - "grad_norm": 5.638999938964844, - "learning_rate": 1.9935095039406584e-06, - "loss": 1.1378, + "epoch": 0.08000744255279561, + "grad_norm": 0.013781987130641937, + "learning_rate": 1.5985130111524165e-05, + "loss": 0.65, "step": 215 }, { - "epoch": 0.010199350950394067, - "grad_norm": 6.697763919830322, - "learning_rate": 2.0398701900788133e-06, - "loss": 1.1164, + "epoch": 0.08186808075169784, + "grad_norm": 0.01373015996068716, + "learning_rate": 1.6356877323420076e-05, + "loss": 0.6681, "step": 220 }, { - "epoch": 0.01043115438108484, - "grad_norm": 5.590152263641357, - "learning_rate": 2.086230876216968e-06, - "loss": 1.1577, + "epoch": 0.08372871895060005, + "grad_norm": 0.01403126772493124, + "learning_rate": 1.6728624535315986e-05, + "loss": 0.6981, "step": 225 }, { - "epoch": 0.010662957811775614, - "grad_norm": 8.595314979553223, - "learning_rate": 2.132591562355123e-06, - "loss": 0.9921, + "epoch": 0.08558935714950228, + "grad_norm": 0.01332685723900795, + "learning_rate": 1.7100371747211897e-05, + "loss": 0.6517, "step": 230 }, { - "epoch": 0.010894761242466388, - "grad_norm": 6.406239986419678, - "learning_rate": 2.178952248493278e-06, - "loss": 1.0457, + "epoch": 0.0874499953484045, + "grad_norm": 0.013317782431840897, + "learning_rate": 1.7472118959107808e-05, + "loss": 0.6835, "step": 235 }, { - "epoch": 0.011126564673157162, - "grad_norm": 6.516043663024902, - "learning_rate": 2.225312934631433e-06, - "loss": 1.3394, + "epoch": 0.08931063354730673, + "grad_norm": 0.015096917748451233, + "learning_rate": 1.7843866171003718e-05, + "loss": 0.7138, "step": 240 }, { - "epoch": 0.011358368103847936, - "grad_norm": 8.184934616088867, - "learning_rate": 2.2716736207695877e-06, - "loss": 1.1182, + "epoch": 0.09117127174620895, + "grad_norm": 0.013502717949450016, + "learning_rate": 1.8215613382899632e-05, + "loss": 0.6722, "step": 245 }, { - "epoch": 0.011590171534538712, - "grad_norm": 6.349449157714844, - "learning_rate": 2.3180343069077426e-06, - "loss": 1.0817, + "epoch": 0.09303190994511118, + "grad_norm": 0.01279054582118988, + "learning_rate": 1.858736059479554e-05, + "loss": 0.6323, "step": 250 }, { - "epoch": 0.011821974965229486, - "grad_norm": 8.29964828491211, - "learning_rate": 2.3643949930458975e-06, - "loss": 0.9925, + "epoch": 0.09489254814401339, + "grad_norm": 0.011777005158364773, + "learning_rate": 1.8959107806691453e-05, + "loss": 0.6332, "step": 255 }, { - "epoch": 0.01205377839592026, - "grad_norm": 4.632545471191406, - "learning_rate": 2.4107556791840523e-06, - "loss": 0.9848, + "epoch": 0.09675318634291562, + "grad_norm": 0.01320689544081688, + "learning_rate": 1.933085501858736e-05, + "loss": 0.7034, "step": 260 }, { - "epoch": 0.012285581826611033, - "grad_norm": 9.396188735961914, - "learning_rate": 2.457116365322207e-06, - "loss": 1.1873, + "epoch": 0.09861382454181784, + "grad_norm": 0.012140162289142609, + "learning_rate": 1.970260223048327e-05, + "loss": 0.6154, "step": 265 }, { - "epoch": 0.012517385257301807, - "grad_norm": 9.827582359313965, - "learning_rate": 2.5034770514603617e-06, - "loss": 1.219, + "epoch": 0.10047446274072007, + "grad_norm": 0.01772845722734928, + "learning_rate": 1.9999991559715313e-05, + "loss": 0.7119, "step": 270 }, { - "epoch": 0.012749188687992583, - "grad_norm": 6.420851230621338, - "learning_rate": 2.5498377375985166e-06, - "loss": 1.1635, + "epoch": 0.1023351009396223, + "grad_norm": 0.012948377057909966, + "learning_rate": 1.999969615124717e-05, + "loss": 0.6554, "step": 275 }, { - "epoch": 0.012980992118683357, - "grad_norm": 9.211454391479492, - "learning_rate": 2.5961984237366715e-06, - "loss": 1.1646, + "epoch": 0.10419573913852452, + "grad_norm": 0.013341420330107212, + "learning_rate": 1.9998978742792098e-05, + "loss": 0.636, "step": 280 }, { - "epoch": 0.01321279554937413, - "grad_norm": 5.8738322257995605, - "learning_rate": 2.6425591098748268e-06, - "loss": 1.0619, + "epoch": 0.10605637733742673, + "grad_norm": 0.012272336520254612, + "learning_rate": 1.999783936462566e-05, + "loss": 0.6182, "step": 285 }, { - "epoch": 0.013444598980064905, - "grad_norm": 5.047887325286865, - "learning_rate": 2.6889197960129816e-06, - "loss": 0.8132, + "epoch": 0.10791701553632896, + "grad_norm": 0.012329615652561188, + "learning_rate": 1.999627806483107e-05, + "loss": 0.601, "step": 290 }, { - "epoch": 0.013676402410755678, - "grad_norm": 6.454254150390625, - "learning_rate": 2.7352804821511357e-06, - "loss": 1.0644, + "epoch": 0.10977765373523118, + "grad_norm": 0.01313008088618517, + "learning_rate": 1.999429490929718e-05, + "loss": 0.7002, "step": 295 }, { - "epoch": 0.013908205841446454, - "grad_norm": 12.117624282836914, - "learning_rate": 2.7816411682892906e-06, - "loss": 1.1196, + "epoch": 0.11163829193413341, + "grad_norm": 0.01170238945633173, + "learning_rate": 1.9991889981715696e-05, + "loss": 0.6784, "step": 300 }, { - "epoch": 0.013908205841446454, - "eval_loss": 1.1029967069625854, - "eval_runtime": 11.2775, - "eval_samples_per_second": 11.261, - "eval_steps_per_second": 11.261, + "epoch": 0.11163829193413341, + "eval_loss": 0.676296055316925, + "eval_runtime": 26.7211, + "eval_samples_per_second": 4.753, + "eval_steps_per_second": 4.753, "step": 300 }, { - "epoch": 0.014140009272137228, - "grad_norm": 4.559245586395264, - "learning_rate": 2.828001854427446e-06, - "loss": 1.0169, + "epoch": 0.11349893013303564, + "grad_norm": 0.012631393037736416, + "learning_rate": 1.9989063383577644e-05, + "loss": 0.6332, "step": 305 }, { - "epoch": 0.014371812702828002, - "grad_norm": 6.905066013336182, - "learning_rate": 2.8743625405656008e-06, - "loss": 1.1846, + "epoch": 0.11535956833193785, + "grad_norm": 0.012392008677124977, + "learning_rate": 1.998581523416908e-05, + "loss": 0.713, "step": 310 }, { - "epoch": 0.014603616133518776, - "grad_norm": 5.575958251953125, - "learning_rate": 2.9207232267037557e-06, - "loss": 1.0439, + "epoch": 0.11722020653084007, + "grad_norm": 0.012342042289674282, + "learning_rate": 1.998214567056607e-05, + "loss": 0.6072, "step": 315 }, { - "epoch": 0.01483541956420955, - "grad_norm": 7.027660369873047, - "learning_rate": 2.9670839128419105e-06, - "loss": 1.2515, + "epoch": 0.1190808447297423, + "grad_norm": 0.011801215820014477, + "learning_rate": 1.9978054847628908e-05, + "loss": 0.6553, "step": 320 }, { - "epoch": 0.015067222994900325, - "grad_norm": 5.862566947937012, - "learning_rate": 3.0134445989800654e-06, - "loss": 1.067, + "epoch": 0.12094148292864453, + "grad_norm": 0.013543626293540001, + "learning_rate": 1.997354293799555e-05, + "loss": 0.638, "step": 325 }, { - "epoch": 0.015299026425591099, - "grad_norm": 5.865589618682861, - "learning_rate": 3.05980528511822e-06, - "loss": 1.1834, + "epoch": 0.12280212112754675, + "grad_norm": 0.012367943301796913, + "learning_rate": 1.9968610132074372e-05, + "loss": 0.6867, "step": 330 }, { - "epoch": 0.015530829856281873, - "grad_norm": 6.330904960632324, - "learning_rate": 3.1061659712563748e-06, - "loss": 1.2675, + "epoch": 0.12466275932644898, + "grad_norm": 0.01382706593722105, + "learning_rate": 1.99632566380361e-05, + "loss": 0.67, "step": 335 }, { - "epoch": 0.01576263328697265, - "grad_norm": 6.37352180480957, - "learning_rate": 3.1525266573945297e-06, - "loss": 1.0757, + "epoch": 0.1265233975253512, + "grad_norm": 0.01336714904755354, + "learning_rate": 1.9957482681805036e-05, + "loss": 0.6615, "step": 340 }, { - "epoch": 0.015994436717663423, - "grad_norm": 6.554245948791504, - "learning_rate": 3.1988873435326845e-06, - "loss": 1.0759, + "epoch": 0.12838403572425341, + "grad_norm": 0.012786686420440674, + "learning_rate": 1.9951288507049532e-05, + "loss": 0.6343, "step": 345 }, { - "epoch": 0.016226240148354196, - "grad_norm": 5.791826248168945, - "learning_rate": 3.2452480296708394e-06, - "loss": 1.0212, + "epoch": 0.13024467392315564, + "grad_norm": 0.014750728383660316, + "learning_rate": 1.9944674375171697e-05, + "loss": 0.6478, "step": 350 }, { - "epoch": 0.01645804357904497, - "grad_norm": 5.290635108947754, - "learning_rate": 3.2916087158089943e-06, - "loss": 0.9677, + "epoch": 0.13210531212205787, + "grad_norm": 0.01330367662012577, + "learning_rate": 1.9937640565296372e-05, + "loss": 0.6844, "step": 355 }, { - "epoch": 0.016689847009735744, - "grad_norm": 6.6352739334106445, - "learning_rate": 3.337969401947149e-06, - "loss": 1.1298, + "epoch": 0.1339659503209601, + "grad_norm": 0.011332959868013859, + "learning_rate": 1.9930187374259338e-05, + "loss": 0.6188, "step": 360 }, { - "epoch": 0.016921650440426518, - "grad_norm": 5.054739952087402, - "learning_rate": 3.384330088085304e-06, - "loss": 1.1997, + "epoch": 0.13582658851986232, + "grad_norm": 0.013507647439837456, + "learning_rate": 1.992231511659481e-05, + "loss": 0.6844, "step": 365 }, { - "epoch": 0.017153453871117292, - "grad_norm": 6.105016231536865, - "learning_rate": 3.4306907742234585e-06, - "loss": 1.0421, + "epoch": 0.13768722671876454, + "grad_norm": 0.01494019664824009, + "learning_rate": 1.991402412452214e-05, + "loss": 0.6616, "step": 370 }, { - "epoch": 0.017385257301808066, - "grad_norm": 7.15664529800415, - "learning_rate": 3.4770514603616134e-06, - "loss": 1.0932, + "epoch": 0.13954786491766677, + "grad_norm": 0.012762832455337048, + "learning_rate": 1.9905314747931816e-05, + "loss": 0.6797, "step": 375 }, { - "epoch": 0.01761706073249884, - "grad_norm": 5.2151103019714355, - "learning_rate": 3.5234121464997683e-06, - "loss": 1.0914, + "epoch": 0.141408503116569, + "grad_norm": 0.014913683757185936, + "learning_rate": 1.989618735437069e-05, + "loss": 0.6268, "step": 380 }, { - "epoch": 0.017848864163189614, - "grad_norm": 4.892900466918945, - "learning_rate": 3.569772832637923e-06, - "loss": 0.8586, + "epoch": 0.1432691413154712, + "grad_norm": 0.012221734039485455, + "learning_rate": 1.9886642329026457e-05, + "loss": 0.6587, "step": 385 }, { - "epoch": 0.01808066759388039, - "grad_norm": 5.810879230499268, - "learning_rate": 3.616133518776078e-06, - "loss": 1.009, + "epoch": 0.14512977951437342, + "grad_norm": 0.01553855836391449, + "learning_rate": 1.9876680074711417e-05, + "loss": 0.6403, "step": 390 }, { - "epoch": 0.018312471024571165, - "grad_norm": 5.979573726654053, - "learning_rate": 3.662494204914233e-06, - "loss": 1.0977, + "epoch": 0.14699041771327565, + "grad_norm": 0.013682518154382706, + "learning_rate": 1.986630101184546e-05, + "loss": 0.6287, "step": 395 }, { - "epoch": 0.01854427445526194, - "grad_norm": 6.757267951965332, - "learning_rate": 3.708854891052388e-06, - "loss": 0.9805, + "epoch": 0.14885105591217787, + "grad_norm": 0.013481502421200275, + "learning_rate": 1.9855505578438343e-05, + "loss": 0.6757, "step": 400 }, { - "epoch": 0.01854427445526194, - "eval_loss": 1.0770819187164307, - "eval_runtime": 11.2853, - "eval_samples_per_second": 11.254, - "eval_steps_per_second": 11.254, + "epoch": 0.14885105591217787, + "eval_loss": 0.6647208333015442, + "eval_runtime": 27.129, + "eval_samples_per_second": 4.681, + "eval_steps_per_second": 4.681, "step": 400 }, { - "epoch": 0.018776077885952713, - "grad_norm": 7.137505531311035, - "learning_rate": 3.755215577190543e-06, - "loss": 1.0624, + "epoch": 0.1507116941110801, + "grad_norm": 0.01228385604918003, + "learning_rate": 1.984429423007117e-05, + "loss": 0.6277, "step": 405 }, { - "epoch": 0.019007881316643487, - "grad_norm": 6.788882255554199, - "learning_rate": 3.801576263328697e-06, - "loss": 1.1705, + "epoch": 0.15257233230998232, + "grad_norm": 0.014119746163487434, + "learning_rate": 1.9832667439877217e-05, + "loss": 0.615, "step": 410 }, { - "epoch": 0.01923968474733426, - "grad_norm": 5.606616020202637, - "learning_rate": 3.8479369494668525e-06, - "loss": 1.1273, + "epoch": 0.15443297050888455, + "grad_norm": 0.014395875856280327, + "learning_rate": 1.9820625698521918e-05, + "loss": 0.6417, "step": 415 }, { - "epoch": 0.019471488178025034, - "grad_norm": 6.507837772369385, - "learning_rate": 3.894297635605007e-06, - "loss": 1.1154, + "epoch": 0.15629360870778677, + "grad_norm": 0.013175971806049347, + "learning_rate": 1.9808169514182182e-05, + "loss": 0.6509, "step": 420 }, { - "epoch": 0.019703291608715808, - "grad_norm": 6.080861568450928, - "learning_rate": 3.940658321743162e-06, - "loss": 1.1084, + "epoch": 0.158154246906689, + "grad_norm": 0.015295376069843769, + "learning_rate": 1.9795299412524948e-05, + "loss": 0.6275, "step": 425 }, { - "epoch": 0.019935095039406582, - "grad_norm": 5.980709075927734, - "learning_rate": 3.987019007881317e-06, - "loss": 1.1777, + "epoch": 0.16001488510559123, + "grad_norm": 0.014611025340855122, + "learning_rate": 1.9782015936684987e-05, + "loss": 0.6627, "step": 430 }, { - "epoch": 0.020166898470097356, - "grad_norm": 5.779782772064209, - "learning_rate": 4.033379694019472e-06, - "loss": 1.0581, + "epoch": 0.16187552330449345, + "grad_norm": 0.01412207167595625, + "learning_rate": 1.9768319647242e-05, + "loss": 0.6362, "step": 435 }, { - "epoch": 0.020398701900788133, - "grad_norm": 6.758626461029053, - "learning_rate": 4.0797403801576265e-06, - "loss": 1.1615, + "epoch": 0.16373616150339568, + "grad_norm": 0.012070410884916782, + "learning_rate": 1.9754211122196945e-05, + "loss": 0.6429, "step": 440 }, { - "epoch": 0.020630505331478907, - "grad_norm": 5.383087635040283, - "learning_rate": 4.126101066295782e-06, - "loss": 0.9877, + "epoch": 0.16559679970229788, + "grad_norm": 0.013232079334557056, + "learning_rate": 1.9739690956947652e-05, + "loss": 0.6941, "step": 445 }, { - "epoch": 0.02086230876216968, - "grad_norm": 5.77476692199707, - "learning_rate": 4.172461752433936e-06, - "loss": 1.0031, + "epoch": 0.1674574379012001, + "grad_norm": 0.012606708332896233, + "learning_rate": 1.972475976426369e-05, + "loss": 0.6554, "step": 450 }, { - "epoch": 0.021094112192860455, - "grad_norm": 5.7638726234436035, - "learning_rate": 4.218822438572091e-06, - "loss": 0.9153, + "epoch": 0.16931807610010233, + "grad_norm": 0.012638423591852188, + "learning_rate": 1.9709418174260523e-05, + "loss": 0.645, "step": 455 }, { - "epoch": 0.02132591562355123, - "grad_norm": 6.627535820007324, - "learning_rate": 4.265183124710246e-06, - "loss": 1.1059, + "epoch": 0.17117871429900455, + "grad_norm": 0.013119902461767197, + "learning_rate": 1.9693666834372896e-05, + "loss": 0.6128, "step": 460 }, { - "epoch": 0.021557719054242003, - "grad_norm": 6.557712554931641, - "learning_rate": 4.3115438108484005e-06, - "loss": 1.2056, + "epoch": 0.17303935249790678, + "grad_norm": 0.011363113299012184, + "learning_rate": 1.9677506409327532e-05, + "loss": 0.6294, "step": 465 }, { - "epoch": 0.021789522484932777, - "grad_norm": 5.442972660064697, - "learning_rate": 4.357904496986556e-06, - "loss": 0.9902, + "epoch": 0.174899990696809, + "grad_norm": 0.014238959178328514, + "learning_rate": 1.9660937581115073e-05, + "loss": 0.6647, "step": 470 }, { - "epoch": 0.02202132591562355, - "grad_norm": 6.064192771911621, - "learning_rate": 4.404265183124711e-06, - "loss": 1.3211, + "epoch": 0.17676062889571123, + "grad_norm": 0.013214629143476486, + "learning_rate": 1.9643961048961283e-05, + "loss": 0.6037, "step": 475 }, { - "epoch": 0.022253129346314324, - "grad_norm": 11.156740188598633, - "learning_rate": 4.450625869262866e-06, - "loss": 1.091, + "epoch": 0.17862126709461346, + "grad_norm": 0.012312485836446285, + "learning_rate": 1.9626577529297573e-05, + "loss": 0.6703, "step": 480 }, { - "epoch": 0.0224849327770051, - "grad_norm": 7.064011096954346, - "learning_rate": 4.49698655540102e-06, - "loss": 1.0799, + "epoch": 0.18048190529351568, + "grad_norm": 0.012318914756178856, + "learning_rate": 1.9608787755730746e-05, + "loss": 0.6141, "step": 485 }, { - "epoch": 0.022716736207695872, - "grad_norm": 6.225513458251953, - "learning_rate": 4.543347241539175e-06, - "loss": 1.1523, + "epoch": 0.1823425434924179, + "grad_norm": 0.01374764647334814, + "learning_rate": 1.9590592479012022e-05, + "loss": 0.673, "step": 490 }, { - "epoch": 0.02294853963838665, - "grad_norm": 9.263200759887695, - "learning_rate": 4.58970792767733e-06, - "loss": 1.1255, + "epoch": 0.18420318169132013, + "grad_norm": 0.012634415179491043, + "learning_rate": 1.9571992467005395e-05, + "loss": 0.6135, "step": 495 }, { - "epoch": 0.023180343069077423, - "grad_norm": 5.4971699714660645, - "learning_rate": 4.636068613815485e-06, - "loss": 1.0001, + "epoch": 0.18606381989022236, + "grad_norm": 0.012813772074878216, + "learning_rate": 1.9552988504655194e-05, + "loss": 0.6648, "step": 500 }, { - "epoch": 0.023180343069077423, - "eval_loss": 1.0599620342254639, - "eval_runtime": 11.2721, - "eval_samples_per_second": 11.267, - "eval_steps_per_second": 11.267, + "epoch": 0.18606381989022236, + "eval_loss": 0.6582558751106262, + "eval_runtime": 26.9621, + "eval_samples_per_second": 4.71, + "eval_steps_per_second": 4.71, "step": 500 }, { - "epoch": 0.023412146499768197, - "grad_norm": 23.34360694885254, - "learning_rate": 4.68242929995364e-06, - "loss": 0.8703, + "epoch": 0.18792445808912456, + "grad_norm": 0.012235240079462528, + "learning_rate": 1.9533581393952978e-05, + "loss": 0.6108, "step": 505 }, { - "epoch": 0.02364394993045897, - "grad_norm": 6.617534637451172, - "learning_rate": 4.728789986091795e-06, - "loss": 1.0301, + "epoch": 0.18978509628802678, + "grad_norm": 0.012589952908456326, + "learning_rate": 1.951377195390367e-05, + "loss": 0.6218, "step": 510 }, { - "epoch": 0.023875753361149745, - "grad_norm": 6.7085771560668945, - "learning_rate": 4.775150672229949e-06, - "loss": 0.9515, + "epoch": 0.191645734486929, + "grad_norm": 0.012783786281943321, + "learning_rate": 1.9493561020491024e-05, + "loss": 0.6668, "step": 515 }, { - "epoch": 0.02410755679184052, - "grad_norm": 7.118703365325928, - "learning_rate": 4.821511358368105e-06, - "loss": 1.1126, + "epoch": 0.19350637268583123, + "grad_norm": 0.013828632421791553, + "learning_rate": 1.9472949446642318e-05, + "loss": 0.6081, "step": 520 }, { - "epoch": 0.024339360222531293, - "grad_norm": 5.29957914352417, - "learning_rate": 4.867872044506259e-06, - "loss": 0.9504, + "epoch": 0.19536701088473346, + "grad_norm": 0.011929171159863472, + "learning_rate": 1.945193810219237e-05, + "loss": 0.6329, "step": 525 }, { - "epoch": 0.024571163653222067, - "grad_norm": 5.3429436683654785, - "learning_rate": 4.914232730644414e-06, - "loss": 0.9477, + "epoch": 0.1972276490836357, + "grad_norm": 0.014584473334252834, + "learning_rate": 1.9430527873846826e-05, + "loss": 0.7017, "step": 530 }, { - "epoch": 0.02480296708391284, - "grad_norm": 5.861160755157471, - "learning_rate": 4.960593416782569e-06, - "loss": 0.9679, + "epoch": 0.1990882872825379, + "grad_norm": 0.01474926806986332, + "learning_rate": 1.9408719665144756e-05, + "loss": 0.632, "step": 535 }, { - "epoch": 0.025034770514603615, - "grad_norm": 5.540627956390381, - "learning_rate": 5.006954102920723e-06, - "loss": 1.049, + "epoch": 0.20094892548144014, + "grad_norm": 0.015552829019725323, + "learning_rate": 1.9386514396420503e-05, + "loss": 0.6757, "step": 540 }, { - "epoch": 0.025266573945294392, - "grad_norm": 5.6925950050354, - "learning_rate": 5.053314789058879e-06, - "loss": 1.08, + "epoch": 0.20280956368034236, + "grad_norm": 0.013232480734586716, + "learning_rate": 1.9363913004764847e-05, + "loss": 0.6722, "step": 545 }, { - "epoch": 0.025498377375985166, - "grad_norm": 5.670673847198486, - "learning_rate": 5.099675475197033e-06, - "loss": 0.993, + "epoch": 0.2046702018792446, + "grad_norm": 0.012858827598392963, + "learning_rate": 1.9340916443985465e-05, + "loss": 0.6231, "step": 550 }, { - "epoch": 0.02573018080667594, - "grad_norm": 6.216073513031006, - "learning_rate": 5.1460361613351884e-06, - "loss": 1.18, + "epoch": 0.20653084007814682, + "grad_norm": 0.012365566566586494, + "learning_rate": 1.9317525684566686e-05, + "loss": 0.5986, "step": 555 }, { - "epoch": 0.025961984237366714, - "grad_norm": 6.3086113929748535, - "learning_rate": 5.192396847473343e-06, - "loss": 0.9918, + "epoch": 0.20839147827704904, + "grad_norm": 0.01528852991759777, + "learning_rate": 1.9293741713628518e-05, + "loss": 0.6537, "step": 560 }, { - "epoch": 0.026193787668057487, - "grad_norm": 7.024346828460693, - "learning_rate": 5.238757533611498e-06, - "loss": 1.0553, + "epoch": 0.21025211647595124, + "grad_norm": 0.014512522146105766, + "learning_rate": 1.9269565534885003e-05, + "loss": 0.6527, "step": 565 }, { - "epoch": 0.02642559109874826, - "grad_norm": 4.924084186553955, - "learning_rate": 5.2851182197496535e-06, - "loss": 0.9521, + "epoch": 0.21211275467485347, + "grad_norm": 0.013798325322568417, + "learning_rate": 1.9244998168601848e-05, + "loss": 0.6148, "step": 570 }, { - "epoch": 0.026657394529439035, - "grad_norm": 5.455867290496826, - "learning_rate": 5.331478905887808e-06, - "loss": 0.875, + "epoch": 0.2139733928737557, + "grad_norm": 0.013186248019337654, + "learning_rate": 1.9220040651553388e-05, + "loss": 0.6106, "step": 575 }, { - "epoch": 0.02688919796012981, - "grad_norm": 4.741636276245117, - "learning_rate": 5.377839592025963e-06, - "loss": 0.9555, + "epoch": 0.21583403107265792, + "grad_norm": 0.013986771926283836, + "learning_rate": 1.9194694036978807e-05, + "loss": 0.6654, "step": 580 }, { - "epoch": 0.027121001390820583, - "grad_norm": 6.165698528289795, - "learning_rate": 5.424200278164117e-06, - "loss": 1.0817, + "epoch": 0.21769466927156014, + "grad_norm": 0.016201818361878395, + "learning_rate": 1.9168959394537708e-05, + "loss": 0.6306, "step": 585 }, { - "epoch": 0.027352804821511357, - "grad_norm": 5.713921070098877, - "learning_rate": 5.470560964302271e-06, - "loss": 0.9599, + "epoch": 0.21955530747046237, + "grad_norm": 0.013889294117689133, + "learning_rate": 1.9142837810264972e-05, + "loss": 0.6749, "step": 590 }, { - "epoch": 0.027584608252202134, - "grad_norm": 4.89811372756958, - "learning_rate": 5.516921650440427e-06, - "loss": 1.0632, + "epoch": 0.2214159456693646, + "grad_norm": 0.013025142252445221, + "learning_rate": 1.911633038652491e-05, + "loss": 0.6075, "step": 595 }, { - "epoch": 0.027816411682892908, - "grad_norm": 6.775177478790283, - "learning_rate": 5.563282336578581e-06, - "loss": 0.9467, + "epoch": 0.22327658386826682, + "grad_norm": 0.013716059736907482, + "learning_rate": 1.9089438241964764e-05, + "loss": 0.6516, "step": 600 }, { - "epoch": 0.027816411682892908, - "eval_loss": 1.0482524633407593, - "eval_runtime": 11.1015, - "eval_samples_per_second": 11.44, - "eval_steps_per_second": 11.44, + "epoch": 0.22327658386826682, + "eval_loss": 0.6547934412956238, + "eval_runtime": 26.7928, + "eval_samples_per_second": 4.74, + "eval_steps_per_second": 4.74, "step": 600 }, { - "epoch": 0.028048215113583682, - "grad_norm": 5.133644104003906, - "learning_rate": 5.6096430227167365e-06, - "loss": 1.0033, + "epoch": 0.22513722206716905, + "grad_norm": 0.016251811757683754, + "learning_rate": 1.906216251146748e-05, + "loss": 0.6265, "step": 605 }, { - "epoch": 0.028280018544274456, - "grad_norm": 6.307579040527344, - "learning_rate": 5.656003708854892e-06, - "loss": 1.0006, + "epoch": 0.22699786026607127, + "grad_norm": 0.013359563425183296, + "learning_rate": 1.9034504346103825e-05, + "loss": 0.6052, "step": 610 }, { - "epoch": 0.02851182197496523, - "grad_norm": 6.110098838806152, - "learning_rate": 5.702364394993046e-06, - "loss": 1.1813, + "epoch": 0.2288584984649735, + "grad_norm": 0.012794552370905876, + "learning_rate": 1.9006464913083807e-05, + "loss": 0.613, "step": 615 }, { - "epoch": 0.028743625405656004, - "grad_norm": 7.244579315185547, - "learning_rate": 5.7487250811312015e-06, - "loss": 1.1448, + "epoch": 0.2307191366638757, + "grad_norm": 0.012533072382211685, + "learning_rate": 1.897804539570742e-05, + "loss": 0.6735, "step": 620 }, { - "epoch": 0.028975428836346778, - "grad_norm": 5.6296305656433105, - "learning_rate": 5.795085767269356e-06, - "loss": 1.0722, + "epoch": 0.23257977486277792, + "grad_norm": 0.013286220841109753, + "learning_rate": 1.8949246993314694e-05, + "loss": 0.6692, "step": 625 }, { - "epoch": 0.02920723226703755, - "grad_norm": 5.689414978027344, - "learning_rate": 5.841446453407511e-06, - "loss": 1.0578, + "epoch": 0.23444041306168015, + "grad_norm": 0.013466808013617992, + "learning_rate": 1.892007092123511e-05, + "loss": 0.6513, "step": 630 }, { - "epoch": 0.029439035697728325, - "grad_norm": 7.251938343048096, - "learning_rate": 5.887807139545666e-06, - "loss": 0.9588, + "epoch": 0.23630105126058237, + "grad_norm": 0.012991335242986679, + "learning_rate": 1.8890518410736275e-05, + "loss": 0.6405, "step": 635 }, { - "epoch": 0.0296708391284191, - "grad_norm": 6.593122482299805, - "learning_rate": 5.934167825683821e-06, - "loss": 1.1287, + "epoch": 0.2381616894594846, + "grad_norm": 0.013223089277744293, + "learning_rate": 1.8860590708971997e-05, + "loss": 0.6488, "step": 640 }, { - "epoch": 0.029902642559109873, - "grad_norm": 5.741578102111816, - "learning_rate": 5.9805285118219755e-06, - "loss": 0.9135, + "epoch": 0.24002232765838682, + "grad_norm": 0.012394067831337452, + "learning_rate": 1.8830289078929618e-05, + "loss": 0.6131, "step": 645 }, { - "epoch": 0.03013444598980065, - "grad_norm": 7.281170845031738, - "learning_rate": 6.026889197960131e-06, - "loss": 0.9585, + "epoch": 0.24188296585728905, + "grad_norm": 0.013721502386033535, + "learning_rate": 1.8799614799376743e-05, + "loss": 0.681, "step": 650 }, { - "epoch": 0.030366249420491424, - "grad_norm": 5.263773441314697, - "learning_rate": 6.073249884098285e-06, - "loss": 1.0363, + "epoch": 0.24374360405619128, + "grad_norm": 0.012726777233183384, + "learning_rate": 1.8768569164807272e-05, + "loss": 0.6837, "step": 655 }, { - "epoch": 0.030598052851182198, - "grad_norm": 6.059519290924072, - "learning_rate": 6.11961057023644e-06, - "loss": 0.9297, + "epoch": 0.2456042422550935, + "grad_norm": 0.013720668852329254, + "learning_rate": 1.8737153485386737e-05, + "loss": 0.6007, "step": 660 }, { - "epoch": 0.030829856281872972, - "grad_norm": 5.903897762298584, - "learning_rate": 6.165971256374594e-06, - "loss": 0.9503, + "epoch": 0.24746488045399573, + "grad_norm": 0.012646087445318699, + "learning_rate": 1.8705369086897063e-05, + "loss": 0.6545, "step": 665 }, { - "epoch": 0.031061659712563746, - "grad_norm": 5.2086944580078125, - "learning_rate": 6.2123319425127495e-06, - "loss": 0.9917, + "epoch": 0.24932551865289795, + "grad_norm": 0.013658811338245869, + "learning_rate": 1.8673217310680578e-05, + "loss": 0.6379, "step": 670 }, { - "epoch": 0.03129346314325452, - "grad_norm": 4.655348777770996, - "learning_rate": 6.258692628650904e-06, - "loss": 0.9246, + "epoch": 0.2511861568518002, + "grad_norm": 0.012248256243765354, + "learning_rate": 1.864069951358342e-05, + "loss": 0.6356, "step": 675 }, { - "epoch": 0.0315252665739453, - "grad_norm": 6.055633544921875, - "learning_rate": 6.305053314789059e-06, - "loss": 0.9003, + "epoch": 0.2530467950507024, + "grad_norm": 0.0133894681930542, + "learning_rate": 1.860781706789829e-05, + "loss": 0.6737, "step": 680 }, { - "epoch": 0.03175707000463607, - "grad_norm": 5.528895378112793, - "learning_rate": 6.351414000927214e-06, - "loss": 1.1059, + "epoch": 0.25490743324960463, + "grad_norm": 0.01430124044418335, + "learning_rate": 1.857457136130651e-05, + "loss": 0.6169, "step": 685 }, { - "epoch": 0.031988873435326845, - "grad_norm": 5.239342212677002, - "learning_rate": 6.397774687065369e-06, - "loss": 1.0502, + "epoch": 0.25676807144850683, + "grad_norm": 0.013437042012810707, + "learning_rate": 1.854096379681949e-05, + "loss": 0.6021, "step": 690 }, { - "epoch": 0.032220676866017615, - "grad_norm": 6.7139763832092285, - "learning_rate": 6.4441353732035235e-06, - "loss": 1.0828, + "epoch": 0.2586287096474091, + "grad_norm": 0.011497768573462963, + "learning_rate": 1.8506995792719498e-05, + "loss": 0.6119, "step": 695 }, { - "epoch": 0.03245248029670839, - "grad_norm": 5.635594844818115, - "learning_rate": 6.490496059341679e-06, - "loss": 1.0016, + "epoch": 0.2604893478463113, + "grad_norm": 0.01319235097616911, + "learning_rate": 1.8472668782499817e-05, + "loss": 0.627, "step": 700 }, { - "epoch": 0.03245248029670839, - "eval_loss": 1.0429729223251343, - "eval_runtime": 11.2781, - "eval_samples_per_second": 11.261, - "eval_steps_per_second": 11.261, + "epoch": 0.2604893478463113, + "eval_loss": 0.6522720456123352, + "eval_runtime": 26.7201, + "eval_samples_per_second": 4.753, + "eval_steps_per_second": 4.753, "step": 700 }, { - "epoch": 0.03268428372739916, - "grad_norm": 5.803088665008545, - "learning_rate": 6.536856745479834e-06, - "loss": 1.1353, + "epoch": 0.26234998604521353, + "grad_norm": 0.014044429175555706, + "learning_rate": 1.843798421480426e-05, + "loss": 0.6244, "step": 705 }, { - "epoch": 0.03291608715808994, - "grad_norm": 7.0808305740356445, - "learning_rate": 6.583217431617989e-06, - "loss": 0.814, + "epoch": 0.26421062424411573, + "grad_norm": 0.011311609297990799, + "learning_rate": 1.8402943553365998e-05, + "loss": 0.5975, "step": 710 }, { - "epoch": 0.03314789058878071, - "grad_norm": 5.607944011688232, - "learning_rate": 6.629578117756144e-06, - "loss": 1.0398, + "epoch": 0.26607126244301793, + "grad_norm": 0.012355692684650421, + "learning_rate": 1.8367548276945846e-05, + "loss": 0.6009, "step": 715 }, { - "epoch": 0.03337969401947149, - "grad_norm": 4.771095275878906, - "learning_rate": 6.675938803894298e-06, - "loss": 0.975, + "epoch": 0.2679319006419202, + "grad_norm": 0.012264437042176723, + "learning_rate": 1.83317998792698e-05, + "loss": 0.6103, "step": 720 }, { - "epoch": 0.033611497450162266, - "grad_norm": 6.333477020263672, - "learning_rate": 6.722299490032454e-06, - "loss": 0.8296, + "epoch": 0.2697925388408224, + "grad_norm": 0.012037084437906742, + "learning_rate": 1.8295699868966038e-05, + "loss": 0.5602, "step": 725 }, { - "epoch": 0.033843300880853036, - "grad_norm": 5.489449501037598, - "learning_rate": 6.768660176170608e-06, - "loss": 1.0049, + "epoch": 0.27165317703972464, + "grad_norm": 0.012773050926625729, + "learning_rate": 1.8259249769501237e-05, + "loss": 0.6215, "step": 730 }, { - "epoch": 0.034075104311543813, - "grad_norm": 6.153010845184326, - "learning_rate": 6.8150208623087635e-06, - "loss": 0.921, + "epoch": 0.27351381523862683, + "grad_norm": 0.012415550649166107, + "learning_rate": 1.8222451119116288e-05, + "loss": 0.6364, "step": 735 }, { - "epoch": 0.034306907742234584, - "grad_norm": 5.430294036865234, - "learning_rate": 6.861381548446917e-06, - "loss": 0.9989, + "epoch": 0.2753744534375291, + "grad_norm": 0.01302468404173851, + "learning_rate": 1.8185305470761366e-05, + "loss": 0.5994, "step": 740 }, { - "epoch": 0.03453871117292536, - "grad_norm": 5.40978479385376, - "learning_rate": 6.907742234585072e-06, - "loss": 0.9716, + "epoch": 0.2772350916364313, + "grad_norm": 0.013447316363453865, + "learning_rate": 1.814781439203043e-05, + "loss": 0.6458, "step": 745 }, { - "epoch": 0.03477051460361613, - "grad_norm": 7.3993754386901855, - "learning_rate": 6.954102920723227e-06, - "loss": 0.9539, + "epoch": 0.27909572983533354, + "grad_norm": 0.012098093517124653, + "learning_rate": 1.8109979465095014e-05, + "loss": 0.6357, "step": 750 }, { - "epoch": 0.03500231803430691, - "grad_norm": 6.530559539794922, - "learning_rate": 7.000463606861382e-06, - "loss": 0.8866, + "epoch": 0.28095636803423574, + "grad_norm": 0.012987499125301838, + "learning_rate": 1.8071802286637505e-05, + "loss": 0.6248, "step": 755 }, { - "epoch": 0.03523412146499768, - "grad_norm": 5.703915596008301, - "learning_rate": 7.046824292999537e-06, - "loss": 0.9307, + "epoch": 0.282817006233138, + "grad_norm": 0.011747024022042751, + "learning_rate": 1.8033284467783742e-05, + "loss": 0.6202, "step": 760 }, { - "epoch": 0.03546592489568846, - "grad_norm": 5.743514060974121, - "learning_rate": 7.093184979137692e-06, - "loss": 0.9864, + "epoch": 0.2846776444320402, + "grad_norm": 0.01332057174295187, + "learning_rate": 1.7994427634035016e-05, + "loss": 0.6347, "step": 765 }, { - "epoch": 0.03569772832637923, - "grad_norm": 5.572900295257568, - "learning_rate": 7.139545665275846e-06, - "loss": 0.9106, + "epoch": 0.2865382826309424, + "grad_norm": 0.013383504003286362, + "learning_rate": 1.795523342519948e-05, + "loss": 0.6001, "step": 770 }, { - "epoch": 0.035929531757070005, - "grad_norm": 6.122991561889648, - "learning_rate": 7.185906351414002e-06, - "loss": 1.051, + "epoch": 0.28839892082984464, + "grad_norm": 0.013648821040987968, + "learning_rate": 1.7915703495322967e-05, + "loss": 0.6399, "step": 775 }, { - "epoch": 0.03616133518776078, - "grad_norm": 6.111961364746094, - "learning_rate": 7.232267037552156e-06, - "loss": 1.1246, + "epoch": 0.29025955902874684, + "grad_norm": 0.012947522103786469, + "learning_rate": 1.7875839512619148e-05, + "loss": 0.6298, "step": 780 }, { - "epoch": 0.03639313861845155, - "grad_norm": 6.494778633117676, - "learning_rate": 7.2786277236903115e-06, - "loss": 1.057, + "epoch": 0.2921201972276491, + "grad_norm": 0.01334394421428442, + "learning_rate": 1.7835643159399156e-05, + "loss": 0.6418, "step": 785 }, { - "epoch": 0.03662494204914233, - "grad_norm": 5.405693054199219, - "learning_rate": 7.324988409828466e-06, - "loss": 1.0847, + "epoch": 0.2939808354265513, + "grad_norm": 0.014045110903680325, + "learning_rate": 1.7795116132000587e-05, + "loss": 0.6403, "step": 790 }, { - "epoch": 0.0368567454798331, - "grad_norm": 5.207508087158203, - "learning_rate": 7.371349095966621e-06, - "loss": 1.0974, + "epoch": 0.29584147362545354, + "grad_norm": 0.015219368040561676, + "learning_rate": 1.7754260140715918e-05, + "loss": 0.6277, "step": 795 }, { - "epoch": 0.03708854891052388, - "grad_norm": 6.345035076141357, - "learning_rate": 7.417709782104776e-06, - "loss": 1.0643, + "epoch": 0.29770211182435574, + "grad_norm": 0.01307649165391922, + "learning_rate": 1.771307690972031e-05, + "loss": 0.6271, "step": 800 }, { - "epoch": 0.03708854891052388, - "eval_loss": 1.040871024131775, - "eval_runtime": 11.2912, - "eval_samples_per_second": 11.248, - "eval_steps_per_second": 11.248, + "epoch": 0.29770211182435574, + "eval_loss": 0.6502260565757751, + "eval_runtime": 26.7042, + "eval_samples_per_second": 4.756, + "eval_steps_per_second": 4.756, "step": 800 }, { - "epoch": 0.03732035234121465, - "grad_norm": 5.8186492919921875, - "learning_rate": 7.464070468242931e-06, - "loss": 1.0505, + "epoch": 0.299562750023258, + "grad_norm": 0.013835963793098927, + "learning_rate": 1.7671568176998865e-05, + "loss": 0.6286, "step": 805 }, { - "epoch": 0.037552155771905425, - "grad_norm": 6.559715747833252, - "learning_rate": 7.510431154381086e-06, - "loss": 0.9892, + "epoch": 0.3014233882221602, + "grad_norm": 0.013574733398854733, + "learning_rate": 1.762973569427328e-05, + "loss": 0.6462, "step": 810 }, { - "epoch": 0.037783959202596196, - "grad_norm": 5.379362106323242, - "learning_rate": 7.55679184051924e-06, - "loss": 1.1113, + "epoch": 0.30328402642106245, + "grad_norm": 0.01131366565823555, + "learning_rate": 1.758758122692791e-05, + "loss": 0.6167, "step": 815 }, { - "epoch": 0.03801576263328697, - "grad_norm": 5.991211414337158, - "learning_rate": 7.603152526657394e-06, - "loss": 1.0382, + "epoch": 0.30514466461996465, + "grad_norm": 0.013482702895998955, + "learning_rate": 1.7545106553935277e-05, + "loss": 0.6413, "step": 820 }, { - "epoch": 0.03824756606397774, - "grad_norm": 10.996150016784668, - "learning_rate": 7.64951321279555e-06, - "loss": 0.963, + "epoch": 0.3070053028188669, + "grad_norm": 0.012861824594438076, + "learning_rate": 1.7502313467780988e-05, + "loss": 0.6027, "step": 825 }, { - "epoch": 0.03847936949466852, - "grad_norm": 6.428567409515381, - "learning_rate": 7.695873898933705e-06, - "loss": 1.0694, + "epoch": 0.3088659410177691, + "grad_norm": 0.012556380592286587, + "learning_rate": 1.7459203774388097e-05, + "loss": 0.6603, "step": 830 }, { - "epoch": 0.0387111729253593, - "grad_norm": 6.335524082183838, - "learning_rate": 7.742234585071859e-06, - "loss": 1.15, + "epoch": 0.3107265792166713, + "grad_norm": 0.012278062291443348, + "learning_rate": 1.7415779293040887e-05, + "loss": 0.5803, "step": 835 }, { - "epoch": 0.03894297635605007, - "grad_norm": 6.979407787322998, - "learning_rate": 7.788595271210014e-06, - "loss": 1.0347, + "epoch": 0.31258721741557355, + "grad_norm": 0.012213567271828651, + "learning_rate": 1.7372041856308098e-05, + "loss": 0.6624, "step": 840 }, { - "epoch": 0.039174779786740846, - "grad_norm": 6.700367450714111, - "learning_rate": 7.83495595734817e-06, - "loss": 0.9949, + "epoch": 0.31444785561447575, + "grad_norm": 0.0131307952105999, + "learning_rate": 1.7327993309965583e-05, + "loss": 0.6447, "step": 845 }, { - "epoch": 0.039406583217431616, - "grad_norm": 5.5337605476379395, - "learning_rate": 7.881316643486325e-06, - "loss": 0.9346, + "epoch": 0.316308493813378, + "grad_norm": 0.0121999466791749, + "learning_rate": 1.7283635512918423e-05, + "loss": 0.6451, "step": 850 }, { - "epoch": 0.039638386648122394, - "grad_norm": 6.536638259887695, - "learning_rate": 7.92767732962448e-06, - "loss": 0.8486, + "epoch": 0.3181691320122802, + "grad_norm": 0.012872702442109585, + "learning_rate": 1.7238970337122484e-05, + "loss": 0.5724, "step": 855 }, { - "epoch": 0.039870190078813164, - "grad_norm": 5.657130241394043, - "learning_rate": 7.974038015762633e-06, - "loss": 0.9584, + "epoch": 0.32002977021118245, + "grad_norm": 0.013137550093233585, + "learning_rate": 1.7193999667505387e-05, + "loss": 0.6459, "step": 860 }, { - "epoch": 0.04010199350950394, - "grad_norm": 5.246485233306885, - "learning_rate": 8.020398701900789e-06, - "loss": 0.9707, + "epoch": 0.32189040841008465, + "grad_norm": 0.013948196545243263, + "learning_rate": 1.7148725401887002e-05, + "loss": 0.651, "step": 865 }, { - "epoch": 0.04033379694019471, - "grad_norm": 8.451269149780273, - "learning_rate": 8.066759388038944e-06, - "loss": 0.9778, + "epoch": 0.3237510466089869, + "grad_norm": 0.012517811730504036, + "learning_rate": 1.710314945089933e-05, + "loss": 0.6114, "step": 870 }, { - "epoch": 0.04056560037088549, - "grad_norm": 5.41778564453125, - "learning_rate": 8.1131200741771e-06, - "loss": 1.0868, + "epoch": 0.3256116848078891, + "grad_norm": 0.014199101366102695, + "learning_rate": 1.7057273737905887e-05, + "loss": 0.6405, "step": 875 }, { - "epoch": 0.04079740380157627, - "grad_norm": 5.614444255828857, - "learning_rate": 8.159480760315253e-06, - "loss": 1.0862, + "epoch": 0.32747232300679135, + "grad_norm": 0.012591714970767498, + "learning_rate": 1.7011100198920528e-05, + "loss": 0.6767, "step": 880 }, { - "epoch": 0.04102920723226704, - "grad_norm": 5.727239608764648, - "learning_rate": 8.205841446453408e-06, - "loss": 1.028, + "epoch": 0.32933296120569355, + "grad_norm": 0.012114683166146278, + "learning_rate": 1.6964630782525743e-05, + "loss": 0.6037, "step": 885 }, { - "epoch": 0.041261010662957814, - "grad_norm": 6.3700056076049805, - "learning_rate": 8.252202132591564e-06, - "loss": 1.1344, + "epoch": 0.33119359940459575, + "grad_norm": 0.013139299117028713, + "learning_rate": 1.6917867449790432e-05, + "loss": 0.643, "step": 890 }, { - "epoch": 0.041492814093648585, - "grad_norm": 5.90902042388916, - "learning_rate": 8.298562818729717e-06, - "loss": 1.0694, + "epoch": 0.333054237603498, + "grad_norm": 0.012391779571771622, + "learning_rate": 1.6870812174187136e-05, + "loss": 0.647, "step": 895 }, { - "epoch": 0.04172461752433936, - "grad_norm": 7.204585075378418, - "learning_rate": 8.344923504867873e-06, - "loss": 1.1645, + "epoch": 0.3349148758024002, + "grad_norm": 0.013360656797885895, + "learning_rate": 1.6823466941508762e-05, + "loss": 0.725, "step": 900 }, { - "epoch": 0.04172461752433936, - "eval_loss": 1.035333275794983, - "eval_runtime": 11.2783, - "eval_samples_per_second": 11.261, - "eval_steps_per_second": 11.261, + "epoch": 0.3349148758024002, + "eval_loss": 0.6488396525382996, + "eval_runtime": 27.3774, + "eval_samples_per_second": 4.639, + "eval_steps_per_second": 4.639, "step": 900 }, { - "epoch": 0.04195642095503013, - "grad_norm": 7.7777018547058105, - "learning_rate": 8.391284191006028e-06, - "loss": 1.1797, + "epoch": 0.33677551400130246, + "grad_norm": 0.013424807228147984, + "learning_rate": 1.677583374978478e-05, + "loss": 0.6342, "step": 905 }, { - "epoch": 0.04218822438572091, - "grad_norm": 6.48301362991333, - "learning_rate": 8.437644877144181e-06, - "loss": 0.9009, + "epoch": 0.33863615220020465, + "grad_norm": 0.01573541946709156, + "learning_rate": 1.6727914609196895e-05, + "loss": 0.6562, "step": 910 }, { - "epoch": 0.04242002781641168, - "grad_norm": 5.617315769195557, - "learning_rate": 8.484005563282337e-06, - "loss": 0.9737, + "epoch": 0.3404967903991069, + "grad_norm": 0.011363287456333637, + "learning_rate": 1.6679711541994227e-05, + "loss": 0.6492, "step": 915 }, { - "epoch": 0.04265183124710246, - "grad_norm": 5.954869747161865, - "learning_rate": 8.530366249420492e-06, - "loss": 0.9795, + "epoch": 0.3423574285980091, + "grad_norm": 0.012154373340308666, + "learning_rate": 1.6631226582407954e-05, + "loss": 0.602, "step": 920 }, { - "epoch": 0.04288363467779323, - "grad_norm": 5.577536582946777, - "learning_rate": 8.576726935558647e-06, - "loss": 1.1313, + "epoch": 0.34421806679691136, + "grad_norm": 0.01744014024734497, + "learning_rate": 1.658246177656548e-05, + "loss": 0.6318, "step": 925 }, { - "epoch": 0.043115438108484005, - "grad_norm": 5.356098175048828, - "learning_rate": 8.623087621696801e-06, - "loss": 0.9889, + "epoch": 0.34607870499581356, + "grad_norm": 0.012465902604162693, + "learning_rate": 1.6533419182404078e-05, + "loss": 0.6522, "step": 930 }, { - "epoch": 0.04334724153917478, - "grad_norm": 6.9520087242126465, - "learning_rate": 8.669448307834956e-06, - "loss": 1.1183, + "epoch": 0.3479393431947158, + "grad_norm": 0.015055039897561073, + "learning_rate": 1.6484100869584044e-05, + "loss": 0.6376, "step": 935 }, { - "epoch": 0.04357904496986555, - "grad_norm": 5.181753635406494, - "learning_rate": 8.715808993973112e-06, - "loss": 0.9116, + "epoch": 0.349799981393618, + "grad_norm": 0.012351201847195625, + "learning_rate": 1.6434508919401357e-05, + "loss": 0.6206, "step": 940 }, { - "epoch": 0.04381084840055633, - "grad_norm": 6.152126312255859, - "learning_rate": 8.762169680111267e-06, - "loss": 1.0747, + "epoch": 0.3516606195925202, + "grad_norm": 0.012793191708624363, + "learning_rate": 1.6384645424699835e-05, + "loss": 0.6182, "step": 945 }, { - "epoch": 0.0440426518312471, - "grad_norm": 6.403101921081543, - "learning_rate": 8.808530366249422e-06, - "loss": 0.9084, + "epoch": 0.35352125779142246, + "grad_norm": 0.012946651317179203, + "learning_rate": 1.6334512489782833e-05, + "loss": 0.5839, "step": 950 }, { - "epoch": 0.04427445526193788, - "grad_norm": 5.676062107086182, - "learning_rate": 8.854891052387576e-06, - "loss": 0.8152, + "epoch": 0.35538189599032466, + "grad_norm": 0.012998082675039768, + "learning_rate": 1.628411223032442e-05, + "loss": 0.6517, "step": 955 }, { - "epoch": 0.04450625869262865, - "grad_norm": 5.6627936363220215, - "learning_rate": 8.901251738525731e-06, - "loss": 1.0508, + "epoch": 0.3572425341892269, + "grad_norm": 0.012614963576197624, + "learning_rate": 1.6233446773280113e-05, + "loss": 0.6235, "step": 960 }, { - "epoch": 0.044738062123319426, - "grad_norm": 5.71261739730835, - "learning_rate": 8.947612424663886e-06, - "loss": 1.0253, + "epoch": 0.3591031723881291, + "grad_norm": 0.012318151071667671, + "learning_rate": 1.6182518256797095e-05, + "loss": 0.664, "step": 965 }, { - "epoch": 0.0449698655540102, - "grad_norm": 7.736510753631592, - "learning_rate": 8.99397311080204e-06, - "loss": 1.0148, + "epoch": 0.36096381058703136, + "grad_norm": 0.012551162391901016, + "learning_rate": 1.6131328830123997e-05, + "loss": 0.6317, "step": 970 }, { - "epoch": 0.045201668984700974, - "grad_norm": 6.378748416900635, - "learning_rate": 9.040333796940195e-06, - "loss": 1.0871, + "epoch": 0.36282444878593356, + "grad_norm": 0.013372802175581455, + "learning_rate": 1.60798806535202e-05, + "loss": 0.6418, "step": 975 }, { - "epoch": 0.045433472415391744, - "grad_norm": 6.469996452331543, - "learning_rate": 9.08669448307835e-06, - "loss": 0.9835, + "epoch": 0.3646850869848358, + "grad_norm": 0.011675640940666199, + "learning_rate": 1.6028175898164665e-05, + "loss": 0.6118, "step": 980 }, { - "epoch": 0.04566527584608252, - "grad_norm": 5.87582540512085, - "learning_rate": 9.133055169216504e-06, - "loss": 0.8729, + "epoch": 0.366545725183738, + "grad_norm": 0.013295911252498627, + "learning_rate": 1.5976216746064294e-05, + "loss": 0.6217, "step": 985 }, { - "epoch": 0.0458970792767733, - "grad_norm": 6.7339606285095215, - "learning_rate": 9.17941585535466e-06, - "loss": 1.0141, + "epoch": 0.36840636338264027, + "grad_norm": 0.012895721010863781, + "learning_rate": 1.5924005389961866e-05, + "loss": 0.6436, "step": 990 }, { - "epoch": 0.04612888270746407, - "grad_norm": 6.447714328765869, - "learning_rate": 9.225776541492815e-06, - "loss": 0.8944, + "epoch": 0.37026700158154247, + "grad_norm": 0.014572090469300747, + "learning_rate": 1.5871544033243488e-05, + "loss": 0.6342, "step": 995 }, { - "epoch": 0.04636068613815485, - "grad_norm": 5.313723564147949, - "learning_rate": 9.27213722763097e-06, - "loss": 0.926, + "epoch": 0.3721276397804447, + "grad_norm": 0.012393898330628872, + "learning_rate": 1.581883488984562e-05, + "loss": 0.6218, "step": 1000 }, { - "epoch": 0.04636068613815485, - "eval_loss": 1.0359811782836914, - "eval_runtime": 11.2916, - "eval_samples_per_second": 11.247, - "eval_steps_per_second": 11.247, + "epoch": 0.3721276397804447, + "eval_loss": 0.6476932168006897, + "eval_runtime": 27.1488, + "eval_samples_per_second": 4.678, + "eval_steps_per_second": 4.678, "step": 1000 }, { - "epoch": 0.04659248956884562, - "grad_norm": 5.054633140563965, - "learning_rate": 9.318497913769124e-06, - "loss": 0.9767, + "epoch": 0.3739882779793469, + "grad_norm": 0.014692210592329502, + "learning_rate": 1.5765880184161625e-05, + "loss": 0.6216, "step": 1005 }, { - "epoch": 0.046824292999536395, - "grad_norm": 5.397629737854004, - "learning_rate": 9.36485859990728e-06, - "loss": 1.0489, + "epoch": 0.3758489161782491, + "grad_norm": 0.012152746319770813, + "learning_rate": 1.5712682150947926e-05, + "loss": 0.6243, "step": 1010 }, { - "epoch": 0.047056096430227165, - "grad_norm": 4.538703441619873, - "learning_rate": 9.411219286045434e-06, - "loss": 0.854, + "epoch": 0.37770955437715137, + "grad_norm": 0.012929155491292477, + "learning_rate": 1.5659243035229657e-05, + "loss": 0.6493, "step": 1015 }, { - "epoch": 0.04728789986091794, - "grad_norm": 5.5513529777526855, - "learning_rate": 9.45757997218359e-06, - "loss": 0.9744, + "epoch": 0.37957019257605357, + "grad_norm": 0.0136475944891572, + "learning_rate": 1.5605565092205973e-05, + "loss": 0.6506, "step": 1020 }, { - "epoch": 0.04751970329160871, - "grad_norm": 5.287125110626221, - "learning_rate": 9.503940658321743e-06, - "loss": 0.993, + "epoch": 0.3814308307749558, + "grad_norm": 0.014008302241563797, + "learning_rate": 1.5551650587154815e-05, + "loss": 0.6429, "step": 1025 }, { - "epoch": 0.04775150672229949, - "grad_norm": 5.260387897491455, - "learning_rate": 9.550301344459899e-06, - "loss": 0.9027, + "epoch": 0.383291468973858, + "grad_norm": 0.014000017195940018, + "learning_rate": 1.5497501795337366e-05, + "loss": 0.6277, "step": 1030 }, { - "epoch": 0.04798331015299027, - "grad_norm": 7.19069242477417, - "learning_rate": 9.596662030598054e-06, - "loss": 0.783, + "epoch": 0.38515210717276027, + "grad_norm": 0.012146887369453907, + "learning_rate": 1.5443121001901994e-05, + "loss": 0.635, "step": 1035 }, { - "epoch": 0.04821511358368104, - "grad_norm": 5.297065734863281, - "learning_rate": 9.64302271673621e-06, - "loss": 0.9357, + "epoch": 0.38701274537166247, + "grad_norm": 0.013878699392080307, + "learning_rate": 1.5388510501787855e-05, + "loss": 0.6416, "step": 1040 }, { - "epoch": 0.048446917014371815, - "grad_norm": 5.547972202301025, - "learning_rate": 9.689383402874365e-06, - "loss": 1.1059, + "epoch": 0.3888733835705647, + "grad_norm": 0.011823480948805809, + "learning_rate": 1.5333672599628005e-05, + "loss": 0.637, "step": 1045 }, { - "epoch": 0.048678720445062586, - "grad_norm": 5.573083400726318, - "learning_rate": 9.735744089012518e-06, - "loss": 1.0051, + "epoch": 0.3907340217694669, + "grad_norm": 0.012524113990366459, + "learning_rate": 1.527860960965216e-05, + "loss": 0.6763, "step": 1050 }, { - "epoch": 0.04891052387575336, - "grad_norm": 6.3478546142578125, - "learning_rate": 9.782104775150672e-06, - "loss": 1.2446, + "epoch": 0.3925946599683692, + "grad_norm": 0.013192784041166306, + "learning_rate": 1.5223323855589027e-05, + "loss": 0.6501, "step": 1055 }, { - "epoch": 0.049142327306444133, - "grad_norm": 5.411722183227539, - "learning_rate": 9.828465461288827e-06, - "loss": 0.9051, + "epoch": 0.3944552981672714, + "grad_norm": 0.012439992278814316, + "learning_rate": 1.5167817670568253e-05, + "loss": 0.5886, "step": 1060 }, { - "epoch": 0.04937413073713491, - "grad_norm": 5.237289905548096, - "learning_rate": 9.874826147426983e-06, - "loss": 1.1089, + "epoch": 0.39631593636617357, + "grad_norm": 0.013605669140815735, + "learning_rate": 1.5112093397021945e-05, + "loss": 0.5925, "step": 1065 }, { - "epoch": 0.04960593416782568, - "grad_norm": 6.6244354248046875, - "learning_rate": 9.921186833565138e-06, - "loss": 1.0403, + "epoch": 0.3981765745650758, + "grad_norm": 0.01384530495852232, + "learning_rate": 1.5056153386585828e-05, + "loss": 0.6607, "step": 1070 }, { - "epoch": 0.04983773759851646, - "grad_norm": 5.800417423248291, - "learning_rate": 9.967547519703291e-06, - "loss": 1.2312, + "epoch": 0.400037212763978, + "grad_norm": 0.014060786925256252, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.6458, "step": 1075 }, { - "epoch": 0.05006954102920723, - "grad_norm": 5.320492744445801, - "learning_rate": 1.0013908205841447e-05, - "loss": 0.8337, + "epoch": 0.4018978509628803, + "grad_norm": 0.012853951193392277, + "learning_rate": 1.494363560700931e-05, + "loss": 0.6028, "step": 1080 }, { - "epoch": 0.050301344459898006, - "grad_norm": 5.400928974151611, - "learning_rate": 1.0060268891979604e-05, - "loss": 0.884, + "epoch": 0.4037584891617825, + "grad_norm": 0.012846381403505802, + "learning_rate": 1.4887062586263334e-05, + "loss": 0.6543, "step": 1085 }, { - "epoch": 0.050533147890588784, - "grad_norm": 5.387668132781982, - "learning_rate": 1.0106629578117757e-05, - "loss": 0.9599, + "epoch": 0.40561912736068473, + "grad_norm": 0.013001542538404465, + "learning_rate": 1.4830283325216026e-05, + "loss": 0.5654, "step": 1090 }, { - "epoch": 0.050764951321279554, - "grad_norm": 5.861815929412842, - "learning_rate": 1.0152990264255911e-05, - "loss": 0.9738, + "epoch": 0.4074797655595869, + "grad_norm": 0.012333175167441368, + "learning_rate": 1.477330022002493e-05, + "loss": 0.6465, "step": 1095 }, { - "epoch": 0.05099675475197033, - "grad_norm": 6.088605880737305, - "learning_rate": 1.0199350950394066e-05, - "loss": 1.1038, + "epoch": 0.4093404037584892, + "grad_norm": 0.01342159602791071, + "learning_rate": 1.4716115675450078e-05, + "loss": 0.6168, "step": 1100 }, { - "epoch": 0.05099675475197033, - "eval_loss": 1.0365773439407349, - "eval_runtime": 11.2883, - "eval_samples_per_second": 11.251, - "eval_steps_per_second": 11.251, + "epoch": 0.4093404037584892, + "eval_loss": 0.6468775272369385, + "eval_runtime": 26.8263, + "eval_samples_per_second": 4.734, + "eval_steps_per_second": 4.734, "step": 1100 }, { - "epoch": 0.0512285581826611, - "grad_norm": 5.065865516662598, - "learning_rate": 1.024571163653222e-05, - "loss": 0.8697, + "epoch": 0.4112010419573914, + "grad_norm": 0.0139292748644948, + "learning_rate": 1.4658732104752507e-05, + "loss": 0.634, "step": 1105 }, { - "epoch": 0.05146036161335188, - "grad_norm": 6.684823513031006, - "learning_rate": 1.0292072322670377e-05, - "loss": 1.1078, + "epoch": 0.41306168015629363, + "grad_norm": 0.013169731944799423, + "learning_rate": 1.4601151929592403e-05, + "loss": 0.6227, "step": 1110 }, { - "epoch": 0.05169216504404265, - "grad_norm": 5.413392066955566, - "learning_rate": 1.033843300880853e-05, - "loss": 0.9671, + "epoch": 0.41492231835519583, + "grad_norm": 0.013705245219171047, + "learning_rate": 1.4543377579926915e-05, + "loss": 0.6441, "step": 1115 }, { - "epoch": 0.05192396847473343, - "grad_norm": 5.247165679931641, - "learning_rate": 1.0384793694946686e-05, - "loss": 0.8268, + "epoch": 0.4167829565540981, + "grad_norm": 0.013035726733505726, + "learning_rate": 1.4485411493907617e-05, + "loss": 0.6498, "step": 1120 }, { - "epoch": 0.0521557719054242, - "grad_norm": 5.909581661224365, - "learning_rate": 1.0431154381084841e-05, - "loss": 1.0279, + "epoch": 0.4186435947530003, + "grad_norm": 0.01190096139907837, + "learning_rate": 1.442725611777758e-05, + "loss": 0.6285, "step": 1125 }, { - "epoch": 0.052387575336114975, - "grad_norm": 5.5670366287231445, - "learning_rate": 1.0477515067222996e-05, - "loss": 1.0836, + "epoch": 0.4205042329519025, + "grad_norm": 0.013753347098827362, + "learning_rate": 1.4368913905768178e-05, + "loss": 0.6541, "step": 1130 }, { - "epoch": 0.052619378766805745, - "grad_norm": 6.892451763153076, - "learning_rate": 1.052387575336115e-05, - "loss": 0.8755, + "epoch": 0.42236487115080473, + "grad_norm": 0.012330746278166771, + "learning_rate": 1.4310387319995492e-05, + "loss": 0.6721, "step": 1135 }, { - "epoch": 0.05285118219749652, - "grad_norm": 7.612436294555664, - "learning_rate": 1.0570236439499307e-05, - "loss": 1.0173, + "epoch": 0.42422550934970693, + "grad_norm": 0.01278294064104557, + "learning_rate": 1.4251678830356408e-05, + "loss": 0.6589, "step": 1140 }, { - "epoch": 0.0530829856281873, - "grad_norm": 5.493641376495361, - "learning_rate": 1.061659712563746e-05, - "loss": 1.0388, + "epoch": 0.4260861475486092, + "grad_norm": 0.012772184796631336, + "learning_rate": 1.41927909144244e-05, + "loss": 0.6411, "step": 1145 }, { - "epoch": 0.05331478905887807, - "grad_norm": 5.2113471031188965, - "learning_rate": 1.0662957811775616e-05, - "loss": 0.9213, + "epoch": 0.4279467857475114, + "grad_norm": 0.012047790922224522, + "learning_rate": 1.413372605734495e-05, + "loss": 0.5759, "step": 1150 }, { - "epoch": 0.05354659248956885, - "grad_norm": 6.035247802734375, - "learning_rate": 1.070931849791377e-05, - "loss": 0.9501, + "epoch": 0.42980742394641364, + "grad_norm": 0.014543715864419937, + "learning_rate": 1.4074486751730687e-05, + "loss": 0.6578, "step": 1155 }, { - "epoch": 0.05377839592025962, - "grad_norm": 6.905342102050781, - "learning_rate": 1.0755679184051927e-05, - "loss": 1.0902, + "epoch": 0.43166806214531583, + "grad_norm": 0.013436605222523212, + "learning_rate": 1.4015075497556193e-05, + "loss": 0.5876, "step": 1160 }, { - "epoch": 0.054010199350950396, - "grad_norm": 4.778216361999512, - "learning_rate": 1.080203987019008e-05, - "loss": 1.0163, + "epoch": 0.4335287003442181, + "grad_norm": 0.011584432795643806, + "learning_rate": 1.3955494802052498e-05, + "loss": 0.656, "step": 1165 }, { - "epoch": 0.054242002781641166, - "grad_norm": 5.389718055725098, - "learning_rate": 1.0848400556328234e-05, - "loss": 0.9602, + "epoch": 0.4353893385431203, + "grad_norm": 0.012196795083582401, + "learning_rate": 1.3895747179601275e-05, + "loss": 0.6562, "step": 1170 }, { - "epoch": 0.05447380621233194, - "grad_norm": 5.472878932952881, - "learning_rate": 1.0894761242466389e-05, - "loss": 1.1119, + "epoch": 0.43724997674202254, + "grad_norm": 0.011762428097426891, + "learning_rate": 1.3835835151628728e-05, + "loss": 0.5918, "step": 1175 }, { - "epoch": 0.054705609643022714, - "grad_norm": 6.158339500427246, - "learning_rate": 1.0941121928604543e-05, - "loss": 1.152, + "epoch": 0.43911061494092474, + "grad_norm": 0.014100808650255203, + "learning_rate": 1.3775761246499177e-05, + "loss": 0.6216, "step": 1180 }, { - "epoch": 0.05493741307371349, - "grad_norm": 6.443728923797607, - "learning_rate": 1.09874826147427e-05, - "loss": 1.0942, + "epoch": 0.44097125313982694, + "grad_norm": 0.011724433861672878, + "learning_rate": 1.3715527999408376e-05, + "loss": 0.6434, "step": 1185 }, { - "epoch": 0.05516921650440427, - "grad_norm": 6.741264343261719, - "learning_rate": 1.1033843300880853e-05, - "loss": 1.0458, + "epoch": 0.4428318913387292, + "grad_norm": 0.013045977801084518, + "learning_rate": 1.365513795227651e-05, + "loss": 0.5915, "step": 1190 }, { - "epoch": 0.05540101993509504, - "grad_norm": 5.246371269226074, - "learning_rate": 1.1080203987019009e-05, - "loss": 0.849, + "epoch": 0.4446925295376314, + "grad_norm": 0.013339078053832054, + "learning_rate": 1.359459365364092e-05, + "loss": 0.6148, "step": 1195 }, { - "epoch": 0.055632823365785816, - "grad_norm": 7.0242509841918945, - "learning_rate": 1.1126564673157162e-05, - "loss": 1.0244, + "epoch": 0.44655316773653364, + "grad_norm": 0.011851554736495018, + "learning_rate": 1.3533897658548571e-05, + "loss": 0.6294, "step": 1200 }, { - "epoch": 0.055632823365785816, - "eval_loss": 1.035953164100647, - "eval_runtime": 11.2796, - "eval_samples_per_second": 11.259, - "eval_steps_per_second": 11.259, + "epoch": 0.44655316773653364, + "eval_loss": 0.6459712982177734, + "eval_runtime": 27.1858, + "eval_samples_per_second": 4.672, + "eval_steps_per_second": 4.672, "step": 1200 }, { - "epoch": 0.05586462679647659, - "grad_norm": 5.66998815536499, - "learning_rate": 1.117292535929532e-05, - "loss": 0.9911, + "epoch": 0.44841380593543584, + "grad_norm": 0.014516811817884445, + "learning_rate": 1.3473052528448203e-05, + "loss": 0.6052, "step": 1205 }, { - "epoch": 0.056096430227167364, - "grad_norm": 6.682615756988525, - "learning_rate": 1.1219286045433473e-05, - "loss": 0.9718, + "epoch": 0.4502744441343381, + "grad_norm": 0.013740907423198223, + "learning_rate": 1.341206083108225e-05, + "loss": 0.6035, "step": 1210 }, { - "epoch": 0.056328233657858134, - "grad_norm": 6.9051513671875, - "learning_rate": 1.1265646731571628e-05, - "loss": 1.1582, + "epoch": 0.4521350823332403, + "grad_norm": 0.013699905015528202, + "learning_rate": 1.3350925140378465e-05, + "loss": 0.64, "step": 1215 }, { - "epoch": 0.05656003708854891, - "grad_norm": 5.607714653015137, - "learning_rate": 1.1312007417709784e-05, - "loss": 0.9506, + "epoch": 0.45399572053214254, + "grad_norm": 0.013219136744737625, + "learning_rate": 1.328964803634131e-05, + "loss": 0.6102, "step": 1220 }, { - "epoch": 0.05679184051923968, - "grad_norm": 6.658040523529053, - "learning_rate": 1.1358368103847939e-05, - "loss": 1.1766, + "epoch": 0.45585635873104474, + "grad_norm": 0.012603058479726315, + "learning_rate": 1.3228232104943073e-05, + "loss": 0.5452, "step": 1225 }, { - "epoch": 0.05702364394993046, - "grad_norm": 7.252627849578857, - "learning_rate": 1.1404728789986092e-05, - "loss": 0.9402, + "epoch": 0.457716996929947, + "grad_norm": 0.012072132900357246, + "learning_rate": 1.3166679938014728e-05, + "loss": 0.5864, "step": 1230 }, { - "epoch": 0.05725544738062123, - "grad_norm": 6.398477554321289, - "learning_rate": 1.1451089476124248e-05, - "loss": 0.8712, + "epoch": 0.4595776351288492, + "grad_norm": 0.01093310210853815, + "learning_rate": 1.3104994133136563e-05, + "loss": 0.6122, "step": 1235 }, { - "epoch": 0.05748725081131201, - "grad_norm": 6.501800060272217, - "learning_rate": 1.1497450162262403e-05, - "loss": 1.164, + "epoch": 0.4614382733277514, + "grad_norm": 0.01245942059904337, + "learning_rate": 1.3043177293528571e-05, + "loss": 0.5889, "step": 1240 }, { - "epoch": 0.057719054242002785, - "grad_norm": 5.963634967803955, - "learning_rate": 1.1543810848400557e-05, - "loss": 1.0071, + "epoch": 0.46329891152665365, + "grad_norm": 0.013127041980624199, + "learning_rate": 1.2981232027940562e-05, + "loss": 0.6225, "step": 1245 }, { - "epoch": 0.057950857672693555, - "grad_norm": 6.0244317054748535, - "learning_rate": 1.1590171534538712e-05, - "loss": 1.1575, + "epoch": 0.46515954972555584, + "grad_norm": 0.018618909642100334, + "learning_rate": 1.2919160950542095e-05, + "loss": 0.6189, "step": 1250 }, { - "epoch": 0.05818266110338433, - "grad_norm": 7.449973106384277, - "learning_rate": 1.1636532220676866e-05, - "loss": 0.8669, + "epoch": 0.4670201879244581, + "grad_norm": 0.013042682781815529, + "learning_rate": 1.2856966680812148e-05, + "loss": 0.674, "step": 1255 }, { - "epoch": 0.0584144645340751, - "grad_norm": 6.4726080894470215, - "learning_rate": 1.1682892906815023e-05, - "loss": 0.9558, + "epoch": 0.4688808261233603, + "grad_norm": 0.013208975084125996, + "learning_rate": 1.2794651843428575e-05, + "loss": 0.6084, "step": 1260 }, { - "epoch": 0.05864626796476588, - "grad_norm": 4.924577713012695, - "learning_rate": 1.1729253592953176e-05, - "loss": 0.9858, + "epoch": 0.47074146432226255, + "grad_norm": 0.012473770417273045, + "learning_rate": 1.2732219068157335e-05, + "loss": 0.5748, "step": 1265 }, { - "epoch": 0.05887807139545665, - "grad_norm": 7.104029178619385, - "learning_rate": 1.1775614279091332e-05, - "loss": 1.0259, + "epoch": 0.47260210252116475, + "grad_norm": 0.013629582710564137, + "learning_rate": 1.2669670989741519e-05, + "loss": 0.6358, "step": 1270 }, { - "epoch": 0.05910987482614743, - "grad_norm": 5.916851997375488, - "learning_rate": 1.1821974965229485e-05, - "loss": 1.0703, + "epoch": 0.474462740720067, + "grad_norm": 0.014257396571338177, + "learning_rate": 1.2607010247790158e-05, + "loss": 0.6794, "step": 1275 }, { - "epoch": 0.0593416782568382, - "grad_norm": 5.41799783706665, - "learning_rate": 1.1868335651367642e-05, - "loss": 0.9557, + "epoch": 0.4763233789189692, + "grad_norm": 0.0164741612970829, + "learning_rate": 1.2544239486666831e-05, + "loss": 0.6647, "step": 1280 }, { - "epoch": 0.059573481687528976, - "grad_norm": 5.087590217590332, - "learning_rate": 1.1914696337505796e-05, - "loss": 0.8486, + "epoch": 0.47818401711787145, + "grad_norm": 0.012896180152893066, + "learning_rate": 1.2481361355378066e-05, + "loss": 0.6413, "step": 1285 }, { - "epoch": 0.059805285118219746, - "grad_norm": 4.912267208099365, - "learning_rate": 1.1961057023643951e-05, - "loss": 1.0576, + "epoch": 0.48004465531677365, + "grad_norm": 0.012296337634325027, + "learning_rate": 1.2418378507461544e-05, + "loss": 0.62, "step": 1290 }, { - "epoch": 0.060037088548910524, - "grad_norm": 5.457602024078369, - "learning_rate": 1.2007417709782105e-05, - "loss": 1.0238, + "epoch": 0.4819052935156759, + "grad_norm": 0.01213445421308279, + "learning_rate": 1.2355293600874132e-05, + "loss": 0.6611, "step": 1295 }, { - "epoch": 0.0602688919796013, - "grad_norm": 5.606520652770996, - "learning_rate": 1.2053778395920262e-05, - "loss": 0.9371, + "epoch": 0.4837659317145781, + "grad_norm": 0.01198558695614338, + "learning_rate": 1.229210929787969e-05, + "loss": 0.6438, "step": 1300 }, { - "epoch": 0.0602688919796013, - "eval_loss": 1.0377291440963745, - "eval_runtime": 11.2632, - "eval_samples_per_second": 11.276, - "eval_steps_per_second": 11.276, + "epoch": 0.4837659317145781, + "eval_loss": 0.6453238725662231, + "eval_runtime": 26.6037, + "eval_samples_per_second": 4.774, + "eval_steps_per_second": 4.774, "step": 1300 }, { - "epoch": 0.06050069541029207, - "grad_norm": 5.769026279449463, - "learning_rate": 1.2100139082058415e-05, - "loss": 0.9785, + "epoch": 0.4856265699134803, + "grad_norm": 0.012105841189622879, + "learning_rate": 1.2228828264936755e-05, + "loss": 0.675, "step": 1305 }, { - "epoch": 0.06073249884098285, - "grad_norm": 4.725026607513428, - "learning_rate": 1.214649976819657e-05, - "loss": 0.7743, + "epoch": 0.48748720811238255, + "grad_norm": 0.012657279148697853, + "learning_rate": 1.2165453172585964e-05, + "loss": 0.6066, "step": 1310 }, { - "epoch": 0.06096430227167362, - "grad_norm": 5.6914567947387695, - "learning_rate": 1.2192860454334726e-05, - "loss": 0.926, + "epoch": 0.48934784631128475, + "grad_norm": 0.01320530753582716, + "learning_rate": 1.2101986695337407e-05, + "loss": 0.6578, "step": 1315 }, { - "epoch": 0.061196105702364396, - "grad_norm": 5.947300434112549, - "learning_rate": 1.223922114047288e-05, - "loss": 0.8786, + "epoch": 0.491208484510187, + "grad_norm": 0.012736879289150238, + "learning_rate": 1.2038431511557715e-05, + "loss": 0.6596, "step": 1320 }, { - "epoch": 0.06142790913305517, - "grad_norm": 4.879319190979004, - "learning_rate": 1.2285581826611035e-05, - "loss": 0.9271, + "epoch": 0.4930691227090892, + "grad_norm": 0.013182558119297028, + "learning_rate": 1.197479030335706e-05, + "loss": 0.595, "step": 1325 }, { - "epoch": 0.061659712563745944, - "grad_norm": 5.405600070953369, - "learning_rate": 1.2331942512749188e-05, - "loss": 1.0455, + "epoch": 0.49492976090799146, + "grad_norm": 0.013970241881906986, + "learning_rate": 1.1911065756475953e-05, + "loss": 0.6525, "step": 1330 }, { - "epoch": 0.061891515994436715, - "grad_norm": 6.316072463989258, - "learning_rate": 1.2378303198887345e-05, - "loss": 0.9825, + "epoch": 0.49679039910689365, + "grad_norm": 0.012158108875155449, + "learning_rate": 1.1847260560171895e-05, + "loss": 0.576, "step": 1335 }, { - "epoch": 0.06212331942512749, - "grad_norm": 5.218293190002441, - "learning_rate": 1.2424663885025499e-05, - "loss": 0.9407, + "epoch": 0.4986510373057959, + "grad_norm": 0.012398924678564072, + "learning_rate": 1.1783377407105907e-05, + "loss": 0.6039, "step": 1340 }, { - "epoch": 0.06235512285581827, - "grad_norm": 5.403647422790527, - "learning_rate": 1.2471024571163654e-05, - "loss": 0.9509, + "epoch": 0.5005116755046981, + "grad_norm": 0.013791786506772041, + "learning_rate": 1.1719418993228883e-05, + "loss": 0.6585, "step": 1345 }, { - "epoch": 0.06258692628650904, - "grad_norm": 5.6688666343688965, - "learning_rate": 1.2517385257301808e-05, - "loss": 0.9351, + "epoch": 0.5023723137036004, + "grad_norm": 0.011280239559710026, + "learning_rate": 1.1655388017667812e-05, + "loss": 0.5919, "step": 1350 }, { - "epoch": 0.06281872971719982, - "grad_norm": 4.872259616851807, - "learning_rate": 1.2563745943439965e-05, - "loss": 0.9504, + "epoch": 0.5042329519025026, + "grad_norm": 0.015436794608831406, + "learning_rate": 1.159128718261189e-05, + "loss": 0.6632, "step": 1355 }, { - "epoch": 0.0630505331478906, - "grad_norm": 5.114622592926025, - "learning_rate": 1.2610106629578119e-05, - "loss": 0.9407, + "epoch": 0.5060935901014048, + "grad_norm": 0.015739573165774345, + "learning_rate": 1.1527119193198466e-05, + "loss": 0.6384, "step": 1360 }, { - "epoch": 0.06328233657858136, - "grad_norm": 6.169310092926025, - "learning_rate": 1.2656467315716274e-05, - "loss": 1.2146, + "epoch": 0.507954228300307, + "grad_norm": 0.01318281702697277, + "learning_rate": 1.146288675739889e-05, + "loss": 0.6312, "step": 1365 }, { - "epoch": 0.06351414000927214, - "grad_norm": 6.065832138061523, - "learning_rate": 1.2702828001854428e-05, - "loss": 1.0371, + "epoch": 0.5098148664992093, + "grad_norm": 0.01480270829051733, + "learning_rate": 1.1398592585904234e-05, + "loss": 0.6453, "step": 1370 }, { - "epoch": 0.06374594343996291, - "grad_norm": 7.362551689147949, - "learning_rate": 1.2749188687992585e-05, - "loss": 0.9644, + "epoch": 0.5116755046981114, + "grad_norm": 0.011967113241553307, + "learning_rate": 1.133423939201089e-05, + "loss": 0.6335, "step": 1375 }, { - "epoch": 0.06397774687065369, - "grad_norm": 6.0073065757751465, - "learning_rate": 1.2795549374130738e-05, - "loss": 1.1034, + "epoch": 0.5135361428970137, + "grad_norm": 0.013129732571542263, + "learning_rate": 1.1269829891506081e-05, + "loss": 0.5852, "step": 1380 }, { - "epoch": 0.06420955030134445, - "grad_norm": 6.123709201812744, - "learning_rate": 1.2841910060268893e-05, - "loss": 1.0149, + "epoch": 0.5153967810959159, + "grad_norm": 0.0132956113666296, + "learning_rate": 1.1205366802553231e-05, + "loss": 0.6477, "step": 1385 }, { - "epoch": 0.06444135373203523, - "grad_norm": 5.83848237991333, - "learning_rate": 1.2888270746407047e-05, - "loss": 1.1497, + "epoch": 0.5172574192948182, + "grad_norm": 0.01314165536314249, + "learning_rate": 1.1140852845577273e-05, + "loss": 0.6441, "step": 1390 }, { - "epoch": 0.06467315716272601, - "grad_norm": 5.201710224151611, - "learning_rate": 1.2934631432545204e-05, - "loss": 0.8791, + "epoch": 0.5191180574937203, + "grad_norm": 0.01178740430623293, + "learning_rate": 1.1076290743149827e-05, + "loss": 0.6035, "step": 1395 }, { - "epoch": 0.06490496059341679, - "grad_norm": 5.282418727874756, - "learning_rate": 1.2980992118683358e-05, - "loss": 0.9191, + "epoch": 0.5209786956926226, + "grad_norm": 0.013940893113613129, + "learning_rate": 1.1011683219874324e-05, + "loss": 0.6492, "step": 1400 }, { - "epoch": 0.06490496059341679, - "eval_loss": 1.0392991304397583, - "eval_runtime": 11.2723, - "eval_samples_per_second": 11.267, - "eval_steps_per_second": 11.267, + "epoch": 0.5209786956926226, + "eval_loss": 0.644782543182373, + "eval_runtime": 26.5697, + "eval_samples_per_second": 4.78, + "eval_steps_per_second": 4.78, "step": 1400 }, { - "epoch": 0.06513676402410756, - "grad_norm": 6.327620506286621, - "learning_rate": 1.3027352804821511e-05, - "loss": 0.9029, + "epoch": 0.5228393338915248, + "grad_norm": 0.013468287885189056, + "learning_rate": 1.0947033002271001e-05, + "loss": 0.6135, "step": 1405 }, { - "epoch": 0.06536856745479833, - "grad_norm": 5.448641777038574, - "learning_rate": 1.3073713490959668e-05, - "loss": 1.0854, + "epoch": 0.5246999720904271, + "grad_norm": 0.015101495198905468, + "learning_rate": 1.0882342818661859e-05, + "loss": 0.6449, "step": 1410 }, { - "epoch": 0.0656003708854891, - "grad_norm": 6.067873001098633, - "learning_rate": 1.3120074177097822e-05, - "loss": 1.0033, + "epoch": 0.5265606102893292, + "grad_norm": 0.013072527013719082, + "learning_rate": 1.0817615399055513e-05, + "loss": 0.6252, "step": 1415 }, { - "epoch": 0.06583217431617988, - "grad_norm": 4.918057918548584, - "learning_rate": 1.3166434863235977e-05, - "loss": 1.1301, + "epoch": 0.5284212484882315, + "grad_norm": 0.01259040180593729, + "learning_rate": 1.075285347503198e-05, + "loss": 0.6274, "step": 1420 }, { - "epoch": 0.06606397774687066, - "grad_norm": 5.890987873077393, - "learning_rate": 1.3212795549374131e-05, - "loss": 0.9813, + "epoch": 0.5302818866871337, + "grad_norm": 0.014084907248616219, + "learning_rate": 1.0688059779627417e-05, + "loss": 0.6298, "step": 1425 }, { - "epoch": 0.06629578117756142, - "grad_norm": 5.978640556335449, - "learning_rate": 1.3259156235512288e-05, - "loss": 1.0217, + "epoch": 0.5321425248860359, + "grad_norm": 0.015118095092475414, + "learning_rate": 1.0623237047218771e-05, + "loss": 0.6638, "step": 1430 }, { - "epoch": 0.0665275846082522, - "grad_norm": 7.252511024475098, - "learning_rate": 1.3305516921650441e-05, - "loss": 0.9992, + "epoch": 0.5340031630849381, + "grad_norm": 0.013168774545192719, + "learning_rate": 1.0558388013408378e-05, + "loss": 0.6134, "step": 1435 }, { - "epoch": 0.06675938803894298, - "grad_norm": 5.379759788513184, - "learning_rate": 1.3351877607788597e-05, - "loss": 0.9253, + "epoch": 0.5358638012838404, + "grad_norm": 0.011875185184180737, + "learning_rate": 1.0493515414908542e-05, + "loss": 0.6396, "step": 1440 }, { - "epoch": 0.06699119146963375, - "grad_norm": 6.269161701202393, - "learning_rate": 1.339823829392675e-05, - "loss": 1.0769, + "epoch": 0.5377244394827426, + "grad_norm": 0.013676362112164497, + "learning_rate": 1.0428621989426016e-05, + "loss": 0.6286, "step": 1445 }, { - "epoch": 0.06722299490032453, - "grad_norm": 6.233831882476807, - "learning_rate": 1.3444598980064907e-05, - "loss": 1.0805, + "epoch": 0.5395850776816448, + "grad_norm": 0.012775209732353687, + "learning_rate": 1.0363710475546483e-05, + "loss": 0.6156, "step": 1450 }, { - "epoch": 0.0674547983310153, - "grad_norm": 6.35414981842041, - "learning_rate": 1.3490959666203061e-05, - "loss": 0.9839, + "epoch": 0.541445715880547, + "grad_norm": 0.014763396233320236, + "learning_rate": 1.0298783612618977e-05, + "loss": 0.6713, "step": 1455 }, { - "epoch": 0.06768660176170607, - "grad_norm": 5.051734924316406, - "learning_rate": 1.3537320352341216e-05, - "loss": 1.1025, + "epoch": 0.5433063540794493, + "grad_norm": 0.014012233354151249, + "learning_rate": 1.0233844140640287e-05, + "loss": 0.5887, "step": 1460 }, { - "epoch": 0.06791840519239685, - "grad_norm": 6.6913886070251465, - "learning_rate": 1.358368103847937e-05, - "loss": 1.2106, + "epoch": 0.5451669922783515, + "grad_norm": 0.013994456268846989, + "learning_rate": 1.0168894800139311e-05, + "loss": 0.6509, "step": 1465 }, { - "epoch": 0.06815020862308763, - "grad_norm": 5.920834064483643, - "learning_rate": 1.3630041724617527e-05, - "loss": 0.9518, + "epoch": 0.5470276304772537, + "grad_norm": 0.014792009256780148, + "learning_rate": 1.0103938332061422e-05, + "loss": 0.6434, "step": 1470 }, { - "epoch": 0.06838201205377839, - "grad_norm": 4.299671173095703, - "learning_rate": 1.367640241075568e-05, - "loss": 0.7529, + "epoch": 0.5488882686761559, + "grad_norm": 0.01255644578486681, + "learning_rate": 1.0038977477652779e-05, + "loss": 0.6407, "step": 1475 }, { - "epoch": 0.06861381548446917, - "grad_norm": 5.374516010284424, - "learning_rate": 1.3722763096893834e-05, - "loss": 1.1624, + "epoch": 0.5507489068750582, + "grad_norm": 0.013431582599878311, + "learning_rate": 9.974014978344646e-06, + "loss": 0.6528, "step": 1480 }, { - "epoch": 0.06884561891515995, - "grad_norm": 6.2488932609558105, - "learning_rate": 1.376912378303199e-05, - "loss": 1.0323, + "epoch": 0.5526095450739603, + "grad_norm": 0.0123568931594491, + "learning_rate": 9.909053575637717e-06, + "loss": 0.602, "step": 1485 }, { - "epoch": 0.06907742234585072, - "grad_norm": 6.374792575836182, - "learning_rate": 1.3815484469170145e-05, - "loss": 1.1237, + "epoch": 0.5544701832728626, + "grad_norm": 0.014569776132702827, + "learning_rate": 9.844096010986392e-06, + "loss": 0.6268, "step": 1490 }, { - "epoch": 0.06930922577654149, - "grad_norm": 5.536932468414307, - "learning_rate": 1.38618451553083e-05, - "loss": 1.1524, + "epoch": 0.5563308214717648, + "grad_norm": 0.012518184259533882, + "learning_rate": 9.779145025683114e-06, + "loss": 0.5936, "step": 1495 }, { - "epoch": 0.06954102920723226, - "grad_norm": 5.977290153503418, - "learning_rate": 1.3908205841446454e-05, - "loss": 0.9357, + "epoch": 0.5581914596706671, + "grad_norm": 0.011615007184445858, + "learning_rate": 9.714203360742666e-06, + "loss": 0.6275, "step": 1500 }, { - "epoch": 0.06954102920723226, - "eval_loss": 1.0378950834274292, - "eval_runtime": 11.2663, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, + "epoch": 0.5581914596706671, + "eval_loss": 0.6442868709564209, + "eval_runtime": 26.5597, + "eval_samples_per_second": 4.782, + "eval_steps_per_second": 4.782, "step": 1500 }, { - "epoch": 0.06977283263792304, - "grad_norm": 5.865034580230713, - "learning_rate": 1.3954566527584609e-05, - "loss": 1.0989, + "epoch": 0.5600520978695692, + "grad_norm": 0.013426556251943111, + "learning_rate": 9.649273756786486e-06, + "loss": 0.6291, "step": 1505 }, { - "epoch": 0.07000463606861382, - "grad_norm": 5.796088695526123, - "learning_rate": 1.4000927213722764e-05, - "loss": 0.9742, + "epoch": 0.5619127360684715, + "grad_norm": 0.01306986529380083, + "learning_rate": 9.584358953927043e-06, + "loss": 0.6211, "step": 1510 }, { - "epoch": 0.0702364394993046, - "grad_norm": 0.5797013640403748, - "learning_rate": 1.404728789986092e-05, - "loss": 0.9583, + "epoch": 0.5637733742673737, + "grad_norm": 0.012594708241522312, + "learning_rate": 9.519461691652169e-06, + "loss": 0.5803, "step": 1515 }, { - "epoch": 0.07046824292999536, - "grad_norm": 6.2530341148376465, - "learning_rate": 1.4093648585999073e-05, - "loss": 1.0374, + "epoch": 0.565634012466276, + "grad_norm": 0.014830299653112888, + "learning_rate": 9.454584708709462e-06, + "loss": 0.5976, "step": 1520 }, { - "epoch": 0.07070004636068614, - "grad_norm": 5.679171562194824, - "learning_rate": 1.414000927213723e-05, - "loss": 1.1167, + "epoch": 0.5674946506651781, + "grad_norm": 0.013333864510059357, + "learning_rate": 9.389730742990714e-06, + "loss": 0.6154, "step": 1525 }, { - "epoch": 0.07093184979137691, - "grad_norm": 4.820891380310059, - "learning_rate": 1.4186369958275384e-05, - "loss": 0.9248, + "epoch": 0.5693552888640804, + "grad_norm": 0.013150627724826336, + "learning_rate": 9.324902531416348e-06, + "loss": 0.581, "step": 1530 }, { - "epoch": 0.07116365322206769, - "grad_norm": 5.5291948318481445, - "learning_rate": 1.423273064441354e-05, - "loss": 1.014, + "epoch": 0.5712159270629826, + "grad_norm": 0.012874056585133076, + "learning_rate": 9.260102809819939e-06, + "loss": 0.6224, "step": 1535 }, { - "epoch": 0.07139545665275845, - "grad_norm": 5.817046165466309, - "learning_rate": 1.4279091330551693e-05, - "loss": 1.2119, + "epoch": 0.5730765652618848, + "grad_norm": 0.012678616680204868, + "learning_rate": 9.195334312832742e-06, + "loss": 0.6705, "step": 1540 }, { - "epoch": 0.07162726008344923, - "grad_norm": 5.100951194763184, - "learning_rate": 1.432545201668985e-05, - "loss": 0.8246, + "epoch": 0.574937203460787, + "grad_norm": 0.011814710684120655, + "learning_rate": 9.1305997737683e-06, + "loss": 0.6103, "step": 1545 }, { - "epoch": 0.07185906351414001, - "grad_norm": 8.208178520202637, - "learning_rate": 1.4371812702828003e-05, - "loss": 1.1136, + "epoch": 0.5767978416596893, + "grad_norm": 0.013010908849537373, + "learning_rate": 9.065901924507085e-06, + "loss": 0.655, "step": 1550 }, { - "epoch": 0.07209086694483079, - "grad_norm": 5.660605430603027, - "learning_rate": 1.4418173388966157e-05, - "loss": 0.8556, + "epoch": 0.5786584798585915, + "grad_norm": 0.013622297905385494, + "learning_rate": 9.001243495381207e-06, + "loss": 0.5961, "step": 1555 }, { - "epoch": 0.07232267037552156, - "grad_norm": 7.057506084442139, - "learning_rate": 1.4464534075104312e-05, - "loss": 1.0404, + "epoch": 0.5805191180574937, + "grad_norm": 0.013876644894480705, + "learning_rate": 8.936627215059206e-06, + "loss": 0.6789, "step": 1560 }, { - "epoch": 0.07255447380621233, - "grad_norm": 5.514895915985107, - "learning_rate": 1.4510894761242466e-05, - "loss": 0.9745, + "epoch": 0.5823797562563959, + "grad_norm": 0.01281541958451271, + "learning_rate": 8.872055810430881e-06, + "loss": 0.6567, "step": 1565 }, { - "epoch": 0.0727862772369031, - "grad_norm": 5.650478363037109, - "learning_rate": 1.4557255447380623e-05, - "loss": 0.9403, + "epoch": 0.5842403944552982, + "grad_norm": 0.012182512320578098, + "learning_rate": 8.80753200649222e-06, + "loss": 0.6338, "step": 1570 }, { - "epoch": 0.07301808066759388, - "grad_norm": 5.063937187194824, - "learning_rate": 1.4603616133518777e-05, - "loss": 1.0381, + "epoch": 0.5861010326542004, + "grad_norm": 0.012718496844172478, + "learning_rate": 8.743058526230409e-06, + "loss": 0.6151, "step": 1575 }, { - "epoch": 0.07324988409828466, - "grad_norm": 4.689387798309326, - "learning_rate": 1.4649976819656932e-05, - "loss": 0.9246, + "epoch": 0.5879616708531026, + "grad_norm": 0.01301120687276125, + "learning_rate": 8.678638090508897e-06, + "loss": 0.6147, "step": 1580 }, { - "epoch": 0.07348168752897542, - "grad_norm": 6.5369720458984375, - "learning_rate": 1.4696337505795087e-05, - "loss": 1.1306, + "epoch": 0.5898223090520048, + "grad_norm": 0.011715607717633247, + "learning_rate": 8.614273417952593e-06, + "loss": 0.5776, "step": 1585 }, { - "epoch": 0.0737134909596662, - "grad_norm": 6.205171585083008, - "learning_rate": 1.4742698191933242e-05, - "loss": 1.0382, + "epoch": 0.5916829472509071, + "grad_norm": 0.013846187852323055, + "learning_rate": 8.549967224833131e-06, + "loss": 0.6604, "step": 1590 }, { - "epoch": 0.07394529439035698, - "grad_norm": 4.940855026245117, - "learning_rate": 1.4789058878071396e-05, - "loss": 0.9282, + "epoch": 0.5935435854498092, + "grad_norm": 0.013312350027263165, + "learning_rate": 8.485722224954237e-06, + "loss": 0.6395, "step": 1595 }, { - "epoch": 0.07417709782104775, - "grad_norm": 5.930394649505615, - "learning_rate": 1.4835419564209551e-05, - "loss": 0.9031, + "epoch": 0.5954042236487115, + "grad_norm": 0.014313463121652603, + "learning_rate": 8.421541129537194e-06, + "loss": 0.6848, "step": 1600 }, { - "epoch": 0.07417709782104775, - "eval_loss": 1.041509985923767, - "eval_runtime": 11.2652, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, + "epoch": 0.5954042236487115, + "eval_loss": 0.6439012885093689, + "eval_runtime": 26.5935, + "eval_samples_per_second": 4.776, + "eval_steps_per_second": 4.776, "step": 1600 }, { - "epoch": 0.07440890125173853, - "grad_norm": 5.346956729888916, - "learning_rate": 1.4881780250347707e-05, - "loss": 0.9515, + "epoch": 0.5972648618476137, + "grad_norm": 0.012196023017168045, + "learning_rate": 8.357426647106451e-06, + "loss": 0.6079, "step": 1605 }, { - "epoch": 0.0746407046824293, - "grad_norm": 7.964709758758545, - "learning_rate": 1.4928140936485862e-05, - "loss": 1.0475, + "epoch": 0.599125500046516, + "grad_norm": 0.0135956397280097, + "learning_rate": 8.293381483375293e-06, + "loss": 0.6463, "step": 1610 }, { - "epoch": 0.07487250811312007, - "grad_norm": 5.0009331703186035, - "learning_rate": 1.4974501622624016e-05, - "loss": 1.1008, + "epoch": 0.6009861382454181, + "grad_norm": 0.011982797645032406, + "learning_rate": 8.229408341131665e-06, + "loss": 0.6113, "step": 1615 }, { - "epoch": 0.07510431154381085, - "grad_norm": 5.313481330871582, - "learning_rate": 1.5020862308762173e-05, - "loss": 0.9793, + "epoch": 0.6028467764443204, + "grad_norm": 0.014127896167337894, + "learning_rate": 8.165509920124125e-06, + "loss": 0.6602, "step": 1620 }, { - "epoch": 0.07533611497450163, - "grad_norm": 4.500079154968262, - "learning_rate": 1.5067222994900326e-05, - "loss": 1.067, + "epoch": 0.6047074146432226, + "grad_norm": 0.012588880024850368, + "learning_rate": 8.10168891694789e-06, + "loss": 0.6376, "step": 1625 }, { - "epoch": 0.07556791840519239, - "grad_norm": 5.636900901794434, - "learning_rate": 1.511358368103848e-05, - "loss": 1.081, + "epoch": 0.6065680528421249, + "grad_norm": 0.012559432536363602, + "learning_rate": 8.037948024931039e-06, + "loss": 0.6336, "step": 1630 }, { - "epoch": 0.07579972183588317, - "grad_norm": 5.819832801818848, - "learning_rate": 1.5159944367176635e-05, - "loss": 1.0421, + "epoch": 0.608428691041027, + "grad_norm": 0.012455436401069164, + "learning_rate": 7.974289934020879e-06, + "loss": 0.6403, "step": 1635 }, { - "epoch": 0.07603152526657395, - "grad_norm": 4.920659065246582, - "learning_rate": 1.5206305053314789e-05, - "loss": 0.8251, + "epoch": 0.6102893292399293, + "grad_norm": 0.01241991762071848, + "learning_rate": 7.91071733067038e-06, + "loss": 0.6518, "step": 1640 }, { - "epoch": 0.07626332869726472, - "grad_norm": 7.103638172149658, - "learning_rate": 1.5252665739452946e-05, - "loss": 1.1794, + "epoch": 0.6121499674388315, + "grad_norm": 0.012328005395829678, + "learning_rate": 7.84723289772484e-06, + "loss": 0.6162, "step": 1645 }, { - "epoch": 0.07649513212795549, - "grad_norm": 6.366265296936035, - "learning_rate": 1.52990264255911e-05, - "loss": 0.9501, + "epoch": 0.6140106056377338, + "grad_norm": 0.012782512232661247, + "learning_rate": 7.783839314308656e-06, + "loss": 0.6624, "step": 1650 }, { - "epoch": 0.07672693555864626, - "grad_norm": 7.341405868530273, - "learning_rate": 1.5345387111729255e-05, - "loss": 0.8921, + "epoch": 0.6158712438366359, + "grad_norm": 0.012050081044435501, + "learning_rate": 7.720539255712252e-06, + "loss": 0.6565, "step": 1655 }, { - "epoch": 0.07695873898933704, - "grad_norm": 8.745707511901855, - "learning_rate": 1.539174779786741e-05, - "loss": 0.9825, + "epoch": 0.6177318820355382, + "grad_norm": 0.013794245198369026, + "learning_rate": 7.657335393279179e-06, + "loss": 0.6475, "step": 1660 }, { - "epoch": 0.07719054242002782, - "grad_norm": 6.279496192932129, - "learning_rate": 1.5438108484005565e-05, - "loss": 1.1137, + "epoch": 0.6195925202344404, + "grad_norm": 0.012430761009454727, + "learning_rate": 7.594230394293404e-06, + "loss": 0.5821, "step": 1665 }, { - "epoch": 0.0774223458507186, - "grad_norm": 4.467720031738281, - "learning_rate": 1.5484469170143717e-05, - "loss": 0.8885, + "epoch": 0.6214531584333426, + "grad_norm": 0.013150406070053577, + "learning_rate": 7.531226921866715e-06, + "loss": 0.6023, "step": 1670 }, { - "epoch": 0.07765414928140936, - "grad_norm": 4.987480163574219, - "learning_rate": 1.5530829856281876e-05, - "loss": 1.022, + "epoch": 0.6233137966322448, + "grad_norm": 0.013466687873005867, + "learning_rate": 7.468327634826354e-06, + "loss": 0.637, "step": 1675 }, { - "epoch": 0.07788595271210014, - "grad_norm": 5.3170061111450195, - "learning_rate": 1.5577190542420028e-05, - "loss": 1.0107, + "epoch": 0.6251744348311471, + "grad_norm": 0.014000273309648037, + "learning_rate": 7.405535187602809e-06, + "loss": 0.6113, "step": 1680 }, { - "epoch": 0.07811775614279091, - "grad_norm": 5.079998970031738, - "learning_rate": 1.5623551228558183e-05, - "loss": 1.0645, + "epoch": 0.6270350730300494, + "grad_norm": 0.013736380264163017, + "learning_rate": 7.3428522301177894e-06, + "loss": 0.5914, "step": 1685 }, { - "epoch": 0.07834955957348169, - "grad_norm": 6.98558235168457, - "learning_rate": 1.566991191469634e-05, - "loss": 1.0992, + "epoch": 0.6288957112289515, + "grad_norm": 0.013854081742465496, + "learning_rate": 7.2802814076723896e-06, + "loss": 0.6744, "step": 1690 }, { - "epoch": 0.07858136300417246, - "grad_norm": 5.634239673614502, - "learning_rate": 1.5716272600834494e-05, - "loss": 0.8674, + "epoch": 0.6307563494278537, + "grad_norm": 0.012866591103374958, + "learning_rate": 7.217825360835475e-06, + "loss": 0.6209, "step": 1695 }, { - "epoch": 0.07881316643486323, - "grad_norm": 6.679068088531494, - "learning_rate": 1.576263328697265e-05, - "loss": 0.9038, + "epoch": 0.632616987626756, + "grad_norm": 0.012788123451173306, + "learning_rate": 7.155486725332224e-06, + "loss": 0.5764, "step": 1700 }, { - "epoch": 0.07881316643486323, - "eval_loss": 1.0476100444793701, - "eval_runtime": 11.2706, - "eval_samples_per_second": 11.268, - "eval_steps_per_second": 11.268, + "epoch": 0.632616987626756, + "eval_loss": 0.6434745192527771, + "eval_runtime": 26.5737, + "eval_samples_per_second": 4.779, + "eval_steps_per_second": 4.779, "step": 1700 }, { - "epoch": 0.07904496986555401, - "grad_norm": 4.895448207855225, - "learning_rate": 1.5808993973110804e-05, - "loss": 0.9676, + "epoch": 0.6344776258256583, + "grad_norm": 0.013442011550068855, + "learning_rate": 7.093268131932905e-06, + "loss": 0.6522, "step": 1705 }, { - "epoch": 0.07927677329624479, - "grad_norm": 5.62677001953125, - "learning_rate": 1.585535465924896e-05, - "loss": 1.1671, + "epoch": 0.6363382640245604, + "grad_norm": 0.013659958727657795, + "learning_rate": 7.03117220634187e-06, + "loss": 0.5949, "step": 1710 }, { - "epoch": 0.07950857672693556, - "grad_norm": 4.569333076477051, - "learning_rate": 1.590171534538711e-05, - "loss": 0.8581, + "epoch": 0.6381989022234626, + "grad_norm": 0.013367819599807262, + "learning_rate": 6.9692015690867135e-06, + "loss": 0.5959, "step": 1715 }, { - "epoch": 0.07974038015762633, - "grad_norm": 6.019289970397949, - "learning_rate": 1.5948076031525267e-05, - "loss": 0.9214, + "epoch": 0.6400595404223649, + "grad_norm": 0.013543561100959778, + "learning_rate": 6.9073588354077125e-06, + "loss": 0.6539, "step": 1720 }, { - "epoch": 0.0799721835883171, - "grad_norm": 5.4483537673950195, - "learning_rate": 1.5994436717663422e-05, - "loss": 1.16, + "epoch": 0.641920178621267, + "grad_norm": 0.014200146310031414, + "learning_rate": 6.845646615147445e-06, + "loss": 0.6438, "step": 1725 }, { - "epoch": 0.08020398701900788, - "grad_norm": 5.327470302581787, - "learning_rate": 1.6040797403801578e-05, - "loss": 0.8904, + "epoch": 0.6437808168201693, + "grad_norm": 0.012410931289196014, + "learning_rate": 6.784067512640666e-06, + "loss": 0.6035, "step": 1730 }, { - "epoch": 0.08043579044969866, - "grad_norm": 4.499557971954346, - "learning_rate": 1.6087158089939733e-05, - "loss": 1.0687, + "epoch": 0.6456414550190716, + "grad_norm": 0.01299245934933424, + "learning_rate": 6.7226241266043735e-06, + "loss": 0.6507, "step": 1735 }, { - "epoch": 0.08066759388038942, - "grad_norm": 4.95378303527832, - "learning_rate": 1.6133518776077888e-05, - "loss": 0.9192, + "epoch": 0.6475020932179738, + "grad_norm": 0.013046164996922016, + "learning_rate": 6.661319050028167e-06, + "loss": 0.6277, "step": 1740 }, { - "epoch": 0.0808993973110802, - "grad_norm": 5.148226737976074, - "learning_rate": 1.617987946221604e-05, - "loss": 0.9066, + "epoch": 0.649362731416876, + "grad_norm": 0.013011117465794086, + "learning_rate": 6.600154870064812e-06, + "loss": 0.6415, "step": 1745 }, { - "epoch": 0.08113120074177098, - "grad_norm": 5.059067726135254, - "learning_rate": 1.62262401483542e-05, - "loss": 1.0501, + "epoch": 0.6512233696157782, + "grad_norm": 0.012195507064461708, + "learning_rate": 6.53913416792105e-06, + "loss": 0.5718, "step": 1750 }, { - "epoch": 0.08136300417246176, - "grad_norm": 4.736515998840332, - "learning_rate": 1.627260083449235e-05, - "loss": 1.0785, + "epoch": 0.6530840078146805, + "grad_norm": 0.012789854779839516, + "learning_rate": 6.478259518748675e-06, + "loss": 0.5963, "step": 1755 }, { - "epoch": 0.08159480760315253, - "grad_norm": 5.576144695281982, - "learning_rate": 1.6318961520630506e-05, - "loss": 1.0075, + "epoch": 0.6549446460135827, + "grad_norm": 0.01217294204980135, + "learning_rate": 6.41753349153587e-06, + "loss": 0.6361, "step": 1760 }, { - "epoch": 0.0818266110338433, - "grad_norm": 4.682522773742676, - "learning_rate": 1.636532220676866e-05, - "loss": 1.0356, + "epoch": 0.6568052842124849, + "grad_norm": 0.014139696955680847, + "learning_rate": 6.356958648998762e-06, + "loss": 0.6321, "step": 1765 }, { - "epoch": 0.08205841446453407, - "grad_norm": 5.505351543426514, - "learning_rate": 1.6411682892906817e-05, - "loss": 0.8688, + "epoch": 0.6586659224113871, + "grad_norm": 0.014064906165003777, + "learning_rate": 6.296537547473302e-06, + "loss": 0.6519, "step": 1770 }, { - "epoch": 0.08229021789522485, - "grad_norm": 5.349116325378418, - "learning_rate": 1.6458043579044972e-05, - "loss": 1.0601, + "epoch": 0.6605265606102894, + "grad_norm": 0.012787656858563423, + "learning_rate": 6.236272736807378e-06, + "loss": 0.6033, "step": 1775 }, { - "epoch": 0.08252202132591563, - "grad_norm": 4.7216572761535645, - "learning_rate": 1.6504404265183127e-05, - "loss": 0.8931, + "epoch": 0.6623871988091915, + "grad_norm": 0.012475831434130669, + "learning_rate": 6.176166760253196e-06, + "loss": 0.5947, "step": 1780 }, { - "epoch": 0.08275382475660639, - "grad_norm": 6.225747585296631, - "learning_rate": 1.6550764951321283e-05, - "loss": 1.1836, + "epoch": 0.6642478370080938, + "grad_norm": 0.013439202681183815, + "learning_rate": 6.116222154359952e-06, + "loss": 0.636, "step": 1785 }, { - "epoch": 0.08298562818729717, - "grad_norm": 4.889942646026611, - "learning_rate": 1.6597125637459435e-05, - "loss": 0.9129, + "epoch": 0.666108475206996, + "grad_norm": 0.01295219361782074, + "learning_rate": 6.056441448866817e-06, + "loss": 0.6203, "step": 1790 }, { - "epoch": 0.08321743161798795, - "grad_norm": 4.839940071105957, - "learning_rate": 1.664348632359759e-05, - "loss": 0.9188, + "epoch": 0.6679691134058983, + "grad_norm": 0.012275603599846363, + "learning_rate": 5.996827166596129e-06, + "loss": 0.6743, "step": 1795 }, { - "epoch": 0.08344923504867872, - "grad_norm": 4.898179531097412, - "learning_rate": 1.6689847009735745e-05, - "loss": 1.1024, + "epoch": 0.6698297516048004, + "grad_norm": 0.01329441275447607, + "learning_rate": 5.937381823346964e-06, + "loss": 0.5975, "step": 1800 }, { - "epoch": 0.08344923504867872, - "eval_loss": 1.0464279651641846, - "eval_runtime": 11.2783, - "eval_samples_per_second": 11.261, - "eval_steps_per_second": 11.261, + "epoch": 0.6698297516048004, + "eval_loss": 0.6431623101234436, + "eval_runtime": 26.5644, + "eval_samples_per_second": 4.781, + "eval_steps_per_second": 4.781, "step": 1800 }, { - "epoch": 0.08368103847936949, - "grad_norm": 6.300966739654541, - "learning_rate": 1.67362076958739e-05, - "loss": 1.006, + "epoch": 0.6716903898037027, + "grad_norm": 0.013007073663175106, + "learning_rate": 5.878107927788962e-06, + "loss": 0.6165, "step": 1805 }, { - "epoch": 0.08391284191006027, - "grad_norm": 5.7721476554870605, - "learning_rate": 1.6782568382012056e-05, - "loss": 0.9676, + "epoch": 0.6735510280026049, + "grad_norm": 0.013040522113442421, + "learning_rate": 5.819007981356441e-06, + "loss": 0.6107, "step": 1810 }, { - "epoch": 0.08414464534075104, - "grad_norm": 5.889309406280518, - "learning_rate": 1.682892906815021e-05, - "loss": 1.0873, + "epoch": 0.6754116662015072, + "grad_norm": 0.014542641118168831, + "learning_rate": 5.760084478142842e-06, + "loss": 0.6284, "step": 1815 }, { - "epoch": 0.08437644877144182, - "grad_norm": 6.087542533874512, - "learning_rate": 1.6875289754288363e-05, - "loss": 1.129, + "epoch": 0.6772723044004093, + "grad_norm": 0.01339266262948513, + "learning_rate": 5.701339904795486e-06, + "loss": 0.6228, "step": 1820 }, { - "epoch": 0.0846082522021326, - "grad_norm": 5.6989545822143555, - "learning_rate": 1.692165044042652e-05, - "loss": 1.0761, + "epoch": 0.6791329425993116, + "grad_norm": 0.013091943226754665, + "learning_rate": 5.642776740410618e-06, + "loss": 0.5995, "step": 1825 }, { - "epoch": 0.08484005563282336, - "grad_norm": 6.22220516204834, - "learning_rate": 1.6968011126564674e-05, - "loss": 1.1475, + "epoch": 0.6809935807982138, + "grad_norm": 0.013321259059011936, + "learning_rate": 5.584397456428785e-06, + "loss": 0.627, "step": 1830 }, { - "epoch": 0.08507185906351414, - "grad_norm": 5.317427635192871, - "learning_rate": 1.701437181270283e-05, - "loss": 0.9838, + "epoch": 0.682854218997116, + "grad_norm": 0.012838364578783512, + "learning_rate": 5.5262045165305615e-06, + "loss": 0.658, "step": 1835 }, { - "epoch": 0.08530366249420492, - "grad_norm": 4.751984119415283, - "learning_rate": 1.7060732498840984e-05, - "loss": 1.0279, + "epoch": 0.6847148571960182, + "grad_norm": 0.01306592021137476, + "learning_rate": 5.468200376532552e-06, + "loss": 0.6756, "step": 1840 }, { - "epoch": 0.08553546592489569, - "grad_norm": 6.037285804748535, - "learning_rate": 1.710709318497914e-05, - "loss": 1.0937, + "epoch": 0.6865754953949205, + "grad_norm": 0.014314945787191391, + "learning_rate": 5.410387484283767e-06, + "loss": 0.6598, "step": 1845 }, { - "epoch": 0.08576726935558646, - "grad_norm": 4.684612274169922, - "learning_rate": 1.7153453871117295e-05, - "loss": 0.9422, + "epoch": 0.6884361335938227, + "grad_norm": 0.013973386958241463, + "learning_rate": 5.352768279562315e-06, + "loss": 0.6535, "step": 1850 }, { - "epoch": 0.08599907278627723, - "grad_norm": 4.810619831085205, - "learning_rate": 1.719981455725545e-05, - "loss": 0.9663, + "epoch": 0.6902967717927249, + "grad_norm": 0.012864621356129646, + "learning_rate": 5.295345193972445e-06, + "loss": 0.6422, "step": 1855 }, { - "epoch": 0.08623087621696801, - "grad_norm": 5.514533042907715, - "learning_rate": 1.7246175243393602e-05, - "loss": 1.0154, + "epoch": 0.6921574099916271, + "grad_norm": 0.014392906799912453, + "learning_rate": 5.238120650841925e-06, + "loss": 0.664, "step": 1860 }, { - "epoch": 0.08646267964765879, - "grad_norm": 6.909518718719482, - "learning_rate": 1.7292535929531757e-05, - "loss": 1.0768, + "epoch": 0.6940180481905294, + "grad_norm": 0.01327193807810545, + "learning_rate": 5.18109706511978e-06, + "loss": 0.6179, "step": 1865 }, { - "epoch": 0.08669448307834957, - "grad_norm": 4.488744258880615, - "learning_rate": 1.7338896615669913e-05, - "loss": 1.0446, + "epoch": 0.6958786863894316, + "grad_norm": 0.01355487760156393, + "learning_rate": 5.124276843274372e-06, + "loss": 0.605, "step": 1870 }, { - "epoch": 0.08692628650904033, - "grad_norm": 4.692446708679199, - "learning_rate": 1.7385257301808068e-05, - "loss": 1.0563, + "epoch": 0.6977393245883338, + "grad_norm": 0.012646518647670746, + "learning_rate": 5.067662383191845e-06, + "loss": 0.6608, "step": 1875 }, { - "epoch": 0.0871580899397311, - "grad_norm": 5.291960716247559, - "learning_rate": 1.7431617987946223e-05, - "loss": 1.0662, + "epoch": 0.699599962787236, + "grad_norm": 0.013363865204155445, + "learning_rate": 5.011256074074945e-06, + "loss": 0.6975, "step": 1880 }, { - "epoch": 0.08738989337042188, - "grad_norm": 5.532649517059326, - "learning_rate": 1.747797867408438e-05, - "loss": 1.1875, + "epoch": 0.7014606009861383, + "grad_norm": 0.013032359071075916, + "learning_rate": 4.955060296342163e-06, + "loss": 0.6041, "step": 1885 }, { - "epoch": 0.08762169680111266, - "grad_norm": 5.574441909790039, - "learning_rate": 1.7524339360222534e-05, - "loss": 1.1473, + "epoch": 0.7033212391850404, + "grad_norm": 0.012049228884279728, + "learning_rate": 4.899077421527304e-06, + "loss": 0.6078, "step": 1890 }, { - "epoch": 0.08785350023180342, - "grad_norm": 6.044393539428711, - "learning_rate": 1.7570700046360686e-05, - "loss": 1.1049, + "epoch": 0.7051818773839427, + "grad_norm": 0.013416007161140442, + "learning_rate": 4.843309812179405e-06, + "loss": 0.6514, "step": 1895 }, { - "epoch": 0.0880853036624942, - "grad_norm": 4.733377456665039, - "learning_rate": 1.7617060732498845e-05, - "loss": 0.9169, + "epoch": 0.7070425155828449, + "grad_norm": 0.013701760210096836, + "learning_rate": 4.787759821763017e-06, + "loss": 0.6606, "step": 1900 }, { - "epoch": 0.0880853036624942, - "eval_loss": 1.0482640266418457, - "eval_runtime": 11.2598, - "eval_samples_per_second": 11.279, - "eval_steps_per_second": 11.279, + "epoch": 0.7070425155828449, + "eval_loss": 0.6429811716079712, + "eval_runtime": 26.563, + "eval_samples_per_second": 4.781, + "eval_steps_per_second": 4.781, "step": 1900 }, { - "epoch": 0.08831710709318498, - "grad_norm": 6.185121536254883, - "learning_rate": 1.7663421418636996e-05, - "loss": 0.8974, + "epoch": 0.7089031537817472, + "grad_norm": 0.013226517476141453, + "learning_rate": 4.732429794558887e-06, + "loss": 0.6391, "step": 1905 }, { - "epoch": 0.08854891052387576, - "grad_norm": 4.563868999481201, - "learning_rate": 1.7709782104775152e-05, - "loss": 0.9664, + "epoch": 0.7107637919806493, + "grad_norm": 0.012964668683707714, + "learning_rate": 4.677322065565039e-06, + "loss": 0.6768, "step": 1910 }, { - "epoch": 0.08878071395456653, - "grad_norm": 4.728658676147461, - "learning_rate": 1.7756142790913307e-05, - "loss": 0.9351, + "epoch": 0.7126244301795516, + "grad_norm": 0.01224041823297739, + "learning_rate": 4.622438960398234e-06, + "loss": 0.645, "step": 1915 }, { - "epoch": 0.0890125173852573, - "grad_norm": 5.058661460876465, - "learning_rate": 1.7802503477051462e-05, - "loss": 1.0738, + "epoch": 0.7144850683784538, + "grad_norm": 0.013205330818891525, + "learning_rate": 4.567782795195816e-06, + "loss": 0.6051, "step": 1920 }, { - "epoch": 0.08924432081594807, - "grad_norm": 5.769558906555176, - "learning_rate": 1.7848864163189618e-05, - "loss": 0.8531, + "epoch": 0.7163457065773561, + "grad_norm": 0.01323428563773632, + "learning_rate": 4.5133558765179576e-06, + "loss": 0.6113, "step": 1925 }, { - "epoch": 0.08947612424663885, - "grad_norm": 5.377395153045654, - "learning_rate": 1.7895224849327773e-05, - "loss": 0.958, + "epoch": 0.7182063447762582, + "grad_norm": 0.012786868028342724, + "learning_rate": 4.459160501250358e-06, + "loss": 0.6677, "step": 1930 }, { - "epoch": 0.08970792767732963, - "grad_norm": 4.951549530029297, - "learning_rate": 1.7941585535465925e-05, - "loss": 0.9338, + "epoch": 0.7200669829751605, + "grad_norm": 0.010466697625815868, + "learning_rate": 4.405198956507272e-06, + "loss": 0.586, "step": 1935 }, { - "epoch": 0.0899397311080204, - "grad_norm": 4.409064769744873, - "learning_rate": 1.798794622160408e-05, - "loss": 0.9537, + "epoch": 0.7219276211740627, + "grad_norm": 0.012410039082169533, + "learning_rate": 4.35147351953501e-06, + "loss": 0.6111, "step": 1940 }, { - "epoch": 0.09017153453871117, - "grad_norm": 6.281864166259766, - "learning_rate": 1.8034306907742236e-05, - "loss": 1.0452, + "epoch": 0.723788259372965, + "grad_norm": 0.013155271299183369, + "learning_rate": 4.297986457615836e-06, + "loss": 0.6404, "step": 1945 }, { - "epoch": 0.09040333796940195, - "grad_norm": 5.833890438079834, - "learning_rate": 1.808066759388039e-05, - "loss": 1.3076, + "epoch": 0.7256488975718671, + "grad_norm": 0.01472384575754404, + "learning_rate": 4.244740027972275e-06, + "loss": 0.6553, "step": 1950 }, { - "epoch": 0.09063514140009273, - "grad_norm": 4.426032066345215, - "learning_rate": 1.8127028280018546e-05, - "loss": 1.0576, + "epoch": 0.7275095357707694, + "grad_norm": 0.012972739525139332, + "learning_rate": 4.191736477671864e-06, + "loss": 0.6613, "step": 1955 }, { - "epoch": 0.09086694483078349, - "grad_norm": 4.635809898376465, - "learning_rate": 1.81733889661567e-05, - "loss": 0.8691, + "epoch": 0.7293701739696716, + "grad_norm": 0.013699792325496674, + "learning_rate": 4.138978043532332e-06, + "loss": 0.6178, "step": 1960 }, { - "epoch": 0.09109874826147427, - "grad_norm": 5.994819641113281, - "learning_rate": 1.8219749652294857e-05, - "loss": 1.1195, + "epoch": 0.7312308121685738, + "grad_norm": 0.012072841636836529, + "learning_rate": 4.086466952027171e-06, + "loss": 0.5865, "step": 1965 }, { - "epoch": 0.09133055169216504, - "grad_norm": 5.217397689819336, - "learning_rate": 1.826611033843301e-05, - "loss": 0.9707, + "epoch": 0.733091450367476, + "grad_norm": 0.013510013930499554, + "learning_rate": 4.034205419191709e-06, + "loss": 0.6387, "step": 1970 }, { - "epoch": 0.09156235512285582, - "grad_norm": 5.001087665557861, - "learning_rate": 1.8312471024571164e-05, - "loss": 0.9911, + "epoch": 0.7349520885663783, + "grad_norm": 0.01405192632228136, + "learning_rate": 3.982195650529583e-06, + "loss": 0.5677, "step": 1975 }, { - "epoch": 0.0917941585535466, - "grad_norm": 4.645782470703125, - "learning_rate": 1.835883171070932e-05, - "loss": 0.8967, + "epoch": 0.7368127267652805, + "grad_norm": 0.012013067491352558, + "learning_rate": 3.930439840919652e-06, + "loss": 0.5868, "step": 1980 }, { - "epoch": 0.09202596198423736, - "grad_norm": 5.851307392120361, - "learning_rate": 1.8405192396847475e-05, - "loss": 1.0551, + "epoch": 0.7386733649641827, + "grad_norm": 0.01283415500074625, + "learning_rate": 3.878940174523371e-06, + "loss": 0.597, "step": 1985 }, { - "epoch": 0.09225776541492814, - "grad_norm": 5.592276096343994, - "learning_rate": 1.845155308298563e-05, - "loss": 1.1209, + "epoch": 0.7405340031630849, + "grad_norm": 0.013413486070930958, + "learning_rate": 3.827698824692643e-06, + "loss": 0.6074, "step": 1990 }, { - "epoch": 0.09248956884561892, - "grad_norm": 5.643247604370117, - "learning_rate": 1.8497913769123785e-05, - "loss": 1.1347, + "epoch": 0.7423946413619872, + "grad_norm": 0.013137887232005596, + "learning_rate": 3.776717953878064e-06, + "loss": 0.6599, "step": 1995 }, { - "epoch": 0.0927213722763097, - "grad_norm": 6.246877670288086, - "learning_rate": 1.854427445526194e-05, - "loss": 0.9848, + "epoch": 0.7442552795608894, + "grad_norm": 0.013435564935207367, + "learning_rate": 3.725999713537689e-06, + "loss": 0.6191, "step": 2000 }, { - "epoch": 0.0927213722763097, - "eval_loss": 1.0543617010116577, - "eval_runtime": 11.2709, - "eval_samples_per_second": 11.268, - "eval_steps_per_second": 11.268, + "epoch": 0.7442552795608894, + "eval_loss": 0.6428595185279846, + "eval_runtime": 26.6061, + "eval_samples_per_second": 4.773, + "eval_steps_per_second": 4.773, "step": 2000 }, { - "epoch": 0.09295317570700046, - "grad_norm": 6.111345291137695, - "learning_rate": 1.8590635141400096e-05, - "loss": 1.1706, + "epoch": 0.7461159177597916, + "grad_norm": 0.013417122885584831, + "learning_rate": 3.6755462440462288e-06, + "loss": 0.6012, "step": 2005 }, { - "epoch": 0.09318497913769123, - "grad_norm": 4.53964900970459, - "learning_rate": 1.8636995827538248e-05, - "loss": 1.0575, + "epoch": 0.7479765559586938, + "grad_norm": 0.01378430612385273, + "learning_rate": 3.625359674604725e-06, + "loss": 0.607, "step": 2010 }, { - "epoch": 0.09341678256838201, - "grad_norm": 5.4626922607421875, - "learning_rate": 1.8683356513676406e-05, - "loss": 1.0724, + "epoch": 0.7498371941575961, + "grad_norm": 0.012614194303750992, + "learning_rate": 3.5754421231506953e-06, + "loss": 0.6364, "step": 2015 }, { - "epoch": 0.09364858599907279, - "grad_norm": 5.114150047302246, - "learning_rate": 1.872971719981456e-05, - "loss": 0.9194, + "epoch": 0.7516978323564982, + "grad_norm": 0.013113941997289658, + "learning_rate": 3.5257956962687545e-06, + "loss": 0.6148, "step": 2020 }, { - "epoch": 0.09388038942976357, - "grad_norm": 5.33641242980957, - "learning_rate": 1.8776077885952714e-05, - "loss": 1.2183, + "epoch": 0.7535584705554005, + "grad_norm": 0.012877865694463253, + "learning_rate": 3.476422489101713e-06, + "loss": 0.6369, "step": 2025 }, { - "epoch": 0.09411219286045433, - "grad_norm": 5.04241418838501, - "learning_rate": 1.882243857209087e-05, - "loss": 1.046, + "epoch": 0.7554191087543027, + "grad_norm": 0.013343026861548424, + "learning_rate": 3.427324585262156e-06, + "loss": 0.6257, "step": 2030 }, { - "epoch": 0.09434399629114511, - "grad_norm": 5.5946431159973145, - "learning_rate": 1.886879925822902e-05, - "loss": 1.12, + "epoch": 0.757279746953205, + "grad_norm": 0.01313395518809557, + "learning_rate": 3.3785040567445282e-06, + "loss": 0.6301, "step": 2035 }, { - "epoch": 0.09457579972183588, - "grad_norm": 4.880713939666748, - "learning_rate": 1.891515994436718e-05, - "loss": 0.9931, + "epoch": 0.7591403851521071, + "grad_norm": 0.013584131374955177, + "learning_rate": 3.329962963837661e-06, + "loss": 0.6244, "step": 2040 }, { - "epoch": 0.09480760315252666, - "grad_norm": 5.912775993347168, - "learning_rate": 1.896152063050533e-05, - "loss": 0.9105, + "epoch": 0.7610010233510094, + "grad_norm": 0.011685586534440517, + "learning_rate": 3.281703355037854e-06, + "loss": 0.5818, "step": 2045 }, { - "epoch": 0.09503940658321743, - "grad_norm": 5.8085198402404785, - "learning_rate": 1.9007881316643487e-05, - "loss": 1.1281, + "epoch": 0.7628616615499116, + "grad_norm": 0.013454140163958073, + "learning_rate": 3.233727266962425e-06, + "loss": 0.6151, "step": 2050 }, { - "epoch": 0.0952712100139082, - "grad_norm": 5.165389060974121, - "learning_rate": 1.9054242002781642e-05, - "loss": 0.9402, + "epoch": 0.7647222997488139, + "grad_norm": 0.012276149354875088, + "learning_rate": 3.186036724263748e-06, + "loss": 0.621, "step": 2055 }, { - "epoch": 0.09550301344459898, - "grad_norm": 4.676323890686035, - "learning_rate": 1.9100602688919797e-05, - "loss": 1.0616, + "epoch": 0.766582937947716, + "grad_norm": 0.013796020299196243, + "learning_rate": 3.138633739543805e-06, + "loss": 0.6466, "step": 2060 }, { - "epoch": 0.09573481687528976, - "grad_norm": 4.0118088722229, - "learning_rate": 1.9146963375057953e-05, - "loss": 0.8473, + "epoch": 0.7684435761466183, + "grad_norm": 0.013075289316475391, + "learning_rate": 3.0915203132692805e-06, + "loss": 0.6116, "step": 2065 }, { - "epoch": 0.09596662030598054, - "grad_norm": 5.034766674041748, - "learning_rate": 1.9193324061196108e-05, - "loss": 1.167, + "epoch": 0.7703042143455205, + "grad_norm": 0.014096383936703205, + "learning_rate": 3.0446984336871144e-06, + "loss": 0.5877, "step": 2070 }, { - "epoch": 0.0961984237366713, - "grad_norm": 5.5457763671875, - "learning_rate": 1.9239684747334263e-05, - "loss": 1.3238, + "epoch": 0.7721648525444227, + "grad_norm": 0.013074836693704128, + "learning_rate": 2.998170076740601e-06, + "loss": 0.5829, "step": 2075 }, { - "epoch": 0.09643022716736208, - "grad_norm": 5.167206764221191, - "learning_rate": 1.928604543347242e-05, - "loss": 1.2318, + "epoch": 0.7740254907433249, + "grad_norm": 0.01671191304922104, + "learning_rate": 2.951937205986004e-06, + "loss": 0.6439, "step": 2080 }, { - "epoch": 0.09666203059805285, - "grad_norm": 5.160457134246826, - "learning_rate": 1.933240611961057e-05, - "loss": 1.0846, + "epoch": 0.7758861289422272, + "grad_norm": 0.013207321055233479, + "learning_rate": 2.9060017725096943e-06, + "loss": 0.621, "step": 2085 }, { - "epoch": 0.09689383402874363, - "grad_norm": 6.138757705688477, - "learning_rate": 1.937876680574873e-05, - "loss": 0.84, + "epoch": 0.7777467671411294, + "grad_norm": 0.014768741093575954, + "learning_rate": 2.8603657148458053e-06, + "loss": 0.6272, "step": 2090 }, { - "epoch": 0.0971256374594344, - "grad_norm": 5.376162052154541, - "learning_rate": 1.942512749188688e-05, - "loss": 0.8957, + "epoch": 0.7796074053400316, + "grad_norm": 0.015293040312826633, + "learning_rate": 2.8150309588944304e-06, + "loss": 0.6388, "step": 2095 }, { - "epoch": 0.09735744089012517, - "grad_norm": 6.612864971160889, - "learning_rate": 1.9471488178025037e-05, - "loss": 0.983, + "epoch": 0.7814680435389338, + "grad_norm": 0.01271377969533205, + "learning_rate": 2.769999417840341e-06, + "loss": 0.6249, "step": 2100 }, { - "epoch": 0.09735744089012517, - "eval_loss": 1.0578044652938843, - "eval_runtime": 11.2841, - "eval_samples_per_second": 11.255, - "eval_steps_per_second": 11.255, + "epoch": 0.7814680435389338, + "eval_loss": 0.6426796913146973, + "eval_runtime": 26.6067, + "eval_samples_per_second": 4.773, + "eval_steps_per_second": 4.773, "step": 2100 }, { - "epoch": 0.09758924432081595, - "grad_norm": 4.397307395935059, - "learning_rate": 1.9517848864163192e-05, - "loss": 0.9275, + "epoch": 0.7833286817378361, + "grad_norm": 0.013834511861205101, + "learning_rate": 2.7252729920722564e-06, + "loss": 0.6254, "step": 2105 }, { - "epoch": 0.09782104775150673, - "grad_norm": 5.631065368652344, - "learning_rate": 1.9564209550301344e-05, - "loss": 1.1583, + "epoch": 0.7851893199367384, + "grad_norm": 0.020366957411170006, + "learning_rate": 2.680853569102633e-06, + "loss": 0.632, "step": 2110 }, { - "epoch": 0.09805285118219749, - "grad_norm": 4.9445600509643555, - "learning_rate": 1.9610570236439502e-05, - "loss": 1.1222, + "epoch": 0.7870499581356405, + "grad_norm": 0.012656195089221, + "learning_rate": 2.6367430234880286e-06, + "loss": 0.6274, "step": 2115 }, { - "epoch": 0.09828465461288827, - "grad_norm": 5.1628265380859375, - "learning_rate": 1.9656930922577654e-05, - "loss": 1.2511, + "epoch": 0.7889105963345427, + "grad_norm": 0.01308165118098259, + "learning_rate": 2.5929432167499658e-06, + "loss": 0.6457, "step": 2120 }, { - "epoch": 0.09851645804357904, - "grad_norm": 4.909499168395996, - "learning_rate": 1.970329160871581e-05, - "loss": 1.0582, + "epoch": 0.790771234533445, + "grad_norm": 0.012838170863687992, + "learning_rate": 2.5494559972963928e-06, + "loss": 0.6436, "step": 2125 }, { - "epoch": 0.09874826147426982, - "grad_norm": 6.675722599029541, - "learning_rate": 1.9749652294853965e-05, - "loss": 1.1201, + "epoch": 0.7926318727323471, + "grad_norm": 0.012966095469892025, + "learning_rate": 2.5062832003436833e-06, + "loss": 0.6449, "step": 2130 }, { - "epoch": 0.0989800649049606, - "grad_norm": 4.577010154724121, - "learning_rate": 1.979601298099212e-05, - "loss": 0.8593, + "epoch": 0.7944925109312494, + "grad_norm": 0.013781096786260605, + "learning_rate": 2.463426647839173e-06, + "loss": 0.584, "step": 2135 }, { - "epoch": 0.09921186833565136, - "grad_norm": 5.183391571044922, - "learning_rate": 1.9842373667130276e-05, - "loss": 0.8958, + "epoch": 0.7963531491301516, + "grad_norm": 0.012445746921002865, + "learning_rate": 2.420888148384265e-06, + "loss": 0.6397, "step": 2140 }, { - "epoch": 0.09944367176634214, - "grad_norm": 4.433844089508057, - "learning_rate": 1.988873435326843e-05, - "loss": 1.0494, + "epoch": 0.7982137873290539, + "grad_norm": 0.012599524110555649, + "learning_rate": 2.378669497158138e-06, + "loss": 0.5974, "step": 2145 }, { - "epoch": 0.09967547519703292, - "grad_norm": 4.70166015625, - "learning_rate": 1.9935095039406583e-05, - "loss": 1.0033, + "epoch": 0.800074425527956, + "grad_norm": 0.011499716900289059, + "learning_rate": 2.3367724758419495e-06, + "loss": 0.5552, "step": 2150 }, { - "epoch": 0.0999072786277237, - "grad_norm": 5.131468772888184, - "learning_rate": 1.998145572554474e-05, - "loss": 1.0701, + "epoch": 0.8019350637268583, + "grad_norm": 0.012771397829055786, + "learning_rate": 2.2951988525436695e-06, + "loss": 0.666, "step": 2155 }, { - "epoch": 0.10013908205841446, - "grad_norm": 5.585656642913818, - "learning_rate": 1.999999882150718e-05, - "loss": 1.1144, + "epoch": 0.8037957019257606, + "grad_norm": 0.013215843588113785, + "learning_rate": 2.2539503817234553e-06, + "loss": 0.5925, "step": 2160 }, { - "epoch": 0.10037088548910524, - "grad_norm": 4.321651935577393, - "learning_rate": 1.999999161960761e-05, - "loss": 1.0022, + "epoch": 0.8056563401246628, + "grad_norm": 0.014011417515575886, + "learning_rate": 2.2130288041196135e-06, + "loss": 0.6216, "step": 2165 }, { - "epoch": 0.10060268891979601, - "grad_norm": 4.363278388977051, - "learning_rate": 1.9999977870531412e-05, - "loss": 0.8248, + "epoch": 0.807516978323565, + "grad_norm": 0.014169846661388874, + "learning_rate": 2.1724358466751394e-06, + "loss": 0.625, "step": 2170 }, { - "epoch": 0.10083449235048679, - "grad_norm": 4.530508041381836, - "learning_rate": 1.9999957574287587e-05, - "loss": 0.8732, + "epoch": 0.8093776165224672, + "grad_norm": 0.013908619992434978, + "learning_rate": 2.132173222464834e-06, + "loss": 0.6641, "step": 2175 }, { - "epoch": 0.10106629578117757, - "grad_norm": 5.886618137359619, - "learning_rate": 1.9999930730889426e-05, - "loss": 1.0238, + "epoch": 0.8112382547213695, + "grad_norm": 0.01255475077778101, + "learning_rate": 2.092242630623016e-06, + "loss": 0.6135, "step": 2180 }, { - "epoch": 0.10129809921186833, - "grad_norm": 5.83167028427124, - "learning_rate": 1.99998973403545e-05, - "loss": 1.1206, + "epoch": 0.8130988929202716, + "grad_norm": 0.01320129819214344, + "learning_rate": 2.0526457562718074e-06, + "loss": 0.5893, "step": 2185 }, { - "epoch": 0.10152990264255911, - "grad_norm": 4.87026834487915, - "learning_rate": 1.9999857402704675e-05, - "loss": 1.1022, + "epoch": 0.8149595311191739, + "grad_norm": 0.012163982726633549, + "learning_rate": 2.013384270450036e-06, + "loss": 0.6552, "step": 2190 }, { - "epoch": 0.10176170607324989, - "grad_norm": 5.260519504547119, - "learning_rate": 1.9999810917966097e-05, - "loss": 0.8485, + "epoch": 0.8168201693180761, + "grad_norm": 0.012990654446184635, + "learning_rate": 1.974459830042691e-06, + "loss": 0.6309, "step": 2195 }, { - "epoch": 0.10199350950394066, - "grad_norm": 5.333953857421875, - "learning_rate": 1.99997578861692e-05, - "loss": 1.1512, + "epoch": 0.8186808075169784, + "grad_norm": 0.015373739413917065, + "learning_rate": 1.9358740777110154e-06, + "loss": 0.6761, "step": 2200 }, { - "epoch": 0.10199350950394066, - "eval_loss": 1.0549308061599731, - "eval_runtime": 11.2621, - "eval_samples_per_second": 11.277, - "eval_steps_per_second": 11.277, + "epoch": 0.8186808075169784, + "eval_loss": 0.6425639390945435, + "eval_runtime": 26.5817, + "eval_samples_per_second": 4.778, + "eval_steps_per_second": 4.778, "step": 2200 }, { - "epoch": 0.10222531293463143, - "grad_norm": 4.999898910522461, - "learning_rate": 1.9999698307348702e-05, - "loss": 1.0005, + "epoch": 0.8205414457158805, + "grad_norm": 0.012748058885335922, + "learning_rate": 1.8976286418231916e-06, + "loss": 0.6313, "step": 2205 }, { - "epoch": 0.1024571163653222, - "grad_norm": 5.3397932052612305, - "learning_rate": 1.9999632181543614e-05, - "loss": 1.048, + "epoch": 0.8224020839147828, + "grad_norm": 0.014108446426689625, + "learning_rate": 1.8597251363856061e-06, + "loss": 0.5997, "step": 2210 }, { - "epoch": 0.10268891979601298, - "grad_norm": 4.57288122177124, - "learning_rate": 1.9999559508797233e-05, - "loss": 1.0049, + "epoch": 0.824262722113685, + "grad_norm": 0.0133186811581254, + "learning_rate": 1.8221651609747337e-06, + "loss": 0.6732, "step": 2215 }, { - "epoch": 0.10292072322670376, - "grad_norm": 4.630931854248047, - "learning_rate": 1.9999480289157135e-05, - "loss": 0.9457, + "epoch": 0.8261233603125873, + "grad_norm": 0.012968887574970722, + "learning_rate": 1.7849503006696566e-06, + "loss": 0.6129, "step": 2220 }, { - "epoch": 0.10315252665739454, - "grad_norm": 5.614922046661377, - "learning_rate": 1.9999394522675186e-05, - "loss": 1.082, + "epoch": 0.8279839985114894, + "grad_norm": 0.012473355047404766, + "learning_rate": 1.7480821259851488e-06, + "loss": 0.6131, "step": 2225 }, { - "epoch": 0.1033843300880853, - "grad_norm": 5.246216297149658, - "learning_rate": 1.999930220940754e-05, - "loss": 0.9773, + "epoch": 0.8298446367103917, + "grad_norm": 0.012497167102992535, + "learning_rate": 1.7115621928054105e-06, + "loss": 0.6199, "step": 2230 }, { - "epoch": 0.10361613351877608, - "grad_norm": 4.738265514373779, - "learning_rate": 1.9999203349414635e-05, - "loss": 0.9073, + "epoch": 0.8317052749092939, + "grad_norm": 0.014741111546754837, + "learning_rate": 1.6753920423184022e-06, + "loss": 0.6159, "step": 2235 }, { - "epoch": 0.10384793694946685, - "grad_norm": 5.955806255340576, - "learning_rate": 1.9999097942761197e-05, - "loss": 0.9572, + "epoch": 0.8335659131081962, + "grad_norm": 0.013057067058980465, + "learning_rate": 1.6395732009508058e-06, + "loss": 0.5898, "step": 2240 }, { - "epoch": 0.10407974038015763, - "grad_norm": 5.307494163513184, - "learning_rate": 1.999898598951624e-05, - "loss": 1.1539, + "epoch": 0.8354265513070983, + "grad_norm": 0.014299440197646618, + "learning_rate": 1.60410718030361e-06, + "loss": 0.6486, "step": 2245 }, { - "epoch": 0.1043115438108484, - "grad_norm": 5.4604172706604, - "learning_rate": 1.9998867489753057e-05, - "loss": 1.1064, + "epoch": 0.8372871895060006, + "grad_norm": 0.013178296387195587, + "learning_rate": 1.568995477088323e-06, + "loss": 0.6254, "step": 2250 }, { - "epoch": 0.10454334724153917, - "grad_norm": 5.4470319747924805, - "learning_rate": 1.9998742443549237e-05, - "loss": 1.1014, + "epoch": 0.8391478277049028, + "grad_norm": 0.013072814792394638, + "learning_rate": 1.5342395730637904e-06, + "loss": 0.6385, "step": 2255 }, { - "epoch": 0.10477515067222995, - "grad_norm": 4.63338565826416, - "learning_rate": 1.999861085098665e-05, - "loss": 1.0084, + "epoch": 0.841008465903805, + "grad_norm": 0.014187455177307129, + "learning_rate": 1.4998409349736841e-06, + "loss": 0.6458, "step": 2260 }, { - "epoch": 0.10500695410292073, - "grad_norm": 4.390408039093018, - "learning_rate": 1.9998472712151446e-05, - "loss": 1.0617, + "epoch": 0.8428691041027072, + "grad_norm": 0.011844666674733162, + "learning_rate": 1.4658010144846001e-06, + "loss": 0.6556, "step": 2265 }, { - "epoch": 0.10523875753361149, - "grad_norm": 5.576091766357422, - "learning_rate": 1.999832802713407e-05, - "loss": 1.1452, + "epoch": 0.8447297423016095, + "grad_norm": 0.013700997456908226, + "learning_rate": 1.432121248124786e-06, + "loss": 0.6381, "step": 2270 }, { - "epoch": 0.10547056096430227, - "grad_norm": 5.20993185043335, - "learning_rate": 1.9998176796029253e-05, - "loss": 1.0616, + "epoch": 0.8465903805005117, + "grad_norm": 0.014204096049070358, + "learning_rate": 1.3988030572235212e-06, + "loss": 0.624, "step": 2275 }, { - "epoch": 0.10570236439499305, - "grad_norm": 4.806192874908447, - "learning_rate": 1.9998019018936007e-05, - "loss": 1.0339, + "epoch": 0.8484510186994139, + "grad_norm": 0.013039126060903072, + "learning_rate": 1.3658478478511416e-06, + "loss": 0.593, "step": 2280 }, { - "epoch": 0.10593416782568382, - "grad_norm": 4.287393569946289, - "learning_rate": 1.999785469595763e-05, - "loss": 0.9953, + "epoch": 0.8503116568983161, + "grad_norm": 0.014153816737234592, + "learning_rate": 1.333257010759702e-06, + "loss": 0.5909, "step": 2285 }, { - "epoch": 0.1061659712563746, - "grad_norm": 5.184103488922119, - "learning_rate": 1.999768382720171e-05, - "loss": 0.9038, + "epoch": 0.8521722950972184, + "grad_norm": 0.012623702175915241, + "learning_rate": 1.3010319213242762e-06, + "loss": 0.6006, "step": 2290 }, { - "epoch": 0.10639777468706536, - "grad_norm": 6.341686725616455, - "learning_rate": 1.9997506412780117e-05, - "loss": 1.1514, + "epoch": 0.8540329332961206, + "grad_norm": 0.013684497214853764, + "learning_rate": 1.2691739394849089e-06, + "loss": 0.6112, "step": 2295 }, { - "epoch": 0.10662957811775614, - "grad_norm": 5.4436750411987305, - "learning_rate": 1.9997322452809003e-05, - "loss": 1.1623, + "epoch": 0.8558935714950228, + "grad_norm": 0.012923210859298706, + "learning_rate": 1.2376844096892526e-06, + "loss": 0.6239, "step": 2300 }, { - "epoch": 0.10662957811775614, - "eval_loss": 1.0587830543518066, - "eval_runtime": 11.261, - "eval_samples_per_second": 11.278, - "eval_steps_per_second": 11.278, + "epoch": 0.8558935714950228, + "eval_loss": 0.6425209641456604, + "eval_runtime": 26.5582, + "eval_samples_per_second": 4.782, + "eval_steps_per_second": 4.782, "step": 2300 }, { - "epoch": 0.10686138154844692, - "grad_norm": 5.4230427742004395, - "learning_rate": 1.9997131947408818e-05, - "loss": 0.967, + "epoch": 0.857754209693925, + "grad_norm": 0.01325127761811018, + "learning_rate": 1.2065646608357972e-06, + "loss": 0.6537, "step": 2305 }, { - "epoch": 0.1070931849791377, - "grad_norm": 4.601907253265381, - "learning_rate": 1.9996934896704282e-05, - "loss": 1.1419, + "epoch": 0.8596148478928273, + "grad_norm": 0.01215402316302061, + "learning_rate": 1.1758160062178093e-06, + "loss": 0.634, "step": 2310 }, { - "epoch": 0.10732498840982846, - "grad_norm": 4.844226837158203, - "learning_rate": 1.999673130082441e-05, - "loss": 0.9171, + "epoch": 0.8614754860917294, + "grad_norm": 0.015918321907520294, + "learning_rate": 1.1454397434679022e-06, + "loss": 0.6296, "step": 2315 }, { - "epoch": 0.10755679184051924, - "grad_norm": 5.482775688171387, - "learning_rate": 1.9996521159902502e-05, - "loss": 1.1142, + "epoch": 0.8633361242906317, + "grad_norm": 0.011971482075750828, + "learning_rate": 1.1154371545032738e-06, + "loss": 0.5983, "step": 2320 }, { - "epoch": 0.10778859527121001, - "grad_norm": 5.575770854949951, - "learning_rate": 1.999630447407614e-05, - "loss": 1.0038, + "epoch": 0.8651967624895339, + "grad_norm": 0.01542737614363432, + "learning_rate": 1.0858095054716111e-06, + "loss": 0.6468, "step": 2325 }, { - "epoch": 0.10802039870190079, - "grad_norm": 4.347164630889893, - "learning_rate": 1.9996081243487194e-05, - "loss": 0.9837, + "epoch": 0.8670574006884362, + "grad_norm": 0.012823596596717834, + "learning_rate": 1.0565580466976566e-06, + "loss": 0.6222, "step": 2330 }, { - "epoch": 0.10825220213259157, - "grad_norm": 5.454910755157471, - "learning_rate": 1.999585146828181e-05, - "loss": 1.11, + "epoch": 0.8689180388873383, + "grad_norm": 0.012822597287595272, + "learning_rate": 1.027684012630441e-06, + "loss": 0.6385, "step": 2335 }, { - "epoch": 0.10848400556328233, - "grad_norm": 4.132719039916992, - "learning_rate": 1.9995615148610436e-05, - "loss": 0.8614, + "epoch": 0.8707786770862406, + "grad_norm": 0.014365943148732185, + "learning_rate": 9.991886217911851e-07, + "loss": 0.6541, "step": 2340 }, { - "epoch": 0.10871580899397311, - "grad_norm": 4.780175685882568, - "learning_rate": 1.999537228462779e-05, - "loss": 0.9514, + "epoch": 0.8726393152851428, + "grad_norm": 0.013635417446494102, + "learning_rate": 9.710730767218913e-07, + "loss": 0.6323, "step": 2345 }, { - "epoch": 0.10894761242466389, - "grad_norm": 5.469396114349365, - "learning_rate": 1.9995122876492873e-05, - "loss": 0.8916, + "epoch": 0.8744999534840451, + "grad_norm": 0.012839280068874359, + "learning_rate": 9.433385639345705e-07, + "loss": 0.5944, "step": 2350 }, { - "epoch": 0.10917941585535466, - "grad_norm": 5.702508449554443, - "learning_rate": 1.9994866924368988e-05, - "loss": 1.1915, + "epoch": 0.8763605916829472, + "grad_norm": 0.0141257019713521, + "learning_rate": 9.159862538611908e-07, + "loss": 0.6632, "step": 2355 }, { - "epoch": 0.10941121928604543, - "grad_norm": 4.359443187713623, - "learning_rate": 1.9994604428423706e-05, - "loss": 0.9538, + "epoch": 0.8782212298818495, + "grad_norm": 0.013571621850132942, + "learning_rate": 8.890173008042768e-07, + "loss": 0.6086, "step": 2360 }, { - "epoch": 0.1096430227167362, - "grad_norm": 5.570619106292725, - "learning_rate": 1.999433538882889e-05, - "loss": 0.9006, + "epoch": 0.8800818680807517, + "grad_norm": 0.01433682069182396, + "learning_rate": 8.624328428881945e-07, + "loss": 0.6482, "step": 2365 }, { - "epoch": 0.10987482614742698, - "grad_norm": 4.556485176086426, - "learning_rate": 1.999405980576068e-05, - "loss": 1.0132, + "epoch": 0.8819425062796539, + "grad_norm": 0.014090972021222115, + "learning_rate": 8.36234002011117e-07, + "loss": 0.6335, "step": 2370 }, { - "epoch": 0.11010662957811776, - "grad_norm": 4.931366443634033, - "learning_rate": 1.999377767939951e-05, - "loss": 0.9837, + "epoch": 0.8838031444785561, + "grad_norm": 0.013004067353904247, + "learning_rate": 8.10421883797694e-07, + "loss": 0.6145, "step": 2375 }, { - "epoch": 0.11033843300880854, - "grad_norm": 5.512187480926514, - "learning_rate": 1.9993489009930097e-05, - "loss": 0.9279, + "epoch": 0.8856637826774584, + "grad_norm": 0.01391025260090828, + "learning_rate": 7.849975775523777e-07, + "loss": 0.6415, "step": 2380 }, { - "epoch": 0.1105702364394993, - "grad_norm": 5.395881652832031, - "learning_rate": 1.999319379754143e-05, - "loss": 1.0187, + "epoch": 0.8875244208763606, + "grad_norm": 0.012305272743105888, + "learning_rate": 7.599621562134596e-07, + "loss": 0.6462, "step": 2385 }, { - "epoch": 0.11080203987019008, - "grad_norm": 5.412979602813721, - "learning_rate": 1.9992892042426794e-05, - "loss": 1.1533, + "epoch": 0.8893850590752628, + "grad_norm": 0.013947544619441032, + "learning_rate": 7.35316676307789e-07, + "loss": 0.6266, "step": 2390 }, { - "epoch": 0.11103384330088086, - "grad_norm": 4.9955735206604, - "learning_rate": 1.9992583744783754e-05, - "loss": 1.0025, + "epoch": 0.891245697274165, + "grad_norm": 0.016055511310696602, + "learning_rate": 7.110621779061889e-07, + "loss": 0.6501, "step": 2395 }, { - "epoch": 0.11126564673157163, - "grad_norm": 5.231233596801758, - "learning_rate": 1.9992268904814154e-05, - "loss": 1.0544, + "epoch": 0.8931063354730673, + "grad_norm": 0.01238927897065878, + "learning_rate": 6.871996845795581e-07, + "loss": 0.5895, "step": 2400 }, { - "epoch": 0.11126564673157163, - "eval_loss": 1.052207589149475, - "eval_runtime": 11.2784, - "eval_samples_per_second": 11.26, - "eval_steps_per_second": 11.26, + "epoch": 0.8931063354730673, + "eval_loss": 0.6424703598022461, + "eval_runtime": 26.6049, + "eval_samples_per_second": 4.774, + "eval_steps_per_second": 4.774, "step": 2400 }, { - "epoch": 0.1114974501622624, - "grad_norm": 4.9348015785217285, - "learning_rate": 1.999194752272413e-05, - "loss": 1.1822, + "epoch": 0.8949669736719695, + "grad_norm": 0.012343904934823513, + "learning_rate": 6.637302033556891e-07, + "loss": 0.6105, "step": 2405 }, { - "epoch": 0.11172925359295317, - "grad_norm": 4.779728412628174, - "learning_rate": 1.9991619598724096e-05, - "loss": 1.08, + "epoch": 0.8968276118708717, + "grad_norm": 0.01289752684533596, + "learning_rate": 6.40654724676748e-07, + "loss": 0.6265, "step": 2410 }, { - "epoch": 0.11196105702364395, - "grad_norm": 4.476454734802246, - "learning_rate": 1.999128513302875e-05, - "loss": 1.0141, + "epoch": 0.8986882500697739, + "grad_norm": 0.013334677554666996, + "learning_rate": 6.179742223574936e-07, + "loss": 0.6261, "step": 2415 }, { - "epoch": 0.11219286045433473, - "grad_norm": 4.818485736846924, - "learning_rate": 1.999094412585707e-05, - "loss": 0.9972, + "epoch": 0.9005488882686762, + "grad_norm": 0.013512643985450268, + "learning_rate": 5.956896535441803e-07, + "loss": 0.5797, "step": 2420 }, { - "epoch": 0.11242466388502549, - "grad_norm": 4.220043182373047, - "learning_rate": 1.9990596577432325e-05, - "loss": 0.9463, + "epoch": 0.9024095264675783, + "grad_norm": 0.01403987966477871, + "learning_rate": 5.738019586741573e-07, + "loss": 0.616, "step": 2425 }, { - "epoch": 0.11265646731571627, - "grad_norm": 4.722184181213379, - "learning_rate": 1.9990242487982056e-05, - "loss": 1.0425, + "epoch": 0.9042701646664806, + "grad_norm": 0.013224196620285511, + "learning_rate": 5.523120614361821e-07, + "loss": 0.608, "step": 2430 }, { - "epoch": 0.11288827074640705, - "grad_norm": 4.847781658172607, - "learning_rate": 1.9989881857738093e-05, - "loss": 1.0659, + "epoch": 0.9061308028653828, + "grad_norm": 0.01317799836397171, + "learning_rate": 5.312208687314502e-07, + "loss": 0.6206, "step": 2435 }, { - "epoch": 0.11312007417709782, - "grad_norm": 4.4079813957214355, - "learning_rate": 1.9989514686936545e-05, - "loss": 1.0153, + "epoch": 0.9079914410642851, + "grad_norm": 0.01357815321534872, + "learning_rate": 5.105292706353093e-07, + "loss": 0.6759, "step": 2440 }, { - "epoch": 0.1133518776077886, - "grad_norm": 3.875089168548584, - "learning_rate": 1.9989140975817813e-05, - "loss": 0.9605, + "epoch": 0.9098520792631872, + "grad_norm": 0.012360199354588985, + "learning_rate": 4.902381403597046e-07, + "loss": 0.6182, "step": 2445 }, { - "epoch": 0.11358368103847936, - "grad_norm": 4.662908554077148, - "learning_rate": 1.9988760724626564e-05, - "loss": 1.0402, + "epoch": 0.9117127174620895, + "grad_norm": 0.012978832237422466, + "learning_rate": 4.703483342163262e-07, + "loss": 0.5892, "step": 2450 }, { - "epoch": 0.11381548446917014, - "grad_norm": 5.138006687164307, - "learning_rate": 1.998837393361176e-05, - "loss": 1.1952, + "epoch": 0.9135733556609917, + "grad_norm": 0.013323403894901276, + "learning_rate": 4.5086069158047143e-07, + "loss": 0.636, "step": 2455 }, { - "epoch": 0.11404728789986092, - "grad_norm": 5.774982452392578, - "learning_rate": 1.998798060302664e-05, - "loss": 0.9564, + "epoch": 0.915433993859894, + "grad_norm": 0.013127562589943409, + "learning_rate": 4.3177603485562327e-07, + "loss": 0.5847, "step": 2460 }, { - "epoch": 0.1142790913305517, - "grad_norm": 4.6387553215026855, - "learning_rate": 1.998758073312872e-05, - "loss": 0.9307, + "epoch": 0.9172946320587961, + "grad_norm": 0.012560434639453888, + "learning_rate": 4.1309516943874196e-07, + "loss": 0.6073, "step": 2465 }, { - "epoch": 0.11451089476124246, - "grad_norm": 4.885598659515381, - "learning_rate": 1.9987174324179814e-05, - "loss": 0.8062, + "epoch": 0.9191552702576984, + "grad_norm": 0.01229447964578867, + "learning_rate": 3.9481888368627764e-07, + "loss": 0.5846, "step": 2470 }, { - "epoch": 0.11474269819193324, - "grad_norm": 5.628473281860352, - "learning_rate": 1.9986761376445992e-05, - "loss": 1.1763, + "epoch": 0.9210159084566006, + "grad_norm": 0.012554515153169632, + "learning_rate": 3.7694794888090025e-07, + "loss": 0.6186, "step": 2475 }, { - "epoch": 0.11497450162262401, - "grad_norm": 5.386045932769775, - "learning_rate": 1.9986341890197622e-05, - "loss": 0.9168, + "epoch": 0.9228765466555028, + "grad_norm": 0.011958773247897625, + "learning_rate": 3.594831191989523e-07, + "loss": 0.6217, "step": 2480 }, { - "epoch": 0.11520630505331479, - "grad_norm": 4.536678791046143, - "learning_rate": 1.9985915865709352e-05, - "loss": 1.0792, + "epoch": 0.924737184854405, + "grad_norm": 0.01367896143347025, + "learning_rate": 3.424251316786165e-07, + "loss": 0.6572, "step": 2485 }, { - "epoch": 0.11543810848400557, - "grad_norm": 5.205998420715332, - "learning_rate": 1.998548330326011e-05, - "loss": 1.1665, + "epoch": 0.9265978230533073, + "grad_norm": 0.013063084334135056, + "learning_rate": 3.2577470618881726e-07, + "loss": 0.5973, "step": 2490 }, { - "epoch": 0.11566991191469633, - "grad_norm": 3.94810152053833, - "learning_rate": 1.9985044203133096e-05, - "loss": 1.0869, + "epoch": 0.9284584612522095, + "grad_norm": 0.012596881948411465, + "learning_rate": 3.095325453988385e-07, + "loss": 0.6153, "step": 2495 }, { - "epoch": 0.11590171534538711, - "grad_norm": 4.870349407196045, - "learning_rate": 1.99845985656158e-05, - "loss": 1.0749, + "epoch": 0.9303190994511117, + "grad_norm": 0.013333328068256378, + "learning_rate": 2.9369933474867496e-07, + "loss": 0.6337, "step": 2500 }, { - "epoch": 0.11590171534538711, - "eval_loss": 1.0539507865905762, - "eval_runtime": 11.2662, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, + "epoch": 0.9303190994511117, + "eval_loss": 0.6424322128295898, + "eval_runtime": 26.5954, + "eval_samples_per_second": 4.775, + "eval_steps_per_second": 4.775, "step": 2500 }, { - "epoch": 0.11613351877607789, - "grad_norm": 4.539355278015137, - "learning_rate": 1.9984146390999993e-05, - "loss": 0.9024, + "epoch": 0.9321797376500139, + "grad_norm": 0.012263627722859383, + "learning_rate": 2.7827574242009434e-07, + "loss": 0.6258, "step": 2505 }, { - "epoch": 0.11636532220676866, - "grad_norm": 5.9270219802856445, - "learning_rate": 1.9983687679581717e-05, - "loss": 1.0351, + "epoch": 0.9340403758489162, + "grad_norm": 0.01295017171651125, + "learning_rate": 2.632624193084499e-07, + "loss": 0.6071, "step": 2510 }, { - "epoch": 0.11659712563745943, - "grad_norm": 5.80509614944458, - "learning_rate": 1.99832224316613e-05, - "loss": 0.9956, + "epoch": 0.9359010140478184, + "grad_norm": 0.013952870853245258, + "learning_rate": 2.48659998995211e-07, + "loss": 0.6594, "step": 2515 }, { - "epoch": 0.1168289290681502, - "grad_norm": 4.957856178283691, - "learning_rate": 1.9982750647543347e-05, - "loss": 1.0678, + "epoch": 0.9377616522467206, + "grad_norm": 0.01351676881313324, + "learning_rate": 2.344690977212205e-07, + "loss": 0.6072, "step": 2520 }, { - "epoch": 0.11706073249884098, - "grad_norm": 5.07832670211792, - "learning_rate": 1.9982272327536748e-05, - "loss": 1.209, + "epoch": 0.9396222904456228, + "grad_norm": 0.012661050073802471, + "learning_rate": 2.2069031436068643e-07, + "loss": 0.6152, "step": 2525 }, { - "epoch": 0.11729253592953176, - "grad_norm": 5.460022926330566, - "learning_rate": 1.9981787471954663e-05, - "loss": 0.8889, + "epoch": 0.9414829286445251, + "grad_norm": 0.013936568051576614, + "learning_rate": 2.0732423039591998e-07, + "loss": 0.6149, "step": 2530 }, { - "epoch": 0.11752433936022254, - "grad_norm": 5.039451599121094, - "learning_rate": 1.9981296081114535e-05, - "loss": 1.1051, + "epoch": 0.9433435668434274, + "grad_norm": 0.012601389549672604, + "learning_rate": 1.9437140989278624e-07, + "loss": 0.6205, "step": 2535 }, { - "epoch": 0.1177561427909133, - "grad_norm": 5.151773452758789, - "learning_rate": 1.9980798155338092e-05, - "loss": 1.0199, + "epoch": 0.9452042050423295, + "grad_norm": 0.011978083290159702, + "learning_rate": 1.8183239947690112e-07, + "loss": 0.6252, "step": 2540 }, { - "epoch": 0.11798794622160408, - "grad_norm": 5.761076927185059, - "learning_rate": 1.9980293694951333e-05, - "loss": 1.0285, + "epoch": 0.9470648432412317, + "grad_norm": 0.012075323611497879, + "learning_rate": 1.6970772831056637e-07, + "loss": 0.6438, "step": 2545 }, { - "epoch": 0.11821974965229486, - "grad_norm": 5.472416400909424, - "learning_rate": 1.9979782700284532e-05, - "loss": 0.9648, + "epoch": 0.948925481440134, + "grad_norm": 0.012837120331823826, + "learning_rate": 1.5799790807043857e-07, + "loss": 0.6304, "step": 2550 }, { - "epoch": 0.11845155308298563, - "grad_norm": 3.8476479053497314, - "learning_rate": 1.997926517167225e-05, - "loss": 1.0118, + "epoch": 0.9507861196390361, + "grad_norm": 0.012269029393792152, + "learning_rate": 1.467034329259287e-07, + "loss": 0.5938, "step": 2555 }, { - "epoch": 0.1186833565136764, - "grad_norm": 4.49735164642334, - "learning_rate": 1.997874110945333e-05, - "loss": 1.0989, + "epoch": 0.9526467578379384, + "grad_norm": 0.01306453812867403, + "learning_rate": 1.358247795183587e-07, + "loss": 0.5981, "step": 2560 }, { - "epoch": 0.11891515994436717, - "grad_norm": 3.996696710586548, - "learning_rate": 1.9978210513970877e-05, - "loss": 0.8726, + "epoch": 0.9545073960368406, + "grad_norm": 0.012171389535069466, + "learning_rate": 1.2536240694083658e-07, + "loss": 0.5599, "step": 2565 }, { - "epoch": 0.11914696337505795, - "grad_norm": 4.320407390594482, - "learning_rate": 1.9977673385572284e-05, - "loss": 0.9728, + "epoch": 0.9563680342357429, + "grad_norm": 0.013338044285774231, + "learning_rate": 1.1531675671888621e-07, + "loss": 0.6366, "step": 2570 }, { - "epoch": 0.11937876680574873, - "grad_norm": 7.207583427429199, - "learning_rate": 1.9977129724609214e-05, - "loss": 1.1305, + "epoch": 0.958228672434645, + "grad_norm": 0.012387475930154324, + "learning_rate": 1.0568825279181572e-07, + "loss": 0.6376, "step": 2575 }, { - "epoch": 0.11961057023643949, - "grad_norm": 5.2364821434021, - "learning_rate": 1.9976579531437616e-05, - "loss": 1.0501, + "epoch": 0.9600893106335473, + "grad_norm": 0.013054094277322292, + "learning_rate": 9.647730149482614e-08, + "loss": 0.6386, "step": 2580 }, { - "epoch": 0.11984237366713027, - "grad_norm": 4.274256706237793, - "learning_rate": 1.9976022806417714e-05, - "loss": 1.0153, + "epoch": 0.9619499488324496, + "grad_norm": 0.01319506112486124, + "learning_rate": 8.768429154185853e-08, + "loss": 0.6067, "step": 2585 }, { - "epoch": 0.12007417709782105, - "grad_norm": 5.584084510803223, - "learning_rate": 1.9975459549914e-05, - "loss": 0.974, + "epoch": 0.9638105870313518, + "grad_norm": 0.013576803728938103, + "learning_rate": 7.930959400919924e-08, + "loss": 0.6258, "step": 2590 }, { - "epoch": 0.12030598052851182, - "grad_norm": 5.529228687286377, - "learning_rate": 1.9974889762295254e-05, - "loss": 1.0517, + "epoch": 0.965671225230254, + "grad_norm": 0.013341645710170269, + "learning_rate": 7.135356231981028e-08, + "loss": 0.6344, "step": 2595 }, { - "epoch": 0.1205377839592026, - "grad_norm": 4.252645015716553, - "learning_rate": 1.997431344393452e-05, - "loss": 1.0758, + "epoch": 0.9675318634291562, + "grad_norm": 0.013685373589396477, + "learning_rate": 6.381653222842011e-08, + "loss": 0.6144, "step": 2600 }, { - "epoch": 0.1205377839592026, - "eval_loss": 1.0526373386383057, - "eval_runtime": 11.2657, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, + "epoch": 0.9675318634291562, + "eval_loss": 0.642424464225769, + "eval_runtime": 26.5716, + "eval_samples_per_second": 4.78, + "eval_steps_per_second": 4.78, "step": 2600 }, { - "epoch": 0.12076958738989337, - "grad_norm": 4.629552364349365, - "learning_rate": 1.9973730595209134e-05, - "loss": 1.0771, + "epoch": 0.9693925016280585, + "grad_norm": 0.018206071108579636, + "learning_rate": 5.6698821807354975e-08, + "loss": 0.5716, "step": 2605 }, { - "epoch": 0.12100139082058414, - "grad_norm": 4.153298377990723, - "learning_rate": 1.9973141216500685e-05, - "loss": 1.1439, + "epoch": 0.9712531398269606, + "grad_norm": 0.014082850888371468, + "learning_rate": 5.000073143310969e-08, + "loss": 0.641, "step": 2610 }, { - "epoch": 0.12123319425127492, - "grad_norm": 4.761916637420654, - "learning_rate": 1.997254530819506e-05, - "loss": 1.0997, + "epoch": 0.9731137780258629, + "grad_norm": 0.01310745719820261, + "learning_rate": 4.3722543773681016e-08, + "loss": 0.6206, "step": 2615 }, { - "epoch": 0.1214649976819657, - "grad_norm": 6.3712544441223145, - "learning_rate": 1.9971942870682404e-05, - "loss": 1.0405, + "epoch": 0.9749744162247651, + "grad_norm": 0.013652559369802475, + "learning_rate": 3.7864523776628414e-08, + "loss": 0.6405, "step": 2620 }, { - "epoch": 0.12169680111265646, - "grad_norm": 5.3124518394470215, - "learning_rate": 1.9971333904357152e-05, - "loss": 1.1737, + "epoch": 0.9768350544236674, + "grad_norm": 0.01430275198072195, + "learning_rate": 3.242691865790071e-08, + "loss": 0.6191, "step": 2625 }, { - "epoch": 0.12192860454334724, - "grad_norm": 4.761479377746582, - "learning_rate": 1.9970718409618e-05, - "loss": 1.0439, + "epoch": 0.9786956926225695, + "grad_norm": 0.013149100355803967, + "learning_rate": 2.7409957891397775e-08, + "loss": 0.6676, "step": 2630 }, { - "epoch": 0.12216040797403802, - "grad_norm": 5.314613342285156, - "learning_rate": 1.9970096386867922e-05, - "loss": 1.0017, + "epoch": 0.9805563308214718, + "grad_norm": 0.01429106667637825, + "learning_rate": 2.2813853199292745e-08, + "loss": 0.656, "step": 2635 }, { - "epoch": 0.12239221140472879, - "grad_norm": 5.509566307067871, - "learning_rate": 1.996946783651417e-05, - "loss": 1.0059, + "epoch": 0.982416969020374, + "grad_norm": 0.012532561086118221, + "learning_rate": 1.8638798543090253e-08, + "loss": 0.6065, "step": 2640 }, { - "epoch": 0.12262401483541957, - "grad_norm": 5.131368637084961, - "learning_rate": 1.996883275896827e-05, - "loss": 0.8546, + "epoch": 0.9842776072192763, + "grad_norm": 0.01296111661940813, + "learning_rate": 1.4884970115444097e-08, + "loss": 0.604, "step": 2645 }, { - "epoch": 0.12285581826611033, - "grad_norm": 4.782067775726318, - "learning_rate": 1.9968191154646012e-05, - "loss": 0.9647, + "epoch": 0.9861382454181784, + "grad_norm": 0.012804310768842697, + "learning_rate": 1.1552526332723191e-08, + "loss": 0.6236, "step": 2650 }, { - "epoch": 0.12308762169680111, - "grad_norm": 5.031082630157471, - "learning_rate": 1.996754302396747e-05, - "loss": 1.0906, + "epoch": 0.9879988836170807, + "grad_norm": 0.012863220646977425, + "learning_rate": 8.641607828324682e-09, + "loss": 0.6245, "step": 2655 }, { - "epoch": 0.12331942512749189, - "grad_norm": 4.008341312408447, - "learning_rate": 1.996688836735699e-05, - "loss": 1.0645, + "epoch": 0.9898595218159829, + "grad_norm": 0.013312343508005142, + "learning_rate": 6.152337446736489e-09, + "loss": 0.6561, "step": 2660 }, { - "epoch": 0.12355122855818267, - "grad_norm": 4.894102096557617, - "learning_rate": 1.9966227185243184e-05, - "loss": 0.9666, + "epoch": 0.991720160014885, + "grad_norm": 0.011470803059637547, + "learning_rate": 4.0848202383581e-09, + "loss": 0.5847, "step": 2665 }, { - "epoch": 0.12378303198887343, - "grad_norm": 4.749420166015625, - "learning_rate": 1.996555947805894e-05, - "loss": 1.2034, + "epoch": 0.9935807982137873, + "grad_norm": 0.013984150253236294, + "learning_rate": 2.4391434550652403e-09, + "loss": 0.5881, "step": 2670 }, { - "epoch": 0.1240148354195642, - "grad_norm": 5.051076889038086, - "learning_rate": 1.9964885246241417e-05, - "loss": 0.9919, + "epoch": 0.9954414364126896, + "grad_norm": 0.012489933520555496, + "learning_rate": 1.2153765465250378e-09, + "loss": 0.6653, "step": 2675 }, { - "epoch": 0.12424663885025498, - "grad_norm": 5.225306987762451, - "learning_rate": 1.996420449023205e-05, - "loss": 0.9872, + "epoch": 0.9973020746115918, + "grad_norm": 0.012180610559880733, + "learning_rate": 4.1357115726947674e-10, + "loss": 0.629, "step": 2680 }, { - "epoch": 0.12447844228094576, - "grad_norm": 4.617201328277588, - "learning_rate": 1.9963517210476543e-05, - "loss": 1.0147, + "epoch": 0.999162712810494, + "grad_norm": 0.014413848519325256, + "learning_rate": 3.376112451158875e-11, + "loss": 0.6525, "step": 2685 }, { - "epoch": 0.12471024571163654, - "grad_norm": 4.746542453765869, - "learning_rate": 1.9962823407424868e-05, - "loss": 0.8903, - "step": 2690 - }, - { - "epoch": 0.1249420491423273, - "grad_norm": 4.461585521697998, - "learning_rate": 1.9962123081531268e-05, - "loss": 1.0642, - "step": 2695 - }, - { - "epoch": 0.12517385257301808, - "grad_norm": 4.527534008026123, - "learning_rate": 1.996141623325426e-05, - "loss": 1.001, - "step": 2700 - }, - { - "epoch": 0.12517385257301808, - "eval_loss": 1.0466318130493164, - "eval_runtime": 11.2744, - "eval_samples_per_second": 11.264, - "eval_steps_per_second": 11.264, - "step": 2700 - }, - { - "epoch": 0.12540565600370884, - "grad_norm": 5.0327982902526855, - "learning_rate": 1.996070286305664e-05, - "loss": 0.9662, - "step": 2705 - }, - { - "epoch": 0.12563745943439963, - "grad_norm": 4.870184421539307, - "learning_rate": 1.9959982971405454e-05, - "loss": 0.9642, - "step": 2710 - }, - { - "epoch": 0.1258692628650904, - "grad_norm": 4.229808807373047, - "learning_rate": 1.995925655877203e-05, - "loss": 1.1574, - "step": 2715 - }, - { - "epoch": 0.1261010662957812, - "grad_norm": 4.645317077636719, - "learning_rate": 1.9958523625631964e-05, - "loss": 0.8886, - "step": 2720 - }, - { - "epoch": 0.12633286972647195, - "grad_norm": 4.935416221618652, - "learning_rate": 1.9957784172465124e-05, - "loss": 0.9621, - "step": 2725 - }, - { - "epoch": 0.12656467315716272, - "grad_norm": 4.857961654663086, - "learning_rate": 1.9957038199755637e-05, - "loss": 1.0029, - "step": 2730 - }, - { - "epoch": 0.1267964765878535, - "grad_norm": 5.052342891693115, - "learning_rate": 1.9956285707991913e-05, - "loss": 1.0644, - "step": 2735 - }, - { - "epoch": 0.12702828001854427, - "grad_norm": 4.7845354080200195, - "learning_rate": 1.9955526697666615e-05, - "loss": 1.0014, - "step": 2740 - }, - { - "epoch": 0.12726008344923506, - "grad_norm": 5.164909362792969, - "learning_rate": 1.9954761169276686e-05, - "loss": 1.0553, - "step": 2745 - }, - { - "epoch": 0.12749188687992583, - "grad_norm": 4.757340431213379, - "learning_rate": 1.9953989123323327e-05, - "loss": 1.1321, - "step": 2750 - }, - { - "epoch": 0.1277236903106166, - "grad_norm": 5.6578688621521, - "learning_rate": 1.995321056031201e-05, - "loss": 0.9902, - "step": 2755 - }, - { - "epoch": 0.12795549374130738, - "grad_norm": 4.892259120941162, - "learning_rate": 1.9952425480752483e-05, - "loss": 1.061, - "step": 2760 - }, - { - "epoch": 0.12818729717199814, - "grad_norm": 5.429172992706299, - "learning_rate": 1.9951633885158745e-05, - "loss": 1.0976, - "step": 2765 - }, - { - "epoch": 0.1284191006026889, - "grad_norm": 3.9769465923309326, - "learning_rate": 1.9950835774049066e-05, - "loss": 0.9902, - "step": 2770 - }, - { - "epoch": 0.1286509040333797, - "grad_norm": 5.092761993408203, - "learning_rate": 1.995003114794599e-05, - "loss": 1.114, - "step": 2775 - }, - { - "epoch": 0.12888270746407046, - "grad_norm": 5.163008689880371, - "learning_rate": 1.9949220007376312e-05, - "loss": 1.2063, - "step": 2780 - }, - { - "epoch": 0.12911451089476125, - "grad_norm": 4.68748140335083, - "learning_rate": 1.994840235287111e-05, - "loss": 1.1674, - "step": 2785 - }, - { - "epoch": 0.12934631432545202, - "grad_norm": 5.405124664306641, - "learning_rate": 1.9947578184965712e-05, - "loss": 1.0991, - "step": 2790 - }, - { - "epoch": 0.12957811775614278, - "grad_norm": 5.273200511932373, - "learning_rate": 1.9946747504199718e-05, - "loss": 0.8909, - "step": 2795 - }, - { - "epoch": 0.12980992118683357, - "grad_norm": 4.340039253234863, - "learning_rate": 1.9945910311116986e-05, - "loss": 0.9397, - "step": 2800 - }, - { - "epoch": 0.12980992118683357, - "eval_loss": 1.0486783981323242, - "eval_runtime": 11.2737, - "eval_samples_per_second": 11.265, - "eval_steps_per_second": 11.265, - "step": 2800 - }, - { - "epoch": 0.13004172461752433, - "grad_norm": 4.297397136688232, - "learning_rate": 1.9945066606265645e-05, - "loss": 0.9343, - "step": 2805 - }, - { - "epoch": 0.13027352804821513, - "grad_norm": 4.575728416442871, - "learning_rate": 1.9944216390198082e-05, - "loss": 0.9489, - "step": 2810 - }, - { - "epoch": 0.1305053314789059, - "grad_norm": 5.547889232635498, - "learning_rate": 1.9943359663470954e-05, - "loss": 1.0477, - "step": 2815 - }, - { - "epoch": 0.13073713490959665, - "grad_norm": 5.181078910827637, - "learning_rate": 1.994249642664517e-05, - "loss": 1.0363, - "step": 2820 - }, - { - "epoch": 0.13096893834028744, - "grad_norm": 4.966522693634033, - "learning_rate": 1.9941626680285906e-05, - "loss": 1.0665, - "step": 2825 - }, - { - "epoch": 0.1312007417709782, - "grad_norm": 6.055009841918945, - "learning_rate": 1.9940750424962603e-05, - "loss": 0.9614, - "step": 2830 - }, - { - "epoch": 0.13143254520166897, - "grad_norm": 4.3711628913879395, - "learning_rate": 1.9939867661248964e-05, - "loss": 1.0237, - "step": 2835 - }, - { - "epoch": 0.13166434863235976, - "grad_norm": 4.811582565307617, - "learning_rate": 1.993897838972295e-05, - "loss": 0.9741, - "step": 2840 - }, - { - "epoch": 0.13189615206305053, - "grad_norm": 4.890336036682129, - "learning_rate": 1.9938082610966775e-05, - "loss": 0.9146, - "step": 2845 - }, - { - "epoch": 0.13212795549374132, - "grad_norm": 4.723270416259766, - "learning_rate": 1.9937180325566934e-05, - "loss": 1.1937, - "step": 2850 - }, - { - "epoch": 0.13235975892443208, - "grad_norm": 3.966965675354004, - "learning_rate": 1.9936271534114157e-05, - "loss": 0.9725, - "step": 2855 - }, - { - "epoch": 0.13259156235512284, - "grad_norm": 5.717922687530518, - "learning_rate": 1.9935356237203457e-05, - "loss": 1.0297, - "step": 2860 - }, - { - "epoch": 0.13282336578581364, - "grad_norm": 5.139970302581787, - "learning_rate": 1.9934434435434095e-05, - "loss": 1.0494, - "step": 2865 - }, - { - "epoch": 0.1330551692165044, - "grad_norm": 4.475926399230957, - "learning_rate": 1.9933506129409583e-05, - "loss": 0.928, - "step": 2870 - }, - { - "epoch": 0.1332869726471952, - "grad_norm": 4.684488296508789, - "learning_rate": 1.9932571319737706e-05, - "loss": 1.1672, - "step": 2875 - }, - { - "epoch": 0.13351877607788595, - "grad_norm": 4.571727275848389, - "learning_rate": 1.99316300070305e-05, - "loss": 0.9693, - "step": 2880 - }, - { - "epoch": 0.13375057950857672, - "grad_norm": 3.749621629714966, - "learning_rate": 1.9930682191904262e-05, - "loss": 0.8496, - "step": 2885 - }, - { - "epoch": 0.1339823829392675, - "grad_norm": 4.513935565948486, - "learning_rate": 1.9929727874979536e-05, - "loss": 1.1165, - "step": 2890 - }, - { - "epoch": 0.13421418636995827, - "grad_norm": 4.019639492034912, - "learning_rate": 1.9928767056881137e-05, - "loss": 1.0072, - "step": 2895 - }, - { - "epoch": 0.13444598980064906, - "grad_norm": 5.7689948081970215, - "learning_rate": 1.9927799738238128e-05, - "loss": 1.0774, - "step": 2900 - }, - { - "epoch": 0.13444598980064906, - "eval_loss": 1.0431135892868042, - "eval_runtime": 11.2686, - "eval_samples_per_second": 11.27, - "eval_steps_per_second": 11.27, - "step": 2900 - }, - { - "epoch": 0.13467779323133983, - "grad_norm": 4.070760726928711, - "learning_rate": 1.9926825919683836e-05, - "loss": 0.9854, - "step": 2905 - }, - { - "epoch": 0.1349095966620306, - "grad_norm": 4.487087249755859, - "learning_rate": 1.992584560185583e-05, - "loss": 0.9863, - "step": 2910 - }, - { - "epoch": 0.13514140009272138, - "grad_norm": 4.818826675415039, - "learning_rate": 1.9924858785395943e-05, - "loss": 0.9824, - "step": 2915 - }, - { - "epoch": 0.13537320352341214, - "grad_norm": 4.67568826675415, - "learning_rate": 1.9923865470950266e-05, - "loss": 1.0129, - "step": 2920 - }, - { - "epoch": 0.1356050069541029, - "grad_norm": 4.589109897613525, - "learning_rate": 1.9922865659169133e-05, - "loss": 1.073, - "step": 2925 - }, - { - "epoch": 0.1358368103847937, - "grad_norm": 4.670289039611816, - "learning_rate": 1.992185935070715e-05, - "loss": 1.058, - "step": 2930 - }, - { - "epoch": 0.13606861381548446, - "grad_norm": 4.4648542404174805, - "learning_rate": 1.9920846546223155e-05, - "loss": 0.8634, - "step": 2935 - }, - { - "epoch": 0.13630041724617525, - "grad_norm": 5.334418296813965, - "learning_rate": 1.9919827246380254e-05, - "loss": 0.9094, - "step": 2940 - }, - { - "epoch": 0.13653222067686602, - "grad_norm": 4.683016777038574, - "learning_rate": 1.9918801451845798e-05, - "loss": 0.9466, - "step": 2945 - }, - { - "epoch": 0.13676402410755678, - "grad_norm": 5.748208045959473, - "learning_rate": 1.99177691632914e-05, - "loss": 1.061, - "step": 2950 - }, - { - "epoch": 0.13699582753824757, - "grad_norm": 4.584070205688477, - "learning_rate": 1.9916730381392913e-05, - "loss": 1.0056, - "step": 2955 - }, - { - "epoch": 0.13722763096893834, - "grad_norm": 5.5280561447143555, - "learning_rate": 1.9915685106830444e-05, - "loss": 1.0739, - "step": 2960 - }, - { - "epoch": 0.13745943439962913, - "grad_norm": 4.314013481140137, - "learning_rate": 1.9914633340288362e-05, - "loss": 1.0037, - "step": 2965 - }, - { - "epoch": 0.1376912378303199, - "grad_norm": 5.107608795166016, - "learning_rate": 1.991357508245527e-05, - "loss": 1.0779, - "step": 2970 - }, - { - "epoch": 0.13792304126101065, - "grad_norm": 5.307722091674805, - "learning_rate": 1.9912510334024034e-05, - "loss": 1.1289, - "step": 2975 - }, - { - "epoch": 0.13815484469170144, - "grad_norm": 6.47243595123291, - "learning_rate": 1.991143909569176e-05, - "loss": 1.1177, - "step": 2980 - }, - { - "epoch": 0.1383866481223922, - "grad_norm": 5.38716459274292, - "learning_rate": 1.9910361368159803e-05, - "loss": 1.0031, - "step": 2985 - }, - { - "epoch": 0.13861845155308297, - "grad_norm": 4.5754194259643555, - "learning_rate": 1.990927715213378e-05, - "loss": 1.0479, - "step": 2990 - }, - { - "epoch": 0.13885025498377376, - "grad_norm": 5.582353591918945, - "learning_rate": 1.9908186448323546e-05, - "loss": 0.9586, - "step": 2995 - }, - { - "epoch": 0.13908205841446453, - "grad_norm": 7.415589809417725, - "learning_rate": 1.99070892574432e-05, - "loss": 0.9956, - "step": 3000 - }, - { - "epoch": 0.13908205841446453, - "eval_loss": 1.0443520545959473, - "eval_runtime": 11.2822, - "eval_samples_per_second": 11.257, - "eval_steps_per_second": 11.257, - "step": 3000 - }, - { - "epoch": 0.13931386184515532, - "grad_norm": 4.259057521820068, - "learning_rate": 1.99059855802111e-05, - "loss": 0.9444, - "step": 3005 - }, - { - "epoch": 0.13954566527584608, - "grad_norm": 4.621634006500244, - "learning_rate": 1.990487541734983e-05, - "loss": 0.9122, - "step": 3010 - }, - { - "epoch": 0.13977746870653684, - "grad_norm": 4.235317707061768, - "learning_rate": 1.9903758769586247e-05, - "loss": 1.0184, - "step": 3015 - }, - { - "epoch": 0.14000927213722764, - "grad_norm": 4.515482425689697, - "learning_rate": 1.9902635637651435e-05, - "loss": 1.0877, - "step": 3020 - }, - { - "epoch": 0.1402410755679184, - "grad_norm": 4.261685371398926, - "learning_rate": 1.990150602228073e-05, - "loss": 1.034, - "step": 3025 - }, - { - "epoch": 0.1404728789986092, - "grad_norm": 5.3254289627075195, - "learning_rate": 1.990036992421371e-05, - "loss": 1.1411, - "step": 3030 - }, - { - "epoch": 0.14070468242929995, - "grad_norm": 5.06146764755249, - "learning_rate": 1.9899227344194204e-05, - "loss": 0.9846, - "step": 3035 - }, - { - "epoch": 0.14093648585999072, - "grad_norm": 4.180016994476318, - "learning_rate": 1.9898078282970274e-05, - "loss": 1.0153, - "step": 3040 - }, - { - "epoch": 0.1411682892906815, - "grad_norm": 4.471304893493652, - "learning_rate": 1.989692274129424e-05, - "loss": 1.0025, - "step": 3045 - }, - { - "epoch": 0.14140009272137227, - "grad_norm": 4.3698811531066895, - "learning_rate": 1.9895760719922644e-05, - "loss": 0.9501, - "step": 3050 - }, - { - "epoch": 0.14163189615206306, - "grad_norm": 4.26015567779541, - "learning_rate": 1.989459221961629e-05, - "loss": 1.0604, - "step": 3055 - }, - { - "epoch": 0.14186369958275383, - "grad_norm": 6.512139320373535, - "learning_rate": 1.9893417241140215e-05, - "loss": 1.0173, - "step": 3060 - }, - { - "epoch": 0.1420955030134446, - "grad_norm": 3.8038687705993652, - "learning_rate": 1.98922357852637e-05, - "loss": 1.0969, - "step": 3065 - }, - { - "epoch": 0.14232730644413538, - "grad_norm": 4.5116143226623535, - "learning_rate": 1.9891047852760264e-05, - "loss": 1.0758, - "step": 3070 - }, - { - "epoch": 0.14255910987482615, - "grad_norm": 5.065197944641113, - "learning_rate": 1.9889853444407666e-05, - "loss": 1.2611, - "step": 3075 - }, - { - "epoch": 0.1427909133055169, - "grad_norm": 5.6167778968811035, - "learning_rate": 1.9888652560987915e-05, - "loss": 0.9903, - "step": 3080 - }, - { - "epoch": 0.1430227167362077, - "grad_norm": 4.782162666320801, - "learning_rate": 1.9887445203287242e-05, - "loss": 1.0061, - "step": 3085 - }, - { - "epoch": 0.14325452016689846, - "grad_norm": 4.842726230621338, - "learning_rate": 1.9886231372096133e-05, - "loss": 1.2085, - "step": 3090 - }, - { - "epoch": 0.14348632359758925, - "grad_norm": 4.182819843292236, - "learning_rate": 1.98850110682093e-05, - "loss": 0.9167, - "step": 3095 - }, - { - "epoch": 0.14371812702828002, - "grad_norm": 4.709084510803223, - "learning_rate": 1.98837842924257e-05, - "loss": 1.0333, - "step": 3100 - }, - { - "epoch": 0.14371812702828002, - "eval_loss": 1.0421561002731323, - "eval_runtime": 11.2658, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, - "step": 3100 - }, - { - "epoch": 0.14394993045897078, - "grad_norm": 4.043697834014893, - "learning_rate": 1.988255104554853e-05, - "loss": 1.1004, - "step": 3105 - }, - { - "epoch": 0.14418173388966157, - "grad_norm": 5.348196029663086, - "learning_rate": 1.9881311328385213e-05, - "loss": 1.152, - "step": 3110 - }, - { - "epoch": 0.14441353732035234, - "grad_norm": 4.3684821128845215, - "learning_rate": 1.9880065141747417e-05, - "loss": 1.0153, - "step": 3115 - }, - { - "epoch": 0.14464534075104313, - "grad_norm": 4.7167253494262695, - "learning_rate": 1.987881248645104e-05, - "loss": 0.9573, - "step": 3120 - }, - { - "epoch": 0.1448771441817339, - "grad_norm": 5.030665874481201, - "learning_rate": 1.987755336331622e-05, - "loss": 1.0171, - "step": 3125 - }, - { - "epoch": 0.14510894761242465, - "grad_norm": 5.634954929351807, - "learning_rate": 1.9876287773167333e-05, - "loss": 0.9415, - "step": 3130 - }, - { - "epoch": 0.14534075104311545, - "grad_norm": 4.053722858428955, - "learning_rate": 1.9875015716832978e-05, - "loss": 1.039, - "step": 3135 - }, - { - "epoch": 0.1455725544738062, - "grad_norm": 6.290874481201172, - "learning_rate": 1.9873737195145993e-05, - "loss": 1.0583, - "step": 3140 - }, - { - "epoch": 0.14580435790449697, - "grad_norm": 4.683097839355469, - "learning_rate": 1.9872452208943453e-05, - "loss": 1.0219, - "step": 3145 - }, - { - "epoch": 0.14603616133518776, - "grad_norm": 5.337057590484619, - "learning_rate": 1.9871160759066662e-05, - "loss": 0.9462, - "step": 3150 - }, - { - "epoch": 0.14626796476587853, - "grad_norm": 5.12256383895874, - "learning_rate": 1.9869862846361152e-05, - "loss": 0.9631, - "step": 3155 - }, - { - "epoch": 0.14649976819656932, - "grad_norm": 4.650876522064209, - "learning_rate": 1.9868558471676693e-05, - "loss": 1.1056, - "step": 3160 - }, - { - "epoch": 0.14673157162726008, - "grad_norm": 5.170388698577881, - "learning_rate": 1.9867247635867285e-05, - "loss": 1.2239, - "step": 3165 - }, - { - "epoch": 0.14696337505795085, - "grad_norm": 3.694185495376587, - "learning_rate": 1.9865930339791147e-05, - "loss": 0.924, - "step": 3170 - }, - { - "epoch": 0.14719517848864164, - "grad_norm": 7.866819858551025, - "learning_rate": 1.986460658431075e-05, - "loss": 1.1734, - "step": 3175 - }, - { - "epoch": 0.1474269819193324, - "grad_norm": 4.429941654205322, - "learning_rate": 1.9863276370292766e-05, - "loss": 1.0446, - "step": 3180 - }, - { - "epoch": 0.1476587853500232, - "grad_norm": 5.235711097717285, - "learning_rate": 1.9861939698608123e-05, - "loss": 1.053, - "step": 3185 - }, - { - "epoch": 0.14789058878071396, - "grad_norm": 4.496561527252197, - "learning_rate": 1.986059657013196e-05, - "loss": 1.0764, - "step": 3190 - }, - { - "epoch": 0.14812239221140472, - "grad_norm": 4.243244647979736, - "learning_rate": 1.9859246985743646e-05, - "loss": 0.9841, - "step": 3195 - }, - { - "epoch": 0.1483541956420955, - "grad_norm": 4.924827575683594, - "learning_rate": 1.985789094632678e-05, - "loss": 0.9833, - "step": 3200 - }, - { - "epoch": 0.1483541956420955, - "eval_loss": 1.0363918542861938, - "eval_runtime": 11.2718, - "eval_samples_per_second": 11.267, - "eval_steps_per_second": 11.267, - "step": 3200 - }, - { - "epoch": 0.14858599907278627, - "grad_norm": 4.187185287475586, - "learning_rate": 1.9856528452769182e-05, - "loss": 0.941, - "step": 3205 - }, - { - "epoch": 0.14881780250347706, - "grad_norm": 4.020816802978516, - "learning_rate": 1.9855159505962907e-05, - "loss": 0.95, - "step": 3210 - }, - { - "epoch": 0.14904960593416783, - "grad_norm": 4.336887359619141, - "learning_rate": 1.9853784106804227e-05, - "loss": 0.7706, - "step": 3215 - }, - { - "epoch": 0.1492814093648586, - "grad_norm": 4.014132976531982, - "learning_rate": 1.9852402256193638e-05, - "loss": 0.917, - "step": 3220 - }, - { - "epoch": 0.14951321279554938, - "grad_norm": 3.605818510055542, - "learning_rate": 1.9851013955035868e-05, - "loss": 1.0043, - "step": 3225 - }, - { - "epoch": 0.14974501622624015, - "grad_norm": 4.072787284851074, - "learning_rate": 1.9849619204239863e-05, - "loss": 0.9526, - "step": 3230 - }, - { - "epoch": 0.1499768196569309, - "grad_norm": 4.913269519805908, - "learning_rate": 1.9848218004718783e-05, - "loss": 1.0963, - "step": 3235 - }, - { - "epoch": 0.1502086230876217, - "grad_norm": 4.653753280639648, - "learning_rate": 1.9846810357390033e-05, - "loss": 1.1541, - "step": 3240 - }, - { - "epoch": 0.15044042651831246, - "grad_norm": 4.507899761199951, - "learning_rate": 1.9845396263175214e-05, - "loss": 1.108, - "step": 3245 - }, - { - "epoch": 0.15067222994900326, - "grad_norm": 6.352303504943848, - "learning_rate": 1.984397572300016e-05, - "loss": 0.9772, - "step": 3250 - }, - { - "epoch": 0.15090403337969402, - "grad_norm": 4.838869094848633, - "learning_rate": 1.984254873779493e-05, - "loss": 1.0617, - "step": 3255 - }, - { - "epoch": 0.15113583681038478, - "grad_norm": 4.51560115814209, - "learning_rate": 1.9841115308493797e-05, - "loss": 0.7006, - "step": 3260 - }, - { - "epoch": 0.15136764024107557, - "grad_norm": 5.014158725738525, - "learning_rate": 1.9839675436035245e-05, - "loss": 0.9927, - "step": 3265 - }, - { - "epoch": 0.15159944367176634, - "grad_norm": 4.209564685821533, - "learning_rate": 1.9838229121361995e-05, - "loss": 1.0364, - "step": 3270 - }, - { - "epoch": 0.15183124710245713, - "grad_norm": 4.790988445281982, - "learning_rate": 1.983677636542097e-05, - "loss": 0.9416, - "step": 3275 - }, - { - "epoch": 0.1520630505331479, - "grad_norm": 3.9592926502227783, - "learning_rate": 1.9835317169163314e-05, - "loss": 1.053, - "step": 3280 - }, - { - "epoch": 0.15229485396383866, - "grad_norm": 3.99648118019104, - "learning_rate": 1.9833851533544396e-05, - "loss": 0.9706, - "step": 3285 - }, - { - "epoch": 0.15252665739452945, - "grad_norm": 4.372700214385986, - "learning_rate": 1.9832379459523793e-05, - "loss": 0.9676, - "step": 3290 - }, - { - "epoch": 0.1527584608252202, - "grad_norm": 5.660383224487305, - "learning_rate": 1.983090094806529e-05, - "loss": 0.9255, - "step": 3295 - }, - { - "epoch": 0.15299026425591097, - "grad_norm": 4.162616729736328, - "learning_rate": 1.9829416000136905e-05, - "loss": 1.0264, - "step": 3300 - }, - { - "epoch": 0.15299026425591097, - "eval_loss": 1.038940668106079, - "eval_runtime": 11.2668, - "eval_samples_per_second": 11.272, - "eval_steps_per_second": 11.272, - "step": 3300 - }, - { - "epoch": 0.15322206768660177, - "grad_norm": 4.065362453460693, - "learning_rate": 1.9827924616710858e-05, - "loss": 0.909, - "step": 3305 - }, - { - "epoch": 0.15345387111729253, - "grad_norm": 4.5612592697143555, - "learning_rate": 1.9826426798763577e-05, - "loss": 1.2016, - "step": 3310 - }, - { - "epoch": 0.15368567454798332, - "grad_norm": 4.262414932250977, - "learning_rate": 1.9824922547275726e-05, - "loss": 0.9091, - "step": 3315 - }, - { - "epoch": 0.15391747797867408, - "grad_norm": 5.065141201019287, - "learning_rate": 1.9823411863232155e-05, - "loss": 1.0608, - "step": 3320 - }, - { - "epoch": 0.15414928140936485, - "grad_norm": 4.390530109405518, - "learning_rate": 1.9821894747621936e-05, - "loss": 1.1014, - "step": 3325 - }, - { - "epoch": 0.15438108484005564, - "grad_norm": 5.454830169677734, - "learning_rate": 1.9820371201438358e-05, - "loss": 1.0328, - "step": 3330 - }, - { - "epoch": 0.1546128882707464, - "grad_norm": 3.939898729324341, - "learning_rate": 1.981884122567891e-05, - "loss": 0.9734, - "step": 3335 - }, - { - "epoch": 0.1548446917014372, - "grad_norm": 4.209561824798584, - "learning_rate": 1.98173048213453e-05, - "loss": 0.7803, - "step": 3340 - }, - { - "epoch": 0.15507649513212796, - "grad_norm": 4.415044784545898, - "learning_rate": 1.981576198944343e-05, - "loss": 0.9674, - "step": 3345 - }, - { - "epoch": 0.15530829856281872, - "grad_norm": 4.023775100708008, - "learning_rate": 1.981421273098343e-05, - "loss": 0.9258, - "step": 3350 - }, - { - "epoch": 0.1555401019935095, - "grad_norm": 4.346290588378906, - "learning_rate": 1.9812657046979625e-05, - "loss": 1.1134, - "step": 3355 - }, - { - "epoch": 0.15577190542420027, - "grad_norm": 3.649505853652954, - "learning_rate": 1.9811094938450548e-05, - "loss": 0.9605, - "step": 3360 - }, - { - "epoch": 0.15600370885489107, - "grad_norm": 5.957110404968262, - "learning_rate": 1.9809526406418943e-05, - "loss": 1.1297, - "step": 3365 - }, - { - "epoch": 0.15623551228558183, - "grad_norm": 4.172776699066162, - "learning_rate": 1.980795145191175e-05, - "loss": 1.0034, - "step": 3370 - }, - { - "epoch": 0.1564673157162726, - "grad_norm": 4.410923004150391, - "learning_rate": 1.9806370075960128e-05, - "loss": 1.0254, - "step": 3375 - }, - { - "epoch": 0.15669911914696338, - "grad_norm": 4.170181751251221, - "learning_rate": 1.9804782279599424e-05, - "loss": 1.1244, - "step": 3380 - }, - { - "epoch": 0.15693092257765415, - "grad_norm": 4.1127119064331055, - "learning_rate": 1.980318806386921e-05, - "loss": 0.9039, - "step": 3385 - }, - { - "epoch": 0.1571627260083449, - "grad_norm": 4.409833908081055, - "learning_rate": 1.9801587429813237e-05, - "loss": 1.0573, - "step": 3390 - }, - { - "epoch": 0.1573945294390357, - "grad_norm": 4.1619086265563965, - "learning_rate": 1.979998037847947e-05, - "loss": 0.9595, - "step": 3395 - }, - { - "epoch": 0.15762633286972647, - "grad_norm": 4.631487846374512, - "learning_rate": 1.9798366910920082e-05, - "loss": 1.0714, - "step": 3400 - }, - { - "epoch": 0.15762633286972647, - "eval_loss": 1.0348224639892578, - "eval_runtime": 11.2693, - "eval_samples_per_second": 11.27, - "eval_steps_per_second": 11.27, - "step": 3400 - }, - { - "epoch": 0.15785813630041726, - "grad_norm": 3.5335793495178223, - "learning_rate": 1.9796747028191433e-05, - "loss": 0.6737, - "step": 3405 - }, - { - "epoch": 0.15808993973110802, - "grad_norm": 4.976848602294922, - "learning_rate": 1.9795120731354092e-05, - "loss": 1.1097, - "step": 3410 - }, - { - "epoch": 0.15832174316179878, - "grad_norm": 4.47586727142334, - "learning_rate": 1.9793488021472827e-05, - "loss": 0.9923, - "step": 3415 - }, - { - "epoch": 0.15855354659248957, - "grad_norm": 4.697174072265625, - "learning_rate": 1.97918488996166e-05, - "loss": 1.0532, - "step": 3420 - }, - { - "epoch": 0.15878535002318034, - "grad_norm": 4.547231674194336, - "learning_rate": 1.9790203366858572e-05, - "loss": 0.9237, - "step": 3425 - }, - { - "epoch": 0.15901715345387113, - "grad_norm": 4.481298923492432, - "learning_rate": 1.978855142427611e-05, - "loss": 1.1366, - "step": 3430 - }, - { - "epoch": 0.1592489568845619, - "grad_norm": 4.639220714569092, - "learning_rate": 1.9786893072950766e-05, - "loss": 1.0086, - "step": 3435 - }, - { - "epoch": 0.15948076031525266, - "grad_norm": 4.7241339683532715, - "learning_rate": 1.978522831396829e-05, - "loss": 1.0335, - "step": 3440 - }, - { - "epoch": 0.15971256374594345, - "grad_norm": 6.761361598968506, - "learning_rate": 1.978355714841864e-05, - "loss": 1.0096, - "step": 3445 - }, - { - "epoch": 0.1599443671766342, - "grad_norm": 11.095553398132324, - "learning_rate": 1.978187957739595e-05, - "loss": 1.0299, - "step": 3450 - }, - { - "epoch": 0.16017617060732497, - "grad_norm": 4.084702491760254, - "learning_rate": 1.978019560199856e-05, - "loss": 1.0295, - "step": 3455 - }, - { - "epoch": 0.16040797403801577, - "grad_norm": 4.840928554534912, - "learning_rate": 1.9778505223328996e-05, - "loss": 0.9896, - "step": 3460 - }, - { - "epoch": 0.16063977746870653, - "grad_norm": 4.553187370300293, - "learning_rate": 1.977680844249398e-05, - "loss": 0.9175, - "step": 3465 - }, - { - "epoch": 0.16087158089939732, - "grad_norm": 3.998581647872925, - "learning_rate": 1.977510526060443e-05, - "loss": 1.0071, - "step": 3470 - }, - { - "epoch": 0.16110338433008808, - "grad_norm": 3.791153907775879, - "learning_rate": 1.977339567877545e-05, - "loss": 1.0841, - "step": 3475 - }, - { - "epoch": 0.16133518776077885, - "grad_norm": 5.333462715148926, - "learning_rate": 1.9771679698126325e-05, - "loss": 1.0645, - "step": 3480 - }, - { - "epoch": 0.16156699119146964, - "grad_norm": 4.272615909576416, - "learning_rate": 1.976995731978055e-05, - "loss": 0.8177, - "step": 3485 - }, - { - "epoch": 0.1617987946221604, - "grad_norm": 4.884770393371582, - "learning_rate": 1.9768228544865793e-05, - "loss": 0.9687, - "step": 3490 - }, - { - "epoch": 0.1620305980528512, - "grad_norm": 4.274031162261963, - "learning_rate": 1.9766493374513912e-05, - "loss": 1.0413, - "step": 3495 - }, - { - "epoch": 0.16226240148354196, - "grad_norm": 3.577754497528076, - "learning_rate": 1.9764751809860957e-05, - "loss": 1.0992, - "step": 3500 - }, - { - "epoch": 0.16226240148354196, - "eval_loss": 1.0341907739639282, - "eval_runtime": 11.2731, - "eval_samples_per_second": 11.266, - "eval_steps_per_second": 11.266, - "step": 3500 - }, - { - "epoch": 0.16249420491423272, - "grad_norm": 3.879619598388672, - "learning_rate": 1.9763003852047164e-05, - "loss": 0.9959, - "step": 3505 - }, - { - "epoch": 0.1627260083449235, - "grad_norm": 4.179292678833008, - "learning_rate": 1.976124950221695e-05, - "loss": 1.1132, - "step": 3510 - }, - { - "epoch": 0.16295781177561428, - "grad_norm": 4.479405879974365, - "learning_rate": 1.975948876151892e-05, - "loss": 1.1255, - "step": 3515 - }, - { - "epoch": 0.16318961520630507, - "grad_norm": 4.490816116333008, - "learning_rate": 1.9757721631105865e-05, - "loss": 0.8673, - "step": 3520 - }, - { - "epoch": 0.16342141863699583, - "grad_norm": 6.789510726928711, - "learning_rate": 1.9755948112134758e-05, - "loss": 0.9336, - "step": 3525 - }, - { - "epoch": 0.1636532220676866, - "grad_norm": 3.9455666542053223, - "learning_rate": 1.975416820576675e-05, - "loss": 0.8529, - "step": 3530 - }, - { - "epoch": 0.16388502549837738, - "grad_norm": 4.85823917388916, - "learning_rate": 1.9752381913167178e-05, - "loss": 1.1189, - "step": 3535 - }, - { - "epoch": 0.16411682892906815, - "grad_norm": 5.231497764587402, - "learning_rate": 1.9750589235505565e-05, - "loss": 0.9141, - "step": 3540 - }, - { - "epoch": 0.1643486323597589, - "grad_norm": 5.203330993652344, - "learning_rate": 1.9748790173955606e-05, - "loss": 0.8673, - "step": 3545 - }, - { - "epoch": 0.1645804357904497, - "grad_norm": 3.671243190765381, - "learning_rate": 1.974698472969518e-05, - "loss": 0.9806, - "step": 3550 - }, - { - "epoch": 0.16481223922114047, - "grad_norm": 4.87835693359375, - "learning_rate": 1.974517290390635e-05, - "loss": 1.1254, - "step": 3555 - }, - { - "epoch": 0.16504404265183126, - "grad_norm": 4.269375801086426, - "learning_rate": 1.9743354697775343e-05, - "loss": 0.875, - "step": 3560 - }, - { - "epoch": 0.16527584608252202, - "grad_norm": 4.071610927581787, - "learning_rate": 1.974153011249257e-05, - "loss": 1.0405, - "step": 3565 - }, - { - "epoch": 0.16550764951321278, - "grad_norm": 4.637951374053955, - "learning_rate": 1.9739699149252634e-05, - "loss": 1.088, - "step": 3570 - }, - { - "epoch": 0.16573945294390358, - "grad_norm": 3.6990163326263428, - "learning_rate": 1.9737861809254285e-05, - "loss": 0.9877, - "step": 3575 - }, - { - "epoch": 0.16597125637459434, - "grad_norm": 3.8444910049438477, - "learning_rate": 1.9736018093700477e-05, - "loss": 0.882, - "step": 3580 - }, - { - "epoch": 0.16620305980528513, - "grad_norm": 4.383355617523193, - "learning_rate": 1.973416800379831e-05, - "loss": 1.1287, - "step": 3585 - }, - { - "epoch": 0.1664348632359759, - "grad_norm": 4.884894371032715, - "learning_rate": 1.9732311540759075e-05, - "loss": 0.9952, - "step": 3590 - }, - { - "epoch": 0.16666666666666666, - "grad_norm": 5.813953399658203, - "learning_rate": 1.973044870579824e-05, - "loss": 0.9972, - "step": 3595 - }, - { - "epoch": 0.16689847009735745, - "grad_norm": 6.32355260848999, - "learning_rate": 1.972857950013543e-05, - "loss": 1.1215, - "step": 3600 - }, - { - "epoch": 0.16689847009735745, - "eval_loss": 1.0363401174545288, - "eval_runtime": 11.2664, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, - "step": 3600 - }, - { - "epoch": 0.1671302735280482, - "grad_norm": 4.845508575439453, - "learning_rate": 1.972670392499445e-05, - "loss": 0.9931, - "step": 3605 - }, - { - "epoch": 0.16736207695873898, - "grad_norm": 4.5534515380859375, - "learning_rate": 1.9724821981603276e-05, - "loss": 1.0293, - "step": 3610 - }, - { - "epoch": 0.16759388038942977, - "grad_norm": 4.300433158874512, - "learning_rate": 1.9722933671194045e-05, - "loss": 0.933, - "step": 3615 - }, - { - "epoch": 0.16782568382012053, - "grad_norm": 4.074310302734375, - "learning_rate": 1.972103899500307e-05, - "loss": 0.9394, - "step": 3620 - }, - { - "epoch": 0.16805748725081132, - "grad_norm": 4.055570602416992, - "learning_rate": 1.9719137954270832e-05, - "loss": 1.0515, - "step": 3625 - }, - { - "epoch": 0.16828929068150209, - "grad_norm": 3.8710527420043945, - "learning_rate": 1.9717230550241978e-05, - "loss": 1.118, - "step": 3630 - }, - { - "epoch": 0.16852109411219285, - "grad_norm": 5.327306747436523, - "learning_rate": 1.9715316784165316e-05, - "loss": 1.1147, - "step": 3635 - }, - { - "epoch": 0.16875289754288364, - "grad_norm": 3.6472907066345215, - "learning_rate": 1.971339665729383e-05, - "loss": 0.9233, - "step": 3640 - }, - { - "epoch": 0.1689847009735744, - "grad_norm": 3.9819483757019043, - "learning_rate": 1.9711470170884652e-05, - "loss": 1.0589, - "step": 3645 - }, - { - "epoch": 0.1692165044042652, - "grad_norm": 4.43086051940918, - "learning_rate": 1.9709537326199096e-05, - "loss": 0.9735, - "step": 3650 - }, - { - "epoch": 0.16944830783495596, - "grad_norm": 5.653796195983887, - "learning_rate": 1.9707598124502627e-05, - "loss": 1.087, - "step": 3655 - }, - { - "epoch": 0.16968011126564672, - "grad_norm": 4.229031562805176, - "learning_rate": 1.9705652567064877e-05, - "loss": 1.0886, - "step": 3660 - }, - { - "epoch": 0.1699119146963375, - "grad_norm": 4.481167316436768, - "learning_rate": 1.9703700655159638e-05, - "loss": 0.9242, - "step": 3665 - }, - { - "epoch": 0.17014371812702828, - "grad_norm": 5.274721622467041, - "learning_rate": 1.970174239006486e-05, - "loss": 0.9642, - "step": 3670 - }, - { - "epoch": 0.17037552155771907, - "grad_norm": 5.67561674118042, - "learning_rate": 1.969977777306266e-05, - "loss": 1.1584, - "step": 3675 - }, - { - "epoch": 0.17060732498840983, - "grad_norm": 4.051698207855225, - "learning_rate": 1.9697806805439302e-05, - "loss": 0.9111, - "step": 3680 - }, - { - "epoch": 0.1708391284191006, - "grad_norm": 4.867814064025879, - "learning_rate": 1.969582948848522e-05, - "loss": 0.9635, - "step": 3685 - }, - { - "epoch": 0.17107093184979139, - "grad_norm": 4.764296531677246, - "learning_rate": 1.9693845823494997e-05, - "loss": 0.8434, - "step": 3690 - }, - { - "epoch": 0.17130273528048215, - "grad_norm": 3.813321352005005, - "learning_rate": 1.969185581176737e-05, - "loss": 0.9985, - "step": 3695 - }, - { - "epoch": 0.1715345387111729, - "grad_norm": 4.2770280838012695, - "learning_rate": 1.9689859454605246e-05, - "loss": 0.9844, - "step": 3700 - }, - { - "epoch": 0.1715345387111729, - "eval_loss": 1.0277271270751953, - "eval_runtime": 11.275, - "eval_samples_per_second": 11.264, - "eval_steps_per_second": 11.264, - "step": 3700 - }, - { - "epoch": 0.1717663421418637, - "grad_norm": 4.8459391593933105, - "learning_rate": 1.968785675331567e-05, - "loss": 1.0516, - "step": 3705 - }, - { - "epoch": 0.17199814557255447, - "grad_norm": 4.204828262329102, - "learning_rate": 1.9685847709209847e-05, - "loss": 1.08, - "step": 3710 - }, - { - "epoch": 0.17222994900324526, - "grad_norm": 4.045187950134277, - "learning_rate": 1.9683832323603137e-05, - "loss": 0.9182, - "step": 3715 - }, - { - "epoch": 0.17246175243393602, - "grad_norm": 4.368062973022461, - "learning_rate": 1.9681810597815047e-05, - "loss": 0.9901, - "step": 3720 - }, - { - "epoch": 0.17269355586462679, - "grad_norm": 4.184208393096924, - "learning_rate": 1.9679782533169237e-05, - "loss": 0.9044, - "step": 3725 - }, - { - "epoch": 0.17292535929531758, - "grad_norm": 4.531180381774902, - "learning_rate": 1.9677748130993524e-05, - "loss": 0.9238, - "step": 3730 - }, - { - "epoch": 0.17315716272600834, - "grad_norm": 3.6048779487609863, - "learning_rate": 1.9675707392619864e-05, - "loss": 1.026, - "step": 3735 - }, - { - "epoch": 0.17338896615669913, - "grad_norm": 4.324881553649902, - "learning_rate": 1.9673660319384362e-05, - "loss": 0.8431, - "step": 3740 - }, - { - "epoch": 0.1736207695873899, - "grad_norm": 5.622114181518555, - "learning_rate": 1.967160691262728e-05, - "loss": 0.9655, - "step": 3745 - }, - { - "epoch": 0.17385257301808066, - "grad_norm": 4.113274097442627, - "learning_rate": 1.9669547173693017e-05, - "loss": 0.7703, - "step": 3750 - }, - { - "epoch": 0.17408437644877145, - "grad_norm": 4.276374340057373, - "learning_rate": 1.9667481103930125e-05, - "loss": 1.1605, - "step": 3755 - }, - { - "epoch": 0.1743161798794622, - "grad_norm": 3.935689687728882, - "learning_rate": 1.9665408704691295e-05, - "loss": 1.0532, - "step": 3760 - }, - { - "epoch": 0.17454798331015298, - "grad_norm": 3.852186679840088, - "learning_rate": 1.9663329977333365e-05, - "loss": 0.8158, - "step": 3765 - }, - { - "epoch": 0.17477978674084377, - "grad_norm": 5.546504497528076, - "learning_rate": 1.9661244923217314e-05, - "loss": 1.0553, - "step": 3770 - }, - { - "epoch": 0.17501159017153453, - "grad_norm": 4.326759338378906, - "learning_rate": 1.9659153543708268e-05, - "loss": 1.0558, - "step": 3775 - }, - { - "epoch": 0.17524339360222532, - "grad_norm": 5.423635959625244, - "learning_rate": 1.9657055840175493e-05, - "loss": 0.9978, - "step": 3780 - }, - { - "epoch": 0.17547519703291609, - "grad_norm": 4.628658771514893, - "learning_rate": 1.9654951813992387e-05, - "loss": 1.0361, - "step": 3785 - }, - { - "epoch": 0.17570700046360685, - "grad_norm": 5.572815418243408, - "learning_rate": 1.96528414665365e-05, - "loss": 0.8554, - "step": 3790 - }, - { - "epoch": 0.17593880389429764, - "grad_norm": 3.796617269515991, - "learning_rate": 1.965072479918951e-05, - "loss": 1.012, - "step": 3795 - }, - { - "epoch": 0.1761706073249884, - "grad_norm": 4.33388614654541, - "learning_rate": 1.9648601813337244e-05, - "loss": 0.9588, - "step": 3800 - }, - { - "epoch": 0.1761706073249884, - "eval_loss": 1.0285202264785767, - "eval_runtime": 11.2827, - "eval_samples_per_second": 11.256, - "eval_steps_per_second": 11.256, - "step": 3800 - }, - { - "epoch": 0.1764024107556792, - "grad_norm": 8.51945972442627, - "learning_rate": 1.9646472510369656e-05, - "loss": 1.0992, - "step": 3805 - }, - { - "epoch": 0.17663421418636996, - "grad_norm": 4.519212245941162, - "learning_rate": 1.9644336891680836e-05, - "loss": 1.0471, - "step": 3810 - }, - { - "epoch": 0.17686601761706072, - "grad_norm": 3.9827346801757812, - "learning_rate": 1.964219495866902e-05, - "loss": 0.8874, - "step": 3815 - }, - { - "epoch": 0.1770978210477515, - "grad_norm": 4.4609694480896, - "learning_rate": 1.9640046712736565e-05, - "loss": 1.0301, - "step": 3820 - }, - { - "epoch": 0.17732962447844228, - "grad_norm": 3.9472250938415527, - "learning_rate": 1.9637892155289967e-05, - "loss": 0.8735, - "step": 3825 - }, - { - "epoch": 0.17756142790913307, - "grad_norm": 3.6204352378845215, - "learning_rate": 1.9635731287739857e-05, - "loss": 0.9376, - "step": 3830 - }, - { - "epoch": 0.17779323133982383, - "grad_norm": 3.9136033058166504, - "learning_rate": 1.963356411150099e-05, - "loss": 0.9396, - "step": 3835 - }, - { - "epoch": 0.1780250347705146, - "grad_norm": 3.8638482093811035, - "learning_rate": 1.963139062799226e-05, - "loss": 0.9546, - "step": 3840 - }, - { - "epoch": 0.1782568382012054, - "grad_norm": 3.9721662998199463, - "learning_rate": 1.9629210838636685e-05, - "loss": 1.0491, - "step": 3845 - }, - { - "epoch": 0.17848864163189615, - "grad_norm": 5.257063865661621, - "learning_rate": 1.962702474486141e-05, - "loss": 1.1241, - "step": 3850 - }, - { - "epoch": 0.1787204450625869, - "grad_norm": 4.413024425506592, - "learning_rate": 1.9624832348097717e-05, - "loss": 1.1912, - "step": 3855 - }, - { - "epoch": 0.1789522484932777, - "grad_norm": 5.4773759841918945, - "learning_rate": 1.9622633649780998e-05, - "loss": 0.9792, - "step": 3860 - }, - { - "epoch": 0.17918405192396847, - "grad_norm": 4.147509574890137, - "learning_rate": 1.962042865135079e-05, - "loss": 0.923, - "step": 3865 - }, - { - "epoch": 0.17941585535465926, - "grad_norm": 3.813749313354492, - "learning_rate": 1.9618217354250742e-05, - "loss": 0.9268, - "step": 3870 - }, - { - "epoch": 0.17964765878535002, - "grad_norm": 5.082292079925537, - "learning_rate": 1.9615999759928626e-05, - "loss": 1.0358, - "step": 3875 - }, - { - "epoch": 0.1798794622160408, - "grad_norm": 4.669640064239502, - "learning_rate": 1.9613775869836355e-05, - "loss": 1.0233, - "step": 3880 - }, - { - "epoch": 0.18011126564673158, - "grad_norm": 4.9730424880981445, - "learning_rate": 1.9611545685429937e-05, - "loss": 1.0794, - "step": 3885 - }, - { - "epoch": 0.18034306907742234, - "grad_norm": 4.652693271636963, - "learning_rate": 1.9609309208169518e-05, - "loss": 0.9949, - "step": 3890 - }, - { - "epoch": 0.18057487250811313, - "grad_norm": 3.6250641345977783, - "learning_rate": 1.9607066439519363e-05, - "loss": 0.8387, - "step": 3895 - }, - { - "epoch": 0.1808066759388039, - "grad_norm": 4.949042320251465, - "learning_rate": 1.960481738094785e-05, - "loss": 0.9968, - "step": 3900 - }, - { - "epoch": 0.1808066759388039, - "eval_loss": 1.0276681184768677, - "eval_runtime": 11.2655, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, - "step": 3900 - }, - { - "epoch": 0.18103847936949466, - "grad_norm": 3.579059362411499, - "learning_rate": 1.9602562033927478e-05, - "loss": 0.9648, - "step": 3905 - }, - { - "epoch": 0.18127028280018545, - "grad_norm": 4.2248358726501465, - "learning_rate": 1.960030039993487e-05, - "loss": 0.9858, - "step": 3910 - }, - { - "epoch": 0.18150208623087621, - "grad_norm": 4.166600227355957, - "learning_rate": 1.9598032480450757e-05, - "loss": 1.079, - "step": 3915 - }, - { - "epoch": 0.18173388966156698, - "grad_norm": 4.941333293914795, - "learning_rate": 1.9595758276959982e-05, - "loss": 0.9412, - "step": 3920 - }, - { - "epoch": 0.18196569309225777, - "grad_norm": 5.1348981857299805, - "learning_rate": 1.959347779095151e-05, - "loss": 1.0761, - "step": 3925 - }, - { - "epoch": 0.18219749652294853, - "grad_norm": 4.545997142791748, - "learning_rate": 1.9591191023918417e-05, - "loss": 1.055, - "step": 3930 - }, - { - "epoch": 0.18242929995363932, - "grad_norm": 4.699060916900635, - "learning_rate": 1.9588897977357894e-05, - "loss": 1.1684, - "step": 3935 - }, - { - "epoch": 0.1826611033843301, - "grad_norm": 5.560849666595459, - "learning_rate": 1.9586598652771237e-05, - "loss": 0.9208, - "step": 3940 - }, - { - "epoch": 0.18289290681502085, - "grad_norm": 4.416356086730957, - "learning_rate": 1.9584293051663855e-05, - "loss": 0.9471, - "step": 3945 - }, - { - "epoch": 0.18312471024571164, - "grad_norm": 3.3139140605926514, - "learning_rate": 1.958198117554527e-05, - "loss": 1.0265, - "step": 3950 - }, - { - "epoch": 0.1833565136764024, - "grad_norm": 3.708087205886841, - "learning_rate": 1.957966302592911e-05, - "loss": 1.0605, - "step": 3955 - }, - { - "epoch": 0.1835883171070932, - "grad_norm": 4.243253707885742, - "learning_rate": 1.9577338604333102e-05, - "loss": 1.0028, - "step": 3960 - }, - { - "epoch": 0.18382012053778396, - "grad_norm": 4.022925853729248, - "learning_rate": 1.9575007912279096e-05, - "loss": 0.8775, - "step": 3965 - }, - { - "epoch": 0.18405192396847472, - "grad_norm": 4.368210315704346, - "learning_rate": 1.9572670951293034e-05, - "loss": 1.1574, - "step": 3970 - }, - { - "epoch": 0.18428372739916551, - "grad_norm": 3.763110399246216, - "learning_rate": 1.957032772290497e-05, - "loss": 0.9621, - "step": 3975 - }, - { - "epoch": 0.18451553082985628, - "grad_norm": 4.240961074829102, - "learning_rate": 1.9567978228649052e-05, - "loss": 1.1428, - "step": 3980 - }, - { - "epoch": 0.18474733426054707, - "grad_norm": 4.0695037841796875, - "learning_rate": 1.9565622470063544e-05, - "loss": 0.8761, - "step": 3985 - }, - { - "epoch": 0.18497913769123783, - "grad_norm": 4.663343906402588, - "learning_rate": 1.9563260448690805e-05, - "loss": 1.1419, - "step": 3990 - }, - { - "epoch": 0.1852109411219286, - "grad_norm": 3.7018463611602783, - "learning_rate": 1.9560892166077285e-05, - "loss": 0.9401, - "step": 3995 - }, - { - "epoch": 0.1854427445526194, - "grad_norm": 4.6532511711120605, - "learning_rate": 1.9558517623773547e-05, - "loss": 0.8992, - "step": 4000 - }, - { - "epoch": 0.1854427445526194, - "eval_loss": 1.0249594449996948, - "eval_runtime": 11.2634, - "eval_samples_per_second": 11.275, - "eval_steps_per_second": 11.275, - "step": 4000 - }, - { - "epoch": 0.18567454798331015, - "grad_norm": 4.233008861541748, - "learning_rate": 1.955613682333425e-05, - "loss": 1.0099, - "step": 4005 - }, - { - "epoch": 0.18590635141400091, - "grad_norm": 4.297733306884766, - "learning_rate": 1.955374976631814e-05, - "loss": 0.8721, - "step": 4010 - }, - { - "epoch": 0.1861381548446917, - "grad_norm": 5.088667869567871, - "learning_rate": 1.9551356454288073e-05, - "loss": 0.9625, - "step": 4015 - }, - { - "epoch": 0.18636995827538247, - "grad_norm": 4.837838172912598, - "learning_rate": 1.954895688881099e-05, - "loss": 1.07, - "step": 4020 - }, - { - "epoch": 0.18660176170607326, - "grad_norm": 3.7905993461608887, - "learning_rate": 1.954655107145793e-05, - "loss": 1.0712, - "step": 4025 - }, - { - "epoch": 0.18683356513676402, - "grad_norm": 3.836812734603882, - "learning_rate": 1.954413900380403e-05, - "loss": 1.1433, - "step": 4030 - }, - { - "epoch": 0.1870653685674548, - "grad_norm": 3.796400547027588, - "learning_rate": 1.9541720687428512e-05, - "loss": 0.7793, - "step": 4035 - }, - { - "epoch": 0.18729717199814558, - "grad_norm": 4.4649553298950195, - "learning_rate": 1.9539296123914693e-05, - "loss": 1.0256, - "step": 4040 - }, - { - "epoch": 0.18752897542883634, - "grad_norm": 3.9846534729003906, - "learning_rate": 1.9536865314849973e-05, - "loss": 0.9158, - "step": 4045 - }, - { - "epoch": 0.18776077885952713, - "grad_norm": 5.033636093139648, - "learning_rate": 1.9534428261825857e-05, - "loss": 1.1844, - "step": 4050 - }, - { - "epoch": 0.1879925822902179, - "grad_norm": 4.519948959350586, - "learning_rate": 1.9531984966437916e-05, - "loss": 1.1649, - "step": 4055 - }, - { - "epoch": 0.18822438572090866, - "grad_norm": 5.361178874969482, - "learning_rate": 1.952953543028583e-05, - "loss": 0.9993, - "step": 4060 - }, - { - "epoch": 0.18845618915159945, - "grad_norm": 3.7113776206970215, - "learning_rate": 1.9527079654973353e-05, - "loss": 0.9374, - "step": 4065 - }, - { - "epoch": 0.18868799258229021, - "grad_norm": 4.310559272766113, - "learning_rate": 1.952461764210832e-05, - "loss": 0.8647, - "step": 4070 - }, - { - "epoch": 0.18891979601298098, - "grad_norm": 3.929408073425293, - "learning_rate": 1.9522149393302664e-05, - "loss": 0.9928, - "step": 4075 - }, - { - "epoch": 0.18915159944367177, - "grad_norm": 4.187215805053711, - "learning_rate": 1.9519674910172387e-05, - "loss": 1.2086, - "step": 4080 - }, - { - "epoch": 0.18938340287436253, - "grad_norm": 5.6810302734375, - "learning_rate": 1.9517194194337578e-05, - "loss": 1.0479, - "step": 4085 - }, - { - "epoch": 0.18961520630505332, - "grad_norm": 4.767360687255859, - "learning_rate": 1.951470724742241e-05, - "loss": 1.1135, - "step": 4090 - }, - { - "epoch": 0.1898470097357441, - "grad_norm": 5.183935642242432, - "learning_rate": 1.951221407105513e-05, - "loss": 1.1425, - "step": 4095 - }, - { - "epoch": 0.19007881316643485, - "grad_norm": 4.624146938323975, - "learning_rate": 1.9509714666868066e-05, - "loss": 1.1632, - "step": 4100 - }, - { - "epoch": 0.19007881316643485, - "eval_loss": 1.0201714038848877, - "eval_runtime": 11.2685, - "eval_samples_per_second": 11.27, - "eval_steps_per_second": 11.27, - "step": 4100 - }, - { - "epoch": 0.19031061659712564, - "grad_norm": 6.600108623504639, - "learning_rate": 1.9507209036497626e-05, - "loss": 1.063, - "step": 4105 - }, - { - "epoch": 0.1905424200278164, - "grad_norm": 7.14796257019043, - "learning_rate": 1.950469718158429e-05, - "loss": 0.9141, - "step": 4110 - }, - { - "epoch": 0.1907742234585072, - "grad_norm": 5.579740524291992, - "learning_rate": 1.9502179103772612e-05, - "loss": 0.9236, - "step": 4115 - }, - { - "epoch": 0.19100602688919796, - "grad_norm": 3.9887657165527344, - "learning_rate": 1.9499654804711227e-05, - "loss": 0.972, - "step": 4120 - }, - { - "epoch": 0.19123783031988872, - "grad_norm": 5.2029595375061035, - "learning_rate": 1.949712428605284e-05, - "loss": 1.1145, - "step": 4125 - }, - { - "epoch": 0.19146963375057952, - "grad_norm": 3.8869788646698, - "learning_rate": 1.949458754945423e-05, - "loss": 0.8691, - "step": 4130 - }, - { - "epoch": 0.19170143718127028, - "grad_norm": 4.547691822052002, - "learning_rate": 1.9492044596576238e-05, - "loss": 0.8942, - "step": 4135 - }, - { - "epoch": 0.19193324061196107, - "grad_norm": 4.375545978546143, - "learning_rate": 1.9489495429083787e-05, - "loss": 1.0702, - "step": 4140 - }, - { - "epoch": 0.19216504404265183, - "grad_norm": 4.513903617858887, - "learning_rate": 1.948694004864586e-05, - "loss": 1.0058, - "step": 4145 - }, - { - "epoch": 0.1923968474733426, - "grad_norm": 3.5358970165252686, - "learning_rate": 1.948437845693551e-05, - "loss": 0.9213, - "step": 4150 - }, - { - "epoch": 0.1926286509040334, - "grad_norm": 4.308315277099609, - "learning_rate": 1.9481810655629862e-05, - "loss": 0.9508, - "step": 4155 - }, - { - "epoch": 0.19286045433472415, - "grad_norm": 4.383856773376465, - "learning_rate": 1.9479236646410104e-05, - "loss": 1.0954, - "step": 4160 - }, - { - "epoch": 0.19309225776541492, - "grad_norm": 4.728183746337891, - "learning_rate": 1.947665643096148e-05, - "loss": 1.0281, - "step": 4165 - }, - { - "epoch": 0.1933240611961057, - "grad_norm": 4.869402885437012, - "learning_rate": 1.9474070010973307e-05, - "loss": 0.8735, - "step": 4170 - }, - { - "epoch": 0.19355586462679647, - "grad_norm": 3.7020456790924072, - "learning_rate": 1.9471477388138962e-05, - "loss": 0.8899, - "step": 4175 - }, - { - "epoch": 0.19378766805748726, - "grad_norm": 4.01472806930542, - "learning_rate": 1.9468878564155882e-05, - "loss": 0.973, - "step": 4180 - }, - { - "epoch": 0.19401947148817802, - "grad_norm": 4.646851062774658, - "learning_rate": 1.9466273540725565e-05, - "loss": 1.1544, - "step": 4185 - }, - { - "epoch": 0.1942512749188688, - "grad_norm": 4.423065662384033, - "learning_rate": 1.9463662319553568e-05, - "loss": 0.9554, - "step": 4190 - }, - { - "epoch": 0.19448307834955958, - "grad_norm": 4.54693078994751, - "learning_rate": 1.9461044902349497e-05, - "loss": 0.923, - "step": 4195 - }, - { - "epoch": 0.19471488178025034, - "grad_norm": 4.460963726043701, - "learning_rate": 1.9458421290827035e-05, - "loss": 1.0284, - "step": 4200 - }, - { - "epoch": 0.19471488178025034, - "eval_loss": 1.022770881652832, - "eval_runtime": 11.2826, - "eval_samples_per_second": 11.256, - "eval_steps_per_second": 11.256, - "step": 4200 - }, - { - "epoch": 0.19494668521094113, - "grad_norm": 3.523135185241699, - "learning_rate": 1.9455791486703905e-05, - "loss": 0.9056, - "step": 4205 - }, - { - "epoch": 0.1951784886416319, - "grad_norm": 3.8507754802703857, - "learning_rate": 1.945315549170188e-05, - "loss": 0.7916, - "step": 4210 - }, - { - "epoch": 0.19541029207232266, - "grad_norm": 14.194623947143555, - "learning_rate": 1.94505133075468e-05, - "loss": 1.075, - "step": 4215 - }, - { - "epoch": 0.19564209550301345, - "grad_norm": 3.8128092288970947, - "learning_rate": 1.944786493596855e-05, - "loss": 1.0419, - "step": 4220 - }, - { - "epoch": 0.19587389893370422, - "grad_norm": 4.251338481903076, - "learning_rate": 1.9445210378701067e-05, - "loss": 1.1187, - "step": 4225 - }, - { - "epoch": 0.19610570236439498, - "grad_norm": 4.296945571899414, - "learning_rate": 1.9442549637482342e-05, - "loss": 0.9089, - "step": 4230 - }, - { - "epoch": 0.19633750579508577, - "grad_norm": 4.671785831451416, - "learning_rate": 1.9439882714054403e-05, - "loss": 0.9709, - "step": 4235 - }, - { - "epoch": 0.19656930922577653, - "grad_norm": 4.192035675048828, - "learning_rate": 1.943720961016334e-05, - "loss": 0.9624, - "step": 4240 - }, - { - "epoch": 0.19680111265646733, - "grad_norm": 5.137444019317627, - "learning_rate": 1.943453032755928e-05, - "loss": 1.0318, - "step": 4245 - }, - { - "epoch": 0.1970329160871581, - "grad_norm": 4.235700607299805, - "learning_rate": 1.9431844867996396e-05, - "loss": 0.897, - "step": 4250 - }, - { - "epoch": 0.19726471951784885, - "grad_norm": 4.324668884277344, - "learning_rate": 1.9429153233232912e-05, - "loss": 0.9075, - "step": 4255 - }, - { - "epoch": 0.19749652294853964, - "grad_norm": 5.024466514587402, - "learning_rate": 1.9426455425031085e-05, - "loss": 1.1617, - "step": 4260 - }, - { - "epoch": 0.1977283263792304, - "grad_norm": 3.738459348678589, - "learning_rate": 1.9423751445157225e-05, - "loss": 0.8962, - "step": 4265 - }, - { - "epoch": 0.1979601298099212, - "grad_norm": 4.7824625968933105, - "learning_rate": 1.9421041295381674e-05, - "loss": 1.0028, - "step": 4270 - }, - { - "epoch": 0.19819193324061196, - "grad_norm": 5.4419264793396, - "learning_rate": 1.9418324977478815e-05, - "loss": 1.1338, - "step": 4275 - }, - { - "epoch": 0.19842373667130273, - "grad_norm": 3.802666425704956, - "learning_rate": 1.941560249322707e-05, - "loss": 1.0708, - "step": 4280 - }, - { - "epoch": 0.19865554010199352, - "grad_norm": 4.5152788162231445, - "learning_rate": 1.9412873844408905e-05, - "loss": 1.0177, - "step": 4285 - }, - { - "epoch": 0.19888734353268428, - "grad_norm": 4.837648391723633, - "learning_rate": 1.941013903281081e-05, - "loss": 1.099, - "step": 4290 - }, - { - "epoch": 0.19911914696337507, - "grad_norm": 6.359385013580322, - "learning_rate": 1.940739806022332e-05, - "loss": 1.1031, - "step": 4295 - }, - { - "epoch": 0.19935095039406583, - "grad_norm": 3.1812539100646973, - "learning_rate": 1.9404650928440996e-05, - "loss": 0.9961, - "step": 4300 - }, - { - "epoch": 0.19935095039406583, - "eval_loss": 1.0196547508239746, - "eval_runtime": 11.2671, - "eval_samples_per_second": 11.272, - "eval_steps_per_second": 11.272, - "step": 4300 - }, - { - "epoch": 0.1995827538247566, - "grad_norm": 3.9984710216522217, - "learning_rate": 1.940189763926244e-05, - "loss": 1.1403, - "step": 4305 - }, - { - "epoch": 0.1998145572554474, - "grad_norm": 4.785546779632568, - "learning_rate": 1.9399138194490273e-05, - "loss": 1.0475, - "step": 4310 - }, - { - "epoch": 0.20004636068613815, - "grad_norm": 4.264800548553467, - "learning_rate": 1.9396372595931165e-05, - "loss": 1.01, - "step": 4315 - }, - { - "epoch": 0.20027816411682892, - "grad_norm": 4.709249496459961, - "learning_rate": 1.9393600845395792e-05, - "loss": 0.8559, - "step": 4320 - }, - { - "epoch": 0.2005099675475197, - "grad_norm": 4.430532455444336, - "learning_rate": 1.9390822944698873e-05, - "loss": 0.8757, - "step": 4325 - }, - { - "epoch": 0.20074177097821047, - "grad_norm": 3.601813554763794, - "learning_rate": 1.9388038895659156e-05, - "loss": 1.0746, - "step": 4330 - }, - { - "epoch": 0.20097357440890126, - "grad_norm": 5.1407880783081055, - "learning_rate": 1.9385248700099402e-05, - "loss": 0.9955, - "step": 4335 - }, - { - "epoch": 0.20120537783959203, - "grad_norm": 4.2756829261779785, - "learning_rate": 1.9382452359846408e-05, - "loss": 0.9426, - "step": 4340 - }, - { - "epoch": 0.2014371812702828, - "grad_norm": 4.615556716918945, - "learning_rate": 1.937964987673098e-05, - "loss": 1.0964, - "step": 4345 - }, - { - "epoch": 0.20166898470097358, - "grad_norm": 4.2694244384765625, - "learning_rate": 1.937684125258797e-05, - "loss": 0.9078, - "step": 4350 - }, - { - "epoch": 0.20190078813166434, - "grad_norm": 3.894665241241455, - "learning_rate": 1.9374026489256217e-05, - "loss": 0.7885, - "step": 4355 - }, - { - "epoch": 0.20213259156235514, - "grad_norm": 3.197622537612915, - "learning_rate": 1.9371205588578613e-05, - "loss": 0.9278, - "step": 4360 - }, - { - "epoch": 0.2023643949930459, - "grad_norm": 4.220120906829834, - "learning_rate": 1.9368378552402046e-05, - "loss": 0.8669, - "step": 4365 - }, - { - "epoch": 0.20259619842373666, - "grad_norm": 4.15202522277832, - "learning_rate": 1.9365545382577433e-05, - "loss": 0.9815, - "step": 4370 - }, - { - "epoch": 0.20282800185442745, - "grad_norm": 4.307468414306641, - "learning_rate": 1.9362706080959694e-05, - "loss": 1.1502, - "step": 4375 - }, - { - "epoch": 0.20305980528511822, - "grad_norm": 4.232968807220459, - "learning_rate": 1.9359860649407775e-05, - "loss": 1.1307, - "step": 4380 - }, - { - "epoch": 0.20329160871580898, - "grad_norm": 4.220963954925537, - "learning_rate": 1.935700908978463e-05, - "loss": 1.1395, - "step": 4385 - }, - { - "epoch": 0.20352341214649977, - "grad_norm": 3.8981432914733887, - "learning_rate": 1.9354151403957234e-05, - "loss": 1.035, - "step": 4390 - }, - { - "epoch": 0.20375521557719053, - "grad_norm": 5.082888603210449, - "learning_rate": 1.935128759379656e-05, - "loss": 0.9727, - "step": 4395 - }, - { - "epoch": 0.20398701900788133, - "grad_norm": 4.414380073547363, - "learning_rate": 1.9348417661177597e-05, - "loss": 1.0242, - "step": 4400 - }, - { - "epoch": 0.20398701900788133, - "eval_loss": 1.0201265811920166, - "eval_runtime": 11.2759, - "eval_samples_per_second": 11.263, - "eval_steps_per_second": 11.263, - "step": 4400 - }, - { - "epoch": 0.2042188224385721, - "grad_norm": 3.861715793609619, - "learning_rate": 1.9345541607979347e-05, - "loss": 0.9632, - "step": 4405 - }, - { - "epoch": 0.20445062586926285, - "grad_norm": 3.9358503818511963, - "learning_rate": 1.9342659436084808e-05, - "loss": 0.8809, - "step": 4410 - }, - { - "epoch": 0.20468242929995364, - "grad_norm": 4.636179447174072, - "learning_rate": 1.9339771147380993e-05, - "loss": 1.0324, - "step": 4415 - }, - { - "epoch": 0.2049142327306444, - "grad_norm": 3.6778595447540283, - "learning_rate": 1.9336876743758918e-05, - "loss": 1.1124, - "step": 4420 - }, - { - "epoch": 0.2051460361613352, - "grad_norm": 3.6706385612487793, - "learning_rate": 1.9333976227113598e-05, - "loss": 0.9638, - "step": 4425 - }, - { - "epoch": 0.20537783959202596, - "grad_norm": 4.206617832183838, - "learning_rate": 1.9331069599344058e-05, - "loss": 1.0878, - "step": 4430 - }, - { - "epoch": 0.20560964302271673, - "grad_norm": 4.7787017822265625, - "learning_rate": 1.9328156862353322e-05, - "loss": 0.9389, - "step": 4435 - }, - { - "epoch": 0.20584144645340752, - "grad_norm": 4.6298933029174805, - "learning_rate": 1.9325238018048405e-05, - "loss": 1.1185, - "step": 4440 - }, - { - "epoch": 0.20607324988409828, - "grad_norm": 4.0797858238220215, - "learning_rate": 1.9322313068340338e-05, - "loss": 0.9152, - "step": 4445 - }, - { - "epoch": 0.20630505331478907, - "grad_norm": 3.8898723125457764, - "learning_rate": 1.9319382015144127e-05, - "loss": 1.027, - "step": 4450 - }, - { - "epoch": 0.20653685674547984, - "grad_norm": 3.4612269401550293, - "learning_rate": 1.9316444860378792e-05, - "loss": 0.9548, - "step": 4455 - }, - { - "epoch": 0.2067686601761706, - "grad_norm": 3.6337077617645264, - "learning_rate": 1.9313501605967343e-05, - "loss": 0.8258, - "step": 4460 - }, - { - "epoch": 0.2070004636068614, - "grad_norm": 4.511351585388184, - "learning_rate": 1.9310552253836784e-05, - "loss": 1.0118, - "step": 4465 - }, - { - "epoch": 0.20723226703755215, - "grad_norm": 4.938009738922119, - "learning_rate": 1.93075968059181e-05, - "loss": 0.9311, - "step": 4470 - }, - { - "epoch": 0.20746407046824292, - "grad_norm": 4.323369026184082, - "learning_rate": 1.9304635264146287e-05, - "loss": 1.0904, - "step": 4475 - }, - { - "epoch": 0.2076958738989337, - "grad_norm": 4.58251428604126, - "learning_rate": 1.9301667630460312e-05, - "loss": 1.0012, - "step": 4480 - }, - { - "epoch": 0.20792767732962447, - "grad_norm": 3.8780105113983154, - "learning_rate": 1.9298693906803146e-05, - "loss": 0.978, - "step": 4485 - }, - { - "epoch": 0.20815948076031526, - "grad_norm": 4.129722595214844, - "learning_rate": 1.9295714095121733e-05, - "loss": 1.1244, - "step": 4490 - }, - { - "epoch": 0.20839128419100603, - "grad_norm": 4.224957466125488, - "learning_rate": 1.9292728197367017e-05, - "loss": 0.9721, - "step": 4495 - }, - { - "epoch": 0.2086230876216968, - "grad_norm": 4.470384120941162, - "learning_rate": 1.9289736215493914e-05, - "loss": 1.0162, - "step": 4500 - }, - { - "epoch": 0.2086230876216968, - "eval_loss": 1.0174404382705688, - "eval_runtime": 11.2755, - "eval_samples_per_second": 11.263, - "eval_steps_per_second": 11.263, - "step": 4500 - }, - { - "epoch": 0.20885489105238758, - "grad_norm": 3.653148651123047, - "learning_rate": 1.9286738151461333e-05, - "loss": 0.8662, - "step": 4505 - }, - { - "epoch": 0.20908669448307834, - "grad_norm": 4.83407735824585, - "learning_rate": 1.9283734007232156e-05, - "loss": 1.0202, - "step": 4510 - }, - { - "epoch": 0.20931849791376914, - "grad_norm": 5.023587226867676, - "learning_rate": 1.9280723784773257e-05, - "loss": 1.0915, - "step": 4515 - }, - { - "epoch": 0.2095503013444599, - "grad_norm": 5.459712982177734, - "learning_rate": 1.9277707486055487e-05, - "loss": 1.1079, - "step": 4520 - }, - { - "epoch": 0.20978210477515066, - "grad_norm": 4.731924057006836, - "learning_rate": 1.9274685113053656e-05, - "loss": 1.1386, - "step": 4525 - }, - { - "epoch": 0.21001390820584145, - "grad_norm": 3.6431119441986084, - "learning_rate": 1.9271656667746583e-05, - "loss": 0.9088, - "step": 4530 - }, - { - "epoch": 0.21024571163653222, - "grad_norm": 3.588373899459839, - "learning_rate": 1.9268622152117035e-05, - "loss": 0.8291, - "step": 4535 - }, - { - "epoch": 0.21047751506722298, - "grad_norm": 5.668548107147217, - "learning_rate": 1.926558156815177e-05, - "loss": 1.1071, - "step": 4540 - }, - { - "epoch": 0.21070931849791377, - "grad_norm": 5.053746223449707, - "learning_rate": 1.9262534917841514e-05, - "loss": 0.8661, - "step": 4545 - }, - { - "epoch": 0.21094112192860454, - "grad_norm": 3.778334140777588, - "learning_rate": 1.9259482203180963e-05, - "loss": 0.9204, - "step": 4550 - }, - { - "epoch": 0.21117292535929533, - "grad_norm": 4.436891555786133, - "learning_rate": 1.9256423426168785e-05, - "loss": 0.8769, - "step": 4555 - }, - { - "epoch": 0.2114047287899861, - "grad_norm": 4.076458930969238, - "learning_rate": 1.9253358588807617e-05, - "loss": 0.9674, - "step": 4560 - }, - { - "epoch": 0.21163653222067685, - "grad_norm": 4.523109436035156, - "learning_rate": 1.9250287693104068e-05, - "loss": 0.904, - "step": 4565 - }, - { - "epoch": 0.21186833565136765, - "grad_norm": 4.488249778747559, - "learning_rate": 1.92472107410687e-05, - "loss": 0.9186, - "step": 4570 - }, - { - "epoch": 0.2121001390820584, - "grad_norm": 4.525340557098389, - "learning_rate": 1.9244127734716058e-05, - "loss": 0.8192, - "step": 4575 - }, - { - "epoch": 0.2123319425127492, - "grad_norm": 3.548717498779297, - "learning_rate": 1.9241038676064637e-05, - "loss": 0.9903, - "step": 4580 - }, - { - "epoch": 0.21256374594343996, - "grad_norm": 4.380881309509277, - "learning_rate": 1.9237943567136905e-05, - "loss": 1.0492, - "step": 4585 - }, - { - "epoch": 0.21279554937413073, - "grad_norm": 4.449493408203125, - "learning_rate": 1.9234842409959283e-05, - "loss": 1.1464, - "step": 4590 - }, - { - "epoch": 0.21302735280482152, - "grad_norm": 4.247632026672363, - "learning_rate": 1.9231735206562154e-05, - "loss": 0.8259, - "step": 4595 - }, - { - "epoch": 0.21325915623551228, - "grad_norm": 3.8416435718536377, - "learning_rate": 1.9228621958979862e-05, - "loss": 1.0981, - "step": 4600 - }, - { - "epoch": 0.21325915623551228, - "eval_loss": 1.0158852338790894, - "eval_runtime": 11.2758, - "eval_samples_per_second": 11.263, - "eval_steps_per_second": 11.263, - "step": 4600 - }, - { - "epoch": 0.21349095966620307, - "grad_norm": 4.114621162414551, - "learning_rate": 1.922550266925071e-05, - "loss": 0.9712, - "step": 4605 - }, - { - "epoch": 0.21372276309689384, - "grad_norm": 4.465586185455322, - "learning_rate": 1.9222377339416945e-05, - "loss": 0.9613, - "step": 4610 - }, - { - "epoch": 0.2139545665275846, - "grad_norm": 5.107663154602051, - "learning_rate": 1.921924597152479e-05, - "loss": 1.0306, - "step": 4615 - }, - { - "epoch": 0.2141863699582754, - "grad_norm": 3.389920949935913, - "learning_rate": 1.9216108567624397e-05, - "loss": 0.7988, - "step": 4620 - }, - { - "epoch": 0.21441817338896615, - "grad_norm": 4.314112186431885, - "learning_rate": 1.921296512976989e-05, - "loss": 0.9554, - "step": 4625 - }, - { - "epoch": 0.21464997681965692, - "grad_norm": 4.409841060638428, - "learning_rate": 1.920981566001933e-05, - "loss": 0.9666, - "step": 4630 - }, - { - "epoch": 0.2148817802503477, - "grad_norm": 3.8194494247436523, - "learning_rate": 1.9206660160434733e-05, - "loss": 1.0888, - "step": 4635 - }, - { - "epoch": 0.21511358368103847, - "grad_norm": 4.668557167053223, - "learning_rate": 1.9203498633082062e-05, - "loss": 0.9607, - "step": 4640 - }, - { - "epoch": 0.21534538711172926, - "grad_norm": 4.180658340454102, - "learning_rate": 1.9200331080031228e-05, - "loss": 1.1876, - "step": 4645 - }, - { - "epoch": 0.21557719054242003, - "grad_norm": 3.8917407989501953, - "learning_rate": 1.9197157503356084e-05, - "loss": 0.8905, - "step": 4650 - }, - { - "epoch": 0.2158089939731108, - "grad_norm": 3.4583301544189453, - "learning_rate": 1.919397790513443e-05, - "loss": 1.0806, - "step": 4655 - }, - { - "epoch": 0.21604079740380158, - "grad_norm": 4.301894664764404, - "learning_rate": 1.9190792287448007e-05, - "loss": 1.0075, - "step": 4660 - }, - { - "epoch": 0.21627260083449235, - "grad_norm": 3.784499168395996, - "learning_rate": 1.9187600652382495e-05, - "loss": 0.8791, - "step": 4665 - }, - { - "epoch": 0.21650440426518314, - "grad_norm": 4.241847515106201, - "learning_rate": 1.9184403002027516e-05, - "loss": 0.8495, - "step": 4670 - }, - { - "epoch": 0.2167362076958739, - "grad_norm": 5.300185203552246, - "learning_rate": 1.918119933847663e-05, - "loss": 0.9325, - "step": 4675 - }, - { - "epoch": 0.21696801112656466, - "grad_norm": 4.148271560668945, - "learning_rate": 1.9177989663827336e-05, - "loss": 1.131, - "step": 4680 - }, - { - "epoch": 0.21719981455725546, - "grad_norm": 4.2981696128845215, - "learning_rate": 1.9174773980181066e-05, - "loss": 1.1398, - "step": 4685 - }, - { - "epoch": 0.21743161798794622, - "grad_norm": 4.249345302581787, - "learning_rate": 1.917155228964318e-05, - "loss": 0.9213, - "step": 4690 - }, - { - "epoch": 0.21766342141863698, - "grad_norm": 3.92807674407959, - "learning_rate": 1.9168324594322988e-05, - "loss": 0.9589, - "step": 4695 - }, - { - "epoch": 0.21789522484932777, - "grad_norm": 4.179681777954102, - "learning_rate": 1.9165090896333717e-05, - "loss": 1.0402, - "step": 4700 - }, - { - "epoch": 0.21789522484932777, - "eval_loss": 1.0108238458633423, - "eval_runtime": 11.2618, - "eval_samples_per_second": 11.277, - "eval_steps_per_second": 11.277, - "step": 4700 - }, - { - "epoch": 0.21812702828001854, - "grad_norm": 4.779577732086182, - "learning_rate": 1.9161851197792523e-05, - "loss": 0.9088, - "step": 4705 - }, - { - "epoch": 0.21835883171070933, - "grad_norm": 4.182905673980713, - "learning_rate": 1.91586055008205e-05, - "loss": 1.0959, - "step": 4710 - }, - { - "epoch": 0.2185906351414001, - "grad_norm": 5.201434135437012, - "learning_rate": 1.9155353807542666e-05, - "loss": 1.1631, - "step": 4715 - }, - { - "epoch": 0.21882243857209085, - "grad_norm": 4.404604911804199, - "learning_rate": 1.915209612008796e-05, - "loss": 1.1433, - "step": 4720 - }, - { - "epoch": 0.21905424200278165, - "grad_norm": 3.8333659172058105, - "learning_rate": 1.9148832440589252e-05, - "loss": 0.9584, - "step": 4725 - }, - { - "epoch": 0.2192860454334724, - "grad_norm": 4.002883434295654, - "learning_rate": 1.9145562771183333e-05, - "loss": 1.0783, - "step": 4730 - }, - { - "epoch": 0.2195178488641632, - "grad_norm": 4.002244472503662, - "learning_rate": 1.9142287114010915e-05, - "loss": 1.1606, - "step": 4735 - }, - { - "epoch": 0.21974965229485396, - "grad_norm": 4.178402900695801, - "learning_rate": 1.9139005471216627e-05, - "loss": 1.0444, - "step": 4740 - }, - { - "epoch": 0.21998145572554473, - "grad_norm": 4.380372047424316, - "learning_rate": 1.9135717844949023e-05, - "loss": 1.001, - "step": 4745 - }, - { - "epoch": 0.22021325915623552, - "grad_norm": 5.0303168296813965, - "learning_rate": 1.9132424237360572e-05, - "loss": 0.9649, - "step": 4750 - }, - { - "epoch": 0.22044506258692628, - "grad_norm": 4.722301483154297, - "learning_rate": 1.912912465060766e-05, - "loss": 1.0605, - "step": 4755 - }, - { - "epoch": 0.22067686601761707, - "grad_norm": 4.6185102462768555, - "learning_rate": 1.9125819086850582e-05, - "loss": 0.9996, - "step": 4760 - }, - { - "epoch": 0.22090866944830784, - "grad_norm": 4.793702602386475, - "learning_rate": 1.9122507548253555e-05, - "loss": 1.0098, - "step": 4765 - }, - { - "epoch": 0.2211404728789986, - "grad_norm": 4.518040180206299, - "learning_rate": 1.9119190036984702e-05, - "loss": 0.8762, - "step": 4770 - }, - { - "epoch": 0.2213722763096894, - "grad_norm": 3.88997745513916, - "learning_rate": 1.9115866555216058e-05, - "loss": 1.1243, - "step": 4775 - }, - { - "epoch": 0.22160407974038016, - "grad_norm": 0.39716479182243347, - "learning_rate": 1.9112537105123567e-05, - "loss": 0.9426, - "step": 4780 - }, - { - "epoch": 0.22183588317107092, - "grad_norm": 3.8836007118225098, - "learning_rate": 1.910920168888708e-05, - "loss": 0.9133, - "step": 4785 - }, - { - "epoch": 0.2220676866017617, - "grad_norm": 4.020860195159912, - "learning_rate": 1.910586030869036e-05, - "loss": 1.0884, - "step": 4790 - }, - { - "epoch": 0.22229949003245247, - "grad_norm": 3.6284425258636475, - "learning_rate": 1.9102512966721063e-05, - "loss": 1.0384, - "step": 4795 - }, - { - "epoch": 0.22253129346314326, - "grad_norm": 4.15256929397583, - "learning_rate": 1.9099159665170758e-05, - "loss": 0.9094, - "step": 4800 - }, - { - "epoch": 0.22253129346314326, - "eval_loss": 1.0125683546066284, - "eval_runtime": 11.2696, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 4800 - }, - { - "epoch": 0.22276309689383403, - "grad_norm": 5.169234275817871, - "learning_rate": 1.909580040623491e-05, - "loss": 1.1009, - "step": 4805 - }, - { - "epoch": 0.2229949003245248, - "grad_norm": 3.830695867538452, - "learning_rate": 1.909243519211289e-05, - "loss": 0.8057, - "step": 4810 - }, - { - "epoch": 0.22322670375521558, - "grad_norm": 3.351536989212036, - "learning_rate": 1.9089064025007958e-05, - "loss": 0.8803, - "step": 4815 - }, - { - "epoch": 0.22345850718590635, - "grad_norm": 3.44328236579895, - "learning_rate": 1.908568690712729e-05, - "loss": 0.8897, - "step": 4820 - }, - { - "epoch": 0.22369031061659714, - "grad_norm": 3.884352922439575, - "learning_rate": 1.908230384068194e-05, - "loss": 1.0575, - "step": 4825 - }, - { - "epoch": 0.2239221140472879, - "grad_norm": 3.6071789264678955, - "learning_rate": 1.907891482788686e-05, - "loss": 1.0257, - "step": 4830 - }, - { - "epoch": 0.22415391747797866, - "grad_norm": 4.853216648101807, - "learning_rate": 1.9075519870960906e-05, - "loss": 0.9984, - "step": 4835 - }, - { - "epoch": 0.22438572090866946, - "grad_norm": 4.0743632316589355, - "learning_rate": 1.9072118972126814e-05, - "loss": 1.0508, - "step": 4840 - }, - { - "epoch": 0.22461752433936022, - "grad_norm": 3.989567995071411, - "learning_rate": 1.9068712133611216e-05, - "loss": 0.9234, - "step": 4845 - }, - { - "epoch": 0.22484932777005098, - "grad_norm": 4.845396518707275, - "learning_rate": 1.9065299357644628e-05, - "loss": 0.8949, - "step": 4850 - }, - { - "epoch": 0.22508113120074177, - "grad_norm": 3.8650968074798584, - "learning_rate": 1.9061880646461457e-05, - "loss": 0.9957, - "step": 4855 - }, - { - "epoch": 0.22531293463143254, - "grad_norm": 4.8118181228637695, - "learning_rate": 1.90584560023e-05, - "loss": 0.8788, - "step": 4860 - }, - { - "epoch": 0.22554473806212333, - "grad_norm": 3.8270719051361084, - "learning_rate": 1.905502542740243e-05, - "loss": 1.0374, - "step": 4865 - }, - { - "epoch": 0.2257765414928141, - "grad_norm": 4.53631067276001, - "learning_rate": 1.9051588924014804e-05, - "loss": 0.9582, - "step": 4870 - }, - { - "epoch": 0.22600834492350486, - "grad_norm": 3.948395013809204, - "learning_rate": 1.904814649438707e-05, - "loss": 0.9951, - "step": 4875 - }, - { - "epoch": 0.22624014835419565, - "grad_norm": 7.741011142730713, - "learning_rate": 1.904469814077305e-05, - "loss": 0.9671, - "step": 4880 - }, - { - "epoch": 0.2264719517848864, - "grad_norm": 4.1785712242126465, - "learning_rate": 1.9041243865430436e-05, - "loss": 1.0577, - "step": 4885 - }, - { - "epoch": 0.2267037552155772, - "grad_norm": 4.016406059265137, - "learning_rate": 1.903778367062081e-05, - "loss": 0.8957, - "step": 4890 - }, - { - "epoch": 0.22693555864626797, - "grad_norm": 3.7880759239196777, - "learning_rate": 1.9034317558609624e-05, - "loss": 1.0002, - "step": 4895 - }, - { - "epoch": 0.22716736207695873, - "grad_norm": 3.6266255378723145, - "learning_rate": 1.9030845531666203e-05, - "loss": 0.7714, - "step": 4900 - }, - { - "epoch": 0.22716736207695873, - "eval_loss": 1.0120409727096558, - "eval_runtime": 11.2606, - "eval_samples_per_second": 11.278, - "eval_steps_per_second": 11.278, - "step": 4900 - }, - { - "epoch": 0.22739916550764952, - "grad_norm": 4.234470367431641, - "learning_rate": 1.902736759206375e-05, - "loss": 1.0156, - "step": 4905 - }, - { - "epoch": 0.22763096893834028, - "grad_norm": 3.4304628372192383, - "learning_rate": 1.9023883742079337e-05, - "loss": 1.0283, - "step": 4910 - }, - { - "epoch": 0.22786277236903107, - "grad_norm": 4.131404399871826, - "learning_rate": 1.9020393983993895e-05, - "loss": 0.9577, - "step": 4915 - }, - { - "epoch": 0.22809457579972184, - "grad_norm": 3.733011484146118, - "learning_rate": 1.9016898320092242e-05, - "loss": 0.9742, - "step": 4920 - }, - { - "epoch": 0.2283263792304126, - "grad_norm": 4.668416976928711, - "learning_rate": 1.9013396752663044e-05, - "loss": 1.0222, - "step": 4925 - }, - { - "epoch": 0.2285581826611034, - "grad_norm": 4.214738845825195, - "learning_rate": 1.9009889283998847e-05, - "loss": 0.9623, - "step": 4930 - }, - { - "epoch": 0.22878998609179416, - "grad_norm": 3.838366985321045, - "learning_rate": 1.900637591639605e-05, - "loss": 0.9191, - "step": 4935 - }, - { - "epoch": 0.22902178952248492, - "grad_norm": 3.8003344535827637, - "learning_rate": 1.9002856652154925e-05, - "loss": 0.8639, - "step": 4940 - }, - { - "epoch": 0.2292535929531757, - "grad_norm": 3.651313543319702, - "learning_rate": 1.8999331493579594e-05, - "loss": 1.012, - "step": 4945 - }, - { - "epoch": 0.22948539638386647, - "grad_norm": 4.221668720245361, - "learning_rate": 1.8995800442978044e-05, - "loss": 0.9683, - "step": 4950 - }, - { - "epoch": 0.22971719981455727, - "grad_norm": 4.224164009094238, - "learning_rate": 1.8992263502662114e-05, - "loss": 1.0671, - "step": 4955 - }, - { - "epoch": 0.22994900324524803, - "grad_norm": 4.996068000793457, - "learning_rate": 1.8988720674947512e-05, - "loss": 1.1553, - "step": 4960 - }, - { - "epoch": 0.2301808066759388, - "grad_norm": 4.583253860473633, - "learning_rate": 1.8985171962153784e-05, - "loss": 0.9906, - "step": 4965 - }, - { - "epoch": 0.23041261010662958, - "grad_norm": 4.157360553741455, - "learning_rate": 1.898161736660434e-05, - "loss": 1.017, - "step": 4970 - }, - { - "epoch": 0.23064441353732035, - "grad_norm": 3.966968297958374, - "learning_rate": 1.897805689062644e-05, - "loss": 1.0456, - "step": 4975 - }, - { - "epoch": 0.23087621696801114, - "grad_norm": 5.055805683135986, - "learning_rate": 1.8974490536551186e-05, - "loss": 0.9645, - "step": 4980 - }, - { - "epoch": 0.2311080203987019, - "grad_norm": 5.330160617828369, - "learning_rate": 1.897091830671354e-05, - "loss": 0.9688, - "step": 4985 - }, - { - "epoch": 0.23133982382939267, - "grad_norm": 4.764847755432129, - "learning_rate": 1.896734020345231e-05, - "loss": 1.0223, - "step": 4990 - }, - { - "epoch": 0.23157162726008346, - "grad_norm": 3.9281156063079834, - "learning_rate": 1.896375622911014e-05, - "loss": 0.9495, - "step": 4995 - }, - { - "epoch": 0.23180343069077422, - "grad_norm": 4.123696327209473, - "learning_rate": 1.896016638603352e-05, - "loss": 0.8876, - "step": 5000 - }, - { - "epoch": 0.23180343069077422, - "eval_loss": 1.0095674991607666, - "eval_runtime": 11.2932, - "eval_samples_per_second": 11.246, - "eval_steps_per_second": 11.246, - "step": 5000 - }, - { - "epoch": 0.23203523412146498, - "grad_norm": 4.128748416900635, - "learning_rate": 1.8956570676572793e-05, - "loss": 0.9017, - "step": 5005 - }, - { - "epoch": 0.23226703755215578, - "grad_norm": 4.058050155639648, - "learning_rate": 1.895296910308213e-05, - "loss": 0.9553, - "step": 5010 - }, - { - "epoch": 0.23249884098284654, - "grad_norm": 4.250258445739746, - "learning_rate": 1.8949361667919552e-05, - "loss": 0.9679, - "step": 5015 - }, - { - "epoch": 0.23273064441353733, - "grad_norm": 5.300534248352051, - "learning_rate": 1.8945748373446904e-05, - "loss": 0.9784, - "step": 5020 - }, - { - "epoch": 0.2329624478442281, - "grad_norm": 4.7261881828308105, - "learning_rate": 1.894212922202989e-05, - "loss": 1.0984, - "step": 5025 - }, - { - "epoch": 0.23319425127491886, - "grad_norm": 4.31865930557251, - "learning_rate": 1.893850421603802e-05, - "loss": 1.1129, - "step": 5030 - }, - { - "epoch": 0.23342605470560965, - "grad_norm": 3.7286322116851807, - "learning_rate": 1.8934873357844654e-05, - "loss": 0.9771, - "step": 5035 - }, - { - "epoch": 0.2336578581363004, - "grad_norm": 3.7164711952209473, - "learning_rate": 1.8931236649826988e-05, - "loss": 0.8744, - "step": 5040 - }, - { - "epoch": 0.2338896615669912, - "grad_norm": 4.368716716766357, - "learning_rate": 1.892759409436603e-05, - "loss": 0.9397, - "step": 5045 - }, - { - "epoch": 0.23412146499768197, - "grad_norm": 3.451289415359497, - "learning_rate": 1.8923945693846642e-05, - "loss": 0.9761, - "step": 5050 - }, - { - "epoch": 0.23435326842837273, - "grad_norm": 4.144486904144287, - "learning_rate": 1.8920291450657487e-05, - "loss": 0.959, - "step": 5055 - }, - { - "epoch": 0.23458507185906352, - "grad_norm": 4.257894515991211, - "learning_rate": 1.8916631367191072e-05, - "loss": 0.8989, - "step": 5060 - }, - { - "epoch": 0.23481687528975428, - "grad_norm": 4.326287269592285, - "learning_rate": 1.891296544584371e-05, - "loss": 0.9968, - "step": 5065 - }, - { - "epoch": 0.23504867872044508, - "grad_norm": 5.085713863372803, - "learning_rate": 1.890929368901556e-05, - "loss": 0.9061, - "step": 5070 - }, - { - "epoch": 0.23528048215113584, - "grad_norm": 3.4437763690948486, - "learning_rate": 1.8905616099110578e-05, - "loss": 0.9122, - "step": 5075 - }, - { - "epoch": 0.2355122855818266, - "grad_norm": 3.675459384918213, - "learning_rate": 1.890193267853655e-05, - "loss": 0.9841, - "step": 5080 - }, - { - "epoch": 0.2357440890125174, - "grad_norm": 3.7504916191101074, - "learning_rate": 1.8898243429705082e-05, - "loss": 0.9003, - "step": 5085 - }, - { - "epoch": 0.23597589244320816, - "grad_norm": 5.602362155914307, - "learning_rate": 1.889454835503159e-05, - "loss": 1.0204, - "step": 5090 - }, - { - "epoch": 0.23620769587389892, - "grad_norm": 4.032191753387451, - "learning_rate": 1.8890847456935305e-05, - "loss": 0.8905, - "step": 5095 - }, - { - "epoch": 0.2364394993045897, - "grad_norm": 3.700024127960205, - "learning_rate": 1.888714073783928e-05, - "loss": 0.9313, - "step": 5100 - }, - { - "epoch": 0.2364394993045897, - "eval_loss": 1.0069371461868286, - "eval_runtime": 11.262, - "eval_samples_per_second": 11.277, - "eval_steps_per_second": 11.277, - "step": 5100 - }, - { - "epoch": 0.23667130273528048, - "grad_norm": 3.9474759101867676, - "learning_rate": 1.8883428200170365e-05, - "loss": 0.9212, - "step": 5105 - }, - { - "epoch": 0.23690310616597127, - "grad_norm": 4.089510440826416, - "learning_rate": 1.8879709846359227e-05, - "loss": 1.0069, - "step": 5110 - }, - { - "epoch": 0.23713490959666203, - "grad_norm": 3.7710094451904297, - "learning_rate": 1.887598567884034e-05, - "loss": 0.9985, - "step": 5115 - }, - { - "epoch": 0.2373667130273528, - "grad_norm": 4.190286636352539, - "learning_rate": 1.8872255700051988e-05, - "loss": 0.8677, - "step": 5120 - }, - { - "epoch": 0.23759851645804358, - "grad_norm": 3.9296810626983643, - "learning_rate": 1.8868519912436247e-05, - "loss": 0.9758, - "step": 5125 - }, - { - "epoch": 0.23783031988873435, - "grad_norm": 3.989487409591675, - "learning_rate": 1.8864778318439013e-05, - "loss": 0.9516, - "step": 5130 - }, - { - "epoch": 0.23806212331942514, - "grad_norm": 4.058485984802246, - "learning_rate": 1.8861030920509976e-05, - "loss": 0.9176, - "step": 5135 - }, - { - "epoch": 0.2382939267501159, - "grad_norm": 4.617326736450195, - "learning_rate": 1.885727772110262e-05, - "loss": 0.9109, - "step": 5140 - }, - { - "epoch": 0.23852573018080667, - "grad_norm": 4.54904317855835, - "learning_rate": 1.8853518722674237e-05, - "loss": 1.1414, - "step": 5145 - }, - { - "epoch": 0.23875753361149746, - "grad_norm": 3.805340528488159, - "learning_rate": 1.8849753927685915e-05, - "loss": 1.0677, - "step": 5150 - }, - { - "epoch": 0.23898933704218822, - "grad_norm": 4.674151420593262, - "learning_rate": 1.8845983338602524e-05, - "loss": 0.9734, - "step": 5155 - }, - { - "epoch": 0.23922114047287898, - "grad_norm": 4.656925201416016, - "learning_rate": 1.884220695789275e-05, - "loss": 1.083, - "step": 5160 - }, - { - "epoch": 0.23945294390356978, - "grad_norm": 4.5245256423950195, - "learning_rate": 1.8838424788029047e-05, - "loss": 1.0778, - "step": 5165 - }, - { - "epoch": 0.23968474733426054, - "grad_norm": 6.588644981384277, - "learning_rate": 1.8834636831487673e-05, - "loss": 1.0181, - "step": 5170 - }, - { - "epoch": 0.23991655076495133, - "grad_norm": 3.7076587677001953, - "learning_rate": 1.8830843090748678e-05, - "loss": 0.9473, - "step": 5175 - }, - { - "epoch": 0.2401483541956421, - "grad_norm": 4.623571395874023, - "learning_rate": 1.8827043568295887e-05, - "loss": 1.0638, - "step": 5180 - }, - { - "epoch": 0.24038015762633286, - "grad_norm": 4.6283135414123535, - "learning_rate": 1.8823238266616922e-05, - "loss": 1.0713, - "step": 5185 - }, - { - "epoch": 0.24061196105702365, - "grad_norm": 3.885566473007202, - "learning_rate": 1.8819427188203177e-05, - "loss": 0.9161, - "step": 5190 - }, - { - "epoch": 0.2408437644877144, - "grad_norm": 4.2143402099609375, - "learning_rate": 1.881561033554984e-05, - "loss": 1.1575, - "step": 5195 - }, - { - "epoch": 0.2410755679184052, - "grad_norm": 4.28688907623291, - "learning_rate": 1.881178771115587e-05, - "loss": 1.0844, - "step": 5200 - }, - { - "epoch": 0.2410755679184052, - "eval_loss": 1.0090265274047852, - "eval_runtime": 11.2654, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, - "step": 5200 - }, - { - "epoch": 0.24130737134909597, - "grad_norm": 6.4583306312561035, - "learning_rate": 1.880795931752401e-05, - "loss": 1.1072, - "step": 5205 - }, - { - "epoch": 0.24153917477978673, - "grad_norm": 3.783980369567871, - "learning_rate": 1.8804125157160777e-05, - "loss": 1.0074, - "step": 5210 - }, - { - "epoch": 0.24177097821047752, - "grad_norm": 3.996311902999878, - "learning_rate": 1.8800285232576473e-05, - "loss": 1.0447, - "step": 5215 - }, - { - "epoch": 0.24200278164116829, - "grad_norm": 4.2546467781066895, - "learning_rate": 1.879643954628516e-05, - "loss": 0.9928, - "step": 5220 - }, - { - "epoch": 0.24223458507185908, - "grad_norm": 3.8486084938049316, - "learning_rate": 1.8792588100804677e-05, - "loss": 1.0198, - "step": 5225 - }, - { - "epoch": 0.24246638850254984, - "grad_norm": 3.533219575881958, - "learning_rate": 1.8788730898656645e-05, - "loss": 0.9151, - "step": 5230 - }, - { - "epoch": 0.2426981919332406, - "grad_norm": 4.661604404449463, - "learning_rate": 1.878486794236643e-05, - "loss": 1.0086, - "step": 5235 - }, - { - "epoch": 0.2429299953639314, - "grad_norm": 3.634969472885132, - "learning_rate": 1.8780999234463195e-05, - "loss": 1.144, - "step": 5240 - }, - { - "epoch": 0.24316179879462216, - "grad_norm": 4.38676643371582, - "learning_rate": 1.8777124777479846e-05, - "loss": 1.0801, - "step": 5245 - }, - { - "epoch": 0.24339360222531292, - "grad_norm": 3.5356669425964355, - "learning_rate": 1.8773244573953058e-05, - "loss": 0.6998, - "step": 5250 - }, - { - "epoch": 0.2436254056560037, - "grad_norm": 3.8418920040130615, - "learning_rate": 1.8769358626423276e-05, - "loss": 0.818, - "step": 5255 - }, - { - "epoch": 0.24385720908669448, - "grad_norm": 4.476934432983398, - "learning_rate": 1.87654669374347e-05, - "loss": 1.0859, - "step": 5260 - }, - { - "epoch": 0.24408901251738527, - "grad_norm": 4.2773566246032715, - "learning_rate": 1.8761569509535284e-05, - "loss": 1.1232, - "step": 5265 - }, - { - "epoch": 0.24432081594807603, - "grad_norm": 3.773895740509033, - "learning_rate": 1.8757666345276754e-05, - "loss": 0.8429, - "step": 5270 - }, - { - "epoch": 0.2445526193787668, - "grad_norm": 3.9892804622650146, - "learning_rate": 1.8753757447214574e-05, - "loss": 1.0794, - "step": 5275 - }, - { - "epoch": 0.24478442280945759, - "grad_norm": 4.0765767097473145, - "learning_rate": 1.874984281790798e-05, - "loss": 0.9931, - "step": 5280 - }, - { - "epoch": 0.24501622624014835, - "grad_norm": 3.730342388153076, - "learning_rate": 1.8745922459919944e-05, - "loss": 0.8929, - "step": 5285 - }, - { - "epoch": 0.24524802967083914, - "grad_norm": 4.401835918426514, - "learning_rate": 1.8741996375817196e-05, - "loss": 1.0135, - "step": 5290 - }, - { - "epoch": 0.2454798331015299, - "grad_norm": 3.8997461795806885, - "learning_rate": 1.8738064568170217e-05, - "loss": 0.9449, - "step": 5295 - }, - { - "epoch": 0.24571163653222067, - "grad_norm": 4.102304935455322, - "learning_rate": 1.8734127039553235e-05, - "loss": 0.8591, - "step": 5300 - }, - { - "epoch": 0.24571163653222067, - "eval_loss": 1.0059479475021362, - "eval_runtime": 11.265, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 5300 - }, - { - "epoch": 0.24594343996291146, - "grad_norm": 4.207913875579834, - "learning_rate": 1.8730183792544218e-05, - "loss": 1.1101, - "step": 5305 - }, - { - "epoch": 0.24617524339360222, - "grad_norm": 3.844038963317871, - "learning_rate": 1.8726234829724883e-05, - "loss": 0.9573, - "step": 5310 - }, - { - "epoch": 0.24640704682429299, - "grad_norm": 4.369620323181152, - "learning_rate": 1.8722280153680688e-05, - "loss": 0.9645, - "step": 5315 - }, - { - "epoch": 0.24663885025498378, - "grad_norm": 5.992620944976807, - "learning_rate": 1.871831976700083e-05, - "loss": 1.0597, - "step": 5320 - }, - { - "epoch": 0.24687065368567454, - "grad_norm": 5.34641695022583, - "learning_rate": 1.8714353672278246e-05, - "loss": 1.0403, - "step": 5325 - }, - { - "epoch": 0.24710245711636533, - "grad_norm": 3.386537551879883, - "learning_rate": 1.8710381872109616e-05, - "loss": 0.8237, - "step": 5330 - }, - { - "epoch": 0.2473342605470561, - "grad_norm": 3.749030828475952, - "learning_rate": 1.8706404369095344e-05, - "loss": 0.9373, - "step": 5335 - }, - { - "epoch": 0.24756606397774686, - "grad_norm": 3.7837107181549072, - "learning_rate": 1.8702421165839578e-05, - "loss": 1.0784, - "step": 5340 - }, - { - "epoch": 0.24779786740843765, - "grad_norm": 4.194408416748047, - "learning_rate": 1.869843226495019e-05, - "loss": 1.0148, - "step": 5345 - }, - { - "epoch": 0.2480296708391284, - "grad_norm": 4.585747241973877, - "learning_rate": 1.869443766903878e-05, - "loss": 0.8059, - "step": 5350 - }, - { - "epoch": 0.2482614742698192, - "grad_norm": 3.960101842880249, - "learning_rate": 1.8690437380720697e-05, - "loss": 0.9932, - "step": 5355 - }, - { - "epoch": 0.24849327770050997, - "grad_norm": 3.7180113792419434, - "learning_rate": 1.8686431402614996e-05, - "loss": 1.0071, - "step": 5360 - }, - { - "epoch": 0.24872508113120073, - "grad_norm": 3.8417141437530518, - "learning_rate": 1.8682419737344465e-05, - "loss": 0.9517, - "step": 5365 - }, - { - "epoch": 0.24895688456189152, - "grad_norm": 4.174343109130859, - "learning_rate": 1.8678402387535608e-05, - "loss": 0.9137, - "step": 5370 - }, - { - "epoch": 0.2491886879925823, - "grad_norm": 3.7449638843536377, - "learning_rate": 1.8674379355818664e-05, - "loss": 0.8484, - "step": 5375 - }, - { - "epoch": 0.24942049142327308, - "grad_norm": 3.8778748512268066, - "learning_rate": 1.8670350644827585e-05, - "loss": 0.9259, - "step": 5380 - }, - { - "epoch": 0.24965229485396384, - "grad_norm": 4.755029678344727, - "learning_rate": 1.8666316257200035e-05, - "loss": 1.1386, - "step": 5385 - }, - { - "epoch": 0.2498840982846546, - "grad_norm": 4.226649761199951, - "learning_rate": 1.8662276195577407e-05, - "loss": 0.9777, - "step": 5390 - }, - { - "epoch": 0.25011590171534537, - "grad_norm": 4.788735866546631, - "learning_rate": 1.8658230462604804e-05, - "loss": 0.9213, - "step": 5395 - }, - { - "epoch": 0.25034770514603616, - "grad_norm": 4.801352500915527, - "learning_rate": 1.8654179060931035e-05, - "loss": 1.0886, - "step": 5400 - }, - { - "epoch": 0.25034770514603616, - "eval_loss": 1.0038655996322632, - "eval_runtime": 11.266, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, - "step": 5400 - }, - { - "epoch": 0.25057950857672695, - "grad_norm": 4.064159870147705, - "learning_rate": 1.8650121993208632e-05, - "loss": 0.8917, - "step": 5405 - }, - { - "epoch": 0.2508113120074177, - "grad_norm": 3.9093940258026123, - "learning_rate": 1.864605926209383e-05, - "loss": 1.015, - "step": 5410 - }, - { - "epoch": 0.2510431154381085, - "grad_norm": 3.832493305206299, - "learning_rate": 1.8641990870246568e-05, - "loss": 0.893, - "step": 5415 - }, - { - "epoch": 0.25127491886879927, - "grad_norm": 3.6580631732940674, - "learning_rate": 1.86379168203305e-05, - "loss": 0.9733, - "step": 5420 - }, - { - "epoch": 0.25150672229949006, - "grad_norm": 4.236906051635742, - "learning_rate": 1.8633837115012985e-05, - "loss": 0.8722, - "step": 5425 - }, - { - "epoch": 0.2517385257301808, - "grad_norm": 4.54565954208374, - "learning_rate": 1.8629751756965073e-05, - "loss": 1.0042, - "step": 5430 - }, - { - "epoch": 0.2519703291608716, - "grad_norm": 4.301955699920654, - "learning_rate": 1.8625660748861528e-05, - "loss": 0.8718, - "step": 5435 - }, - { - "epoch": 0.2522021325915624, - "grad_norm": 3.71239972114563, - "learning_rate": 1.8621564093380805e-05, - "loss": 1.0586, - "step": 5440 - }, - { - "epoch": 0.2524339360222531, - "grad_norm": 4.631463050842285, - "learning_rate": 1.861746179320506e-05, - "loss": 0.8433, - "step": 5445 - }, - { - "epoch": 0.2526657394529439, - "grad_norm": 3.705151319503784, - "learning_rate": 1.861335385102014e-05, - "loss": 0.9236, - "step": 5450 - }, - { - "epoch": 0.2528975428836347, - "grad_norm": 4.138430595397949, - "learning_rate": 1.8609240269515595e-05, - "loss": 0.8659, - "step": 5455 - }, - { - "epoch": 0.25312934631432543, - "grad_norm": 3.9012696743011475, - "learning_rate": 1.8605121051384657e-05, - "loss": 0.8742, - "step": 5460 - }, - { - "epoch": 0.2533611497450162, - "grad_norm": 5.521048069000244, - "learning_rate": 1.8600996199324253e-05, - "loss": 1.1732, - "step": 5465 - }, - { - "epoch": 0.253592953175707, - "grad_norm": 3.5360827445983887, - "learning_rate": 1.8596865716035005e-05, - "loss": 0.9477, - "step": 5470 - }, - { - "epoch": 0.25382475660639775, - "grad_norm": 3.5689191818237305, - "learning_rate": 1.859272960422121e-05, - "loss": 0.9334, - "step": 5475 - }, - { - "epoch": 0.25405656003708854, - "grad_norm": 4.756504058837891, - "learning_rate": 1.8588587866590858e-05, - "loss": 1.0424, - "step": 5480 - }, - { - "epoch": 0.25428836346777933, - "grad_norm": 3.3674094676971436, - "learning_rate": 1.8584440505855622e-05, - "loss": 0.7284, - "step": 5485 - }, - { - "epoch": 0.2545201668984701, - "grad_norm": 4.216386795043945, - "learning_rate": 1.8580287524730846e-05, - "loss": 0.9278, - "step": 5490 - }, - { - "epoch": 0.25475197032916086, - "grad_norm": 6.581062316894531, - "learning_rate": 1.857612892593557e-05, - "loss": 1.0525, - "step": 5495 - }, - { - "epoch": 0.25498377375985165, - "grad_norm": 3.360948085784912, - "learning_rate": 1.8571964712192504e-05, - "loss": 0.8063, - "step": 5500 - }, - { - "epoch": 0.25498377375985165, - "eval_loss": 1.003278136253357, - "eval_runtime": 11.2614, - "eval_samples_per_second": 11.277, - "eval_steps_per_second": 11.277, - "step": 5500 - }, - { - "epoch": 0.25521557719054244, - "grad_norm": 3.8659751415252686, - "learning_rate": 1.8567794886228034e-05, - "loss": 0.8668, - "step": 5505 - }, - { - "epoch": 0.2554473806212332, - "grad_norm": 4.503422737121582, - "learning_rate": 1.856361945077222e-05, - "loss": 1.0136, - "step": 5510 - }, - { - "epoch": 0.25567918405192397, - "grad_norm": 3.9764323234558105, - "learning_rate": 1.8559438408558795e-05, - "loss": 1.0744, - "step": 5515 - }, - { - "epoch": 0.25591098748261476, - "grad_norm": 3.8709752559661865, - "learning_rate": 1.8555251762325164e-05, - "loss": 0.8849, - "step": 5520 - }, - { - "epoch": 0.2561427909133055, - "grad_norm": 4.205710411071777, - "learning_rate": 1.8551059514812402e-05, - "loss": 0.841, - "step": 5525 - }, - { - "epoch": 0.2563745943439963, - "grad_norm": 3.547140121459961, - "learning_rate": 1.8546861668765245e-05, - "loss": 1.0945, - "step": 5530 - }, - { - "epoch": 0.2566063977746871, - "grad_norm": 3.594226360321045, - "learning_rate": 1.8542658226932107e-05, - "loss": 1.0526, - "step": 5535 - }, - { - "epoch": 0.2568382012053778, - "grad_norm": 3.7059407234191895, - "learning_rate": 1.8538449192065052e-05, - "loss": 1.078, - "step": 5540 - }, - { - "epoch": 0.2570700046360686, - "grad_norm": 3.7842459678649902, - "learning_rate": 1.8534234566919815e-05, - "loss": 0.8654, - "step": 5545 - }, - { - "epoch": 0.2573018080667594, - "grad_norm": 3.989053964614868, - "learning_rate": 1.8530014354255788e-05, - "loss": 1.0133, - "step": 5550 - }, - { - "epoch": 0.2575336114974502, - "grad_norm": 4.875503063201904, - "learning_rate": 1.8525788556836016e-05, - "loss": 0.9915, - "step": 5555 - }, - { - "epoch": 0.2577654149281409, - "grad_norm": 4.440192699432373, - "learning_rate": 1.852155717742721e-05, - "loss": 1.0343, - "step": 5560 - }, - { - "epoch": 0.2579972183588317, - "grad_norm": 4.262231349945068, - "learning_rate": 1.851732021879973e-05, - "loss": 1.0658, - "step": 5565 - }, - { - "epoch": 0.2582290217895225, - "grad_norm": 3.8692004680633545, - "learning_rate": 1.851307768372759e-05, - "loss": 0.9289, - "step": 5570 - }, - { - "epoch": 0.25846082522021324, - "grad_norm": 3.9327476024627686, - "learning_rate": 1.850882957498846e-05, - "loss": 1.0081, - "step": 5575 - }, - { - "epoch": 0.25869262865090403, - "grad_norm": 4.598284721374512, - "learning_rate": 1.8504575895363645e-05, - "loss": 1.076, - "step": 5580 - }, - { - "epoch": 0.2589244320815948, - "grad_norm": 4.525602340698242, - "learning_rate": 1.850031664763811e-05, - "loss": 0.9856, - "step": 5585 - }, - { - "epoch": 0.25915623551228556, - "grad_norm": 4.555290222167969, - "learning_rate": 1.8496051834600465e-05, - "loss": 1.101, - "step": 5590 - }, - { - "epoch": 0.25938803894297635, - "grad_norm": 4.705940246582031, - "learning_rate": 1.8491781459042956e-05, - "loss": 1.0118, - "step": 5595 - }, - { - "epoch": 0.25961984237366714, - "grad_norm": 4.4292521476745605, - "learning_rate": 1.848750552376148e-05, - "loss": 1.0232, - "step": 5600 - }, - { - "epoch": 0.25961984237366714, - "eval_loss": 1.0037119388580322, - "eval_runtime": 11.2651, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 5600 - }, - { - "epoch": 0.2598516458043579, - "grad_norm": 3.5477733612060547, - "learning_rate": 1.848322403155557e-05, - "loss": 1.073, - "step": 5605 - }, - { - "epoch": 0.26008344923504867, - "grad_norm": 4.175229549407959, - "learning_rate": 1.8478936985228394e-05, - "loss": 1.0311, - "step": 5610 - }, - { - "epoch": 0.26031525266573946, - "grad_norm": 4.131260395050049, - "learning_rate": 1.8474644387586757e-05, - "loss": 0.9146, - "step": 5615 - }, - { - "epoch": 0.26054705609643025, - "grad_norm": 3.8669774532318115, - "learning_rate": 1.8470346241441105e-05, - "loss": 0.9011, - "step": 5620 - }, - { - "epoch": 0.260778859527121, - "grad_norm": 4.5490546226501465, - "learning_rate": 1.8466042549605518e-05, - "loss": 0.9329, - "step": 5625 - }, - { - "epoch": 0.2610106629578118, - "grad_norm": 4.513428688049316, - "learning_rate": 1.8461733314897688e-05, - "loss": 1.1291, - "step": 5630 - }, - { - "epoch": 0.26124246638850257, - "grad_norm": 4.3655500411987305, - "learning_rate": 1.8457418540138958e-05, - "loss": 1.094, - "step": 5635 - }, - { - "epoch": 0.2614742698191933, - "grad_norm": 4.182011127471924, - "learning_rate": 1.8453098228154287e-05, - "loss": 1.0215, - "step": 5640 - }, - { - "epoch": 0.2617060732498841, - "grad_norm": 4.210841655731201, - "learning_rate": 1.8448772381772266e-05, - "loss": 1.0594, - "step": 5645 - }, - { - "epoch": 0.2619378766805749, - "grad_norm": 3.889146327972412, - "learning_rate": 1.84444410038251e-05, - "loss": 1.0411, - "step": 5650 - }, - { - "epoch": 0.2621696801112656, - "grad_norm": 4.405303955078125, - "learning_rate": 1.8440104097148625e-05, - "loss": 1.0183, - "step": 5655 - }, - { - "epoch": 0.2624014835419564, - "grad_norm": 3.846829891204834, - "learning_rate": 1.843576166458229e-05, - "loss": 0.8977, - "step": 5660 - }, - { - "epoch": 0.2626332869726472, - "grad_norm": 3.1990158557891846, - "learning_rate": 1.843141370896917e-05, - "loss": 0.9122, - "step": 5665 - }, - { - "epoch": 0.26286509040333794, - "grad_norm": 4.093268394470215, - "learning_rate": 1.842706023315594e-05, - "loss": 1.1009, - "step": 5670 - }, - { - "epoch": 0.26309689383402873, - "grad_norm": 4.303419589996338, - "learning_rate": 1.8422701239992916e-05, - "loss": 0.8344, - "step": 5675 - }, - { - "epoch": 0.2633286972647195, - "grad_norm": 3.7140135765075684, - "learning_rate": 1.841833673233399e-05, - "loss": 1.0971, - "step": 5680 - }, - { - "epoch": 0.2635605006954103, - "grad_norm": 4.93548059463501, - "learning_rate": 1.8413966713036703e-05, - "loss": 0.9318, - "step": 5685 - }, - { - "epoch": 0.26379230412610105, - "grad_norm": 3.852540969848633, - "learning_rate": 1.8409591184962176e-05, - "loss": 0.8139, - "step": 5690 - }, - { - "epoch": 0.26402410755679184, - "grad_norm": 4.363709449768066, - "learning_rate": 1.840521015097515e-05, - "loss": 0.9556, - "step": 5695 - }, - { - "epoch": 0.26425591098748263, - "grad_norm": 4.695869445800781, - "learning_rate": 1.840082361394397e-05, - "loss": 1.0201, - "step": 5700 - }, - { - "epoch": 0.26425591098748263, - "eval_loss": 1.0051652193069458, - "eval_runtime": 11.2696, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 5700 - }, - { - "epoch": 0.26448771441817337, - "grad_norm": 3.8231186866760254, - "learning_rate": 1.8396431576740574e-05, - "loss": 1.0544, - "step": 5705 - }, - { - "epoch": 0.26471951784886416, - "grad_norm": 3.3638997077941895, - "learning_rate": 1.8392034042240516e-05, - "loss": 0.9932, - "step": 5710 - }, - { - "epoch": 0.26495132127955495, - "grad_norm": 3.88324236869812, - "learning_rate": 1.838763101332294e-05, - "loss": 0.8402, - "step": 5715 - }, - { - "epoch": 0.2651831247102457, - "grad_norm": 4.111605644226074, - "learning_rate": 1.8383222492870587e-05, - "loss": 0.9163, - "step": 5720 - }, - { - "epoch": 0.2654149281409365, - "grad_norm": 3.5883824825286865, - "learning_rate": 1.83788084837698e-05, - "loss": 1.0904, - "step": 5725 - }, - { - "epoch": 0.26564673157162727, - "grad_norm": 3.9498956203460693, - "learning_rate": 1.8374388988910507e-05, - "loss": 1.1557, - "step": 5730 - }, - { - "epoch": 0.26587853500231806, - "grad_norm": 3.6644833087921143, - "learning_rate": 1.8369964011186236e-05, - "loss": 0.9459, - "step": 5735 - }, - { - "epoch": 0.2661103384330088, - "grad_norm": 4.248614311218262, - "learning_rate": 1.8365533553494092e-05, - "loss": 1.0257, - "step": 5740 - }, - { - "epoch": 0.2663421418636996, - "grad_norm": 3.679971694946289, - "learning_rate": 1.8361097618734787e-05, - "loss": 1.1086, - "step": 5745 - }, - { - "epoch": 0.2665739452943904, - "grad_norm": 4.014171600341797, - "learning_rate": 1.8356656209812602e-05, - "loss": 0.8715, - "step": 5750 - }, - { - "epoch": 0.2668057487250811, - "grad_norm": 4.839527130126953, - "learning_rate": 1.8352209329635412e-05, - "loss": 1.1297, - "step": 5755 - }, - { - "epoch": 0.2670375521557719, - "grad_norm": 3.6321723461151123, - "learning_rate": 1.8347756981114665e-05, - "loss": 0.9076, - "step": 5760 - }, - { - "epoch": 0.2672693555864627, - "grad_norm": 3.7051403522491455, - "learning_rate": 1.83432991671654e-05, - "loss": 0.9269, - "step": 5765 - }, - { - "epoch": 0.26750115901715343, - "grad_norm": 4.2225341796875, - "learning_rate": 1.8338835890706227e-05, - "loss": 1.0015, - "step": 5770 - }, - { - "epoch": 0.2677329624478442, - "grad_norm": 4.238531112670898, - "learning_rate": 1.8334367154659334e-05, - "loss": 0.997, - "step": 5775 - }, - { - "epoch": 0.267964765878535, - "grad_norm": 4.749455451965332, - "learning_rate": 1.832989296195048e-05, - "loss": 0.9658, - "step": 5780 - }, - { - "epoch": 0.26819656930922575, - "grad_norm": 4.021186351776123, - "learning_rate": 1.832541331550901e-05, - "loss": 1.0967, - "step": 5785 - }, - { - "epoch": 0.26842837273991654, - "grad_norm": 4.332131385803223, - "learning_rate": 1.8320928218267827e-05, - "loss": 0.9285, - "step": 5790 - }, - { - "epoch": 0.26866017617060733, - "grad_norm": 3.860887050628662, - "learning_rate": 1.8316437673163396e-05, - "loss": 1.0157, - "step": 5795 - }, - { - "epoch": 0.2688919796012981, - "grad_norm": 3.988381862640381, - "learning_rate": 1.831194168313577e-05, - "loss": 0.9248, - "step": 5800 - }, - { - "epoch": 0.2688919796012981, - "eval_loss": 1.0011706352233887, - "eval_runtime": 11.2794, - "eval_samples_per_second": 11.259, - "eval_steps_per_second": 11.259, - "step": 5800 - }, - { - "epoch": 0.26912378303198886, - "grad_norm": 4.354172706604004, - "learning_rate": 1.830744025112855e-05, - "loss": 0.9361, - "step": 5805 - }, - { - "epoch": 0.26935558646267965, - "grad_norm": 3.436056137084961, - "learning_rate": 1.8302933380088912e-05, - "loss": 1.0889, - "step": 5810 - }, - { - "epoch": 0.26958738989337044, - "grad_norm": 4.711020469665527, - "learning_rate": 1.8298421072967577e-05, - "loss": 0.9565, - "step": 5815 - }, - { - "epoch": 0.2698191933240612, - "grad_norm": 4.042435169219971, - "learning_rate": 1.829390333271884e-05, - "loss": 1.0529, - "step": 5820 - }, - { - "epoch": 0.27005099675475197, - "grad_norm": 4.613515853881836, - "learning_rate": 1.8289380162300552e-05, - "loss": 0.9848, - "step": 5825 - }, - { - "epoch": 0.27028280018544276, - "grad_norm": 4.459861755371094, - "learning_rate": 1.8284851564674105e-05, - "loss": 0.859, - "step": 5830 - }, - { - "epoch": 0.2705146036161335, - "grad_norm": 4.675287246704102, - "learning_rate": 1.828031754280446e-05, - "loss": 1.1043, - "step": 5835 - }, - { - "epoch": 0.2707464070468243, - "grad_norm": 4.442194938659668, - "learning_rate": 1.8275778099660127e-05, - "loss": 0.9316, - "step": 5840 - }, - { - "epoch": 0.2709782104775151, - "grad_norm": 3.762488842010498, - "learning_rate": 1.8271233238213155e-05, - "loss": 1.006, - "step": 5845 - }, - { - "epoch": 0.2712100139082058, - "grad_norm": 4.232035160064697, - "learning_rate": 1.826668296143915e-05, - "loss": 1.0029, - "step": 5850 - }, - { - "epoch": 0.2714418173388966, - "grad_norm": 4.225643634796143, - "learning_rate": 1.8262127272317264e-05, - "loss": 1.1048, - "step": 5855 - }, - { - "epoch": 0.2716736207695874, - "grad_norm": 4.082033157348633, - "learning_rate": 1.8257566173830192e-05, - "loss": 0.8627, - "step": 5860 - }, - { - "epoch": 0.2719054242002782, - "grad_norm": 4.129724025726318, - "learning_rate": 1.825299966896416e-05, - "loss": 1.0083, - "step": 5865 - }, - { - "epoch": 0.2721372276309689, - "grad_norm": 3.868680953979492, - "learning_rate": 1.8248427760708943e-05, - "loss": 0.9134, - "step": 5870 - }, - { - "epoch": 0.2723690310616597, - "grad_norm": 4.29145622253418, - "learning_rate": 1.824385045205786e-05, - "loss": 1.0508, - "step": 5875 - }, - { - "epoch": 0.2726008344923505, - "grad_norm": 3.5361058712005615, - "learning_rate": 1.823926774600775e-05, - "loss": 0.9143, - "step": 5880 - }, - { - "epoch": 0.27283263792304124, - "grad_norm": 3.730649948120117, - "learning_rate": 1.8234679645559e-05, - "loss": 0.9608, - "step": 5885 - }, - { - "epoch": 0.27306444135373203, - "grad_norm": 4.100008487701416, - "learning_rate": 1.8230086153715523e-05, - "loss": 0.9398, - "step": 5890 - }, - { - "epoch": 0.2732962447844228, - "grad_norm": 4.048903465270996, - "learning_rate": 1.8225487273484758e-05, - "loss": 0.9586, - "step": 5895 - }, - { - "epoch": 0.27352804821511356, - "grad_norm": 3.6291110515594482, - "learning_rate": 1.822088300787767e-05, - "loss": 0.9457, - "step": 5900 - }, - { - "epoch": 0.27352804821511356, - "eval_loss": 1.0006332397460938, - "eval_runtime": 11.2615, - "eval_samples_per_second": 11.277, - "eval_steps_per_second": 11.277, - "step": 5900 - }, - { - "epoch": 0.27375985164580435, - "grad_norm": 4.81033182144165, - "learning_rate": 1.8216273359908775e-05, - "loss": 0.9129, - "step": 5905 - }, - { - "epoch": 0.27399165507649514, - "grad_norm": 3.627592086791992, - "learning_rate": 1.8211658332596075e-05, - "loss": 0.8996, - "step": 5910 - }, - { - "epoch": 0.2742234585071859, - "grad_norm": 4.121501445770264, - "learning_rate": 1.8207037928961116e-05, - "loss": 0.9794, - "step": 5915 - }, - { - "epoch": 0.27445526193787667, - "grad_norm": 4.973855018615723, - "learning_rate": 1.8202412152028966e-05, - "loss": 0.8467, - "step": 5920 - }, - { - "epoch": 0.27468706536856746, - "grad_norm": 4.146650314331055, - "learning_rate": 1.8197781004828205e-05, - "loss": 0.9895, - "step": 5925 - }, - { - "epoch": 0.27491886879925825, - "grad_norm": 3.674971580505371, - "learning_rate": 1.819314449039092e-05, - "loss": 0.9533, - "step": 5930 - }, - { - "epoch": 0.275150672229949, - "grad_norm": 4.400362014770508, - "learning_rate": 1.8188502611752733e-05, - "loss": 0.9267, - "step": 5935 - }, - { - "epoch": 0.2753824756606398, - "grad_norm": 4.569714069366455, - "learning_rate": 1.8183855371952764e-05, - "loss": 1.0855, - "step": 5940 - }, - { - "epoch": 0.27561427909133057, - "grad_norm": 4.336934566497803, - "learning_rate": 1.817920277403364e-05, - "loss": 1.104, - "step": 5945 - }, - { - "epoch": 0.2758460825220213, - "grad_norm": 4.304383277893066, - "learning_rate": 1.8174544821041508e-05, - "loss": 0.9737, - "step": 5950 - }, - { - "epoch": 0.2760778859527121, - "grad_norm": 4.5948991775512695, - "learning_rate": 1.8169881516026005e-05, - "loss": 1.0474, - "step": 5955 - }, - { - "epoch": 0.2763096893834029, - "grad_norm": 3.7741198539733887, - "learning_rate": 1.8165212862040294e-05, - "loss": 1.0487, - "step": 5960 - }, - { - "epoch": 0.2765414928140936, - "grad_norm": 5.039603233337402, - "learning_rate": 1.816053886214102e-05, - "loss": 1.0309, - "step": 5965 - }, - { - "epoch": 0.2767732962447844, - "grad_norm": 4.001100540161133, - "learning_rate": 1.8155859519388343e-05, - "loss": 0.9451, - "step": 5970 - }, - { - "epoch": 0.2770050996754752, - "grad_norm": 3.5335066318511963, - "learning_rate": 1.81511748368459e-05, - "loss": 0.9249, - "step": 5975 - }, - { - "epoch": 0.27723690310616594, - "grad_norm": 3.4955151081085205, - "learning_rate": 1.8146484817580852e-05, - "loss": 0.9852, - "step": 5980 - }, - { - "epoch": 0.27746870653685674, - "grad_norm": 6.319854736328125, - "learning_rate": 1.8141789464663833e-05, - "loss": 1.0451, - "step": 5985 - }, - { - "epoch": 0.2777005099675475, - "grad_norm": 4.266619682312012, - "learning_rate": 1.813708878116898e-05, - "loss": 0.9854, - "step": 5990 - }, - { - "epoch": 0.2779323133982383, - "grad_norm": 3.541940927505493, - "learning_rate": 1.8132382770173913e-05, - "loss": 1.021, - "step": 5995 - }, - { - "epoch": 0.27816411682892905, - "grad_norm": 3.38006591796875, - "learning_rate": 1.8127671434759745e-05, - "loss": 1.0177, - "step": 6000 - }, - { - "epoch": 0.27816411682892905, - "eval_loss": 1.0024383068084717, - "eval_runtime": 11.261, - "eval_samples_per_second": 11.278, - "eval_steps_per_second": 11.278, - "step": 6000 - }, - { - "epoch": 0.27839592025961984, - "grad_norm": 4.122401714324951, - "learning_rate": 1.812295477801107e-05, - "loss": 0.8534, - "step": 6005 - }, - { - "epoch": 0.27862772369031064, - "grad_norm": 5.3609137535095215, - "learning_rate": 1.8118232803015972e-05, - "loss": 1.0622, - "step": 6010 - }, - { - "epoch": 0.27885952712100137, - "grad_norm": 4.3532185554504395, - "learning_rate": 1.8113505512866014e-05, - "loss": 1.0705, - "step": 6015 - }, - { - "epoch": 0.27909133055169216, - "grad_norm": 4.290230751037598, - "learning_rate": 1.8108772910656237e-05, - "loss": 0.8422, - "step": 6020 - }, - { - "epoch": 0.27932313398238295, - "grad_norm": 4.357093334197998, - "learning_rate": 1.8104034999485164e-05, - "loss": 0.9788, - "step": 6025 - }, - { - "epoch": 0.2795549374130737, - "grad_norm": 4.413539409637451, - "learning_rate": 1.809929178245479e-05, - "loss": 1.0253, - "step": 6030 - }, - { - "epoch": 0.2797867408437645, - "grad_norm": 3.7482516765594482, - "learning_rate": 1.809454326267059e-05, - "loss": 0.9227, - "step": 6035 - }, - { - "epoch": 0.28001854427445527, - "grad_norm": 3.96517014503479, - "learning_rate": 1.80897894432415e-05, - "loss": 1.0275, - "step": 6040 - }, - { - "epoch": 0.28025034770514606, - "grad_norm": 3.922147512435913, - "learning_rate": 1.8085030327279933e-05, - "loss": 0.945, - "step": 6045 - }, - { - "epoch": 0.2804821511358368, - "grad_norm": 4.065891742706299, - "learning_rate": 1.8080265917901773e-05, - "loss": 0.8729, - "step": 6050 - }, - { - "epoch": 0.2807139545665276, - "grad_norm": 5.4653191566467285, - "learning_rate": 1.8075496218226362e-05, - "loss": 1.1503, - "step": 6055 - }, - { - "epoch": 0.2809457579972184, - "grad_norm": 4.193495750427246, - "learning_rate": 1.807072123137651e-05, - "loss": 0.9621, - "step": 6060 - }, - { - "epoch": 0.2811775614279091, - "grad_norm": 7.294194221496582, - "learning_rate": 1.806594096047849e-05, - "loss": 0.9805, - "step": 6065 - }, - { - "epoch": 0.2814093648585999, - "grad_norm": 3.062164783477783, - "learning_rate": 1.8061155408662032e-05, - "loss": 0.9644, - "step": 6070 - }, - { - "epoch": 0.2816411682892907, - "grad_norm": 3.978872776031494, - "learning_rate": 1.8056364579060322e-05, - "loss": 0.9448, - "step": 6075 - }, - { - "epoch": 0.28187297171998144, - "grad_norm": 3.817619800567627, - "learning_rate": 1.8051568474810005e-05, - "loss": 1.0054, - "step": 6080 - }, - { - "epoch": 0.2821047751506722, - "grad_norm": 3.854076385498047, - "learning_rate": 1.8046767099051177e-05, - "loss": 0.982, - "step": 6085 - }, - { - "epoch": 0.282336578581363, - "grad_norm": 4.013674259185791, - "learning_rate": 1.8041960454927384e-05, - "loss": 0.7673, - "step": 6090 - }, - { - "epoch": 0.28256838201205375, - "grad_norm": 4.976582050323486, - "learning_rate": 1.803714854558563e-05, - "loss": 1.0388, - "step": 6095 - }, - { - "epoch": 0.28280018544274454, - "grad_norm": 4.081470489501953, - "learning_rate": 1.803233137417635e-05, - "loss": 0.9529, - "step": 6100 - }, - { - "epoch": 0.28280018544274454, - "eval_loss": 0.9999383687973022, - "eval_runtime": 11.265, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 6100 - }, - { - "epoch": 0.28303198887343534, - "grad_norm": 10.968564987182617, - "learning_rate": 1.8027508943853443e-05, - "loss": 0.9481, - "step": 6105 - }, - { - "epoch": 0.2832637923041261, - "grad_norm": 3.7944722175598145, - "learning_rate": 1.8022681257774234e-05, - "loss": 1.2888, - "step": 6110 - }, - { - "epoch": 0.28349559573481686, - "grad_norm": 4.5122551918029785, - "learning_rate": 1.8017848319099504e-05, - "loss": 0.9062, - "step": 6115 - }, - { - "epoch": 0.28372739916550765, - "grad_norm": 3.6060822010040283, - "learning_rate": 1.801301013099346e-05, - "loss": 1.1642, - "step": 6120 - }, - { - "epoch": 0.28395920259619845, - "grad_norm": 4.519002914428711, - "learning_rate": 1.800816669662376e-05, - "loss": 1.1781, - "step": 6125 - }, - { - "epoch": 0.2841910060268892, - "grad_norm": 3.604435443878174, - "learning_rate": 1.800331801916148e-05, - "loss": 0.8987, - "step": 6130 - }, - { - "epoch": 0.28442280945758, - "grad_norm": 4.327789783477783, - "learning_rate": 1.7998464101781145e-05, - "loss": 0.9225, - "step": 6135 - }, - { - "epoch": 0.28465461288827076, - "grad_norm": 4.079405784606934, - "learning_rate": 1.7993604947660693e-05, - "loss": 0.9651, - "step": 6140 - }, - { - "epoch": 0.2848864163189615, - "grad_norm": 4.200125217437744, - "learning_rate": 1.798874055998151e-05, - "loss": 0.9448, - "step": 6145 - }, - { - "epoch": 0.2851182197496523, - "grad_norm": 4.063922882080078, - "learning_rate": 1.7983870941928397e-05, - "loss": 0.8086, - "step": 6150 - }, - { - "epoch": 0.2853500231803431, - "grad_norm": 4.742971897125244, - "learning_rate": 1.7978996096689578e-05, - "loss": 0.9615, - "step": 6155 - }, - { - "epoch": 0.2855818266110338, - "grad_norm": 3.8731369972229004, - "learning_rate": 1.7974116027456707e-05, - "loss": 1.1075, - "step": 6160 - }, - { - "epoch": 0.2858136300417246, - "grad_norm": 4.0479044914245605, - "learning_rate": 1.796923073742485e-05, - "loss": 1.0039, - "step": 6165 - }, - { - "epoch": 0.2860454334724154, - "grad_norm": 4.324695587158203, - "learning_rate": 1.7964340229792506e-05, - "loss": 1.0035, - "step": 6170 - }, - { - "epoch": 0.2862772369031062, - "grad_norm": 4.0023627281188965, - "learning_rate": 1.7959444507761564e-05, - "loss": 1.0187, - "step": 6175 - }, - { - "epoch": 0.2865090403337969, - "grad_norm": 3.887049913406372, - "learning_rate": 1.7954543574537356e-05, - "loss": 0.9609, - "step": 6180 - }, - { - "epoch": 0.2867408437644877, - "grad_norm": 4.166080474853516, - "learning_rate": 1.79496374333286e-05, - "loss": 0.9904, - "step": 6185 - }, - { - "epoch": 0.2869726471951785, - "grad_norm": 4.174224853515625, - "learning_rate": 1.7944726087347446e-05, - "loss": 0.966, - "step": 6190 - }, - { - "epoch": 0.28720445062586925, - "grad_norm": 5.26371955871582, - "learning_rate": 1.7939809539809437e-05, - "loss": 0.9457, - "step": 6195 - }, - { - "epoch": 0.28743625405656004, - "grad_norm": 3.897738218307495, - "learning_rate": 1.7934887793933527e-05, - "loss": 0.8693, - "step": 6200 - }, - { - "epoch": 0.28743625405656004, - "eval_loss": 0.9928064942359924, - "eval_runtime": 11.2728, - "eval_samples_per_second": 11.266, - "eval_steps_per_second": 11.266, - "step": 6200 - }, - { - "epoch": 0.28766805748725083, - "grad_norm": 4.036500453948975, - "learning_rate": 1.7929960852942073e-05, - "loss": 0.8726, - "step": 6205 - }, - { - "epoch": 0.28789986091794156, - "grad_norm": 3.5420334339141846, - "learning_rate": 1.7925028720060833e-05, - "loss": 0.9567, - "step": 6210 - }, - { - "epoch": 0.28813166434863235, - "grad_norm": 3.9448046684265137, - "learning_rate": 1.7920091398518963e-05, - "loss": 0.8335, - "step": 6215 - }, - { - "epoch": 0.28836346777932315, - "grad_norm": 3.8117644786834717, - "learning_rate": 1.7915148891549016e-05, - "loss": 0.8017, - "step": 6220 - }, - { - "epoch": 0.2885952712100139, - "grad_norm": 3.843318462371826, - "learning_rate": 1.791020120238695e-05, - "loss": 0.931, - "step": 6225 - }, - { - "epoch": 0.2888270746407047, - "grad_norm": 4.147006511688232, - "learning_rate": 1.7905248334272094e-05, - "loss": 1.0015, - "step": 6230 - }, - { - "epoch": 0.28905887807139546, - "grad_norm": 3.9577856063842773, - "learning_rate": 1.790029029044719e-05, - "loss": 0.9748, - "step": 6235 - }, - { - "epoch": 0.28929068150208626, - "grad_norm": 4.157485008239746, - "learning_rate": 1.7895327074158356e-05, - "loss": 0.989, - "step": 6240 - }, - { - "epoch": 0.289522484932777, - "grad_norm": 4.6592020988464355, - "learning_rate": 1.7890358688655105e-05, - "loss": 0.8493, - "step": 6245 - }, - { - "epoch": 0.2897542883634678, - "grad_norm": 3.6401262283325195, - "learning_rate": 1.7885385137190324e-05, - "loss": 0.9636, - "step": 6250 - }, - { - "epoch": 0.2899860917941586, - "grad_norm": 3.0835177898406982, - "learning_rate": 1.788040642302029e-05, - "loss": 0.9383, - "step": 6255 - }, - { - "epoch": 0.2902178952248493, - "grad_norm": 3.7407875061035156, - "learning_rate": 1.7875422549404657e-05, - "loss": 0.9174, - "step": 6260 - }, - { - "epoch": 0.2904496986555401, - "grad_norm": 3.3547024726867676, - "learning_rate": 1.7870433519606457e-05, - "loss": 0.8579, - "step": 6265 - }, - { - "epoch": 0.2906815020862309, - "grad_norm": 5.221685886383057, - "learning_rate": 1.78654393368921e-05, - "loss": 1.237, - "step": 6270 - }, - { - "epoch": 0.2909133055169216, - "grad_norm": 3.961782693862915, - "learning_rate": 1.786044000453137e-05, - "loss": 0.7314, - "step": 6275 - }, - { - "epoch": 0.2911451089476124, - "grad_norm": 3.9514427185058594, - "learning_rate": 1.785543552579742e-05, - "loss": 0.9881, - "step": 6280 - }, - { - "epoch": 0.2913769123783032, - "grad_norm": 3.950453281402588, - "learning_rate": 1.785042590396677e-05, - "loss": 0.9165, - "step": 6285 - }, - { - "epoch": 0.29160871580899395, - "grad_norm": 4.681061267852783, - "learning_rate": 1.7845411142319316e-05, - "loss": 0.9462, - "step": 6290 - }, - { - "epoch": 0.29184051923968474, - "grad_norm": 4.256533145904541, - "learning_rate": 1.784039124413831e-05, - "loss": 1.0572, - "step": 6295 - }, - { - "epoch": 0.29207232267037553, - "grad_norm": 4.432834148406982, - "learning_rate": 1.7835366212710372e-05, - "loss": 0.9016, - "step": 6300 - }, - { - "epoch": 0.29207232267037553, - "eval_loss": 0.9896853566169739, - "eval_runtime": 11.2569, - "eval_samples_per_second": 11.282, - "eval_steps_per_second": 11.282, - "step": 6300 - }, - { - "epoch": 0.2923041261010663, - "grad_norm": 3.48752760887146, - "learning_rate": 1.783033605132548e-05, - "loss": 0.7874, - "step": 6305 - }, - { - "epoch": 0.29253592953175706, - "grad_norm": 3.671842336654663, - "learning_rate": 1.7825300763276974e-05, - "loss": 0.7982, - "step": 6310 - }, - { - "epoch": 0.29276773296244785, - "grad_norm": 4.298096656799316, - "learning_rate": 1.7820260351861545e-05, - "loss": 1.0207, - "step": 6315 - }, - { - "epoch": 0.29299953639313864, - "grad_norm": 4.248196125030518, - "learning_rate": 1.7815214820379253e-05, - "loss": 0.9549, - "step": 6320 - }, - { - "epoch": 0.2932313398238294, - "grad_norm": 3.7347469329833984, - "learning_rate": 1.781016417213349e-05, - "loss": 0.9005, - "step": 6325 - }, - { - "epoch": 0.29346314325452016, - "grad_norm": 3.76611328125, - "learning_rate": 1.7805108410431006e-05, - "loss": 0.9894, - "step": 6330 - }, - { - "epoch": 0.29369494668521096, - "grad_norm": 3.8931164741516113, - "learning_rate": 1.7800047538581902e-05, - "loss": 0.9377, - "step": 6335 - }, - { - "epoch": 0.2939267501159017, - "grad_norm": 3.719175100326538, - "learning_rate": 1.779498155989963e-05, - "loss": 0.856, - "step": 6340 - }, - { - "epoch": 0.2941585535465925, - "grad_norm": 4.287708282470703, - "learning_rate": 1.778991047770097e-05, - "loss": 1.0496, - "step": 6345 - }, - { - "epoch": 0.2943903569772833, - "grad_norm": 3.886796474456787, - "learning_rate": 1.7784834295306056e-05, - "loss": 0.956, - "step": 6350 - }, - { - "epoch": 0.29462216040797407, - "grad_norm": 3.6003739833831787, - "learning_rate": 1.7779753016038356e-05, - "loss": 0.9431, - "step": 6355 - }, - { - "epoch": 0.2948539638386648, - "grad_norm": 4.9724297523498535, - "learning_rate": 1.7774666643224675e-05, - "loss": 1.1131, - "step": 6360 - }, - { - "epoch": 0.2950857672693556, - "grad_norm": 4.584454536437988, - "learning_rate": 1.776957518019515e-05, - "loss": 0.9687, - "step": 6365 - }, - { - "epoch": 0.2953175707000464, - "grad_norm": 4.142701625823975, - "learning_rate": 1.776447863028327e-05, - "loss": 1.0858, - "step": 6370 - }, - { - "epoch": 0.2955493741307371, - "grad_norm": 5.801587104797363, - "learning_rate": 1.775937699682582e-05, - "loss": 1.0839, - "step": 6375 - }, - { - "epoch": 0.2957811775614279, - "grad_norm": 3.5847275257110596, - "learning_rate": 1.7754270283162943e-05, - "loss": 0.9185, - "step": 6380 - }, - { - "epoch": 0.2960129809921187, - "grad_norm": 3.590195417404175, - "learning_rate": 1.77491584926381e-05, - "loss": 0.9499, - "step": 6385 - }, - { - "epoch": 0.29624478442280944, - "grad_norm": 3.7471694946289062, - "learning_rate": 1.774404162859806e-05, - "loss": 1.0203, - "step": 6390 - }, - { - "epoch": 0.29647658785350023, - "grad_norm": 7.472535610198975, - "learning_rate": 1.7738919694392944e-05, - "loss": 1.1356, - "step": 6395 - }, - { - "epoch": 0.296708391284191, - "grad_norm": 3.6303653717041016, - "learning_rate": 1.7733792693376167e-05, - "loss": 0.8752, - "step": 6400 - }, - { - "epoch": 0.296708391284191, - "eval_loss": 0.989250898361206, - "eval_runtime": 11.2677, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 6400 - }, - { - "epoch": 0.29694019471488176, - "grad_norm": 4.146617889404297, - "learning_rate": 1.7728660628904463e-05, - "loss": 0.9494, - "step": 6405 - }, - { - "epoch": 0.29717199814557255, - "grad_norm": 4.808352947235107, - "learning_rate": 1.77235235043379e-05, - "loss": 0.969, - "step": 6410 - }, - { - "epoch": 0.29740380157626334, - "grad_norm": 4.137024402618408, - "learning_rate": 1.7718381323039844e-05, - "loss": 1.027, - "step": 6415 - }, - { - "epoch": 0.29763560500695413, - "grad_norm": 4.234049320220947, - "learning_rate": 1.7713234088376973e-05, - "loss": 1.0522, - "step": 6420 - }, - { - "epoch": 0.29786740843764486, - "grad_norm": 3.660369873046875, - "learning_rate": 1.770808180371927e-05, - "loss": 0.848, - "step": 6425 - }, - { - "epoch": 0.29809921186833566, - "grad_norm": 3.89237904548645, - "learning_rate": 1.7702924472440038e-05, - "loss": 1.0321, - "step": 6430 - }, - { - "epoch": 0.29833101529902645, - "grad_norm": 4.689309597015381, - "learning_rate": 1.7697762097915875e-05, - "loss": 0.9966, - "step": 6435 - }, - { - "epoch": 0.2985628187297172, - "grad_norm": 5.034631729125977, - "learning_rate": 1.769259468352668e-05, - "loss": 1.1245, - "step": 6440 - }, - { - "epoch": 0.298794622160408, - "grad_norm": 3.535649299621582, - "learning_rate": 1.768742223265565e-05, - "loss": 0.8608, - "step": 6445 - }, - { - "epoch": 0.29902642559109877, - "grad_norm": 3.9463391304016113, - "learning_rate": 1.7682244748689283e-05, - "loss": 0.8687, - "step": 6450 - }, - { - "epoch": 0.2992582290217895, - "grad_norm": 3.97773814201355, - "learning_rate": 1.767706223501738e-05, - "loss": 0.9634, - "step": 6455 - }, - { - "epoch": 0.2994900324524803, - "grad_norm": 3.951277017593384, - "learning_rate": 1.7671874695033015e-05, - "loss": 0.9828, - "step": 6460 - }, - { - "epoch": 0.2997218358831711, - "grad_norm": 3.4690914154052734, - "learning_rate": 1.7666682132132575e-05, - "loss": 0.9453, - "step": 6465 - }, - { - "epoch": 0.2999536393138618, - "grad_norm": 4.310832977294922, - "learning_rate": 1.766148454971572e-05, - "loss": 1.0056, - "step": 6470 - }, - { - "epoch": 0.3001854427445526, - "grad_norm": 5.2137861251831055, - "learning_rate": 1.7656281951185406e-05, - "loss": 1.1368, - "step": 6475 - }, - { - "epoch": 0.3004172461752434, - "grad_norm": 3.1809844970703125, - "learning_rate": 1.7651074339947864e-05, - "loss": 0.7993, - "step": 6480 - }, - { - "epoch": 0.3006490496059342, - "grad_norm": 4.750578880310059, - "learning_rate": 1.7645861719412617e-05, - "loss": 0.9543, - "step": 6485 - }, - { - "epoch": 0.30088085303662493, - "grad_norm": 3.670433282852173, - "learning_rate": 1.764064409299246e-05, - "loss": 0.9679, - "step": 6490 - }, - { - "epoch": 0.3011126564673157, - "grad_norm": 4.144705772399902, - "learning_rate": 1.7635421464103468e-05, - "loss": 1.0732, - "step": 6495 - }, - { - "epoch": 0.3013444598980065, - "grad_norm": 3.8694875240325928, - "learning_rate": 1.7630193836164996e-05, - "loss": 1.1033, - "step": 6500 - }, - { - "epoch": 0.3013444598980065, - "eval_loss": 0.9887648820877075, - "eval_runtime": 11.2829, - "eval_samples_per_second": 11.256, - "eval_steps_per_second": 11.256, - "step": 6500 - }, - { - "epoch": 0.30157626332869725, - "grad_norm": 3.5212910175323486, - "learning_rate": 1.7624961212599654e-05, - "loss": 0.871, - "step": 6505 - }, - { - "epoch": 0.30180806675938804, - "grad_norm": 3.8312153816223145, - "learning_rate": 1.761972359683335e-05, - "loss": 1.0764, - "step": 6510 - }, - { - "epoch": 0.30203987019007883, - "grad_norm": 4.861583709716797, - "learning_rate": 1.761448099229524e-05, - "loss": 0.9775, - "step": 6515 - }, - { - "epoch": 0.30227167362076957, - "grad_norm": 4.358705043792725, - "learning_rate": 1.7609233402417755e-05, - "loss": 0.9921, - "step": 6520 - }, - { - "epoch": 0.30250347705146036, - "grad_norm": 4.940891742706299, - "learning_rate": 1.7603980830636586e-05, - "loss": 0.9865, - "step": 6525 - }, - { - "epoch": 0.30273528048215115, - "grad_norm": 4.035562992095947, - "learning_rate": 1.759872328039069e-05, - "loss": 1.0127, - "step": 6530 - }, - { - "epoch": 0.3029670839128419, - "grad_norm": 3.8091330528259277, - "learning_rate": 1.7593460755122274e-05, - "loss": 1.0808, - "step": 6535 - }, - { - "epoch": 0.3031988873435327, - "grad_norm": 4.551588535308838, - "learning_rate": 1.7588193258276815e-05, - "loss": 0.9015, - "step": 6540 - }, - { - "epoch": 0.30343069077422347, - "grad_norm": 6.152681350708008, - "learning_rate": 1.758292079330304e-05, - "loss": 1.126, - "step": 6545 - }, - { - "epoch": 0.30366249420491426, - "grad_norm": 4.223917007446289, - "learning_rate": 1.7577643363652925e-05, - "loss": 0.9735, - "step": 6550 - }, - { - "epoch": 0.303894297635605, - "grad_norm": 4.873682975769043, - "learning_rate": 1.75723609727817e-05, - "loss": 0.9722, - "step": 6555 - }, - { - "epoch": 0.3041261010662958, - "grad_norm": 4.19580602645874, - "learning_rate": 1.756707362414784e-05, - "loss": 1.0676, - "step": 6560 - }, - { - "epoch": 0.3043579044969866, - "grad_norm": 4.814520835876465, - "learning_rate": 1.7561781321213078e-05, - "loss": 1.0338, - "step": 6565 - }, - { - "epoch": 0.3045897079276773, - "grad_norm": 3.395010471343994, - "learning_rate": 1.755648406744237e-05, - "loss": 0.973, - "step": 6570 - }, - { - "epoch": 0.3048215113583681, - "grad_norm": 3.860635757446289, - "learning_rate": 1.7551181866303926e-05, - "loss": 0.8789, - "step": 6575 - }, - { - "epoch": 0.3050533147890589, - "grad_norm": 4.463065147399902, - "learning_rate": 1.7545874721269198e-05, - "loss": 1.0307, - "step": 6580 - }, - { - "epoch": 0.30528511821974963, - "grad_norm": 5.383005619049072, - "learning_rate": 1.754056263581287e-05, - "loss": 0.9404, - "step": 6585 - }, - { - "epoch": 0.3055169216504404, - "grad_norm": 4.1449360847473145, - "learning_rate": 1.7535245613412862e-05, - "loss": 0.9562, - "step": 6590 - }, - { - "epoch": 0.3057487250811312, - "grad_norm": 5.013772487640381, - "learning_rate": 1.752992365755032e-05, - "loss": 1.0953, - "step": 6595 - }, - { - "epoch": 0.30598052851182195, - "grad_norm": 3.9730842113494873, - "learning_rate": 1.7524596771709628e-05, - "loss": 1.0541, - "step": 6600 - }, - { - "epoch": 0.30598052851182195, - "eval_loss": 0.9859225749969482, - "eval_runtime": 11.2765, - "eval_samples_per_second": 11.262, - "eval_steps_per_second": 11.262, - "step": 6600 - }, - { - "epoch": 0.30621233194251274, - "grad_norm": 3.646376132965088, - "learning_rate": 1.7519264959378398e-05, - "loss": 0.8244, - "step": 6605 - }, - { - "epoch": 0.30644413537320353, - "grad_norm": 3.8406198024749756, - "learning_rate": 1.7513928224047464e-05, - "loss": 0.9946, - "step": 6610 - }, - { - "epoch": 0.3066759388038943, - "grad_norm": 3.9793317317962646, - "learning_rate": 1.7508586569210883e-05, - "loss": 0.8445, - "step": 6615 - }, - { - "epoch": 0.30690774223458506, - "grad_norm": 4.22384786605835, - "learning_rate": 1.750323999836593e-05, - "loss": 0.9408, - "step": 6620 - }, - { - "epoch": 0.30713954566527585, - "grad_norm": 3.759800910949707, - "learning_rate": 1.7497888515013106e-05, - "loss": 0.8005, - "step": 6625 - }, - { - "epoch": 0.30737134909596664, - "grad_norm": 3.72318959236145, - "learning_rate": 1.7492532122656124e-05, - "loss": 0.9359, - "step": 6630 - }, - { - "epoch": 0.3076031525266574, - "grad_norm": 3.6016530990600586, - "learning_rate": 1.7487170824801912e-05, - "loss": 1.0068, - "step": 6635 - }, - { - "epoch": 0.30783495595734817, - "grad_norm": 4.295738220214844, - "learning_rate": 1.7481804624960607e-05, - "loss": 1.0878, - "step": 6640 - }, - { - "epoch": 0.30806675938803896, - "grad_norm": 3.8684656620025635, - "learning_rate": 1.7476433526645562e-05, - "loss": 1.0067, - "step": 6645 - }, - { - "epoch": 0.3082985628187297, - "grad_norm": 3.390023708343506, - "learning_rate": 1.7471057533373332e-05, - "loss": 0.903, - "step": 6650 - }, - { - "epoch": 0.3085303662494205, - "grad_norm": 4.495607376098633, - "learning_rate": 1.746567664866367e-05, - "loss": 0.7868, - "step": 6655 - }, - { - "epoch": 0.3087621696801113, - "grad_norm": 4.227192401885986, - "learning_rate": 1.7460290876039552e-05, - "loss": 1.1199, - "step": 6660 - }, - { - "epoch": 0.30899397311080207, - "grad_norm": 3.549609422683716, - "learning_rate": 1.7454900219027133e-05, - "loss": 0.9409, - "step": 6665 - }, - { - "epoch": 0.3092257765414928, - "grad_norm": 5.658492088317871, - "learning_rate": 1.744950468115578e-05, - "loss": 1.086, - "step": 6670 - }, - { - "epoch": 0.3094575799721836, - "grad_norm": 3.934952974319458, - "learning_rate": 1.744410426595804e-05, - "loss": 0.998, - "step": 6675 - }, - { - "epoch": 0.3096893834028744, - "grad_norm": 3.8127615451812744, - "learning_rate": 1.743869897696967e-05, - "loss": 0.9748, - "step": 6680 - }, - { - "epoch": 0.3099211868335651, - "grad_norm": 3.870283603668213, - "learning_rate": 1.7433288817729614e-05, - "loss": 1.1462, - "step": 6685 - }, - { - "epoch": 0.3101529902642559, - "grad_norm": 4.023786544799805, - "learning_rate": 1.7427873791779994e-05, - "loss": 0.9699, - "step": 6690 - }, - { - "epoch": 0.3103847936949467, - "grad_norm": 3.8653597831726074, - "learning_rate": 1.7422453902666135e-05, - "loss": 0.8618, - "step": 6695 - }, - { - "epoch": 0.31061659712563744, - "grad_norm": 4.728411674499512, - "learning_rate": 1.7417029153936528e-05, - "loss": 1.0816, - "step": 6700 - }, - { - "epoch": 0.31061659712563744, - "eval_loss": 0.9867631196975708, - "eval_runtime": 11.2651, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 6700 - }, - { - "epoch": 0.31084840055632823, - "grad_norm": 4.564522743225098, - "learning_rate": 1.7411599549142863e-05, - "loss": 1.1131, - "step": 6705 - }, - { - "epoch": 0.311080203987019, - "grad_norm": 4.069855213165283, - "learning_rate": 1.740616509183999e-05, - "loss": 0.9792, - "step": 6710 - }, - { - "epoch": 0.31131200741770976, - "grad_norm": 3.651477575302124, - "learning_rate": 1.7400725785585965e-05, - "loss": 0.9195, - "step": 6715 - }, - { - "epoch": 0.31154381084840055, - "grad_norm": 4.551967144012451, - "learning_rate": 1.7395281633941986e-05, - "loss": 0.9555, - "step": 6720 - }, - { - "epoch": 0.31177561427909134, - "grad_norm": 3.895969867706299, - "learning_rate": 1.738983264047244e-05, - "loss": 1.0506, - "step": 6725 - }, - { - "epoch": 0.31200741770978213, - "grad_norm": 3.545581817626953, - "learning_rate": 1.738437880874489e-05, - "loss": 0.8803, - "step": 6730 - }, - { - "epoch": 0.31223922114047287, - "grad_norm": 3.2424161434173584, - "learning_rate": 1.737892014233005e-05, - "loss": 1.0651, - "step": 6735 - }, - { - "epoch": 0.31247102457116366, - "grad_norm": 4.444201469421387, - "learning_rate": 1.737345664480181e-05, - "loss": 0.9039, - "step": 6740 - }, - { - "epoch": 0.31270282800185445, - "grad_norm": 4.671426773071289, - "learning_rate": 1.736798831973723e-05, - "loss": 1.0772, - "step": 6745 - }, - { - "epoch": 0.3129346314325452, - "grad_norm": 3.8473031520843506, - "learning_rate": 1.7362515170716512e-05, - "loss": 1.0927, - "step": 6750 - }, - { - "epoch": 0.313166434863236, - "grad_norm": 4.373749732971191, - "learning_rate": 1.735703720132303e-05, - "loss": 0.992, - "step": 6755 - }, - { - "epoch": 0.31339823829392677, - "grad_norm": 4.11835241317749, - "learning_rate": 1.735155441514331e-05, - "loss": 0.8926, - "step": 6760 - }, - { - "epoch": 0.3136300417246175, - "grad_norm": 3.815171957015991, - "learning_rate": 1.7346066815767033e-05, - "loss": 0.9104, - "step": 6765 - }, - { - "epoch": 0.3138618451553083, - "grad_norm": 3.835723638534546, - "learning_rate": 1.734057440678703e-05, - "loss": 0.9115, - "step": 6770 - }, - { - "epoch": 0.3140936485859991, - "grad_norm": 3.634661912918091, - "learning_rate": 1.7335077191799277e-05, - "loss": 0.7149, - "step": 6775 - }, - { - "epoch": 0.3143254520166898, - "grad_norm": 4.546589374542236, - "learning_rate": 1.7329575174402907e-05, - "loss": 1.0683, - "step": 6780 - }, - { - "epoch": 0.3145572554473806, - "grad_norm": 3.3564836978912354, - "learning_rate": 1.732406835820019e-05, - "loss": 0.8881, - "step": 6785 - }, - { - "epoch": 0.3147890588780714, - "grad_norm": 3.6788740158081055, - "learning_rate": 1.7318556746796532e-05, - "loss": 1.0333, - "step": 6790 - }, - { - "epoch": 0.3150208623087622, - "grad_norm": 3.0859086513519287, - "learning_rate": 1.7313040343800493e-05, - "loss": 0.8799, - "step": 6795 - }, - { - "epoch": 0.31525266573945293, - "grad_norm": 4.490921497344971, - "learning_rate": 1.7307519152823763e-05, - "loss": 0.8285, - "step": 6800 - }, - { - "epoch": 0.31525266573945293, - "eval_loss": 0.9875900745391846, - "eval_runtime": 11.2755, - "eval_samples_per_second": 11.263, - "eval_steps_per_second": 11.263, - "step": 6800 - }, - { - "epoch": 0.3154844691701437, - "grad_norm": 4.825845241546631, - "learning_rate": 1.730199317748116e-05, - "loss": 1.0106, - "step": 6805 - }, - { - "epoch": 0.3157162726008345, - "grad_norm": 4.728207588195801, - "learning_rate": 1.7296462421390647e-05, - "loss": 0.9964, - "step": 6810 - }, - { - "epoch": 0.31594807603152525, - "grad_norm": 4.3029279708862305, - "learning_rate": 1.7290926888173306e-05, - "loss": 0.9904, - "step": 6815 - }, - { - "epoch": 0.31617987946221604, - "grad_norm": 3.814422130584717, - "learning_rate": 1.7285386581453353e-05, - "loss": 0.9931, - "step": 6820 - }, - { - "epoch": 0.31641168289290683, - "grad_norm": 4.1385393142700195, - "learning_rate": 1.7279841504858127e-05, - "loss": 1.0007, - "step": 6825 - }, - { - "epoch": 0.31664348632359757, - "grad_norm": 3.7476789951324463, - "learning_rate": 1.727429166201809e-05, - "loss": 0.9299, - "step": 6830 - }, - { - "epoch": 0.31687528975428836, - "grad_norm": 3.564824104309082, - "learning_rate": 1.726873705656683e-05, - "loss": 0.9108, - "step": 6835 - }, - { - "epoch": 0.31710709318497915, - "grad_norm": 4.3677215576171875, - "learning_rate": 1.7263177692141045e-05, - "loss": 0.9922, - "step": 6840 - }, - { - "epoch": 0.3173388966156699, - "grad_norm": 3.237433433532715, - "learning_rate": 1.725761357238055e-05, - "loss": 0.7999, - "step": 6845 - }, - { - "epoch": 0.3175707000463607, - "grad_norm": 5.139328479766846, - "learning_rate": 1.7252044700928275e-05, - "loss": 0.9433, - "step": 6850 - }, - { - "epoch": 0.31780250347705147, - "grad_norm": 3.4519400596618652, - "learning_rate": 1.724647108143027e-05, - "loss": 0.8964, - "step": 6855 - }, - { - "epoch": 0.31803430690774226, - "grad_norm": 3.80947208404541, - "learning_rate": 1.724089271753568e-05, - "loss": 1.0493, - "step": 6860 - }, - { - "epoch": 0.318266110338433, - "grad_norm": 4.166898250579834, - "learning_rate": 1.7235309612896756e-05, - "loss": 1.0334, - "step": 6865 - }, - { - "epoch": 0.3184979137691238, - "grad_norm": 3.7955734729766846, - "learning_rate": 1.7229721771168864e-05, - "loss": 1.0212, - "step": 6870 - }, - { - "epoch": 0.3187297171998146, - "grad_norm": 3.6304619312286377, - "learning_rate": 1.7224129196010467e-05, - "loss": 0.948, - "step": 6875 - }, - { - "epoch": 0.3189615206305053, - "grad_norm": 3.9213900566101074, - "learning_rate": 1.7218531891083122e-05, - "loss": 0.97, - "step": 6880 - }, - { - "epoch": 0.3191933240611961, - "grad_norm": 5.918650150299072, - "learning_rate": 1.721292986005149e-05, - "loss": 1.0927, - "step": 6885 - }, - { - "epoch": 0.3194251274918869, - "grad_norm": 3.4678218364715576, - "learning_rate": 1.720732310658332e-05, - "loss": 0.8423, - "step": 6890 - }, - { - "epoch": 0.31965693092257763, - "grad_norm": 4.282207489013672, - "learning_rate": 1.7201711634349457e-05, - "loss": 1.029, - "step": 6895 - }, - { - "epoch": 0.3198887343532684, - "grad_norm": 4.10146951675415, - "learning_rate": 1.7196095447023833e-05, - "loss": 0.8673, - "step": 6900 - }, - { - "epoch": 0.3198887343532684, - "eval_loss": 0.9845543503761292, - "eval_runtime": 11.2805, - "eval_samples_per_second": 11.258, - "eval_steps_per_second": 11.258, - "step": 6900 - }, - { - "epoch": 0.3201205377839592, - "grad_norm": 4.028499603271484, - "learning_rate": 1.7190474548283468e-05, - "loss": 0.8934, - "step": 6905 - }, - { - "epoch": 0.32035234121464995, - "grad_norm": 3.83030104637146, - "learning_rate": 1.718484894180847e-05, - "loss": 1.0745, - "step": 6910 - }, - { - "epoch": 0.32058414464534074, - "grad_norm": 3.5020344257354736, - "learning_rate": 1.717921863128202e-05, - "loss": 0.8391, - "step": 6915 - }, - { - "epoch": 0.32081594807603153, - "grad_norm": 3.863657236099243, - "learning_rate": 1.717358362039039e-05, - "loss": 0.9929, - "step": 6920 - }, - { - "epoch": 0.3210477515067223, - "grad_norm": 3.93176007270813, - "learning_rate": 1.716794391282292e-05, - "loss": 1.0097, - "step": 6925 - }, - { - "epoch": 0.32127955493741306, - "grad_norm": 4.715558052062988, - "learning_rate": 1.7162299512272036e-05, - "loss": 0.9322, - "step": 6930 - }, - { - "epoch": 0.32151135836810385, - "grad_norm": 3.9522275924682617, - "learning_rate": 1.7156650422433225e-05, - "loss": 0.8993, - "step": 6935 - }, - { - "epoch": 0.32174316179879464, - "grad_norm": 5.151059627532959, - "learning_rate": 1.715099664700505e-05, - "loss": 0.8975, - "step": 6940 - }, - { - "epoch": 0.3219749652294854, - "grad_norm": 3.8080761432647705, - "learning_rate": 1.7145338189689136e-05, - "loss": 0.8875, - "step": 6945 - }, - { - "epoch": 0.32220676866017617, - "grad_norm": 3.906095504760742, - "learning_rate": 1.7139675054190186e-05, - "loss": 0.944, - "step": 6950 - }, - { - "epoch": 0.32243857209086696, - "grad_norm": 4.30042028427124, - "learning_rate": 1.7134007244215955e-05, - "loss": 0.9607, - "step": 6955 - }, - { - "epoch": 0.3226703755215577, - "grad_norm": 3.7272627353668213, - "learning_rate": 1.7128334763477256e-05, - "loss": 0.9211, - "step": 6960 - }, - { - "epoch": 0.3229021789522485, - "grad_norm": 4.208070755004883, - "learning_rate": 1.7122657615687975e-05, - "loss": 0.9681, - "step": 6965 - }, - { - "epoch": 0.3231339823829393, - "grad_norm": 3.3243024349212646, - "learning_rate": 1.711697580456504e-05, - "loss": 0.9908, - "step": 6970 - }, - { - "epoch": 0.32336578581363007, - "grad_norm": 3.6350746154785156, - "learning_rate": 1.711128933382843e-05, - "loss": 1.0272, - "step": 6975 - }, - { - "epoch": 0.3235975892443208, - "grad_norm": 5.2530198097229, - "learning_rate": 1.710559820720119e-05, - "loss": 1.0747, - "step": 6980 - }, - { - "epoch": 0.3238293926750116, - "grad_norm": 3.2672336101531982, - "learning_rate": 1.7099902428409404e-05, - "loss": 0.8968, - "step": 6985 - }, - { - "epoch": 0.3240611961057024, - "grad_norm": 3.2137134075164795, - "learning_rate": 1.7094202001182192e-05, - "loss": 1.0142, - "step": 6990 - }, - { - "epoch": 0.3242929995363931, - "grad_norm": 4.092966079711914, - "learning_rate": 1.708849692925174e-05, - "loss": 0.9918, - "step": 6995 - }, - { - "epoch": 0.3245248029670839, - "grad_norm": 5.9110331535339355, - "learning_rate": 1.7082787216353255e-05, - "loss": 1.0513, - "step": 7000 - }, - { - "epoch": 0.3245248029670839, - "eval_loss": 0.9831182956695557, - "eval_runtime": 11.2664, - "eval_samples_per_second": 11.272, - "eval_steps_per_second": 11.272, - "step": 7000 - }, - { - "epoch": 0.3247566063977747, - "grad_norm": 4.733374118804932, - "learning_rate": 1.707707286622499e-05, - "loss": 1.0291, - "step": 7005 - }, - { - "epoch": 0.32498840982846544, - "grad_norm": 3.4680347442626953, - "learning_rate": 1.7071353882608238e-05, - "loss": 1.0134, - "step": 7010 - }, - { - "epoch": 0.32522021325915623, - "grad_norm": 3.5740184783935547, - "learning_rate": 1.706563026924732e-05, - "loss": 0.9281, - "step": 7015 - }, - { - "epoch": 0.325452016689847, - "grad_norm": 4.178410530090332, - "learning_rate": 1.7059902029889585e-05, - "loss": 0.9851, - "step": 7020 - }, - { - "epoch": 0.32568382012053776, - "grad_norm": 3.8983752727508545, - "learning_rate": 1.7054169168285425e-05, - "loss": 0.9198, - "step": 7025 - }, - { - "epoch": 0.32591562355122855, - "grad_norm": 3.6942498683929443, - "learning_rate": 1.7048431688188244e-05, - "loss": 0.9314, - "step": 7030 - }, - { - "epoch": 0.32614742698191934, - "grad_norm": 3.4959239959716797, - "learning_rate": 1.7042689593354472e-05, - "loss": 0.8442, - "step": 7035 - }, - { - "epoch": 0.32637923041261013, - "grad_norm": 3.6562867164611816, - "learning_rate": 1.7036942887543566e-05, - "loss": 1.0384, - "step": 7040 - }, - { - "epoch": 0.32661103384330087, - "grad_norm": 3.508683204650879, - "learning_rate": 1.7031191574518e-05, - "loss": 1.0261, - "step": 7045 - }, - { - "epoch": 0.32684283727399166, - "grad_norm": 4.6104278564453125, - "learning_rate": 1.702543565804326e-05, - "loss": 0.9351, - "step": 7050 - }, - { - "epoch": 0.32707464070468245, - "grad_norm": 3.4227206707000732, - "learning_rate": 1.7019675141887855e-05, - "loss": 0.8968, - "step": 7055 - }, - { - "epoch": 0.3273064441353732, - "grad_norm": 3.3952324390411377, - "learning_rate": 1.7013910029823292e-05, - "loss": 0.8983, - "step": 7060 - }, - { - "epoch": 0.327538247566064, - "grad_norm": 4.351296424865723, - "learning_rate": 1.7008140325624104e-05, - "loss": 0.9035, - "step": 7065 - }, - { - "epoch": 0.32777005099675477, - "grad_norm": 3.7643845081329346, - "learning_rate": 1.7002366033067813e-05, - "loss": 0.8031, - "step": 7070 - }, - { - "epoch": 0.3280018544274455, - "grad_norm": 4.087846279144287, - "learning_rate": 1.699658715593496e-05, - "loss": 1.1498, - "step": 7075 - }, - { - "epoch": 0.3282336578581363, - "grad_norm": 4.362499237060547, - "learning_rate": 1.699080369800908e-05, - "loss": 0.9128, - "step": 7080 - }, - { - "epoch": 0.3284654612888271, - "grad_norm": 3.442183494567871, - "learning_rate": 1.6985015663076702e-05, - "loss": 0.8206, - "step": 7085 - }, - { - "epoch": 0.3286972647195178, - "grad_norm": 3.769672155380249, - "learning_rate": 1.697922305492736e-05, - "loss": 1.019, - "step": 7090 - }, - { - "epoch": 0.3289290681502086, - "grad_norm": 4.2746381759643555, - "learning_rate": 1.6973425877353587e-05, - "loss": 0.8872, - "step": 7095 - }, - { - "epoch": 0.3291608715808994, - "grad_norm": 4.174293041229248, - "learning_rate": 1.6967624134150895e-05, - "loss": 0.9736, - "step": 7100 - }, - { - "epoch": 0.3291608715808994, - "eval_loss": 0.9836241602897644, - "eval_runtime": 11.2694, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 7100 - }, - { - "epoch": 0.3293926750115902, - "grad_norm": 2.938143491744995, - "learning_rate": 1.696181782911779e-05, - "loss": 0.8206, - "step": 7105 - }, - { - "epoch": 0.32962447844228093, - "grad_norm": 3.7101032733917236, - "learning_rate": 1.6956006966055772e-05, - "loss": 0.9207, - "step": 7110 - }, - { - "epoch": 0.3298562818729717, - "grad_norm": 3.974308490753174, - "learning_rate": 1.6950191548769314e-05, - "loss": 0.9115, - "step": 7115 - }, - { - "epoch": 0.3300880853036625, - "grad_norm": 5.249458312988281, - "learning_rate": 1.6944371581065873e-05, - "loss": 1.0737, - "step": 7120 - }, - { - "epoch": 0.33031988873435325, - "grad_norm": 4.166738033294678, - "learning_rate": 1.693854706675589e-05, - "loss": 0.9348, - "step": 7125 - }, - { - "epoch": 0.33055169216504404, - "grad_norm": 4.688779830932617, - "learning_rate": 1.6932718009652785e-05, - "loss": 0.8953, - "step": 7130 - }, - { - "epoch": 0.33078349559573483, - "grad_norm": 3.903956890106201, - "learning_rate": 1.6926884413572942e-05, - "loss": 0.9307, - "step": 7135 - }, - { - "epoch": 0.33101529902642557, - "grad_norm": 4.132503509521484, - "learning_rate": 1.6921046282335724e-05, - "loss": 0.969, - "step": 7140 - }, - { - "epoch": 0.33124710245711636, - "grad_norm": 3.627603054046631, - "learning_rate": 1.691520361976347e-05, - "loss": 0.8196, - "step": 7145 - }, - { - "epoch": 0.33147890588780715, - "grad_norm": 3.7588562965393066, - "learning_rate": 1.690935642968146e-05, - "loss": 0.8648, - "step": 7150 - }, - { - "epoch": 0.3317107093184979, - "grad_norm": 4.168130874633789, - "learning_rate": 1.6903504715917972e-05, - "loss": 1.1597, - "step": 7155 - }, - { - "epoch": 0.3319425127491887, - "grad_norm": 3.6999406814575195, - "learning_rate": 1.6897648482304226e-05, - "loss": 0.7546, - "step": 7160 - }, - { - "epoch": 0.33217431617987947, - "grad_norm": 3.8351409435272217, - "learning_rate": 1.68917877326744e-05, - "loss": 0.9968, - "step": 7165 - }, - { - "epoch": 0.33240611961057026, - "grad_norm": 3.701868772506714, - "learning_rate": 1.6885922470865636e-05, - "loss": 0.9861, - "step": 7170 - }, - { - "epoch": 0.332637923041261, - "grad_norm": 3.982545852661133, - "learning_rate": 1.6880052700718028e-05, - "loss": 1.0263, - "step": 7175 - }, - { - "epoch": 0.3328697264719518, - "grad_norm": 4.025231838226318, - "learning_rate": 1.687417842607462e-05, - "loss": 0.8908, - "step": 7180 - }, - { - "epoch": 0.3331015299026426, - "grad_norm": 3.8443965911865234, - "learning_rate": 1.686829965078141e-05, - "loss": 0.8625, - "step": 7185 - }, - { - "epoch": 0.3333333333333333, - "grad_norm": 4.3348517417907715, - "learning_rate": 1.686241637868734e-05, - "loss": 1.0526, - "step": 7190 - }, - { - "epoch": 0.3335651367640241, - "grad_norm": 4.766390800476074, - "learning_rate": 1.6856528613644284e-05, - "loss": 0.9305, - "step": 7195 - }, - { - "epoch": 0.3337969401947149, - "grad_norm": 4.653823375701904, - "learning_rate": 1.6850636359507084e-05, - "loss": 0.985, - "step": 7200 - }, - { - "epoch": 0.3337969401947149, - "eval_loss": 0.9839121103286743, - "eval_runtime": 11.2603, - "eval_samples_per_second": 11.279, - "eval_steps_per_second": 11.279, - "step": 7200 - }, - { - "epoch": 0.33402874362540563, - "grad_norm": 4.698121070861816, - "learning_rate": 1.68447396201335e-05, - "loss": 0.8599, - "step": 7205 - }, - { - "epoch": 0.3342605470560964, - "grad_norm": 3.780442953109741, - "learning_rate": 1.6838838399384233e-05, - "loss": 0.8712, - "step": 7210 - }, - { - "epoch": 0.3344923504867872, - "grad_norm": 3.3134045600891113, - "learning_rate": 1.6832932701122917e-05, - "loss": 0.9061, - "step": 7215 - }, - { - "epoch": 0.33472415391747795, - "grad_norm": 4.624858379364014, - "learning_rate": 1.682702252921613e-05, - "loss": 0.8832, - "step": 7220 - }, - { - "epoch": 0.33495595734816874, - "grad_norm": 3.465853214263916, - "learning_rate": 1.682110788753336e-05, - "loss": 0.7583, - "step": 7225 - }, - { - "epoch": 0.33518776077885953, - "grad_norm": 4.856706142425537, - "learning_rate": 1.6815188779947035e-05, - "loss": 1.1742, - "step": 7230 - }, - { - "epoch": 0.3354195642095503, - "grad_norm": 3.952514171600342, - "learning_rate": 1.68092652103325e-05, - "loss": 1.0064, - "step": 7235 - }, - { - "epoch": 0.33565136764024106, - "grad_norm": 3.868814706802368, - "learning_rate": 1.6803337182568025e-05, - "loss": 0.9385, - "step": 7240 - }, - { - "epoch": 0.33588317107093185, - "grad_norm": 3.863393783569336, - "learning_rate": 1.67974047005348e-05, - "loss": 1.0178, - "step": 7245 - }, - { - "epoch": 0.33611497450162264, - "grad_norm": 4.632596492767334, - "learning_rate": 1.6791467768116927e-05, - "loss": 1.0524, - "step": 7250 - }, - { - "epoch": 0.3363467779323134, - "grad_norm": 3.71608829498291, - "learning_rate": 1.6785526389201422e-05, - "loss": 1.0079, - "step": 7255 - }, - { - "epoch": 0.33657858136300417, - "grad_norm": 4.460151672363281, - "learning_rate": 1.6779580567678218e-05, - "loss": 0.8563, - "step": 7260 - }, - { - "epoch": 0.33681038479369496, - "grad_norm": 3.5773184299468994, - "learning_rate": 1.677363030744015e-05, - "loss": 0.9136, - "step": 7265 - }, - { - "epoch": 0.3370421882243857, - "grad_norm": 4.082240104675293, - "learning_rate": 1.6767675612382964e-05, - "loss": 1.0243, - "step": 7270 - }, - { - "epoch": 0.3372739916550765, - "grad_norm": 4.144321918487549, - "learning_rate": 1.6761716486405305e-05, - "loss": 0.9109, - "step": 7275 - }, - { - "epoch": 0.3375057950857673, - "grad_norm": 3.912290334701538, - "learning_rate": 1.6755752933408717e-05, - "loss": 0.8532, - "step": 7280 - }, - { - "epoch": 0.33773759851645807, - "grad_norm": 4.518337249755859, - "learning_rate": 1.6749784957297656e-05, - "loss": 0.8114, - "step": 7285 - }, - { - "epoch": 0.3379694019471488, - "grad_norm": 4.082379341125488, - "learning_rate": 1.674381256197946e-05, - "loss": 0.9259, - "step": 7290 - }, - { - "epoch": 0.3382012053778396, - "grad_norm": 3.4231996536254883, - "learning_rate": 1.6737835751364364e-05, - "loss": 0.9061, - "step": 7295 - }, - { - "epoch": 0.3384330088085304, - "grad_norm": 4.965606212615967, - "learning_rate": 1.6731854529365494e-05, - "loss": 1.0355, - "step": 7300 - }, - { - "epoch": 0.3384330088085304, - "eval_loss": 0.9803537130355835, - "eval_runtime": 11.265, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 7300 - }, - { - "epoch": 0.3386648122392211, - "grad_norm": 4.2138237953186035, - "learning_rate": 1.6725868899898873e-05, - "loss": 0.9846, - "step": 7305 - }, - { - "epoch": 0.3388966156699119, - "grad_norm": 4.691622734069824, - "learning_rate": 1.671987886688339e-05, - "loss": 1.0037, - "step": 7310 - }, - { - "epoch": 0.3391284191006027, - "grad_norm": 3.772075653076172, - "learning_rate": 1.6713884434240838e-05, - "loss": 0.9368, - "step": 7315 - }, - { - "epoch": 0.33936022253129344, - "grad_norm": 3.8454747200012207, - "learning_rate": 1.6707885605895876e-05, - "loss": 0.9976, - "step": 7320 - }, - { - "epoch": 0.33959202596198423, - "grad_norm": 3.014082193374634, - "learning_rate": 1.6701882385776047e-05, - "loss": 0.9104, - "step": 7325 - }, - { - "epoch": 0.339823829392675, - "grad_norm": 4.067466735839844, - "learning_rate": 1.6695874777811768e-05, - "loss": 1.014, - "step": 7330 - }, - { - "epoch": 0.34005563282336576, - "grad_norm": 4.160797595977783, - "learning_rate": 1.6689862785936336e-05, - "loss": 0.9069, - "step": 7335 - }, - { - "epoch": 0.34028743625405655, - "grad_norm": 3.3378775119781494, - "learning_rate": 1.66838464140859e-05, - "loss": 0.7564, - "step": 7340 - }, - { - "epoch": 0.34051923968474734, - "grad_norm": 3.7165188789367676, - "learning_rate": 1.6677825666199497e-05, - "loss": 0.8907, - "step": 7345 - }, - { - "epoch": 0.34075104311543813, - "grad_norm": 4.068546772003174, - "learning_rate": 1.667180054621902e-05, - "loss": 0.8771, - "step": 7350 - }, - { - "epoch": 0.34098284654612887, - "grad_norm": 4.538022994995117, - "learning_rate": 1.6665771058089218e-05, - "loss": 1.0441, - "step": 7355 - }, - { - "epoch": 0.34121464997681966, - "grad_norm": 3.9702048301696777, - "learning_rate": 1.6659737205757713e-05, - "loss": 0.989, - "step": 7360 - }, - { - "epoch": 0.34144645340751045, - "grad_norm": 3.565910816192627, - "learning_rate": 1.6653698993174975e-05, - "loss": 0.939, - "step": 7365 - }, - { - "epoch": 0.3416782568382012, - "grad_norm": 3.3991103172302246, - "learning_rate": 1.6647656424294334e-05, - "loss": 0.9982, - "step": 7370 - }, - { - "epoch": 0.341910060268892, - "grad_norm": 3.2477567195892334, - "learning_rate": 1.6641609503071968e-05, - "loss": 0.9894, - "step": 7375 - }, - { - "epoch": 0.34214186369958277, - "grad_norm": 3.702672004699707, - "learning_rate": 1.663555823346691e-05, - "loss": 0.9833, - "step": 7380 - }, - { - "epoch": 0.3423736671302735, - "grad_norm": 3.8544015884399414, - "learning_rate": 1.6629502619441028e-05, - "loss": 1.0198, - "step": 7385 - }, - { - "epoch": 0.3426054705609643, - "grad_norm": 3.860546827316284, - "learning_rate": 1.6623442664959054e-05, - "loss": 0.849, - "step": 7390 - }, - { - "epoch": 0.3428372739916551, - "grad_norm": 4.790958881378174, - "learning_rate": 1.6617378373988544e-05, - "loss": 0.8904, - "step": 7395 - }, - { - "epoch": 0.3430690774223458, - "grad_norm": 4.495256423950195, - "learning_rate": 1.6611309750499898e-05, - "loss": 1.0719, - "step": 7400 - }, - { - "epoch": 0.3430690774223458, - "eval_loss": 0.9776688814163208, - "eval_runtime": 11.2736, - "eval_samples_per_second": 11.265, - "eval_steps_per_second": 11.265, - "step": 7400 - }, - { - "epoch": 0.3433008808530366, - "grad_norm": 3.7262792587280273, - "learning_rate": 1.6605236798466358e-05, - "loss": 0.8616, - "step": 7405 - }, - { - "epoch": 0.3435326842837274, - "grad_norm": 3.547213554382324, - "learning_rate": 1.6599159521863995e-05, - "loss": 0.9388, - "step": 7410 - }, - { - "epoch": 0.3437644877144182, - "grad_norm": 3.846677541732788, - "learning_rate": 1.6593077924671712e-05, - "loss": 1.0564, - "step": 7415 - }, - { - "epoch": 0.34399629114510893, - "grad_norm": 3.696018695831299, - "learning_rate": 1.6586992010871245e-05, - "loss": 1.0205, - "step": 7420 - }, - { - "epoch": 0.3442280945757997, - "grad_norm": 4.1325249671936035, - "learning_rate": 1.658090178444715e-05, - "loss": 0.8804, - "step": 7425 - }, - { - "epoch": 0.3444598980064905, - "grad_norm": 4.22160005569458, - "learning_rate": 1.657480724938681e-05, - "loss": 0.9673, - "step": 7430 - }, - { - "epoch": 0.34469170143718125, - "grad_norm": 3.7792885303497314, - "learning_rate": 1.6568708409680422e-05, - "loss": 0.9713, - "step": 7435 - }, - { - "epoch": 0.34492350486787204, - "grad_norm": 3.9127721786499023, - "learning_rate": 1.6562605269321017e-05, - "loss": 0.8601, - "step": 7440 - }, - { - "epoch": 0.34515530829856284, - "grad_norm": 3.9979143142700195, - "learning_rate": 1.6556497832304425e-05, - "loss": 1.0057, - "step": 7445 - }, - { - "epoch": 0.34538711172925357, - "grad_norm": 4.151058673858643, - "learning_rate": 1.6550386102629302e-05, - "loss": 1.0317, - "step": 7450 - }, - { - "epoch": 0.34561891515994436, - "grad_norm": 4.163232803344727, - "learning_rate": 1.6544270084297107e-05, - "loss": 1.0535, - "step": 7455 - }, - { - "epoch": 0.34585071859063515, - "grad_norm": 3.3977575302124023, - "learning_rate": 1.6538149781312102e-05, - "loss": 0.9496, - "step": 7460 - }, - { - "epoch": 0.3460825220213259, - "grad_norm": 4.182466983795166, - "learning_rate": 1.6532025197681368e-05, - "loss": 0.9746, - "step": 7465 - }, - { - "epoch": 0.3463143254520167, - "grad_norm": 4.1667256355285645, - "learning_rate": 1.652589633741478e-05, - "loss": 0.9128, - "step": 7470 - }, - { - "epoch": 0.34654612888270747, - "grad_norm": 3.694902181625366, - "learning_rate": 1.6519763204525015e-05, - "loss": 0.8044, - "step": 7475 - }, - { - "epoch": 0.34677793231339826, - "grad_norm": 4.3444085121154785, - "learning_rate": 1.6513625803027545e-05, - "loss": 0.9057, - "step": 7480 - }, - { - "epoch": 0.347009735744089, - "grad_norm": 4.2170090675354, - "learning_rate": 1.650748413694064e-05, - "loss": 1.0288, - "step": 7485 - }, - { - "epoch": 0.3472415391747798, - "grad_norm": 3.5477216243743896, - "learning_rate": 1.650133821028536e-05, - "loss": 0.9128, - "step": 7490 - }, - { - "epoch": 0.3474733426054706, - "grad_norm": 4.178347587585449, - "learning_rate": 1.6495188027085554e-05, - "loss": 1.012, - "step": 7495 - }, - { - "epoch": 0.3477051460361613, - "grad_norm": 4.44584321975708, - "learning_rate": 1.648903359136786e-05, - "loss": 1.1048, - "step": 7500 - }, - { - "epoch": 0.3477051460361613, - "eval_loss": 0.9784466624259949, - "eval_runtime": 11.2673, - "eval_samples_per_second": 11.272, - "eval_steps_per_second": 11.272, - "step": 7500 - }, - { - "epoch": 0.3479369494668521, - "grad_norm": 3.767441511154175, - "learning_rate": 1.64828749071617e-05, - "loss": 1.0245, - "step": 7505 - }, - { - "epoch": 0.3481687528975429, - "grad_norm": 3.552576780319214, - "learning_rate": 1.6476711978499275e-05, - "loss": 0.8817, - "step": 7510 - }, - { - "epoch": 0.34840055632823363, - "grad_norm": 4.1061906814575195, - "learning_rate": 1.6470544809415568e-05, - "loss": 1.0079, - "step": 7515 - }, - { - "epoch": 0.3486323597589244, - "grad_norm": 4.06695032119751, - "learning_rate": 1.6464373403948336e-05, - "loss": 0.957, - "step": 7520 - }, - { - "epoch": 0.3488641631896152, - "grad_norm": 3.781661033630371, - "learning_rate": 1.6458197766138115e-05, - "loss": 0.9984, - "step": 7525 - }, - { - "epoch": 0.34909596662030595, - "grad_norm": 3.849099636077881, - "learning_rate": 1.6452017900028198e-05, - "loss": 0.9009, - "step": 7530 - }, - { - "epoch": 0.34932777005099674, - "grad_norm": 4.14891242980957, - "learning_rate": 1.6445833809664664e-05, - "loss": 1.1266, - "step": 7535 - }, - { - "epoch": 0.34955957348168754, - "grad_norm": 5.304938316345215, - "learning_rate": 1.6439645499096347e-05, - "loss": 0.9744, - "step": 7540 - }, - { - "epoch": 0.3497913769123783, - "grad_norm": 3.5674333572387695, - "learning_rate": 1.6433452972374846e-05, - "loss": 0.9666, - "step": 7545 - }, - { - "epoch": 0.35002318034306906, - "grad_norm": 3.3488993644714355, - "learning_rate": 1.6427256233554523e-05, - "loss": 0.8965, - "step": 7550 - }, - { - "epoch": 0.35025498377375985, - "grad_norm": 3.5717201232910156, - "learning_rate": 1.6421055286692492e-05, - "loss": 0.8758, - "step": 7555 - }, - { - "epoch": 0.35048678720445064, - "grad_norm": 4.32782506942749, - "learning_rate": 1.641485013584863e-05, - "loss": 1.021, - "step": 7560 - }, - { - "epoch": 0.3507185906351414, - "grad_norm": 4.122309684753418, - "learning_rate": 1.640864078508556e-05, - "loss": 0.93, - "step": 7565 - }, - { - "epoch": 0.35095039406583217, - "grad_norm": 3.43186616897583, - "learning_rate": 1.640242723846866e-05, - "loss": 0.9018, - "step": 7570 - }, - { - "epoch": 0.35118219749652296, - "grad_norm": 4.272997856140137, - "learning_rate": 1.6396209500066045e-05, - "loss": 0.9095, - "step": 7575 - }, - { - "epoch": 0.3514140009272137, - "grad_norm": 3.9995079040527344, - "learning_rate": 1.6389987573948588e-05, - "loss": 0.9397, - "step": 7580 - }, - { - "epoch": 0.3516458043579045, - "grad_norm": 3.7181596755981445, - "learning_rate": 1.6383761464189895e-05, - "loss": 0.7581, - "step": 7585 - }, - { - "epoch": 0.3518776077885953, - "grad_norm": 3.570861577987671, - "learning_rate": 1.6377531174866315e-05, - "loss": 0.973, - "step": 7590 - }, - { - "epoch": 0.3521094112192861, - "grad_norm": 3.842984676361084, - "learning_rate": 1.637129671005693e-05, - "loss": 0.939, - "step": 7595 - }, - { - "epoch": 0.3523412146499768, - "grad_norm": 3.6865546703338623, - "learning_rate": 1.6365058073843562e-05, - "loss": 0.958, - "step": 7600 - }, - { - "epoch": 0.3523412146499768, - "eval_loss": 0.9766842722892761, - "eval_runtime": 11.2741, - "eval_samples_per_second": 11.265, - "eval_steps_per_second": 11.265, - "step": 7600 - }, - { - "epoch": 0.3525730180806676, - "grad_norm": 3.9954993724823, - "learning_rate": 1.6358815270310752e-05, - "loss": 1.032, - "step": 7605 - }, - { - "epoch": 0.3528048215113584, - "grad_norm": 3.9146227836608887, - "learning_rate": 1.6352568303545782e-05, - "loss": 0.9987, - "step": 7610 - }, - { - "epoch": 0.3530366249420491, - "grad_norm": 3.8388328552246094, - "learning_rate": 1.6346317177638658e-05, - "loss": 0.8106, - "step": 7615 - }, - { - "epoch": 0.3532684283727399, - "grad_norm": 3.9061410427093506, - "learning_rate": 1.6340061896682098e-05, - "loss": 1.0071, - "step": 7620 - }, - { - "epoch": 0.3535002318034307, - "grad_norm": 5.165677547454834, - "learning_rate": 1.6333802464771557e-05, - "loss": 1.1079, - "step": 7625 - }, - { - "epoch": 0.35373203523412144, - "grad_norm": 4.240540027618408, - "learning_rate": 1.632753888600519e-05, - "loss": 0.9281, - "step": 7630 - }, - { - "epoch": 0.35396383866481224, - "grad_norm": 3.3363611698150635, - "learning_rate": 1.6321271164483884e-05, - "loss": 0.9122, - "step": 7635 - }, - { - "epoch": 0.354195642095503, - "grad_norm": 3.6181395053863525, - "learning_rate": 1.6314999304311222e-05, - "loss": 0.7473, - "step": 7640 - }, - { - "epoch": 0.35442744552619376, - "grad_norm": 3.771057605743408, - "learning_rate": 1.630872330959352e-05, - "loss": 1.0243, - "step": 7645 - }, - { - "epoch": 0.35465924895688455, - "grad_norm": 4.966143608093262, - "learning_rate": 1.630244318443977e-05, - "loss": 0.9069, - "step": 7650 - }, - { - "epoch": 0.35489105238757535, - "grad_norm": 4.111125946044922, - "learning_rate": 1.629615893296169e-05, - "loss": 0.9339, - "step": 7655 - }, - { - "epoch": 0.35512285581826614, - "grad_norm": 3.705050230026245, - "learning_rate": 1.6289870559273696e-05, - "loss": 0.8188, - "step": 7660 - }, - { - "epoch": 0.3553546592489569, - "grad_norm": 3.890373945236206, - "learning_rate": 1.62835780674929e-05, - "loss": 0.7916, - "step": 7665 - }, - { - "epoch": 0.35558646267964766, - "grad_norm": 3.7870006561279297, - "learning_rate": 1.627728146173911e-05, - "loss": 0.8877, - "step": 7670 - }, - { - "epoch": 0.35581826611033845, - "grad_norm": 4.223922252655029, - "learning_rate": 1.627098074613483e-05, - "loss": 0.9675, - "step": 7675 - }, - { - "epoch": 0.3560500695410292, - "grad_norm": 3.460359573364258, - "learning_rate": 1.626467592480525e-05, - "loss": 0.956, - "step": 7680 - }, - { - "epoch": 0.35628187297172, - "grad_norm": 4.657145977020264, - "learning_rate": 1.625836700187825e-05, - "loss": 0.8836, - "step": 7685 - }, - { - "epoch": 0.3565136764024108, - "grad_norm": 3.332714319229126, - "learning_rate": 1.625205398148441e-05, - "loss": 0.9026, - "step": 7690 - }, - { - "epoch": 0.3567454798331015, - "grad_norm": 3.8605220317840576, - "learning_rate": 1.624573686775696e-05, - "loss": 1.0878, - "step": 7695 - }, - { - "epoch": 0.3569772832637923, - "grad_norm": 3.559641122817993, - "learning_rate": 1.6239415664831844e-05, - "loss": 0.9073, - "step": 7700 - }, - { - "epoch": 0.3569772832637923, - "eval_loss": 0.9736750721931458, - "eval_runtime": 11.2674, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 7700 - }, - { - "epoch": 0.3572090866944831, - "grad_norm": 3.865588426589966, - "learning_rate": 1.623309037684766e-05, - "loss": 1.0368, - "step": 7705 - }, - { - "epoch": 0.3574408901251738, - "grad_norm": 6.948596000671387, - "learning_rate": 1.6226761007945696e-05, - "loss": 0.8942, - "step": 7710 - }, - { - "epoch": 0.3576726935558646, - "grad_norm": 4.2951812744140625, - "learning_rate": 1.62204275622699e-05, - "loss": 0.7803, - "step": 7715 - }, - { - "epoch": 0.3579044969865554, - "grad_norm": 3.882315158843994, - "learning_rate": 1.62140900439669e-05, - "loss": 0.8272, - "step": 7720 - }, - { - "epoch": 0.3581363004172462, - "grad_norm": 4.093032360076904, - "learning_rate": 1.6207748457185974e-05, - "loss": 0.8589, - "step": 7725 - }, - { - "epoch": 0.35836810384793694, - "grad_norm": 3.319401264190674, - "learning_rate": 1.6201402806079086e-05, - "loss": 0.7305, - "step": 7730 - }, - { - "epoch": 0.3585999072786277, - "grad_norm": 3.839540958404541, - "learning_rate": 1.6195053094800844e-05, - "loss": 1.0484, - "step": 7735 - }, - { - "epoch": 0.3588317107093185, - "grad_norm": 3.2904021739959717, - "learning_rate": 1.6188699327508518e-05, - "loss": 0.9034, - "step": 7740 - }, - { - "epoch": 0.35906351414000925, - "grad_norm": 3.4221081733703613, - "learning_rate": 1.6182341508362044e-05, - "loss": 0.9592, - "step": 7745 - }, - { - "epoch": 0.35929531757070005, - "grad_norm": 4.381012916564941, - "learning_rate": 1.6175979641523986e-05, - "loss": 1.008, - "step": 7750 - }, - { - "epoch": 0.35952712100139084, - "grad_norm": 4.044552326202393, - "learning_rate": 1.6169613731159586e-05, - "loss": 0.8802, - "step": 7755 - }, - { - "epoch": 0.3597589244320816, - "grad_norm": 3.523761510848999, - "learning_rate": 1.6163243781436723e-05, - "loss": 0.9593, - "step": 7760 - }, - { - "epoch": 0.35999072786277236, - "grad_norm": 3.548126697540283, - "learning_rate": 1.615686979652591e-05, - "loss": 0.8895, - "step": 7765 - }, - { - "epoch": 0.36022253129346316, - "grad_norm": 4.415576457977295, - "learning_rate": 1.615049178060032e-05, - "loss": 0.9599, - "step": 7770 - }, - { - "epoch": 0.3604543347241539, - "grad_norm": 3.6763060092926025, - "learning_rate": 1.6144109737835748e-05, - "loss": 1.0321, - "step": 7775 - }, - { - "epoch": 0.3606861381548447, - "grad_norm": 3.7065987586975098, - "learning_rate": 1.613772367241064e-05, - "loss": 0.9211, - "step": 7780 - }, - { - "epoch": 0.3609179415855355, - "grad_norm": 3.909865379333496, - "learning_rate": 1.6131333588506064e-05, - "loss": 0.9941, - "step": 7785 - }, - { - "epoch": 0.36114974501622626, - "grad_norm": 3.2999017238616943, - "learning_rate": 1.6124939490305732e-05, - "loss": 0.8126, - "step": 7790 - }, - { - "epoch": 0.361381548446917, - "grad_norm": 4.182840347290039, - "learning_rate": 1.6118541381995966e-05, - "loss": 0.9713, - "step": 7795 - }, - { - "epoch": 0.3616133518776078, - "grad_norm": 4.793496131896973, - "learning_rate": 1.6112139267765737e-05, - "loss": 0.9375, - "step": 7800 - }, - { - "epoch": 0.3616133518776078, - "eval_loss": 0.9727134108543396, - "eval_runtime": 11.2846, - "eval_samples_per_second": 11.254, - "eval_steps_per_second": 11.254, - "step": 7800 - }, - { - "epoch": 0.3618451553082986, - "grad_norm": 3.9969582557678223, - "learning_rate": 1.6105733151806615e-05, - "loss": 1.021, - "step": 7805 - }, - { - "epoch": 0.3620769587389893, - "grad_norm": 3.9675846099853516, - "learning_rate": 1.60993230383128e-05, - "loss": 0.9999, - "step": 7810 - }, - { - "epoch": 0.3623087621696801, - "grad_norm": 3.7798831462860107, - "learning_rate": 1.609290893148112e-05, - "loss": 1.0193, - "step": 7815 - }, - { - "epoch": 0.3625405656003709, - "grad_norm": 4.016881942749023, - "learning_rate": 1.6086490835511e-05, - "loss": 1.0227, - "step": 7820 - }, - { - "epoch": 0.36277236903106164, - "grad_norm": 3.7668068408966064, - "learning_rate": 1.6080068754604486e-05, - "loss": 1.0612, - "step": 7825 - }, - { - "epoch": 0.36300417246175243, - "grad_norm": 3.260951280593872, - "learning_rate": 1.6073642692966233e-05, - "loss": 0.955, - "step": 7830 - }, - { - "epoch": 0.3632359758924432, - "grad_norm": 3.218118190765381, - "learning_rate": 1.6067212654803503e-05, - "loss": 0.7994, - "step": 7835 - }, - { - "epoch": 0.36346777932313395, - "grad_norm": 4.213292121887207, - "learning_rate": 1.606077864432615e-05, - "loss": 0.909, - "step": 7840 - }, - { - "epoch": 0.36369958275382475, - "grad_norm": 4.328354835510254, - "learning_rate": 1.6054340665746645e-05, - "loss": 0.9793, - "step": 7845 - }, - { - "epoch": 0.36393138618451554, - "grad_norm": 3.9034736156463623, - "learning_rate": 1.6047898723280045e-05, - "loss": 0.9643, - "step": 7850 - }, - { - "epoch": 0.36416318961520633, - "grad_norm": 3.579296350479126, - "learning_rate": 1.6041452821144013e-05, - "loss": 0.8492, - "step": 7855 - }, - { - "epoch": 0.36439499304589706, - "grad_norm": 6.530494689941406, - "learning_rate": 1.6035002963558796e-05, - "loss": 1.1487, - "step": 7860 - }, - { - "epoch": 0.36462679647658786, - "grad_norm": 4.108696937561035, - "learning_rate": 1.602854915474723e-05, - "loss": 0.911, - "step": 7865 - }, - { - "epoch": 0.36485859990727865, - "grad_norm": 4.311334133148193, - "learning_rate": 1.6022091398934746e-05, - "loss": 0.9403, - "step": 7870 - }, - { - "epoch": 0.3650904033379694, - "grad_norm": 3.4167885780334473, - "learning_rate": 1.601562970034935e-05, - "loss": 0.9623, - "step": 7875 - }, - { - "epoch": 0.3653222067686602, - "grad_norm": 3.9069101810455322, - "learning_rate": 1.600916406322164e-05, - "loss": 0.8251, - "step": 7880 - }, - { - "epoch": 0.36555401019935096, - "grad_norm": 3.9749691486358643, - "learning_rate": 1.600269449178478e-05, - "loss": 0.8434, - "step": 7885 - }, - { - "epoch": 0.3657858136300417, - "grad_norm": 3.524258852005005, - "learning_rate": 1.599622099027452e-05, - "loss": 0.8075, - "step": 7890 - }, - { - "epoch": 0.3660176170607325, - "grad_norm": 3.6314921379089355, - "learning_rate": 1.5989743562929177e-05, - "loss": 0.9493, - "step": 7895 - }, - { - "epoch": 0.3662494204914233, - "grad_norm": 3.23356294631958, - "learning_rate": 1.598326221398964e-05, - "loss": 0.9669, - "step": 7900 - }, - { - "epoch": 0.3662494204914233, - "eval_loss": 0.9721471071243286, - "eval_runtime": 11.2707, - "eval_samples_per_second": 11.268, - "eval_steps_per_second": 11.268, - "step": 7900 - }, - { - "epoch": 0.3664812239221141, - "grad_norm": 4.47834587097168, - "learning_rate": 1.5976776947699368e-05, - "loss": 1.0061, - "step": 7905 - }, - { - "epoch": 0.3667130273528048, - "grad_norm": 4.536191463470459, - "learning_rate": 1.5970287768304385e-05, - "loss": 1.0296, - "step": 7910 - }, - { - "epoch": 0.3669448307834956, - "grad_norm": 3.6170647144317627, - "learning_rate": 1.596379468005327e-05, - "loss": 0.8788, - "step": 7915 - }, - { - "epoch": 0.3671766342141864, - "grad_norm": 3.2994272708892822, - "learning_rate": 1.5957297687197172e-05, - "loss": 0.8066, - "step": 7920 - }, - { - "epoch": 0.36740843764487713, - "grad_norm": 3.139523983001709, - "learning_rate": 1.5950796793989785e-05, - "loss": 0.882, - "step": 7925 - }, - { - "epoch": 0.3676402410755679, - "grad_norm": 3.833423376083374, - "learning_rate": 1.5944292004687362e-05, - "loss": 0.9673, - "step": 7930 - }, - { - "epoch": 0.3678720445062587, - "grad_norm": 4.070364952087402, - "learning_rate": 1.5937783323548715e-05, - "loss": 0.9485, - "step": 7935 - }, - { - "epoch": 0.36810384793694945, - "grad_norm": 4.206231117248535, - "learning_rate": 1.593127075483519e-05, - "loss": 1.0304, - "step": 7940 - }, - { - "epoch": 0.36833565136764024, - "grad_norm": 4.32648229598999, - "learning_rate": 1.5924754302810682e-05, - "loss": 1.0254, - "step": 7945 - }, - { - "epoch": 0.36856745479833103, - "grad_norm": 3.5725724697113037, - "learning_rate": 1.591823397174163e-05, - "loss": 0.9672, - "step": 7950 - }, - { - "epoch": 0.36879925822902176, - "grad_norm": 4.0060954093933105, - "learning_rate": 1.5911709765897027e-05, - "loss": 0.9852, - "step": 7955 - }, - { - "epoch": 0.36903106165971256, - "grad_norm": 4.210195541381836, - "learning_rate": 1.5905181689548373e-05, - "loss": 0.9808, - "step": 7960 - }, - { - "epoch": 0.36926286509040335, - "grad_norm": 4.058366775512695, - "learning_rate": 1.5898649746969728e-05, - "loss": 0.8752, - "step": 7965 - }, - { - "epoch": 0.36949466852109414, - "grad_norm": 3.9418766498565674, - "learning_rate": 1.589211394243767e-05, - "loss": 1.0223, - "step": 7970 - }, - { - "epoch": 0.3697264719517849, - "grad_norm": 3.892645835876465, - "learning_rate": 1.5885574280231308e-05, - "loss": 0.8914, - "step": 7975 - }, - { - "epoch": 0.36995827538247567, - "grad_norm": 3.822187900543213, - "learning_rate": 1.587903076463228e-05, - "loss": 1.0347, - "step": 7980 - }, - { - "epoch": 0.37019007881316646, - "grad_norm": 4.411693096160889, - "learning_rate": 1.587248339992475e-05, - "loss": 1.0486, - "step": 7985 - }, - { - "epoch": 0.3704218822438572, - "grad_norm": 3.2753002643585205, - "learning_rate": 1.5865932190395386e-05, - "loss": 0.801, - "step": 7990 - }, - { - "epoch": 0.370653685674548, - "grad_norm": 3.6129369735717773, - "learning_rate": 1.5859377140333393e-05, - "loss": 0.8998, - "step": 7995 - }, - { - "epoch": 0.3708854891052388, - "grad_norm": 3.9310081005096436, - "learning_rate": 1.5852818254030474e-05, - "loss": 0.922, - "step": 8000 - }, - { - "epoch": 0.3708854891052388, - "eval_loss": 0.9707987904548645, - "eval_runtime": 11.2704, - "eval_samples_per_second": 11.268, - "eval_steps_per_second": 11.268, - "step": 8000 - }, - { - "epoch": 0.3711172925359295, - "grad_norm": 4.156801700592041, - "learning_rate": 1.5846255535780863e-05, - "loss": 1.0529, - "step": 8005 - }, - { - "epoch": 0.3713490959666203, - "grad_norm": 5.164330005645752, - "learning_rate": 1.583968898988128e-05, - "loss": 1.0731, - "step": 8010 - }, - { - "epoch": 0.3715808993973111, - "grad_norm": 3.661679983139038, - "learning_rate": 1.5833118620630966e-05, - "loss": 0.9838, - "step": 8015 - }, - { - "epoch": 0.37181270282800183, - "grad_norm": 4.266704082489014, - "learning_rate": 1.5826544432331666e-05, - "loss": 0.8833, - "step": 8020 - }, - { - "epoch": 0.3720445062586926, - "grad_norm": 5.135181903839111, - "learning_rate": 1.5819966429287612e-05, - "loss": 1.0761, - "step": 8025 - }, - { - "epoch": 0.3722763096893834, - "grad_norm": 3.6553714275360107, - "learning_rate": 1.581338461580555e-05, - "loss": 0.7655, - "step": 8030 - }, - { - "epoch": 0.3725081131200742, - "grad_norm": 4.098663806915283, - "learning_rate": 1.5806798996194704e-05, - "loss": 1.0577, - "step": 8035 - }, - { - "epoch": 0.37273991655076494, - "grad_norm": 3.716402769088745, - "learning_rate": 1.5800209574766813e-05, - "loss": 0.9381, - "step": 8040 - }, - { - "epoch": 0.37297171998145573, - "grad_norm": 4.095868110656738, - "learning_rate": 1.5793616355836076e-05, - "loss": 1.0957, - "step": 8045 - }, - { - "epoch": 0.3732035234121465, - "grad_norm": 2.871941328048706, - "learning_rate": 1.5787019343719206e-05, - "loss": 0.6745, - "step": 8050 - }, - { - "epoch": 0.37343532684283726, - "grad_norm": 4.028416633605957, - "learning_rate": 1.578041854273538e-05, - "loss": 1.1041, - "step": 8055 - }, - { - "epoch": 0.37366713027352805, - "grad_norm": 3.7373595237731934, - "learning_rate": 1.5773813957206265e-05, - "loss": 1.1335, - "step": 8060 - }, - { - "epoch": 0.37389893370421884, - "grad_norm": 3.2081689834594727, - "learning_rate": 1.5767205591456e-05, - "loss": 0.8999, - "step": 8065 - }, - { - "epoch": 0.3741307371349096, - "grad_norm": 3.564337730407715, - "learning_rate": 1.576059344981121e-05, - "loss": 0.816, - "step": 8070 - }, - { - "epoch": 0.37436254056560037, - "grad_norm": 4.3315653800964355, - "learning_rate": 1.575397753660098e-05, - "loss": 0.9207, - "step": 8075 - }, - { - "epoch": 0.37459434399629116, - "grad_norm": 4.162485599517822, - "learning_rate": 1.5747357856156864e-05, - "loss": 0.9209, - "step": 8080 - }, - { - "epoch": 0.3748261474269819, - "grad_norm": 3.711111545562744, - "learning_rate": 1.5740734412812894e-05, - "loss": 0.7482, - "step": 8085 - }, - { - "epoch": 0.3750579508576727, - "grad_norm": 3.6291377544403076, - "learning_rate": 1.5734107210905557e-05, - "loss": 0.8207, - "step": 8090 - }, - { - "epoch": 0.3752897542883635, - "grad_norm": 3.9952619075775146, - "learning_rate": 1.5727476254773805e-05, - "loss": 1.0326, - "step": 8095 - }, - { - "epoch": 0.37552155771905427, - "grad_norm": 3.7901878356933594, - "learning_rate": 1.572084154875904e-05, - "loss": 0.8866, - "step": 8100 - }, - { - "epoch": 0.37552155771905427, - "eval_loss": 0.9721558094024658, - "eval_runtime": 11.2673, - "eval_samples_per_second": 11.272, - "eval_steps_per_second": 11.272, - "step": 8100 - }, - { - "epoch": 0.375753361149745, - "grad_norm": 3.700965642929077, - "learning_rate": 1.5714203097205126e-05, - "loss": 0.9772, - "step": 8105 - }, - { - "epoch": 0.3759851645804358, - "grad_norm": 3.4462482929229736, - "learning_rate": 1.5707560904458385e-05, - "loss": 0.8462, - "step": 8110 - }, - { - "epoch": 0.3762169680111266, - "grad_norm": 4.038230895996094, - "learning_rate": 1.570091497486758e-05, - "loss": 0.9706, - "step": 8115 - }, - { - "epoch": 0.3764487714418173, - "grad_norm": 3.681793689727783, - "learning_rate": 1.5694265312783915e-05, - "loss": 0.9543, - "step": 8120 - }, - { - "epoch": 0.3766805748725081, - "grad_norm": 3.430973529815674, - "learning_rate": 1.5687611922561048e-05, - "loss": 0.9888, - "step": 8125 - }, - { - "epoch": 0.3769123783031989, - "grad_norm": 4.354652404785156, - "learning_rate": 1.568095480855508e-05, - "loss": 0.9147, - "step": 8130 - }, - { - "epoch": 0.37714418173388964, - "grad_norm": 4.103789806365967, - "learning_rate": 1.5674293975124537e-05, - "loss": 1.02, - "step": 8135 - }, - { - "epoch": 0.37737598516458043, - "grad_norm": 3.2359514236450195, - "learning_rate": 1.5667629426630392e-05, - "loss": 0.8205, - "step": 8140 - }, - { - "epoch": 0.3776077885952712, - "grad_norm": 3.833833932876587, - "learning_rate": 1.5660961167436043e-05, - "loss": 0.8056, - "step": 8145 - }, - { - "epoch": 0.37783959202596196, - "grad_norm": 3.0617690086364746, - "learning_rate": 1.565428920190733e-05, - "loss": 0.9386, - "step": 8150 - }, - { - "epoch": 0.37807139545665275, - "grad_norm": 3.7850069999694824, - "learning_rate": 1.5647613534412495e-05, - "loss": 0.9045, - "step": 8155 - }, - { - "epoch": 0.37830319888734354, - "grad_norm": 4.245758056640625, - "learning_rate": 1.5640934169322236e-05, - "loss": 0.922, - "step": 8160 - }, - { - "epoch": 0.37853500231803433, - "grad_norm": 3.542757749557495, - "learning_rate": 1.5634251111009644e-05, - "loss": 1.0869, - "step": 8165 - }, - { - "epoch": 0.37876680574872507, - "grad_norm": 3.3576743602752686, - "learning_rate": 1.5627564363850244e-05, - "loss": 0.9502, - "step": 8170 - }, - { - "epoch": 0.37899860917941586, - "grad_norm": 3.7136809825897217, - "learning_rate": 1.5620873932221966e-05, - "loss": 0.8285, - "step": 8175 - }, - { - "epoch": 0.37923041261010665, - "grad_norm": 4.323848724365234, - "learning_rate": 1.5614179820505162e-05, - "loss": 1.0334, - "step": 8180 - }, - { - "epoch": 0.3794622160407974, - "grad_norm": 3.9784998893737793, - "learning_rate": 1.5607482033082586e-05, - "loss": 0.9102, - "step": 8185 - }, - { - "epoch": 0.3796940194714882, - "grad_norm": 3.1557223796844482, - "learning_rate": 1.56007805743394e-05, - "loss": 0.8081, - "step": 8190 - }, - { - "epoch": 0.37992582290217897, - "grad_norm": 4.566058158874512, - "learning_rate": 1.5594075448663183e-05, - "loss": 0.9558, - "step": 8195 - }, - { - "epoch": 0.3801576263328697, - "grad_norm": 3.514949321746826, - "learning_rate": 1.5587366660443887e-05, - "loss": 1.0877, - "step": 8200 - }, - { - "epoch": 0.3801576263328697, - "eval_loss": 0.9680636525154114, - "eval_runtime": 11.2709, - "eval_samples_per_second": 11.268, - "eval_steps_per_second": 11.268, - "step": 8200 - }, - { - "epoch": 0.3803894297635605, - "grad_norm": 3.738237142562866, - "learning_rate": 1.5580654214073885e-05, - "loss": 0.857, - "step": 8205 - }, - { - "epoch": 0.3806212331942513, - "grad_norm": 4.238804817199707, - "learning_rate": 1.5573938113947938e-05, - "loss": 0.8178, - "step": 8210 - }, - { - "epoch": 0.3808530366249421, - "grad_norm": 4.3957438468933105, - "learning_rate": 1.5567218364463198e-05, - "loss": 0.9658, - "step": 8215 - }, - { - "epoch": 0.3810848400556328, - "grad_norm": 3.63651967048645, - "learning_rate": 1.556049497001921e-05, - "loss": 0.9853, - "step": 8220 - }, - { - "epoch": 0.3813166434863236, - "grad_norm": 3.9356980323791504, - "learning_rate": 1.5553767935017895e-05, - "loss": 0.7605, - "step": 8225 - }, - { - "epoch": 0.3815484469170144, - "grad_norm": 4.169957637786865, - "learning_rate": 1.5547037263863577e-05, - "loss": 1.0644, - "step": 8230 - }, - { - "epoch": 0.38178025034770513, - "grad_norm": 3.485525131225586, - "learning_rate": 1.554030296096294e-05, - "loss": 0.9829, - "step": 8235 - }, - { - "epoch": 0.3820120537783959, - "grad_norm": 4.001947402954102, - "learning_rate": 1.553356503072506e-05, - "loss": 0.8294, - "step": 8240 - }, - { - "epoch": 0.3822438572090867, - "grad_norm": 3.6511752605438232, - "learning_rate": 1.552682347756138e-05, - "loss": 0.8348, - "step": 8245 - }, - { - "epoch": 0.38247566063977745, - "grad_norm": 3.49690580368042, - "learning_rate": 1.5520078305885712e-05, - "loss": 0.929, - "step": 8250 - }, - { - "epoch": 0.38270746407046824, - "grad_norm": 4.088393211364746, - "learning_rate": 1.5513329520114254e-05, - "loss": 0.9723, - "step": 8255 - }, - { - "epoch": 0.38293926750115903, - "grad_norm": 3.4010307788848877, - "learning_rate": 1.5506577124665552e-05, - "loss": 0.8488, - "step": 8260 - }, - { - "epoch": 0.38317107093184977, - "grad_norm": 4.285269260406494, - "learning_rate": 1.5499821123960525e-05, - "loss": 0.9262, - "step": 8265 - }, - { - "epoch": 0.38340287436254056, - "grad_norm": 4.1062517166137695, - "learning_rate": 1.5493061522422447e-05, - "loss": 0.8384, - "step": 8270 - }, - { - "epoch": 0.38363467779323135, - "grad_norm": 3.7376549243927, - "learning_rate": 1.548629832447696e-05, - "loss": 1.0614, - "step": 8275 - }, - { - "epoch": 0.38386648122392214, - "grad_norm": 4.19257926940918, - "learning_rate": 1.547953153455204e-05, - "loss": 1.16, - "step": 8280 - }, - { - "epoch": 0.3840982846546129, - "grad_norm": 4.338336944580078, - "learning_rate": 1.547276115707803e-05, - "loss": 1.0549, - "step": 8285 - }, - { - "epoch": 0.38433008808530367, - "grad_norm": 3.7592556476593018, - "learning_rate": 1.5465987196487634e-05, - "loss": 0.7895, - "step": 8290 - }, - { - "epoch": 0.38456189151599446, - "grad_norm": 3.735196352005005, - "learning_rate": 1.545920965721587e-05, - "loss": 0.9698, - "step": 8295 - }, - { - "epoch": 0.3847936949466852, - "grad_norm": 3.527911424636841, - "learning_rate": 1.5452428543700126e-05, - "loss": 0.9027, - "step": 8300 - }, - { - "epoch": 0.3847936949466852, - "eval_loss": 0.9673840403556824, - "eval_runtime": 11.2732, - "eval_samples_per_second": 11.266, - "eval_steps_per_second": 11.266, - "step": 8300 - }, - { - "epoch": 0.385025498377376, - "grad_norm": 4.203237533569336, - "learning_rate": 1.544564386038012e-05, - "loss": 1.004, - "step": 8305 - }, - { - "epoch": 0.3852573018080668, - "grad_norm": 4.254677772521973, - "learning_rate": 1.5438855611697903e-05, - "loss": 0.8579, - "step": 8310 - }, - { - "epoch": 0.3854891052387575, - "grad_norm": 3.9172377586364746, - "learning_rate": 1.5432063802097866e-05, - "loss": 0.8802, - "step": 8315 - }, - { - "epoch": 0.3857209086694483, - "grad_norm": 4.3555498123168945, - "learning_rate": 1.5425268436026736e-05, - "loss": 0.8698, - "step": 8320 - }, - { - "epoch": 0.3859527121001391, - "grad_norm": 4.431722640991211, - "learning_rate": 1.541846951793356e-05, - "loss": 1.0022, - "step": 8325 - }, - { - "epoch": 0.38618451553082983, - "grad_norm": 4.228847026824951, - "learning_rate": 1.541166705226971e-05, - "loss": 1.0159, - "step": 8330 - }, - { - "epoch": 0.3864163189615206, - "grad_norm": 3.549771547317505, - "learning_rate": 1.5404861043488883e-05, - "loss": 0.8531, - "step": 8335 - }, - { - "epoch": 0.3866481223922114, - "grad_norm": 4.803309917449951, - "learning_rate": 1.5398051496047105e-05, - "loss": 1.0774, - "step": 8340 - }, - { - "epoch": 0.3868799258229022, - "grad_norm": 3.5839805603027344, - "learning_rate": 1.5391238414402704e-05, - "loss": 1.0094, - "step": 8345 - }, - { - "epoch": 0.38711172925359294, - "grad_norm": 3.267014980316162, - "learning_rate": 1.5384421803016337e-05, - "loss": 0.9279, - "step": 8350 - }, - { - "epoch": 0.38734353268428373, - "grad_norm": 4.285528659820557, - "learning_rate": 1.5377601666350953e-05, - "loss": 0.9292, - "step": 8355 - }, - { - "epoch": 0.3875753361149745, - "grad_norm": 4.261288166046143, - "learning_rate": 1.537077800887182e-05, - "loss": 0.9634, - "step": 8360 - }, - { - "epoch": 0.38780713954566526, - "grad_norm": 4.2175517082214355, - "learning_rate": 1.5363950835046524e-05, - "loss": 0.94, - "step": 8365 - }, - { - "epoch": 0.38803894297635605, - "grad_norm": 4.262879371643066, - "learning_rate": 1.5357120149344923e-05, - "loss": 1.0298, - "step": 8370 - }, - { - "epoch": 0.38827074640704684, - "grad_norm": 3.9581894874572754, - "learning_rate": 1.5350285956239204e-05, - "loss": 0.8537, - "step": 8375 - }, - { - "epoch": 0.3885025498377376, - "grad_norm": 3.6455395221710205, - "learning_rate": 1.5343448260203833e-05, - "loss": 0.8997, - "step": 8380 - }, - { - "epoch": 0.38873435326842837, - "grad_norm": 3.51712703704834, - "learning_rate": 1.533660706571557e-05, - "loss": 0.9503, - "step": 8385 - }, - { - "epoch": 0.38896615669911916, - "grad_norm": 3.7901172637939453, - "learning_rate": 1.5329762377253482e-05, - "loss": 0.887, - "step": 8390 - }, - { - "epoch": 0.3891979601298099, - "grad_norm": 3.1369612216949463, - "learning_rate": 1.5322914199298897e-05, - "loss": 0.8935, - "step": 8395 - }, - { - "epoch": 0.3894297635605007, - "grad_norm": 3.3814847469329834, - "learning_rate": 1.5316062536335448e-05, - "loss": 0.8281, - "step": 8400 - }, - { - "epoch": 0.3894297635605007, - "eval_loss": 0.9663224816322327, - "eval_runtime": 11.2673, - "eval_samples_per_second": 11.272, - "eval_steps_per_second": 11.272, - "step": 8400 - }, - { - "epoch": 0.3896615669911915, - "grad_norm": 3.347869396209717, - "learning_rate": 1.5309207392849044e-05, - "loss": 1.0086, - "step": 8405 - }, - { - "epoch": 0.38989337042188227, - "grad_norm": 4.276888847351074, - "learning_rate": 1.5302348773327876e-05, - "loss": 1.1375, - "step": 8410 - }, - { - "epoch": 0.390125173852573, - "grad_norm": 4.132003307342529, - "learning_rate": 1.5295486682262394e-05, - "loss": 1.0415, - "step": 8415 - }, - { - "epoch": 0.3903569772832638, - "grad_norm": 3.471198320388794, - "learning_rate": 1.5288621124145352e-05, - "loss": 0.8592, - "step": 8420 - }, - { - "epoch": 0.3905887807139546, - "grad_norm": 4.1366448402404785, - "learning_rate": 1.528175210347174e-05, - "loss": 1.0408, - "step": 8425 - }, - { - "epoch": 0.3908205841446453, - "grad_norm": 3.8777146339416504, - "learning_rate": 1.527487962473884e-05, - "loss": 0.9605, - "step": 8430 - }, - { - "epoch": 0.3910523875753361, - "grad_norm": 4.086522579193115, - "learning_rate": 1.526800369244619e-05, - "loss": 1.0024, - "step": 8435 - }, - { - "epoch": 0.3912841910060269, - "grad_norm": 3.3036468029022217, - "learning_rate": 1.526112431109558e-05, - "loss": 0.9449, - "step": 8440 - }, - { - "epoch": 0.39151599443671764, - "grad_norm": 3.620948314666748, - "learning_rate": 1.525424148519107e-05, - "loss": 0.8556, - "step": 8445 - }, - { - "epoch": 0.39174779786740843, - "grad_norm": 4.023210048675537, - "learning_rate": 1.5247355219238977e-05, - "loss": 1.0359, - "step": 8450 - }, - { - "epoch": 0.3919796012980992, - "grad_norm": 3.1166470050811768, - "learning_rate": 1.5240465517747857e-05, - "loss": 0.8803, - "step": 8455 - }, - { - "epoch": 0.39221140472878996, - "grad_norm": 3.456226110458374, - "learning_rate": 1.5233572385228527e-05, - "loss": 0.9047, - "step": 8460 - }, - { - "epoch": 0.39244320815948075, - "grad_norm": 6.233022689819336, - "learning_rate": 1.5226675826194046e-05, - "loss": 0.9472, - "step": 8465 - }, - { - "epoch": 0.39267501159017154, - "grad_norm": 4.138795375823975, - "learning_rate": 1.5219775845159716e-05, - "loss": 1.0286, - "step": 8470 - }, - { - "epoch": 0.39290681502086233, - "grad_norm": 3.463423728942871, - "learning_rate": 1.5212872446643082e-05, - "loss": 1.0471, - "step": 8475 - }, - { - "epoch": 0.39313861845155307, - "grad_norm": 3.417653799057007, - "learning_rate": 1.5205965635163921e-05, - "loss": 0.7668, - "step": 8480 - }, - { - "epoch": 0.39337042188224386, - "grad_norm": 3.8116729259490967, - "learning_rate": 1.5199055415244253e-05, - "loss": 0.9211, - "step": 8485 - }, - { - "epoch": 0.39360222531293465, - "grad_norm": 3.489842176437378, - "learning_rate": 1.519214179140832e-05, - "loss": 0.8637, - "step": 8490 - }, - { - "epoch": 0.3938340287436254, - "grad_norm": 4.423672676086426, - "learning_rate": 1.51852247681826e-05, - "loss": 1.0781, - "step": 8495 - }, - { - "epoch": 0.3940658321743162, - "grad_norm": 3.542827606201172, - "learning_rate": 1.5178304350095796e-05, - "loss": 0.9339, - "step": 8500 - }, - { - "epoch": 0.3940658321743162, - "eval_loss": 0.9644456505775452, - "eval_runtime": 11.2654, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 8500 - }, - { - "epoch": 0.39429763560500697, - "grad_norm": 3.8117423057556152, - "learning_rate": 1.5171380541678829e-05, - "loss": 0.8443, - "step": 8505 - }, - { - "epoch": 0.3945294390356977, - "grad_norm": 3.482234001159668, - "learning_rate": 1.516445334746484e-05, - "loss": 0.8924, - "step": 8510 - }, - { - "epoch": 0.3947612424663885, - "grad_norm": 3.5414836406707764, - "learning_rate": 1.5157522771989195e-05, - "loss": 1.0507, - "step": 8515 - }, - { - "epoch": 0.3949930458970793, - "grad_norm": 2.974254846572876, - "learning_rate": 1.5150588819789467e-05, - "loss": 0.8076, - "step": 8520 - }, - { - "epoch": 0.3952248493277701, - "grad_norm": 3.456716775894165, - "learning_rate": 1.5143651495405439e-05, - "loss": 0.9779, - "step": 8525 - }, - { - "epoch": 0.3954566527584608, - "grad_norm": 3.6141698360443115, - "learning_rate": 1.5136710803379103e-05, - "loss": 0.9303, - "step": 8530 - }, - { - "epoch": 0.3956884561891516, - "grad_norm": 3.409841537475586, - "learning_rate": 1.5129766748254656e-05, - "loss": 0.8588, - "step": 8535 - }, - { - "epoch": 0.3959202596198424, - "grad_norm": 4.912245750427246, - "learning_rate": 1.5122819334578498e-05, - "loss": 0.8972, - "step": 8540 - }, - { - "epoch": 0.39615206305053313, - "grad_norm": 3.854921340942383, - "learning_rate": 1.511586856689923e-05, - "loss": 1.1203, - "step": 8545 - }, - { - "epoch": 0.3963838664812239, - "grad_norm": 4.43947696685791, - "learning_rate": 1.510891444976764e-05, - "loss": 0.9963, - "step": 8550 - }, - { - "epoch": 0.3966156699119147, - "grad_norm": 3.782301902770996, - "learning_rate": 1.5101956987736724e-05, - "loss": 1.0837, - "step": 8555 - }, - { - "epoch": 0.39684747334260545, - "grad_norm": 3.896822214126587, - "learning_rate": 1.509499618536165e-05, - "loss": 0.8543, - "step": 8560 - }, - { - "epoch": 0.39707927677329624, - "grad_norm": 3.10201096534729, - "learning_rate": 1.5088032047199787e-05, - "loss": 0.8756, - "step": 8565 - }, - { - "epoch": 0.39731108020398703, - "grad_norm": 4.653040885925293, - "learning_rate": 1.508106457781069e-05, - "loss": 0.9539, - "step": 8570 - }, - { - "epoch": 0.39754288363467777, - "grad_norm": 3.215571880340576, - "learning_rate": 1.507409378175607e-05, - "loss": 0.9654, - "step": 8575 - }, - { - "epoch": 0.39777468706536856, - "grad_norm": 3.752126932144165, - "learning_rate": 1.506711966359985e-05, - "loss": 0.9448, - "step": 8580 - }, - { - "epoch": 0.39800649049605935, - "grad_norm": 3.5548033714294434, - "learning_rate": 1.5060142227908106e-05, - "loss": 0.7547, - "step": 8585 - }, - { - "epoch": 0.39823829392675014, - "grad_norm": 2.9906864166259766, - "learning_rate": 1.5053161479249092e-05, - "loss": 0.8918, - "step": 8590 - }, - { - "epoch": 0.3984700973574409, - "grad_norm": 3.0783543586730957, - "learning_rate": 1.5046177422193234e-05, - "loss": 0.8722, - "step": 8595 - }, - { - "epoch": 0.39870190078813167, - "grad_norm": 3.2667911052703857, - "learning_rate": 1.5039190061313118e-05, - "loss": 0.9629, - "step": 8600 - }, - { - "epoch": 0.39870190078813167, - "eval_loss": 0.9621018767356873, - "eval_runtime": 11.278, - "eval_samples_per_second": 11.261, - "eval_steps_per_second": 11.261, - "step": 8600 - }, - { - "epoch": 0.39893370421882246, - "grad_norm": 3.2560768127441406, - "learning_rate": 1.50321994011835e-05, - "loss": 0.8151, - "step": 8605 - }, - { - "epoch": 0.3991655076495132, - "grad_norm": 3.382565498352051, - "learning_rate": 1.5025205446381286e-05, - "loss": 1.0409, - "step": 8610 - }, - { - "epoch": 0.399397311080204, - "grad_norm": 3.8526535034179688, - "learning_rate": 1.5018208201485555e-05, - "loss": 0.8708, - "step": 8615 - }, - { - "epoch": 0.3996291145108948, - "grad_norm": 3.978084087371826, - "learning_rate": 1.5011207671077523e-05, - "loss": 0.9014, - "step": 8620 - }, - { - "epoch": 0.3998609179415855, - "grad_norm": 3.4769928455352783, - "learning_rate": 1.5004203859740569e-05, - "loss": 0.9161, - "step": 8625 - }, - { - "epoch": 0.4000927213722763, - "grad_norm": 3.813260078430176, - "learning_rate": 1.4997196772060215e-05, - "loss": 0.9638, - "step": 8630 - }, - { - "epoch": 0.4003245248029671, - "grad_norm": 4.105741500854492, - "learning_rate": 1.4990186412624126e-05, - "loss": 1.0067, - "step": 8635 - }, - { - "epoch": 0.40055632823365783, - "grad_norm": 3.5164148807525635, - "learning_rate": 1.4983172786022115e-05, - "loss": 0.9244, - "step": 8640 - }, - { - "epoch": 0.4007881316643486, - "grad_norm": 3.283759593963623, - "learning_rate": 1.4976155896846129e-05, - "loss": 0.8354, - "step": 8645 - }, - { - "epoch": 0.4010199350950394, - "grad_norm": 4.097290992736816, - "learning_rate": 1.4969135749690259e-05, - "loss": 0.8365, - "step": 8650 - }, - { - "epoch": 0.4012517385257302, - "grad_norm": 4.539755344390869, - "learning_rate": 1.4962112349150714e-05, - "loss": 1.0438, - "step": 8655 - }, - { - "epoch": 0.40148354195642094, - "grad_norm": 3.291273355484009, - "learning_rate": 1.4955085699825849e-05, - "loss": 0.8505, - "step": 8660 - }, - { - "epoch": 0.40171534538711173, - "grad_norm": 3.5600454807281494, - "learning_rate": 1.4948055806316138e-05, - "loss": 0.9365, - "step": 8665 - }, - { - "epoch": 0.4019471488178025, - "grad_norm": 3.5624420642852783, - "learning_rate": 1.4941022673224178e-05, - "loss": 0.9191, - "step": 8670 - }, - { - "epoch": 0.40217895224849326, - "grad_norm": 3.605802297592163, - "learning_rate": 1.4933986305154691e-05, - "loss": 1.0921, - "step": 8675 - }, - { - "epoch": 0.40241075567918405, - "grad_norm": 3.824154853820801, - "learning_rate": 1.4926946706714516e-05, - "loss": 0.9617, - "step": 8680 - }, - { - "epoch": 0.40264255910987484, - "grad_norm": 3.4327552318573, - "learning_rate": 1.4919903882512605e-05, - "loss": 0.8381, - "step": 8685 - }, - { - "epoch": 0.4028743625405656, - "grad_norm": 3.498324394226074, - "learning_rate": 1.4912857837160023e-05, - "loss": 0.8875, - "step": 8690 - }, - { - "epoch": 0.40310616597125637, - "grad_norm": 3.320936918258667, - "learning_rate": 1.4905808575269942e-05, - "loss": 0.7844, - "step": 8695 - }, - { - "epoch": 0.40333796940194716, - "grad_norm": 4.480831146240234, - "learning_rate": 1.4898756101457652e-05, - "loss": 0.902, - "step": 8700 - }, - { - "epoch": 0.40333796940194716, - "eval_loss": 0.9607927799224854, - "eval_runtime": 11.2715, - "eval_samples_per_second": 11.267, - "eval_steps_per_second": 11.267, - "step": 8700 - }, - { - "epoch": 0.4035697728326379, - "grad_norm": 3.961334466934204, - "learning_rate": 1.4891700420340522e-05, - "loss": 1.0543, - "step": 8705 - }, - { - "epoch": 0.4038015762633287, - "grad_norm": 9.081817626953125, - "learning_rate": 1.4884641536538045e-05, - "loss": 1.1713, - "step": 8710 - }, - { - "epoch": 0.4040333796940195, - "grad_norm": 4.37900447845459, - "learning_rate": 1.48775794546718e-05, - "loss": 0.9349, - "step": 8715 - }, - { - "epoch": 0.40426518312471027, - "grad_norm": 4.0761399269104, - "learning_rate": 1.4870514179365453e-05, - "loss": 0.9451, - "step": 8720 - }, - { - "epoch": 0.404496986555401, - "grad_norm": 3.5387260913848877, - "learning_rate": 1.486344571524478e-05, - "loss": 0.8872, - "step": 8725 - }, - { - "epoch": 0.4047287899860918, - "grad_norm": 3.9236161708831787, - "learning_rate": 1.4856374066937622e-05, - "loss": 0.9091, - "step": 8730 - }, - { - "epoch": 0.4049605934167826, - "grad_norm": 3.637903928756714, - "learning_rate": 1.4849299239073924e-05, - "loss": 0.932, - "step": 8735 - }, - { - "epoch": 0.4051923968474733, - "grad_norm": 3.5043904781341553, - "learning_rate": 1.4842221236285701e-05, - "loss": 0.9343, - "step": 8740 - }, - { - "epoch": 0.4054242002781641, - "grad_norm": 4.224621295928955, - "learning_rate": 1.483514006320705e-05, - "loss": 0.8623, - "step": 8745 - }, - { - "epoch": 0.4056560037088549, - "grad_norm": 3.410630226135254, - "learning_rate": 1.4828055724474144e-05, - "loss": 0.8286, - "step": 8750 - }, - { - "epoch": 0.40588780713954564, - "grad_norm": 4.416448593139648, - "learning_rate": 1.4820968224725229e-05, - "loss": 0.93, - "step": 8755 - }, - { - "epoch": 0.40611961057023643, - "grad_norm": 3.3819997310638428, - "learning_rate": 1.4813877568600625e-05, - "loss": 0.9855, - "step": 8760 - }, - { - "epoch": 0.4063514140009272, - "grad_norm": 3.6573407649993896, - "learning_rate": 1.4806783760742703e-05, - "loss": 0.9538, - "step": 8765 - }, - { - "epoch": 0.40658321743161796, - "grad_norm": 3.8681700229644775, - "learning_rate": 1.4799686805795919e-05, - "loss": 1.0216, - "step": 8770 - }, - { - "epoch": 0.40681502086230875, - "grad_norm": 3.9974756240844727, - "learning_rate": 1.4792586708406776e-05, - "loss": 0.9717, - "step": 8775 - }, - { - "epoch": 0.40704682429299954, - "grad_norm": 4.307816028594971, - "learning_rate": 1.4785483473223829e-05, - "loss": 0.9331, - "step": 8780 - }, - { - "epoch": 0.40727862772369033, - "grad_norm": 4.006975173950195, - "learning_rate": 1.4778377104897703e-05, - "loss": 1.0737, - "step": 8785 - }, - { - "epoch": 0.40751043115438107, - "grad_norm": 3.7386765480041504, - "learning_rate": 1.4771267608081068e-05, - "loss": 0.98, - "step": 8790 - }, - { - "epoch": 0.40774223458507186, - "grad_norm": 3.2965736389160156, - "learning_rate": 1.4764154987428639e-05, - "loss": 0.9451, - "step": 8795 - }, - { - "epoch": 0.40797403801576265, - "grad_norm": 3.6365609169006348, - "learning_rate": 1.4757039247597176e-05, - "loss": 0.8254, - "step": 8800 - }, - { - "epoch": 0.40797403801576265, - "eval_loss": 0.9614850878715515, - "eval_runtime": 11.2707, - "eval_samples_per_second": 11.268, - "eval_steps_per_second": 11.268, - "step": 8800 - }, - { - "epoch": 0.4082058414464534, - "grad_norm": 3.285236358642578, - "learning_rate": 1.4749920393245485e-05, - "loss": 1.0057, - "step": 8805 - }, - { - "epoch": 0.4084376448771442, - "grad_norm": 3.88112473487854, - "learning_rate": 1.4742798429034409e-05, - "loss": 0.8818, - "step": 8810 - }, - { - "epoch": 0.40866944830783497, - "grad_norm": 3.2408924102783203, - "learning_rate": 1.4735673359626828e-05, - "loss": 0.8564, - "step": 8815 - }, - { - "epoch": 0.4089012517385257, - "grad_norm": 5.373213291168213, - "learning_rate": 1.4728545189687658e-05, - "loss": 1.0474, - "step": 8820 - }, - { - "epoch": 0.4091330551692165, - "grad_norm": 3.371821880340576, - "learning_rate": 1.4721413923883838e-05, - "loss": 0.8351, - "step": 8825 - }, - { - "epoch": 0.4093648585999073, - "grad_norm": 3.3529322147369385, - "learning_rate": 1.4714279566884335e-05, - "loss": 0.869, - "step": 8830 - }, - { - "epoch": 0.4095966620305981, - "grad_norm": 4.656132698059082, - "learning_rate": 1.4707142123360149e-05, - "loss": 1.1344, - "step": 8835 - }, - { - "epoch": 0.4098284654612888, - "grad_norm": 4.094824314117432, - "learning_rate": 1.4700001597984291e-05, - "loss": 0.9209, - "step": 8840 - }, - { - "epoch": 0.4100602688919796, - "grad_norm": 3.9256341457366943, - "learning_rate": 1.469285799543179e-05, - "loss": 0.8983, - "step": 8845 - }, - { - "epoch": 0.4102920723226704, - "grad_norm": 3.937293529510498, - "learning_rate": 1.4685711320379698e-05, - "loss": 0.9011, - "step": 8850 - }, - { - "epoch": 0.41052387575336113, - "grad_norm": 3.690925359725952, - "learning_rate": 1.4678561577507074e-05, - "loss": 0.9901, - "step": 8855 - }, - { - "epoch": 0.4107556791840519, - "grad_norm": 4.106996536254883, - "learning_rate": 1.467140877149498e-05, - "loss": 0.9249, - "step": 8860 - }, - { - "epoch": 0.4109874826147427, - "grad_norm": 3.8499739170074463, - "learning_rate": 1.4664252907026488e-05, - "loss": 1.0555, - "step": 8865 - }, - { - "epoch": 0.41121928604543345, - "grad_norm": 4.457788944244385, - "learning_rate": 1.4657093988786677e-05, - "loss": 0.9393, - "step": 8870 - }, - { - "epoch": 0.41145108947612424, - "grad_norm": 3.177001476287842, - "learning_rate": 1.4649932021462616e-05, - "loss": 0.8521, - "step": 8875 - }, - { - "epoch": 0.41168289290681503, - "grad_norm": 4.380415916442871, - "learning_rate": 1.4642767009743384e-05, - "loss": 0.9663, - "step": 8880 - }, - { - "epoch": 0.41191469633750577, - "grad_norm": 4.102474212646484, - "learning_rate": 1.463559895832004e-05, - "loss": 0.9828, - "step": 8885 - }, - { - "epoch": 0.41214649976819656, - "grad_norm": 7.0587663650512695, - "learning_rate": 1.4628427871885632e-05, - "loss": 0.9628, - "step": 8890 - }, - { - "epoch": 0.41237830319888735, - "grad_norm": 3.6423287391662598, - "learning_rate": 1.4621253755135207e-05, - "loss": 0.7434, - "step": 8895 - }, - { - "epoch": 0.41261010662957814, - "grad_norm": 4.276022434234619, - "learning_rate": 1.4614076612765788e-05, - "loss": 0.8113, - "step": 8900 - }, - { - "epoch": 0.41261010662957814, - "eval_loss": 0.9591353535652161, - "eval_runtime": 11.2682, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 8900 - }, - { - "epoch": 0.4128419100602689, - "grad_norm": 3.499518871307373, - "learning_rate": 1.4606896449476387e-05, - "loss": 0.8564, - "step": 8905 - }, - { - "epoch": 0.41307371349095967, - "grad_norm": 3.5929625034332275, - "learning_rate": 1.4599713269967976e-05, - "loss": 1.0022, - "step": 8910 - }, - { - "epoch": 0.41330551692165046, - "grad_norm": 4.23135232925415, - "learning_rate": 1.4592527078943523e-05, - "loss": 1.1004, - "step": 8915 - }, - { - "epoch": 0.4135373203523412, - "grad_norm": 3.6771299839019775, - "learning_rate": 1.4585337881107956e-05, - "loss": 1.0565, - "step": 8920 - }, - { - "epoch": 0.413769123783032, - "grad_norm": 4.86899471282959, - "learning_rate": 1.457814568116817e-05, - "loss": 0.9799, - "step": 8925 - }, - { - "epoch": 0.4140009272137228, - "grad_norm": 3.4616496562957764, - "learning_rate": 1.4570950483833036e-05, - "loss": 0.9112, - "step": 8930 - }, - { - "epoch": 0.4142327306444135, - "grad_norm": 3.190037250518799, - "learning_rate": 1.4563752293813371e-05, - "loss": 0.9892, - "step": 8935 - }, - { - "epoch": 0.4144645340751043, - "grad_norm": 4.035621166229248, - "learning_rate": 1.455655111582197e-05, - "loss": 0.8924, - "step": 8940 - }, - { - "epoch": 0.4146963375057951, - "grad_norm": 3.7359254360198975, - "learning_rate": 1.4549346954573574e-05, - "loss": 0.9515, - "step": 8945 - }, - { - "epoch": 0.41492814093648583, - "grad_norm": 4.1890997886657715, - "learning_rate": 1.4542139814784874e-05, - "loss": 0.913, - "step": 8950 - }, - { - "epoch": 0.4151599443671766, - "grad_norm": 4.180643558502197, - "learning_rate": 1.453492970117452e-05, - "loss": 0.8282, - "step": 8955 - }, - { - "epoch": 0.4153917477978674, - "grad_norm": 3.5779945850372314, - "learning_rate": 1.45277166184631e-05, - "loss": 0.9543, - "step": 8960 - }, - { - "epoch": 0.4156235512285582, - "grad_norm": 3.7106773853302, - "learning_rate": 1.4520500571373157e-05, - "loss": 1.0066, - "step": 8965 - }, - { - "epoch": 0.41585535465924894, - "grad_norm": 3.7494590282440186, - "learning_rate": 1.4513281564629163e-05, - "loss": 0.9777, - "step": 8970 - }, - { - "epoch": 0.41608715808993973, - "grad_norm": 3.517127513885498, - "learning_rate": 1.4506059602957537e-05, - "loss": 0.9611, - "step": 8975 - }, - { - "epoch": 0.4163189615206305, - "grad_norm": 4.559821605682373, - "learning_rate": 1.449883469108663e-05, - "loss": 0.9203, - "step": 8980 - }, - { - "epoch": 0.41655076495132126, - "grad_norm": 3.6281211376190186, - "learning_rate": 1.4491606833746714e-05, - "loss": 0.9143, - "step": 8985 - }, - { - "epoch": 0.41678256838201205, - "grad_norm": 4.122176170349121, - "learning_rate": 1.4484376035670008e-05, - "loss": 1.0841, - "step": 8990 - }, - { - "epoch": 0.41701437181270284, - "grad_norm": 4.669403553009033, - "learning_rate": 1.4477142301590647e-05, - "loss": 0.7889, - "step": 8995 - }, - { - "epoch": 0.4172461752433936, - "grad_norm": 3.9678564071655273, - "learning_rate": 1.4469905636244686e-05, - "loss": 0.8821, - "step": 9000 - }, - { - "epoch": 0.4172461752433936, - "eval_loss": 0.9594786167144775, - "eval_runtime": 11.271, - "eval_samples_per_second": 11.268, - "eval_steps_per_second": 11.268, - "step": 9000 - }, - { - "epoch": 0.41747797867408437, - "grad_norm": 3.803729295730591, - "learning_rate": 1.44626660443701e-05, - "loss": 0.9246, - "step": 9005 - }, - { - "epoch": 0.41770978210477516, - "grad_norm": 4.379030227661133, - "learning_rate": 1.4455423530706782e-05, - "loss": 1.1268, - "step": 9010 - }, - { - "epoch": 0.4179415855354659, - "grad_norm": 3.691082715988159, - "learning_rate": 1.4448178099996539e-05, - "loss": 0.9361, - "step": 9015 - }, - { - "epoch": 0.4181733889661567, - "grad_norm": 3.965627908706665, - "learning_rate": 1.4440929756983086e-05, - "loss": 0.997, - "step": 9020 - }, - { - "epoch": 0.4184051923968475, - "grad_norm": 4.264866352081299, - "learning_rate": 1.443367850641205e-05, - "loss": 0.8876, - "step": 9025 - }, - { - "epoch": 0.41863699582753827, - "grad_norm": 4.311303615570068, - "learning_rate": 1.4426424353030947e-05, - "loss": 0.8455, - "step": 9030 - }, - { - "epoch": 0.418868799258229, - "grad_norm": 3.237496852874756, - "learning_rate": 1.441916730158921e-05, - "loss": 0.8728, - "step": 9035 - }, - { - "epoch": 0.4191006026889198, - "grad_norm": 2.8361425399780273, - "learning_rate": 1.4411907356838165e-05, - "loss": 0.7104, - "step": 9040 - }, - { - "epoch": 0.4193324061196106, - "grad_norm": 4.360476493835449, - "learning_rate": 1.440464452353102e-05, - "loss": 0.8732, - "step": 9045 - }, - { - "epoch": 0.4195642095503013, - "grad_norm": 3.61860728263855, - "learning_rate": 1.4397378806422895e-05, - "loss": 0.8852, - "step": 9050 - }, - { - "epoch": 0.4197960129809921, - "grad_norm": 4.051429271697998, - "learning_rate": 1.439011021027078e-05, - "loss": 1.1358, - "step": 9055 - }, - { - "epoch": 0.4200278164116829, - "grad_norm": 3.6325385570526123, - "learning_rate": 1.4382838739833562e-05, - "loss": 0.8702, - "step": 9060 - }, - { - "epoch": 0.42025961984237364, - "grad_norm": 3.3029043674468994, - "learning_rate": 1.4375564399872005e-05, - "loss": 0.8015, - "step": 9065 - }, - { - "epoch": 0.42049142327306444, - "grad_norm": 4.105788707733154, - "learning_rate": 1.4368287195148746e-05, - "loss": 0.9538, - "step": 9070 - }, - { - "epoch": 0.4207232267037552, - "grad_norm": 3.321793556213379, - "learning_rate": 1.4361007130428311e-05, - "loss": 0.9167, - "step": 9075 - }, - { - "epoch": 0.42095503013444596, - "grad_norm": 4.301546096801758, - "learning_rate": 1.4353724210477084e-05, - "loss": 0.9535, - "step": 9080 - }, - { - "epoch": 0.42118683356513675, - "grad_norm": 3.8021814823150635, - "learning_rate": 1.4346438440063333e-05, - "loss": 0.876, - "step": 9085 - }, - { - "epoch": 0.42141863699582754, - "grad_norm": 3.5097808837890625, - "learning_rate": 1.433914982395718e-05, - "loss": 0.9506, - "step": 9090 - }, - { - "epoch": 0.42165044042651834, - "grad_norm": 3.8643198013305664, - "learning_rate": 1.4331858366930611e-05, - "loss": 0.9854, - "step": 9095 - }, - { - "epoch": 0.42188224385720907, - "grad_norm": 3.721323251724243, - "learning_rate": 1.4324564073757486e-05, - "loss": 0.8642, - "step": 9100 - }, - { - "epoch": 0.42188224385720907, - "eval_loss": 0.9581558108329773, - "eval_runtime": 11.2675, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 9100 - }, - { - "epoch": 0.42211404728789986, - "grad_norm": 3.54001522064209, - "learning_rate": 1.43172669492135e-05, - "loss": 0.8244, - "step": 9105 - }, - { - "epoch": 0.42234585071859065, - "grad_norm": 3.663177490234375, - "learning_rate": 1.4309966998076222e-05, - "loss": 0.9079, - "step": 9110 - }, - { - "epoch": 0.4225776541492814, - "grad_norm": 3.5418756008148193, - "learning_rate": 1.4302664225125058e-05, - "loss": 0.8807, - "step": 9115 - }, - { - "epoch": 0.4228094575799722, - "grad_norm": 3.5246732234954834, - "learning_rate": 1.429535863514127e-05, - "loss": 0.7437, - "step": 9120 - }, - { - "epoch": 0.423041261010663, - "grad_norm": 3.5707225799560547, - "learning_rate": 1.428805023290796e-05, - "loss": 1.0353, - "step": 9125 - }, - { - "epoch": 0.4232730644413537, - "grad_norm": 3.847414255142212, - "learning_rate": 1.4280739023210067e-05, - "loss": 1.1087, - "step": 9130 - }, - { - "epoch": 0.4235048678720445, - "grad_norm": 4.29426383972168, - "learning_rate": 1.427342501083438e-05, - "loss": 0.7605, - "step": 9135 - }, - { - "epoch": 0.4237366713027353, - "grad_norm": 3.425589084625244, - "learning_rate": 1.426610820056951e-05, - "loss": 0.9359, - "step": 9140 - }, - { - "epoch": 0.4239684747334261, - "grad_norm": 3.226741075515747, - "learning_rate": 1.4258788597205913e-05, - "loss": 0.8194, - "step": 9145 - }, - { - "epoch": 0.4242002781641168, - "grad_norm": 3.313603401184082, - "learning_rate": 1.425146620553586e-05, - "loss": 0.8958, - "step": 9150 - }, - { - "epoch": 0.4244320815948076, - "grad_norm": 4.064041614532471, - "learning_rate": 1.4244141030353458e-05, - "loss": 1.008, - "step": 9155 - }, - { - "epoch": 0.4246638850254984, - "grad_norm": 4.129899024963379, - "learning_rate": 1.4236813076454632e-05, - "loss": 0.8858, - "step": 9160 - }, - { - "epoch": 0.42489568845618914, - "grad_norm": 3.3487892150878906, - "learning_rate": 1.4229482348637125e-05, - "loss": 0.8874, - "step": 9165 - }, - { - "epoch": 0.4251274918868799, - "grad_norm": 3.243873357772827, - "learning_rate": 1.4222148851700504e-05, - "loss": 0.8868, - "step": 9170 - }, - { - "epoch": 0.4253592953175707, - "grad_norm": 4.211264133453369, - "learning_rate": 1.4214812590446135e-05, - "loss": 0.8756, - "step": 9175 - }, - { - "epoch": 0.42559109874826145, - "grad_norm": 3.718165159225464, - "learning_rate": 1.4207473569677203e-05, - "loss": 0.9177, - "step": 9180 - }, - { - "epoch": 0.42582290217895225, - "grad_norm": 3.3769257068634033, - "learning_rate": 1.4200131794198706e-05, - "loss": 0.8602, - "step": 9185 - }, - { - "epoch": 0.42605470560964304, - "grad_norm": 3.4048376083374023, - "learning_rate": 1.4192787268817427e-05, - "loss": 0.9522, - "step": 9190 - }, - { - "epoch": 0.42628650904033377, - "grad_norm": 4.399397850036621, - "learning_rate": 1.418543999834197e-05, - "loss": 0.8503, - "step": 9195 - }, - { - "epoch": 0.42651831247102456, - "grad_norm": 3.5596532821655273, - "learning_rate": 1.4178089987582722e-05, - "loss": 0.7514, - "step": 9200 - }, - { - "epoch": 0.42651831247102456, - "eval_loss": 0.957184374332428, - "eval_runtime": 11.2793, - "eval_samples_per_second": 11.26, - "eval_steps_per_second": 11.26, - "step": 9200 - }, - { - "epoch": 0.42675011590171535, - "grad_norm": 3.185898780822754, - "learning_rate": 1.4170737241351873e-05, - "loss": 0.9724, - "step": 9205 - }, - { - "epoch": 0.42698191933240615, - "grad_norm": 3.8426990509033203, - "learning_rate": 1.4163381764463395e-05, - "loss": 0.841, - "step": 9210 - }, - { - "epoch": 0.4272137227630969, - "grad_norm": 3.9944119453430176, - "learning_rate": 1.4156023561733054e-05, - "loss": 0.9992, - "step": 9215 - }, - { - "epoch": 0.4274455261937877, - "grad_norm": 3.3030200004577637, - "learning_rate": 1.41486626379784e-05, - "loss": 1.078, - "step": 9220 - }, - { - "epoch": 0.42767732962447846, - "grad_norm": 4.052797317504883, - "learning_rate": 1.4141298998018765e-05, - "loss": 0.7569, - "step": 9225 - }, - { - "epoch": 0.4279091330551692, - "grad_norm": 5.390350341796875, - "learning_rate": 1.4133932646675258e-05, - "loss": 0.9488, - "step": 9230 - }, - { - "epoch": 0.42814093648586, - "grad_norm": 3.9626481533050537, - "learning_rate": 1.412656358877076e-05, - "loss": 1.0712, - "step": 9235 - }, - { - "epoch": 0.4283727399165508, - "grad_norm": 3.662187337875366, - "learning_rate": 1.411919182912993e-05, - "loss": 0.8196, - "step": 9240 - }, - { - "epoch": 0.4286045433472415, - "grad_norm": 3.5590271949768066, - "learning_rate": 1.4111817372579198e-05, - "loss": 1.1157, - "step": 9245 - }, - { - "epoch": 0.4288363467779323, - "grad_norm": 3.3557329177856445, - "learning_rate": 1.4104440223946742e-05, - "loss": 0.8617, - "step": 9250 - }, - { - "epoch": 0.4290681502086231, - "grad_norm": 4.487248420715332, - "learning_rate": 1.4097060388062529e-05, - "loss": 0.9544, - "step": 9255 - }, - { - "epoch": 0.42929995363931384, - "grad_norm": 5.409276962280273, - "learning_rate": 1.4089677869758266e-05, - "loss": 1.0187, - "step": 9260 - }, - { - "epoch": 0.4295317570700046, - "grad_norm": 3.230283737182617, - "learning_rate": 1.4082292673867421e-05, - "loss": 0.971, - "step": 9265 - }, - { - "epoch": 0.4297635605006954, - "grad_norm": 4.221251010894775, - "learning_rate": 1.407490480522522e-05, - "loss": 0.928, - "step": 9270 - }, - { - "epoch": 0.4299953639313862, - "grad_norm": 4.350439071655273, - "learning_rate": 1.4067514268668631e-05, - "loss": 1.002, - "step": 9275 - }, - { - "epoch": 0.43022716736207695, - "grad_norm": 3.369349479675293, - "learning_rate": 1.4060121069036379e-05, - "loss": 0.8569, - "step": 9280 - }, - { - "epoch": 0.43045897079276774, - "grad_norm": 3.8549163341522217, - "learning_rate": 1.405272521116892e-05, - "loss": 0.9794, - "step": 9285 - }, - { - "epoch": 0.43069077422345853, - "grad_norm": 4.107554912567139, - "learning_rate": 1.404532669990846e-05, - "loss": 0.9398, - "step": 9290 - }, - { - "epoch": 0.43092257765414926, - "grad_norm": 3.9353766441345215, - "learning_rate": 1.4037925540098937e-05, - "loss": 1.0514, - "step": 9295 - }, - { - "epoch": 0.43115438108484005, - "grad_norm": 3.31166934967041, - "learning_rate": 1.4030521736586025e-05, - "loss": 0.6717, - "step": 9300 - }, - { - "epoch": 0.43115438108484005, - "eval_loss": 0.954317033290863, - "eval_runtime": 11.2662, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, - "step": 9300 - }, - { - "epoch": 0.43138618451553085, - "grad_norm": 3.7155702114105225, - "learning_rate": 1.4023115294217135e-05, - "loss": 0.9786, - "step": 9305 - }, - { - "epoch": 0.4316179879462216, - "grad_norm": 3.2847890853881836, - "learning_rate": 1.4015706217841392e-05, - "loss": 0.8072, - "step": 9310 - }, - { - "epoch": 0.4318497913769124, - "grad_norm": 4.270799160003662, - "learning_rate": 1.400829451230966e-05, - "loss": 0.9943, - "step": 9315 - }, - { - "epoch": 0.43208159480760316, - "grad_norm": 3.409757614135742, - "learning_rate": 1.4000880182474512e-05, - "loss": 0.8742, - "step": 9320 - }, - { - "epoch": 0.4323133982382939, - "grad_norm": 3.892242908477783, - "learning_rate": 1.3993463233190245e-05, - "loss": 1.1398, - "step": 9325 - }, - { - "epoch": 0.4325452016689847, - "grad_norm": 4.129483222961426, - "learning_rate": 1.3986043669312874e-05, - "loss": 0.945, - "step": 9330 - }, - { - "epoch": 0.4327770050996755, - "grad_norm": 3.7721927165985107, - "learning_rate": 1.397862149570012e-05, - "loss": 1.0127, - "step": 9335 - }, - { - "epoch": 0.4330088085303663, - "grad_norm": 4.086781978607178, - "learning_rate": 1.397119671721142e-05, - "loss": 0.9437, - "step": 9340 - }, - { - "epoch": 0.433240611961057, - "grad_norm": 3.799339532852173, - "learning_rate": 1.3963769338707905e-05, - "loss": 0.8334, - "step": 9345 - }, - { - "epoch": 0.4334724153917478, - "grad_norm": 3.543330669403076, - "learning_rate": 1.3956339365052419e-05, - "loss": 0.9997, - "step": 9350 - }, - { - "epoch": 0.4337042188224386, - "grad_norm": 5.324821949005127, - "learning_rate": 1.3948906801109498e-05, - "loss": 0.8411, - "step": 9355 - }, - { - "epoch": 0.4339360222531293, - "grad_norm": 3.7983787059783936, - "learning_rate": 1.3941471651745382e-05, - "loss": 0.9135, - "step": 9360 - }, - { - "epoch": 0.4341678256838201, - "grad_norm": 3.554536819458008, - "learning_rate": 1.3934033921827994e-05, - "loss": 0.9295, - "step": 9365 - }, - { - "epoch": 0.4343996291145109, - "grad_norm": 4.4289140701293945, - "learning_rate": 1.3926593616226951e-05, - "loss": 1.0013, - "step": 9370 - }, - { - "epoch": 0.43463143254520165, - "grad_norm": 4.294192314147949, - "learning_rate": 1.3919150739813566e-05, - "loss": 1.0827, - "step": 9375 - }, - { - "epoch": 0.43486323597589244, - "grad_norm": 3.2097225189208984, - "learning_rate": 1.3911705297460812e-05, - "loss": 0.9832, - "step": 9380 - }, - { - "epoch": 0.43509503940658323, - "grad_norm": 3.6051383018493652, - "learning_rate": 1.3904257294043362e-05, - "loss": 0.9388, - "step": 9385 - }, - { - "epoch": 0.43532684283727396, - "grad_norm": 4.199221134185791, - "learning_rate": 1.3896806734437563e-05, - "loss": 0.8745, - "step": 9390 - }, - { - "epoch": 0.43555864626796476, - "grad_norm": 3.3234703540802, - "learning_rate": 1.3889353623521426e-05, - "loss": 0.9371, - "step": 9395 - }, - { - "epoch": 0.43579044969865555, - "grad_norm": 3.5717806816101074, - "learning_rate": 1.3881897966174642e-05, - "loss": 0.8621, - "step": 9400 - }, - { - "epoch": 0.43579044969865555, - "eval_loss": 0.9543979167938232, - "eval_runtime": 11.2671, - "eval_samples_per_second": 11.272, - "eval_steps_per_second": 11.272, - "step": 9400 - }, - { - "epoch": 0.43602225312934634, - "grad_norm": 3.5517239570617676, - "learning_rate": 1.3874439767278562e-05, - "loss": 0.9488, - "step": 9405 - }, - { - "epoch": 0.4362540565600371, - "grad_norm": 3.392153739929199, - "learning_rate": 1.3866979031716212e-05, - "loss": 0.8369, - "step": 9410 - }, - { - "epoch": 0.43648585999072786, - "grad_norm": 4.374027729034424, - "learning_rate": 1.3859515764372265e-05, - "loss": 1.0121, - "step": 9415 - }, - { - "epoch": 0.43671766342141866, - "grad_norm": 3.541436195373535, - "learning_rate": 1.385204997013306e-05, - "loss": 0.9504, - "step": 9420 - }, - { - "epoch": 0.4369494668521094, - "grad_norm": 3.709672451019287, - "learning_rate": 1.3844581653886587e-05, - "loss": 1.021, - "step": 9425 - }, - { - "epoch": 0.4371812702828002, - "grad_norm": 3.851954460144043, - "learning_rate": 1.383711082052249e-05, - "loss": 1.0206, - "step": 9430 - }, - { - "epoch": 0.437413073713491, - "grad_norm": 4.461718559265137, - "learning_rate": 1.3829637474932058e-05, - "loss": 0.9523, - "step": 9435 - }, - { - "epoch": 0.4376448771441817, - "grad_norm": 3.9250519275665283, - "learning_rate": 1.3822161622008233e-05, - "loss": 0.9426, - "step": 9440 - }, - { - "epoch": 0.4378766805748725, - "grad_norm": 3.6661217212677, - "learning_rate": 1.381468326664558e-05, - "loss": 0.9269, - "step": 9445 - }, - { - "epoch": 0.4381084840055633, - "grad_norm": 4.122355937957764, - "learning_rate": 1.3807202413740329e-05, - "loss": 0.9779, - "step": 9450 - }, - { - "epoch": 0.4383402874362541, - "grad_norm": 3.240402936935425, - "learning_rate": 1.3799719068190318e-05, - "loss": 0.8976, - "step": 9455 - }, - { - "epoch": 0.4385720908669448, - "grad_norm": 4.14400577545166, - "learning_rate": 1.3792233234895037e-05, - "loss": 0.9346, - "step": 9460 - }, - { - "epoch": 0.4388038942976356, - "grad_norm": 3.896681308746338, - "learning_rate": 1.3784744918755593e-05, - "loss": 1.093, - "step": 9465 - }, - { - "epoch": 0.4390356977283264, - "grad_norm": 3.402827262878418, - "learning_rate": 1.3777254124674726e-05, - "loss": 1.0033, - "step": 9470 - }, - { - "epoch": 0.43926750115901714, - "grad_norm": 3.659036874771118, - "learning_rate": 1.3769760857556795e-05, - "loss": 0.9373, - "step": 9475 - }, - { - "epoch": 0.43949930458970793, - "grad_norm": 4.025357723236084, - "learning_rate": 1.3762265122307778e-05, - "loss": 1.0778, - "step": 9480 - }, - { - "epoch": 0.4397311080203987, - "grad_norm": 3.3651130199432373, - "learning_rate": 1.375476692383527e-05, - "loss": 0.8191, - "step": 9485 - }, - { - "epoch": 0.43996291145108946, - "grad_norm": 3.4289755821228027, - "learning_rate": 1.3747266267048475e-05, - "loss": 0.7334, - "step": 9490 - }, - { - "epoch": 0.44019471488178025, - "grad_norm": 3.8313558101654053, - "learning_rate": 1.3739763156858215e-05, - "loss": 0.8722, - "step": 9495 - }, - { - "epoch": 0.44042651831247104, - "grad_norm": 3.7963833808898926, - "learning_rate": 1.3732257598176908e-05, - "loss": 0.832, - "step": 9500 - }, - { - "epoch": 0.44042651831247104, - "eval_loss": 0.9525100588798523, - "eval_runtime": 11.2607, - "eval_samples_per_second": 11.278, - "eval_steps_per_second": 11.278, - "step": 9500 - }, - { - "epoch": 0.4406583217431618, - "grad_norm": 4.198210716247559, - "learning_rate": 1.372474959591858e-05, - "loss": 1.0779, - "step": 9505 - }, - { - "epoch": 0.44089012517385257, - "grad_norm": 3.5892603397369385, - "learning_rate": 1.3717239154998862e-05, - "loss": 0.8437, - "step": 9510 - }, - { - "epoch": 0.44112192860454336, - "grad_norm": 4.606617450714111, - "learning_rate": 1.370972628033497e-05, - "loss": 1.0619, - "step": 9515 - }, - { - "epoch": 0.44135373203523415, - "grad_norm": 2.997912883758545, - "learning_rate": 1.3702210976845725e-05, - "loss": 0.8647, - "step": 9520 - }, - { - "epoch": 0.4415855354659249, - "grad_norm": 3.43404483795166, - "learning_rate": 1.3694693249451532e-05, - "loss": 0.766, - "step": 9525 - }, - { - "epoch": 0.4418173388966157, - "grad_norm": 3.4902102947235107, - "learning_rate": 1.3687173103074378e-05, - "loss": 0.8688, - "step": 9530 - }, - { - "epoch": 0.44204914232730647, - "grad_norm": 3.934156656265259, - "learning_rate": 1.3679650542637848e-05, - "loss": 0.9175, - "step": 9535 - }, - { - "epoch": 0.4422809457579972, - "grad_norm": 4.285318374633789, - "learning_rate": 1.3672125573067097e-05, - "loss": 0.918, - "step": 9540 - }, - { - "epoch": 0.442512749188688, - "grad_norm": 3.6897430419921875, - "learning_rate": 1.3664598199288857e-05, - "loss": 0.829, - "step": 9545 - }, - { - "epoch": 0.4427445526193788, - "grad_norm": 4.011903285980225, - "learning_rate": 1.3657068426231439e-05, - "loss": 1.1165, - "step": 9550 - }, - { - "epoch": 0.4429763560500695, - "grad_norm": 4.198677062988281, - "learning_rate": 1.3649536258824722e-05, - "loss": 0.8271, - "step": 9555 - }, - { - "epoch": 0.4432081594807603, - "grad_norm": 4.13018274307251, - "learning_rate": 1.3642001702000155e-05, - "loss": 0.8224, - "step": 9560 - }, - { - "epoch": 0.4434399629114511, - "grad_norm": 3.2623791694641113, - "learning_rate": 1.3634464760690742e-05, - "loss": 0.8145, - "step": 9565 - }, - { - "epoch": 0.44367176634214184, - "grad_norm": 4.026551246643066, - "learning_rate": 1.3626925439831067e-05, - "loss": 0.9373, - "step": 9570 - }, - { - "epoch": 0.44390356977283263, - "grad_norm": 4.671341896057129, - "learning_rate": 1.361938374435725e-05, - "loss": 0.9916, - "step": 9575 - }, - { - "epoch": 0.4441353732035234, - "grad_norm": 3.9089975357055664, - "learning_rate": 1.3611839679206986e-05, - "loss": 0.9412, - "step": 9580 - }, - { - "epoch": 0.4443671766342142, - "grad_norm": 4.026656150817871, - "learning_rate": 1.3604293249319507e-05, - "loss": 0.7684, - "step": 9585 - }, - { - "epoch": 0.44459898006490495, - "grad_norm": 3.824301242828369, - "learning_rate": 1.3596744459635595e-05, - "loss": 1.0024, - "step": 9590 - }, - { - "epoch": 0.44483078349559574, - "grad_norm": 3.724039316177368, - "learning_rate": 1.3589193315097589e-05, - "loss": 0.8391, - "step": 9595 - }, - { - "epoch": 0.44506258692628653, - "grad_norm": 3.2751495838165283, - "learning_rate": 1.358163982064935e-05, - "loss": 0.8337, - "step": 9600 - }, - { - "epoch": 0.44506258692628653, - "eval_loss": 0.9512506127357483, - "eval_runtime": 11.2694, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 9600 - }, - { - "epoch": 0.44529439035697727, - "grad_norm": 3.5212228298187256, - "learning_rate": 1.3574083981236295e-05, - "loss": 1.0335, - "step": 9605 - }, - { - "epoch": 0.44552619378766806, - "grad_norm": 4.416726112365723, - "learning_rate": 1.3566525801805369e-05, - "loss": 0.9456, - "step": 9610 - }, - { - "epoch": 0.44575799721835885, - "grad_norm": 3.1660563945770264, - "learning_rate": 1.3558965287305049e-05, - "loss": 0.7768, - "step": 9615 - }, - { - "epoch": 0.4459898006490496, - "grad_norm": 3.384840488433838, - "learning_rate": 1.3551402442685342e-05, - "loss": 0.9298, - "step": 9620 - }, - { - "epoch": 0.4462216040797404, - "grad_norm": 3.863919496536255, - "learning_rate": 1.3543837272897778e-05, - "loss": 0.7726, - "step": 9625 - }, - { - "epoch": 0.44645340751043117, - "grad_norm": 3.7253286838531494, - "learning_rate": 1.3536269782895416e-05, - "loss": 0.8242, - "step": 9630 - }, - { - "epoch": 0.4466852109411219, - "grad_norm": 3.352973461151123, - "learning_rate": 1.3528699977632824e-05, - "loss": 0.8054, - "step": 9635 - }, - { - "epoch": 0.4469170143718127, - "grad_norm": 3.536757707595825, - "learning_rate": 1.3521127862066095e-05, - "loss": 0.7579, - "step": 9640 - }, - { - "epoch": 0.4471488178025035, - "grad_norm": 3.5800118446350098, - "learning_rate": 1.351355344115283e-05, - "loss": 0.8839, - "step": 9645 - }, - { - "epoch": 0.4473806212331943, - "grad_norm": 3.774768829345703, - "learning_rate": 1.3505976719852138e-05, - "loss": 0.8917, - "step": 9650 - }, - { - "epoch": 0.447612424663885, - "grad_norm": 3.8715434074401855, - "learning_rate": 1.3498397703124644e-05, - "loss": 0.903, - "step": 9655 - }, - { - "epoch": 0.4478442280945758, - "grad_norm": 4.232203483581543, - "learning_rate": 1.3490816395932456e-05, - "loss": 0.9655, - "step": 9660 - }, - { - "epoch": 0.4480760315252666, - "grad_norm": 3.0055248737335205, - "learning_rate": 1.3483232803239203e-05, - "loss": 0.8359, - "step": 9665 - }, - { - "epoch": 0.44830783495595733, - "grad_norm": 3.7039334774017334, - "learning_rate": 1.3475646930010005e-05, - "loss": 0.8554, - "step": 9670 - }, - { - "epoch": 0.4485396383866481, - "grad_norm": 3.22282338142395, - "learning_rate": 1.346805878121146e-05, - "loss": 0.9855, - "step": 9675 - }, - { - "epoch": 0.4487714418173389, - "grad_norm": 4.586467266082764, - "learning_rate": 1.3460468361811674e-05, - "loss": 0.8068, - "step": 9680 - }, - { - "epoch": 0.44900324524802965, - "grad_norm": 3.3284201622009277, - "learning_rate": 1.345287567678023e-05, - "loss": 0.892, - "step": 9685 - }, - { - "epoch": 0.44923504867872044, - "grad_norm": 3.7659194469451904, - "learning_rate": 1.34452807310882e-05, - "loss": 0.9078, - "step": 9690 - }, - { - "epoch": 0.44946685210941123, - "grad_norm": 4.3242363929748535, - "learning_rate": 1.3437683529708132e-05, - "loss": 0.904, - "step": 9695 - }, - { - "epoch": 0.44969865554010197, - "grad_norm": 4.269758701324463, - "learning_rate": 1.3430084077614052e-05, - "loss": 0.9889, - "step": 9700 - }, - { - "epoch": 0.44969865554010197, - "eval_loss": 0.9509567618370056, - "eval_runtime": 11.2534, - "eval_samples_per_second": 11.285, - "eval_steps_per_second": 11.285, - "step": 9700 - }, - { - "epoch": 0.44993045897079276, - "grad_norm": 3.9716100692749023, - "learning_rate": 1.3422482379781459e-05, - "loss": 0.8453, - "step": 9705 - }, - { - "epoch": 0.45016226240148355, - "grad_norm": 4.124273300170898, - "learning_rate": 1.3414878441187321e-05, - "loss": 0.885, - "step": 9710 - }, - { - "epoch": 0.45039406583217434, - "grad_norm": 3.850370168685913, - "learning_rate": 1.3407272266810082e-05, - "loss": 1.081, - "step": 9715 - }, - { - "epoch": 0.4506258692628651, - "grad_norm": 3.7931642532348633, - "learning_rate": 1.3399663861629638e-05, - "loss": 0.9646, - "step": 9720 - }, - { - "epoch": 0.45085767269355587, - "grad_norm": 3.9382190704345703, - "learning_rate": 1.3392053230627349e-05, - "loss": 0.7776, - "step": 9725 - }, - { - "epoch": 0.45108947612424666, - "grad_norm": 4.051333427429199, - "learning_rate": 1.3384440378786039e-05, - "loss": 0.9445, - "step": 9730 - }, - { - "epoch": 0.4513212795549374, - "grad_norm": 4.055787563323975, - "learning_rate": 1.3376825311089974e-05, - "loss": 0.9952, - "step": 9735 - }, - { - "epoch": 0.4515530829856282, - "grad_norm": 3.4830150604248047, - "learning_rate": 1.3369208032524884e-05, - "loss": 0.9384, - "step": 9740 - }, - { - "epoch": 0.451784886416319, - "grad_norm": 3.9164631366729736, - "learning_rate": 1.3361588548077935e-05, - "loss": 0.8746, - "step": 9745 - }, - { - "epoch": 0.4520166898470097, - "grad_norm": 3.741633415222168, - "learning_rate": 1.3353966862737746e-05, - "loss": 0.9874, - "step": 9750 - }, - { - "epoch": 0.4522484932777005, - "grad_norm": 3.608835220336914, - "learning_rate": 1.3346342981494372e-05, - "loss": 0.9664, - "step": 9755 - }, - { - "epoch": 0.4524802967083913, - "grad_norm": 3.453850746154785, - "learning_rate": 1.3338716909339304e-05, - "loss": 0.8703, - "step": 9760 - }, - { - "epoch": 0.4527121001390821, - "grad_norm": 3.682974100112915, - "learning_rate": 1.3331088651265473e-05, - "loss": 1.0097, - "step": 9765 - }, - { - "epoch": 0.4529439035697728, - "grad_norm": 3.790799140930176, - "learning_rate": 1.3323458212267238e-05, - "loss": 0.8946, - "step": 9770 - }, - { - "epoch": 0.4531757070004636, - "grad_norm": 3.7424209117889404, - "learning_rate": 1.3315825597340388e-05, - "loss": 0.872, - "step": 9775 - }, - { - "epoch": 0.4534075104311544, - "grad_norm": 4.28916597366333, - "learning_rate": 1.3308190811482134e-05, - "loss": 0.9537, - "step": 9780 - }, - { - "epoch": 0.45363931386184514, - "grad_norm": 4.0296220779418945, - "learning_rate": 1.3300553859691103e-05, - "loss": 1.019, - "step": 9785 - }, - { - "epoch": 0.45387111729253593, - "grad_norm": 3.4381442070007324, - "learning_rate": 1.3292914746967361e-05, - "loss": 0.8505, - "step": 9790 - }, - { - "epoch": 0.4541029207232267, - "grad_norm": 3.793199300765991, - "learning_rate": 1.3285273478312359e-05, - "loss": 0.8457, - "step": 9795 - }, - { - "epoch": 0.45433472415391746, - "grad_norm": 3.8284950256347656, - "learning_rate": 1.3277630058728984e-05, - "loss": 1.0618, - "step": 9800 - }, - { - "epoch": 0.45433472415391746, - "eval_loss": 0.9499906301498413, - "eval_runtime": 11.2829, - "eval_samples_per_second": 11.256, - "eval_steps_per_second": 11.256, - "step": 9800 - }, - { - "epoch": 0.45456652758460825, - "grad_norm": 3.412783622741699, - "learning_rate": 1.3269984493221516e-05, - "loss": 0.8846, - "step": 9805 - }, - { - "epoch": 0.45479833101529904, - "grad_norm": 3.8301942348480225, - "learning_rate": 1.3262336786795654e-05, - "loss": 0.982, - "step": 9810 - }, - { - "epoch": 0.4550301344459898, - "grad_norm": 3.887139081954956, - "learning_rate": 1.3254686944458484e-05, - "loss": 0.8593, - "step": 9815 - }, - { - "epoch": 0.45526193787668057, - "grad_norm": 3.891221761703491, - "learning_rate": 1.32470349712185e-05, - "loss": 0.9703, - "step": 9820 - }, - { - "epoch": 0.45549374130737136, - "grad_norm": 3.008439540863037, - "learning_rate": 1.3239380872085586e-05, - "loss": 0.7827, - "step": 9825 - }, - { - "epoch": 0.45572554473806215, - "grad_norm": 3.8640429973602295, - "learning_rate": 1.3231724652071021e-05, - "loss": 0.9659, - "step": 9830 - }, - { - "epoch": 0.4559573481687529, - "grad_norm": 3.8008694648742676, - "learning_rate": 1.3224066316187472e-05, - "loss": 0.8439, - "step": 9835 - }, - { - "epoch": 0.4561891515994437, - "grad_norm": 3.950207471847534, - "learning_rate": 1.3216405869448991e-05, - "loss": 1.0957, - "step": 9840 - }, - { - "epoch": 0.45642095503013447, - "grad_norm": 4.314381122589111, - "learning_rate": 1.3208743316871013e-05, - "loss": 0.9362, - "step": 9845 - }, - { - "epoch": 0.4566527584608252, - "grad_norm": 3.4835643768310547, - "learning_rate": 1.3201078663470348e-05, - "loss": 1.0829, - "step": 9850 - }, - { - "epoch": 0.456884561891516, - "grad_norm": 3.5081305503845215, - "learning_rate": 1.3193411914265186e-05, - "loss": 0.8306, - "step": 9855 - }, - { - "epoch": 0.4571163653222068, - "grad_norm": 3.41742205619812, - "learning_rate": 1.318574307427509e-05, - "loss": 0.7945, - "step": 9860 - }, - { - "epoch": 0.4573481687528975, - "grad_norm": 4.183412075042725, - "learning_rate": 1.3178072148520981e-05, - "loss": 0.9223, - "step": 9865 - }, - { - "epoch": 0.4575799721835883, - "grad_norm": 3.7634990215301514, - "learning_rate": 1.3170399142025163e-05, - "loss": 0.7524, - "step": 9870 - }, - { - "epoch": 0.4578117756142791, - "grad_norm": 3.36533784866333, - "learning_rate": 1.3162724059811291e-05, - "loss": 0.8724, - "step": 9875 - }, - { - "epoch": 0.45804357904496984, - "grad_norm": 3.756016254425049, - "learning_rate": 1.3155046906904376e-05, - "loss": 0.9082, - "step": 9880 - }, - { - "epoch": 0.45827538247566063, - "grad_norm": 3.022764205932617, - "learning_rate": 1.3147367688330795e-05, - "loss": 0.8565, - "step": 9885 - }, - { - "epoch": 0.4585071859063514, - "grad_norm": 3.910804271697998, - "learning_rate": 1.3139686409118268e-05, - "loss": 0.994, - "step": 9890 - }, - { - "epoch": 0.4587389893370422, - "grad_norm": 4.257674694061279, - "learning_rate": 1.3132003074295874e-05, - "loss": 0.8303, - "step": 9895 - }, - { - "epoch": 0.45897079276773295, - "grad_norm": 4.139712810516357, - "learning_rate": 1.3124317688894026e-05, - "loss": 0.9909, - "step": 9900 - }, - { - "epoch": 0.45897079276773295, - "eval_loss": 0.948937714099884, - "eval_runtime": 11.2699, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 9900 - }, - { - "epoch": 0.45920259619842374, - "grad_norm": 3.3769872188568115, - "learning_rate": 1.3116630257944492e-05, - "loss": 0.9206, - "step": 9905 - }, - { - "epoch": 0.45943439962911453, - "grad_norm": 3.6999621391296387, - "learning_rate": 1.3108940786480366e-05, - "loss": 0.9053, - "step": 9910 - }, - { - "epoch": 0.45966620305980527, - "grad_norm": 3.6005308628082275, - "learning_rate": 1.3101249279536089e-05, - "loss": 0.8128, - "step": 9915 - }, - { - "epoch": 0.45989800649049606, - "grad_norm": 3.3132901191711426, - "learning_rate": 1.3093555742147435e-05, - "loss": 0.8088, - "step": 9920 - }, - { - "epoch": 0.46012980992118685, - "grad_norm": 3.400157928466797, - "learning_rate": 1.3085860179351493e-05, - "loss": 0.9798, - "step": 9925 - }, - { - "epoch": 0.4603616133518776, - "grad_norm": 3.298585891723633, - "learning_rate": 1.3078162596186697e-05, - "loss": 1.0104, - "step": 9930 - }, - { - "epoch": 0.4605934167825684, - "grad_norm": 4.665667533874512, - "learning_rate": 1.3070462997692794e-05, - "loss": 0.9341, - "step": 9935 - }, - { - "epoch": 0.46082522021325917, - "grad_norm": 3.343885660171509, - "learning_rate": 1.3062761388910842e-05, - "loss": 1.0162, - "step": 9940 - }, - { - "epoch": 0.4610570236439499, - "grad_norm": 3.290959596633911, - "learning_rate": 1.3055057774883237e-05, - "loss": 0.9058, - "step": 9945 - }, - { - "epoch": 0.4612888270746407, - "grad_norm": 3.9871811866760254, - "learning_rate": 1.3047352160653668e-05, - "loss": 1.0865, - "step": 9950 - }, - { - "epoch": 0.4615206305053315, - "grad_norm": 4.483913421630859, - "learning_rate": 1.3039644551267143e-05, - "loss": 0.9857, - "step": 9955 - }, - { - "epoch": 0.4617524339360223, - "grad_norm": 3.8120338916778564, - "learning_rate": 1.3031934951769975e-05, - "loss": 0.9333, - "step": 9960 - }, - { - "epoch": 0.461984237366713, - "grad_norm": 3.768869638442993, - "learning_rate": 1.3024223367209777e-05, - "loss": 0.8497, - "step": 9965 - }, - { - "epoch": 0.4622160407974038, - "grad_norm": 3.1632208824157715, - "learning_rate": 1.3016509802635464e-05, - "loss": 0.8643, - "step": 9970 - }, - { - "epoch": 0.4624478442280946, - "grad_norm": 3.730023145675659, - "learning_rate": 1.3008794263097248e-05, - "loss": 0.7617, - "step": 9975 - }, - { - "epoch": 0.46267964765878533, - "grad_norm": 3.803635597229004, - "learning_rate": 1.3001076753646637e-05, - "loss": 0.8348, - "step": 9980 - }, - { - "epoch": 0.4629114510894761, - "grad_norm": 3.9034042358398438, - "learning_rate": 1.299335727933642e-05, - "loss": 0.9941, - "step": 9985 - }, - { - "epoch": 0.4631432545201669, - "grad_norm": 4.450758934020996, - "learning_rate": 1.2985635845220678e-05, - "loss": 1.0976, - "step": 9990 - }, - { - "epoch": 0.46337505795085765, - "grad_norm": 3.4911108016967773, - "learning_rate": 1.2977912456354778e-05, - "loss": 0.872, - "step": 9995 - }, - { - "epoch": 0.46360686138154844, - "grad_norm": 3.8963191509246826, - "learning_rate": 1.2970187117795357e-05, - "loss": 1.0407, - "step": 10000 - }, - { - "epoch": 0.46360686138154844, - "eval_loss": 0.9476349949836731, - "eval_runtime": 11.2604, - "eval_samples_per_second": 11.278, - "eval_steps_per_second": 11.278, - "step": 10000 - }, - { - "epoch": 0.46383866481223923, - "grad_norm": 4.193517208099365, - "learning_rate": 1.2962459834600341e-05, - "loss": 0.9288, - "step": 10005 - }, - { - "epoch": 0.46407046824292997, - "grad_norm": 3.453949213027954, - "learning_rate": 1.295473061182892e-05, - "loss": 0.9205, - "step": 10010 - }, - { - "epoch": 0.46430227167362076, - "grad_norm": 3.316931962966919, - "learning_rate": 1.2946999454541558e-05, - "loss": 0.8677, - "step": 10015 - }, - { - "epoch": 0.46453407510431155, - "grad_norm": 3.576442241668701, - "learning_rate": 1.2939266367799983e-05, - "loss": 0.953, - "step": 10020 - }, - { - "epoch": 0.46476587853500234, - "grad_norm": 3.715101957321167, - "learning_rate": 1.2931531356667191e-05, - "loss": 0.8921, - "step": 10025 - }, - { - "epoch": 0.4649976819656931, - "grad_norm": 4.151893138885498, - "learning_rate": 1.292379442620743e-05, - "loss": 0.8068, - "step": 10030 - }, - { - "epoch": 0.46522948539638387, - "grad_norm": 3.853817939758301, - "learning_rate": 1.2916055581486211e-05, - "loss": 0.9439, - "step": 10035 - }, - { - "epoch": 0.46546128882707466, - "grad_norm": 3.5883684158325195, - "learning_rate": 1.29083148275703e-05, - "loss": 0.8318, - "step": 10040 - }, - { - "epoch": 0.4656930922577654, - "grad_norm": 4.212533950805664, - "learning_rate": 1.2900572169527705e-05, - "loss": 1.0225, - "step": 10045 - }, - { - "epoch": 0.4659248956884562, - "grad_norm": 2.86691951751709, - "learning_rate": 1.289282761242769e-05, - "loss": 0.8968, - "step": 10050 - }, - { - "epoch": 0.466156699119147, - "grad_norm": 2.563044548034668, - "learning_rate": 1.2885081161340751e-05, - "loss": 0.8134, - "step": 10055 - }, - { - "epoch": 0.4663885025498377, - "grad_norm": 3.6845993995666504, - "learning_rate": 1.2877332821338635e-05, - "loss": 0.9026, - "step": 10060 - }, - { - "epoch": 0.4666203059805285, - "grad_norm": 4.9104413986206055, - "learning_rate": 1.2869582597494324e-05, - "loss": 1.0064, - "step": 10065 - }, - { - "epoch": 0.4668521094112193, - "grad_norm": 3.9013454914093018, - "learning_rate": 1.2861830494882021e-05, - "loss": 0.9372, - "step": 10070 - }, - { - "epoch": 0.4670839128419101, - "grad_norm": 3.2289061546325684, - "learning_rate": 1.285407651857718e-05, - "loss": 0.8886, - "step": 10075 - }, - { - "epoch": 0.4673157162726008, - "grad_norm": 3.8512096405029297, - "learning_rate": 1.2846320673656464e-05, - "loss": 0.9159, - "step": 10080 - }, - { - "epoch": 0.4675475197032916, - "grad_norm": 4.084563732147217, - "learning_rate": 1.2838562965197766e-05, - "loss": 0.8895, - "step": 10085 - }, - { - "epoch": 0.4677793231339824, - "grad_norm": 4.185845375061035, - "learning_rate": 1.2830803398280203e-05, - "loss": 0.9423, - "step": 10090 - }, - { - "epoch": 0.46801112656467314, - "grad_norm": 3.8135392665863037, - "learning_rate": 1.2823041977984102e-05, - "loss": 0.8725, - "step": 10095 - }, - { - "epoch": 0.46824292999536393, - "grad_norm": 4.087751865386963, - "learning_rate": 1.2815278709391004e-05, - "loss": 0.9895, - "step": 10100 - }, - { - "epoch": 0.46824292999536393, - "eval_loss": 0.9459433555603027, - "eval_runtime": 11.2767, - "eval_samples_per_second": 11.262, - "eval_steps_per_second": 11.262, - "step": 10100 - }, - { - "epoch": 0.4684747334260547, - "grad_norm": 3.648947238922119, - "learning_rate": 1.2807513597583665e-05, - "loss": 1.0151, - "step": 10105 - }, - { - "epoch": 0.46870653685674546, - "grad_norm": 3.339587926864624, - "learning_rate": 1.2799746647646045e-05, - "loss": 0.9785, - "step": 10110 - }, - { - "epoch": 0.46893834028743625, - "grad_norm": 3.7756528854370117, - "learning_rate": 1.279197786466331e-05, - "loss": 0.7827, - "step": 10115 - }, - { - "epoch": 0.46917014371812704, - "grad_norm": 4.143910884857178, - "learning_rate": 1.2784207253721817e-05, - "loss": 0.954, - "step": 10120 - }, - { - "epoch": 0.4694019471488178, - "grad_norm": 2.996795892715454, - "learning_rate": 1.2776434819909136e-05, - "loss": 0.6456, - "step": 10125 - }, - { - "epoch": 0.46963375057950857, - "grad_norm": 4.004779815673828, - "learning_rate": 1.2768660568314012e-05, - "loss": 1.0712, - "step": 10130 - }, - { - "epoch": 0.46986555401019936, - "grad_norm": 3.2952988147735596, - "learning_rate": 1.2760884504026392e-05, - "loss": 0.7785, - "step": 10135 - }, - { - "epoch": 0.47009735744089015, - "grad_norm": 5.342103958129883, - "learning_rate": 1.275310663213741e-05, - "loss": 1.0066, - "step": 10140 - }, - { - "epoch": 0.4703291608715809, - "grad_norm": 4.140252113342285, - "learning_rate": 1.2745326957739375e-05, - "loss": 0.959, - "step": 10145 - }, - { - "epoch": 0.4705609643022717, - "grad_norm": 3.928713321685791, - "learning_rate": 1.2737545485925786e-05, - "loss": 0.9485, - "step": 10150 - }, - { - "epoch": 0.47079276773296247, - "grad_norm": 4.4672136306762695, - "learning_rate": 1.2729762221791312e-05, - "loss": 0.9565, - "step": 10155 - }, - { - "epoch": 0.4710245711636532, - "grad_norm": 3.2409780025482178, - "learning_rate": 1.27219771704318e-05, - "loss": 0.8986, - "step": 10160 - }, - { - "epoch": 0.471256374594344, - "grad_norm": 4.780323505401611, - "learning_rate": 1.2714190336944264e-05, - "loss": 0.9055, - "step": 10165 - }, - { - "epoch": 0.4714881780250348, - "grad_norm": 3.7296030521392822, - "learning_rate": 1.2706401726426884e-05, - "loss": 0.8048, - "step": 10170 - }, - { - "epoch": 0.4717199814557255, - "grad_norm": 3.707808017730713, - "learning_rate": 1.2698611343979007e-05, - "loss": 0.9201, - "step": 10175 - }, - { - "epoch": 0.4719517848864163, - "grad_norm": 3.560077667236328, - "learning_rate": 1.2690819194701134e-05, - "loss": 0.9069, - "step": 10180 - }, - { - "epoch": 0.4721835883171071, - "grad_norm": 4.014420032501221, - "learning_rate": 1.2683025283694936e-05, - "loss": 0.9935, - "step": 10185 - }, - { - "epoch": 0.47241539174779784, - "grad_norm": 3.263787031173706, - "learning_rate": 1.267522961606322e-05, - "loss": 0.8071, - "step": 10190 - }, - { - "epoch": 0.47264719517848863, - "grad_norm": 4.139646053314209, - "learning_rate": 1.2667432196909955e-05, - "loss": 0.9272, - "step": 10195 - }, - { - "epoch": 0.4728789986091794, - "grad_norm": 4.009764671325684, - "learning_rate": 1.2659633031340255e-05, - "loss": 0.9658, - "step": 10200 - }, - { - "epoch": 0.4728789986091794, - "eval_loss": 0.9445049166679382, - "eval_runtime": 11.2699, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 10200 - }, - { - "epoch": 0.4731108020398702, - "grad_norm": 3.5922906398773193, - "learning_rate": 1.2651832124460369e-05, - "loss": 0.9533, - "step": 10205 - }, - { - "epoch": 0.47334260547056095, - "grad_norm": 3.384347677230835, - "learning_rate": 1.26440294813777e-05, - "loss": 0.7947, - "step": 10210 - }, - { - "epoch": 0.47357440890125174, - "grad_norm": 3.7490670680999756, - "learning_rate": 1.2636225107200779e-05, - "loss": 0.7939, - "step": 10215 - }, - { - "epoch": 0.47380621233194253, - "grad_norm": 4.333261966705322, - "learning_rate": 1.262841900703927e-05, - "loss": 0.7959, - "step": 10220 - }, - { - "epoch": 0.47403801576263327, - "grad_norm": 4.149163722991943, - "learning_rate": 1.262061118600397e-05, - "loss": 0.9461, - "step": 10225 - }, - { - "epoch": 0.47426981919332406, - "grad_norm": 3.4442434310913086, - "learning_rate": 1.26128016492068e-05, - "loss": 0.874, - "step": 10230 - }, - { - "epoch": 0.47450162262401485, - "grad_norm": 3.540757894515991, - "learning_rate": 1.260499040176081e-05, - "loss": 0.9761, - "step": 10235 - }, - { - "epoch": 0.4747334260547056, - "grad_norm": 4.07343864440918, - "learning_rate": 1.2597177448780162e-05, - "loss": 1.0432, - "step": 10240 - }, - { - "epoch": 0.4749652294853964, - "grad_norm": 3.954550266265869, - "learning_rate": 1.258936279538014e-05, - "loss": 0.9347, - "step": 10245 - }, - { - "epoch": 0.47519703291608717, - "grad_norm": 4.400870323181152, - "learning_rate": 1.2581546446677142e-05, - "loss": 1.2181, - "step": 10250 - }, - { - "epoch": 0.4754288363467779, - "grad_norm": 3.343297004699707, - "learning_rate": 1.2573728407788667e-05, - "loss": 0.7962, - "step": 10255 - }, - { - "epoch": 0.4756606397774687, - "grad_norm": 3.889265775680542, - "learning_rate": 1.2565908683833337e-05, - "loss": 0.8248, - "step": 10260 - }, - { - "epoch": 0.4758924432081595, - "grad_norm": 3.9012787342071533, - "learning_rate": 1.2558087279930859e-05, - "loss": 0.9967, - "step": 10265 - }, - { - "epoch": 0.4761242466388503, - "grad_norm": 3.298150062561035, - "learning_rate": 1.2550264201202054e-05, - "loss": 0.9277, - "step": 10270 - }, - { - "epoch": 0.476356050069541, - "grad_norm": 3.909010887145996, - "learning_rate": 1.2542439452768833e-05, - "loss": 0.9993, - "step": 10275 - }, - { - "epoch": 0.4765878535002318, - "grad_norm": 3.329361915588379, - "learning_rate": 1.2534613039754201e-05, - "loss": 0.9402, - "step": 10280 - }, - { - "epoch": 0.4768196569309226, - "grad_norm": 3.2942123413085938, - "learning_rate": 1.2526784967282255e-05, - "loss": 0.8344, - "step": 10285 - }, - { - "epoch": 0.47705146036161333, - "grad_norm": 4.618292808532715, - "learning_rate": 1.251895524047817e-05, - "loss": 0.8381, - "step": 10290 - }, - { - "epoch": 0.4772832637923041, - "grad_norm": 3.3553268909454346, - "learning_rate": 1.2511123864468218e-05, - "loss": 0.844, - "step": 10295 - }, - { - "epoch": 0.4775150672229949, - "grad_norm": 4.03507137298584, - "learning_rate": 1.2503290844379739e-05, - "loss": 0.9175, - "step": 10300 - }, - { - "epoch": 0.4775150672229949, - "eval_loss": 0.9433912634849548, - "eval_runtime": 11.2657, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, - "step": 10300 - }, - { - "epoch": 0.47774687065368565, - "grad_norm": 3.0558102130889893, - "learning_rate": 1.2495456185341157e-05, - "loss": 0.9507, - "step": 10305 - }, - { - "epoch": 0.47797867408437644, - "grad_norm": 3.6690430641174316, - "learning_rate": 1.2487619892481966e-05, - "loss": 0.8518, - "step": 10310 - }, - { - "epoch": 0.47821047751506723, - "grad_norm": 4.014634132385254, - "learning_rate": 1.2479781970932727e-05, - "loss": 0.8619, - "step": 10315 - }, - { - "epoch": 0.47844228094575797, - "grad_norm": 3.3118784427642822, - "learning_rate": 1.247194242582507e-05, - "loss": 0.9213, - "step": 10320 - }, - { - "epoch": 0.47867408437644876, - "grad_norm": 3.962343454360962, - "learning_rate": 1.2464101262291688e-05, - "loss": 0.9235, - "step": 10325 - }, - { - "epoch": 0.47890588780713955, - "grad_norm": 3.672382354736328, - "learning_rate": 1.2456258485466339e-05, - "loss": 0.8238, - "step": 10330 - }, - { - "epoch": 0.47913769123783034, - "grad_norm": 4.043935298919678, - "learning_rate": 1.2448414100483826e-05, - "loss": 0.986, - "step": 10335 - }, - { - "epoch": 0.4793694946685211, - "grad_norm": 3.571040630340576, - "learning_rate": 1.2440568112480009e-05, - "loss": 0.8952, - "step": 10340 - }, - { - "epoch": 0.47960129809921187, - "grad_norm": 4.216655731201172, - "learning_rate": 1.2432720526591808e-05, - "loss": 1.0396, - "step": 10345 - }, - { - "epoch": 0.47983310152990266, - "grad_norm": 3.7450881004333496, - "learning_rate": 1.2424871347957173e-05, - "loss": 0.9586, - "step": 10350 - }, - { - "epoch": 0.4800649049605934, - "grad_norm": 3.8393282890319824, - "learning_rate": 1.2417020581715104e-05, - "loss": 0.8527, - "step": 10355 - }, - { - "epoch": 0.4802967083912842, - "grad_norm": 3.087303400039673, - "learning_rate": 1.2409168233005644e-05, - "loss": 0.8458, - "step": 10360 - }, - { - "epoch": 0.480528511821975, - "grad_norm": 3.759127378463745, - "learning_rate": 1.2401314306969869e-05, - "loss": 1.0019, - "step": 10365 - }, - { - "epoch": 0.4807603152526657, - "grad_norm": 3.2928617000579834, - "learning_rate": 1.2393458808749885e-05, - "loss": 0.8309, - "step": 10370 - }, - { - "epoch": 0.4809921186833565, - "grad_norm": 3.7181479930877686, - "learning_rate": 1.2385601743488831e-05, - "loss": 0.8989, - "step": 10375 - }, - { - "epoch": 0.4812239221140473, - "grad_norm": 3.783705711364746, - "learning_rate": 1.2377743116330869e-05, - "loss": 0.8045, - "step": 10380 - }, - { - "epoch": 0.4814557255447381, - "grad_norm": 3.634629487991333, - "learning_rate": 1.2369882932421184e-05, - "loss": 0.9103, - "step": 10385 - }, - { - "epoch": 0.4816875289754288, - "grad_norm": 3.8915700912475586, - "learning_rate": 1.236202119690599e-05, - "loss": 0.9318, - "step": 10390 - }, - { - "epoch": 0.4819193324061196, - "grad_norm": 4.209451675415039, - "learning_rate": 1.2354157914932501e-05, - "loss": 0.9598, - "step": 10395 - }, - { - "epoch": 0.4821511358368104, - "grad_norm": 4.348424911499023, - "learning_rate": 1.234629309164895e-05, - "loss": 0.915, - "step": 10400 - }, - { - "epoch": 0.4821511358368104, - "eval_loss": 0.9418513178825378, - "eval_runtime": 11.2768, - "eval_samples_per_second": 11.262, - "eval_steps_per_second": 11.262, - "step": 10400 - }, - { - "epoch": 0.48238293926750114, - "grad_norm": 3.9658772945404053, - "learning_rate": 1.2338426732204585e-05, - "loss": 0.9433, - "step": 10405 - }, - { - "epoch": 0.48261474269819193, - "grad_norm": 3.0501604080200195, - "learning_rate": 1.2330558841749649e-05, - "loss": 0.6786, - "step": 10410 - }, - { - "epoch": 0.4828465461288827, - "grad_norm": 3.564727306365967, - "learning_rate": 1.2322689425435401e-05, - "loss": 0.9322, - "step": 10415 - }, - { - "epoch": 0.48307834955957346, - "grad_norm": 5.660946846008301, - "learning_rate": 1.2314818488414083e-05, - "loss": 1.0264, - "step": 10420 - }, - { - "epoch": 0.48331015299026425, - "grad_norm": 4.12052583694458, - "learning_rate": 1.2306946035838946e-05, - "loss": 0.8962, - "step": 10425 - }, - { - "epoch": 0.48354195642095504, - "grad_norm": 4.541821479797363, - "learning_rate": 1.229907207286423e-05, - "loss": 1.1351, - "step": 10430 - }, - { - "epoch": 0.4837737598516458, - "grad_norm": 3.592571496963501, - "learning_rate": 1.2291196604645153e-05, - "loss": 0.839, - "step": 10435 - }, - { - "epoch": 0.48400556328233657, - "grad_norm": 3.7811429500579834, - "learning_rate": 1.2283319636337935e-05, - "loss": 0.9498, - "step": 10440 - }, - { - "epoch": 0.48423736671302736, - "grad_norm": 3.5306897163391113, - "learning_rate": 1.227544117309977e-05, - "loss": 0.7831, - "step": 10445 - }, - { - "epoch": 0.48446917014371815, - "grad_norm": 3.1777243614196777, - "learning_rate": 1.2267561220088829e-05, - "loss": 0.7042, - "step": 10450 - }, - { - "epoch": 0.4847009735744089, - "grad_norm": 3.2184152603149414, - "learning_rate": 1.225967978246426e-05, - "loss": 0.8678, - "step": 10455 - }, - { - "epoch": 0.4849327770050997, - "grad_norm": 4.353787422180176, - "learning_rate": 1.2251796865386185e-05, - "loss": 0.9821, - "step": 10460 - }, - { - "epoch": 0.48516458043579047, - "grad_norm": 4.005190372467041, - "learning_rate": 1.2243912474015698e-05, - "loss": 0.9116, - "step": 10465 - }, - { - "epoch": 0.4853963838664812, - "grad_norm": 3.815784454345703, - "learning_rate": 1.2236026613514846e-05, - "loss": 0.8607, - "step": 10470 - }, - { - "epoch": 0.485628187297172, - "grad_norm": 3.319671154022217, - "learning_rate": 1.222813928904665e-05, - "loss": 0.8951, - "step": 10475 - }, - { - "epoch": 0.4858599907278628, - "grad_norm": 3.4613735675811768, - "learning_rate": 1.2220250505775086e-05, - "loss": 0.8038, - "step": 10480 - }, - { - "epoch": 0.4860917941585535, - "grad_norm": 4.234585285186768, - "learning_rate": 1.2212360268865079e-05, - "loss": 0.9614, - "step": 10485 - }, - { - "epoch": 0.4863235975892443, - "grad_norm": 3.492586374282837, - "learning_rate": 1.2204468583482514e-05, - "loss": 0.816, - "step": 10490 - }, - { - "epoch": 0.4865554010199351, - "grad_norm": 3.3907053470611572, - "learning_rate": 1.2196575454794222e-05, - "loss": 1.0767, - "step": 10495 - }, - { - "epoch": 0.48678720445062584, - "grad_norm": 3.7171268463134766, - "learning_rate": 1.2188680887967977e-05, - "loss": 0.9849, - "step": 10500 - }, - { - "epoch": 0.48678720445062584, - "eval_loss": 0.9430806040763855, - "eval_runtime": 11.2641, - "eval_samples_per_second": 11.275, - "eval_steps_per_second": 11.275, - "step": 10500 - }, - { - "epoch": 0.48701900788131663, - "grad_norm": 3.030805826187134, - "learning_rate": 1.2180784888172494e-05, - "loss": 0.8955, - "step": 10505 - }, - { - "epoch": 0.4872508113120074, - "grad_norm": 4.06427001953125, - "learning_rate": 1.2172887460577432e-05, - "loss": 1.0308, - "step": 10510 - }, - { - "epoch": 0.4874826147426982, - "grad_norm": 3.5869393348693848, - "learning_rate": 1.2164988610353375e-05, - "loss": 0.9683, - "step": 10515 - }, - { - "epoch": 0.48771441817338895, - "grad_norm": 3.8638756275177, - "learning_rate": 1.215708834267185e-05, - "loss": 1.0335, - "step": 10520 - }, - { - "epoch": 0.48794622160407974, - "grad_norm": 3.455765724182129, - "learning_rate": 1.21491866627053e-05, - "loss": 0.7771, - "step": 10525 - }, - { - "epoch": 0.48817802503477054, - "grad_norm": 3.3024017810821533, - "learning_rate": 1.2141283575627105e-05, - "loss": 0.9882, - "step": 10530 - }, - { - "epoch": 0.48840982846546127, - "grad_norm": 3.9688782691955566, - "learning_rate": 1.2133379086611559e-05, - "loss": 0.9241, - "step": 10535 - }, - { - "epoch": 0.48864163189615206, - "grad_norm": 4.1364874839782715, - "learning_rate": 1.212547320083387e-05, - "loss": 0.838, - "step": 10540 - }, - { - "epoch": 0.48887343532684285, - "grad_norm": 3.659271717071533, - "learning_rate": 1.211756592347017e-05, - "loss": 0.883, - "step": 10545 - }, - { - "epoch": 0.4891052387575336, - "grad_norm": 3.7890548706054688, - "learning_rate": 1.2109657259697496e-05, - "loss": 0.9718, - "step": 10550 - }, - { - "epoch": 0.4893370421882244, - "grad_norm": 3.4658312797546387, - "learning_rate": 1.210174721469379e-05, - "loss": 0.916, - "step": 10555 - }, - { - "epoch": 0.48956884561891517, - "grad_norm": 3.9163401126861572, - "learning_rate": 1.2093835793637911e-05, - "loss": 0.8764, - "step": 10560 - }, - { - "epoch": 0.4898006490496059, - "grad_norm": 3.979099750518799, - "learning_rate": 1.2085923001709607e-05, - "loss": 1.046, - "step": 10565 - }, - { - "epoch": 0.4900324524802967, - "grad_norm": 4.127171039581299, - "learning_rate": 1.2078008844089523e-05, - "loss": 1.011, - "step": 10570 - }, - { - "epoch": 0.4902642559109875, - "grad_norm": 5.003758907318115, - "learning_rate": 1.2070093325959209e-05, - "loss": 0.9325, - "step": 10575 - }, - { - "epoch": 0.4904960593416783, - "grad_norm": 3.5403106212615967, - "learning_rate": 1.2062176452501093e-05, - "loss": 1.002, - "step": 10580 - }, - { - "epoch": 0.490727862772369, - "grad_norm": 3.0813117027282715, - "learning_rate": 1.2054258228898498e-05, - "loss": 0.7354, - "step": 10585 - }, - { - "epoch": 0.4909596662030598, - "grad_norm": 4.302268981933594, - "learning_rate": 1.204633866033563e-05, - "loss": 1.0057, - "step": 10590 - }, - { - "epoch": 0.4911914696337506, - "grad_norm": 3.079711437225342, - "learning_rate": 1.2038417751997575e-05, - "loss": 0.8065, - "step": 10595 - }, - { - "epoch": 0.49142327306444133, - "grad_norm": 4.152148246765137, - "learning_rate": 1.2030495509070295e-05, - "loss": 0.9679, - "step": 10600 - }, - { - "epoch": 0.49142327306444133, - "eval_loss": 0.9401566982269287, - "eval_runtime": 11.2741, - "eval_samples_per_second": 11.265, - "eval_steps_per_second": 11.265, - "step": 10600 - }, - { - "epoch": 0.4916550764951321, - "grad_norm": 3.9036941528320312, - "learning_rate": 1.2022571936740626e-05, - "loss": 0.8449, - "step": 10605 - }, - { - "epoch": 0.4918868799258229, - "grad_norm": 3.7571048736572266, - "learning_rate": 1.201464704019628e-05, - "loss": 1.0602, - "step": 10610 - }, - { - "epoch": 0.49211868335651365, - "grad_norm": 4.245012283325195, - "learning_rate": 1.200672082462582e-05, - "loss": 0.9533, - "step": 10615 - }, - { - "epoch": 0.49235048678720444, - "grad_norm": 4.420340061187744, - "learning_rate": 1.1998793295218696e-05, - "loss": 0.8259, - "step": 10620 - }, - { - "epoch": 0.49258229021789524, - "grad_norm": 3.532597064971924, - "learning_rate": 1.1990864457165198e-05, - "loss": 0.9432, - "step": 10625 - }, - { - "epoch": 0.49281409364858597, - "grad_norm": 3.4388935565948486, - "learning_rate": 1.1982934315656487e-05, - "loss": 0.7395, - "step": 10630 - }, - { - "epoch": 0.49304589707927676, - "grad_norm": 3.3850276470184326, - "learning_rate": 1.1975002875884565e-05, - "loss": 0.9985, - "step": 10635 - }, - { - "epoch": 0.49327770050996755, - "grad_norm": 4.3244099617004395, - "learning_rate": 1.1967070143042291e-05, - "loss": 0.9041, - "step": 10640 - }, - { - "epoch": 0.49350950394065835, - "grad_norm": 3.7923336029052734, - "learning_rate": 1.1959136122323376e-05, - "loss": 0.7961, - "step": 10645 - }, - { - "epoch": 0.4937413073713491, - "grad_norm": 4.479656219482422, - "learning_rate": 1.195120081892236e-05, - "loss": 1.0191, - "step": 10650 - }, - { - "epoch": 0.49397311080203987, - "grad_norm": 4.143479347229004, - "learning_rate": 1.1943264238034633e-05, - "loss": 0.9074, - "step": 10655 - }, - { - "epoch": 0.49420491423273066, - "grad_norm": 4.062614917755127, - "learning_rate": 1.193532638485642e-05, - "loss": 0.7747, - "step": 10660 - }, - { - "epoch": 0.4944367176634214, - "grad_norm": 3.4529573917388916, - "learning_rate": 1.1927387264584778e-05, - "loss": 0.9046, - "step": 10665 - }, - { - "epoch": 0.4946685210941122, - "grad_norm": 3.946471929550171, - "learning_rate": 1.1919446882417595e-05, - "loss": 1.1305, - "step": 10670 - }, - { - "epoch": 0.494900324524803, - "grad_norm": 4.636277675628662, - "learning_rate": 1.191150524355358e-05, - "loss": 0.8582, - "step": 10675 - }, - { - "epoch": 0.4951321279554937, - "grad_norm": 3.944178819656372, - "learning_rate": 1.1903562353192273e-05, - "loss": 0.9054, - "step": 10680 - }, - { - "epoch": 0.4953639313861845, - "grad_norm": 3.4163365364074707, - "learning_rate": 1.1895618216534026e-05, - "loss": 0.9238, - "step": 10685 - }, - { - "epoch": 0.4955957348168753, - "grad_norm": 3.241332769393921, - "learning_rate": 1.1887672838780009e-05, - "loss": 0.8977, - "step": 10690 - }, - { - "epoch": 0.4958275382475661, - "grad_norm": 3.3289830684661865, - "learning_rate": 1.1879726225132208e-05, - "loss": 0.8232, - "step": 10695 - }, - { - "epoch": 0.4960593416782568, - "grad_norm": 4.045788288116455, - "learning_rate": 1.1871778380793418e-05, - "loss": 0.9514, - "step": 10700 - }, - { - "epoch": 0.4960593416782568, - "eval_loss": 0.9392388463020325, - "eval_runtime": 11.2754, - "eval_samples_per_second": 11.263, - "eval_steps_per_second": 11.263, - "step": 10700 - }, - { - "epoch": 0.4962911451089476, - "grad_norm": 3.331512928009033, - "learning_rate": 1.1863829310967232e-05, - "loss": 0.9257, - "step": 10705 - }, - { - "epoch": 0.4965229485396384, - "grad_norm": 4.671146392822266, - "learning_rate": 1.1855879020858053e-05, - "loss": 0.8757, - "step": 10710 - }, - { - "epoch": 0.49675475197032914, - "grad_norm": 4.140007019042969, - "learning_rate": 1.1847927515671081e-05, - "loss": 0.9834, - "step": 10715 - }, - { - "epoch": 0.49698655540101994, - "grad_norm": 3.917147636413574, - "learning_rate": 1.183997480061231e-05, - "loss": 0.9667, - "step": 10720 - }, - { - "epoch": 0.4972183588317107, - "grad_norm": 3.7735893726348877, - "learning_rate": 1.1832020880888528e-05, - "loss": 0.9726, - "step": 10725 - }, - { - "epoch": 0.49745016226240146, - "grad_norm": 4.691269397735596, - "learning_rate": 1.1824065761707315e-05, - "loss": 1.0185, - "step": 10730 - }, - { - "epoch": 0.49768196569309225, - "grad_norm": 3.9918999671936035, - "learning_rate": 1.1816109448277029e-05, - "loss": 0.8937, - "step": 10735 - }, - { - "epoch": 0.49791376912378305, - "grad_norm": 3.6287760734558105, - "learning_rate": 1.1808151945806807e-05, - "loss": 0.9186, - "step": 10740 - }, - { - "epoch": 0.4981455725544738, - "grad_norm": 3.8710758686065674, - "learning_rate": 1.1800193259506585e-05, - "loss": 0.8035, - "step": 10745 - }, - { - "epoch": 0.4983773759851646, - "grad_norm": 4.250580787658691, - "learning_rate": 1.1792233394587047e-05, - "loss": 0.9368, - "step": 10750 - }, - { - "epoch": 0.49860917941585536, - "grad_norm": 4.388089656829834, - "learning_rate": 1.1784272356259669e-05, - "loss": 1.0037, - "step": 10755 - }, - { - "epoch": 0.49884098284654615, - "grad_norm": 3.610869884490967, - "learning_rate": 1.1776310149736682e-05, - "loss": 0.8208, - "step": 10760 - }, - { - "epoch": 0.4990727862772369, - "grad_norm": 4.281845569610596, - "learning_rate": 1.1768346780231092e-05, - "loss": 0.891, - "step": 10765 - }, - { - "epoch": 0.4993045897079277, - "grad_norm": 3.3298609256744385, - "learning_rate": 1.1760382252956663e-05, - "loss": 0.7928, - "step": 10770 - }, - { - "epoch": 0.4995363931386185, - "grad_norm": 3.43632173538208, - "learning_rate": 1.1752416573127912e-05, - "loss": 0.9587, - "step": 10775 - }, - { - "epoch": 0.4997681965693092, - "grad_norm": 3.8259291648864746, - "learning_rate": 1.1744449745960119e-05, - "loss": 0.8569, - "step": 10780 - }, - { - "epoch": 0.5, - "grad_norm": 4.613092422485352, - "learning_rate": 1.1736481776669307e-05, - "loss": 0.9346, - "step": 10785 - }, - { - "epoch": 0.5002318034306907, - "grad_norm": 3.216547966003418, - "learning_rate": 1.1728512670472249e-05, - "loss": 0.8897, - "step": 10790 - }, - { - "epoch": 0.5004636068613816, - "grad_norm": 3.9909191131591797, - "learning_rate": 1.172054243258647e-05, - "loss": 1.0385, - "step": 10795 - }, - { - "epoch": 0.5006954102920723, - "grad_norm": 3.6476809978485107, - "learning_rate": 1.1712571068230228e-05, - "loss": 0.8202, - "step": 10800 - }, - { - "epoch": 0.5006954102920723, - "eval_loss": 0.9371610879898071, - "eval_runtime": 11.2585, - "eval_samples_per_second": 11.28, - "eval_steps_per_second": 11.28, - "step": 10800 - }, - { - "epoch": 0.500927213722763, - "grad_norm": 4.106610298156738, - "learning_rate": 1.1704598582622517e-05, - "loss": 0.9352, - "step": 10805 - }, - { - "epoch": 0.5011590171534539, - "grad_norm": 3.3235268592834473, - "learning_rate": 1.169662498098307e-05, - "loss": 0.7574, - "step": 10810 - }, - { - "epoch": 0.5013908205841446, - "grad_norm": 3.4809799194335938, - "learning_rate": 1.1688650268532353e-05, - "loss": 0.8316, - "step": 10815 - }, - { - "epoch": 0.5016226240148354, - "grad_norm": 4.205314636230469, - "learning_rate": 1.168067445049155e-05, - "loss": 0.9359, - "step": 10820 - }, - { - "epoch": 0.5018544274455262, - "grad_norm": 3.303571939468384, - "learning_rate": 1.1672697532082578e-05, - "loss": 0.8145, - "step": 10825 - }, - { - "epoch": 0.502086230876217, - "grad_norm": 4.021134853363037, - "learning_rate": 1.1664719518528068e-05, - "loss": 0.8672, - "step": 10830 - }, - { - "epoch": 0.5023180343069077, - "grad_norm": 3.5206027030944824, - "learning_rate": 1.1656740415051373e-05, - "loss": 0.9901, - "step": 10835 - }, - { - "epoch": 0.5025498377375985, - "grad_norm": 3.848743438720703, - "learning_rate": 1.1648760226876559e-05, - "loss": 0.87, - "step": 10840 - }, - { - "epoch": 0.5027816411682893, - "grad_norm": 4.378417015075684, - "learning_rate": 1.1640778959228396e-05, - "loss": 1.0345, - "step": 10845 - }, - { - "epoch": 0.5030134445989801, - "grad_norm": 3.704587697982788, - "learning_rate": 1.1632796617332366e-05, - "loss": 1.0267, - "step": 10850 - }, - { - "epoch": 0.5032452480296709, - "grad_norm": 4.409409046173096, - "learning_rate": 1.1624813206414658e-05, - "loss": 1.0559, - "step": 10855 - }, - { - "epoch": 0.5034770514603616, - "grad_norm": 3.70389986038208, - "learning_rate": 1.1616828731702153e-05, - "loss": 0.9175, - "step": 10860 - }, - { - "epoch": 0.5037088548910524, - "grad_norm": 3.591284990310669, - "learning_rate": 1.1608843198422432e-05, - "loss": 0.9618, - "step": 10865 - }, - { - "epoch": 0.5039406583217432, - "grad_norm": 4.028764247894287, - "learning_rate": 1.1600856611803769e-05, - "loss": 1.0365, - "step": 10870 - }, - { - "epoch": 0.5041724617524339, - "grad_norm": 3.5418801307678223, - "learning_rate": 1.1592868977075132e-05, - "loss": 0.9485, - "step": 10875 - }, - { - "epoch": 0.5044042651831248, - "grad_norm": 4.499399662017822, - "learning_rate": 1.158488029946616e-05, - "loss": 1.0452, - "step": 10880 - }, - { - "epoch": 0.5046360686138155, - "grad_norm": 3.686279058456421, - "learning_rate": 1.15768905842072e-05, - "loss": 0.7959, - "step": 10885 - }, - { - "epoch": 0.5048678720445062, - "grad_norm": 3.655107021331787, - "learning_rate": 1.1568899836529255e-05, - "loss": 0.8616, - "step": 10890 - }, - { - "epoch": 0.5050996754751971, - "grad_norm": 3.7078745365142822, - "learning_rate": 1.1560908061664014e-05, - "loss": 0.925, - "step": 10895 - }, - { - "epoch": 0.5053314789058878, - "grad_norm": 3.763909101486206, - "learning_rate": 1.1552915264843838e-05, - "loss": 0.8868, - "step": 10900 - }, - { - "epoch": 0.5053314789058878, - "eval_loss": 0.9361811280250549, - "eval_runtime": 11.2635, - "eval_samples_per_second": 11.275, - "eval_steps_per_second": 11.275, - "step": 10900 - }, - { - "epoch": 0.5055632823365785, - "grad_norm": 4.104317665100098, - "learning_rate": 1.1544921451301759e-05, - "loss": 0.8438, - "step": 10905 - }, - { - "epoch": 0.5057950857672694, - "grad_norm": 3.793743848800659, - "learning_rate": 1.153692662627147e-05, - "loss": 1.0021, - "step": 10910 - }, - { - "epoch": 0.5060268891979601, - "grad_norm": 3.819781541824341, - "learning_rate": 1.1528930794987329e-05, - "loss": 1.0518, - "step": 10915 - }, - { - "epoch": 0.5062586926286509, - "grad_norm": 3.5014266967773438, - "learning_rate": 1.1520933962684353e-05, - "loss": 0.7188, - "step": 10920 - }, - { - "epoch": 0.5064904960593417, - "grad_norm": 4.050322532653809, - "learning_rate": 1.1512936134598214e-05, - "loss": 0.9117, - "step": 10925 - }, - { - "epoch": 0.5067222994900324, - "grad_norm": 4.244816303253174, - "learning_rate": 1.1504937315965233e-05, - "loss": 0.9438, - "step": 10930 - }, - { - "epoch": 0.5069541029207232, - "grad_norm": 4.170376300811768, - "learning_rate": 1.1496937512022388e-05, - "loss": 1.0061, - "step": 10935 - }, - { - "epoch": 0.507185906351414, - "grad_norm": 3.4303226470947266, - "learning_rate": 1.148893672800729e-05, - "loss": 0.8894, - "step": 10940 - }, - { - "epoch": 0.5074177097821048, - "grad_norm": 4.357320308685303, - "learning_rate": 1.1480934969158197e-05, - "loss": 1.0696, - "step": 10945 - }, - { - "epoch": 0.5076495132127955, - "grad_norm": 3.495048999786377, - "learning_rate": 1.1472932240714017e-05, - "loss": 0.8701, - "step": 10950 - }, - { - "epoch": 0.5078813166434863, - "grad_norm": 3.7668962478637695, - "learning_rate": 1.1464928547914267e-05, - "loss": 0.8798, - "step": 10955 - }, - { - "epoch": 0.5081131200741771, - "grad_norm": 3.035940647125244, - "learning_rate": 1.1456923895999124e-05, - "loss": 0.8343, - "step": 10960 - }, - { - "epoch": 0.5083449235048678, - "grad_norm": 4.260811805725098, - "learning_rate": 1.144891829020937e-05, - "loss": 0.855, - "step": 10965 - }, - { - "epoch": 0.5085767269355587, - "grad_norm": 3.065490484237671, - "learning_rate": 1.1440911735786426e-05, - "loss": 1.0546, - "step": 10970 - }, - { - "epoch": 0.5088085303662494, - "grad_norm": 3.277255058288574, - "learning_rate": 1.1432904237972329e-05, - "loss": 0.9556, - "step": 10975 - }, - { - "epoch": 0.5090403337969402, - "grad_norm": 3.772494077682495, - "learning_rate": 1.142489580200973e-05, - "loss": 0.9838, - "step": 10980 - }, - { - "epoch": 0.509272137227631, - "grad_norm": 3.4952893257141113, - "learning_rate": 1.1416886433141903e-05, - "loss": 0.9328, - "step": 10985 - }, - { - "epoch": 0.5095039406583217, - "grad_norm": 4.249672889709473, - "learning_rate": 1.1408876136612722e-05, - "loss": 0.8854, - "step": 10990 - }, - { - "epoch": 0.5097357440890126, - "grad_norm": 3.2859761714935303, - "learning_rate": 1.1400864917666679e-05, - "loss": 0.8774, - "step": 10995 - }, - { - "epoch": 0.5099675475197033, - "grad_norm": 3.6365582942962646, - "learning_rate": 1.139285278154886e-05, - "loss": 0.8254, - "step": 11000 - }, - { - "epoch": 0.5099675475197033, - "eval_loss": 0.9347610473632812, - "eval_runtime": 11.2667, - "eval_samples_per_second": 11.272, - "eval_steps_per_second": 11.272, - "step": 11000 - }, - { - "epoch": 0.510199350950394, - "grad_norm": 3.846269369125366, - "learning_rate": 1.138483973350496e-05, - "loss": 0.9616, - "step": 11005 - }, - { - "epoch": 0.5104311543810849, - "grad_norm": 3.8627161979675293, - "learning_rate": 1.1376825778781266e-05, - "loss": 1.0072, - "step": 11010 - }, - { - "epoch": 0.5106629578117756, - "grad_norm": 3.971733808517456, - "learning_rate": 1.136881092262466e-05, - "loss": 0.9459, - "step": 11015 - }, - { - "epoch": 0.5108947612424664, - "grad_norm": 3.493520498275757, - "learning_rate": 1.1360795170282619e-05, - "loss": 0.8961, - "step": 11020 - }, - { - "epoch": 0.5111265646731572, - "grad_norm": 3.7606539726257324, - "learning_rate": 1.1352778527003194e-05, - "loss": 0.9448, - "step": 11025 - }, - { - "epoch": 0.5113583681038479, - "grad_norm": 11.092731475830078, - "learning_rate": 1.1344760998035031e-05, - "loss": 1.0258, - "step": 11030 - }, - { - "epoch": 0.5115901715345387, - "grad_norm": 4.570170879364014, - "learning_rate": 1.1336742588627358e-05, - "loss": 0.8798, - "step": 11035 - }, - { - "epoch": 0.5118219749652295, - "grad_norm": 3.612354278564453, - "learning_rate": 1.1328723304029962e-05, - "loss": 0.8494, - "step": 11040 - }, - { - "epoch": 0.5120537783959203, - "grad_norm": 3.9332404136657715, - "learning_rate": 1.1320703149493224e-05, - "loss": 0.9113, - "step": 11045 - }, - { - "epoch": 0.512285581826611, - "grad_norm": 3.531489133834839, - "learning_rate": 1.1312682130268082e-05, - "loss": 0.9557, - "step": 11050 - }, - { - "epoch": 0.5125173852573018, - "grad_norm": 3.8228137493133545, - "learning_rate": 1.1304660251606044e-05, - "loss": 0.7984, - "step": 11055 - }, - { - "epoch": 0.5127491886879926, - "grad_norm": 4.060273170471191, - "learning_rate": 1.129663751875918e-05, - "loss": 0.8869, - "step": 11060 - }, - { - "epoch": 0.5129809921186833, - "grad_norm": 3.425370931625366, - "learning_rate": 1.1288613936980116e-05, - "loss": 0.6703, - "step": 11065 - }, - { - "epoch": 0.5132127955493742, - "grad_norm": 3.5954689979553223, - "learning_rate": 1.1280589511522041e-05, - "loss": 0.7522, - "step": 11070 - }, - { - "epoch": 0.5134445989800649, - "grad_norm": 3.8219289779663086, - "learning_rate": 1.127256424763869e-05, - "loss": 1.0325, - "step": 11075 - }, - { - "epoch": 0.5136764024107556, - "grad_norm": 5.733139991760254, - "learning_rate": 1.1264538150584356e-05, - "loss": 1.0223, - "step": 11080 - }, - { - "epoch": 0.5139082058414465, - "grad_norm": 4.2980170249938965, - "learning_rate": 1.1256511225613863e-05, - "loss": 0.9425, - "step": 11085 - }, - { - "epoch": 0.5141400092721372, - "grad_norm": 5.1117024421691895, - "learning_rate": 1.1248483477982584e-05, - "loss": 0.962, - "step": 11090 - }, - { - "epoch": 0.5143718127028281, - "grad_norm": 3.2566781044006348, - "learning_rate": 1.1240454912946442e-05, - "loss": 0.9778, - "step": 11095 - }, - { - "epoch": 0.5146036161335188, - "grad_norm": 4.791970252990723, - "learning_rate": 1.1232425535761872e-05, - "loss": 0.9794, - "step": 11100 - }, - { - "epoch": 0.5146036161335188, - "eval_loss": 0.9343388676643372, - "eval_runtime": 11.267, - "eval_samples_per_second": 11.272, - "eval_steps_per_second": 11.272, - "step": 11100 - }, - { - "epoch": 0.5148354195642095, - "grad_norm": 3.983933687210083, - "learning_rate": 1.1224395351685862e-05, - "loss": 1.025, - "step": 11105 - }, - { - "epoch": 0.5150672229949004, - "grad_norm": 3.460850954055786, - "learning_rate": 1.1216364365975918e-05, - "loss": 0.7973, - "step": 11110 - }, - { - "epoch": 0.5152990264255911, - "grad_norm": 3.478369951248169, - "learning_rate": 1.1208332583890072e-05, - "loss": 0.8024, - "step": 11115 - }, - { - "epoch": 0.5155308298562818, - "grad_norm": 3.2050466537475586, - "learning_rate": 1.1200300010686878e-05, - "loss": 0.926, - "step": 11120 - }, - { - "epoch": 0.5157626332869727, - "grad_norm": 3.6445305347442627, - "learning_rate": 1.1192266651625406e-05, - "loss": 1.006, - "step": 11125 - }, - { - "epoch": 0.5159944367176634, - "grad_norm": 3.9382009506225586, - "learning_rate": 1.1184232511965244e-05, - "loss": 0.8803, - "step": 11130 - }, - { - "epoch": 0.5162262401483542, - "grad_norm": 4.49378776550293, - "learning_rate": 1.117619759696649e-05, - "loss": 1.0766, - "step": 11135 - }, - { - "epoch": 0.516458043579045, - "grad_norm": 3.932445764541626, - "learning_rate": 1.1168161911889753e-05, - "loss": 0.9086, - "step": 11140 - }, - { - "epoch": 0.5166898470097357, - "grad_norm": 3.898226022720337, - "learning_rate": 1.1160125461996137e-05, - "loss": 0.982, - "step": 11145 - }, - { - "epoch": 0.5169216504404265, - "grad_norm": 3.8211286067962646, - "learning_rate": 1.1152088252547249e-05, - "loss": 1.0433, - "step": 11150 - }, - { - "epoch": 0.5171534538711173, - "grad_norm": 3.4513728618621826, - "learning_rate": 1.1144050288805206e-05, - "loss": 0.8361, - "step": 11155 - }, - { - "epoch": 0.5173852573018081, - "grad_norm": 3.6396796703338623, - "learning_rate": 1.11360115760326e-05, - "loss": 0.8701, - "step": 11160 - }, - { - "epoch": 0.5176170607324988, - "grad_norm": 3.4153990745544434, - "learning_rate": 1.112797211949253e-05, - "loss": 0.8483, - "step": 11165 - }, - { - "epoch": 0.5178488641631896, - "grad_norm": 3.43082857131958, - "learning_rate": 1.1119931924448569e-05, - "loss": 0.7802, - "step": 11170 - }, - { - "epoch": 0.5180806675938804, - "grad_norm": 3.3968653678894043, - "learning_rate": 1.1111890996164782e-05, - "loss": 1.055, - "step": 11175 - }, - { - "epoch": 0.5183124710245711, - "grad_norm": 3.6078455448150635, - "learning_rate": 1.1103849339905712e-05, - "loss": 0.9291, - "step": 11180 - }, - { - "epoch": 0.518544274455262, - "grad_norm": 4.045047283172607, - "learning_rate": 1.1095806960936378e-05, - "loss": 1.0033, - "step": 11185 - }, - { - "epoch": 0.5187760778859527, - "grad_norm": 3.687140703201294, - "learning_rate": 1.1087763864522268e-05, - "loss": 0.7571, - "step": 11190 - }, - { - "epoch": 0.5190078813166434, - "grad_norm": 3.655649185180664, - "learning_rate": 1.1079720055929346e-05, - "loss": 0.8994, - "step": 11195 - }, - { - "epoch": 0.5192396847473343, - "grad_norm": 4.835019111633301, - "learning_rate": 1.107167554042404e-05, - "loss": 0.8423, - "step": 11200 - }, - { - "epoch": 0.5192396847473343, - "eval_loss": 0.9321886897087097, - "eval_runtime": 11.2611, - "eval_samples_per_second": 11.278, - "eval_steps_per_second": 11.278, - "step": 11200 - }, - { - "epoch": 0.519471488178025, - "grad_norm": 3.382221221923828, - "learning_rate": 1.1063630323273242e-05, - "loss": 0.9049, - "step": 11205 - }, - { - "epoch": 0.5197032916087158, - "grad_norm": 3.561631679534912, - "learning_rate": 1.10555844097443e-05, - "loss": 1.004, - "step": 11210 - }, - { - "epoch": 0.5199350950394066, - "grad_norm": 3.8593273162841797, - "learning_rate": 1.1047537805105025e-05, - "loss": 0.942, - "step": 11215 - }, - { - "epoch": 0.5201668984700973, - "grad_norm": 3.7144737243652344, - "learning_rate": 1.1039490514623665e-05, - "loss": 0.8805, - "step": 11220 - }, - { - "epoch": 0.5203987019007882, - "grad_norm": 4.249030590057373, - "learning_rate": 1.1031442543568943e-05, - "loss": 0.791, - "step": 11225 - }, - { - "epoch": 0.5206305053314789, - "grad_norm": 3.3096988201141357, - "learning_rate": 1.1023393897209996e-05, - "loss": 0.8479, - "step": 11230 - }, - { - "epoch": 0.5208623087621697, - "grad_norm": 4.292798042297363, - "learning_rate": 1.101534458081643e-05, - "loss": 1.0426, - "step": 11235 - }, - { - "epoch": 0.5210941121928605, - "grad_norm": 4.003423690795898, - "learning_rate": 1.1007294599658278e-05, - "loss": 0.97, - "step": 11240 - }, - { - "epoch": 0.5213259156235512, - "grad_norm": 3.7791945934295654, - "learning_rate": 1.0999243959006002e-05, - "loss": 0.8429, - "step": 11245 - }, - { - "epoch": 0.521557719054242, - "grad_norm": 4.104649543762207, - "learning_rate": 1.0991192664130513e-05, - "loss": 0.8678, - "step": 11250 - }, - { - "epoch": 0.5217895224849328, - "grad_norm": 4.052036285400391, - "learning_rate": 1.0983140720303136e-05, - "loss": 0.8715, - "step": 11255 - }, - { - "epoch": 0.5220213259156236, - "grad_norm": 3.6360881328582764, - "learning_rate": 1.0975088132795623e-05, - "loss": 0.9221, - "step": 11260 - }, - { - "epoch": 0.5222531293463143, - "grad_norm": 3.4455959796905518, - "learning_rate": 1.0967034906880151e-05, - "loss": 0.8846, - "step": 11265 - }, - { - "epoch": 0.5224849327770051, - "grad_norm": 4.016299724578857, - "learning_rate": 1.0958981047829317e-05, - "loss": 0.8542, - "step": 11270 - }, - { - "epoch": 0.5227167362076959, - "grad_norm": 3.5622754096984863, - "learning_rate": 1.0950926560916126e-05, - "loss": 0.7955, - "step": 11275 - }, - { - "epoch": 0.5229485396383866, - "grad_norm": 4.078085422515869, - "learning_rate": 1.0942871451413994e-05, - "loss": 0.9445, - "step": 11280 - }, - { - "epoch": 0.5231803430690775, - "grad_norm": 3.5976459980010986, - "learning_rate": 1.0934815724596759e-05, - "loss": 0.8949, - "step": 11285 - }, - { - "epoch": 0.5234121464997682, - "grad_norm": 4.441613674163818, - "learning_rate": 1.092675938573864e-05, - "loss": 0.8814, - "step": 11290 - }, - { - "epoch": 0.5236439499304589, - "grad_norm": 4.4883904457092285, - "learning_rate": 1.0918702440114272e-05, - "loss": 0.8916, - "step": 11295 - }, - { - "epoch": 0.5238757533611498, - "grad_norm": 3.8522751331329346, - "learning_rate": 1.091064489299869e-05, - "loss": 0.8884, - "step": 11300 - }, - { - "epoch": 0.5238757533611498, - "eval_loss": 0.9312301874160767, - "eval_runtime": 11.2698, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 11300 - }, - { - "epoch": 0.5241075567918405, - "grad_norm": 3.6490557193756104, - "learning_rate": 1.0902586749667308e-05, - "loss": 0.8504, - "step": 11305 - }, - { - "epoch": 0.5243393602225312, - "grad_norm": 3.075930595397949, - "learning_rate": 1.0894528015395946e-05, - "loss": 0.7918, - "step": 11310 - }, - { - "epoch": 0.5245711636532221, - "grad_norm": 3.6393370628356934, - "learning_rate": 1.0886468695460802e-05, - "loss": 1.0254, - "step": 11315 - }, - { - "epoch": 0.5248029670839128, - "grad_norm": 4.083436012268066, - "learning_rate": 1.0878408795138458e-05, - "loss": 0.7521, - "step": 11320 - }, - { - "epoch": 0.5250347705146036, - "grad_norm": 4.432519912719727, - "learning_rate": 1.087034831970588e-05, - "loss": 0.948, - "step": 11325 - }, - { - "epoch": 0.5252665739452944, - "grad_norm": 3.2188823223114014, - "learning_rate": 1.0862287274440407e-05, - "loss": 0.9685, - "step": 11330 - }, - { - "epoch": 0.5254983773759851, - "grad_norm": 3.748762607574463, - "learning_rate": 1.085422566461975e-05, - "loss": 0.9177, - "step": 11335 - }, - { - "epoch": 0.5257301808066759, - "grad_norm": 3.7646892070770264, - "learning_rate": 1.0846163495521992e-05, - "loss": 0.9478, - "step": 11340 - }, - { - "epoch": 0.5259619842373667, - "grad_norm": 3.686171531677246, - "learning_rate": 1.0838100772425591e-05, - "loss": 0.975, - "step": 11345 - }, - { - "epoch": 0.5261937876680575, - "grad_norm": 5.668167591094971, - "learning_rate": 1.0830037500609348e-05, - "loss": 0.9199, - "step": 11350 - }, - { - "epoch": 0.5264255910987483, - "grad_norm": 3.6817402839660645, - "learning_rate": 1.0821973685352432e-05, - "loss": 0.9086, - "step": 11355 - }, - { - "epoch": 0.526657394529439, - "grad_norm": 3.8931546211242676, - "learning_rate": 1.0813909331934383e-05, - "loss": 0.879, - "step": 11360 - }, - { - "epoch": 0.5268891979601298, - "grad_norm": 3.524632215499878, - "learning_rate": 1.0805844445635065e-05, - "loss": 0.7434, - "step": 11365 - }, - { - "epoch": 0.5271210013908206, - "grad_norm": 3.135315418243408, - "learning_rate": 1.0797779031734716e-05, - "loss": 0.785, - "step": 11370 - }, - { - "epoch": 0.5273528048215114, - "grad_norm": 4.795115947723389, - "learning_rate": 1.0789713095513905e-05, - "loss": 0.9284, - "step": 11375 - }, - { - "epoch": 0.5275846082522021, - "grad_norm": 3.631051540374756, - "learning_rate": 1.078164664225355e-05, - "loss": 0.8551, - "step": 11380 - }, - { - "epoch": 0.527816411682893, - "grad_norm": 3.488373279571533, - "learning_rate": 1.0773579677234904e-05, - "loss": 0.8433, - "step": 11385 - }, - { - "epoch": 0.5280482151135837, - "grad_norm": 3.9285202026367188, - "learning_rate": 1.0765512205739554e-05, - "loss": 0.8897, - "step": 11390 - }, - { - "epoch": 0.5282800185442744, - "grad_norm": 3.3522801399230957, - "learning_rate": 1.0757444233049421e-05, - "loss": 0.8434, - "step": 11395 - }, - { - "epoch": 0.5285118219749653, - "grad_norm": 3.6392805576324463, - "learning_rate": 1.0749375764446756e-05, - "loss": 0.8998, - "step": 11400 - }, - { - "epoch": 0.5285118219749653, - "eval_loss": 0.9297853112220764, - "eval_runtime": 11.2781, - "eval_samples_per_second": 11.261, - "eval_steps_per_second": 11.261, - "step": 11400 - }, - { - "epoch": 0.528743625405656, - "grad_norm": 3.962904930114746, - "learning_rate": 1.074130680521413e-05, - "loss": 0.8843, - "step": 11405 - }, - { - "epoch": 0.5289754288363467, - "grad_norm": 4.593536853790283, - "learning_rate": 1.073323736063444e-05, - "loss": 0.8648, - "step": 11410 - }, - { - "epoch": 0.5292072322670376, - "grad_norm": 4.164706230163574, - "learning_rate": 1.0725167435990894e-05, - "loss": 0.8713, - "step": 11415 - }, - { - "epoch": 0.5294390356977283, - "grad_norm": 4.113262176513672, - "learning_rate": 1.0717097036567025e-05, - "loss": 0.8647, - "step": 11420 - }, - { - "epoch": 0.5296708391284191, - "grad_norm": 4.011596202850342, - "learning_rate": 1.0709026167646661e-05, - "loss": 0.9141, - "step": 11425 - }, - { - "epoch": 0.5299026425591099, - "grad_norm": 4.122656345367432, - "learning_rate": 1.0700954834513957e-05, - "loss": 0.8079, - "step": 11430 - }, - { - "epoch": 0.5301344459898006, - "grad_norm": 3.779600143432617, - "learning_rate": 1.069288304245336e-05, - "loss": 0.8531, - "step": 11435 - }, - { - "epoch": 0.5303662494204914, - "grad_norm": 3.971891164779663, - "learning_rate": 1.0684810796749611e-05, - "loss": 0.909, - "step": 11440 - }, - { - "epoch": 0.5305980528511822, - "grad_norm": 8.932957649230957, - "learning_rate": 1.0676738102687767e-05, - "loss": 1.0013, - "step": 11445 - }, - { - "epoch": 0.530829856281873, - "grad_norm": 3.974130153656006, - "learning_rate": 1.0668664965553156e-05, - "loss": 1.0049, - "step": 11450 - }, - { - "epoch": 0.5310616597125637, - "grad_norm": 3.069875478744507, - "learning_rate": 1.066059139063142e-05, - "loss": 0.9278, - "step": 11455 - }, - { - "epoch": 0.5312934631432545, - "grad_norm": 3.6674485206604004, - "learning_rate": 1.065251738320847e-05, - "loss": 0.707, - "step": 11460 - }, - { - "epoch": 0.5315252665739453, - "grad_norm": 3.577423334121704, - "learning_rate": 1.0644442948570506e-05, - "loss": 0.8667, - "step": 11465 - }, - { - "epoch": 0.5317570700046361, - "grad_norm": 4.023151874542236, - "learning_rate": 1.0636368092004005e-05, - "loss": 1.0108, - "step": 11470 - }, - { - "epoch": 0.5319888734353269, - "grad_norm": 3.7723028659820557, - "learning_rate": 1.0628292818795726e-05, - "loss": 0.9597, - "step": 11475 - }, - { - "epoch": 0.5322206768660176, - "grad_norm": 3.417649984359741, - "learning_rate": 1.0620217134232698e-05, - "loss": 0.8539, - "step": 11480 - }, - { - "epoch": 0.5324524802967084, - "grad_norm": 4.218043804168701, - "learning_rate": 1.0612141043602215e-05, - "loss": 0.8892, - "step": 11485 - }, - { - "epoch": 0.5326842837273992, - "grad_norm": 4.164142608642578, - "learning_rate": 1.0604064552191846e-05, - "loss": 0.9493, - "step": 11490 - }, - { - "epoch": 0.5329160871580899, - "grad_norm": 5.829555511474609, - "learning_rate": 1.059598766528941e-05, - "loss": 0.908, - "step": 11495 - }, - { - "epoch": 0.5331478905887808, - "grad_norm": 4.448476791381836, - "learning_rate": 1.0587910388182995e-05, - "loss": 0.8947, - "step": 11500 - }, - { - "epoch": 0.5331478905887808, - "eval_loss": 0.9298225045204163, - "eval_runtime": 11.2793, - "eval_samples_per_second": 11.26, - "eval_steps_per_second": 11.26, - "step": 11500 - }, - { - "epoch": 0.5333796940194715, - "grad_norm": 3.3360562324523926, - "learning_rate": 1.0579832726160948e-05, - "loss": 0.7823, - "step": 11505 - }, - { - "epoch": 0.5336114974501622, - "grad_norm": 3.199880599975586, - "learning_rate": 1.057175468451185e-05, - "loss": 0.8111, - "step": 11510 - }, - { - "epoch": 0.5338433008808531, - "grad_norm": 3.1434779167175293, - "learning_rate": 1.0563676268524551e-05, - "loss": 0.6705, - "step": 11515 - }, - { - "epoch": 0.5340751043115438, - "grad_norm": 3.9120616912841797, - "learning_rate": 1.0555597483488132e-05, - "loss": 1.1042, - "step": 11520 - }, - { - "epoch": 0.5343069077422345, - "grad_norm": 3.9502615928649902, - "learning_rate": 1.0547518334691924e-05, - "loss": 0.9414, - "step": 11525 - }, - { - "epoch": 0.5345387111729254, - "grad_norm": 3.2604763507843018, - "learning_rate": 1.053943882742549e-05, - "loss": 0.7735, - "step": 11530 - }, - { - "epoch": 0.5347705146036161, - "grad_norm": 3.5365970134735107, - "learning_rate": 1.053135896697863e-05, - "loss": 0.7848, - "step": 11535 - }, - { - "epoch": 0.5350023180343069, - "grad_norm": 3.263577461242676, - "learning_rate": 1.052327875864138e-05, - "loss": 0.7864, - "step": 11540 - }, - { - "epoch": 0.5352341214649977, - "grad_norm": 3.6061646938323975, - "learning_rate": 1.0515198207703997e-05, - "loss": 0.8842, - "step": 11545 - }, - { - "epoch": 0.5354659248956884, - "grad_norm": 3.667140245437622, - "learning_rate": 1.0507117319456965e-05, - "loss": 0.9251, - "step": 11550 - }, - { - "epoch": 0.5356977283263792, - "grad_norm": 3.3890786170959473, - "learning_rate": 1.0499036099190986e-05, - "loss": 0.8562, - "step": 11555 - }, - { - "epoch": 0.53592953175707, - "grad_norm": 3.4232475757598877, - "learning_rate": 1.0490954552196986e-05, - "loss": 0.7744, - "step": 11560 - }, - { - "epoch": 0.5361613351877608, - "grad_norm": 4.001134395599365, - "learning_rate": 1.0482872683766102e-05, - "loss": 0.7914, - "step": 11565 - }, - { - "epoch": 0.5363931386184515, - "grad_norm": 3.9395151138305664, - "learning_rate": 1.0474790499189675e-05, - "loss": 0.9072, - "step": 11570 - }, - { - "epoch": 0.5366249420491424, - "grad_norm": 4.937548637390137, - "learning_rate": 1.0466708003759262e-05, - "loss": 0.8502, - "step": 11575 - }, - { - "epoch": 0.5368567454798331, - "grad_norm": 4.098718166351318, - "learning_rate": 1.0458625202766621e-05, - "loss": 0.8955, - "step": 11580 - }, - { - "epoch": 0.5370885489105238, - "grad_norm": 6.9161906242370605, - "learning_rate": 1.0450542101503706e-05, - "loss": 0.9945, - "step": 11585 - }, - { - "epoch": 0.5373203523412147, - "grad_norm": 3.356679677963257, - "learning_rate": 1.0442458705262675e-05, - "loss": 0.8751, - "step": 11590 - }, - { - "epoch": 0.5375521557719054, - "grad_norm": 3.306637763977051, - "learning_rate": 1.043437501933587e-05, - "loss": 0.7993, - "step": 11595 - }, - { - "epoch": 0.5377839592025963, - "grad_norm": 4.0915961265563965, - "learning_rate": 1.0426291049015829e-05, - "loss": 0.6809, - "step": 11600 - }, - { - "epoch": 0.5377839592025963, - "eval_loss": 0.9296634793281555, - "eval_runtime": 11.2604, - "eval_samples_per_second": 11.278, - "eval_steps_per_second": 11.278, - "step": 11600 - }, - { - "epoch": 0.538015762633287, - "grad_norm": 3.1307010650634766, - "learning_rate": 1.0418206799595275e-05, - "loss": 0.7849, - "step": 11605 - }, - { - "epoch": 0.5382475660639777, - "grad_norm": 3.243574380874634, - "learning_rate": 1.0410122276367114e-05, - "loss": 0.7722, - "step": 11610 - }, - { - "epoch": 0.5384793694946686, - "grad_norm": 3.598766326904297, - "learning_rate": 1.0402037484624428e-05, - "loss": 0.9738, - "step": 11615 - }, - { - "epoch": 0.5387111729253593, - "grad_norm": 3.394896984100342, - "learning_rate": 1.0393952429660479e-05, - "loss": 0.9443, - "step": 11620 - }, - { - "epoch": 0.53894297635605, - "grad_norm": 3.3461711406707764, - "learning_rate": 1.0385867116768702e-05, - "loss": 0.7564, - "step": 11625 - }, - { - "epoch": 0.5391747797867409, - "grad_norm": 3.3700802326202393, - "learning_rate": 1.0377781551242693e-05, - "loss": 0.9093, - "step": 11630 - }, - { - "epoch": 0.5394065832174316, - "grad_norm": 3.464348554611206, - "learning_rate": 1.0369695738376226e-05, - "loss": 0.929, - "step": 11635 - }, - { - "epoch": 0.5396383866481224, - "grad_norm": 4.16630744934082, - "learning_rate": 1.0361609683463227e-05, - "loss": 1.0442, - "step": 11640 - }, - { - "epoch": 0.5398701900788132, - "grad_norm": 3.8371846675872803, - "learning_rate": 1.035352339179778e-05, - "loss": 0.8529, - "step": 11645 - }, - { - "epoch": 0.5401019935095039, - "grad_norm": 3.6530447006225586, - "learning_rate": 1.034543686867413e-05, - "loss": 0.942, - "step": 11650 - }, - { - "epoch": 0.5403337969401947, - "grad_norm": 3.9036331176757812, - "learning_rate": 1.0337350119386673e-05, - "loss": 0.8973, - "step": 11655 - }, - { - "epoch": 0.5405656003708855, - "grad_norm": 3.5922040939331055, - "learning_rate": 1.032926314922995e-05, - "loss": 0.9248, - "step": 11660 - }, - { - "epoch": 0.5407974038015763, - "grad_norm": 4.615749835968018, - "learning_rate": 1.0321175963498646e-05, - "loss": 0.97, - "step": 11665 - }, - { - "epoch": 0.541029207232267, - "grad_norm": 3.7986438274383545, - "learning_rate": 1.0313088567487589e-05, - "loss": 0.8226, - "step": 11670 - }, - { - "epoch": 0.5412610106629578, - "grad_norm": 3.754406452178955, - "learning_rate": 1.0305000966491746e-05, - "loss": 0.976, - "step": 11675 - }, - { - "epoch": 0.5414928140936486, - "grad_norm": 3.355897903442383, - "learning_rate": 1.0296913165806215e-05, - "loss": 0.8337, - "step": 11680 - }, - { - "epoch": 0.5417246175243393, - "grad_norm": 3.912876844406128, - "learning_rate": 1.0288825170726227e-05, - "loss": 0.9295, - "step": 11685 - }, - { - "epoch": 0.5419564209550302, - "grad_norm": 3.975902557373047, - "learning_rate": 1.0280736986547137e-05, - "loss": 1.0297, - "step": 11690 - }, - { - "epoch": 0.5421882243857209, - "grad_norm": 3.9908456802368164, - "learning_rate": 1.0272648618564436e-05, - "loss": 0.9178, - "step": 11695 - }, - { - "epoch": 0.5424200278164116, - "grad_norm": 3.790532350540161, - "learning_rate": 1.0264560072073716e-05, - "loss": 0.7329, - "step": 11700 - }, - { - "epoch": 0.5424200278164116, - "eval_loss": 0.926837682723999, - "eval_runtime": 11.2701, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 11700 - }, - { - "epoch": 0.5426518312471025, - "grad_norm": 3.8717010021209717, - "learning_rate": 1.0256471352370693e-05, - "loss": 1.0964, - "step": 11705 - }, - { - "epoch": 0.5428836346777932, - "grad_norm": 3.893839120864868, - "learning_rate": 1.0248382464751214e-05, - "loss": 0.9117, - "step": 11710 - }, - { - "epoch": 0.5431154381084841, - "grad_norm": 3.5871944427490234, - "learning_rate": 1.0240293414511207e-05, - "loss": 0.9653, - "step": 11715 - }, - { - "epoch": 0.5433472415391748, - "grad_norm": 3.3438005447387695, - "learning_rate": 1.0232204206946725e-05, - "loss": 0.7898, - "step": 11720 - }, - { - "epoch": 0.5435790449698655, - "grad_norm": 3.8526430130004883, - "learning_rate": 1.0224114847353921e-05, - "loss": 0.9564, - "step": 11725 - }, - { - "epoch": 0.5438108484005564, - "grad_norm": 4.089172840118408, - "learning_rate": 1.0216025341029046e-05, - "loss": 0.9941, - "step": 11730 - }, - { - "epoch": 0.5440426518312471, - "grad_norm": 4.1344451904296875, - "learning_rate": 1.0207935693268444e-05, - "loss": 0.8658, - "step": 11735 - }, - { - "epoch": 0.5442744552619379, - "grad_norm": 4.014649868011475, - "learning_rate": 1.019984590936856e-05, - "loss": 1.0357, - "step": 11740 - }, - { - "epoch": 0.5445062586926287, - "grad_norm": 3.6267828941345215, - "learning_rate": 1.0191755994625917e-05, - "loss": 0.787, - "step": 11745 - }, - { - "epoch": 0.5447380621233194, - "grad_norm": 3.1016199588775635, - "learning_rate": 1.0183665954337134e-05, - "loss": 0.8991, - "step": 11750 - }, - { - "epoch": 0.5449698655540102, - "grad_norm": 4.051199913024902, - "learning_rate": 1.0175575793798906e-05, - "loss": 0.9813, - "step": 11755 - }, - { - "epoch": 0.545201668984701, - "grad_norm": 3.9017701148986816, - "learning_rate": 1.0167485518308006e-05, - "loss": 0.9734, - "step": 11760 - }, - { - "epoch": 0.5454334724153918, - "grad_norm": 4.230993747711182, - "learning_rate": 1.0159395133161287e-05, - "loss": 1.1052, - "step": 11765 - }, - { - "epoch": 0.5456652758460825, - "grad_norm": 3.363537549972534, - "learning_rate": 1.0151304643655674e-05, - "loss": 0.884, - "step": 11770 - }, - { - "epoch": 0.5458970792767733, - "grad_norm": 3.54296612739563, - "learning_rate": 1.0143214055088153e-05, - "loss": 0.8351, - "step": 11775 - }, - { - "epoch": 0.5461288827074641, - "grad_norm": 3.821948766708374, - "learning_rate": 1.0135123372755785e-05, - "loss": 0.9147, - "step": 11780 - }, - { - "epoch": 0.5463606861381548, - "grad_norm": 4.290584087371826, - "learning_rate": 1.0127032601955682e-05, - "loss": 0.9966, - "step": 11785 - }, - { - "epoch": 0.5465924895688457, - "grad_norm": 4.043396949768066, - "learning_rate": 1.011894174798502e-05, - "loss": 1.0158, - "step": 11790 - }, - { - "epoch": 0.5468242929995364, - "grad_norm": 3.4050636291503906, - "learning_rate": 1.011085081614103e-05, - "loss": 0.9174, - "step": 11795 - }, - { - "epoch": 0.5470560964302271, - "grad_norm": 3.734877109527588, - "learning_rate": 1.0102759811720995e-05, - "loss": 0.9592, - "step": 11800 - }, - { - "epoch": 0.5470560964302271, - "eval_loss": 0.9257230758666992, - "eval_runtime": 11.2689, - "eval_samples_per_second": 11.27, - "eval_steps_per_second": 11.27, - "step": 11800 - }, - { - "epoch": 0.547287899860918, - "grad_norm": 4.463562488555908, - "learning_rate": 1.0094668740022236e-05, - "loss": 0.7343, - "step": 11805 - }, - { - "epoch": 0.5475197032916087, - "grad_norm": 3.9908478260040283, - "learning_rate": 1.0086577606342131e-05, - "loss": 0.9073, - "step": 11810 - }, - { - "epoch": 0.5477515067222994, - "grad_norm": 3.766899824142456, - "learning_rate": 1.007848641597809e-05, - "loss": 0.8451, - "step": 11815 - }, - { - "epoch": 0.5479833101529903, - "grad_norm": 3.727322578430176, - "learning_rate": 1.0070395174227563e-05, - "loss": 0.7368, - "step": 11820 - }, - { - "epoch": 0.548215113583681, - "grad_norm": 3.233952522277832, - "learning_rate": 1.0062303886388031e-05, - "loss": 0.8511, - "step": 11825 - }, - { - "epoch": 0.5484469170143718, - "grad_norm": 3.7264208793640137, - "learning_rate": 1.0054212557757015e-05, - "loss": 0.7904, - "step": 11830 - }, - { - "epoch": 0.5486787204450626, - "grad_norm": 3.164869785308838, - "learning_rate": 1.0046121193632046e-05, - "loss": 0.8442, - "step": 11835 - }, - { - "epoch": 0.5489105238757533, - "grad_norm": 3.574117660522461, - "learning_rate": 1.0038029799310691e-05, - "loss": 1.0224, - "step": 11840 - }, - { - "epoch": 0.5491423273064442, - "grad_norm": 3.944805145263672, - "learning_rate": 1.002993838009054e-05, - "loss": 0.8456, - "step": 11845 - }, - { - "epoch": 0.5493741307371349, - "grad_norm": 3.691657543182373, - "learning_rate": 1.002184694126918e-05, - "loss": 0.988, - "step": 11850 - }, - { - "epoch": 0.5496059341678257, - "grad_norm": 3.9201769828796387, - "learning_rate": 1.0013755488144233e-05, - "loss": 0.8129, - "step": 11855 - }, - { - "epoch": 0.5498377375985165, - "grad_norm": 3.153733015060425, - "learning_rate": 1.0005664026013315e-05, - "loss": 0.8401, - "step": 11860 - }, - { - "epoch": 0.5500695410292072, - "grad_norm": 3.8798153400421143, - "learning_rate": 9.99757256017406e-06, - "loss": 0.8851, - "step": 11865 - }, - { - "epoch": 0.550301344459898, - "grad_norm": 2.954413414001465, - "learning_rate": 9.989481095924089e-06, - "loss": 0.9418, - "step": 11870 - }, - { - "epoch": 0.5505331478905888, - "grad_norm": 4.20355749130249, - "learning_rate": 9.981389638561038e-06, - "loss": 1.0055, - "step": 11875 - }, - { - "epoch": 0.5507649513212796, - "grad_norm": 3.426891565322876, - "learning_rate": 9.97329819338253e-06, - "loss": 0.8393, - "step": 11880 - }, - { - "epoch": 0.5509967547519703, - "grad_norm": 3.606313705444336, - "learning_rate": 9.96520676568618e-06, - "loss": 0.7815, - "step": 11885 - }, - { - "epoch": 0.5512285581826611, - "grad_norm": 4.329439640045166, - "learning_rate": 9.957115360769595e-06, - "loss": 0.9814, - "step": 11890 - }, - { - "epoch": 0.5514603616133519, - "grad_norm": 3.6122066974639893, - "learning_rate": 9.949023983930365e-06, - "loss": 0.9594, - "step": 11895 - }, - { - "epoch": 0.5516921650440426, - "grad_norm": 4.180810451507568, - "learning_rate": 9.940932640466063e-06, - "loss": 0.8931, - "step": 11900 - }, - { - "epoch": 0.5516921650440426, - "eval_loss": 0.9250540137290955, - "eval_runtime": 11.2655, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, - "step": 11900 - }, - { - "epoch": 0.5519239684747335, - "grad_norm": 4.088957786560059, - "learning_rate": 9.932841335674234e-06, - "loss": 1.0179, - "step": 11905 - }, - { - "epoch": 0.5521557719054242, - "grad_norm": 3.0207998752593994, - "learning_rate": 9.924750074852403e-06, - "loss": 0.7174, - "step": 11910 - }, - { - "epoch": 0.5523875753361149, - "grad_norm": 3.4280521869659424, - "learning_rate": 9.916658863298072e-06, - "loss": 0.9469, - "step": 11915 - }, - { - "epoch": 0.5526193787668058, - "grad_norm": 3.119837522506714, - "learning_rate": 9.908567706308701e-06, - "loss": 0.7976, - "step": 11920 - }, - { - "epoch": 0.5528511821974965, - "grad_norm": 3.94305419921875, - "learning_rate": 9.900476609181719e-06, - "loss": 1.062, - "step": 11925 - }, - { - "epoch": 0.5530829856281873, - "grad_norm": 3.694404363632202, - "learning_rate": 9.89238557721451e-06, - "loss": 0.9182, - "step": 11930 - }, - { - "epoch": 0.5533147890588781, - "grad_norm": 4.300608158111572, - "learning_rate": 9.884294615704426e-06, - "loss": 0.9331, - "step": 11935 - }, - { - "epoch": 0.5535465924895688, - "grad_norm": 3.950634479522705, - "learning_rate": 9.876203729948762e-06, - "loss": 0.8561, - "step": 11940 - }, - { - "epoch": 0.5537783959202596, - "grad_norm": 3.2765071392059326, - "learning_rate": 9.86811292524477e-06, - "loss": 0.7449, - "step": 11945 - }, - { - "epoch": 0.5540101993509504, - "grad_norm": 3.229647636413574, - "learning_rate": 9.86002220688965e-06, - "loss": 0.8359, - "step": 11950 - }, - { - "epoch": 0.5542420027816412, - "grad_norm": 4.144287109375, - "learning_rate": 9.851931580180543e-06, - "loss": 0.7521, - "step": 11955 - }, - { - "epoch": 0.5544738062123319, - "grad_norm": 3.954238176345825, - "learning_rate": 9.843841050414523e-06, - "loss": 0.8984, - "step": 11960 - }, - { - "epoch": 0.5547056096430227, - "grad_norm": 4.321442127227783, - "learning_rate": 9.835750622888612e-06, - "loss": 0.9637, - "step": 11965 - }, - { - "epoch": 0.5549374130737135, - "grad_norm": 3.347836971282959, - "learning_rate": 9.827660302899758e-06, - "loss": 0.8953, - "step": 11970 - }, - { - "epoch": 0.5551692165044043, - "grad_norm": 5.19721794128418, - "learning_rate": 9.819570095744843e-06, - "loss": 0.9844, - "step": 11975 - }, - { - "epoch": 0.555401019935095, - "grad_norm": 3.556494951248169, - "learning_rate": 9.811480006720674e-06, - "loss": 0.8301, - "step": 11980 - }, - { - "epoch": 0.5556328233657858, - "grad_norm": 3.1306145191192627, - "learning_rate": 9.80339004112398e-06, - "loss": 0.8594, - "step": 11985 - }, - { - "epoch": 0.5558646267964766, - "grad_norm": 3.2993454933166504, - "learning_rate": 9.795300204251401e-06, - "loss": 0.9242, - "step": 11990 - }, - { - "epoch": 0.5560964302271674, - "grad_norm": 3.9028289318084717, - "learning_rate": 9.787210501399507e-06, - "loss": 0.9108, - "step": 11995 - }, - { - "epoch": 0.5563282336578581, - "grad_norm": 3.538569927215576, - "learning_rate": 9.779120937864775e-06, - "loss": 0.8735, - "step": 12000 - }, - { - "epoch": 0.5563282336578581, - "eval_loss": 0.9230943322181702, - "eval_runtime": 11.2638, - "eval_samples_per_second": 11.275, - "eval_steps_per_second": 11.275, - "step": 12000 - }, - { - "epoch": 0.556560037088549, - "grad_norm": 3.572889566421509, - "learning_rate": 9.771031518943588e-06, - "loss": 0.9114, - "step": 12005 - }, - { - "epoch": 0.5567918405192397, - "grad_norm": 3.8642523288726807, - "learning_rate": 9.762942249932232e-06, - "loss": 0.9442, - "step": 12010 - }, - { - "epoch": 0.5570236439499304, - "grad_norm": 4.011014461517334, - "learning_rate": 9.754853136126907e-06, - "loss": 0.8789, - "step": 12015 - }, - { - "epoch": 0.5572554473806213, - "grad_norm": 3.090644359588623, - "learning_rate": 9.746764182823696e-06, - "loss": 0.9078, - "step": 12020 - }, - { - "epoch": 0.557487250811312, - "grad_norm": 4.063168048858643, - "learning_rate": 9.738675395318585e-06, - "loss": 0.9359, - "step": 12025 - }, - { - "epoch": 0.5577190542420027, - "grad_norm": 3.925661325454712, - "learning_rate": 9.73058677890745e-06, - "loss": 0.9003, - "step": 12030 - }, - { - "epoch": 0.5579508576726936, - "grad_norm": 4.365866184234619, - "learning_rate": 9.722498338886061e-06, - "loss": 1.0072, - "step": 12035 - }, - { - "epoch": 0.5581826611033843, - "grad_norm": 3.7354307174682617, - "learning_rate": 9.714410080550066e-06, - "loss": 1.0974, - "step": 12040 - }, - { - "epoch": 0.5584144645340751, - "grad_norm": 3.8771347999572754, - "learning_rate": 9.706322009194989e-06, - "loss": 0.9791, - "step": 12045 - }, - { - "epoch": 0.5586462679647659, - "grad_norm": 3.3021087646484375, - "learning_rate": 9.69823413011624e-06, - "loss": 0.8397, - "step": 12050 - }, - { - "epoch": 0.5588780713954566, - "grad_norm": 4.110974311828613, - "learning_rate": 9.6901464486091e-06, - "loss": 0.905, - "step": 12055 - }, - { - "epoch": 0.5591098748261474, - "grad_norm": 3.842860221862793, - "learning_rate": 9.682058969968724e-06, - "loss": 0.964, - "step": 12060 - }, - { - "epoch": 0.5593416782568382, - "grad_norm": 4.045979976654053, - "learning_rate": 9.673971699490133e-06, - "loss": 0.9184, - "step": 12065 - }, - { - "epoch": 0.559573481687529, - "grad_norm": 4.390701770782471, - "learning_rate": 9.665884642468207e-06, - "loss": 0.9431, - "step": 12070 - }, - { - "epoch": 0.5598052851182197, - "grad_norm": 3.036095380783081, - "learning_rate": 9.657797804197688e-06, - "loss": 0.8122, - "step": 12075 - }, - { - "epoch": 0.5600370885489105, - "grad_norm": 3.978038787841797, - "learning_rate": 9.64971118997318e-06, - "loss": 1.0041, - "step": 12080 - }, - { - "epoch": 0.5602688919796013, - "grad_norm": 3.818854570388794, - "learning_rate": 9.641624805089136e-06, - "loss": 1.0957, - "step": 12085 - }, - { - "epoch": 0.5605006954102921, - "grad_norm": 4.990757942199707, - "learning_rate": 9.633538654839856e-06, - "loss": 1.0599, - "step": 12090 - }, - { - "epoch": 0.5607324988409829, - "grad_norm": 3.926579236984253, - "learning_rate": 9.625452744519494e-06, - "loss": 0.8846, - "step": 12095 - }, - { - "epoch": 0.5609643022716736, - "grad_norm": 4.215625762939453, - "learning_rate": 9.617367079422044e-06, - "loss": 1.0451, - "step": 12100 - }, - { - "epoch": 0.5609643022716736, - "eval_loss": 0.9230334758758545, - "eval_runtime": 11.2713, - "eval_samples_per_second": 11.268, - "eval_steps_per_second": 11.268, - "step": 12100 - }, - { - "epoch": 0.5611961057023644, - "grad_norm": 3.8236374855041504, - "learning_rate": 9.609281664841333e-06, - "loss": 0.9031, - "step": 12105 - }, - { - "epoch": 0.5614279091330552, - "grad_norm": 4.061149597167969, - "learning_rate": 9.601196506071032e-06, - "loss": 0.7433, - "step": 12110 - }, - { - "epoch": 0.5616597125637459, - "grad_norm": 4.054765701293945, - "learning_rate": 9.59311160840464e-06, - "loss": 0.9256, - "step": 12115 - }, - { - "epoch": 0.5618915159944368, - "grad_norm": 3.500136137008667, - "learning_rate": 9.585026977135487e-06, - "loss": 0.8239, - "step": 12120 - }, - { - "epoch": 0.5621233194251275, - "grad_norm": 4.4179229736328125, - "learning_rate": 9.576942617556732e-06, - "loss": 0.9409, - "step": 12125 - }, - { - "epoch": 0.5623551228558182, - "grad_norm": 3.1021149158477783, - "learning_rate": 9.568858534961352e-06, - "loss": 0.9298, - "step": 12130 - }, - { - "epoch": 0.5625869262865091, - "grad_norm": 3.5426769256591797, - "learning_rate": 9.560774734642138e-06, - "loss": 0.9236, - "step": 12135 - }, - { - "epoch": 0.5628187297171998, - "grad_norm": 3.6595637798309326, - "learning_rate": 9.552691221891702e-06, - "loss": 0.9955, - "step": 12140 - }, - { - "epoch": 0.5630505331478906, - "grad_norm": 3.189774751663208, - "learning_rate": 9.544608002002471e-06, - "loss": 0.8035, - "step": 12145 - }, - { - "epoch": 0.5632823365785814, - "grad_norm": 4.558648586273193, - "learning_rate": 9.536525080266674e-06, - "loss": 0.9646, - "step": 12150 - }, - { - "epoch": 0.5635141400092721, - "grad_norm": 4.213756084442139, - "learning_rate": 9.528442461976347e-06, - "loss": 0.9128, - "step": 12155 - }, - { - "epoch": 0.5637459434399629, - "grad_norm": 3.8861300945281982, - "learning_rate": 9.52036015242333e-06, - "loss": 0.9403, - "step": 12160 - }, - { - "epoch": 0.5639777468706537, - "grad_norm": 3.3439478874206543, - "learning_rate": 9.512278156899255e-06, - "loss": 0.8125, - "step": 12165 - }, - { - "epoch": 0.5642095503013445, - "grad_norm": 4.4666748046875, - "learning_rate": 9.504196480695553e-06, - "loss": 0.9626, - "step": 12170 - }, - { - "epoch": 0.5644413537320352, - "grad_norm": 3.6612839698791504, - "learning_rate": 9.496115129103442e-06, - "loss": 0.8229, - "step": 12175 - }, - { - "epoch": 0.564673157162726, - "grad_norm": 3.3860251903533936, - "learning_rate": 9.488034107413935e-06, - "loss": 0.9714, - "step": 12180 - }, - { - "epoch": 0.5649049605934168, - "grad_norm": 4.411099433898926, - "learning_rate": 9.479953420917821e-06, - "loss": 0.9219, - "step": 12185 - }, - { - "epoch": 0.5651367640241075, - "grad_norm": 4.0682454109191895, - "learning_rate": 9.471873074905676e-06, - "loss": 0.9739, - "step": 12190 - }, - { - "epoch": 0.5653685674547984, - "grad_norm": 4.332896709442139, - "learning_rate": 9.463793074667845e-06, - "loss": 0.9527, - "step": 12195 - }, - { - "epoch": 0.5656003708854891, - "grad_norm": 3.3936986923217773, - "learning_rate": 9.455713425494449e-06, - "loss": 0.8179, - "step": 12200 - }, - { - "epoch": 0.5656003708854891, - "eval_loss": 0.9221216440200806, - "eval_runtime": 11.2783, - "eval_samples_per_second": 11.261, - "eval_steps_per_second": 11.261, - "step": 12200 - }, - { - "epoch": 0.5658321743161798, - "grad_norm": 3.9779462814331055, - "learning_rate": 9.447634132675389e-06, - "loss": 1.0487, - "step": 12205 - }, - { - "epoch": 0.5660639777468707, - "grad_norm": 3.2726247310638428, - "learning_rate": 9.439555201500321e-06, - "loss": 0.8528, - "step": 12210 - }, - { - "epoch": 0.5662957811775614, - "grad_norm": 4.14183235168457, - "learning_rate": 9.431476637258669e-06, - "loss": 0.9682, - "step": 12215 - }, - { - "epoch": 0.5665275846082523, - "grad_norm": 4.068934440612793, - "learning_rate": 9.423398445239618e-06, - "loss": 0.9826, - "step": 12220 - }, - { - "epoch": 0.566759388038943, - "grad_norm": 3.1168320178985596, - "learning_rate": 9.415320630732104e-06, - "loss": 0.9555, - "step": 12225 - }, - { - "epoch": 0.5669911914696337, - "grad_norm": 4.245713233947754, - "learning_rate": 9.407243199024822e-06, - "loss": 1.0328, - "step": 12230 - }, - { - "epoch": 0.5672229949003246, - "grad_norm": 3.5633294582366943, - "learning_rate": 9.399166155406209e-06, - "loss": 1.0491, - "step": 12235 - }, - { - "epoch": 0.5674547983310153, - "grad_norm": 3.3050804138183594, - "learning_rate": 9.39108950516446e-06, - "loss": 0.7442, - "step": 12240 - }, - { - "epoch": 0.567686601761706, - "grad_norm": 3.608316421508789, - "learning_rate": 9.383013253587499e-06, - "loss": 0.9549, - "step": 12245 - }, - { - "epoch": 0.5679184051923969, - "grad_norm": 3.2159860134124756, - "learning_rate": 9.374937405963002e-06, - "loss": 0.8389, - "step": 12250 - }, - { - "epoch": 0.5681502086230876, - "grad_norm": 4.187016010284424, - "learning_rate": 9.366861967578365e-06, - "loss": 0.8149, - "step": 12255 - }, - { - "epoch": 0.5683820120537784, - "grad_norm": 3.3455278873443604, - "learning_rate": 9.358786943720724e-06, - "loss": 0.8868, - "step": 12260 - }, - { - "epoch": 0.5686138154844692, - "grad_norm": 3.9041545391082764, - "learning_rate": 9.350712339676952e-06, - "loss": 0.9097, - "step": 12265 - }, - { - "epoch": 0.56884561891516, - "grad_norm": 3.5602474212646484, - "learning_rate": 9.342638160733638e-06, - "loss": 0.8567, - "step": 12270 - }, - { - "epoch": 0.5690774223458507, - "grad_norm": 3.781757354736328, - "learning_rate": 9.334564412177091e-06, - "loss": 1.0214, - "step": 12275 - }, - { - "epoch": 0.5693092257765415, - "grad_norm": 4.233317852020264, - "learning_rate": 9.32649109929334e-06, - "loss": 0.9249, - "step": 12280 - }, - { - "epoch": 0.5695410292072323, - "grad_norm": 3.0850915908813477, - "learning_rate": 9.31841822736813e-06, - "loss": 0.6689, - "step": 12285 - }, - { - "epoch": 0.569772832637923, - "grad_norm": 4.0126237869262695, - "learning_rate": 9.310345801686923e-06, - "loss": 0.9143, - "step": 12290 - }, - { - "epoch": 0.5700046360686138, - "grad_norm": 3.6224422454833984, - "learning_rate": 9.302273827534874e-06, - "loss": 0.7326, - "step": 12295 - }, - { - "epoch": 0.5702364394993046, - "grad_norm": 3.5666017532348633, - "learning_rate": 9.29420231019686e-06, - "loss": 0.9441, - "step": 12300 - }, - { - "epoch": 0.5702364394993046, - "eval_loss": 0.9209733009338379, - "eval_runtime": 11.2829, - "eval_samples_per_second": 11.256, - "eval_steps_per_second": 11.256, - "step": 12300 - }, - { - "epoch": 0.5704682429299953, - "grad_norm": 3.6667540073394775, - "learning_rate": 9.286131254957451e-06, - "loss": 1.0631, - "step": 12305 - }, - { - "epoch": 0.5707000463606862, - "grad_norm": 4.890436172485352, - "learning_rate": 9.278060667100907e-06, - "loss": 0.8892, - "step": 12310 - }, - { - "epoch": 0.5709318497913769, - "grad_norm": 4.06868839263916, - "learning_rate": 9.26999055191119e-06, - "loss": 0.9446, - "step": 12315 - }, - { - "epoch": 0.5711636532220676, - "grad_norm": 3.7493627071380615, - "learning_rate": 9.261920914671952e-06, - "loss": 0.8918, - "step": 12320 - }, - { - "epoch": 0.5713954566527585, - "grad_norm": 3.5982697010040283, - "learning_rate": 9.253851760666536e-06, - "loss": 0.8466, - "step": 12325 - }, - { - "epoch": 0.5716272600834492, - "grad_norm": 4.513077259063721, - "learning_rate": 9.245783095177959e-06, - "loss": 0.9269, - "step": 12330 - }, - { - "epoch": 0.5718590635141401, - "grad_norm": 3.5556976795196533, - "learning_rate": 9.237714923488929e-06, - "loss": 1.0331, - "step": 12335 - }, - { - "epoch": 0.5720908669448308, - "grad_norm": 3.3687074184417725, - "learning_rate": 9.229647250881817e-06, - "loss": 0.6496, - "step": 12340 - }, - { - "epoch": 0.5723226703755215, - "grad_norm": 3.337689161300659, - "learning_rate": 9.221580082638678e-06, - "loss": 0.8649, - "step": 12345 - }, - { - "epoch": 0.5725544738062124, - "grad_norm": 4.0120530128479, - "learning_rate": 9.213513424041236e-06, - "loss": 1.0376, - "step": 12350 - }, - { - "epoch": 0.5727862772369031, - "grad_norm": 4.060422897338867, - "learning_rate": 9.205447280370879e-06, - "loss": 0.9098, - "step": 12355 - }, - { - "epoch": 0.5730180806675939, - "grad_norm": 9.308208465576172, - "learning_rate": 9.197381656908654e-06, - "loss": 0.8756, - "step": 12360 - }, - { - "epoch": 0.5732498840982847, - "grad_norm": 3.8436498641967773, - "learning_rate": 9.189316558935282e-06, - "loss": 0.8995, - "step": 12365 - }, - { - "epoch": 0.5734816875289754, - "grad_norm": 3.3367080688476562, - "learning_rate": 9.181251991731119e-06, - "loss": 0.9903, - "step": 12370 - }, - { - "epoch": 0.5737134909596662, - "grad_norm": 3.9300827980041504, - "learning_rate": 9.173187960576189e-06, - "loss": 0.9093, - "step": 12375 - }, - { - "epoch": 0.573945294390357, - "grad_norm": 4.047109127044678, - "learning_rate": 9.165124470750153e-06, - "loss": 0.9035, - "step": 12380 - }, - { - "epoch": 0.5741770978210478, - "grad_norm": 3.9684131145477295, - "learning_rate": 9.157061527532335e-06, - "loss": 0.7974, - "step": 12385 - }, - { - "epoch": 0.5744089012517385, - "grad_norm": 3.2714669704437256, - "learning_rate": 9.148999136201686e-06, - "loss": 0.9637, - "step": 12390 - }, - { - "epoch": 0.5746407046824293, - "grad_norm": 4.41608190536499, - "learning_rate": 9.140937302036803e-06, - "loss": 0.8028, - "step": 12395 - }, - { - "epoch": 0.5748725081131201, - "grad_norm": 4.140214920043945, - "learning_rate": 9.13287603031591e-06, - "loss": 0.958, - "step": 12400 - }, - { - "epoch": 0.5748725081131201, - "eval_loss": 0.9194921851158142, - "eval_runtime": 11.2653, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 12400 - }, - { - "epoch": 0.5751043115438108, - "grad_norm": 3.3030049800872803, - "learning_rate": 9.124815326316868e-06, - "loss": 0.9745, - "step": 12405 - }, - { - "epoch": 0.5753361149745017, - "grad_norm": 3.330787181854248, - "learning_rate": 9.116755195317176e-06, - "loss": 0.8783, - "step": 12410 - }, - { - "epoch": 0.5755679184051924, - "grad_norm": 3.8961637020111084, - "learning_rate": 9.10869564259394e-06, - "loss": 0.8882, - "step": 12415 - }, - { - "epoch": 0.5757997218358831, - "grad_norm": 3.4725751876831055, - "learning_rate": 9.100636673423898e-06, - "loss": 1.0014, - "step": 12420 - }, - { - "epoch": 0.576031525266574, - "grad_norm": 3.758121967315674, - "learning_rate": 9.092578293083408e-06, - "loss": 0.9907, - "step": 12425 - }, - { - "epoch": 0.5762633286972647, - "grad_norm": 3.889312267303467, - "learning_rate": 9.084520506848433e-06, - "loss": 0.8723, - "step": 12430 - }, - { - "epoch": 0.5764951321279554, - "grad_norm": 3.362896203994751, - "learning_rate": 9.076463319994554e-06, - "loss": 0.7634, - "step": 12435 - }, - { - "epoch": 0.5767269355586463, - "grad_norm": 3.958284854888916, - "learning_rate": 9.068406737796957e-06, - "loss": 0.804, - "step": 12440 - }, - { - "epoch": 0.576958738989337, - "grad_norm": 3.408536911010742, - "learning_rate": 9.060350765530436e-06, - "loss": 0.8166, - "step": 12445 - }, - { - "epoch": 0.5771905424200278, - "grad_norm": 3.5486555099487305, - "learning_rate": 9.052295408469382e-06, - "loss": 0.6871, - "step": 12450 - }, - { - "epoch": 0.5774223458507186, - "grad_norm": 3.405040740966797, - "learning_rate": 9.044240671887784e-06, - "loss": 0.7373, - "step": 12455 - }, - { - "epoch": 0.5776541492814093, - "grad_norm": 3.590054988861084, - "learning_rate": 9.036186561059222e-06, - "loss": 0.8471, - "step": 12460 - }, - { - "epoch": 0.5778859527121002, - "grad_norm": 3.6733016967773438, - "learning_rate": 9.02813308125687e-06, - "loss": 0.8568, - "step": 12465 - }, - { - "epoch": 0.5781177561427909, - "grad_norm": 3.4835875034332275, - "learning_rate": 9.020080237753489e-06, - "loss": 0.8211, - "step": 12470 - }, - { - "epoch": 0.5783495595734817, - "grad_norm": 3.9289448261260986, - "learning_rate": 9.012028035821423e-06, - "loss": 0.8948, - "step": 12475 - }, - { - "epoch": 0.5785813630041725, - "grad_norm": 6.000369071960449, - "learning_rate": 9.003976480732592e-06, - "loss": 0.9534, - "step": 12480 - }, - { - "epoch": 0.5788131664348632, - "grad_norm": 3.9709556102752686, - "learning_rate": 8.995925577758505e-06, - "loss": 0.8551, - "step": 12485 - }, - { - "epoch": 0.579044969865554, - "grad_norm": 3.99529767036438, - "learning_rate": 8.98787533217022e-06, - "loss": 0.732, - "step": 12490 - }, - { - "epoch": 0.5792767732962448, - "grad_norm": 3.711063861846924, - "learning_rate": 8.97982574923839e-06, - "loss": 0.9214, - "step": 12495 - }, - { - "epoch": 0.5795085767269356, - "grad_norm": 3.617279529571533, - "learning_rate": 8.971776834233223e-06, - "loss": 0.7603, - "step": 12500 - }, - { - "epoch": 0.5795085767269356, - "eval_loss": 0.9186437726020813, - "eval_runtime": 11.2712, - "eval_samples_per_second": 11.268, - "eval_steps_per_second": 11.268, - "step": 12500 - }, - { - "epoch": 0.5797403801576263, - "grad_norm": 3.9697864055633545, - "learning_rate": 8.963728592424483e-06, - "loss": 0.8395, - "step": 12505 - }, - { - "epoch": 0.5799721835883171, - "grad_norm": 3.5097768306732178, - "learning_rate": 8.955681029081512e-06, - "loss": 0.8685, - "step": 12510 - }, - { - "epoch": 0.5802039870190079, - "grad_norm": 3.709843635559082, - "learning_rate": 8.947634149473187e-06, - "loss": 1.0262, - "step": 12515 - }, - { - "epoch": 0.5804357904496986, - "grad_norm": 2.8888819217681885, - "learning_rate": 8.939587958867948e-06, - "loss": 0.7361, - "step": 12520 - }, - { - "epoch": 0.5806675938803895, - "grad_norm": 4.29230260848999, - "learning_rate": 8.931542462533783e-06, - "loss": 0.8786, - "step": 12525 - }, - { - "epoch": 0.5808993973110802, - "grad_norm": 4.221256256103516, - "learning_rate": 8.923497665738227e-06, - "loss": 0.9499, - "step": 12530 - }, - { - "epoch": 0.5811312007417709, - "grad_norm": 3.4252729415893555, - "learning_rate": 8.915453573748356e-06, - "loss": 0.7506, - "step": 12535 - }, - { - "epoch": 0.5813630041724618, - "grad_norm": 4.157860279083252, - "learning_rate": 8.907410191830782e-06, - "loss": 1.0024, - "step": 12540 - }, - { - "epoch": 0.5815948076031525, - "grad_norm": 4.022140979766846, - "learning_rate": 8.899367525251652e-06, - "loss": 1.0174, - "step": 12545 - }, - { - "epoch": 0.5818266110338433, - "grad_norm": 4.003377437591553, - "learning_rate": 8.891325579276645e-06, - "loss": 0.8262, - "step": 12550 - }, - { - "epoch": 0.5820584144645341, - "grad_norm": 3.017552614212036, - "learning_rate": 8.883284359170973e-06, - "loss": 0.9769, - "step": 12555 - }, - { - "epoch": 0.5822902178952248, - "grad_norm": 4.014686584472656, - "learning_rate": 8.875243870199368e-06, - "loss": 1.0247, - "step": 12560 - }, - { - "epoch": 0.5825220213259156, - "grad_norm": 3.7590744495391846, - "learning_rate": 8.867204117626083e-06, - "loss": 0.8188, - "step": 12565 - }, - { - "epoch": 0.5827538247566064, - "grad_norm": 4.151451110839844, - "learning_rate": 8.859165106714895e-06, - "loss": 0.9248, - "step": 12570 - }, - { - "epoch": 0.5829856281872972, - "grad_norm": 3.3136253356933594, - "learning_rate": 8.851126842729088e-06, - "loss": 0.7535, - "step": 12575 - }, - { - "epoch": 0.5832174316179879, - "grad_norm": 3.7408132553100586, - "learning_rate": 8.843089330931458e-06, - "loss": 0.7754, - "step": 12580 - }, - { - "epoch": 0.5834492350486787, - "grad_norm": 4.146431922912598, - "learning_rate": 8.83505257658431e-06, - "loss": 0.7483, - "step": 12585 - }, - { - "epoch": 0.5836810384793695, - "grad_norm": 4.425289630889893, - "learning_rate": 8.827016584949458e-06, - "loss": 0.9464, - "step": 12590 - }, - { - "epoch": 0.5839128419100603, - "grad_norm": 3.187209367752075, - "learning_rate": 8.81898136128821e-06, - "loss": 0.8334, - "step": 12595 - }, - { - "epoch": 0.5841446453407511, - "grad_norm": 2.9438233375549316, - "learning_rate": 8.810946910861374e-06, - "loss": 0.7319, - "step": 12600 - }, - { - "epoch": 0.5841446453407511, - "eval_loss": 0.9165540933609009, - "eval_runtime": 11.2682, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 12600 - }, - { - "epoch": 0.5843764487714418, - "grad_norm": 4.2117767333984375, - "learning_rate": 8.802913238929248e-06, - "loss": 0.8086, - "step": 12605 - }, - { - "epoch": 0.5846082522021326, - "grad_norm": 3.517542839050293, - "learning_rate": 8.794880350751624e-06, - "loss": 0.8586, - "step": 12610 - }, - { - "epoch": 0.5848400556328234, - "grad_norm": 3.9067094326019287, - "learning_rate": 8.786848251587783e-06, - "loss": 0.8945, - "step": 12615 - }, - { - "epoch": 0.5850718590635141, - "grad_norm": 3.6777069568634033, - "learning_rate": 8.778816946696484e-06, - "loss": 0.865, - "step": 12620 - }, - { - "epoch": 0.585303662494205, - "grad_norm": 4.279821395874023, - "learning_rate": 8.770786441335967e-06, - "loss": 0.8967, - "step": 12625 - }, - { - "epoch": 0.5855354659248957, - "grad_norm": 3.570180892944336, - "learning_rate": 8.76275674076396e-06, - "loss": 0.9695, - "step": 12630 - }, - { - "epoch": 0.5857672693555864, - "grad_norm": 3.2852587699890137, - "learning_rate": 8.75472785023764e-06, - "loss": 0.943, - "step": 12635 - }, - { - "epoch": 0.5859990727862773, - "grad_norm": 5.42115592956543, - "learning_rate": 8.746699775013679e-06, - "loss": 0.838, - "step": 12640 - }, - { - "epoch": 0.586230876216968, - "grad_norm": 3.4503657817840576, - "learning_rate": 8.738672520348194e-06, - "loss": 0.7896, - "step": 12645 - }, - { - "epoch": 0.5864626796476587, - "grad_norm": 3.632561445236206, - "learning_rate": 8.730646091496783e-06, - "loss": 0.8426, - "step": 12650 - }, - { - "epoch": 0.5866944830783496, - "grad_norm": 3.065523624420166, - "learning_rate": 8.722620493714495e-06, - "loss": 0.7696, - "step": 12655 - }, - { - "epoch": 0.5869262865090403, - "grad_norm": 3.866663694381714, - "learning_rate": 8.714595732255833e-06, - "loss": 0.7732, - "step": 12660 - }, - { - "epoch": 0.5871580899397311, - "grad_norm": 4.122859001159668, - "learning_rate": 8.706571812374754e-06, - "loss": 0.8873, - "step": 12665 - }, - { - "epoch": 0.5873898933704219, - "grad_norm": 4.008296966552734, - "learning_rate": 8.69854873932466e-06, - "loss": 0.8638, - "step": 12670 - }, - { - "epoch": 0.5876216968011126, - "grad_norm": 3.9733011722564697, - "learning_rate": 8.690526518358411e-06, - "loss": 1.0573, - "step": 12675 - }, - { - "epoch": 0.5878535002318034, - "grad_norm": 4.02682638168335, - "learning_rate": 8.682505154728299e-06, - "loss": 0.8987, - "step": 12680 - }, - { - "epoch": 0.5880853036624942, - "grad_norm": 4.271615028381348, - "learning_rate": 8.674484653686052e-06, - "loss": 1.0168, - "step": 12685 - }, - { - "epoch": 0.588317107093185, - "grad_norm": 4.35434627532959, - "learning_rate": 8.666465020482847e-06, - "loss": 0.8627, - "step": 12690 - }, - { - "epoch": 0.5885489105238757, - "grad_norm": 3.5957367420196533, - "learning_rate": 8.658446260369274e-06, - "loss": 0.7889, - "step": 12695 - }, - { - "epoch": 0.5887807139545665, - "grad_norm": 3.276003360748291, - "learning_rate": 8.650428378595368e-06, - "loss": 0.6797, - "step": 12700 - }, - { - "epoch": 0.5887807139545665, - "eval_loss": 0.9164103865623474, - "eval_runtime": 11.2762, - "eval_samples_per_second": 11.263, - "eval_steps_per_second": 11.263, - "step": 12700 - }, - { - "epoch": 0.5890125173852573, - "grad_norm": 3.588099718093872, - "learning_rate": 8.64241138041058e-06, - "loss": 0.9398, - "step": 12705 - }, - { - "epoch": 0.5892443208159481, - "grad_norm": 3.816437244415283, - "learning_rate": 8.634395271063782e-06, - "loss": 0.9051, - "step": 12710 - }, - { - "epoch": 0.5894761242466389, - "grad_norm": 3.784233331680298, - "learning_rate": 8.626380055803272e-06, - "loss": 0.8581, - "step": 12715 - }, - { - "epoch": 0.5897079276773296, - "grad_norm": 3.619258165359497, - "learning_rate": 8.618365739876759e-06, - "loss": 0.8342, - "step": 12720 - }, - { - "epoch": 0.5899397311080204, - "grad_norm": 3.3081767559051514, - "learning_rate": 8.610352328531353e-06, - "loss": 0.6449, - "step": 12725 - }, - { - "epoch": 0.5901715345387112, - "grad_norm": 3.5461251735687256, - "learning_rate": 8.602339827013583e-06, - "loss": 0.8356, - "step": 12730 - }, - { - "epoch": 0.5904033379694019, - "grad_norm": 3.2143607139587402, - "learning_rate": 8.594328240569387e-06, - "loss": 0.8567, - "step": 12735 - }, - { - "epoch": 0.5906351414000928, - "grad_norm": 4.731443405151367, - "learning_rate": 8.586317574444088e-06, - "loss": 0.9381, - "step": 12740 - }, - { - "epoch": 0.5908669448307835, - "grad_norm": 3.678004741668701, - "learning_rate": 8.578307833882422e-06, - "loss": 0.7766, - "step": 12745 - }, - { - "epoch": 0.5910987482614742, - "grad_norm": 4.3722147941589355, - "learning_rate": 8.570299024128505e-06, - "loss": 0.9503, - "step": 12750 - }, - { - "epoch": 0.5913305516921651, - "grad_norm": 3.4704501628875732, - "learning_rate": 8.562291150425851e-06, - "loss": 0.7958, - "step": 12755 - }, - { - "epoch": 0.5915623551228558, - "grad_norm": 3.9162561893463135, - "learning_rate": 8.554284218017366e-06, - "loss": 0.8784, - "step": 12760 - }, - { - "epoch": 0.5917941585535466, - "grad_norm": 3.3849542140960693, - "learning_rate": 8.546278232145334e-06, - "loss": 0.9433, - "step": 12765 - }, - { - "epoch": 0.5920259619842374, - "grad_norm": 3.4642109870910645, - "learning_rate": 8.538273198051413e-06, - "loss": 0.9221, - "step": 12770 - }, - { - "epoch": 0.5922577654149281, - "grad_norm": 3.538447380065918, - "learning_rate": 8.530269120976654e-06, - "loss": 0.9911, - "step": 12775 - }, - { - "epoch": 0.5924895688456189, - "grad_norm": 3.6921002864837646, - "learning_rate": 8.522266006161467e-06, - "loss": 1.0559, - "step": 12780 - }, - { - "epoch": 0.5927213722763097, - "grad_norm": 4.147615432739258, - "learning_rate": 8.514263858845638e-06, - "loss": 0.9022, - "step": 12785 - }, - { - "epoch": 0.5929531757070005, - "grad_norm": 4.016373634338379, - "learning_rate": 8.506262684268312e-06, - "loss": 0.9778, - "step": 12790 - }, - { - "epoch": 0.5931849791376912, - "grad_norm": 3.8568859100341797, - "learning_rate": 8.498262487668015e-06, - "loss": 0.9409, - "step": 12795 - }, - { - "epoch": 0.593416782568382, - "grad_norm": 3.5835936069488525, - "learning_rate": 8.490263274282615e-06, - "loss": 0.9539, - "step": 12800 - }, - { - "epoch": 0.593416782568382, - "eval_loss": 0.9164746999740601, - "eval_runtime": 11.2684, - "eval_samples_per_second": 11.27, - "eval_steps_per_second": 11.27, - "step": 12800 - }, - { - "epoch": 0.5936485859990728, - "grad_norm": 4.477583885192871, - "learning_rate": 8.482265049349347e-06, - "loss": 0.9882, - "step": 12805 - }, - { - "epoch": 0.5938803894297635, - "grad_norm": 4.2472076416015625, - "learning_rate": 8.474267818104788e-06, - "loss": 1.0207, - "step": 12810 - }, - { - "epoch": 0.5941121928604544, - "grad_norm": 4.307184219360352, - "learning_rate": 8.466271585784872e-06, - "loss": 0.7898, - "step": 12815 - }, - { - "epoch": 0.5943439962911451, - "grad_norm": 4.007270336151123, - "learning_rate": 8.458276357624882e-06, - "loss": 0.8249, - "step": 12820 - }, - { - "epoch": 0.5945757997218358, - "grad_norm": 3.8119564056396484, - "learning_rate": 8.450282138859439e-06, - "loss": 0.8163, - "step": 12825 - }, - { - "epoch": 0.5948076031525267, - "grad_norm": 4.081138610839844, - "learning_rate": 8.4422889347225e-06, - "loss": 0.8888, - "step": 12830 - }, - { - "epoch": 0.5950394065832174, - "grad_norm": 3.449310302734375, - "learning_rate": 8.43429675044737e-06, - "loss": 0.856, - "step": 12835 - }, - { - "epoch": 0.5952712100139083, - "grad_norm": 3.682424783706665, - "learning_rate": 8.426305591266664e-06, - "loss": 1.0172, - "step": 12840 - }, - { - "epoch": 0.595503013444599, - "grad_norm": 4.203467845916748, - "learning_rate": 8.418315462412348e-06, - "loss": 0.7828, - "step": 12845 - }, - { - "epoch": 0.5957348168752897, - "grad_norm": 2.9372358322143555, - "learning_rate": 8.410326369115706e-06, - "loss": 0.8506, - "step": 12850 - }, - { - "epoch": 0.5959666203059806, - "grad_norm": 3.798149347305298, - "learning_rate": 8.402338316607336e-06, - "loss": 0.8294, - "step": 12855 - }, - { - "epoch": 0.5961984237366713, - "grad_norm": 3.2341666221618652, - "learning_rate": 8.394351310117168e-06, - "loss": 0.9562, - "step": 12860 - }, - { - "epoch": 0.596430227167362, - "grad_norm": 3.3644139766693115, - "learning_rate": 8.386365354874442e-06, - "loss": 0.782, - "step": 12865 - }, - { - "epoch": 0.5966620305980529, - "grad_norm": 5.455288410186768, - "learning_rate": 8.378380456107702e-06, - "loss": 0.9533, - "step": 12870 - }, - { - "epoch": 0.5968938340287436, - "grad_norm": 3.402225971221924, - "learning_rate": 8.370396619044806e-06, - "loss": 0.8046, - "step": 12875 - }, - { - "epoch": 0.5971256374594344, - "grad_norm": 3.9545254707336426, - "learning_rate": 8.362413848912925e-06, - "loss": 0.9077, - "step": 12880 - }, - { - "epoch": 0.5973574408901252, - "grad_norm": 3.285334587097168, - "learning_rate": 8.35443215093852e-06, - "loss": 0.8095, - "step": 12885 - }, - { - "epoch": 0.597589244320816, - "grad_norm": 3.7525312900543213, - "learning_rate": 8.34645153034735e-06, - "loss": 0.9235, - "step": 12890 - }, - { - "epoch": 0.5978210477515067, - "grad_norm": 3.530529022216797, - "learning_rate": 8.338471992364484e-06, - "loss": 0.9493, - "step": 12895 - }, - { - "epoch": 0.5980528511821975, - "grad_norm": 3.5442280769348145, - "learning_rate": 8.330493542214257e-06, - "loss": 0.8373, - "step": 12900 - }, - { - "epoch": 0.5980528511821975, - "eval_loss": 0.9144797921180725, - "eval_runtime": 11.2681, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 12900 - }, - { - "epoch": 0.5982846546128883, - "grad_norm": 3.829141855239868, - "learning_rate": 8.322516185120314e-06, - "loss": 0.9114, - "step": 12905 - }, - { - "epoch": 0.598516458043579, - "grad_norm": 3.869734764099121, - "learning_rate": 8.314539926305573e-06, - "loss": 0.8432, - "step": 12910 - }, - { - "epoch": 0.5987482614742698, - "grad_norm": 3.453571081161499, - "learning_rate": 8.306564770992236e-06, - "loss": 0.7266, - "step": 12915 - }, - { - "epoch": 0.5989800649049606, - "grad_norm": 3.4968366622924805, - "learning_rate": 8.298590724401785e-06, - "loss": 0.8843, - "step": 12920 - }, - { - "epoch": 0.5992118683356513, - "grad_norm": 3.363662004470825, - "learning_rate": 8.290617791754975e-06, - "loss": 0.9189, - "step": 12925 - }, - { - "epoch": 0.5994436717663422, - "grad_norm": 6.320411205291748, - "learning_rate": 8.282645978271825e-06, - "loss": 0.9253, - "step": 12930 - }, - { - "epoch": 0.5996754751970329, - "grad_norm": 3.4955732822418213, - "learning_rate": 8.274675289171623e-06, - "loss": 0.8373, - "step": 12935 - }, - { - "epoch": 0.5999072786277236, - "grad_norm": 3.8414204120635986, - "learning_rate": 8.266705729672935e-06, - "loss": 0.9313, - "step": 12940 - }, - { - "epoch": 0.6001390820584145, - "grad_norm": 3.388800859451294, - "learning_rate": 8.25873730499357e-06, - "loss": 0.9714, - "step": 12945 - }, - { - "epoch": 0.6003708854891052, - "grad_norm": 3.6213204860687256, - "learning_rate": 8.250770020350602e-06, - "loss": 0.8529, - "step": 12950 - }, - { - "epoch": 0.6006026889197961, - "grad_norm": 4.339563846588135, - "learning_rate": 8.242803880960362e-06, - "loss": 0.9313, - "step": 12955 - }, - { - "epoch": 0.6008344923504868, - "grad_norm": 3.9930691719055176, - "learning_rate": 8.234838892038417e-06, - "loss": 0.8653, - "step": 12960 - }, - { - "epoch": 0.6010662957811775, - "grad_norm": 4.320202350616455, - "learning_rate": 8.226875058799596e-06, - "loss": 0.8899, - "step": 12965 - }, - { - "epoch": 0.6012980992118684, - "grad_norm": 4.060983180999756, - "learning_rate": 8.218912386457967e-06, - "loss": 0.8887, - "step": 12970 - }, - { - "epoch": 0.6015299026425591, - "grad_norm": 3.39972186088562, - "learning_rate": 8.210950880226833e-06, - "loss": 0.8836, - "step": 12975 - }, - { - "epoch": 0.6017617060732499, - "grad_norm": 4.337881565093994, - "learning_rate": 8.202990545318744e-06, - "loss": 0.8655, - "step": 12980 - }, - { - "epoch": 0.6019935095039407, - "grad_norm": 4.529641151428223, - "learning_rate": 8.195031386945465e-06, - "loss": 0.9368, - "step": 12985 - }, - { - "epoch": 0.6022253129346314, - "grad_norm": 4.606254577636719, - "learning_rate": 8.18707341031801e-06, - "loss": 0.8114, - "step": 12990 - }, - { - "epoch": 0.6024571163653222, - "grad_norm": 3.1904773712158203, - "learning_rate": 8.179116620646607e-06, - "loss": 0.9546, - "step": 12995 - }, - { - "epoch": 0.602688919796013, - "grad_norm": 3.775242805480957, - "learning_rate": 8.171161023140717e-06, - "loss": 0.8536, - "step": 13000 - }, - { - "epoch": 0.602688919796013, - "eval_loss": 0.9141844511032104, - "eval_runtime": 11.2721, - "eval_samples_per_second": 11.267, - "eval_steps_per_second": 11.267, - "step": 13000 - }, - { - "epoch": 0.6029207232267038, - "grad_norm": 3.759544849395752, - "learning_rate": 8.163206623009009e-06, - "loss": 0.9482, - "step": 13005 - }, - { - "epoch": 0.6031525266573945, - "grad_norm": 3.72965145111084, - "learning_rate": 8.155253425459379e-06, - "loss": 0.9966, - "step": 13010 - }, - { - "epoch": 0.6033843300880853, - "grad_norm": 3.206998586654663, - "learning_rate": 8.147301435698923e-06, - "loss": 0.8948, - "step": 13015 - }, - { - "epoch": 0.6036161335187761, - "grad_norm": 3.263573169708252, - "learning_rate": 8.139350658933957e-06, - "loss": 0.9882, - "step": 13020 - }, - { - "epoch": 0.6038479369494668, - "grad_norm": 3.676816701889038, - "learning_rate": 8.131401100370002e-06, - "loss": 0.871, - "step": 13025 - }, - { - "epoch": 0.6040797403801577, - "grad_norm": 4.554088115692139, - "learning_rate": 8.123452765211775e-06, - "loss": 1.0203, - "step": 13030 - }, - { - "epoch": 0.6043115438108484, - "grad_norm": 4.035562992095947, - "learning_rate": 8.115505658663193e-06, - "loss": 0.9691, - "step": 13035 - }, - { - "epoch": 0.6045433472415391, - "grad_norm": 4.155014991760254, - "learning_rate": 8.107559785927385e-06, - "loss": 0.83, - "step": 13040 - }, - { - "epoch": 0.60477515067223, - "grad_norm": 3.3499128818511963, - "learning_rate": 8.099615152206644e-06, - "loss": 0.8616, - "step": 13045 - }, - { - "epoch": 0.6050069541029207, - "grad_norm": 4.116346836090088, - "learning_rate": 8.09167176270247e-06, - "loss": 0.8987, - "step": 13050 - }, - { - "epoch": 0.6052387575336114, - "grad_norm": 4.1573100090026855, - "learning_rate": 8.08372962261555e-06, - "loss": 0.9406, - "step": 13055 - }, - { - "epoch": 0.6054705609643023, - "grad_norm": 3.7353973388671875, - "learning_rate": 8.07578873714574e-06, - "loss": 0.7806, - "step": 13060 - }, - { - "epoch": 0.605702364394993, - "grad_norm": 3.6239640712738037, - "learning_rate": 8.06784911149209e-06, - "loss": 0.9823, - "step": 13065 - }, - { - "epoch": 0.6059341678256838, - "grad_norm": 3.2447075843811035, - "learning_rate": 8.059910750852815e-06, - "loss": 0.7974, - "step": 13070 - }, - { - "epoch": 0.6061659712563746, - "grad_norm": 3.9731087684631348, - "learning_rate": 8.0519736604253e-06, - "loss": 0.8716, - "step": 13075 - }, - { - "epoch": 0.6063977746870653, - "grad_norm": 3.3315420150756836, - "learning_rate": 8.044037845406104e-06, - "loss": 1.0138, - "step": 13080 - }, - { - "epoch": 0.6066295781177562, - "grad_norm": 2.920837163925171, - "learning_rate": 8.036103310990954e-06, - "loss": 0.9604, - "step": 13085 - }, - { - "epoch": 0.6068613815484469, - "grad_norm": 3.556678295135498, - "learning_rate": 8.028170062374734e-06, - "loss": 0.9994, - "step": 13090 - }, - { - "epoch": 0.6070931849791377, - "grad_norm": 3.7681658267974854, - "learning_rate": 8.02023810475148e-06, - "loss": 0.9512, - "step": 13095 - }, - { - "epoch": 0.6073249884098285, - "grad_norm": 3.459141492843628, - "learning_rate": 8.012307443314399e-06, - "loss": 0.866, - "step": 13100 - }, - { - "epoch": 0.6073249884098285, - "eval_loss": 0.9133861064910889, - "eval_runtime": 11.2835, - "eval_samples_per_second": 11.255, - "eval_steps_per_second": 11.255, - "step": 13100 - }, - { - "epoch": 0.6075567918405193, - "grad_norm": 3.4361283779144287, - "learning_rate": 8.004378083255828e-06, - "loss": 0.9585, - "step": 13105 - }, - { - "epoch": 0.60778859527121, - "grad_norm": 4.018921375274658, - "learning_rate": 7.996450029767269e-06, - "loss": 1.0037, - "step": 13110 - }, - { - "epoch": 0.6080203987019008, - "grad_norm": 4.195085525512695, - "learning_rate": 7.988523288039365e-06, - "loss": 0.9713, - "step": 13115 - }, - { - "epoch": 0.6082522021325916, - "grad_norm": 3.3644561767578125, - "learning_rate": 7.980597863261895e-06, - "loss": 0.8191, - "step": 13120 - }, - { - "epoch": 0.6084840055632823, - "grad_norm": 4.499396800994873, - "learning_rate": 7.972673760623781e-06, - "loss": 0.8071, - "step": 13125 - }, - { - "epoch": 0.6087158089939732, - "grad_norm": 3.8352174758911133, - "learning_rate": 7.964750985313079e-06, - "loss": 1.066, - "step": 13130 - }, - { - "epoch": 0.6089476124246639, - "grad_norm": 3.380789041519165, - "learning_rate": 7.956829542516967e-06, - "loss": 0.8846, - "step": 13135 - }, - { - "epoch": 0.6091794158553546, - "grad_norm": 4.380323886871338, - "learning_rate": 7.948909437421764e-06, - "loss": 0.872, - "step": 13140 - }, - { - "epoch": 0.6094112192860455, - "grad_norm": 4.038541793823242, - "learning_rate": 7.940990675212905e-06, - "loss": 0.9652, - "step": 13145 - }, - { - "epoch": 0.6096430227167362, - "grad_norm": 3.7945806980133057, - "learning_rate": 7.933073261074951e-06, - "loss": 0.9835, - "step": 13150 - }, - { - "epoch": 0.6098748261474269, - "grad_norm": 4.038233757019043, - "learning_rate": 7.92515720019157e-06, - "loss": 0.8533, - "step": 13155 - }, - { - "epoch": 0.6101066295781178, - "grad_norm": 3.5028693675994873, - "learning_rate": 7.917242497745563e-06, - "loss": 0.7992, - "step": 13160 - }, - { - "epoch": 0.6103384330088085, - "grad_norm": 3.19927978515625, - "learning_rate": 7.909329158918817e-06, - "loss": 0.7886, - "step": 13165 - }, - { - "epoch": 0.6105702364394993, - "grad_norm": 3.3780574798583984, - "learning_rate": 7.901417188892347e-06, - "loss": 0.8571, - "step": 13170 - }, - { - "epoch": 0.6108020398701901, - "grad_norm": 3.4062836170196533, - "learning_rate": 7.893506592846262e-06, - "loss": 0.8582, - "step": 13175 - }, - { - "epoch": 0.6110338433008808, - "grad_norm": 3.2859694957733154, - "learning_rate": 7.88559737595977e-06, - "loss": 0.7769, - "step": 13180 - }, - { - "epoch": 0.6112656467315716, - "grad_norm": 3.8368353843688965, - "learning_rate": 7.877689543411184e-06, - "loss": 0.9078, - "step": 13185 - }, - { - "epoch": 0.6114974501622624, - "grad_norm": 3.6319220066070557, - "learning_rate": 7.869783100377906e-06, - "loss": 0.7553, - "step": 13190 - }, - { - "epoch": 0.6117292535929532, - "grad_norm": 3.9984467029571533, - "learning_rate": 7.861878052036425e-06, - "loss": 0.8557, - "step": 13195 - }, - { - "epoch": 0.6119610570236439, - "grad_norm": 3.637826681137085, - "learning_rate": 7.853974403562324e-06, - "loss": 0.8803, - "step": 13200 - }, - { - "epoch": 0.6119610570236439, - "eval_loss": 0.9121905565261841, - "eval_runtime": 11.273, - "eval_samples_per_second": 11.266, - "eval_steps_per_second": 11.266, - "step": 13200 - }, - { - "epoch": 0.6121928604543347, - "grad_norm": 3.8848507404327393, - "learning_rate": 7.846072160130258e-06, - "loss": 0.9607, - "step": 13205 - }, - { - "epoch": 0.6124246638850255, - "grad_norm": 3.7693490982055664, - "learning_rate": 7.838171326913977e-06, - "loss": 0.9417, - "step": 13210 - }, - { - "epoch": 0.6126564673157163, - "grad_norm": 3.1103503704071045, - "learning_rate": 7.8302719090863e-06, - "loss": 0.6954, - "step": 13215 - }, - { - "epoch": 0.6128882707464071, - "grad_norm": 3.501575469970703, - "learning_rate": 7.822373911819119e-06, - "loss": 0.8014, - "step": 13220 - }, - { - "epoch": 0.6131200741770978, - "grad_norm": 4.255192756652832, - "learning_rate": 7.81447734028339e-06, - "loss": 1.0034, - "step": 13225 - }, - { - "epoch": 0.6133518776077886, - "grad_norm": 3.4730281829833984, - "learning_rate": 7.806582199649152e-06, - "loss": 1.0145, - "step": 13230 - }, - { - "epoch": 0.6135836810384794, - "grad_norm": 3.566481828689575, - "learning_rate": 7.798688495085492e-06, - "loss": 0.9147, - "step": 13235 - }, - { - "epoch": 0.6138154844691701, - "grad_norm": 4.164937973022461, - "learning_rate": 7.790796231760563e-06, - "loss": 1.0266, - "step": 13240 - }, - { - "epoch": 0.614047287899861, - "grad_norm": 3.8256633281707764, - "learning_rate": 7.782905414841579e-06, - "loss": 0.9039, - "step": 13245 - }, - { - "epoch": 0.6142790913305517, - "grad_norm": 4.075922012329102, - "learning_rate": 7.77501604949479e-06, - "loss": 0.9964, - "step": 13250 - }, - { - "epoch": 0.6145108947612424, - "grad_norm": 3.1503143310546875, - "learning_rate": 7.767128140885515e-06, - "loss": 0.8551, - "step": 13255 - }, - { - "epoch": 0.6147426981919333, - "grad_norm": 3.707214832305908, - "learning_rate": 7.75924169417811e-06, - "loss": 0.8868, - "step": 13260 - }, - { - "epoch": 0.614974501622624, - "grad_norm": 4.248465538024902, - "learning_rate": 7.751356714535974e-06, - "loss": 0.9955, - "step": 13265 - }, - { - "epoch": 0.6152063050533148, - "grad_norm": 4.484118938446045, - "learning_rate": 7.743473207121549e-06, - "loss": 1.0392, - "step": 13270 - }, - { - "epoch": 0.6154381084840056, - "grad_norm": 3.491764783859253, - "learning_rate": 7.735591177096313e-06, - "loss": 0.9588, - "step": 13275 - }, - { - "epoch": 0.6156699119146963, - "grad_norm": 3.8887925148010254, - "learning_rate": 7.727710629620768e-06, - "loss": 0.8629, - "step": 13280 - }, - { - "epoch": 0.6159017153453871, - "grad_norm": 3.9234886169433594, - "learning_rate": 7.719831569854454e-06, - "loss": 0.7192, - "step": 13285 - }, - { - "epoch": 0.6161335187760779, - "grad_norm": 3.4256324768066406, - "learning_rate": 7.71195400295594e-06, - "loss": 0.8865, - "step": 13290 - }, - { - "epoch": 0.6163653222067687, - "grad_norm": 3.125434160232544, - "learning_rate": 7.704077934082808e-06, - "loss": 0.7298, - "step": 13295 - }, - { - "epoch": 0.6165971256374594, - "grad_norm": 3.844914436340332, - "learning_rate": 7.696203368391662e-06, - "loss": 0.8286, - "step": 13300 - }, - { - "epoch": 0.6165971256374594, - "eval_loss": 0.9110766649246216, - "eval_runtime": 11.2753, - "eval_samples_per_second": 11.264, - "eval_steps_per_second": 11.264, - "step": 13300 - }, - { - "epoch": 0.6168289290681502, - "grad_norm": 4.225092887878418, - "learning_rate": 7.688330311038133e-06, - "loss": 0.8003, - "step": 13305 - }, - { - "epoch": 0.617060732498841, - "grad_norm": 3.7063608169555664, - "learning_rate": 7.68045876717684e-06, - "loss": 0.8791, - "step": 13310 - }, - { - "epoch": 0.6172925359295317, - "grad_norm": 4.041941165924072, - "learning_rate": 7.672588741961438e-06, - "loss": 0.7651, - "step": 13315 - }, - { - "epoch": 0.6175243393602226, - "grad_norm": 3.5181355476379395, - "learning_rate": 7.664720240544571e-06, - "loss": 0.7582, - "step": 13320 - }, - { - "epoch": 0.6177561427909133, - "grad_norm": 3.7614009380340576, - "learning_rate": 7.65685326807789e-06, - "loss": 0.8476, - "step": 13325 - }, - { - "epoch": 0.6179879462216041, - "grad_norm": 3.4191246032714844, - "learning_rate": 7.648987829712049e-06, - "loss": 0.7782, - "step": 13330 - }, - { - "epoch": 0.6182197496522949, - "grad_norm": 3.7803595066070557, - "learning_rate": 7.641123930596693e-06, - "loss": 0.9768, - "step": 13335 - }, - { - "epoch": 0.6184515530829856, - "grad_norm": 4.002791881561279, - "learning_rate": 7.633261575880455e-06, - "loss": 0.9629, - "step": 13340 - }, - { - "epoch": 0.6186833565136765, - "grad_norm": 3.3021740913391113, - "learning_rate": 7.625400770710965e-06, - "loss": 0.8824, - "step": 13345 - }, - { - "epoch": 0.6189151599443672, - "grad_norm": 3.8734776973724365, - "learning_rate": 7.617541520234836e-06, - "loss": 0.9005, - "step": 13350 - }, - { - "epoch": 0.6191469633750579, - "grad_norm": 3.9729230403900146, - "learning_rate": 7.609683829597661e-06, - "loss": 0.7664, - "step": 13355 - }, - { - "epoch": 0.6193787668057488, - "grad_norm": 3.6947438716888428, - "learning_rate": 7.601827703944014e-06, - "loss": 0.8933, - "step": 13360 - }, - { - "epoch": 0.6196105702364395, - "grad_norm": 3.5670571327209473, - "learning_rate": 7.5939731484174475e-06, - "loss": 0.8265, - "step": 13365 - }, - { - "epoch": 0.6198423736671302, - "grad_norm": 4.319498538970947, - "learning_rate": 7.586120168160472e-06, - "loss": 0.9813, - "step": 13370 - }, - { - "epoch": 0.6200741770978211, - "grad_norm": 3.1896026134490967, - "learning_rate": 7.578268768314585e-06, - "loss": 0.8378, - "step": 13375 - }, - { - "epoch": 0.6203059805285118, - "grad_norm": 2.9694342613220215, - "learning_rate": 7.570418954020238e-06, - "loss": 0.828, - "step": 13380 - }, - { - "epoch": 0.6205377839592026, - "grad_norm": 4.684661388397217, - "learning_rate": 7.5625707304168475e-06, - "loss": 0.9626, - "step": 13385 - }, - { - "epoch": 0.6207695873898934, - "grad_norm": 3.447885036468506, - "learning_rate": 7.554724102642789e-06, - "loss": 0.8045, - "step": 13390 - }, - { - "epoch": 0.6210013908205841, - "grad_norm": 3.6924703121185303, - "learning_rate": 7.546879075835395e-06, - "loss": 0.9149, - "step": 13395 - }, - { - "epoch": 0.6212331942512749, - "grad_norm": 3.3220014572143555, - "learning_rate": 7.539035655130944e-06, - "loss": 0.7473, - "step": 13400 - }, - { - "epoch": 0.6212331942512749, - "eval_loss": 0.9087728261947632, - "eval_runtime": 11.2696, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 13400 - }, - { - "epoch": 0.6214649976819657, - "grad_norm": 4.143969535827637, - "learning_rate": 7.531193845664666e-06, - "loss": 0.8948, - "step": 13405 - }, - { - "epoch": 0.6216968011126565, - "grad_norm": 3.8429365158081055, - "learning_rate": 7.523353652570735e-06, - "loss": 1.0161, - "step": 13410 - }, - { - "epoch": 0.6219286045433472, - "grad_norm": 5.148918151855469, - "learning_rate": 7.515515080982274e-06, - "loss": 0.841, - "step": 13415 - }, - { - "epoch": 0.622160407974038, - "grad_norm": 4.205752849578857, - "learning_rate": 7.507678136031336e-06, - "loss": 0.8718, - "step": 13420 - }, - { - "epoch": 0.6223922114047288, - "grad_norm": 3.2777302265167236, - "learning_rate": 7.499842822848913e-06, - "loss": 0.9281, - "step": 13425 - }, - { - "epoch": 0.6226240148354195, - "grad_norm": 3.4595041275024414, - "learning_rate": 7.4920091465649205e-06, - "loss": 0.8329, - "step": 13430 - }, - { - "epoch": 0.6228558182661104, - "grad_norm": 4.039912700653076, - "learning_rate": 7.484177112308216e-06, - "loss": 0.9747, - "step": 13435 - }, - { - "epoch": 0.6230876216968011, - "grad_norm": 3.4145519733428955, - "learning_rate": 7.476346725206573e-06, - "loss": 0.8099, - "step": 13440 - }, - { - "epoch": 0.6233194251274918, - "grad_norm": 3.8764584064483643, - "learning_rate": 7.468517990386687e-06, - "loss": 0.951, - "step": 13445 - }, - { - "epoch": 0.6235512285581827, - "grad_norm": 3.5151326656341553, - "learning_rate": 7.46069091297418e-06, - "loss": 0.8281, - "step": 13450 - }, - { - "epoch": 0.6237830319888734, - "grad_norm": 3.4390511512756348, - "learning_rate": 7.452865498093571e-06, - "loss": 1.0219, - "step": 13455 - }, - { - "epoch": 0.6240148354195643, - "grad_norm": 2.892617702484131, - "learning_rate": 7.445041750868308e-06, - "loss": 0.7701, - "step": 13460 - }, - { - "epoch": 0.624246638850255, - "grad_norm": 4.317920684814453, - "learning_rate": 7.437219676420742e-06, - "loss": 0.8255, - "step": 13465 - }, - { - "epoch": 0.6244784422809457, - "grad_norm": 4.20429801940918, - "learning_rate": 7.429399279872123e-06, - "loss": 0.9242, - "step": 13470 - }, - { - "epoch": 0.6247102457116366, - "grad_norm": 3.5195364952087402, - "learning_rate": 7.42158056634261e-06, - "loss": 0.8838, - "step": 13475 - }, - { - "epoch": 0.6249420491423273, - "grad_norm": 3.793839931488037, - "learning_rate": 7.413763540951262e-06, - "loss": 0.7726, - "step": 13480 - }, - { - "epoch": 0.625173852573018, - "grad_norm": 4.641863822937012, - "learning_rate": 7.405948208816018e-06, - "loss": 1.0465, - "step": 13485 - }, - { - "epoch": 0.6254056560037089, - "grad_norm": 3.6203348636627197, - "learning_rate": 7.398134575053724e-06, - "loss": 0.878, - "step": 13490 - }, - { - "epoch": 0.6256374594343996, - "grad_norm": 3.926771640777588, - "learning_rate": 7.390322644780107e-06, - "loss": 1.0517, - "step": 13495 - }, - { - "epoch": 0.6258692628650904, - "grad_norm": 3.5244922637939453, - "learning_rate": 7.382512423109781e-06, - "loss": 0.8058, - "step": 13500 - }, - { - "epoch": 0.6258692628650904, - "eval_loss": 0.9085682034492493, - "eval_runtime": 11.2676, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 13500 - }, - { - "epoch": 0.6261010662957812, - "grad_norm": 4.184100151062012, - "learning_rate": 7.374703915156238e-06, - "loss": 0.937, - "step": 13505 - }, - { - "epoch": 0.626332869726472, - "grad_norm": 3.873671054840088, - "learning_rate": 7.3668971260318574e-06, - "loss": 0.9048, - "step": 13510 - }, - { - "epoch": 0.6265646731571627, - "grad_norm": 4.3199849128723145, - "learning_rate": 7.3590920608478766e-06, - "loss": 0.9245, - "step": 13515 - }, - { - "epoch": 0.6267964765878535, - "grad_norm": 4.238365173339844, - "learning_rate": 7.3512887247144185e-06, - "loss": 0.8239, - "step": 13520 - }, - { - "epoch": 0.6270282800185443, - "grad_norm": 3.413118839263916, - "learning_rate": 7.34348712274047e-06, - "loss": 0.8431, - "step": 13525 - }, - { - "epoch": 0.627260083449235, - "grad_norm": 3.717684268951416, - "learning_rate": 7.3356872600338795e-06, - "loss": 0.6615, - "step": 13530 - }, - { - "epoch": 0.6274918868799259, - "grad_norm": 3.8231186866760254, - "learning_rate": 7.327889141701362e-06, - "loss": 0.891, - "step": 13535 - }, - { - "epoch": 0.6277236903106166, - "grad_norm": 4.464356422424316, - "learning_rate": 7.320092772848488e-06, - "loss": 0.889, - "step": 13540 - }, - { - "epoch": 0.6279554937413073, - "grad_norm": 3.6922123432159424, - "learning_rate": 7.31229815857968e-06, - "loss": 0.8991, - "step": 13545 - }, - { - "epoch": 0.6281872971719982, - "grad_norm": 3.411736488342285, - "learning_rate": 7.3045053039982135e-06, - "loss": 0.821, - "step": 13550 - }, - { - "epoch": 0.6284191006026889, - "grad_norm": 3.353180408477783, - "learning_rate": 7.296714214206211e-06, - "loss": 0.8047, - "step": 13555 - }, - { - "epoch": 0.6286509040333796, - "grad_norm": 3.580899477005005, - "learning_rate": 7.288924894304646e-06, - "loss": 1.0161, - "step": 13560 - }, - { - "epoch": 0.6288827074640705, - "grad_norm": 4.194135665893555, - "learning_rate": 7.281137349393326e-06, - "loss": 0.8003, - "step": 13565 - }, - { - "epoch": 0.6291145108947612, - "grad_norm": 3.760871171951294, - "learning_rate": 7.273351584570898e-06, - "loss": 1.0035, - "step": 13570 - }, - { - "epoch": 0.6293463143254521, - "grad_norm": 3.420745611190796, - "learning_rate": 7.26556760493484e-06, - "loss": 0.7368, - "step": 13575 - }, - { - "epoch": 0.6295781177561428, - "grad_norm": 4.322468280792236, - "learning_rate": 7.257785415581471e-06, - "loss": 0.9997, - "step": 13580 - }, - { - "epoch": 0.6298099211868335, - "grad_norm": 4.65591287612915, - "learning_rate": 7.2500050216059305e-06, - "loss": 0.8858, - "step": 13585 - }, - { - "epoch": 0.6300417246175244, - "grad_norm": 3.4827702045440674, - "learning_rate": 7.2422264281021814e-06, - "loss": 0.7796, - "step": 13590 - }, - { - "epoch": 0.6302735280482151, - "grad_norm": 3.829669952392578, - "learning_rate": 7.234449640163015e-06, - "loss": 0.8724, - "step": 13595 - }, - { - "epoch": 0.6305053314789059, - "grad_norm": 3.7582693099975586, - "learning_rate": 7.2266746628800354e-06, - "loss": 0.7856, - "step": 13600 - }, - { - "epoch": 0.6305053314789059, - "eval_loss": 0.9077388048171997, - "eval_runtime": 11.2719, - "eval_samples_per_second": 11.267, - "eval_steps_per_second": 11.267, - "step": 13600 - }, - { - "epoch": 0.6307371349095967, - "grad_norm": 3.175899028778076, - "learning_rate": 7.218901501343658e-06, - "loss": 0.8942, - "step": 13605 - }, - { - "epoch": 0.6309689383402874, - "grad_norm": 4.218183994293213, - "learning_rate": 7.211130160643116e-06, - "loss": 0.9087, - "step": 13610 - }, - { - "epoch": 0.6312007417709782, - "grad_norm": 3.943713903427124, - "learning_rate": 7.203360645866446e-06, - "loss": 0.8711, - "step": 13615 - }, - { - "epoch": 0.631432545201669, - "grad_norm": 4.030193328857422, - "learning_rate": 7.195592962100493e-06, - "loss": 0.9171, - "step": 13620 - }, - { - "epoch": 0.6316643486323598, - "grad_norm": 3.495069742202759, - "learning_rate": 7.1878271144309e-06, - "loss": 0.7155, - "step": 13625 - }, - { - "epoch": 0.6318961520630505, - "grad_norm": 3.8679513931274414, - "learning_rate": 7.18006310794211e-06, - "loss": 0.9703, - "step": 13630 - }, - { - "epoch": 0.6321279554937413, - "grad_norm": 4.323716640472412, - "learning_rate": 7.1723009477173565e-06, - "loss": 0.9051, - "step": 13635 - }, - { - "epoch": 0.6323597589244321, - "grad_norm": 3.993604898452759, - "learning_rate": 7.1645406388386685e-06, - "loss": 0.8736, - "step": 13640 - }, - { - "epoch": 0.6325915623551228, - "grad_norm": 3.447650194168091, - "learning_rate": 7.156782186386864e-06, - "loss": 1.0247, - "step": 13645 - }, - { - "epoch": 0.6328233657858137, - "grad_norm": 3.7543070316314697, - "learning_rate": 7.149025595441537e-06, - "loss": 0.7316, - "step": 13650 - }, - { - "epoch": 0.6330551692165044, - "grad_norm": 3.229228973388672, - "learning_rate": 7.141270871081076e-06, - "loss": 0.8748, - "step": 13655 - }, - { - "epoch": 0.6332869726471951, - "grad_norm": 2.9329261779785156, - "learning_rate": 7.13351801838264e-06, - "loss": 0.7545, - "step": 13660 - }, - { - "epoch": 0.633518776077886, - "grad_norm": 3.830026865005493, - "learning_rate": 7.125767042422158e-06, - "loss": 0.9572, - "step": 13665 - }, - { - "epoch": 0.6337505795085767, - "grad_norm": 4.036183834075928, - "learning_rate": 7.118017948274336e-06, - "loss": 0.8909, - "step": 13670 - }, - { - "epoch": 0.6339823829392675, - "grad_norm": 3.7986652851104736, - "learning_rate": 7.110270741012648e-06, - "loss": 0.923, - "step": 13675 - }, - { - "epoch": 0.6342141863699583, - "grad_norm": 3.986788749694824, - "learning_rate": 7.102525425709332e-06, - "loss": 0.8906, - "step": 13680 - }, - { - "epoch": 0.634445989800649, - "grad_norm": 3.509570598602295, - "learning_rate": 7.09478200743539e-06, - "loss": 0.9297, - "step": 13685 - }, - { - "epoch": 0.6346777932313398, - "grad_norm": 4.8821797370910645, - "learning_rate": 7.087040491260575e-06, - "loss": 0.8174, - "step": 13690 - }, - { - "epoch": 0.6349095966620306, - "grad_norm": 3.297886371612549, - "learning_rate": 7.079300882253397e-06, - "loss": 0.8757, - "step": 13695 - }, - { - "epoch": 0.6351414000927214, - "grad_norm": 3.7602527141571045, - "learning_rate": 7.071563185481121e-06, - "loss": 0.8544, - "step": 13700 - }, - { - "epoch": 0.6351414000927214, - "eval_loss": 0.9052226543426514, - "eval_runtime": 11.2739, - "eval_samples_per_second": 11.265, - "eval_steps_per_second": 11.265, - "step": 13700 - }, - { - "epoch": 0.6353732035234122, - "grad_norm": 3.7896673679351807, - "learning_rate": 7.06382740600976e-06, - "loss": 1.032, - "step": 13705 - }, - { - "epoch": 0.6356050069541029, - "grad_norm": 4.140953540802002, - "learning_rate": 7.056093548904066e-06, - "loss": 0.893, - "step": 13710 - }, - { - "epoch": 0.6358368103847937, - "grad_norm": 3.7599258422851562, - "learning_rate": 7.048361619227546e-06, - "loss": 0.8545, - "step": 13715 - }, - { - "epoch": 0.6360686138154845, - "grad_norm": 3.4784529209136963, - "learning_rate": 7.04063162204242e-06, - "loss": 1.0141, - "step": 13720 - }, - { - "epoch": 0.6363004172461753, - "grad_norm": 3.0331788063049316, - "learning_rate": 7.0329035624096675e-06, - "loss": 0.902, - "step": 13725 - }, - { - "epoch": 0.636532220676866, - "grad_norm": 3.8510243892669678, - "learning_rate": 7.025177445388987e-06, - "loss": 0.734, - "step": 13730 - }, - { - "epoch": 0.6367640241075568, - "grad_norm": 3.2795825004577637, - "learning_rate": 7.017453276038808e-06, - "loss": 0.926, - "step": 13735 - }, - { - "epoch": 0.6369958275382476, - "grad_norm": 3.572727680206299, - "learning_rate": 7.009731059416287e-06, - "loss": 0.8886, - "step": 13740 - }, - { - "epoch": 0.6372276309689383, - "grad_norm": 3.5403172969818115, - "learning_rate": 7.0020108005773e-06, - "loss": 0.8549, - "step": 13745 - }, - { - "epoch": 0.6374594343996292, - "grad_norm": 4.365976333618164, - "learning_rate": 6.994292504576439e-06, - "loss": 0.8625, - "step": 13750 - }, - { - "epoch": 0.6376912378303199, - "grad_norm": 3.202730417251587, - "learning_rate": 6.986576176467013e-06, - "loss": 0.8318, - "step": 13755 - }, - { - "epoch": 0.6379230412610106, - "grad_norm": 3.6069822311401367, - "learning_rate": 6.9788618213010394e-06, - "loss": 0.9083, - "step": 13760 - }, - { - "epoch": 0.6381548446917015, - "grad_norm": 4.6335768699646, - "learning_rate": 6.971149444129254e-06, - "loss": 0.8498, - "step": 13765 - }, - { - "epoch": 0.6383866481223922, - "grad_norm": 3.619800329208374, - "learning_rate": 6.963439050001086e-06, - "loss": 0.9175, - "step": 13770 - }, - { - "epoch": 0.6386184515530829, - "grad_norm": 4.0841965675354, - "learning_rate": 6.955730643964675e-06, - "loss": 0.9527, - "step": 13775 - }, - { - "epoch": 0.6388502549837738, - "grad_norm": 2.9466323852539062, - "learning_rate": 6.948024231066849e-06, - "loss": 0.799, - "step": 13780 - }, - { - "epoch": 0.6390820584144645, - "grad_norm": 3.827254295349121, - "learning_rate": 6.94031981635314e-06, - "loss": 0.8248, - "step": 13785 - }, - { - "epoch": 0.6393138618451553, - "grad_norm": 4.052628040313721, - "learning_rate": 6.9326174048677695e-06, - "loss": 0.9989, - "step": 13790 - }, - { - "epoch": 0.6395456652758461, - "grad_norm": 3.57578182220459, - "learning_rate": 6.924917001653642e-06, - "loss": 0.823, - "step": 13795 - }, - { - "epoch": 0.6397774687065368, - "grad_norm": 3.9761085510253906, - "learning_rate": 6.917218611752359e-06, - "loss": 0.8732, - "step": 13800 - }, - { - "epoch": 0.6397774687065368, - "eval_loss": 0.9052022695541382, - "eval_runtime": 11.2744, - "eval_samples_per_second": 11.264, - "eval_steps_per_second": 11.264, - "step": 13800 - }, - { - "epoch": 0.6400092721372276, - "grad_norm": 3.8727567195892334, - "learning_rate": 6.909522240204194e-06, - "loss": 1.022, - "step": 13805 - }, - { - "epoch": 0.6402410755679184, - "grad_norm": 3.445838212966919, - "learning_rate": 6.9018278920480984e-06, - "loss": 0.7732, - "step": 13810 - }, - { - "epoch": 0.6404728789986092, - "grad_norm": 3.655327558517456, - "learning_rate": 6.894135572321703e-06, - "loss": 0.9275, - "step": 13815 - }, - { - "epoch": 0.6407046824292999, - "grad_norm": 3.729952096939087, - "learning_rate": 6.88644528606131e-06, - "loss": 0.8728, - "step": 13820 - }, - { - "epoch": 0.6409364858599907, - "grad_norm": 3.2580454349517822, - "learning_rate": 6.8787570383018905e-06, - "loss": 0.8933, - "step": 13825 - }, - { - "epoch": 0.6411682892906815, - "grad_norm": 3.6652071475982666, - "learning_rate": 6.8710708340770826e-06, - "loss": 0.7551, - "step": 13830 - }, - { - "epoch": 0.6414000927213723, - "grad_norm": 3.6860642433166504, - "learning_rate": 6.8633866784191834e-06, - "loss": 0.91, - "step": 13835 - }, - { - "epoch": 0.6416318961520631, - "grad_norm": 3.2946152687072754, - "learning_rate": 6.855704576359144e-06, - "loss": 0.855, - "step": 13840 - }, - { - "epoch": 0.6418636995827538, - "grad_norm": 2.9450178146362305, - "learning_rate": 6.848024532926581e-06, - "loss": 0.785, - "step": 13845 - }, - { - "epoch": 0.6420955030134446, - "grad_norm": 5.349577903747559, - "learning_rate": 6.840346553149761e-06, - "loss": 0.9715, - "step": 13850 - }, - { - "epoch": 0.6423273064441354, - "grad_norm": 3.259477376937866, - "learning_rate": 6.83267064205559e-06, - "loss": 0.8353, - "step": 13855 - }, - { - "epoch": 0.6425591098748261, - "grad_norm": 3.9054973125457764, - "learning_rate": 6.824996804669634e-06, - "loss": 1.0056, - "step": 13860 - }, - { - "epoch": 0.642790913305517, - "grad_norm": 4.273472785949707, - "learning_rate": 6.817325046016095e-06, - "loss": 0.9855, - "step": 13865 - }, - { - "epoch": 0.6430227167362077, - "grad_norm": 3.450132131576538, - "learning_rate": 6.809655371117805e-06, - "loss": 0.9567, - "step": 13870 - }, - { - "epoch": 0.6432545201668984, - "grad_norm": 4.0070624351501465, - "learning_rate": 6.8019877849962446e-06, - "loss": 0.9257, - "step": 13875 - }, - { - "epoch": 0.6434863235975893, - "grad_norm": 3.8173329830169678, - "learning_rate": 6.79432229267152e-06, - "loss": 0.882, - "step": 13880 - }, - { - "epoch": 0.64371812702828, - "grad_norm": 3.4225456714630127, - "learning_rate": 6.786658899162373e-06, - "loss": 0.7999, - "step": 13885 - }, - { - "epoch": 0.6439499304589708, - "grad_norm": 4.218738079071045, - "learning_rate": 6.778997609486166e-06, - "loss": 0.904, - "step": 13890 - }, - { - "epoch": 0.6441817338896616, - "grad_norm": 4.1808671951293945, - "learning_rate": 6.77133842865888e-06, - "loss": 0.999, - "step": 13895 - }, - { - "epoch": 0.6444135373203523, - "grad_norm": 3.5177528858184814, - "learning_rate": 6.763681361695125e-06, - "loss": 0.9198, - "step": 13900 - }, - { - "epoch": 0.6444135373203523, - "eval_loss": 0.9041063189506531, - "eval_runtime": 11.2757, - "eval_samples_per_second": 11.263, - "eval_steps_per_second": 11.263, - "step": 13900 - }, - { - "epoch": 0.6446453407510431, - "grad_norm": 3.941197156906128, - "learning_rate": 6.756026413608117e-06, - "loss": 0.805, - "step": 13905 - }, - { - "epoch": 0.6448771441817339, - "grad_norm": 3.7070484161376953, - "learning_rate": 6.748373589409696e-06, - "loss": 0.9117, - "step": 13910 - }, - { - "epoch": 0.6451089476124247, - "grad_norm": 5.314979553222656, - "learning_rate": 6.740722894110303e-06, - "loss": 0.8005, - "step": 13915 - }, - { - "epoch": 0.6453407510431154, - "grad_norm": 3.6489126682281494, - "learning_rate": 6.73307433271899e-06, - "loss": 0.7596, - "step": 13920 - }, - { - "epoch": 0.6455725544738062, - "grad_norm": 3.1416983604431152, - "learning_rate": 6.7254279102434015e-06, - "loss": 0.9408, - "step": 13925 - }, - { - "epoch": 0.645804357904497, - "grad_norm": 3.881897449493408, - "learning_rate": 6.717783631689797e-06, - "loss": 0.7941, - "step": 13930 - }, - { - "epoch": 0.6460361613351877, - "grad_norm": 4.629847049713135, - "learning_rate": 6.710141502063023e-06, - "loss": 0.8509, - "step": 13935 - }, - { - "epoch": 0.6462679647658786, - "grad_norm": 3.130946159362793, - "learning_rate": 6.702501526366518e-06, - "loss": 0.8722, - "step": 13940 - }, - { - "epoch": 0.6464997681965693, - "grad_norm": 3.541328191757202, - "learning_rate": 6.69486370960232e-06, - "loss": 0.9263, - "step": 13945 - }, - { - "epoch": 0.6467315716272601, - "grad_norm": 4.076601505279541, - "learning_rate": 6.6872280567710425e-06, - "loss": 0.8394, - "step": 13950 - }, - { - "epoch": 0.6469633750579509, - "grad_norm": 3.7371718883514404, - "learning_rate": 6.679594572871884e-06, - "loss": 0.9085, - "step": 13955 - }, - { - "epoch": 0.6471951784886416, - "grad_norm": 3.3321588039398193, - "learning_rate": 6.6719632629026285e-06, - "loss": 0.7783, - "step": 13960 - }, - { - "epoch": 0.6474269819193325, - "grad_norm": 3.7667531967163086, - "learning_rate": 6.664334131859631e-06, - "loss": 0.8365, - "step": 13965 - }, - { - "epoch": 0.6476587853500232, - "grad_norm": 3.918442487716675, - "learning_rate": 6.6567071847378275e-06, - "loss": 0.7859, - "step": 13970 - }, - { - "epoch": 0.6478905887807139, - "grad_norm": 3.935377836227417, - "learning_rate": 6.649082426530715e-06, - "loss": 0.8009, - "step": 13975 - }, - { - "epoch": 0.6481223922114048, - "grad_norm": 4.223479270935059, - "learning_rate": 6.641459862230366e-06, - "loss": 0.9154, - "step": 13980 - }, - { - "epoch": 0.6483541956420955, - "grad_norm": 3.7977757453918457, - "learning_rate": 6.6338394968274055e-06, - "loss": 0.9013, - "step": 13985 - }, - { - "epoch": 0.6485859990727862, - "grad_norm": 4.023576736450195, - "learning_rate": 6.626221335311029e-06, - "loss": 0.986, - "step": 13990 - }, - { - "epoch": 0.6488178025034771, - "grad_norm": 4.236316204071045, - "learning_rate": 6.618605382668988e-06, - "loss": 1.0194, - "step": 13995 - }, - { - "epoch": 0.6490496059341678, - "grad_norm": 3.5158629417419434, - "learning_rate": 6.610991643887579e-06, - "loss": 0.8858, - "step": 14000 - }, - { - "epoch": 0.6490496059341678, - "eval_loss": 0.9039557576179504, - "eval_runtime": 11.2818, - "eval_samples_per_second": 11.257, - "eval_steps_per_second": 11.257, - "step": 14000 - }, - { - "epoch": 0.6492814093648586, - "grad_norm": 4.082876205444336, - "learning_rate": 6.603380123951663e-06, - "loss": 0.9116, - "step": 14005 - }, - { - "epoch": 0.6495132127955494, - "grad_norm": 3.2537522315979004, - "learning_rate": 6.595770827844637e-06, - "loss": 0.8609, - "step": 14010 - }, - { - "epoch": 0.6497450162262401, - "grad_norm": 3.975658893585205, - "learning_rate": 6.588163760548446e-06, - "loss": 1.0489, - "step": 14015 - }, - { - "epoch": 0.6499768196569309, - "grad_norm": 3.7245466709136963, - "learning_rate": 6.580558927043575e-06, - "loss": 0.799, - "step": 14020 - }, - { - "epoch": 0.6502086230876217, - "grad_norm": 2.8868415355682373, - "learning_rate": 6.572956332309044e-06, - "loss": 0.8372, - "step": 14025 - }, - { - "epoch": 0.6504404265183125, - "grad_norm": 3.727860689163208, - "learning_rate": 6.5653559813224165e-06, - "loss": 0.7826, - "step": 14030 - }, - { - "epoch": 0.6506722299490032, - "grad_norm": 3.944013833999634, - "learning_rate": 6.557757879059779e-06, - "loss": 0.8501, - "step": 14035 - }, - { - "epoch": 0.650904033379694, - "grad_norm": 3.464038610458374, - "learning_rate": 6.550162030495746e-06, - "loss": 0.995, - "step": 14040 - }, - { - "epoch": 0.6511358368103848, - "grad_norm": 4.141759872436523, - "learning_rate": 6.5425684406034565e-06, - "loss": 0.8872, - "step": 14045 - }, - { - "epoch": 0.6513676402410755, - "grad_norm": 3.4732260704040527, - "learning_rate": 6.534977114354573e-06, - "loss": 1.038, - "step": 14050 - }, - { - "epoch": 0.6515994436717664, - "grad_norm": 3.323402166366577, - "learning_rate": 6.527388056719277e-06, - "loss": 0.7998, - "step": 14055 - }, - { - "epoch": 0.6518312471024571, - "grad_norm": 3.9810140132904053, - "learning_rate": 6.519801272666259e-06, - "loss": 0.9836, - "step": 14060 - }, - { - "epoch": 0.6520630505331478, - "grad_norm": 3.6072423458099365, - "learning_rate": 6.512216767162729e-06, - "loss": 0.8642, - "step": 14065 - }, - { - "epoch": 0.6522948539638387, - "grad_norm": 4.4568610191345215, - "learning_rate": 6.504634545174402e-06, - "loss": 0.888, - "step": 14070 - }, - { - "epoch": 0.6525266573945294, - "grad_norm": 3.5873544216156006, - "learning_rate": 6.497054611665492e-06, - "loss": 0.795, - "step": 14075 - }, - { - "epoch": 0.6527584608252203, - "grad_norm": 3.70351505279541, - "learning_rate": 6.489476971598721e-06, - "loss": 0.9346, - "step": 14080 - }, - { - "epoch": 0.652990264255911, - "grad_norm": 3.1498491764068604, - "learning_rate": 6.481901629935305e-06, - "loss": 0.8551, - "step": 14085 - }, - { - "epoch": 0.6532220676866017, - "grad_norm": 4.010249137878418, - "learning_rate": 6.4743285916349664e-06, - "loss": 0.8376, - "step": 14090 - }, - { - "epoch": 0.6534538711172926, - "grad_norm": 4.265456676483154, - "learning_rate": 6.466757861655903e-06, - "loss": 0.9528, - "step": 14095 - }, - { - "epoch": 0.6536856745479833, - "grad_norm": 3.2373321056365967, - "learning_rate": 6.459189444954817e-06, - "loss": 0.9482, - "step": 14100 - }, - { - "epoch": 0.6536856745479833, - "eval_loss": 0.9028105139732361, - "eval_runtime": 11.283, - "eval_samples_per_second": 11.256, - "eval_steps_per_second": 11.256, - "step": 14100 - }, - { - "epoch": 0.653917477978674, - "grad_norm": 3.4183366298675537, - "learning_rate": 6.451623346486883e-06, - "loss": 0.9387, - "step": 14105 - }, - { - "epoch": 0.6541492814093649, - "grad_norm": 3.709038019180298, - "learning_rate": 6.444059571205762e-06, - "loss": 0.8929, - "step": 14110 - }, - { - "epoch": 0.6543810848400556, - "grad_norm": 3.6893160343170166, - "learning_rate": 6.436498124063601e-06, - "loss": 0.8333, - "step": 14115 - }, - { - "epoch": 0.6546128882707464, - "grad_norm": 4.0160346031188965, - "learning_rate": 6.428939010011014e-06, - "loss": 0.963, - "step": 14120 - }, - { - "epoch": 0.6548446917014372, - "grad_norm": 3.783250570297241, - "learning_rate": 6.421382233997094e-06, - "loss": 0.7792, - "step": 14125 - }, - { - "epoch": 0.655076495132128, - "grad_norm": 4.110098838806152, - "learning_rate": 6.413827800969392e-06, - "loss": 0.8569, - "step": 14130 - }, - { - "epoch": 0.6553082985628187, - "grad_norm": 3.531893730163574, - "learning_rate": 6.406275715873941e-06, - "loss": 0.8379, - "step": 14135 - }, - { - "epoch": 0.6555401019935095, - "grad_norm": 3.7876014709472656, - "learning_rate": 6.398725983655225e-06, - "loss": 0.8394, - "step": 14140 - }, - { - "epoch": 0.6557719054242003, - "grad_norm": 3.642598867416382, - "learning_rate": 6.39117860925619e-06, - "loss": 0.8898, - "step": 14145 - }, - { - "epoch": 0.656003708854891, - "grad_norm": 3.1661229133605957, - "learning_rate": 6.383633597618244e-06, - "loss": 0.8901, - "step": 14150 - }, - { - "epoch": 0.6562355122855819, - "grad_norm": 3.3539295196533203, - "learning_rate": 6.376090953681242e-06, - "loss": 0.757, - "step": 14155 - }, - { - "epoch": 0.6564673157162726, - "grad_norm": 3.8540477752685547, - "learning_rate": 6.368550682383487e-06, - "loss": 0.8076, - "step": 14160 - }, - { - "epoch": 0.6566991191469633, - "grad_norm": 3.6921424865722656, - "learning_rate": 6.361012788661736e-06, - "loss": 0.9312, - "step": 14165 - }, - { - "epoch": 0.6569309225776542, - "grad_norm": 4.235689163208008, - "learning_rate": 6.353477277451179e-06, - "loss": 0.9182, - "step": 14170 - }, - { - "epoch": 0.6571627260083449, - "grad_norm": 3.6121301651000977, - "learning_rate": 6.345944153685461e-06, - "loss": 0.8698, - "step": 14175 - }, - { - "epoch": 0.6573945294390356, - "grad_norm": 3.3108012676239014, - "learning_rate": 6.3384134222966495e-06, - "loss": 0.8356, - "step": 14180 - }, - { - "epoch": 0.6576263328697265, - "grad_norm": 4.003962993621826, - "learning_rate": 6.330885088215255e-06, - "loss": 0.8549, - "step": 14185 - }, - { - "epoch": 0.6578581363004172, - "grad_norm": 3.306988000869751, - "learning_rate": 6.32335915637021e-06, - "loss": 0.7874, - "step": 14190 - }, - { - "epoch": 0.6580899397311081, - "grad_norm": 3.650463342666626, - "learning_rate": 6.315835631688885e-06, - "loss": 0.8735, - "step": 14195 - }, - { - "epoch": 0.6583217431617988, - "grad_norm": 3.5259885787963867, - "learning_rate": 6.308314519097063e-06, - "loss": 0.8242, - "step": 14200 - }, - { - "epoch": 0.6583217431617988, - "eval_loss": 0.9013914465904236, - "eval_runtime": 11.2736, - "eval_samples_per_second": 11.265, - "eval_steps_per_second": 11.265, - "step": 14200 - }, - { - "epoch": 0.6585535465924895, - "grad_norm": 3.500894784927368, - "learning_rate": 6.300795823518957e-06, - "loss": 0.8101, - "step": 14205 - }, - { - "epoch": 0.6587853500231804, - "grad_norm": 6.990668773651123, - "learning_rate": 6.293279549877194e-06, - "loss": 0.9567, - "step": 14210 - }, - { - "epoch": 0.6590171534538711, - "grad_norm": 3.8730952739715576, - "learning_rate": 6.285765703092817e-06, - "loss": 0.8529, - "step": 14215 - }, - { - "epoch": 0.6592489568845619, - "grad_norm": 3.886561393737793, - "learning_rate": 6.278254288085272e-06, - "loss": 0.9505, - "step": 14220 - }, - { - "epoch": 0.6594807603152527, - "grad_norm": 3.6404316425323486, - "learning_rate": 6.270745309772424e-06, - "loss": 0.7549, - "step": 14225 - }, - { - "epoch": 0.6597125637459434, - "grad_norm": 4.1507182121276855, - "learning_rate": 6.2632387730705345e-06, - "loss": 0.9244, - "step": 14230 - }, - { - "epoch": 0.6599443671766342, - "grad_norm": 4.353921890258789, - "learning_rate": 6.255734682894275e-06, - "loss": 0.9279, - "step": 14235 - }, - { - "epoch": 0.660176170607325, - "grad_norm": 3.7515358924865723, - "learning_rate": 6.248233044156706e-06, - "loss": 0.8284, - "step": 14240 - }, - { - "epoch": 0.6604079740380158, - "grad_norm": 3.5080502033233643, - "learning_rate": 6.240733861769291e-06, - "loss": 0.7591, - "step": 14245 - }, - { - "epoch": 0.6606397774687065, - "grad_norm": 3.134357452392578, - "learning_rate": 6.233237140641874e-06, - "loss": 0.7692, - "step": 14250 - }, - { - "epoch": 0.6608715808993973, - "grad_norm": 3.8307859897613525, - "learning_rate": 6.225742885682699e-06, - "loss": 0.7785, - "step": 14255 - }, - { - "epoch": 0.6611033843300881, - "grad_norm": 3.3515188694000244, - "learning_rate": 6.2182511017983936e-06, - "loss": 0.7713, - "step": 14260 - }, - { - "epoch": 0.6613351877607788, - "grad_norm": 3.2930147647857666, - "learning_rate": 6.2107617938939625e-06, - "loss": 0.7633, - "step": 14265 - }, - { - "epoch": 0.6615669911914697, - "grad_norm": 3.6122865676879883, - "learning_rate": 6.20327496687279e-06, - "loss": 0.8302, - "step": 14270 - }, - { - "epoch": 0.6617987946221604, - "grad_norm": 2.9412949085235596, - "learning_rate": 6.195790625636644e-06, - "loss": 0.7882, - "step": 14275 - }, - { - "epoch": 0.6620305980528511, - "grad_norm": 3.8114194869995117, - "learning_rate": 6.188308775085654e-06, - "loss": 0.9221, - "step": 14280 - }, - { - "epoch": 0.662262401483542, - "grad_norm": 3.629974842071533, - "learning_rate": 6.180829420118326e-06, - "loss": 0.7898, - "step": 14285 - }, - { - "epoch": 0.6624942049142327, - "grad_norm": 3.917085886001587, - "learning_rate": 6.1733525656315256e-06, - "loss": 0.8577, - "step": 14290 - }, - { - "epoch": 0.6627260083449235, - "grad_norm": 3.4944143295288086, - "learning_rate": 6.165878216520492e-06, - "loss": 0.8692, - "step": 14295 - }, - { - "epoch": 0.6629578117756143, - "grad_norm": 3.9613192081451416, - "learning_rate": 6.158406377678816e-06, - "loss": 0.8492, - "step": 14300 - }, - { - "epoch": 0.6629578117756143, - "eval_loss": 0.9019988179206848, - "eval_runtime": 11.2571, - "eval_samples_per_second": 11.282, - "eval_steps_per_second": 11.282, - "step": 14300 - }, - { - "epoch": 0.663189615206305, - "grad_norm": 3.555412769317627, - "learning_rate": 6.1509370539984455e-06, - "loss": 0.9138, - "step": 14305 - }, - { - "epoch": 0.6634214186369958, - "grad_norm": 3.3379549980163574, - "learning_rate": 6.143470250369682e-06, - "loss": 0.8428, - "step": 14310 - }, - { - "epoch": 0.6636532220676866, - "grad_norm": 3.9386861324310303, - "learning_rate": 6.136005971681176e-06, - "loss": 0.9693, - "step": 14315 - }, - { - "epoch": 0.6638850254983774, - "grad_norm": 3.368528366088867, - "learning_rate": 6.1285442228199325e-06, - "loss": 0.7739, - "step": 14320 - }, - { - "epoch": 0.6641168289290682, - "grad_norm": 3.3850362300872803, - "learning_rate": 6.121085008671288e-06, - "loss": 0.965, - "step": 14325 - }, - { - "epoch": 0.6643486323597589, - "grad_norm": 3.772986888885498, - "learning_rate": 6.113628334118929e-06, - "loss": 0.9258, - "step": 14330 - }, - { - "epoch": 0.6645804357904497, - "grad_norm": 3.1928980350494385, - "learning_rate": 6.10617420404488e-06, - "loss": 0.7586, - "step": 14335 - }, - { - "epoch": 0.6648122392211405, - "grad_norm": 3.8362560272216797, - "learning_rate": 6.098722623329489e-06, - "loss": 0.9274, - "step": 14340 - }, - { - "epoch": 0.6650440426518313, - "grad_norm": 3.8659775257110596, - "learning_rate": 6.091273596851446e-06, - "loss": 0.7597, - "step": 14345 - }, - { - "epoch": 0.665275846082522, - "grad_norm": 3.232468605041504, - "learning_rate": 6.0838271294877586e-06, - "loss": 0.8773, - "step": 14350 - }, - { - "epoch": 0.6655076495132128, - "grad_norm": 3.664357900619507, - "learning_rate": 6.076383226113771e-06, - "loss": 0.9295, - "step": 14355 - }, - { - "epoch": 0.6657394529439036, - "grad_norm": 3.904632091522217, - "learning_rate": 6.0689418916031426e-06, - "loss": 0.9202, - "step": 14360 - }, - { - "epoch": 0.6659712563745943, - "grad_norm": 3.8306760787963867, - "learning_rate": 6.061503130827846e-06, - "loss": 0.9047, - "step": 14365 - }, - { - "epoch": 0.6662030598052852, - "grad_norm": 4.240020751953125, - "learning_rate": 6.054066948658175e-06, - "loss": 0.9071, - "step": 14370 - }, - { - "epoch": 0.6664348632359759, - "grad_norm": 3.161769390106201, - "learning_rate": 6.046633349962731e-06, - "loss": 0.8829, - "step": 14375 - }, - { - "epoch": 0.6666666666666666, - "grad_norm": 3.986294984817505, - "learning_rate": 6.039202339608432e-06, - "loss": 0.8351, - "step": 14380 - }, - { - "epoch": 0.6668984700973575, - "grad_norm": 3.352684497833252, - "learning_rate": 6.031773922460493e-06, - "loss": 0.8495, - "step": 14385 - }, - { - "epoch": 0.6671302735280482, - "grad_norm": 4.166463851928711, - "learning_rate": 6.024348103382436e-06, - "loss": 1.0153, - "step": 14390 - }, - { - "epoch": 0.667362076958739, - "grad_norm": 4.052969932556152, - "learning_rate": 6.016924887236077e-06, - "loss": 0.7314, - "step": 14395 - }, - { - "epoch": 0.6675938803894298, - "grad_norm": 3.7992799282073975, - "learning_rate": 6.009504278881529e-06, - "loss": 0.8469, - "step": 14400 - }, - { - "epoch": 0.6675938803894298, - "eval_loss": 0.9014151096343994, - "eval_runtime": 11.2535, - "eval_samples_per_second": 11.285, - "eval_steps_per_second": 11.285, - "step": 14400 - }, - { - "epoch": 0.6678256838201205, - "grad_norm": 4.891984462738037, - "learning_rate": 6.002086283177203e-06, - "loss": 0.9517, - "step": 14405 - }, - { - "epoch": 0.6680574872508113, - "grad_norm": 3.334757089614868, - "learning_rate": 5.994670904979793e-06, - "loss": 0.8671, - "step": 14410 - }, - { - "epoch": 0.6682892906815021, - "grad_norm": 3.49668288230896, - "learning_rate": 5.987258149144289e-06, - "loss": 0.8352, - "step": 14415 - }, - { - "epoch": 0.6685210941121928, - "grad_norm": 4.175472259521484, - "learning_rate": 5.979848020523953e-06, - "loss": 1.0171, - "step": 14420 - }, - { - "epoch": 0.6687528975428836, - "grad_norm": 3.3329074382781982, - "learning_rate": 5.972440523970329e-06, - "loss": 0.9542, - "step": 14425 - }, - { - "epoch": 0.6689847009735744, - "grad_norm": 3.4944443702697754, - "learning_rate": 5.965035664333241e-06, - "loss": 0.7861, - "step": 14430 - }, - { - "epoch": 0.6692165044042652, - "grad_norm": 5.090800762176514, - "learning_rate": 5.957633446460783e-06, - "loss": 1.0441, - "step": 14435 - }, - { - "epoch": 0.6694483078349559, - "grad_norm": 3.310312032699585, - "learning_rate": 5.950233875199328e-06, - "loss": 0.7745, - "step": 14440 - }, - { - "epoch": 0.6696801112656467, - "grad_norm": 3.937577962875366, - "learning_rate": 5.942836955393507e-06, - "loss": 1.1439, - "step": 14445 - }, - { - "epoch": 0.6699119146963375, - "grad_norm": 4.166878700256348, - "learning_rate": 5.93544269188622e-06, - "loss": 0.7785, - "step": 14450 - }, - { - "epoch": 0.6701437181270283, - "grad_norm": 3.7805533409118652, - "learning_rate": 5.928051089518623e-06, - "loss": 0.7695, - "step": 14455 - }, - { - "epoch": 0.6703755215577191, - "grad_norm": 4.2814249992370605, - "learning_rate": 5.92066215313013e-06, - "loss": 0.9514, - "step": 14460 - }, - { - "epoch": 0.6706073249884098, - "grad_norm": 3.207496166229248, - "learning_rate": 5.913275887558418e-06, - "loss": 0.7387, - "step": 14465 - }, - { - "epoch": 0.6708391284191006, - "grad_norm": 3.688148021697998, - "learning_rate": 5.905892297639408e-06, - "loss": 0.9069, - "step": 14470 - }, - { - "epoch": 0.6710709318497914, - "grad_norm": 3.606431245803833, - "learning_rate": 5.898511388207269e-06, - "loss": 0.8267, - "step": 14475 - }, - { - "epoch": 0.6713027352804821, - "grad_norm": 4.078047752380371, - "learning_rate": 5.89113316409442e-06, - "loss": 1.0075, - "step": 14480 - }, - { - "epoch": 0.671534538711173, - "grad_norm": 3.6268293857574463, - "learning_rate": 5.883757630131517e-06, - "loss": 0.8306, - "step": 14485 - }, - { - "epoch": 0.6717663421418637, - "grad_norm": 3.7984752655029297, - "learning_rate": 5.8763847911474555e-06, - "loss": 0.9554, - "step": 14490 - }, - { - "epoch": 0.6719981455725544, - "grad_norm": 3.4812657833099365, - "learning_rate": 5.869014651969366e-06, - "loss": 0.9154, - "step": 14495 - }, - { - "epoch": 0.6722299490032453, - "grad_norm": 3.199309825897217, - "learning_rate": 5.8616472174226165e-06, - "loss": 0.5889, - "step": 14500 - }, - { - "epoch": 0.6722299490032453, - "eval_loss": 0.9003604054450989, - "eval_runtime": 11.3032, - "eval_samples_per_second": 11.236, - "eval_steps_per_second": 11.236, - "step": 14500 - }, - { - "epoch": 0.672461752433936, - "grad_norm": 4.093687057495117, - "learning_rate": 5.8542824923308e-06, - "loss": 0.8131, - "step": 14505 - }, - { - "epoch": 0.6726935558646268, - "grad_norm": 4.430140018463135, - "learning_rate": 5.846920481515736e-06, - "loss": 0.9598, - "step": 14510 - }, - { - "epoch": 0.6729253592953176, - "grad_norm": 3.939739465713501, - "learning_rate": 5.839561189797464e-06, - "loss": 0.8837, - "step": 14515 - }, - { - "epoch": 0.6731571627260083, - "grad_norm": 3.9287960529327393, - "learning_rate": 5.832204621994251e-06, - "loss": 0.6834, - "step": 14520 - }, - { - "epoch": 0.6733889661566991, - "grad_norm": 4.755311012268066, - "learning_rate": 5.82485078292257e-06, - "loss": 0.8228, - "step": 14525 - }, - { - "epoch": 0.6736207695873899, - "grad_norm": 3.7588210105895996, - "learning_rate": 5.817499677397116e-06, - "loss": 0.8636, - "step": 14530 - }, - { - "epoch": 0.6738525730180807, - "grad_norm": 3.697885751724243, - "learning_rate": 5.810151310230798e-06, - "loss": 0.783, - "step": 14535 - }, - { - "epoch": 0.6740843764487714, - "grad_norm": 3.8682544231414795, - "learning_rate": 5.802805686234721e-06, - "loss": 0.9349, - "step": 14540 - }, - { - "epoch": 0.6743161798794622, - "grad_norm": 3.55385684967041, - "learning_rate": 5.795462810218191e-06, - "loss": 0.9123, - "step": 14545 - }, - { - "epoch": 0.674547983310153, - "grad_norm": 3.61362624168396, - "learning_rate": 5.788122686988732e-06, - "loss": 0.8943, - "step": 14550 - }, - { - "epoch": 0.6747797867408437, - "grad_norm": 4.1845245361328125, - "learning_rate": 5.7807853213520584e-06, - "loss": 0.9365, - "step": 14555 - }, - { - "epoch": 0.6750115901715346, - "grad_norm": 3.7930006980895996, - "learning_rate": 5.773450718112068e-06, - "loss": 0.7716, - "step": 14560 - }, - { - "epoch": 0.6752433936022253, - "grad_norm": 3.108151435852051, - "learning_rate": 5.766118882070864e-06, - "loss": 0.7694, - "step": 14565 - }, - { - "epoch": 0.6754751970329161, - "grad_norm": 4.250346660614014, - "learning_rate": 5.758789818028741e-06, - "loss": 0.9389, - "step": 14570 - }, - { - "epoch": 0.6757070004636069, - "grad_norm": 3.8213279247283936, - "learning_rate": 5.751463530784154e-06, - "loss": 0.9436, - "step": 14575 - }, - { - "epoch": 0.6759388038942976, - "grad_norm": 3.573568105697632, - "learning_rate": 5.7441400251337656e-06, - "loss": 0.8428, - "step": 14580 - }, - { - "epoch": 0.6761706073249885, - "grad_norm": 3.095245122909546, - "learning_rate": 5.736819305872407e-06, - "loss": 0.9342, - "step": 14585 - }, - { - "epoch": 0.6764024107556792, - "grad_norm": 3.4886996746063232, - "learning_rate": 5.729501377793094e-06, - "loss": 0.7714, - "step": 14590 - }, - { - "epoch": 0.6766342141863699, - "grad_norm": 3.144732713699341, - "learning_rate": 5.722186245687e-06, - "loss": 0.8221, - "step": 14595 - }, - { - "epoch": 0.6768660176170608, - "grad_norm": 3.8869521617889404, - "learning_rate": 5.71487391434347e-06, - "loss": 0.831, - "step": 14600 - }, - { - "epoch": 0.6768660176170608, - "eval_loss": 0.8991519808769226, - "eval_runtime": 11.2746, - "eval_samples_per_second": 11.264, - "eval_steps_per_second": 11.264, - "step": 14600 - }, - { - "epoch": 0.6770978210477515, - "grad_norm": 3.6212191581726074, - "learning_rate": 5.707564388550033e-06, - "loss": 0.8467, - "step": 14605 - }, - { - "epoch": 0.6773296244784422, - "grad_norm": 3.0634665489196777, - "learning_rate": 5.7002576730923575e-06, - "loss": 0.8487, - "step": 14610 - }, - { - "epoch": 0.6775614279091331, - "grad_norm": 3.5993947982788086, - "learning_rate": 5.692953772754287e-06, - "loss": 0.7163, - "step": 14615 - }, - { - "epoch": 0.6777932313398238, - "grad_norm": 3.701159954071045, - "learning_rate": 5.685652692317825e-06, - "loss": 0.8995, - "step": 14620 - }, - { - "epoch": 0.6780250347705146, - "grad_norm": 3.198442220687866, - "learning_rate": 5.6783544365631135e-06, - "loss": 0.6766, - "step": 14625 - }, - { - "epoch": 0.6782568382012054, - "grad_norm": 3.342956304550171, - "learning_rate": 5.671059010268452e-06, - "loss": 0.6306, - "step": 14630 - }, - { - "epoch": 0.6784886416318961, - "grad_norm": 4.532590389251709, - "learning_rate": 5.663766418210293e-06, - "loss": 0.9384, - "step": 14635 - }, - { - "epoch": 0.6787204450625869, - "grad_norm": 3.206352949142456, - "learning_rate": 5.656476665163233e-06, - "loss": 0.7051, - "step": 14640 - }, - { - "epoch": 0.6789522484932777, - "grad_norm": 3.7517313957214355, - "learning_rate": 5.649189755899999e-06, - "loss": 0.9279, - "step": 14645 - }, - { - "epoch": 0.6791840519239685, - "grad_norm": 3.682551383972168, - "learning_rate": 5.641905695191465e-06, - "loss": 0.7678, - "step": 14650 - }, - { - "epoch": 0.6794158553546592, - "grad_norm": 3.4281675815582275, - "learning_rate": 5.634624487806644e-06, - "loss": 0.8488, - "step": 14655 - }, - { - "epoch": 0.67964765878535, - "grad_norm": 3.929577112197876, - "learning_rate": 5.627346138512671e-06, - "loss": 0.7632, - "step": 14660 - }, - { - "epoch": 0.6798794622160408, - "grad_norm": 3.8639705181121826, - "learning_rate": 5.620070652074808e-06, - "loss": 0.9047, - "step": 14665 - }, - { - "epoch": 0.6801112656467315, - "grad_norm": 3.5569334030151367, - "learning_rate": 5.612798033256454e-06, - "loss": 0.8764, - "step": 14670 - }, - { - "epoch": 0.6803430690774224, - "grad_norm": 3.9777040481567383, - "learning_rate": 5.605528286819131e-06, - "loss": 0.9697, - "step": 14675 - }, - { - "epoch": 0.6805748725081131, - "grad_norm": 4.220929145812988, - "learning_rate": 5.598261417522462e-06, - "loss": 0.8711, - "step": 14680 - }, - { - "epoch": 0.6808066759388038, - "grad_norm": 4.128261089324951, - "learning_rate": 5.5909974301242095e-06, - "loss": 0.8907, - "step": 14685 - }, - { - "epoch": 0.6810384793694947, - "grad_norm": 3.572897434234619, - "learning_rate": 5.5837363293802335e-06, - "loss": 0.7123, - "step": 14690 - }, - { - "epoch": 0.6812702828001854, - "grad_norm": 4.047999382019043, - "learning_rate": 5.576478120044506e-06, - "loss": 0.8807, - "step": 14695 - }, - { - "epoch": 0.6815020862308763, - "grad_norm": 4.205711364746094, - "learning_rate": 5.5692228068691125e-06, - "loss": 0.9255, - "step": 14700 - }, - { - "epoch": 0.6815020862308763, - "eval_loss": 0.8986382484436035, - "eval_runtime": 11.2621, - "eval_samples_per_second": 11.277, - "eval_steps_per_second": 11.277, - "step": 14700 - }, - { - "epoch": 0.681733889661567, - "grad_norm": 3.3091366291046143, - "learning_rate": 5.5619703946042405e-06, - "loss": 0.6813, - "step": 14705 - }, - { - "epoch": 0.6819656930922577, - "grad_norm": 3.9704837799072266, - "learning_rate": 5.55472088799817e-06, - "loss": 0.8628, - "step": 14710 - }, - { - "epoch": 0.6821974965229486, - "grad_norm": 3.0079431533813477, - "learning_rate": 5.547474291797293e-06, - "loss": 0.8371, - "step": 14715 - }, - { - "epoch": 0.6824292999536393, - "grad_norm": 4.201705455780029, - "learning_rate": 5.540230610746082e-06, - "loss": 0.8795, - "step": 14720 - }, - { - "epoch": 0.6826611033843301, - "grad_norm": 3.8901288509368896, - "learning_rate": 5.53298984958711e-06, - "loss": 1.0424, - "step": 14725 - }, - { - "epoch": 0.6828929068150209, - "grad_norm": 3.0045812129974365, - "learning_rate": 5.525752013061032e-06, - "loss": 0.6785, - "step": 14730 - }, - { - "epoch": 0.6831247102457116, - "grad_norm": 4.07247257232666, - "learning_rate": 5.518517105906593e-06, - "loss": 0.903, - "step": 14735 - }, - { - "epoch": 0.6833565136764024, - "grad_norm": 4.029781341552734, - "learning_rate": 5.5112851328606235e-06, - "loss": 0.9391, - "step": 14740 - }, - { - "epoch": 0.6835883171070932, - "grad_norm": 3.749738931655884, - "learning_rate": 5.504056098658022e-06, - "loss": 0.8587, - "step": 14745 - }, - { - "epoch": 0.683820120537784, - "grad_norm": 3.17695689201355, - "learning_rate": 5.496830008031768e-06, - "loss": 0.7638, - "step": 14750 - }, - { - "epoch": 0.6840519239684747, - "grad_norm": 3.8042185306549072, - "learning_rate": 5.489606865712915e-06, - "loss": 0.8975, - "step": 14755 - }, - { - "epoch": 0.6842837273991655, - "grad_norm": 3.345367670059204, - "learning_rate": 5.482386676430593e-06, - "loss": 0.8314, - "step": 14760 - }, - { - "epoch": 0.6845155308298563, - "grad_norm": 3.19871187210083, - "learning_rate": 5.475169444911982e-06, - "loss": 0.7643, - "step": 14765 - }, - { - "epoch": 0.684747334260547, - "grad_norm": 3.753889799118042, - "learning_rate": 5.467955175882338e-06, - "loss": 0.7962, - "step": 14770 - }, - { - "epoch": 0.6849791376912379, - "grad_norm": 4.27974271774292, - "learning_rate": 5.460743874064984e-06, - "loss": 0.7481, - "step": 14775 - }, - { - "epoch": 0.6852109411219286, - "grad_norm": 3.5102105140686035, - "learning_rate": 5.453535544181274e-06, - "loss": 0.7998, - "step": 14780 - }, - { - "epoch": 0.6854427445526193, - "grad_norm": 3.4035680294036865, - "learning_rate": 5.4463301909506415e-06, - "loss": 0.8603, - "step": 14785 - }, - { - "epoch": 0.6856745479833102, - "grad_norm": 3.453497886657715, - "learning_rate": 5.439127819090566e-06, - "loss": 0.7311, - "step": 14790 - }, - { - "epoch": 0.6859063514140009, - "grad_norm": 4.089419841766357, - "learning_rate": 5.431928433316562e-06, - "loss": 0.7608, - "step": 14795 - }, - { - "epoch": 0.6861381548446917, - "grad_norm": 3.4193642139434814, - "learning_rate": 5.424732038342204e-06, - "loss": 0.9064, - "step": 14800 - }, - { - "epoch": 0.6861381548446917, - "eval_loss": 0.8975695371627808, - "eval_runtime": 11.2774, - "eval_samples_per_second": 11.262, - "eval_steps_per_second": 11.262, - "step": 14800 - }, - { - "epoch": 0.6863699582753825, - "grad_norm": 4.701401710510254, - "learning_rate": 5.4175386388791075e-06, - "loss": 0.8915, - "step": 14805 - }, - { - "epoch": 0.6866017617060732, - "grad_norm": 3.8055918216705322, - "learning_rate": 5.410348239636918e-06, - "loss": 0.987, - "step": 14810 - }, - { - "epoch": 0.6868335651367641, - "grad_norm": 3.4142491817474365, - "learning_rate": 5.4031608453233145e-06, - "loss": 0.7781, - "step": 14815 - }, - { - "epoch": 0.6870653685674548, - "grad_norm": 3.7377047538757324, - "learning_rate": 5.39597646064402e-06, - "loss": 0.8501, - "step": 14820 - }, - { - "epoch": 0.6872971719981456, - "grad_norm": 3.767622470855713, - "learning_rate": 5.388795090302788e-06, - "loss": 0.804, - "step": 14825 - }, - { - "epoch": 0.6875289754288364, - "grad_norm": 3.2815511226654053, - "learning_rate": 5.381616739001386e-06, - "loss": 0.8258, - "step": 14830 - }, - { - "epoch": 0.6877607788595271, - "grad_norm": 3.6691720485687256, - "learning_rate": 5.374441411439609e-06, - "loss": 0.8332, - "step": 14835 - }, - { - "epoch": 0.6879925822902179, - "grad_norm": 3.6556670665740967, - "learning_rate": 5.367269112315278e-06, - "loss": 0.8806, - "step": 14840 - }, - { - "epoch": 0.6882243857209087, - "grad_norm": 3.983771324157715, - "learning_rate": 5.360099846324232e-06, - "loss": 0.9924, - "step": 14845 - }, - { - "epoch": 0.6884561891515995, - "grad_norm": 3.640467405319214, - "learning_rate": 5.352933618160311e-06, - "loss": 0.9716, - "step": 14850 - }, - { - "epoch": 0.6886879925822902, - "grad_norm": 3.7547876834869385, - "learning_rate": 5.345770432515381e-06, - "loss": 0.9835, - "step": 14855 - }, - { - "epoch": 0.688919796012981, - "grad_norm": 3.748774528503418, - "learning_rate": 5.338610294079311e-06, - "loss": 0.7202, - "step": 14860 - }, - { - "epoch": 0.6891515994436718, - "grad_norm": 3.7663495540618896, - "learning_rate": 5.331453207539974e-06, - "loss": 0.8268, - "step": 14865 - }, - { - "epoch": 0.6893834028743625, - "grad_norm": 3.5820882320404053, - "learning_rate": 5.324299177583239e-06, - "loss": 0.7453, - "step": 14870 - }, - { - "epoch": 0.6896152063050534, - "grad_norm": 4.511739253997803, - "learning_rate": 5.317148208892982e-06, - "loss": 0.8281, - "step": 14875 - }, - { - "epoch": 0.6898470097357441, - "grad_norm": 3.408353567123413, - "learning_rate": 5.3100003061510785e-06, - "loss": 0.8948, - "step": 14880 - }, - { - "epoch": 0.6900788131664348, - "grad_norm": 3.573390483856201, - "learning_rate": 5.302855474037383e-06, - "loss": 0.8971, - "step": 14885 - }, - { - "epoch": 0.6903106165971257, - "grad_norm": 4.010354518890381, - "learning_rate": 5.295713717229753e-06, - "loss": 0.8969, - "step": 14890 - }, - { - "epoch": 0.6905424200278164, - "grad_norm": 3.8016562461853027, - "learning_rate": 5.288575040404025e-06, - "loss": 0.719, - "step": 14895 - }, - { - "epoch": 0.6907742234585071, - "grad_norm": 3.699970245361328, - "learning_rate": 5.281439448234016e-06, - "loss": 0.8287, - "step": 14900 - }, - { - "epoch": 0.6907742234585071, - "eval_loss": 0.8960445523262024, - "eval_runtime": 11.2607, - "eval_samples_per_second": 11.278, - "eval_steps_per_second": 11.278, - "step": 14900 - }, - { - "epoch": 0.691006026889198, - "grad_norm": 4.231236934661865, - "learning_rate": 5.274306945391532e-06, - "loss": 1.005, - "step": 14905 - }, - { - "epoch": 0.6912378303198887, - "grad_norm": 4.223328590393066, - "learning_rate": 5.267177536546355e-06, - "loss": 0.9441, - "step": 14910 - }, - { - "epoch": 0.6914696337505795, - "grad_norm": 3.7817633152008057, - "learning_rate": 5.260051226366234e-06, - "loss": 0.8287, - "step": 14915 - }, - { - "epoch": 0.6917014371812703, - "grad_norm": 3.6774466037750244, - "learning_rate": 5.2529280195168996e-06, - "loss": 0.8102, - "step": 14920 - }, - { - "epoch": 0.691933240611961, - "grad_norm": 3.3471529483795166, - "learning_rate": 5.245807920662038e-06, - "loss": 0.7598, - "step": 14925 - }, - { - "epoch": 0.6921650440426518, - "grad_norm": 3.2454981803894043, - "learning_rate": 5.238690934463315e-06, - "loss": 0.7236, - "step": 14930 - }, - { - "epoch": 0.6923968474733426, - "grad_norm": 4.209705829620361, - "learning_rate": 5.231577065580345e-06, - "loss": 0.976, - "step": 14935 - }, - { - "epoch": 0.6926286509040334, - "grad_norm": 3.823845386505127, - "learning_rate": 5.2244663186707075e-06, - "loss": 0.9594, - "step": 14940 - }, - { - "epoch": 0.6928604543347242, - "grad_norm": 3.8407392501831055, - "learning_rate": 5.217358698389945e-06, - "loss": 0.8413, - "step": 14945 - }, - { - "epoch": 0.6930922577654149, - "grad_norm": 3.701613664627075, - "learning_rate": 5.210254209391543e-06, - "loss": 0.7563, - "step": 14950 - }, - { - "epoch": 0.6933240611961057, - "grad_norm": 3.6802921295166016, - "learning_rate": 5.203152856326933e-06, - "loss": 0.8716, - "step": 14955 - }, - { - "epoch": 0.6935558646267965, - "grad_norm": 3.6844983100891113, - "learning_rate": 5.196054643845505e-06, - "loss": 0.9297, - "step": 14960 - }, - { - "epoch": 0.6937876680574873, - "grad_norm": 3.931682825088501, - "learning_rate": 5.188959576594593e-06, - "loss": 0.8128, - "step": 14965 - }, - { - "epoch": 0.694019471488178, - "grad_norm": 3.934149980545044, - "learning_rate": 5.181867659219457e-06, - "loss": 0.8199, - "step": 14970 - }, - { - "epoch": 0.6942512749188688, - "grad_norm": 3.853133201599121, - "learning_rate": 5.174778896363312e-06, - "loss": 0.9809, - "step": 14975 - }, - { - "epoch": 0.6944830783495596, - "grad_norm": 4.392022609710693, - "learning_rate": 5.167693292667304e-06, - "loss": 1.0305, - "step": 14980 - }, - { - "epoch": 0.6947148817802503, - "grad_norm": 4.075222492218018, - "learning_rate": 5.160610852770493e-06, - "loss": 0.9731, - "step": 14985 - }, - { - "epoch": 0.6949466852109412, - "grad_norm": 4.099298000335693, - "learning_rate": 5.153531581309887e-06, - "loss": 0.9576, - "step": 14990 - }, - { - "epoch": 0.6951784886416319, - "grad_norm": 4.013690948486328, - "learning_rate": 5.146455482920423e-06, - "loss": 0.7349, - "step": 14995 - }, - { - "epoch": 0.6954102920723226, - "grad_norm": 3.4314472675323486, - "learning_rate": 5.139382562234938e-06, - "loss": 0.8236, - "step": 15000 - }, - { - "epoch": 0.6954102920723226, - "eval_loss": 0.8956095576286316, - "eval_runtime": 11.2867, - "eval_samples_per_second": 11.252, - "eval_steps_per_second": 11.252, - "step": 15000 - }, - { - "epoch": 0.6956420955030135, - "grad_norm": 2.756856679916382, - "learning_rate": 5.132312823884209e-06, - "loss": 0.8249, - "step": 15005 - }, - { - "epoch": 0.6958738989337042, - "grad_norm": 3.730117082595825, - "learning_rate": 5.125246272496927e-06, - "loss": 0.8329, - "step": 15010 - }, - { - "epoch": 0.696105702364395, - "grad_norm": 3.479635715484619, - "learning_rate": 5.118182912699684e-06, - "loss": 0.8444, - "step": 15015 - }, - { - "epoch": 0.6963375057950858, - "grad_norm": 3.8835439682006836, - "learning_rate": 5.11112274911699e-06, - "loss": 0.8925, - "step": 15020 - }, - { - "epoch": 0.6965693092257765, - "grad_norm": 4.538135051727295, - "learning_rate": 5.1040657863712666e-06, - "loss": 0.911, - "step": 15025 - }, - { - "epoch": 0.6968011126564673, - "grad_norm": 3.9026095867156982, - "learning_rate": 5.097012029082837e-06, - "loss": 0.9225, - "step": 15030 - }, - { - "epoch": 0.6970329160871581, - "grad_norm": 4.518922805786133, - "learning_rate": 5.0899614818699205e-06, - "loss": 1.0216, - "step": 15035 - }, - { - "epoch": 0.6972647195178489, - "grad_norm": 4.434970855712891, - "learning_rate": 5.0829141493486435e-06, - "loss": 0.7916, - "step": 15040 - }, - { - "epoch": 0.6974965229485396, - "grad_norm": 3.540708541870117, - "learning_rate": 5.075870036133015e-06, - "loss": 0.7286, - "step": 15045 - }, - { - "epoch": 0.6977283263792304, - "grad_norm": 3.813382387161255, - "learning_rate": 5.068829146834955e-06, - "loss": 0.6732, - "step": 15050 - }, - { - "epoch": 0.6979601298099212, - "grad_norm": 3.9284863471984863, - "learning_rate": 5.061791486064251e-06, - "loss": 0.7924, - "step": 15055 - }, - { - "epoch": 0.6981919332406119, - "grad_norm": 3.7859926223754883, - "learning_rate": 5.054757058428595e-06, - "loss": 1.0506, - "step": 15060 - }, - { - "epoch": 0.6984237366713028, - "grad_norm": 4.451103687286377, - "learning_rate": 5.047725868533557e-06, - "loss": 0.789, - "step": 15065 - }, - { - "epoch": 0.6986555401019935, - "grad_norm": 4.454600811004639, - "learning_rate": 5.040697920982576e-06, - "loss": 0.8248, - "step": 15070 - }, - { - "epoch": 0.6988873435326843, - "grad_norm": 3.176328659057617, - "learning_rate": 5.03367322037698e-06, - "loss": 0.8367, - "step": 15075 - }, - { - "epoch": 0.6991191469633751, - "grad_norm": 3.6452035903930664, - "learning_rate": 5.026651771315972e-06, - "loss": 0.9312, - "step": 15080 - }, - { - "epoch": 0.6993509503940658, - "grad_norm": 3.4964804649353027, - "learning_rate": 5.0196335783966255e-06, - "loss": 0.9832, - "step": 15085 - }, - { - "epoch": 0.6995827538247567, - "grad_norm": 3.5905039310455322, - "learning_rate": 5.012618646213869e-06, - "loss": 0.8185, - "step": 15090 - }, - { - "epoch": 0.6998145572554474, - "grad_norm": 4.217673301696777, - "learning_rate": 5.0056069793605175e-06, - "loss": 0.8456, - "step": 15095 - }, - { - "epoch": 0.7000463606861381, - "grad_norm": 3.272371292114258, - "learning_rate": 4.998598582427231e-06, - "loss": 0.8687, - "step": 15100 - }, - { - "epoch": 0.7000463606861381, - "eval_loss": 0.8956237435340881, - "eval_runtime": 11.2682, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 15100 - }, - { - "epoch": 0.700278164116829, - "grad_norm": 3.845468044281006, - "learning_rate": 4.991593460002532e-06, - "loss": 0.8925, - "step": 15105 - }, - { - "epoch": 0.7005099675475197, - "grad_norm": 4.131564617156982, - "learning_rate": 4.984591616672805e-06, - "loss": 0.8506, - "step": 15110 - }, - { - "epoch": 0.7007417709782104, - "grad_norm": 3.886279344558716, - "learning_rate": 4.977593057022286e-06, - "loss": 0.8359, - "step": 15115 - }, - { - "epoch": 0.7009735744089013, - "grad_norm": 3.745542049407959, - "learning_rate": 4.9705977856330566e-06, - "loss": 0.6869, - "step": 15120 - }, - { - "epoch": 0.701205377839592, - "grad_norm": 4.212728977203369, - "learning_rate": 4.963605807085052e-06, - "loss": 0.8538, - "step": 15125 - }, - { - "epoch": 0.7014371812702828, - "grad_norm": 4.162337779998779, - "learning_rate": 4.956617125956042e-06, - "loss": 0.8974, - "step": 15130 - }, - { - "epoch": 0.7016689847009736, - "grad_norm": 4.083733558654785, - "learning_rate": 4.9496317468216505e-06, - "loss": 0.8506, - "step": 15135 - }, - { - "epoch": 0.7019007881316643, - "grad_norm": 3.7863776683807373, - "learning_rate": 4.942649674255326e-06, - "loss": 0.8284, - "step": 15140 - }, - { - "epoch": 0.7021325915623551, - "grad_norm": 3.9084062576293945, - "learning_rate": 4.93567091282836e-06, - "loss": 0.7824, - "step": 15145 - }, - { - "epoch": 0.7023643949930459, - "grad_norm": 4.295900344848633, - "learning_rate": 4.928695467109879e-06, - "loss": 0.8566, - "step": 15150 - }, - { - "epoch": 0.7025961984237367, - "grad_norm": 3.5533487796783447, - "learning_rate": 4.921723341666833e-06, - "loss": 0.9455, - "step": 15155 - }, - { - "epoch": 0.7028280018544274, - "grad_norm": 3.6774909496307373, - "learning_rate": 4.914754541063994e-06, - "loss": 0.8561, - "step": 15160 - }, - { - "epoch": 0.7030598052851182, - "grad_norm": 3.6324589252471924, - "learning_rate": 4.907789069863965e-06, - "loss": 0.662, - "step": 15165 - }, - { - "epoch": 0.703291608715809, - "grad_norm": 4.190369129180908, - "learning_rate": 4.900826932627174e-06, - "loss": 0.9028, - "step": 15170 - }, - { - "epoch": 0.7035234121464997, - "grad_norm": 4.633210182189941, - "learning_rate": 4.893868133911849e-06, - "loss": 0.9034, - "step": 15175 - }, - { - "epoch": 0.7037552155771906, - "grad_norm": 3.308100700378418, - "learning_rate": 4.886912678274046e-06, - "loss": 0.8148, - "step": 15180 - }, - { - "epoch": 0.7039870190078813, - "grad_norm": 3.5198941230773926, - "learning_rate": 4.879960570267638e-06, - "loss": 0.8963, - "step": 15185 - }, - { - "epoch": 0.7042188224385721, - "grad_norm": 3.3143296241760254, - "learning_rate": 4.873011814444279e-06, - "loss": 0.7713, - "step": 15190 - }, - { - "epoch": 0.7044506258692629, - "grad_norm": 3.4104104042053223, - "learning_rate": 4.8660664153534555e-06, - "loss": 0.9218, - "step": 15195 - }, - { - "epoch": 0.7046824292999536, - "grad_norm": 3.8021419048309326, - "learning_rate": 4.8591243775424505e-06, - "loss": 0.7878, - "step": 15200 - }, - { - "epoch": 0.7046824292999536, - "eval_loss": 0.8943904638290405, - "eval_runtime": 11.268, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 15200 - }, - { - "epoch": 0.7049142327306445, - "grad_norm": 3.752955913543701, - "learning_rate": 4.852185705556334e-06, - "loss": 0.8282, - "step": 15205 - }, - { - "epoch": 0.7051460361613352, - "grad_norm": 4.037057399749756, - "learning_rate": 4.845250403937985e-06, - "loss": 0.9621, - "step": 15210 - }, - { - "epoch": 0.7053778395920259, - "grad_norm": 3.99474835395813, - "learning_rate": 4.8383184772280745e-06, - "loss": 0.8936, - "step": 15215 - }, - { - "epoch": 0.7056096430227168, - "grad_norm": 4.396583080291748, - "learning_rate": 4.831389929965059e-06, - "loss": 0.9539, - "step": 15220 - }, - { - "epoch": 0.7058414464534075, - "grad_norm": 4.322121620178223, - "learning_rate": 4.8244647666851815e-06, - "loss": 0.7406, - "step": 15225 - }, - { - "epoch": 0.7060732498840983, - "grad_norm": 3.7498438358306885, - "learning_rate": 4.817542991922475e-06, - "loss": 0.7946, - "step": 15230 - }, - { - "epoch": 0.7063050533147891, - "grad_norm": 3.8820183277130127, - "learning_rate": 4.810624610208754e-06, - "loss": 0.7112, - "step": 15235 - }, - { - "epoch": 0.7065368567454798, - "grad_norm": 3.285693407058716, - "learning_rate": 4.803709626073604e-06, - "loss": 0.7001, - "step": 15240 - }, - { - "epoch": 0.7067686601761706, - "grad_norm": 4.146936416625977, - "learning_rate": 4.796798044044398e-06, - "loss": 0.7498, - "step": 15245 - }, - { - "epoch": 0.7070004636068614, - "grad_norm": 3.7342891693115234, - "learning_rate": 4.789889868646265e-06, - "loss": 0.8432, - "step": 15250 - }, - { - "epoch": 0.7072322670375522, - "grad_norm": 3.5254108905792236, - "learning_rate": 4.782985104402123e-06, - "loss": 0.8564, - "step": 15255 - }, - { - "epoch": 0.7074640704682429, - "grad_norm": 3.4690604209899902, - "learning_rate": 4.776083755832638e-06, - "loss": 0.8843, - "step": 15260 - }, - { - "epoch": 0.7076958738989337, - "grad_norm": 3.0940325260162354, - "learning_rate": 4.769185827456254e-06, - "loss": 0.8507, - "step": 15265 - }, - { - "epoch": 0.7079276773296245, - "grad_norm": 3.7676939964294434, - "learning_rate": 4.762291323789172e-06, - "loss": 0.9011, - "step": 15270 - }, - { - "epoch": 0.7081594807603152, - "grad_norm": 4.3535308837890625, - "learning_rate": 4.755400249345348e-06, - "loss": 0.9, - "step": 15275 - }, - { - "epoch": 0.708391284191006, - "grad_norm": 4.5304083824157715, - "learning_rate": 4.748512608636487e-06, - "loss": 0.8402, - "step": 15280 - }, - { - "epoch": 0.7086230876216968, - "grad_norm": 3.6635782718658447, - "learning_rate": 4.741628406172063e-06, - "loss": 0.7709, - "step": 15285 - }, - { - "epoch": 0.7088548910523875, - "grad_norm": 3.2135374546051025, - "learning_rate": 4.73474764645928e-06, - "loss": 0.9065, - "step": 15290 - }, - { - "epoch": 0.7090866944830784, - "grad_norm": 4.172916412353516, - "learning_rate": 4.7278703340031e-06, - "loss": 0.9198, - "step": 15295 - }, - { - "epoch": 0.7093184979137691, - "grad_norm": 3.894068717956543, - "learning_rate": 4.720996473306231e-06, - "loss": 0.7857, - "step": 15300 - }, - { - "epoch": 0.7093184979137691, - "eval_loss": 0.8935646414756775, - "eval_runtime": 11.2772, - "eval_samples_per_second": 11.262, - "eval_steps_per_second": 11.262, - "step": 15300 - }, - { - "epoch": 0.7095503013444598, - "grad_norm": 3.9130654335021973, - "learning_rate": 4.714126068869107e-06, - "loss": 0.9399, - "step": 15305 - }, - { - "epoch": 0.7097821047751507, - "grad_norm": 3.7862632274627686, - "learning_rate": 4.707259125189906e-06, - "loss": 0.8922, - "step": 15310 - }, - { - "epoch": 0.7100139082058414, - "grad_norm": 4.150571823120117, - "learning_rate": 4.700395646764544e-06, - "loss": 0.9313, - "step": 15315 - }, - { - "epoch": 0.7102457116365323, - "grad_norm": 4.1176862716674805, - "learning_rate": 4.693535638086669e-06, - "loss": 0.8162, - "step": 15320 - }, - { - "epoch": 0.710477515067223, - "grad_norm": 3.9354631900787354, - "learning_rate": 4.686679103647647e-06, - "loss": 0.9237, - "step": 15325 - }, - { - "epoch": 0.7107093184979137, - "grad_norm": 3.2577991485595703, - "learning_rate": 4.679826047936582e-06, - "loss": 0.7905, - "step": 15330 - }, - { - "epoch": 0.7109411219286046, - "grad_norm": 3.445051670074463, - "learning_rate": 4.672976475440288e-06, - "loss": 0.8439, - "step": 15335 - }, - { - "epoch": 0.7111729253592953, - "grad_norm": 3.5186078548431396, - "learning_rate": 4.666130390643312e-06, - "loss": 0.8311, - "step": 15340 - }, - { - "epoch": 0.7114047287899861, - "grad_norm": 3.8396527767181396, - "learning_rate": 4.6592877980279025e-06, - "loss": 0.8022, - "step": 15345 - }, - { - "epoch": 0.7116365322206769, - "grad_norm": 3.7967803478240967, - "learning_rate": 4.652448702074034e-06, - "loss": 0.8061, - "step": 15350 - }, - { - "epoch": 0.7118683356513676, - "grad_norm": 3.51462459564209, - "learning_rate": 4.6456131072593905e-06, - "loss": 0.8804, - "step": 15355 - }, - { - "epoch": 0.7121001390820584, - "grad_norm": 3.5005767345428467, - "learning_rate": 4.638781018059357e-06, - "loss": 0.7588, - "step": 15360 - }, - { - "epoch": 0.7123319425127492, - "grad_norm": 2.943761110305786, - "learning_rate": 4.631952438947021e-06, - "loss": 0.8174, - "step": 15365 - }, - { - "epoch": 0.71256374594344, - "grad_norm": 4.694367408752441, - "learning_rate": 4.625127374393185e-06, - "loss": 0.9243, - "step": 15370 - }, - { - "epoch": 0.7127955493741307, - "grad_norm": 4.095063209533691, - "learning_rate": 4.618305828866344e-06, - "loss": 0.8334, - "step": 15375 - }, - { - "epoch": 0.7130273528048215, - "grad_norm": 3.5834052562713623, - "learning_rate": 4.611487806832681e-06, - "loss": 0.7467, - "step": 15380 - }, - { - "epoch": 0.7132591562355123, - "grad_norm": 3.4568352699279785, - "learning_rate": 4.6046733127560835e-06, - "loss": 0.768, - "step": 15385 - }, - { - "epoch": 0.713490959666203, - "grad_norm": 4.342470645904541, - "learning_rate": 4.597862351098132e-06, - "loss": 0.9905, - "step": 15390 - }, - { - "epoch": 0.7137227630968939, - "grad_norm": 3.9144160747528076, - "learning_rate": 4.591054926318074e-06, - "loss": 0.8793, - "step": 15395 - }, - { - "epoch": 0.7139545665275846, - "grad_norm": 3.455382823944092, - "learning_rate": 4.584251042872859e-06, - "loss": 0.7232, - "step": 15400 - }, - { - "epoch": 0.7139545665275846, - "eval_loss": 0.8931986093521118, - "eval_runtime": 11.2766, - "eval_samples_per_second": 11.262, - "eval_steps_per_second": 11.262, - "step": 15400 - }, - { - "epoch": 0.7141863699582753, - "grad_norm": 4.558581829071045, - "learning_rate": 4.577450705217118e-06, - "loss": 0.8319, - "step": 15405 - }, - { - "epoch": 0.7144181733889662, - "grad_norm": 3.382648468017578, - "learning_rate": 4.5706539178031504e-06, - "loss": 0.8465, - "step": 15410 - }, - { - "epoch": 0.7146499768196569, - "grad_norm": 4.303096771240234, - "learning_rate": 4.5638606850809374e-06, - "loss": 0.9637, - "step": 15415 - }, - { - "epoch": 0.7148817802503477, - "grad_norm": 3.2249696254730225, - "learning_rate": 4.557071011498138e-06, - "loss": 0.8946, - "step": 15420 - }, - { - "epoch": 0.7151135836810385, - "grad_norm": 4.005526065826416, - "learning_rate": 4.550284901500072e-06, - "loss": 0.842, - "step": 15425 - }, - { - "epoch": 0.7153453871117292, - "grad_norm": 4.499578952789307, - "learning_rate": 4.543502359529724e-06, - "loss": 0.9361, - "step": 15430 - }, - { - "epoch": 0.7155771905424201, - "grad_norm": 3.953172206878662, - "learning_rate": 4.536723390027751e-06, - "loss": 0.9228, - "step": 15435 - }, - { - "epoch": 0.7158089939731108, - "grad_norm": 3.8294663429260254, - "learning_rate": 4.529947997432472e-06, - "loss": 0.7677, - "step": 15440 - }, - { - "epoch": 0.7160407974038016, - "grad_norm": 3.6534347534179688, - "learning_rate": 4.523176186179853e-06, - "loss": 0.8623, - "step": 15445 - }, - { - "epoch": 0.7162726008344924, - "grad_norm": 3.6950314044952393, - "learning_rate": 4.516407960703529e-06, - "loss": 0.8982, - "step": 15450 - }, - { - "epoch": 0.7165044042651831, - "grad_norm": 4.337798595428467, - "learning_rate": 4.509643325434775e-06, - "loss": 0.9184, - "step": 15455 - }, - { - "epoch": 0.7167362076958739, - "grad_norm": 3.7928266525268555, - "learning_rate": 4.502882284802526e-06, - "loss": 0.9644, - "step": 15460 - }, - { - "epoch": 0.7169680111265647, - "grad_norm": 3.2648489475250244, - "learning_rate": 4.496124843233351e-06, - "loss": 0.7637, - "step": 15465 - }, - { - "epoch": 0.7171998145572555, - "grad_norm": 3.6675827503204346, - "learning_rate": 4.489371005151476e-06, - "loss": 0.7412, - "step": 15470 - }, - { - "epoch": 0.7174316179879462, - "grad_norm": 3.5607333183288574, - "learning_rate": 4.482620774978763e-06, - "loss": 0.889, - "step": 15475 - }, - { - "epoch": 0.717663421418637, - "grad_norm": 3.4801552295684814, - "learning_rate": 4.475874157134709e-06, - "loss": 0.8214, - "step": 15480 - }, - { - "epoch": 0.7178952248493278, - "grad_norm": 3.724231004714966, - "learning_rate": 4.469131156036441e-06, - "loss": 0.673, - "step": 15485 - }, - { - "epoch": 0.7181270282800185, - "grad_norm": 4.352002143859863, - "learning_rate": 4.462391776098736e-06, - "loss": 0.9104, - "step": 15490 - }, - { - "epoch": 0.7183588317107094, - "grad_norm": 3.5941615104675293, - "learning_rate": 4.455656021733977e-06, - "loss": 0.8045, - "step": 15495 - }, - { - "epoch": 0.7185906351414001, - "grad_norm": 4.01120662689209, - "learning_rate": 4.448923897352193e-06, - "loss": 0.8604, - "step": 15500 - }, - { - "epoch": 0.7185906351414001, - "eval_loss": 0.8920189738273621, - "eval_runtime": 11.2594, - "eval_samples_per_second": 11.279, - "eval_steps_per_second": 11.279, - "step": 15500 - }, - { - "epoch": 0.7188224385720908, - "grad_norm": 3.968578815460205, - "learning_rate": 4.442195407361029e-06, - "loss": 0.8634, - "step": 15505 - }, - { - "epoch": 0.7190542420027817, - "grad_norm": 3.571563243865967, - "learning_rate": 4.435470556165747e-06, - "loss": 0.9375, - "step": 15510 - }, - { - "epoch": 0.7192860454334724, - "grad_norm": 3.326970100402832, - "learning_rate": 4.428749348169228e-06, - "loss": 0.7336, - "step": 15515 - }, - { - "epoch": 0.7195178488641631, - "grad_norm": 4.140428066253662, - "learning_rate": 4.4220317877719684e-06, - "loss": 0.8382, - "step": 15520 - }, - { - "epoch": 0.719749652294854, - "grad_norm": 3.8773670196533203, - "learning_rate": 4.415317879372086e-06, - "loss": 0.8804, - "step": 15525 - }, - { - "epoch": 0.7199814557255447, - "grad_norm": 4.370347023010254, - "learning_rate": 4.408607627365289e-06, - "loss": 0.9379, - "step": 15530 - }, - { - "epoch": 0.7202132591562355, - "grad_norm": 3.609670400619507, - "learning_rate": 4.401901036144909e-06, - "loss": 0.7642, - "step": 15535 - }, - { - "epoch": 0.7204450625869263, - "grad_norm": 4.242509841918945, - "learning_rate": 4.395198110101866e-06, - "loss": 0.8219, - "step": 15540 - }, - { - "epoch": 0.720676866017617, - "grad_norm": 3.3745124340057373, - "learning_rate": 4.388498853624696e-06, - "loss": 0.8442, - "step": 15545 - }, - { - "epoch": 0.7209086694483078, - "grad_norm": 3.047438859939575, - "learning_rate": 4.381803271099515e-06, - "loss": 0.6754, - "step": 15550 - }, - { - "epoch": 0.7211404728789986, - "grad_norm": 4.263495922088623, - "learning_rate": 4.375111366910049e-06, - "loss": 0.873, - "step": 15555 - }, - { - "epoch": 0.7213722763096894, - "grad_norm": 3.8155529499053955, - "learning_rate": 4.368423145437611e-06, - "loss": 0.9339, - "step": 15560 - }, - { - "epoch": 0.7216040797403802, - "grad_norm": 3.9193153381347656, - "learning_rate": 4.3617386110611e-06, - "loss": 0.9423, - "step": 15565 - }, - { - "epoch": 0.721835883171071, - "grad_norm": 3.6092798709869385, - "learning_rate": 4.355057768156997e-06, - "loss": 0.8167, - "step": 15570 - }, - { - "epoch": 0.7220676866017617, - "grad_norm": 3.869521141052246, - "learning_rate": 4.348380621099376e-06, - "loss": 0.9408, - "step": 15575 - }, - { - "epoch": 0.7222994900324525, - "grad_norm": 4.508119583129883, - "learning_rate": 4.3417071742598895e-06, - "loss": 0.8851, - "step": 15580 - }, - { - "epoch": 0.7225312934631433, - "grad_norm": 3.347935199737549, - "learning_rate": 4.335037432007758e-06, - "loss": 0.6706, - "step": 15585 - }, - { - "epoch": 0.722763096893834, - "grad_norm": 3.8962862491607666, - "learning_rate": 4.328371398709786e-06, - "loss": 0.8095, - "step": 15590 - }, - { - "epoch": 0.7229949003245248, - "grad_norm": 3.721971273422241, - "learning_rate": 4.3217090787303564e-06, - "loss": 0.8029, - "step": 15595 - }, - { - "epoch": 0.7232267037552156, - "grad_norm": 3.0758113861083984, - "learning_rate": 4.315050476431395e-06, - "loss": 0.7492, - "step": 15600 - }, - { - "epoch": 0.7232267037552156, - "eval_loss": 0.8914928436279297, - "eval_runtime": 11.2606, - "eval_samples_per_second": 11.278, - "eval_steps_per_second": 11.278, - "step": 15600 - }, - { - "epoch": 0.7234585071859063, - "grad_norm": 3.8786678314208984, - "learning_rate": 4.308395596172418e-06, - "loss": 0.9624, - "step": 15605 - }, - { - "epoch": 0.7236903106165972, - "grad_norm": 3.3441412448883057, - "learning_rate": 4.301744442310499e-06, - "loss": 0.7183, - "step": 15610 - }, - { - "epoch": 0.7239221140472879, - "grad_norm": 4.816915988922119, - "learning_rate": 4.295097019200264e-06, - "loss": 0.8229, - "step": 15615 - }, - { - "epoch": 0.7241539174779786, - "grad_norm": 3.833200216293335, - "learning_rate": 4.288453331193904e-06, - "loss": 0.7384, - "step": 15620 - }, - { - "epoch": 0.7243857209086695, - "grad_norm": 3.196971893310547, - "learning_rate": 4.281813382641169e-06, - "loss": 0.8744, - "step": 15625 - }, - { - "epoch": 0.7246175243393602, - "grad_norm": 3.8103346824645996, - "learning_rate": 4.275177177889347e-06, - "loss": 0.8335, - "step": 15630 - }, - { - "epoch": 0.724849327770051, - "grad_norm": 3.861053228378296, - "learning_rate": 4.268544721283281e-06, - "loss": 0.744, - "step": 15635 - }, - { - "epoch": 0.7250811312007418, - "grad_norm": 3.6687607765197754, - "learning_rate": 4.261916017165364e-06, - "loss": 0.9155, - "step": 15640 - }, - { - "epoch": 0.7253129346314325, - "grad_norm": 3.7486867904663086, - "learning_rate": 4.2552910698755325e-06, - "loss": 0.862, - "step": 15645 - }, - { - "epoch": 0.7255447380621233, - "grad_norm": 3.7146894931793213, - "learning_rate": 4.2486698837512545e-06, - "loss": 0.8686, - "step": 15650 - }, - { - "epoch": 0.7257765414928141, - "grad_norm": 4.370089054107666, - "learning_rate": 4.242052463127545e-06, - "loss": 0.8455, - "step": 15655 - }, - { - "epoch": 0.7260083449235049, - "grad_norm": 3.910057306289673, - "learning_rate": 4.235438812336946e-06, - "loss": 0.9133, - "step": 15660 - }, - { - "epoch": 0.7262401483541956, - "grad_norm": 3.6310927867889404, - "learning_rate": 4.228828935709541e-06, - "loss": 0.874, - "step": 15665 - }, - { - "epoch": 0.7264719517848864, - "grad_norm": 3.224461317062378, - "learning_rate": 4.222222837572929e-06, - "loss": 0.6786, - "step": 15670 - }, - { - "epoch": 0.7267037552155772, - "grad_norm": 3.6823716163635254, - "learning_rate": 4.215620522252247e-06, - "loss": 0.815, - "step": 15675 - }, - { - "epoch": 0.7269355586462679, - "grad_norm": 3.2620420455932617, - "learning_rate": 4.209021994070155e-06, - "loss": 0.7819, - "step": 15680 - }, - { - "epoch": 0.7271673620769588, - "grad_norm": 3.7991809844970703, - "learning_rate": 4.202427257346825e-06, - "loss": 0.8341, - "step": 15685 - }, - { - "epoch": 0.7273991655076495, - "grad_norm": 3.8811824321746826, - "learning_rate": 4.195836316399948e-06, - "loss": 0.8063, - "step": 15690 - }, - { - "epoch": 0.7276309689383403, - "grad_norm": 3.89102840423584, - "learning_rate": 4.18924917554474e-06, - "loss": 0.8389, - "step": 15695 - }, - { - "epoch": 0.7278627723690311, - "grad_norm": 3.7965166568756104, - "learning_rate": 4.182665839093916e-06, - "loss": 0.7412, - "step": 15700 - }, - { - "epoch": 0.7278627723690311, - "eval_loss": 0.8913096785545349, - "eval_runtime": 11.2647, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 15700 - }, - { - "epoch": 0.7280945757997218, - "grad_norm": 3.3673911094665527, - "learning_rate": 4.1760863113577085e-06, - "loss": 0.8514, - "step": 15705 - }, - { - "epoch": 0.7283263792304127, - "grad_norm": 3.435807704925537, - "learning_rate": 4.169510596643859e-06, - "loss": 0.861, - "step": 15710 - }, - { - "epoch": 0.7285581826611034, - "grad_norm": 4.1871137619018555, - "learning_rate": 4.1629386992576025e-06, - "loss": 0.9678, - "step": 15715 - }, - { - "epoch": 0.7287899860917941, - "grad_norm": 4.157339096069336, - "learning_rate": 4.156370623501678e-06, - "loss": 0.8808, - "step": 15720 - }, - { - "epoch": 0.729021789522485, - "grad_norm": 3.5089452266693115, - "learning_rate": 4.149806373676328e-06, - "loss": 0.8607, - "step": 15725 - }, - { - "epoch": 0.7292535929531757, - "grad_norm": 3.601043462753296, - "learning_rate": 4.1432459540792866e-06, - "loss": 0.872, - "step": 15730 - }, - { - "epoch": 0.7294853963838664, - "grad_norm": 3.9285526275634766, - "learning_rate": 4.136689369005776e-06, - "loss": 0.8477, - "step": 15735 - }, - { - "epoch": 0.7297171998145573, - "grad_norm": 3.5868418216705322, - "learning_rate": 4.130136622748516e-06, - "loss": 0.8735, - "step": 15740 - }, - { - "epoch": 0.729949003245248, - "grad_norm": 3.829298496246338, - "learning_rate": 4.12358771959771e-06, - "loss": 0.7792, - "step": 15745 - }, - { - "epoch": 0.7301808066759388, - "grad_norm": 2.9413342475891113, - "learning_rate": 4.1170426638410436e-06, - "loss": 0.7645, - "step": 15750 - }, - { - "epoch": 0.7304126101066296, - "grad_norm": 3.4420745372772217, - "learning_rate": 4.1105014597636775e-06, - "loss": 0.9818, - "step": 15755 - }, - { - "epoch": 0.7306444135373203, - "grad_norm": 3.5504422187805176, - "learning_rate": 4.1039641116482605e-06, - "loss": 0.7684, - "step": 15760 - }, - { - "epoch": 0.7308762169680111, - "grad_norm": 3.317941665649414, - "learning_rate": 4.0974306237749205e-06, - "loss": 0.6928, - "step": 15765 - }, - { - "epoch": 0.7311080203987019, - "grad_norm": 3.58900785446167, - "learning_rate": 4.090901000421244e-06, - "loss": 0.7373, - "step": 15770 - }, - { - "epoch": 0.7313398238293927, - "grad_norm": 3.978079080581665, - "learning_rate": 4.084375245862293e-06, - "loss": 0.7678, - "step": 15775 - }, - { - "epoch": 0.7315716272600834, - "grad_norm": 3.7968006134033203, - "learning_rate": 4.0778533643706e-06, - "loss": 0.6788, - "step": 15780 - }, - { - "epoch": 0.7318034306907742, - "grad_norm": 3.2984938621520996, - "learning_rate": 4.071335360216164e-06, - "loss": 0.749, - "step": 15785 - }, - { - "epoch": 0.732035234121465, - "grad_norm": 3.3127284049987793, - "learning_rate": 4.064821237666433e-06, - "loss": 0.8255, - "step": 15790 - }, - { - "epoch": 0.7322670375521557, - "grad_norm": 3.829406499862671, - "learning_rate": 4.058311000986326e-06, - "loss": 0.8941, - "step": 15795 - }, - { - "epoch": 0.7324988409828466, - "grad_norm": 3.6632189750671387, - "learning_rate": 4.05180465443822e-06, - "loss": 0.9017, - "step": 15800 - }, - { - "epoch": 0.7324988409828466, - "eval_loss": 0.890110969543457, - "eval_runtime": 11.284, - "eval_samples_per_second": 11.255, - "eval_steps_per_second": 11.255, - "step": 15800 - }, - { - "epoch": 0.7327306444135373, - "grad_norm": 3.796661138534546, - "learning_rate": 4.045302202281926e-06, - "loss": 0.8491, - "step": 15805 - }, - { - "epoch": 0.7329624478442281, - "grad_norm": 4.159597396850586, - "learning_rate": 4.0388036487747225e-06, - "loss": 0.8807, - "step": 15810 - }, - { - "epoch": 0.7331942512749189, - "grad_norm": 4.575572490692139, - "learning_rate": 4.032308998171336e-06, - "loss": 0.8274, - "step": 15815 - }, - { - "epoch": 0.7334260547056096, - "grad_norm": 3.7240734100341797, - "learning_rate": 4.025818254723925e-06, - "loss": 1.0875, - "step": 15820 - }, - { - "epoch": 0.7336578581363005, - "grad_norm": 3.9850006103515625, - "learning_rate": 4.019331422682101e-06, - "loss": 1.015, - "step": 15825 - }, - { - "epoch": 0.7338896615669912, - "grad_norm": 3.4893457889556885, - "learning_rate": 4.0128485062929145e-06, - "loss": 0.794, - "step": 15830 - }, - { - "epoch": 0.7341214649976819, - "grad_norm": 3.174949884414673, - "learning_rate": 4.006369509800846e-06, - "loss": 0.7296, - "step": 15835 - }, - { - "epoch": 0.7343532684283728, - "grad_norm": 3.194807529449463, - "learning_rate": 3.999894437447807e-06, - "loss": 0.8823, - "step": 15840 - }, - { - "epoch": 0.7345850718590635, - "grad_norm": 4.552024841308594, - "learning_rate": 3.993423293473152e-06, - "loss": 0.9614, - "step": 15845 - }, - { - "epoch": 0.7348168752897543, - "grad_norm": 3.7308084964752197, - "learning_rate": 3.986956082113658e-06, - "loss": 0.9514, - "step": 15850 - }, - { - "epoch": 0.7350486787204451, - "grad_norm": 3.8906126022338867, - "learning_rate": 3.980492807603519e-06, - "loss": 0.8865, - "step": 15855 - }, - { - "epoch": 0.7352804821511358, - "grad_norm": 3.130915641784668, - "learning_rate": 3.974033474174367e-06, - "loss": 0.7193, - "step": 15860 - }, - { - "epoch": 0.7355122855818266, - "grad_norm": 4.078449726104736, - "learning_rate": 3.967578086055237e-06, - "loss": 0.7484, - "step": 15865 - }, - { - "epoch": 0.7357440890125174, - "grad_norm": 3.706937074661255, - "learning_rate": 3.961126647472596e-06, - "loss": 0.8345, - "step": 15870 - }, - { - "epoch": 0.7359758924432082, - "grad_norm": 3.131194591522217, - "learning_rate": 3.954679162650313e-06, - "loss": 0.6977, - "step": 15875 - }, - { - "epoch": 0.7362076958738989, - "grad_norm": 3.336714029312134, - "learning_rate": 3.948235635809675e-06, - "loss": 0.7781, - "step": 15880 - }, - { - "epoch": 0.7364394993045897, - "grad_norm": 3.725440502166748, - "learning_rate": 3.941796071169383e-06, - "loss": 0.7968, - "step": 15885 - }, - { - "epoch": 0.7366713027352805, - "grad_norm": 3.469977378845215, - "learning_rate": 3.9353604729455296e-06, - "loss": 0.9139, - "step": 15890 - }, - { - "epoch": 0.7369031061659712, - "grad_norm": 4.227980136871338, - "learning_rate": 3.928928845351617e-06, - "loss": 0.8296, - "step": 15895 - }, - { - "epoch": 0.7371349095966621, - "grad_norm": 3.965374708175659, - "learning_rate": 3.922501192598557e-06, - "loss": 0.8702, - "step": 15900 - }, - { - "epoch": 0.7371349095966621, - "eval_loss": 0.8892138004302979, - "eval_runtime": 11.2674, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 15900 - }, - { - "epoch": 0.7373667130273528, - "grad_norm": 3.8569962978363037, - "learning_rate": 3.916077518894642e-06, - "loss": 0.8873, - "step": 15905 - }, - { - "epoch": 0.7375985164580435, - "grad_norm": 3.320935010910034, - "learning_rate": 3.909657828445572e-06, - "loss": 0.8956, - "step": 15910 - }, - { - "epoch": 0.7378303198887344, - "grad_norm": 3.896636962890625, - "learning_rate": 3.903242125454439e-06, - "loss": 1.0005, - "step": 15915 - }, - { - "epoch": 0.7380621233194251, - "grad_norm": 3.972095489501953, - "learning_rate": 3.896830414121717e-06, - "loss": 0.9519, - "step": 15920 - }, - { - "epoch": 0.7382939267501158, - "grad_norm": 3.7143542766571045, - "learning_rate": 3.890422698645267e-06, - "loss": 0.8063, - "step": 15925 - }, - { - "epoch": 0.7385257301808067, - "grad_norm": 3.7164463996887207, - "learning_rate": 3.88401898322034e-06, - "loss": 0.7474, - "step": 15930 - }, - { - "epoch": 0.7387575336114974, - "grad_norm": 4.113378047943115, - "learning_rate": 3.87761927203957e-06, - "loss": 0.9298, - "step": 15935 - }, - { - "epoch": 0.7389893370421883, - "grad_norm": 3.508679151535034, - "learning_rate": 3.871223569292955e-06, - "loss": 0.9423, - "step": 15940 - }, - { - "epoch": 0.739221140472879, - "grad_norm": 3.7724037170410156, - "learning_rate": 3.8648318791678844e-06, - "loss": 0.9195, - "step": 15945 - }, - { - "epoch": 0.7394529439035697, - "grad_norm": 4.135608673095703, - "learning_rate": 3.8584442058491145e-06, - "loss": 0.7762, - "step": 15950 - }, - { - "epoch": 0.7396847473342606, - "grad_norm": 4.278160095214844, - "learning_rate": 3.852060553518771e-06, - "loss": 0.8267, - "step": 15955 - }, - { - "epoch": 0.7399165507649513, - "grad_norm": 4.063554286956787, - "learning_rate": 3.8456809263563425e-06, - "loss": 0.941, - "step": 15960 - }, - { - "epoch": 0.7401483541956421, - "grad_norm": 4.410874843597412, - "learning_rate": 3.839305328538688e-06, - "loss": 0.8817, - "step": 15965 - }, - { - "epoch": 0.7403801576263329, - "grad_norm": 4.527900218963623, - "learning_rate": 3.832933764240036e-06, - "loss": 1.025, - "step": 15970 - }, - { - "epoch": 0.7406119610570236, - "grad_norm": 3.8349719047546387, - "learning_rate": 3.8265662376319545e-06, - "loss": 0.6803, - "step": 15975 - }, - { - "epoch": 0.7408437644877144, - "grad_norm": 4.201470375061035, - "learning_rate": 3.820202752883389e-06, - "loss": 0.7962, - "step": 15980 - }, - { - "epoch": 0.7410755679184052, - "grad_norm": 4.27456521987915, - "learning_rate": 3.8138433141606233e-06, - "loss": 0.83, - "step": 15985 - }, - { - "epoch": 0.741307371349096, - "grad_norm": 3.525296926498413, - "learning_rate": 3.8074879256272958e-06, - "loss": 0.8171, - "step": 15990 - }, - { - "epoch": 0.7415391747797867, - "grad_norm": 3.365690231323242, - "learning_rate": 3.8011365914443977e-06, - "loss": 0.7481, - "step": 15995 - }, - { - "epoch": 0.7417709782104775, - "grad_norm": 3.8495423793792725, - "learning_rate": 3.7947893157702663e-06, - "loss": 0.8634, - "step": 16000 - }, - { - "epoch": 0.7417709782104775, - "eval_loss": 0.8883469700813293, - "eval_runtime": 11.2669, - "eval_samples_per_second": 11.272, - "eval_steps_per_second": 11.272, - "step": 16000 - }, - { - "epoch": 0.7420027816411683, - "grad_norm": 3.318895101547241, - "learning_rate": 3.7884461027605756e-06, - "loss": 0.7934, - "step": 16005 - }, - { - "epoch": 0.742234585071859, - "grad_norm": 4.090661525726318, - "learning_rate": 3.7821069565683388e-06, - "loss": 0.7891, - "step": 16010 - }, - { - "epoch": 0.7424663885025499, - "grad_norm": 3.940528631210327, - "learning_rate": 3.7757718813439147e-06, - "loss": 0.8408, - "step": 16015 - }, - { - "epoch": 0.7426981919332406, - "grad_norm": 3.4445431232452393, - "learning_rate": 3.769440881234996e-06, - "loss": 0.7104, - "step": 16020 - }, - { - "epoch": 0.7429299953639313, - "grad_norm": 3.6801605224609375, - "learning_rate": 3.763113960386595e-06, - "loss": 0.8792, - "step": 16025 - }, - { - "epoch": 0.7431617987946222, - "grad_norm": 3.427544116973877, - "learning_rate": 3.756791122941068e-06, - "loss": 0.754, - "step": 16030 - }, - { - "epoch": 0.7433936022253129, - "grad_norm": 4.70984411239624, - "learning_rate": 3.750472373038093e-06, - "loss": 0.868, - "step": 16035 - }, - { - "epoch": 0.7436254056560037, - "grad_norm": 3.9222209453582764, - "learning_rate": 3.744157714814669e-06, - "loss": 0.8452, - "step": 16040 - }, - { - "epoch": 0.7438572090866945, - "grad_norm": 3.9630048274993896, - "learning_rate": 3.737847152405113e-06, - "loss": 0.8538, - "step": 16045 - }, - { - "epoch": 0.7440890125173852, - "grad_norm": 4.065320014953613, - "learning_rate": 3.73154068994107e-06, - "loss": 0.8741, - "step": 16050 - }, - { - "epoch": 0.7443208159480761, - "grad_norm": 3.2823739051818848, - "learning_rate": 3.7252383315514974e-06, - "loss": 0.7499, - "step": 16055 - }, - { - "epoch": 0.7445526193787668, - "grad_norm": 4.034680366516113, - "learning_rate": 3.7189400813626577e-06, - "loss": 1.0117, - "step": 16060 - }, - { - "epoch": 0.7447844228094576, - "grad_norm": 3.311304807662964, - "learning_rate": 3.7126459434981377e-06, - "loss": 0.6851, - "step": 16065 - }, - { - "epoch": 0.7450162262401484, - "grad_norm": 3.439509391784668, - "learning_rate": 3.706355922078817e-06, - "loss": 0.948, - "step": 16070 - }, - { - "epoch": 0.7452480296708391, - "grad_norm": 3.8146088123321533, - "learning_rate": 3.7000700212228937e-06, - "loss": 0.814, - "step": 16075 - }, - { - "epoch": 0.7454798331015299, - "grad_norm": 3.7652645111083984, - "learning_rate": 3.693788245045854e-06, - "loss": 0.917, - "step": 16080 - }, - { - "epoch": 0.7457116365322207, - "grad_norm": 3.8624465465545654, - "learning_rate": 3.687510597660495e-06, - "loss": 0.8091, - "step": 16085 - }, - { - "epoch": 0.7459434399629115, - "grad_norm": 3.9942777156829834, - "learning_rate": 3.6812370831769117e-06, - "loss": 0.7945, - "step": 16090 - }, - { - "epoch": 0.7461752433936022, - "grad_norm": 3.569929361343384, - "learning_rate": 3.6749677057024834e-06, - "loss": 0.8493, - "step": 16095 - }, - { - "epoch": 0.746407046824293, - "grad_norm": 3.523303508758545, - "learning_rate": 3.6687024693418838e-06, - "loss": 0.803, - "step": 16100 - }, - { - "epoch": 0.746407046824293, - "eval_loss": 0.8879801034927368, - "eval_runtime": 11.2743, - "eval_samples_per_second": 11.265, - "eval_steps_per_second": 11.265, - "step": 16100 - }, - { - "epoch": 0.7466388502549838, - "grad_norm": 3.59873366355896, - "learning_rate": 3.6624413781970815e-06, - "loss": 0.8011, - "step": 16105 - }, - { - "epoch": 0.7468706536856745, - "grad_norm": 3.186870813369751, - "learning_rate": 3.6561844363673215e-06, - "loss": 0.6644, - "step": 16110 - }, - { - "epoch": 0.7471024571163654, - "grad_norm": 4.058816432952881, - "learning_rate": 3.6499316479491408e-06, - "loss": 0.9662, - "step": 16115 - }, - { - "epoch": 0.7473342605470561, - "grad_norm": 4.230071067810059, - "learning_rate": 3.643683017036358e-06, - "loss": 0.9362, - "step": 16120 - }, - { - "epoch": 0.7475660639777468, - "grad_norm": 4.078927516937256, - "learning_rate": 3.6374385477200613e-06, - "loss": 0.9531, - "step": 16125 - }, - { - "epoch": 0.7477978674084377, - "grad_norm": 3.8070027828216553, - "learning_rate": 3.631198244088615e-06, - "loss": 0.864, - "step": 16130 - }, - { - "epoch": 0.7480296708391284, - "grad_norm": 4.256518840789795, - "learning_rate": 3.6249621102276643e-06, - "loss": 0.816, - "step": 16135 - }, - { - "epoch": 0.7482614742698191, - "grad_norm": 4.053735256195068, - "learning_rate": 3.618730150220121e-06, - "loss": 0.8013, - "step": 16140 - }, - { - "epoch": 0.74849327770051, - "grad_norm": 4.204226970672607, - "learning_rate": 3.6125023681461566e-06, - "loss": 0.8703, - "step": 16145 - }, - { - "epoch": 0.7487250811312007, - "grad_norm": 3.6390607357025146, - "learning_rate": 3.6062787680832178e-06, - "loss": 0.7165, - "step": 16150 - }, - { - "epoch": 0.7489568845618915, - "grad_norm": 3.5119662284851074, - "learning_rate": 3.6000593541060123e-06, - "loss": 0.8844, - "step": 16155 - }, - { - "epoch": 0.7491886879925823, - "grad_norm": 3.674628496170044, - "learning_rate": 3.593844130286499e-06, - "loss": 0.817, - "step": 16160 - }, - { - "epoch": 0.749420491423273, - "grad_norm": 3.410306215286255, - "learning_rate": 3.5876331006938957e-06, - "loss": 0.8219, - "step": 16165 - }, - { - "epoch": 0.7496522948539638, - "grad_norm": 3.05379056930542, - "learning_rate": 3.581426269394679e-06, - "loss": 0.8094, - "step": 16170 - }, - { - "epoch": 0.7498840982846546, - "grad_norm": 3.567718267440796, - "learning_rate": 3.5752236404525797e-06, - "loss": 0.9139, - "step": 16175 - }, - { - "epoch": 0.7501159017153454, - "grad_norm": 3.88759708404541, - "learning_rate": 3.569025217928563e-06, - "loss": 0.9061, - "step": 16180 - }, - { - "epoch": 0.7503477051460362, - "grad_norm": 4.398759841918945, - "learning_rate": 3.562831005880857e-06, - "loss": 1.0038, - "step": 16185 - }, - { - "epoch": 0.750579508576727, - "grad_norm": 4.027905464172363, - "learning_rate": 3.5566410083649218e-06, - "loss": 0.9197, - "step": 16190 - }, - { - "epoch": 0.7508113120074177, - "grad_norm": 3.7599446773529053, - "learning_rate": 3.550455229433457e-06, - "loss": 0.9728, - "step": 16195 - }, - { - "epoch": 0.7510431154381085, - "grad_norm": 3.042257070541382, - "learning_rate": 3.544273673136409e-06, - "loss": 0.7215, - "step": 16200 - }, - { - "epoch": 0.7510431154381085, - "eval_loss": 0.8876115083694458, - "eval_runtime": 11.2651, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 16200 - }, - { - "epoch": 0.7512749188687993, - "grad_norm": 3.53696870803833, - "learning_rate": 3.538096343520959e-06, - "loss": 0.7741, - "step": 16205 - }, - { - "epoch": 0.75150672229949, - "grad_norm": 3.3951306343078613, - "learning_rate": 3.53192324463151e-06, - "loss": 0.9269, - "step": 16210 - }, - { - "epoch": 0.7517385257301809, - "grad_norm": 3.8003907203674316, - "learning_rate": 3.5257543805097096e-06, - "loss": 0.807, - "step": 16215 - }, - { - "epoch": 0.7519703291608716, - "grad_norm": 3.8604257106781006, - "learning_rate": 3.519589755194417e-06, - "loss": 0.8091, - "step": 16220 - }, - { - "epoch": 0.7522021325915623, - "grad_norm": 4.3903398513793945, - "learning_rate": 3.513429372721733e-06, - "loss": 0.8131, - "step": 16225 - }, - { - "epoch": 0.7524339360222532, - "grad_norm": 3.688931703567505, - "learning_rate": 3.507273237124965e-06, - "loss": 0.9296, - "step": 16230 - }, - { - "epoch": 0.7526657394529439, - "grad_norm": 3.84499454498291, - "learning_rate": 3.501121352434651e-06, - "loss": 1.002, - "step": 16235 - }, - { - "epoch": 0.7528975428836346, - "grad_norm": 3.5966436862945557, - "learning_rate": 3.494973722678544e-06, - "loss": 0.8786, - "step": 16240 - }, - { - "epoch": 0.7531293463143255, - "grad_norm": 4.628900051116943, - "learning_rate": 3.4888303518816068e-06, - "loss": 0.9503, - "step": 16245 - }, - { - "epoch": 0.7533611497450162, - "grad_norm": 4.280179500579834, - "learning_rate": 3.482691244066012e-06, - "loss": 0.8784, - "step": 16250 - }, - { - "epoch": 0.753592953175707, - "grad_norm": 4.282437801361084, - "learning_rate": 3.47655640325115e-06, - "loss": 0.8438, - "step": 16255 - }, - { - "epoch": 0.7538247566063978, - "grad_norm": 4.042704105377197, - "learning_rate": 3.4704258334536156e-06, - "loss": 0.8948, - "step": 16260 - }, - { - "epoch": 0.7540565600370885, - "grad_norm": 4.1212239265441895, - "learning_rate": 3.464299538687197e-06, - "loss": 0.8409, - "step": 16265 - }, - { - "epoch": 0.7542883634677793, - "grad_norm": 4.124472141265869, - "learning_rate": 3.458177522962898e-06, - "loss": 0.9342, - "step": 16270 - }, - { - "epoch": 0.7545201668984701, - "grad_norm": 3.704339027404785, - "learning_rate": 3.4520597902889077e-06, - "loss": 0.8526, - "step": 16275 - }, - { - "epoch": 0.7547519703291609, - "grad_norm": 3.406463384628296, - "learning_rate": 3.445946344670623e-06, - "loss": 0.9123, - "step": 16280 - }, - { - "epoch": 0.7549837737598516, - "grad_norm": 4.044938087463379, - "learning_rate": 3.439837190110622e-06, - "loss": 1.0306, - "step": 16285 - }, - { - "epoch": 0.7552155771905424, - "grad_norm": 3.2226245403289795, - "learning_rate": 3.433732330608682e-06, - "loss": 0.7938, - "step": 16290 - }, - { - "epoch": 0.7554473806212332, - "grad_norm": 3.456693649291992, - "learning_rate": 3.42763177016177e-06, - "loss": 0.8515, - "step": 16295 - }, - { - "epoch": 0.7556791840519239, - "grad_norm": 4.75590181350708, - "learning_rate": 3.4215355127640303e-06, - "loss": 1.0084, - "step": 16300 - }, - { - "epoch": 0.7556791840519239, - "eval_loss": 0.8864190578460693, - "eval_runtime": 11.2633, - "eval_samples_per_second": 11.276, - "eval_steps_per_second": 11.276, - "step": 16300 - }, - { - "epoch": 0.7559109874826148, - "grad_norm": 3.5325708389282227, - "learning_rate": 3.4154435624067907e-06, - "loss": 0.8667, - "step": 16305 - }, - { - "epoch": 0.7561427909133055, - "grad_norm": 4.027612686157227, - "learning_rate": 3.4093559230785676e-06, - "loss": 0.9299, - "step": 16310 - }, - { - "epoch": 0.7563745943439963, - "grad_norm": 3.76712965965271, - "learning_rate": 3.403272598765044e-06, - "loss": 1.0016, - "step": 16315 - }, - { - "epoch": 0.7566063977746871, - "grad_norm": 3.77254581451416, - "learning_rate": 3.3971935934490852e-06, - "loss": 1.0005, - "step": 16320 - }, - { - "epoch": 0.7568382012053778, - "grad_norm": 3.5619187355041504, - "learning_rate": 3.39111891111073e-06, - "loss": 0.8961, - "step": 16325 - }, - { - "epoch": 0.7570700046360687, - "grad_norm": 3.8188278675079346, - "learning_rate": 3.385048555727182e-06, - "loss": 0.8626, - "step": 16330 - }, - { - "epoch": 0.7573018080667594, - "grad_norm": 3.299271583557129, - "learning_rate": 3.3789825312728086e-06, - "loss": 0.8992, - "step": 16335 - }, - { - "epoch": 0.7575336114974501, - "grad_norm": 4.4377360343933105, - "learning_rate": 3.3729208417191505e-06, - "loss": 0.8702, - "step": 16340 - }, - { - "epoch": 0.757765414928141, - "grad_norm": 3.683051347732544, - "learning_rate": 3.3668634910349085e-06, - "loss": 0.9598, - "step": 16345 - }, - { - "epoch": 0.7579972183588317, - "grad_norm": 4.029439926147461, - "learning_rate": 3.3608104831859344e-06, - "loss": 0.9332, - "step": 16350 - }, - { - "epoch": 0.7582290217895225, - "grad_norm": 3.681857109069824, - "learning_rate": 3.3547618221352473e-06, - "loss": 0.8892, - "step": 16355 - }, - { - "epoch": 0.7584608252202133, - "grad_norm": 3.72904896736145, - "learning_rate": 3.348717511843016e-06, - "loss": 0.8558, - "step": 16360 - }, - { - "epoch": 0.758692628650904, - "grad_norm": 4.226546764373779, - "learning_rate": 3.3426775562665605e-06, - "loss": 0.8188, - "step": 16365 - }, - { - "epoch": 0.7589244320815948, - "grad_norm": 4.173439025878906, - "learning_rate": 3.336641959360345e-06, - "loss": 0.9299, - "step": 16370 - }, - { - "epoch": 0.7591562355122856, - "grad_norm": 2.8630075454711914, - "learning_rate": 3.3306107250759867e-06, - "loss": 0.761, - "step": 16375 - }, - { - "epoch": 0.7593880389429764, - "grad_norm": 3.6998066902160645, - "learning_rate": 3.3245838573622503e-06, - "loss": 0.74, - "step": 16380 - }, - { - "epoch": 0.7596198423736671, - "grad_norm": 3.66335129737854, - "learning_rate": 3.318561360165027e-06, - "loss": 0.758, - "step": 16385 - }, - { - "epoch": 0.7598516458043579, - "grad_norm": 4.026821136474609, - "learning_rate": 3.3125432374273638e-06, - "loss": 0.8289, - "step": 16390 - }, - { - "epoch": 0.7600834492350487, - "grad_norm": 3.9597976207733154, - "learning_rate": 3.3065294930894308e-06, - "loss": 0.8404, - "step": 16395 - }, - { - "epoch": 0.7603152526657394, - "grad_norm": 3.870403528213501, - "learning_rate": 3.3005201310885326e-06, - "loss": 0.8141, - "step": 16400 - }, - { - "epoch": 0.7603152526657394, - "eval_loss": 0.8859652280807495, - "eval_runtime": 11.2773, - "eval_samples_per_second": 11.262, - "eval_steps_per_second": 11.262, - "step": 16400 - }, - { - "epoch": 0.7605470560964303, - "grad_norm": 3.7531723976135254, - "learning_rate": 3.2945151553591127e-06, - "loss": 0.9562, - "step": 16405 - }, - { - "epoch": 0.760778859527121, - "grad_norm": 3.4687469005584717, - "learning_rate": 3.288514569832739e-06, - "loss": 0.9494, - "step": 16410 - }, - { - "epoch": 0.7610106629578117, - "grad_norm": 4.315207004547119, - "learning_rate": 3.2825183784380997e-06, - "loss": 0.9185, - "step": 16415 - }, - { - "epoch": 0.7612424663885026, - "grad_norm": 5.365420818328857, - "learning_rate": 3.2765265851010164e-06, - "loss": 0.9603, - "step": 16420 - }, - { - "epoch": 0.7614742698191933, - "grad_norm": 3.6974503993988037, - "learning_rate": 3.2705391937444188e-06, - "loss": 0.7098, - "step": 16425 - }, - { - "epoch": 0.7617060732498842, - "grad_norm": 3.763038396835327, - "learning_rate": 3.2645562082883665e-06, - "loss": 0.7566, - "step": 16430 - }, - { - "epoch": 0.7619378766805749, - "grad_norm": 3.7737200260162354, - "learning_rate": 3.258577632650024e-06, - "loss": 0.7182, - "step": 16435 - }, - { - "epoch": 0.7621696801112656, - "grad_norm": 3.475022554397583, - "learning_rate": 3.252603470743676e-06, - "loss": 0.7762, - "step": 16440 - }, - { - "epoch": 0.7624014835419565, - "grad_norm": 3.917085647583008, - "learning_rate": 3.246633726480719e-06, - "loss": 0.807, - "step": 16445 - }, - { - "epoch": 0.7626332869726472, - "grad_norm": 3.9613709449768066, - "learning_rate": 3.240668403769649e-06, - "loss": 0.9799, - "step": 16450 - }, - { - "epoch": 0.7628650904033379, - "grad_norm": 3.5182406902313232, - "learning_rate": 3.2347075065160693e-06, - "loss": 0.7412, - "step": 16455 - }, - { - "epoch": 0.7630968938340288, - "grad_norm": 3.8503899574279785, - "learning_rate": 3.2287510386226916e-06, - "loss": 0.9763, - "step": 16460 - }, - { - "epoch": 0.7633286972647195, - "grad_norm": 3.6205215454101562, - "learning_rate": 3.2227990039893253e-06, - "loss": 0.932, - "step": 16465 - }, - { - "epoch": 0.7635605006954103, - "grad_norm": 4.192770481109619, - "learning_rate": 3.2168514065128697e-06, - "loss": 0.8804, - "step": 16470 - }, - { - "epoch": 0.7637923041261011, - "grad_norm": 3.577810049057007, - "learning_rate": 3.2109082500873334e-06, - "loss": 0.7342, - "step": 16475 - }, - { - "epoch": 0.7640241075567918, - "grad_norm": 3.62418532371521, - "learning_rate": 3.204969538603805e-06, - "loss": 0.8044, - "step": 16480 - }, - { - "epoch": 0.7642559109874826, - "grad_norm": 3.3772356510162354, - "learning_rate": 3.199035275950463e-06, - "loss": 0.8625, - "step": 16485 - }, - { - "epoch": 0.7644877144181734, - "grad_norm": 3.894425868988037, - "learning_rate": 3.1931054660125802e-06, - "loss": 0.9266, - "step": 16490 - }, - { - "epoch": 0.7647195178488642, - "grad_norm": 3.513700485229492, - "learning_rate": 3.1871801126725133e-06, - "loss": 0.8347, - "step": 16495 - }, - { - "epoch": 0.7649513212795549, - "grad_norm": 3.5783920288085938, - "learning_rate": 3.1812592198097016e-06, - "loss": 0.7945, - "step": 16500 - }, - { - "epoch": 0.7649513212795549, - "eval_loss": 0.8850834965705872, - "eval_runtime": 11.2607, - "eval_samples_per_second": 11.278, - "eval_steps_per_second": 11.278, - "step": 16500 - }, - { - "epoch": 0.7651831247102457, - "grad_norm": 4.362255573272705, - "learning_rate": 3.1753427913006563e-06, - "loss": 0.8344, - "step": 16505 - }, - { - "epoch": 0.7654149281409365, - "grad_norm": 3.7136001586914062, - "learning_rate": 3.169430831018969e-06, - "loss": 0.722, - "step": 16510 - }, - { - "epoch": 0.7656467315716272, - "grad_norm": 3.7046449184417725, - "learning_rate": 3.163523342835315e-06, - "loss": 0.7368, - "step": 16515 - }, - { - "epoch": 0.7658785350023181, - "grad_norm": 3.598536729812622, - "learning_rate": 3.1576203306174268e-06, - "loss": 0.9693, - "step": 16520 - }, - { - "epoch": 0.7661103384330088, - "grad_norm": 3.1772992610931396, - "learning_rate": 3.1517217982301184e-06, - "loss": 0.7942, - "step": 16525 - }, - { - "epoch": 0.7663421418636995, - "grad_norm": 4.3848395347595215, - "learning_rate": 3.1458277495352687e-06, - "loss": 0.9413, - "step": 16530 - }, - { - "epoch": 0.7665739452943904, - "grad_norm": 3.851550340652466, - "learning_rate": 3.1399381883918144e-06, - "loss": 1.0156, - "step": 16535 - }, - { - "epoch": 0.7668057487250811, - "grad_norm": 3.9418606758117676, - "learning_rate": 3.134053118655758e-06, - "loss": 0.9909, - "step": 16540 - }, - { - "epoch": 0.7670375521557719, - "grad_norm": 3.4677114486694336, - "learning_rate": 3.1281725441801625e-06, - "loss": 0.7909, - "step": 16545 - }, - { - "epoch": 0.7672693555864627, - "grad_norm": 4.036655902862549, - "learning_rate": 3.1222964688151515e-06, - "loss": 0.8848, - "step": 16550 - }, - { - "epoch": 0.7675011590171534, - "grad_norm": 4.098992824554443, - "learning_rate": 3.1164248964078913e-06, - "loss": 0.8305, - "step": 16555 - }, - { - "epoch": 0.7677329624478443, - "grad_norm": 3.2788567543029785, - "learning_rate": 3.110557830802612e-06, - "loss": 0.9747, - "step": 16560 - }, - { - "epoch": 0.767964765878535, - "grad_norm": 3.436088800430298, - "learning_rate": 3.1046952758405913e-06, - "loss": 0.73, - "step": 16565 - }, - { - "epoch": 0.7681965693092258, - "grad_norm": 3.365654468536377, - "learning_rate": 3.098837235360146e-06, - "loss": 0.7866, - "step": 16570 - }, - { - "epoch": 0.7684283727399166, - "grad_norm": 3.64009428024292, - "learning_rate": 3.092983713196639e-06, - "loss": 0.9828, - "step": 16575 - }, - { - "epoch": 0.7686601761706073, - "grad_norm": 4.071859359741211, - "learning_rate": 3.0871347131824823e-06, - "loss": 0.8831, - "step": 16580 - }, - { - "epoch": 0.7688919796012981, - "grad_norm": 3.6441633701324463, - "learning_rate": 3.0812902391471244e-06, - "loss": 0.8001, - "step": 16585 - }, - { - "epoch": 0.7691237830319889, - "grad_norm": 3.392503023147583, - "learning_rate": 3.075450294917044e-06, - "loss": 0.772, - "step": 16590 - }, - { - "epoch": 0.7693555864626797, - "grad_norm": 3.7266502380371094, - "learning_rate": 3.0696148843157646e-06, - "loss": 0.8001, - "step": 16595 - }, - { - "epoch": 0.7695873898933704, - "grad_norm": 3.742562770843506, - "learning_rate": 3.063784011163833e-06, - "loss": 0.9027, - "step": 16600 - }, - { - "epoch": 0.7695873898933704, - "eval_loss": 0.8843169808387756, - "eval_runtime": 11.2638, - "eval_samples_per_second": 11.275, - "eval_steps_per_second": 11.275, - "step": 16600 - }, - { - "epoch": 0.7698191933240612, - "grad_norm": 3.8710248470306396, - "learning_rate": 3.0579576792788234e-06, - "loss": 0.8078, - "step": 16605 - }, - { - "epoch": 0.770050996754752, - "grad_norm": 2.7604763507843018, - "learning_rate": 3.0521358924753466e-06, - "loss": 0.7083, - "step": 16610 - }, - { - "epoch": 0.7702828001854427, - "grad_norm": 3.607058525085449, - "learning_rate": 3.0463186545650346e-06, - "loss": 0.8634, - "step": 16615 - }, - { - "epoch": 0.7705146036161336, - "grad_norm": 2.935387372970581, - "learning_rate": 3.040505969356532e-06, - "loss": 0.7493, - "step": 16620 - }, - { - "epoch": 0.7707464070468243, - "grad_norm": 3.4781177043914795, - "learning_rate": 3.0346978406555172e-06, - "loss": 0.8851, - "step": 16625 - }, - { - "epoch": 0.770978210477515, - "grad_norm": 3.8345789909362793, - "learning_rate": 3.028894272264671e-06, - "loss": 0.7353, - "step": 16630 - }, - { - "epoch": 0.7712100139082059, - "grad_norm": 3.4135794639587402, - "learning_rate": 3.0230952679837023e-06, - "loss": 0.7298, - "step": 16635 - }, - { - "epoch": 0.7714418173388966, - "grad_norm": 3.7546191215515137, - "learning_rate": 3.0173008316093166e-06, - "loss": 0.9568, - "step": 16640 - }, - { - "epoch": 0.7716736207695873, - "grad_norm": 4.952125072479248, - "learning_rate": 3.0115109669352426e-06, - "loss": 0.9374, - "step": 16645 - }, - { - "epoch": 0.7719054242002782, - "grad_norm": 4.161298751831055, - "learning_rate": 3.0057256777522116e-06, - "loss": 0.8104, - "step": 16650 - }, - { - "epoch": 0.7721372276309689, - "grad_norm": 3.4128384590148926, - "learning_rate": 2.999944967847954e-06, - "loss": 0.7943, - "step": 16655 - }, - { - "epoch": 0.7723690310616597, - "grad_norm": 3.6398515701293945, - "learning_rate": 2.9941688410072057e-06, - "loss": 0.7745, - "step": 16660 - }, - { - "epoch": 0.7726008344923505, - "grad_norm": 4.075809478759766, - "learning_rate": 2.9883973010117017e-06, - "loss": 0.7606, - "step": 16665 - }, - { - "epoch": 0.7728326379230412, - "grad_norm": 3.837707281112671, - "learning_rate": 2.98263035164018e-06, - "loss": 0.7089, - "step": 16670 - }, - { - "epoch": 0.7730644413537321, - "grad_norm": 3.544987440109253, - "learning_rate": 2.97686799666836e-06, - "loss": 0.8661, - "step": 16675 - }, - { - "epoch": 0.7732962447844228, - "grad_norm": 4.513082027435303, - "learning_rate": 2.971110239868964e-06, - "loss": 0.8233, - "step": 16680 - }, - { - "epoch": 0.7735280482151136, - "grad_norm": 3.475184917449951, - "learning_rate": 2.9653570850117065e-06, - "loss": 0.9276, - "step": 16685 - }, - { - "epoch": 0.7737598516458044, - "grad_norm": 3.9753215312957764, - "learning_rate": 2.95960853586327e-06, - "loss": 0.795, - "step": 16690 - }, - { - "epoch": 0.7739916550764951, - "grad_norm": 3.1066231727600098, - "learning_rate": 2.95386459618734e-06, - "loss": 0.85, - "step": 16695 - }, - { - "epoch": 0.7742234585071859, - "grad_norm": 3.46185564994812, - "learning_rate": 2.948125269744583e-06, - "loss": 0.7633, - "step": 16700 - }, - { - "epoch": 0.7742234585071859, - "eval_loss": 0.8845146298408508, - "eval_runtime": 11.2686, - "eval_samples_per_second": 11.27, - "eval_steps_per_second": 11.27, - "step": 16700 - }, - { - "epoch": 0.7744552619378767, - "grad_norm": 3.0039331912994385, - "learning_rate": 2.942390560292633e-06, - "loss": 0.8481, - "step": 16705 - }, - { - "epoch": 0.7746870653685675, - "grad_norm": 3.653935432434082, - "learning_rate": 2.936660471586116e-06, - "loss": 0.9245, - "step": 16710 - }, - { - "epoch": 0.7749188687992582, - "grad_norm": 3.9792356491088867, - "learning_rate": 2.9309350073766173e-06, - "loss": 0.8936, - "step": 16715 - }, - { - "epoch": 0.775150672229949, - "grad_norm": 3.7885968685150146, - "learning_rate": 2.9252141714127115e-06, - "loss": 0.8169, - "step": 16720 - }, - { - "epoch": 0.7753824756606398, - "grad_norm": 4.647716045379639, - "learning_rate": 2.919497967439926e-06, - "loss": 0.9129, - "step": 16725 - }, - { - "epoch": 0.7756142790913305, - "grad_norm": 4.3390398025512695, - "learning_rate": 2.9137863992007666e-06, - "loss": 0.8678, - "step": 16730 - }, - { - "epoch": 0.7758460825220214, - "grad_norm": 3.9097447395324707, - "learning_rate": 2.908079470434706e-06, - "loss": 0.9015, - "step": 16735 - }, - { - "epoch": 0.7760778859527121, - "grad_norm": 3.679063320159912, - "learning_rate": 2.902377184878169e-06, - "loss": 0.8466, - "step": 16740 - }, - { - "epoch": 0.7763096893834028, - "grad_norm": 4.423166275024414, - "learning_rate": 2.8966795462645448e-06, - "loss": 0.8707, - "step": 16745 - }, - { - "epoch": 0.7765414928140937, - "grad_norm": 3.802765130996704, - "learning_rate": 2.8909865583241825e-06, - "loss": 0.8169, - "step": 16750 - }, - { - "epoch": 0.7767732962447844, - "grad_norm": 4.170059680938721, - "learning_rate": 2.8852982247843886e-06, - "loss": 0.9063, - "step": 16755 - }, - { - "epoch": 0.7770050996754752, - "grad_norm": 4.481255054473877, - "learning_rate": 2.8796145493694127e-06, - "loss": 0.9702, - "step": 16760 - }, - { - "epoch": 0.777236903106166, - "grad_norm": 3.3113086223602295, - "learning_rate": 2.873935535800465e-06, - "loss": 0.861, - "step": 16765 - }, - { - "epoch": 0.7774687065368567, - "grad_norm": 4.658487796783447, - "learning_rate": 2.8682611877957003e-06, - "loss": 0.8002, - "step": 16770 - }, - { - "epoch": 0.7777005099675475, - "grad_norm": 3.762125015258789, - "learning_rate": 2.862591509070214e-06, - "loss": 0.8989, - "step": 16775 - }, - { - "epoch": 0.7779323133982383, - "grad_norm": 3.8254103660583496, - "learning_rate": 2.856926503336047e-06, - "loss": 0.8452, - "step": 16780 - }, - { - "epoch": 0.778164116828929, - "grad_norm": 3.9982810020446777, - "learning_rate": 2.851266174302183e-06, - "loss": 0.9509, - "step": 16785 - }, - { - "epoch": 0.7783959202596198, - "grad_norm": 3.761439561843872, - "learning_rate": 2.845610525674547e-06, - "loss": 0.7328, - "step": 16790 - }, - { - "epoch": 0.7786277236903106, - "grad_norm": 3.8584911823272705, - "learning_rate": 2.8399595611559892e-06, - "loss": 1.0582, - "step": 16795 - }, - { - "epoch": 0.7788595271210014, - "grad_norm": 4.529983997344971, - "learning_rate": 2.834313284446303e-06, - "loss": 0.7333, - "step": 16800 - }, - { - "epoch": 0.7788595271210014, - "eval_loss": 0.883889377117157, - "eval_runtime": 11.2662, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, - "step": 16800 - }, - { - "epoch": 0.7790913305516922, - "grad_norm": 3.4581003189086914, - "learning_rate": 2.8286716992422083e-06, - "loss": 0.7859, - "step": 16805 - }, - { - "epoch": 0.779323133982383, - "grad_norm": 3.182137966156006, - "learning_rate": 2.8230348092373474e-06, - "loss": 0.8461, - "step": 16810 - }, - { - "epoch": 0.7795549374130737, - "grad_norm": 4.005838394165039, - "learning_rate": 2.8174026181223e-06, - "loss": 0.8896, - "step": 16815 - }, - { - "epoch": 0.7797867408437645, - "grad_norm": 3.5771877765655518, - "learning_rate": 2.8117751295845673e-06, - "loss": 0.8756, - "step": 16820 - }, - { - "epoch": 0.7800185442744553, - "grad_norm": 3.877868890762329, - "learning_rate": 2.8061523473085626e-06, - "loss": 0.7942, - "step": 16825 - }, - { - "epoch": 0.780250347705146, - "grad_norm": 3.636414051055908, - "learning_rate": 2.8005342749756292e-06, - "loss": 0.7895, - "step": 16830 - }, - { - "epoch": 0.7804821511358369, - "grad_norm": 3.0762102603912354, - "learning_rate": 2.7949209162640144e-06, - "loss": 0.7918, - "step": 16835 - }, - { - "epoch": 0.7807139545665276, - "grad_norm": 4.274107933044434, - "learning_rate": 2.7893122748488943e-06, - "loss": 0.9119, - "step": 16840 - }, - { - "epoch": 0.7809457579972183, - "grad_norm": 4.0760602951049805, - "learning_rate": 2.7837083544023425e-06, - "loss": 0.9507, - "step": 16845 - }, - { - "epoch": 0.7811775614279092, - "grad_norm": 3.145632028579712, - "learning_rate": 2.7781091585933484e-06, - "loss": 0.7351, - "step": 16850 - }, - { - "epoch": 0.7814093648585999, - "grad_norm": 4.766406536102295, - "learning_rate": 2.772514691087813e-06, - "loss": 0.9983, - "step": 16855 - }, - { - "epoch": 0.7816411682892906, - "grad_norm": 2.734017848968506, - "learning_rate": 2.7669249555485323e-06, - "loss": 0.7706, - "step": 16860 - }, - { - "epoch": 0.7818729717199815, - "grad_norm": 3.6416406631469727, - "learning_rate": 2.761339955635205e-06, - "loss": 0.8816, - "step": 16865 - }, - { - "epoch": 0.7821047751506722, - "grad_norm": 3.407658576965332, - "learning_rate": 2.755759695004434e-06, - "loss": 0.8622, - "step": 16870 - }, - { - "epoch": 0.782336578581363, - "grad_norm": 3.256417989730835, - "learning_rate": 2.7501841773097227e-06, - "loss": 0.684, - "step": 16875 - }, - { - "epoch": 0.7825683820120538, - "grad_norm": 4.0953826904296875, - "learning_rate": 2.7446134062014563e-06, - "loss": 0.8825, - "step": 16880 - }, - { - "epoch": 0.7828001854427445, - "grad_norm": 3.547276258468628, - "learning_rate": 2.739047385326924e-06, - "loss": 0.6165, - "step": 16885 - }, - { - "epoch": 0.7830319888734353, - "grad_norm": 3.827667474746704, - "learning_rate": 2.7334861183303064e-06, - "loss": 0.8814, - "step": 16890 - }, - { - "epoch": 0.7832637923041261, - "grad_norm": 4.139094352722168, - "learning_rate": 2.727929608852655e-06, - "loss": 0.9706, - "step": 16895 - }, - { - "epoch": 0.7834955957348169, - "grad_norm": 4.088076591491699, - "learning_rate": 2.722377860531924e-06, - "loss": 1.0389, - "step": 16900 - }, - { - "epoch": 0.7834955957348169, - "eval_loss": 0.8829403519630432, - "eval_runtime": 11.2748, - "eval_samples_per_second": 11.264, - "eval_steps_per_second": 11.264, - "step": 16900 - }, - { - "epoch": 0.7837273991655076, - "grad_norm": 3.537174940109253, - "learning_rate": 2.716830877002946e-06, - "loss": 0.9587, - "step": 16905 - }, - { - "epoch": 0.7839592025961984, - "grad_norm": 4.048293113708496, - "learning_rate": 2.7112886618974264e-06, - "loss": 0.9287, - "step": 16910 - }, - { - "epoch": 0.7841910060268892, - "grad_norm": 4.15169095993042, - "learning_rate": 2.7057512188439615e-06, - "loss": 0.8293, - "step": 16915 - }, - { - "epoch": 0.7844228094575799, - "grad_norm": 3.689326763153076, - "learning_rate": 2.700218551468009e-06, - "loss": 0.7086, - "step": 16920 - }, - { - "epoch": 0.7846546128882708, - "grad_norm": 3.1314663887023926, - "learning_rate": 2.6946906633919136e-06, - "loss": 0.9349, - "step": 16925 - }, - { - "epoch": 0.7848864163189615, - "grad_norm": 4.037022113800049, - "learning_rate": 2.6891675582348774e-06, - "loss": 0.9397, - "step": 16930 - }, - { - "epoch": 0.7851182197496523, - "grad_norm": 3.389939785003662, - "learning_rate": 2.683649239612982e-06, - "loss": 0.8126, - "step": 16935 - }, - { - "epoch": 0.7853500231803431, - "grad_norm": 4.172562599182129, - "learning_rate": 2.6781357111391725e-06, - "loss": 0.9107, - "step": 16940 - }, - { - "epoch": 0.7855818266110338, - "grad_norm": 3.2886476516723633, - "learning_rate": 2.672626976423256e-06, - "loss": 0.6793, - "step": 16945 - }, - { - "epoch": 0.7858136300417247, - "grad_norm": 3.162684679031372, - "learning_rate": 2.6671230390718962e-06, - "loss": 0.8093, - "step": 16950 - }, - { - "epoch": 0.7860454334724154, - "grad_norm": 4.041141033172607, - "learning_rate": 2.661623902688625e-06, - "loss": 1.0565, - "step": 16955 - }, - { - "epoch": 0.7862772369031061, - "grad_norm": 3.6500637531280518, - "learning_rate": 2.65612957087383e-06, - "loss": 0.7986, - "step": 16960 - }, - { - "epoch": 0.786509040333797, - "grad_norm": 3.767634868621826, - "learning_rate": 2.650640047224744e-06, - "loss": 0.8563, - "step": 16965 - }, - { - "epoch": 0.7867408437644877, - "grad_norm": 3.3875231742858887, - "learning_rate": 2.645155335335461e-06, - "loss": 0.7401, - "step": 16970 - }, - { - "epoch": 0.7869726471951785, - "grad_norm": 3.8744959831237793, - "learning_rate": 2.639675438796926e-06, - "loss": 0.7653, - "step": 16975 - }, - { - "epoch": 0.7872044506258693, - "grad_norm": 4.187319278717041, - "learning_rate": 2.6342003611969225e-06, - "loss": 0.6953, - "step": 16980 - }, - { - "epoch": 0.78743625405656, - "grad_norm": 3.9528913497924805, - "learning_rate": 2.6287301061200822e-06, - "loss": 0.7537, - "step": 16985 - }, - { - "epoch": 0.7876680574872508, - "grad_norm": 3.991596221923828, - "learning_rate": 2.623264677147882e-06, - "loss": 0.9745, - "step": 16990 - }, - { - "epoch": 0.7878998609179416, - "grad_norm": 3.6371634006500244, - "learning_rate": 2.6178040778586424e-06, - "loss": 0.921, - "step": 16995 - }, - { - "epoch": 0.7881316643486324, - "grad_norm": 4.425895690917969, - "learning_rate": 2.6123483118275097e-06, - "loss": 0.8142, - "step": 17000 - }, - { - "epoch": 0.7881316643486324, - "eval_loss": 0.8824594020843506, - "eval_runtime": 11.268, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 17000 - }, - { - "epoch": 0.7883634677793231, - "grad_norm": 3.481616735458374, - "learning_rate": 2.60689738262648e-06, - "loss": 0.7512, - "step": 17005 - }, - { - "epoch": 0.7885952712100139, - "grad_norm": 3.797632932662964, - "learning_rate": 2.601451293824372e-06, - "loss": 0.7542, - "step": 17010 - }, - { - "epoch": 0.7888270746407047, - "grad_norm": 4.2333173751831055, - "learning_rate": 2.5960100489868377e-06, - "loss": 0.9087, - "step": 17015 - }, - { - "epoch": 0.7890588780713954, - "grad_norm": 4.05795955657959, - "learning_rate": 2.5905736516763626e-06, - "loss": 0.8283, - "step": 17020 - }, - { - "epoch": 0.7892906815020863, - "grad_norm": 3.3479185104370117, - "learning_rate": 2.5851421054522552e-06, - "loss": 0.697, - "step": 17025 - }, - { - "epoch": 0.789522484932777, - "grad_norm": 4.0138139724731445, - "learning_rate": 2.579715413870645e-06, - "loss": 0.7351, - "step": 17030 - }, - { - "epoch": 0.7897542883634677, - "grad_norm": 3.8981614112854004, - "learning_rate": 2.5742935804844894e-06, - "loss": 0.9307, - "step": 17035 - }, - { - "epoch": 0.7899860917941586, - "grad_norm": 4.282942295074463, - "learning_rate": 2.568876608843556e-06, - "loss": 0.941, - "step": 17040 - }, - { - "epoch": 0.7902178952248493, - "grad_norm": 4.372350215911865, - "learning_rate": 2.5634645024944414e-06, - "loss": 1.047, - "step": 17045 - }, - { - "epoch": 0.7904496986555402, - "grad_norm": 4.30950927734375, - "learning_rate": 2.5580572649805445e-06, - "loss": 0.9546, - "step": 17050 - }, - { - "epoch": 0.7906815020862309, - "grad_norm": 3.705162286758423, - "learning_rate": 2.5526548998420843e-06, - "loss": 0.8446, - "step": 17055 - }, - { - "epoch": 0.7909133055169216, - "grad_norm": 3.317826271057129, - "learning_rate": 2.5472574106160907e-06, - "loss": 0.9309, - "step": 17060 - }, - { - "epoch": 0.7911451089476125, - "grad_norm": 4.324466705322266, - "learning_rate": 2.5418648008363955e-06, - "loss": 0.835, - "step": 17065 - }, - { - "epoch": 0.7913769123783032, - "grad_norm": 3.8251116275787354, - "learning_rate": 2.5364770740336364e-06, - "loss": 0.8074, - "step": 17070 - }, - { - "epoch": 0.791608715808994, - "grad_norm": 3.2614023685455322, - "learning_rate": 2.531094233735257e-06, - "loss": 0.8442, - "step": 17075 - }, - { - "epoch": 0.7918405192396848, - "grad_norm": 3.841665267944336, - "learning_rate": 2.5257162834655057e-06, - "loss": 0.7189, - "step": 17080 - }, - { - "epoch": 0.7920723226703755, - "grad_norm": 3.531162977218628, - "learning_rate": 2.5203432267454188e-06, - "loss": 0.9309, - "step": 17085 - }, - { - "epoch": 0.7923041261010663, - "grad_norm": 4.039501667022705, - "learning_rate": 2.514975067092835e-06, - "loss": 0.8324, - "step": 17090 - }, - { - "epoch": 0.7925359295317571, - "grad_norm": 3.6314001083374023, - "learning_rate": 2.509611808022394e-06, - "loss": 0.8618, - "step": 17095 - }, - { - "epoch": 0.7927677329624478, - "grad_norm": 4.1385178565979, - "learning_rate": 2.5042534530455076e-06, - "loss": 0.8456, - "step": 17100 - }, - { - "epoch": 0.7927677329624478, - "eval_loss": 0.8818289637565613, - "eval_runtime": 11.2697, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 17100 - }, - { - "epoch": 0.7929995363931386, - "grad_norm": 3.2137322425842285, - "learning_rate": 2.498900005670394e-06, - "loss": 0.7141, - "step": 17105 - }, - { - "epoch": 0.7932313398238294, - "grad_norm": 4.110595226287842, - "learning_rate": 2.4935514694020545e-06, - "loss": 0.867, - "step": 17110 - }, - { - "epoch": 0.7934631432545202, - "grad_norm": 3.7244606018066406, - "learning_rate": 2.488207847742269e-06, - "loss": 0.9114, - "step": 17115 - }, - { - "epoch": 0.7936949466852109, - "grad_norm": 2.8758745193481445, - "learning_rate": 2.4828691441896056e-06, - "loss": 0.7327, - "step": 17120 - }, - { - "epoch": 0.7939267501159017, - "grad_norm": 3.3576953411102295, - "learning_rate": 2.477535362239414e-06, - "loss": 0.9152, - "step": 17125 - }, - { - "epoch": 0.7941585535465925, - "grad_norm": 3.668461561203003, - "learning_rate": 2.472206505383815e-06, - "loss": 0.9391, - "step": 17130 - }, - { - "epoch": 0.7943903569772832, - "grad_norm": 3.574362277984619, - "learning_rate": 2.4668825771117065e-06, - "loss": 0.8111, - "step": 17135 - }, - { - "epoch": 0.7946221604079741, - "grad_norm": 3.9505579471588135, - "learning_rate": 2.4615635809087646e-06, - "loss": 0.9073, - "step": 17140 - }, - { - "epoch": 0.7948539638386648, - "grad_norm": 3.6357901096343994, - "learning_rate": 2.456249520257433e-06, - "loss": 0.8859, - "step": 17145 - }, - { - "epoch": 0.7950857672693555, - "grad_norm": 4.1312127113342285, - "learning_rate": 2.4509403986369242e-06, - "loss": 1.0306, - "step": 17150 - }, - { - "epoch": 0.7953175707000464, - "grad_norm": 3.519533634185791, - "learning_rate": 2.445636219523212e-06, - "loss": 0.8652, - "step": 17155 - }, - { - "epoch": 0.7955493741307371, - "grad_norm": 3.3949859142303467, - "learning_rate": 2.4403369863890425e-06, - "loss": 0.7245, - "step": 17160 - }, - { - "epoch": 0.7957811775614279, - "grad_norm": 3.7756998538970947, - "learning_rate": 2.4350427027039234e-06, - "loss": 0.8728, - "step": 17165 - }, - { - "epoch": 0.7960129809921187, - "grad_norm": 3.843841075897217, - "learning_rate": 2.429753371934114e-06, - "loss": 0.856, - "step": 17170 - }, - { - "epoch": 0.7962447844228094, - "grad_norm": 3.760075330734253, - "learning_rate": 2.424468997542635e-06, - "loss": 1.0026, - "step": 17175 - }, - { - "epoch": 0.7964765878535003, - "grad_norm": 3.853175163269043, - "learning_rate": 2.4191895829892707e-06, - "loss": 0.7359, - "step": 17180 - }, - { - "epoch": 0.796708391284191, - "grad_norm": 3.496713638305664, - "learning_rate": 2.4139151317305386e-06, - "loss": 0.7392, - "step": 17185 - }, - { - "epoch": 0.7969401947148818, - "grad_norm": 3.5978150367736816, - "learning_rate": 2.4086456472197227e-06, - "loss": 0.9091, - "step": 17190 - }, - { - "epoch": 0.7971719981455726, - "grad_norm": 3.749814033508301, - "learning_rate": 2.403381132906851e-06, - "loss": 0.794, - "step": 17195 - }, - { - "epoch": 0.7974038015762633, - "grad_norm": 4.097865581512451, - "learning_rate": 2.3981215922386993e-06, - "loss": 0.9018, - "step": 17200 - }, - { - "epoch": 0.7974038015762633, - "eval_loss": 0.8813273906707764, - "eval_runtime": 11.2722, - "eval_samples_per_second": 11.267, - "eval_steps_per_second": 11.267, - "step": 17200 - }, - { - "epoch": 0.7976356050069541, - "grad_norm": 4.155472755432129, - "learning_rate": 2.3928670286587797e-06, - "loss": 0.8676, - "step": 17205 - }, - { - "epoch": 0.7978674084376449, - "grad_norm": 3.5118024349212646, - "learning_rate": 2.3876174456073564e-06, - "loss": 0.8646, - "step": 17210 - }, - { - "epoch": 0.7980992118683357, - "grad_norm": 3.5498898029327393, - "learning_rate": 2.3823728465214235e-06, - "loss": 0.85, - "step": 17215 - }, - { - "epoch": 0.7983310152990264, - "grad_norm": 4.195731163024902, - "learning_rate": 2.377133234834713e-06, - "loss": 0.8185, - "step": 17220 - }, - { - "epoch": 0.7985628187297172, - "grad_norm": 4.20908260345459, - "learning_rate": 2.3718986139776967e-06, - "loss": 0.8682, - "step": 17225 - }, - { - "epoch": 0.798794622160408, - "grad_norm": 4.149493217468262, - "learning_rate": 2.3666689873775793e-06, - "loss": 1.0926, - "step": 17230 - }, - { - "epoch": 0.7990264255910987, - "grad_norm": 4.024598598480225, - "learning_rate": 2.361444358458288e-06, - "loss": 0.9183, - "step": 17235 - }, - { - "epoch": 0.7992582290217896, - "grad_norm": 3.8815884590148926, - "learning_rate": 2.356224730640486e-06, - "loss": 0.8183, - "step": 17240 - }, - { - "epoch": 0.7994900324524803, - "grad_norm": 4.009396553039551, - "learning_rate": 2.3510101073415537e-06, - "loss": 0.7657, - "step": 17245 - }, - { - "epoch": 0.799721835883171, - "grad_norm": 3.6904358863830566, - "learning_rate": 2.345800491975606e-06, - "loss": 0.9904, - "step": 17250 - }, - { - "epoch": 0.7999536393138619, - "grad_norm": 4.349132537841797, - "learning_rate": 2.3405958879534687e-06, - "loss": 0.7577, - "step": 17255 - }, - { - "epoch": 0.8001854427445526, - "grad_norm": 3.6113533973693848, - "learning_rate": 2.3353962986826904e-06, - "loss": 0.8239, - "step": 17260 - }, - { - "epoch": 0.8004172461752433, - "grad_norm": 3.7421815395355225, - "learning_rate": 2.3302017275675427e-06, - "loss": 0.8305, - "step": 17265 - }, - { - "epoch": 0.8006490496059342, - "grad_norm": 3.947587013244629, - "learning_rate": 2.325012178009002e-06, - "loss": 0.9255, - "step": 17270 - }, - { - "epoch": 0.8008808530366249, - "grad_norm": 4.150144577026367, - "learning_rate": 2.319827653404757e-06, - "loss": 0.8958, - "step": 17275 - }, - { - "epoch": 0.8011126564673157, - "grad_norm": 3.0124239921569824, - "learning_rate": 2.314648157149214e-06, - "loss": 0.6897, - "step": 17280 - }, - { - "epoch": 0.8013444598980065, - "grad_norm": 3.2395570278167725, - "learning_rate": 2.3094736926334883e-06, - "loss": 0.8505, - "step": 17285 - }, - { - "epoch": 0.8015762633286972, - "grad_norm": 3.67964768409729, - "learning_rate": 2.304304263245387e-06, - "loss": 0.7883, - "step": 17290 - }, - { - "epoch": 0.8018080667593881, - "grad_norm": 3.6921372413635254, - "learning_rate": 2.2991398723694346e-06, - "loss": 0.7569, - "step": 17295 - }, - { - "epoch": 0.8020398701900788, - "grad_norm": 3.7016260623931885, - "learning_rate": 2.2939805233868575e-06, - "loss": 0.7654, - "step": 17300 - }, - { - "epoch": 0.8020398701900788, - "eval_loss": 0.8810060620307922, - "eval_runtime": 11.2796, - "eval_samples_per_second": 11.259, - "eval_steps_per_second": 11.259, - "step": 17300 - }, - { - "epoch": 0.8022716736207696, - "grad_norm": 3.850266933441162, - "learning_rate": 2.288826219675563e-06, - "loss": 0.8341, - "step": 17305 - }, - { - "epoch": 0.8025034770514604, - "grad_norm": 3.525607109069824, - "learning_rate": 2.2836769646101743e-06, - "loss": 0.8223, - "step": 17310 - }, - { - "epoch": 0.8027352804821511, - "grad_norm": 4.057685375213623, - "learning_rate": 2.2785327615620055e-06, - "loss": 0.8506, - "step": 17315 - }, - { - "epoch": 0.8029670839128419, - "grad_norm": 4.184149742126465, - "learning_rate": 2.273393613899053e-06, - "loss": 1.0103, - "step": 17320 - }, - { - "epoch": 0.8031988873435327, - "grad_norm": 3.58420729637146, - "learning_rate": 2.2682595249860138e-06, - "loss": 0.828, - "step": 17325 - }, - { - "epoch": 0.8034306907742235, - "grad_norm": 3.695439577102661, - "learning_rate": 2.263130498184274e-06, - "loss": 0.8649, - "step": 17330 - }, - { - "epoch": 0.8036624942049142, - "grad_norm": 3.5146336555480957, - "learning_rate": 2.2580065368518943e-06, - "loss": 0.834, - "step": 17335 - }, - { - "epoch": 0.803894297635605, - "grad_norm": 3.4850876331329346, - "learning_rate": 2.2528876443436266e-06, - "loss": 0.6973, - "step": 17340 - }, - { - "epoch": 0.8041261010662958, - "grad_norm": 3.5119123458862305, - "learning_rate": 2.247773824010903e-06, - "loss": 0.6997, - "step": 17345 - }, - { - "epoch": 0.8043579044969865, - "grad_norm": 4.371307373046875, - "learning_rate": 2.242665079201839e-06, - "loss": 0.8097, - "step": 17350 - }, - { - "epoch": 0.8045897079276774, - "grad_norm": 3.5886342525482178, - "learning_rate": 2.237561413261218e-06, - "loss": 0.8238, - "step": 17355 - }, - { - "epoch": 0.8048215113583681, - "grad_norm": 3.4891817569732666, - "learning_rate": 2.232462829530506e-06, - "loss": 0.7724, - "step": 17360 - }, - { - "epoch": 0.8050533147890588, - "grad_norm": 3.905630588531494, - "learning_rate": 2.2273693313478352e-06, - "loss": 0.708, - "step": 17365 - }, - { - "epoch": 0.8052851182197497, - "grad_norm": 3.9939043521881104, - "learning_rate": 2.222280922048017e-06, - "loss": 0.8429, - "step": 17370 - }, - { - "epoch": 0.8055169216504404, - "grad_norm": 3.5375664234161377, - "learning_rate": 2.2171976049625186e-06, - "loss": 0.9396, - "step": 17375 - }, - { - "epoch": 0.8057487250811312, - "grad_norm": 3.547438144683838, - "learning_rate": 2.2121193834194855e-06, - "loss": 0.7913, - "step": 17380 - }, - { - "epoch": 0.805980528511822, - "grad_norm": 4.114473342895508, - "learning_rate": 2.207046260743726e-06, - "loss": 0.8347, - "step": 17385 - }, - { - "epoch": 0.8062123319425127, - "grad_norm": 3.792337656021118, - "learning_rate": 2.2019782402566934e-06, - "loss": 0.8177, - "step": 17390 - }, - { - "epoch": 0.8064441353732035, - "grad_norm": 3.6505558490753174, - "learning_rate": 2.196915325276522e-06, - "loss": 0.8452, - "step": 17395 - }, - { - "epoch": 0.8066759388038943, - "grad_norm": 4.075436592102051, - "learning_rate": 2.1918575191179937e-06, - "loss": 0.7933, - "step": 17400 - }, - { - "epoch": 0.8066759388038943, - "eval_loss": 0.8806984424591064, - "eval_runtime": 11.2762, - "eval_samples_per_second": 11.263, - "eval_steps_per_second": 11.263, - "step": 17400 - }, - { - "epoch": 0.8069077422345851, - "grad_norm": 4.127367973327637, - "learning_rate": 2.186804825092542e-06, - "loss": 0.8318, - "step": 17405 - }, - { - "epoch": 0.8071395456652758, - "grad_norm": 3.078382968902588, - "learning_rate": 2.1817572465082627e-06, - "loss": 0.7779, - "step": 17410 - }, - { - "epoch": 0.8073713490959666, - "grad_norm": 3.11124587059021, - "learning_rate": 2.1767147866698967e-06, - "loss": 0.8903, - "step": 17415 - }, - { - "epoch": 0.8076031525266574, - "grad_norm": 4.386844635009766, - "learning_rate": 2.171677448878833e-06, - "loss": 0.9116, - "step": 17420 - }, - { - "epoch": 0.8078349559573482, - "grad_norm": 3.5533015727996826, - "learning_rate": 2.166645236433106e-06, - "loss": 0.8415, - "step": 17425 - }, - { - "epoch": 0.808066759388039, - "grad_norm": 3.453176736831665, - "learning_rate": 2.1616181526273994e-06, - "loss": 0.8109, - "step": 17430 - }, - { - "epoch": 0.8082985628187297, - "grad_norm": 3.496314525604248, - "learning_rate": 2.156596200753038e-06, - "loss": 0.976, - "step": 17435 - }, - { - "epoch": 0.8085303662494205, - "grad_norm": 3.6104514598846436, - "learning_rate": 2.1515793840979827e-06, - "loss": 0.7953, - "step": 17440 - }, - { - "epoch": 0.8087621696801113, - "grad_norm": 3.6141369342803955, - "learning_rate": 2.146567705946837e-06, - "loss": 0.7809, - "step": 17445 - }, - { - "epoch": 0.808993973110802, - "grad_norm": 4.1143293380737305, - "learning_rate": 2.1415611695808334e-06, - "loss": 0.9777, - "step": 17450 - }, - { - "epoch": 0.8092257765414929, - "grad_norm": 3.966641426086426, - "learning_rate": 2.1365597782778493e-06, - "loss": 0.9165, - "step": 17455 - }, - { - "epoch": 0.8094575799721836, - "grad_norm": 3.243546962738037, - "learning_rate": 2.13156353531238e-06, - "loss": 0.856, - "step": 17460 - }, - { - "epoch": 0.8096893834028743, - "grad_norm": 3.5764763355255127, - "learning_rate": 2.1265724439555592e-06, - "loss": 0.7839, - "step": 17465 - }, - { - "epoch": 0.8099211868335652, - "grad_norm": 3.726209878921509, - "learning_rate": 2.1215865074751487e-06, - "loss": 0.7766, - "step": 17470 - }, - { - "epoch": 0.8101529902642559, - "grad_norm": 4.007201671600342, - "learning_rate": 2.116605729135528e-06, - "loss": 0.6973, - "step": 17475 - }, - { - "epoch": 0.8103847936949466, - "grad_norm": 3.5548746585845947, - "learning_rate": 2.1116301121977024e-06, - "loss": 0.9117, - "step": 17480 - }, - { - "epoch": 0.8106165971256375, - "grad_norm": 4.227999210357666, - "learning_rate": 2.1066596599193e-06, - "loss": 0.9328, - "step": 17485 - }, - { - "epoch": 0.8108484005563282, - "grad_norm": 4.0675201416015625, - "learning_rate": 2.1016943755545704e-06, - "loss": 1.0801, - "step": 17490 - }, - { - "epoch": 0.811080203987019, - "grad_norm": 4.169510364532471, - "learning_rate": 2.0967342623543695e-06, - "loss": 0.8303, - "step": 17495 - }, - { - "epoch": 0.8113120074177098, - "grad_norm": 3.53352952003479, - "learning_rate": 2.0917793235661777e-06, - "loss": 0.7763, - "step": 17500 - }, - { - "epoch": 0.8113120074177098, - "eval_loss": 0.880153477191925, - "eval_runtime": 11.2652, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 17500 - }, - { - "epoch": 0.8115438108484005, - "grad_norm": 3.3654439449310303, - "learning_rate": 2.0868295624340874e-06, - "loss": 0.932, - "step": 17505 - }, - { - "epoch": 0.8117756142790913, - "grad_norm": 4.071439266204834, - "learning_rate": 2.0818849821987875e-06, - "loss": 0.9639, - "step": 17510 - }, - { - "epoch": 0.8120074177097821, - "grad_norm": 3.366882085800171, - "learning_rate": 2.0769455860975896e-06, - "loss": 0.9948, - "step": 17515 - }, - { - "epoch": 0.8122392211404729, - "grad_norm": 4.544328212738037, - "learning_rate": 2.07201137736441e-06, - "loss": 0.9075, - "step": 17520 - }, - { - "epoch": 0.8124710245711636, - "grad_norm": 4.452793598175049, - "learning_rate": 2.0670823592297583e-06, - "loss": 0.8756, - "step": 17525 - }, - { - "epoch": 0.8127028280018544, - "grad_norm": 3.932635545730591, - "learning_rate": 2.0621585349207562e-06, - "loss": 0.9399, - "step": 17530 - }, - { - "epoch": 0.8129346314325452, - "grad_norm": 3.453287363052368, - "learning_rate": 2.0572399076611228e-06, - "loss": 0.7789, - "step": 17535 - }, - { - "epoch": 0.8131664348632359, - "grad_norm": 3.538092613220215, - "learning_rate": 2.052326480671171e-06, - "loss": 0.8716, - "step": 17540 - }, - { - "epoch": 0.8133982382939268, - "grad_norm": 4.095542907714844, - "learning_rate": 2.047418257167808e-06, - "loss": 0.8383, - "step": 17545 - }, - { - "epoch": 0.8136300417246175, - "grad_norm": 3.764967918395996, - "learning_rate": 2.04251524036454e-06, - "loss": 0.8201, - "step": 17550 - }, - { - "epoch": 0.8138618451553083, - "grad_norm": 3.590008497238159, - "learning_rate": 2.037617433471464e-06, - "loss": 0.6739, - "step": 17555 - }, - { - "epoch": 0.8140936485859991, - "grad_norm": 3.986955404281616, - "learning_rate": 2.0327248396952582e-06, - "loss": 0.8388, - "step": 17560 - }, - { - "epoch": 0.8143254520166898, - "grad_norm": 3.728408098220825, - "learning_rate": 2.0278374622391983e-06, - "loss": 0.9149, - "step": 17565 - }, - { - "epoch": 0.8145572554473807, - "grad_norm": 3.5650875568389893, - "learning_rate": 2.0229553043031324e-06, - "loss": 0.8783, - "step": 17570 - }, - { - "epoch": 0.8147890588780714, - "grad_norm": 4.118309497833252, - "learning_rate": 2.018078369083506e-06, - "loss": 0.7925, - "step": 17575 - }, - { - "epoch": 0.8150208623087621, - "grad_norm": 4.11269998550415, - "learning_rate": 2.0132066597733302e-06, - "loss": 0.9272, - "step": 17580 - }, - { - "epoch": 0.815252665739453, - "grad_norm": 3.2680740356445312, - "learning_rate": 2.0083401795622046e-06, - "loss": 0.8929, - "step": 17585 - }, - { - "epoch": 0.8154844691701437, - "grad_norm": 3.561012029647827, - "learning_rate": 2.0034789316363067e-06, - "loss": 0.7216, - "step": 17590 - }, - { - "epoch": 0.8157162726008345, - "grad_norm": 3.8684680461883545, - "learning_rate": 1.99862291917838e-06, - "loss": 0.8087, - "step": 17595 - }, - { - "epoch": 0.8159480760315253, - "grad_norm": 3.5848188400268555, - "learning_rate": 1.993772145367742e-06, - "loss": 0.9077, - "step": 17600 - }, - { - "epoch": 0.8159480760315253, - "eval_loss": 0.8796182870864868, - "eval_runtime": 11.273, - "eval_samples_per_second": 11.266, - "eval_steps_per_second": 11.266, - "step": 17600 - }, - { - "epoch": 0.816179879462216, - "grad_norm": 3.892946243286133, - "learning_rate": 1.9889266133802877e-06, - "loss": 0.8927, - "step": 17605 - }, - { - "epoch": 0.8164116828929068, - "grad_norm": 3.616286277770996, - "learning_rate": 1.98408632638847e-06, - "loss": 0.838, - "step": 17610 - }, - { - "epoch": 0.8166434863235976, - "grad_norm": 3.7281265258789062, - "learning_rate": 1.979251287561316e-06, - "loss": 0.7584, - "step": 17615 - }, - { - "epoch": 0.8168752897542884, - "grad_norm": 3.5958240032196045, - "learning_rate": 1.9744215000644153e-06, - "loss": 0.8676, - "step": 17620 - }, - { - "epoch": 0.8171070931849791, - "grad_norm": 3.500471591949463, - "learning_rate": 1.9695969670599156e-06, - "loss": 0.7494, - "step": 17625 - }, - { - "epoch": 0.8173388966156699, - "grad_norm": 3.6047792434692383, - "learning_rate": 1.9647776917065253e-06, - "loss": 0.8516, - "step": 17630 - }, - { - "epoch": 0.8175707000463607, - "grad_norm": 3.827798366546631, - "learning_rate": 1.959963677159512e-06, - "loss": 0.9751, - "step": 17635 - }, - { - "epoch": 0.8178025034770514, - "grad_norm": 3.7230641841888428, - "learning_rate": 1.9551549265707025e-06, - "loss": 0.8542, - "step": 17640 - }, - { - "epoch": 0.8180343069077423, - "grad_norm": 3.4928386211395264, - "learning_rate": 1.9503514430884685e-06, - "loss": 0.7689, - "step": 17645 - }, - { - "epoch": 0.818266110338433, - "grad_norm": 3.538079261779785, - "learning_rate": 1.9455532298577417e-06, - "loss": 0.6581, - "step": 17650 - }, - { - "epoch": 0.8184979137691237, - "grad_norm": 3.569852590560913, - "learning_rate": 1.9407602900199975e-06, - "loss": 0.9138, - "step": 17655 - }, - { - "epoch": 0.8187297171998146, - "grad_norm": 4.027120590209961, - "learning_rate": 1.935972626713263e-06, - "loss": 0.9222, - "step": 17660 - }, - { - "epoch": 0.8189615206305053, - "grad_norm": 3.9751973152160645, - "learning_rate": 1.931190243072105e-06, - "loss": 0.8335, - "step": 17665 - }, - { - "epoch": 0.8191933240611962, - "grad_norm": 3.7292730808258057, - "learning_rate": 1.9264131422276387e-06, - "loss": 0.8459, - "step": 17670 - }, - { - "epoch": 0.8194251274918869, - "grad_norm": 4.147140979766846, - "learning_rate": 1.921641327307523e-06, - "loss": 0.7935, - "step": 17675 - }, - { - "epoch": 0.8196569309225776, - "grad_norm": 4.169584274291992, - "learning_rate": 1.9168748014359494e-06, - "loss": 0.8263, - "step": 17680 - }, - { - "epoch": 0.8198887343532685, - "grad_norm": 3.447660446166992, - "learning_rate": 1.9121135677336445e-06, - "loss": 1.0089, - "step": 17685 - }, - { - "epoch": 0.8201205377839592, - "grad_norm": 3.7443501949310303, - "learning_rate": 1.9073576293178797e-06, - "loss": 0.8516, - "step": 17690 - }, - { - "epoch": 0.82035234121465, - "grad_norm": 3.317669630050659, - "learning_rate": 1.9026069893024556e-06, - "loss": 0.7569, - "step": 17695 - }, - { - "epoch": 0.8205841446453408, - "grad_norm": 3.989811897277832, - "learning_rate": 1.8978616507976989e-06, - "loss": 0.6587, - "step": 17700 - }, - { - "epoch": 0.8205841446453408, - "eval_loss": 0.8793162703514099, - "eval_runtime": 11.2717, - "eval_samples_per_second": 11.267, - "eval_steps_per_second": 11.267, - "step": 17700 - }, - { - "epoch": 0.8208159480760315, - "grad_norm": 3.4570274353027344, - "learning_rate": 1.8931216169104704e-06, - "loss": 0.8214, - "step": 17705 - }, - { - "epoch": 0.8210477515067223, - "grad_norm": 3.902951240539551, - "learning_rate": 1.8883868907441615e-06, - "loss": 0.7536, - "step": 17710 - }, - { - "epoch": 0.8212795549374131, - "grad_norm": 3.0927488803863525, - "learning_rate": 1.8836574753986758e-06, - "loss": 0.7152, - "step": 17715 - }, - { - "epoch": 0.8215113583681039, - "grad_norm": 4.087670803070068, - "learning_rate": 1.8789333739704496e-06, - "loss": 0.9579, - "step": 17720 - }, - { - "epoch": 0.8217431617987946, - "grad_norm": 3.9503560066223145, - "learning_rate": 1.8742145895524432e-06, - "loss": 0.7369, - "step": 17725 - }, - { - "epoch": 0.8219749652294854, - "grad_norm": 2.7508480548858643, - "learning_rate": 1.8695011252341244e-06, - "loss": 0.7558, - "step": 17730 - }, - { - "epoch": 0.8222067686601762, - "grad_norm": 7.382952690124512, - "learning_rate": 1.8647929841014867e-06, - "loss": 0.9415, - "step": 17735 - }, - { - "epoch": 0.8224385720908669, - "grad_norm": 3.3182077407836914, - "learning_rate": 1.8600901692370377e-06, - "loss": 0.8364, - "step": 17740 - }, - { - "epoch": 0.8226703755215578, - "grad_norm": 4.234379291534424, - "learning_rate": 1.8553926837197956e-06, - "loss": 1.0117, - "step": 17745 - }, - { - "epoch": 0.8229021789522485, - "grad_norm": 3.2763216495513916, - "learning_rate": 1.8507005306252856e-06, - "loss": 0.8022, - "step": 17750 - }, - { - "epoch": 0.8231339823829392, - "grad_norm": 4.160790920257568, - "learning_rate": 1.8460137130255472e-06, - "loss": 0.9042, - "step": 17755 - }, - { - "epoch": 0.8233657858136301, - "grad_norm": 3.7351162433624268, - "learning_rate": 1.8413322339891303e-06, - "loss": 0.8661, - "step": 17760 - }, - { - "epoch": 0.8235975892443208, - "grad_norm": 3.6955184936523438, - "learning_rate": 1.8366560965810787e-06, - "loss": 0.8112, - "step": 17765 - }, - { - "epoch": 0.8238293926750115, - "grad_norm": 3.1400656700134277, - "learning_rate": 1.8319853038629486e-06, - "loss": 0.8455, - "step": 17770 - }, - { - "epoch": 0.8240611961057024, - "grad_norm": 3.6449131965637207, - "learning_rate": 1.8273198588927887e-06, - "loss": 0.7387, - "step": 17775 - }, - { - "epoch": 0.8242929995363931, - "grad_norm": 3.8718135356903076, - "learning_rate": 1.8226597647251564e-06, - "loss": 0.8486, - "step": 17780 - }, - { - "epoch": 0.8245248029670839, - "grad_norm": 3.834111452102661, - "learning_rate": 1.8180050244110947e-06, - "loss": 0.7541, - "step": 17785 - }, - { - "epoch": 0.8247566063977747, - "grad_norm": 3.8849096298217773, - "learning_rate": 1.813355640998149e-06, - "loss": 0.9113, - "step": 17790 - }, - { - "epoch": 0.8249884098284654, - "grad_norm": 4.433188438415527, - "learning_rate": 1.8087116175303577e-06, - "loss": 0.8947, - "step": 17795 - }, - { - "epoch": 0.8252202132591563, - "grad_norm": 3.6504225730895996, - "learning_rate": 1.8040729570482452e-06, - "loss": 0.8461, - "step": 17800 - }, - { - "epoch": 0.8252202132591563, - "eval_loss": 0.8788275718688965, - "eval_runtime": 11.2707, - "eval_samples_per_second": 11.268, - "eval_steps_per_second": 11.268, - "step": 17800 - }, - { - "epoch": 0.825452016689847, - "grad_norm": 3.6747710704803467, - "learning_rate": 1.7994396625888255e-06, - "loss": 0.865, - "step": 17805 - }, - { - "epoch": 0.8256838201205378, - "grad_norm": 3.45422101020813, - "learning_rate": 1.7948117371856045e-06, - "loss": 0.9433, - "step": 17810 - }, - { - "epoch": 0.8259156235512286, - "grad_norm": 3.293832302093506, - "learning_rate": 1.7901891838685659e-06, - "loss": 0.8554, - "step": 17815 - }, - { - "epoch": 0.8261474269819193, - "grad_norm": 3.420264959335327, - "learning_rate": 1.78557200566418e-06, - "loss": 0.8656, - "step": 17820 - }, - { - "epoch": 0.8263792304126101, - "grad_norm": 4.223310470581055, - "learning_rate": 1.7809602055954012e-06, - "loss": 0.8573, - "step": 17825 - }, - { - "epoch": 0.8266110338433009, - "grad_norm": 3.8550260066986084, - "learning_rate": 1.776353786681656e-06, - "loss": 0.8986, - "step": 17830 - }, - { - "epoch": 0.8268428372739917, - "grad_norm": 3.8833189010620117, - "learning_rate": 1.771752751938849e-06, - "loss": 0.7821, - "step": 17835 - }, - { - "epoch": 0.8270746407046824, - "grad_norm": 3.930795669555664, - "learning_rate": 1.7671571043793634e-06, - "loss": 0.8693, - "step": 17840 - }, - { - "epoch": 0.8273064441353732, - "grad_norm": 4.167591571807861, - "learning_rate": 1.762566847012055e-06, - "loss": 0.7985, - "step": 17845 - }, - { - "epoch": 0.827538247566064, - "grad_norm": 3.4307236671447754, - "learning_rate": 1.7579819828422451e-06, - "loss": 0.6935, - "step": 17850 - }, - { - "epoch": 0.8277700509967547, - "grad_norm": 4.414237976074219, - "learning_rate": 1.7534025148717327e-06, - "loss": 0.7815, - "step": 17855 - }, - { - "epoch": 0.8280018544274456, - "grad_norm": 3.4899954795837402, - "learning_rate": 1.7488284460987737e-06, - "loss": 0.9855, - "step": 17860 - }, - { - "epoch": 0.8282336578581363, - "grad_norm": 3.925497055053711, - "learning_rate": 1.7442597795180994e-06, - "loss": 0.8605, - "step": 17865 - }, - { - "epoch": 0.828465461288827, - "grad_norm": 4.437492370605469, - "learning_rate": 1.7396965181208935e-06, - "loss": 0.9315, - "step": 17870 - }, - { - "epoch": 0.8286972647195179, - "grad_norm": 3.858232021331787, - "learning_rate": 1.7351386648948088e-06, - "loss": 0.7953, - "step": 17875 - }, - { - "epoch": 0.8289290681502086, - "grad_norm": 3.927126884460449, - "learning_rate": 1.7305862228239579e-06, - "loss": 0.9262, - "step": 17880 - }, - { - "epoch": 0.8291608715808994, - "grad_norm": 3.5855329036712646, - "learning_rate": 1.7260391948889043e-06, - "loss": 0.8873, - "step": 17885 - }, - { - "epoch": 0.8293926750115902, - "grad_norm": 4.403600215911865, - "learning_rate": 1.7214975840666681e-06, - "loss": 0.8922, - "step": 17890 - }, - { - "epoch": 0.8296244784422809, - "grad_norm": 3.798386812210083, - "learning_rate": 1.7169613933307294e-06, - "loss": 0.934, - "step": 17895 - }, - { - "epoch": 0.8298562818729717, - "grad_norm": 3.3800482749938965, - "learning_rate": 1.7124306256510104e-06, - "loss": 0.873, - "step": 17900 - }, - { - "epoch": 0.8298562818729717, - "eval_loss": 0.8784139752388, - "eval_runtime": 11.2612, - "eval_samples_per_second": 11.278, - "eval_steps_per_second": 11.278, - "step": 17900 - }, - { - "epoch": 0.8300880853036625, - "grad_norm": 3.2892932891845703, - "learning_rate": 1.7079052839938881e-06, - "loss": 0.8444, - "step": 17905 - }, - { - "epoch": 0.8303198887343533, - "grad_norm": 4.069420337677002, - "learning_rate": 1.703385371322188e-06, - "loss": 0.737, - "step": 17910 - }, - { - "epoch": 0.8305516921650441, - "grad_norm": 3.6989574432373047, - "learning_rate": 1.6988708905951834e-06, - "loss": 0.8348, - "step": 17915 - }, - { - "epoch": 0.8307834955957348, - "grad_norm": 3.9831721782684326, - "learning_rate": 1.6943618447685773e-06, - "loss": 0.7892, - "step": 17920 - }, - { - "epoch": 0.8310152990264256, - "grad_norm": 3.6940042972564697, - "learning_rate": 1.6898582367945292e-06, - "loss": 0.9604, - "step": 17925 - }, - { - "epoch": 0.8312471024571164, - "grad_norm": 3.789471387863159, - "learning_rate": 1.6853600696216344e-06, - "loss": 0.9228, - "step": 17930 - }, - { - "epoch": 0.8314789058878072, - "grad_norm": 3.592510461807251, - "learning_rate": 1.6808673461949222e-06, - "loss": 0.8276, - "step": 17935 - }, - { - "epoch": 0.8317107093184979, - "grad_norm": 3.432971715927124, - "learning_rate": 1.6763800694558608e-06, - "loss": 0.8355, - "step": 17940 - }, - { - "epoch": 0.8319425127491887, - "grad_norm": 3.7236011028289795, - "learning_rate": 1.6718982423423557e-06, - "loss": 0.8081, - "step": 17945 - }, - { - "epoch": 0.8321743161798795, - "grad_norm": 3.5183401107788086, - "learning_rate": 1.6674218677887388e-06, - "loss": 0.7581, - "step": 17950 - }, - { - "epoch": 0.8324061196105702, - "grad_norm": 3.4557225704193115, - "learning_rate": 1.66295094872577e-06, - "loss": 0.8102, - "step": 17955 - }, - { - "epoch": 0.832637923041261, - "grad_norm": 3.7359535694122314, - "learning_rate": 1.6584854880806445e-06, - "loss": 0.969, - "step": 17960 - }, - { - "epoch": 0.8328697264719518, - "grad_norm": 3.518160343170166, - "learning_rate": 1.6540254887769824e-06, - "loss": 0.7597, - "step": 17965 - }, - { - "epoch": 0.8331015299026425, - "grad_norm": 3.740760087966919, - "learning_rate": 1.6495709537348238e-06, - "loss": 0.7805, - "step": 17970 - }, - { - "epoch": 0.8333333333333334, - "grad_norm": 3.5426766872406006, - "learning_rate": 1.6451218858706374e-06, - "loss": 0.807, - "step": 17975 - }, - { - "epoch": 0.8335651367640241, - "grad_norm": 3.917860746383667, - "learning_rate": 1.6406782880973038e-06, - "loss": 0.8023, - "step": 17980 - }, - { - "epoch": 0.8337969401947148, - "grad_norm": 3.597020149230957, - "learning_rate": 1.636240163324132e-06, - "loss": 0.8634, - "step": 17985 - }, - { - "epoch": 0.8340287436254057, - "grad_norm": 3.77585506439209, - "learning_rate": 1.631807514456839e-06, - "loss": 0.8499, - "step": 17990 - }, - { - "epoch": 0.8342605470560964, - "grad_norm": 3.657649278640747, - "learning_rate": 1.6273803443975633e-06, - "loss": 0.8618, - "step": 17995 - }, - { - "epoch": 0.8344923504867872, - "grad_norm": 4.247122287750244, - "learning_rate": 1.6229586560448562e-06, - "loss": 0.908, - "step": 18000 - }, - { - "epoch": 0.8344923504867872, - "eval_loss": 0.8780138492584229, - "eval_runtime": 11.2702, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 18000 - }, - { - "epoch": 0.834724153917478, - "grad_norm": 3.2867918014526367, - "learning_rate": 1.6185424522936743e-06, - "loss": 0.8368, - "step": 18005 - }, - { - "epoch": 0.8349559573481687, - "grad_norm": 3.2922523021698, - "learning_rate": 1.6141317360353847e-06, - "loss": 0.741, - "step": 18010 - }, - { - "epoch": 0.8351877607788595, - "grad_norm": 3.9418952465057373, - "learning_rate": 1.6097265101577697e-06, - "loss": 0.7602, - "step": 18015 - }, - { - "epoch": 0.8354195642095503, - "grad_norm": 3.6202547550201416, - "learning_rate": 1.6053267775450032e-06, - "loss": 0.8347, - "step": 18020 - }, - { - "epoch": 0.8356513676402411, - "grad_norm": 3.360456705093384, - "learning_rate": 1.6009325410776754e-06, - "loss": 0.7795, - "step": 18025 - }, - { - "epoch": 0.8358831710709318, - "grad_norm": 3.2623724937438965, - "learning_rate": 1.5965438036327741e-06, - "loss": 0.89, - "step": 18030 - }, - { - "epoch": 0.8361149745016226, - "grad_norm": 4.737086772918701, - "learning_rate": 1.5921605680836838e-06, - "loss": 0.96, - "step": 18035 - }, - { - "epoch": 0.8363467779323134, - "grad_norm": 3.349141836166382, - "learning_rate": 1.5877828373001857e-06, - "loss": 0.8303, - "step": 18040 - }, - { - "epoch": 0.8365785813630042, - "grad_norm": 4.2403950691223145, - "learning_rate": 1.5834106141484628e-06, - "loss": 0.9327, - "step": 18045 - }, - { - "epoch": 0.836810384793695, - "grad_norm": 3.949309825897217, - "learning_rate": 1.5790439014910908e-06, - "loss": 0.7971, - "step": 18050 - }, - { - "epoch": 0.8370421882243857, - "grad_norm": 3.6255528926849365, - "learning_rate": 1.5746827021870315e-06, - "loss": 0.8399, - "step": 18055 - }, - { - "epoch": 0.8372739916550765, - "grad_norm": 3.5447537899017334, - "learning_rate": 1.5703270190916453e-06, - "loss": 0.721, - "step": 18060 - }, - { - "epoch": 0.8375057950857673, - "grad_norm": 4.385354042053223, - "learning_rate": 1.5659768550566768e-06, - "loss": 0.8497, - "step": 18065 - }, - { - "epoch": 0.837737598516458, - "grad_norm": 3.67063307762146, - "learning_rate": 1.5616322129302586e-06, - "loss": 0.9161, - "step": 18070 - }, - { - "epoch": 0.8379694019471489, - "grad_norm": 3.8790194988250732, - "learning_rate": 1.557293095556901e-06, - "loss": 0.9169, - "step": 18075 - }, - { - "epoch": 0.8382012053778396, - "grad_norm": 3.567552089691162, - "learning_rate": 1.5529595057775093e-06, - "loss": 0.7768, - "step": 18080 - }, - { - "epoch": 0.8384330088085303, - "grad_norm": 3.9901583194732666, - "learning_rate": 1.5486314464293627e-06, - "loss": 0.8621, - "step": 18085 - }, - { - "epoch": 0.8386648122392212, - "grad_norm": 3.9711081981658936, - "learning_rate": 1.5443089203461203e-06, - "loss": 0.8611, - "step": 18090 - }, - { - "epoch": 0.8388966156699119, - "grad_norm": 3.800506353378296, - "learning_rate": 1.539991930357816e-06, - "loss": 0.8638, - "step": 18095 - }, - { - "epoch": 0.8391284191006027, - "grad_norm": 3.651494026184082, - "learning_rate": 1.5356804792908652e-06, - "loss": 0.9013, - "step": 18100 - }, - { - "epoch": 0.8391284191006027, - "eval_loss": 0.8775888681411743, - "eval_runtime": 11.2697, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 18100 - }, - { - "epoch": 0.8393602225312935, - "grad_norm": 4.302680969238281, - "learning_rate": 1.5313745699680505e-06, - "loss": 0.9629, - "step": 18105 - }, - { - "epoch": 0.8395920259619842, - "grad_norm": 3.702319860458374, - "learning_rate": 1.5270742052085296e-06, - "loss": 0.8363, - "step": 18110 - }, - { - "epoch": 0.839823829392675, - "grad_norm": 3.423970937728882, - "learning_rate": 1.522779387827833e-06, - "loss": 0.7962, - "step": 18115 - }, - { - "epoch": 0.8400556328233658, - "grad_norm": 3.711111068725586, - "learning_rate": 1.5184901206378545e-06, - "loss": 0.838, - "step": 18120 - }, - { - "epoch": 0.8402874362540566, - "grad_norm": 3.613584041595459, - "learning_rate": 1.514206406446851e-06, - "loss": 0.7964, - "step": 18125 - }, - { - "epoch": 0.8405192396847473, - "grad_norm": 3.8854293823242188, - "learning_rate": 1.5099282480594523e-06, - "loss": 0.7295, - "step": 18130 - }, - { - "epoch": 0.8407510431154381, - "grad_norm": 3.463650941848755, - "learning_rate": 1.5056556482766483e-06, - "loss": 0.7444, - "step": 18135 - }, - { - "epoch": 0.8409828465461289, - "grad_norm": 3.2122750282287598, - "learning_rate": 1.501388609895782e-06, - "loss": 0.7282, - "step": 18140 - }, - { - "epoch": 0.8412146499768196, - "grad_norm": 4.111193656921387, - "learning_rate": 1.4971271357105655e-06, - "loss": 0.9369, - "step": 18145 - }, - { - "epoch": 0.8414464534075105, - "grad_norm": 3.4979441165924072, - "learning_rate": 1.4928712285110647e-06, - "loss": 0.8797, - "step": 18150 - }, - { - "epoch": 0.8416782568382012, - "grad_norm": 4.0167341232299805, - "learning_rate": 1.4886208910836986e-06, - "loss": 0.7936, - "step": 18155 - }, - { - "epoch": 0.8419100602688919, - "grad_norm": 3.7691385746002197, - "learning_rate": 1.4843761262112355e-06, - "loss": 0.7724, - "step": 18160 - }, - { - "epoch": 0.8421418636995828, - "grad_norm": 4.039342880249023, - "learning_rate": 1.4801369366728059e-06, - "loss": 0.9082, - "step": 18165 - }, - { - "epoch": 0.8423736671302735, - "grad_norm": 4.11240816116333, - "learning_rate": 1.4759033252438837e-06, - "loss": 0.9212, - "step": 18170 - }, - { - "epoch": 0.8426054705609644, - "grad_norm": 3.6150293350219727, - "learning_rate": 1.4716752946962886e-06, - "loss": 1.0067, - "step": 18175 - }, - { - "epoch": 0.8428372739916551, - "grad_norm": 3.085090398788452, - "learning_rate": 1.4674528477981931e-06, - "loss": 0.7283, - "step": 18180 - }, - { - "epoch": 0.8430690774223458, - "grad_norm": 3.6002490520477295, - "learning_rate": 1.4632359873141055e-06, - "loss": 0.7917, - "step": 18185 - }, - { - "epoch": 0.8433008808530367, - "grad_norm": 3.630927324295044, - "learning_rate": 1.4590247160048865e-06, - "loss": 0.8492, - "step": 18190 - }, - { - "epoch": 0.8435326842837274, - "grad_norm": 3.3591744899749756, - "learning_rate": 1.454819036627727e-06, - "loss": 0.8766, - "step": 18195 - }, - { - "epoch": 0.8437644877144181, - "grad_norm": 3.287750005722046, - "learning_rate": 1.4506189519361636e-06, - "loss": 0.7674, - "step": 18200 - }, - { - "epoch": 0.8437644877144181, - "eval_loss": 0.8770650029182434, - "eval_runtime": 11.2617, - "eval_samples_per_second": 11.277, - "eval_steps_per_second": 11.277, - "step": 18200 - }, - { - "epoch": 0.843996291145109, - "grad_norm": 3.973710536956787, - "learning_rate": 1.4464244646800718e-06, - "loss": 0.7988, - "step": 18205 - }, - { - "epoch": 0.8442280945757997, - "grad_norm": 4.015002250671387, - "learning_rate": 1.4422355776056552e-06, - "loss": 1.0665, - "step": 18210 - }, - { - "epoch": 0.8444598980064905, - "grad_norm": 3.847734212875366, - "learning_rate": 1.438052293455452e-06, - "loss": 0.8656, - "step": 18215 - }, - { - "epoch": 0.8446917014371813, - "grad_norm": 4.58140230178833, - "learning_rate": 1.4338746149683414e-06, - "loss": 0.9376, - "step": 18220 - }, - { - "epoch": 0.844923504867872, - "grad_norm": 3.9086010456085205, - "learning_rate": 1.4297025448795187e-06, - "loss": 0.789, - "step": 18225 - }, - { - "epoch": 0.8451553082985628, - "grad_norm": 4.11086368560791, - "learning_rate": 1.425536085920517e-06, - "loss": 0.7864, - "step": 18230 - }, - { - "epoch": 0.8453871117292536, - "grad_norm": 3.6161892414093018, - "learning_rate": 1.421375240819196e-06, - "loss": 0.9303, - "step": 18235 - }, - { - "epoch": 0.8456189151599444, - "grad_norm": 3.4047515392303467, - "learning_rate": 1.4172200122997327e-06, - "loss": 0.7881, - "step": 18240 - }, - { - "epoch": 0.8458507185906351, - "grad_norm": 4.340506553649902, - "learning_rate": 1.4130704030826304e-06, - "loss": 0.9115, - "step": 18245 - }, - { - "epoch": 0.846082522021326, - "grad_norm": 3.772585391998291, - "learning_rate": 1.408926415884716e-06, - "loss": 0.9117, - "step": 18250 - }, - { - "epoch": 0.8463143254520167, - "grad_norm": 4.270310878753662, - "learning_rate": 1.4047880534191338e-06, - "loss": 0.927, - "step": 18255 - }, - { - "epoch": 0.8465461288827074, - "grad_norm": 3.5970466136932373, - "learning_rate": 1.400655318395343e-06, - "loss": 0.8635, - "step": 18260 - }, - { - "epoch": 0.8467779323133983, - "grad_norm": 2.8994784355163574, - "learning_rate": 1.3965282135191205e-06, - "loss": 0.8529, - "step": 18265 - }, - { - "epoch": 0.847009735744089, - "grad_norm": 3.8191888332366943, - "learning_rate": 1.3924067414925613e-06, - "loss": 0.719, - "step": 18270 - }, - { - "epoch": 0.8472415391747797, - "grad_norm": 3.4908547401428223, - "learning_rate": 1.3882909050140646e-06, - "loss": 0.8734, - "step": 18275 - }, - { - "epoch": 0.8474733426054706, - "grad_norm": 4.079155921936035, - "learning_rate": 1.3841807067783419e-06, - "loss": 0.9573, - "step": 18280 - }, - { - "epoch": 0.8477051460361613, - "grad_norm": 4.186222553253174, - "learning_rate": 1.380076149476417e-06, - "loss": 0.8626, - "step": 18285 - }, - { - "epoch": 0.8479369494668522, - "grad_norm": 3.282341718673706, - "learning_rate": 1.375977235795619e-06, - "loss": 0.7254, - "step": 18290 - }, - { - "epoch": 0.8481687528975429, - "grad_norm": 4.016537666320801, - "learning_rate": 1.3718839684195795e-06, - "loss": 0.8116, - "step": 18295 - }, - { - "epoch": 0.8484005563282336, - "grad_norm": 3.811028480529785, - "learning_rate": 1.3677963500282377e-06, - "loss": 0.876, - "step": 18300 - }, - { - "epoch": 0.8484005563282336, - "eval_loss": 0.8769089579582214, - "eval_runtime": 11.2618, - "eval_samples_per_second": 11.277, - "eval_steps_per_second": 11.277, - "step": 18300 - }, - { - "epoch": 0.8486323597589245, - "grad_norm": 3.929299831390381, - "learning_rate": 1.3637143832978305e-06, - "loss": 0.9984, - "step": 18305 - }, - { - "epoch": 0.8488641631896152, - "grad_norm": 3.347557544708252, - "learning_rate": 1.3596380709008938e-06, - "loss": 0.7783, - "step": 18310 - }, - { - "epoch": 0.849095966620306, - "grad_norm": 3.877534866333008, - "learning_rate": 1.355567415506266e-06, - "loss": 0.8746, - "step": 18315 - }, - { - "epoch": 0.8493277700509968, - "grad_norm": 3.770537853240967, - "learning_rate": 1.3515024197790794e-06, - "loss": 0.9068, - "step": 18320 - }, - { - "epoch": 0.8495595734816875, - "grad_norm": 4.099272727966309, - "learning_rate": 1.3474430863807608e-06, - "loss": 0.8611, - "step": 18325 - }, - { - "epoch": 0.8497913769123783, - "grad_norm": 4.282232284545898, - "learning_rate": 1.3433894179690266e-06, - "loss": 0.8107, - "step": 18330 - }, - { - "epoch": 0.8500231803430691, - "grad_norm": 3.5612881183624268, - "learning_rate": 1.3393414171978892e-06, - "loss": 0.8929, - "step": 18335 - }, - { - "epoch": 0.8502549837737599, - "grad_norm": 3.6753652095794678, - "learning_rate": 1.3352990867176519e-06, - "loss": 0.7928, - "step": 18340 - }, - { - "epoch": 0.8504867872044506, - "grad_norm": 3.9664688110351562, - "learning_rate": 1.3312624291748965e-06, - "loss": 0.9438, - "step": 18345 - }, - { - "epoch": 0.8507185906351414, - "grad_norm": 4.463581085205078, - "learning_rate": 1.3272314472124992e-06, - "loss": 0.8554, - "step": 18350 - }, - { - "epoch": 0.8509503940658322, - "grad_norm": 4.024453163146973, - "learning_rate": 1.3232061434696186e-06, - "loss": 0.8932, - "step": 18355 - }, - { - "epoch": 0.8511821974965229, - "grad_norm": 3.6871092319488525, - "learning_rate": 1.3191865205816923e-06, - "loss": 0.8562, - "step": 18360 - }, - { - "epoch": 0.8514140009272138, - "grad_norm": 3.467263698577881, - "learning_rate": 1.3151725811804395e-06, - "loss": 0.8576, - "step": 18365 - }, - { - "epoch": 0.8516458043579045, - "grad_norm": 3.4049699306488037, - "learning_rate": 1.3111643278938613e-06, - "loss": 0.7381, - "step": 18370 - }, - { - "epoch": 0.8518776077885952, - "grad_norm": 3.9009854793548584, - "learning_rate": 1.3071617633462351e-06, - "loss": 0.8384, - "step": 18375 - }, - { - "epoch": 0.8521094112192861, - "grad_norm": 4.61009407043457, - "learning_rate": 1.3031648901581084e-06, - "loss": 0.7304, - "step": 18380 - }, - { - "epoch": 0.8523412146499768, - "grad_norm": 3.888120412826538, - "learning_rate": 1.2991737109463132e-06, - "loss": 0.983, - "step": 18385 - }, - { - "epoch": 0.8525730180806675, - "grad_norm": 3.6879374980926514, - "learning_rate": 1.2951882283239414e-06, - "loss": 0.9002, - "step": 18390 - }, - { - "epoch": 0.8528048215113584, - "grad_norm": 3.665576457977295, - "learning_rate": 1.291208444900366e-06, - "loss": 0.7605, - "step": 18395 - }, - { - "epoch": 0.8530366249420491, - "grad_norm": 3.582505226135254, - "learning_rate": 1.287234363281219e-06, - "loss": 0.8616, - "step": 18400 - }, - { - "epoch": 0.8530366249420491, - "eval_loss": 0.8765570521354675, - "eval_runtime": 11.2681, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 18400 - }, - { - "epoch": 0.8532684283727399, - "grad_norm": 3.894113063812256, - "learning_rate": 1.283265986068406e-06, - "loss": 0.7478, - "step": 18405 - }, - { - "epoch": 0.8535002318034307, - "grad_norm": 3.673455238342285, - "learning_rate": 1.2793033158600987e-06, - "loss": 0.7809, - "step": 18410 - }, - { - "epoch": 0.8537320352341214, - "grad_norm": 3.5018301010131836, - "learning_rate": 1.2753463552507262e-06, - "loss": 0.971, - "step": 18415 - }, - { - "epoch": 0.8539638386648123, - "grad_norm": 3.3875620365142822, - "learning_rate": 1.2713951068309815e-06, - "loss": 0.9165, - "step": 18420 - }, - { - "epoch": 0.854195642095503, - "grad_norm": 3.3227803707122803, - "learning_rate": 1.2674495731878233e-06, - "loss": 0.7961, - "step": 18425 - }, - { - "epoch": 0.8544274455261938, - "grad_norm": 4.07461404800415, - "learning_rate": 1.2635097569044585e-06, - "loss": 0.7933, - "step": 18430 - }, - { - "epoch": 0.8546592489568846, - "grad_norm": 4.2566914558410645, - "learning_rate": 1.2595756605603604e-06, - "loss": 0.9242, - "step": 18435 - }, - { - "epoch": 0.8548910523875753, - "grad_norm": 3.8578896522521973, - "learning_rate": 1.2556472867312553e-06, - "loss": 0.8824, - "step": 18440 - }, - { - "epoch": 0.8551228558182661, - "grad_norm": 3.371788501739502, - "learning_rate": 1.251724637989118e-06, - "loss": 0.8345, - "step": 18445 - }, - { - "epoch": 0.8553546592489569, - "grad_norm": 3.8980839252471924, - "learning_rate": 1.2478077169021773e-06, - "loss": 0.8757, - "step": 18450 - }, - { - "epoch": 0.8555864626796477, - "grad_norm": 3.4333198070526123, - "learning_rate": 1.2438965260349134e-06, - "loss": 0.7938, - "step": 18455 - }, - { - "epoch": 0.8558182661103384, - "grad_norm": 3.738865852355957, - "learning_rate": 1.2399910679480576e-06, - "loss": 0.7182, - "step": 18460 - }, - { - "epoch": 0.8560500695410292, - "grad_norm": 4.250709533691406, - "learning_rate": 1.23609134519858e-06, - "loss": 0.8313, - "step": 18465 - }, - { - "epoch": 0.85628187297172, - "grad_norm": 5.0909743309021, - "learning_rate": 1.232197360339702e-06, - "loss": 1.0371, - "step": 18470 - }, - { - "epoch": 0.8565136764024107, - "grad_norm": 3.838352680206299, - "learning_rate": 1.2283091159208882e-06, - "loss": 0.9044, - "step": 18475 - }, - { - "epoch": 0.8567454798331016, - "grad_norm": 3.0013608932495117, - "learning_rate": 1.2244266144878415e-06, - "loss": 0.7732, - "step": 18480 - }, - { - "epoch": 0.8569772832637923, - "grad_norm": 3.4243435859680176, - "learning_rate": 1.2205498585825037e-06, - "loss": 0.6783, - "step": 18485 - }, - { - "epoch": 0.857209086694483, - "grad_norm": 3.4078502655029297, - "learning_rate": 1.2166788507430593e-06, - "loss": 0.7527, - "step": 18490 - }, - { - "epoch": 0.8574408901251739, - "grad_norm": 3.5167553424835205, - "learning_rate": 1.212813593503931e-06, - "loss": 0.8503, - "step": 18495 - }, - { - "epoch": 0.8576726935558646, - "grad_norm": 4.191205978393555, - "learning_rate": 1.2089540893957675e-06, - "loss": 0.7204, - "step": 18500 - }, - { - "epoch": 0.8576726935558646, - "eval_loss": 0.8762512803077698, - "eval_runtime": 11.2696, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 18500 - }, - { - "epoch": 0.8579044969865554, - "grad_norm": 3.3614768981933594, - "learning_rate": 1.205100340945462e-06, - "loss": 0.6965, - "step": 18505 - }, - { - "epoch": 0.8581363004172462, - "grad_norm": 3.390779733657837, - "learning_rate": 1.2012523506761299e-06, - "loss": 0.768, - "step": 18510 - }, - { - "epoch": 0.8583681038479369, - "grad_norm": 4.055974960327148, - "learning_rate": 1.1974101211071197e-06, - "loss": 0.937, - "step": 18515 - }, - { - "epoch": 0.8585999072786277, - "grad_norm": 3.6975340843200684, - "learning_rate": 1.1935736547540122e-06, - "loss": 0.7743, - "step": 18520 - }, - { - "epoch": 0.8588317107093185, - "grad_norm": 4.171230792999268, - "learning_rate": 1.189742954128611e-06, - "loss": 0.9207, - "step": 18525 - }, - { - "epoch": 0.8590635141400093, - "grad_norm": 4.331693649291992, - "learning_rate": 1.1859180217389433e-06, - "loss": 0.821, - "step": 18530 - }, - { - "epoch": 0.8592953175707001, - "grad_norm": 4.08601713180542, - "learning_rate": 1.1820988600892669e-06, - "loss": 0.9393, - "step": 18535 - }, - { - "epoch": 0.8595271210013908, - "grad_norm": 3.40236234664917, - "learning_rate": 1.1782854716800496e-06, - "loss": 0.8371, - "step": 18540 - }, - { - "epoch": 0.8597589244320816, - "grad_norm": 3.1915442943573, - "learning_rate": 1.174477859007993e-06, - "loss": 0.8073, - "step": 18545 - }, - { - "epoch": 0.8599907278627724, - "grad_norm": 3.60099720954895, - "learning_rate": 1.1706760245660053e-06, - "loss": 0.8328, - "step": 18550 - }, - { - "epoch": 0.8602225312934632, - "grad_norm": 3.87766170501709, - "learning_rate": 1.16687997084322e-06, - "loss": 0.8903, - "step": 18555 - }, - { - "epoch": 0.8604543347241539, - "grad_norm": 4.071938991546631, - "learning_rate": 1.163089700324982e-06, - "loss": 0.815, - "step": 18560 - }, - { - "epoch": 0.8606861381548447, - "grad_norm": 4.054932117462158, - "learning_rate": 1.1593052154928508e-06, - "loss": 0.9528, - "step": 18565 - }, - { - "epoch": 0.8609179415855355, - "grad_norm": 3.4975640773773193, - "learning_rate": 1.1555265188245934e-06, - "loss": 0.8449, - "step": 18570 - }, - { - "epoch": 0.8611497450162262, - "grad_norm": 4.184132099151611, - "learning_rate": 1.1517536127941952e-06, - "loss": 0.9859, - "step": 18575 - }, - { - "epoch": 0.8613815484469171, - "grad_norm": 4.251339435577393, - "learning_rate": 1.1479864998718482e-06, - "loss": 0.9542, - "step": 18580 - }, - { - "epoch": 0.8616133518776078, - "grad_norm": 4.361698627471924, - "learning_rate": 1.1442251825239459e-06, - "loss": 0.9167, - "step": 18585 - }, - { - "epoch": 0.8618451553082985, - "grad_norm": 3.473511219024658, - "learning_rate": 1.140469663213094e-06, - "loss": 0.902, - "step": 18590 - }, - { - "epoch": 0.8620769587389894, - "grad_norm": 3.937403440475464, - "learning_rate": 1.1367199443980991e-06, - "loss": 0.9195, - "step": 18595 - }, - { - "epoch": 0.8623087621696801, - "grad_norm": 3.817412853240967, - "learning_rate": 1.1329760285339675e-06, - "loss": 0.7739, - "step": 18600 - }, - { - "epoch": 0.8623087621696801, - "eval_loss": 0.8760548233985901, - "eval_runtime": 11.2847, - "eval_samples_per_second": 11.254, - "eval_steps_per_second": 11.254, - "step": 18600 - }, - { - "epoch": 0.8625405656003708, - "grad_norm": 3.9234488010406494, - "learning_rate": 1.1292379180719104e-06, - "loss": 0.7825, - "step": 18605 - }, - { - "epoch": 0.8627723690310617, - "grad_norm": 3.362407684326172, - "learning_rate": 1.125505615459338e-06, - "loss": 0.5777, - "step": 18610 - }, - { - "epoch": 0.8630041724617524, - "grad_norm": 3.790205717086792, - "learning_rate": 1.1217791231398578e-06, - "loss": 0.8194, - "step": 18615 - }, - { - "epoch": 0.8632359758924432, - "grad_norm": 3.6676652431488037, - "learning_rate": 1.1180584435532716e-06, - "loss": 0.8417, - "step": 18620 - }, - { - "epoch": 0.863467779323134, - "grad_norm": 4.089963436126709, - "learning_rate": 1.1143435791355728e-06, - "loss": 0.9652, - "step": 18625 - }, - { - "epoch": 0.8636995827538247, - "grad_norm": 3.81626033782959, - "learning_rate": 1.1106345323189548e-06, - "loss": 0.8223, - "step": 18630 - }, - { - "epoch": 0.8639313861845155, - "grad_norm": 4.048824310302734, - "learning_rate": 1.1069313055317953e-06, - "loss": 0.9674, - "step": 18635 - }, - { - "epoch": 0.8641631896152063, - "grad_norm": 4.039358139038086, - "learning_rate": 1.103233901198666e-06, - "loss": 0.8702, - "step": 18640 - }, - { - "epoch": 0.8643949930458971, - "grad_norm": 3.317441463470459, - "learning_rate": 1.0995423217403245e-06, - "loss": 0.7957, - "step": 18645 - }, - { - "epoch": 0.8646267964765878, - "grad_norm": 4.214531421661377, - "learning_rate": 1.0958565695737167e-06, - "loss": 0.8281, - "step": 18650 - }, - { - "epoch": 0.8648585999072786, - "grad_norm": 3.687647581100464, - "learning_rate": 1.0921766471119665e-06, - "loss": 0.877, - "step": 18655 - }, - { - "epoch": 0.8650904033379694, - "grad_norm": 3.635422945022583, - "learning_rate": 1.0885025567643913e-06, - "loss": 0.8062, - "step": 18660 - }, - { - "epoch": 0.8653222067686602, - "grad_norm": 3.2454099655151367, - "learning_rate": 1.0848343009364837e-06, - "loss": 0.7843, - "step": 18665 - }, - { - "epoch": 0.865554010199351, - "grad_norm": 4.3716139793396, - "learning_rate": 1.0811718820299155e-06, - "loss": 0.7872, - "step": 18670 - }, - { - "epoch": 0.8657858136300417, - "grad_norm": 4.972971439361572, - "learning_rate": 1.0775153024425411e-06, - "loss": 1.0013, - "step": 18675 - }, - { - "epoch": 0.8660176170607325, - "grad_norm": 3.9282686710357666, - "learning_rate": 1.0738645645683921e-06, - "loss": 0.8474, - "step": 18680 - }, - { - "epoch": 0.8662494204914233, - "grad_norm": 4.049283027648926, - "learning_rate": 1.0702196707976686e-06, - "loss": 0.8647, - "step": 18685 - }, - { - "epoch": 0.866481223922114, - "grad_norm": 3.5909154415130615, - "learning_rate": 1.0665806235167498e-06, - "loss": 0.8355, - "step": 18690 - }, - { - "epoch": 0.8667130273528049, - "grad_norm": 4.021440029144287, - "learning_rate": 1.0629474251081873e-06, - "loss": 0.8835, - "step": 18695 - }, - { - "epoch": 0.8669448307834956, - "grad_norm": 3.9168529510498047, - "learning_rate": 1.0593200779507028e-06, - "loss": 0.801, - "step": 18700 - }, - { - "epoch": 0.8669448307834956, - "eval_loss": 0.8755388855934143, - "eval_runtime": 11.2609, - "eval_samples_per_second": 11.278, - "eval_steps_per_second": 11.278, - "step": 18700 - }, - { - "epoch": 0.8671766342141863, - "grad_norm": 4.553606986999512, - "learning_rate": 1.0556985844191848e-06, - "loss": 0.8309, - "step": 18705 - }, - { - "epoch": 0.8674084376448772, - "grad_norm": 4.2659478187561035, - "learning_rate": 1.0520829468846938e-06, - "loss": 0.9314, - "step": 18710 - }, - { - "epoch": 0.8676402410755679, - "grad_norm": 3.4218521118164062, - "learning_rate": 1.0484731677144521e-06, - "loss": 0.8183, - "step": 18715 - }, - { - "epoch": 0.8678720445062587, - "grad_norm": 4.528285026550293, - "learning_rate": 1.044869249271845e-06, - "loss": 0.8522, - "step": 18720 - }, - { - "epoch": 0.8681038479369495, - "grad_norm": 3.9115729331970215, - "learning_rate": 1.0412711939164266e-06, - "loss": 0.7733, - "step": 18725 - }, - { - "epoch": 0.8683356513676402, - "grad_norm": 3.4151837825775146, - "learning_rate": 1.0376790040039098e-06, - "loss": 0.9614, - "step": 18730 - }, - { - "epoch": 0.868567454798331, - "grad_norm": 4.008971214294434, - "learning_rate": 1.0340926818861642e-06, - "loss": 0.8216, - "step": 18735 - }, - { - "epoch": 0.8687992582290218, - "grad_norm": 3.121033191680908, - "learning_rate": 1.0305122299112246e-06, - "loss": 0.8541, - "step": 18740 - }, - { - "epoch": 0.8690310616597126, - "grad_norm": 3.4723434448242188, - "learning_rate": 1.026937650423272e-06, - "loss": 0.7798, - "step": 18745 - }, - { - "epoch": 0.8692628650904033, - "grad_norm": 4.1530961990356445, - "learning_rate": 1.023368945762655e-06, - "loss": 0.9794, - "step": 18750 - }, - { - "epoch": 0.8694946685210941, - "grad_norm": 4.090208053588867, - "learning_rate": 1.0198061182658637e-06, - "loss": 0.7676, - "step": 18755 - }, - { - "epoch": 0.8697264719517849, - "grad_norm": 3.8618550300598145, - "learning_rate": 1.0162491702655486e-06, - "loss": 0.7873, - "step": 18760 - }, - { - "epoch": 0.8699582753824756, - "grad_norm": 3.4474642276763916, - "learning_rate": 1.0126981040905116e-06, - "loss": 0.9073, - "step": 18765 - }, - { - "epoch": 0.8701900788131665, - "grad_norm": 3.3244881629943848, - "learning_rate": 1.009152922065697e-06, - "loss": 0.8019, - "step": 18770 - }, - { - "epoch": 0.8704218822438572, - "grad_norm": 3.623584747314453, - "learning_rate": 1.0056136265121985e-06, - "loss": 0.8823, - "step": 18775 - }, - { - "epoch": 0.8706536856745479, - "grad_norm": 3.2864067554473877, - "learning_rate": 1.002080219747259e-06, - "loss": 0.7796, - "step": 18780 - }, - { - "epoch": 0.8708854891052388, - "grad_norm": 3.941572666168213, - "learning_rate": 9.985527040842669e-07, - "loss": 0.7145, - "step": 18785 - }, - { - "epoch": 0.8711172925359295, - "grad_norm": 4.011714935302734, - "learning_rate": 9.950310818327469e-07, - "loss": 0.8252, - "step": 18790 - }, - { - "epoch": 0.8713490959666204, - "grad_norm": 3.8345000743865967, - "learning_rate": 9.915153552983725e-07, - "loss": 0.9874, - "step": 18795 - }, - { - "epoch": 0.8715808993973111, - "grad_norm": 3.2887487411499023, - "learning_rate": 9.880055267829524e-07, - "loss": 0.8881, - "step": 18800 - }, - { - "epoch": 0.8715808993973111, - "eval_loss": 0.8754202723503113, - "eval_runtime": 11.2653, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 18800 - }, - { - "epoch": 0.8718127028280018, - "grad_norm": 4.1776652336120605, - "learning_rate": 9.84501598584433e-07, - "loss": 0.9445, - "step": 18805 - }, - { - "epoch": 0.8720445062586927, - "grad_norm": 4.137359619140625, - "learning_rate": 9.810035729969026e-07, - "loss": 0.8771, - "step": 18810 - }, - { - "epoch": 0.8722763096893834, - "grad_norm": 4.066986083984375, - "learning_rate": 9.775114523105833e-07, - "loss": 1.0272, - "step": 18815 - }, - { - "epoch": 0.8725081131200741, - "grad_norm": 3.746918201446533, - "learning_rate": 9.740252388118266e-07, - "loss": 0.7733, - "step": 18820 - }, - { - "epoch": 0.872739916550765, - "grad_norm": 3.638258457183838, - "learning_rate": 9.70544934783123e-07, - "loss": 0.7531, - "step": 18825 - }, - { - "epoch": 0.8729717199814557, - "grad_norm": 3.6451056003570557, - "learning_rate": 9.67070542503089e-07, - "loss": 0.9688, - "step": 18830 - }, - { - "epoch": 0.8732035234121465, - "grad_norm": 3.232944965362549, - "learning_rate": 9.63602064246475e-07, - "loss": 0.9892, - "step": 18835 - }, - { - "epoch": 0.8734353268428373, - "grad_norm": 3.4214277267456055, - "learning_rate": 9.601395022841542e-07, - "loss": 0.8214, - "step": 18840 - }, - { - "epoch": 0.873667130273528, - "grad_norm": 4.174017906188965, - "learning_rate": 9.56682858883129e-07, - "loss": 0.8695, - "step": 18845 - }, - { - "epoch": 0.8738989337042188, - "grad_norm": 3.725480556488037, - "learning_rate": 9.5323213630653e-07, - "loss": 0.815, - "step": 18850 - }, - { - "epoch": 0.8741307371349096, - "grad_norm": 3.5639865398406982, - "learning_rate": 9.497873368136056e-07, - "loss": 0.7668, - "step": 18855 - }, - { - "epoch": 0.8743625405656004, - "grad_norm": 4.303888320922852, - "learning_rate": 9.463484626597275e-07, - "loss": 0.9614, - "step": 18860 - }, - { - "epoch": 0.8745943439962911, - "grad_norm": 3.6800920963287354, - "learning_rate": 9.429155160963899e-07, - "loss": 0.7031, - "step": 18865 - }, - { - "epoch": 0.874826147426982, - "grad_norm": 3.8146042823791504, - "learning_rate": 9.394884993712095e-07, - "loss": 0.8, - "step": 18870 - }, - { - "epoch": 0.8750579508576727, - "grad_norm": 4.3860368728637695, - "learning_rate": 9.360674147279103e-07, - "loss": 0.9592, - "step": 18875 - }, - { - "epoch": 0.8752897542883634, - "grad_norm": 3.643028736114502, - "learning_rate": 9.326522644063429e-07, - "loss": 0.7462, - "step": 18880 - }, - { - "epoch": 0.8755215577190543, - "grad_norm": 3.8522374629974365, - "learning_rate": 9.292430506424688e-07, - "loss": 0.8639, - "step": 18885 - }, - { - "epoch": 0.875753361149745, - "grad_norm": 4.47439432144165, - "learning_rate": 9.258397756683601e-07, - "loss": 0.7049, - "step": 18890 - }, - { - "epoch": 0.8759851645804357, - "grad_norm": 3.729813814163208, - "learning_rate": 9.224424417122036e-07, - "loss": 0.7833, - "step": 18895 - }, - { - "epoch": 0.8762169680111266, - "grad_norm": 3.2106189727783203, - "learning_rate": 9.190510509982942e-07, - "loss": 0.8653, - "step": 18900 - }, - { - "epoch": 0.8762169680111266, - "eval_loss": 0.8750000596046448, - "eval_runtime": 11.2681, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 18900 - }, - { - "epoch": 0.8764487714418173, - "grad_norm": 3.6238574981689453, - "learning_rate": 9.156656057470415e-07, - "loss": 0.8184, - "step": 18905 - }, - { - "epoch": 0.8766805748725082, - "grad_norm": 4.165507793426514, - "learning_rate": 9.122861081749524e-07, - "loss": 0.802, - "step": 18910 - }, - { - "epoch": 0.8769123783031989, - "grad_norm": 3.3286075592041016, - "learning_rate": 9.089125604946514e-07, - "loss": 0.7304, - "step": 18915 - }, - { - "epoch": 0.8771441817338896, - "grad_norm": 3.1397790908813477, - "learning_rate": 9.055449649148585e-07, - "loss": 0.7384, - "step": 18920 - }, - { - "epoch": 0.8773759851645805, - "grad_norm": 3.556872844696045, - "learning_rate": 9.021833236403977e-07, - "loss": 0.7844, - "step": 18925 - }, - { - "epoch": 0.8776077885952712, - "grad_norm": 3.5317461490631104, - "learning_rate": 8.988276388721995e-07, - "loss": 0.9778, - "step": 18930 - }, - { - "epoch": 0.877839592025962, - "grad_norm": 3.8792641162872314, - "learning_rate": 8.954779128072932e-07, - "loss": 0.8529, - "step": 18935 - }, - { - "epoch": 0.8780713954566528, - "grad_norm": 3.7412779331207275, - "learning_rate": 8.921341476388024e-07, - "loss": 0.8977, - "step": 18940 - }, - { - "epoch": 0.8783031988873435, - "grad_norm": 3.8396799564361572, - "learning_rate": 8.887963455559534e-07, - "loss": 0.7698, - "step": 18945 - }, - { - "epoch": 0.8785350023180343, - "grad_norm": 3.3994853496551514, - "learning_rate": 8.854645087440628e-07, - "loss": 0.973, - "step": 18950 - }, - { - "epoch": 0.8787668057487251, - "grad_norm": 3.9767489433288574, - "learning_rate": 8.8213863938455e-07, - "loss": 1.01, - "step": 18955 - }, - { - "epoch": 0.8789986091794159, - "grad_norm": 3.4473788738250732, - "learning_rate": 8.788187396549164e-07, - "loss": 0.7616, - "step": 18960 - }, - { - "epoch": 0.8792304126101066, - "grad_norm": 3.0410356521606445, - "learning_rate": 8.755048117287646e-07, - "loss": 0.8133, - "step": 18965 - }, - { - "epoch": 0.8794622160407974, - "grad_norm": 4.893867015838623, - "learning_rate": 8.721968577757833e-07, - "loss": 0.8119, - "step": 18970 - }, - { - "epoch": 0.8796940194714882, - "grad_norm": 3.6503591537475586, - "learning_rate": 8.688948799617502e-07, - "loss": 0.8597, - "step": 18975 - }, - { - "epoch": 0.8799258229021789, - "grad_norm": 3.970219612121582, - "learning_rate": 8.655988804485283e-07, - "loss": 0.8174, - "step": 18980 - }, - { - "epoch": 0.8801576263328698, - "grad_norm": 3.8891892433166504, - "learning_rate": 8.6230886139407e-07, - "loss": 0.8486, - "step": 18985 - }, - { - "epoch": 0.8803894297635605, - "grad_norm": 3.3773927688598633, - "learning_rate": 8.590248249524114e-07, - "loss": 0.9104, - "step": 18990 - }, - { - "epoch": 0.8806212331942512, - "grad_norm": 4.802641868591309, - "learning_rate": 8.557467732736679e-07, - "loss": 0.797, - "step": 18995 - }, - { - "epoch": 0.8808530366249421, - "grad_norm": 3.5772879123687744, - "learning_rate": 8.52474708504043e-07, - "loss": 0.8144, - "step": 19000 - }, - { - "epoch": 0.8808530366249421, - "eval_loss": 0.8745938539505005, - "eval_runtime": 11.2692, - "eval_samples_per_second": 11.27, - "eval_steps_per_second": 11.27, - "step": 19000 - }, - { - "epoch": 0.8810848400556328, - "grad_norm": 3.7284469604492188, - "learning_rate": 8.492086327858184e-07, - "loss": 0.905, - "step": 19005 - }, - { - "epoch": 0.8813166434863235, - "grad_norm": 3.670050621032715, - "learning_rate": 8.459485482573481e-07, - "loss": 0.8371, - "step": 19010 - }, - { - "epoch": 0.8815484469170144, - "grad_norm": 3.598022222518921, - "learning_rate": 8.426944570530715e-07, - "loss": 0.8005, - "step": 19015 - }, - { - "epoch": 0.8817802503477051, - "grad_norm": 4.089553356170654, - "learning_rate": 8.394463613035031e-07, - "loss": 0.9309, - "step": 19020 - }, - { - "epoch": 0.8820120537783959, - "grad_norm": 3.748443365097046, - "learning_rate": 8.362042631352274e-07, - "loss": 0.9283, - "step": 19025 - }, - { - "epoch": 0.8822438572090867, - "grad_norm": 3.325829267501831, - "learning_rate": 8.329681646709076e-07, - "loss": 0.8398, - "step": 19030 - }, - { - "epoch": 0.8824756606397774, - "grad_norm": 4.052063465118408, - "learning_rate": 8.297380680292732e-07, - "loss": 0.8329, - "step": 19035 - }, - { - "epoch": 0.8827074640704683, - "grad_norm": 4.558441162109375, - "learning_rate": 8.265139753251306e-07, - "loss": 0.9717, - "step": 19040 - }, - { - "epoch": 0.882939267501159, - "grad_norm": 3.689945697784424, - "learning_rate": 8.232958886693498e-07, - "loss": 0.7946, - "step": 19045 - }, - { - "epoch": 0.8831710709318498, - "grad_norm": 3.7369065284729004, - "learning_rate": 8.200838101688713e-07, - "loss": 1.0788, - "step": 19050 - }, - { - "epoch": 0.8834028743625406, - "grad_norm": 4.071127891540527, - "learning_rate": 8.168777419267026e-07, - "loss": 0.9555, - "step": 19055 - }, - { - "epoch": 0.8836346777932313, - "grad_norm": 3.3217930793762207, - "learning_rate": 8.136776860419149e-07, - "loss": 0.8766, - "step": 19060 - }, - { - "epoch": 0.8838664812239221, - "grad_norm": 3.262667179107666, - "learning_rate": 8.104836446096409e-07, - "loss": 0.7637, - "step": 19065 - }, - { - "epoch": 0.8840982846546129, - "grad_norm": 3.5461394786834717, - "learning_rate": 8.07295619721078e-07, - "loss": 0.8512, - "step": 19070 - }, - { - "epoch": 0.8843300880853037, - "grad_norm": 3.659705638885498, - "learning_rate": 8.041136134634864e-07, - "loss": 0.9289, - "step": 19075 - }, - { - "epoch": 0.8845618915159944, - "grad_norm": 3.627200126647949, - "learning_rate": 8.009376279201819e-07, - "loss": 0.8783, - "step": 19080 - }, - { - "epoch": 0.8847936949466852, - "grad_norm": 3.535076856613159, - "learning_rate": 7.977676651705391e-07, - "loss": 0.8446, - "step": 19085 - }, - { - "epoch": 0.885025498377376, - "grad_norm": 3.2673180103302, - "learning_rate": 7.946037272899965e-07, - "loss": 0.9139, - "step": 19090 - }, - { - "epoch": 0.8852573018080667, - "grad_norm": 3.439281702041626, - "learning_rate": 7.914458163500327e-07, - "loss": 0.8172, - "step": 19095 - }, - { - "epoch": 0.8854891052387576, - "grad_norm": 3.935387134552002, - "learning_rate": 7.882939344181939e-07, - "loss": 0.7661, - "step": 19100 - }, - { - "epoch": 0.8854891052387576, - "eval_loss": 0.8745239973068237, - "eval_runtime": 11.2651, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 19100 - }, - { - "epoch": 0.8857209086694483, - "grad_norm": 4.236382484436035, - "learning_rate": 7.851480835580749e-07, - "loss": 0.9492, - "step": 19105 - }, - { - "epoch": 0.885952712100139, - "grad_norm": 3.854919672012329, - "learning_rate": 7.820082658293227e-07, - "loss": 0.8794, - "step": 19110 - }, - { - "epoch": 0.8861845155308299, - "grad_norm": 4.203670024871826, - "learning_rate": 7.788744832876294e-07, - "loss": 0.9396, - "step": 19115 - }, - { - "epoch": 0.8864163189615206, - "grad_norm": 3.6348228454589844, - "learning_rate": 7.757467379847438e-07, - "loss": 0.8706, - "step": 19120 - }, - { - "epoch": 0.8866481223922114, - "grad_norm": 3.4574408531188965, - "learning_rate": 7.726250319684558e-07, - "loss": 0.8792, - "step": 19125 - }, - { - "epoch": 0.8868799258229022, - "grad_norm": 3.5718040466308594, - "learning_rate": 7.695093672826015e-07, - "loss": 0.7707, - "step": 19130 - }, - { - "epoch": 0.8871117292535929, - "grad_norm": 3.444611072540283, - "learning_rate": 7.663997459670647e-07, - "loss": 0.8391, - "step": 19135 - }, - { - "epoch": 0.8873435326842837, - "grad_norm": 4.593543529510498, - "learning_rate": 7.632961700577723e-07, - "loss": 0.882, - "step": 19140 - }, - { - "epoch": 0.8875753361149745, - "grad_norm": 4.171180248260498, - "learning_rate": 7.601986415866903e-07, - "loss": 0.9814, - "step": 19145 - }, - { - "epoch": 0.8878071395456653, - "grad_norm": 3.3400700092315674, - "learning_rate": 7.571071625818283e-07, - "loss": 0.592, - "step": 19150 - }, - { - "epoch": 0.8880389429763561, - "grad_norm": 3.698866128921509, - "learning_rate": 7.540217350672318e-07, - "loss": 0.9165, - "step": 19155 - }, - { - "epoch": 0.8882707464070468, - "grad_norm": 4.187894821166992, - "learning_rate": 7.509423610629896e-07, - "loss": 0.8477, - "step": 19160 - }, - { - "epoch": 0.8885025498377376, - "grad_norm": 3.720407724380493, - "learning_rate": 7.478690425852197e-07, - "loss": 0.9191, - "step": 19165 - }, - { - "epoch": 0.8887343532684284, - "grad_norm": 4.027552604675293, - "learning_rate": 7.448017816460829e-07, - "loss": 0.8655, - "step": 19170 - }, - { - "epoch": 0.8889661566991192, - "grad_norm": 3.9079010486602783, - "learning_rate": 7.417405802537725e-07, - "loss": 0.7091, - "step": 19175 - }, - { - "epoch": 0.8891979601298099, - "grad_norm": 3.516188144683838, - "learning_rate": 7.386854404125088e-07, - "loss": 0.7183, - "step": 19180 - }, - { - "epoch": 0.8894297635605007, - "grad_norm": 2.9960403442382812, - "learning_rate": 7.356363641225484e-07, - "loss": 0.7832, - "step": 19185 - }, - { - "epoch": 0.8896615669911915, - "grad_norm": 4.731221675872803, - "learning_rate": 7.325933533801766e-07, - "loss": 0.8783, - "step": 19190 - }, - { - "epoch": 0.8898933704218822, - "grad_norm": 3.7551865577697754, - "learning_rate": 7.29556410177712e-07, - "loss": 0.9412, - "step": 19195 - }, - { - "epoch": 0.8901251738525731, - "grad_norm": 3.833738327026367, - "learning_rate": 7.265255365034907e-07, - "loss": 0.7348, - "step": 19200 - }, - { - "epoch": 0.8901251738525731, - "eval_loss": 0.8742764592170715, - "eval_runtime": 11.2535, - "eval_samples_per_second": 11.285, - "eval_steps_per_second": 11.285, - "step": 19200 - }, - { - "epoch": 0.8903569772832638, - "grad_norm": 3.7898571491241455, - "learning_rate": 7.235007343418843e-07, - "loss": 0.8018, - "step": 19205 - }, - { - "epoch": 0.8905887807139545, - "grad_norm": 3.9264822006225586, - "learning_rate": 7.204820056732897e-07, - "loss": 0.9964, - "step": 19210 - }, - { - "epoch": 0.8908205841446454, - "grad_norm": 3.960806369781494, - "learning_rate": 7.17469352474115e-07, - "loss": 0.7303, - "step": 19215 - }, - { - "epoch": 0.8910523875753361, - "grad_norm": 3.6674251556396484, - "learning_rate": 7.144627767168033e-07, - "loss": 1.031, - "step": 19220 - }, - { - "epoch": 0.8912841910060268, - "grad_norm": 3.619192123413086, - "learning_rate": 7.114622803698157e-07, - "loss": 0.8463, - "step": 19225 - }, - { - "epoch": 0.8915159944367177, - "grad_norm": 3.470325469970703, - "learning_rate": 7.084678653976295e-07, - "loss": 0.8261, - "step": 19230 - }, - { - "epoch": 0.8917477978674084, - "grad_norm": 4.503883361816406, - "learning_rate": 7.054795337607445e-07, - "loss": 0.9888, - "step": 19235 - }, - { - "epoch": 0.8919796012980992, - "grad_norm": 3.7686662673950195, - "learning_rate": 7.024972874156766e-07, - "loss": 0.9952, - "step": 19240 - }, - { - "epoch": 0.89221140472879, - "grad_norm": 2.989572286605835, - "learning_rate": 6.99521128314955e-07, - "loss": 0.821, - "step": 19245 - }, - { - "epoch": 0.8924432081594807, - "grad_norm": 3.647934913635254, - "learning_rate": 6.965510584071234e-07, - "loss": 1.0117, - "step": 19250 - }, - { - "epoch": 0.8926750115901715, - "grad_norm": 4.123228549957275, - "learning_rate": 6.935870796367438e-07, - "loss": 0.811, - "step": 19255 - }, - { - "epoch": 0.8929068150208623, - "grad_norm": 3.645017385482788, - "learning_rate": 6.906291939443877e-07, - "loss": 0.8398, - "step": 19260 - }, - { - "epoch": 0.8931386184515531, - "grad_norm": 3.1021242141723633, - "learning_rate": 6.876774032666356e-07, - "loss": 0.6563, - "step": 19265 - }, - { - "epoch": 0.8933704218822438, - "grad_norm": 3.449209451675415, - "learning_rate": 6.847317095360762e-07, - "loss": 0.6527, - "step": 19270 - }, - { - "epoch": 0.8936022253129347, - "grad_norm": 3.8125498294830322, - "learning_rate": 6.817921146813111e-07, - "loss": 0.8687, - "step": 19275 - }, - { - "epoch": 0.8938340287436254, - "grad_norm": 4.294896125793457, - "learning_rate": 6.788586206269476e-07, - "loss": 0.7868, - "step": 19280 - }, - { - "epoch": 0.8940658321743162, - "grad_norm": 4.346077919006348, - "learning_rate": 6.759312292935949e-07, - "loss": 0.9154, - "step": 19285 - }, - { - "epoch": 0.894297635605007, - "grad_norm": 3.7401599884033203, - "learning_rate": 6.730099425978709e-07, - "loss": 0.9543, - "step": 19290 - }, - { - "epoch": 0.8945294390356977, - "grad_norm": 3.5999746322631836, - "learning_rate": 6.70094762452399e-07, - "loss": 0.7718, - "step": 19295 - }, - { - "epoch": 0.8947612424663886, - "grad_norm": 3.3646798133850098, - "learning_rate": 6.671856907657936e-07, - "loss": 0.9172, - "step": 19300 - }, - { - "epoch": 0.8947612424663886, - "eval_loss": 0.8741331100463867, - "eval_runtime": 11.2872, - "eval_samples_per_second": 11.252, - "eval_steps_per_second": 11.252, - "step": 19300 - }, - { - "epoch": 0.8949930458970793, - "grad_norm": 3.7011406421661377, - "learning_rate": 6.642827294426801e-07, - "loss": 0.7227, - "step": 19305 - }, - { - "epoch": 0.89522484932777, - "grad_norm": 3.1988883018493652, - "learning_rate": 6.613858803836814e-07, - "loss": 0.6719, - "step": 19310 - }, - { - "epoch": 0.8954566527584609, - "grad_norm": 3.0587880611419678, - "learning_rate": 6.58495145485416e-07, - "loss": 0.7688, - "step": 19315 - }, - { - "epoch": 0.8956884561891516, - "grad_norm": 4.00978422164917, - "learning_rate": 6.556105266404999e-07, - "loss": 0.7579, - "step": 19320 - }, - { - "epoch": 0.8959202596198423, - "grad_norm": 4.0303802490234375, - "learning_rate": 6.527320257375491e-07, - "loss": 0.8285, - "step": 19325 - }, - { - "epoch": 0.8961520630505332, - "grad_norm": 3.5370168685913086, - "learning_rate": 6.498596446611671e-07, - "loss": 1.005, - "step": 19330 - }, - { - "epoch": 0.8963838664812239, - "grad_norm": 3.032008647918701, - "learning_rate": 6.469933852919541e-07, - "loss": 0.7647, - "step": 19335 - }, - { - "epoch": 0.8966156699119147, - "grad_norm": 3.647174596786499, - "learning_rate": 6.441332495065034e-07, - "loss": 0.8656, - "step": 19340 - }, - { - "epoch": 0.8968474733426055, - "grad_norm": 3.3170273303985596, - "learning_rate": 6.412792391773992e-07, - "loss": 1.0113, - "step": 19345 - }, - { - "epoch": 0.8970792767732962, - "grad_norm": 3.2985119819641113, - "learning_rate": 6.384313561732114e-07, - "loss": 0.9544, - "step": 19350 - }, - { - "epoch": 0.897311080203987, - "grad_norm": 3.6995089054107666, - "learning_rate": 6.35589602358504e-07, - "loss": 1.0398, - "step": 19355 - }, - { - "epoch": 0.8975428836346778, - "grad_norm": 3.910785436630249, - "learning_rate": 6.32753979593821e-07, - "loss": 0.9002, - "step": 19360 - }, - { - "epoch": 0.8977746870653686, - "grad_norm": 3.369471788406372, - "learning_rate": 6.299244897357004e-07, - "loss": 0.8634, - "step": 19365 - }, - { - "epoch": 0.8980064904960593, - "grad_norm": 4.074563980102539, - "learning_rate": 6.271011346366574e-07, - "loss": 0.9008, - "step": 19370 - }, - { - "epoch": 0.8982382939267501, - "grad_norm": 3.610403299331665, - "learning_rate": 6.242839161451952e-07, - "loss": 0.8019, - "step": 19375 - }, - { - "epoch": 0.8984700973574409, - "grad_norm": 3.7776875495910645, - "learning_rate": 6.214728361057997e-07, - "loss": 0.9037, - "step": 19380 - }, - { - "epoch": 0.8987019007881316, - "grad_norm": 4.211784839630127, - "learning_rate": 6.186678963589354e-07, - "loss": 0.7532, - "step": 19385 - }, - { - "epoch": 0.8989337042188225, - "grad_norm": 3.3378918170928955, - "learning_rate": 6.158690987410443e-07, - "loss": 0.8697, - "step": 19390 - }, - { - "epoch": 0.8991655076495132, - "grad_norm": 3.1902554035186768, - "learning_rate": 6.130764450845539e-07, - "loss": 0.7719, - "step": 19395 - }, - { - "epoch": 0.8993973110802039, - "grad_norm": 4.117619514465332, - "learning_rate": 6.102899372178639e-07, - "loss": 1.0541, - "step": 19400 - }, - { - "epoch": 0.8993973110802039, - "eval_loss": 0.8739257454872131, - "eval_runtime": 11.2619, - "eval_samples_per_second": 11.277, - "eval_steps_per_second": 11.277, - "step": 19400 - }, - { - "epoch": 0.8996291145108948, - "grad_norm": 3.4233851432800293, - "learning_rate": 6.075095769653516e-07, - "loss": 0.8865, - "step": 19405 - }, - { - "epoch": 0.8998609179415855, - "grad_norm": 3.4738340377807617, - "learning_rate": 6.047353661473698e-07, - "loss": 0.7926, - "step": 19410 - }, - { - "epoch": 0.9000927213722764, - "grad_norm": 3.7461628913879395, - "learning_rate": 6.019673065802479e-07, - "loss": 0.8197, - "step": 19415 - }, - { - "epoch": 0.9003245248029671, - "grad_norm": 3.637523889541626, - "learning_rate": 5.992054000762793e-07, - "loss": 0.8103, - "step": 19420 - }, - { - "epoch": 0.9005563282336578, - "grad_norm": 4.224597930908203, - "learning_rate": 5.964496484437365e-07, - "loss": 0.8117, - "step": 19425 - }, - { - "epoch": 0.9007881316643487, - "grad_norm": 3.7092161178588867, - "learning_rate": 5.937000534868631e-07, - "loss": 0.8398, - "step": 19430 - }, - { - "epoch": 0.9010199350950394, - "grad_norm": 3.013432502746582, - "learning_rate": 5.909566170058644e-07, - "loss": 0.8795, - "step": 19435 - }, - { - "epoch": 0.9012517385257302, - "grad_norm": 3.9475176334381104, - "learning_rate": 5.882193407969216e-07, - "loss": 0.7751, - "step": 19440 - }, - { - "epoch": 0.901483541956421, - "grad_norm": 3.6993296146392822, - "learning_rate": 5.854882266521788e-07, - "loss": 0.8293, - "step": 19445 - }, - { - "epoch": 0.9017153453871117, - "grad_norm": 3.6457648277282715, - "learning_rate": 5.827632763597468e-07, - "loss": 0.8728, - "step": 19450 - }, - { - "epoch": 0.9019471488178025, - "grad_norm": 3.7621054649353027, - "learning_rate": 5.800444917036962e-07, - "loss": 0.8707, - "step": 19455 - }, - { - "epoch": 0.9021789522484933, - "grad_norm": 4.108449935913086, - "learning_rate": 5.773318744640688e-07, - "loss": 0.8549, - "step": 19460 - }, - { - "epoch": 0.902410755679184, - "grad_norm": 3.445681095123291, - "learning_rate": 5.746254264168649e-07, - "loss": 0.6915, - "step": 19465 - }, - { - "epoch": 0.9026425591098748, - "grad_norm": 3.283756732940674, - "learning_rate": 5.719251493340417e-07, - "loss": 0.8681, - "step": 19470 - }, - { - "epoch": 0.9028743625405656, - "grad_norm": 3.5738472938537598, - "learning_rate": 5.692310449835237e-07, - "loss": 0.9259, - "step": 19475 - }, - { - "epoch": 0.9031061659712564, - "grad_norm": 3.6439244747161865, - "learning_rate": 5.665431151291867e-07, - "loss": 0.7761, - "step": 19480 - }, - { - "epoch": 0.9033379694019471, - "grad_norm": 3.921151876449585, - "learning_rate": 5.638613615308708e-07, - "loss": 0.8634, - "step": 19485 - }, - { - "epoch": 0.903569772832638, - "grad_norm": 3.847851514816284, - "learning_rate": 5.61185785944366e-07, - "loss": 0.7767, - "step": 19490 - }, - { - "epoch": 0.9038015762633287, - "grad_norm": 4.24960994720459, - "learning_rate": 5.585163901214197e-07, - "loss": 0.7588, - "step": 19495 - }, - { - "epoch": 0.9040333796940194, - "grad_norm": 3.390794515609741, - "learning_rate": 5.558531758097396e-07, - "loss": 0.9196, - "step": 19500 - }, - { - "epoch": 0.9040333796940194, - "eval_loss": 0.8735858201980591, - "eval_runtime": 11.2655, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, - "step": 19500 - }, - { - "epoch": 0.9042651831247103, - "grad_norm": 3.261413097381592, - "learning_rate": 5.531961447529743e-07, - "loss": 0.7741, - "step": 19505 - }, - { - "epoch": 0.904496986555401, - "grad_norm": 3.9807848930358887, - "learning_rate": 5.505452986907322e-07, - "loss": 0.709, - "step": 19510 - }, - { - "epoch": 0.9047287899860917, - "grad_norm": 4.0768632888793945, - "learning_rate": 5.479006393585728e-07, - "loss": 0.8462, - "step": 19515 - }, - { - "epoch": 0.9049605934167826, - "grad_norm": 3.7398440837860107, - "learning_rate": 5.452621684879988e-07, - "loss": 0.8092, - "step": 19520 - }, - { - "epoch": 0.9051923968474733, - "grad_norm": 3.3626298904418945, - "learning_rate": 5.426298878064684e-07, - "loss": 0.7902, - "step": 19525 - }, - { - "epoch": 0.9054242002781642, - "grad_norm": 3.6051082611083984, - "learning_rate": 5.40003799037383e-07, - "loss": 0.8525, - "step": 19530 - }, - { - "epoch": 0.9056560037088549, - "grad_norm": 3.7399423122406006, - "learning_rate": 5.373839039000905e-07, - "loss": 0.8411, - "step": 19535 - }, - { - "epoch": 0.9058878071395456, - "grad_norm": 3.000722646713257, - "learning_rate": 5.347702041098834e-07, - "loss": 0.6594, - "step": 19540 - }, - { - "epoch": 0.9061196105702365, - "grad_norm": 3.077620029449463, - "learning_rate": 5.321627013779984e-07, - "loss": 0.7542, - "step": 19545 - }, - { - "epoch": 0.9063514140009272, - "grad_norm": 3.0676279067993164, - "learning_rate": 5.295613974116165e-07, - "loss": 0.6199, - "step": 19550 - }, - { - "epoch": 0.906583217431618, - "grad_norm": 3.53662371635437, - "learning_rate": 5.269662939138576e-07, - "loss": 0.6298, - "step": 19555 - }, - { - "epoch": 0.9068150208623088, - "grad_norm": 3.4631190299987793, - "learning_rate": 5.243773925837847e-07, - "loss": 0.8718, - "step": 19560 - }, - { - "epoch": 0.9070468242929995, - "grad_norm": 3.2487220764160156, - "learning_rate": 5.217946951163955e-07, - "loss": 0.7724, - "step": 19565 - }, - { - "epoch": 0.9072786277236903, - "grad_norm": 4.004088401794434, - "learning_rate": 5.192182032026327e-07, - "loss": 0.9907, - "step": 19570 - }, - { - "epoch": 0.9075104311543811, - "grad_norm": 3.633103132247925, - "learning_rate": 5.166479185293694e-07, - "loss": 0.9067, - "step": 19575 - }, - { - "epoch": 0.9077422345850719, - "grad_norm": 4.3104095458984375, - "learning_rate": 5.140838427794192e-07, - "loss": 0.8647, - "step": 19580 - }, - { - "epoch": 0.9079740380157626, - "grad_norm": 4.145888805389404, - "learning_rate": 5.115259776315307e-07, - "loss": 0.817, - "step": 19585 - }, - { - "epoch": 0.9082058414464534, - "grad_norm": 3.9163827896118164, - "learning_rate": 5.089743247603829e-07, - "loss": 0.8899, - "step": 19590 - }, - { - "epoch": 0.9084376448771442, - "grad_norm": 3.5132017135620117, - "learning_rate": 5.064288858365885e-07, - "loss": 0.747, - "step": 19595 - }, - { - "epoch": 0.9086694483078349, - "grad_norm": 3.710824966430664, - "learning_rate": 5.038896625266942e-07, - "loss": 0.9545, - "step": 19600 - }, - { - "epoch": 0.9086694483078349, - "eval_loss": 0.8735297918319702, - "eval_runtime": 11.2733, - "eval_samples_per_second": 11.266, - "eval_steps_per_second": 11.266, - "step": 19600 - }, - { - "epoch": 0.9089012517385258, - "grad_norm": 3.6324193477630615, - "learning_rate": 5.013566564931771e-07, - "loss": 0.8615, - "step": 19605 - }, - { - "epoch": 0.9091330551692165, - "grad_norm": 3.605224370956421, - "learning_rate": 4.988298693944382e-07, - "loss": 0.7686, - "step": 19610 - }, - { - "epoch": 0.9093648585999072, - "grad_norm": 4.282202243804932, - "learning_rate": 4.963093028848154e-07, - "loss": 0.8672, - "step": 19615 - }, - { - "epoch": 0.9095966620305981, - "grad_norm": 3.4218647480010986, - "learning_rate": 4.937949586145696e-07, - "loss": 0.7751, - "step": 19620 - }, - { - "epoch": 0.9098284654612888, - "grad_norm": 4.3748345375061035, - "learning_rate": 4.91286838229883e-07, - "loss": 1.0064, - "step": 19625 - }, - { - "epoch": 0.9100602688919796, - "grad_norm": 3.9138522148132324, - "learning_rate": 4.887849433728709e-07, - "loss": 0.7735, - "step": 19630 - }, - { - "epoch": 0.9102920723226704, - "grad_norm": 3.642190933227539, - "learning_rate": 4.862892756815707e-07, - "loss": 0.9095, - "step": 19635 - }, - { - "epoch": 0.9105238757533611, - "grad_norm": 3.8553028106689453, - "learning_rate": 4.837998367899388e-07, - "loss": 0.7353, - "step": 19640 - }, - { - "epoch": 0.9107556791840519, - "grad_norm": 3.8063559532165527, - "learning_rate": 4.813166283278569e-07, - "loss": 0.995, - "step": 19645 - }, - { - "epoch": 0.9109874826147427, - "grad_norm": 3.4844539165496826, - "learning_rate": 4.78839651921128e-07, - "loss": 0.7777, - "step": 19650 - }, - { - "epoch": 0.9112192860454335, - "grad_norm": 3.6754894256591797, - "learning_rate": 4.763689091914736e-07, - "loss": 0.8534, - "step": 19655 - }, - { - "epoch": 0.9114510894761243, - "grad_norm": 3.5442609786987305, - "learning_rate": 4.73904401756532e-07, - "loss": 0.7036, - "step": 19660 - }, - { - "epoch": 0.911682892906815, - "grad_norm": 3.768289804458618, - "learning_rate": 4.714461312298624e-07, - "loss": 0.8011, - "step": 19665 - }, - { - "epoch": 0.9119146963375058, - "grad_norm": 3.958264112472534, - "learning_rate": 4.689940992209407e-07, - "loss": 0.9242, - "step": 19670 - }, - { - "epoch": 0.9121464997681966, - "grad_norm": 3.9602081775665283, - "learning_rate": 4.66548307335154e-07, - "loss": 1.0384, - "step": 19675 - }, - { - "epoch": 0.9123783031988874, - "grad_norm": 3.245852470397949, - "learning_rate": 4.6410875717380897e-07, - "loss": 0.7956, - "step": 19680 - }, - { - "epoch": 0.9126101066295781, - "grad_norm": 3.358444929122925, - "learning_rate": 4.616754503341214e-07, - "loss": 0.7481, - "step": 19685 - }, - { - "epoch": 0.9128419100602689, - "grad_norm": 3.517886161804199, - "learning_rate": 4.592483884092247e-07, - "loss": 0.872, - "step": 19690 - }, - { - "epoch": 0.9130737134909597, - "grad_norm": 3.7229044437408447, - "learning_rate": 4.568275729881577e-07, - "loss": 0.755, - "step": 19695 - }, - { - "epoch": 0.9133055169216504, - "grad_norm": 3.6528728008270264, - "learning_rate": 4.5441300565587264e-07, - "loss": 0.7834, - "step": 19700 - }, - { - "epoch": 0.9133055169216504, - "eval_loss": 0.8733378052711487, - "eval_runtime": 11.2608, - "eval_samples_per_second": 11.278, - "eval_steps_per_second": 11.278, - "step": 19700 - }, - { - "epoch": 0.9135373203523413, - "grad_norm": 3.802203893661499, - "learning_rate": 4.5200468799323383e-07, - "loss": 0.9832, - "step": 19705 - }, - { - "epoch": 0.913769123783032, - "grad_norm": 4.232209205627441, - "learning_rate": 4.496026215770077e-07, - "loss": 0.7035, - "step": 19710 - }, - { - "epoch": 0.9140009272137227, - "grad_norm": 4.124170780181885, - "learning_rate": 4.472068079798708e-07, - "loss": 1.0385, - "step": 19715 - }, - { - "epoch": 0.9142327306444136, - "grad_norm": 3.3067452907562256, - "learning_rate": 4.4481724877040834e-07, - "loss": 0.9504, - "step": 19720 - }, - { - "epoch": 0.9144645340751043, - "grad_norm": 3.844074249267578, - "learning_rate": 4.424339455131044e-07, - "loss": 0.9187, - "step": 19725 - }, - { - "epoch": 0.914696337505795, - "grad_norm": 3.3239314556121826, - "learning_rate": 4.4005689976835297e-07, - "loss": 0.8161, - "step": 19730 - }, - { - "epoch": 0.9149281409364859, - "grad_norm": 3.4764716625213623, - "learning_rate": 4.3768611309245145e-07, - "loss": 0.7984, - "step": 19735 - }, - { - "epoch": 0.9151599443671766, - "grad_norm": 3.828855276107788, - "learning_rate": 4.3532158703759487e-07, - "loss": 0.7868, - "step": 19740 - }, - { - "epoch": 0.9153917477978674, - "grad_norm": 3.310119390487671, - "learning_rate": 4.329633231518804e-07, - "loss": 0.8396, - "step": 19745 - }, - { - "epoch": 0.9156235512285582, - "grad_norm": 3.7126951217651367, - "learning_rate": 4.306113229793074e-07, - "loss": 0.8955, - "step": 19750 - }, - { - "epoch": 0.9158553546592489, - "grad_norm": 3.6136951446533203, - "learning_rate": 4.2826558805977527e-07, - "loss": 0.775, - "step": 19755 - }, - { - "epoch": 0.9160871580899397, - "grad_norm": 3.726264715194702, - "learning_rate": 4.259261199290765e-07, - "loss": 0.7419, - "step": 19760 - }, - { - "epoch": 0.9163189615206305, - "grad_norm": 3.7436885833740234, - "learning_rate": 4.2359292011890484e-07, - "loss": 0.7245, - "step": 19765 - }, - { - "epoch": 0.9165507649513213, - "grad_norm": 3.374852180480957, - "learning_rate": 4.2126599015684723e-07, - "loss": 0.7779, - "step": 19770 - }, - { - "epoch": 0.9167825683820121, - "grad_norm": 3.1694605350494385, - "learning_rate": 4.189453315663905e-07, - "loss": 0.942, - "step": 19775 - }, - { - "epoch": 0.9170143718127028, - "grad_norm": 4.185464382171631, - "learning_rate": 4.1663094586690824e-07, - "loss": 0.7565, - "step": 19780 - }, - { - "epoch": 0.9172461752433936, - "grad_norm": 4.132409572601318, - "learning_rate": 4.143228345736716e-07, - "loss": 0.8305, - "step": 19785 - }, - { - "epoch": 0.9174779786740844, - "grad_norm": 3.8933820724487305, - "learning_rate": 4.1202099919784633e-07, - "loss": 0.9198, - "step": 19790 - }, - { - "epoch": 0.9177097821047752, - "grad_norm": 3.0863237380981445, - "learning_rate": 4.097254412464824e-07, - "loss": 0.7378, - "step": 19795 - }, - { - "epoch": 0.9179415855354659, - "grad_norm": 3.9306046962738037, - "learning_rate": 4.0743616222252424e-07, - "loss": 0.8401, - "step": 19800 - }, - { - "epoch": 0.9179415855354659, - "eval_loss": 0.8732080459594727, - "eval_runtime": 11.2654, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, - "step": 19800 - }, - { - "epoch": 0.9181733889661567, - "grad_norm": 3.6145589351654053, - "learning_rate": 4.0515316362480516e-07, - "loss": 0.8457, - "step": 19805 - }, - { - "epoch": 0.9184051923968475, - "grad_norm": 3.4796619415283203, - "learning_rate": 4.028764469480473e-07, - "loss": 0.8131, - "step": 19810 - }, - { - "epoch": 0.9186369958275382, - "grad_norm": 4.206612586975098, - "learning_rate": 4.006060136828549e-07, - "loss": 0.7882, - "step": 19815 - }, - { - "epoch": 0.9188687992582291, - "grad_norm": 3.936595916748047, - "learning_rate": 3.983418653157245e-07, - "loss": 0.7965, - "step": 19820 - }, - { - "epoch": 0.9191006026889198, - "grad_norm": 4.071559906005859, - "learning_rate": 3.960840033290358e-07, - "loss": 0.9554, - "step": 19825 - }, - { - "epoch": 0.9193324061196105, - "grad_norm": 3.226321220397949, - "learning_rate": 3.938324292010498e-07, - "loss": 0.7675, - "step": 19830 - }, - { - "epoch": 0.9195642095503014, - "grad_norm": 4.5919036865234375, - "learning_rate": 3.9158714440591496e-07, - "loss": 0.7775, - "step": 19835 - }, - { - "epoch": 0.9197960129809921, - "grad_norm": 3.887747049331665, - "learning_rate": 3.893481504136598e-07, - "loss": 0.7748, - "step": 19840 - }, - { - "epoch": 0.9200278164116829, - "grad_norm": 3.5264813899993896, - "learning_rate": 3.8711544869019404e-07, - "loss": 0.6458, - "step": 19845 - }, - { - "epoch": 0.9202596198423737, - "grad_norm": 4.041652679443359, - "learning_rate": 3.8488904069730713e-07, - "loss": 0.7981, - "step": 19850 - }, - { - "epoch": 0.9204914232730644, - "grad_norm": 3.4928979873657227, - "learning_rate": 3.826689278926721e-07, - "loss": 0.79, - "step": 19855 - }, - { - "epoch": 0.9207232267037552, - "grad_norm": 3.8515238761901855, - "learning_rate": 3.8045511172983493e-07, - "loss": 0.8458, - "step": 19860 - }, - { - "epoch": 0.920955030134446, - "grad_norm": 4.285694122314453, - "learning_rate": 3.782475936582208e-07, - "loss": 0.8627, - "step": 19865 - }, - { - "epoch": 0.9211868335651368, - "grad_norm": 3.648986577987671, - "learning_rate": 3.7604637512313245e-07, - "loss": 0.881, - "step": 19870 - }, - { - "epoch": 0.9214186369958275, - "grad_norm": 3.3540894985198975, - "learning_rate": 3.7385145756575037e-07, - "loss": 0.9356, - "step": 19875 - }, - { - "epoch": 0.9216504404265183, - "grad_norm": 4.117437362670898, - "learning_rate": 3.716628424231228e-07, - "loss": 0.9502, - "step": 19880 - }, - { - "epoch": 0.9218822438572091, - "grad_norm": 3.526902675628662, - "learning_rate": 3.694805311281802e-07, - "loss": 0.7778, - "step": 19885 - }, - { - "epoch": 0.9221140472878998, - "grad_norm": 3.7025606632232666, - "learning_rate": 3.6730452510971736e-07, - "loss": 0.7941, - "step": 19890 - }, - { - "epoch": 0.9223458507185907, - "grad_norm": 3.383410930633545, - "learning_rate": 3.6513482579240813e-07, - "loss": 0.6289, - "step": 19895 - }, - { - "epoch": 0.9225776541492814, - "grad_norm": 4.023029327392578, - "learning_rate": 3.629714345967916e-07, - "loss": 0.9294, - "step": 19900 - }, - { - "epoch": 0.9225776541492814, - "eval_loss": 0.8729880452156067, - "eval_runtime": 11.2731, - "eval_samples_per_second": 11.266, - "eval_steps_per_second": 11.266, - "step": 19900 - }, - { - "epoch": 0.9228094575799722, - "grad_norm": 3.646693229675293, - "learning_rate": 3.608143529392816e-07, - "loss": 0.8774, - "step": 19905 - }, - { - "epoch": 0.923041261010663, - "grad_norm": 3.4620437622070312, - "learning_rate": 3.586635822321594e-07, - "loss": 0.8828, - "step": 19910 - }, - { - "epoch": 0.9232730644413537, - "grad_norm": 3.016693353652954, - "learning_rate": 3.565191238835719e-07, - "loss": 0.8007, - "step": 19915 - }, - { - "epoch": 0.9235048678720446, - "grad_norm": 3.731323003768921, - "learning_rate": 3.54380979297535e-07, - "loss": 0.94, - "step": 19920 - }, - { - "epoch": 0.9237366713027353, - "grad_norm": 3.6027114391326904, - "learning_rate": 3.522491498739311e-07, - "loss": 0.9447, - "step": 19925 - }, - { - "epoch": 0.923968474733426, - "grad_norm": 3.1686642169952393, - "learning_rate": 3.5012363700850815e-07, - "loss": 0.8824, - "step": 19930 - }, - { - "epoch": 0.9242002781641169, - "grad_norm": 3.9111225605010986, - "learning_rate": 3.4800444209287744e-07, - "loss": 0.7697, - "step": 19935 - }, - { - "epoch": 0.9244320815948076, - "grad_norm": 3.5328216552734375, - "learning_rate": 3.458915665145168e-07, - "loss": 0.8081, - "step": 19940 - }, - { - "epoch": 0.9246638850254983, - "grad_norm": 4.168152332305908, - "learning_rate": 3.4378501165676313e-07, - "loss": 0.7272, - "step": 19945 - }, - { - "epoch": 0.9248956884561892, - "grad_norm": 3.5316083431243896, - "learning_rate": 3.416847788988131e-07, - "loss": 0.8657, - "step": 19950 - }, - { - "epoch": 0.9251274918868799, - "grad_norm": 3.775172472000122, - "learning_rate": 3.395908696157313e-07, - "loss": 0.908, - "step": 19955 - }, - { - "epoch": 0.9253592953175707, - "grad_norm": 3.273514986038208, - "learning_rate": 3.3750328517843766e-07, - "loss": 0.9044, - "step": 19960 - }, - { - "epoch": 0.9255910987482615, - "grad_norm": 3.712826728820801, - "learning_rate": 3.354220269537101e-07, - "loss": 0.9362, - "step": 19965 - }, - { - "epoch": 0.9258229021789522, - "grad_norm": 3.922048568725586, - "learning_rate": 3.333470963041874e-07, - "loss": 0.8773, - "step": 19970 - }, - { - "epoch": 0.926054705609643, - "grad_norm": 4.05618953704834, - "learning_rate": 3.3127849458836404e-07, - "loss": 0.8257, - "step": 19975 - }, - { - "epoch": 0.9262865090403338, - "grad_norm": 3.3972959518432617, - "learning_rate": 3.292162231605922e-07, - "loss": 0.836, - "step": 19980 - }, - { - "epoch": 0.9265183124710246, - "grad_norm": 4.221245765686035, - "learning_rate": 3.271602833710774e-07, - "loss": 0.8406, - "step": 19985 - }, - { - "epoch": 0.9267501159017153, - "grad_norm": 2.942352056503296, - "learning_rate": 3.251106765658807e-07, - "loss": 0.7356, - "step": 19990 - }, - { - "epoch": 0.9269819193324061, - "grad_norm": 4.593667507171631, - "learning_rate": 3.230674040869186e-07, - "loss": 0.9098, - "step": 19995 - }, - { - "epoch": 0.9272137227630969, - "grad_norm": 3.9805760383605957, - "learning_rate": 3.210304672719577e-07, - "loss": 0.8497, - "step": 20000 - }, - { - "epoch": 0.9272137227630969, - "eval_loss": 0.8728947639465332, - "eval_runtime": 11.2692, - "eval_samples_per_second": 11.27, - "eval_steps_per_second": 11.27, - "step": 20000 - }, - { - "epoch": 0.9274455261937876, - "grad_norm": 3.8057963848114014, - "learning_rate": 3.1899986745461685e-07, - "loss": 0.7294, - "step": 20005 - }, - { - "epoch": 0.9276773296244785, - "grad_norm": 4.51212739944458, - "learning_rate": 3.1697560596436915e-07, - "loss": 0.8166, - "step": 20010 - }, - { - "epoch": 0.9279091330551692, - "grad_norm": 4.221890449523926, - "learning_rate": 3.149576841265323e-07, - "loss": 0.826, - "step": 20015 - }, - { - "epoch": 0.9281409364858599, - "grad_norm": 3.759281873703003, - "learning_rate": 3.1294610326227825e-07, - "loss": 0.7127, - "step": 20020 - }, - { - "epoch": 0.9283727399165508, - "grad_norm": 3.804107189178467, - "learning_rate": 3.109408646886247e-07, - "loss": 0.8837, - "step": 20025 - }, - { - "epoch": 0.9286045433472415, - "grad_norm": 3.8112025260925293, - "learning_rate": 3.089419697184415e-07, - "loss": 0.7981, - "step": 20030 - }, - { - "epoch": 0.9288363467779324, - "grad_norm": 3.5084173679351807, - "learning_rate": 3.0694941966043613e-07, - "loss": 0.7923, - "step": 20035 - }, - { - "epoch": 0.9290681502086231, - "grad_norm": 4.406299114227295, - "learning_rate": 3.0496321581917066e-07, - "loss": 0.8866, - "step": 20040 - }, - { - "epoch": 0.9292999536393138, - "grad_norm": 4.219856262207031, - "learning_rate": 3.029833594950493e-07, - "loss": 0.6969, - "step": 20045 - }, - { - "epoch": 0.9295317570700047, - "grad_norm": 4.252161979675293, - "learning_rate": 3.010098519843174e-07, - "loss": 0.8416, - "step": 20050 - }, - { - "epoch": 0.9297635605006954, - "grad_norm": 4.353281497955322, - "learning_rate": 2.990426945790681e-07, - "loss": 0.9764, - "step": 20055 - }, - { - "epoch": 0.9299953639313862, - "grad_norm": 3.8172295093536377, - "learning_rate": 2.970818885672355e-07, - "loss": 0.9966, - "step": 20060 - }, - { - "epoch": 0.930227167362077, - "grad_norm": 3.732015609741211, - "learning_rate": 2.95127435232595e-07, - "loss": 0.9422, - "step": 20065 - }, - { - "epoch": 0.9304589707927677, - "grad_norm": 3.7781100273132324, - "learning_rate": 2.9317933585476075e-07, - "loss": 0.8624, - "step": 20070 - }, - { - "epoch": 0.9306907742234585, - "grad_norm": 3.4985451698303223, - "learning_rate": 2.912375917091903e-07, - "loss": 0.6331, - "step": 20075 - }, - { - "epoch": 0.9309225776541493, - "grad_norm": 3.749589204788208, - "learning_rate": 2.8930220406718e-07, - "loss": 0.7681, - "step": 20080 - }, - { - "epoch": 0.93115438108484, - "grad_norm": 4.194093704223633, - "learning_rate": 2.8737317419586185e-07, - "loss": 0.8769, - "step": 20085 - }, - { - "epoch": 0.9313861845155308, - "grad_norm": 3.5070695877075195, - "learning_rate": 2.854505033582078e-07, - "loss": 0.9185, - "step": 20090 - }, - { - "epoch": 0.9316179879462216, - "grad_norm": 3.497375011444092, - "learning_rate": 2.835341928130242e-07, - "loss": 0.9454, - "step": 20095 - }, - { - "epoch": 0.9318497913769124, - "grad_norm": 3.9200212955474854, - "learning_rate": 2.8162424381495633e-07, - "loss": 0.8779, - "step": 20100 - }, - { - "epoch": 0.9318497913769124, - "eval_loss": 0.8727548718452454, - "eval_runtime": 11.2653, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 20100 - }, - { - "epoch": 0.9320815948076031, - "grad_norm": 3.960589647293091, - "learning_rate": 2.7972065761448065e-07, - "loss": 0.8635, - "step": 20105 - }, - { - "epoch": 0.932313398238294, - "grad_norm": 3.295830011367798, - "learning_rate": 2.7782343545791014e-07, - "loss": 0.7369, - "step": 20110 - }, - { - "epoch": 0.9325452016689847, - "grad_norm": 3.6758456230163574, - "learning_rate": 2.759325785873923e-07, - "loss": 0.7372, - "step": 20115 - }, - { - "epoch": 0.9327770050996754, - "grad_norm": 4.0003180503845215, - "learning_rate": 2.740480882409047e-07, - "loss": 0.8826, - "step": 20120 - }, - { - "epoch": 0.9330088085303663, - "grad_norm": 3.0322558879852295, - "learning_rate": 2.7216996565225583e-07, - "loss": 0.7696, - "step": 20125 - }, - { - "epoch": 0.933240611961057, - "grad_norm": 3.4727134704589844, - "learning_rate": 2.7029821205108885e-07, - "loss": 0.9406, - "step": 20130 - }, - { - "epoch": 0.9334724153917477, - "grad_norm": 4.067834377288818, - "learning_rate": 2.6843282866287344e-07, - "loss": 0.8498, - "step": 20135 - }, - { - "epoch": 0.9337042188224386, - "grad_norm": 3.396549701690674, - "learning_rate": 2.6657381670891047e-07, - "loss": 0.9788, - "step": 20140 - }, - { - "epoch": 0.9339360222531293, - "grad_norm": 3.457141399383545, - "learning_rate": 2.6472117740633074e-07, - "loss": 0.8151, - "step": 20145 - }, - { - "epoch": 0.9341678256838202, - "grad_norm": 4.950340747833252, - "learning_rate": 2.6287491196808957e-07, - "loss": 0.9633, - "step": 20150 - }, - { - "epoch": 0.9343996291145109, - "grad_norm": 3.4806978702545166, - "learning_rate": 2.610350216029689e-07, - "loss": 0.8388, - "step": 20155 - }, - { - "epoch": 0.9346314325452016, - "grad_norm": 4.318567276000977, - "learning_rate": 2.592015075155807e-07, - "loss": 0.8218, - "step": 20160 - }, - { - "epoch": 0.9348632359758925, - "grad_norm": 3.5549590587615967, - "learning_rate": 2.573743709063603e-07, - "loss": 0.9195, - "step": 20165 - }, - { - "epoch": 0.9350950394065832, - "grad_norm": 3.809382915496826, - "learning_rate": 2.555536129715652e-07, - "loss": 0.7518, - "step": 20170 - }, - { - "epoch": 0.935326842837274, - "grad_norm": 3.2903544902801514, - "learning_rate": 2.537392349032808e-07, - "loss": 0.8851, - "step": 20175 - }, - { - "epoch": 0.9355586462679648, - "grad_norm": 3.7616612911224365, - "learning_rate": 2.519312378894112e-07, - "loss": 0.9809, - "step": 20180 - }, - { - "epoch": 0.9357904496986555, - "grad_norm": 3.5184905529022217, - "learning_rate": 2.5012962311368737e-07, - "loss": 0.8735, - "step": 20185 - }, - { - "epoch": 0.9360222531293463, - "grad_norm": 4.265666961669922, - "learning_rate": 2.48334391755658e-07, - "loss": 0.8784, - "step": 20190 - }, - { - "epoch": 0.9362540565600371, - "grad_norm": 3.5565555095672607, - "learning_rate": 2.4654554499069395e-07, - "loss": 0.769, - "step": 20195 - }, - { - "epoch": 0.9364858599907279, - "grad_norm": 3.7593798637390137, - "learning_rate": 2.4476308398998727e-07, - "loss": 0.6453, - "step": 20200 - }, - { - "epoch": 0.9364858599907279, - "eval_loss": 0.8727805614471436, - "eval_runtime": 11.2633, - "eval_samples_per_second": 11.276, - "eval_steps_per_second": 11.276, - "step": 20200 - }, - { - "epoch": 0.9367176634214186, - "grad_norm": 4.56554651260376, - "learning_rate": 2.429870099205456e-07, - "loss": 0.8942, - "step": 20205 - }, - { - "epoch": 0.9369494668521094, - "grad_norm": 3.816410541534424, - "learning_rate": 2.4121732394519646e-07, - "loss": 0.919, - "step": 20210 - }, - { - "epoch": 0.9371812702828002, - "grad_norm": 3.6307473182678223, - "learning_rate": 2.394540272225887e-07, - "loss": 0.7905, - "step": 20215 - }, - { - "epoch": 0.9374130737134909, - "grad_norm": 4.1224684715271, - "learning_rate": 2.37697120907181e-07, - "loss": 0.7759, - "step": 20220 - }, - { - "epoch": 0.9376448771441818, - "grad_norm": 4.303830623626709, - "learning_rate": 2.3594660614925326e-07, - "loss": 0.9309, - "step": 20225 - }, - { - "epoch": 0.9378766805748725, - "grad_norm": 3.734203815460205, - "learning_rate": 2.3420248409490086e-07, - "loss": 1.0117, - "step": 20230 - }, - { - "epoch": 0.9381084840055632, - "grad_norm": 3.7188565731048584, - "learning_rate": 2.3246475588603267e-07, - "loss": 0.8211, - "step": 20235 - }, - { - "epoch": 0.9383402874362541, - "grad_norm": 4.2501983642578125, - "learning_rate": 2.3073342266036748e-07, - "loss": 0.8411, - "step": 20240 - }, - { - "epoch": 0.9385720908669448, - "grad_norm": 4.016298294067383, - "learning_rate": 2.2900848555144295e-07, - "loss": 0.8139, - "step": 20245 - }, - { - "epoch": 0.9388038942976356, - "grad_norm": 3.4987635612487793, - "learning_rate": 2.2728994568860685e-07, - "loss": 0.8471, - "step": 20250 - }, - { - "epoch": 0.9390356977283264, - "grad_norm": 4.0096049308776855, - "learning_rate": 2.2557780419701802e-07, - "loss": 0.7444, - "step": 20255 - }, - { - "epoch": 0.9392675011590171, - "grad_norm": 3.374089002609253, - "learning_rate": 2.2387206219764645e-07, - "loss": 0.8296, - "step": 20260 - }, - { - "epoch": 0.9394993045897079, - "grad_norm": 3.474876642227173, - "learning_rate": 2.2217272080727436e-07, - "loss": 0.9066, - "step": 20265 - }, - { - "epoch": 0.9397311080203987, - "grad_norm": 3.869378089904785, - "learning_rate": 2.2047978113849068e-07, - "loss": 0.7102, - "step": 20270 - }, - { - "epoch": 0.9399629114510895, - "grad_norm": 3.837383270263672, - "learning_rate": 2.1879324429969095e-07, - "loss": 0.8952, - "step": 20275 - }, - { - "epoch": 0.9401947148817803, - "grad_norm": 3.9950919151306152, - "learning_rate": 2.1711311139508528e-07, - "loss": 0.9188, - "step": 20280 - }, - { - "epoch": 0.940426518312471, - "grad_norm": 3.3735032081604004, - "learning_rate": 2.1543938352468597e-07, - "loss": 0.8943, - "step": 20285 - }, - { - "epoch": 0.9406583217431618, - "grad_norm": 3.7731034755706787, - "learning_rate": 2.137720617843131e-07, - "loss": 0.8451, - "step": 20290 - }, - { - "epoch": 0.9408901251738526, - "grad_norm": 3.9715733528137207, - "learning_rate": 2.1211114726559233e-07, - "loss": 0.8222, - "step": 20295 - }, - { - "epoch": 0.9411219286045434, - "grad_norm": 3.0871033668518066, - "learning_rate": 2.1045664105595387e-07, - "loss": 0.8157, - "step": 20300 - }, - { - "epoch": 0.9411219286045434, - "eval_loss": 0.8726610541343689, - "eval_runtime": 11.2667, - "eval_samples_per_second": 11.272, - "eval_steps_per_second": 11.272, - "step": 20300 - }, - { - "epoch": 0.9413537320352341, - "grad_norm": 4.362879276275635, - "learning_rate": 2.088085442386356e-07, - "loss": 0.9497, - "step": 20305 - }, - { - "epoch": 0.9415855354659249, - "grad_norm": 3.8125417232513428, - "learning_rate": 2.0716685789267445e-07, - "loss": 0.9322, - "step": 20310 - }, - { - "epoch": 0.9418173388966157, - "grad_norm": 3.584050416946411, - "learning_rate": 2.0553158309291167e-07, - "loss": 0.7188, - "step": 20315 - }, - { - "epoch": 0.9420491423273064, - "grad_norm": 3.3716397285461426, - "learning_rate": 2.039027209099942e-07, - "loss": 0.8026, - "step": 20320 - }, - { - "epoch": 0.9422809457579973, - "grad_norm": 3.3109800815582275, - "learning_rate": 2.0228027241036675e-07, - "loss": 0.8779, - "step": 20325 - }, - { - "epoch": 0.942512749188688, - "grad_norm": 3.862812042236328, - "learning_rate": 2.0066423865627405e-07, - "loss": 0.9238, - "step": 20330 - }, - { - "epoch": 0.9427445526193787, - "grad_norm": 3.545383930206299, - "learning_rate": 1.9905462070576532e-07, - "loss": 0.9785, - "step": 20335 - }, - { - "epoch": 0.9429763560500696, - "grad_norm": 3.8723387718200684, - "learning_rate": 1.974514196126842e-07, - "loss": 0.8293, - "step": 20340 - }, - { - "epoch": 0.9432081594807603, - "grad_norm": 3.7321619987487793, - "learning_rate": 1.9585463642667668e-07, - "loss": 0.8364, - "step": 20345 - }, - { - "epoch": 0.943439962911451, - "grad_norm": 3.676527261734009, - "learning_rate": 1.9426427219318755e-07, - "loss": 0.8345, - "step": 20350 - }, - { - "epoch": 0.9436717663421419, - "grad_norm": 3.3420934677124023, - "learning_rate": 1.9268032795345615e-07, - "loss": 0.7551, - "step": 20355 - }, - { - "epoch": 0.9439035697728326, - "grad_norm": 3.7380433082580566, - "learning_rate": 1.9110280474451847e-07, - "loss": 0.8608, - "step": 20360 - }, - { - "epoch": 0.9441353732035234, - "grad_norm": 3.892584800720215, - "learning_rate": 1.8953170359920835e-07, - "loss": 0.9361, - "step": 20365 - }, - { - "epoch": 0.9443671766342142, - "grad_norm": 3.387708902359009, - "learning_rate": 1.8796702554615521e-07, - "loss": 0.6998, - "step": 20370 - }, - { - "epoch": 0.944598980064905, - "grad_norm": 4.322428226470947, - "learning_rate": 1.8640877160978066e-07, - "loss": 0.8976, - "step": 20375 - }, - { - "epoch": 0.9448307834955957, - "grad_norm": 3.898622989654541, - "learning_rate": 1.8485694281030197e-07, - "loss": 0.8889, - "step": 20380 - }, - { - "epoch": 0.9450625869262865, - "grad_norm": 4.003245830535889, - "learning_rate": 1.833115401637331e-07, - "loss": 0.8091, - "step": 20385 - }, - { - "epoch": 0.9452943903569773, - "grad_norm": 3.791826009750366, - "learning_rate": 1.8177256468187354e-07, - "loss": 0.8516, - "step": 20390 - }, - { - "epoch": 0.9455261937876681, - "grad_norm": 4.986959457397461, - "learning_rate": 1.8024001737231957e-07, - "loss": 0.9104, - "step": 20395 - }, - { - "epoch": 0.9457579972183588, - "grad_norm": 3.9102840423583984, - "learning_rate": 1.7871389923845738e-07, - "loss": 0.8164, - "step": 20400 - }, - { - "epoch": 0.9457579972183588, - "eval_loss": 0.8726429343223572, - "eval_runtime": 11.267, - "eval_samples_per_second": 11.272, - "eval_steps_per_second": 11.272, - "step": 20400 - }, - { - "epoch": 0.9459898006490496, - "grad_norm": 3.6218159198760986, - "learning_rate": 1.7719421127946668e-07, - "loss": 0.7913, - "step": 20405 - }, - { - "epoch": 0.9462216040797404, - "grad_norm": 3.935687780380249, - "learning_rate": 1.7568095449031263e-07, - "loss": 0.8897, - "step": 20410 - }, - { - "epoch": 0.9464534075104312, - "grad_norm": 4.482246398925781, - "learning_rate": 1.7417412986175163e-07, - "loss": 0.7946, - "step": 20415 - }, - { - "epoch": 0.9466852109411219, - "grad_norm": 3.267892599105835, - "learning_rate": 1.726737383803312e-07, - "loss": 0.8567, - "step": 20420 - }, - { - "epoch": 0.9469170143718127, - "grad_norm": 4.164371013641357, - "learning_rate": 1.7117978102838328e-07, - "loss": 0.8608, - "step": 20425 - }, - { - "epoch": 0.9471488178025035, - "grad_norm": 3.210048198699951, - "learning_rate": 1.6969225878402883e-07, - "loss": 0.8283, - "step": 20430 - }, - { - "epoch": 0.9473806212331942, - "grad_norm": 4.08260440826416, - "learning_rate": 1.682111726211777e-07, - "loss": 0.8282, - "step": 20435 - }, - { - "epoch": 0.9476124246638851, - "grad_norm": 3.357116222381592, - "learning_rate": 1.6673652350952307e-07, - "loss": 0.7393, - "step": 20440 - }, - { - "epoch": 0.9478442280945758, - "grad_norm": 4.292599678039551, - "learning_rate": 1.6526831241454268e-07, - "loss": 0.8278, - "step": 20445 - }, - { - "epoch": 0.9480760315252665, - "grad_norm": 4.124458312988281, - "learning_rate": 1.638065402975031e-07, - "loss": 0.8579, - "step": 20450 - }, - { - "epoch": 0.9483078349559574, - "grad_norm": 3.8831825256347656, - "learning_rate": 1.6235120811545325e-07, - "loss": 0.834, - "step": 20455 - }, - { - "epoch": 0.9485396383866481, - "grad_norm": 4.089710235595703, - "learning_rate": 1.6090231682122536e-07, - "loss": 0.8033, - "step": 20460 - }, - { - "epoch": 0.9487714418173389, - "grad_norm": 4.102946758270264, - "learning_rate": 1.5945986736343287e-07, - "loss": 0.7532, - "step": 20465 - }, - { - "epoch": 0.9490032452480297, - "grad_norm": 3.067202091217041, - "learning_rate": 1.58023860686477e-07, - "loss": 0.8644, - "step": 20470 - }, - { - "epoch": 0.9492350486787204, - "grad_norm": 3.8552489280700684, - "learning_rate": 1.565942977305368e-07, - "loss": 0.9556, - "step": 20475 - }, - { - "epoch": 0.9494668521094112, - "grad_norm": 4.418707370758057, - "learning_rate": 1.5517117943157022e-07, - "loss": 0.8926, - "step": 20480 - }, - { - "epoch": 0.949698655540102, - "grad_norm": 3.372945785522461, - "learning_rate": 1.5375450672132198e-07, - "loss": 0.886, - "step": 20485 - }, - { - "epoch": 0.9499304589707928, - "grad_norm": 3.378730297088623, - "learning_rate": 1.5234428052731233e-07, - "loss": 0.8458, - "step": 20490 - }, - { - "epoch": 0.9501622624014835, - "grad_norm": 3.5803749561309814, - "learning_rate": 1.509405017728416e-07, - "loss": 0.7429, - "step": 20495 - }, - { - "epoch": 0.9503940658321743, - "grad_norm": 3.7786428928375244, - "learning_rate": 1.4954317137699016e-07, - "loss": 0.755, - "step": 20500 - }, - { - "epoch": 0.9503940658321743, - "eval_loss": 0.8725816011428833, - "eval_runtime": 11.2693, - "eval_samples_per_second": 11.27, - "eval_steps_per_second": 11.27, - "step": 20500 - }, - { - "epoch": 0.9506258692628651, - "grad_norm": 4.169706344604492, - "learning_rate": 1.481522902546151e-07, - "loss": 0.943, - "step": 20505 - }, - { - "epoch": 0.9508576726935558, - "grad_norm": 3.189598560333252, - "learning_rate": 1.467678593163524e-07, - "loss": 0.8133, - "step": 20510 - }, - { - "epoch": 0.9510894761242467, - "grad_norm": 4.087679386138916, - "learning_rate": 1.453898794686126e-07, - "loss": 0.7238, - "step": 20515 - }, - { - "epoch": 0.9513212795549374, - "grad_norm": 4.687533378601074, - "learning_rate": 1.4401835161358512e-07, - "loss": 0.8274, - "step": 20520 - }, - { - "epoch": 0.9515530829856282, - "grad_norm": 4.421091079711914, - "learning_rate": 1.42653276649235e-07, - "loss": 1.0243, - "step": 20525 - }, - { - "epoch": 0.951784886416319, - "grad_norm": 3.2969958782196045, - "learning_rate": 1.4129465546930177e-07, - "loss": 0.7684, - "step": 20530 - }, - { - "epoch": 0.9520166898470097, - "grad_norm": 3.84316086769104, - "learning_rate": 1.399424889632972e-07, - "loss": 0.8361, - "step": 20535 - }, - { - "epoch": 0.9522484932777006, - "grad_norm": 3.4402241706848145, - "learning_rate": 1.3859677801651094e-07, - "loss": 0.7908, - "step": 20540 - }, - { - "epoch": 0.9524802967083913, - "grad_norm": 3.5798709392547607, - "learning_rate": 1.3725752351000488e-07, - "loss": 0.6813, - "step": 20545 - }, - { - "epoch": 0.952712100139082, - "grad_norm": 3.9807188510894775, - "learning_rate": 1.3592472632061204e-07, - "loss": 0.7419, - "step": 20550 - }, - { - "epoch": 0.9529439035697729, - "grad_norm": 3.7711613178253174, - "learning_rate": 1.345983873209411e-07, - "loss": 0.9607, - "step": 20555 - }, - { - "epoch": 0.9531757070004636, - "grad_norm": 3.3518753051757812, - "learning_rate": 1.3327850737936853e-07, - "loss": 0.7509, - "step": 20560 - }, - { - "epoch": 0.9534075104311543, - "grad_norm": 5.417386531829834, - "learning_rate": 1.319650873600431e-07, - "loss": 0.9085, - "step": 20565 - }, - { - "epoch": 0.9536393138618452, - "grad_norm": 5.1365790367126465, - "learning_rate": 1.3065812812288692e-07, - "loss": 0.8834, - "step": 20570 - }, - { - "epoch": 0.9538711172925359, - "grad_norm": 3.5701076984405518, - "learning_rate": 1.2935763052358886e-07, - "loss": 0.8912, - "step": 20575 - }, - { - "epoch": 0.9541029207232267, - "grad_norm": 4.482658863067627, - "learning_rate": 1.280635954136089e-07, - "loss": 0.932, - "step": 20580 - }, - { - "epoch": 0.9543347241539175, - "grad_norm": 4.094351291656494, - "learning_rate": 1.2677602364017382e-07, - "loss": 0.9277, - "step": 20585 - }, - { - "epoch": 0.9545665275846082, - "grad_norm": 3.1283581256866455, - "learning_rate": 1.254949160462826e-07, - "loss": 0.7529, - "step": 20590 - }, - { - "epoch": 0.954798331015299, - "grad_norm": 3.5420258045196533, - "learning_rate": 1.2422027347069877e-07, - "loss": 0.7797, - "step": 20595 - }, - { - "epoch": 0.9550301344459898, - "grad_norm": 3.7095041275024414, - "learning_rate": 1.229520967479536e-07, - "loss": 0.9017, - "step": 20600 - }, - { - "epoch": 0.9550301344459898, - "eval_loss": 0.872434139251709, - "eval_runtime": 11.2645, - "eval_samples_per_second": 11.274, - "eval_steps_per_second": 11.274, - "step": 20600 - }, - { - "epoch": 0.9552619378766806, - "grad_norm": 4.508840084075928, - "learning_rate": 1.2169038670834522e-07, - "loss": 0.9815, - "step": 20605 - }, - { - "epoch": 0.9554937413073713, - "grad_norm": 3.8109028339385986, - "learning_rate": 1.2043514417794055e-07, - "loss": 0.8162, - "step": 20610 - }, - { - "epoch": 0.9557255447380621, - "grad_norm": 3.4969725608825684, - "learning_rate": 1.1918636997856669e-07, - "loss": 0.6752, - "step": 20615 - }, - { - "epoch": 0.9559573481687529, - "grad_norm": 3.203023910522461, - "learning_rate": 1.1794406492781962e-07, - "loss": 0.9055, - "step": 20620 - }, - { - "epoch": 0.9561891515994436, - "grad_norm": 4.043225288391113, - "learning_rate": 1.1670822983906094e-07, - "loss": 0.9176, - "step": 20625 - }, - { - "epoch": 0.9564209550301345, - "grad_norm": 3.5347437858581543, - "learning_rate": 1.1547886552141229e-07, - "loss": 0.8326, - "step": 20630 - }, - { - "epoch": 0.9566527584608252, - "grad_norm": 4.313322067260742, - "learning_rate": 1.1425597277976097e-07, - "loss": 1.0108, - "step": 20635 - }, - { - "epoch": 0.9568845618915159, - "grad_norm": 2.926300525665283, - "learning_rate": 1.1303955241475872e-07, - "loss": 0.6825, - "step": 20640 - }, - { - "epoch": 0.9571163653222068, - "grad_norm": 3.7507739067077637, - "learning_rate": 1.1182960522281739e-07, - "loss": 0.6986, - "step": 20645 - }, - { - "epoch": 0.9573481687528975, - "grad_norm": 3.813779354095459, - "learning_rate": 1.1062613199610994e-07, - "loss": 0.9632, - "step": 20650 - }, - { - "epoch": 0.9575799721835884, - "grad_norm": 3.9066879749298096, - "learning_rate": 1.0942913352257389e-07, - "loss": 0.7497, - "step": 20655 - }, - { - "epoch": 0.9578117756142791, - "grad_norm": 3.8484296798706055, - "learning_rate": 1.0823861058590568e-07, - "loss": 0.8, - "step": 20660 - }, - { - "epoch": 0.9580435790449698, - "grad_norm": 3.3053674697875977, - "learning_rate": 1.0705456396556179e-07, - "loss": 0.775, - "step": 20665 - }, - { - "epoch": 0.9582753824756607, - "grad_norm": 3.2667057514190674, - "learning_rate": 1.0587699443675992e-07, - "loss": 0.7743, - "step": 20670 - }, - { - "epoch": 0.9585071859063514, - "grad_norm": 4.044334411621094, - "learning_rate": 1.0470590277047554e-07, - "loss": 0.9362, - "step": 20675 - }, - { - "epoch": 0.9587389893370422, - "grad_norm": 3.7937705516815186, - "learning_rate": 1.0354128973344536e-07, - "loss": 0.9165, - "step": 20680 - }, - { - "epoch": 0.958970792767733, - "grad_norm": 4.389861106872559, - "learning_rate": 1.0238315608815941e-07, - "loss": 1.0114, - "step": 20685 - }, - { - "epoch": 0.9592025961984237, - "grad_norm": 3.973416805267334, - "learning_rate": 1.0123150259287229e-07, - "loss": 0.8243, - "step": 20690 - }, - { - "epoch": 0.9594343996291145, - "grad_norm": 4.305580139160156, - "learning_rate": 1.0008633000159085e-07, - "loss": 0.9289, - "step": 20695 - }, - { - "epoch": 0.9596662030598053, - "grad_norm": 4.440845489501953, - "learning_rate": 9.894763906407978e-08, - "loss": 1.033, - "step": 20700 - }, - { - "epoch": 0.9596662030598053, - "eval_loss": 0.8724677562713623, - "eval_runtime": 11.2771, - "eval_samples_per_second": 11.262, - "eval_steps_per_second": 11.262, - "step": 20700 - }, - { - "epoch": 0.9598980064904961, - "grad_norm": 4.005496025085449, - "learning_rate": 9.781543052586273e-08, - "loss": 0.798, - "step": 20705 - }, - { - "epoch": 0.9601298099211868, - "grad_norm": 3.747995615005493, - "learning_rate": 9.668970512821674e-08, - "loss": 0.8338, - "step": 20710 - }, - { - "epoch": 0.9603616133518776, - "grad_norm": 3.5021567344665527, - "learning_rate": 9.557046360817335e-08, - "loss": 0.8593, - "step": 20715 - }, - { - "epoch": 0.9605934167825684, - "grad_norm": 2.960855484008789, - "learning_rate": 9.445770669852084e-08, - "loss": 0.7712, - "step": 20720 - }, - { - "epoch": 0.9608252202132591, - "grad_norm": 3.840736150741577, - "learning_rate": 9.335143512780198e-08, - "loss": 0.8052, - "step": 20725 - }, - { - "epoch": 0.96105702364395, - "grad_norm": 3.2714478969573975, - "learning_rate": 9.225164962031297e-08, - "loss": 0.8372, - "step": 20730 - }, - { - "epoch": 0.9612888270746407, - "grad_norm": 3.7272298336029053, - "learning_rate": 9.115835089610337e-08, - "loss": 0.8361, - "step": 20735 - }, - { - "epoch": 0.9615206305053314, - "grad_norm": 3.4043381214141846, - "learning_rate": 9.007153967097504e-08, - "loss": 0.8576, - "step": 20740 - }, - { - "epoch": 0.9617524339360223, - "grad_norm": 3.917623281478882, - "learning_rate": 8.899121665648436e-08, - "loss": 0.7097, - "step": 20745 - }, - { - "epoch": 0.961984237366713, - "grad_norm": 3.294656276702881, - "learning_rate": 8.791738255993665e-08, - "loss": 0.8041, - "step": 20750 - }, - { - "epoch": 0.9622160407974037, - "grad_norm": 4.912676811218262, - "learning_rate": 8.685003808439174e-08, - "loss": 0.9048, - "step": 20755 - }, - { - "epoch": 0.9624478442280946, - "grad_norm": 3.2419793605804443, - "learning_rate": 8.578918392866064e-08, - "loss": 0.7305, - "step": 20760 - }, - { - "epoch": 0.9626796476587853, - "grad_norm": 3.819314956665039, - "learning_rate": 8.473482078730221e-08, - "loss": 1.0409, - "step": 20765 - }, - { - "epoch": 0.9629114510894762, - "grad_norm": 2.945345640182495, - "learning_rate": 8.36869493506276e-08, - "loss": 0.7376, - "step": 20770 - }, - { - "epoch": 0.9631432545201669, - "grad_norm": 3.5658223628997803, - "learning_rate": 8.264557030469689e-08, - "loss": 0.8783, - "step": 20775 - }, - { - "epoch": 0.9633750579508576, - "grad_norm": 4.4376444816589355, - "learning_rate": 8.161068433132135e-08, - "loss": 0.8467, - "step": 20780 - }, - { - "epoch": 0.9636068613815485, - "grad_norm": 3.707390069961548, - "learning_rate": 8.058229210805901e-08, - "loss": 0.8506, - "step": 20785 - }, - { - "epoch": 0.9638386648122392, - "grad_norm": 3.9231960773468018, - "learning_rate": 7.956039430821572e-08, - "loss": 0.9538, - "step": 20790 - }, - { - "epoch": 0.96407046824293, - "grad_norm": 3.662692070007324, - "learning_rate": 7.85449916008485e-08, - "loss": 0.8462, - "step": 20795 - }, - { - "epoch": 0.9643022716736208, - "grad_norm": 3.5197982788085938, - "learning_rate": 7.75360846507589e-08, - "loss": 0.8184, - "step": 20800 - }, - { - "epoch": 0.9643022716736208, - "eval_loss": 0.8724554181098938, - "eval_runtime": 11.2675, - "eval_samples_per_second": 11.271, - "eval_steps_per_second": 11.271, - "step": 20800 - }, - { - "epoch": 0.9645340751043116, - "grad_norm": 3.512238025665283, - "learning_rate": 7.653367411849632e-08, - "loss": 0.8036, - "step": 20805 - }, - { - "epoch": 0.9647658785350023, - "grad_norm": 3.1053659915924072, - "learning_rate": 7.5537760660358e-08, - "loss": 0.7613, - "step": 20810 - }, - { - "epoch": 0.9649976819656931, - "grad_norm": 4.269003391265869, - "learning_rate": 7.454834492838681e-08, - "loss": 1.027, - "step": 20815 - }, - { - "epoch": 0.9652294853963839, - "grad_norm": 4.014410018920898, - "learning_rate": 7.356542757037011e-08, - "loss": 0.8457, - "step": 20820 - }, - { - "epoch": 0.9654612888270746, - "grad_norm": 4.1713738441467285, - "learning_rate": 7.258900922984313e-08, - "loss": 0.8088, - "step": 20825 - }, - { - "epoch": 0.9656930922577655, - "grad_norm": 4.075442790985107, - "learning_rate": 7.161909054608451e-08, - "loss": 0.9568, - "step": 20830 - }, - { - "epoch": 0.9659248956884562, - "grad_norm": 3.7142698764801025, - "learning_rate": 7.065567215411629e-08, - "loss": 0.9301, - "step": 20835 - }, - { - "epoch": 0.9661566991191469, - "grad_norm": 3.9417366981506348, - "learning_rate": 6.969875468470721e-08, - "loss": 0.7984, - "step": 20840 - }, - { - "epoch": 0.9663885025498378, - "grad_norm": 4.075379371643066, - "learning_rate": 6.874833876436948e-08, - "loss": 0.838, - "step": 20845 - }, - { - "epoch": 0.9666203059805285, - "grad_norm": 3.971217393875122, - "learning_rate": 6.780442501535534e-08, - "loss": 0.7235, - "step": 20850 - }, - { - "epoch": 0.9668521094112192, - "grad_norm": 3.244826078414917, - "learning_rate": 6.686701405566376e-08, - "loss": 0.7689, - "step": 20855 - }, - { - "epoch": 0.9670839128419101, - "grad_norm": 3.7413337230682373, - "learning_rate": 6.593610649903493e-08, - "loss": 0.8759, - "step": 20860 - }, - { - "epoch": 0.9673157162726008, - "grad_norm": 3.7818310260772705, - "learning_rate": 6.501170295495019e-08, - "loss": 0.8278, - "step": 20865 - }, - { - "epoch": 0.9675475197032916, - "grad_norm": 3.9411723613739014, - "learning_rate": 6.40938040286343e-08, - "loss": 0.783, - "step": 20870 - }, - { - "epoch": 0.9677793231339824, - "grad_norm": 3.7038698196411133, - "learning_rate": 6.318241032105099e-08, - "loss": 0.8504, - "step": 20875 - }, - { - "epoch": 0.9680111265646731, - "grad_norm": 3.660369396209717, - "learning_rate": 6.22775224289085e-08, - "loss": 0.9106, - "step": 20880 - }, - { - "epoch": 0.9682429299953639, - "grad_norm": 3.6254231929779053, - "learning_rate": 6.13791409446518e-08, - "loss": 0.8556, - "step": 20885 - }, - { - "epoch": 0.9684747334260547, - "grad_norm": 3.7178094387054443, - "learning_rate": 6.048726645646819e-08, - "loss": 0.7655, - "step": 20890 - }, - { - "epoch": 0.9687065368567455, - "grad_norm": 3.7253708839416504, - "learning_rate": 5.960189954828277e-08, - "loss": 0.9205, - "step": 20895 - }, - { - "epoch": 0.9689383402874363, - "grad_norm": 5.241983413696289, - "learning_rate": 5.87230407997641e-08, - "loss": 0.8271, - "step": 20900 - }, - { - "epoch": 0.9689383402874363, - "eval_loss": 0.8723733425140381, - "eval_runtime": 11.274, - "eval_samples_per_second": 11.265, - "eval_steps_per_second": 11.265, - "step": 20900 - }, - { - "epoch": 0.969170143718127, - "grad_norm": 3.9776298999786377, - "learning_rate": 5.785069078631411e-08, - "loss": 1.0213, - "step": 20905 - }, - { - "epoch": 0.9694019471488178, - "grad_norm": 4.242099285125732, - "learning_rate": 5.698485007907817e-08, - "loss": 0.7948, - "step": 20910 - }, - { - "epoch": 0.9696337505795086, - "grad_norm": 3.6130335330963135, - "learning_rate": 5.612551924493836e-08, - "loss": 0.8865, - "step": 20915 - }, - { - "epoch": 0.9698655540101994, - "grad_norm": 3.73136305809021, - "learning_rate": 5.527269884651132e-08, - "loss": 0.8146, - "step": 20920 - }, - { - "epoch": 0.9700973574408901, - "grad_norm": 3.5985543727874756, - "learning_rate": 5.442638944215706e-08, - "loss": 0.7532, - "step": 20925 - }, - { - "epoch": 0.9703291608715809, - "grad_norm": 3.584609031677246, - "learning_rate": 5.3586591585969016e-08, - "loss": 0.8114, - "step": 20930 - }, - { - "epoch": 0.9705609643022717, - "grad_norm": 3.8732564449310303, - "learning_rate": 5.2753305827776266e-08, - "loss": 0.8789, - "step": 20935 - }, - { - "epoch": 0.9707927677329624, - "grad_norm": 4.232132434844971, - "learning_rate": 5.192653271314907e-08, - "loss": 0.8396, - "step": 20940 - }, - { - "epoch": 0.9710245711636533, - "grad_norm": 3.9535317420959473, - "learning_rate": 5.1106272783388865e-08, - "loss": 0.7687, - "step": 20945 - }, - { - "epoch": 0.971256374594344, - "grad_norm": 3.3350656032562256, - "learning_rate": 5.029252657553607e-08, - "loss": 0.7718, - "step": 20950 - }, - { - "epoch": 0.9714881780250347, - "grad_norm": 3.9200267791748047, - "learning_rate": 4.948529462236451e-08, - "loss": 0.8927, - "step": 20955 - }, - { - "epoch": 0.9717199814557256, - "grad_norm": 4.135104656219482, - "learning_rate": 4.868457745238253e-08, - "loss": 0.7815, - "step": 20960 - }, - { - "epoch": 0.9719517848864163, - "grad_norm": 3.703777551651001, - "learning_rate": 4.7890375589836334e-08, - "loss": 0.8333, - "step": 20965 - }, - { - "epoch": 0.972183588317107, - "grad_norm": 4.566959381103516, - "learning_rate": 4.710268955470332e-08, - "loss": 1.0059, - "step": 20970 - }, - { - "epoch": 0.9724153917477979, - "grad_norm": 3.848745107650757, - "learning_rate": 4.63215198626954e-08, - "loss": 0.8187, - "step": 20975 - }, - { - "epoch": 0.9726471951784886, - "grad_norm": 4.270590305328369, - "learning_rate": 4.554686702525901e-08, - "loss": 0.9922, - "step": 20980 - }, - { - "epoch": 0.9728789986091794, - "grad_norm": 4.593664646148682, - "learning_rate": 4.477873154957402e-08, - "loss": 0.8601, - "step": 20985 - }, - { - "epoch": 0.9731108020398702, - "grad_norm": 4.01714563369751, - "learning_rate": 4.401711393855257e-08, - "loss": 0.9009, - "step": 20990 - }, - { - "epoch": 0.973342605470561, - "grad_norm": 3.4283859729766846, - "learning_rate": 4.326201469083802e-08, - "loss": 0.7851, - "step": 20995 - }, - { - "epoch": 0.9735744089012517, - "grad_norm": 3.7679734230041504, - "learning_rate": 4.251343430080934e-08, - "loss": 0.8801, - "step": 21000 - }, - { - "epoch": 0.9735744089012517, - "eval_loss": 0.8723198771476746, - "eval_runtime": 11.2701, - "eval_samples_per_second": 11.269, - "eval_steps_per_second": 11.269, - "step": 21000 - }, - { - "epoch": 0.9738062123319425, - "grad_norm": 3.2241997718811035, - "learning_rate": 4.177137325857672e-08, - "loss": 0.8128, - "step": 21005 - }, - { - "epoch": 0.9740380157626333, - "grad_norm": 4.312016010284424, - "learning_rate": 4.103583204997708e-08, - "loss": 0.8834, - "step": 21010 - }, - { - "epoch": 0.9742698191933241, - "grad_norm": 4.0609941482543945, - "learning_rate": 4.03068111565863e-08, - "loss": 0.9945, - "step": 21015 - }, - { - "epoch": 0.9745016226240149, - "grad_norm": 4.250798225402832, - "learning_rate": 3.9584311055707036e-08, - "loss": 0.9721, - "step": 21020 - }, - { - "epoch": 0.9747334260547056, - "grad_norm": 3.8703458309173584, - "learning_rate": 3.886833222037201e-08, - "loss": 0.7808, - "step": 21025 - }, - { - "epoch": 0.9749652294853964, - "grad_norm": 4.010364055633545, - "learning_rate": 3.815887511934735e-08, - "loss": 0.7971, - "step": 21030 - }, - { - "epoch": 0.9751970329160872, - "grad_norm": 3.7243025302886963, - "learning_rate": 3.745594021712484e-08, - "loss": 0.8468, - "step": 21035 - }, - { - "epoch": 0.9754288363467779, - "grad_norm": 3.7182421684265137, - "learning_rate": 3.6759527973931894e-08, - "loss": 0.8744, - "step": 21040 - }, - { - "epoch": 0.9756606397774688, - "grad_norm": 3.662433624267578, - "learning_rate": 3.6069638845719347e-08, - "loss": 0.7838, - "step": 21045 - }, - { - "epoch": 0.9758924432081595, - "grad_norm": 3.870577096939087, - "learning_rate": 3.538627328417255e-08, - "loss": 0.7402, - "step": 21050 - }, - { - "epoch": 0.9761242466388502, - "grad_norm": 4.221757888793945, - "learning_rate": 3.470943173670249e-08, - "loss": 0.7746, - "step": 21055 - }, - { - "epoch": 0.9763560500695411, - "grad_norm": 3.4447455406188965, - "learning_rate": 3.4039114646449154e-08, - "loss": 0.6891, - "step": 21060 - }, - { - "epoch": 0.9765878535002318, - "grad_norm": 3.708122491836548, - "learning_rate": 3.337532245228148e-08, - "loss": 0.7792, - "step": 21065 - }, - { - "epoch": 0.9768196569309225, - "grad_norm": 4.2107744216918945, - "learning_rate": 3.271805558879737e-08, - "loss": 0.8349, - "step": 21070 - }, - { - "epoch": 0.9770514603616134, - "grad_norm": 4.240828037261963, - "learning_rate": 3.2067314486320386e-08, - "loss": 0.8192, - "step": 21075 - }, - { - "epoch": 0.9772832637923041, - "grad_norm": 3.749157428741455, - "learning_rate": 3.1423099570903057e-08, - "loss": 0.9146, - "step": 21080 - }, - { - "epoch": 0.9775150672229949, - "grad_norm": 5.120777130126953, - "learning_rate": 3.078541126432466e-08, - "loss": 0.9231, - "step": 21085 - }, - { - "epoch": 0.9777468706536857, - "grad_norm": 3.7648401260375977, - "learning_rate": 3.0154249984090114e-08, - "loss": 0.8833, - "step": 21090 - }, - { - "epoch": 0.9779786740843764, - "grad_norm": 3.9840023517608643, - "learning_rate": 2.9529616143434416e-08, - "loss": 0.7592, - "step": 21095 - }, - { - "epoch": 0.9782104775150672, - "grad_norm": 3.708495855331421, - "learning_rate": 2.8911510151314883e-08, - "loss": 0.7202, - "step": 21100 - }, - { - "epoch": 0.9782104775150672, - "eval_loss": 0.8723410367965698, - "eval_runtime": 11.2623, - "eval_samples_per_second": 11.277, - "eval_steps_per_second": 11.277, - "step": 21100 - }, - { - "epoch": 0.978442280945758, - "grad_norm": 3.607025384902954, - "learning_rate": 2.8299932412416686e-08, - "loss": 0.8083, - "step": 21105 - }, - { - "epoch": 0.9786740843764488, - "grad_norm": 3.52121901512146, - "learning_rate": 2.7694883327152867e-08, - "loss": 0.8049, - "step": 21110 - }, - { - "epoch": 0.9789058878071395, - "grad_norm": 3.9384076595306396, - "learning_rate": 2.7096363291657657e-08, - "loss": 0.9872, - "step": 21115 - }, - { - "epoch": 0.9791376912378303, - "grad_norm": 3.609402656555176, - "learning_rate": 2.6504372697795377e-08, - "loss": 0.8411, - "step": 21120 - }, - { - "epoch": 0.9793694946685211, - "grad_norm": 4.334535598754883, - "learning_rate": 2.5918911933151548e-08, - "loss": 0.8178, - "step": 21125 - }, - { - "epoch": 0.9796012980992118, - "grad_norm": 3.8995320796966553, - "learning_rate": 2.5339981381037326e-08, - "loss": 0.935, - "step": 21130 - }, - { - "epoch": 0.9798331015299027, - "grad_norm": 4.084055423736572, - "learning_rate": 2.4767581420491738e-08, - "loss": 0.96, - "step": 21135 - }, - { - "epoch": 0.9800649049605934, - "grad_norm": 4.670334815979004, - "learning_rate": 2.4201712426272783e-08, - "loss": 0.7796, - "step": 21140 - }, - { - "epoch": 0.9802967083912842, - "grad_norm": 3.5102531909942627, - "learning_rate": 2.3642374768866326e-08, - "loss": 0.8136, - "step": 21145 - }, - { - "epoch": 0.980528511821975, - "grad_norm": 4.083783149719238, - "learning_rate": 2.3089568814480546e-08, - "loss": 0.9431, - "step": 21150 - }, - { - "epoch": 0.9807603152526657, - "grad_norm": 4.43557596206665, - "learning_rate": 2.2543294925048142e-08, - "loss": 0.9378, - "step": 21155 - }, - { - "epoch": 0.9809921186833566, - "grad_norm": 3.0960304737091064, - "learning_rate": 2.2003553458224135e-08, - "loss": 0.8348, - "step": 21160 - }, - { - "epoch": 0.9812239221140473, - "grad_norm": 4.227958679199219, - "learning_rate": 2.1470344767386962e-08, - "loss": 0.849, - "step": 21165 - }, - { - "epoch": 0.981455725544738, - "grad_norm": 3.6003923416137695, - "learning_rate": 2.0943669201638482e-08, - "loss": 0.8085, - "step": 21170 - }, - { - "epoch": 0.9816875289754289, - "grad_norm": 3.7972798347473145, - "learning_rate": 2.0423527105802865e-08, - "loss": 0.8498, - "step": 21175 - }, - { - "epoch": 0.9819193324061196, - "grad_norm": 3.423555850982666, - "learning_rate": 1.9909918820425476e-08, - "loss": 0.8486, - "step": 21180 - }, - { - "epoch": 0.9821511358368104, - "grad_norm": 3.361279010772705, - "learning_rate": 1.9402844681777334e-08, - "loss": 0.819, - "step": 21185 - }, - { - "epoch": 0.9823829392675012, - "grad_norm": 3.7809674739837646, - "learning_rate": 1.89023050218462e-08, - "loss": 0.8435, - "step": 21190 - }, - { - "epoch": 0.9826147426981919, - "grad_norm": 3.679137945175171, - "learning_rate": 1.8408300168346605e-08, - "loss": 0.8949, - "step": 21195 - }, - { - "epoch": 0.9828465461288827, - "grad_norm": 3.5145680904388428, - "learning_rate": 1.7920830444712045e-08, - "loss": 0.7488, - "step": 21200 - }, - { - "epoch": 0.9828465461288827, - "eval_loss": 0.872364342212677, - "eval_runtime": 11.2794, - "eval_samples_per_second": 11.259, - "eval_steps_per_second": 11.259, - "step": 21200 - }, - { - "epoch": 0.9830783495595735, - "grad_norm": 3.847689628601074, - "learning_rate": 1.7439896170098337e-08, - "loss": 0.7922, - "step": 21205 - }, - { - "epoch": 0.9833101529902643, - "grad_norm": 3.7140440940856934, - "learning_rate": 1.696549765938027e-08, - "loss": 0.7843, - "step": 21210 - }, - { - "epoch": 0.983541956420955, - "grad_norm": 3.18188738822937, - "learning_rate": 1.649763522315606e-08, - "loss": 0.861, - "step": 21215 - }, - { - "epoch": 0.9837737598516458, - "grad_norm": 3.7658166885375977, - "learning_rate": 1.6036309167745122e-08, - "loss": 0.6235, - "step": 21220 - }, - { - "epoch": 0.9840055632823366, - "grad_norm": 3.599701404571533, - "learning_rate": 1.558151979518363e-08, - "loss": 0.7665, - "step": 21225 - }, - { - "epoch": 0.9842373667130273, - "grad_norm": 4.016284942626953, - "learning_rate": 1.5133267403232287e-08, - "loss": 0.9517, - "step": 21230 - }, - { - "epoch": 0.9844691701437182, - "grad_norm": 3.475090265274048, - "learning_rate": 1.469155228536856e-08, - "loss": 0.8615, - "step": 21235 - }, - { - "epoch": 0.9847009735744089, - "grad_norm": 4.081587791442871, - "learning_rate": 1.4256374730793333e-08, - "loss": 0.8005, - "step": 21240 - }, - { - "epoch": 0.9849327770050996, - "grad_norm": 3.612630605697632, - "learning_rate": 1.382773502442314e-08, - "loss": 0.8847, - "step": 21245 - }, - { - "epoch": 0.9851645804357905, - "grad_norm": 3.211310863494873, - "learning_rate": 1.3405633446897936e-08, - "loss": 0.7803, - "step": 21250 - }, - { - "epoch": 0.9853963838664812, - "grad_norm": 3.2925093173980713, - "learning_rate": 1.299007027457444e-08, - "loss": 0.8834, - "step": 21255 - }, - { - "epoch": 0.9856281872971719, - "grad_norm": 3.815549850463867, - "learning_rate": 1.2581045779529455e-08, - "loss": 0.8248, - "step": 21260 - }, - { - "epoch": 0.9858599907278628, - "grad_norm": 3.699127435684204, - "learning_rate": 1.2178560229558766e-08, - "loss": 0.9212, - "step": 21265 - }, - { - "epoch": 0.9860917941585535, - "grad_norm": 3.7043263912200928, - "learning_rate": 1.1782613888177142e-08, - "loss": 0.9181, - "step": 21270 - }, - { - "epoch": 0.9863235975892444, - "grad_norm": 4.019752502441406, - "learning_rate": 1.1393207014618324e-08, - "loss": 0.9589, - "step": 21275 - }, - { - "epoch": 0.9865554010199351, - "grad_norm": 4.226020812988281, - "learning_rate": 1.101033986383393e-08, - "loss": 0.9858, - "step": 21280 - }, - { - "epoch": 0.9867872044506258, - "grad_norm": 3.518124580383301, - "learning_rate": 1.0634012686493444e-08, - "loss": 0.8142, - "step": 21285 - }, - { - "epoch": 0.9870190078813167, - "grad_norm": 4.09403657913208, - "learning_rate": 1.0264225728985333e-08, - "loss": 0.9373, - "step": 21290 - }, - { - "epoch": 0.9872508113120074, - "grad_norm": 3.8443639278411865, - "learning_rate": 9.90097923341593e-09, - "loss": 0.8152, - "step": 21295 - }, - { - "epoch": 0.9874826147426982, - "grad_norm": 4.340132713317871, - "learning_rate": 9.544273437609441e-09, - "loss": 0.7602, - "step": 21300 - }, - { - "epoch": 0.9874826147426982, - "eval_loss": 0.8722533583641052, - "eval_runtime": 11.2748, - "eval_samples_per_second": 11.264, - "eval_steps_per_second": 11.264, - "step": 21300 - }, - { - "epoch": 0.987714418173389, - "grad_norm": 3.7367873191833496, - "learning_rate": 9.194108575106831e-09, - "loss": 0.8509, - "step": 21305 - }, - { - "epoch": 0.9879462216040797, - "grad_norm": 3.9204158782958984, - "learning_rate": 8.850484875169152e-09, - "loss": 0.9756, - "step": 21310 - }, - { - "epoch": 0.9881780250347705, - "grad_norm": 4.005619049072266, - "learning_rate": 8.513402562772e-09, - "loss": 0.7471, - "step": 21315 - }, - { - "epoch": 0.9884098284654613, - "grad_norm": 3.505582332611084, - "learning_rate": 8.182861858608837e-09, - "loss": 0.802, - "step": 21320 - }, - { - "epoch": 0.9886416318961521, - "grad_norm": 3.2862749099731445, - "learning_rate": 7.858862979092107e-09, - "loss": 0.8035, - "step": 21325 - }, - { - "epoch": 0.9888734353268428, - "grad_norm": 3.761301040649414, - "learning_rate": 7.541406136348794e-09, - "loss": 0.8301, - "step": 21330 - }, - { - "epoch": 0.9891052387575336, - "grad_norm": 2.987476348876953, - "learning_rate": 7.230491538222639e-09, - "loss": 0.7306, - "step": 21335 - }, - { - "epoch": 0.9893370421882244, - "grad_norm": 3.9521970748901367, - "learning_rate": 6.9261193882774746e-09, - "loss": 0.8284, - "step": 21340 - }, - { - "epoch": 0.9895688456189151, - "grad_norm": 3.9067275524139404, - "learning_rate": 6.628289885789452e-09, - "loss": 0.8549, - "step": 21345 - }, - { - "epoch": 0.989800649049606, - "grad_norm": 3.8883721828460693, - "learning_rate": 6.337003225753702e-09, - "loss": 0.8476, - "step": 21350 - }, - { - "epoch": 0.9900324524802967, - "grad_norm": 4.2398295402526855, - "learning_rate": 6.052259598881005e-09, - "loss": 0.9362, - "step": 21355 - }, - { - "epoch": 0.9902642559109874, - "grad_norm": 3.8248987197875977, - "learning_rate": 5.774059191597792e-09, - "loss": 0.8736, - "step": 21360 - }, - { - "epoch": 0.9904960593416783, - "grad_norm": 4.027286052703857, - "learning_rate": 5.502402186047251e-09, - "loss": 0.8286, - "step": 21365 - }, - { - "epoch": 0.990727862772369, - "grad_norm": 3.1771819591522217, - "learning_rate": 5.237288760087111e-09, - "loss": 0.8632, - "step": 21370 - }, - { - "epoch": 0.9909596662030598, - "grad_norm": 3.260469675064087, - "learning_rate": 4.9787190872940815e-09, - "loss": 0.769, - "step": 21375 - }, - { - "epoch": 0.9911914696337506, - "grad_norm": 3.7272191047668457, - "learning_rate": 4.726693336957189e-09, - "loss": 0.9642, - "step": 21380 - }, - { - "epoch": 0.9914232730644413, - "grad_norm": 3.7813467979431152, - "learning_rate": 4.48121167408111e-09, - "loss": 0.8887, - "step": 21385 - }, - { - "epoch": 0.9916550764951322, - "grad_norm": 3.121617078781128, - "learning_rate": 4.242274259389501e-09, - "loss": 0.775, - "step": 21390 - }, - { - "epoch": 0.9918868799258229, - "grad_norm": 3.8441171646118164, - "learning_rate": 4.009881249318337e-09, - "loss": 0.8256, - "step": 21395 - }, - { - "epoch": 0.9921186833565137, - "grad_norm": 4.062152862548828, - "learning_rate": 3.7840327960181336e-09, - "loss": 0.744, - "step": 21400 - }, - { - "epoch": 0.9921186833565137, - "eval_loss": 0.8722884654998779, - "eval_runtime": 11.2657, - "eval_samples_per_second": 11.273, - "eval_steps_per_second": 11.273, - "step": 21400 - }, - { - "epoch": 0.9923504867872045, - "grad_norm": 3.2572579383850098, - "learning_rate": 3.564729047357274e-09, - "loss": 0.7409, - "step": 21405 - }, - { - "epoch": 0.9925822902178952, - "grad_norm": 4.003219127655029, - "learning_rate": 3.3519701469175714e-09, - "loss": 0.8981, - "step": 21410 - }, - { - "epoch": 0.992814093648586, - "grad_norm": 3.633725881576538, - "learning_rate": 3.145756233996489e-09, - "loss": 0.7598, - "step": 21415 - }, - { - "epoch": 0.9930458970792768, - "grad_norm": 3.757777214050293, - "learning_rate": 2.946087443606027e-09, - "loss": 0.9012, - "step": 21420 - }, - { - "epoch": 0.9932777005099676, - "grad_norm": 3.0937254428863525, - "learning_rate": 2.7529639064716175e-09, - "loss": 0.7436, - "step": 21425 - }, - { - "epoch": 0.9935095039406583, - "grad_norm": 4.3624677658081055, - "learning_rate": 2.5663857490365597e-09, - "loss": 0.8341, - "step": 21430 - }, - { - "epoch": 0.9937413073713491, - "grad_norm": 3.44085693359375, - "learning_rate": 2.3863530934564727e-09, - "loss": 0.6942, - "step": 21435 - }, - { - "epoch": 0.9939731108020399, - "grad_norm": 4.770618438720703, - "learning_rate": 2.212866057601515e-09, - "loss": 0.8691, - "step": 21440 - }, - { - "epoch": 0.9942049142327306, - "grad_norm": 4.3806915283203125, - "learning_rate": 2.0459247550574933e-09, - "loss": 0.8095, - "step": 21445 - }, - { - "epoch": 0.9944367176634215, - "grad_norm": 3.6676011085510254, - "learning_rate": 1.885529295123645e-09, - "loss": 0.8438, - "step": 21450 - }, - { - "epoch": 0.9946685210941122, - "grad_norm": 4.355186939239502, - "learning_rate": 1.7316797828126341e-09, - "loss": 0.9678, - "step": 21455 - }, - { - "epoch": 0.9949003245248029, - "grad_norm": 3.5051472187042236, - "learning_rate": 1.5843763188538863e-09, - "loss": 0.8384, - "step": 21460 - }, - { - "epoch": 0.9951321279554938, - "grad_norm": 3.705413579940796, - "learning_rate": 1.4436189996902549e-09, - "loss": 0.9325, - "step": 21465 - }, - { - "epoch": 0.9953639313861845, - "grad_norm": 3.9275357723236084, - "learning_rate": 1.3094079174769126e-09, - "loss": 0.9495, - "step": 21470 - }, - { - "epoch": 0.9955957348168752, - "grad_norm": 3.814878225326538, - "learning_rate": 1.1817431600835706e-09, - "loss": 0.8951, - "step": 21475 - }, - { - "epoch": 0.9958275382475661, - "grad_norm": 3.2377772331237793, - "learning_rate": 1.0606248110967e-09, - "loss": 0.7856, - "step": 21480 - }, - { - "epoch": 0.9960593416782568, - "grad_norm": 3.7217166423797607, - "learning_rate": 9.460529498139803e-10, - "loss": 0.9289, - "step": 21485 - }, - { - "epoch": 0.9962911451089476, - "grad_norm": 3.9231810569763184, - "learning_rate": 8.380276512465202e-10, - "loss": 0.8453, - "step": 21490 - }, - { - "epoch": 0.9965229485396384, - "grad_norm": 3.294024705886841, - "learning_rate": 7.365489861221875e-10, - "loss": 0.8054, - "step": 21495 - }, - { - "epoch": 0.9967547519703291, - "grad_norm": 3.8653006553649902, - "learning_rate": 6.416170208789485e-10, - "loss": 0.8285, - "step": 21500 - }, - { - "epoch": 0.9967547519703291, - "eval_loss": 0.8723641037940979, - "eval_runtime": 11.2669, - "eval_samples_per_second": 11.272, - "eval_steps_per_second": 11.272, - "step": 21500 - }, - { - "epoch": 0.9969865554010199, - "grad_norm": 3.1132254600524902, - "learning_rate": 5.532318176726393e-10, - "loss": 0.8889, - "step": 21505 - }, - { - "epoch": 0.9972183588317107, - "grad_norm": 3.57822322845459, - "learning_rate": 4.713934343691939e-10, - "loss": 0.8632, - "step": 21510 - }, - { - "epoch": 0.9974501622624015, - "grad_norm": 4.240692615509033, - "learning_rate": 3.9610192455019624e-10, - "loss": 0.8483, - "step": 21515 - }, - { - "epoch": 0.9976819656930923, - "grad_norm": 4.356186866760254, - "learning_rate": 3.2735733751065866e-10, - "loss": 0.9656, - "step": 21520 - }, - { - "epoch": 0.997913769123783, - "grad_norm": 3.6947021484375, - "learning_rate": 2.6515971825791243e-10, - "loss": 0.681, - "step": 21525 - }, - { - "epoch": 0.9981455725544738, - "grad_norm": 3.716294288635254, - "learning_rate": 2.0950910751493802e-10, - "loss": 0.7692, - "step": 21530 - }, - { - "epoch": 0.9983773759851646, - "grad_norm": 4.128090858459473, - "learning_rate": 1.604055417170347e-10, - "loss": 0.8413, - "step": 21535 - }, - { - "epoch": 0.9986091794158554, - "grad_norm": 4.070624828338623, - "learning_rate": 1.1784905301182036e-10, - "loss": 0.8496, - "step": 21540 - }, - { - "epoch": 0.9988409828465461, - "grad_norm": 3.773017406463623, - "learning_rate": 8.183966926367249e-11, - "loss": 0.8795, - "step": 21545 - }, - { - "epoch": 0.999072786277237, - "grad_norm": 4.658735275268555, - "learning_rate": 5.237741404817698e-11, - "loss": 0.9189, - "step": 21550 - }, - { - "epoch": 0.9993045897079277, - "grad_norm": 3.159343957901001, - "learning_rate": 2.946230665434868e-11, - "loss": 0.8484, - "step": 21555 - }, - { - "epoch": 0.9995363931386184, - "grad_norm": 3.9690911769866943, - "learning_rate": 1.3094362085741553e-11, - "loss": 0.9825, - "step": 21560 - }, - { - "epoch": 0.9997681965693093, - "grad_norm": 3.8402626514434814, - "learning_rate": 3.2735910571179976e-12, - "loss": 0.7572, - "step": 21565 - }, - { - "epoch": 1.0, - "grad_norm": 3.908137321472168, - "learning_rate": 0.0, - "loss": 0.787, - "step": 21570 - }, - { - "epoch": 1.0, - "step": 21570, - "total_flos": 37930024304640.0, - "train_loss": 0.9211775431878377, - "train_runtime": 29016.4089, - "train_samples_per_second": 0.743, - "train_steps_per_second": 0.743 + "epoch": 0.9999069680900549, + "step": 2687, + "total_flos": 8.247315480402985e+17, + "train_loss": 0.638783668202896, + "train_runtime": 14784.0018, + "train_samples_per_second": 1.454, + "train_steps_per_second": 0.182 } ], "logging_steps": 5, - "max_steps": 21570, + "max_steps": 2687, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, @@ -31953,7 +4002,7 @@ "attributes": {} } }, - "total_flos": 37930024304640.0, + "total_flos": 8.247315480402985e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null