{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 24920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 2.4730186462402344, "learning_rate": 2e-05, "loss": 0.3314, "step": 100 }, { "epoch": 0.02, "grad_norm": 3.50852108001709, "learning_rate": 1.9999872447769624e-05, "loss": 0.0313, "step": 200 }, { "epoch": 0.02, "grad_norm": 0.40423300862312317, "learning_rate": 1.9999489794332404e-05, "loss": 0.0226, "step": 300 }, { "epoch": 0.03, "grad_norm": 0.4923788011074066, "learning_rate": 1.9998852049449998e-05, "loss": 0.0144, "step": 400 }, { "epoch": 0.04, "grad_norm": 0.039026230573654175, "learning_rate": 1.9997959229391567e-05, "loss": 0.0088, "step": 500 }, { "epoch": 0.05, "grad_norm": 0.13339382410049438, "learning_rate": 1.9996811356933346e-05, "loss": 0.0097, "step": 600 }, { "epoch": 0.06, "grad_norm": 0.03016798384487629, "learning_rate": 1.9995408461358074e-05, "loss": 0.0063, "step": 700 }, { "epoch": 0.06, "grad_norm": 0.12673313915729523, "learning_rate": 1.9993750578454248e-05, "loss": 0.0084, "step": 800 }, { "epoch": 0.07, "grad_norm": 0.012224284932017326, "learning_rate": 1.999183775051519e-05, "loss": 0.006, "step": 900 }, { "epoch": 0.08, "grad_norm": 2.144702434539795, "learning_rate": 1.9989670026338002e-05, "loss": 0.0076, "step": 1000 }, { "epoch": 0.09, "grad_norm": 0.02996153011918068, "learning_rate": 1.9987247461222297e-05, "loss": 0.0052, "step": 1100 }, { "epoch": 0.1, "grad_norm": 0.008165411651134491, "learning_rate": 1.9984570116968785e-05, "loss": 0.0052, "step": 1200 }, { "epoch": 0.1, "grad_norm": 0.07781720906496048, "learning_rate": 1.9981638061877714e-05, "loss": 0.0056, "step": 1300 }, { "epoch": 0.11, "grad_norm": 0.016230745241045952, "learning_rate": 1.9978451370747122e-05, "loss": 0.0052, "step": 1400 }, { "epoch": 0.12, "grad_norm": 2.58638858795166, "learning_rate": 1.997501012487091e-05, "loss": 0.0037, "step": 1500 }, { "epoch": 0.13, "grad_norm": 0.029858984053134918, "learning_rate": 1.9971314412036807e-05, "loss": 0.0037, "step": 1600 }, { "epoch": 0.14, "grad_norm": 0.2630612850189209, "learning_rate": 1.996736432652409e-05, "loss": 0.0046, "step": 1700 }, { "epoch": 0.14, "grad_norm": 0.013704606331884861, "learning_rate": 1.9963159969101207e-05, "loss": 0.0056, "step": 1800 }, { "epoch": 0.15, "grad_norm": 0.09422887861728668, "learning_rate": 1.9958701447023188e-05, "loss": 0.0041, "step": 1900 }, { "epoch": 0.16, "grad_norm": 0.02264581061899662, "learning_rate": 1.9953988874028917e-05, "loss": 0.0039, "step": 2000 }, { "epoch": 0.17, "grad_norm": 0.020697351545095444, "learning_rate": 1.994902237033824e-05, "loss": 0.0048, "step": 2100 }, { "epoch": 0.18, "grad_norm": 0.1434525102376938, "learning_rate": 1.9943802062648877e-05, "loss": 0.0051, "step": 2200 }, { "epoch": 0.18, "grad_norm": 0.06430936604738235, "learning_rate": 1.9938328084133206e-05, "loss": 0.0023, "step": 2300 }, { "epoch": 0.19, "grad_norm": 0.008740647695958614, "learning_rate": 1.9932600574434864e-05, "loss": 0.004, "step": 2400 }, { "epoch": 0.2, "grad_norm": 0.014448602683842182, "learning_rate": 1.9926619679665175e-05, "loss": 0.0043, "step": 2500 }, { "epoch": 0.21, "grad_norm": 0.02901746705174446, "learning_rate": 1.9920385552399434e-05, "loss": 0.0019, "step": 2600 }, { "epoch": 0.22, "grad_norm": 1.6094048023223877, "learning_rate": 1.9913898351673006e-05, "loss": 0.0022, "step": 2700 }, { "epoch": 0.22, "grad_norm": 0.36414438486099243, "learning_rate": 1.990715824297728e-05, "loss": 0.0056, "step": 2800 }, { "epoch": 0.23, "grad_norm": 0.1421067714691162, "learning_rate": 1.9900165398255434e-05, "loss": 0.003, "step": 2900 }, { "epoch": 0.24, "grad_norm": 0.028148509562015533, "learning_rate": 1.9892919995898052e-05, "loss": 0.0016, "step": 3000 }, { "epoch": 0.25, "grad_norm": 0.015177663415670395, "learning_rate": 1.9885422220738583e-05, "loss": 0.004, "step": 3100 }, { "epoch": 0.26, "grad_norm": 0.25167518854141235, "learning_rate": 1.9877672264048618e-05, "loss": 0.0027, "step": 3200 }, { "epoch": 0.26, "grad_norm": 0.03997454047203064, "learning_rate": 1.9869670323533005e-05, "loss": 0.003, "step": 3300 }, { "epoch": 0.27, "grad_norm": 0.0028598604258149862, "learning_rate": 1.986141660332482e-05, "loss": 0.0019, "step": 3400 }, { "epoch": 0.28, "grad_norm": 0.1343453824520111, "learning_rate": 1.9852911313980146e-05, "loss": 0.0034, "step": 3500 }, { "epoch": 0.29, "grad_norm": 0.12534217536449432, "learning_rate": 1.9844154672472707e-05, "loss": 0.0026, "step": 3600 }, { "epoch": 0.3, "grad_norm": 0.24615176022052765, "learning_rate": 1.9835146902188336e-05, "loss": 0.0018, "step": 3700 }, { "epoch": 0.3, "grad_norm": 0.013632328249514103, "learning_rate": 1.9825888232919268e-05, "loss": 0.0019, "step": 3800 }, { "epoch": 0.31, "grad_norm": 0.4895065724849701, "learning_rate": 1.9816378900858288e-05, "loss": 0.0037, "step": 3900 }, { "epoch": 0.32, "grad_norm": 1.3317159414291382, "learning_rate": 1.98066191485927e-05, "loss": 0.0022, "step": 4000 }, { "epoch": 0.33, "grad_norm": 0.008369619026780128, "learning_rate": 1.9796609225098136e-05, "loss": 0.0008, "step": 4100 }, { "epoch": 0.34, "grad_norm": 0.007471293676644564, "learning_rate": 1.9786349385732212e-05, "loss": 0.0031, "step": 4200 }, { "epoch": 0.35, "grad_norm": 0.011221354827284813, "learning_rate": 1.9775839892228004e-05, "loss": 0.0013, "step": 4300 }, { "epoch": 0.35, "grad_norm": 0.006684939842671156, "learning_rate": 1.976508101268738e-05, "loss": 0.0024, "step": 4400 }, { "epoch": 0.36, "grad_norm": 0.7115832567214966, "learning_rate": 1.9754073021574153e-05, "loss": 0.0027, "step": 4500 }, { "epoch": 0.37, "grad_norm": 0.031222190707921982, "learning_rate": 1.9742816199707096e-05, "loss": 0.0013, "step": 4600 }, { "epoch": 0.38, "grad_norm": 0.033377427607774734, "learning_rate": 1.9731310834252747e-05, "loss": 0.0046, "step": 4700 }, { "epoch": 0.39, "grad_norm": 0.3707186281681061, "learning_rate": 1.9719557218718116e-05, "loss": 0.0025, "step": 4800 }, { "epoch": 0.39, "grad_norm": 0.030659371986985207, "learning_rate": 1.970755565294318e-05, "loss": 0.0033, "step": 4900 }, { "epoch": 0.4, "grad_norm": 0.031716570258140564, "learning_rate": 1.969530644309323e-05, "loss": 0.0017, "step": 5000 }, { "epoch": 0.41, "grad_norm": 0.018681248649954796, "learning_rate": 1.9682809901651074e-05, "loss": 0.0025, "step": 5100 }, { "epoch": 0.42, "grad_norm": 0.031807415187358856, "learning_rate": 1.9670066347409063e-05, "loss": 0.0029, "step": 5200 }, { "epoch": 0.43, "grad_norm": 0.00439372006803751, "learning_rate": 1.9657076105460945e-05, "loss": 0.0024, "step": 5300 }, { "epoch": 0.43, "grad_norm": 0.2656748592853546, "learning_rate": 1.964383950719359e-05, "loss": 0.0019, "step": 5400 }, { "epoch": 0.44, "grad_norm": 0.18905937671661377, "learning_rate": 1.9630356890278527e-05, "loss": 0.0015, "step": 5500 }, { "epoch": 0.45, "grad_norm": 0.1290869414806366, "learning_rate": 1.9616628598663322e-05, "loss": 0.0034, "step": 5600 }, { "epoch": 0.46, "grad_norm": 0.004318041726946831, "learning_rate": 1.9602654982562822e-05, "loss": 0.0014, "step": 5700 }, { "epoch": 0.47, "grad_norm": 0.15103724598884583, "learning_rate": 1.9588436398450206e-05, "loss": 0.0013, "step": 5800 }, { "epoch": 0.47, "grad_norm": 0.039078064262866974, "learning_rate": 1.9573973209047893e-05, "loss": 0.0015, "step": 5900 }, { "epoch": 0.48, "grad_norm": 2.7625222206115723, "learning_rate": 1.9559265783318304e-05, "loss": 0.0018, "step": 6000 }, { "epoch": 0.49, "grad_norm": 0.32574662566185, "learning_rate": 1.9544314496454423e-05, "loss": 0.0023, "step": 6100 }, { "epoch": 0.5, "grad_norm": 0.014155671931803226, "learning_rate": 1.9529119729870253e-05, "loss": 0.0011, "step": 6200 }, { "epoch": 0.51, "grad_norm": 0.007221277803182602, "learning_rate": 1.9513681871191063e-05, "loss": 0.0033, "step": 6300 }, { "epoch": 0.51, "grad_norm": 0.013470535166561604, "learning_rate": 1.949800131424352e-05, "loss": 0.003, "step": 6400 }, { "epoch": 0.52, "grad_norm": 0.003437698120251298, "learning_rate": 1.9482078459045617e-05, "loss": 0.0012, "step": 6500 }, { "epoch": 0.53, "grad_norm": 0.006796371191740036, "learning_rate": 1.9465913711796502e-05, "loss": 0.001, "step": 6600 }, { "epoch": 0.54, "grad_norm": 0.7563895583152771, "learning_rate": 1.9449507484866084e-05, "loss": 0.0018, "step": 6700 }, { "epoch": 0.55, "grad_norm": 0.016867786645889282, "learning_rate": 1.9432860196784533e-05, "loss": 0.0016, "step": 6800 }, { "epoch": 0.55, "grad_norm": 0.006416236516088247, "learning_rate": 1.941597227223159e-05, "loss": 0.0008, "step": 6900 }, { "epoch": 0.56, "grad_norm": 0.0008421270758844912, "learning_rate": 1.9398844142025746e-05, "loss": 0.0011, "step": 7000 }, { "epoch": 0.57, "grad_norm": 0.012384007684886456, "learning_rate": 1.9381476243113243e-05, "loss": 0.0017, "step": 7100 }, { "epoch": 0.58, "grad_norm": 0.004778598435223103, "learning_rate": 1.9363869018556928e-05, "loss": 0.0016, "step": 7200 }, { "epoch": 0.59, "grad_norm": 0.00394839234650135, "learning_rate": 1.9346022917524958e-05, "loss": 0.0017, "step": 7300 }, { "epoch": 0.59, "grad_norm": 0.006818379741162062, "learning_rate": 1.9327938395279325e-05, "loss": 0.0019, "step": 7400 }, { "epoch": 0.6, "grad_norm": 0.028212955221533775, "learning_rate": 1.9309615913164262e-05, "loss": 0.0013, "step": 7500 }, { "epoch": 0.61, "grad_norm": 0.017115216702222824, "learning_rate": 1.9291055938594464e-05, "loss": 0.0017, "step": 7600 }, { "epoch": 0.62, "grad_norm": 0.02033141627907753, "learning_rate": 1.9272258945043154e-05, "loss": 0.0006, "step": 7700 }, { "epoch": 0.63, "grad_norm": 1.2443268299102783, "learning_rate": 1.9253225412030028e-05, "loss": 0.0011, "step": 7800 }, { "epoch": 0.63, "grad_norm": 0.0025956807658076286, "learning_rate": 1.9233955825109e-05, "loss": 0.001, "step": 7900 }, { "epoch": 0.64, "grad_norm": 13.584095001220703, "learning_rate": 1.9214450675855832e-05, "loss": 0.0029, "step": 8000 }, { "epoch": 0.65, "grad_norm": 0.0022025350481271744, "learning_rate": 1.919471046185558e-05, "loss": 0.001, "step": 8100 }, { "epoch": 0.66, "grad_norm": 0.002522263675928116, "learning_rate": 1.917473568668991e-05, "loss": 0.0024, "step": 8200 }, { "epoch": 0.67, "grad_norm": 1.3481580018997192, "learning_rate": 1.9154526859924242e-05, "loss": 0.0017, "step": 8300 }, { "epoch": 0.67, "grad_norm": 0.010974978096783161, "learning_rate": 1.9134084497094766e-05, "loss": 0.0008, "step": 8400 }, { "epoch": 0.68, "grad_norm": 0.005469560623168945, "learning_rate": 1.9113409119695276e-05, "loss": 0.0012, "step": 8500 }, { "epoch": 0.69, "grad_norm": 0.08603531867265701, "learning_rate": 1.9092501255163874e-05, "loss": 0.0025, "step": 8600 }, { "epoch": 0.7, "grad_norm": 0.0022059655748307705, "learning_rate": 1.907136143686951e-05, "loss": 0.0013, "step": 8700 }, { "epoch": 0.71, "grad_norm": 0.0005606790073215961, "learning_rate": 1.904999020409837e-05, "loss": 0.0014, "step": 8800 }, { "epoch": 0.71, "grad_norm": 0.012699578888714314, "learning_rate": 1.902838810204015e-05, "loss": 0.0012, "step": 8900 }, { "epoch": 0.72, "grad_norm": 0.00312532065436244, "learning_rate": 1.90065556817741e-05, "loss": 0.0012, "step": 9000 }, { "epoch": 0.73, "grad_norm": 0.008270219899713993, "learning_rate": 1.8984493500255e-05, "loss": 0.0014, "step": 9100 }, { "epoch": 0.74, "grad_norm": 0.01274858694523573, "learning_rate": 1.8962202120298948e-05, "loss": 0.0013, "step": 9200 }, { "epoch": 0.75, "grad_norm": 0.012512357905507088, "learning_rate": 1.8939682110568982e-05, "loss": 0.001, "step": 9300 }, { "epoch": 0.75, "grad_norm": 0.0043404679745435715, "learning_rate": 1.8916934045560603e-05, "loss": 0.0023, "step": 9400 }, { "epoch": 0.76, "grad_norm": 3.700385332107544, "learning_rate": 1.8893958505587093e-05, "loss": 0.0031, "step": 9500 }, { "epoch": 0.77, "grad_norm": 0.016660619527101517, "learning_rate": 1.8870756076764728e-05, "loss": 0.0019, "step": 9600 }, { "epoch": 0.78, "grad_norm": 0.047528013586997986, "learning_rate": 1.8847327350997814e-05, "loss": 0.0008, "step": 9700 }, { "epoch": 0.79, "grad_norm": 0.01734699122607708, "learning_rate": 1.8823672925963598e-05, "loss": 0.0022, "step": 9800 }, { "epoch": 0.79, "grad_norm": 0.019478172063827515, "learning_rate": 1.879979340509701e-05, "loss": 0.0009, "step": 9900 }, { "epoch": 0.8, "grad_norm": 0.1385781317949295, "learning_rate": 1.877568939757529e-05, "loss": 0.0015, "step": 10000 }, { "epoch": 0.81, "grad_norm": 0.143458753824234, "learning_rate": 1.8751361518302413e-05, "loss": 0.0008, "step": 10100 }, { "epoch": 0.82, "grad_norm": 0.003875449998304248, "learning_rate": 1.8726810387893438e-05, "loss": 0.0029, "step": 10200 }, { "epoch": 0.83, "grad_norm": 0.0030744324903935194, "learning_rate": 1.8702036632658646e-05, "loss": 0.0012, "step": 10300 }, { "epoch": 0.83, "grad_norm": 0.012408553622663021, "learning_rate": 1.867704088458759e-05, "loss": 0.0017, "step": 10400 }, { "epoch": 0.84, "grad_norm": 0.001399656874127686, "learning_rate": 1.8651823781332948e-05, "loss": 0.0006, "step": 10500 }, { "epoch": 0.85, "grad_norm": 0.001192159834317863, "learning_rate": 1.8626385966194275e-05, "loss": 0.001, "step": 10600 }, { "epoch": 0.86, "grad_norm": 0.6697074770927429, "learning_rate": 1.8600728088101587e-05, "loss": 0.0021, "step": 10700 }, { "epoch": 0.87, "grad_norm": 0.007071709726005793, "learning_rate": 1.857485080159879e-05, "loss": 0.0016, "step": 10800 }, { "epoch": 0.87, "grad_norm": 0.005609508138149977, "learning_rate": 1.8548754766827016e-05, "loss": 0.0022, "step": 10900 }, { "epoch": 0.88, "grad_norm": 0.0166325643658638, "learning_rate": 1.852244064950775e-05, "loss": 0.0032, "step": 11000 }, { "epoch": 0.89, "grad_norm": 0.0896034687757492, "learning_rate": 1.8495909120925857e-05, "loss": 0.0015, "step": 11100 }, { "epoch": 0.9, "grad_norm": 0.24882763624191284, "learning_rate": 1.846916085791247e-05, "loss": 0.0017, "step": 11200 }, { "epoch": 0.91, "grad_norm": 0.013551519252359867, "learning_rate": 1.8442196542827712e-05, "loss": 0.0007, "step": 11300 }, { "epoch": 0.91, "grad_norm": 0.01764736883342266, "learning_rate": 1.8415016863543286e-05, "loss": 0.0005, "step": 11400 }, { "epoch": 0.92, "grad_norm": 0.001765413791872561, "learning_rate": 1.8387622513424942e-05, "loss": 0.0044, "step": 11500 }, { "epoch": 0.93, "grad_norm": 0.0014637404819950461, "learning_rate": 1.836001419131476e-05, "loss": 0.0012, "step": 11600 }, { "epoch": 0.94, "grad_norm": 0.005813998635858297, "learning_rate": 1.8332192601513358e-05, "loss": 0.0006, "step": 11700 }, { "epoch": 0.95, "grad_norm": 0.0008558441768400371, "learning_rate": 1.8304158453761904e-05, "loss": 0.002, "step": 11800 }, { "epoch": 0.96, "grad_norm": 0.0033321972005069256, "learning_rate": 1.827591246322401e-05, "loss": 0.0005, "step": 11900 }, { "epoch": 0.96, "grad_norm": 0.5678846836090088, "learning_rate": 1.8247455350467496e-05, "loss": 0.0014, "step": 12000 }, { "epoch": 0.97, "grad_norm": 0.009241198189556599, "learning_rate": 1.8218787841446003e-05, "loss": 0.0004, "step": 12100 }, { "epoch": 0.98, "grad_norm": 0.07260189205408096, "learning_rate": 1.8189910667480476e-05, "loss": 0.0015, "step": 12200 }, { "epoch": 0.99, "grad_norm": 0.0022115109022706747, "learning_rate": 1.8160824565240495e-05, "loss": 0.0029, "step": 12300 }, { "epoch": 1.0, "grad_norm": 0.020148636773228645, "learning_rate": 1.8131530276725514e-05, "loss": 0.0015, "step": 12400 }, { "epoch": 1.0, "grad_norm": 0.2839891314506531, "learning_rate": 1.8102028549245894e-05, "loss": 0.0015, "step": 12500 }, { "epoch": 1.01, "grad_norm": 0.004473233129829168, "learning_rate": 1.8072320135403862e-05, "loss": 0.0012, "step": 12600 }, { "epoch": 1.02, "grad_norm": 0.0016079695196822286, "learning_rate": 1.804240579307431e-05, "loss": 0.0009, "step": 12700 }, { "epoch": 1.03, "grad_norm": 0.015555166639387608, "learning_rate": 1.8012286285385456e-05, "loss": 0.0015, "step": 12800 }, { "epoch": 1.04, "grad_norm": 0.014301074668765068, "learning_rate": 1.7981962380699376e-05, "loss": 0.0006, "step": 12900 }, { "epoch": 1.04, "grad_norm": 0.005016946699470282, "learning_rate": 1.7951434852592406e-05, "loss": 0.0008, "step": 13000 }, { "epoch": 1.05, "grad_norm": 0.006782655604183674, "learning_rate": 1.79207044798354e-05, "loss": 0.0004, "step": 13100 }, { "epoch": 1.06, "grad_norm": 0.006875937804579735, "learning_rate": 1.788977204637388e-05, "loss": 0.0027, "step": 13200 }, { "epoch": 1.07, "grad_norm": 0.007184536661952734, "learning_rate": 1.7858638341308026e-05, "loss": 0.0013, "step": 13300 }, { "epoch": 1.08, "grad_norm": 0.0019093825249001384, "learning_rate": 1.7827304158872538e-05, "loss": 0.0014, "step": 13400 }, { "epoch": 1.08, "grad_norm": 1.7438774108886719, "learning_rate": 1.779577029841638e-05, "loss": 0.0027, "step": 13500 }, { "epoch": 1.09, "grad_norm": 0.0496426597237587, "learning_rate": 1.776403756438241e-05, "loss": 0.0016, "step": 13600 }, { "epoch": 1.1, "grad_norm": 0.005185109097510576, "learning_rate": 1.773210676628682e-05, "loss": 0.0004, "step": 13700 }, { "epoch": 1.11, "grad_norm": 0.0010710001224651933, "learning_rate": 1.769997871869852e-05, "loss": 0.0009, "step": 13800 }, { "epoch": 1.12, "grad_norm": 0.013586360029876232, "learning_rate": 1.7667654241218332e-05, "loss": 0.0028, "step": 13900 }, { "epoch": 1.12, "grad_norm": 0.021679196506738663, "learning_rate": 1.7635134158458095e-05, "loss": 0.0011, "step": 14000 }, { "epoch": 1.13, "grad_norm": 0.2464137077331543, "learning_rate": 1.7602419300019627e-05, "loss": 0.0014, "step": 14100 }, { "epoch": 1.14, "grad_norm": 0.0015559865860268474, "learning_rate": 1.7569510500473566e-05, "loss": 0.0003, "step": 14200 }, { "epoch": 1.15, "grad_norm": 0.007820005528628826, "learning_rate": 1.753640859933806e-05, "loss": 0.001, "step": 14300 }, { "epoch": 1.16, "grad_norm": 0.0021381524857133627, "learning_rate": 1.7503114441057374e-05, "loss": 0.0019, "step": 14400 }, { "epoch": 1.16, "grad_norm": 0.004830517340451479, "learning_rate": 1.746962887498034e-05, "loss": 0.0006, "step": 14500 }, { "epoch": 1.17, "grad_norm": 0.004048046190291643, "learning_rate": 1.743595275533869e-05, "loss": 0.001, "step": 14600 }, { "epoch": 1.18, "grad_norm": 0.0010604397393763065, "learning_rate": 1.7402086941225246e-05, "loss": 0.0007, "step": 14700 }, { "epoch": 1.19, "grad_norm": 0.10798583179712296, "learning_rate": 1.736803229657204e-05, "loss": 0.0016, "step": 14800 }, { "epoch": 1.2, "grad_norm": 0.00545172905549407, "learning_rate": 1.7333789690128252e-05, "loss": 0.0003, "step": 14900 }, { "epoch": 1.2, "grad_norm": 0.051401443779468536, "learning_rate": 1.7299359995438046e-05, "loss": 0.0007, "step": 15000 }, { "epoch": 1.21, "grad_norm": 0.0022059613838791847, "learning_rate": 1.7264744090818284e-05, "loss": 0.0009, "step": 15100 }, { "epoch": 1.22, "grad_norm": 0.0031678322702646255, "learning_rate": 1.7229942859336142e-05, "loss": 0.0003, "step": 15200 }, { "epoch": 1.23, "grad_norm": 0.4575800597667694, "learning_rate": 1.719495718878655e-05, "loss": 0.0008, "step": 15300 }, { "epoch": 1.24, "grad_norm": 0.0009642325458116829, "learning_rate": 1.7159787971669586e-05, "loss": 0.001, "step": 15400 }, { "epoch": 1.24, "grad_norm": 0.0024611325934529305, "learning_rate": 1.712443610516765e-05, "loss": 0.0007, "step": 15500 }, { "epoch": 1.25, "grad_norm": 0.007828389294445515, "learning_rate": 1.7088902491122636e-05, "loss": 0.0003, "step": 15600 }, { "epoch": 1.26, "grad_norm": 0.0010621993569657207, "learning_rate": 1.7053188036012885e-05, "loss": 0.0022, "step": 15700 }, { "epoch": 1.27, "grad_norm": 0.003037663409486413, "learning_rate": 1.7017293650930083e-05, "loss": 0.0011, "step": 15800 }, { "epoch": 1.28, "grad_norm": 0.021871333941817284, "learning_rate": 1.6981220251555996e-05, "loss": 0.0005, "step": 15900 }, { "epoch": 1.28, "grad_norm": 0.2676248848438263, "learning_rate": 1.6944968758139144e-05, "loss": 0.0012, "step": 16000 }, { "epoch": 1.29, "grad_norm": 0.002139603951945901, "learning_rate": 1.6908540095471288e-05, "loss": 0.0016, "step": 16100 }, { "epoch": 1.3, "grad_norm": 0.00880183931440115, "learning_rate": 1.6871935192863862e-05, "loss": 0.0007, "step": 16200 }, { "epoch": 1.31, "grad_norm": 0.006027602590620518, "learning_rate": 1.6835154984124266e-05, "loss": 0.0027, "step": 16300 }, { "epoch": 1.32, "grad_norm": 0.002160316100344062, "learning_rate": 1.6798200407532025e-05, "loss": 0.0004, "step": 16400 }, { "epoch": 1.32, "grad_norm": 0.004634434822946787, "learning_rate": 1.676107240581488e-05, "loss": 0.0005, "step": 16500 }, { "epoch": 1.33, "grad_norm": 0.013142119161784649, "learning_rate": 1.6723771926124704e-05, "loss": 0.0012, "step": 16600 }, { "epoch": 1.34, "grad_norm": 0.0017228337237611413, "learning_rate": 1.6686299920013388e-05, "loss": 0.001, "step": 16700 }, { "epoch": 1.35, "grad_norm": 0.011891527101397514, "learning_rate": 1.6648657343408517e-05, "loss": 0.0004, "step": 16800 }, { "epoch": 1.36, "grad_norm": 0.03485196456313133, "learning_rate": 1.661084515658901e-05, "loss": 0.0008, "step": 16900 }, { "epoch": 1.36, "grad_norm": 0.006045693065971136, "learning_rate": 1.6572864324160617e-05, "loss": 0.0016, "step": 17000 }, { "epoch": 1.37, "grad_norm": 0.0319632813334465, "learning_rate": 1.6534715815031325e-05, "loss": 0.001, "step": 17100 }, { "epoch": 1.38, "grad_norm": 0.000651439419016242, "learning_rate": 1.649640060238661e-05, "loss": 0.0008, "step": 17200 }, { "epoch": 1.39, "grad_norm": 0.013289058580994606, "learning_rate": 1.645791966366464e-05, "loss": 0.0004, "step": 17300 }, { "epoch": 1.4, "grad_norm": 1.1903555393218994, "learning_rate": 1.6419273980531333e-05, "loss": 0.0009, "step": 17400 }, { "epoch": 1.4, "grad_norm": 0.0004181715485174209, "learning_rate": 1.63804645388553e-05, "loss": 0.0008, "step": 17500 }, { "epoch": 1.41, "grad_norm": 0.0005597823183052242, "learning_rate": 1.6341492328682703e-05, "loss": 0.0015, "step": 17600 }, { "epoch": 1.42, "grad_norm": 0.003250558627769351, "learning_rate": 1.6302358344212025e-05, "loss": 0.0015, "step": 17700 }, { "epoch": 1.43, "grad_norm": 0.00633718678727746, "learning_rate": 1.6263063583768652e-05, "loss": 0.0007, "step": 17800 }, { "epoch": 1.44, "grad_norm": 0.0012848949991166592, "learning_rate": 1.622360904977946e-05, "loss": 0.001, "step": 17900 }, { "epoch": 1.44, "grad_norm": 0.03510229289531708, "learning_rate": 1.6183995748747204e-05, "loss": 0.001, "step": 18000 }, { "epoch": 1.45, "grad_norm": 0.00044461427023634315, "learning_rate": 1.6144224691224868e-05, "loss": 0.0005, "step": 18100 }, { "epoch": 1.46, "grad_norm": 0.056695085018873215, "learning_rate": 1.6104296891789867e-05, "loss": 0.0011, "step": 18200 }, { "epoch": 1.47, "grad_norm": 0.002499540336430073, "learning_rate": 1.606421336901818e-05, "loss": 0.0005, "step": 18300 }, { "epoch": 1.48, "grad_norm": 0.00433159526437521, "learning_rate": 1.6023975145458352e-05, "loss": 0.0007, "step": 18400 }, { "epoch": 1.48, "grad_norm": 0.0020198116544634104, "learning_rate": 1.5983583247605414e-05, "loss": 0.0005, "step": 18500 }, { "epoch": 1.49, "grad_norm": 0.004110960755497217, "learning_rate": 1.5943038705874697e-05, "loss": 0.001, "step": 18600 }, { "epoch": 1.5, "grad_norm": 0.0036564720794558525, "learning_rate": 1.590234255457555e-05, "loss": 0.0014, "step": 18700 }, { "epoch": 1.51, "grad_norm": 0.0010110210860148072, "learning_rate": 1.5861495831884942e-05, "loss": 0.0008, "step": 18800 }, { "epoch": 1.52, "grad_norm": 0.11147645115852356, "learning_rate": 1.582049957982099e-05, "loss": 0.0004, "step": 18900 }, { "epoch": 1.52, "grad_norm": 0.002563257934525609, "learning_rate": 1.5779354844216377e-05, "loss": 0.0003, "step": 19000 }, { "epoch": 1.53, "grad_norm": 0.002472821157425642, "learning_rate": 1.5738062674691657e-05, "loss": 0.0003, "step": 19100 }, { "epoch": 1.54, "grad_norm": 0.08172155171632767, "learning_rate": 1.5696624124628495e-05, "loss": 0.0005, "step": 19200 }, { "epoch": 1.55, "grad_norm": 0.009228968061506748, "learning_rate": 1.5655040251142787e-05, "loss": 0.0008, "step": 19300 }, { "epoch": 1.56, "grad_norm": 0.027170058339834213, "learning_rate": 1.5613312115057697e-05, "loss": 0.0002, "step": 19400 }, { "epoch": 1.57, "grad_norm": 0.003953781444579363, "learning_rate": 1.5571440780876588e-05, "loss": 0.0009, "step": 19500 }, { "epoch": 1.57, "grad_norm": 0.2073652297258377, "learning_rate": 1.5529427316755876e-05, "loss": 0.001, "step": 19600 }, { "epoch": 1.58, "grad_norm": 0.000581290340051055, "learning_rate": 1.548727279447777e-05, "loss": 0.0007, "step": 19700 }, { "epoch": 1.59, "grad_norm": 0.001082689268514514, "learning_rate": 1.5444978289422937e-05, "loss": 0.0011, "step": 19800 }, { "epoch": 1.6, "grad_norm": 0.020316652953624725, "learning_rate": 1.540254488054307e-05, "loss": 0.0007, "step": 19900 }, { "epoch": 1.61, "grad_norm": 0.0026674780528992414, "learning_rate": 1.5359973650333352e-05, "loss": 0.0006, "step": 20000 }, { "epoch": 1.61, "grad_norm": 0.007332425098866224, "learning_rate": 1.5317265684804865e-05, "loss": 0.001, "step": 20100 }, { "epoch": 1.62, "grad_norm": 0.0030178299639374018, "learning_rate": 1.5274422073456853e-05, "loss": 0.0002, "step": 20200 }, { "epoch": 1.63, "grad_norm": 0.0009229824645444751, "learning_rate": 1.5231443909248956e-05, "loss": 0.0006, "step": 20300 }, { "epoch": 1.64, "grad_norm": 0.000619510596152395, "learning_rate": 1.5188332288573313e-05, "loss": 0.0011, "step": 20400 }, { "epoch": 1.65, "grad_norm": 0.0273954626172781, "learning_rate": 1.5145088311226599e-05, "loss": 0.0004, "step": 20500 }, { "epoch": 1.65, "grad_norm": 0.009902927093207836, "learning_rate": 1.510171308038197e-05, "loss": 0.0006, "step": 20600 }, { "epoch": 1.66, "grad_norm": 0.012104487977921963, "learning_rate": 1.5058207702560907e-05, "loss": 0.0004, "step": 20700 }, { "epoch": 1.67, "grad_norm": 0.8639243245124817, "learning_rate": 1.501457328760501e-05, "loss": 0.0007, "step": 20800 }, { "epoch": 1.68, "grad_norm": 0.07374394685029984, "learning_rate": 1.4970810948647664e-05, "loss": 0.0007, "step": 20900 }, { "epoch": 1.69, "grad_norm": 0.0010848238598555326, "learning_rate": 1.4926921802085662e-05, "loss": 0.0008, "step": 21000 }, { "epoch": 1.69, "grad_norm": 0.03615230694413185, "learning_rate": 1.4882906967550708e-05, "loss": 0.0002, "step": 21100 }, { "epoch": 1.7, "grad_norm": 0.00893151480704546, "learning_rate": 1.4838767567880865e-05, "loss": 0.0012, "step": 21200 }, { "epoch": 1.71, "grad_norm": 0.0023355227895081043, "learning_rate": 1.479450472909191e-05, "loss": 0.0023, "step": 21300 }, { "epoch": 1.72, "grad_norm": 0.7856515645980835, "learning_rate": 1.4750119580348601e-05, "loss": 0.0008, "step": 21400 }, { "epoch": 1.73, "grad_norm": 0.0007943073869682848, "learning_rate": 1.4705613253935886e-05, "loss": 0.0002, "step": 21500 }, { "epoch": 1.73, "grad_norm": 0.000991353183053434, "learning_rate": 1.4660986885230002e-05, "loss": 0.0005, "step": 21600 }, { "epoch": 1.74, "grad_norm": 0.0023457545321434736, "learning_rate": 1.4616241612669523e-05, "loss": 0.0035, "step": 21700 }, { "epoch": 1.75, "grad_norm": 0.0007867334061302245, "learning_rate": 1.4571378577726317e-05, "loss": 0.0006, "step": 21800 }, { "epoch": 1.76, "grad_norm": 0.0013897059252485633, "learning_rate": 1.4526398924876407e-05, "loss": 0.0005, "step": 21900 }, { "epoch": 1.77, "grad_norm": 0.0020729138050228357, "learning_rate": 1.4481303801570805e-05, "loss": 0.0009, "step": 22000 }, { "epoch": 1.77, "grad_norm": 0.002158819232136011, "learning_rate": 1.4436094358206224e-05, "loss": 0.0005, "step": 22100 }, { "epoch": 1.78, "grad_norm": 0.0038599702529609203, "learning_rate": 1.4390771748095735e-05, "loss": 0.0007, "step": 22200 }, { "epoch": 1.79, "grad_norm": 0.0022886607330292463, "learning_rate": 1.4345337127439333e-05, "loss": 0.0002, "step": 22300 }, { "epoch": 1.8, "grad_norm": 0.011882874183356762, "learning_rate": 1.4299791655294461e-05, "loss": 0.0021, "step": 22400 }, { "epoch": 1.81, "grad_norm": 0.011625411920249462, "learning_rate": 1.4254136493546432e-05, "loss": 0.0003, "step": 22500 }, { "epoch": 1.81, "grad_norm": 0.004132369067519903, "learning_rate": 1.4208372806878782e-05, "loss": 0.0004, "step": 22600 }, { "epoch": 1.82, "grad_norm": 0.0008625888731330633, "learning_rate": 1.4162501762743579e-05, "loss": 0.0013, "step": 22700 }, { "epoch": 1.83, "grad_norm": 0.02038603462278843, "learning_rate": 1.4116524531331616e-05, "loss": 0.001, "step": 22800 }, { "epoch": 1.84, "grad_norm": 0.021469444036483765, "learning_rate": 1.4070442285542579e-05, "loss": 0.0004, "step": 22900 }, { "epoch": 1.85, "grad_norm": 0.7221536040306091, "learning_rate": 1.402425620095511e-05, "loss": 0.0007, "step": 23000 }, { "epoch": 1.85, "grad_norm": 0.0006885859766043723, "learning_rate": 1.3977967455796828e-05, "loss": 0.0009, "step": 23100 }, { "epoch": 1.86, "grad_norm": 0.0010358589934185147, "learning_rate": 1.393157723091428e-05, "loss": 0.0003, "step": 23200 }, { "epoch": 1.87, "grad_norm": 0.0008139715064316988, "learning_rate": 1.3885086709742788e-05, "loss": 0.0005, "step": 23300 }, { "epoch": 1.88, "grad_norm": 0.6740529537200928, "learning_rate": 1.3838497078276288e-05, "loss": 0.0018, "step": 23400 }, { "epoch": 1.89, "grad_norm": 0.0022279045078903437, "learning_rate": 1.3791809525037057e-05, "loss": 0.0005, "step": 23500 }, { "epoch": 1.89, "grad_norm": 0.0020128132309764624, "learning_rate": 1.3745025241045414e-05, "loss": 0.0002, "step": 23600 }, { "epoch": 1.9, "grad_norm": 0.000531968311406672, "learning_rate": 1.3698145419789302e-05, "loss": 0.0007, "step": 23700 }, { "epoch": 1.91, "grad_norm": 0.00039000410470180213, "learning_rate": 1.3651171257193883e-05, "loss": 0.0006, "step": 23800 }, { "epoch": 1.92, "grad_norm": 0.041364409029483795, "learning_rate": 1.3604103951590993e-05, "loss": 0.0003, "step": 23900 }, { "epoch": 1.93, "grad_norm": 0.0011203879257664084, "learning_rate": 1.3556944703688592e-05, "loss": 0.0003, "step": 24000 }, { "epoch": 1.93, "grad_norm": 0.001194770447909832, "learning_rate": 1.3509694716540135e-05, "loss": 0.0002, "step": 24100 }, { "epoch": 1.94, "grad_norm": 0.002378718461841345, "learning_rate": 1.3462355195513868e-05, "loss": 0.0006, "step": 24200 }, { "epoch": 1.95, "grad_norm": 0.32328903675079346, "learning_rate": 1.341492734826209e-05, "loss": 0.001, "step": 24300 }, { "epoch": 1.96, "grad_norm": 0.002817463595420122, "learning_rate": 1.3367412384690346e-05, "loss": 0.0017, "step": 24400 }, { "epoch": 1.97, "grad_norm": 0.0024801292456686497, "learning_rate": 1.3319811516926541e-05, "loss": 0.0005, "step": 24500 }, { "epoch": 1.97, "grad_norm": 0.004569421522319317, "learning_rate": 1.3272125959290059e-05, "loss": 0.0008, "step": 24600 }, { "epoch": 1.98, "grad_norm": 0.1658785045146942, "learning_rate": 1.3224356928260735e-05, "loss": 0.0005, "step": 24700 }, { "epoch": 1.99, "grad_norm": 0.001219902653247118, "learning_rate": 1.317650564244787e-05, "loss": 0.0005, "step": 24800 }, { "epoch": 2.0, "grad_norm": 0.002142696175724268, "learning_rate": 1.3128573322559097e-05, "loss": 0.0022, "step": 24900 } ], "logging_steps": 100, "max_steps": 62300, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.706415322300416e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }